ffs_snapshot.c revision 1.66
1/* $NetBSD: ffs_snapshot.c,v 1.66 2008/04/17 09:52:47 hannken Exp $ */ 2 3/* 4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 5 * 6 * Further information about snapshots can be obtained from: 7 * 8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 9 * 1614 Oxford Street mckusick@mckusick.com 10 * Berkeley, CA 94709-1608 +1-510-843-9542 11 * USA 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 36 * 37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 38 */ 39 40#include <sys/cdefs.h> 41__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.66 2008/04/17 09:52:47 hannken Exp $"); 42 43#if defined(_KERNEL_OPT) 44#include "opt_ffs.h" 45#endif 46 47#include <sys/param.h> 48#include <sys/kernel.h> 49#include <sys/systm.h> 50#include <sys/conf.h> 51#include <sys/buf.h> 52#include <sys/proc.h> 53#include <sys/namei.h> 54#include <sys/sched.h> 55#include <sys/stat.h> 56#include <sys/malloc.h> 57#include <sys/mount.h> 58#include <sys/resource.h> 59#include <sys/resourcevar.h> 60#include <sys/vnode.h> 61#include <sys/kauth.h> 62#include <sys/fstrans.h> 63 64#include <miscfs/specfs/specdev.h> 65 66#include <ufs/ufs/quota.h> 67#include <ufs/ufs/ufsmount.h> 68#include <ufs/ufs/inode.h> 69#include <ufs/ufs/ufs_extern.h> 70#include <ufs/ufs/ufs_bswap.h> 71 72#include <ufs/ffs/fs.h> 73#include <ufs/ffs/ffs_extern.h> 74 75/* FreeBSD -> NetBSD conversion */ 76#define KERNCRED lwp0.l_cred 77#define ufs1_daddr_t int32_t 78#define ufs2_daddr_t int64_t 79#define ufs_lbn_t daddr_t 80#define VI_MTX(v) (&(v)->v_interlock) 81#define VI_LOCK(v) mutex_enter(&(v)->v_interlock) 82#define VI_UNLOCK(v) mutex_exit(&(v)->v_interlock) 83#define MNT_ILOCK(v) mutex_enter(&mntvnode_lock) 84#define MNT_IUNLOCK(v) mutex_exit(&mntvnode_lock) 85 86#if !defined(FFS_NO_SNAPSHOT) 87static int cgaccount(int, struct vnode *, void *, int); 88static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 89 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 90 ufs_lbn_t, int), int); 91static int indiracct_ufs1(struct vnode *, struct vnode *, int, 92 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 93 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 94 ufs_lbn_t, int), int); 95static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 96 struct fs *, ufs_lbn_t, int); 97static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 98 struct fs *, ufs_lbn_t, int); 99static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 100 struct fs *, ufs_lbn_t, int); 101static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 102 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 103 ufs_lbn_t, int), int); 104static int indiracct_ufs2(struct vnode *, struct vnode *, int, 105 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 106 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 107 ufs_lbn_t, int), int); 108static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 109 struct fs *, ufs_lbn_t, int); 110static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 111 struct fs *, ufs_lbn_t, int); 112static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 113 struct fs *, ufs_lbn_t, int); 114static int readvnblk(struct vnode *, void *, ufs2_daddr_t); 115#endif /* !defined(FFS_NO_SNAPSHOT) */ 116 117static int ffs_copyonwrite(void *, struct buf *, bool); 118static int readfsblk(struct vnode *, void *, ufs2_daddr_t); 119static int writevnblk(struct vnode *, void *, ufs2_daddr_t); 120static inline int cow_enter(void); 121static inline void cow_leave(int); 122static inline ufs2_daddr_t db_get(struct inode *, int); 123static inline void db_assign(struct inode *, int, ufs2_daddr_t); 124static inline ufs2_daddr_t idb_get(struct inode *, void *, int); 125static inline void idb_assign(struct inode *, void *, int, ufs2_daddr_t); 126 127struct snap_info { 128 kmutex_t si_lock; /* Lock this snapinfo */ 129 struct vnlock si_vnlock; /* Snapshot vnode common lock */ 130 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */ 131 daddr_t *si_snapblklist; /* Snapshot block hints list */ 132 uint32_t si_gen; /* Incremented on change */ 133}; 134 135#ifdef DEBUG 136static int snapdebug = 0; 137#endif 138 139int 140ffs_snapshot_init(struct ufsmount *ump) 141{ 142 struct snap_info *si; 143 144 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP); 145 if (si == NULL) 146 return ENOMEM; 147 148 TAILQ_INIT(&si->si_snapshots); 149 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE); 150 rw_init(&si->si_vnlock.vl_lock); 151 si->si_vnlock.vl_canrecurse = 1; 152 si->si_vnlock.vl_recursecnt = 0; 153 si->si_gen = 0; 154 si->si_snapblklist = NULL; 155 156 return 0; 157} 158 159void 160ffs_snapshot_fini(struct ufsmount *ump) 161{ 162 struct snap_info *si; 163 164 si = ump->um_snapinfo; 165 ump->um_snapinfo = NULL; 166 167 KASSERT(TAILQ_EMPTY(&si->si_snapshots)); 168 mutex_destroy(&si->si_lock); 169 rw_destroy(&si->si_vnlock.vl_lock); 170 KASSERT(si->si_snapblklist == NULL); 171 kmem_free(si, sizeof(*si)); 172} 173 174/* 175 * Create a snapshot file and initialize it for the filesystem. 176 * Vnode is locked on entry and return. 177 */ 178int 179ffs_snapshot(struct mount *mp, struct vnode *vp, 180 struct timespec *ctime) 181{ 182#if defined(FFS_NO_SNAPSHOT) 183 return EOPNOTSUPP; 184} 185#else /* defined(FFS_NO_SNAPSHOT) */ 186 ufs2_daddr_t numblks, blkno, *blkp, snaplistsize = 0, *snapblklist; 187 int error, ns, cg, snaploc; 188 int i, size, len, loc; 189 int flag = mp->mnt_flag; 190 struct timeval starttime; 191#ifdef DEBUG 192 struct timeval endtime; 193#endif 194 struct timespec ts; 195 long redo = 0; 196 int32_t *lp; 197 void *space; 198 void *sbbuf = NULL; 199 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 200 struct lwp *l = curlwp; 201 struct inode *ip, *xp; 202 struct buf *bp, *ibp, *nbp; 203 struct vattr vat; 204 struct vnode *xvp, *mvp, *devvp; 205 struct snap_info *si; 206 207 ns = UFS_FSNEEDSWAP(fs); 208 si = VFSTOUFS(mp)->um_snapinfo; 209 210 /* 211 * Need to serialize access to snapshot code per filesystem. 212 */ 213 /* 214 * If the vnode already is a snapshot, return. 215 */ 216 if (VTOI(vp)->i_flags & SF_SNAPSHOT) { 217 if (ctime) { 218 ctime->tv_sec = DIP(VTOI(vp), mtime); 219 ctime->tv_nsec = DIP(VTOI(vp), mtimensec); 220 } 221 return 0; 222 } 223 /* 224 * Check mount, exclusive reference and owner. 225 */ 226 if (vp->v_mount != mp) 227 return EXDEV; 228 if (vp->v_usecount != 1 || vp->v_writecount != 0) 229 return EBUSY; 230 if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 231 NULL) != 0 && 232 VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred)) 233 return EACCES; 234 235 if (vp->v_size != 0) { 236 error = ffs_truncate(vp, 0, 0, NOCRED); 237 if (error) 238 return error; 239 } 240 /* 241 * Assign a snapshot slot in the superblock. 242 */ 243 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 244 if (fs->fs_snapinum[snaploc] == 0) 245 break; 246 if (snaploc == FSMAXSNAP) 247 return (ENOSPC); 248 ip = VTOI(vp); 249 devvp = ip->i_devvp; 250 /* 251 * Write an empty list of preallocated blocks to the end of 252 * the snapshot to set size to at least that of the filesystem. 253 */ 254 numblks = howmany(fs->fs_size, fs->fs_frag); 255 blkno = 1; 256 blkno = ufs_rw64(blkno, ns); 257 error = vn_rdwr(UIO_WRITE, vp, 258 (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks), 259 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 260 if (error) 261 goto out; 262 /* 263 * Preallocate critical data structures so that we can copy 264 * them in without further allocation after we suspend all 265 * operations on the filesystem. We would like to just release 266 * the allocated buffers without writing them since they will 267 * be filled in below once we are ready to go, but this upsets 268 * the soft update code, so we go ahead and write the new buffers. 269 * 270 * Allocate all indirect blocks and mark all of them as not 271 * needing to be copied. 272 */ 273 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 274 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 275 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 276 if (error) 277 goto out; 278 bawrite(ibp); 279 } 280 /* 281 * Allocate copies for the superblock and its summary information. 282 */ 283 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 284 0, &nbp); 285 if (error) 286 goto out; 287 bawrite(nbp); 288 blkno = fragstoblks(fs, fs->fs_csaddr); 289 len = howmany(fs->fs_cssize, fs->fs_bsize); 290 for (loc = 0; loc < len; loc++) { 291 error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)), 292 fs->fs_bsize, KERNCRED, 0, &nbp); 293 if (error) 294 goto out; 295 bawrite(nbp); 296 } 297 /* 298 * Copy all the cylinder group maps. Although the 299 * filesystem is still active, we hope that only a few 300 * cylinder groups will change between now and when we 301 * suspend operations. Thus, we will be able to quickly 302 * touch up the few cylinder groups that changed during 303 * the suspension period. 304 */ 305 len = howmany(fs->fs_ncg, NBBY); 306 fs->fs_active = malloc(len, M_DEVBUF, M_WAITOK | M_ZERO); 307 for (cg = 0; cg < fs->fs_ncg; cg++) { 308 if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 309 fs->fs_bsize, KERNCRED, 0, &nbp)) != 0) 310 goto out; 311 error = cgaccount(cg, vp, nbp->b_data, 1); 312 bawrite(nbp); 313 if (error) 314 goto out; 315 } 316 /* 317 * Change inode to snapshot type file. 318 */ 319 ip->i_flags |= SF_SNAPSHOT; 320 DIP_ASSIGN(ip, flags, ip->i_flags); 321 ip->i_flag |= IN_CHANGE | IN_UPDATE; 322 /* 323 * Ensure that the snapshot is completely on disk. 324 * Since we have marked it as a snapshot it is safe to 325 * unlock it as no process will be allowed to write to it. 326 */ 327 if ((error = VOP_FSYNC(vp, KERNCRED, FSYNC_WAIT, 0, 0)) != 0) 328 goto out; 329 VOP_UNLOCK(vp, 0); 330 /* 331 * All allocations are done, so we can now snapshot the system. 332 * 333 * Suspend operation on filesystem. 334 */ 335 if ((error = vfs_suspend(vp->v_mount, 0)) != 0) { 336 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 337 goto out; 338 } 339 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 340 getmicrotime(&starttime); 341 /* 342 * First, copy all the cylinder group maps that have changed. 343 */ 344 for (cg = 0; cg < fs->fs_ncg; cg++) { 345 if (ACTIVECG_ISSET(fs, cg)) 346 continue; 347 redo++; 348 if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 349 fs->fs_bsize, KERNCRED, 0, &nbp)) != 0) 350 goto out1; 351 error = cgaccount(cg, vp, nbp->b_data, 2); 352 bawrite(nbp); 353 if (error) 354 goto out1; 355 } 356 /* 357 * Grab a copy of the superblock and its summary information. 358 * We delay writing it until the suspension is released below. 359 */ 360 sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 361 loc = blkoff(fs, fs->fs_sblockloc); 362 if (loc > 0) 363 memset(sbbuf, 0, loc); 364 copy_fs = (struct fs *)((char *)sbbuf + loc); 365 bcopy(fs, copy_fs, fs->fs_sbsize); 366 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 367 if (fs->fs_sbsize < size) 368 memset((char *)sbbuf + loc + fs->fs_sbsize, 0, 369 size - fs->fs_sbsize); 370 size = blkroundup(fs, fs->fs_cssize); 371 if (fs->fs_contigsumsize > 0) 372 size += fs->fs_ncg * sizeof(int32_t); 373 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 374 copy_fs->fs_csp = space; 375 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 376 space = (char *)space + fs->fs_cssize; 377 loc = howmany(fs->fs_cssize, fs->fs_fsize); 378 i = fs->fs_frag - loc % fs->fs_frag; 379 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 380 if (len > 0) { 381 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 382 len, KERNCRED, &bp)) != 0) { 383 brelse(bp, 0); 384 free(copy_fs->fs_csp, M_UFSMNT); 385 goto out1; 386 } 387 bcopy(bp->b_data, space, (u_int)len); 388 space = (char *)space + len; 389 brelse(bp, BC_INVAL | BC_NOCACHE); 390 } 391 if (fs->fs_contigsumsize > 0) { 392 copy_fs->fs_maxcluster = lp = space; 393 for (i = 0; i < fs->fs_ncg; i++) 394 *lp++ = fs->fs_contigsumsize; 395 } 396 /* 397 * We must check for active files that have been unlinked 398 * (e.g., with a zero link count). We have to expunge all 399 * trace of these files from the snapshot so that they are 400 * not reclaimed prematurely by fsck or unnecessarily dumped. 401 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 402 * spec_strategy about writing on a suspended filesystem. 403 * Note that we skip unlinked snapshot files as they will 404 * be handled separately below. 405 * 406 * We also calculate the needed size for the snapshot list. 407 */ 408 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 409 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 410 /* Allocate a marker vnode */ 411 if ((mvp = vnalloc(mp)) == NULL) { 412 error = ENOMEM; 413 goto out1; 414 } 415 MNT_ILOCK(mp); 416 /* 417 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 418 * and vclean() can be called indirectly 419 */ 420 for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) { 421 vmark(mvp, xvp); 422 /* 423 * Make sure this vnode wasn't reclaimed in getnewvnode(). 424 * Start over if it has (it won't be on the list anymore). 425 */ 426 if (xvp->v_mount != mp || vismarker(xvp)) 427 continue; 428 VI_LOCK(xvp); 429 if ((xvp->v_iflag & VI_XLOCK) || 430 xvp->v_usecount == 0 || xvp->v_type == VNON || 431 VTOI(xvp) == NULL || 432 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 433 VI_UNLOCK(xvp); 434 continue; 435 } 436 MNT_IUNLOCK(mp); 437 /* 438 * XXXAD should increase vnode ref count to prevent it 439 * disappearing or being recycled. 440 */ 441 VI_UNLOCK(xvp); 442#ifdef DEBUG 443 if (snapdebug) 444 vprint("ffs_snapshot: busy vnode", xvp); 445#endif 446 if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 && 447 vat.va_nlink > 0) { 448 MNT_ILOCK(mp); 449 continue; 450 } 451 xp = VTOI(xvp); 452 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 453 MNT_ILOCK(mp); 454 continue; 455 } 456 /* 457 * If there is a fragment, clear it here. 458 */ 459 blkno = 0; 460 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 461 if (loc < NDADDR) { 462 len = fragroundup(fs, blkoff(fs, xp->i_size)); 463 if (len > 0 && len < fs->fs_bsize) { 464 ffs_blkfree(copy_fs, vp, db_get(xp, loc), 465 len, xp->i_number); 466 blkno = db_get(xp, loc); 467 db_assign(xp, loc, 0); 468 } 469 } 470 snaplistsize += 1; 471 if (xp->i_ump->um_fstype == UFS1) 472 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 473 BLK_NOCOPY); 474 else 475 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 476 BLK_NOCOPY); 477 if (blkno) 478 db_assign(xp, loc, blkno); 479 if (!error) 480 error = ffs_freefile(copy_fs, vp, xp->i_number, 481 xp->i_mode); 482 if (error) { 483 free(copy_fs->fs_csp, M_UFSMNT); 484 (void)vunmark(mvp); 485 goto out1; 486 } 487 MNT_ILOCK(mp); 488 } 489 MNT_IUNLOCK(mp); 490 vnfree(mvp); 491 /* 492 * Acquire the snapshot lock and give up our original private lock. 493 */ 494 VI_LOCK(vp); 495 vp->v_vnlock = &si->si_vnlock; 496 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY); 497 vlockmgr(&vp->v_lock, LK_RELEASE); 498 /* 499 * If this is the first snapshot on this filesystem, then we need 500 * to allocate the space for the list of preallocated snapshot blocks. 501 * This list will be refined below, but this preliminary one will 502 * keep us out of deadlock until the full one is ready. 503 */ 504 mutex_enter(&si->si_lock); 505 if ((xp = TAILQ_FIRST(&si->si_snapshots)) == NULL) { 506 mutex_exit(&si->si_lock); 507 snapblklist = malloc( 508 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 509 blkp = &snapblklist[1]; 510 *blkp++ = lblkno(fs, fs->fs_sblockloc); 511 blkno = fragstoblks(fs, fs->fs_csaddr); 512 for (cg = 0; cg < fs->fs_ncg; cg++) { 513 if (fragstoblks(fs, cgtod(fs, cg)) > blkno) 514 break; 515 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 516 } 517 len = howmany(fs->fs_cssize, fs->fs_bsize); 518 for (loc = 0; loc < len; loc++) 519 *blkp++ = blkno + loc; 520 for (; cg < fs->fs_ncg; cg++) 521 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 522 snapblklist[0] = blkp - snapblklist; 523 mutex_enter(&si->si_lock); 524 if (si->si_snapblklist != NULL) 525 panic("ffs_snapshot: non-empty list"); 526 si->si_snapblklist = snapblklist; 527 } 528 /* 529 * Record snapshot inode. Since this is the newest snapshot, 530 * it must be placed at the end of the list. 531 */ 532 fs->fs_snapinum[snaploc] = ip->i_number; 533 if (ip->i_nextsnap.tqe_prev != 0) 534 panic("ffs_snapshot: %llu already on list", 535 (unsigned long long)ip->i_number); 536 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 537 if (xp == NULL) 538 fscow_establish(mp, ffs_copyonwrite, devvp); 539 si->si_gen++; 540 mutex_exit(&si->si_lock); 541 vp->v_vflag |= VV_SYSTEM; 542out1: 543 /* 544 * Resume operation on filesystem. 545 */ 546 vfs_resume(vp->v_mount); 547 /* 548 * Set the mtime to the time the snapshot has been taken. 549 */ 550 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 551 if (ctime) 552 *ctime = ts; 553 DIP_ASSIGN(ip, mtime, ts.tv_sec); 554 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 555 ip->i_flag |= IN_CHANGE | IN_UPDATE; 556 557#ifdef DEBUG 558 if (starttime.tv_sec > 0) { 559 getmicrotime(&endtime); 560 timersub(&endtime, &starttime, &endtime); 561 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 562 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 563 endtime.tv_usec / 1000, redo, fs->fs_ncg); 564 } 565#endif 566 if (error) 567 goto out; 568 /* 569 * Copy allocation information from all the snapshots in 570 * this snapshot and then expunge them from its view. 571 */ 572 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) { 573 if (xp == ip) 574 break; 575 if (xp->i_ump->um_fstype == UFS1) 576 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 577 BLK_SNAP); 578 else 579 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 580 BLK_SNAP); 581 if (error == 0 && xp->i_ffs_effnlink == 0) 582 error = ffs_freefile(copy_fs, vp, 583 xp->i_number, xp->i_mode); 584 if (error) { 585 fs->fs_snapinum[snaploc] = 0; 586 goto done; 587 } 588 } 589 /* 590 * Allocate space for the full list of preallocated snapshot blocks. 591 */ 592 snapblklist = malloc(snaplistsize * sizeof(ufs2_daddr_t), 593 M_UFSMNT, M_WAITOK); 594 ip->i_snapblklist = &snapblklist[1]; 595 /* 596 * Expunge the blocks used by the snapshots from the set of 597 * blocks marked as used in the snapshot bitmaps. Also, collect 598 * the list of allocated blocks in i_snapblklist. 599 */ 600 if (ip->i_ump->um_fstype == UFS1) 601 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 602 else 603 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 604 if (error) { 605 fs->fs_snapinum[snaploc] = 0; 606 FREE(snapblklist, M_UFSMNT); 607 goto done; 608 } 609 if (snaplistsize < ip->i_snapblklist - snapblklist) 610 panic("ffs_snapshot: list too small"); 611 snaplistsize = ip->i_snapblklist - snapblklist; 612 snapblklist[0] = snaplistsize; 613 ip->i_snapblklist = &snapblklist[0]; 614 /* 615 * Write out the list of allocated blocks to the end of the snapshot. 616 */ 617 for (i = 0; i < snaplistsize; i++) 618 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 619 error = vn_rdwr(UIO_WRITE, vp, (void *)snapblklist, 620 snaplistsize*sizeof(ufs2_daddr_t), lblktosize(fs, (off_t)numblks), 621 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 622 for (i = 0; i < snaplistsize; i++) 623 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 624 if (error) { 625 fs->fs_snapinum[snaploc] = 0; 626 FREE(snapblklist, M_UFSMNT); 627 goto done; 628 } 629 /* 630 * Write the superblock and its summary information 631 * to the snapshot. 632 */ 633 blkno = fragstoblks(fs, fs->fs_csaddr); 634 len = howmany(fs->fs_cssize, fs->fs_bsize); 635 space = copy_fs->fs_csp; 636#ifdef FFS_EI 637 if (ns) { 638 ffs_sb_swap(copy_fs, copy_fs); 639 ffs_csum_swap(space, space, fs->fs_cssize); 640 } 641#endif 642 for (loc = 0; loc < len; loc++) { 643 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 644 if (error) { 645 brelse(nbp, 0); 646 fs->fs_snapinum[snaploc] = 0; 647 FREE(snapblklist, M_UFSMNT); 648 goto done; 649 } 650 bcopy(space, nbp->b_data, fs->fs_bsize); 651 space = (char *)space + fs->fs_bsize; 652 bawrite(nbp); 653 } 654 /* 655 * As this is the newest list, it is the most inclusive, so 656 * should replace the previous list. If this is the first snapshot 657 * free the preliminary list. 658 */ 659 mutex_enter(&si->si_lock); 660 space = si->si_snapblklist; 661 si->si_snapblklist = snapblklist; 662 if (TAILQ_FIRST(&si->si_snapshots) == ip) 663 FREE(space, M_UFSMNT); 664 si->si_gen++; 665 mutex_exit(&si->si_lock); 666done: 667 free(copy_fs->fs_csp, M_UFSMNT); 668 if (!error) { 669 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 670 KERNCRED, &nbp); 671 if (error) { 672 brelse(nbp, 0); 673 fs->fs_snapinum[snaploc] = 0; 674 } 675 bcopy(sbbuf, nbp->b_data, fs->fs_bsize); 676 bawrite(nbp); 677 } 678out: 679 /* 680 * Invalidate and free all pages on the snapshot vnode. 681 * All metadata has been written through the buffer cache. 682 * Clean all dirty buffers now to avoid UBC inconsistencies. 683 */ 684 if (!error) { 685 mutex_enter(&vp->v_interlock); 686 error = VOP_PUTPAGES(vp, 0, 0, 687 PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 688 } 689 if (!error) { 690 mutex_enter(&bufcache_lock); 691 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 692 nbp = LIST_NEXT(bp, b_vnbufs); 693 bp->b_cflags |= BC_BUSY|BC_VFLUSH; 694 if (LIST_FIRST(&bp->b_dep) == NULL) 695 bp->b_cflags |= BC_NOCACHE; 696 mutex_exit(&bufcache_lock); 697 bwrite(bp); 698 mutex_enter(&bufcache_lock); 699 } 700 mutex_exit(&bufcache_lock); 701 702 mutex_enter(&vp->v_interlock); 703 while (vp->v_numoutput > 0) 704 cv_wait(&vp->v_cv, &vp->v_interlock); 705 mutex_exit(&vp->v_interlock); 706 } 707 if (sbbuf) 708 free(sbbuf, M_UFSMNT); 709 if (fs->fs_active != 0) { 710 FREE(fs->fs_active, M_DEVBUF); 711 fs->fs_active = 0; 712 } 713 mp->mnt_flag = flag; 714 if (error) 715 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); 716 else 717 vref(vp); 718 return (error); 719} 720 721/* 722 * Copy a cylinder group map. All the unallocated blocks are marked 723 * BLK_NOCOPY so that the snapshot knows that it need not copy them 724 * if they are later written. If passno is one, then this is a first 725 * pass, so only setting needs to be done. If passno is 2, then this 726 * is a revision to a previous pass which must be undone as the 727 * replacement pass is done. 728 */ 729static int 730cgaccount(int cg, struct vnode *vp, void *data, int passno) 731{ 732 struct buf *bp, *ibp; 733 struct inode *ip; 734 struct cg *cgp; 735 struct fs *fs; 736 ufs2_daddr_t base, numblks; 737 int error, len, loc, ns, indiroff; 738 739 ip = VTOI(vp); 740 fs = ip->i_fs; 741 ns = UFS_FSNEEDSWAP(fs); 742 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 743 (int)fs->fs_cgsize, KERNCRED, &bp); 744 if (error) { 745 brelse(bp, 0); 746 return (error); 747 } 748 cgp = (struct cg *)bp->b_data; 749 if (!cg_chkmagic(cgp, ns)) { 750 brelse(bp, 0); 751 return (EIO); 752 } 753 ACTIVECG_SET(fs, cg); 754 755 bcopy(bp->b_data, data, fs->fs_cgsize); 756 brelse(bp, 0); 757 if (fs->fs_cgsize < fs->fs_bsize) 758 memset((char *)data + fs->fs_cgsize, 0, 759 fs->fs_bsize - fs->fs_cgsize); 760 numblks = howmany(fs->fs_size, fs->fs_frag); 761 len = howmany(fs->fs_fpg, fs->fs_frag); 762 base = cg * fs->fs_fpg / fs->fs_frag; 763 if (base + len >= numblks) 764 len = numblks - base - 1; 765 loc = 0; 766 if (base < NDADDR) { 767 for ( ; loc < NDADDR; loc++) { 768 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 769 db_assign(ip, loc, BLK_NOCOPY); 770 else if (db_get(ip, loc) == BLK_NOCOPY) { 771 if (passno == 2) 772 db_assign(ip, loc, 0); 773 else if (passno == 1) 774 panic("ffs_snapshot: lost direct block"); 775 } 776 } 777 } 778 if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)), 779 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 780 return (error); 781 indiroff = (base + loc - NDADDR) % NINDIR(fs); 782 for ( ; loc < len; loc++, indiroff++) { 783 if (indiroff >= NINDIR(fs)) { 784 bawrite(ibp); 785 if ((error = ffs_balloc(vp, 786 lblktosize(fs, (off_t)(base + loc)), 787 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 788 return (error); 789 indiroff = 0; 790 } 791 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 792 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 793 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 794 if (passno == 2) 795 idb_assign(ip, ibp->b_data, indiroff, 0); 796 else if (passno == 1) 797 panic("ffs_snapshot: lost indirect block"); 798 } 799 } 800 bdwrite(ibp); 801 return (0); 802} 803 804/* 805 * Before expunging a snapshot inode, note all the 806 * blocks that it claims with BLK_SNAP so that fsck will 807 * be able to account for those blocks properly and so 808 * that this snapshot knows that it need not copy them 809 * if the other snapshot holding them is freed. This code 810 * is reproduced once each for UFS1 and UFS2. 811 */ 812static int 813expunge_ufs1(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 814 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 815 struct fs *, ufs_lbn_t, int), 816 int expungetype) 817{ 818 int i, s, error, ns, indiroff; 819 ufs_lbn_t lbn, rlbn; 820 ufs2_daddr_t len, blkno, numblks, blksperindir; 821 struct ufs1_dinode *dip; 822 struct buf *bp; 823 void *bf; 824 825 ns = UFS_FSNEEDSWAP(fs); 826 /* 827 * Prepare to expunge the inode. If its inode block has not 828 * yet been copied, then allocate and fill the copy. 829 */ 830 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 831 blkno = 0; 832 if (lbn < NDADDR) { 833 blkno = db_get(VTOI(snapvp), lbn); 834 } else { 835 s = cow_enter(); 836 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn), 837 fs->fs_bsize, KERNCRED, B_METAONLY, &bp); 838 cow_leave(s); 839 if (error) 840 return (error); 841 indiroff = (lbn - NDADDR) % NINDIR(fs); 842 blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff); 843 brelse(bp, 0); 844 } 845 bf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 846 if (blkno != 0) 847 error = readvnblk(snapvp, bf, lbn); 848 else 849 error = readfsblk(snapvp, bf, lbn); 850 if (error) { 851 free(bf, M_UFSMNT); 852 return error; 853 } 854 /* 855 * Set a snapshot inode to be a zero length file, regular files 856 * or unlinked snapshots to be completely unallocated. 857 */ 858 dip = (struct ufs1_dinode *)bf + ino_to_fsbo(fs, cancelip->i_number); 859 if (expungetype == BLK_NOCOPY || cancelip->i_ffs_effnlink == 0) 860 dip->di_mode = 0; 861 dip->di_size = 0; 862 dip->di_blocks = 0; 863 dip->di_flags = 864 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 865 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 866 error = writevnblk(snapvp, bf, lbn); 867 free(bf, M_UFSMNT); 868 if (error) 869 return error; 870 /* 871 * Now go through and expunge all the blocks in the file 872 * using the function requested. 873 */ 874 numblks = howmany(cancelip->i_size, fs->fs_bsize); 875 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_db[0], 876 &cancelip->i_ffs1_db[NDADDR], fs, 0, expungetype))) 877 return (error); 878 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_ib[0], 879 &cancelip->i_ffs1_ib[NIADDR], fs, -1, expungetype))) 880 return (error); 881 blksperindir = 1; 882 lbn = -NDADDR; 883 len = numblks - NDADDR; 884 rlbn = NDADDR; 885 for (i = 0; len > 0 && i < NIADDR; i++) { 886 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 887 ufs_rw32(cancelip->i_ffs1_ib[i], ns), lbn, rlbn, len, 888 blksperindir, fs, acctfunc, expungetype); 889 if (error) 890 return (error); 891 blksperindir *= NINDIR(fs); 892 lbn -= blksperindir + 1; 893 len -= blksperindir; 894 rlbn += blksperindir; 895 } 896 return (0); 897} 898 899/* 900 * Descend an indirect block chain for vnode cancelvp accounting for all 901 * its indirect blocks in snapvp. 902 */ 903static int 904indiracct_ufs1(struct vnode *snapvp, struct vnode *cancelvp, int level, 905 ufs1_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks, 906 ufs_lbn_t blksperindir, struct fs *fs, 907 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 908 struct fs *, ufs_lbn_t, int), 909 int expungetype) 910{ 911 int error, ns, num, i; 912 ufs_lbn_t subblksperindir; 913 struct indir indirs[NIADDR + 2]; 914 ufs1_daddr_t last, *bap; 915 struct buf *bp; 916 917 ns = UFS_FSNEEDSWAP(fs); 918 919 if (blkno == 0) { 920 if (expungetype == BLK_NOCOPY) 921 return (0); 922 panic("indiracct_ufs1: missing indir"); 923 } 924 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 925 return (error); 926 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 927 panic("indiracct_ufs1: botched params"); 928 /* 929 * We have to expand bread here since it will deadlock looking 930 * up the block number for any blocks that are not in the cache. 931 */ 932 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 933 bp->b_blkno = fsbtodb(fs, blkno); 934 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && 935 (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) { 936 brelse(bp, 0); 937 return (error); 938 } 939 /* 940 * Account for the block pointers in this indirect block. 941 */ 942 last = howmany(remblks, blksperindir); 943 if (last > NINDIR(fs)) 944 last = NINDIR(fs); 945 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 946 bcopy(bp->b_data, (void *)bap, fs->fs_bsize); 947 brelse(bp, 0); 948 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 949 level == 0 ? rlbn : -1, expungetype); 950 if (error || level == 0) 951 goto out; 952 /* 953 * Account for the block pointers in each of the indirect blocks 954 * in the levels below us. 955 */ 956 subblksperindir = blksperindir / NINDIR(fs); 957 for (lbn++, level--, i = 0; i < last; i++) { 958 error = indiracct_ufs1(snapvp, cancelvp, level, 959 ufs_rw32(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 960 fs, acctfunc, expungetype); 961 if (error) 962 goto out; 963 rlbn += blksperindir; 964 lbn -= blksperindir; 965 remblks -= blksperindir; 966 } 967out: 968 FREE(bap, M_DEVBUF); 969 return (error); 970} 971 972/* 973 * Do both snap accounting and map accounting. 974 */ 975static int 976fullacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, 977 struct fs *fs, ufs_lbn_t lblkno, 978 int exptype /* BLK_SNAP or BLK_NOCOPY */) 979{ 980 int error; 981 982 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 983 return (error); 984 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 985} 986 987/* 988 * Identify a set of blocks allocated in a snapshot inode. 989 */ 990static int 991snapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, 992 struct fs *fs, ufs_lbn_t lblkno, 993 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 994{ 995 struct inode *ip = VTOI(vp); 996 ufs1_daddr_t blkno, *blkp; 997 ufs_lbn_t lbn; 998 struct buf *ibp; 999 int error, ns; 1000 1001 ns = UFS_FSNEEDSWAP(fs); 1002 1003 for ( ; oldblkp < lastblkp; oldblkp++) { 1004 blkno = ufs_rw32(*oldblkp, ns); 1005 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1006 continue; 1007 lbn = fragstoblks(fs, blkno); 1008 if (lbn < NDADDR) { 1009 blkp = &ip->i_ffs1_db[lbn]; 1010 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1011 } else { 1012 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1013 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1014 if (error) 1015 return (error); 1016 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1017 [(lbn - NDADDR) % NINDIR(fs)]; 1018 } 1019 /* 1020 * If we are expunging a snapshot vnode and we 1021 * find a block marked BLK_NOCOPY, then it is 1022 * one that has been allocated to this snapshot after 1023 * we took our current snapshot and can be ignored. 1024 */ 1025 blkno = ufs_rw32(*blkp, ns); 1026 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1027 if (lbn >= NDADDR) 1028 brelse(ibp, 0); 1029 } else { 1030 if (blkno != 0) 1031 panic("snapacct_ufs1: bad block"); 1032 *blkp = ufs_rw32(expungetype, ns); 1033 if (lbn >= NDADDR) 1034 bdwrite(ibp); 1035 } 1036 } 1037 return (0); 1038} 1039 1040/* 1041 * Account for a set of blocks allocated in a snapshot inode. 1042 */ 1043static int 1044mapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, 1045 struct fs *fs, ufs_lbn_t lblkno, int expungetype) 1046{ 1047 ufs1_daddr_t blkno; 1048 struct inode *ip; 1049 ino_t inum; 1050 int acctit, ns; 1051 1052 ns = UFS_FSNEEDSWAP(fs); 1053 ip = VTOI(vp); 1054 inum = ip->i_number; 1055 if (lblkno == -1) 1056 acctit = 0; 1057 else 1058 acctit = 1; 1059 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1060 blkno = ufs_rw32(*oldblkp, ns); 1061 if (blkno == 0 || blkno == BLK_NOCOPY) 1062 continue; 1063 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1064 *ip->i_snapblklist++ = lblkno; 1065 if (blkno == BLK_SNAP) 1066 blkno = blkstofrags(fs, lblkno); 1067 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1068 } 1069 return (0); 1070} 1071 1072/* 1073 * Before expunging a snapshot inode, note all the 1074 * blocks that it claims with BLK_SNAP so that fsck will 1075 * be able to account for those blocks properly and so 1076 * that this snapshot knows that it need not copy them 1077 * if the other snapshot holding them is freed. This code 1078 * is reproduced once each for UFS1 and UFS2. 1079 */ 1080static int 1081expunge_ufs2(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 1082 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1083 struct fs *, ufs_lbn_t, int), 1084 int expungetype) 1085{ 1086 int i, s, error, ns, indiroff; 1087 ufs_lbn_t lbn, rlbn; 1088 ufs2_daddr_t len, blkno, numblks, blksperindir; 1089 struct ufs2_dinode *dip; 1090 struct buf *bp; 1091 void *bf; 1092 1093 ns = UFS_FSNEEDSWAP(fs); 1094 /* 1095 * Prepare to expunge the inode. If its inode block has not 1096 * yet been copied, then allocate and fill the copy. 1097 */ 1098 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1099 blkno = 0; 1100 if (lbn < NDADDR) { 1101 blkno = db_get(VTOI(snapvp), lbn); 1102 } else { 1103 s = cow_enter(); 1104 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn), 1105 fs->fs_bsize, KERNCRED, B_METAONLY, &bp); 1106 cow_leave(s); 1107 if (error) 1108 return (error); 1109 indiroff = (lbn - NDADDR) % NINDIR(fs); 1110 blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff); 1111 brelse(bp, 0); 1112 } 1113 bf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1114 if (blkno != 0) 1115 error = readvnblk(snapvp, bf, lbn); 1116 else 1117 error = readfsblk(snapvp, bf, lbn); 1118 if (error) { 1119 free(bf, M_UFSMNT); 1120 return error; 1121 } 1122 /* 1123 * Set a snapshot inode to be a zero length file, regular files 1124 * or unlinked snapshots to be completely unallocated. 1125 */ 1126 dip = (struct ufs2_dinode *)bf + ino_to_fsbo(fs, cancelip->i_number); 1127 if (expungetype == BLK_NOCOPY || cancelip->i_ffs_effnlink == 0) 1128 dip->di_mode = 0; 1129 dip->di_size = 0; 1130 dip->di_blocks = 0; 1131 dip->di_flags = 1132 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 1133 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1134 error = writevnblk(snapvp, bf, lbn); 1135 free(bf, M_UFSMNT); 1136 if (error) 1137 return error; 1138 /* 1139 * Now go through and expunge all the blocks in the file 1140 * using the function requested. 1141 */ 1142 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1143 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_db[0], 1144 &cancelip->i_ffs2_db[NDADDR], fs, 0, expungetype))) 1145 return (error); 1146 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_ib[0], 1147 &cancelip->i_ffs2_ib[NIADDR], fs, -1, expungetype))) 1148 return (error); 1149 blksperindir = 1; 1150 lbn = -NDADDR; 1151 len = numblks - NDADDR; 1152 rlbn = NDADDR; 1153 for (i = 0; len > 0 && i < NIADDR; i++) { 1154 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1155 ufs_rw64(cancelip->i_ffs2_ib[i], ns), lbn, rlbn, len, 1156 blksperindir, fs, acctfunc, expungetype); 1157 if (error) 1158 return (error); 1159 blksperindir *= NINDIR(fs); 1160 lbn -= blksperindir + 1; 1161 len -= blksperindir; 1162 rlbn += blksperindir; 1163 } 1164 return (0); 1165} 1166 1167/* 1168 * Descend an indirect block chain for vnode cancelvp accounting for all 1169 * its indirect blocks in snapvp. 1170 */ 1171static int 1172indiracct_ufs2(struct vnode *snapvp, struct vnode *cancelvp, int level, 1173 ufs2_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks, 1174 ufs_lbn_t blksperindir, struct fs *fs, 1175 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1176 struct fs *, ufs_lbn_t, int), 1177 int expungetype) 1178{ 1179 int error, ns, num, i; 1180 ufs_lbn_t subblksperindir; 1181 struct indir indirs[NIADDR + 2]; 1182 ufs2_daddr_t last, *bap; 1183 struct buf *bp; 1184 1185 ns = UFS_FSNEEDSWAP(fs); 1186 1187 if (blkno == 0) { 1188 if (expungetype == BLK_NOCOPY) 1189 return (0); 1190 panic("indiracct_ufs2: missing indir"); 1191 } 1192 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1193 return (error); 1194 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1195 panic("indiracct_ufs2: botched params"); 1196 /* 1197 * We have to expand bread here since it will deadlock looking 1198 * up the block number for any blocks that are not in the cache. 1199 */ 1200 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 1201 bp->b_blkno = fsbtodb(fs, blkno); 1202 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && 1203 (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) { 1204 brelse(bp, 0); 1205 return (error); 1206 } 1207 /* 1208 * Account for the block pointers in this indirect block. 1209 */ 1210 last = howmany(remblks, blksperindir); 1211 if (last > NINDIR(fs)) 1212 last = NINDIR(fs); 1213 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 1214 bcopy(bp->b_data, (void *)bap, fs->fs_bsize); 1215 brelse(bp, 0); 1216 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1217 level == 0 ? rlbn : -1, expungetype); 1218 if (error || level == 0) 1219 goto out; 1220 /* 1221 * Account for the block pointers in each of the indirect blocks 1222 * in the levels below us. 1223 */ 1224 subblksperindir = blksperindir / NINDIR(fs); 1225 for (lbn++, level--, i = 0; i < last; i++) { 1226 error = indiracct_ufs2(snapvp, cancelvp, level, 1227 ufs_rw64(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 1228 fs, acctfunc, expungetype); 1229 if (error) 1230 goto out; 1231 rlbn += blksperindir; 1232 lbn -= blksperindir; 1233 remblks -= blksperindir; 1234 } 1235out: 1236 FREE(bap, M_DEVBUF); 1237 return (error); 1238} 1239 1240/* 1241 * Do both snap accounting and map accounting. 1242 */ 1243static int 1244fullacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, 1245 struct fs *fs, ufs_lbn_t lblkno, 1246 int exptype /* BLK_SNAP or BLK_NOCOPY */) 1247{ 1248 int error; 1249 1250 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1251 return (error); 1252 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1253} 1254 1255/* 1256 * Identify a set of blocks allocated in a snapshot inode. 1257 */ 1258static int 1259snapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, 1260 struct fs *fs, ufs_lbn_t lblkno, 1261 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 1262{ 1263 struct inode *ip = VTOI(vp); 1264 ufs2_daddr_t blkno, *blkp; 1265 ufs_lbn_t lbn; 1266 struct buf *ibp; 1267 int error, ns; 1268 1269 ns = UFS_FSNEEDSWAP(fs); 1270 1271 for ( ; oldblkp < lastblkp; oldblkp++) { 1272 blkno = ufs_rw64(*oldblkp, ns); 1273 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1274 continue; 1275 lbn = fragstoblks(fs, blkno); 1276 if (lbn < NDADDR) { 1277 blkp = &ip->i_ffs2_db[lbn]; 1278 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1279 } else { 1280 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1281 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1282 if (error) 1283 return (error); 1284 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1285 [(lbn - NDADDR) % NINDIR(fs)]; 1286 } 1287 /* 1288 * If we are expunging a snapshot vnode and we 1289 * find a block marked BLK_NOCOPY, then it is 1290 * one that has been allocated to this snapshot after 1291 * we took our current snapshot and can be ignored. 1292 */ 1293 blkno = ufs_rw64(*blkp, ns); 1294 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1295 if (lbn >= NDADDR) 1296 brelse(ibp, 0); 1297 } else { 1298 if (blkno != 0) 1299 panic("snapacct_ufs2: bad block"); 1300 *blkp = ufs_rw64(expungetype, ns); 1301 if (lbn >= NDADDR) 1302 bdwrite(ibp); 1303 } 1304 } 1305 return (0); 1306} 1307 1308/* 1309 * Account for a set of blocks allocated in a snapshot inode. 1310 */ 1311static int 1312mapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, 1313 struct fs *fs, ufs_lbn_t lblkno, int expungetype) 1314{ 1315 ufs2_daddr_t blkno; 1316 struct inode *ip; 1317 ino_t inum; 1318 int acctit, ns; 1319 1320 ns = UFS_FSNEEDSWAP(fs); 1321 ip = VTOI(vp); 1322 inum = ip->i_number; 1323 if (lblkno == -1) 1324 acctit = 0; 1325 else 1326 acctit = 1; 1327 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1328 blkno = ufs_rw64(*oldblkp, ns); 1329 if (blkno == 0 || blkno == BLK_NOCOPY) 1330 continue; 1331 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1332 *ip->i_snapblklist++ = lblkno; 1333 if (blkno == BLK_SNAP) 1334 blkno = blkstofrags(fs, lblkno); 1335 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1336 } 1337 return (0); 1338} 1339#endif /* defined(FFS_NO_SNAPSHOT) */ 1340 1341/* 1342 * Decrement extra reference on snapshot when last name is removed. 1343 * It will not be freed until the last open reference goes away. 1344 */ 1345void 1346ffs_snapgone(struct inode *ip) 1347{ 1348 struct mount *mp = ip->i_devvp->v_specmountpoint; 1349 struct inode *xp; 1350 struct fs *fs; 1351 struct snap_info *si; 1352 int snaploc; 1353 1354 si = VFSTOUFS(mp)->um_snapinfo; 1355 1356 /* 1357 * Find snapshot in incore list. 1358 */ 1359 mutex_enter(&si->si_lock); 1360 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 1361 if (xp == ip) 1362 break; 1363 mutex_exit(&si->si_lock); 1364 if (xp != NULL) 1365 vrele(ITOV(ip)); 1366#ifdef DEBUG 1367 else if (snapdebug) 1368 printf("ffs_snapgone: lost snapshot vnode %llu\n", 1369 (unsigned long long)ip->i_number); 1370#endif 1371 /* 1372 * Delete snapshot inode from superblock. Keep list dense. 1373 */ 1374 mutex_enter(&si->si_lock); 1375 fs = ip->i_fs; 1376 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1377 if (fs->fs_snapinum[snaploc] == ip->i_number) 1378 break; 1379 if (snaploc < FSMAXSNAP) { 1380 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1381 if (fs->fs_snapinum[snaploc] == 0) 1382 break; 1383 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1384 } 1385 fs->fs_snapinum[snaploc - 1] = 0; 1386 } 1387 si->si_gen++; 1388 mutex_exit(&si->si_lock); 1389} 1390 1391/* 1392 * Prepare a snapshot file for being removed. 1393 */ 1394void 1395ffs_snapremove(struct vnode *vp) 1396{ 1397 struct inode *ip = VTOI(vp), *xp; 1398 struct vnode *devvp = ip->i_devvp; 1399 struct fs *fs = ip->i_fs; 1400 struct mount *mp = devvp->v_specmountpoint; 1401 struct vnlock *lkp; 1402 struct buf *ibp; 1403 struct snap_info *si; 1404 ufs2_daddr_t numblks, blkno, dblk; 1405 int error, ns, loc, last; 1406 1407 si = VFSTOUFS(mp)->um_snapinfo; 1408 ns = UFS_FSNEEDSWAP(fs); 1409 /* 1410 * If active, delete from incore list (this snapshot may 1411 * already have been in the process of being deleted, so 1412 * would not have been active). 1413 * 1414 * Clear copy-on-write flag if last snapshot. 1415 */ 1416 if (ip->i_nextsnap.tqe_prev != 0) { 1417 mutex_enter(&si->si_lock); 1418 vlockmgr(&vp->v_lock, LK_EXCLUSIVE); 1419 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap); 1420 ip->i_nextsnap.tqe_prev = 0; 1421 lkp = vp->v_vnlock; 1422 KASSERT(lkp == &si->si_vnlock); 1423 vp->v_vnlock = &vp->v_lock; 1424 vlockmgr(lkp, LK_RELEASE); 1425 if (TAILQ_FIRST(&si->si_snapshots) != 0) { 1426 /* Roll back the list of preallocated blocks. */ 1427 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1428 si->si_snapblklist = xp->i_snapblklist; 1429 } else { 1430 si->si_snapblklist = 0; 1431 si->si_gen++; 1432 mutex_exit(&si->si_lock); 1433 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1434 mutex_enter(&si->si_lock); 1435 } 1436 si->si_gen++; 1437 mutex_exit(&si->si_lock); 1438 FREE(ip->i_snapblklist, M_UFSMNT); 1439 ip->i_snapblklist = NULL; 1440 } 1441 /* 1442 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1443 * snapshots that want them (see ffs_snapblkfree below). 1444 */ 1445 for (blkno = 1; blkno < NDADDR; blkno++) { 1446 dblk = db_get(ip, blkno); 1447 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1448 db_assign(ip, blkno, 0); 1449 else if ((dblk == blkstofrags(fs, blkno) && 1450 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1451 ip->i_number))) { 1452 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1453 db_assign(ip, blkno, 0); 1454 } 1455 } 1456 numblks = howmany(ip->i_size, fs->fs_bsize); 1457 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1458 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 1459 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1460 if (error) 1461 continue; 1462 if (fs->fs_size - blkno > NINDIR(fs)) 1463 last = NINDIR(fs); 1464 else 1465 last = fs->fs_size - blkno; 1466 for (loc = 0; loc < last; loc++) { 1467 dblk = idb_get(ip, ibp->b_data, loc); 1468 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1469 idb_assign(ip, ibp->b_data, loc, 0); 1470 else if (dblk == blkstofrags(fs, blkno) && 1471 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1472 fs->fs_bsize, ip->i_number)) { 1473 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1474 idb_assign(ip, ibp->b_data, loc, 0); 1475 } 1476 } 1477 bawrite(ibp); 1478 } 1479 /* 1480 * Clear snapshot flag and drop reference. 1481 */ 1482 ip->i_flags &= ~SF_SNAPSHOT; 1483 DIP_ASSIGN(ip, flags, ip->i_flags); 1484 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1485} 1486 1487/* 1488 * Notification that a block is being freed. Return zero if the free 1489 * should be allowed to proceed. Return non-zero if the snapshot file 1490 * wants to claim the block. The block will be claimed if it is an 1491 * uncopied part of one of the snapshots. It will be freed if it is 1492 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1493 * If a fragment is being freed, then all snapshots that care about 1494 * it must make a copy since a snapshot file can only claim full sized 1495 * blocks. Note that if more than one snapshot file maps the block, 1496 * we can pick one at random to claim it. Since none of the snapshots 1497 * can change, we are assurred that they will all see the same unmodified 1498 * image. When deleting a snapshot file (see ffs_snapremove above), we 1499 * must push any of these claimed blocks to one of the other snapshots 1500 * that maps it. These claimed blocks are easily identified as they will 1501 * have a block number equal to their logical block number within the 1502 * snapshot. A copied block can never have this property because they 1503 * must always have been allocated from a BLK_NOCOPY location. 1504 */ 1505int 1506ffs_snapblkfree(struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno, 1507 long size, ino_t inum) 1508{ 1509 struct mount *mp = devvp->v_specmountpoint; 1510 struct buf *ibp; 1511 struct inode *ip; 1512 struct vnode *vp = NULL; 1513 struct snap_info *si; 1514 void *saved_data = NULL; 1515 ufs_lbn_t lbn; 1516 ufs2_daddr_t blkno; 1517 uint32_t gen; 1518 int s, indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1519 1520 si = VFSTOUFS(mp)->um_snapinfo; 1521 lbn = fragstoblks(fs, bno); 1522 mutex_enter(&si->si_lock); 1523retry: 1524 gen = si->si_gen; 1525 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1526 vp = ITOV(ip); 1527 if (snapshot_locked == 0) { 1528 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1529 mutex_exit(&si->si_lock); 1530 kpause("snaplock", false, 1, NULL); 1531 mutex_enter(&si->si_lock); 1532 goto retry; 1533 } 1534 snapshot_locked = 1; 1535 if (gen != si->si_gen) 1536 goto retry; 1537 } 1538 /* 1539 * Lookup block being written. 1540 */ 1541 if (lbn < NDADDR) { 1542 blkno = db_get(ip, lbn); 1543 } else { 1544 mutex_exit(&si->si_lock); 1545 s = cow_enter(); 1546 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1547 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1548 cow_leave(s); 1549 if (error) { 1550 mutex_enter(&si->si_lock); 1551 break; 1552 } 1553 indiroff = (lbn - NDADDR) % NINDIR(fs); 1554 blkno = idb_get(ip, ibp->b_data, indiroff); 1555 mutex_enter(&si->si_lock); 1556 if (gen != si->si_gen) { 1557 brelse(ibp, 0); 1558 goto retry; 1559 } 1560 } 1561 /* 1562 * Check to see if block needs to be copied. 1563 */ 1564 if (blkno == 0) { 1565 /* 1566 * A block that we map is being freed. If it has not 1567 * been claimed yet, we will claim or copy it (below). 1568 */ 1569 claimedblk = 1; 1570 } else if (blkno == BLK_SNAP) { 1571 /* 1572 * No previous snapshot claimed the block, 1573 * so it will be freed and become a BLK_NOCOPY 1574 * (don't care) for us. 1575 */ 1576 if (claimedblk) 1577 panic("snapblkfree: inconsistent block type"); 1578 if (lbn < NDADDR) { 1579 db_assign(ip, lbn, BLK_NOCOPY); 1580 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1581 } else { 1582 idb_assign(ip, ibp->b_data, indiroff, 1583 BLK_NOCOPY); 1584 mutex_exit(&si->si_lock); 1585 bwrite(ibp); 1586 mutex_enter(&si->si_lock); 1587 if (gen != si->si_gen) 1588 goto retry; 1589 } 1590 continue; 1591 } else /* BLK_NOCOPY or default */ { 1592 /* 1593 * If the snapshot has already copied the block 1594 * (default), or does not care about the block, 1595 * it is not needed. 1596 */ 1597 if (lbn >= NDADDR) 1598 brelse(ibp, 0); 1599 continue; 1600 } 1601 /* 1602 * If this is a full size block, we will just grab it 1603 * and assign it to the snapshot inode. Otherwise we 1604 * will proceed to copy it. See explanation for this 1605 * routine as to why only a single snapshot needs to 1606 * claim this block. 1607 */ 1608 if (size == fs->fs_bsize) { 1609#ifdef DEBUG 1610 if (snapdebug) 1611 printf("%s %llu lbn %" PRId64 1612 "from inum %llu\n", 1613 "Grabonremove: snapino", 1614 (unsigned long long)ip->i_number, 1615 lbn, (unsigned long long)inum); 1616#endif 1617 mutex_exit(&si->si_lock); 1618 if (lbn < NDADDR) { 1619 db_assign(ip, lbn, bno); 1620 } else { 1621 idb_assign(ip, ibp->b_data, indiroff, bno); 1622 bwrite(ibp); 1623 } 1624 DIP_ADD(ip, blocks, btodb(size)); 1625 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1626 VOP_UNLOCK(vp, 0); 1627 return (1); 1628 } 1629 if (lbn >= NDADDR) 1630 brelse(ibp, 0); 1631#ifdef DEBUG 1632 if (snapdebug) 1633 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n", 1634 "Copyonremove: snapino ", 1635 (unsigned long long)ip->i_number, 1636 lbn, "for inum", (unsigned long long)inum, size); 1637#endif 1638 /* 1639 * If we have already read the old block contents, then 1640 * simply copy them to the new block. Note that we need 1641 * to synchronously write snapshots that have not been 1642 * unlinked, and hence will be visible after a crash, 1643 * to ensure their integrity. 1644 */ 1645 mutex_exit(&si->si_lock); 1646 if (saved_data == NULL) { 1647 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1648 if ((error = readfsblk(vp, saved_data, lbn)) != 0) { 1649 free(saved_data, M_UFSMNT); 1650 saved_data = NULL; 1651 mutex_enter(&si->si_lock); 1652 break; 1653 } 1654 } 1655 error = writevnblk(vp, saved_data, lbn); 1656 mutex_enter(&si->si_lock); 1657 if (error) 1658 break; 1659 if (gen != si->si_gen) 1660 goto retry; 1661 } 1662 mutex_exit(&si->si_lock); 1663 if (saved_data) 1664 free(saved_data, M_UFSMNT); 1665 /* 1666 * If we have been unable to allocate a block in which to do 1667 * the copy, then return non-zero so that the fragment will 1668 * not be freed. Although space will be lost, the snapshot 1669 * will stay consistent. 1670 */ 1671 if (snapshot_locked) 1672 VOP_UNLOCK(vp, 0); 1673 return (error); 1674} 1675 1676/* 1677 * Associate snapshot files when mounting. 1678 */ 1679void 1680ffs_snapshot_mount(struct mount *mp) 1681{ 1682 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1683 struct fs *fs = VFSTOUFS(mp)->um_fs; 1684 struct lwp *l = curlwp; 1685 struct vnode *vp; 1686 struct inode *ip, *xp; 1687 struct snap_info *si; 1688 ufs2_daddr_t snaplistsize, *snapblklist; 1689 int i, error, ns, snaploc, loc; 1690 1691 /* 1692 * No persistent snapshots on apple ufs file systems. 1693 */ 1694 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp))) 1695 return; 1696 1697 si = VFSTOUFS(mp)->um_snapinfo; 1698 ns = UFS_FSNEEDSWAP(fs); 1699 /* 1700 * XXX The following needs to be set before ffs_truncate or 1701 * VOP_READ can be called. 1702 */ 1703 mp->mnt_stat.f_iosize = fs->fs_bsize; 1704 /* 1705 * Process each snapshot listed in the superblock. 1706 */ 1707 vp = NULL; 1708 mutex_enter(&si->si_lock); 1709 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1710 if (fs->fs_snapinum[snaploc] == 0) 1711 break; 1712 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1713 &vp)) != 0) { 1714 printf("ffs_snapshot_mount: vget failed %d\n", error); 1715 continue; 1716 } 1717 ip = VTOI(vp); 1718 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1719 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1720 fs->fs_snapinum[snaploc]); 1721 vput(vp); 1722 vp = NULL; 1723 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1724 if (fs->fs_snapinum[loc] == 0) 1725 break; 1726 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1727 } 1728 fs->fs_snapinum[loc - 1] = 0; 1729 snaploc--; 1730 continue; 1731 } 1732 1733 /* 1734 * Read the block hints list. Use an empty list on 1735 * read errors. 1736 */ 1737 error = vn_rdwr(UIO_READ, vp, 1738 (void *)&snaplistsize, sizeof(snaplistsize), 1739 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1740 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, 1741 l->l_cred, NULL, NULL); 1742 if (error) { 1743 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1744 snaplistsize = 1; 1745 } else 1746 snaplistsize = ufs_rw64(snaplistsize, ns); 1747 snapblklist = malloc( 1748 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 1749 if (error) 1750 snapblklist[0] = 1; 1751 else { 1752 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist, 1753 snaplistsize * sizeof(ufs2_daddr_t), 1754 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1755 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, 1756 l->l_cred, NULL, NULL); 1757 for (i = 0; i < snaplistsize; i++) 1758 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 1759 if (error) { 1760 printf("ffs_snapshot_mount: read_2 failed %d\n", 1761 error); 1762 snapblklist[0] = 1; 1763 } 1764 } 1765 ip->i_snapblklist = &snapblklist[0]; 1766 1767 /* 1768 * Acquire the snapshot lock and give up our original 1769 * private lock. 1770 */ 1771 VI_LOCK(vp); 1772 vp->v_vnlock = &si->si_vnlock; 1773 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY); 1774 vlockmgr(&vp->v_lock, LK_RELEASE); 1775 /* 1776 * Link it onto the active snapshot list. 1777 */ 1778 if (ip->i_nextsnap.tqe_prev != 0) 1779 panic("ffs_snapshot_mount: %llu already on list", 1780 (unsigned long long)ip->i_number); 1781 else 1782 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 1783 vp->v_vflag |= VV_SYSTEM; 1784 VOP_UNLOCK(vp, 0); 1785 } 1786 /* 1787 * No usable snapshots found. 1788 */ 1789 if (vp == NULL) { 1790 mutex_exit(&si->si_lock); 1791 return; 1792 } 1793 /* 1794 * Attach the block hints list. We always want to 1795 * use the list from the newest snapshot. 1796 */ 1797 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1798 si->si_snapblklist = xp->i_snapblklist; 1799 fscow_establish(mp, ffs_copyonwrite, devvp); 1800 si->si_gen++; 1801 mutex_exit(&si->si_lock); 1802} 1803 1804/* 1805 * Disassociate snapshot files when unmounting. 1806 */ 1807void 1808ffs_snapshot_unmount(struct mount *mp) 1809{ 1810 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1811 struct inode *xp; 1812 struct vnode *vp = NULL; 1813 struct snap_info *si; 1814 1815 si = VFSTOUFS(mp)->um_snapinfo; 1816 mutex_enter(&si->si_lock); 1817 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) { 1818 vp = ITOV(xp); 1819 vp->v_vnlock = &vp->v_lock; 1820 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap); 1821 xp->i_nextsnap.tqe_prev = 0; 1822 if (xp->i_snapblklist == si->si_snapblklist) 1823 si->si_snapblklist = NULL; 1824 FREE(xp->i_snapblklist, M_UFSMNT); 1825 if (xp->i_ffs_effnlink > 0) { 1826 si->si_gen++; 1827 mutex_exit(&si->si_lock); 1828 vrele(vp); 1829 mutex_enter(&si->si_lock); 1830 } 1831 } 1832 if (vp) 1833 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1834 si->si_gen++; 1835 mutex_exit(&si->si_lock); 1836} 1837 1838/* 1839 * Check for need to copy block that is about to be written, 1840 * copying the block if necessary. 1841 */ 1842static int 1843ffs_copyonwrite(void *v, struct buf *bp, bool data_valid) 1844{ 1845 struct buf *ibp; 1846 struct fs *fs; 1847 struct inode *ip; 1848 struct vnode *devvp = v, *vp = NULL; 1849 struct mount *mp = devvp->v_specmountpoint; 1850 struct snap_info *si; 1851 void *saved_data = NULL; 1852 ufs2_daddr_t lbn, blkno, *snapblklist; 1853 uint32_t gen; 1854 int lower, upper, mid, s, ns, indiroff, snapshot_locked = 0, error = 0; 1855 1856 /* 1857 * Check for valid snapshots. 1858 */ 1859 si = VFSTOUFS(mp)->um_snapinfo; 1860 mutex_enter(&si->si_lock); 1861 ip = TAILQ_FIRST(&si->si_snapshots); 1862 if (ip == NULL) { 1863 mutex_exit(&si->si_lock); 1864 return 0; 1865 } 1866 /* 1867 * First check to see if it is in the preallocated list. 1868 * By doing this check we avoid several potential deadlocks. 1869 */ 1870 fs = ip->i_fs; 1871 ns = UFS_FSNEEDSWAP(fs); 1872 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1873 snapblklist = si->si_snapblklist; 1874 upper = si->si_snapblklist[0] - 1; 1875 lower = 1; 1876 while (lower <= upper) { 1877 mid = (lower + upper) / 2; 1878 if (snapblklist[mid] == lbn) 1879 break; 1880 if (snapblklist[mid] < lbn) 1881 lower = mid + 1; 1882 else 1883 upper = mid - 1; 1884 } 1885 if (lower <= upper) { 1886 mutex_exit(&si->si_lock); 1887 return 0; 1888 } 1889 /* 1890 * Not in the precomputed list, so check the snapshots. 1891 */ 1892 if (data_valid && bp->b_bcount == fs->fs_bsize) 1893 saved_data = bp->b_data; 1894retry: 1895 gen = si->si_gen; 1896 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1897 vp = ITOV(ip); 1898 /* 1899 * We ensure that everything of our own that needs to be 1900 * copied will be done at the time that ffs_snapshot is 1901 * called. Thus we can skip the check here which can 1902 * deadlock in doing the lookup in ffs_balloc. 1903 */ 1904 if (bp->b_vp == vp) 1905 continue; 1906 if (snapshot_locked == 0) { 1907 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1908 mutex_exit(&si->si_lock); 1909 kpause("snaplock", false, 1, NULL); 1910 mutex_enter(&si->si_lock); 1911 goto retry; 1912 } 1913 snapshot_locked = 1; 1914 if (gen != si->si_gen) 1915 goto retry; 1916 } 1917 /* 1918 * Check to see if block needs to be copied. We do not have 1919 * to hold the snapshot lock while doing this lookup as it 1920 * will never require any additional allocations for the 1921 * snapshot inode. 1922 */ 1923 if (lbn < NDADDR) { 1924 blkno = db_get(ip, lbn); 1925 } else { 1926 mutex_exit(&si->si_lock); 1927 s = cow_enter(); 1928 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1929 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1930 cow_leave(s); 1931 if (error) { 1932 mutex_enter(&si->si_lock); 1933 break; 1934 } 1935 indiroff = (lbn - NDADDR) % NINDIR(fs); 1936 blkno = idb_get(ip, ibp->b_data, indiroff); 1937 brelse(ibp, 0); 1938 mutex_enter(&si->si_lock); 1939 if (gen != si->si_gen) 1940 goto retry; 1941 } 1942#ifdef DIAGNOSTIC 1943 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1944 panic("ffs_copyonwrite: bad copy block"); 1945#endif 1946 if (blkno != 0) 1947 continue; 1948#ifdef DIAGNOSTIC 1949 if (curlwp->l_pflag & LP_UFSCOW) 1950 printf("ffs_copyonwrite: recursive call\n"); 1951#endif 1952 /* 1953 * Allocate the block into which to do the copy. Since 1954 * multiple processes may all try to copy the same block, 1955 * we have to recheck our need to do a copy if we sleep 1956 * waiting for the lock. 1957 * 1958 * Because all snapshots on a filesystem share a single 1959 * lock, we ensure that we will never be in competition 1960 * with another process to allocate a block. 1961 */ 1962#ifdef DEBUG 1963 if (snapdebug) { 1964 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ", 1965 (unsigned long long)ip->i_number, lbn); 1966 if (bp->b_vp == devvp) 1967 printf("fs metadata"); 1968 else 1969 printf("inum %llu", (unsigned long long) 1970 VTOI(bp->b_vp)->i_number); 1971 printf(" lblkno %" PRId64 "\n", bp->b_lblkno); 1972 } 1973#endif 1974 /* 1975 * If we have already read the old block contents, then 1976 * simply copy them to the new block. Note that we need 1977 * to synchronously write snapshots that have not been 1978 * unlinked, and hence will be visible after a crash, 1979 * to ensure their integrity. 1980 */ 1981 mutex_exit(&si->si_lock); 1982 if (saved_data == NULL) { 1983 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1984 if ((error = readfsblk(vp, saved_data, lbn)) != 0) { 1985 free(saved_data, M_UFSMNT); 1986 saved_data = NULL; 1987 mutex_enter(&si->si_lock); 1988 break; 1989 } 1990 } 1991 error = writevnblk(vp, saved_data, lbn); 1992 mutex_enter(&si->si_lock); 1993 if (error) 1994 break; 1995 if (gen != si->si_gen) 1996 goto retry; 1997 } 1998 /* 1999 * Note that we need to synchronously write snapshots that 2000 * have not been unlinked, and hence will be visible after 2001 * a crash, to ensure their integrity. 2002 */ 2003 mutex_exit(&si->si_lock); 2004 if (saved_data && saved_data != bp->b_data) 2005 free(saved_data, M_UFSMNT); 2006 if (snapshot_locked) 2007 VOP_UNLOCK(vp, 0); 2008 return error; 2009} 2010 2011/* 2012 * Read the specified block from disk. Vp is usually a snapshot vnode. 2013 */ 2014static int 2015readfsblk(struct vnode *vp, void *data, ufs2_daddr_t lbn) 2016{ 2017 int error; 2018 struct inode *ip = VTOI(vp); 2019 struct fs *fs = ip->i_fs; 2020 struct buf *nbp; 2021 2022 nbp = getiobuf(NULL, true); 2023 nbp->b_flags = B_READ; 2024 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 2025 nbp->b_error = 0; 2026 nbp->b_data = data; 2027 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn)); 2028 nbp->b_proc = NULL; 2029 nbp->b_dev = ip->i_devvp->v_rdev; 2030 2031 bdev_strategy(nbp); 2032 2033 error = biowait(nbp); 2034 2035 putiobuf(nbp); 2036 2037 return error; 2038} 2039 2040#if !defined(FFS_NO_SNAPSHOT) 2041/* 2042 * Read the specified block. Bypass UBC to prevent deadlocks. 2043 */ 2044static int 2045readvnblk(struct vnode *vp, void *data, ufs2_daddr_t lbn) 2046{ 2047 int error; 2048 daddr_t bn; 2049 off_t offset; 2050 struct inode *ip = VTOI(vp); 2051 struct fs *fs = ip->i_fs; 2052 2053 error = VOP_BMAP(vp, lbn, NULL, &bn, NULL); 2054 if (error) 2055 return error; 2056 2057 if (bn != (daddr_t)-1) { 2058 offset = dbtob(bn); 2059 mutex_enter(&vp->v_interlock); 2060 error = VOP_PUTPAGES(vp, trunc_page(offset), 2061 round_page(offset+fs->fs_bsize), 2062 PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 2063 if (error) 2064 return error; 2065 2066 return readfsblk(vp, data, fragstoblks(fs, dbtofsb(fs, bn))); 2067 } 2068 2069 bzero(data, fs->fs_bsize); 2070 2071 return 0; 2072} 2073#endif /* !defined(FFS_NO_SNAPSHOT) */ 2074 2075/* 2076 * Write the specified block. Bypass UBC to prevent deadlocks. 2077 */ 2078static int 2079writevnblk(struct vnode *vp, void *data, ufs2_daddr_t lbn) 2080{ 2081 int s, error; 2082 off_t offset; 2083 struct buf *bp; 2084 struct inode *ip = VTOI(vp); 2085 struct fs *fs = ip->i_fs; 2086 2087 offset = lblktosize(fs, (off_t)lbn); 2088 s = cow_enter(); 2089 mutex_enter(&vp->v_interlock); 2090 error = VOP_PUTPAGES(vp, trunc_page(offset), 2091 round_page(offset+fs->fs_bsize), PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 2092 if (error == 0) 2093 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 2094 fs->fs_bsize, KERNCRED, B_SYNC, &bp); 2095 cow_leave(s); 2096 if (error) 2097 return error; 2098 2099 bcopy(data, bp->b_data, fs->fs_bsize); 2100 mutex_enter(&bufcache_lock); 2101 /* XXX Shouldn't need to lock for this, NOCACHE is only read later. */ 2102 bp->b_cflags |= BC_NOCACHE; 2103 mutex_exit(&bufcache_lock); 2104 2105 return bwrite(bp); 2106} 2107 2108/* 2109 * Set/reset lwp's LP_UFSCOW flag. 2110 * May be called recursive. 2111 */ 2112static inline int 2113cow_enter(void) 2114{ 2115 struct lwp *l = curlwp; 2116 2117 if (l->l_pflag & LP_UFSCOW) { 2118 return 0; 2119 } else { 2120 l->l_pflag |= LP_UFSCOW; 2121 return LP_UFSCOW; 2122 } 2123} 2124 2125static inline void 2126cow_leave(int flag) 2127{ 2128 struct lwp *l = curlwp; 2129 2130 l->l_pflag &= ~flag; 2131} 2132 2133/* 2134 * Get/Put direct block from inode or buffer containing disk addresses. Take 2135 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 2136 * into a global include. 2137 */ 2138static inline ufs2_daddr_t 2139db_get(struct inode *ip, int loc) 2140{ 2141 if (ip->i_ump->um_fstype == UFS1) 2142 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); 2143 else 2144 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); 2145} 2146 2147static inline void 2148db_assign(struct inode *ip, int loc, ufs2_daddr_t val) 2149{ 2150 if (ip->i_ump->um_fstype == UFS1) 2151 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2152 else 2153 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2154} 2155 2156static inline ufs2_daddr_t 2157idb_get(struct inode *ip, void *bf, int loc) 2158{ 2159 if (ip->i_ump->um_fstype == UFS1) 2160 return ufs_rw32(((ufs1_daddr_t *)(bf))[loc], 2161 UFS_IPNEEDSWAP(ip)); 2162 else 2163 return ufs_rw64(((ufs2_daddr_t *)(bf))[loc], 2164 UFS_IPNEEDSWAP(ip)); 2165} 2166 2167static inline void 2168idb_assign(struct inode *ip, void *bf, int loc, ufs2_daddr_t val) 2169{ 2170 if (ip->i_ump->um_fstype == UFS1) 2171 ((ufs1_daddr_t *)(bf))[loc] = 2172 ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2173 else 2174 ((ufs2_daddr_t *)(bf))[loc] = 2175 ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2176} 2177