ffs_snapshot.c revision 1.110
1/* $NetBSD: ffs_snapshot.c,v 1.110 2011/02/24 09:38:57 hannken Exp $ */ 2 3/* 4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 5 * 6 * Further information about snapshots can be obtained from: 7 * 8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 9 * 1614 Oxford Street mckusick@mckusick.com 10 * Berkeley, CA 94709-1608 +1-510-843-9542 11 * USA 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 36 * 37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 38 */ 39 40#include <sys/cdefs.h> 41__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.110 2011/02/24 09:38:57 hannken Exp $"); 42 43#if defined(_KERNEL_OPT) 44#include "opt_ffs.h" 45#endif 46 47#include <sys/param.h> 48#include <sys/kernel.h> 49#include <sys/systm.h> 50#include <sys/conf.h> 51#include <sys/buf.h> 52#include <sys/proc.h> 53#include <sys/namei.h> 54#include <sys/sched.h> 55#include <sys/stat.h> 56#include <sys/malloc.h> 57#include <sys/mount.h> 58#include <sys/resource.h> 59#include <sys/resourcevar.h> 60#include <sys/vnode.h> 61#include <sys/kauth.h> 62#include <sys/fstrans.h> 63#include <sys/wapbl.h> 64 65#include <miscfs/specfs/specdev.h> 66 67#include <ufs/ufs/quota.h> 68#include <ufs/ufs/ufsmount.h> 69#include <ufs/ufs/inode.h> 70#include <ufs/ufs/ufs_extern.h> 71#include <ufs/ufs/ufs_bswap.h> 72#include <ufs/ufs/ufs_wapbl.h> 73 74#include <ufs/ffs/fs.h> 75#include <ufs/ffs/ffs_extern.h> 76 77#include <uvm/uvm.h> 78 79struct snap_info { 80 kmutex_t si_lock; /* Lock this snapinfo */ 81 kmutex_t si_snaplock; /* Snapshot vnode common lock */ 82 lwp_t *si_owner; /* Sanplock owner */ 83 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */ 84 daddr_t *si_snapblklist; /* Snapshot block hints list */ 85 uint32_t si_gen; /* Incremented on change */ 86}; 87 88#if !defined(FFS_NO_SNAPSHOT) 89typedef int (*acctfunc_t) 90 (struct vnode *, void *, int, int, struct fs *, daddr_t, int); 91 92static int snapshot_setup(struct mount *, struct vnode *); 93static int snapshot_copyfs(struct mount *, struct vnode *, void **); 94static int snapshot_expunge(struct mount *, struct vnode *, 95 struct fs *, daddr_t *, daddr_t **); 96static int snapshot_expunge_snap(struct mount *, struct vnode *, 97 struct fs *, daddr_t); 98static int snapshot_writefs(struct mount *, struct vnode *, void *); 99static int cgaccount(struct vnode *, int, int *); 100static int cgaccount1(int, struct vnode *, void *, int); 101static int expunge(struct vnode *, struct inode *, struct fs *, 102 acctfunc_t, int); 103static int indiracct(struct vnode *, struct vnode *, int, daddr_t, 104 daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int); 105static int fullacct(struct vnode *, void *, int, int, struct fs *, 106 daddr_t, int); 107static int snapacct(struct vnode *, void *, int, int, struct fs *, 108 daddr_t, int); 109static int mapacct(struct vnode *, void *, int, int, struct fs *, 110 daddr_t, int); 111#endif /* !defined(FFS_NO_SNAPSHOT) */ 112 113static int ffs_copyonwrite(void *, struct buf *, bool); 114static int snapblkaddr(struct vnode *, daddr_t, daddr_t *); 115static int rwfsblk(struct vnode *, int, void *, daddr_t); 116static int syncsnap(struct vnode *); 117static int wrsnapblk(struct vnode *, void *, daddr_t); 118#if !defined(FFS_NO_SNAPSHOT) 119static int blocks_in_journal(struct fs *); 120#endif 121 122static inline bool is_active_snapshot(struct snap_info *, struct inode *); 123static inline daddr_t db_get(struct inode *, int); 124static inline void db_assign(struct inode *, int, daddr_t); 125static inline daddr_t ib_get(struct inode *, int); 126static inline void ib_assign(struct inode *, int, daddr_t); 127static inline daddr_t idb_get(struct inode *, void *, int); 128static inline void idb_assign(struct inode *, void *, int, daddr_t); 129 130#ifdef DEBUG 131static int snapdebug = 0; 132#endif 133 134int 135ffs_snapshot_init(struct ufsmount *ump) 136{ 137 struct snap_info *si; 138 139 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP); 140 if (si == NULL) 141 return ENOMEM; 142 143 TAILQ_INIT(&si->si_snapshots); 144 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE); 145 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE); 146 si->si_owner = NULL; 147 si->si_gen = 0; 148 si->si_snapblklist = NULL; 149 150 return 0; 151} 152 153void 154ffs_snapshot_fini(struct ufsmount *ump) 155{ 156 struct snap_info *si; 157 158 si = ump->um_snapinfo; 159 ump->um_snapinfo = NULL; 160 161 KASSERT(TAILQ_EMPTY(&si->si_snapshots)); 162 mutex_destroy(&si->si_lock); 163 mutex_destroy(&si->si_snaplock); 164 KASSERT(si->si_snapblklist == NULL); 165 kmem_free(si, sizeof(*si)); 166} 167 168/* 169 * Create a snapshot file and initialize it for the filesystem. 170 * Vnode is locked on entry and return. 171 */ 172int 173ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime) 174{ 175#if defined(FFS_NO_SNAPSHOT) 176 return EOPNOTSUPP; 177} 178#else /* defined(FFS_NO_SNAPSHOT) */ 179 bool suspended = false; 180 int error, redo = 0, snaploc; 181 void *sbbuf = NULL; 182 daddr_t *snaplist = NULL, snaplistsize = 0; 183 struct buf *bp, *nbp; 184 struct fs *copy_fs = NULL; 185 struct fs *fs = VFSTOUFS(mp)->um_fs; 186 struct inode *ip = VTOI(vp); 187 struct lwp *l = curlwp; 188 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; 189 struct timespec ts; 190 struct timeval starttime; 191#ifdef DEBUG 192 struct timeval endtime; 193#endif 194 struct vnode *devvp = ip->i_devvp; 195 196 /* 197 * If the vnode already is a snapshot, return. 198 */ 199 if (VTOI(vp)->i_flags & SF_SNAPSHOT) { 200 if (ctime) { 201 ctime->tv_sec = DIP(VTOI(vp), mtime); 202 ctime->tv_nsec = DIP(VTOI(vp), mtimensec); 203 } 204 return 0; 205 } 206 /* 207 * Check for free snapshot slot in the superblock. 208 */ 209 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 210 if (fs->fs_snapinum[snaploc] == 0) 211 break; 212 if (snaploc == FSMAXSNAP) 213 return (ENOSPC); 214 /* 215 * Prepare the vnode to become a snapshot. 216 */ 217 error = snapshot_setup(mp, vp); 218 if (error) 219 goto out; 220 /* 221 * Change inode to snapshot type file. 222 */ 223 ip->i_flags |= SF_SNAPSHOT; 224 DIP_ASSIGN(ip, flags, ip->i_flags); 225 ip->i_flag |= IN_CHANGE | IN_UPDATE; 226 /* 227 * Copy all the cylinder group maps. Although the 228 * filesystem is still active, we hope that only a few 229 * cylinder groups will change between now and when we 230 * suspend operations. Thus, we will be able to quickly 231 * touch up the few cylinder groups that changed during 232 * the suspension period. 233 */ 234 error = cgaccount(vp, 1, NULL); 235 if (error) 236 goto out; 237 /* 238 * Ensure that the snapshot is completely on disk. 239 * Since we have marked it as a snapshot it is safe to 240 * unlock it as no process will be allowed to write to it. 241 */ 242 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); 243 if (error) 244 goto out; 245 VOP_UNLOCK(vp); 246 /* 247 * All allocations are done, so we can now suspend the filesystem. 248 */ 249 error = vfs_suspend(vp->v_mount, 0); 250 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 251 if (error) 252 goto out; 253 suspended = true; 254 getmicrotime(&starttime); 255 /* 256 * First, copy all the cylinder group maps that have changed. 257 */ 258 error = cgaccount(vp, 2, &redo); 259 if (error) 260 goto out; 261 /* 262 * Create a copy of the superblock and its summary information. 263 */ 264 error = snapshot_copyfs(mp, vp, &sbbuf); 265 copy_fs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc)); 266 if (error) 267 goto out; 268 /* 269 * Expunge unlinked files from our view. 270 */ 271 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist); 272 if (error) 273 goto out; 274 /* 275 * Record snapshot inode. Since this is the newest snapshot, 276 * it must be placed at the end of the list. 277 */ 278 if (ip->i_nlink > 0) 279 fs->fs_snapinum[snaploc] = ip->i_number; 280 281 mutex_enter(&si->si_lock); 282 if (is_active_snapshot(si, ip)) 283 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number); 284 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 285 if (TAILQ_FIRST(&si->si_snapshots) == ip) { 286 /* 287 * If this is the first snapshot on this filesystem, put the 288 * preliminary list in place and establish the cow handler. 289 */ 290 si->si_snapblklist = snaplist; 291 fscow_establish(mp, ffs_copyonwrite, devvp); 292 } 293 si->si_gen++; 294 mutex_exit(&si->si_lock); 295 296 vp->v_vflag |= VV_SYSTEM; 297 /* 298 * Set the mtime to the time the snapshot has been taken. 299 */ 300 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 301 if (ctime) 302 *ctime = ts; 303 DIP_ASSIGN(ip, mtime, ts.tv_sec); 304 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 305 ip->i_flag |= IN_CHANGE | IN_UPDATE; 306 /* 307 * Copy allocation information from all snapshots and then 308 * expunge them from our view. 309 */ 310 error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize); 311 if (error) 312 goto out; 313 /* 314 * Write the superblock and its summary information to the snapshot. 315 */ 316 error = snapshot_writefs(mp, vp, sbbuf); 317 if (error) 318 goto out; 319 /* 320 * We're nearly done, ensure that the snapshot is completely on disk. 321 */ 322 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); 323 if (error) 324 goto out; 325 /* 326 * Invalidate and free all pages on the snapshot vnode. 327 * We will read and write through the buffercache. 328 */ 329 mutex_enter(&vp->v_interlock); 330 error = VOP_PUTPAGES(vp, 0, 0, 331 PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE); 332 if (error) 333 goto out; 334 /* 335 * Invalidate short ( < fs_bsize ) buffers. We will always read 336 * full size buffers later. 337 */ 338 mutex_enter(&bufcache_lock); 339 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 340 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 341 nbp = LIST_NEXT(bp, b_vnbufs); 342 KASSERT((bp->b_cflags & BC_BUSY) == 0); 343 if (bp->b_bcount < fs->fs_bsize) { 344 bp->b_cflags |= BC_BUSY; 345 brelsel(bp, BC_INVAL | BC_VFLUSH); 346 } 347 } 348 mutex_exit(&bufcache_lock); 349 350out: 351 if (sbbuf != NULL) { 352 free(copy_fs->fs_csp, M_UFSMNT); 353 free(sbbuf, M_UFSMNT); 354 } 355 if (fs->fs_active != NULL) { 356 free(fs->fs_active, M_DEVBUF); 357 fs->fs_active = NULL; 358 } 359 360 mutex_enter(&si->si_lock); 361 if (snaplist != NULL) { 362 if (si->si_snapblklist == snaplist) 363 si->si_snapblklist = NULL; 364 free(snaplist, M_UFSMNT); 365 } 366 if (error) { 367 fs->fs_snapinum[snaploc] = 0; 368 } else { 369 /* 370 * As this is the newest list, it is the most inclusive, so 371 * should replace the previous list. 372 */ 373 si->si_snapblklist = ip->i_snapblklist; 374 } 375 si->si_gen++; 376 mutex_exit(&si->si_lock); 377 378 if (suspended) { 379 vfs_resume(vp->v_mount); 380#ifdef DEBUG 381 getmicrotime(&endtime); 382 timersub(&endtime, &starttime, &endtime); 383 printf("%s: suspended %lld.%03d sec, redo %d of %d\n", 384 mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec, 385 endtime.tv_usec / 1000, redo, fs->fs_ncg); 386#endif 387 } 388 if (error) { 389 if (!UFS_WAPBL_BEGIN(mp)) { 390 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); 391 UFS_WAPBL_END(mp); 392 } 393 } else if (ip->i_nlink > 0) 394 vref(vp); 395 return (error); 396} 397 398/* 399 * Prepare vnode to become a snapshot. 400 */ 401static int 402snapshot_setup(struct mount *mp, struct vnode *vp) 403{ 404 int error, n, len, loc; 405 daddr_t blkno, numblks; 406 struct buf *ibp, *nbp; 407 struct fs *fs = VFSTOUFS(mp)->um_fs; 408 struct lwp *l = curlwp; 409 const int wbreak = blocks_in_journal(fs)/8; 410 411 /* 412 * Check mount, exclusive reference and owner. 413 */ 414 if (vp->v_mount != mp) 415 return EXDEV; 416 if (vp->v_usecount != 1 || vp->v_writecount != 0) 417 return EBUSY; 418 if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 419 NULL) != 0 && 420 VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred)) 421 return EACCES; 422 423 if (vp->v_size != 0) { 424 error = ffs_truncate(vp, 0, 0, NOCRED); 425 if (error) 426 return error; 427 } 428 /* 429 * Write an empty list of preallocated blocks to the end of 430 * the snapshot to set size to at least that of the filesystem. 431 */ 432 numblks = howmany(fs->fs_size, fs->fs_frag); 433 blkno = 1; 434 blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs)); 435 error = vn_rdwr(UIO_WRITE, vp, 436 (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks), 437 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 438 if (error) 439 return error; 440 /* 441 * Preallocate critical data structures so that we can copy 442 * them in without further allocation after we suspend all 443 * operations on the filesystem. We would like to just release 444 * the allocated buffers without writing them since they will 445 * be filled in below once we are ready to go, but this upsets 446 * the soft update code, so we go ahead and write the new buffers. 447 * 448 * Allocate all indirect blocks and mark all of them as not 449 * needing to be copied. 450 */ 451 error = UFS_WAPBL_BEGIN(mp); 452 if (error) 453 return error; 454 for (blkno = NDADDR, n = 0; blkno < numblks; blkno += NINDIR(fs)) { 455 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 456 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 457 if (error) 458 goto out; 459 brelse(ibp, 0); 460 if (wbreak > 0 && (++n % wbreak) == 0) { 461 UFS_WAPBL_END(mp); 462 error = UFS_WAPBL_BEGIN(mp); 463 if (error) 464 return error; 465 } 466 } 467 /* 468 * Allocate copies for the superblock and its summary information. 469 */ 470 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred, 471 0, &nbp); 472 if (error) 473 goto out; 474 bawrite(nbp); 475 blkno = fragstoblks(fs, fs->fs_csaddr); 476 len = howmany(fs->fs_cssize, fs->fs_bsize); 477 for (loc = 0; loc < len; loc++) { 478 error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)), 479 fs->fs_bsize, l->l_cred, 0, &nbp); 480 if (error) 481 goto out; 482 bawrite(nbp); 483 } 484 485out: 486 UFS_WAPBL_END(mp); 487 return error; 488} 489 490/* 491 * Create a copy of the superblock and its summary information. 492 * It is up to the caller to free copyfs and copy_fs->fs_csp. 493 */ 494static int 495snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf) 496{ 497 int error, i, len, loc, size; 498 void *space; 499 int32_t *lp; 500 struct buf *bp; 501 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; 502 struct lwp *l = curlwp; 503 struct vnode *devvp = VTOI(vp)->i_devvp; 504 505 /* 506 * Grab a copy of the superblock and its summary information. 507 * We delay writing it until the suspension is released below. 508 */ 509 *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 510 loc = blkoff(fs, fs->fs_sblockloc); 511 if (loc > 0) 512 memset(*sbbuf, 0, loc); 513 copyfs = (struct fs *)((char *)(*sbbuf) + loc); 514 memcpy(copyfs, fs, fs->fs_sbsize); 515 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 516 if (fs->fs_sbsize < size) 517 memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0, 518 size - fs->fs_sbsize); 519 size = blkroundup(fs, fs->fs_cssize); 520 if (fs->fs_contigsumsize > 0) 521 size += fs->fs_ncg * sizeof(int32_t); 522 space = malloc(size, M_UFSMNT, M_WAITOK); 523 copyfs->fs_csp = space; 524 memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize); 525 space = (char *)space + fs->fs_cssize; 526 loc = howmany(fs->fs_cssize, fs->fs_fsize); 527 i = fs->fs_frag - loc % fs->fs_frag; 528 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 529 if (len > 0) { 530 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 531 len, l->l_cred, 0, &bp)) != 0) { 532 brelse(bp, 0); 533 free(copyfs->fs_csp, M_UFSMNT); 534 free(*sbbuf, M_UFSMNT); 535 *sbbuf = NULL; 536 return error; 537 } 538 memcpy(space, bp->b_data, (u_int)len); 539 space = (char *)space + len; 540 brelse(bp, BC_INVAL | BC_NOCACHE); 541 } 542 if (fs->fs_contigsumsize > 0) { 543 copyfs->fs_maxcluster = lp = space; 544 for (i = 0; i < fs->fs_ncg; i++) 545 *lp++ = fs->fs_contigsumsize; 546 } 547 if (mp->mnt_wapbl) 548 copyfs->fs_flags &= ~FS_DOWAPBL; 549 return 0; 550} 551 552/* 553 * We must check for active files that have been unlinked (e.g., with a zero 554 * link count). We have to expunge all trace of these files from the snapshot 555 * so that they are not reclaimed prematurely by fsck or unnecessarily dumped. 556 * Note that we skip unlinked snapshot files as they will be handled separately. 557 * Calculate the snapshot list size and create a preliminary list. 558 */ 559static int 560snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs, 561 daddr_t *snaplistsize, daddr_t **snaplist) 562{ 563 int cg, error = 0, len, loc; 564 daddr_t blkno, *blkp; 565 struct fs *fs = VFSTOUFS(mp)->um_fs; 566 struct inode *xp; 567 struct lwp *l = curlwp; 568 struct vattr vat; 569 struct vnode *logvp = NULL, *mvp = NULL, *xvp; 570 571 *snaplist = NULL; 572 /* 573 * Get the log inode if any. 574 */ 575 if ((fs->fs_flags & FS_DOWAPBL) && 576 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { 577 error = VFS_VGET(mp, 578 fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp); 579 if (error) 580 goto out; 581 } 582 /* 583 * Allocate a marker vnode. 584 */ 585 if ((mvp = vnalloc(mp)) == NULL) { 586 error = ENOMEM; 587 goto out; 588 } 589 /* 590 * We also calculate the needed size for the snapshot list. 591 */ 592 *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 593 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 594 mutex_enter(&mntvnode_lock); 595 /* 596 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 597 * and vclean() can be called indirectly 598 */ 599 for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) { 600 vmark(mvp, xvp); 601 /* 602 * Make sure this vnode wasn't reclaimed in getnewvnode(). 603 * Start over if it has (it won't be on the list anymore). 604 */ 605 if (xvp->v_mount != mp || vismarker(xvp)) 606 continue; 607 mutex_enter(&xvp->v_interlock); 608 if ((xvp->v_iflag & VI_XLOCK) || 609 xvp->v_usecount == 0 || xvp->v_type == VNON || 610 VTOI(xvp) == NULL || 611 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 612 mutex_exit(&xvp->v_interlock); 613 continue; 614 } 615 mutex_exit(&mntvnode_lock); 616 /* 617 * XXXAD should increase vnode ref count to prevent it 618 * disappearing or being recycled. 619 */ 620 mutex_exit(&xvp->v_interlock); 621#ifdef DEBUG 622 if (snapdebug) 623 vprint("ffs_snapshot: busy vnode", xvp); 624#endif 625 xp = VTOI(xvp); 626 if (xvp != logvp) { 627 if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 && 628 vat.va_nlink > 0) { 629 mutex_enter(&mntvnode_lock); 630 continue; 631 } 632 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 633 mutex_enter(&mntvnode_lock); 634 continue; 635 } 636 } 637 /* 638 * If there is a fragment, clear it here. 639 */ 640 blkno = 0; 641 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 642 if (loc < NDADDR) { 643 len = fragroundup(fs, blkoff(fs, xp->i_size)); 644 if (len > 0 && len < fs->fs_bsize) { 645 error = UFS_WAPBL_BEGIN(mp); 646 if (error) { 647 (void)vunmark(mvp); 648 goto out; 649 } 650 ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc), 651 len, xp->i_number); 652 blkno = db_get(xp, loc); 653 db_assign(xp, loc, 0); 654 UFS_WAPBL_END(mp); 655 } 656 } 657 *snaplistsize += 1; 658 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY); 659 if (blkno) 660 db_assign(xp, loc, blkno); 661 if (!error) { 662 error = UFS_WAPBL_BEGIN(mp); 663 if (!error) { 664 error = ffs_freefile_snap(copy_fs, vp, 665 xp->i_number, xp->i_mode); 666 UFS_WAPBL_END(mp); 667 } 668 } 669 if (error) { 670 (void)vunmark(mvp); 671 goto out; 672 } 673 mutex_enter(&mntvnode_lock); 674 } 675 mutex_exit(&mntvnode_lock); 676 /* 677 * Create a preliminary list of preallocated snapshot blocks. 678 */ 679 *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 680 blkp = &(*snaplist)[1]; 681 *blkp++ = lblkno(fs, fs->fs_sblockloc); 682 blkno = fragstoblks(fs, fs->fs_csaddr); 683 for (cg = 0; cg < fs->fs_ncg; cg++) { 684 if (fragstoblks(fs, cgtod(fs, cg)) > blkno) 685 break; 686 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 687 } 688 len = howmany(fs->fs_cssize, fs->fs_bsize); 689 for (loc = 0; loc < len; loc++) 690 *blkp++ = blkno + loc; 691 for (; cg < fs->fs_ncg; cg++) 692 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 693 (*snaplist)[0] = blkp - &(*snaplist)[0]; 694 695out: 696 if (mvp != NULL) 697 vnfree(mvp); 698 if (logvp != NULL) 699 vput(logvp); 700 if (error && *snaplist != NULL) { 701 free(*snaplist, M_UFSMNT); 702 *snaplist = NULL; 703 } 704 705 return error; 706} 707 708/* 709 * Copy allocation information from all the snapshots in this snapshot and 710 * then expunge them from its view. Also, collect the list of allocated 711 * blocks in i_snapblklist. 712 */ 713static int 714snapshot_expunge_snap(struct mount *mp, struct vnode *vp, 715 struct fs *copy_fs, daddr_t snaplistsize) 716{ 717 int error = 0, i; 718 daddr_t numblks, *snaplist = NULL; 719 struct fs *fs = VFSTOUFS(mp)->um_fs; 720 struct inode *ip = VTOI(vp), *xp; 721 struct lwp *l = curlwp; 722 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; 723 724 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) { 725 if (xp != ip) { 726 error = expunge(vp, xp, fs, snapacct, BLK_SNAP); 727 if (error) 728 break; 729 } 730 if (xp->i_nlink != 0) 731 continue; 732 error = UFS_WAPBL_BEGIN(mp); 733 if (error) 734 break; 735 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode); 736 UFS_WAPBL_END(mp); 737 if (error) 738 break; 739 } 740 if (error) 741 goto out; 742 /* 743 * Allocate space for the full list of preallocated snapshot blocks. 744 */ 745 snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 746 ip->i_snapblklist = &snaplist[1]; 747 /* 748 * Expunge the blocks used by the snapshots from the set of 749 * blocks marked as used in the snapshot bitmaps. Also, collect 750 * the list of allocated blocks in i_snapblklist. 751 */ 752 error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP); 753 if (error) 754 goto out; 755 if (snaplistsize < ip->i_snapblklist - snaplist) 756 panic("ffs_snapshot: list too small"); 757 snaplistsize = ip->i_snapblklist - snaplist; 758 snaplist[0] = snaplistsize; 759 ip->i_snapblklist = &snaplist[0]; 760 /* 761 * Write out the list of allocated blocks to the end of the snapshot. 762 */ 763 numblks = howmany(fs->fs_size, fs->fs_frag); 764 for (i = 0; i < snaplistsize; i++) 765 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); 766 error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist, 767 snaplistsize * sizeof(daddr_t), lblktosize(fs, (off_t)numblks), 768 UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL); 769 for (i = 0; i < snaplistsize; i++) 770 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); 771out: 772 if (error && snaplist != NULL) { 773 free(snaplist, M_UFSMNT); 774 ip->i_snapblklist = NULL; 775 } 776 return error; 777} 778 779/* 780 * Write the superblock and its summary information to the snapshot. 781 * Make sure, the first NDADDR blocks get copied to the snapshot. 782 */ 783static int 784snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf) 785{ 786 int error, len, loc; 787 void *space; 788 daddr_t blkno; 789 struct buf *bp; 790 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; 791 struct inode *ip = VTOI(vp); 792 struct lwp *l = curlwp; 793 794 copyfs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc)); 795 796 /* 797 * Write the superblock and its summary information 798 * to the snapshot. 799 */ 800 blkno = fragstoblks(fs, fs->fs_csaddr); 801 len = howmany(fs->fs_cssize, fs->fs_bsize); 802 space = copyfs->fs_csp; 803#ifdef FFS_EI 804 if (UFS_FSNEEDSWAP(fs)) { 805 ffs_sb_swap(copyfs, copyfs); 806 ffs_csum_swap(space, space, fs->fs_cssize); 807 } 808#endif 809 error = UFS_WAPBL_BEGIN(mp); 810 if (error) 811 return error; 812 for (loc = 0; loc < len; loc++) { 813 error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred, 814 B_MODIFY, &bp); 815 if (error) { 816 brelse(bp, 0); 817 break; 818 } 819 memcpy(bp->b_data, space, fs->fs_bsize); 820 space = (char *)space + fs->fs_bsize; 821 bawrite(bp); 822 } 823 if (error) 824 goto out; 825 error = bread(vp, lblkno(fs, fs->fs_sblockloc), 826 fs->fs_bsize, l->l_cred, B_MODIFY, &bp); 827 if (error) { 828 brelse(bp, 0); 829 goto out; 830 } else { 831 memcpy(bp->b_data, sbbuf, fs->fs_bsize); 832 bawrite(bp); 833 } 834 /* 835 * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite() 836 * and ffs_snapblkfree() will always work on indirect blocks. 837 */ 838 for (loc = 0; loc < NDADDR; loc++) { 839 if (db_get(ip, loc) != 0) 840 continue; 841 error = ffs_balloc(vp, lblktosize(fs, (off_t)loc), 842 fs->fs_bsize, l->l_cred, 0, &bp); 843 if (error) 844 break; 845 error = rwfsblk(vp, B_READ, bp->b_data, loc); 846 if (error) { 847 brelse(bp, 0); 848 break; 849 } 850 bawrite(bp); 851 } 852 853out: 854 UFS_WAPBL_END(mp); 855 return error; 856} 857 858/* 859 * Copy all cylinder group maps. 860 */ 861static int 862cgaccount(struct vnode *vp, int passno, int *redo) 863{ 864 int cg, error = 0; 865 struct buf *nbp; 866 struct fs *fs = VTOI(vp)->i_fs; 867 868 if (redo != NULL) 869 *redo = 0; 870 if (passno == 1) 871 fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY), 872 M_DEVBUF, M_WAITOK | M_ZERO); 873 for (cg = 0; cg < fs->fs_ncg; cg++) { 874 if (passno == 2 && ACTIVECG_ISSET(fs, cg)) 875 continue; 876 877 if (redo != NULL) 878 *redo += 1; 879 error = UFS_WAPBL_BEGIN(vp->v_mount); 880 if (error) 881 return error; 882 error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 883 fs->fs_bsize, curlwp->l_cred, 0, &nbp); 884 if (error) { 885 UFS_WAPBL_END(vp->v_mount); 886 break; 887 } 888 error = cgaccount1(cg, vp, nbp->b_data, passno); 889 bawrite(nbp); 890 UFS_WAPBL_END(vp->v_mount); 891 if (error) 892 break; 893 } 894 return error; 895} 896 897/* 898 * Copy a cylinder group map. All the unallocated blocks are marked 899 * BLK_NOCOPY so that the snapshot knows that it need not copy them 900 * if they are later written. If passno is one, then this is a first 901 * pass, so only setting needs to be done. If passno is 2, then this 902 * is a revision to a previous pass which must be undone as the 903 * replacement pass is done. 904 */ 905static int 906cgaccount1(int cg, struct vnode *vp, void *data, int passno) 907{ 908 struct buf *bp, *ibp; 909 struct inode *ip; 910 struct cg *cgp; 911 struct fs *fs; 912 struct lwp *l = curlwp; 913 daddr_t base, numblks; 914 int error, len, loc, ns, indiroff; 915 916 ip = VTOI(vp); 917 fs = ip->i_fs; 918 ns = UFS_FSNEEDSWAP(fs); 919 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 920 (int)fs->fs_cgsize, l->l_cred, 0, &bp); 921 if (error) { 922 brelse(bp, 0); 923 return (error); 924 } 925 cgp = (struct cg *)bp->b_data; 926 if (!cg_chkmagic(cgp, ns)) { 927 brelse(bp, 0); 928 return (EIO); 929 } 930 ACTIVECG_SET(fs, cg); 931 932 memcpy(data, bp->b_data, fs->fs_cgsize); 933 brelse(bp, 0); 934 if (fs->fs_cgsize < fs->fs_bsize) 935 memset((char *)data + fs->fs_cgsize, 0, 936 fs->fs_bsize - fs->fs_cgsize); 937 numblks = howmany(fs->fs_size, fs->fs_frag); 938 len = howmany(fs->fs_fpg, fs->fs_frag); 939 base = cg * fs->fs_fpg / fs->fs_frag; 940 if (base + len >= numblks) 941 len = numblks - base - 1; 942 loc = 0; 943 if (base < NDADDR) { 944 for ( ; loc < NDADDR; loc++) { 945 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 946 db_assign(ip, loc, BLK_NOCOPY); 947 else if (db_get(ip, loc) == BLK_NOCOPY) { 948 if (passno == 2) 949 db_assign(ip, loc, 0); 950 else if (passno == 1) 951 panic("ffs_snapshot: lost direct block"); 952 } 953 } 954 } 955 if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)), 956 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) 957 return (error); 958 indiroff = (base + loc - NDADDR) % NINDIR(fs); 959 for ( ; loc < len; loc++, indiroff++) { 960 if (indiroff >= NINDIR(fs)) { 961 bawrite(ibp); 962 if ((error = ffs_balloc(vp, 963 lblktosize(fs, (off_t)(base + loc)), 964 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) 965 return (error); 966 indiroff = 0; 967 } 968 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 969 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 970 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 971 if (passno == 2) 972 idb_assign(ip, ibp->b_data, indiroff, 0); 973 else if (passno == 1) 974 panic("ffs_snapshot: lost indirect block"); 975 } 976 } 977 bdwrite(ibp); 978 return (0); 979} 980 981/* 982 * Before expunging a snapshot inode, note all the 983 * blocks that it claims with BLK_SNAP so that fsck will 984 * be able to account for those blocks properly and so 985 * that this snapshot knows that it need not copy them 986 * if the other snapshot holding them is freed. 987 */ 988static int 989expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 990 acctfunc_t acctfunc, int expungetype) 991{ 992 int i, error, ns; 993 daddr_t lbn, rlbn; 994 daddr_t len, blkno, numblks, blksperindir; 995 struct ufs1_dinode *dip1; 996 struct ufs2_dinode *dip2; 997 struct lwp *l = curlwp; 998 void *bap; 999 struct buf *bp; 1000 struct mount *mp; 1001 1002 ns = UFS_FSNEEDSWAP(fs); 1003 mp = snapvp->v_mount; 1004 1005 error = UFS_WAPBL_BEGIN(mp); 1006 if (error) 1007 return error; 1008 /* 1009 * Prepare to expunge the inode. If its inode block has not 1010 * yet been copied, then allocate and fill the copy. 1011 */ 1012 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1013 error = snapblkaddr(snapvp, lbn, &blkno); 1014 if (error) 1015 return error; 1016 if (blkno != 0) { 1017 error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred, 1018 B_MODIFY, &bp); 1019 } else { 1020 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn), 1021 fs->fs_bsize, l->l_cred, 0, &bp); 1022 if (! error) 1023 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn); 1024 } 1025 if (error) { 1026 UFS_WAPBL_END(mp); 1027 return error; 1028 } 1029 /* 1030 * Set a snapshot inode to be a zero length file, regular files 1031 * or unlinked snapshots to be completely unallocated. 1032 */ 1033 if (fs->fs_magic == FS_UFS1_MAGIC) { 1034 dip1 = (struct ufs1_dinode *)bp->b_data + 1035 ino_to_fsbo(fs, cancelip->i_number); 1036 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0) 1037 dip1->di_mode = 0; 1038 dip1->di_size = 0; 1039 dip1->di_blocks = 0; 1040 dip1->di_flags = 1041 ufs_rw32(ufs_rw32(dip1->di_flags, ns) & ~SF_SNAPSHOT, ns); 1042 memset(&dip1->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int32_t)); 1043 } else { 1044 dip2 = (struct ufs2_dinode *)bp->b_data + 1045 ino_to_fsbo(fs, cancelip->i_number); 1046 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0) 1047 dip2->di_mode = 0; 1048 dip2->di_size = 0; 1049 dip2->di_blocks = 0; 1050 dip2->di_flags = 1051 ufs_rw32(ufs_rw32(dip2->di_flags, ns) & ~SF_SNAPSHOT, ns); 1052 memset(&dip2->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int64_t)); 1053 } 1054 bdwrite(bp); 1055 UFS_WAPBL_END(mp); 1056 /* 1057 * Now go through and expunge all the blocks in the file 1058 * using the function requested. 1059 */ 1060 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1061 if (fs->fs_magic == FS_UFS1_MAGIC) 1062 bap = &cancelip->i_ffs1_db[0]; 1063 else 1064 bap = &cancelip->i_ffs2_db[0]; 1065 error = (*acctfunc)(snapvp, bap, 0, NDADDR, fs, 0, expungetype); 1066 if (error) 1067 return (error); 1068 if (fs->fs_magic == FS_UFS1_MAGIC) 1069 bap = &cancelip->i_ffs1_ib[0]; 1070 else 1071 bap = &cancelip->i_ffs2_ib[0]; 1072 error = (*acctfunc)(snapvp, bap, 0, NIADDR, fs, -1, expungetype); 1073 if (error) 1074 return (error); 1075 blksperindir = 1; 1076 lbn = -NDADDR; 1077 len = numblks - NDADDR; 1078 rlbn = NDADDR; 1079 for (i = 0; len > 0 && i < NIADDR; i++) { 1080 error = indiracct(snapvp, ITOV(cancelip), i, 1081 ib_get(cancelip, i), lbn, rlbn, len, 1082 blksperindir, fs, acctfunc, expungetype); 1083 if (error) 1084 return (error); 1085 blksperindir *= NINDIR(fs); 1086 lbn -= blksperindir + 1; 1087 len -= blksperindir; 1088 rlbn += blksperindir; 1089 } 1090 return (0); 1091} 1092 1093/* 1094 * Descend an indirect block chain for vnode cancelvp accounting for all 1095 * its indirect blocks in snapvp. 1096 */ 1097static int 1098indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level, 1099 daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks, 1100 daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype) 1101{ 1102 int error, num, i; 1103 daddr_t subblksperindir; 1104 struct indir indirs[NIADDR + 2]; 1105 daddr_t last; 1106 void *bap; 1107 struct buf *bp; 1108 1109 if (blkno == 0) { 1110 if (expungetype == BLK_NOCOPY) 1111 return (0); 1112 panic("indiracct: missing indir"); 1113 } 1114 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1115 return (error); 1116 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1117 panic("indiracct: botched params"); 1118 /* 1119 * We have to expand bread here since it will deadlock looking 1120 * up the block number for any blocks that are not in the cache. 1121 */ 1122 error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize, 1123 false, &bp); 1124 if (error) 1125 return error; 1126 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error = 1127 rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) { 1128 brelse(bp, 0); 1129 return (error); 1130 } 1131 /* 1132 * Account for the block pointers in this indirect block. 1133 */ 1134 last = howmany(remblks, blksperindir); 1135 if (last > NINDIR(fs)) 1136 last = NINDIR(fs); 1137 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO); 1138 memcpy((void *)bap, bp->b_data, fs->fs_bsize); 1139 brelse(bp, 0); 1140 error = (*acctfunc)(snapvp, bap, 0, last, 1141 fs, level == 0 ? rlbn : -1, expungetype); 1142 if (error || level == 0) 1143 goto out; 1144 /* 1145 * Account for the block pointers in each of the indirect blocks 1146 * in the levels below us. 1147 */ 1148 subblksperindir = blksperindir / NINDIR(fs); 1149 for (lbn++, level--, i = 0; i < last; i++) { 1150 error = indiracct(snapvp, cancelvp, level, 1151 idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks, 1152 subblksperindir, fs, acctfunc, expungetype); 1153 if (error) 1154 goto out; 1155 rlbn += blksperindir; 1156 lbn -= blksperindir; 1157 remblks -= blksperindir; 1158 } 1159out: 1160 free(bap, M_DEVBUF); 1161 return (error); 1162} 1163 1164/* 1165 * Do both snap accounting and map accounting. 1166 */ 1167static int 1168fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1169 struct fs *fs, daddr_t lblkno, 1170 int exptype /* BLK_SNAP or BLK_NOCOPY */) 1171{ 1172 int error; 1173 1174 if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype))) 1175 return (error); 1176 return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)); 1177} 1178 1179/* 1180 * Identify a set of blocks allocated in a snapshot inode. 1181 */ 1182static int 1183snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1184 struct fs *fs, daddr_t lblkno, 1185 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 1186{ 1187 struct inode *ip = VTOI(vp); 1188 struct lwp *l = curlwp; 1189 struct mount *mp = vp->v_mount; 1190 daddr_t blkno; 1191 daddr_t lbn; 1192 struct buf *ibp; 1193 int error, n; 1194 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8; 1195 1196 error = UFS_WAPBL_BEGIN(mp); 1197 if (error) 1198 return error; 1199 for ( n = 0; oldblkp < lastblkp; oldblkp++) { 1200 blkno = idb_get(ip, bap, oldblkp); 1201 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1202 continue; 1203 lbn = fragstoblks(fs, blkno); 1204 if (lbn < NDADDR) { 1205 blkno = db_get(ip, lbn); 1206 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1207 } else { 1208 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1209 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 1210 if (error) 1211 break; 1212 blkno = idb_get(ip, ibp->b_data, 1213 (lbn - NDADDR) % NINDIR(fs)); 1214 } 1215 /* 1216 * If we are expunging a snapshot vnode and we 1217 * find a block marked BLK_NOCOPY, then it is 1218 * one that has been allocated to this snapshot after 1219 * we took our current snapshot and can be ignored. 1220 */ 1221 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1222 if (lbn >= NDADDR) 1223 brelse(ibp, 0); 1224 } else { 1225 if (blkno != 0) 1226 panic("snapacct: bad block"); 1227 if (lbn < NDADDR) 1228 db_assign(ip, lbn, expungetype); 1229 else { 1230 idb_assign(ip, ibp->b_data, 1231 (lbn - NDADDR) % NINDIR(fs), expungetype); 1232 bdwrite(ibp); 1233 } 1234 } 1235 if (wbreak > 0 && (++n % wbreak) == 0) { 1236 UFS_WAPBL_END(mp); 1237 error = UFS_WAPBL_BEGIN(mp); 1238 if (error) 1239 return error; 1240 } 1241 } 1242 UFS_WAPBL_END(mp); 1243 return error; 1244} 1245 1246/* 1247 * Account for a set of blocks allocated in a snapshot inode. 1248 */ 1249static int 1250mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1251 struct fs *fs, daddr_t lblkno, int expungetype) 1252{ 1253 daddr_t blkno; 1254 struct inode *ip; 1255 struct mount *mp = vp->v_mount; 1256 ino_t inum; 1257 int acctit, error, n; 1258 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8; 1259 1260 error = UFS_WAPBL_BEGIN(mp); 1261 if (error) 1262 return error; 1263 ip = VTOI(vp); 1264 inum = ip->i_number; 1265 if (lblkno == -1) 1266 acctit = 0; 1267 else 1268 acctit = 1; 1269 for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) { 1270 blkno = idb_get(ip, bap, oldblkp); 1271 if (blkno == 0 || blkno == BLK_NOCOPY) 1272 continue; 1273 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1274 *ip->i_snapblklist++ = lblkno; 1275 if (blkno == BLK_SNAP) 1276 blkno = blkstofrags(fs, lblkno); 1277 ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum); 1278 if (wbreak > 0 && (++n % wbreak) == 0) { 1279 UFS_WAPBL_END(mp); 1280 error = UFS_WAPBL_BEGIN(mp); 1281 if (error) 1282 return error; 1283 } 1284 } 1285 UFS_WAPBL_END(mp); 1286 return (0); 1287} 1288 1289/* 1290 * Number of blocks that fit into the journal or zero if not logging. 1291 */ 1292static int 1293blocks_in_journal(struct fs *fs) 1294{ 1295 off_t bpj; 1296 1297 if ((fs->fs_flags & FS_DOWAPBL) == 0) 1298 return 0; 1299 bpj = 1; 1300 if (fs->fs_journal_version == UFS_WAPBL_VERSION) { 1301 switch (fs->fs_journal_location) { 1302 case UFS_WAPBL_JOURNALLOC_END_PARTITION: 1303 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]* 1304 fs->fs_journallocs[UFS_WAPBL_EPART_COUNT]; 1305 break; 1306 case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: 1307 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]* 1308 fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; 1309 break; 1310 } 1311 } 1312 bpj /= fs->fs_bsize; 1313 return (bpj > 0 ? bpj : 1); 1314} 1315#endif /* defined(FFS_NO_SNAPSHOT) */ 1316 1317/* 1318 * Decrement extra reference on snapshot when last name is removed. 1319 * It will not be freed until the last open reference goes away. 1320 */ 1321void 1322ffs_snapgone(struct inode *ip) 1323{ 1324 struct mount *mp = ip->i_devvp->v_specmountpoint; 1325 struct inode *xp; 1326 struct fs *fs; 1327 struct snap_info *si; 1328 int snaploc; 1329 1330 si = VFSTOUFS(mp)->um_snapinfo; 1331 1332 /* 1333 * Find snapshot in incore list. 1334 */ 1335 mutex_enter(&si->si_lock); 1336 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 1337 if (xp == ip) 1338 break; 1339 mutex_exit(&si->si_lock); 1340 if (xp != NULL) 1341 vrele(ITOV(ip)); 1342#ifdef DEBUG 1343 else if (snapdebug) 1344 printf("ffs_snapgone: lost snapshot vnode %llu\n", 1345 (unsigned long long)ip->i_number); 1346#endif 1347 /* 1348 * Delete snapshot inode from superblock. Keep list dense. 1349 */ 1350 mutex_enter(&si->si_lock); 1351 fs = ip->i_fs; 1352 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1353 if (fs->fs_snapinum[snaploc] == ip->i_number) 1354 break; 1355 if (snaploc < FSMAXSNAP) { 1356 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1357 if (fs->fs_snapinum[snaploc] == 0) 1358 break; 1359 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1360 } 1361 fs->fs_snapinum[snaploc - 1] = 0; 1362 } 1363 si->si_gen++; 1364 mutex_exit(&si->si_lock); 1365} 1366 1367/* 1368 * Prepare a snapshot file for being removed. 1369 */ 1370void 1371ffs_snapremove(struct vnode *vp) 1372{ 1373 struct inode *ip = VTOI(vp), *xp; 1374 struct vnode *devvp = ip->i_devvp; 1375 struct fs *fs = ip->i_fs; 1376 struct mount *mp = devvp->v_specmountpoint; 1377 struct buf *ibp; 1378 struct snap_info *si; 1379 struct lwp *l = curlwp; 1380 daddr_t numblks, blkno, dblk; 1381 int error, loc, last; 1382 1383 si = VFSTOUFS(mp)->um_snapinfo; 1384 /* 1385 * If active, delete from incore list (this snapshot may 1386 * already have been in the process of being deleted, so 1387 * would not have been active). 1388 * 1389 * Clear copy-on-write flag if last snapshot. 1390 */ 1391 mutex_enter(&si->si_snaplock); 1392 mutex_enter(&si->si_lock); 1393 if (is_active_snapshot(si, ip)) { 1394 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap); 1395 if (TAILQ_FIRST(&si->si_snapshots) != 0) { 1396 /* Roll back the list of preallocated blocks. */ 1397 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1398 si->si_snapblklist = xp->i_snapblklist; 1399 si->si_gen++; 1400 mutex_exit(&si->si_lock); 1401 mutex_exit(&si->si_snaplock); 1402 } else { 1403 si->si_snapblklist = 0; 1404 si->si_gen++; 1405 mutex_exit(&si->si_lock); 1406 mutex_exit(&si->si_snaplock); 1407 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1408 } 1409 if (ip->i_snapblklist != NULL) { 1410 free(ip->i_snapblklist, M_UFSMNT); 1411 ip->i_snapblklist = NULL; 1412 } 1413 } else { 1414 mutex_exit(&si->si_lock); 1415 mutex_exit(&si->si_snaplock); 1416 } 1417 /* 1418 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1419 * snapshots that want them (see ffs_snapblkfree below). 1420 */ 1421 for (blkno = 1; blkno < NDADDR; blkno++) { 1422 dblk = db_get(ip, blkno); 1423 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1424 db_assign(ip, blkno, 0); 1425 else if ((dblk == blkstofrags(fs, blkno) && 1426 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1427 ip->i_number))) { 1428 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1429 db_assign(ip, blkno, 0); 1430 } 1431 } 1432 numblks = howmany(ip->i_size, fs->fs_bsize); 1433 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1434 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 1435 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 1436 if (error) 1437 continue; 1438 if (fs->fs_size - blkno > NINDIR(fs)) 1439 last = NINDIR(fs); 1440 else 1441 last = fs->fs_size - blkno; 1442 for (loc = 0; loc < last; loc++) { 1443 dblk = idb_get(ip, ibp->b_data, loc); 1444 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1445 idb_assign(ip, ibp->b_data, loc, 0); 1446 else if (dblk == blkstofrags(fs, blkno) && 1447 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1448 fs->fs_bsize, ip->i_number)) { 1449 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1450 idb_assign(ip, ibp->b_data, loc, 0); 1451 } 1452 } 1453 bawrite(ibp); 1454 UFS_WAPBL_END(mp); 1455 error = UFS_WAPBL_BEGIN(mp); 1456 KASSERT(error == 0); 1457 } 1458 /* 1459 * Clear snapshot flag and drop reference. 1460 */ 1461 ip->i_flags &= ~SF_SNAPSHOT; 1462 DIP_ASSIGN(ip, flags, ip->i_flags); 1463 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1464} 1465 1466/* 1467 * Notification that a block is being freed. Return zero if the free 1468 * should be allowed to proceed. Return non-zero if the snapshot file 1469 * wants to claim the block. The block will be claimed if it is an 1470 * uncopied part of one of the snapshots. It will be freed if it is 1471 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1472 * If a fragment is being freed, then all snapshots that care about 1473 * it must make a copy since a snapshot file can only claim full sized 1474 * blocks. Note that if more than one snapshot file maps the block, 1475 * we can pick one at random to claim it. Since none of the snapshots 1476 * can change, we are assurred that they will all see the same unmodified 1477 * image. When deleting a snapshot file (see ffs_snapremove above), we 1478 * must push any of these claimed blocks to one of the other snapshots 1479 * that maps it. These claimed blocks are easily identified as they will 1480 * have a block number equal to their logical block number within the 1481 * snapshot. A copied block can never have this property because they 1482 * must always have been allocated from a BLK_NOCOPY location. 1483 */ 1484int 1485ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, 1486 long size, ino_t inum) 1487{ 1488 struct mount *mp = devvp->v_specmountpoint; 1489 struct buf *ibp; 1490 struct inode *ip; 1491 struct vnode *vp = NULL; 1492 struct snap_info *si; 1493 void *saved_data = NULL; 1494 daddr_t lbn; 1495 daddr_t blkno; 1496 uint32_t gen; 1497 int indiroff = 0, error = 0, claimedblk = 0; 1498 1499 si = VFSTOUFS(mp)->um_snapinfo; 1500 lbn = fragstoblks(fs, bno); 1501 mutex_enter(&si->si_snaplock); 1502 mutex_enter(&si->si_lock); 1503 si->si_owner = curlwp; 1504 1505retry: 1506 gen = si->si_gen; 1507 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1508 vp = ITOV(ip); 1509 /* 1510 * Lookup block being written. 1511 */ 1512 if (lbn < NDADDR) { 1513 blkno = db_get(ip, lbn); 1514 } else { 1515 mutex_exit(&si->si_lock); 1516 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1517 fs->fs_bsize, FSCRED, B_METAONLY, &ibp); 1518 if (error) { 1519 mutex_enter(&si->si_lock); 1520 break; 1521 } 1522 indiroff = (lbn - NDADDR) % NINDIR(fs); 1523 blkno = idb_get(ip, ibp->b_data, indiroff); 1524 mutex_enter(&si->si_lock); 1525 if (gen != si->si_gen) { 1526 brelse(ibp, 0); 1527 goto retry; 1528 } 1529 } 1530 /* 1531 * Check to see if block needs to be copied. 1532 */ 1533 if (blkno == 0) { 1534 /* 1535 * A block that we map is being freed. If it has not 1536 * been claimed yet, we will claim or copy it (below). 1537 */ 1538 claimedblk = 1; 1539 } else if (blkno == BLK_SNAP) { 1540 /* 1541 * No previous snapshot claimed the block, 1542 * so it will be freed and become a BLK_NOCOPY 1543 * (don't care) for us. 1544 */ 1545 if (claimedblk) 1546 panic("snapblkfree: inconsistent block type"); 1547 if (lbn < NDADDR) { 1548 db_assign(ip, lbn, BLK_NOCOPY); 1549 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1550 } else { 1551 idb_assign(ip, ibp->b_data, indiroff, 1552 BLK_NOCOPY); 1553 mutex_exit(&si->si_lock); 1554 if (ip->i_nlink > 0) 1555 bwrite(ibp); 1556 else 1557 bdwrite(ibp); 1558 mutex_enter(&si->si_lock); 1559 if (gen != si->si_gen) 1560 goto retry; 1561 } 1562 continue; 1563 } else /* BLK_NOCOPY or default */ { 1564 /* 1565 * If the snapshot has already copied the block 1566 * (default), or does not care about the block, 1567 * it is not needed. 1568 */ 1569 if (lbn >= NDADDR) 1570 brelse(ibp, 0); 1571 continue; 1572 } 1573 /* 1574 * If this is a full size block, we will just grab it 1575 * and assign it to the snapshot inode. Otherwise we 1576 * will proceed to copy it. See explanation for this 1577 * routine as to why only a single snapshot needs to 1578 * claim this block. 1579 */ 1580 if (size == fs->fs_bsize) { 1581#ifdef DEBUG 1582 if (snapdebug) 1583 printf("%s %llu lbn %" PRId64 1584 "from inum %llu\n", 1585 "Grabonremove: snapino", 1586 (unsigned long long)ip->i_number, 1587 lbn, (unsigned long long)inum); 1588#endif 1589 mutex_exit(&si->si_lock); 1590 if (lbn < NDADDR) { 1591 db_assign(ip, lbn, bno); 1592 } else { 1593 idb_assign(ip, ibp->b_data, indiroff, bno); 1594 if (ip->i_nlink > 0) 1595 bwrite(ibp); 1596 else 1597 bdwrite(ibp); 1598 } 1599 DIP_ADD(ip, blocks, btodb(size)); 1600 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1601 if (ip->i_nlink > 0 && mp->mnt_wapbl) 1602 error = syncsnap(vp); 1603 else 1604 error = 0; 1605 mutex_enter(&si->si_lock); 1606 si->si_owner = NULL; 1607 mutex_exit(&si->si_lock); 1608 mutex_exit(&si->si_snaplock); 1609 return (error == 0); 1610 } 1611 if (lbn >= NDADDR) 1612 brelse(ibp, 0); 1613#ifdef DEBUG 1614 if (snapdebug) 1615 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n", 1616 "Copyonremove: snapino ", 1617 (unsigned long long)ip->i_number, 1618 lbn, "for inum", (unsigned long long)inum, size); 1619#endif 1620 /* 1621 * If we have already read the old block contents, then 1622 * simply copy them to the new block. Note that we need 1623 * to synchronously write snapshots that have not been 1624 * unlinked, and hence will be visible after a crash, 1625 * to ensure their integrity. 1626 */ 1627 mutex_exit(&si->si_lock); 1628 if (saved_data == NULL) { 1629 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1630 error = rwfsblk(vp, B_READ, saved_data, lbn); 1631 if (error) { 1632 free(saved_data, M_UFSMNT); 1633 saved_data = NULL; 1634 mutex_enter(&si->si_lock); 1635 break; 1636 } 1637 } 1638 error = wrsnapblk(vp, saved_data, lbn); 1639 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) 1640 error = syncsnap(vp); 1641 mutex_enter(&si->si_lock); 1642 if (error) 1643 break; 1644 if (gen != si->si_gen) 1645 goto retry; 1646 } 1647 si->si_owner = NULL; 1648 mutex_exit(&si->si_lock); 1649 mutex_exit(&si->si_snaplock); 1650 if (saved_data) 1651 free(saved_data, M_UFSMNT); 1652 /* 1653 * If we have been unable to allocate a block in which to do 1654 * the copy, then return non-zero so that the fragment will 1655 * not be freed. Although space will be lost, the snapshot 1656 * will stay consistent. 1657 */ 1658 return (error); 1659} 1660 1661/* 1662 * Associate snapshot files when mounting. 1663 */ 1664void 1665ffs_snapshot_mount(struct mount *mp) 1666{ 1667 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1668 struct fs *fs = VFSTOUFS(mp)->um_fs; 1669 struct lwp *l = curlwp; 1670 struct vnode *vp; 1671 struct inode *ip, *xp; 1672 struct snap_info *si; 1673 daddr_t snaplistsize, *snapblklist; 1674 int i, error, ns, snaploc, loc; 1675 1676 /* 1677 * No persistent snapshots on apple ufs file systems. 1678 */ 1679 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp))) 1680 return; 1681 1682 si = VFSTOUFS(mp)->um_snapinfo; 1683 ns = UFS_FSNEEDSWAP(fs); 1684 /* 1685 * XXX The following needs to be set before ffs_truncate or 1686 * VOP_READ can be called. 1687 */ 1688 mp->mnt_stat.f_iosize = fs->fs_bsize; 1689 /* 1690 * Process each snapshot listed in the superblock. 1691 */ 1692 vp = NULL; 1693 mutex_enter(&si->si_lock); 1694 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1695 if (fs->fs_snapinum[snaploc] == 0) 1696 break; 1697 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1698 &vp)) != 0) { 1699 printf("ffs_snapshot_mount: vget failed %d\n", error); 1700 continue; 1701 } 1702 ip = VTOI(vp); 1703 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1704 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1705 fs->fs_snapinum[snaploc]); 1706 vput(vp); 1707 vp = NULL; 1708 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1709 if (fs->fs_snapinum[loc] == 0) 1710 break; 1711 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1712 } 1713 fs->fs_snapinum[loc - 1] = 0; 1714 snaploc--; 1715 continue; 1716 } 1717 1718 /* 1719 * Read the block hints list. Use an empty list on 1720 * read errors. 1721 */ 1722 error = vn_rdwr(UIO_READ, vp, 1723 (void *)&snaplistsize, sizeof(snaplistsize), 1724 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1725 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, 1726 l->l_cred, NULL, NULL); 1727 if (error) { 1728 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1729 snaplistsize = 1; 1730 } else 1731 snaplistsize = ufs_rw64(snaplistsize, ns); 1732 snapblklist = malloc( 1733 snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 1734 if (error) 1735 snapblklist[0] = 1; 1736 else { 1737 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist, 1738 snaplistsize * sizeof(daddr_t), 1739 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1740 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, 1741 l->l_cred, NULL, NULL); 1742 for (i = 0; i < snaplistsize; i++) 1743 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 1744 if (error) { 1745 printf("ffs_snapshot_mount: read_2 failed %d\n", 1746 error); 1747 snapblklist[0] = 1; 1748 } 1749 } 1750 ip->i_snapblklist = &snapblklist[0]; 1751 1752 /* 1753 * Link it onto the active snapshot list. 1754 */ 1755 if (is_active_snapshot(si, ip)) 1756 panic("ffs_snapshot_mount: %"PRIu64" already on list", 1757 ip->i_number); 1758 else 1759 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 1760 vp->v_vflag |= VV_SYSTEM; 1761 VOP_UNLOCK(vp); 1762 } 1763 /* 1764 * No usable snapshots found. 1765 */ 1766 if (vp == NULL) { 1767 mutex_exit(&si->si_lock); 1768 return; 1769 } 1770 /* 1771 * Attach the block hints list. We always want to 1772 * use the list from the newest snapshot. 1773 */ 1774 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1775 si->si_snapblklist = xp->i_snapblklist; 1776 fscow_establish(mp, ffs_copyonwrite, devvp); 1777 si->si_gen++; 1778 mutex_exit(&si->si_lock); 1779} 1780 1781/* 1782 * Disassociate snapshot files when unmounting. 1783 */ 1784void 1785ffs_snapshot_unmount(struct mount *mp) 1786{ 1787 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1788 struct inode *xp; 1789 struct vnode *vp = NULL; 1790 struct snap_info *si; 1791 1792 si = VFSTOUFS(mp)->um_snapinfo; 1793 mutex_enter(&si->si_lock); 1794 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) { 1795 vp = ITOV(xp); 1796 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap); 1797 if (xp->i_snapblklist == si->si_snapblklist) 1798 si->si_snapblklist = NULL; 1799 free(xp->i_snapblklist, M_UFSMNT); 1800 if (xp->i_nlink > 0) { 1801 si->si_gen++; 1802 mutex_exit(&si->si_lock); 1803 vrele(vp); 1804 mutex_enter(&si->si_lock); 1805 } 1806 } 1807 si->si_gen++; 1808 mutex_exit(&si->si_lock); 1809 if (vp) 1810 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1811} 1812 1813/* 1814 * Check for need to copy block that is about to be written, 1815 * copying the block if necessary. 1816 */ 1817static int 1818ffs_copyonwrite(void *v, struct buf *bp, bool data_valid) 1819{ 1820 struct fs *fs; 1821 struct inode *ip; 1822 struct vnode *devvp = v, *vp = NULL; 1823 struct mount *mp = devvp->v_specmountpoint; 1824 struct snap_info *si; 1825 void *saved_data = NULL; 1826 daddr_t lbn, blkno, *snapblklist; 1827 uint32_t gen; 1828 int lower, upper, mid, snapshot_locked = 0, error = 0; 1829 1830 /* 1831 * Check for valid snapshots. 1832 */ 1833 si = VFSTOUFS(mp)->um_snapinfo; 1834 mutex_enter(&si->si_lock); 1835 ip = TAILQ_FIRST(&si->si_snapshots); 1836 if (ip == NULL) { 1837 mutex_exit(&si->si_lock); 1838 return 0; 1839 } 1840 /* 1841 * First check to see if it is after the file system or 1842 * in the preallocated list. 1843 * By doing this check we avoid several potential deadlocks. 1844 */ 1845 fs = ip->i_fs; 1846 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1847 if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) { 1848 mutex_exit(&si->si_lock); 1849 return 0; 1850 } 1851 snapblklist = si->si_snapblklist; 1852 upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0); 1853 lower = 1; 1854 while (lower <= upper) { 1855 mid = (lower + upper) / 2; 1856 if (snapblklist[mid] == lbn) 1857 break; 1858 if (snapblklist[mid] < lbn) 1859 lower = mid + 1; 1860 else 1861 upper = mid - 1; 1862 } 1863 if (lower <= upper) { 1864 mutex_exit(&si->si_lock); 1865 return 0; 1866 } 1867 /* 1868 * Not in the precomputed list, so check the snapshots. 1869 */ 1870 if (si->si_owner != curlwp) { 1871 if (!mutex_tryenter(&si->si_snaplock)) { 1872 mutex_exit(&si->si_lock); 1873 mutex_enter(&si->si_snaplock); 1874 mutex_enter(&si->si_lock); 1875 } 1876 si->si_owner = curlwp; 1877 snapshot_locked = 1; 1878 } 1879 if (data_valid && bp->b_bcount == fs->fs_bsize) 1880 saved_data = bp->b_data; 1881retry: 1882 gen = si->si_gen; 1883 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1884 vp = ITOV(ip); 1885 /* 1886 * We ensure that everything of our own that needs to be 1887 * copied will be done at the time that ffs_snapshot is 1888 * called. Thus we can skip the check here which can 1889 * deadlock in doing the lookup in ffs_balloc. 1890 */ 1891 if (bp->b_vp == vp) 1892 continue; 1893 /* 1894 * Check to see if block needs to be copied. 1895 */ 1896 if (lbn < NDADDR) { 1897 blkno = db_get(ip, lbn); 1898 } else { 1899 mutex_exit(&si->si_lock); 1900 blkno = 0; /* XXX: GCC */ 1901 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 1902 mutex_enter(&si->si_lock); 1903 break; 1904 } 1905 mutex_enter(&si->si_lock); 1906 if (gen != si->si_gen) 1907 goto retry; 1908 } 1909#ifdef DIAGNOSTIC 1910 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1911 panic("ffs_copyonwrite: bad copy block"); 1912#endif 1913 if (blkno != 0) 1914 continue; 1915 1916 if (curlwp == uvm.pagedaemon_lwp) { 1917 error = ENOMEM; 1918 break; 1919 } 1920 /* Only one level of recursion allowed. */ 1921 KASSERT(snapshot_locked); 1922 /* 1923 * Allocate the block into which to do the copy. Since 1924 * multiple processes may all try to copy the same block, 1925 * we have to recheck our need to do a copy if we sleep 1926 * waiting for the lock. 1927 * 1928 * Because all snapshots on a filesystem share a single 1929 * lock, we ensure that we will never be in competition 1930 * with another process to allocate a block. 1931 */ 1932#ifdef DEBUG 1933 if (snapdebug) { 1934 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ", 1935 (unsigned long long)ip->i_number, lbn); 1936 if (bp->b_vp == devvp) 1937 printf("fs metadata"); 1938 else 1939 printf("inum %llu", (unsigned long long) 1940 VTOI(bp->b_vp)->i_number); 1941 printf(" lblkno %" PRId64 "\n", bp->b_lblkno); 1942 } 1943#endif 1944 /* 1945 * If we have already read the old block contents, then 1946 * simply copy them to the new block. Note that we need 1947 * to synchronously write snapshots that have not been 1948 * unlinked, and hence will be visible after a crash, 1949 * to ensure their integrity. 1950 */ 1951 mutex_exit(&si->si_lock); 1952 if (saved_data == NULL) { 1953 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1954 error = rwfsblk(vp, B_READ, saved_data, lbn); 1955 if (error) { 1956 free(saved_data, M_UFSMNT); 1957 saved_data = NULL; 1958 mutex_enter(&si->si_lock); 1959 break; 1960 } 1961 } 1962 error = wrsnapblk(vp, saved_data, lbn); 1963 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) 1964 error = syncsnap(vp); 1965 mutex_enter(&si->si_lock); 1966 if (error) 1967 break; 1968 if (gen != si->si_gen) 1969 goto retry; 1970 } 1971 /* 1972 * Note that we need to synchronously write snapshots that 1973 * have not been unlinked, and hence will be visible after 1974 * a crash, to ensure their integrity. 1975 */ 1976 if (snapshot_locked) { 1977 si->si_owner = NULL; 1978 mutex_exit(&si->si_lock); 1979 mutex_exit(&si->si_snaplock); 1980 } else 1981 mutex_exit(&si->si_lock); 1982 if (saved_data && saved_data != bp->b_data) 1983 free(saved_data, M_UFSMNT); 1984 return error; 1985} 1986 1987/* 1988 * Read from a snapshot. 1989 */ 1990int 1991ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag) 1992{ 1993 struct inode *ip = VTOI(vp); 1994 struct fs *fs = ip->i_fs; 1995 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo; 1996 struct buf *bp; 1997 daddr_t lbn, nextlbn; 1998 off_t fsbytes, bytesinfile; 1999 long size, xfersize, blkoffset; 2000 int error; 2001 2002 fstrans_start(vp->v_mount, FSTRANS_SHARED); 2003 mutex_enter(&si->si_snaplock); 2004 2005 if (ioflag & IO_ALTSEMANTICS) 2006 fsbytes = ip->i_size; 2007 else 2008 fsbytes = lfragtosize(fs, fs->fs_size); 2009 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 2010 bytesinfile = fsbytes - uio->uio_offset; 2011 if (bytesinfile <= 0) 2012 break; 2013 lbn = lblkno(fs, uio->uio_offset); 2014 nextlbn = lbn + 1; 2015 size = fs->fs_bsize; 2016 blkoffset = blkoff(fs, uio->uio_offset); 2017 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), 2018 bytesinfile); 2019 2020 if (lblktosize(fs, nextlbn + 1) >= fsbytes) { 2021 if (lblktosize(fs, lbn) + size > fsbytes) 2022 size = fragroundup(fs, 2023 fsbytes - lblktosize(fs, lbn)); 2024 error = bread(vp, lbn, size, NOCRED, 0, &bp); 2025 } else { 2026 int nextsize = fs->fs_bsize; 2027 error = breadn(vp, lbn, 2028 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp); 2029 } 2030 if (error) 2031 break; 2032 2033 /* 2034 * We should only get non-zero b_resid when an I/O error 2035 * has occurred, which should cause us to break above. 2036 * However, if the short read did not cause an error, 2037 * then we want to ensure that we do not uiomove bad 2038 * or uninitialized data. 2039 */ 2040 size -= bp->b_resid; 2041 if (size < blkoffset + xfersize) { 2042 xfersize = size - blkoffset; 2043 if (xfersize <= 0) 2044 break; 2045 } 2046 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 2047 if (error) 2048 break; 2049 brelse(bp, BC_AGE); 2050 } 2051 if (bp != NULL) 2052 brelse(bp, BC_AGE); 2053 2054 mutex_exit(&si->si_snaplock); 2055 fstrans_done(vp->v_mount); 2056 return error; 2057} 2058 2059/* 2060 * Lookup a snapshots data block address. 2061 * Simpler than UFS_BALLOC() as we know all metadata is already allocated 2062 * and safe even for the pagedaemon where we cannot bread(). 2063 */ 2064static int 2065snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res) 2066{ 2067 struct indir indirs[NIADDR + 2]; 2068 struct inode *ip = VTOI(vp); 2069 struct fs *fs = ip->i_fs; 2070 struct buf *bp; 2071 int error, num; 2072 2073 KASSERT(lbn >= 0); 2074 2075 if (lbn < NDADDR) { 2076 *res = db_get(ip, lbn); 2077 return 0; 2078 } 2079 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 2080 return error; 2081 if (curlwp == uvm.pagedaemon_lwp) { 2082 mutex_enter(&bufcache_lock); 2083 bp = incore(vp, indirs[num-1].in_lbn); 2084 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) { 2085 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 2086 error = 0; 2087 } else 2088 error = ENOMEM; 2089 mutex_exit(&bufcache_lock); 2090 return error; 2091 } 2092 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp); 2093 if (error == 0) 2094 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 2095 brelse(bp, 0); 2096 2097 return error; 2098} 2099 2100/* 2101 * Read or write the specified block of the filesystem vp resides on 2102 * from or to the disk bypassing the buffer cache. 2103 */ 2104static int 2105rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn) 2106{ 2107 int error; 2108 struct inode *ip = VTOI(vp); 2109 struct fs *fs = ip->i_fs; 2110 struct buf *nbp; 2111 2112 nbp = getiobuf(NULL, true); 2113 nbp->b_flags = flags; 2114 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 2115 nbp->b_error = 0; 2116 nbp->b_data = data; 2117 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn)); 2118 nbp->b_proc = NULL; 2119 nbp->b_dev = ip->i_devvp->v_rdev; 2120 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */ 2121 2122 bdev_strategy(nbp); 2123 2124 error = biowait(nbp); 2125 2126 putiobuf(nbp); 2127 2128 return error; 2129} 2130 2131/* 2132 * Write all dirty buffers to disk and invalidate them. 2133 */ 2134static int 2135syncsnap(struct vnode *vp) 2136{ 2137 int error; 2138 buf_t *bp; 2139 struct fs *fs = VTOI(vp)->i_fs; 2140 2141 mutex_enter(&bufcache_lock); 2142 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) { 2143 error = bbusy(bp, false, 0, NULL); 2144 if (error == EPASSTHROUGH) 2145 continue; 2146 else if (error != 0) { 2147 mutex_exit(&bufcache_lock); 2148 return error; 2149 } 2150 KASSERT(bp->b_bcount == fs->fs_bsize); 2151 mutex_exit(&bufcache_lock); 2152 error = rwfsblk(vp, B_WRITE, bp->b_data, 2153 fragstoblks(fs, dbtofsb(fs, bp->b_blkno))); 2154 brelse(bp, BC_INVAL | BC_VFLUSH); 2155 if (error) 2156 return error; 2157 mutex_enter(&bufcache_lock); 2158 } 2159 mutex_exit(&bufcache_lock); 2160 2161 return 0; 2162} 2163 2164/* 2165 * Write the specified block to a snapshot. 2166 */ 2167static int 2168wrsnapblk(struct vnode *vp, void *data, daddr_t lbn) 2169{ 2170 struct inode *ip = VTOI(vp); 2171 struct fs *fs = ip->i_fs; 2172 struct buf *bp; 2173 int error; 2174 2175 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, 2176 FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp); 2177 if (error) 2178 return error; 2179 memcpy(bp->b_data, data, fs->fs_bsize); 2180 if (ip->i_nlink > 0) 2181 error = bwrite(bp); 2182 else 2183 bawrite(bp); 2184 2185 return error; 2186} 2187 2188/* 2189 * Check if this inode is present on the active snapshot list. 2190 * Must be called with snapinfo locked. 2191 */ 2192static inline bool 2193is_active_snapshot(struct snap_info *si, struct inode *ip) 2194{ 2195 struct inode *xp; 2196 2197 KASSERT(mutex_owned(&si->si_lock)); 2198 2199 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 2200 if (xp == ip) 2201 return true; 2202 return false; 2203} 2204 2205/* 2206 * Get/Put direct block from inode or buffer containing disk addresses. Take 2207 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 2208 * into a global include. 2209 */ 2210static inline daddr_t 2211db_get(struct inode *ip, int loc) 2212{ 2213 if (ip->i_ump->um_fstype == UFS1) 2214 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); 2215 else 2216 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); 2217} 2218 2219static inline void 2220db_assign(struct inode *ip, int loc, daddr_t val) 2221{ 2222 if (ip->i_ump->um_fstype == UFS1) 2223 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2224 else 2225 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2226} 2227 2228static inline daddr_t 2229ib_get(struct inode *ip, int loc) 2230{ 2231 if (ip->i_ump->um_fstype == UFS1) 2232 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip)); 2233 else 2234 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip)); 2235} 2236 2237static inline void 2238ib_assign(struct inode *ip, int loc, daddr_t val) 2239{ 2240 if (ip->i_ump->um_fstype == UFS1) 2241 ip->i_ffs1_ib[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2242 else 2243 ip->i_ffs2_ib[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2244} 2245 2246static inline daddr_t 2247idb_get(struct inode *ip, void *bf, int loc) 2248{ 2249 if (ip->i_ump->um_fstype == UFS1) 2250 return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); 2251 else 2252 return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); 2253} 2254 2255static inline void 2256idb_assign(struct inode *ip, void *bf, int loc, daddr_t val) 2257{ 2258 if (ip->i_ump->um_fstype == UFS1) 2259 ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2260 else 2261 ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2262} 2263