ffs_snapshot.c revision 1.90
1/* $NetBSD: ffs_snapshot.c,v 1.90 2009/01/03 15:29:08 hannken Exp $ */ 2 3/* 4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 5 * 6 * Further information about snapshots can be obtained from: 7 * 8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 9 * 1614 Oxford Street mckusick@mckusick.com 10 * Berkeley, CA 94709-1608 +1-510-843-9542 11 * USA 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 36 * 37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 38 */ 39 40#include <sys/cdefs.h> 41__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.90 2009/01/03 15:29:08 hannken Exp $"); 42 43#if defined(_KERNEL_OPT) 44#include "opt_ffs.h" 45#endif 46 47#include <sys/param.h> 48#include <sys/kernel.h> 49#include <sys/systm.h> 50#include <sys/conf.h> 51#include <sys/buf.h> 52#include <sys/proc.h> 53#include <sys/namei.h> 54#include <sys/sched.h> 55#include <sys/stat.h> 56#include <sys/malloc.h> 57#include <sys/mount.h> 58#include <sys/resource.h> 59#include <sys/resourcevar.h> 60#include <sys/vnode.h> 61#include <sys/kauth.h> 62#include <sys/fstrans.h> 63#include <sys/wapbl.h> 64 65#include <miscfs/specfs/specdev.h> 66 67#include <ufs/ufs/quota.h> 68#include <ufs/ufs/ufsmount.h> 69#include <ufs/ufs/inode.h> 70#include <ufs/ufs/ufs_extern.h> 71#include <ufs/ufs/ufs_bswap.h> 72#include <ufs/ufs/ufs_wapbl.h> 73 74#include <ufs/ffs/fs.h> 75#include <ufs/ffs/ffs_extern.h> 76 77#include <uvm/uvm.h> 78 79#if !defined(FFS_NO_SNAPSHOT) 80typedef int (*acctfunc_t) 81 (struct vnode *, void *, int, int, struct fs *, daddr_t, int); 82 83static int snapshot_setup(struct mount *, struct vnode *); 84static int snapshot_copyfs(struct mount *, struct vnode *, void **); 85static int snapshot_expunge(struct mount *, struct vnode *, 86 struct fs *, daddr_t *, daddr_t **); 87static int snapshot_expunge_snap(struct mount *, struct vnode *, 88 struct fs *, daddr_t); 89static int snapshot_writefs(struct mount *, struct vnode *, void *); 90static int cgaccount(struct vnode *, int, int *); 91static int cgaccount1(int, struct vnode *, void *, int); 92static int expunge(struct vnode *, struct inode *, struct fs *, 93 acctfunc_t, int); 94static int indiracct(struct vnode *, struct vnode *, int, daddr_t, 95 daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int); 96static int fullacct(struct vnode *, void *, int, int, struct fs *, 97 daddr_t, int); 98static int snapacct(struct vnode *, void *, int, int, struct fs *, 99 daddr_t, int); 100static int mapacct(struct vnode *, void *, int, int, struct fs *, 101 daddr_t, int); 102#endif /* !defined(FFS_NO_SNAPSHOT) */ 103 104static int ffs_copyonwrite(void *, struct buf *, bool); 105static int snapblkaddr(struct vnode *, daddr_t, daddr_t *); 106static int rwfsblk(struct vnode *, int, void *, daddr_t); 107static int syncsnap(struct vnode *); 108static int wrsnapblk(struct vnode *, void *, daddr_t); 109 110static inline daddr_t db_get(struct inode *, int); 111static inline void db_assign(struct inode *, int, daddr_t); 112static inline daddr_t ib_get(struct inode *, int); 113static inline void ib_assign(struct inode *, int, daddr_t); 114static inline daddr_t idb_get(struct inode *, void *, int); 115static inline void idb_assign(struct inode *, void *, int, daddr_t); 116 117struct snap_info { 118 kmutex_t si_lock; /* Lock this snapinfo */ 119 kmutex_t si_snaplock; /* Snapshot vnode common lock */ 120 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */ 121 daddr_t *si_snapblklist; /* Snapshot block hints list */ 122 uint32_t si_gen; /* Incremented on change */ 123}; 124 125#ifdef DEBUG 126static int snapdebug = 0; 127#endif 128 129int 130ffs_snapshot_init(struct ufsmount *ump) 131{ 132 struct snap_info *si; 133 134 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP); 135 if (si == NULL) 136 return ENOMEM; 137 138 TAILQ_INIT(&si->si_snapshots); 139 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE); 140 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE); 141 si->si_gen = 0; 142 si->si_snapblklist = NULL; 143 144 return 0; 145} 146 147void 148ffs_snapshot_fini(struct ufsmount *ump) 149{ 150 struct snap_info *si; 151 152 si = ump->um_snapinfo; 153 ump->um_snapinfo = NULL; 154 155 KASSERT(TAILQ_EMPTY(&si->si_snapshots)); 156 mutex_destroy(&si->si_lock); 157 mutex_destroy(&si->si_snaplock); 158 KASSERT(si->si_snapblklist == NULL); 159 kmem_free(si, sizeof(*si)); 160} 161 162/* 163 * Create a snapshot file and initialize it for the filesystem. 164 * Vnode is locked on entry and return. 165 */ 166int 167ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime) 168{ 169#if defined(FFS_NO_SNAPSHOT) 170 return EOPNOTSUPP; 171} 172#else /* defined(FFS_NO_SNAPSHOT) */ 173 bool suspended = false; 174 bool snapshot_locked = false; 175 int error, redo = 0, snaploc; 176 void *sbbuf = NULL; 177 daddr_t *snaplist = NULL, snaplistsize = 0; 178 struct buf *bp, *nbp; 179 struct fs *copy_fs, *fs = VFSTOUFS(mp)->um_fs; 180 struct inode *ip = VTOI(vp); 181 struct lwp *l = curlwp; 182 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; 183 struct timespec ts; 184 struct timeval starttime; 185#ifdef DEBUG 186 struct timeval endtime; 187#endif 188 struct vnode *devvp = ip->i_devvp; 189 190 /* 191 * If the vnode already is a snapshot, return. 192 */ 193 if (VTOI(vp)->i_flags & SF_SNAPSHOT) { 194 if (ctime) { 195 ctime->tv_sec = DIP(VTOI(vp), mtime); 196 ctime->tv_nsec = DIP(VTOI(vp), mtimensec); 197 } 198 return 0; 199 } 200 /* 201 * Check for free snapshot slot in the superblock. 202 */ 203 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 204 if (fs->fs_snapinum[snaploc] == 0) 205 break; 206 if (snaploc == FSMAXSNAP) 207 return (ENOSPC); 208 /* 209 * Prepare the vnode to become a snapshot. 210 */ 211 error = snapshot_setup(mp, vp); 212 if (error) 213 goto out; 214 /* 215 * Change inode to snapshot type file. 216 */ 217 ip->i_flags |= SF_SNAPSHOT; 218 DIP_ASSIGN(ip, flags, ip->i_flags); 219 ip->i_flag |= IN_CHANGE | IN_UPDATE; 220 /* 221 * Copy all the cylinder group maps. Although the 222 * filesystem is still active, we hope that only a few 223 * cylinder groups will change between now and when we 224 * suspend operations. Thus, we will be able to quickly 225 * touch up the few cylinder groups that changed during 226 * the suspension period. 227 */ 228 error = cgaccount(vp, 1, NULL); 229 if (error) 230 goto out; 231 /* 232 * Ensure that the snapshot is completely on disk. 233 * Since we have marked it as a snapshot it is safe to 234 * unlock it as no process will be allowed to write to it. 235 */ 236 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); 237 if (error) 238 goto out; 239 VOP_UNLOCK(vp, 0); 240 /* 241 * All allocations are done, so we can now suspend the filesystem. 242 */ 243 error = vfs_suspend(vp->v_mount, 0); 244 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 245 if (error) 246 goto out; 247 suspended = true; 248 getmicrotime(&starttime); 249 /* 250 * First, copy all the cylinder group maps that have changed. 251 */ 252 error = cgaccount(vp, 2, &redo); 253 if (error) 254 goto out; 255 /* 256 * Create a copy of the superblock and its summary information. 257 */ 258 error = snapshot_copyfs(mp, vp, &sbbuf); 259 copy_fs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc)); 260 if (error) 261 goto out; 262 /* 263 * Expunge unlinked files from our view. 264 */ 265 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist); 266 if (error) 267 goto out; 268 /* 269 * Acquire the snapshot lock. 270 */ 271 mutex_enter(&si->si_snaplock); 272 snapshot_locked = true; 273 /* 274 * Record snapshot inode. Since this is the newest snapshot, 275 * it must be placed at the end of the list. 276 */ 277 fs->fs_snapinum[snaploc] = ip->i_number; 278 279 mutex_enter(&si->si_lock); 280 if (ip->i_nextsnap.tqe_prev != 0) 281 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number); 282 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 283 if (TAILQ_FIRST(&si->si_snapshots) == ip) { 284 /* 285 * If this is the first snapshot on this filesystem, put the 286 * preliminary list in place and establish the cow handler. 287 */ 288 si->si_snapblklist = snaplist; 289 fscow_establish(mp, ffs_copyonwrite, devvp); 290 } 291 si->si_gen++; 292 mutex_exit(&si->si_lock); 293 294 vp->v_vflag |= VV_SYSTEM; 295 /* 296 * Set the mtime to the time the snapshot has been taken. 297 */ 298 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 299 if (ctime) 300 *ctime = ts; 301 DIP_ASSIGN(ip, mtime, ts.tv_sec); 302 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 303 ip->i_flag |= IN_CHANGE | IN_UPDATE; 304 /* 305 * Copy allocation information from all snapshots and then 306 * expunge them from our view. 307 */ 308 error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize); 309 if (error) 310 goto out; 311 /* 312 * Write the superblock and its summary information to the snapshot. 313 */ 314 error = snapshot_writefs(mp, vp, sbbuf); 315 if (error) 316 goto out; 317 /* 318 * We're nearly done, ensure that the snapshot is completely on disk. 319 */ 320 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); 321 if (error) 322 goto out; 323 /* 324 * Invalidate and free all pages on the snapshot vnode. 325 * We will read and write through the buffercache. 326 */ 327 mutex_enter(&vp->v_interlock); 328 error = VOP_PUTPAGES(vp, 0, 0, 329 PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE); 330 if (error) 331 goto out; 332 /* 333 * Invalidate short ( < fs_bsize ) buffers. We will always read 334 * full size buffers later. 335 */ 336 mutex_enter(&bufcache_lock); 337 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 338 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 339 nbp = LIST_NEXT(bp, b_vnbufs); 340 KASSERT((bp->b_cflags & BC_BUSY) == 0); 341 if (bp->b_bcount < fs->fs_bsize) { 342 bp->b_cflags |= BC_BUSY; 343 brelsel(bp, BC_INVAL | BC_VFLUSH); 344 } 345 } 346 mutex_exit(&bufcache_lock); 347 348out: 349 if (sbbuf != NULL) { 350 free(copy_fs->fs_csp, M_UFSMNT); 351 free(sbbuf, M_UFSMNT); 352 } 353 if (fs->fs_active != NULL) { 354 free(fs->fs_active, M_DEVBUF); 355 fs->fs_active = NULL; 356 } 357 358 mutex_enter(&si->si_lock); 359 if (snaplist != NULL) { 360 if (si->si_snapblklist == snaplist) 361 si->si_snapblklist = NULL; 362 free(snaplist, M_UFSMNT); 363 } 364 if (error) { 365 fs->fs_snapinum[snaploc] = 0; 366 } else { 367 /* 368 * As this is the newest list, it is the most inclusive, so 369 * should replace the previous list. 370 */ 371 si->si_snapblklist = ip->i_snapblklist; 372 } 373 si->si_gen++; 374 mutex_exit(&si->si_lock); 375 376 if (snapshot_locked) 377 mutex_exit(&si->si_snaplock); 378 if (suspended) { 379 vfs_resume(vp->v_mount); 380#ifdef DEBUG 381 getmicrotime(&endtime); 382 timersub(&endtime, &starttime, &endtime); 383 printf("%s: suspended %ld.%03ld sec, redo %d of %d\n", 384 mp->mnt_stat.f_mntonname, (long)endtime.tv_sec, 385 endtime.tv_usec / 1000, redo, fs->fs_ncg); 386#endif 387 } 388 if (error) { 389 if (!UFS_WAPBL_BEGIN(mp)) { 390 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); 391 UFS_WAPBL_END(mp); 392 } 393 } else 394 vref(vp); 395 return (error); 396} 397 398/* 399 * Prepare vnode to become a snapshot. 400 */ 401static int 402snapshot_setup(struct mount *mp, struct vnode *vp) 403{ 404 int error, i, len, loc; 405 daddr_t blkno, numblks; 406 struct buf *ibp, *nbp; 407 struct fs *fs = VFSTOUFS(mp)->um_fs; 408 struct lwp *l = curlwp; 409 410 /* 411 * Check mount, exclusive reference and owner. 412 */ 413 if (vp->v_mount != mp) 414 return EXDEV; 415 if (vp->v_usecount != 1 || vp->v_writecount != 0) 416 return EBUSY; 417 if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 418 NULL) != 0 && 419 VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred)) 420 return EACCES; 421 422 if (vp->v_size != 0) { 423 error = ffs_truncate(vp, 0, 0, NOCRED); 424 if (error) 425 return error; 426 } 427 /* 428 * Write an empty list of preallocated blocks to the end of 429 * the snapshot to set size to at least that of the filesystem. 430 */ 431 numblks = howmany(fs->fs_size, fs->fs_frag); 432 blkno = 1; 433 blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs)); 434 error = vn_rdwr(UIO_WRITE, vp, 435 (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks), 436 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 437 if (error) 438 return error; 439 /* 440 * Preallocate critical data structures so that we can copy 441 * them in without further allocation after we suspend all 442 * operations on the filesystem. We would like to just release 443 * the allocated buffers without writing them since they will 444 * be filled in below once we are ready to go, but this upsets 445 * the soft update code, so we go ahead and write the new buffers. 446 * 447 * Allocate all indirect blocks and mark all of them as not 448 * needing to be copied. 449 */ 450 error = UFS_WAPBL_BEGIN(mp); 451 if (error) 452 return error; 453 for (blkno = NDADDR, i = 0; blkno < numblks; blkno += NINDIR(fs)) { 454 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 455 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 456 if (error) 457 goto out; 458 if (DOINGSOFTDEP(vp)) 459 bawrite(ibp); 460 else 461 brelse(ibp, 0); 462 if ((++i % 16) == 0) { 463 UFS_WAPBL_END(mp); 464 error = UFS_WAPBL_BEGIN(mp); 465 if (error) 466 return error; 467 } 468 } 469 /* 470 * Allocate copies for the superblock and its summary information. 471 */ 472 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred, 473 0, &nbp); 474 if (error) 475 goto out; 476 bawrite(nbp); 477 blkno = fragstoblks(fs, fs->fs_csaddr); 478 len = howmany(fs->fs_cssize, fs->fs_bsize); 479 for (loc = 0; loc < len; loc++) { 480 error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)), 481 fs->fs_bsize, l->l_cred, 0, &nbp); 482 if (error) 483 goto out; 484 bawrite(nbp); 485 } 486 487out: 488 UFS_WAPBL_END(mp); 489 return error; 490} 491 492/* 493 * Create a copy of the superblock and its summary information. 494 * It is up to the caller to free copyfs and copy_fs->fs_csp. 495 */ 496static int 497snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf) 498{ 499 int error, i, len, loc, size; 500 void *space; 501 int32_t *lp; 502 struct buf *bp; 503 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; 504 struct lwp *l = curlwp; 505 struct vnode *devvp = VTOI(vp)->i_devvp; 506 507 /* 508 * Grab a copy of the superblock and its summary information. 509 * We delay writing it until the suspension is released below. 510 */ 511 *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 512 loc = blkoff(fs, fs->fs_sblockloc); 513 if (loc > 0) 514 memset(*sbbuf, 0, loc); 515 copyfs = (struct fs *)((char *)(*sbbuf) + loc); 516 bcopy(fs, copyfs, fs->fs_sbsize); 517 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 518 if (fs->fs_sbsize < size) 519 memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0, 520 size - fs->fs_sbsize); 521 size = blkroundup(fs, fs->fs_cssize); 522 if (fs->fs_contigsumsize > 0) 523 size += fs->fs_ncg * sizeof(int32_t); 524 space = malloc(size, M_UFSMNT, M_WAITOK); 525 copyfs->fs_csp = space; 526 bcopy(fs->fs_csp, copyfs->fs_csp, fs->fs_cssize); 527 space = (char *)space + fs->fs_cssize; 528 loc = howmany(fs->fs_cssize, fs->fs_fsize); 529 i = fs->fs_frag - loc % fs->fs_frag; 530 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 531 if (len > 0) { 532 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 533 len, l->l_cred, 0, &bp)) != 0) { 534 brelse(bp, 0); 535 free(copyfs->fs_csp, M_UFSMNT); 536 free(*sbbuf, M_UFSMNT); 537 *sbbuf = NULL; 538 return error; 539 } 540 bcopy(bp->b_data, space, (u_int)len); 541 space = (char *)space + len; 542 brelse(bp, BC_INVAL | BC_NOCACHE); 543 } 544 if (fs->fs_contigsumsize > 0) { 545 copyfs->fs_maxcluster = lp = space; 546 for (i = 0; i < fs->fs_ncg; i++) 547 *lp++ = fs->fs_contigsumsize; 548 } 549 if (mp->mnt_wapbl) 550 copyfs->fs_flags &= ~FS_DOWAPBL; 551 return 0; 552} 553 554/* 555 * We must check for active files that have been unlinked (e.g., with a zero 556 * link count). We have to expunge all trace of these files from the snapshot 557 * so that they are not reclaimed prematurely by fsck or unnecessarily dumped. 558 * Note that we skip unlinked snapshot files as they will be handled separately. 559 * Calculate the snapshot list size and create a preliminary list. 560 */ 561static int 562snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs, 563 daddr_t *snaplistsize, daddr_t **snaplist) 564{ 565 bool has_wapbl = false; 566 int cg, error, len, loc; 567 daddr_t blkno, *blkp; 568 struct fs *fs = VFSTOUFS(mp)->um_fs; 569 struct inode *xp; 570 struct lwp *l = curlwp; 571 struct vattr vat; 572 struct vnode *logvp = NULL, *mvp = NULL, *xvp; 573 574 *snaplist = NULL; 575 /* 576 * Get the log inode if any. 577 */ 578 if ((fs->fs_flags & FS_DOWAPBL) && 579 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { 580 error = VFS_VGET(mp, 581 fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp); 582 if (error) 583 goto out; 584 } 585 /* 586 * Allocate a marker vnode. 587 */ 588 if ((mvp = vnalloc(mp)) == NULL) { 589 error = ENOMEM; 590 goto out; 591 } 592 /* 593 * We also calculate the needed size for the snapshot list. 594 */ 595 *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 596 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 597 error = UFS_WAPBL_BEGIN(mp); 598 if (error) 599 goto out; 600 has_wapbl = true; 601 mutex_enter(&mntvnode_lock); 602 /* 603 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 604 * and vclean() can be called indirectly 605 */ 606 for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) { 607 vmark(mvp, xvp); 608 /* 609 * Make sure this vnode wasn't reclaimed in getnewvnode(). 610 * Start over if it has (it won't be on the list anymore). 611 */ 612 if (xvp->v_mount != mp || vismarker(xvp)) 613 continue; 614 mutex_enter(&xvp->v_interlock); 615 if ((xvp->v_iflag & VI_XLOCK) || 616 xvp->v_usecount == 0 || xvp->v_type == VNON || 617 VTOI(xvp) == NULL || 618 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 619 mutex_exit(&xvp->v_interlock); 620 continue; 621 } 622 mutex_exit(&mntvnode_lock); 623 /* 624 * XXXAD should increase vnode ref count to prevent it 625 * disappearing or being recycled. 626 */ 627 mutex_exit(&xvp->v_interlock); 628#ifdef DEBUG 629 if (snapdebug) 630 vprint("ffs_snapshot: busy vnode", xvp); 631#endif 632 xp = VTOI(xvp); 633 if (xvp != logvp) { 634 if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 && 635 vat.va_nlink > 0) { 636 mutex_enter(&mntvnode_lock); 637 continue; 638 } 639 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 640 mutex_enter(&mntvnode_lock); 641 continue; 642 } 643 } 644 /* 645 * If there is a fragment, clear it here. 646 */ 647 blkno = 0; 648 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 649 if (loc < NDADDR) { 650 len = fragroundup(fs, blkoff(fs, xp->i_size)); 651 if (len > 0 && len < fs->fs_bsize) { 652 ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc), 653 len, xp->i_number); 654 blkno = db_get(xp, loc); 655 db_assign(xp, loc, 0); 656 } 657 } 658 *snaplistsize += 1; 659 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY); 660 if (blkno) 661 db_assign(xp, loc, blkno); 662 if (!error) 663 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, 664 xp->i_mode); 665 if (error) { 666 (void)vunmark(mvp); 667 goto out; 668 } 669 mutex_enter(&mntvnode_lock); 670 } 671 mutex_exit(&mntvnode_lock); 672 /* 673 * Create a preliminary list of preallocated snapshot blocks. 674 */ 675 *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 676 blkp = &(*snaplist)[1]; 677 *blkp++ = lblkno(fs, fs->fs_sblockloc); 678 blkno = fragstoblks(fs, fs->fs_csaddr); 679 for (cg = 0; cg < fs->fs_ncg; cg++) { 680 if (fragstoblks(fs, cgtod(fs, cg)) > blkno) 681 break; 682 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 683 } 684 len = howmany(fs->fs_cssize, fs->fs_bsize); 685 for (loc = 0; loc < len; loc++) 686 *blkp++ = blkno + loc; 687 for (; cg < fs->fs_ncg; cg++) 688 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 689 690out: 691 if (has_wapbl) 692 UFS_WAPBL_END(mp); 693 if (mvp != NULL) 694 vnfree(mvp); 695 if (logvp != NULL) 696 vput(logvp); 697 if (error && *snaplist != NULL) { 698 free(*snaplist, M_UFSMNT); 699 *snaplist = NULL; 700 } 701 702 return error; 703} 704 705/* 706 * Copy allocation information from all the snapshots in this snapshot and 707 * then expunge them from its view. Also, collect the list of allocated 708 * blocks in i_snapblklist. 709 */ 710static int 711snapshot_expunge_snap(struct mount *mp, struct vnode *vp, 712 struct fs *copy_fs, daddr_t snaplistsize) 713{ 714 int error, i; 715 daddr_t numblks, *snaplist = NULL; 716 struct fs *fs = VFSTOUFS(mp)->um_fs; 717 struct inode *ip = VTOI(vp), *xp; 718 struct lwp *l = curlwp; 719 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; 720 721 error = UFS_WAPBL_BEGIN(mp); 722 if (error) 723 return error; 724 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) { 725 if (xp == ip) 726 break; 727 error = expunge(vp, xp, fs, snapacct, BLK_SNAP); 728 if (error) 729 break; 730 if (xp->i_ffs_effnlink != 0) 731 continue; 732 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode); 733 if (error) 734 break; 735 } 736 if (error) 737 goto out; 738 /* 739 * Allocate space for the full list of preallocated snapshot blocks. 740 */ 741 snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 742 ip->i_snapblklist = &snaplist[1]; 743 /* 744 * Expunge the blocks used by the snapshots from the set of 745 * blocks marked as used in the snapshot bitmaps. Also, collect 746 * the list of allocated blocks in i_snapblklist. 747 */ 748 error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP); 749 if (error) 750 goto out; 751 if (snaplistsize < ip->i_snapblklist - snaplist) 752 panic("ffs_snapshot: list too small"); 753 snaplistsize = ip->i_snapblklist - snaplist; 754 snaplist[0] = snaplistsize; 755 ip->i_snapblklist = &snaplist[0]; 756 /* 757 * Write out the list of allocated blocks to the end of the snapshot. 758 */ 759 numblks = howmany(fs->fs_size, fs->fs_frag); 760 for (i = 0; i < snaplistsize; i++) 761 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); 762 error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist, 763 snaplistsize * sizeof(daddr_t), lblktosize(fs, (off_t)numblks), 764 UIO_SYSSPACE, IO_NODELOCKED | IO_JOURNALLOCKED | IO_UNIT, 765 l->l_cred, NULL, NULL); 766 for (i = 0; i < snaplistsize; i++) 767 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); 768out: 769 UFS_WAPBL_END(mp); 770 if (error && snaplist != NULL) { 771 free(snaplist, M_UFSMNT); 772 ip->i_snapblklist = NULL; 773 } 774 return error; 775} 776 777/* 778 * Write the superblock and its summary information to the snapshot. 779 * Make sure, the first NDADDR blocks get copied to the snapshot. 780 */ 781static int 782snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf) 783{ 784 int error, len, loc; 785 void *space; 786 daddr_t blkno; 787 struct buf *bp; 788 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; 789 struct inode *ip = VTOI(vp); 790 struct lwp *l = curlwp; 791 792 copyfs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc)); 793 794 /* 795 * Write the superblock and its summary information 796 * to the snapshot. 797 */ 798 blkno = fragstoblks(fs, fs->fs_csaddr); 799 len = howmany(fs->fs_cssize, fs->fs_bsize); 800 space = copyfs->fs_csp; 801#ifdef FFS_EI 802 if (UFS_FSNEEDSWAP(fs)) { 803 ffs_sb_swap(copyfs, copyfs); 804 ffs_csum_swap(space, space, fs->fs_cssize); 805 } 806#endif 807 error = UFS_WAPBL_BEGIN(mp); 808 if (error) 809 return error; 810 for (loc = 0; loc < len; loc++) { 811 error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred, 812 B_MODIFY, &bp); 813 if (error) { 814 brelse(bp, 0); 815 break; 816 } 817 bcopy(space, bp->b_data, fs->fs_bsize); 818 space = (char *)space + fs->fs_bsize; 819 bawrite(bp); 820 } 821 if (error) 822 goto out; 823 error = bread(vp, lblkno(fs, fs->fs_sblockloc), 824 fs->fs_bsize, l->l_cred, B_MODIFY, &bp); 825 if (error) { 826 brelse(bp, 0); 827 goto out; 828 } else { 829 bcopy(sbbuf, bp->b_data, fs->fs_bsize); 830 bawrite(bp); 831 } 832 /* 833 * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite() 834 * and ffs_snapblkfree() will always work on indirect blocks. 835 */ 836 for (loc = 0; loc < NDADDR; loc++) { 837 if (db_get(ip, loc) != 0) 838 continue; 839 error = ffs_balloc(vp, lblktosize(fs, (off_t)loc), 840 fs->fs_bsize, l->l_cred, 0, &bp); 841 if (error) 842 break; 843 error = rwfsblk(vp, B_READ, bp->b_data, loc); 844 if (error) { 845 brelse(bp, 0); 846 break; 847 } 848 bawrite(bp); 849 } 850 851out: 852 UFS_WAPBL_END(mp); 853 return error; 854} 855 856/* 857 * Copy all cylinder group maps. 858 */ 859static int 860cgaccount(struct vnode *vp, int passno, int *redo) 861{ 862 int cg, error; 863 struct buf *nbp; 864 struct fs *fs = VTOI(vp)->i_fs; 865 866 error = UFS_WAPBL_BEGIN(vp->v_mount); 867 if (error) 868 return error; 869 if (redo != NULL) 870 *redo = 0; 871 if (passno == 1) 872 fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY), 873 M_DEVBUF, M_WAITOK | M_ZERO); 874 for (cg = 0; cg < fs->fs_ncg; cg++) { 875 if (passno == 2 && ACTIVECG_ISSET(fs, cg)) 876 continue; 877 if (redo != NULL) 878 *redo += 1; 879 error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 880 fs->fs_bsize, curlwp->l_cred, 0, &nbp); 881 if (error) 882 break; 883 error = cgaccount1(cg, vp, nbp->b_data, passno); 884 bawrite(nbp); 885 if (error) 886 break; 887 } 888 UFS_WAPBL_END(vp->v_mount); 889 return error; 890} 891 892/* 893 * Copy a cylinder group map. All the unallocated blocks are marked 894 * BLK_NOCOPY so that the snapshot knows that it need not copy them 895 * if they are later written. If passno is one, then this is a first 896 * pass, so only setting needs to be done. If passno is 2, then this 897 * is a revision to a previous pass which must be undone as the 898 * replacement pass is done. 899 */ 900static int 901cgaccount1(int cg, struct vnode *vp, void *data, int passno) 902{ 903 struct buf *bp, *ibp; 904 struct inode *ip; 905 struct cg *cgp; 906 struct fs *fs; 907 struct lwp *l = curlwp; 908 daddr_t base, numblks; 909 int error, len, loc, ns, indiroff; 910 911 ip = VTOI(vp); 912 fs = ip->i_fs; 913 ns = UFS_FSNEEDSWAP(fs); 914 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 915 (int)fs->fs_cgsize, l->l_cred, 0, &bp); 916 if (error) { 917 brelse(bp, 0); 918 return (error); 919 } 920 cgp = (struct cg *)bp->b_data; 921 if (!cg_chkmagic(cgp, ns)) { 922 brelse(bp, 0); 923 return (EIO); 924 } 925 ACTIVECG_SET(fs, cg); 926 927 bcopy(bp->b_data, data, fs->fs_cgsize); 928 brelse(bp, 0); 929 if (fs->fs_cgsize < fs->fs_bsize) 930 memset((char *)data + fs->fs_cgsize, 0, 931 fs->fs_bsize - fs->fs_cgsize); 932 numblks = howmany(fs->fs_size, fs->fs_frag); 933 len = howmany(fs->fs_fpg, fs->fs_frag); 934 base = cg * fs->fs_fpg / fs->fs_frag; 935 if (base + len >= numblks) 936 len = numblks - base - 1; 937 loc = 0; 938 if (base < NDADDR) { 939 for ( ; loc < NDADDR; loc++) { 940 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 941 db_assign(ip, loc, BLK_NOCOPY); 942 else if (db_get(ip, loc) == BLK_NOCOPY) { 943 if (passno == 2) 944 db_assign(ip, loc, 0); 945 else if (passno == 1) 946 panic("ffs_snapshot: lost direct block"); 947 } 948 } 949 } 950 if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)), 951 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) 952 return (error); 953 indiroff = (base + loc - NDADDR) % NINDIR(fs); 954 for ( ; loc < len; loc++, indiroff++) { 955 if (indiroff >= NINDIR(fs)) { 956 bawrite(ibp); 957 if ((error = ffs_balloc(vp, 958 lblktosize(fs, (off_t)(base + loc)), 959 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) 960 return (error); 961 indiroff = 0; 962 } 963 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 964 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 965 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 966 if (passno == 2) 967 idb_assign(ip, ibp->b_data, indiroff, 0); 968 else if (passno == 1) 969 panic("ffs_snapshot: lost indirect block"); 970 } 971 } 972 bdwrite(ibp); 973 return (0); 974} 975 976/* 977 * Before expunging a snapshot inode, note all the 978 * blocks that it claims with BLK_SNAP so that fsck will 979 * be able to account for those blocks properly and so 980 * that this snapshot knows that it need not copy them 981 * if the other snapshot holding them is freed. 982 */ 983static int 984expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 985 acctfunc_t acctfunc, int expungetype) 986{ 987 int i, error, ns; 988 daddr_t lbn, rlbn; 989 daddr_t len, blkno, numblks, blksperindir; 990 struct ufs1_dinode *dip1; 991 struct ufs2_dinode *dip2; 992 struct lwp *l = curlwp; 993 void *bap; 994 struct buf *bp; 995 996 ns = UFS_FSNEEDSWAP(fs); 997 /* 998 * Prepare to expunge the inode. If its inode block has not 999 * yet been copied, then allocate and fill the copy. 1000 */ 1001 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1002 error = snapblkaddr(snapvp, lbn, &blkno); 1003 if (error) 1004 return error; 1005 if (blkno != 0) { 1006 error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred, 1007 B_MODIFY, &bp); 1008 } else { 1009 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn), 1010 fs->fs_bsize, l->l_cred, 0, &bp); 1011 if (! error) 1012 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn); 1013 } 1014 if (error) 1015 return error; 1016 /* 1017 * Set a snapshot inode to be a zero length file, regular files 1018 * or unlinked snapshots to be completely unallocated. 1019 */ 1020 if (fs->fs_magic == FS_UFS1_MAGIC) { 1021 dip1 = (struct ufs1_dinode *)bp->b_data + 1022 ino_to_fsbo(fs, cancelip->i_number); 1023 if (expungetype == BLK_NOCOPY || cancelip->i_ffs_effnlink == 0) 1024 dip1->di_mode = 0; 1025 dip1->di_size = 0; 1026 dip1->di_blocks = 0; 1027 dip1->di_flags = 1028 ufs_rw32(ufs_rw32(dip1->di_flags, ns) & ~SF_SNAPSHOT, ns); 1029 bzero(&dip1->di_db[0], (NDADDR + NIADDR) * sizeof(int32_t)); 1030 } else { 1031 dip2 = (struct ufs2_dinode *)bp->b_data + 1032 ino_to_fsbo(fs, cancelip->i_number); 1033 if (expungetype == BLK_NOCOPY || cancelip->i_ffs_effnlink == 0) 1034 dip2->di_mode = 0; 1035 dip2->di_size = 0; 1036 dip2->di_blocks = 0; 1037 dip2->di_flags = 1038 ufs_rw32(ufs_rw32(dip2->di_flags, ns) & ~SF_SNAPSHOT, ns); 1039 bzero(&dip2->di_db[0], (NDADDR + NIADDR) * sizeof(int64_t)); 1040 } 1041 bdwrite(bp); 1042 /* 1043 * Now go through and expunge all the blocks in the file 1044 * using the function requested. 1045 */ 1046 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1047 if (fs->fs_magic == FS_UFS1_MAGIC) 1048 bap = &cancelip->i_ffs1_db[0]; 1049 else 1050 bap = &cancelip->i_ffs2_db[0]; 1051 if ((error = (*acctfunc)(snapvp, bap, 0, NDADDR, fs, 0, expungetype))) 1052 return (error); 1053 if (fs->fs_magic == FS_UFS1_MAGIC) 1054 bap = &cancelip->i_ffs1_ib[0]; 1055 else 1056 bap = &cancelip->i_ffs2_ib[0]; 1057 if ((error = (*acctfunc)(snapvp, bap, 0, NIADDR, fs, -1, expungetype))) 1058 return (error); 1059 blksperindir = 1; 1060 lbn = -NDADDR; 1061 len = numblks - NDADDR; 1062 rlbn = NDADDR; 1063 for (i = 0; len > 0 && i < NIADDR; i++) { 1064 error = indiracct(snapvp, ITOV(cancelip), i, 1065 ib_get(cancelip, i), lbn, rlbn, len, 1066 blksperindir, fs, acctfunc, expungetype); 1067 if (error) 1068 return (error); 1069 blksperindir *= NINDIR(fs); 1070 lbn -= blksperindir + 1; 1071 len -= blksperindir; 1072 rlbn += blksperindir; 1073 } 1074 return (0); 1075} 1076 1077/* 1078 * Descend an indirect block chain for vnode cancelvp accounting for all 1079 * its indirect blocks in snapvp. 1080 */ 1081static int 1082indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level, 1083 daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks, 1084 daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype) 1085{ 1086 int error, num, i; 1087 daddr_t subblksperindir; 1088 struct indir indirs[NIADDR + 2]; 1089 daddr_t last; 1090 void *bap; 1091 struct buf *bp; 1092 1093 if (blkno == 0) { 1094 if (expungetype == BLK_NOCOPY) 1095 return (0); 1096 panic("indiracct: missing indir"); 1097 } 1098 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1099 return (error); 1100 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1101 panic("indiracct: botched params"); 1102 /* 1103 * We have to expand bread here since it will deadlock looking 1104 * up the block number for any blocks that are not in the cache. 1105 */ 1106 error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize, 1107 false, &bp); 1108 if (error) 1109 return error; 1110 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error = 1111 rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) { 1112 brelse(bp, 0); 1113 return (error); 1114 } 1115 /* 1116 * Account for the block pointers in this indirect block. 1117 */ 1118 last = howmany(remblks, blksperindir); 1119 if (last > NINDIR(fs)) 1120 last = NINDIR(fs); 1121 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO); 1122 bcopy(bp->b_data, (void *)bap, fs->fs_bsize); 1123 brelse(bp, 0); 1124 error = (*acctfunc)(snapvp, bap, 0, last, 1125 fs, level == 0 ? rlbn : -1, expungetype); 1126 if (error || level == 0) 1127 goto out; 1128 /* 1129 * Account for the block pointers in each of the indirect blocks 1130 * in the levels below us. 1131 */ 1132 subblksperindir = blksperindir / NINDIR(fs); 1133 for (lbn++, level--, i = 0; i < last; i++) { 1134 error = indiracct(snapvp, cancelvp, level, 1135 idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks, 1136 subblksperindir, fs, acctfunc, expungetype); 1137 if (error) 1138 goto out; 1139 rlbn += blksperindir; 1140 lbn -= blksperindir; 1141 remblks -= blksperindir; 1142 } 1143out: 1144 free(bap, M_DEVBUF); 1145 return (error); 1146} 1147 1148/* 1149 * Do both snap accounting and map accounting. 1150 */ 1151static int 1152fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1153 struct fs *fs, daddr_t lblkno, 1154 int exptype /* BLK_SNAP or BLK_NOCOPY */) 1155{ 1156 int error; 1157 1158 if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype))) 1159 return (error); 1160 return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)); 1161} 1162 1163/* 1164 * Identify a set of blocks allocated in a snapshot inode. 1165 */ 1166static int 1167snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1168 struct fs *fs, daddr_t lblkno, 1169 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 1170{ 1171 struct inode *ip = VTOI(vp); 1172 struct lwp *l = curlwp; 1173 daddr_t blkno; 1174 daddr_t lbn; 1175 struct buf *ibp; 1176 int error; 1177 1178 for ( ; oldblkp < lastblkp; oldblkp++) { 1179 blkno = idb_get(ip, bap, oldblkp); 1180 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1181 continue; 1182 lbn = fragstoblks(fs, blkno); 1183 if (lbn < NDADDR) { 1184 blkno = db_get(ip, lbn); 1185 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1186 } else { 1187 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1188 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 1189 if (error) 1190 return (error); 1191 blkno = idb_get(ip, ibp->b_data, 1192 (lbn - NDADDR) % NINDIR(fs)); 1193 } 1194 /* 1195 * If we are expunging a snapshot vnode and we 1196 * find a block marked BLK_NOCOPY, then it is 1197 * one that has been allocated to this snapshot after 1198 * we took our current snapshot and can be ignored. 1199 */ 1200 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1201 if (lbn >= NDADDR) 1202 brelse(ibp, 0); 1203 } else { 1204 if (blkno != 0) 1205 panic("snapacct: bad block"); 1206 if (lbn < NDADDR) 1207 db_assign(ip, lbn, expungetype); 1208 else { 1209 idb_assign(ip, ibp->b_data, 1210 (lbn - NDADDR) % NINDIR(fs), expungetype); 1211 bdwrite(ibp); 1212 } 1213 } 1214 } 1215 return (0); 1216} 1217 1218/* 1219 * Account for a set of blocks allocated in a snapshot inode. 1220 */ 1221static int 1222mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1223 struct fs *fs, daddr_t lblkno, int expungetype) 1224{ 1225 daddr_t blkno; 1226 struct inode *ip; 1227 ino_t inum; 1228 int acctit; 1229 1230 ip = VTOI(vp); 1231 inum = ip->i_number; 1232 if (lblkno == -1) 1233 acctit = 0; 1234 else 1235 acctit = 1; 1236 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1237 blkno = idb_get(ip, bap, oldblkp); 1238 if (blkno == 0 || blkno == BLK_NOCOPY) 1239 continue; 1240 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1241 *ip->i_snapblklist++ = lblkno; 1242 if (blkno == BLK_SNAP) 1243 blkno = blkstofrags(fs, lblkno); 1244 ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum); 1245 } 1246 return (0); 1247} 1248#endif /* defined(FFS_NO_SNAPSHOT) */ 1249 1250/* 1251 * Decrement extra reference on snapshot when last name is removed. 1252 * It will not be freed until the last open reference goes away. 1253 */ 1254void 1255ffs_snapgone(struct inode *ip) 1256{ 1257 struct mount *mp = ip->i_devvp->v_specmountpoint; 1258 struct inode *xp; 1259 struct fs *fs; 1260 struct snap_info *si; 1261 int snaploc; 1262 1263 si = VFSTOUFS(mp)->um_snapinfo; 1264 1265 /* 1266 * Find snapshot in incore list. 1267 */ 1268 mutex_enter(&si->si_lock); 1269 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 1270 if (xp == ip) 1271 break; 1272 mutex_exit(&si->si_lock); 1273 if (xp != NULL) 1274 vrele(ITOV(ip)); 1275#ifdef DEBUG 1276 else if (snapdebug) 1277 printf("ffs_snapgone: lost snapshot vnode %llu\n", 1278 (unsigned long long)ip->i_number); 1279#endif 1280 /* 1281 * Delete snapshot inode from superblock. Keep list dense. 1282 */ 1283 mutex_enter(&si->si_lock); 1284 fs = ip->i_fs; 1285 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1286 if (fs->fs_snapinum[snaploc] == ip->i_number) 1287 break; 1288 if (snaploc < FSMAXSNAP) { 1289 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1290 if (fs->fs_snapinum[snaploc] == 0) 1291 break; 1292 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1293 } 1294 fs->fs_snapinum[snaploc - 1] = 0; 1295 } 1296 si->si_gen++; 1297 mutex_exit(&si->si_lock); 1298} 1299 1300/* 1301 * Prepare a snapshot file for being removed. 1302 */ 1303void 1304ffs_snapremove(struct vnode *vp) 1305{ 1306 struct inode *ip = VTOI(vp), *xp; 1307 struct vnode *devvp = ip->i_devvp; 1308 struct fs *fs = ip->i_fs; 1309 struct mount *mp = devvp->v_specmountpoint; 1310 struct buf *ibp; 1311 struct snap_info *si; 1312 struct lwp *l = curlwp; 1313 daddr_t numblks, blkno, dblk; 1314 int error, loc, last; 1315 1316 si = VFSTOUFS(mp)->um_snapinfo; 1317 mutex_enter(&si->si_snaplock); 1318 /* 1319 * If active, delete from incore list (this snapshot may 1320 * already have been in the process of being deleted, so 1321 * would not have been active). 1322 * 1323 * Clear copy-on-write flag if last snapshot. 1324 */ 1325 if (ip->i_nextsnap.tqe_prev != 0) { 1326 mutex_enter(&si->si_lock); 1327 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap); 1328 ip->i_nextsnap.tqe_prev = 0; 1329 if (TAILQ_FIRST(&si->si_snapshots) != 0) { 1330 /* Roll back the list of preallocated blocks. */ 1331 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1332 si->si_snapblklist = xp->i_snapblklist; 1333 } else { 1334 si->si_snapblklist = 0; 1335 si->si_gen++; 1336 mutex_exit(&si->si_lock); 1337 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1338 mutex_enter(&si->si_lock); 1339 } 1340 si->si_gen++; 1341 mutex_exit(&si->si_lock); 1342 if (ip->i_snapblklist != NULL) { 1343 free(ip->i_snapblklist, M_UFSMNT); 1344 ip->i_snapblklist = NULL; 1345 } 1346 } 1347 mutex_exit(&si->si_snaplock); 1348 /* 1349 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1350 * snapshots that want them (see ffs_snapblkfree below). 1351 */ 1352 for (blkno = 1; blkno < NDADDR; blkno++) { 1353 dblk = db_get(ip, blkno); 1354 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1355 db_assign(ip, blkno, 0); 1356 else if ((dblk == blkstofrags(fs, blkno) && 1357 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1358 ip->i_number))) { 1359 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1360 db_assign(ip, blkno, 0); 1361 } 1362 } 1363 numblks = howmany(ip->i_size, fs->fs_bsize); 1364 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1365 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 1366 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 1367 if (error) 1368 continue; 1369 if (fs->fs_size - blkno > NINDIR(fs)) 1370 last = NINDIR(fs); 1371 else 1372 last = fs->fs_size - blkno; 1373 for (loc = 0; loc < last; loc++) { 1374 dblk = idb_get(ip, ibp->b_data, loc); 1375 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1376 idb_assign(ip, ibp->b_data, loc, 0); 1377 else if (dblk == blkstofrags(fs, blkno) && 1378 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1379 fs->fs_bsize, ip->i_number)) { 1380 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1381 idb_assign(ip, ibp->b_data, loc, 0); 1382 } 1383 } 1384 bawrite(ibp); 1385 } 1386 /* 1387 * Clear snapshot flag and drop reference. 1388 */ 1389 ip->i_flags &= ~SF_SNAPSHOT; 1390 DIP_ASSIGN(ip, flags, ip->i_flags); 1391 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1392} 1393 1394/* 1395 * Notification that a block is being freed. Return zero if the free 1396 * should be allowed to proceed. Return non-zero if the snapshot file 1397 * wants to claim the block. The block will be claimed if it is an 1398 * uncopied part of one of the snapshots. It will be freed if it is 1399 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1400 * If a fragment is being freed, then all snapshots that care about 1401 * it must make a copy since a snapshot file can only claim full sized 1402 * blocks. Note that if more than one snapshot file maps the block, 1403 * we can pick one at random to claim it. Since none of the snapshots 1404 * can change, we are assurred that they will all see the same unmodified 1405 * image. When deleting a snapshot file (see ffs_snapremove above), we 1406 * must push any of these claimed blocks to one of the other snapshots 1407 * that maps it. These claimed blocks are easily identified as they will 1408 * have a block number equal to their logical block number within the 1409 * snapshot. A copied block can never have this property because they 1410 * must always have been allocated from a BLK_NOCOPY location. 1411 */ 1412int 1413ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, 1414 long size, ino_t inum) 1415{ 1416 struct mount *mp = devvp->v_specmountpoint; 1417 struct buf *ibp; 1418 struct inode *ip; 1419 struct vnode *vp = NULL; 1420 struct snap_info *si; 1421 void *saved_data = NULL; 1422 daddr_t lbn; 1423 daddr_t blkno; 1424 uint32_t gen; 1425 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1426 1427 si = VFSTOUFS(mp)->um_snapinfo; 1428 lbn = fragstoblks(fs, bno); 1429 mutex_enter(&si->si_lock); 1430retry: 1431 gen = si->si_gen; 1432 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1433 vp = ITOV(ip); 1434 if (snapshot_locked == 0) { 1435 if (!mutex_tryenter(&si->si_snaplock)) { 1436 mutex_exit(&si->si_lock); 1437 mutex_enter(&si->si_snaplock); 1438 mutex_enter(&si->si_lock); 1439 } 1440 snapshot_locked = 1; 1441 if (gen != si->si_gen) 1442 goto retry; 1443 } 1444 /* 1445 * Lookup block being written. 1446 */ 1447 if (lbn < NDADDR) { 1448 blkno = db_get(ip, lbn); 1449 } else { 1450 mutex_exit(&si->si_lock); 1451 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1452 fs->fs_bsize, FSCRED, B_METAONLY, &ibp); 1453 if (error) { 1454 mutex_enter(&si->si_lock); 1455 break; 1456 } 1457 indiroff = (lbn - NDADDR) % NINDIR(fs); 1458 blkno = idb_get(ip, ibp->b_data, indiroff); 1459 mutex_enter(&si->si_lock); 1460 if (gen != si->si_gen) { 1461 brelse(ibp, 0); 1462 goto retry; 1463 } 1464 } 1465 /* 1466 * Check to see if block needs to be copied. 1467 */ 1468 if (blkno == 0) { 1469 /* 1470 * A block that we map is being freed. If it has not 1471 * been claimed yet, we will claim or copy it (below). 1472 */ 1473 claimedblk = 1; 1474 } else if (blkno == BLK_SNAP) { 1475 /* 1476 * No previous snapshot claimed the block, 1477 * so it will be freed and become a BLK_NOCOPY 1478 * (don't care) for us. 1479 */ 1480 if (claimedblk) 1481 panic("snapblkfree: inconsistent block type"); 1482 if (lbn < NDADDR) { 1483 db_assign(ip, lbn, BLK_NOCOPY); 1484 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1485 } else { 1486 idb_assign(ip, ibp->b_data, indiroff, 1487 BLK_NOCOPY); 1488 mutex_exit(&si->si_lock); 1489 if (ip->i_ffs_effnlink > 0) 1490 bwrite(ibp); 1491 else 1492 bdwrite(ibp); 1493 mutex_enter(&si->si_lock); 1494 if (gen != si->si_gen) 1495 goto retry; 1496 } 1497 continue; 1498 } else /* BLK_NOCOPY or default */ { 1499 /* 1500 * If the snapshot has already copied the block 1501 * (default), or does not care about the block, 1502 * it is not needed. 1503 */ 1504 if (lbn >= NDADDR) 1505 brelse(ibp, 0); 1506 continue; 1507 } 1508 /* 1509 * If this is a full size block, we will just grab it 1510 * and assign it to the snapshot inode. Otherwise we 1511 * will proceed to copy it. See explanation for this 1512 * routine as to why only a single snapshot needs to 1513 * claim this block. 1514 */ 1515 if (size == fs->fs_bsize) { 1516#ifdef DEBUG 1517 if (snapdebug) 1518 printf("%s %llu lbn %" PRId64 1519 "from inum %llu\n", 1520 "Grabonremove: snapino", 1521 (unsigned long long)ip->i_number, 1522 lbn, (unsigned long long)inum); 1523#endif 1524 mutex_exit(&si->si_lock); 1525 if (lbn < NDADDR) { 1526 db_assign(ip, lbn, bno); 1527 } else { 1528 idb_assign(ip, ibp->b_data, indiroff, bno); 1529 if (ip->i_ffs_effnlink > 0) 1530 bwrite(ibp); 1531 else 1532 bdwrite(ibp); 1533 } 1534 DIP_ADD(ip, blocks, btodb(size)); 1535 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1536 if (ip->i_ffs_effnlink > 0 && mp->mnt_wapbl) 1537 error = syncsnap(vp); 1538 else 1539 error = 0; 1540 mutex_exit(&si->si_snaplock); 1541 return (error == 0); 1542 } 1543 if (lbn >= NDADDR) 1544 brelse(ibp, 0); 1545#ifdef DEBUG 1546 if (snapdebug) 1547 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n", 1548 "Copyonremove: snapino ", 1549 (unsigned long long)ip->i_number, 1550 lbn, "for inum", (unsigned long long)inum, size); 1551#endif 1552 /* 1553 * If we have already read the old block contents, then 1554 * simply copy them to the new block. Note that we need 1555 * to synchronously write snapshots that have not been 1556 * unlinked, and hence will be visible after a crash, 1557 * to ensure their integrity. 1558 */ 1559 mutex_exit(&si->si_lock); 1560 if (saved_data == NULL) { 1561 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1562 error = rwfsblk(vp, B_READ, saved_data, lbn); 1563 if (error) { 1564 free(saved_data, M_UFSMNT); 1565 saved_data = NULL; 1566 mutex_enter(&si->si_lock); 1567 break; 1568 } 1569 } 1570 error = wrsnapblk(vp, saved_data, lbn); 1571 if (error == 0 && ip->i_ffs_effnlink > 0 && mp->mnt_wapbl) 1572 error = syncsnap(vp); 1573 mutex_enter(&si->si_lock); 1574 if (error) 1575 break; 1576 if (gen != si->si_gen) 1577 goto retry; 1578 } 1579 mutex_exit(&si->si_lock); 1580 if (saved_data) 1581 free(saved_data, M_UFSMNT); 1582 /* 1583 * If we have been unable to allocate a block in which to do 1584 * the copy, then return non-zero so that the fragment will 1585 * not be freed. Although space will be lost, the snapshot 1586 * will stay consistent. 1587 */ 1588 if (snapshot_locked) 1589 mutex_exit(&si->si_snaplock); 1590 return (error); 1591} 1592 1593/* 1594 * Associate snapshot files when mounting. 1595 */ 1596void 1597ffs_snapshot_mount(struct mount *mp) 1598{ 1599 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1600 struct fs *fs = VFSTOUFS(mp)->um_fs; 1601 struct lwp *l = curlwp; 1602 struct vnode *vp; 1603 struct inode *ip, *xp; 1604 struct snap_info *si; 1605 daddr_t snaplistsize, *snapblklist; 1606 int i, error, ns, snaploc, loc; 1607 1608 /* 1609 * No persistent snapshots on apple ufs file systems. 1610 */ 1611 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp))) 1612 return; 1613 1614 si = VFSTOUFS(mp)->um_snapinfo; 1615 ns = UFS_FSNEEDSWAP(fs); 1616 /* 1617 * XXX The following needs to be set before ffs_truncate or 1618 * VOP_READ can be called. 1619 */ 1620 mp->mnt_stat.f_iosize = fs->fs_bsize; 1621 /* 1622 * Process each snapshot listed in the superblock. 1623 */ 1624 vp = NULL; 1625 mutex_enter(&si->si_lock); 1626 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1627 if (fs->fs_snapinum[snaploc] == 0) 1628 break; 1629 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1630 &vp)) != 0) { 1631 printf("ffs_snapshot_mount: vget failed %d\n", error); 1632 continue; 1633 } 1634 ip = VTOI(vp); 1635 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1636 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1637 fs->fs_snapinum[snaploc]); 1638 vput(vp); 1639 vp = NULL; 1640 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1641 if (fs->fs_snapinum[loc] == 0) 1642 break; 1643 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1644 } 1645 fs->fs_snapinum[loc - 1] = 0; 1646 snaploc--; 1647 continue; 1648 } 1649 1650 /* 1651 * Read the block hints list. Use an empty list on 1652 * read errors. 1653 */ 1654 error = vn_rdwr(UIO_READ, vp, 1655 (void *)&snaplistsize, sizeof(snaplistsize), 1656 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1657 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, 1658 l->l_cred, NULL, NULL); 1659 if (error) { 1660 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1661 snaplistsize = 1; 1662 } else 1663 snaplistsize = ufs_rw64(snaplistsize, ns); 1664 snapblklist = malloc( 1665 snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 1666 if (error) 1667 snapblklist[0] = 1; 1668 else { 1669 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist, 1670 snaplistsize * sizeof(daddr_t), 1671 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1672 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, 1673 l->l_cred, NULL, NULL); 1674 for (i = 0; i < snaplistsize; i++) 1675 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 1676 if (error) { 1677 printf("ffs_snapshot_mount: read_2 failed %d\n", 1678 error); 1679 snapblklist[0] = 1; 1680 } 1681 } 1682 ip->i_snapblklist = &snapblklist[0]; 1683 1684 /* 1685 * Link it onto the active snapshot list. 1686 */ 1687 if (ip->i_nextsnap.tqe_prev != 0) 1688 panic("ffs_snapshot_mount: %llu already on list", 1689 (unsigned long long)ip->i_number); 1690 else 1691 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 1692 vp->v_vflag |= VV_SYSTEM; 1693 VOP_UNLOCK(vp, 0); 1694 } 1695 /* 1696 * No usable snapshots found. 1697 */ 1698 if (vp == NULL) { 1699 mutex_exit(&si->si_lock); 1700 return; 1701 } 1702 /* 1703 * Attach the block hints list. We always want to 1704 * use the list from the newest snapshot. 1705 */ 1706 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1707 si->si_snapblklist = xp->i_snapblklist; 1708 fscow_establish(mp, ffs_copyonwrite, devvp); 1709 si->si_gen++; 1710 mutex_exit(&si->si_lock); 1711} 1712 1713/* 1714 * Disassociate snapshot files when unmounting. 1715 */ 1716void 1717ffs_snapshot_unmount(struct mount *mp) 1718{ 1719 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1720 struct inode *xp; 1721 struct vnode *vp = NULL; 1722 struct snap_info *si; 1723 1724 si = VFSTOUFS(mp)->um_snapinfo; 1725 mutex_enter(&si->si_lock); 1726 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) { 1727 vp = ITOV(xp); 1728 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap); 1729 xp->i_nextsnap.tqe_prev = 0; 1730 if (xp->i_snapblklist == si->si_snapblklist) 1731 si->si_snapblklist = NULL; 1732 free(xp->i_snapblklist, M_UFSMNT); 1733 if (xp->i_ffs_effnlink > 0) { 1734 si->si_gen++; 1735 mutex_exit(&si->si_lock); 1736 vrele(vp); 1737 mutex_enter(&si->si_lock); 1738 } 1739 } 1740 if (vp) 1741 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1742 si->si_gen++; 1743 mutex_exit(&si->si_lock); 1744} 1745 1746/* 1747 * Check for need to copy block that is about to be written, 1748 * copying the block if necessary. 1749 */ 1750static int 1751ffs_copyonwrite(void *v, struct buf *bp, bool data_valid) 1752{ 1753 struct fs *fs; 1754 struct inode *ip; 1755 struct vnode *devvp = v, *vp = NULL; 1756 struct mount *mp = devvp->v_specmountpoint; 1757 struct snap_info *si; 1758 void *saved_data = NULL; 1759 daddr_t lbn, blkno, *snapblklist; 1760 uint32_t gen; 1761 int lower, upper, mid, snapshot_locked = 0, error = 0; 1762 1763 /* 1764 * Check for valid snapshots. 1765 */ 1766 si = VFSTOUFS(mp)->um_snapinfo; 1767 mutex_enter(&si->si_lock); 1768 ip = TAILQ_FIRST(&si->si_snapshots); 1769 if (ip == NULL) { 1770 mutex_exit(&si->si_lock); 1771 return 0; 1772 } 1773 /* 1774 * First check to see if it is after the file system or 1775 * in the preallocated list. 1776 * By doing this check we avoid several potential deadlocks. 1777 */ 1778 fs = ip->i_fs; 1779 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1780 if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) { 1781 mutex_exit(&si->si_lock); 1782 return 0; 1783 } 1784 snapblklist = si->si_snapblklist; 1785 upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0); 1786 lower = 1; 1787 while (lower <= upper) { 1788 mid = (lower + upper) / 2; 1789 if (snapblklist[mid] == lbn) 1790 break; 1791 if (snapblklist[mid] < lbn) 1792 lower = mid + 1; 1793 else 1794 upper = mid - 1; 1795 } 1796 if (lower <= upper) { 1797 mutex_exit(&si->si_lock); 1798 return 0; 1799 } 1800 /* 1801 * Not in the precomputed list, so check the snapshots. 1802 */ 1803 if (data_valid && bp->b_bcount == fs->fs_bsize) 1804 saved_data = bp->b_data; 1805retry: 1806 gen = si->si_gen; 1807 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1808 vp = ITOV(ip); 1809 /* 1810 * We ensure that everything of our own that needs to be 1811 * copied will be done at the time that ffs_snapshot is 1812 * called. Thus we can skip the check here which can 1813 * deadlock in doing the lookup in ffs_balloc. 1814 */ 1815 if (bp->b_vp == vp) 1816 continue; 1817 /* 1818 * Check to see if block needs to be copied. 1819 */ 1820 if (lbn < NDADDR) { 1821 blkno = db_get(ip, lbn); 1822 } else { 1823 mutex_exit(&si->si_lock); 1824 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 1825 mutex_enter(&si->si_lock); 1826 break; 1827 } 1828 mutex_enter(&si->si_lock); 1829 if (gen != si->si_gen) 1830 goto retry; 1831 } 1832#ifdef DIAGNOSTIC 1833 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1834 panic("ffs_copyonwrite: bad copy block"); 1835#endif 1836 if (blkno != 0) 1837 continue; 1838 1839 if (curlwp == uvm.pagedaemon_lwp) { 1840 error = ENOMEM; 1841 break; 1842 } 1843 1844 if (snapshot_locked == 0) { 1845 if (!mutex_tryenter(&si->si_snaplock)) { 1846 mutex_exit(&si->si_lock); 1847 mutex_enter(&si->si_snaplock); 1848 mutex_enter(&si->si_lock); 1849 } 1850 snapshot_locked = 1; 1851 if (gen != si->si_gen) 1852 goto retry; 1853 1854 /* Check again if block still needs to be copied */ 1855 if (lbn < NDADDR) { 1856 blkno = db_get(ip, lbn); 1857 } else { 1858 mutex_exit(&si->si_lock); 1859 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 1860 mutex_enter(&si->si_lock); 1861 break; 1862 } 1863 mutex_enter(&si->si_lock); 1864 if (gen != si->si_gen) 1865 goto retry; 1866 } 1867 1868 if (blkno != 0) 1869 continue; 1870 } 1871 /* 1872 * Allocate the block into which to do the copy. Since 1873 * multiple processes may all try to copy the same block, 1874 * we have to recheck our need to do a copy if we sleep 1875 * waiting for the lock. 1876 * 1877 * Because all snapshots on a filesystem share a single 1878 * lock, we ensure that we will never be in competition 1879 * with another process to allocate a block. 1880 */ 1881#ifdef DEBUG 1882 if (snapdebug) { 1883 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ", 1884 (unsigned long long)ip->i_number, lbn); 1885 if (bp->b_vp == devvp) 1886 printf("fs metadata"); 1887 else 1888 printf("inum %llu", (unsigned long long) 1889 VTOI(bp->b_vp)->i_number); 1890 printf(" lblkno %" PRId64 "\n", bp->b_lblkno); 1891 } 1892#endif 1893 /* 1894 * If we have already read the old block contents, then 1895 * simply copy them to the new block. Note that we need 1896 * to synchronously write snapshots that have not been 1897 * unlinked, and hence will be visible after a crash, 1898 * to ensure their integrity. 1899 */ 1900 mutex_exit(&si->si_lock); 1901 if (saved_data == NULL) { 1902 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1903 error = rwfsblk(vp, B_READ, saved_data, lbn); 1904 if (error) { 1905 free(saved_data, M_UFSMNT); 1906 saved_data = NULL; 1907 mutex_enter(&si->si_lock); 1908 break; 1909 } 1910 } 1911 error = wrsnapblk(vp, saved_data, lbn); 1912 if (error == 0 && ip->i_ffs_effnlink > 0 && mp->mnt_wapbl) 1913 error = syncsnap(vp); 1914 mutex_enter(&si->si_lock); 1915 if (error) 1916 break; 1917 if (gen != si->si_gen) 1918 goto retry; 1919 } 1920 /* 1921 * Note that we need to synchronously write snapshots that 1922 * have not been unlinked, and hence will be visible after 1923 * a crash, to ensure their integrity. 1924 */ 1925 mutex_exit(&si->si_lock); 1926 if (saved_data && saved_data != bp->b_data) 1927 free(saved_data, M_UFSMNT); 1928 if (snapshot_locked) 1929 mutex_exit(&si->si_snaplock); 1930 return error; 1931} 1932 1933/* 1934 * Read from a snapshot. 1935 */ 1936int 1937ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag) 1938{ 1939 struct inode *ip = VTOI(vp); 1940 struct fs *fs = ip->i_fs; 1941 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo; 1942 struct buf *bp; 1943 daddr_t lbn, nextlbn; 1944 off_t fsbytes, bytesinfile; 1945 long size, xfersize, blkoffset; 1946 int error; 1947 1948 fstrans_start(vp->v_mount, FSTRANS_SHARED); 1949 mutex_enter(&si->si_snaplock); 1950 1951 if (ioflag & IO_ALTSEMANTICS) 1952 fsbytes = ip->i_size; 1953 else 1954 fsbytes = lfragtosize(fs, fs->fs_size); 1955 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 1956 bytesinfile = fsbytes - uio->uio_offset; 1957 if (bytesinfile <= 0) 1958 break; 1959 lbn = lblkno(fs, uio->uio_offset); 1960 nextlbn = lbn + 1; 1961 size = fs->fs_bsize; 1962 blkoffset = blkoff(fs, uio->uio_offset); 1963 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), 1964 bytesinfile); 1965 1966 if (lblktosize(fs, nextlbn + 1) >= fsbytes) { 1967 if (lblktosize(fs, lbn) + size > fsbytes) 1968 size = fragroundup(fs, 1969 fsbytes - lblktosize(fs, lbn)); 1970 error = bread(vp, lbn, size, NOCRED, 0, &bp); 1971 } else { 1972 int nextsize = fs->fs_bsize; 1973 error = breadn(vp, lbn, 1974 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp); 1975 } 1976 if (error) 1977 break; 1978 1979 /* 1980 * We should only get non-zero b_resid when an I/O error 1981 * has occurred, which should cause us to break above. 1982 * However, if the short read did not cause an error, 1983 * then we want to ensure that we do not uiomove bad 1984 * or uninitialized data. 1985 */ 1986 size -= bp->b_resid; 1987 if (size < blkoffset + xfersize) { 1988 xfersize = size - blkoffset; 1989 if (xfersize <= 0) 1990 break; 1991 } 1992 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 1993 if (error) 1994 break; 1995 brelse(bp, BC_AGE); 1996 } 1997 if (bp != NULL) 1998 brelse(bp, BC_AGE); 1999 2000 mutex_exit(&si->si_snaplock); 2001 fstrans_done(vp->v_mount); 2002 return error; 2003} 2004 2005/* 2006 * Lookup a snapshots data block address. 2007 * Simpler than UFS_BALLOC() as we know all metadata is already allocated 2008 * and safe even for the pagedaemon where we cannot bread(). 2009 */ 2010static int 2011snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res) 2012{ 2013 struct indir indirs[NIADDR + 2]; 2014 struct inode *ip = VTOI(vp); 2015 struct fs *fs = ip->i_fs; 2016 struct buf *bp; 2017 int error, num; 2018 2019 KASSERT(lbn >= 0); 2020 2021 if (lbn < NDADDR) { 2022 *res = db_get(ip, lbn); 2023 return 0; 2024 } 2025 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 2026 return error; 2027 if (curlwp == uvm.pagedaemon_lwp) { 2028 mutex_enter(&bufcache_lock); 2029 bp = incore(vp, indirs[num-1].in_lbn); 2030 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) { 2031 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 2032 error = 0; 2033 } else 2034 error = ENOMEM; 2035 mutex_exit(&bufcache_lock); 2036 return error; 2037 } 2038 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp); 2039 if (error == 0) 2040 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 2041 brelse(bp, 0); 2042 2043 return error; 2044} 2045 2046/* 2047 * Read or write the specified block of the filesystem vp resides on 2048 * from or to the disk bypassing the buffer cache. 2049 */ 2050static int 2051rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn) 2052{ 2053 int error; 2054 struct inode *ip = VTOI(vp); 2055 struct fs *fs = ip->i_fs; 2056 struct buf *nbp; 2057 2058 nbp = getiobuf(NULL, true); 2059 nbp->b_flags = flags; 2060 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 2061 nbp->b_error = 0; 2062 nbp->b_data = data; 2063 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn)); 2064 nbp->b_proc = NULL; 2065 nbp->b_dev = ip->i_devvp->v_rdev; 2066 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */ 2067 2068 bdev_strategy(nbp); 2069 2070 error = biowait(nbp); 2071 2072 putiobuf(nbp); 2073 2074 return error; 2075} 2076 2077/* 2078 * Write all dirty buffers to disk and invalidate them. 2079 */ 2080static int 2081syncsnap(struct vnode *vp) 2082{ 2083 int error; 2084 buf_t *bp; 2085 struct fs *fs = VTOI(vp)->i_fs; 2086 2087 mutex_enter(&bufcache_lock); 2088 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) { 2089 KASSERT((bp->b_cflags & BC_BUSY) == 0); 2090 KASSERT(bp->b_bcount == fs->fs_bsize); 2091 bp->b_cflags |= BC_BUSY; 2092 mutex_exit(&bufcache_lock); 2093 error = rwfsblk(vp, B_WRITE, bp->b_data, 2094 fragstoblks(fs, dbtofsb(fs, bp->b_blkno))); 2095 brelse(bp, BC_INVAL | BC_VFLUSH); 2096 if (error) 2097 return error; 2098 mutex_enter(&bufcache_lock); 2099 } 2100 mutex_exit(&bufcache_lock); 2101 2102 return 0; 2103} 2104 2105/* 2106 * Write the specified block to a snapshot. 2107 */ 2108static int 2109wrsnapblk(struct vnode *vp, void *data, daddr_t lbn) 2110{ 2111 struct inode *ip = VTOI(vp); 2112 struct fs *fs = ip->i_fs; 2113 struct buf *bp; 2114 int error; 2115 2116 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, 2117 FSCRED, (ip->i_ffs_effnlink > 0 ? B_SYNC : 0), &bp); 2118 if (error) 2119 return error; 2120 bcopy(data, bp->b_data, fs->fs_bsize); 2121 if (ip->i_ffs_effnlink > 0) 2122 error = bwrite(bp); 2123 else 2124 bawrite(bp); 2125 2126 return error; 2127} 2128 2129/* 2130 * Get/Put direct block from inode or buffer containing disk addresses. Take 2131 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 2132 * into a global include. 2133 */ 2134static inline daddr_t 2135db_get(struct inode *ip, int loc) 2136{ 2137 if (ip->i_ump->um_fstype == UFS1) 2138 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); 2139 else 2140 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); 2141} 2142 2143static inline void 2144db_assign(struct inode *ip, int loc, daddr_t val) 2145{ 2146 if (ip->i_ump->um_fstype == UFS1) 2147 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2148 else 2149 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2150} 2151 2152static inline daddr_t 2153ib_get(struct inode *ip, int loc) 2154{ 2155 if (ip->i_ump->um_fstype == UFS1) 2156 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip)); 2157 else 2158 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip)); 2159} 2160 2161static inline void 2162ib_assign(struct inode *ip, int loc, daddr_t val) 2163{ 2164 if (ip->i_ump->um_fstype == UFS1) 2165 ip->i_ffs1_ib[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2166 else 2167 ip->i_ffs2_ib[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2168} 2169 2170static inline daddr_t 2171idb_get(struct inode *ip, void *bf, int loc) 2172{ 2173 if (ip->i_ump->um_fstype == UFS1) 2174 return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); 2175 else 2176 return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); 2177} 2178 2179static inline void 2180idb_assign(struct inode *ip, void *bf, int loc, daddr_t val) 2181{ 2182 if (ip->i_ump->um_fstype == UFS1) 2183 ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2184 else 2185 ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2186} 2187