1/* $NetBSD: ffs_snapshot.c,v 1.117 2011/07/01 14:28:21 hannken Exp $ */ 2 3/* 4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 5 * 6 * Further information about snapshots can be obtained from: 7 * 8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 9 * 1614 Oxford Street mckusick@mckusick.com 10 * Berkeley, CA 94709-1608 +1-510-843-9542 11 * USA 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 36 * 37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 38 */ 39 40#include <sys/cdefs.h> 41__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.117 2011/07/01 14:28:21 hannken Exp $"); 42 43#if defined(_KERNEL_OPT) 44#include "opt_ffs.h" 45#include "opt_quota.h" 46#endif 47 48#include <sys/param.h> 49#include <sys/kernel.h> 50#include <sys/systm.h> 51#include <sys/conf.h> 52#include <sys/buf.h> 53#include <sys/proc.h> 54#include <sys/namei.h> 55#include <sys/sched.h> 56#include <sys/stat.h> 57#include <sys/malloc.h> 58#include <sys/mount.h> 59#include <sys/resource.h> 60#include <sys/resourcevar.h> 61#include <sys/vnode.h> 62#include <sys/kauth.h> 63#include <sys/fstrans.h> 64#include <sys/wapbl.h> 65 66#include <miscfs/specfs/specdev.h> 67 68#include <ufs/ufs/quota.h> 69#include <ufs/ufs/ufsmount.h> 70#include <ufs/ufs/inode.h> 71#include <ufs/ufs/ufs_extern.h> 72#include <ufs/ufs/ufs_bswap.h> 73#include <ufs/ufs/ufs_wapbl.h> 74 75#include <ufs/ffs/fs.h> 76#include <ufs/ffs/ffs_extern.h> 77 78#include <uvm/uvm.h> 79 80struct snap_info { 81 kmutex_t si_lock; /* Lock this snapinfo */ 82 kmutex_t si_snaplock; /* Snapshot vnode common lock */ 83 lwp_t *si_owner; /* Sanplock owner */ 84 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */ 85 daddr_t *si_snapblklist; /* Snapshot block hints list */ 86 uint32_t si_gen; /* Incremented on change */ 87}; 88 89#if !defined(FFS_NO_SNAPSHOT) 90typedef int (*acctfunc_t) 91 (struct vnode *, void *, int, int, struct fs *, daddr_t, int); 92 93static int snapshot_setup(struct mount *, struct vnode *); 94static int snapshot_copyfs(struct mount *, struct vnode *, void **); 95static int snapshot_expunge(struct mount *, struct vnode *, 96 struct fs *, daddr_t *, daddr_t **); 97static int snapshot_expunge_snap(struct mount *, struct vnode *, 98 struct fs *, daddr_t); 99static int snapshot_writefs(struct mount *, struct vnode *, void *); 100static int cgaccount(struct vnode *, int, int *); 101static int cgaccount1(int, struct vnode *, void *, int); 102static int expunge(struct vnode *, struct inode *, struct fs *, 103 acctfunc_t, int); 104static int indiracct(struct vnode *, struct vnode *, int, daddr_t, 105 daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int); 106static int fullacct(struct vnode *, void *, int, int, struct fs *, 107 daddr_t, int); 108static int snapacct(struct vnode *, void *, int, int, struct fs *, 109 daddr_t, int); 110static int mapacct(struct vnode *, void *, int, int, struct fs *, 111 daddr_t, int); 112#endif /* !defined(FFS_NO_SNAPSHOT) */ 113 114static int ffs_copyonwrite(void *, struct buf *, bool); 115static int snapblkaddr(struct vnode *, daddr_t, daddr_t *); 116static int rwfsblk(struct vnode *, int, void *, daddr_t); 117static int syncsnap(struct vnode *); 118static int wrsnapblk(struct vnode *, void *, daddr_t); 119#if !defined(FFS_NO_SNAPSHOT) 120static int blocks_in_journal(struct fs *); 121#endif 122 123static inline bool is_active_snapshot(struct snap_info *, struct inode *); 124static inline daddr_t db_get(struct inode *, int); 125static inline void db_assign(struct inode *, int, daddr_t); 126static inline daddr_t ib_get(struct inode *, int); 127static inline void ib_assign(struct inode *, int, daddr_t); 128static inline daddr_t idb_get(struct inode *, void *, int); 129static inline void idb_assign(struct inode *, void *, int, daddr_t); 130 131#ifdef DEBUG 132static int snapdebug = 0; 133#endif 134 135int 136ffs_snapshot_init(struct ufsmount *ump) 137{ 138 struct snap_info *si; 139 140 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP); 141 if (si == NULL) 142 return ENOMEM; 143 144 TAILQ_INIT(&si->si_snapshots); 145 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE); 146 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE); 147 si->si_owner = NULL; 148 si->si_gen = 0; 149 si->si_snapblklist = NULL; 150 151 return 0; 152} 153 154void 155ffs_snapshot_fini(struct ufsmount *ump) 156{ 157 struct snap_info *si; 158 159 si = ump->um_snapinfo; 160 ump->um_snapinfo = NULL; 161 162 KASSERT(TAILQ_EMPTY(&si->si_snapshots)); 163 mutex_destroy(&si->si_lock); 164 mutex_destroy(&si->si_snaplock); 165 KASSERT(si->si_snapblklist == NULL); 166 kmem_free(si, sizeof(*si)); 167} 168 169/* 170 * Create a snapshot file and initialize it for the filesystem. 171 * Vnode is locked on entry and return. 172 */ 173int 174ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime) 175{ 176#if defined(FFS_NO_SNAPSHOT) 177 return EOPNOTSUPP; 178} 179#else /* defined(FFS_NO_SNAPSHOT) */ 180 bool suspended = false; 181 int error, redo = 0, snaploc; 182 void *sbbuf = NULL; 183 daddr_t *snaplist = NULL, snaplistsize = 0; 184 struct buf *bp, *nbp; 185 struct fs *copy_fs = NULL; 186 struct fs *fs = VFSTOUFS(mp)->um_fs; 187 struct inode *ip = VTOI(vp); 188 struct lwp *l = curlwp; 189 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; 190 struct timespec ts; 191 struct timeval starttime; 192#ifdef DEBUG 193 struct timeval endtime; 194#endif 195 struct vnode *devvp = ip->i_devvp; 196 197 /* 198 * If the vnode already is a snapshot, return. 199 */ 200 if ((VTOI(vp)->i_flags & SF_SNAPSHOT)) { 201 if ((VTOI(vp)->i_flags & SF_SNAPINVAL)) 202 return EINVAL; 203 if (ctime) { 204 ctime->tv_sec = DIP(VTOI(vp), mtime); 205 ctime->tv_nsec = DIP(VTOI(vp), mtimensec); 206 } 207 return 0; 208 } 209 /* 210 * Check for free snapshot slot in the superblock. 211 */ 212 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 213 if (fs->fs_snapinum[snaploc] == 0) 214 break; 215 if (snaploc == FSMAXSNAP) 216 return (ENOSPC); 217 /* 218 * Prepare the vnode to become a snapshot. 219 */ 220 error = snapshot_setup(mp, vp); 221 if (error) 222 goto out; 223 224 /* 225 * Copy all the cylinder group maps. Although the 226 * filesystem is still active, we hope that only a few 227 * cylinder groups will change between now and when we 228 * suspend operations. Thus, we will be able to quickly 229 * touch up the few cylinder groups that changed during 230 * the suspension period. 231 */ 232 error = cgaccount(vp, 1, NULL); 233 if (error) 234 goto out; 235 236 /* 237 * snapshot is now valid 238 */ 239 ip->i_flags &= ~SF_SNAPINVAL; 240 DIP_ASSIGN(ip, flags, ip->i_flags); 241 ip->i_flag |= IN_CHANGE | IN_UPDATE; 242 243 /* 244 * Ensure that the snapshot is completely on disk. 245 * Since we have marked it as a snapshot it is safe to 246 * unlock it as no process will be allowed to write to it. 247 */ 248 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); 249 if (error) 250 goto out; 251 VOP_UNLOCK(vp); 252 /* 253 * All allocations are done, so we can now suspend the filesystem. 254 */ 255 error = vfs_suspend(vp->v_mount, 0); 256 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 257 if (error) 258 goto out; 259 suspended = true; 260 getmicrotime(&starttime); 261 /* 262 * First, copy all the cylinder group maps that have changed. 263 */ 264 error = cgaccount(vp, 2, &redo); 265 if (error) 266 goto out; 267 /* 268 * Create a copy of the superblock and its summary information. 269 */ 270 error = snapshot_copyfs(mp, vp, &sbbuf); 271 copy_fs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc)); 272 if (error) 273 goto out; 274 /* 275 * Expunge unlinked files from our view. 276 */ 277 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist); 278 if (error) 279 goto out; 280 /* 281 * Record snapshot inode. Since this is the newest snapshot, 282 * it must be placed at the end of the list. 283 */ 284 if (ip->i_nlink > 0) 285 fs->fs_snapinum[snaploc] = ip->i_number; 286 287 mutex_enter(&si->si_lock); 288 if (is_active_snapshot(si, ip)) 289 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number); 290 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 291 if (TAILQ_FIRST(&si->si_snapshots) == ip) { 292 /* 293 * If this is the first snapshot on this filesystem, put the 294 * preliminary list in place and establish the cow handler. 295 */ 296 si->si_snapblklist = snaplist; 297 fscow_establish(mp, ffs_copyonwrite, devvp); 298 } 299 si->si_gen++; 300 mutex_exit(&si->si_lock); 301 302 vp->v_vflag |= VV_SYSTEM; 303 /* 304 * Set the mtime to the time the snapshot has been taken. 305 */ 306 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 307 if (ctime) 308 *ctime = ts; 309 DIP_ASSIGN(ip, mtime, ts.tv_sec); 310 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 311 ip->i_flag |= IN_CHANGE | IN_UPDATE; 312 /* 313 * Copy allocation information from all snapshots and then 314 * expunge them from our view. 315 */ 316 error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize); 317 if (error) 318 goto out; 319 /* 320 * Write the superblock and its summary information to the snapshot. 321 */ 322 error = snapshot_writefs(mp, vp, sbbuf); 323 if (error) 324 goto out; 325 /* 326 * We're nearly done, ensure that the snapshot is completely on disk. 327 */ 328 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); 329 if (error) 330 goto out; 331 /* 332 * Invalidate and free all pages on the snapshot vnode. 333 * We will read and write through the buffercache. 334 */ 335 mutex_enter(vp->v_interlock); 336 error = VOP_PUTPAGES(vp, 0, 0, 337 PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE); 338 if (error) 339 goto out; 340 /* 341 * Invalidate short ( < fs_bsize ) buffers. We will always read 342 * full size buffers later. 343 */ 344 mutex_enter(&bufcache_lock); 345 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 346 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 347 nbp = LIST_NEXT(bp, b_vnbufs); 348 KASSERT((bp->b_cflags & BC_BUSY) == 0); 349 if (bp->b_bcount < fs->fs_bsize) { 350 bp->b_cflags |= BC_BUSY; 351 brelsel(bp, BC_INVAL | BC_VFLUSH); 352 } 353 } 354 mutex_exit(&bufcache_lock); 355 356out: 357 if (sbbuf != NULL) { 358 free(copy_fs->fs_csp, M_UFSMNT); 359 free(sbbuf, M_UFSMNT); 360 } 361 if (fs->fs_active != NULL) { 362 free(fs->fs_active, M_DEVBUF); 363 fs->fs_active = NULL; 364 } 365 366 mutex_enter(&si->si_lock); 367 if (snaplist != NULL) { 368 if (si->si_snapblklist == snaplist) 369 si->si_snapblklist = NULL; 370 free(snaplist, M_UFSMNT); 371 } 372 if (error) { 373 fs->fs_snapinum[snaploc] = 0; 374 } else { 375 /* 376 * As this is the newest list, it is the most inclusive, so 377 * should replace the previous list. 378 */ 379 si->si_snapblklist = ip->i_snapblklist; 380 } 381 si->si_gen++; 382 mutex_exit(&si->si_lock); 383 384 if (suspended) { 385 VOP_UNLOCK(vp); 386 vfs_resume(vp->v_mount); 387 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 388#ifdef DEBUG 389 getmicrotime(&endtime); 390 timersub(&endtime, &starttime, &endtime); 391 printf("%s: suspended %lld.%03d sec, redo %d of %d\n", 392 mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec, 393 endtime.tv_usec / 1000, redo, fs->fs_ncg); 394#endif 395 } 396 if (error) { 397 if (!UFS_WAPBL_BEGIN(mp)) { 398 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); 399 UFS_WAPBL_END(mp); 400 } 401 } else if (ip->i_nlink > 0) 402 vref(vp); 403 return (error); 404} 405 406/* 407 * Prepare vnode to become a snapshot. 408 */ 409static int 410snapshot_setup(struct mount *mp, struct vnode *vp) 411{ 412 int error, n, len, loc, cg; 413 daddr_t blkno, numblks; 414 struct buf *ibp, *nbp; 415 struct fs *fs = VFSTOUFS(mp)->um_fs; 416 struct lwp *l = curlwp; 417 const int wbreak = blocks_in_journal(fs)/8; 418 struct inode *ip = VTOI(vp); 419 420 /* 421 * Check mount, exclusive reference and owner. 422 */ 423 if (vp->v_mount != mp) 424 return EXDEV; 425 if (vp->v_usecount != 1 || vp->v_writecount != 0) 426 return EBUSY; 427 if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 428 NULL) != 0 && 429 VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred)) 430 return EACCES; 431 432 if (vp->v_size != 0) { 433 error = ffs_truncate(vp, 0, 0, NOCRED); 434 if (error) 435 return error; 436 } 437 438 /* Change inode to snapshot type file. */ 439 error = UFS_WAPBL_BEGIN(mp); 440 if (error) 441 return error; 442#if defined(QUOTA) || defined(QUOTA2) 443 /* shapshot inodes are not accounted in quotas */ 444 chkiq(ip, -1, l->l_cred, 0); 445#endif 446 ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL); 447 DIP_ASSIGN(ip, flags, ip->i_flags); 448 ip->i_flag |= IN_CHANGE | IN_UPDATE; 449 ffs_update(vp, NULL, NULL, UPDATE_WAIT); 450 UFS_WAPBL_END(mp); 451 452 KASSERT(ip->i_flags & SF_SNAPSHOT); 453 /* 454 * Write an empty list of preallocated blocks to the end of 455 * the snapshot to set size to at least that of the filesystem. 456 */ 457 numblks = howmany(fs->fs_size, fs->fs_frag); 458 blkno = 1; 459 blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs)); 460 error = vn_rdwr(UIO_WRITE, vp, 461 (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks), 462 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 463 if (error) 464 return error; 465 /* 466 * Preallocate critical data structures so that we can copy 467 * them in without further allocation after we suspend all 468 * operations on the filesystem. We would like to just release 469 * the allocated buffers without writing them since they will 470 * be filled in below once we are ready to go, but this upsets 471 * the soft update code, so we go ahead and write the new buffers. 472 * 473 * Allocate all indirect blocks and mark all of them as not 474 * needing to be copied. 475 */ 476 error = UFS_WAPBL_BEGIN(mp); 477 if (error) 478 return error; 479 for (blkno = NDADDR, n = 0; blkno < numblks; blkno += NINDIR(fs)) { 480 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 481 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 482 if (error) 483 goto out; 484 brelse(ibp, 0); 485 if (wbreak > 0 && (++n % wbreak) == 0) { 486 UFS_WAPBL_END(mp); 487 error = UFS_WAPBL_BEGIN(mp); 488 if (error) 489 return error; 490 } 491 } 492 /* 493 * Allocate copies for the superblock and its summary information. 494 */ 495 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred, 496 0, &nbp); 497 if (error) 498 goto out; 499 bawrite(nbp); 500 blkno = fragstoblks(fs, fs->fs_csaddr); 501 len = howmany(fs->fs_cssize, fs->fs_bsize); 502 for (loc = 0; loc < len; loc++) { 503 error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)), 504 fs->fs_bsize, l->l_cred, 0, &nbp); 505 if (error) 506 goto out; 507 bawrite(nbp); 508 if (wbreak > 0 && (++n % wbreak) == 0) { 509 UFS_WAPBL_END(mp); 510 error = UFS_WAPBL_BEGIN(mp); 511 if (error) 512 return error; 513 } 514 } 515 /* 516 * Allocate all cylinder group blocks. 517 */ 518 for (cg = 0; cg < fs->fs_ncg; cg++) { 519 error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 520 fs->fs_bsize, l->l_cred, 0, &nbp); 521 if (error) 522 goto out; 523 bawrite(nbp); 524 if (wbreak > 0 && (++n % wbreak) == 0) { 525 UFS_WAPBL_END(mp); 526 error = UFS_WAPBL_BEGIN(mp); 527 if (error) 528 return error; 529 } 530 } 531 532out: 533 UFS_WAPBL_END(mp); 534 return error; 535} 536 537/* 538 * Create a copy of the superblock and its summary information. 539 * It is up to the caller to free copyfs and copy_fs->fs_csp. 540 */ 541static int 542snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf) 543{ 544 int error, i, len, loc, size; 545 void *space; 546 int32_t *lp; 547 struct buf *bp; 548 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; 549 struct lwp *l = curlwp; 550 struct vnode *devvp = VTOI(vp)->i_devvp; 551 552 /* 553 * Grab a copy of the superblock and its summary information. 554 * We delay writing it until the suspension is released below. 555 */ 556 *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 557 loc = blkoff(fs, fs->fs_sblockloc); 558 if (loc > 0) 559 memset(*sbbuf, 0, loc); 560 copyfs = (struct fs *)((char *)(*sbbuf) + loc); 561 memcpy(copyfs, fs, fs->fs_sbsize); 562 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 563 if (fs->fs_sbsize < size) 564 memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0, 565 size - fs->fs_sbsize); 566 size = blkroundup(fs, fs->fs_cssize); 567 if (fs->fs_contigsumsize > 0) 568 size += fs->fs_ncg * sizeof(int32_t); 569 space = malloc(size, M_UFSMNT, M_WAITOK); 570 copyfs->fs_csp = space; 571 memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize); 572 space = (char *)space + fs->fs_cssize; 573 loc = howmany(fs->fs_cssize, fs->fs_fsize); 574 i = fs->fs_frag - loc % fs->fs_frag; 575 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 576 if (len > 0) { 577 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 578 len, l->l_cred, 0, &bp)) != 0) { 579 brelse(bp, 0); 580 free(copyfs->fs_csp, M_UFSMNT); 581 free(*sbbuf, M_UFSMNT); 582 *sbbuf = NULL; 583 return error; 584 } 585 memcpy(space, bp->b_data, (u_int)len); 586 space = (char *)space + len; 587 brelse(bp, BC_INVAL | BC_NOCACHE); 588 } 589 if (fs->fs_contigsumsize > 0) { 590 copyfs->fs_maxcluster = lp = space; 591 for (i = 0; i < fs->fs_ncg; i++) 592 *lp++ = fs->fs_contigsumsize; 593 } 594 if (mp->mnt_wapbl) 595 copyfs->fs_flags &= ~FS_DOWAPBL; 596 return 0; 597} 598 599/* 600 * We must check for active files that have been unlinked (e.g., with a zero 601 * link count). We have to expunge all trace of these files from the snapshot 602 * so that they are not reclaimed prematurely by fsck or unnecessarily dumped. 603 * Note that we skip unlinked snapshot files as they will be handled separately. 604 * Calculate the snapshot list size and create a preliminary list. 605 */ 606static int 607snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs, 608 daddr_t *snaplistsize, daddr_t **snaplist) 609{ 610 int cg, error = 0, len, loc; 611 daddr_t blkno, *blkp; 612 struct fs *fs = VFSTOUFS(mp)->um_fs; 613 struct inode *xp; 614 struct lwp *l = curlwp; 615 struct vattr vat; 616 struct vnode *logvp = NULL, *mvp = NULL, *xvp; 617 618 *snaplist = NULL; 619 /* 620 * Get the log inode if any. 621 */ 622 if ((fs->fs_flags & FS_DOWAPBL) && 623 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { 624 error = VFS_VGET(mp, 625 fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp); 626 if (error) 627 goto out; 628 } 629 /* 630 * Allocate a marker vnode. 631 */ 632 mvp = vnalloc(mp); 633 /* 634 * We also calculate the needed size for the snapshot list. 635 */ 636 *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 637 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 638 mutex_enter(&mntvnode_lock); 639 /* 640 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 641 * and vclean() can be called indirectly 642 */ 643 for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) { 644 vmark(mvp, xvp); 645 /* 646 * Make sure this vnode wasn't reclaimed in getnewvnode(). 647 * Start over if it has (it won't be on the list anymore). 648 */ 649 if (xvp->v_mount != mp || vismarker(xvp)) 650 continue; 651 mutex_enter(xvp->v_interlock); 652 if ((xvp->v_iflag & VI_XLOCK) || 653 xvp->v_usecount == 0 || xvp->v_type == VNON || 654 VTOI(xvp) == NULL || 655 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 656 mutex_exit(xvp->v_interlock); 657 continue; 658 } 659 mutex_exit(&mntvnode_lock); 660 /* 661 * XXXAD should increase vnode ref count to prevent it 662 * disappearing or being recycled. 663 */ 664 mutex_exit(xvp->v_interlock); 665#ifdef DEBUG 666 if (snapdebug) 667 vprint("ffs_snapshot: busy vnode", xvp); 668#endif 669 xp = VTOI(xvp); 670 if (xvp != logvp) { 671 if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 && 672 vat.va_nlink > 0) { 673 mutex_enter(&mntvnode_lock); 674 continue; 675 } 676 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 677 mutex_enter(&mntvnode_lock); 678 continue; 679 } 680 } 681 /* 682 * If there is a fragment, clear it here. 683 */ 684 blkno = 0; 685 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 686 if (loc < NDADDR) { 687 len = fragroundup(fs, blkoff(fs, xp->i_size)); 688 if (len > 0 && len < fs->fs_bsize) { 689 error = UFS_WAPBL_BEGIN(mp); 690 if (error) { 691 (void)vunmark(mvp); 692 goto out; 693 } 694 ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc), 695 len, xp->i_number); 696 blkno = db_get(xp, loc); 697 db_assign(xp, loc, 0); 698 UFS_WAPBL_END(mp); 699 } 700 } 701 *snaplistsize += 1; 702 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY); 703 if (blkno) 704 db_assign(xp, loc, blkno); 705 if (!error) { 706 error = UFS_WAPBL_BEGIN(mp); 707 if (!error) { 708 error = ffs_freefile_snap(copy_fs, vp, 709 xp->i_number, xp->i_mode); 710 UFS_WAPBL_END(mp); 711 } 712 } 713 if (error) { 714 (void)vunmark(mvp); 715 goto out; 716 } 717 mutex_enter(&mntvnode_lock); 718 } 719 mutex_exit(&mntvnode_lock); 720 /* 721 * Create a preliminary list of preallocated snapshot blocks. 722 */ 723 *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 724 blkp = &(*snaplist)[1]; 725 *blkp++ = lblkno(fs, fs->fs_sblockloc); 726 blkno = fragstoblks(fs, fs->fs_csaddr); 727 for (cg = 0; cg < fs->fs_ncg; cg++) { 728 if (fragstoblks(fs, cgtod(fs, cg)) > blkno) 729 break; 730 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 731 } 732 len = howmany(fs->fs_cssize, fs->fs_bsize); 733 for (loc = 0; loc < len; loc++) 734 *blkp++ = blkno + loc; 735 for (; cg < fs->fs_ncg; cg++) 736 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 737 (*snaplist)[0] = blkp - &(*snaplist)[0]; 738 739out: 740 if (mvp != NULL) 741 vnfree(mvp); 742 if (logvp != NULL) 743 vput(logvp); 744 if (error && *snaplist != NULL) { 745 free(*snaplist, M_UFSMNT); 746 *snaplist = NULL; 747 } 748 749 return error; 750} 751 752/* 753 * Copy allocation information from all the snapshots in this snapshot and 754 * then expunge them from its view. Also, collect the list of allocated 755 * blocks in i_snapblklist. 756 */ 757static int 758snapshot_expunge_snap(struct mount *mp, struct vnode *vp, 759 struct fs *copy_fs, daddr_t snaplistsize) 760{ 761 int error = 0, i; 762 daddr_t numblks, *snaplist = NULL; 763 struct fs *fs = VFSTOUFS(mp)->um_fs; 764 struct inode *ip = VTOI(vp), *xp; 765 struct lwp *l = curlwp; 766 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; 767 768 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) { 769 if (xp != ip) { 770 error = expunge(vp, xp, fs, snapacct, BLK_SNAP); 771 if (error) 772 break; 773 } 774 if (xp->i_nlink != 0) 775 continue; 776 error = UFS_WAPBL_BEGIN(mp); 777 if (error) 778 break; 779 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode); 780 UFS_WAPBL_END(mp); 781 if (error) 782 break; 783 } 784 if (error) 785 goto out; 786 /* 787 * Allocate space for the full list of preallocated snapshot blocks. 788 */ 789 snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 790 ip->i_snapblklist = &snaplist[1]; 791 /* 792 * Expunge the blocks used by the snapshots from the set of 793 * blocks marked as used in the snapshot bitmaps. Also, collect 794 * the list of allocated blocks in i_snapblklist. 795 */ 796 error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP); 797 if (error) 798 goto out; 799 if (snaplistsize < ip->i_snapblklist - snaplist) 800 panic("ffs_snapshot: list too small"); 801 snaplistsize = ip->i_snapblklist - snaplist; 802 snaplist[0] = snaplistsize; 803 ip->i_snapblklist = &snaplist[0]; 804 /* 805 * Write out the list of allocated blocks to the end of the snapshot. 806 */ 807 numblks = howmany(fs->fs_size, fs->fs_frag); 808 for (i = 0; i < snaplistsize; i++) 809 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); 810 error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist, 811 snaplistsize * sizeof(daddr_t), lblktosize(fs, (off_t)numblks), 812 UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL); 813 for (i = 0; i < snaplistsize; i++) 814 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); 815out: 816 if (error && snaplist != NULL) { 817 free(snaplist, M_UFSMNT); 818 ip->i_snapblklist = NULL; 819 } 820 return error; 821} 822 823/* 824 * Write the superblock and its summary information to the snapshot. 825 * Make sure, the first NDADDR blocks get copied to the snapshot. 826 */ 827static int 828snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf) 829{ 830 int error, len, loc; 831 void *space; 832 daddr_t blkno; 833 struct buf *bp; 834 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; 835 struct inode *ip = VTOI(vp); 836 struct lwp *l = curlwp; 837 838 copyfs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc)); 839 840 /* 841 * Write the superblock and its summary information 842 * to the snapshot. 843 */ 844 blkno = fragstoblks(fs, fs->fs_csaddr); 845 len = howmany(fs->fs_cssize, fs->fs_bsize); 846 space = copyfs->fs_csp; 847#ifdef FFS_EI 848 if (UFS_FSNEEDSWAP(fs)) { 849 ffs_sb_swap(copyfs, copyfs); 850 ffs_csum_swap(space, space, fs->fs_cssize); 851 } 852#endif 853 error = UFS_WAPBL_BEGIN(mp); 854 if (error) 855 return error; 856 for (loc = 0; loc < len; loc++) { 857 error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred, 858 B_MODIFY, &bp); 859 if (error) { 860 brelse(bp, 0); 861 break; 862 } 863 memcpy(bp->b_data, space, fs->fs_bsize); 864 space = (char *)space + fs->fs_bsize; 865 bawrite(bp); 866 } 867 if (error) 868 goto out; 869 error = bread(vp, lblkno(fs, fs->fs_sblockloc), 870 fs->fs_bsize, l->l_cred, B_MODIFY, &bp); 871 if (error) { 872 brelse(bp, 0); 873 goto out; 874 } else { 875 memcpy(bp->b_data, sbbuf, fs->fs_bsize); 876 bawrite(bp); 877 } 878 /* 879 * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite() 880 * and ffs_snapblkfree() will always work on indirect blocks. 881 */ 882 for (loc = 0; loc < NDADDR; loc++) { 883 if (db_get(ip, loc) != 0) 884 continue; 885 error = ffs_balloc(vp, lblktosize(fs, (off_t)loc), 886 fs->fs_bsize, l->l_cred, 0, &bp); 887 if (error) 888 break; 889 error = rwfsblk(vp, B_READ, bp->b_data, loc); 890 if (error) { 891 brelse(bp, 0); 892 break; 893 } 894 bawrite(bp); 895 } 896 897out: 898 UFS_WAPBL_END(mp); 899 return error; 900} 901 902/* 903 * Copy all cylinder group maps. 904 */ 905static int 906cgaccount(struct vnode *vp, int passno, int *redo) 907{ 908 int cg, error = 0; 909 struct buf *nbp; 910 struct fs *fs = VTOI(vp)->i_fs; 911 912 if (redo != NULL) 913 *redo = 0; 914 if (passno == 1) 915 fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY), 916 M_DEVBUF, M_WAITOK | M_ZERO); 917 for (cg = 0; cg < fs->fs_ncg; cg++) { 918 if (passno == 2 && ACTIVECG_ISSET(fs, cg)) 919 continue; 920 921 if (redo != NULL) 922 *redo += 1; 923 error = UFS_WAPBL_BEGIN(vp->v_mount); 924 if (error) 925 return error; 926 error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 927 fs->fs_bsize, curlwp->l_cred, 0, &nbp); 928 if (error) { 929 UFS_WAPBL_END(vp->v_mount); 930 break; 931 } 932 error = cgaccount1(cg, vp, nbp->b_data, passno); 933 bawrite(nbp); 934 UFS_WAPBL_END(vp->v_mount); 935 if (error) 936 break; 937 } 938 return error; 939} 940 941/* 942 * Copy a cylinder group map. All the unallocated blocks are marked 943 * BLK_NOCOPY so that the snapshot knows that it need not copy them 944 * if they are later written. If passno is one, then this is a first 945 * pass, so only setting needs to be done. If passno is 2, then this 946 * is a revision to a previous pass which must be undone as the 947 * replacement pass is done. 948 */ 949static int 950cgaccount1(int cg, struct vnode *vp, void *data, int passno) 951{ 952 struct buf *bp, *ibp; 953 struct inode *ip; 954 struct cg *cgp; 955 struct fs *fs; 956 struct lwp *l = curlwp; 957 daddr_t base, numblks; 958 int error, len, loc, ns, indiroff; 959 960 ip = VTOI(vp); 961 fs = ip->i_fs; 962 ns = UFS_FSNEEDSWAP(fs); 963 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 964 (int)fs->fs_cgsize, l->l_cred, 0, &bp); 965 if (error) { 966 brelse(bp, 0); 967 return (error); 968 } 969 cgp = (struct cg *)bp->b_data; 970 if (!cg_chkmagic(cgp, ns)) { 971 brelse(bp, 0); 972 return (EIO); 973 } 974 ACTIVECG_SET(fs, cg); 975 976 memcpy(data, bp->b_data, fs->fs_cgsize); 977 brelse(bp, 0); 978 if (fs->fs_cgsize < fs->fs_bsize) 979 memset((char *)data + fs->fs_cgsize, 0, 980 fs->fs_bsize - fs->fs_cgsize); 981 numblks = howmany(fs->fs_size, fs->fs_frag); 982 len = howmany(fs->fs_fpg, fs->fs_frag); 983 base = cg * fs->fs_fpg / fs->fs_frag; 984 if (base + len >= numblks) 985 len = numblks - base - 1; 986 loc = 0; 987 if (base < NDADDR) { 988 for ( ; loc < NDADDR; loc++) { 989 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 990 db_assign(ip, loc, BLK_NOCOPY); 991 else if (db_get(ip, loc) == BLK_NOCOPY) { 992 if (passno == 2) 993 db_assign(ip, loc, 0); 994 else if (passno == 1) 995 panic("ffs_snapshot: lost direct block"); 996 } 997 } 998 } 999 if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)), 1000 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) 1001 return (error); 1002 indiroff = (base + loc - NDADDR) % NINDIR(fs); 1003 for ( ; loc < len; loc++, indiroff++) { 1004 if (indiroff >= NINDIR(fs)) { 1005 bawrite(ibp); 1006 if ((error = ffs_balloc(vp, 1007 lblktosize(fs, (off_t)(base + loc)), 1008 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) 1009 return (error); 1010 indiroff = 0; 1011 } 1012 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 1013 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 1014 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 1015 if (passno == 2) 1016 idb_assign(ip, ibp->b_data, indiroff, 0); 1017 else if (passno == 1) 1018 panic("ffs_snapshot: lost indirect block"); 1019 } 1020 } 1021 bdwrite(ibp); 1022 return (0); 1023} 1024 1025/* 1026 * Before expunging a snapshot inode, note all the 1027 * blocks that it claims with BLK_SNAP so that fsck will 1028 * be able to account for those blocks properly and so 1029 * that this snapshot knows that it need not copy them 1030 * if the other snapshot holding them is freed. 1031 */ 1032static int 1033expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 1034 acctfunc_t acctfunc, int expungetype) 1035{ 1036 int i, error, ns; 1037 daddr_t lbn, rlbn; 1038 daddr_t len, blkno, numblks, blksperindir; 1039 struct ufs1_dinode *dip1; 1040 struct ufs2_dinode *dip2; 1041 struct lwp *l = curlwp; 1042 void *bap; 1043 struct buf *bp; 1044 struct mount *mp; 1045 1046 ns = UFS_FSNEEDSWAP(fs); 1047 mp = snapvp->v_mount; 1048 1049 error = UFS_WAPBL_BEGIN(mp); 1050 if (error) 1051 return error; 1052 /* 1053 * Prepare to expunge the inode. If its inode block has not 1054 * yet been copied, then allocate and fill the copy. 1055 */ 1056 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1057 error = snapblkaddr(snapvp, lbn, &blkno); 1058 if (error) 1059 return error; 1060 if (blkno != 0) { 1061 error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred, 1062 B_MODIFY, &bp); 1063 } else { 1064 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn), 1065 fs->fs_bsize, l->l_cred, 0, &bp); 1066 if (! error) 1067 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn); 1068 } 1069 if (error) { 1070 UFS_WAPBL_END(mp); 1071 return error; 1072 } 1073 /* 1074 * Set a snapshot inode to be a zero length file, regular files 1075 * or unlinked snapshots to be completely unallocated. 1076 */ 1077 if (fs->fs_magic == FS_UFS1_MAGIC) { 1078 dip1 = (struct ufs1_dinode *)bp->b_data + 1079 ino_to_fsbo(fs, cancelip->i_number); 1080 if (cancelip->i_flags & SF_SNAPSHOT) { 1081 dip1->di_flags = 1082 ufs_rw32(ufs_rw32(dip1->di_flags, ns) | 1083 SF_SNAPINVAL, ns); 1084 } 1085 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0) 1086 dip1->di_mode = 0; 1087 dip1->di_size = 0; 1088 dip1->di_blocks = 0; 1089 memset(&dip1->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int32_t)); 1090 } else { 1091 dip2 = (struct ufs2_dinode *)bp->b_data + 1092 ino_to_fsbo(fs, cancelip->i_number); 1093 if (cancelip->i_flags & SF_SNAPSHOT) { 1094 dip2->di_flags = 1095 ufs_rw32(ufs_rw32(dip2->di_flags, ns) | 1096 SF_SNAPINVAL, ns); 1097 } 1098 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0) 1099 dip2->di_mode = 0; 1100 dip2->di_size = 0; 1101 dip2->di_blocks = 0; 1102 memset(&dip2->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int64_t)); 1103 } 1104 bdwrite(bp); 1105 UFS_WAPBL_END(mp); 1106 /* 1107 * Now go through and expunge all the blocks in the file 1108 * using the function requested. 1109 */ 1110 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1111 if (fs->fs_magic == FS_UFS1_MAGIC) 1112 bap = &cancelip->i_ffs1_db[0]; 1113 else 1114 bap = &cancelip->i_ffs2_db[0]; 1115 error = (*acctfunc)(snapvp, bap, 0, NDADDR, fs, 0, expungetype); 1116 if (error) 1117 return (error); 1118 if (fs->fs_magic == FS_UFS1_MAGIC) 1119 bap = &cancelip->i_ffs1_ib[0]; 1120 else 1121 bap = &cancelip->i_ffs2_ib[0]; 1122 error = (*acctfunc)(snapvp, bap, 0, NIADDR, fs, -1, expungetype); 1123 if (error) 1124 return (error); 1125 blksperindir = 1; 1126 lbn = -NDADDR; 1127 len = numblks - NDADDR; 1128 rlbn = NDADDR; 1129 for (i = 0; len > 0 && i < NIADDR; i++) { 1130 error = indiracct(snapvp, ITOV(cancelip), i, 1131 ib_get(cancelip, i), lbn, rlbn, len, 1132 blksperindir, fs, acctfunc, expungetype); 1133 if (error) 1134 return (error); 1135 blksperindir *= NINDIR(fs); 1136 lbn -= blksperindir + 1; 1137 len -= blksperindir; 1138 rlbn += blksperindir; 1139 } 1140 return (0); 1141} 1142 1143/* 1144 * Descend an indirect block chain for vnode cancelvp accounting for all 1145 * its indirect blocks in snapvp. 1146 */ 1147static int 1148indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level, 1149 daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks, 1150 daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype) 1151{ 1152 int error, num, i; 1153 daddr_t subblksperindir; 1154 struct indir indirs[NIADDR + 2]; 1155 daddr_t last; 1156 void *bap; 1157 struct buf *bp; 1158 1159 if (blkno == 0) { 1160 if (expungetype == BLK_NOCOPY) 1161 return (0); 1162 panic("indiracct: missing indir"); 1163 } 1164 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1165 return (error); 1166 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1167 panic("indiracct: botched params"); 1168 /* 1169 * We have to expand bread here since it will deadlock looking 1170 * up the block number for any blocks that are not in the cache. 1171 */ 1172 error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize, 1173 false, &bp); 1174 if (error) 1175 return error; 1176 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error = 1177 rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) { 1178 brelse(bp, 0); 1179 return (error); 1180 } 1181 /* 1182 * Account for the block pointers in this indirect block. 1183 */ 1184 last = howmany(remblks, blksperindir); 1185 if (last > NINDIR(fs)) 1186 last = NINDIR(fs); 1187 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO); 1188 memcpy((void *)bap, bp->b_data, fs->fs_bsize); 1189 brelse(bp, 0); 1190 error = (*acctfunc)(snapvp, bap, 0, last, 1191 fs, level == 0 ? rlbn : -1, expungetype); 1192 if (error || level == 0) 1193 goto out; 1194 /* 1195 * Account for the block pointers in each of the indirect blocks 1196 * in the levels below us. 1197 */ 1198 subblksperindir = blksperindir / NINDIR(fs); 1199 for (lbn++, level--, i = 0; i < last; i++) { 1200 error = indiracct(snapvp, cancelvp, level, 1201 idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks, 1202 subblksperindir, fs, acctfunc, expungetype); 1203 if (error) 1204 goto out; 1205 rlbn += blksperindir; 1206 lbn -= blksperindir; 1207 remblks -= blksperindir; 1208 } 1209out: 1210 free(bap, M_DEVBUF); 1211 return (error); 1212} 1213 1214/* 1215 * Do both snap accounting and map accounting. 1216 */ 1217static int 1218fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1219 struct fs *fs, daddr_t lblkno, 1220 int exptype /* BLK_SNAP or BLK_NOCOPY */) 1221{ 1222 int error; 1223 1224 if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype))) 1225 return (error); 1226 return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)); 1227} 1228 1229/* 1230 * Identify a set of blocks allocated in a snapshot inode. 1231 */ 1232static int 1233snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1234 struct fs *fs, daddr_t lblkno, 1235 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 1236{ 1237 struct inode *ip = VTOI(vp); 1238 struct lwp *l = curlwp; 1239 struct mount *mp = vp->v_mount; 1240 daddr_t blkno; 1241 daddr_t lbn; 1242 struct buf *ibp; 1243 int error, n; 1244 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8; 1245 1246 error = UFS_WAPBL_BEGIN(mp); 1247 if (error) 1248 return error; 1249 for ( n = 0; oldblkp < lastblkp; oldblkp++) { 1250 blkno = idb_get(ip, bap, oldblkp); 1251 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1252 continue; 1253 lbn = fragstoblks(fs, blkno); 1254 if (lbn < NDADDR) { 1255 blkno = db_get(ip, lbn); 1256 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1257 } else { 1258 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1259 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 1260 if (error) 1261 break; 1262 blkno = idb_get(ip, ibp->b_data, 1263 (lbn - NDADDR) % NINDIR(fs)); 1264 } 1265 /* 1266 * If we are expunging a snapshot vnode and we 1267 * find a block marked BLK_NOCOPY, then it is 1268 * one that has been allocated to this snapshot after 1269 * we took our current snapshot and can be ignored. 1270 */ 1271 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1272 if (lbn >= NDADDR) 1273 brelse(ibp, 0); 1274 } else { 1275 if (blkno != 0) 1276 panic("snapacct: bad block"); 1277 if (lbn < NDADDR) 1278 db_assign(ip, lbn, expungetype); 1279 else { 1280 idb_assign(ip, ibp->b_data, 1281 (lbn - NDADDR) % NINDIR(fs), expungetype); 1282 bdwrite(ibp); 1283 } 1284 } 1285 if (wbreak > 0 && (++n % wbreak) == 0) { 1286 UFS_WAPBL_END(mp); 1287 error = UFS_WAPBL_BEGIN(mp); 1288 if (error) 1289 return error; 1290 } 1291 } 1292 UFS_WAPBL_END(mp); 1293 return error; 1294} 1295 1296/* 1297 * Account for a set of blocks allocated in a snapshot inode. 1298 */ 1299static int 1300mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1301 struct fs *fs, daddr_t lblkno, int expungetype) 1302{ 1303 daddr_t blkno; 1304 struct inode *ip; 1305 struct mount *mp = vp->v_mount; 1306 ino_t inum; 1307 int acctit, error, n; 1308 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8; 1309 1310 error = UFS_WAPBL_BEGIN(mp); 1311 if (error) 1312 return error; 1313 ip = VTOI(vp); 1314 inum = ip->i_number; 1315 if (lblkno == -1) 1316 acctit = 0; 1317 else 1318 acctit = 1; 1319 for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) { 1320 blkno = idb_get(ip, bap, oldblkp); 1321 if (blkno == 0 || blkno == BLK_NOCOPY) 1322 continue; 1323 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1324 *ip->i_snapblklist++ = lblkno; 1325 if (blkno == BLK_SNAP) 1326 blkno = blkstofrags(fs, lblkno); 1327 ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum); 1328 if (wbreak > 0 && (++n % wbreak) == 0) { 1329 UFS_WAPBL_END(mp); 1330 error = UFS_WAPBL_BEGIN(mp); 1331 if (error) 1332 return error; 1333 } 1334 } 1335 UFS_WAPBL_END(mp); 1336 return (0); 1337} 1338 1339/* 1340 * Number of blocks that fit into the journal or zero if not logging. 1341 */ 1342static int 1343blocks_in_journal(struct fs *fs) 1344{ 1345 off_t bpj; 1346 1347 if ((fs->fs_flags & FS_DOWAPBL) == 0) 1348 return 0; 1349 bpj = 1; 1350 if (fs->fs_journal_version == UFS_WAPBL_VERSION) { 1351 switch (fs->fs_journal_location) { 1352 case UFS_WAPBL_JOURNALLOC_END_PARTITION: 1353 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]* 1354 fs->fs_journallocs[UFS_WAPBL_EPART_COUNT]; 1355 break; 1356 case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: 1357 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]* 1358 fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; 1359 break; 1360 } 1361 } 1362 bpj /= fs->fs_bsize; 1363 return (bpj > 0 ? bpj : 1); 1364} 1365#endif /* defined(FFS_NO_SNAPSHOT) */ 1366 1367/* 1368 * Decrement extra reference on snapshot when last name is removed. 1369 * It will not be freed until the last open reference goes away. 1370 */ 1371void 1372ffs_snapgone(struct inode *ip) 1373{ 1374 struct mount *mp = ip->i_devvp->v_specmountpoint; 1375 struct inode *xp; 1376 struct fs *fs; 1377 struct snap_info *si; 1378 int snaploc; 1379 1380 si = VFSTOUFS(mp)->um_snapinfo; 1381 1382 /* 1383 * Find snapshot in incore list. 1384 */ 1385 mutex_enter(&si->si_lock); 1386 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 1387 if (xp == ip) 1388 break; 1389 mutex_exit(&si->si_lock); 1390 if (xp != NULL) 1391 vrele(ITOV(ip)); 1392#ifdef DEBUG 1393 else if (snapdebug) 1394 printf("ffs_snapgone: lost snapshot vnode %llu\n", 1395 (unsigned long long)ip->i_number); 1396#endif 1397 /* 1398 * Delete snapshot inode from superblock. Keep list dense. 1399 */ 1400 mutex_enter(&si->si_lock); 1401 fs = ip->i_fs; 1402 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1403 if (fs->fs_snapinum[snaploc] == ip->i_number) 1404 break; 1405 if (snaploc < FSMAXSNAP) { 1406 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1407 if (fs->fs_snapinum[snaploc] == 0) 1408 break; 1409 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1410 } 1411 fs->fs_snapinum[snaploc - 1] = 0; 1412 } 1413 si->si_gen++; 1414 mutex_exit(&si->si_lock); 1415} 1416 1417/* 1418 * Prepare a snapshot file for being removed. 1419 */ 1420void 1421ffs_snapremove(struct vnode *vp) 1422{ 1423 struct inode *ip = VTOI(vp), *xp; 1424 struct vnode *devvp = ip->i_devvp; 1425 struct fs *fs = ip->i_fs; 1426 struct mount *mp = devvp->v_specmountpoint; 1427 struct buf *ibp; 1428 struct snap_info *si; 1429 struct lwp *l = curlwp; 1430 daddr_t numblks, blkno, dblk; 1431 int error, loc, last; 1432 1433 si = VFSTOUFS(mp)->um_snapinfo; 1434 /* 1435 * If active, delete from incore list (this snapshot may 1436 * already have been in the process of being deleted, so 1437 * would not have been active). 1438 * 1439 * Clear copy-on-write flag if last snapshot. 1440 */ 1441 mutex_enter(&si->si_snaplock); 1442 mutex_enter(&si->si_lock); 1443 if (is_active_snapshot(si, ip)) { 1444 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap); 1445 if (TAILQ_FIRST(&si->si_snapshots) != 0) { 1446 /* Roll back the list of preallocated blocks. */ 1447 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1448 si->si_snapblklist = xp->i_snapblklist; 1449 si->si_gen++; 1450 mutex_exit(&si->si_lock); 1451 mutex_exit(&si->si_snaplock); 1452 } else { 1453 si->si_snapblklist = 0; 1454 si->si_gen++; 1455 mutex_exit(&si->si_lock); 1456 mutex_exit(&si->si_snaplock); 1457 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1458 } 1459 if (ip->i_snapblklist != NULL) { 1460 free(ip->i_snapblklist, M_UFSMNT); 1461 ip->i_snapblklist = NULL; 1462 } 1463 } else { 1464 mutex_exit(&si->si_lock); 1465 mutex_exit(&si->si_snaplock); 1466 } 1467 /* 1468 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1469 * snapshots that want them (see ffs_snapblkfree below). 1470 */ 1471 for (blkno = 1; blkno < NDADDR; blkno++) { 1472 dblk = db_get(ip, blkno); 1473 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1474 db_assign(ip, blkno, 0); 1475 else if ((dblk == blkstofrags(fs, blkno) && 1476 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1477 ip->i_number))) { 1478 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1479 db_assign(ip, blkno, 0); 1480 } 1481 } 1482 numblks = howmany(ip->i_size, fs->fs_bsize); 1483 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1484 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 1485 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 1486 if (error) 1487 continue; 1488 if (fs->fs_size - blkno > NINDIR(fs)) 1489 last = NINDIR(fs); 1490 else 1491 last = fs->fs_size - blkno; 1492 for (loc = 0; loc < last; loc++) { 1493 dblk = idb_get(ip, ibp->b_data, loc); 1494 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1495 idb_assign(ip, ibp->b_data, loc, 0); 1496 else if (dblk == blkstofrags(fs, blkno) && 1497 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1498 fs->fs_bsize, ip->i_number)) { 1499 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1500 idb_assign(ip, ibp->b_data, loc, 0); 1501 } 1502 } 1503 bawrite(ibp); 1504 UFS_WAPBL_END(mp); 1505 error = UFS_WAPBL_BEGIN(mp); 1506 KASSERT(error == 0); 1507 } 1508 /* 1509 * Clear snapshot flag and drop reference. 1510 */ 1511 ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL); 1512 DIP_ASSIGN(ip, flags, ip->i_flags); 1513 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1514#if defined(QUOTA) || defined(QUOTA2) 1515 chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE); 1516 chkiq(ip, 1, l->l_cred, FORCE); 1517#endif 1518} 1519 1520/* 1521 * Notification that a block is being freed. Return zero if the free 1522 * should be allowed to proceed. Return non-zero if the snapshot file 1523 * wants to claim the block. The block will be claimed if it is an 1524 * uncopied part of one of the snapshots. It will be freed if it is 1525 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1526 * If a fragment is being freed, then all snapshots that care about 1527 * it must make a copy since a snapshot file can only claim full sized 1528 * blocks. Note that if more than one snapshot file maps the block, 1529 * we can pick one at random to claim it. Since none of the snapshots 1530 * can change, we are assurred that they will all see the same unmodified 1531 * image. When deleting a snapshot file (see ffs_snapremove above), we 1532 * must push any of these claimed blocks to one of the other snapshots 1533 * that maps it. These claimed blocks are easily identified as they will 1534 * have a block number equal to their logical block number within the 1535 * snapshot. A copied block can never have this property because they 1536 * must always have been allocated from a BLK_NOCOPY location. 1537 */ 1538int 1539ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, 1540 long size, ino_t inum) 1541{ 1542 struct mount *mp = devvp->v_specmountpoint; 1543 struct buf *ibp; 1544 struct inode *ip; 1545 struct vnode *vp = NULL; 1546 struct snap_info *si; 1547 void *saved_data = NULL; 1548 daddr_t lbn; 1549 daddr_t blkno; 1550 uint32_t gen; 1551 int indiroff = 0, error = 0, claimedblk = 0; 1552 1553 si = VFSTOUFS(mp)->um_snapinfo; 1554 lbn = fragstoblks(fs, bno); 1555 mutex_enter(&si->si_snaplock); 1556 mutex_enter(&si->si_lock); 1557 si->si_owner = curlwp; 1558 1559retry: 1560 gen = si->si_gen; 1561 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1562 vp = ITOV(ip); 1563 /* 1564 * Lookup block being written. 1565 */ 1566 if (lbn < NDADDR) { 1567 blkno = db_get(ip, lbn); 1568 } else { 1569 mutex_exit(&si->si_lock); 1570 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1571 fs->fs_bsize, FSCRED, B_METAONLY, &ibp); 1572 if (error) { 1573 mutex_enter(&si->si_lock); 1574 break; 1575 } 1576 indiroff = (lbn - NDADDR) % NINDIR(fs); 1577 blkno = idb_get(ip, ibp->b_data, indiroff); 1578 mutex_enter(&si->si_lock); 1579 if (gen != si->si_gen) { 1580 brelse(ibp, 0); 1581 goto retry; 1582 } 1583 } 1584 /* 1585 * Check to see if block needs to be copied. 1586 */ 1587 if (blkno == 0) { 1588 /* 1589 * A block that we map is being freed. If it has not 1590 * been claimed yet, we will claim or copy it (below). 1591 */ 1592 claimedblk = 1; 1593 } else if (blkno == BLK_SNAP) { 1594 /* 1595 * No previous snapshot claimed the block, 1596 * so it will be freed and become a BLK_NOCOPY 1597 * (don't care) for us. 1598 */ 1599 if (claimedblk) 1600 panic("snapblkfree: inconsistent block type"); 1601 if (lbn < NDADDR) { 1602 db_assign(ip, lbn, BLK_NOCOPY); 1603 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1604 } else { 1605 idb_assign(ip, ibp->b_data, indiroff, 1606 BLK_NOCOPY); 1607 mutex_exit(&si->si_lock); 1608 if (ip->i_nlink > 0) 1609 bwrite(ibp); 1610 else 1611 bdwrite(ibp); 1612 mutex_enter(&si->si_lock); 1613 if (gen != si->si_gen) 1614 goto retry; 1615 } 1616 continue; 1617 } else /* BLK_NOCOPY or default */ { 1618 /* 1619 * If the snapshot has already copied the block 1620 * (default), or does not care about the block, 1621 * it is not needed. 1622 */ 1623 if (lbn >= NDADDR) 1624 brelse(ibp, 0); 1625 continue; 1626 } 1627 /* 1628 * If this is a full size block, we will just grab it 1629 * and assign it to the snapshot inode. Otherwise we 1630 * will proceed to copy it. See explanation for this 1631 * routine as to why only a single snapshot needs to 1632 * claim this block. 1633 */ 1634 if (size == fs->fs_bsize) { 1635#ifdef DEBUG 1636 if (snapdebug) 1637 printf("%s %llu lbn %" PRId64 1638 "from inum %llu\n", 1639 "Grabonremove: snapino", 1640 (unsigned long long)ip->i_number, 1641 lbn, (unsigned long long)inum); 1642#endif 1643 mutex_exit(&si->si_lock); 1644 if (lbn < NDADDR) { 1645 db_assign(ip, lbn, bno); 1646 } else { 1647 idb_assign(ip, ibp->b_data, indiroff, bno); 1648 if (ip->i_nlink > 0) 1649 bwrite(ibp); 1650 else 1651 bdwrite(ibp); 1652 } 1653 DIP_ADD(ip, blocks, btodb(size)); 1654 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1655 if (ip->i_nlink > 0 && mp->mnt_wapbl) 1656 error = syncsnap(vp); 1657 else 1658 error = 0; 1659 mutex_enter(&si->si_lock); 1660 si->si_owner = NULL; 1661 mutex_exit(&si->si_lock); 1662 mutex_exit(&si->si_snaplock); 1663 return (error == 0); 1664 } 1665 if (lbn >= NDADDR) 1666 brelse(ibp, 0); 1667#ifdef DEBUG 1668 if (snapdebug) 1669 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n", 1670 "Copyonremove: snapino ", 1671 (unsigned long long)ip->i_number, 1672 lbn, "for inum", (unsigned long long)inum, size); 1673#endif 1674 /* 1675 * If we have already read the old block contents, then 1676 * simply copy them to the new block. Note that we need 1677 * to synchronously write snapshots that have not been 1678 * unlinked, and hence will be visible after a crash, 1679 * to ensure their integrity. 1680 */ 1681 mutex_exit(&si->si_lock); 1682 if (saved_data == NULL) { 1683 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1684 error = rwfsblk(vp, B_READ, saved_data, lbn); 1685 if (error) { 1686 free(saved_data, M_UFSMNT); 1687 saved_data = NULL; 1688 mutex_enter(&si->si_lock); 1689 break; 1690 } 1691 } 1692 error = wrsnapblk(vp, saved_data, lbn); 1693 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) 1694 error = syncsnap(vp); 1695 mutex_enter(&si->si_lock); 1696 if (error) 1697 break; 1698 if (gen != si->si_gen) 1699 goto retry; 1700 } 1701 si->si_owner = NULL; 1702 mutex_exit(&si->si_lock); 1703 mutex_exit(&si->si_snaplock); 1704 if (saved_data) 1705 free(saved_data, M_UFSMNT); 1706 /* 1707 * If we have been unable to allocate a block in which to do 1708 * the copy, then return non-zero so that the fragment will 1709 * not be freed. Although space will be lost, the snapshot 1710 * will stay consistent. 1711 */ 1712 return (error); 1713} 1714 1715/* 1716 * Associate snapshot files when mounting. 1717 */ 1718void 1719ffs_snapshot_mount(struct mount *mp) 1720{ 1721 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1722 struct fs *fs = VFSTOUFS(mp)->um_fs; 1723 struct lwp *l = curlwp; 1724 struct vnode *vp; 1725 struct inode *ip, *xp; 1726 struct snap_info *si; 1727 daddr_t snaplistsize, *snapblklist; 1728 int i, error, ns, snaploc, loc; 1729 1730 /* 1731 * No persistent snapshots on apple ufs file systems. 1732 */ 1733 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp))) 1734 return; 1735 1736 si = VFSTOUFS(mp)->um_snapinfo; 1737 ns = UFS_FSNEEDSWAP(fs); 1738 /* 1739 * XXX The following needs to be set before ffs_truncate or 1740 * VOP_READ can be called. 1741 */ 1742 mp->mnt_stat.f_iosize = fs->fs_bsize; 1743 /* 1744 * Process each snapshot listed in the superblock. 1745 */ 1746 vp = NULL; 1747 mutex_enter(&si->si_lock); 1748 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1749 if (fs->fs_snapinum[snaploc] == 0) 1750 break; 1751 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1752 &vp)) != 0) { 1753 printf("ffs_snapshot_mount: vget failed %d\n", error); 1754 continue; 1755 } 1756 ip = VTOI(vp); 1757 if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) != 1758 SF_SNAPSHOT) { 1759 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1760 fs->fs_snapinum[snaploc]); 1761 vput(vp); 1762 vp = NULL; 1763 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1764 if (fs->fs_snapinum[loc] == 0) 1765 break; 1766 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1767 } 1768 fs->fs_snapinum[loc - 1] = 0; 1769 snaploc--; 1770 continue; 1771 } 1772 1773 /* 1774 * Read the block hints list. Use an empty list on 1775 * read errors. 1776 */ 1777 error = vn_rdwr(UIO_READ, vp, 1778 (void *)&snaplistsize, sizeof(snaplistsize), 1779 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1780 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, 1781 l->l_cred, NULL, NULL); 1782 if (error) { 1783 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1784 snaplistsize = 1; 1785 } else 1786 snaplistsize = ufs_rw64(snaplistsize, ns); 1787 snapblklist = malloc( 1788 snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 1789 if (error) 1790 snapblklist[0] = 1; 1791 else { 1792 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist, 1793 snaplistsize * sizeof(daddr_t), 1794 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1795 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, 1796 l->l_cred, NULL, NULL); 1797 for (i = 0; i < snaplistsize; i++) 1798 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 1799 if (error) { 1800 printf("ffs_snapshot_mount: read_2 failed %d\n", 1801 error); 1802 snapblklist[0] = 1; 1803 } 1804 } 1805 ip->i_snapblklist = &snapblklist[0]; 1806 1807 /* 1808 * Link it onto the active snapshot list. 1809 */ 1810 if (is_active_snapshot(si, ip)) 1811 panic("ffs_snapshot_mount: %"PRIu64" already on list", 1812 ip->i_number); 1813 else 1814 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 1815 vp->v_vflag |= VV_SYSTEM; 1816 VOP_UNLOCK(vp); 1817 } 1818 /* 1819 * No usable snapshots found. 1820 */ 1821 if (vp == NULL) { 1822 mutex_exit(&si->si_lock); 1823 return; 1824 } 1825 /* 1826 * Attach the block hints list. We always want to 1827 * use the list from the newest snapshot. 1828 */ 1829 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1830 si->si_snapblklist = xp->i_snapblklist; 1831 fscow_establish(mp, ffs_copyonwrite, devvp); 1832 si->si_gen++; 1833 mutex_exit(&si->si_lock); 1834} 1835 1836/* 1837 * Disassociate snapshot files when unmounting. 1838 */ 1839void 1840ffs_snapshot_unmount(struct mount *mp) 1841{ 1842 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1843 struct inode *xp; 1844 struct vnode *vp = NULL; 1845 struct snap_info *si; 1846 1847 si = VFSTOUFS(mp)->um_snapinfo; 1848 mutex_enter(&si->si_lock); 1849 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) { 1850 vp = ITOV(xp); 1851 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap); 1852 if (xp->i_snapblklist == si->si_snapblklist) 1853 si->si_snapblklist = NULL; 1854 free(xp->i_snapblklist, M_UFSMNT); 1855 if (xp->i_nlink > 0) { 1856 si->si_gen++; 1857 mutex_exit(&si->si_lock); 1858 vrele(vp); 1859 mutex_enter(&si->si_lock); 1860 } 1861 } 1862 si->si_gen++; 1863 mutex_exit(&si->si_lock); 1864 if (vp) 1865 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1866} 1867 1868/* 1869 * Check for need to copy block that is about to be written, 1870 * copying the block if necessary. 1871 */ 1872static int 1873ffs_copyonwrite(void *v, struct buf *bp, bool data_valid) 1874{ 1875 struct fs *fs; 1876 struct inode *ip; 1877 struct vnode *devvp = v, *vp = NULL; 1878 struct mount *mp = devvp->v_specmountpoint; 1879 struct snap_info *si; 1880 void *saved_data = NULL; 1881 daddr_t lbn, blkno, *snapblklist; 1882 uint32_t gen; 1883 int lower, upper, mid, snapshot_locked = 0, error = 0; 1884 1885 /* 1886 * Check for valid snapshots. 1887 */ 1888 si = VFSTOUFS(mp)->um_snapinfo; 1889 mutex_enter(&si->si_lock); 1890 ip = TAILQ_FIRST(&si->si_snapshots); 1891 if (ip == NULL) { 1892 mutex_exit(&si->si_lock); 1893 return 0; 1894 } 1895 /* 1896 * First check to see if it is after the file system, 1897 * in the journal or in the preallocated list. 1898 * By doing these checks we avoid several potential deadlocks. 1899 */ 1900 fs = ip->i_fs; 1901 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1902 if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) { 1903 mutex_exit(&si->si_lock); 1904 return 0; 1905 } 1906 if ((fs->fs_flags & FS_DOWAPBL) && 1907 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { 1908 off_t blk_off, log_start, log_end; 1909 1910 log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] * 1911 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; 1912 log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] * 1913 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; 1914 blk_off = dbtob(bp->b_blkno); 1915 if (blk_off >= log_start && blk_off < log_end) { 1916 mutex_exit(&si->si_lock); 1917 return 0; 1918 } 1919 } 1920 snapblklist = si->si_snapblklist; 1921 upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0); 1922 lower = 1; 1923 while (lower <= upper) { 1924 mid = (lower + upper) / 2; 1925 if (snapblklist[mid] == lbn) 1926 break; 1927 if (snapblklist[mid] < lbn) 1928 lower = mid + 1; 1929 else 1930 upper = mid - 1; 1931 } 1932 if (lower <= upper) { 1933 mutex_exit(&si->si_lock); 1934 return 0; 1935 } 1936 /* 1937 * Not in the precomputed list, so check the snapshots. 1938 */ 1939 if (si->si_owner != curlwp) { 1940 if (!mutex_tryenter(&si->si_snaplock)) { 1941 mutex_exit(&si->si_lock); 1942 mutex_enter(&si->si_snaplock); 1943 mutex_enter(&si->si_lock); 1944 } 1945 si->si_owner = curlwp; 1946 snapshot_locked = 1; 1947 } 1948 if (data_valid && bp->b_bcount == fs->fs_bsize) 1949 saved_data = bp->b_data; 1950retry: 1951 gen = si->si_gen; 1952 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1953 vp = ITOV(ip); 1954 /* 1955 * We ensure that everything of our own that needs to be 1956 * copied will be done at the time that ffs_snapshot is 1957 * called. Thus we can skip the check here which can 1958 * deadlock in doing the lookup in ffs_balloc. 1959 */ 1960 if (bp->b_vp == vp) 1961 continue; 1962 /* 1963 * Check to see if block needs to be copied. 1964 */ 1965 if (lbn < NDADDR) { 1966 blkno = db_get(ip, lbn); 1967 } else { 1968 mutex_exit(&si->si_lock); 1969 blkno = 0; /* XXX: GCC */ 1970 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 1971 mutex_enter(&si->si_lock); 1972 break; 1973 } 1974 mutex_enter(&si->si_lock); 1975 if (gen != si->si_gen) 1976 goto retry; 1977 } 1978#ifdef DIAGNOSTIC 1979 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1980 panic("ffs_copyonwrite: bad copy block"); 1981#endif 1982 if (blkno != 0) 1983 continue; 1984 1985 if (curlwp == uvm.pagedaemon_lwp) { 1986 error = ENOMEM; 1987 break; 1988 } 1989 /* Only one level of recursion allowed. */ 1990 KASSERT(snapshot_locked); 1991 /* 1992 * Allocate the block into which to do the copy. Since 1993 * multiple processes may all try to copy the same block, 1994 * we have to recheck our need to do a copy if we sleep 1995 * waiting for the lock. 1996 * 1997 * Because all snapshots on a filesystem share a single 1998 * lock, we ensure that we will never be in competition 1999 * with another process to allocate a block. 2000 */ 2001#ifdef DEBUG 2002 if (snapdebug) { 2003 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ", 2004 (unsigned long long)ip->i_number, lbn); 2005 if (bp->b_vp == devvp) 2006 printf("fs metadata"); 2007 else 2008 printf("inum %llu", (unsigned long long) 2009 VTOI(bp->b_vp)->i_number); 2010 printf(" lblkno %" PRId64 "\n", bp->b_lblkno); 2011 } 2012#endif 2013 /* 2014 * If we have already read the old block contents, then 2015 * simply copy them to the new block. Note that we need 2016 * to synchronously write snapshots that have not been 2017 * unlinked, and hence will be visible after a crash, 2018 * to ensure their integrity. 2019 */ 2020 mutex_exit(&si->si_lock); 2021 if (saved_data == NULL) { 2022 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 2023 error = rwfsblk(vp, B_READ, saved_data, lbn); 2024 if (error) { 2025 free(saved_data, M_UFSMNT); 2026 saved_data = NULL; 2027 mutex_enter(&si->si_lock); 2028 break; 2029 } 2030 } 2031 error = wrsnapblk(vp, saved_data, lbn); 2032 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) 2033 error = syncsnap(vp); 2034 mutex_enter(&si->si_lock); 2035 if (error) 2036 break; 2037 if (gen != si->si_gen) 2038 goto retry; 2039 } 2040 /* 2041 * Note that we need to synchronously write snapshots that 2042 * have not been unlinked, and hence will be visible after 2043 * a crash, to ensure their integrity. 2044 */ 2045 if (snapshot_locked) { 2046 si->si_owner = NULL; 2047 mutex_exit(&si->si_lock); 2048 mutex_exit(&si->si_snaplock); 2049 } else 2050 mutex_exit(&si->si_lock); 2051 if (saved_data && saved_data != bp->b_data) 2052 free(saved_data, M_UFSMNT); 2053 return error; 2054} 2055 2056/* 2057 * Read from a snapshot. 2058 */ 2059int 2060ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag) 2061{ 2062 struct inode *ip = VTOI(vp); 2063 struct fs *fs = ip->i_fs; 2064 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo; 2065 struct buf *bp; 2066 daddr_t lbn, nextlbn; 2067 off_t fsbytes, bytesinfile; 2068 long size, xfersize, blkoffset; 2069 int error; 2070 2071 fstrans_start(vp->v_mount, FSTRANS_SHARED); 2072 mutex_enter(&si->si_snaplock); 2073 2074 if (ioflag & IO_ALTSEMANTICS) 2075 fsbytes = ip->i_size; 2076 else 2077 fsbytes = lfragtosize(fs, fs->fs_size); 2078 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 2079 bytesinfile = fsbytes - uio->uio_offset; 2080 if (bytesinfile <= 0) 2081 break; 2082 lbn = lblkno(fs, uio->uio_offset); 2083 nextlbn = lbn + 1; 2084 size = fs->fs_bsize; 2085 blkoffset = blkoff(fs, uio->uio_offset); 2086 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), 2087 bytesinfile); 2088 2089 if (lblktosize(fs, nextlbn + 1) >= fsbytes) { 2090 if (lblktosize(fs, lbn) + size > fsbytes) 2091 size = fragroundup(fs, 2092 fsbytes - lblktosize(fs, lbn)); 2093 error = bread(vp, lbn, size, NOCRED, 0, &bp); 2094 } else { 2095 int nextsize = fs->fs_bsize; 2096 error = breadn(vp, lbn, 2097 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp); 2098 } 2099 if (error) 2100 break; 2101 2102 /* 2103 * We should only get non-zero b_resid when an I/O error 2104 * has occurred, which should cause us to break above. 2105 * However, if the short read did not cause an error, 2106 * then we want to ensure that we do not uiomove bad 2107 * or uninitialized data. 2108 */ 2109 size -= bp->b_resid; 2110 if (size < blkoffset + xfersize) { 2111 xfersize = size - blkoffset; 2112 if (xfersize <= 0) 2113 break; 2114 } 2115 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 2116 if (error) 2117 break; 2118 brelse(bp, BC_AGE); 2119 } 2120 if (bp != NULL) 2121 brelse(bp, BC_AGE); 2122 2123 mutex_exit(&si->si_snaplock); 2124 fstrans_done(vp->v_mount); 2125 return error; 2126} 2127 2128/* 2129 * Lookup a snapshots data block address. 2130 * Simpler than UFS_BALLOC() as we know all metadata is already allocated 2131 * and safe even for the pagedaemon where we cannot bread(). 2132 */ 2133static int 2134snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res) 2135{ 2136 struct indir indirs[NIADDR + 2]; 2137 struct inode *ip = VTOI(vp); 2138 struct fs *fs = ip->i_fs; 2139 struct buf *bp; 2140 int error, num; 2141 2142 KASSERT(lbn >= 0); 2143 2144 if (lbn < NDADDR) { 2145 *res = db_get(ip, lbn); 2146 return 0; 2147 } 2148 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 2149 return error; 2150 if (curlwp == uvm.pagedaemon_lwp) { 2151 mutex_enter(&bufcache_lock); 2152 bp = incore(vp, indirs[num-1].in_lbn); 2153 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) { 2154 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 2155 error = 0; 2156 } else 2157 error = ENOMEM; 2158 mutex_exit(&bufcache_lock); 2159 return error; 2160 } 2161 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp); 2162 if (error == 0) 2163 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 2164 brelse(bp, 0); 2165 2166 return error; 2167} 2168 2169/* 2170 * Read or write the specified block of the filesystem vp resides on 2171 * from or to the disk bypassing the buffer cache. 2172 */ 2173static int 2174rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn) 2175{ 2176 int error; 2177 struct inode *ip = VTOI(vp); 2178 struct fs *fs = ip->i_fs; 2179 struct buf *nbp; 2180 2181 nbp = getiobuf(NULL, true); 2182 nbp->b_flags = flags; 2183 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 2184 nbp->b_error = 0; 2185 nbp->b_data = data; 2186 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn)); 2187 nbp->b_proc = NULL; 2188 nbp->b_dev = ip->i_devvp->v_rdev; 2189 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */ 2190 2191 bdev_strategy(nbp); 2192 2193 error = biowait(nbp); 2194 2195 putiobuf(nbp); 2196 2197 return error; 2198} 2199 2200/* 2201 * Write all dirty buffers to disk and invalidate them. 2202 */ 2203static int 2204syncsnap(struct vnode *vp) 2205{ 2206 int error; 2207 buf_t *bp; 2208 struct fs *fs = VTOI(vp)->i_fs; 2209 2210 mutex_enter(&bufcache_lock); 2211 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) { 2212 error = bbusy(bp, false, 0, NULL); 2213 if (error == EPASSTHROUGH) 2214 continue; 2215 else if (error != 0) { 2216 mutex_exit(&bufcache_lock); 2217 return error; 2218 } 2219 KASSERT(bp->b_bcount == fs->fs_bsize); 2220 mutex_exit(&bufcache_lock); 2221 error = rwfsblk(vp, B_WRITE, bp->b_data, 2222 fragstoblks(fs, dbtofsb(fs, bp->b_blkno))); 2223 brelse(bp, BC_INVAL | BC_VFLUSH); 2224 if (error) 2225 return error; 2226 mutex_enter(&bufcache_lock); 2227 } 2228 mutex_exit(&bufcache_lock); 2229 2230 return 0; 2231} 2232 2233/* 2234 * Write the specified block to a snapshot. 2235 */ 2236static int 2237wrsnapblk(struct vnode *vp, void *data, daddr_t lbn) 2238{ 2239 struct inode *ip = VTOI(vp); 2240 struct fs *fs = ip->i_fs; 2241 struct buf *bp; 2242 int error; 2243 2244 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, 2245 FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp); 2246 if (error) 2247 return error; 2248 memcpy(bp->b_data, data, fs->fs_bsize); 2249 if (ip->i_nlink > 0) 2250 error = bwrite(bp); 2251 else 2252 bawrite(bp); 2253 2254 return error; 2255} 2256 2257/* 2258 * Check if this inode is present on the active snapshot list. 2259 * Must be called with snapinfo locked. 2260 */ 2261static inline bool 2262is_active_snapshot(struct snap_info *si, struct inode *ip) 2263{ 2264 struct inode *xp; 2265 2266 KASSERT(mutex_owned(&si->si_lock)); 2267 2268 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 2269 if (xp == ip) 2270 return true; 2271 return false; 2272} 2273 2274/* 2275 * Get/Put direct block from inode or buffer containing disk addresses. Take 2276 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 2277 * into a global include. 2278 */ 2279static inline daddr_t 2280db_get(struct inode *ip, int loc) 2281{ 2282 if (ip->i_ump->um_fstype == UFS1) 2283 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); 2284 else 2285 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); 2286} 2287 2288static inline void 2289db_assign(struct inode *ip, int loc, daddr_t val) 2290{ 2291 if (ip->i_ump->um_fstype == UFS1) 2292 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2293 else 2294 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2295} 2296 2297static inline daddr_t 2298ib_get(struct inode *ip, int loc) 2299{ 2300 if (ip->i_ump->um_fstype == UFS1) 2301 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip)); 2302 else 2303 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip)); 2304} 2305 2306static inline void 2307ib_assign(struct inode *ip, int loc, daddr_t val) 2308{ 2309 if (ip->i_ump->um_fstype == UFS1) 2310 ip->i_ffs1_ib[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2311 else 2312 ip->i_ffs2_ib[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2313} 2314 2315static inline daddr_t 2316idb_get(struct inode *ip, void *bf, int loc) 2317{ 2318 if (ip->i_ump->um_fstype == UFS1) 2319 return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); 2320 else 2321 return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); 2322} 2323 2324static inline void 2325idb_assign(struct inode *ip, void *bf, int loc, daddr_t val) 2326{ 2327 if (ip->i_ump->um_fstype == UFS1) 2328 ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2329 else 2330 ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2331} 2332