ffs_snapshot.c revision 1.75
1/* $NetBSD: ffs_snapshot.c,v 1.75 2008/08/22 10:48:22 hannken Exp $ */ 2 3/* 4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 5 * 6 * Further information about snapshots can be obtained from: 7 * 8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 9 * 1614 Oxford Street mckusick@mckusick.com 10 * Berkeley, CA 94709-1608 +1-510-843-9542 11 * USA 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 36 * 37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 38 */ 39 40#include <sys/cdefs.h> 41__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.75 2008/08/22 10:48:22 hannken Exp $"); 42 43#if defined(_KERNEL_OPT) 44#include "opt_ffs.h" 45#include "opt_wapbl.h" 46#endif 47 48#include <sys/param.h> 49#include <sys/kernel.h> 50#include <sys/systm.h> 51#include <sys/conf.h> 52#include <sys/buf.h> 53#include <sys/proc.h> 54#include <sys/namei.h> 55#include <sys/sched.h> 56#include <sys/stat.h> 57#include <sys/malloc.h> 58#include <sys/mount.h> 59#include <sys/resource.h> 60#include <sys/resourcevar.h> 61#include <sys/vnode.h> 62#include <sys/kauth.h> 63#include <sys/fstrans.h> 64#include <sys/wapbl.h> 65 66#include <miscfs/specfs/specdev.h> 67 68#include <ufs/ufs/quota.h> 69#include <ufs/ufs/ufsmount.h> 70#include <ufs/ufs/inode.h> 71#include <ufs/ufs/ufs_extern.h> 72#include <ufs/ufs/ufs_bswap.h> 73#include <ufs/ufs/ufs_wapbl.h> 74 75#include <ufs/ffs/fs.h> 76#include <ufs/ffs/ffs_extern.h> 77 78#include <uvm/uvm.h> 79 80/* FreeBSD -> NetBSD conversion */ 81#define KERNCRED lwp0.l_cred 82#define ufs1_daddr_t int32_t 83#define ufs2_daddr_t int64_t 84#define ufs_lbn_t daddr_t 85#define VI_MTX(v) (&(v)->v_interlock) 86#define VI_LOCK(v) mutex_enter(&(v)->v_interlock) 87#define VI_UNLOCK(v) mutex_exit(&(v)->v_interlock) 88#define MNT_ILOCK(v) mutex_enter(&mntvnode_lock) 89#define MNT_IUNLOCK(v) mutex_exit(&mntvnode_lock) 90 91#if !defined(FFS_NO_SNAPSHOT) 92static int cgaccount(int, struct vnode *, void *, int); 93static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 94 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 95 ufs_lbn_t, int), int); 96static int indiracct_ufs1(struct vnode *, struct vnode *, int, 97 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 98 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 99 ufs_lbn_t, int), int); 100static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 101 struct fs *, ufs_lbn_t, int); 102static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 103 struct fs *, ufs_lbn_t, int); 104static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 105 struct fs *, ufs_lbn_t, int); 106static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 107 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 108 ufs_lbn_t, int), int); 109static int indiracct_ufs2(struct vnode *, struct vnode *, int, 110 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 111 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 112 ufs_lbn_t, int), int); 113static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 114 struct fs *, ufs_lbn_t, int); 115static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 116 struct fs *, ufs_lbn_t, int); 117static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 118 struct fs *, ufs_lbn_t, int); 119#endif /* !defined(FFS_NO_SNAPSHOT) */ 120 121static int ffs_copyonwrite(void *, struct buf *, bool); 122static int snapblkaddr(struct vnode *, daddr_t, daddr_t *); 123static int rwfsblk(struct vnode *, int, void *, ufs2_daddr_t); 124static int syncsnap(struct vnode *); 125static int wrsnapblk(struct vnode *, void *, ufs2_daddr_t); 126static inline ufs2_daddr_t db_get(struct inode *, int); 127static inline void db_assign(struct inode *, int, ufs2_daddr_t); 128static inline ufs2_daddr_t idb_get(struct inode *, void *, int); 129static inline void idb_assign(struct inode *, void *, int, ufs2_daddr_t); 130 131struct snap_info { 132 kmutex_t si_lock; /* Lock this snapinfo */ 133 kmutex_t si_snaplock; /* Snapshot vnode common lock */ 134 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */ 135 daddr_t *si_snapblklist; /* Snapshot block hints list */ 136 uint32_t si_gen; /* Incremented on change */ 137}; 138 139#ifdef DEBUG 140static int snapdebug = 0; 141#endif 142 143int 144ffs_snapshot_init(struct ufsmount *ump) 145{ 146 struct snap_info *si; 147 148 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP); 149 if (si == NULL) 150 return ENOMEM; 151 152 TAILQ_INIT(&si->si_snapshots); 153 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE); 154 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE); 155 si->si_gen = 0; 156 si->si_snapblklist = NULL; 157 158 return 0; 159} 160 161void 162ffs_snapshot_fini(struct ufsmount *ump) 163{ 164 struct snap_info *si; 165 166 si = ump->um_snapinfo; 167 ump->um_snapinfo = NULL; 168 169 KASSERT(TAILQ_EMPTY(&si->si_snapshots)); 170 mutex_destroy(&si->si_lock); 171 mutex_destroy(&si->si_snaplock); 172 KASSERT(si->si_snapblklist == NULL); 173 kmem_free(si, sizeof(*si)); 174} 175 176/* 177 * Create a snapshot file and initialize it for the filesystem. 178 * Vnode is locked on entry and return. 179 */ 180int 181ffs_snapshot(struct mount *mp, struct vnode *vp, 182 struct timespec *ctime) 183{ 184#if defined(FFS_NO_SNAPSHOT) 185 return EOPNOTSUPP; 186} 187#else /* defined(FFS_NO_SNAPSHOT) */ 188 ufs2_daddr_t numblks, blkno, *blkp, snaplistsize = 0, *snapblklist; 189 int error, ns, cg, snaploc; 190 int i, size, len, loc; 191 int flag = mp->mnt_flag; 192 struct timeval starttime; 193#ifdef DEBUG 194 struct timeval endtime; 195#endif 196 struct timespec ts; 197 long redo = 0; 198 int32_t *lp; 199 void *space; 200 void *sbbuf = NULL; 201 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 202 struct lwp *l = curlwp; 203 struct inode *ip, *xp; 204 struct buf *bp, *ibp, *nbp; 205 struct vattr vat; 206 struct vnode *xvp, *mvp, *logvp, *devvp; 207 struct snap_info *si; 208 bool suspended = false; 209 bool snapshot_locked = false; 210 211 ns = UFS_FSNEEDSWAP(fs); 212 si = VFSTOUFS(mp)->um_snapinfo; 213 214 /* 215 * Need to serialize access to snapshot code per filesystem. 216 */ 217 /* 218 * If the vnode already is a snapshot, return. 219 */ 220 if (VTOI(vp)->i_flags & SF_SNAPSHOT) { 221 if (ctime) { 222 ctime->tv_sec = DIP(VTOI(vp), mtime); 223 ctime->tv_nsec = DIP(VTOI(vp), mtimensec); 224 } 225 return 0; 226 } 227 /* 228 * Check mount, exclusive reference and owner. 229 */ 230 if (vp->v_mount != mp) 231 return EXDEV; 232 if (vp->v_usecount != 1 || vp->v_writecount != 0) 233 return EBUSY; 234 if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 235 NULL) != 0 && 236 VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred)) 237 return EACCES; 238 239 if (vp->v_size != 0) { 240 error = ffs_truncate(vp, 0, 0, NOCRED); 241 if (error) 242 return error; 243 } 244 /* 245 * Assign a snapshot slot in the superblock. 246 */ 247 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 248 if (fs->fs_snapinum[snaploc] == 0) 249 break; 250 if (snaploc == FSMAXSNAP) 251 return (ENOSPC); 252 ip = VTOI(vp); 253 devvp = ip->i_devvp; 254 if ((fs->fs_flags & FS_DOWAPBL) && 255 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { 256 error = VFS_VGET(mp, 257 fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp); 258 if (error) 259 return error; 260 } else 261 logvp = NULL; 262 /* 263 * Write an empty list of preallocated blocks to the end of 264 * the snapshot to set size to at least that of the filesystem. 265 */ 266 numblks = howmany(fs->fs_size, fs->fs_frag); 267 blkno = 1; 268 blkno = ufs_rw64(blkno, ns); 269 error = vn_rdwr(UIO_WRITE, vp, 270 (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks), 271 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 272 if (error) 273 goto out; 274 /* 275 * Preallocate critical data structures so that we can copy 276 * them in without further allocation after we suspend all 277 * operations on the filesystem. We would like to just release 278 * the allocated buffers without writing them since they will 279 * be filled in below once we are ready to go, but this upsets 280 * the soft update code, so we go ahead and write the new buffers. 281 * 282 * Allocate all indirect blocks and mark all of them as not 283 * needing to be copied. 284 */ 285 error = UFS_WAPBL_BEGIN(mp); 286 if (error) 287 goto out; 288 for (blkno = NDADDR, i = 0; blkno < numblks; blkno += NINDIR(fs)) { 289 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 290 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 291 if (error) { 292 UFS_WAPBL_END(mp); 293 goto out; 294 } 295 if (DOINGSOFTDEP(vp)) 296 bawrite(ibp); 297 else 298 brelse(ibp, 0); 299 if ((++i % 16) == 0) { 300 UFS_WAPBL_END(mp); 301 error = UFS_WAPBL_BEGIN(mp); 302 if (error) 303 goto out; 304 } 305 } 306 /* 307 * Allocate copies for the superblock and its summary information. 308 */ 309 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 310 0, &nbp); 311 if (error) { 312 UFS_WAPBL_END(mp); 313 goto out; 314 } 315 bawrite(nbp); 316 blkno = fragstoblks(fs, fs->fs_csaddr); 317 len = howmany(fs->fs_cssize, fs->fs_bsize); 318 for (loc = 0; loc < len; loc++) { 319 error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)), 320 fs->fs_bsize, KERNCRED, 0, &nbp); 321 if (error) { 322 UFS_WAPBL_END(mp); 323 goto out; 324 } 325 bawrite(nbp); 326 } 327 /* 328 * Copy all the cylinder group maps. Although the 329 * filesystem is still active, we hope that only a few 330 * cylinder groups will change between now and when we 331 * suspend operations. Thus, we will be able to quickly 332 * touch up the few cylinder groups that changed during 333 * the suspension period. 334 */ 335 len = howmany(fs->fs_ncg, NBBY); 336 fs->fs_active = malloc(len, M_DEVBUF, M_WAITOK | M_ZERO); 337 for (cg = 0; cg < fs->fs_ncg; cg++) { 338 if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 339 fs->fs_bsize, KERNCRED, 0, &nbp)) != 0) 340 break; 341 error = cgaccount(cg, vp, nbp->b_data, 1); 342 bawrite(nbp); 343 if (error) 344 break; 345 } 346 UFS_WAPBL_END(mp); 347 if (error) 348 goto out; 349 /* 350 * Change inode to snapshot type file. 351 */ 352 ip->i_flags |= SF_SNAPSHOT; 353 DIP_ASSIGN(ip, flags, ip->i_flags); 354 ip->i_flag |= IN_CHANGE | IN_UPDATE; 355 /* 356 * Ensure that the snapshot is completely on disk. 357 * Since we have marked it as a snapshot it is safe to 358 * unlock it as no process will be allowed to write to it. 359 */ 360 if ((error = VOP_FSYNC(vp, KERNCRED, FSYNC_WAIT, 0, 0)) != 0) 361 goto out; 362 VOP_UNLOCK(vp, 0); 363 /* 364 * All allocations are done, so we can now snapshot the system. 365 * 366 * Suspend operation on filesystem. 367 */ 368 if ((error = vfs_suspend(vp->v_mount, 0)) != 0) { 369 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 370 goto out; 371 } 372 suspended = true; 373 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 374 getmicrotime(&starttime); 375 error = UFS_WAPBL_BEGIN(mp); 376 if (error) 377 goto out; 378 /* 379 * First, copy all the cylinder group maps that have changed. 380 */ 381 for (cg = 0; cg < fs->fs_ncg; cg++) { 382 if (ACTIVECG_ISSET(fs, cg)) 383 continue; 384 redo++; 385 if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 386 fs->fs_bsize, KERNCRED, 0, &nbp)) != 0) 387 break; 388 error = cgaccount(cg, vp, nbp->b_data, 2); 389 bawrite(nbp); 390 if (error) 391 break; 392 } 393 if (error) { 394 UFS_WAPBL_END(mp); 395 goto out; 396 } 397 /* 398 * Grab a copy of the superblock and its summary information. 399 * We delay writing it until the suspension is released below. 400 */ 401 sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 402 loc = blkoff(fs, fs->fs_sblockloc); 403 if (loc > 0) 404 memset(sbbuf, 0, loc); 405 copy_fs = (struct fs *)((char *)sbbuf + loc); 406 bcopy(fs, copy_fs, fs->fs_sbsize); 407 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 408 if (fs->fs_sbsize < size) 409 memset((char *)sbbuf + loc + fs->fs_sbsize, 0, 410 size - fs->fs_sbsize); 411 size = blkroundup(fs, fs->fs_cssize); 412 if (fs->fs_contigsumsize > 0) 413 size += fs->fs_ncg * sizeof(int32_t); 414 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 415 copy_fs->fs_csp = space; 416 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 417 space = (char *)space + fs->fs_cssize; 418 loc = howmany(fs->fs_cssize, fs->fs_fsize); 419 i = fs->fs_frag - loc % fs->fs_frag; 420 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 421 if (len > 0) { 422 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 423 len, KERNCRED, 0, &bp)) != 0) { 424 brelse(bp, 0); 425 free(copy_fs->fs_csp, M_UFSMNT); 426 goto out; 427 } 428 bcopy(bp->b_data, space, (u_int)len); 429 space = (char *)space + len; 430 brelse(bp, BC_INVAL | BC_NOCACHE); 431 } 432 if (fs->fs_contigsumsize > 0) { 433 copy_fs->fs_maxcluster = lp = space; 434 for (i = 0; i < fs->fs_ncg; i++) 435 *lp++ = fs->fs_contigsumsize; 436 } 437 /* 438 * We must check for active files that have been unlinked 439 * (e.g., with a zero link count). We have to expunge all 440 * trace of these files from the snapshot so that they are 441 * not reclaimed prematurely by fsck or unnecessarily dumped. 442 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 443 * spec_strategy about writing on a suspended filesystem. 444 * Note that we skip unlinked snapshot files as they will 445 * be handled separately below. 446 * 447 * We also calculate the needed size for the snapshot list. 448 */ 449 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 450 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 451 /* Allocate a marker vnode */ 452 if ((mvp = vnalloc(mp)) == NULL) { 453 error = ENOMEM; 454 goto out; 455 } 456 MNT_ILOCK(mp); 457 /* 458 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 459 * and vclean() can be called indirectly 460 */ 461 for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) { 462 vmark(mvp, xvp); 463 /* 464 * Make sure this vnode wasn't reclaimed in getnewvnode(). 465 * Start over if it has (it won't be on the list anymore). 466 */ 467 if (xvp->v_mount != mp || vismarker(xvp)) 468 continue; 469 VI_LOCK(xvp); 470 if ((xvp->v_iflag & VI_XLOCK) || 471 xvp->v_usecount == 0 || xvp->v_type == VNON || 472 VTOI(xvp) == NULL || 473 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 474 VI_UNLOCK(xvp); 475 continue; 476 } 477 MNT_IUNLOCK(mp); 478 /* 479 * XXXAD should increase vnode ref count to prevent it 480 * disappearing or being recycled. 481 */ 482 VI_UNLOCK(xvp); 483#ifdef DEBUG 484 if (snapdebug) 485 vprint("ffs_snapshot: busy vnode", xvp); 486#endif 487 if (xvp != logvp && VOP_GETATTR(xvp, &vat, l->l_cred) == 0 && 488 vat.va_nlink > 0) { 489 MNT_ILOCK(mp); 490 continue; 491 } 492 xp = VTOI(xvp); 493 if (xvp != logvp && 494 ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 495 MNT_ILOCK(mp); 496 continue; 497 } 498 /* 499 * If there is a fragment, clear it here. 500 */ 501 blkno = 0; 502 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 503 if (loc < NDADDR) { 504 len = fragroundup(fs, blkoff(fs, xp->i_size)); 505 if (len > 0 && len < fs->fs_bsize) { 506 ffs_blkfree(copy_fs, vp, db_get(xp, loc), 507 len, xp->i_number); 508 blkno = db_get(xp, loc); 509 db_assign(xp, loc, 0); 510 } 511 } 512 snaplistsize += 1; 513 if (xp->i_ump->um_fstype == UFS1) 514 error = expunge_ufs1(vp, xp, copy_fs, 515 fullacct_ufs1, BLK_NOCOPY); 516 else 517 error = expunge_ufs2(vp, xp, copy_fs, 518 fullacct_ufs2, BLK_NOCOPY); 519 if (blkno) 520 db_assign(xp, loc, blkno); 521 if (!error) 522 error = ffs_freefile(copy_fs, vp, xp->i_number, 523 xp->i_mode); 524 if (error) { 525 free(copy_fs->fs_csp, M_UFSMNT); 526 (void)vunmark(mvp); 527 goto out; 528 } 529 MNT_ILOCK(mp); 530 } 531 MNT_IUNLOCK(mp); 532 vnfree(mvp); 533 UFS_WAPBL_END(mp); 534 /* 535 * Acquire the snapshot lock. 536 */ 537 mutex_enter(&si->si_snaplock); 538 snapshot_locked = true; 539 /* 540 * If this is the first snapshot on this filesystem, then we need 541 * to allocate the space for the list of preallocated snapshot blocks. 542 * This list will be refined below, but this preliminary one will 543 * keep us out of deadlock until the full one is ready. 544 */ 545 mutex_enter(&si->si_lock); 546 if ((xp = TAILQ_FIRST(&si->si_snapshots)) == NULL) { 547 mutex_exit(&si->si_lock); 548 snapblklist = malloc( 549 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 550 blkp = &snapblklist[1]; 551 *blkp++ = lblkno(fs, fs->fs_sblockloc); 552 blkno = fragstoblks(fs, fs->fs_csaddr); 553 for (cg = 0; cg < fs->fs_ncg; cg++) { 554 if (fragstoblks(fs, cgtod(fs, cg)) > blkno) 555 break; 556 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 557 } 558 len = howmany(fs->fs_cssize, fs->fs_bsize); 559 for (loc = 0; loc < len; loc++) 560 *blkp++ = blkno + loc; 561 for (; cg < fs->fs_ncg; cg++) 562 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 563 snapblklist[0] = blkp - snapblklist; 564 mutex_enter(&si->si_lock); 565 if (si->si_snapblklist != NULL) 566 panic("ffs_snapshot: non-empty list"); 567 si->si_snapblklist = snapblklist; 568 } 569 /* 570 * Record snapshot inode. Since this is the newest snapshot, 571 * it must be placed at the end of the list. 572 */ 573 fs->fs_snapinum[snaploc] = ip->i_number; 574 if (ip->i_nextsnap.tqe_prev != 0) 575 panic("ffs_snapshot: %llu already on list", 576 (unsigned long long)ip->i_number); 577 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 578 if (xp == NULL) 579 fscow_establish(mp, ffs_copyonwrite, devvp); 580 si->si_gen++; 581 mutex_exit(&si->si_lock); 582 vp->v_vflag |= VV_SYSTEM; 583 /* 584 * Set the mtime to the time the snapshot has been taken. 585 */ 586 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 587 if (ctime) 588 *ctime = ts; 589 DIP_ASSIGN(ip, mtime, ts.tv_sec); 590 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 591 ip->i_flag |= IN_CHANGE | IN_UPDATE; 592 /* 593 * Copy allocation information from all the snapshots in 594 * this snapshot and then expunge them from its view. 595 */ 596 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) { 597 if (xp == ip) 598 break; 599 if ((error = UFS_WAPBL_BEGIN(mp)) == 0) { 600 if (xp->i_ump->um_fstype == UFS1) 601 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 602 BLK_SNAP); 603 else 604 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 605 BLK_SNAP); 606 if (error == 0 && xp->i_ffs_effnlink == 0) 607 error = ffs_freefile(copy_fs, vp, 608 xp->i_number, xp->i_mode); 609 UFS_WAPBL_END(mp); 610 } 611 if (error) { 612 fs->fs_snapinum[snaploc] = 0; 613 goto done; 614 } 615 } 616 /* 617 * Allocate space for the full list of preallocated snapshot blocks. 618 */ 619 snapblklist = malloc(snaplistsize * sizeof(ufs2_daddr_t), 620 M_UFSMNT, M_WAITOK); 621 ip->i_snapblklist = &snapblklist[1]; 622 /* 623 * Expunge the blocks used by the snapshots from the set of 624 * blocks marked as used in the snapshot bitmaps. Also, collect 625 * the list of allocated blocks in i_snapblklist. 626 */ 627 if ((error = UFS_WAPBL_BEGIN(mp)) == 0) { 628 if (ip->i_ump->um_fstype == UFS1) 629 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, 630 BLK_SNAP); 631 else 632 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, 633 BLK_SNAP); 634 UFS_WAPBL_END(mp); 635 } 636 if (error) { 637 fs->fs_snapinum[snaploc] = 0; 638 FREE(snapblklist, M_UFSMNT); 639 goto done; 640 } 641 if (snaplistsize < ip->i_snapblklist - snapblklist) 642 panic("ffs_snapshot: list too small"); 643 snaplistsize = ip->i_snapblklist - snapblklist; 644 snapblklist[0] = snaplistsize; 645 ip->i_snapblklist = &snapblklist[0]; 646 /* 647 * Write out the list of allocated blocks to the end of the snapshot. 648 */ 649 for (i = 0; i < snaplistsize; i++) 650 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 651 error = vn_rdwr(UIO_WRITE, vp, (void *)snapblklist, 652 snaplistsize*sizeof(ufs2_daddr_t), lblktosize(fs, (off_t)numblks), 653 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 654 for (i = 0; i < snaplistsize; i++) 655 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 656 if (error) { 657 fs->fs_snapinum[snaploc] = 0; 658 FREE(snapblklist, M_UFSMNT); 659 goto done; 660 } 661 /* 662 * Write the superblock and its summary information 663 * to the snapshot. 664 */ 665 blkno = fragstoblks(fs, fs->fs_csaddr); 666 len = howmany(fs->fs_cssize, fs->fs_bsize); 667 space = copy_fs->fs_csp; 668#ifdef FFS_EI 669 if (ns) { 670 ffs_sb_swap(copy_fs, copy_fs); 671 ffs_csum_swap(space, space, fs->fs_cssize); 672 } 673#endif 674 error = UFS_WAPBL_BEGIN(mp); 675 if (error) { 676 fs->fs_snapinum[snaploc] = 0; 677 FREE(snapblklist, M_UFSMNT); 678 goto done; 679 } 680 for (loc = 0; loc < len; loc++) { 681 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, 682 B_MODIFY, &nbp); 683 if (error) { 684 brelse(nbp, 0); 685 fs->fs_snapinum[snaploc] = 0; 686 FREE(snapblklist, M_UFSMNT); 687 goto done; 688 } 689 bcopy(space, nbp->b_data, fs->fs_bsize); 690 space = (char *)space + fs->fs_bsize; 691 bawrite(nbp); 692 } 693 /* 694 * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite() 695 * and ffs_snapblkfree() will always work on indirect blocks. 696 */ 697 for (loc = 0; loc < NDADDR; loc++) { 698 if (db_get(ip, loc) != 0) 699 continue; 700 error = ffs_balloc(vp, lblktosize(fs, (off_t)loc), 701 fs->fs_bsize, KERNCRED, 0, &nbp); 702 if (error) 703 break; 704 error = rwfsblk(vp, B_READ, nbp->b_data, loc); 705 if (error) { 706 brelse(nbp, 0); 707 fs->fs_snapinum[snaploc] = 0; 708 FREE(snapblklist, M_UFSMNT); 709 goto done; 710 } 711 bawrite(nbp); 712 } 713 UFS_WAPBL_END(mp); 714 /* 715 * As this is the newest list, it is the most inclusive, so 716 * should replace the previous list. If this is the first snapshot 717 * free the preliminary list. 718 */ 719 mutex_enter(&si->si_lock); 720 space = si->si_snapblklist; 721 si->si_snapblklist = snapblklist; 722 if (TAILQ_FIRST(&si->si_snapshots) == ip) 723 FREE(space, M_UFSMNT); 724 si->si_gen++; 725 mutex_exit(&si->si_lock); 726done: 727 if (mp->mnt_wapbl) 728 copy_fs->fs_flags &= ~FS_DOWAPBL; 729 free(copy_fs->fs_csp, M_UFSMNT); 730 if (!error) { 731 error = UFS_WAPBL_BEGIN(mp); 732 if (!error) { 733 error = bread(vp, lblkno(fs, fs->fs_sblockloc), 734 fs->fs_bsize, KERNCRED, B_MODIFY, &nbp); 735 if (error) { 736 brelse(nbp, 0); 737 } else { 738 bcopy(sbbuf, nbp->b_data, fs->fs_bsize); 739 bawrite(nbp); 740 } 741 UFS_WAPBL_END(mp); 742 } 743 if (error) 744 fs->fs_snapinum[snaploc] = 0; 745 } 746out: 747 /* 748 * Invalidate and free all pages on the snapshot vnode. 749 * We will read and write through the buffercache. 750 */ 751 if (!error) { 752 mutex_enter(&vp->v_interlock); 753 error = VOP_PUTPAGES(vp, 0, 0, 754 PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 755 } 756#ifdef WAPBL 757 if (!error && mp->mnt_wapbl) 758 error = wapbl_flush(mp->mnt_wapbl, 1); 759#endif 760 if (suspended) { 761 vfs_resume(vp->v_mount); 762#ifdef DEBUG 763 if (starttime.tv_sec > 0) { 764 getmicrotime(&endtime); 765 timersub(&endtime, &starttime, &endtime); 766 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 767 vp->v_mount->mnt_stat.f_mntonname, 768 (long)endtime.tv_sec, endtime.tv_usec / 1000, 769 redo, fs->fs_ncg); 770 } 771#endif 772 } 773 if (sbbuf) 774 free(sbbuf, M_UFSMNT); 775 if (fs->fs_active != 0) { 776 FREE(fs->fs_active, M_DEVBUF); 777 fs->fs_active = 0; 778 } 779 mp->mnt_flag = flag; 780 if (error) { 781 if (!UFS_WAPBL_BEGIN(mp)) { 782 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); 783 UFS_WAPBL_END(mp); 784 } 785 } else 786 vref(vp); 787 if (snapshot_locked) 788 mutex_exit(&si->si_snaplock); 789 return (error); 790} 791 792/* 793 * Copy a cylinder group map. All the unallocated blocks are marked 794 * BLK_NOCOPY so that the snapshot knows that it need not copy them 795 * if they are later written. If passno is one, then this is a first 796 * pass, so only setting needs to be done. If passno is 2, then this 797 * is a revision to a previous pass which must be undone as the 798 * replacement pass is done. 799 */ 800static int 801cgaccount(int cg, struct vnode *vp, void *data, int passno) 802{ 803 struct buf *bp, *ibp; 804 struct inode *ip; 805 struct cg *cgp; 806 struct fs *fs; 807 ufs2_daddr_t base, numblks; 808 int error, len, loc, ns, indiroff; 809 810 ip = VTOI(vp); 811 fs = ip->i_fs; 812 ns = UFS_FSNEEDSWAP(fs); 813 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 814 (int)fs->fs_cgsize, KERNCRED, 0, &bp); 815 if (error) { 816 brelse(bp, 0); 817 return (error); 818 } 819 cgp = (struct cg *)bp->b_data; 820 if (!cg_chkmagic(cgp, ns)) { 821 brelse(bp, 0); 822 return (EIO); 823 } 824 ACTIVECG_SET(fs, cg); 825 826 bcopy(bp->b_data, data, fs->fs_cgsize); 827 brelse(bp, 0); 828 if (fs->fs_cgsize < fs->fs_bsize) 829 memset((char *)data + fs->fs_cgsize, 0, 830 fs->fs_bsize - fs->fs_cgsize); 831 numblks = howmany(fs->fs_size, fs->fs_frag); 832 len = howmany(fs->fs_fpg, fs->fs_frag); 833 base = cg * fs->fs_fpg / fs->fs_frag; 834 if (base + len >= numblks) 835 len = numblks - base - 1; 836 loc = 0; 837 if (base < NDADDR) { 838 for ( ; loc < NDADDR; loc++) { 839 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 840 db_assign(ip, loc, BLK_NOCOPY); 841 else if (db_get(ip, loc) == BLK_NOCOPY) { 842 if (passno == 2) 843 db_assign(ip, loc, 0); 844 else if (passno == 1) 845 panic("ffs_snapshot: lost direct block"); 846 } 847 } 848 } 849 if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)), 850 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 851 return (error); 852 indiroff = (base + loc - NDADDR) % NINDIR(fs); 853 for ( ; loc < len; loc++, indiroff++) { 854 if (indiroff >= NINDIR(fs)) { 855 bawrite(ibp); 856 if ((error = ffs_balloc(vp, 857 lblktosize(fs, (off_t)(base + loc)), 858 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 859 return (error); 860 indiroff = 0; 861 } 862 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 863 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 864 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 865 if (passno == 2) 866 idb_assign(ip, ibp->b_data, indiroff, 0); 867 else if (passno == 1) 868 panic("ffs_snapshot: lost indirect block"); 869 } 870 } 871 bdwrite(ibp); 872 return (0); 873} 874 875/* 876 * Before expunging a snapshot inode, note all the 877 * blocks that it claims with BLK_SNAP so that fsck will 878 * be able to account for those blocks properly and so 879 * that this snapshot knows that it need not copy them 880 * if the other snapshot holding them is freed. This code 881 * is reproduced once each for UFS1 and UFS2. 882 */ 883static int 884expunge_ufs1(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 885 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 886 struct fs *, ufs_lbn_t, int), 887 int expungetype) 888{ 889 int i, error, ns; 890 ufs_lbn_t lbn, rlbn; 891 ufs2_daddr_t len, blkno, numblks, blksperindir; 892 struct ufs1_dinode *dip; 893 struct buf *bp; 894 895 ns = UFS_FSNEEDSWAP(fs); 896 /* 897 * Prepare to expunge the inode. If its inode block has not 898 * yet been copied, then allocate and fill the copy. 899 */ 900 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 901 error = snapblkaddr(snapvp, lbn, &blkno); 902 if (error) 903 return error; 904 if (blkno != 0) { 905 error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, 906 B_MODIFY, &bp); 907 } else { 908 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn), 909 fs->fs_bsize, KERNCRED, 0, &bp); 910 if (! error) 911 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn); 912 } 913 if (error) 914 return error; 915 /* 916 * Set a snapshot inode to be a zero length file, regular files 917 * or unlinked snapshots to be completely unallocated. 918 */ 919 dip = (struct ufs1_dinode *)bp->b_data + 920 ino_to_fsbo(fs, cancelip->i_number); 921 if (expungetype == BLK_NOCOPY || cancelip->i_ffs_effnlink == 0) 922 dip->di_mode = 0; 923 dip->di_size = 0; 924 dip->di_blocks = 0; 925 dip->di_flags = 926 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 927 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 928 bdwrite(bp); 929 /* 930 * Now go through and expunge all the blocks in the file 931 * using the function requested. 932 */ 933 numblks = howmany(cancelip->i_size, fs->fs_bsize); 934 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_db[0], 935 &cancelip->i_ffs1_db[NDADDR], fs, 0, expungetype))) 936 return (error); 937 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_ib[0], 938 &cancelip->i_ffs1_ib[NIADDR], fs, -1, expungetype))) 939 return (error); 940 blksperindir = 1; 941 lbn = -NDADDR; 942 len = numblks - NDADDR; 943 rlbn = NDADDR; 944 for (i = 0; len > 0 && i < NIADDR; i++) { 945 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 946 ufs_rw32(cancelip->i_ffs1_ib[i], ns), lbn, rlbn, len, 947 blksperindir, fs, acctfunc, expungetype); 948 if (error) 949 return (error); 950 blksperindir *= NINDIR(fs); 951 lbn -= blksperindir + 1; 952 len -= blksperindir; 953 rlbn += blksperindir; 954 } 955 return (0); 956} 957 958/* 959 * Descend an indirect block chain for vnode cancelvp accounting for all 960 * its indirect blocks in snapvp. 961 */ 962static int 963indiracct_ufs1(struct vnode *snapvp, struct vnode *cancelvp, int level, 964 ufs1_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks, 965 ufs_lbn_t blksperindir, struct fs *fs, 966 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 967 struct fs *, ufs_lbn_t, int), 968 int expungetype) 969{ 970 int error, ns, num, i; 971 ufs_lbn_t subblksperindir; 972 struct indir indirs[NIADDR + 2]; 973 ufs1_daddr_t last, *bap; 974 struct buf *bp; 975 976 ns = UFS_FSNEEDSWAP(fs); 977 978 if (blkno == 0) { 979 if (expungetype == BLK_NOCOPY) 980 return (0); 981 panic("indiracct_ufs1: missing indir"); 982 } 983 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 984 return (error); 985 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 986 panic("indiracct_ufs1: botched params"); 987 /* 988 * We have to expand bread here since it will deadlock looking 989 * up the block number for any blocks that are not in the cache. 990 */ 991 error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize, 992 false, &bp); 993 if (error) 994 return error; 995 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error = 996 rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) { 997 brelse(bp, 0); 998 return (error); 999 } 1000 /* 1001 * Account for the block pointers in this indirect block. 1002 */ 1003 last = howmany(remblks, blksperindir); 1004 if (last > NINDIR(fs)) 1005 last = NINDIR(fs); 1006 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 1007 bcopy(bp->b_data, (void *)bap, fs->fs_bsize); 1008 brelse(bp, 0); 1009 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1010 level == 0 ? rlbn : -1, expungetype); 1011 if (error || level == 0) 1012 goto out; 1013 /* 1014 * Account for the block pointers in each of the indirect blocks 1015 * in the levels below us. 1016 */ 1017 subblksperindir = blksperindir / NINDIR(fs); 1018 for (lbn++, level--, i = 0; i < last; i++) { 1019 error = indiracct_ufs1(snapvp, cancelvp, level, 1020 ufs_rw32(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 1021 fs, acctfunc, expungetype); 1022 if (error) 1023 goto out; 1024 rlbn += blksperindir; 1025 lbn -= blksperindir; 1026 remblks -= blksperindir; 1027 } 1028out: 1029 FREE(bap, M_DEVBUF); 1030 return (error); 1031} 1032 1033/* 1034 * Do both snap accounting and map accounting. 1035 */ 1036static int 1037fullacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, 1038 struct fs *fs, ufs_lbn_t lblkno, 1039 int exptype /* BLK_SNAP or BLK_NOCOPY */) 1040{ 1041 int error; 1042 1043 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1044 return (error); 1045 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1046} 1047 1048/* 1049 * Identify a set of blocks allocated in a snapshot inode. 1050 */ 1051static int 1052snapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, 1053 struct fs *fs, ufs_lbn_t lblkno, 1054 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 1055{ 1056 struct inode *ip = VTOI(vp); 1057 ufs1_daddr_t blkno, *blkp; 1058 ufs_lbn_t lbn; 1059 struct buf *ibp; 1060 int error, ns; 1061 1062 ns = UFS_FSNEEDSWAP(fs); 1063 1064 for ( ; oldblkp < lastblkp; oldblkp++) { 1065 blkno = ufs_rw32(*oldblkp, ns); 1066 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1067 continue; 1068 lbn = fragstoblks(fs, blkno); 1069 if (lbn < NDADDR) { 1070 blkp = &ip->i_ffs1_db[lbn]; 1071 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1072 } else { 1073 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1074 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1075 if (error) 1076 return (error); 1077 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1078 [(lbn - NDADDR) % NINDIR(fs)]; 1079 } 1080 /* 1081 * If we are expunging a snapshot vnode and we 1082 * find a block marked BLK_NOCOPY, then it is 1083 * one that has been allocated to this snapshot after 1084 * we took our current snapshot and can be ignored. 1085 */ 1086 blkno = ufs_rw32(*blkp, ns); 1087 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1088 if (lbn >= NDADDR) 1089 brelse(ibp, 0); 1090 } else { 1091 if (blkno != 0) 1092 panic("snapacct_ufs1: bad block"); 1093 *blkp = ufs_rw32(expungetype, ns); 1094 if (lbn >= NDADDR) 1095 bdwrite(ibp); 1096 } 1097 } 1098 return (0); 1099} 1100 1101/* 1102 * Account for a set of blocks allocated in a snapshot inode. 1103 */ 1104static int 1105mapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, 1106 struct fs *fs, ufs_lbn_t lblkno, int expungetype) 1107{ 1108 ufs1_daddr_t blkno; 1109 struct inode *ip; 1110 ino_t inum; 1111 int acctit, ns; 1112 1113 ns = UFS_FSNEEDSWAP(fs); 1114 ip = VTOI(vp); 1115 inum = ip->i_number; 1116 if (lblkno == -1) 1117 acctit = 0; 1118 else 1119 acctit = 1; 1120 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1121 blkno = ufs_rw32(*oldblkp, ns); 1122 if (blkno == 0 || blkno == BLK_NOCOPY) 1123 continue; 1124 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1125 *ip->i_snapblklist++ = lblkno; 1126 if (blkno == BLK_SNAP) 1127 blkno = blkstofrags(fs, lblkno); 1128 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1129 } 1130 return (0); 1131} 1132 1133/* 1134 * Before expunging a snapshot inode, note all the 1135 * blocks that it claims with BLK_SNAP so that fsck will 1136 * be able to account for those blocks properly and so 1137 * that this snapshot knows that it need not copy them 1138 * if the other snapshot holding them is freed. This code 1139 * is reproduced once each for UFS1 and UFS2. 1140 */ 1141static int 1142expunge_ufs2(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 1143 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1144 struct fs *, ufs_lbn_t, int), 1145 int expungetype) 1146{ 1147 int i, error, ns; 1148 ufs_lbn_t lbn, rlbn; 1149 ufs2_daddr_t len, blkno, numblks, blksperindir; 1150 struct ufs2_dinode *dip; 1151 struct buf *bp; 1152 1153 ns = UFS_FSNEEDSWAP(fs); 1154 /* 1155 * Prepare to expunge the inode. If its inode block has not 1156 * yet been copied, then allocate and fill the copy. 1157 */ 1158 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1159 error = snapblkaddr(snapvp, lbn, &blkno); 1160 if (error) 1161 return error; 1162 if (blkno != 0) { 1163 error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, 1164 B_MODIFY, &bp); 1165 } else { 1166 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn), 1167 fs->fs_bsize, KERNCRED, 0, &bp); 1168 if (! error) 1169 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn); 1170 } 1171 if (error) 1172 return error; 1173 /* 1174 * Set a snapshot inode to be a zero length file, regular files 1175 * or unlinked snapshots to be completely unallocated. 1176 */ 1177 dip = (struct ufs2_dinode *)bp->b_data + 1178 ino_to_fsbo(fs, cancelip->i_number); 1179 if (expungetype == BLK_NOCOPY || cancelip->i_ffs_effnlink == 0) 1180 dip->di_mode = 0; 1181 dip->di_size = 0; 1182 dip->di_blocks = 0; 1183 dip->di_flags = 1184 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 1185 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1186 bdwrite(bp); 1187 /* 1188 * Now go through and expunge all the blocks in the file 1189 * using the function requested. 1190 */ 1191 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1192 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_db[0], 1193 &cancelip->i_ffs2_db[NDADDR], fs, 0, expungetype))) 1194 return (error); 1195 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_ib[0], 1196 &cancelip->i_ffs2_ib[NIADDR], fs, -1, expungetype))) 1197 return (error); 1198 blksperindir = 1; 1199 lbn = -NDADDR; 1200 len = numblks - NDADDR; 1201 rlbn = NDADDR; 1202 for (i = 0; len > 0 && i < NIADDR; i++) { 1203 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1204 ufs_rw64(cancelip->i_ffs2_ib[i], ns), lbn, rlbn, len, 1205 blksperindir, fs, acctfunc, expungetype); 1206 if (error) 1207 return (error); 1208 blksperindir *= NINDIR(fs); 1209 lbn -= blksperindir + 1; 1210 len -= blksperindir; 1211 rlbn += blksperindir; 1212 } 1213 return (0); 1214} 1215 1216/* 1217 * Descend an indirect block chain for vnode cancelvp accounting for all 1218 * its indirect blocks in snapvp. 1219 */ 1220static int 1221indiracct_ufs2(struct vnode *snapvp, struct vnode *cancelvp, int level, 1222 ufs2_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks, 1223 ufs_lbn_t blksperindir, struct fs *fs, 1224 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1225 struct fs *, ufs_lbn_t, int), 1226 int expungetype) 1227{ 1228 int error, ns, num, i; 1229 ufs_lbn_t subblksperindir; 1230 struct indir indirs[NIADDR + 2]; 1231 ufs2_daddr_t last, *bap; 1232 struct buf *bp; 1233 1234 ns = UFS_FSNEEDSWAP(fs); 1235 1236 if (blkno == 0) { 1237 if (expungetype == BLK_NOCOPY) 1238 return (0); 1239 panic("indiracct_ufs2: missing indir"); 1240 } 1241 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1242 return (error); 1243 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1244 panic("indiracct_ufs2: botched params"); 1245 /* 1246 * We have to expand bread here since it will deadlock looking 1247 * up the block number for any blocks that are not in the cache. 1248 */ 1249 error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize, 1250 false, &bp); 1251 if (error) 1252 return error; 1253 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error = 1254 rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) { 1255 brelse(bp, 0); 1256 return (error); 1257 } 1258 /* 1259 * Account for the block pointers in this indirect block. 1260 */ 1261 last = howmany(remblks, blksperindir); 1262 if (last > NINDIR(fs)) 1263 last = NINDIR(fs); 1264 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 1265 bcopy(bp->b_data, (void *)bap, fs->fs_bsize); 1266 brelse(bp, 0); 1267 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1268 level == 0 ? rlbn : -1, expungetype); 1269 if (error || level == 0) 1270 goto out; 1271 /* 1272 * Account for the block pointers in each of the indirect blocks 1273 * in the levels below us. 1274 */ 1275 subblksperindir = blksperindir / NINDIR(fs); 1276 for (lbn++, level--, i = 0; i < last; i++) { 1277 error = indiracct_ufs2(snapvp, cancelvp, level, 1278 ufs_rw64(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 1279 fs, acctfunc, expungetype); 1280 if (error) 1281 goto out; 1282 rlbn += blksperindir; 1283 lbn -= blksperindir; 1284 remblks -= blksperindir; 1285 } 1286out: 1287 FREE(bap, M_DEVBUF); 1288 return (error); 1289} 1290 1291/* 1292 * Do both snap accounting and map accounting. 1293 */ 1294static int 1295fullacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, 1296 struct fs *fs, ufs_lbn_t lblkno, 1297 int exptype /* BLK_SNAP or BLK_NOCOPY */) 1298{ 1299 int error; 1300 1301 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1302 return (error); 1303 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1304} 1305 1306/* 1307 * Identify a set of blocks allocated in a snapshot inode. 1308 */ 1309static int 1310snapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, 1311 struct fs *fs, ufs_lbn_t lblkno, 1312 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 1313{ 1314 struct inode *ip = VTOI(vp); 1315 ufs2_daddr_t blkno, *blkp; 1316 ufs_lbn_t lbn; 1317 struct buf *ibp; 1318 int error, ns; 1319 1320 ns = UFS_FSNEEDSWAP(fs); 1321 1322 for ( ; oldblkp < lastblkp; oldblkp++) { 1323 blkno = ufs_rw64(*oldblkp, ns); 1324 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1325 continue; 1326 lbn = fragstoblks(fs, blkno); 1327 if (lbn < NDADDR) { 1328 blkp = &ip->i_ffs2_db[lbn]; 1329 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1330 } else { 1331 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1332 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1333 if (error) 1334 return (error); 1335 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1336 [(lbn - NDADDR) % NINDIR(fs)]; 1337 } 1338 /* 1339 * If we are expunging a snapshot vnode and we 1340 * find a block marked BLK_NOCOPY, then it is 1341 * one that has been allocated to this snapshot after 1342 * we took our current snapshot and can be ignored. 1343 */ 1344 blkno = ufs_rw64(*blkp, ns); 1345 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1346 if (lbn >= NDADDR) 1347 brelse(ibp, 0); 1348 } else { 1349 if (blkno != 0) 1350 panic("snapacct_ufs2: bad block"); 1351 *blkp = ufs_rw64(expungetype, ns); 1352 if (lbn >= NDADDR) 1353 bdwrite(ibp); 1354 } 1355 } 1356 return (0); 1357} 1358 1359/* 1360 * Account for a set of blocks allocated in a snapshot inode. 1361 */ 1362static int 1363mapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, 1364 struct fs *fs, ufs_lbn_t lblkno, int expungetype) 1365{ 1366 ufs2_daddr_t blkno; 1367 struct inode *ip; 1368 ino_t inum; 1369 int acctit, ns; 1370 1371 ns = UFS_FSNEEDSWAP(fs); 1372 ip = VTOI(vp); 1373 inum = ip->i_number; 1374 if (lblkno == -1) 1375 acctit = 0; 1376 else 1377 acctit = 1; 1378 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1379 blkno = ufs_rw64(*oldblkp, ns); 1380 if (blkno == 0 || blkno == BLK_NOCOPY) 1381 continue; 1382 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1383 *ip->i_snapblklist++ = lblkno; 1384 if (blkno == BLK_SNAP) 1385 blkno = blkstofrags(fs, lblkno); 1386 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1387 } 1388 return (0); 1389} 1390#endif /* defined(FFS_NO_SNAPSHOT) */ 1391 1392/* 1393 * Decrement extra reference on snapshot when last name is removed. 1394 * It will not be freed until the last open reference goes away. 1395 */ 1396void 1397ffs_snapgone(struct inode *ip) 1398{ 1399 struct mount *mp = ip->i_devvp->v_specmountpoint; 1400 struct inode *xp; 1401 struct fs *fs; 1402 struct snap_info *si; 1403 int snaploc; 1404 1405 si = VFSTOUFS(mp)->um_snapinfo; 1406 1407 /* 1408 * Find snapshot in incore list. 1409 */ 1410 mutex_enter(&si->si_lock); 1411 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 1412 if (xp == ip) 1413 break; 1414 mutex_exit(&si->si_lock); 1415 if (xp != NULL) 1416 vrele(ITOV(ip)); 1417#ifdef DEBUG 1418 else if (snapdebug) 1419 printf("ffs_snapgone: lost snapshot vnode %llu\n", 1420 (unsigned long long)ip->i_number); 1421#endif 1422 /* 1423 * Delete snapshot inode from superblock. Keep list dense. 1424 */ 1425 mutex_enter(&si->si_lock); 1426 fs = ip->i_fs; 1427 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1428 if (fs->fs_snapinum[snaploc] == ip->i_number) 1429 break; 1430 if (snaploc < FSMAXSNAP) { 1431 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1432 if (fs->fs_snapinum[snaploc] == 0) 1433 break; 1434 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1435 } 1436 fs->fs_snapinum[snaploc - 1] = 0; 1437 } 1438 si->si_gen++; 1439 mutex_exit(&si->si_lock); 1440} 1441 1442/* 1443 * Prepare a snapshot file for being removed. 1444 */ 1445void 1446ffs_snapremove(struct vnode *vp) 1447{ 1448 struct inode *ip = VTOI(vp), *xp; 1449 struct vnode *devvp = ip->i_devvp; 1450 struct fs *fs = ip->i_fs; 1451 struct mount *mp = devvp->v_specmountpoint; 1452 struct buf *ibp; 1453 struct snap_info *si; 1454 ufs2_daddr_t numblks, blkno, dblk; 1455 int error, ns, loc, last; 1456 1457 si = VFSTOUFS(mp)->um_snapinfo; 1458 ns = UFS_FSNEEDSWAP(fs); 1459 mutex_enter(&si->si_snaplock); 1460 /* 1461 * If active, delete from incore list (this snapshot may 1462 * already have been in the process of being deleted, so 1463 * would not have been active). 1464 * 1465 * Clear copy-on-write flag if last snapshot. 1466 */ 1467 if (ip->i_nextsnap.tqe_prev != 0) { 1468 mutex_enter(&si->si_lock); 1469 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap); 1470 ip->i_nextsnap.tqe_prev = 0; 1471 if (TAILQ_FIRST(&si->si_snapshots) != 0) { 1472 /* Roll back the list of preallocated blocks. */ 1473 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1474 si->si_snapblklist = xp->i_snapblklist; 1475 } else { 1476 si->si_snapblklist = 0; 1477 si->si_gen++; 1478 mutex_exit(&si->si_lock); 1479 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1480 mutex_enter(&si->si_lock); 1481 } 1482 si->si_gen++; 1483 mutex_exit(&si->si_lock); 1484 FREE(ip->i_snapblklist, M_UFSMNT); 1485 ip->i_snapblklist = NULL; 1486 } 1487 mutex_exit(&si->si_snaplock); 1488 /* 1489 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1490 * snapshots that want them (see ffs_snapblkfree below). 1491 */ 1492 for (blkno = 1; blkno < NDADDR; blkno++) { 1493 dblk = db_get(ip, blkno); 1494 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1495 db_assign(ip, blkno, 0); 1496 else if ((dblk == blkstofrags(fs, blkno) && 1497 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1498 ip->i_number))) { 1499 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1500 db_assign(ip, blkno, 0); 1501 } 1502 } 1503 numblks = howmany(ip->i_size, fs->fs_bsize); 1504 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1505 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 1506 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1507 if (error) 1508 continue; 1509 if (fs->fs_size - blkno > NINDIR(fs)) 1510 last = NINDIR(fs); 1511 else 1512 last = fs->fs_size - blkno; 1513 for (loc = 0; loc < last; loc++) { 1514 dblk = idb_get(ip, ibp->b_data, loc); 1515 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1516 idb_assign(ip, ibp->b_data, loc, 0); 1517 else if (dblk == blkstofrags(fs, blkno) && 1518 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1519 fs->fs_bsize, ip->i_number)) { 1520 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1521 idb_assign(ip, ibp->b_data, loc, 0); 1522 } 1523 } 1524 bawrite(ibp); 1525 } 1526 /* 1527 * Clear snapshot flag and drop reference. 1528 */ 1529 ip->i_flags &= ~SF_SNAPSHOT; 1530 DIP_ASSIGN(ip, flags, ip->i_flags); 1531 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1532} 1533 1534/* 1535 * Notification that a block is being freed. Return zero if the free 1536 * should be allowed to proceed. Return non-zero if the snapshot file 1537 * wants to claim the block. The block will be claimed if it is an 1538 * uncopied part of one of the snapshots. It will be freed if it is 1539 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1540 * If a fragment is being freed, then all snapshots that care about 1541 * it must make a copy since a snapshot file can only claim full sized 1542 * blocks. Note that if more than one snapshot file maps the block, 1543 * we can pick one at random to claim it. Since none of the snapshots 1544 * can change, we are assurred that they will all see the same unmodified 1545 * image. When deleting a snapshot file (see ffs_snapremove above), we 1546 * must push any of these claimed blocks to one of the other snapshots 1547 * that maps it. These claimed blocks are easily identified as they will 1548 * have a block number equal to their logical block number within the 1549 * snapshot. A copied block can never have this property because they 1550 * must always have been allocated from a BLK_NOCOPY location. 1551 */ 1552int 1553ffs_snapblkfree(struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno, 1554 long size, ino_t inum) 1555{ 1556 struct mount *mp = devvp->v_specmountpoint; 1557 struct buf *ibp; 1558 struct inode *ip; 1559 struct vnode *vp = NULL; 1560 struct snap_info *si; 1561 void *saved_data = NULL; 1562 ufs_lbn_t lbn; 1563 ufs2_daddr_t blkno; 1564 uint32_t gen; 1565 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1566 1567 si = VFSTOUFS(mp)->um_snapinfo; 1568 lbn = fragstoblks(fs, bno); 1569 mutex_enter(&si->si_lock); 1570retry: 1571 gen = si->si_gen; 1572 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1573 vp = ITOV(ip); 1574 if (snapshot_locked == 0) { 1575 if (!mutex_tryenter(&si->si_snaplock)) { 1576 mutex_exit(&si->si_lock); 1577 mutex_enter(&si->si_snaplock); 1578 mutex_enter(&si->si_lock); 1579 } 1580 snapshot_locked = 1; 1581 if (gen != si->si_gen) 1582 goto retry; 1583 } 1584 /* 1585 * Lookup block being written. 1586 */ 1587 if (lbn < NDADDR) { 1588 blkno = db_get(ip, lbn); 1589 } else { 1590 mutex_exit(&si->si_lock); 1591 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1592 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1593 if (error) { 1594 mutex_enter(&si->si_lock); 1595 break; 1596 } 1597 indiroff = (lbn - NDADDR) % NINDIR(fs); 1598 blkno = idb_get(ip, ibp->b_data, indiroff); 1599 mutex_enter(&si->si_lock); 1600 if (gen != si->si_gen) { 1601 brelse(ibp, 0); 1602 goto retry; 1603 } 1604 } 1605 /* 1606 * Check to see if block needs to be copied. 1607 */ 1608 if (blkno == 0) { 1609 /* 1610 * A block that we map is being freed. If it has not 1611 * been claimed yet, we will claim or copy it (below). 1612 */ 1613 claimedblk = 1; 1614 } else if (blkno == BLK_SNAP) { 1615 /* 1616 * No previous snapshot claimed the block, 1617 * so it will be freed and become a BLK_NOCOPY 1618 * (don't care) for us. 1619 */ 1620 if (claimedblk) 1621 panic("snapblkfree: inconsistent block type"); 1622 if (lbn < NDADDR) { 1623 db_assign(ip, lbn, BLK_NOCOPY); 1624 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1625 } else { 1626 idb_assign(ip, ibp->b_data, indiroff, 1627 BLK_NOCOPY); 1628 mutex_exit(&si->si_lock); 1629 if (ip->i_ffs_effnlink > 0) 1630 bwrite(ibp); 1631 else 1632 bdwrite(ibp); 1633 mutex_enter(&si->si_lock); 1634 if (gen != si->si_gen) 1635 goto retry; 1636 } 1637 continue; 1638 } else /* BLK_NOCOPY or default */ { 1639 /* 1640 * If the snapshot has already copied the block 1641 * (default), or does not care about the block, 1642 * it is not needed. 1643 */ 1644 if (lbn >= NDADDR) 1645 brelse(ibp, 0); 1646 continue; 1647 } 1648 /* 1649 * If this is a full size block, we will just grab it 1650 * and assign it to the snapshot inode. Otherwise we 1651 * will proceed to copy it. See explanation for this 1652 * routine as to why only a single snapshot needs to 1653 * claim this block. 1654 */ 1655 if (size == fs->fs_bsize) { 1656#ifdef DEBUG 1657 if (snapdebug) 1658 printf("%s %llu lbn %" PRId64 1659 "from inum %llu\n", 1660 "Grabonremove: snapino", 1661 (unsigned long long)ip->i_number, 1662 lbn, (unsigned long long)inum); 1663#endif 1664 mutex_exit(&si->si_lock); 1665 if (lbn < NDADDR) { 1666 db_assign(ip, lbn, bno); 1667 } else { 1668 idb_assign(ip, ibp->b_data, indiroff, bno); 1669 if (ip->i_ffs_effnlink > 0) 1670 bwrite(ibp); 1671 else 1672 bdwrite(ibp); 1673 } 1674 DIP_ADD(ip, blocks, btodb(size)); 1675 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1676 if (ip->i_ffs_effnlink > 0 && mp->mnt_wapbl) 1677 error = syncsnap(vp); 1678 else 1679 error = 0; 1680 mutex_exit(&si->si_snaplock); 1681 return (error == 0); 1682 } 1683 if (lbn >= NDADDR) 1684 brelse(ibp, 0); 1685#ifdef DEBUG 1686 if (snapdebug) 1687 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n", 1688 "Copyonremove: snapino ", 1689 (unsigned long long)ip->i_number, 1690 lbn, "for inum", (unsigned long long)inum, size); 1691#endif 1692 /* 1693 * If we have already read the old block contents, then 1694 * simply copy them to the new block. Note that we need 1695 * to synchronously write snapshots that have not been 1696 * unlinked, and hence will be visible after a crash, 1697 * to ensure their integrity. 1698 */ 1699 mutex_exit(&si->si_lock); 1700 if (saved_data == NULL) { 1701 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1702 error = rwfsblk(vp, B_READ, saved_data, lbn); 1703 if (error) { 1704 free(saved_data, M_UFSMNT); 1705 saved_data = NULL; 1706 mutex_enter(&si->si_lock); 1707 break; 1708 } 1709 } 1710 error = wrsnapblk(vp, saved_data, lbn); 1711 if (error == 0 && ip->i_ffs_effnlink > 0 && mp->mnt_wapbl) 1712 error = syncsnap(vp); 1713 mutex_enter(&si->si_lock); 1714 if (error) 1715 break; 1716 if (gen != si->si_gen) 1717 goto retry; 1718 } 1719 mutex_exit(&si->si_lock); 1720 if (saved_data) 1721 free(saved_data, M_UFSMNT); 1722 /* 1723 * If we have been unable to allocate a block in which to do 1724 * the copy, then return non-zero so that the fragment will 1725 * not be freed. Although space will be lost, the snapshot 1726 * will stay consistent. 1727 */ 1728 if (snapshot_locked) 1729 mutex_exit(&si->si_snaplock); 1730 return (error); 1731} 1732 1733/* 1734 * Associate snapshot files when mounting. 1735 */ 1736void 1737ffs_snapshot_mount(struct mount *mp) 1738{ 1739 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1740 struct fs *fs = VFSTOUFS(mp)->um_fs; 1741 struct lwp *l = curlwp; 1742 struct vnode *vp; 1743 struct inode *ip, *xp; 1744 struct snap_info *si; 1745 ufs2_daddr_t snaplistsize, *snapblklist; 1746 int i, error, ns, snaploc, loc; 1747 1748 /* 1749 * No persistent snapshots on apple ufs file systems. 1750 */ 1751 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp))) 1752 return; 1753 1754 si = VFSTOUFS(mp)->um_snapinfo; 1755 ns = UFS_FSNEEDSWAP(fs); 1756 /* 1757 * XXX The following needs to be set before ffs_truncate or 1758 * VOP_READ can be called. 1759 */ 1760 mp->mnt_stat.f_iosize = fs->fs_bsize; 1761 /* 1762 * Process each snapshot listed in the superblock. 1763 */ 1764 vp = NULL; 1765 mutex_enter(&si->si_lock); 1766 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1767 if (fs->fs_snapinum[snaploc] == 0) 1768 break; 1769 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1770 &vp)) != 0) { 1771 printf("ffs_snapshot_mount: vget failed %d\n", error); 1772 continue; 1773 } 1774 ip = VTOI(vp); 1775 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1776 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1777 fs->fs_snapinum[snaploc]); 1778 vput(vp); 1779 vp = NULL; 1780 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1781 if (fs->fs_snapinum[loc] == 0) 1782 break; 1783 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1784 } 1785 fs->fs_snapinum[loc - 1] = 0; 1786 snaploc--; 1787 continue; 1788 } 1789 1790 /* 1791 * Read the block hints list. Use an empty list on 1792 * read errors. 1793 */ 1794 error = vn_rdwr(UIO_READ, vp, 1795 (void *)&snaplistsize, sizeof(snaplistsize), 1796 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1797 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, 1798 l->l_cred, NULL, NULL); 1799 if (error) { 1800 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1801 snaplistsize = 1; 1802 } else 1803 snaplistsize = ufs_rw64(snaplistsize, ns); 1804 snapblklist = malloc( 1805 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 1806 if (error) 1807 snapblklist[0] = 1; 1808 else { 1809 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist, 1810 snaplistsize * sizeof(ufs2_daddr_t), 1811 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1812 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, 1813 l->l_cred, NULL, NULL); 1814 for (i = 0; i < snaplistsize; i++) 1815 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 1816 if (error) { 1817 printf("ffs_snapshot_mount: read_2 failed %d\n", 1818 error); 1819 snapblklist[0] = 1; 1820 } 1821 } 1822 ip->i_snapblklist = &snapblklist[0]; 1823 1824 /* 1825 * Link it onto the active snapshot list. 1826 */ 1827 if (ip->i_nextsnap.tqe_prev != 0) 1828 panic("ffs_snapshot_mount: %llu already on list", 1829 (unsigned long long)ip->i_number); 1830 else 1831 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 1832 vp->v_vflag |= VV_SYSTEM; 1833 VOP_UNLOCK(vp, 0); 1834 } 1835 /* 1836 * No usable snapshots found. 1837 */ 1838 if (vp == NULL) { 1839 mutex_exit(&si->si_lock); 1840 return; 1841 } 1842 /* 1843 * Attach the block hints list. We always want to 1844 * use the list from the newest snapshot. 1845 */ 1846 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1847 si->si_snapblklist = xp->i_snapblklist; 1848 fscow_establish(mp, ffs_copyonwrite, devvp); 1849 si->si_gen++; 1850 mutex_exit(&si->si_lock); 1851} 1852 1853/* 1854 * Disassociate snapshot files when unmounting. 1855 */ 1856void 1857ffs_snapshot_unmount(struct mount *mp) 1858{ 1859 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1860 struct inode *xp; 1861 struct vnode *vp = NULL; 1862 struct snap_info *si; 1863 1864 si = VFSTOUFS(mp)->um_snapinfo; 1865 mutex_enter(&si->si_lock); 1866 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) { 1867 vp = ITOV(xp); 1868 vp->v_vnlock = &vp->v_lock; 1869 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap); 1870 xp->i_nextsnap.tqe_prev = 0; 1871 if (xp->i_snapblklist == si->si_snapblklist) 1872 si->si_snapblklist = NULL; 1873 FREE(xp->i_snapblklist, M_UFSMNT); 1874 if (xp->i_ffs_effnlink > 0) { 1875 si->si_gen++; 1876 mutex_exit(&si->si_lock); 1877 vrele(vp); 1878 mutex_enter(&si->si_lock); 1879 } 1880 } 1881 if (vp) 1882 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1883 si->si_gen++; 1884 mutex_exit(&si->si_lock); 1885} 1886 1887/* 1888 * Lookup a snapshots data block address. 1889 * Simpler than UFS_BALLOC() as we know all metadata is already allocated 1890 * and safe even for the pagedaemon where we cannot bread(). 1891 */ 1892static int 1893snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res) 1894{ 1895 struct indir indirs[NIADDR + 2]; 1896 struct inode *ip = VTOI(vp); 1897 struct fs *fs = ip->i_fs; 1898 struct buf *bp; 1899 int error, num; 1900 1901 KASSERT(lbn >= 0); 1902 1903 if (lbn < NDADDR) { 1904 *res = db_get(ip, lbn); 1905 return 0; 1906 } 1907 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 1908 return error; 1909 if (curlwp == uvm.pagedaemon_lwp) { 1910 mutex_enter(&bufcache_lock); 1911 bp = incore(vp, indirs[num-1].in_lbn); 1912 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) { 1913 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 1914 error = 0; 1915 } else 1916 error = ENOMEM; 1917 mutex_exit(&bufcache_lock); 1918 return error; 1919 } 1920 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp); 1921 if (error == 0) 1922 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 1923 brelse(bp, 0); 1924 1925 return error; 1926} 1927 1928/* 1929 * Check for need to copy block that is about to be written, 1930 * copying the block if necessary. 1931 */ 1932static int 1933ffs_copyonwrite(void *v, struct buf *bp, bool data_valid) 1934{ 1935 struct fs *fs; 1936 struct inode *ip; 1937 struct vnode *devvp = v, *vp = NULL; 1938 struct mount *mp = devvp->v_specmountpoint; 1939 struct snap_info *si; 1940 void *saved_data = NULL; 1941 ufs2_daddr_t lbn, blkno, *snapblklist; 1942 uint32_t gen; 1943 int lower, upper, mid, ns, snapshot_locked = 0, error = 0; 1944 1945 /* 1946 * Check for valid snapshots. 1947 */ 1948 si = VFSTOUFS(mp)->um_snapinfo; 1949 mutex_enter(&si->si_lock); 1950 ip = TAILQ_FIRST(&si->si_snapshots); 1951 if (ip == NULL) { 1952 mutex_exit(&si->si_lock); 1953 return 0; 1954 } 1955 /* 1956 * First check to see if it is after the file system or 1957 * in the preallocated list. 1958 * By doing this check we avoid several potential deadlocks. 1959 */ 1960 fs = ip->i_fs; 1961 ns = UFS_FSNEEDSWAP(fs); 1962 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1963 if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) { 1964 mutex_exit(&si->si_lock); 1965 return 0; 1966 } 1967 snapblklist = si->si_snapblklist; 1968 upper = si->si_snapblklist[0] - 1; 1969 lower = 1; 1970 while (lower <= upper) { 1971 mid = (lower + upper) / 2; 1972 if (snapblklist[mid] == lbn) 1973 break; 1974 if (snapblklist[mid] < lbn) 1975 lower = mid + 1; 1976 else 1977 upper = mid - 1; 1978 } 1979 if (lower <= upper) { 1980 mutex_exit(&si->si_lock); 1981 return 0; 1982 } 1983 /* 1984 * Not in the precomputed list, so check the snapshots. 1985 */ 1986 if (data_valid && bp->b_bcount == fs->fs_bsize) 1987 saved_data = bp->b_data; 1988retry: 1989 gen = si->si_gen; 1990 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1991 vp = ITOV(ip); 1992 /* 1993 * We ensure that everything of our own that needs to be 1994 * copied will be done at the time that ffs_snapshot is 1995 * called. Thus we can skip the check here which can 1996 * deadlock in doing the lookup in ffs_balloc. 1997 */ 1998 if (bp->b_vp == vp) 1999 continue; 2000 /* 2001 * Check to see if block needs to be copied. 2002 */ 2003 if (lbn < NDADDR) { 2004 blkno = db_get(ip, lbn); 2005 } else { 2006 mutex_exit(&si->si_lock); 2007 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 2008 mutex_enter(&si->si_lock); 2009 break; 2010 } 2011 mutex_enter(&si->si_lock); 2012 if (gen != si->si_gen) 2013 goto retry; 2014 } 2015#ifdef DIAGNOSTIC 2016 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 2017 panic("ffs_copyonwrite: bad copy block"); 2018#endif 2019 if (blkno != 0) 2020 continue; 2021 2022 if (curlwp == uvm.pagedaemon_lwp) { 2023 error = ENOMEM; 2024 break; 2025 } 2026 2027 if (snapshot_locked == 0) { 2028 if (!mutex_tryenter(&si->si_snaplock)) { 2029 mutex_exit(&si->si_lock); 2030 mutex_enter(&si->si_snaplock); 2031 mutex_enter(&si->si_lock); 2032 } 2033 snapshot_locked = 1; 2034 if (gen != si->si_gen) 2035 goto retry; 2036 2037 /* Check again if block still needs to be copied */ 2038 if (lbn < NDADDR) { 2039 blkno = db_get(ip, lbn); 2040 } else { 2041 mutex_exit(&si->si_lock); 2042 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 2043 mutex_enter(&si->si_lock); 2044 break; 2045 } 2046 mutex_enter(&si->si_lock); 2047 if (gen != si->si_gen) 2048 goto retry; 2049 } 2050 2051 if (blkno != 0) 2052 continue; 2053 } 2054 /* 2055 * Allocate the block into which to do the copy. Since 2056 * multiple processes may all try to copy the same block, 2057 * we have to recheck our need to do a copy if we sleep 2058 * waiting for the lock. 2059 * 2060 * Because all snapshots on a filesystem share a single 2061 * lock, we ensure that we will never be in competition 2062 * with another process to allocate a block. 2063 */ 2064#ifdef DEBUG 2065 if (snapdebug) { 2066 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ", 2067 (unsigned long long)ip->i_number, lbn); 2068 if (bp->b_vp == devvp) 2069 printf("fs metadata"); 2070 else 2071 printf("inum %llu", (unsigned long long) 2072 VTOI(bp->b_vp)->i_number); 2073 printf(" lblkno %" PRId64 "\n", bp->b_lblkno); 2074 } 2075#endif 2076 /* 2077 * If we have already read the old block contents, then 2078 * simply copy them to the new block. Note that we need 2079 * to synchronously write snapshots that have not been 2080 * unlinked, and hence will be visible after a crash, 2081 * to ensure their integrity. 2082 */ 2083 mutex_exit(&si->si_lock); 2084 if (saved_data == NULL) { 2085 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 2086 error = rwfsblk(vp, B_READ, saved_data, lbn); 2087 if (error) { 2088 free(saved_data, M_UFSMNT); 2089 saved_data = NULL; 2090 mutex_enter(&si->si_lock); 2091 break; 2092 } 2093 } 2094 error = wrsnapblk(vp, saved_data, lbn); 2095 if (error == 0 && ip->i_ffs_effnlink > 0 && mp->mnt_wapbl) 2096 error = syncsnap(vp); 2097 mutex_enter(&si->si_lock); 2098 if (error) 2099 break; 2100 if (gen != si->si_gen) 2101 goto retry; 2102 } 2103 /* 2104 * Note that we need to synchronously write snapshots that 2105 * have not been unlinked, and hence will be visible after 2106 * a crash, to ensure their integrity. 2107 */ 2108 mutex_exit(&si->si_lock); 2109 if (saved_data && saved_data != bp->b_data) 2110 free(saved_data, M_UFSMNT); 2111 if (snapshot_locked) 2112 mutex_exit(&si->si_snaplock); 2113 return error; 2114} 2115 2116/* 2117 * Read from a snapshot. 2118 */ 2119int 2120ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag) 2121{ 2122 struct inode *ip = VTOI(vp); 2123 struct fs *fs = ip->i_fs; 2124 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo; 2125 struct buf *bp; 2126 daddr_t lbn, nextlbn; 2127 off_t bytesinfile; 2128 long size, xfersize, blkoffset; 2129 int error; 2130 2131 fstrans_start(vp->v_mount, FSTRANS_SHARED); 2132 mutex_enter(&si->si_snaplock); 2133 2134 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 2135 bytesinfile = ip->i_size - uio->uio_offset; 2136 if (bytesinfile <= 0) 2137 break; 2138 lbn = lblkno(fs, uio->uio_offset); 2139 nextlbn = lbn + 1; 2140 size = blksize(fs, ip, lbn); 2141 blkoffset = blkoff(fs, uio->uio_offset); 2142 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), 2143 bytesinfile); 2144 2145 if (lblktosize(fs, nextlbn) >= ip->i_size) 2146 error = bread(vp, lbn, size, NOCRED, 0, &bp); 2147 else { 2148 int nextsize = blksize(fs, ip, nextlbn); 2149 error = breadn(vp, lbn, 2150 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp); 2151 } 2152 if (error) 2153 break; 2154 2155 /* 2156 * We should only get non-zero b_resid when an I/O error 2157 * has occurred, which should cause us to break above. 2158 * However, if the short read did not cause an error, 2159 * then we want to ensure that we do not uiomove bad 2160 * or uninitialized data. 2161 */ 2162 size -= bp->b_resid; 2163 if (size < xfersize) { 2164 if (size == 0) 2165 break; 2166 xfersize = size; 2167 } 2168 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 2169 if (error) 2170 break; 2171 brelse(bp, BC_AGE); 2172 } 2173 if (bp != NULL) 2174 brelse(bp, BC_AGE); 2175 2176 mutex_exit(&si->si_snaplock); 2177 fstrans_done(vp->v_mount); 2178 return error; 2179} 2180 2181/* 2182 * Read or write the specified block of the filesystem vp resides on 2183 * from or to the disk bypassing the buffer cache. 2184 */ 2185static int 2186rwfsblk(struct vnode *vp, int flags, void *data, ufs2_daddr_t lbn) 2187{ 2188 int error; 2189 struct inode *ip = VTOI(vp); 2190 struct fs *fs = ip->i_fs; 2191 struct buf *nbp; 2192 2193 nbp = getiobuf(NULL, true); 2194 nbp->b_flags = flags; 2195 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 2196 nbp->b_error = 0; 2197 nbp->b_data = data; 2198 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn)); 2199 nbp->b_proc = NULL; 2200 nbp->b_dev = ip->i_devvp->v_rdev; 2201 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */ 2202 2203 bdev_strategy(nbp); 2204 2205 error = biowait(nbp); 2206 2207 putiobuf(nbp); 2208 2209 return error; 2210} 2211 2212/* 2213 * Write all dirty buffers to disk and invalidate them. 2214 */ 2215static int 2216syncsnap(struct vnode *vp) 2217{ 2218 int error; 2219 buf_t *bp; 2220 struct fs *fs = VTOI(vp)->i_fs; 2221 2222 mutex_enter(&bufcache_lock); 2223 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) { 2224 KASSERT((bp->b_cflags & BC_BUSY) == 0); 2225 KASSERT(bp->b_bcount == fs->fs_bsize); 2226 bp->b_cflags |= BC_BUSY; 2227 mutex_exit(&bufcache_lock); 2228 error = rwfsblk(vp, B_WRITE, bp->b_data, 2229 fragstoblks(fs, dbtofsb(fs, bp->b_blkno))); 2230 brelse(bp, BC_INVAL | BC_VFLUSH); 2231 if (error) 2232 return error; 2233 mutex_enter(&bufcache_lock); 2234 } 2235 mutex_exit(&bufcache_lock); 2236 2237 return 0; 2238} 2239 2240/* 2241 * Write the specified block to a snapshot. 2242 */ 2243static int 2244wrsnapblk(struct vnode *vp, void *data, ufs2_daddr_t lbn) 2245{ 2246 struct inode *ip = VTOI(vp); 2247 struct fs *fs = ip->i_fs; 2248 struct buf *bp; 2249 int error; 2250 2251 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, 2252 KERNCRED, (ip->i_ffs_effnlink > 0 ? B_SYNC : 0), &bp); 2253 if (error) 2254 return error; 2255 bcopy(data, bp->b_data, fs->fs_bsize); 2256 if (ip->i_ffs_effnlink > 0) 2257 error = bwrite(bp); 2258 else 2259 bawrite(bp); 2260 2261 return error; 2262} 2263 2264/* 2265 * Get/Put direct block from inode or buffer containing disk addresses. Take 2266 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 2267 * into a global include. 2268 */ 2269static inline ufs2_daddr_t 2270db_get(struct inode *ip, int loc) 2271{ 2272 if (ip->i_ump->um_fstype == UFS1) 2273 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); 2274 else 2275 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); 2276} 2277 2278static inline void 2279db_assign(struct inode *ip, int loc, ufs2_daddr_t val) 2280{ 2281 if (ip->i_ump->um_fstype == UFS1) 2282 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2283 else 2284 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2285} 2286 2287static inline ufs2_daddr_t 2288idb_get(struct inode *ip, void *bf, int loc) 2289{ 2290 if (ip->i_ump->um_fstype == UFS1) 2291 return ufs_rw32(((ufs1_daddr_t *)(bf))[loc], 2292 UFS_IPNEEDSWAP(ip)); 2293 else 2294 return ufs_rw64(((ufs2_daddr_t *)(bf))[loc], 2295 UFS_IPNEEDSWAP(ip)); 2296} 2297 2298static inline void 2299idb_assign(struct inode *ip, void *bf, int loc, ufs2_daddr_t val) 2300{ 2301 if (ip->i_ump->um_fstype == UFS1) 2302 ((ufs1_daddr_t *)(bf))[loc] = 2303 ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2304 else 2305 ((ufs2_daddr_t *)(bf))[loc] = 2306 ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2307} 2308