ffs_snapshot.c revision 1.76
1/* $NetBSD: ffs_snapshot.c,v 1.76 2008/08/24 09:51:47 hannken Exp $ */ 2 3/* 4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 5 * 6 * Further information about snapshots can be obtained from: 7 * 8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 9 * 1614 Oxford Street mckusick@mckusick.com 10 * Berkeley, CA 94709-1608 +1-510-843-9542 11 * USA 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 36 * 37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 38 */ 39 40#include <sys/cdefs.h> 41__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.76 2008/08/24 09:51:47 hannken Exp $"); 42 43#if defined(_KERNEL_OPT) 44#include "opt_ffs.h" 45#include "opt_wapbl.h" 46#endif 47 48#include <sys/param.h> 49#include <sys/kernel.h> 50#include <sys/systm.h> 51#include <sys/conf.h> 52#include <sys/buf.h> 53#include <sys/proc.h> 54#include <sys/namei.h> 55#include <sys/sched.h> 56#include <sys/stat.h> 57#include <sys/malloc.h> 58#include <sys/mount.h> 59#include <sys/resource.h> 60#include <sys/resourcevar.h> 61#include <sys/vnode.h> 62#include <sys/kauth.h> 63#include <sys/fstrans.h> 64#include <sys/wapbl.h> 65 66#include <miscfs/specfs/specdev.h> 67 68#include <ufs/ufs/quota.h> 69#include <ufs/ufs/ufsmount.h> 70#include <ufs/ufs/inode.h> 71#include <ufs/ufs/ufs_extern.h> 72#include <ufs/ufs/ufs_bswap.h> 73#include <ufs/ufs/ufs_wapbl.h> 74 75#include <ufs/ffs/fs.h> 76#include <ufs/ffs/ffs_extern.h> 77 78#include <uvm/uvm.h> 79 80/* FreeBSD -> NetBSD conversion */ 81#define KERNCRED lwp0.l_cred 82#define ufs1_daddr_t int32_t 83#define ufs2_daddr_t int64_t 84#define ufs_lbn_t daddr_t 85#define VI_MTX(v) (&(v)->v_interlock) 86#define VI_LOCK(v) mutex_enter(&(v)->v_interlock) 87#define VI_UNLOCK(v) mutex_exit(&(v)->v_interlock) 88#define MNT_ILOCK(v) mutex_enter(&mntvnode_lock) 89#define MNT_IUNLOCK(v) mutex_exit(&mntvnode_lock) 90 91#if !defined(FFS_NO_SNAPSHOT) 92typedef int (*acctfunc_t) 93 (struct vnode *, void *, int, int, struct fs *, daddr_t, int); 94 95static int cgaccount(int, struct vnode *, void *, int); 96static int expunge(struct vnode *, struct inode *, struct fs *, 97 acctfunc_t, int); 98static int indiracct(struct vnode *, struct vnode *, int, daddr_t, 99 daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int); 100static int fullacct(struct vnode *, void *, int, int, struct fs *, 101 daddr_t, int); 102static int snapacct(struct vnode *, void *, int, int, struct fs *, 103 daddr_t, int); 104static int mapacct(struct vnode *, void *, int, int, struct fs *, 105 daddr_t, int); 106#endif /* !defined(FFS_NO_SNAPSHOT) */ 107 108static int ffs_copyonwrite(void *, struct buf *, bool); 109static int snapblkaddr(struct vnode *, daddr_t, daddr_t *); 110static int rwfsblk(struct vnode *, int, void *, ufs2_daddr_t); 111static int syncsnap(struct vnode *); 112static int wrsnapblk(struct vnode *, void *, ufs2_daddr_t); 113 114static inline ufs2_daddr_t db_get(struct inode *, int); 115static inline void db_assign(struct inode *, int, ufs2_daddr_t); 116static inline ufs2_daddr_t ib_get(struct inode *, int); 117static inline void ib_assign(struct inode *, int, ufs2_daddr_t); 118static inline ufs2_daddr_t idb_get(struct inode *, void *, int); 119static inline void idb_assign(struct inode *, void *, int, ufs2_daddr_t); 120 121struct snap_info { 122 kmutex_t si_lock; /* Lock this snapinfo */ 123 kmutex_t si_snaplock; /* Snapshot vnode common lock */ 124 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */ 125 daddr_t *si_snapblklist; /* Snapshot block hints list */ 126 uint32_t si_gen; /* Incremented on change */ 127}; 128 129#ifdef DEBUG 130static int snapdebug = 0; 131#endif 132 133int 134ffs_snapshot_init(struct ufsmount *ump) 135{ 136 struct snap_info *si; 137 138 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP); 139 if (si == NULL) 140 return ENOMEM; 141 142 TAILQ_INIT(&si->si_snapshots); 143 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE); 144 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE); 145 si->si_gen = 0; 146 si->si_snapblklist = NULL; 147 148 return 0; 149} 150 151void 152ffs_snapshot_fini(struct ufsmount *ump) 153{ 154 struct snap_info *si; 155 156 si = ump->um_snapinfo; 157 ump->um_snapinfo = NULL; 158 159 KASSERT(TAILQ_EMPTY(&si->si_snapshots)); 160 mutex_destroy(&si->si_lock); 161 mutex_destroy(&si->si_snaplock); 162 KASSERT(si->si_snapblklist == NULL); 163 kmem_free(si, sizeof(*si)); 164} 165 166/* 167 * Create a snapshot file and initialize it for the filesystem. 168 * Vnode is locked on entry and return. 169 */ 170int 171ffs_snapshot(struct mount *mp, struct vnode *vp, 172 struct timespec *ctime) 173{ 174#if defined(FFS_NO_SNAPSHOT) 175 return EOPNOTSUPP; 176} 177#else /* defined(FFS_NO_SNAPSHOT) */ 178 ufs2_daddr_t numblks, blkno, *blkp, snaplistsize = 0, *snapblklist; 179 int error, ns, cg, snaploc; 180 int i, size, len, loc; 181 int flag = mp->mnt_flag; 182 struct timeval starttime; 183#ifdef DEBUG 184 struct timeval endtime; 185#endif 186 struct timespec ts; 187 long redo = 0; 188 int32_t *lp; 189 void *space; 190 void *sbbuf = NULL; 191 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 192 struct lwp *l = curlwp; 193 struct inode *ip, *xp; 194 struct buf *bp, *ibp, *nbp; 195 struct vattr vat; 196 struct vnode *xvp, *mvp, *logvp, *devvp; 197 struct snap_info *si; 198 bool suspended = false; 199 bool snapshot_locked = false; 200 201 ns = UFS_FSNEEDSWAP(fs); 202 si = VFSTOUFS(mp)->um_snapinfo; 203 204 /* 205 * Need to serialize access to snapshot code per filesystem. 206 */ 207 /* 208 * If the vnode already is a snapshot, return. 209 */ 210 if (VTOI(vp)->i_flags & SF_SNAPSHOT) { 211 if (ctime) { 212 ctime->tv_sec = DIP(VTOI(vp), mtime); 213 ctime->tv_nsec = DIP(VTOI(vp), mtimensec); 214 } 215 return 0; 216 } 217 /* 218 * Check mount, exclusive reference and owner. 219 */ 220 if (vp->v_mount != mp) 221 return EXDEV; 222 if (vp->v_usecount != 1 || vp->v_writecount != 0) 223 return EBUSY; 224 if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 225 NULL) != 0 && 226 VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred)) 227 return EACCES; 228 229 if (vp->v_size != 0) { 230 error = ffs_truncate(vp, 0, 0, NOCRED); 231 if (error) 232 return error; 233 } 234 /* 235 * Assign a snapshot slot in the superblock. 236 */ 237 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 238 if (fs->fs_snapinum[snaploc] == 0) 239 break; 240 if (snaploc == FSMAXSNAP) 241 return (ENOSPC); 242 ip = VTOI(vp); 243 devvp = ip->i_devvp; 244 if ((fs->fs_flags & FS_DOWAPBL) && 245 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { 246 error = VFS_VGET(mp, 247 fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp); 248 if (error) 249 return error; 250 } else 251 logvp = NULL; 252 /* 253 * Write an empty list of preallocated blocks to the end of 254 * the snapshot to set size to at least that of the filesystem. 255 */ 256 numblks = howmany(fs->fs_size, fs->fs_frag); 257 blkno = 1; 258 blkno = ufs_rw64(blkno, ns); 259 error = vn_rdwr(UIO_WRITE, vp, 260 (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks), 261 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 262 if (error) 263 goto out; 264 /* 265 * Preallocate critical data structures so that we can copy 266 * them in without further allocation after we suspend all 267 * operations on the filesystem. We would like to just release 268 * the allocated buffers without writing them since they will 269 * be filled in below once we are ready to go, but this upsets 270 * the soft update code, so we go ahead and write the new buffers. 271 * 272 * Allocate all indirect blocks and mark all of them as not 273 * needing to be copied. 274 */ 275 error = UFS_WAPBL_BEGIN(mp); 276 if (error) 277 goto out; 278 for (blkno = NDADDR, i = 0; blkno < numblks; blkno += NINDIR(fs)) { 279 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 280 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 281 if (error) { 282 UFS_WAPBL_END(mp); 283 goto out; 284 } 285 if (DOINGSOFTDEP(vp)) 286 bawrite(ibp); 287 else 288 brelse(ibp, 0); 289 if ((++i % 16) == 0) { 290 UFS_WAPBL_END(mp); 291 error = UFS_WAPBL_BEGIN(mp); 292 if (error) 293 goto out; 294 } 295 } 296 /* 297 * Allocate copies for the superblock and its summary information. 298 */ 299 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 300 0, &nbp); 301 if (error) { 302 UFS_WAPBL_END(mp); 303 goto out; 304 } 305 bawrite(nbp); 306 blkno = fragstoblks(fs, fs->fs_csaddr); 307 len = howmany(fs->fs_cssize, fs->fs_bsize); 308 for (loc = 0; loc < len; loc++) { 309 error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)), 310 fs->fs_bsize, KERNCRED, 0, &nbp); 311 if (error) { 312 UFS_WAPBL_END(mp); 313 goto out; 314 } 315 bawrite(nbp); 316 } 317 /* 318 * Copy all the cylinder group maps. Although the 319 * filesystem is still active, we hope that only a few 320 * cylinder groups will change between now and when we 321 * suspend operations. Thus, we will be able to quickly 322 * touch up the few cylinder groups that changed during 323 * the suspension period. 324 */ 325 len = howmany(fs->fs_ncg, NBBY); 326 fs->fs_active = malloc(len, M_DEVBUF, M_WAITOK | M_ZERO); 327 for (cg = 0; cg < fs->fs_ncg; cg++) { 328 if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 329 fs->fs_bsize, KERNCRED, 0, &nbp)) != 0) 330 break; 331 error = cgaccount(cg, vp, nbp->b_data, 1); 332 bawrite(nbp); 333 if (error) 334 break; 335 } 336 UFS_WAPBL_END(mp); 337 if (error) 338 goto out; 339 /* 340 * Change inode to snapshot type file. 341 */ 342 ip->i_flags |= SF_SNAPSHOT; 343 DIP_ASSIGN(ip, flags, ip->i_flags); 344 ip->i_flag |= IN_CHANGE | IN_UPDATE; 345 /* 346 * Ensure that the snapshot is completely on disk. 347 * Since we have marked it as a snapshot it is safe to 348 * unlock it as no process will be allowed to write to it. 349 */ 350 if ((error = VOP_FSYNC(vp, KERNCRED, FSYNC_WAIT, 0, 0)) != 0) 351 goto out; 352 VOP_UNLOCK(vp, 0); 353 /* 354 * All allocations are done, so we can now snapshot the system. 355 * 356 * Suspend operation on filesystem. 357 */ 358 if ((error = vfs_suspend(vp->v_mount, 0)) != 0) { 359 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 360 goto out; 361 } 362 suspended = true; 363 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 364 getmicrotime(&starttime); 365 error = UFS_WAPBL_BEGIN(mp); 366 if (error) 367 goto out; 368 /* 369 * First, copy all the cylinder group maps that have changed. 370 */ 371 for (cg = 0; cg < fs->fs_ncg; cg++) { 372 if (ACTIVECG_ISSET(fs, cg)) 373 continue; 374 redo++; 375 if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 376 fs->fs_bsize, KERNCRED, 0, &nbp)) != 0) 377 break; 378 error = cgaccount(cg, vp, nbp->b_data, 2); 379 bawrite(nbp); 380 if (error) 381 break; 382 } 383 if (error) { 384 UFS_WAPBL_END(mp); 385 goto out; 386 } 387 /* 388 * Grab a copy of the superblock and its summary information. 389 * We delay writing it until the suspension is released below. 390 */ 391 sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 392 loc = blkoff(fs, fs->fs_sblockloc); 393 if (loc > 0) 394 memset(sbbuf, 0, loc); 395 copy_fs = (struct fs *)((char *)sbbuf + loc); 396 bcopy(fs, copy_fs, fs->fs_sbsize); 397 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 398 if (fs->fs_sbsize < size) 399 memset((char *)sbbuf + loc + fs->fs_sbsize, 0, 400 size - fs->fs_sbsize); 401 size = blkroundup(fs, fs->fs_cssize); 402 if (fs->fs_contigsumsize > 0) 403 size += fs->fs_ncg * sizeof(int32_t); 404 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 405 copy_fs->fs_csp = space; 406 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 407 space = (char *)space + fs->fs_cssize; 408 loc = howmany(fs->fs_cssize, fs->fs_fsize); 409 i = fs->fs_frag - loc % fs->fs_frag; 410 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 411 if (len > 0) { 412 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 413 len, KERNCRED, 0, &bp)) != 0) { 414 brelse(bp, 0); 415 free(copy_fs->fs_csp, M_UFSMNT); 416 goto out; 417 } 418 bcopy(bp->b_data, space, (u_int)len); 419 space = (char *)space + len; 420 brelse(bp, BC_INVAL | BC_NOCACHE); 421 } 422 if (fs->fs_contigsumsize > 0) { 423 copy_fs->fs_maxcluster = lp = space; 424 for (i = 0; i < fs->fs_ncg; i++) 425 *lp++ = fs->fs_contigsumsize; 426 } 427 /* 428 * We must check for active files that have been unlinked 429 * (e.g., with a zero link count). We have to expunge all 430 * trace of these files from the snapshot so that they are 431 * not reclaimed prematurely by fsck or unnecessarily dumped. 432 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 433 * spec_strategy about writing on a suspended filesystem. 434 * Note that we skip unlinked snapshot files as they will 435 * be handled separately below. 436 * 437 * We also calculate the needed size for the snapshot list. 438 */ 439 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 440 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 441 /* Allocate a marker vnode */ 442 if ((mvp = vnalloc(mp)) == NULL) { 443 error = ENOMEM; 444 goto out; 445 } 446 MNT_ILOCK(mp); 447 /* 448 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 449 * and vclean() can be called indirectly 450 */ 451 for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) { 452 vmark(mvp, xvp); 453 /* 454 * Make sure this vnode wasn't reclaimed in getnewvnode(). 455 * Start over if it has (it won't be on the list anymore). 456 */ 457 if (xvp->v_mount != mp || vismarker(xvp)) 458 continue; 459 VI_LOCK(xvp); 460 if ((xvp->v_iflag & VI_XLOCK) || 461 xvp->v_usecount == 0 || xvp->v_type == VNON || 462 VTOI(xvp) == NULL || 463 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 464 VI_UNLOCK(xvp); 465 continue; 466 } 467 MNT_IUNLOCK(mp); 468 /* 469 * XXXAD should increase vnode ref count to prevent it 470 * disappearing or being recycled. 471 */ 472 VI_UNLOCK(xvp); 473#ifdef DEBUG 474 if (snapdebug) 475 vprint("ffs_snapshot: busy vnode", xvp); 476#endif 477 if (xvp != logvp && VOP_GETATTR(xvp, &vat, l->l_cred) == 0 && 478 vat.va_nlink > 0) { 479 MNT_ILOCK(mp); 480 continue; 481 } 482 xp = VTOI(xvp); 483 if (xvp != logvp && 484 ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 485 MNT_ILOCK(mp); 486 continue; 487 } 488 /* 489 * If there is a fragment, clear it here. 490 */ 491 blkno = 0; 492 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 493 if (loc < NDADDR) { 494 len = fragroundup(fs, blkoff(fs, xp->i_size)); 495 if (len > 0 && len < fs->fs_bsize) { 496 ffs_blkfree(copy_fs, vp, db_get(xp, loc), 497 len, xp->i_number); 498 blkno = db_get(xp, loc); 499 db_assign(xp, loc, 0); 500 } 501 } 502 snaplistsize += 1; 503 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY); 504 if (blkno) 505 db_assign(xp, loc, blkno); 506 if (!error) 507 error = ffs_freefile(copy_fs, vp, xp->i_number, 508 xp->i_mode); 509 if (error) { 510 free(copy_fs->fs_csp, M_UFSMNT); 511 (void)vunmark(mvp); 512 goto out; 513 } 514 MNT_ILOCK(mp); 515 } 516 MNT_IUNLOCK(mp); 517 vnfree(mvp); 518 UFS_WAPBL_END(mp); 519 /* 520 * Acquire the snapshot lock. 521 */ 522 mutex_enter(&si->si_snaplock); 523 snapshot_locked = true; 524 /* 525 * If this is the first snapshot on this filesystem, then we need 526 * to allocate the space for the list of preallocated snapshot blocks. 527 * This list will be refined below, but this preliminary one will 528 * keep us out of deadlock until the full one is ready. 529 */ 530 mutex_enter(&si->si_lock); 531 if ((xp = TAILQ_FIRST(&si->si_snapshots)) == NULL) { 532 mutex_exit(&si->si_lock); 533 snapblklist = malloc( 534 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 535 blkp = &snapblklist[1]; 536 *blkp++ = lblkno(fs, fs->fs_sblockloc); 537 blkno = fragstoblks(fs, fs->fs_csaddr); 538 for (cg = 0; cg < fs->fs_ncg; cg++) { 539 if (fragstoblks(fs, cgtod(fs, cg)) > blkno) 540 break; 541 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 542 } 543 len = howmany(fs->fs_cssize, fs->fs_bsize); 544 for (loc = 0; loc < len; loc++) 545 *blkp++ = blkno + loc; 546 for (; cg < fs->fs_ncg; cg++) 547 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 548 snapblklist[0] = blkp - snapblklist; 549 mutex_enter(&si->si_lock); 550 if (si->si_snapblklist != NULL) 551 panic("ffs_snapshot: non-empty list"); 552 si->si_snapblklist = snapblklist; 553 } 554 /* 555 * Record snapshot inode. Since this is the newest snapshot, 556 * it must be placed at the end of the list. 557 */ 558 fs->fs_snapinum[snaploc] = ip->i_number; 559 if (ip->i_nextsnap.tqe_prev != 0) 560 panic("ffs_snapshot: %llu already on list", 561 (unsigned long long)ip->i_number); 562 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 563 if (xp == NULL) 564 fscow_establish(mp, ffs_copyonwrite, devvp); 565 si->si_gen++; 566 mutex_exit(&si->si_lock); 567 vp->v_vflag |= VV_SYSTEM; 568 /* 569 * Set the mtime to the time the snapshot has been taken. 570 */ 571 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 572 if (ctime) 573 *ctime = ts; 574 DIP_ASSIGN(ip, mtime, ts.tv_sec); 575 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 576 ip->i_flag |= IN_CHANGE | IN_UPDATE; 577 /* 578 * Copy allocation information from all the snapshots in 579 * this snapshot and then expunge them from its view. 580 */ 581 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) { 582 if (xp == ip) 583 break; 584 if ((error = UFS_WAPBL_BEGIN(mp)) == 0) { 585 error = expunge(vp, xp, fs, snapacct, BLK_SNAP); 586 if (error == 0 && xp->i_ffs_effnlink == 0) 587 error = ffs_freefile(copy_fs, vp, 588 xp->i_number, xp->i_mode); 589 UFS_WAPBL_END(mp); 590 } 591 if (error) { 592 fs->fs_snapinum[snaploc] = 0; 593 goto done; 594 } 595 } 596 /* 597 * Allocate space for the full list of preallocated snapshot blocks. 598 */ 599 snapblklist = malloc(snaplistsize * sizeof(ufs2_daddr_t), 600 M_UFSMNT, M_WAITOK); 601 ip->i_snapblklist = &snapblklist[1]; 602 /* 603 * Expunge the blocks used by the snapshots from the set of 604 * blocks marked as used in the snapshot bitmaps. Also, collect 605 * the list of allocated blocks in i_snapblklist. 606 */ 607 if ((error = UFS_WAPBL_BEGIN(mp)) == 0) { 608 expunge(vp, ip, copy_fs, mapacct, BLK_SNAP); 609 UFS_WAPBL_END(mp); 610 } 611 if (error) { 612 fs->fs_snapinum[snaploc] = 0; 613 FREE(snapblklist, M_UFSMNT); 614 goto done; 615 } 616 if (snaplistsize < ip->i_snapblklist - snapblklist) 617 panic("ffs_snapshot: list too small"); 618 snaplistsize = ip->i_snapblklist - snapblklist; 619 snapblklist[0] = snaplistsize; 620 ip->i_snapblklist = &snapblklist[0]; 621 /* 622 * Write out the list of allocated blocks to the end of the snapshot. 623 */ 624 for (i = 0; i < snaplistsize; i++) 625 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 626 error = vn_rdwr(UIO_WRITE, vp, (void *)snapblklist, 627 snaplistsize*sizeof(ufs2_daddr_t), lblktosize(fs, (off_t)numblks), 628 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 629 for (i = 0; i < snaplistsize; i++) 630 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 631 if (error) { 632 fs->fs_snapinum[snaploc] = 0; 633 FREE(snapblklist, M_UFSMNT); 634 goto done; 635 } 636 /* 637 * Write the superblock and its summary information 638 * to the snapshot. 639 */ 640 blkno = fragstoblks(fs, fs->fs_csaddr); 641 len = howmany(fs->fs_cssize, fs->fs_bsize); 642 space = copy_fs->fs_csp; 643#ifdef FFS_EI 644 if (ns) { 645 ffs_sb_swap(copy_fs, copy_fs); 646 ffs_csum_swap(space, space, fs->fs_cssize); 647 } 648#endif 649 error = UFS_WAPBL_BEGIN(mp); 650 if (error) { 651 fs->fs_snapinum[snaploc] = 0; 652 FREE(snapblklist, M_UFSMNT); 653 goto done; 654 } 655 for (loc = 0; loc < len; loc++) { 656 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, 657 B_MODIFY, &nbp); 658 if (error) { 659 brelse(nbp, 0); 660 fs->fs_snapinum[snaploc] = 0; 661 FREE(snapblklist, M_UFSMNT); 662 goto done; 663 } 664 bcopy(space, nbp->b_data, fs->fs_bsize); 665 space = (char *)space + fs->fs_bsize; 666 bawrite(nbp); 667 } 668 /* 669 * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite() 670 * and ffs_snapblkfree() will always work on indirect blocks. 671 */ 672 for (loc = 0; loc < NDADDR; loc++) { 673 if (db_get(ip, loc) != 0) 674 continue; 675 error = ffs_balloc(vp, lblktosize(fs, (off_t)loc), 676 fs->fs_bsize, KERNCRED, 0, &nbp); 677 if (error) 678 break; 679 error = rwfsblk(vp, B_READ, nbp->b_data, loc); 680 if (error) { 681 brelse(nbp, 0); 682 fs->fs_snapinum[snaploc] = 0; 683 FREE(snapblklist, M_UFSMNT); 684 goto done; 685 } 686 bawrite(nbp); 687 } 688 UFS_WAPBL_END(mp); 689 /* 690 * As this is the newest list, it is the most inclusive, so 691 * should replace the previous list. If this is the first snapshot 692 * free the preliminary list. 693 */ 694 mutex_enter(&si->si_lock); 695 space = si->si_snapblklist; 696 si->si_snapblklist = snapblklist; 697 if (TAILQ_FIRST(&si->si_snapshots) == ip) 698 FREE(space, M_UFSMNT); 699 si->si_gen++; 700 mutex_exit(&si->si_lock); 701done: 702 if (mp->mnt_wapbl) 703 copy_fs->fs_flags &= ~FS_DOWAPBL; 704 free(copy_fs->fs_csp, M_UFSMNT); 705 if (!error) { 706 error = UFS_WAPBL_BEGIN(mp); 707 if (!error) { 708 error = bread(vp, lblkno(fs, fs->fs_sblockloc), 709 fs->fs_bsize, KERNCRED, B_MODIFY, &nbp); 710 if (error) { 711 brelse(nbp, 0); 712 } else { 713 bcopy(sbbuf, nbp->b_data, fs->fs_bsize); 714 bawrite(nbp); 715 } 716 UFS_WAPBL_END(mp); 717 } 718 if (error) 719 fs->fs_snapinum[snaploc] = 0; 720 } 721out: 722 /* 723 * Invalidate and free all pages on the snapshot vnode. 724 * We will read and write through the buffercache. 725 */ 726 if (!error) { 727 mutex_enter(&vp->v_interlock); 728 error = VOP_PUTPAGES(vp, 0, 0, 729 PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 730 } 731#ifdef WAPBL 732 if (!error && mp->mnt_wapbl) 733 error = wapbl_flush(mp->mnt_wapbl, 1); 734#endif 735 if (suspended) { 736 vfs_resume(vp->v_mount); 737#ifdef DEBUG 738 if (starttime.tv_sec > 0) { 739 getmicrotime(&endtime); 740 timersub(&endtime, &starttime, &endtime); 741 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 742 vp->v_mount->mnt_stat.f_mntonname, 743 (long)endtime.tv_sec, endtime.tv_usec / 1000, 744 redo, fs->fs_ncg); 745 } 746#endif 747 } 748 if (sbbuf) 749 free(sbbuf, M_UFSMNT); 750 if (fs->fs_active != 0) { 751 FREE(fs->fs_active, M_DEVBUF); 752 fs->fs_active = 0; 753 } 754 mp->mnt_flag = flag; 755 if (error) { 756 if (!UFS_WAPBL_BEGIN(mp)) { 757 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); 758 UFS_WAPBL_END(mp); 759 } 760 } else 761 vref(vp); 762 if (snapshot_locked) 763 mutex_exit(&si->si_snaplock); 764 return (error); 765} 766 767/* 768 * Copy a cylinder group map. All the unallocated blocks are marked 769 * BLK_NOCOPY so that the snapshot knows that it need not copy them 770 * if they are later written. If passno is one, then this is a first 771 * pass, so only setting needs to be done. If passno is 2, then this 772 * is a revision to a previous pass which must be undone as the 773 * replacement pass is done. 774 */ 775static int 776cgaccount(int cg, struct vnode *vp, void *data, int passno) 777{ 778 struct buf *bp, *ibp; 779 struct inode *ip; 780 struct cg *cgp; 781 struct fs *fs; 782 ufs2_daddr_t base, numblks; 783 int error, len, loc, ns, indiroff; 784 785 ip = VTOI(vp); 786 fs = ip->i_fs; 787 ns = UFS_FSNEEDSWAP(fs); 788 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 789 (int)fs->fs_cgsize, KERNCRED, 0, &bp); 790 if (error) { 791 brelse(bp, 0); 792 return (error); 793 } 794 cgp = (struct cg *)bp->b_data; 795 if (!cg_chkmagic(cgp, ns)) { 796 brelse(bp, 0); 797 return (EIO); 798 } 799 ACTIVECG_SET(fs, cg); 800 801 bcopy(bp->b_data, data, fs->fs_cgsize); 802 brelse(bp, 0); 803 if (fs->fs_cgsize < fs->fs_bsize) 804 memset((char *)data + fs->fs_cgsize, 0, 805 fs->fs_bsize - fs->fs_cgsize); 806 numblks = howmany(fs->fs_size, fs->fs_frag); 807 len = howmany(fs->fs_fpg, fs->fs_frag); 808 base = cg * fs->fs_fpg / fs->fs_frag; 809 if (base + len >= numblks) 810 len = numblks - base - 1; 811 loc = 0; 812 if (base < NDADDR) { 813 for ( ; loc < NDADDR; loc++) { 814 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 815 db_assign(ip, loc, BLK_NOCOPY); 816 else if (db_get(ip, loc) == BLK_NOCOPY) { 817 if (passno == 2) 818 db_assign(ip, loc, 0); 819 else if (passno == 1) 820 panic("ffs_snapshot: lost direct block"); 821 } 822 } 823 } 824 if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)), 825 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 826 return (error); 827 indiroff = (base + loc - NDADDR) % NINDIR(fs); 828 for ( ; loc < len; loc++, indiroff++) { 829 if (indiroff >= NINDIR(fs)) { 830 bawrite(ibp); 831 if ((error = ffs_balloc(vp, 832 lblktosize(fs, (off_t)(base + loc)), 833 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 834 return (error); 835 indiroff = 0; 836 } 837 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 838 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 839 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 840 if (passno == 2) 841 idb_assign(ip, ibp->b_data, indiroff, 0); 842 else if (passno == 1) 843 panic("ffs_snapshot: lost indirect block"); 844 } 845 } 846 bdwrite(ibp); 847 return (0); 848} 849 850/* 851 * Before expunging a snapshot inode, note all the 852 * blocks that it claims with BLK_SNAP so that fsck will 853 * be able to account for those blocks properly and so 854 * that this snapshot knows that it need not copy them 855 * if the other snapshot holding them is freed. 856 */ 857static int 858expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 859 acctfunc_t acctfunc, int expungetype) 860{ 861 int i, error, ns; 862 daddr_t lbn, rlbn; 863 daddr_t len, blkno, numblks, blksperindir; 864 struct ufs1_dinode *dip1; 865 struct ufs2_dinode *dip2; 866 void *bap; 867 struct buf *bp; 868 869 ns = UFS_FSNEEDSWAP(fs); 870 /* 871 * Prepare to expunge the inode. If its inode block has not 872 * yet been copied, then allocate and fill the copy. 873 */ 874 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 875 error = snapblkaddr(snapvp, lbn, &blkno); 876 if (error) 877 return error; 878 if (blkno != 0) { 879 error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, 880 B_MODIFY, &bp); 881 } else { 882 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn), 883 fs->fs_bsize, KERNCRED, 0, &bp); 884 if (! error) 885 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn); 886 } 887 if (error) 888 return error; 889 /* 890 * Set a snapshot inode to be a zero length file, regular files 891 * or unlinked snapshots to be completely unallocated. 892 */ 893 if (fs->fs_magic == FS_UFS1_MAGIC) { 894 dip1 = (struct ufs1_dinode *)bp->b_data + 895 ino_to_fsbo(fs, cancelip->i_number); 896 if (expungetype == BLK_NOCOPY || cancelip->i_ffs_effnlink == 0) 897 dip1->di_mode = 0; 898 dip1->di_size = 0; 899 dip1->di_blocks = 0; 900 dip1->di_flags = 901 ufs_rw32(ufs_rw32(dip1->di_flags, ns) & ~SF_SNAPSHOT, ns); 902 bzero(&dip1->di_db[0], (NDADDR + NIADDR) * sizeof(int32_t)); 903 } else { 904 dip2 = (struct ufs2_dinode *)bp->b_data + 905 ino_to_fsbo(fs, cancelip->i_number); 906 if (expungetype == BLK_NOCOPY || cancelip->i_ffs_effnlink == 0) 907 dip2->di_mode = 0; 908 dip2->di_size = 0; 909 dip2->di_blocks = 0; 910 dip2->di_flags = 911 ufs_rw32(ufs_rw32(dip2->di_flags, ns) & ~SF_SNAPSHOT, ns); 912 bzero(&dip2->di_db[0], (NDADDR + NIADDR) * sizeof(int64_t)); 913 } 914 bdwrite(bp); 915 /* 916 * Now go through and expunge all the blocks in the file 917 * using the function requested. 918 */ 919 numblks = howmany(cancelip->i_size, fs->fs_bsize); 920 if (fs->fs_magic == FS_UFS1_MAGIC) 921 bap = &cancelip->i_ffs1_db[0]; 922 else 923 bap = &cancelip->i_ffs2_db[0]; 924 if ((error = (*acctfunc)(snapvp, bap, 0, NDADDR, fs, 0, expungetype))) 925 return (error); 926 if (fs->fs_magic == FS_UFS1_MAGIC) 927 bap = &cancelip->i_ffs1_ib[0]; 928 else 929 bap = &cancelip->i_ffs2_ib[0]; 930 if ((error = (*acctfunc)(snapvp, bap, 0, NIADDR, fs, -1, expungetype))) 931 return (error); 932 blksperindir = 1; 933 lbn = -NDADDR; 934 len = numblks - NDADDR; 935 rlbn = NDADDR; 936 for (i = 0; len > 0 && i < NIADDR; i++) { 937 error = indiracct(snapvp, ITOV(cancelip), i, 938 ib_get(cancelip, i), lbn, rlbn, len, 939 blksperindir, fs, acctfunc, expungetype); 940 if (error) 941 return (error); 942 blksperindir *= NINDIR(fs); 943 lbn -= blksperindir + 1; 944 len -= blksperindir; 945 rlbn += blksperindir; 946 } 947 return (0); 948} 949 950/* 951 * Descend an indirect block chain for vnode cancelvp accounting for all 952 * its indirect blocks in snapvp. 953 */ 954static int 955indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level, 956 daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks, 957 daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype) 958{ 959 int error, num, i; 960 daddr_t subblksperindir; 961 struct indir indirs[NIADDR + 2]; 962 daddr_t last; 963 void *bap; 964 struct buf *bp; 965 966 if (blkno == 0) { 967 if (expungetype == BLK_NOCOPY) 968 return (0); 969 panic("indiracct: missing indir"); 970 } 971 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 972 return (error); 973 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 974 panic("indiracct: botched params"); 975 /* 976 * We have to expand bread here since it will deadlock looking 977 * up the block number for any blocks that are not in the cache. 978 */ 979 error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize, 980 false, &bp); 981 if (error) 982 return error; 983 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error = 984 rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) { 985 brelse(bp, 0); 986 return (error); 987 } 988 /* 989 * Account for the block pointers in this indirect block. 990 */ 991 last = howmany(remblks, blksperindir); 992 if (last > NINDIR(fs)) 993 last = NINDIR(fs); 994 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 995 bcopy(bp->b_data, (void *)bap, fs->fs_bsize); 996 brelse(bp, 0); 997 error = (*acctfunc)(snapvp, bap, 0, last, 998 fs, level == 0 ? rlbn : -1, expungetype); 999 if (error || level == 0) 1000 goto out; 1001 /* 1002 * Account for the block pointers in each of the indirect blocks 1003 * in the levels below us. 1004 */ 1005 subblksperindir = blksperindir / NINDIR(fs); 1006 for (lbn++, level--, i = 0; i < last; i++) { 1007 error = indiracct(snapvp, cancelvp, level, 1008 idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks, 1009 subblksperindir, fs, acctfunc, expungetype); 1010 if (error) 1011 goto out; 1012 rlbn += blksperindir; 1013 lbn -= blksperindir; 1014 remblks -= blksperindir; 1015 } 1016out: 1017 FREE(bap, M_DEVBUF); 1018 return (error); 1019} 1020 1021/* 1022 * Do both snap accounting and map accounting. 1023 */ 1024static int 1025fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1026 struct fs *fs, daddr_t lblkno, 1027 int exptype /* BLK_SNAP or BLK_NOCOPY */) 1028{ 1029 int error; 1030 1031 if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype))) 1032 return (error); 1033 return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)); 1034} 1035 1036/* 1037 * Identify a set of blocks allocated in a snapshot inode. 1038 */ 1039static int 1040snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1041 struct fs *fs, daddr_t lblkno, 1042 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 1043{ 1044 struct inode *ip = VTOI(vp); 1045 daddr_t blkno; 1046 daddr_t lbn; 1047 struct buf *ibp; 1048 int error; 1049 1050 for ( ; oldblkp < lastblkp; oldblkp++) { 1051 blkno = idb_get(ip, bap, oldblkp); 1052 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1053 continue; 1054 lbn = fragstoblks(fs, blkno); 1055 if (lbn < NDADDR) { 1056 blkno = db_get(ip, lbn); 1057 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1058 } else { 1059 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1060 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1061 if (error) 1062 return (error); 1063 blkno = idb_get(ip, ibp->b_data, 1064 (lbn - NDADDR) % NINDIR(fs)); 1065 } 1066 /* 1067 * If we are expunging a snapshot vnode and we 1068 * find a block marked BLK_NOCOPY, then it is 1069 * one that has been allocated to this snapshot after 1070 * we took our current snapshot and can be ignored. 1071 */ 1072 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1073 if (lbn >= NDADDR) 1074 brelse(ibp, 0); 1075 } else { 1076 if (blkno != 0) 1077 panic("snapacct: bad block"); 1078 if (lbn < NDADDR) 1079 db_assign(ip, lbn, expungetype); 1080 else { 1081 idb_assign(ip, ibp->b_data, 1082 (lbn - NDADDR) % NINDIR(fs), expungetype); 1083 bdwrite(ibp); 1084 } 1085 } 1086 } 1087 return (0); 1088} 1089 1090/* 1091 * Account for a set of blocks allocated in a snapshot inode. 1092 */ 1093static int 1094mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1095 struct fs *fs, daddr_t lblkno, int expungetype) 1096{ 1097 daddr_t blkno; 1098 struct inode *ip; 1099 ino_t inum; 1100 int acctit; 1101 1102 ip = VTOI(vp); 1103 inum = ip->i_number; 1104 if (lblkno == -1) 1105 acctit = 0; 1106 else 1107 acctit = 1; 1108 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1109 blkno = idb_get(ip, bap, oldblkp); 1110 if (blkno == 0 || blkno == BLK_NOCOPY) 1111 continue; 1112 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1113 *ip->i_snapblklist++ = lblkno; 1114 if (blkno == BLK_SNAP) 1115 blkno = blkstofrags(fs, lblkno); 1116 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1117 } 1118 return (0); 1119} 1120#endif /* defined(FFS_NO_SNAPSHOT) */ 1121 1122/* 1123 * Decrement extra reference on snapshot when last name is removed. 1124 * It will not be freed until the last open reference goes away. 1125 */ 1126void 1127ffs_snapgone(struct inode *ip) 1128{ 1129 struct mount *mp = ip->i_devvp->v_specmountpoint; 1130 struct inode *xp; 1131 struct fs *fs; 1132 struct snap_info *si; 1133 int snaploc; 1134 1135 si = VFSTOUFS(mp)->um_snapinfo; 1136 1137 /* 1138 * Find snapshot in incore list. 1139 */ 1140 mutex_enter(&si->si_lock); 1141 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 1142 if (xp == ip) 1143 break; 1144 mutex_exit(&si->si_lock); 1145 if (xp != NULL) 1146 vrele(ITOV(ip)); 1147#ifdef DEBUG 1148 else if (snapdebug) 1149 printf("ffs_snapgone: lost snapshot vnode %llu\n", 1150 (unsigned long long)ip->i_number); 1151#endif 1152 /* 1153 * Delete snapshot inode from superblock. Keep list dense. 1154 */ 1155 mutex_enter(&si->si_lock); 1156 fs = ip->i_fs; 1157 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1158 if (fs->fs_snapinum[snaploc] == ip->i_number) 1159 break; 1160 if (snaploc < FSMAXSNAP) { 1161 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1162 if (fs->fs_snapinum[snaploc] == 0) 1163 break; 1164 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1165 } 1166 fs->fs_snapinum[snaploc - 1] = 0; 1167 } 1168 si->si_gen++; 1169 mutex_exit(&si->si_lock); 1170} 1171 1172/* 1173 * Prepare a snapshot file for being removed. 1174 */ 1175void 1176ffs_snapremove(struct vnode *vp) 1177{ 1178 struct inode *ip = VTOI(vp), *xp; 1179 struct vnode *devvp = ip->i_devvp; 1180 struct fs *fs = ip->i_fs; 1181 struct mount *mp = devvp->v_specmountpoint; 1182 struct buf *ibp; 1183 struct snap_info *si; 1184 ufs2_daddr_t numblks, blkno, dblk; 1185 int error, loc, last; 1186 1187 si = VFSTOUFS(mp)->um_snapinfo; 1188 mutex_enter(&si->si_snaplock); 1189 /* 1190 * If active, delete from incore list (this snapshot may 1191 * already have been in the process of being deleted, so 1192 * would not have been active). 1193 * 1194 * Clear copy-on-write flag if last snapshot. 1195 */ 1196 if (ip->i_nextsnap.tqe_prev != 0) { 1197 mutex_enter(&si->si_lock); 1198 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap); 1199 ip->i_nextsnap.tqe_prev = 0; 1200 if (TAILQ_FIRST(&si->si_snapshots) != 0) { 1201 /* Roll back the list of preallocated blocks. */ 1202 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1203 si->si_snapblklist = xp->i_snapblklist; 1204 } else { 1205 si->si_snapblklist = 0; 1206 si->si_gen++; 1207 mutex_exit(&si->si_lock); 1208 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1209 mutex_enter(&si->si_lock); 1210 } 1211 si->si_gen++; 1212 mutex_exit(&si->si_lock); 1213 FREE(ip->i_snapblklist, M_UFSMNT); 1214 ip->i_snapblklist = NULL; 1215 } 1216 mutex_exit(&si->si_snaplock); 1217 /* 1218 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1219 * snapshots that want them (see ffs_snapblkfree below). 1220 */ 1221 for (blkno = 1; blkno < NDADDR; blkno++) { 1222 dblk = db_get(ip, blkno); 1223 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1224 db_assign(ip, blkno, 0); 1225 else if ((dblk == blkstofrags(fs, blkno) && 1226 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1227 ip->i_number))) { 1228 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1229 db_assign(ip, blkno, 0); 1230 } 1231 } 1232 numblks = howmany(ip->i_size, fs->fs_bsize); 1233 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1234 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 1235 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1236 if (error) 1237 continue; 1238 if (fs->fs_size - blkno > NINDIR(fs)) 1239 last = NINDIR(fs); 1240 else 1241 last = fs->fs_size - blkno; 1242 for (loc = 0; loc < last; loc++) { 1243 dblk = idb_get(ip, ibp->b_data, loc); 1244 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1245 idb_assign(ip, ibp->b_data, loc, 0); 1246 else if (dblk == blkstofrags(fs, blkno) && 1247 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1248 fs->fs_bsize, ip->i_number)) { 1249 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1250 idb_assign(ip, ibp->b_data, loc, 0); 1251 } 1252 } 1253 bawrite(ibp); 1254 } 1255 /* 1256 * Clear snapshot flag and drop reference. 1257 */ 1258 ip->i_flags &= ~SF_SNAPSHOT; 1259 DIP_ASSIGN(ip, flags, ip->i_flags); 1260 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1261} 1262 1263/* 1264 * Notification that a block is being freed. Return zero if the free 1265 * should be allowed to proceed. Return non-zero if the snapshot file 1266 * wants to claim the block. The block will be claimed if it is an 1267 * uncopied part of one of the snapshots. It will be freed if it is 1268 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1269 * If a fragment is being freed, then all snapshots that care about 1270 * it must make a copy since a snapshot file can only claim full sized 1271 * blocks. Note that if more than one snapshot file maps the block, 1272 * we can pick one at random to claim it. Since none of the snapshots 1273 * can change, we are assurred that they will all see the same unmodified 1274 * image. When deleting a snapshot file (see ffs_snapremove above), we 1275 * must push any of these claimed blocks to one of the other snapshots 1276 * that maps it. These claimed blocks are easily identified as they will 1277 * have a block number equal to their logical block number within the 1278 * snapshot. A copied block can never have this property because they 1279 * must always have been allocated from a BLK_NOCOPY location. 1280 */ 1281int 1282ffs_snapblkfree(struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno, 1283 long size, ino_t inum) 1284{ 1285 struct mount *mp = devvp->v_specmountpoint; 1286 struct buf *ibp; 1287 struct inode *ip; 1288 struct vnode *vp = NULL; 1289 struct snap_info *si; 1290 void *saved_data = NULL; 1291 ufs_lbn_t lbn; 1292 ufs2_daddr_t blkno; 1293 uint32_t gen; 1294 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1295 1296 si = VFSTOUFS(mp)->um_snapinfo; 1297 lbn = fragstoblks(fs, bno); 1298 mutex_enter(&si->si_lock); 1299retry: 1300 gen = si->si_gen; 1301 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1302 vp = ITOV(ip); 1303 if (snapshot_locked == 0) { 1304 if (!mutex_tryenter(&si->si_snaplock)) { 1305 mutex_exit(&si->si_lock); 1306 mutex_enter(&si->si_snaplock); 1307 mutex_enter(&si->si_lock); 1308 } 1309 snapshot_locked = 1; 1310 if (gen != si->si_gen) 1311 goto retry; 1312 } 1313 /* 1314 * Lookup block being written. 1315 */ 1316 if (lbn < NDADDR) { 1317 blkno = db_get(ip, lbn); 1318 } else { 1319 mutex_exit(&si->si_lock); 1320 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1321 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1322 if (error) { 1323 mutex_enter(&si->si_lock); 1324 break; 1325 } 1326 indiroff = (lbn - NDADDR) % NINDIR(fs); 1327 blkno = idb_get(ip, ibp->b_data, indiroff); 1328 mutex_enter(&si->si_lock); 1329 if (gen != si->si_gen) { 1330 brelse(ibp, 0); 1331 goto retry; 1332 } 1333 } 1334 /* 1335 * Check to see if block needs to be copied. 1336 */ 1337 if (blkno == 0) { 1338 /* 1339 * A block that we map is being freed. If it has not 1340 * been claimed yet, we will claim or copy it (below). 1341 */ 1342 claimedblk = 1; 1343 } else if (blkno == BLK_SNAP) { 1344 /* 1345 * No previous snapshot claimed the block, 1346 * so it will be freed and become a BLK_NOCOPY 1347 * (don't care) for us. 1348 */ 1349 if (claimedblk) 1350 panic("snapblkfree: inconsistent block type"); 1351 if (lbn < NDADDR) { 1352 db_assign(ip, lbn, BLK_NOCOPY); 1353 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1354 } else { 1355 idb_assign(ip, ibp->b_data, indiroff, 1356 BLK_NOCOPY); 1357 mutex_exit(&si->si_lock); 1358 if (ip->i_ffs_effnlink > 0) 1359 bwrite(ibp); 1360 else 1361 bdwrite(ibp); 1362 mutex_enter(&si->si_lock); 1363 if (gen != si->si_gen) 1364 goto retry; 1365 } 1366 continue; 1367 } else /* BLK_NOCOPY or default */ { 1368 /* 1369 * If the snapshot has already copied the block 1370 * (default), or does not care about the block, 1371 * it is not needed. 1372 */ 1373 if (lbn >= NDADDR) 1374 brelse(ibp, 0); 1375 continue; 1376 } 1377 /* 1378 * If this is a full size block, we will just grab it 1379 * and assign it to the snapshot inode. Otherwise we 1380 * will proceed to copy it. See explanation for this 1381 * routine as to why only a single snapshot needs to 1382 * claim this block. 1383 */ 1384 if (size == fs->fs_bsize) { 1385#ifdef DEBUG 1386 if (snapdebug) 1387 printf("%s %llu lbn %" PRId64 1388 "from inum %llu\n", 1389 "Grabonremove: snapino", 1390 (unsigned long long)ip->i_number, 1391 lbn, (unsigned long long)inum); 1392#endif 1393 mutex_exit(&si->si_lock); 1394 if (lbn < NDADDR) { 1395 db_assign(ip, lbn, bno); 1396 } else { 1397 idb_assign(ip, ibp->b_data, indiroff, bno); 1398 if (ip->i_ffs_effnlink > 0) 1399 bwrite(ibp); 1400 else 1401 bdwrite(ibp); 1402 } 1403 DIP_ADD(ip, blocks, btodb(size)); 1404 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1405 if (ip->i_ffs_effnlink > 0 && mp->mnt_wapbl) 1406 error = syncsnap(vp); 1407 else 1408 error = 0; 1409 mutex_exit(&si->si_snaplock); 1410 return (error == 0); 1411 } 1412 if (lbn >= NDADDR) 1413 brelse(ibp, 0); 1414#ifdef DEBUG 1415 if (snapdebug) 1416 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n", 1417 "Copyonremove: snapino ", 1418 (unsigned long long)ip->i_number, 1419 lbn, "for inum", (unsigned long long)inum, size); 1420#endif 1421 /* 1422 * If we have already read the old block contents, then 1423 * simply copy them to the new block. Note that we need 1424 * to synchronously write snapshots that have not been 1425 * unlinked, and hence will be visible after a crash, 1426 * to ensure their integrity. 1427 */ 1428 mutex_exit(&si->si_lock); 1429 if (saved_data == NULL) { 1430 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1431 error = rwfsblk(vp, B_READ, saved_data, lbn); 1432 if (error) { 1433 free(saved_data, M_UFSMNT); 1434 saved_data = NULL; 1435 mutex_enter(&si->si_lock); 1436 break; 1437 } 1438 } 1439 error = wrsnapblk(vp, saved_data, lbn); 1440 if (error == 0 && ip->i_ffs_effnlink > 0 && mp->mnt_wapbl) 1441 error = syncsnap(vp); 1442 mutex_enter(&si->si_lock); 1443 if (error) 1444 break; 1445 if (gen != si->si_gen) 1446 goto retry; 1447 } 1448 mutex_exit(&si->si_lock); 1449 if (saved_data) 1450 free(saved_data, M_UFSMNT); 1451 /* 1452 * If we have been unable to allocate a block in which to do 1453 * the copy, then return non-zero so that the fragment will 1454 * not be freed. Although space will be lost, the snapshot 1455 * will stay consistent. 1456 */ 1457 if (snapshot_locked) 1458 mutex_exit(&si->si_snaplock); 1459 return (error); 1460} 1461 1462/* 1463 * Associate snapshot files when mounting. 1464 */ 1465void 1466ffs_snapshot_mount(struct mount *mp) 1467{ 1468 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1469 struct fs *fs = VFSTOUFS(mp)->um_fs; 1470 struct lwp *l = curlwp; 1471 struct vnode *vp; 1472 struct inode *ip, *xp; 1473 struct snap_info *si; 1474 ufs2_daddr_t snaplistsize, *snapblklist; 1475 int i, error, ns, snaploc, loc; 1476 1477 /* 1478 * No persistent snapshots on apple ufs file systems. 1479 */ 1480 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp))) 1481 return; 1482 1483 si = VFSTOUFS(mp)->um_snapinfo; 1484 ns = UFS_FSNEEDSWAP(fs); 1485 /* 1486 * XXX The following needs to be set before ffs_truncate or 1487 * VOP_READ can be called. 1488 */ 1489 mp->mnt_stat.f_iosize = fs->fs_bsize; 1490 /* 1491 * Process each snapshot listed in the superblock. 1492 */ 1493 vp = NULL; 1494 mutex_enter(&si->si_lock); 1495 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1496 if (fs->fs_snapinum[snaploc] == 0) 1497 break; 1498 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1499 &vp)) != 0) { 1500 printf("ffs_snapshot_mount: vget failed %d\n", error); 1501 continue; 1502 } 1503 ip = VTOI(vp); 1504 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1505 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1506 fs->fs_snapinum[snaploc]); 1507 vput(vp); 1508 vp = NULL; 1509 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1510 if (fs->fs_snapinum[loc] == 0) 1511 break; 1512 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1513 } 1514 fs->fs_snapinum[loc - 1] = 0; 1515 snaploc--; 1516 continue; 1517 } 1518 1519 /* 1520 * Read the block hints list. Use an empty list on 1521 * read errors. 1522 */ 1523 error = vn_rdwr(UIO_READ, vp, 1524 (void *)&snaplistsize, sizeof(snaplistsize), 1525 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1526 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, 1527 l->l_cred, NULL, NULL); 1528 if (error) { 1529 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1530 snaplistsize = 1; 1531 } else 1532 snaplistsize = ufs_rw64(snaplistsize, ns); 1533 snapblklist = malloc( 1534 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 1535 if (error) 1536 snapblklist[0] = 1; 1537 else { 1538 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist, 1539 snaplistsize * sizeof(ufs2_daddr_t), 1540 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1541 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, 1542 l->l_cred, NULL, NULL); 1543 for (i = 0; i < snaplistsize; i++) 1544 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 1545 if (error) { 1546 printf("ffs_snapshot_mount: read_2 failed %d\n", 1547 error); 1548 snapblklist[0] = 1; 1549 } 1550 } 1551 ip->i_snapblklist = &snapblklist[0]; 1552 1553 /* 1554 * Link it onto the active snapshot list. 1555 */ 1556 if (ip->i_nextsnap.tqe_prev != 0) 1557 panic("ffs_snapshot_mount: %llu already on list", 1558 (unsigned long long)ip->i_number); 1559 else 1560 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 1561 vp->v_vflag |= VV_SYSTEM; 1562 VOP_UNLOCK(vp, 0); 1563 } 1564 /* 1565 * No usable snapshots found. 1566 */ 1567 if (vp == NULL) { 1568 mutex_exit(&si->si_lock); 1569 return; 1570 } 1571 /* 1572 * Attach the block hints list. We always want to 1573 * use the list from the newest snapshot. 1574 */ 1575 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1576 si->si_snapblklist = xp->i_snapblklist; 1577 fscow_establish(mp, ffs_copyonwrite, devvp); 1578 si->si_gen++; 1579 mutex_exit(&si->si_lock); 1580} 1581 1582/* 1583 * Disassociate snapshot files when unmounting. 1584 */ 1585void 1586ffs_snapshot_unmount(struct mount *mp) 1587{ 1588 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1589 struct inode *xp; 1590 struct vnode *vp = NULL; 1591 struct snap_info *si; 1592 1593 si = VFSTOUFS(mp)->um_snapinfo; 1594 mutex_enter(&si->si_lock); 1595 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) { 1596 vp = ITOV(xp); 1597 vp->v_vnlock = &vp->v_lock; 1598 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap); 1599 xp->i_nextsnap.tqe_prev = 0; 1600 if (xp->i_snapblklist == si->si_snapblklist) 1601 si->si_snapblklist = NULL; 1602 FREE(xp->i_snapblklist, M_UFSMNT); 1603 if (xp->i_ffs_effnlink > 0) { 1604 si->si_gen++; 1605 mutex_exit(&si->si_lock); 1606 vrele(vp); 1607 mutex_enter(&si->si_lock); 1608 } 1609 } 1610 if (vp) 1611 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1612 si->si_gen++; 1613 mutex_exit(&si->si_lock); 1614} 1615 1616/* 1617 * Lookup a snapshots data block address. 1618 * Simpler than UFS_BALLOC() as we know all metadata is already allocated 1619 * and safe even for the pagedaemon where we cannot bread(). 1620 */ 1621static int 1622snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res) 1623{ 1624 struct indir indirs[NIADDR + 2]; 1625 struct inode *ip = VTOI(vp); 1626 struct fs *fs = ip->i_fs; 1627 struct buf *bp; 1628 int error, num; 1629 1630 KASSERT(lbn >= 0); 1631 1632 if (lbn < NDADDR) { 1633 *res = db_get(ip, lbn); 1634 return 0; 1635 } 1636 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 1637 return error; 1638 if (curlwp == uvm.pagedaemon_lwp) { 1639 mutex_enter(&bufcache_lock); 1640 bp = incore(vp, indirs[num-1].in_lbn); 1641 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) { 1642 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 1643 error = 0; 1644 } else 1645 error = ENOMEM; 1646 mutex_exit(&bufcache_lock); 1647 return error; 1648 } 1649 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp); 1650 if (error == 0) 1651 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 1652 brelse(bp, 0); 1653 1654 return error; 1655} 1656 1657/* 1658 * Check for need to copy block that is about to be written, 1659 * copying the block if necessary. 1660 */ 1661static int 1662ffs_copyonwrite(void *v, struct buf *bp, bool data_valid) 1663{ 1664 struct fs *fs; 1665 struct inode *ip; 1666 struct vnode *devvp = v, *vp = NULL; 1667 struct mount *mp = devvp->v_specmountpoint; 1668 struct snap_info *si; 1669 void *saved_data = NULL; 1670 ufs2_daddr_t lbn, blkno, *snapblklist; 1671 uint32_t gen; 1672 int lower, upper, mid, snapshot_locked = 0, error = 0; 1673 1674 /* 1675 * Check for valid snapshots. 1676 */ 1677 si = VFSTOUFS(mp)->um_snapinfo; 1678 mutex_enter(&si->si_lock); 1679 ip = TAILQ_FIRST(&si->si_snapshots); 1680 if (ip == NULL) { 1681 mutex_exit(&si->si_lock); 1682 return 0; 1683 } 1684 /* 1685 * First check to see if it is after the file system or 1686 * in the preallocated list. 1687 * By doing this check we avoid several potential deadlocks. 1688 */ 1689 fs = ip->i_fs; 1690 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1691 if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) { 1692 mutex_exit(&si->si_lock); 1693 return 0; 1694 } 1695 snapblklist = si->si_snapblklist; 1696 upper = si->si_snapblklist[0] - 1; 1697 lower = 1; 1698 while (lower <= upper) { 1699 mid = (lower + upper) / 2; 1700 if (snapblklist[mid] == lbn) 1701 break; 1702 if (snapblklist[mid] < lbn) 1703 lower = mid + 1; 1704 else 1705 upper = mid - 1; 1706 } 1707 if (lower <= upper) { 1708 mutex_exit(&si->si_lock); 1709 return 0; 1710 } 1711 /* 1712 * Not in the precomputed list, so check the snapshots. 1713 */ 1714 if (data_valid && bp->b_bcount == fs->fs_bsize) 1715 saved_data = bp->b_data; 1716retry: 1717 gen = si->si_gen; 1718 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1719 vp = ITOV(ip); 1720 /* 1721 * We ensure that everything of our own that needs to be 1722 * copied will be done at the time that ffs_snapshot is 1723 * called. Thus we can skip the check here which can 1724 * deadlock in doing the lookup in ffs_balloc. 1725 */ 1726 if (bp->b_vp == vp) 1727 continue; 1728 /* 1729 * Check to see if block needs to be copied. 1730 */ 1731 if (lbn < NDADDR) { 1732 blkno = db_get(ip, lbn); 1733 } else { 1734 mutex_exit(&si->si_lock); 1735 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 1736 mutex_enter(&si->si_lock); 1737 break; 1738 } 1739 mutex_enter(&si->si_lock); 1740 if (gen != si->si_gen) 1741 goto retry; 1742 } 1743#ifdef DIAGNOSTIC 1744 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1745 panic("ffs_copyonwrite: bad copy block"); 1746#endif 1747 if (blkno != 0) 1748 continue; 1749 1750 if (curlwp == uvm.pagedaemon_lwp) { 1751 error = ENOMEM; 1752 break; 1753 } 1754 1755 if (snapshot_locked == 0) { 1756 if (!mutex_tryenter(&si->si_snaplock)) { 1757 mutex_exit(&si->si_lock); 1758 mutex_enter(&si->si_snaplock); 1759 mutex_enter(&si->si_lock); 1760 } 1761 snapshot_locked = 1; 1762 if (gen != si->si_gen) 1763 goto retry; 1764 1765 /* Check again if block still needs to be copied */ 1766 if (lbn < NDADDR) { 1767 blkno = db_get(ip, lbn); 1768 } else { 1769 mutex_exit(&si->si_lock); 1770 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 1771 mutex_enter(&si->si_lock); 1772 break; 1773 } 1774 mutex_enter(&si->si_lock); 1775 if (gen != si->si_gen) 1776 goto retry; 1777 } 1778 1779 if (blkno != 0) 1780 continue; 1781 } 1782 /* 1783 * Allocate the block into which to do the copy. Since 1784 * multiple processes may all try to copy the same block, 1785 * we have to recheck our need to do a copy if we sleep 1786 * waiting for the lock. 1787 * 1788 * Because all snapshots on a filesystem share a single 1789 * lock, we ensure that we will never be in competition 1790 * with another process to allocate a block. 1791 */ 1792#ifdef DEBUG 1793 if (snapdebug) { 1794 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ", 1795 (unsigned long long)ip->i_number, lbn); 1796 if (bp->b_vp == devvp) 1797 printf("fs metadata"); 1798 else 1799 printf("inum %llu", (unsigned long long) 1800 VTOI(bp->b_vp)->i_number); 1801 printf(" lblkno %" PRId64 "\n", bp->b_lblkno); 1802 } 1803#endif 1804 /* 1805 * If we have already read the old block contents, then 1806 * simply copy them to the new block. Note that we need 1807 * to synchronously write snapshots that have not been 1808 * unlinked, and hence will be visible after a crash, 1809 * to ensure their integrity. 1810 */ 1811 mutex_exit(&si->si_lock); 1812 if (saved_data == NULL) { 1813 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1814 error = rwfsblk(vp, B_READ, saved_data, lbn); 1815 if (error) { 1816 free(saved_data, M_UFSMNT); 1817 saved_data = NULL; 1818 mutex_enter(&si->si_lock); 1819 break; 1820 } 1821 } 1822 error = wrsnapblk(vp, saved_data, lbn); 1823 if (error == 0 && ip->i_ffs_effnlink > 0 && mp->mnt_wapbl) 1824 error = syncsnap(vp); 1825 mutex_enter(&si->si_lock); 1826 if (error) 1827 break; 1828 if (gen != si->si_gen) 1829 goto retry; 1830 } 1831 /* 1832 * Note that we need to synchronously write snapshots that 1833 * have not been unlinked, and hence will be visible after 1834 * a crash, to ensure their integrity. 1835 */ 1836 mutex_exit(&si->si_lock); 1837 if (saved_data && saved_data != bp->b_data) 1838 free(saved_data, M_UFSMNT); 1839 if (snapshot_locked) 1840 mutex_exit(&si->si_snaplock); 1841 return error; 1842} 1843 1844/* 1845 * Read from a snapshot. 1846 */ 1847int 1848ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag) 1849{ 1850 struct inode *ip = VTOI(vp); 1851 struct fs *fs = ip->i_fs; 1852 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo; 1853 struct buf *bp; 1854 daddr_t lbn, nextlbn; 1855 off_t bytesinfile; 1856 long size, xfersize, blkoffset; 1857 int error; 1858 1859 fstrans_start(vp->v_mount, FSTRANS_SHARED); 1860 mutex_enter(&si->si_snaplock); 1861 1862 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 1863 bytesinfile = ip->i_size - uio->uio_offset; 1864 if (bytesinfile <= 0) 1865 break; 1866 lbn = lblkno(fs, uio->uio_offset); 1867 nextlbn = lbn + 1; 1868 size = blksize(fs, ip, lbn); 1869 blkoffset = blkoff(fs, uio->uio_offset); 1870 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), 1871 bytesinfile); 1872 1873 if (lblktosize(fs, nextlbn) >= ip->i_size) 1874 error = bread(vp, lbn, size, NOCRED, 0, &bp); 1875 else { 1876 int nextsize = blksize(fs, ip, nextlbn); 1877 error = breadn(vp, lbn, 1878 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp); 1879 } 1880 if (error) 1881 break; 1882 1883 /* 1884 * We should only get non-zero b_resid when an I/O error 1885 * has occurred, which should cause us to break above. 1886 * However, if the short read did not cause an error, 1887 * then we want to ensure that we do not uiomove bad 1888 * or uninitialized data. 1889 */ 1890 size -= bp->b_resid; 1891 if (size < xfersize) { 1892 if (size == 0) 1893 break; 1894 xfersize = size; 1895 } 1896 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 1897 if (error) 1898 break; 1899 brelse(bp, BC_AGE); 1900 } 1901 if (bp != NULL) 1902 brelse(bp, BC_AGE); 1903 1904 mutex_exit(&si->si_snaplock); 1905 fstrans_done(vp->v_mount); 1906 return error; 1907} 1908 1909/* 1910 * Read or write the specified block of the filesystem vp resides on 1911 * from or to the disk bypassing the buffer cache. 1912 */ 1913static int 1914rwfsblk(struct vnode *vp, int flags, void *data, ufs2_daddr_t lbn) 1915{ 1916 int error; 1917 struct inode *ip = VTOI(vp); 1918 struct fs *fs = ip->i_fs; 1919 struct buf *nbp; 1920 1921 nbp = getiobuf(NULL, true); 1922 nbp->b_flags = flags; 1923 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 1924 nbp->b_error = 0; 1925 nbp->b_data = data; 1926 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn)); 1927 nbp->b_proc = NULL; 1928 nbp->b_dev = ip->i_devvp->v_rdev; 1929 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */ 1930 1931 bdev_strategy(nbp); 1932 1933 error = biowait(nbp); 1934 1935 putiobuf(nbp); 1936 1937 return error; 1938} 1939 1940/* 1941 * Write all dirty buffers to disk and invalidate them. 1942 */ 1943static int 1944syncsnap(struct vnode *vp) 1945{ 1946 int error; 1947 buf_t *bp; 1948 struct fs *fs = VTOI(vp)->i_fs; 1949 1950 mutex_enter(&bufcache_lock); 1951 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) { 1952 KASSERT((bp->b_cflags & BC_BUSY) == 0); 1953 KASSERT(bp->b_bcount == fs->fs_bsize); 1954 bp->b_cflags |= BC_BUSY; 1955 mutex_exit(&bufcache_lock); 1956 error = rwfsblk(vp, B_WRITE, bp->b_data, 1957 fragstoblks(fs, dbtofsb(fs, bp->b_blkno))); 1958 brelse(bp, BC_INVAL | BC_VFLUSH); 1959 if (error) 1960 return error; 1961 mutex_enter(&bufcache_lock); 1962 } 1963 mutex_exit(&bufcache_lock); 1964 1965 return 0; 1966} 1967 1968/* 1969 * Write the specified block to a snapshot. 1970 */ 1971static int 1972wrsnapblk(struct vnode *vp, void *data, ufs2_daddr_t lbn) 1973{ 1974 struct inode *ip = VTOI(vp); 1975 struct fs *fs = ip->i_fs; 1976 struct buf *bp; 1977 int error; 1978 1979 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, 1980 KERNCRED, (ip->i_ffs_effnlink > 0 ? B_SYNC : 0), &bp); 1981 if (error) 1982 return error; 1983 bcopy(data, bp->b_data, fs->fs_bsize); 1984 if (ip->i_ffs_effnlink > 0) 1985 error = bwrite(bp); 1986 else 1987 bawrite(bp); 1988 1989 return error; 1990} 1991 1992/* 1993 * Get/Put direct block from inode or buffer containing disk addresses. Take 1994 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 1995 * into a global include. 1996 */ 1997static inline ufs2_daddr_t 1998db_get(struct inode *ip, int loc) 1999{ 2000 if (ip->i_ump->um_fstype == UFS1) 2001 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); 2002 else 2003 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); 2004} 2005 2006static inline void 2007db_assign(struct inode *ip, int loc, ufs2_daddr_t val) 2008{ 2009 if (ip->i_ump->um_fstype == UFS1) 2010 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2011 else 2012 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2013} 2014 2015static inline ufs2_daddr_t 2016ib_get(struct inode *ip, int loc) 2017{ 2018 if (ip->i_ump->um_fstype == UFS1) 2019 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip)); 2020 else 2021 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip)); 2022} 2023 2024static inline void 2025ib_assign(struct inode *ip, int loc, ufs2_daddr_t val) 2026{ 2027 if (ip->i_ump->um_fstype == UFS1) 2028 ip->i_ffs1_ib[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2029 else 2030 ip->i_ffs2_ib[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2031} 2032 2033static inline ufs2_daddr_t 2034idb_get(struct inode *ip, void *bf, int loc) 2035{ 2036 if (ip->i_ump->um_fstype == UFS1) 2037 return ufs_rw32(((ufs1_daddr_t *)(bf))[loc], 2038 UFS_IPNEEDSWAP(ip)); 2039 else 2040 return ufs_rw64(((ufs2_daddr_t *)(bf))[loc], 2041 UFS_IPNEEDSWAP(ip)); 2042} 2043 2044static inline void 2045idb_assign(struct inode *ip, void *bf, int loc, ufs2_daddr_t val) 2046{ 2047 if (ip->i_ump->um_fstype == UFS1) 2048 ((ufs1_daddr_t *)(bf))[loc] = 2049 ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2050 else 2051 ((ufs2_daddr_t *)(bf))[loc] = 2052 ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2053} 2054