ffs_snapshot.c revision 1.1
1/* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 * 35 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 36 */ 37 38#include <sys/cdefs.h> 39__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.1 2004/05/25 14:54:59 hannken Exp $"); 40 41#include <sys/param.h> 42#include <sys/kernel.h> 43#include <sys/systm.h> 44#include <sys/conf.h> 45#include <sys/buf.h> 46#include <sys/proc.h> 47#include <sys/namei.h> 48#include <sys/sched.h> 49#include <sys/stat.h> 50#include <sys/malloc.h> 51#include <sys/mount.h> 52#include <sys/resource.h> 53#include <sys/resourcevar.h> 54#include <sys/vnode.h> 55 56#include <miscfs/specfs/specdev.h> 57 58#include <ufs/ufs/quota.h> 59#include <ufs/ufs/ufsmount.h> 60#include <ufs/ufs/inode.h> 61#include <ufs/ufs/ufs_extern.h> 62#include <ufs/ufs/ufs_bswap.h> 63 64#include <ufs/ffs/fs.h> 65#include <ufs/ffs/ffs_extern.h> 66 67/* FreeBSD -> NetBSD conversion */ 68#define KERNCRED proc0.p_ucred 69#define ufs1_daddr_t int32_t 70#define ufs2_daddr_t int64_t 71#define ufs_lbn_t daddr_t 72#define VI_MTX(v) (&(v)->v_interlock) 73#define VI_LOCK(v) simple_lock(&(v)->v_interlock) 74#define VI_UNLOCK(v) simple_unlock(&(v)->v_interlock) 75#define MNT_ILOCK(v) simple_lock(&mntvnode_slock) 76#define MNT_IUNLOCK(v) simple_unlock(&mntvnode_slock) 77 78static int cgaccount(int, struct vnode *, caddr_t, int); 79static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 80 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 81 ufs_lbn_t, int), int); 82static int indiracct_ufs1(struct vnode *, struct vnode *, int, 83 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 84 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 85 ufs_lbn_t, int), int); 86static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 87 struct fs *, ufs_lbn_t, int); 88static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 89 struct fs *, ufs_lbn_t, int); 90static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 91 struct fs *, ufs_lbn_t, int); 92static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 93 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 94 ufs_lbn_t, int), int); 95static int indiracct_ufs2(struct vnode *, struct vnode *, int, 96 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 97 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 98 ufs_lbn_t, int), int); 99static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 100 struct fs *, ufs_lbn_t, int); 101static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 102 struct fs *, ufs_lbn_t, int); 103static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 104 struct fs *, ufs_lbn_t, int); 105static int ffs_copyonwrite(void *, struct buf *); 106static int readfsblk(struct vnode *, caddr_t, ufs2_daddr_t); 107static int readvnblk(struct vnode *, caddr_t, ufs2_daddr_t); 108static int writevnblk(struct vnode *, caddr_t, ufs2_daddr_t); 109static inline ufs2_daddr_t db_get(struct inode *, int); 110static inline void db_assign(struct inode *, int, ufs2_daddr_t); 111static inline ufs2_daddr_t idb_get(struct inode *, caddr_t, int); 112static inline void idb_assign(struct inode *, caddr_t, int, ufs2_daddr_t); 113 114/* 115 * To ensure the consistency of snapshots across crashes, we must 116 * synchronously write out copied blocks before allowing the 117 * originals to be modified. Because of the rather severe speed 118 * penalty that this imposes, the following flag allows this 119 * crash persistence to be disabled. 120 */ 121static int dopersistence = 1; 122 123#ifdef DEBUG 124static int snapdebug = 0; 125#endif 126 127/* 128 * Create a snapshot file and initialize it for the filesystem. 129 * Vnode is locked on entry and unlocked on return. 130 */ 131int 132ffs_snapshot(mp, vp, ctime) 133 struct mount *mp; 134 struct vnode *vp; 135 struct timespec *ctime; 136{ 137 ufs2_daddr_t numblks, blkno, *blkp, snaplistsize = 0, *snapblklist; 138 int error, ns, cg, snaploc; 139 int i, size, len, loc; 140 int flag = mp->mnt_flag; 141 struct timeval starttime; 142#ifdef DEBUG 143 struct timeval endtime; 144#endif 145 struct timespec ts; 146 long redo = 0; 147 int32_t *lp; 148 void *space; 149 caddr_t cgbuf; 150 struct ufsmount *ump = VFSTOUFS(mp); 151 struct fs *copy_fs = NULL, *fs = ump->um_fs; 152 struct proc *p = curproc; 153 struct inode *ip, *xp; 154 struct buf *bp, *ibp; 155 struct vattr vat; 156 struct vnode *xvp, *nvp, *devvp; 157 struct vop_vfree_args args; 158 159 ns = UFS_FSNEEDSWAP(fs); 160 /* 161 * Need to serialize access to snapshot code per filesystem. 162 */ 163 /* 164 * If the vnode already is a snapshot, return. 165 */ 166 if (VTOI(vp)->i_flags & SF_SNAPSHOT) { 167 if (ctime) { 168 ctime->tv_sec = DIP(VTOI(vp), mtime); 169 ctime->tv_nsec = DIP(VTOI(vp), mtimensec); 170 } 171 VOP_UNLOCK(vp, 0); 172 return 0; 173 } 174 /* 175 * Check mount and check for exclusive reference. 176 */ 177 if (vp->v_mount != mp) { 178 VOP_UNLOCK(vp, 0); 179 return EXDEV; 180 } 181 if (vp->v_usecount != 1 || vp->v_writecount != 0) { 182 VOP_UNLOCK(vp, 0); 183 return EBUSY; 184 } 185 if (vp->v_size != 0) { 186 error = VOP_TRUNCATE(vp, 0, 0, NOCRED, p); 187 if (error) { 188 VOP_UNLOCK(vp, 0); 189 return error; 190 } 191 } 192 /* 193 * Assign a snapshot slot in the superblock. 194 */ 195 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 196 if (fs->fs_snapinum[snaploc] == 0) 197 break; 198 if (snaploc == FSMAXSNAP) 199 return (ENOSPC); 200 ip = VTOI(vp); 201 devvp = ip->i_devvp; 202 /* 203 * Allocate and copy the last block contents so as to be able 204 * to set size to that of the filesystem. 205 */ 206 numblks = howmany(fs->fs_size, fs->fs_frag); 207 cgbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 208 if ((error = readfsblk(vp, cgbuf, numblks - 1)) != 0) 209 goto out; 210 error = vn_rdwr(UIO_WRITE, vp, 211 cgbuf, fs->fs_bsize, lblktosize(fs, (off_t)(numblks - 1)), 212 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, p); 213 if (error) 214 goto out; 215 /* 216 * Preallocate critical data structures so that we can copy 217 * them in without further allocation after we suspend all 218 * operations on the filesystem. We would like to just release 219 * the allocated buffers without writing them since they will 220 * be filled in below once we are ready to go, but this upsets 221 * the soft update code, so we go ahead and write the new buffers. 222 * 223 * Allocate all indirect blocks and mark all of them as not 224 * needing to be copied. 225 */ 226 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 227 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 228 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 229 if (error) 230 goto out; 231 bwrite(ibp); 232 } 233 /* 234 * Allocate copies for the superblock and its summary information. 235 */ 236 bzero(cgbuf, fs->fs_bsize); 237 blkno = lblkno(fs, fs->fs_sblockloc); 238 for (loc = 0; loc < howmany(fs->fs_sbsize, fs->fs_bsize); loc++) 239 if ((error = writevnblk(vp, cgbuf, blkno + loc)) != 0) 240 goto out; 241 blkno = fragstoblks(fs, fs->fs_csaddr); 242 for (loc = 0; loc < howmany(fs->fs_cssize, fs->fs_bsize); loc++) 243 if ((error = writevnblk(vp, cgbuf, blkno + loc)) != 0) 244 goto out; 245 /* 246 * Allocate all cylinder group blocks. 247 */ 248 for (cg = 0; cg < fs->fs_ncg; cg++) 249 if ((error = writevnblk(vp, cgbuf, 250 fragstoblks(fs, cgtod(fs, cg)))) != 0) 251 goto out; 252 /* 253 * Copy all the cylinder group maps. Although the 254 * filesystem is still active, we hope that only a few 255 * cylinder groups will change between now and when we 256 * suspend operations. Thus, we will be able to quickly 257 * touch up the few cylinder groups that changed during 258 * the suspension period. 259 */ 260 len = howmany(fs->fs_ncg, NBBY); 261 MALLOC(fs->fs_active, u_char *, len, M_DEVBUF, M_WAITOK | M_ZERO); 262 for (cg = 0; cg < fs->fs_ncg; cg++) { 263 if ((error = cgaccount(cg, vp, cgbuf, 1)) != 0) 264 goto out; 265 if ((error = writevnblk(vp, cgbuf, 266 fragstoblks(fs, cgtod(fs, cg)))) != 0) 267 goto out; 268 } 269 /* 270 * Change inode to snapshot type file. 271 */ 272 ip->i_flags |= SF_SNAPSHOT; 273 DIP_ASSIGN(ip, flags, ip->i_flags); 274 ip->i_flag |= IN_CHANGE | IN_UPDATE; 275 /* 276 * Ensure that the snapshot is completely on disk. 277 * Since we have marked it as a snapshot it is safe to 278 * unlock it as no process will be allowed to write to it. 279 */ 280 if ((error = VOP_FSYNC(vp, KERNCRED, FSYNC_WAIT, 0, 0, p)) != 0) 281 goto out; 282 VOP_UNLOCK(vp, 0); 283 /* 284 * All allocations are done, so we can now snapshot the system. 285 * 286 * Suspend operation on filesystem. 287 */ 288 if ((error = vfs_write_suspend(vp->v_mount, PUSER|PCATCH, 0)) != 0) { 289 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 290 goto out; 291 } 292 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 293 microtime(&starttime); 294 /* 295 * First, copy all the cylinder group maps that have changed. 296 */ 297 for (cg = 0; cg < fs->fs_ncg; cg++) { 298 if (ACTIVECG_ISSET(fs, cg)) 299 continue; 300 redo++; 301 if ((error = cgaccount(cg, vp, cgbuf, 2)) != 0) 302 goto out1; 303 if ((error = writevnblk(vp, cgbuf, 304 fragstoblks(fs, cgtod(fs, cg)))) != 0) 305 goto out1; 306 } 307 /* 308 * Grab a copy of the superblock and its summary information. 309 * We delay writing it until the suspension is released below. 310 */ 311 loc = blkoff(fs, fs->fs_sblockloc); 312 if (loc > 0) 313 bzero(&cgbuf[0], loc); 314 copy_fs = (struct fs *)(cgbuf + loc); 315 bcopy(fs, copy_fs, fs->fs_sbsize); 316 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 317 if (fs->fs_sbsize < size) 318 bzero(&cgbuf[loc + fs->fs_sbsize], size - fs->fs_sbsize); 319 size = blkroundup(fs, fs->fs_cssize); 320 if (fs->fs_contigsumsize > 0) 321 size += fs->fs_ncg * sizeof(int32_t); 322 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 323 copy_fs->fs_csp = space; 324 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 325 (char *)space += fs->fs_cssize; 326 loc = howmany(fs->fs_cssize, fs->fs_fsize); 327 i = fs->fs_frag - loc % fs->fs_frag; 328 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 329 if (len > 0) { 330 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 331 len, KERNCRED, &bp)) != 0) { 332 brelse(bp); 333 free(copy_fs->fs_csp, M_UFSMNT); 334 goto out1; 335 } 336 bcopy(bp->b_data, space, (u_int)len); 337 (char *)space += len; 338 bp->b_flags |= B_INVAL | B_NOCACHE; 339 brelse(bp); 340 } 341 if (fs->fs_contigsumsize > 0) { 342 copy_fs->fs_maxcluster = lp = space; 343 for (i = 0; i < fs->fs_ncg; i++) 344 *lp++ = fs->fs_contigsumsize; 345 } 346 /* 347 * We must check for active files that have been unlinked 348 * (e.g., with a zero link count). We have to expunge all 349 * trace of these files from the snapshot so that they are 350 * not reclaimed prematurely by fsck or unnecessarily dumped. 351 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 352 * spec_strategy about writing on a suspended filesystem. 353 * Note that we skip unlinked snapshot files as they will 354 * be handled separately below. 355 * 356 * We also calculate the needed size for the snapshot list. 357 */ 358 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 359 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 360 MNT_ILOCK(mp); 361loop: 362 for (xvp = LIST_FIRST(&mp->mnt_vnodelist); xvp; xvp = nvp) { 363 /* 364 * Make sure this vnode wasn't reclaimed in getnewvnode(). 365 * Start over if it has (it won't be on the list anymore). 366 */ 367 if (xvp->v_mount != mp) 368 goto loop; 369 nvp = LIST_NEXT(xvp, v_mntvnodes); 370 VI_LOCK(xvp); 371 MNT_IUNLOCK(mp); 372 if ((xvp->v_flag & VXLOCK) || 373 xvp->v_usecount == 0 || xvp->v_type == VNON || 374 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 375 VI_UNLOCK(xvp); 376 MNT_ILOCK(mp); 377 continue; 378 } 379 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { 380 MNT_ILOCK(mp); 381 goto loop; 382 } 383#ifdef DEBUG 384 if (snapdebug) 385 vprint("ffs_snapshot: busy vnode", xvp); 386#endif 387 if (VOP_GETATTR(xvp, &vat, p->p_ucred, p) == 0 && 388 vat.va_nlink > 0) { 389 VOP_UNLOCK(xvp, 0); 390 MNT_ILOCK(mp); 391 continue; 392 } 393 xp = VTOI(xvp); 394 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 395 VOP_UNLOCK(xvp, 0); 396 MNT_ILOCK(mp); 397 continue; 398 } 399 /* 400 * If there is a fragment, clear it here. 401 */ 402 blkno = 0; 403 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 404 if (loc < NDADDR) { 405 len = fragroundup(fs, blkoff(fs, xp->i_size)); 406 if (len < fs->fs_bsize) { 407 ffs_blkfree(copy_fs, vp, db_get(xp, loc), 408 len, xp->i_number); 409 blkno = db_get(xp, loc); 410 db_assign(xp, loc, 0); 411 } 412 } 413 snaplistsize += 1; 414 if (xp->i_ump->um_fstype == UFS1) 415 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 416 BLK_NOCOPY); 417 else 418 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 419 BLK_NOCOPY); 420 if (blkno) 421 db_assign(xp, loc, blkno); 422 if (!error) { 423 args.a_pvp = vp; 424 args.a_ino = xp->i_number; 425 args.a_mode = xp->i_mode; 426 error = ffs_freefile(&args); 427 } 428 VOP_UNLOCK(xvp, 0); 429 if (error) { 430 free(copy_fs->fs_csp, M_UFSMNT); 431 goto out1; 432 } 433 MNT_ILOCK(mp); 434 } 435 MNT_IUNLOCK(mp); 436 /* 437 * If there already exist snapshots on this filesystem, grab a 438 * reference to their shared lock. If this is the first snapshot 439 * on this filesystem, we need to allocate a lock for the snapshots 440 * to share. In either case, acquire the snapshot lock and give 441 * up our original private lock. 442 */ 443 VI_LOCK(devvp); 444 if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) { 445 struct lock *lkp; 446 447 lkp = ITOV(xp)->v_vnlock; 448 VI_UNLOCK(devvp); 449 VI_LOCK(vp); 450 vp->v_vnlock = lkp; 451 } else { 452 struct lock *lkp; 453 454 VI_UNLOCK(devvp); 455 MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, 456 M_WAITOK); 457 lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE); 458 VI_LOCK(vp); 459 vp->v_vnlock = lkp; 460 } 461 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY); 462 transferlockers(&vp->v_lock, vp->v_vnlock); 463 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 464 /* 465 * If this is the first snapshot on this filesystem, then we need 466 * to allocate the space for the list of preallocated snapshot blocks. 467 * This list will be refined below, but this preliminary one will 468 * keep us out of deadlock until the full one is ready. 469 */ 470 if (xp == NULL) { 471 MALLOC(snapblklist, ufs2_daddr_t *, 472 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 473 blkp = &snapblklist[1]; 474 *blkp++ = ufs_rw64(lblkno(fs, fs->fs_sblockloc), ns); 475 blkno = fragstoblks(fs, fs->fs_csaddr); 476 for (cg = 0; cg < fs->fs_ncg; cg++) { 477 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 478 break; 479 *blkp++ = ufs_rw64(fragstoblks(fs, cgtod(fs, cg)), ns); 480 } 481 len = howmany(fs->fs_cssize, fs->fs_bsize); 482 for (loc = 0; loc < len; loc++) 483 *blkp++ = ufs_rw64(blkno + loc, ns); 484 for (; cg < fs->fs_ncg; cg++) 485 *blkp++ = ufs_rw64(fragstoblks(fs, cgtod(fs, cg)), ns); 486 snapblklist[0] = ufs_rw64(blkp - snapblklist, ns); 487 VI_LOCK(devvp); 488 if (ump->um_snapblklist != NULL) 489 panic("ffs_snapshot: non-empty list"); 490 ump->um_snapblklist = snapblklist; 491 ump->um_snaplistsize = blkp - snapblklist; 492 VI_UNLOCK(devvp); 493 } 494 /* 495 * Record snapshot inode. Since this is the newest snapshot, 496 * it must be placed at the end of the list. 497 */ 498 VI_LOCK(devvp); 499 fs->fs_snapinum[snaploc] = ip->i_number; 500 if (ip->i_nextsnap.tqe_prev != 0) 501 panic("ffs_snapshot: %d already on list", ip->i_number); 502 TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap); 503 VI_UNLOCK(devvp); 504 if (xp == NULL) 505 vn_cow_establish(devvp, ffs_copyonwrite, devvp); 506 vp->v_flag |= VSYSTEM; 507out1: 508 /* 509 * Resume operation on filesystem. 510 */ 511 vfs_write_resume(vp->v_mount); 512 /* 513 * Set the mtime to the time the snapshot has been taken. 514 */ 515 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 516 if (ctime) 517 *ctime = ts; 518 DIP_ASSIGN(ip, mtime, ts.tv_sec); 519 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 520 ip->i_flag |= IN_CHANGE | IN_UPDATE; 521 522#ifdef DEBUG 523 if (starttime.tv_sec > 0) { 524 microtime(&endtime); 525 timersub(&endtime, &starttime, &endtime); 526 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 527 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 528 endtime.tv_usec / 1000, redo, fs->fs_ncg); 529 } 530#endif 531 if (error) 532 goto out; 533 /* 534 * Copy allocation information from all the snapshots in 535 * this snapshot and then expunge them from its view. 536 */ 537 TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap) { 538 if (xp == ip) 539 break; 540 if (xp->i_ump->um_fstype == UFS1) 541 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 542 BLK_SNAP); 543 else 544 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 545 BLK_SNAP); 546 if (error) { 547 fs->fs_snapinum[snaploc] = 0; 548 goto done; 549 } 550 } 551 /* 552 * Allocate space for the full list of preallocated snapshot blocks. 553 */ 554 MALLOC(snapblklist, ufs2_daddr_t *, snaplistsize * sizeof(ufs2_daddr_t), 555 M_UFSMNT, M_WAITOK); 556 ip->i_snapblklist = &snapblklist[1]; 557 /* 558 * Expunge the blocks used by the snapshots from the set of 559 * blocks marked as used in the snapshot bitmaps. Also, collect 560 * the list of allocated blocks in i_snapblklist. 561 */ 562 if (ip->i_ump->um_fstype == UFS1) 563 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 564 else 565 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 566 if (error) { 567 fs->fs_snapinum[snaploc] = 0; 568 FREE(snapblklist, M_UFSMNT); 569 goto done; 570 } 571 if (snaplistsize < ip->i_snapblklist - snapblklist) 572 panic("ffs_snapshot: list too small"); 573 snaplistsize = ip->i_snapblklist - snapblklist; 574 snapblklist[0] = ufs_rw64(snaplistsize, ns); 575 ip->i_snapblklist = 0; 576 /* 577 * Write out the list of allocated blocks to the end of the snapshot. 578 */ 579 error = vn_rdwr(UIO_WRITE, vp, 580 (caddr_t)snapblklist, snaplistsize*sizeof(ufs2_daddr_t), ip->i_size, 581 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, p); 582 if (error) { 583 fs->fs_snapinum[snaploc] = 0; 584 FREE(snapblklist, M_UFSMNT); 585 goto done; 586 } 587 /* 588 * Write the superblock and its summary information 589 * to the snapshot. 590 */ 591 blkno = fragstoblks(fs, fs->fs_csaddr); 592 len = howmany(fs->fs_cssize, fs->fs_bsize); 593 space = copy_fs->fs_csp; 594 if (ns) { 595 ffs_sb_swap(copy_fs, copy_fs); 596 ffs_csum_swap(space, space, fs->fs_cssize); 597 } 598 for (loc = 0; loc < len; loc++) { 599 if ((error = writevnblk(vp, space, blkno + loc)) != 0) { 600 fs->fs_snapinum[snaploc] = 0; 601 FREE(snapblklist, M_UFSMNT); 602 goto done; 603 } 604 space = (char *)space + fs->fs_bsize; 605 } 606 /* 607 * As this is the newest list, it is the most inclusive, so 608 * should replace the previous list. 609 */ 610 VI_LOCK(devvp); 611 space = ump->um_snapblklist; 612 ump->um_snapblklist = snapblklist; 613 ump->um_snaplistsize = snaplistsize; 614 VI_UNLOCK(devvp); 615 if (space != NULL) 616 FREE(space, M_UFSMNT); 617done: 618 free(copy_fs->fs_csp, M_UFSMNT); 619 blkno = lblkno(fs, fs->fs_sblockloc); 620 if (error == 0 && (error = writevnblk(vp, cgbuf, blkno)) != 0) 621 fs->fs_snapinum[snaploc] = 0; 622out: 623 if (cgbuf) 624 free(cgbuf, M_UFSMNT); 625 if (fs->fs_active != 0) { 626 FREE(fs->fs_active, M_DEVBUF); 627 fs->fs_active = 0; 628 } 629 mp->mnt_flag = flag; 630 if (error) 631 (void) VOP_TRUNCATE(vp, (off_t)0, 0, NOCRED, p); 632 else 633 vref(vp); 634 VOP_UNLOCK(vp, 0); 635 return (error); 636} 637 638/* 639 * Copy a cylinder group map. All the unallocated blocks are marked 640 * BLK_NOCOPY so that the snapshot knows that it need not copy them 641 * if they are later written. If passno is one, then this is a first 642 * pass, so only setting needs to be done. If passno is 2, then this 643 * is a revision to a previous pass which must be undone as the 644 * replacement pass is done. 645 */ 646static int 647cgaccount(cg, vp, data, passno) 648 int cg; 649 struct vnode *vp; 650 caddr_t data; 651 int passno; 652{ 653 struct buf *bp, *ibp; 654 struct inode *ip; 655 struct cg *cgp; 656 struct fs *fs; 657 ufs2_daddr_t base, numblks; 658 int error, len, loc, ns, indiroff; 659 660 ip = VTOI(vp); 661 fs = ip->i_fs; 662 ns = UFS_FSNEEDSWAP(fs); 663 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 664 (int)fs->fs_cgsize, KERNCRED, &bp); 665 if (error) { 666 brelse(bp); 667 return (error); 668 } 669 cgp = (struct cg *)bp->b_data; 670 if (!cg_chkmagic(cgp, ns)) { 671 brelse(bp); 672 return (EIO); 673 } 674 ACTIVECG_SET(fs, cg); 675 676 bcopy(bp->b_data, data, fs->fs_cgsize); 677 brelse(bp); 678 if (fs->fs_cgsize < fs->fs_bsize) 679 bzero(&data[fs->fs_cgsize], 680 fs->fs_bsize - fs->fs_cgsize); 681 numblks = howmany(fs->fs_size, fs->fs_frag); 682 len = howmany(fs->fs_fpg, fs->fs_frag); 683 base = cg * fs->fs_fpg / fs->fs_frag; 684 if (base + len >= numblks) 685 len = numblks - base - 1; 686 loc = 0; 687 if (base < NDADDR) { 688 for ( ; loc < NDADDR; loc++) { 689 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 690 db_assign(ip, loc, BLK_NOCOPY); 691 else if (db_get(ip, loc) == BLK_NOCOPY) { 692 if (passno == 2) 693 db_assign(ip, loc, 0); 694 else if (passno == 1) 695 panic("ffs_snapshot: lost direct block"); 696 } 697 } 698 } 699 if ((error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 700 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 701 return (error); 702 indiroff = (base + loc - NDADDR) % NINDIR(fs); 703 for ( ; loc < len; loc++, indiroff++) { 704 if (indiroff >= NINDIR(fs)) { 705 bwrite(ibp); 706 if ((error = VOP_BALLOC(vp, 707 lblktosize(fs, (off_t)(base + loc)), 708 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 709 return (error); 710 indiroff = 0; 711 } 712 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 713 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 714 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 715 if (passno == 2) 716 idb_assign(ip, ibp->b_data, indiroff, 0); 717 else if (passno == 1) 718 panic("ffs_snapshot: lost indirect block"); 719 } 720 } 721 bwrite(ibp); 722 return (0); 723} 724 725/* 726 * Before expunging a snapshot inode, note all the 727 * blocks that it claims with BLK_SNAP so that fsck will 728 * be able to account for those blocks properly and so 729 * that this snapshot knows that it need not copy them 730 * if the other snapshot holding them is freed. This code 731 * is reproduced once each for UFS1 and UFS2. 732 */ 733static int 734expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 735 struct vnode *snapvp; 736 struct inode *cancelip; 737 struct fs *fs; 738 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 739 struct fs *, ufs_lbn_t, int); 740 int expungetype; 741{ 742 int i, error, ns, indiroff; 743 ufs_lbn_t lbn, rlbn; 744 ufs2_daddr_t len, blkno, numblks, blksperindir; 745 struct ufs1_dinode *dip; 746 struct buf *bp; 747 caddr_t buf; 748 749 ns = UFS_FSNEEDSWAP(fs); 750 /* 751 * Prepare to expunge the inode. If its inode block has not 752 * yet been copied, then allocate and fill the copy. 753 */ 754 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 755 blkno = 0; 756 if (lbn < NDADDR) { 757 blkno = db_get(VTOI(snapvp), lbn); 758 } else { 759 error = VOP_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 760 fs->fs_bsize, KERNCRED, B_METAONLY, &bp); 761 if (error) 762 return (error); 763 indiroff = (lbn - NDADDR) % NINDIR(fs); 764 blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff); 765 brelse(bp); 766 } 767 buf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 768 if (blkno != 0) 769 error = readvnblk(snapvp, buf, lbn); 770 else 771 error = readfsblk(snapvp, buf, lbn); 772 if (error) { 773 free(buf, M_UFSMNT); 774 return error; 775 } 776 /* 777 * Set a snapshot inode to be a zero length file, regular files 778 * to be completely unallocated. 779 */ 780 dip = (struct ufs1_dinode *)buf + ino_to_fsbo(fs, cancelip->i_number); 781 if (expungetype == BLK_NOCOPY) 782 dip->di_mode = 0; 783 dip->di_size = 0; 784 dip->di_blocks = 0; 785 dip->di_flags = 786 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 787 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 788 error = writevnblk(snapvp, buf, lbn); 789 free(buf, M_UFSMNT); 790 if (error) 791 return error; 792 /* 793 * Now go through and expunge all the blocks in the file 794 * using the function requested. 795 */ 796 numblks = howmany(cancelip->i_size, fs->fs_bsize); 797 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_db[0], 798 &cancelip->i_ffs1_db[NDADDR], fs, 0, expungetype))) 799 return (error); 800 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_ib[0], 801 &cancelip->i_ffs1_ib[NIADDR], fs, -1, expungetype))) 802 return (error); 803 blksperindir = 1; 804 lbn = -NDADDR; 805 len = numblks - NDADDR; 806 rlbn = NDADDR; 807 for (i = 0; len > 0 && i < NIADDR; i++) { 808 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 809 ufs_rw32(cancelip->i_ffs1_ib[i], ns), lbn, rlbn, len, 810 blksperindir, fs, acctfunc, expungetype); 811 if (error) 812 return (error); 813 blksperindir *= NINDIR(fs); 814 lbn -= blksperindir + 1; 815 len -= blksperindir; 816 rlbn += blksperindir; 817 } 818 return (0); 819} 820 821/* 822 * Descend an indirect block chain for vnode cancelvp accounting for all 823 * its indirect blocks in snapvp. 824 */ 825static int 826indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 827 blksperindir, fs, acctfunc, expungetype) 828 struct vnode *snapvp; 829 struct vnode *cancelvp; 830 int level; 831 ufs1_daddr_t blkno; 832 ufs_lbn_t lbn; 833 ufs_lbn_t rlbn; 834 ufs_lbn_t remblks; 835 ufs_lbn_t blksperindir; 836 struct fs *fs; 837 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 838 struct fs *, ufs_lbn_t, int); 839 int expungetype; 840{ 841 int error, ns, num, i; 842 ufs_lbn_t subblksperindir; 843 struct indir indirs[NIADDR + 2]; 844 ufs1_daddr_t last, *bap; 845 struct buf *bp; 846 847 ns = UFS_FSNEEDSWAP(fs); 848 849 if (blkno == 0) { 850 if (expungetype == BLK_NOCOPY) 851 return (0); 852 panic("indiracct_ufs1: missing indir"); 853 } 854 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 855 return (error); 856 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 857 panic("indiracct_ufs1: botched params"); 858 /* 859 * We have to expand bread here since it will deadlock looking 860 * up the block number for any blocks that are not in the cache. 861 */ 862 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 863 bp->b_blkno = fsbtodb(fs, blkno); 864 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 865 (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) { 866 brelse(bp); 867 return (error); 868 } 869 /* 870 * Account for the block pointers in this indirect block. 871 */ 872 last = howmany(remblks, blksperindir); 873 if (last > NINDIR(fs)) 874 last = NINDIR(fs); 875 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 876 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 877 brelse(bp); 878 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 879 level == 0 ? rlbn : -1, expungetype); 880 if (error || level == 0) 881 goto out; 882 /* 883 * Account for the block pointers in each of the indirect blocks 884 * in the levels below us. 885 */ 886 subblksperindir = blksperindir / NINDIR(fs); 887 for (lbn++, level--, i = 0; i < last; i++) { 888 error = indiracct_ufs1(snapvp, cancelvp, level, 889 ufs_rw32(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 890 fs, acctfunc, expungetype); 891 if (error) 892 goto out; 893 rlbn += blksperindir; 894 lbn -= blksperindir; 895 remblks -= blksperindir; 896 } 897out: 898 FREE(bap, M_DEVBUF); 899 return (error); 900} 901 902/* 903 * Do both snap accounting and map accounting. 904 */ 905static int 906fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 907 struct vnode *vp; 908 ufs1_daddr_t *oldblkp, *lastblkp; 909 struct fs *fs; 910 ufs_lbn_t lblkno; 911 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 912{ 913 int error; 914 915 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 916 return (error); 917 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 918} 919 920/* 921 * Identify a set of blocks allocated in a snapshot inode. 922 */ 923static int 924snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 925 struct vnode *vp; 926 ufs1_daddr_t *oldblkp, *lastblkp; 927 struct fs *fs; 928 ufs_lbn_t lblkno; 929 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 930{ 931 struct inode *ip = VTOI(vp); 932 ufs1_daddr_t blkno, *blkp; 933 ufs_lbn_t lbn; 934 struct buf *ibp; 935 int error, ns; 936 937 ns = UFS_FSNEEDSWAP(fs); 938 939 for ( ; oldblkp < lastblkp; oldblkp++) { 940 blkno = ufs_rw32(*oldblkp, ns); 941 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 942 continue; 943 lbn = fragstoblks(fs, blkno); 944 if (lbn < NDADDR) { 945 blkp = &ip->i_ffs1_db[lbn]; 946 ip->i_flag |= IN_CHANGE | IN_UPDATE; 947 } else { 948 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 949 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 950 if (error) 951 return (error); 952 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 953 [(lbn - NDADDR) % NINDIR(fs)]; 954 } 955 /* 956 * If we are expunging a snapshot vnode and we 957 * find a block marked BLK_NOCOPY, then it is 958 * one that has been allocated to this snapshot after 959 * we took our current snapshot and can be ignored. 960 */ 961 blkno = ufs_rw32(*blkp, ns); 962 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 963 if (lbn >= NDADDR) 964 brelse(ibp); 965 } else { 966 if (blkno != 0) 967 panic("snapacct_ufs1: bad block"); 968 *blkp = ufs_rw32(expungetype, ns); 969 if (lbn >= NDADDR) 970 bwrite(ibp); 971 } 972 } 973 return (0); 974} 975 976/* 977 * Account for a set of blocks allocated in a snapshot inode. 978 */ 979static int 980mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 981 struct vnode *vp; 982 ufs1_daddr_t *oldblkp, *lastblkp; 983 struct fs *fs; 984 ufs_lbn_t lblkno; 985 int expungetype; 986{ 987 ufs1_daddr_t blkno; 988 struct inode *ip; 989 ino_t inum; 990 int acctit, ns; 991 992 ns = UFS_FSNEEDSWAP(fs); 993 ip = VTOI(vp); 994 inum = ip->i_number; 995 if (lblkno == -1) 996 acctit = 0; 997 else 998 acctit = 1; 999 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1000 blkno = ufs_rw32(*oldblkp, ns); 1001 if (blkno == 0 || blkno == BLK_NOCOPY) 1002 continue; 1003 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1004 *ip->i_snapblklist++ = ufs_rw64(lblkno, ns); 1005 if (blkno == BLK_SNAP) 1006 blkno = blkstofrags(fs, lblkno); 1007 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1008 } 1009 return (0); 1010} 1011 1012/* 1013 * Before expunging a snapshot inode, note all the 1014 * blocks that it claims with BLK_SNAP so that fsck will 1015 * be able to account for those blocks properly and so 1016 * that this snapshot knows that it need not copy them 1017 * if the other snapshot holding them is freed. This code 1018 * is reproduced once each for UFS1 and UFS2. 1019 */ 1020static int 1021expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1022 struct vnode *snapvp; 1023 struct inode *cancelip; 1024 struct fs *fs; 1025 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1026 struct fs *, ufs_lbn_t, int); 1027 int expungetype; 1028{ 1029 int i, error, ns, indiroff; 1030 ufs_lbn_t lbn, rlbn; 1031 ufs2_daddr_t len, blkno, numblks, blksperindir; 1032 struct ufs2_dinode *dip; 1033 struct buf *bp; 1034 caddr_t buf; 1035 1036 ns = UFS_FSNEEDSWAP(fs); 1037 /* 1038 * Prepare to expunge the inode. If its inode block has not 1039 * yet been copied, then allocate and fill the copy. 1040 */ 1041 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1042 blkno = 0; 1043 if (lbn < NDADDR) { 1044 blkno = db_get(VTOI(snapvp), lbn); 1045 } else { 1046 error = VOP_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1047 fs->fs_bsize, KERNCRED, B_METAONLY, &bp); 1048 if (error) 1049 return (error); 1050 indiroff = (lbn - NDADDR) % NINDIR(fs); 1051 blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff); 1052 brelse(bp); 1053 } 1054 buf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1055 if (blkno != 0) 1056 error = readvnblk(snapvp, buf, lbn); 1057 else 1058 error = readfsblk(snapvp, buf, lbn); 1059 if (error) { 1060 free(buf, M_UFSMNT); 1061 return error; 1062 } 1063 /* 1064 * Set a snapshot inode to be a zero length file, regular files 1065 * to be completely unallocated. 1066 */ 1067 dip = (struct ufs2_dinode *)buf + ino_to_fsbo(fs, cancelip->i_number); 1068 if (expungetype == BLK_NOCOPY) 1069 dip->di_mode = 0; 1070 dip->di_size = 0; 1071 dip->di_blocks = 0; 1072 dip->di_flags = 1073 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 1074 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1075 error = writevnblk(snapvp, buf, lbn); 1076 free(buf, M_UFSMNT); 1077 if (error) 1078 return error; 1079 /* 1080 * Now go through and expunge all the blocks in the file 1081 * using the function requested. 1082 */ 1083 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1084 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_db[0], 1085 &cancelip->i_ffs2_db[NDADDR], fs, 0, expungetype))) 1086 return (error); 1087 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_ib[0], 1088 &cancelip->i_ffs2_ib[NIADDR], fs, -1, expungetype))) 1089 return (error); 1090 blksperindir = 1; 1091 lbn = -NDADDR; 1092 len = numblks - NDADDR; 1093 rlbn = NDADDR; 1094 for (i = 0; len > 0 && i < NIADDR; i++) { 1095 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1096 ufs_rw64(cancelip->i_ffs2_ib[i], ns), lbn, rlbn, len, 1097 blksperindir, fs, acctfunc, expungetype); 1098 if (error) 1099 return (error); 1100 blksperindir *= NINDIR(fs); 1101 lbn -= blksperindir + 1; 1102 len -= blksperindir; 1103 rlbn += blksperindir; 1104 } 1105 return (0); 1106} 1107 1108/* 1109 * Descend an indirect block chain for vnode cancelvp accounting for all 1110 * its indirect blocks in snapvp. 1111 */ 1112static int 1113indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1114 blksperindir, fs, acctfunc, expungetype) 1115 struct vnode *snapvp; 1116 struct vnode *cancelvp; 1117 int level; 1118 ufs2_daddr_t blkno; 1119 ufs_lbn_t lbn; 1120 ufs_lbn_t rlbn; 1121 ufs_lbn_t remblks; 1122 ufs_lbn_t blksperindir; 1123 struct fs *fs; 1124 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1125 struct fs *, ufs_lbn_t, int); 1126 int expungetype; 1127{ 1128 int error, ns, num, i; 1129 ufs_lbn_t subblksperindir; 1130 struct indir indirs[NIADDR + 2]; 1131 ufs2_daddr_t last, *bap; 1132 struct buf *bp; 1133 1134 ns = UFS_FSNEEDSWAP(fs); 1135 1136 if (blkno == 0) { 1137 if (expungetype == BLK_NOCOPY) 1138 return (0); 1139 panic("indiracct_ufs2: missing indir"); 1140 } 1141 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1142 return (error); 1143 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1144 panic("indiracct_ufs2: botched params"); 1145 /* 1146 * We have to expand bread here since it will deadlock looking 1147 * up the block number for any blocks that are not in the cache. 1148 */ 1149 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 1150 bp->b_blkno = fsbtodb(fs, blkno); 1151 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1152 (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) { 1153 brelse(bp); 1154 return (error); 1155 } 1156 /* 1157 * Account for the block pointers in this indirect block. 1158 */ 1159 last = howmany(remblks, blksperindir); 1160 if (last > NINDIR(fs)) 1161 last = NINDIR(fs); 1162 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1163 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1164 brelse(bp); 1165 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1166 level == 0 ? rlbn : -1, expungetype); 1167 if (error || level == 0) 1168 goto out; 1169 /* 1170 * Account for the block pointers in each of the indirect blocks 1171 * in the levels below us. 1172 */ 1173 subblksperindir = blksperindir / NINDIR(fs); 1174 for (lbn++, level--, i = 0; i < last; i++) { 1175 error = indiracct_ufs2(snapvp, cancelvp, level, 1176 ufs_rw64(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 1177 fs, acctfunc, expungetype); 1178 if (error) 1179 goto out; 1180 rlbn += blksperindir; 1181 lbn -= blksperindir; 1182 remblks -= blksperindir; 1183 } 1184out: 1185 FREE(bap, M_DEVBUF); 1186 return (error); 1187} 1188 1189/* 1190 * Do both snap accounting and map accounting. 1191 */ 1192static int 1193fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1194 struct vnode *vp; 1195 ufs2_daddr_t *oldblkp, *lastblkp; 1196 struct fs *fs; 1197 ufs_lbn_t lblkno; 1198 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1199{ 1200 int error; 1201 1202 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1203 return (error); 1204 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1205} 1206 1207/* 1208 * Identify a set of blocks allocated in a snapshot inode. 1209 */ 1210static int 1211snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1212 struct vnode *vp; 1213 ufs2_daddr_t *oldblkp, *lastblkp; 1214 struct fs *fs; 1215 ufs_lbn_t lblkno; 1216 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1217{ 1218 struct inode *ip = VTOI(vp); 1219 ufs2_daddr_t blkno, *blkp; 1220 ufs_lbn_t lbn; 1221 struct buf *ibp; 1222 int error, ns; 1223 1224 ns = UFS_FSNEEDSWAP(fs); 1225 1226 for ( ; oldblkp < lastblkp; oldblkp++) { 1227 blkno = ufs_rw64(*oldblkp, ns); 1228 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1229 continue; 1230 lbn = fragstoblks(fs, blkno); 1231 if (lbn < NDADDR) { 1232 blkp = &ip->i_ffs2_db[lbn]; 1233 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1234 } else { 1235 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1236 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1237 if (error) 1238 return (error); 1239 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1240 [(lbn - NDADDR) % NINDIR(fs)]; 1241 } 1242 /* 1243 * If we are expunging a snapshot vnode and we 1244 * find a block marked BLK_NOCOPY, then it is 1245 * one that has been allocated to this snapshot after 1246 * we took our current snapshot and can be ignored. 1247 */ 1248 blkno = ufs_rw64(*blkp, ns); 1249 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1250 if (lbn >= NDADDR) 1251 brelse(ibp); 1252 } else { 1253 if (blkno != 0) 1254 panic("snapacct_ufs2: bad block"); 1255 *blkp = ufs_rw64(expungetype, ns); 1256 if (lbn >= NDADDR) 1257 bwrite(ibp); 1258 } 1259 } 1260 return (0); 1261} 1262 1263/* 1264 * Account for a set of blocks allocated in a snapshot inode. 1265 */ 1266static int 1267mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1268 struct vnode *vp; 1269 ufs2_daddr_t *oldblkp, *lastblkp; 1270 struct fs *fs; 1271 ufs_lbn_t lblkno; 1272 int expungetype; 1273{ 1274 ufs2_daddr_t blkno; 1275 struct inode *ip; 1276 ino_t inum; 1277 int acctit, ns; 1278 1279 ns = UFS_FSNEEDSWAP(fs); 1280 ip = VTOI(vp); 1281 inum = ip->i_number; 1282 if (lblkno == -1) 1283 acctit = 0; 1284 else 1285 acctit = 1; 1286 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1287 blkno = ufs_rw64(*oldblkp, ns); 1288 if (blkno == 0 || blkno == BLK_NOCOPY) 1289 continue; 1290 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1291 *ip->i_snapblklist++ = ufs_rw64(lblkno, ns); 1292 if (blkno == BLK_SNAP) 1293 blkno = blkstofrags(fs, lblkno); 1294 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1295 } 1296 return (0); 1297} 1298 1299/* 1300 * Decrement extra reference on snapshot when last name is removed. 1301 * It will not be freed until the last open reference goes away. 1302 */ 1303void 1304ffs_snapgone(ip) 1305 struct inode *ip; 1306{ 1307 struct ufsmount *ump = VFSTOUFS(ip->i_devvp->v_specmountpoint); 1308 struct inode *xp; 1309 struct fs *fs; 1310 int snaploc; 1311 1312 /* 1313 * Find snapshot in incore list. 1314 */ 1315 TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap) 1316 if (xp == ip) 1317 break; 1318 if (xp != NULL) 1319 vrele(ITOV(ip)); 1320#ifdef DEBUG 1321 else if (snapdebug) 1322 printf("ffs_snapgone: lost snapshot vnode %d\n", 1323 ip->i_number); 1324#endif 1325 /* 1326 * Delete snapshot inode from superblock. Keep list dense. 1327 */ 1328 fs = ip->i_fs; 1329 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1330 if (fs->fs_snapinum[snaploc] == ip->i_number) 1331 break; 1332 if (snaploc < FSMAXSNAP) { 1333 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1334 if (fs->fs_snapinum[snaploc] == 0) 1335 break; 1336 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1337 } 1338 fs->fs_snapinum[snaploc - 1] = 0; 1339 } 1340} 1341 1342/* 1343 * Prepare a snapshot file for being removed. 1344 */ 1345void 1346ffs_snapremove(vp) 1347 struct vnode *vp; 1348{ 1349 struct inode *ip = VTOI(vp); 1350 struct vnode *devvp = ip->i_devvp; 1351 struct fs *fs = ip->i_fs; 1352 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1353 struct lock *lkp; 1354 struct buf *ibp; 1355 ufs2_daddr_t numblks, blkno, dblk, *snapblklist; 1356 int error, ns, loc, last; 1357 1358 ns = UFS_FSNEEDSWAP(fs); 1359 /* 1360 * If active, delete from incore list (this snapshot may 1361 * already have been in the process of being deleted, so 1362 * would not have been active). 1363 * 1364 * Clear copy-on-write flag if last snapshot. 1365 */ 1366 if (ip->i_nextsnap.tqe_prev != 0) { 1367 VI_LOCK(devvp); 1368 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1369 VI_MTX(devvp)); 1370 VI_LOCK(devvp); 1371 TAILQ_REMOVE(&ump->um_snapshots, ip, i_nextsnap); 1372 ip->i_nextsnap.tqe_prev = 0; 1373 lkp = vp->v_vnlock; 1374 vp->v_vnlock = &vp->v_lock; 1375 lockmgr(lkp, LK_RELEASE, NULL); 1376 if (TAILQ_FIRST(&ump->um_snapshots) != 0) { 1377 VI_UNLOCK(devvp); 1378 } else { 1379 snapblklist = ump->um_snapblklist; 1380 ump->um_snapblklist = 0; 1381 ump->um_snaplistsize = 0; 1382 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); 1383 lockmgr(lkp, LK_RELEASE, NULL); 1384 vn_cow_disestablish(devvp, ffs_copyonwrite, devvp); 1385 FREE(lkp, M_UFSMNT); 1386 FREE(snapblklist, M_UFSMNT); 1387 } 1388 } 1389 /* 1390 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1391 * snapshots that want them (see ffs_snapblkfree below). 1392 */ 1393 for (blkno = 1; blkno < NDADDR; blkno++) { 1394 dblk = db_get(ip, blkno); 1395 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1396 db_assign(ip, blkno, 0); 1397 else if ((dblk == blkstofrags(fs, blkno) && 1398 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1399 ip->i_number))) { 1400 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1401 db_assign(ip, blkno, 0); 1402 } 1403 } 1404 numblks = howmany(ip->i_size, fs->fs_bsize); 1405 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1406 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1407 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1408 if (error) 1409 continue; 1410 if (fs->fs_size - blkno > NINDIR(fs)) 1411 last = NINDIR(fs); 1412 else 1413 last = fs->fs_size - blkno; 1414 for (loc = 0; loc < last; loc++) { 1415 dblk = idb_get(ip, ibp->b_data, loc); 1416 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1417 idb_assign(ip, ibp->b_data, loc, 0); 1418 else if (dblk == blkstofrags(fs, blkno) && 1419 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1420 fs->fs_bsize, ip->i_number)) { 1421 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1422 idb_assign(ip, ibp->b_data, loc, 0); 1423 } 1424 } 1425 bwrite(ibp); 1426 } 1427 /* 1428 * Clear snapshot flag and drop reference. 1429 */ 1430 ip->i_flags &= ~SF_SNAPSHOT; 1431 DIP_ASSIGN(ip, flags, ip->i_flags); 1432 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1433} 1434 1435/* 1436 * Notification that a block is being freed. Return zero if the free 1437 * should be allowed to proceed. Return non-zero if the snapshot file 1438 * wants to claim the block. The block will be claimed if it is an 1439 * uncopied part of one of the snapshots. It will be freed if it is 1440 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1441 * If a fragment is being freed, then all snapshots that care about 1442 * it must make a copy since a snapshot file can only claim full sized 1443 * blocks. Note that if more than one snapshot file maps the block, 1444 * we can pick one at random to claim it. Since none of the snapshots 1445 * can change, we are assurred that they will all see the same unmodified 1446 * image. When deleting a snapshot file (see ffs_snapremove above), we 1447 * must push any of these claimed blocks to one of the other snapshots 1448 * that maps it. These claimed blocks are easily identified as they will 1449 * have a block number equal to their logical block number within the 1450 * snapshot. A copied block can never have this property because they 1451 * must always have been allocated from a BLK_NOCOPY location. 1452 */ 1453int 1454ffs_snapblkfree(fs, devvp, bno, size, inum) 1455 struct fs *fs; 1456 struct vnode *devvp; 1457 ufs2_daddr_t bno; 1458 long size; 1459 ino_t inum; 1460{ 1461 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1462 struct buf *ibp; 1463 struct inode *ip; 1464 struct vnode *vp = NULL, *saved_vp = NULL; 1465 caddr_t saved_data = NULL; 1466 ufs_lbn_t lbn; 1467 ufs2_daddr_t blkno; 1468 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1469 1470 lbn = fragstoblks(fs, bno); 1471retry: 1472 VI_LOCK(devvp); 1473 TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) { 1474 vp = ITOV(ip); 1475 /* 1476 * Lookup block being written. 1477 */ 1478 if (lbn < NDADDR) { 1479 blkno = db_get(ip, lbn); 1480 } else { 1481 if (snapshot_locked == 0 && 1482 lockmgr(vp->v_vnlock, 1483 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1484 VI_MTX(devvp)) != 0) 1485 goto retry; 1486 snapshot_locked = 1; 1487 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1488 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1489 if (error) 1490 break; 1491 indiroff = (lbn - NDADDR) % NINDIR(fs); 1492 blkno = idb_get(ip, ibp->b_data, indiroff); 1493 } 1494 /* 1495 * Check to see if block needs to be copied. 1496 */ 1497 if (blkno == 0) { 1498 /* 1499 * A block that we map is being freed. If it has not 1500 * been claimed yet, we will claim or copy it (below). 1501 */ 1502 claimedblk = 1; 1503 } else if (blkno == BLK_SNAP) { 1504 /* 1505 * No previous snapshot claimed the block, 1506 * so it will be freed and become a BLK_NOCOPY 1507 * (don't care) for us. 1508 */ 1509 if (claimedblk) 1510 panic("snapblkfree: inconsistent block type"); 1511 if (snapshot_locked == 0 && 1512 lockmgr(vp->v_vnlock, 1513 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1514 VI_MTX(devvp)) != 0) { 1515 if (lbn >= NDADDR) 1516 brelse(ibp); 1517 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL); 1518 goto retry; 1519 } 1520 snapshot_locked = 1; 1521 if (lbn < NDADDR) { 1522 db_assign(ip, lbn, BLK_NOCOPY); 1523 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1524 } else { 1525 idb_assign(ip, ibp->b_data, indiroff, 1526 BLK_NOCOPY); 1527 bwrite(ibp); 1528 } 1529 continue; 1530 } else /* BLK_NOCOPY or default */ { 1531 /* 1532 * If the snapshot has already copied the block 1533 * (default), or does not care about the block, 1534 * it is not needed. 1535 */ 1536 if (lbn >= NDADDR) 1537 brelse(ibp); 1538 continue; 1539 } 1540 /* 1541 * If this is a full size block, we will just grab it 1542 * and assign it to the snapshot inode. Otherwise we 1543 * will proceed to copy it. See explanation for this 1544 * routine as to why only a single snapshot needs to 1545 * claim this block. 1546 */ 1547 if (snapshot_locked == 0 && 1548 lockmgr(vp->v_vnlock, 1549 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1550 VI_MTX(devvp)) != 0) { 1551 if (lbn >= NDADDR) 1552 brelse(ibp); 1553 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL); 1554 goto retry; 1555 } 1556 snapshot_locked = 1; 1557 if (size == fs->fs_bsize) { 1558#ifdef DEBUG 1559 if (snapdebug) 1560 printf("%s %d lbn %jd from inum %d\n", 1561 "Grabonremove: snapino", ip->i_number, 1562 (intmax_t)lbn, inum); 1563#endif 1564 if (lbn < NDADDR) { 1565 db_assign(ip, lbn, bno); 1566 } else { 1567 idb_assign(ip, ibp->b_data, indiroff, bno); 1568 bwrite(ibp); 1569 } 1570 DIP_ADD(ip, blocks, btodb(size)); 1571 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1572 VOP_UNLOCK(vp, 0); 1573 return (1); 1574 } 1575 if (lbn >= NDADDR) 1576 brelse(ibp); 1577#ifdef DEBUG 1578 if (snapdebug) 1579 printf("%s%d lbn %jd %s %d size %ld\n", 1580 "Copyonremove: snapino ", ip->i_number, 1581 (intmax_t)lbn, "for inum", inum, size); 1582#endif 1583 /* 1584 * If we have already read the old block contents, then 1585 * simply copy them to the new block. Note that we need 1586 * to synchronously write snapshots that have not been 1587 * unlinked, and hence will be visible after a crash, 1588 * to ensure their integrity. 1589 */ 1590 if (saved_data) { 1591 error = writevnblk(vp, saved_data, lbn); 1592 if (error) 1593 break; 1594 continue; 1595 } 1596 /* 1597 * Otherwise, read the old block contents into the buffer. 1598 */ 1599 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1600 saved_vp = vp; 1601 if ((error = readfsblk(vp, saved_data, lbn)) != 0) { 1602 free(saved_data, M_UFSMNT); 1603 saved_data = NULL; 1604 break; 1605 } 1606 } 1607 /* 1608 * Note that we need to synchronously write snapshots that 1609 * have not been unlinked, and hence will be visible after 1610 * a crash, to ensure their integrity. 1611 */ 1612 if (saved_data) { 1613 error = writevnblk(saved_vp, saved_data, lbn); 1614 free(saved_data, M_UFSMNT); 1615 } 1616 /* 1617 * If we have been unable to allocate a block in which to do 1618 * the copy, then return non-zero so that the fragment will 1619 * not be freed. Although space will be lost, the snapshot 1620 * will stay consistent. 1621 */ 1622 if (snapshot_locked) 1623 VOP_UNLOCK(vp, 0); 1624 else 1625 VI_UNLOCK(devvp); 1626 return (error); 1627} 1628 1629/* 1630 * Associate snapshot files when mounting. 1631 */ 1632void 1633ffs_snapshot_mount(mp) 1634 struct mount *mp; 1635{ 1636 struct ufsmount *ump = VFSTOUFS(mp); 1637 struct vnode *devvp = ump->um_devvp; 1638 struct fs *fs = ump->um_fs; 1639 struct proc *p = curproc; 1640 struct vnode *vp; 1641 struct inode *ip, *xp; 1642 ufs2_daddr_t snaplistsize, *snapblklist; 1643 int error, ns, snaploc, loc; 1644 1645 ns = UFS_FSNEEDSWAP(fs); 1646 /* 1647 * XXX The following needs to be set before VOP_TRUNCATE or 1648 * VOP_READ can be called. 1649 */ 1650 mp->mnt_stat.f_iosize = fs->fs_bsize; 1651 /* 1652 * Process each snapshot listed in the superblock. 1653 */ 1654 vp = NULL; 1655 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1656 if (fs->fs_snapinum[snaploc] == 0) 1657 break; 1658 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1659 &vp)) != 0) { 1660 printf("ffs_snapshot_mount: vget failed %d\n", error); 1661 continue; 1662 } 1663 ip = VTOI(vp); 1664 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1665 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1666 fs->fs_snapinum[snaploc]); 1667 vput(vp); 1668 vp = NULL; 1669 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1670 if (fs->fs_snapinum[loc] == 0) 1671 break; 1672 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1673 } 1674 fs->fs_snapinum[loc - 1] = 0; 1675 snaploc--; 1676 continue; 1677 } 1678 /* 1679 * If there already exist snapshots on this filesystem, grab a 1680 * reference to their shared lock. If this is the first snapshot 1681 * on this filesystem, we need to allocate a lock for the 1682 * snapshots to share. In either case, acquire the snapshot 1683 * lock and give up our original private lock. 1684 */ 1685 VI_LOCK(devvp); 1686 if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) { 1687 struct lock *lkp; 1688 1689 lkp = ITOV(xp)->v_vnlock; 1690 VI_UNLOCK(devvp); 1691 VI_LOCK(vp); 1692 vp->v_vnlock = lkp; 1693 } else { 1694 struct lock *lkp; 1695 1696 VI_UNLOCK(devvp); 1697 MALLOC(lkp, struct lock *, sizeof(struct lock), 1698 M_UFSMNT, M_WAITOK); 1699 lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE); 1700 VI_LOCK(vp); 1701 vp->v_vnlock = lkp; 1702 } 1703 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY); 1704 transferlockers(&vp->v_lock, vp->v_vnlock); 1705 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 1706 /* 1707 * Link it onto the active snapshot list. 1708 */ 1709 VI_LOCK(devvp); 1710 if (ip->i_nextsnap.tqe_prev != 0) 1711 panic("ffs_snapshot_mount: %d already on list", 1712 ip->i_number); 1713 else 1714 TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap); 1715 vp->v_flag |= VSYSTEM; 1716 VI_UNLOCK(devvp); 1717 VOP_UNLOCK(vp, 0); 1718 } 1719 /* 1720 * No usable snapshots found. 1721 */ 1722 if (vp == NULL) 1723 return; 1724 /* 1725 * Allocate the space for the block hints list. We always want to 1726 * use the list from the newest snapshot. 1727 */ 1728 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1729 error = vn_rdwr(UIO_READ, vp, 1730 (caddr_t)&snaplistsize, sizeof(snaplistsize), 1731 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1732 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, p); 1733 if (error) { 1734 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1735 VOP_UNLOCK(vp, 0); 1736 return; 1737 } 1738 snaplistsize = ufs_rw64(snaplistsize, ns); 1739 MALLOC(snapblklist, ufs2_daddr_t *, snaplistsize * sizeof(ufs2_daddr_t), 1740 M_UFSMNT, M_WAITOK); 1741 error = vn_rdwr(UIO_READ, vp, 1742 (caddr_t)snapblklist, snaplistsize * sizeof(ufs2_daddr_t), 1743 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1744 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, p); 1745 if (error) { 1746 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1747 VOP_UNLOCK(vp, 0); 1748 FREE(snapblklist, M_UFSMNT); 1749 return; 1750 } 1751 VOP_UNLOCK(vp, 0); 1752 VI_LOCK(devvp); 1753 ump->um_snaplistsize = snaplistsize; 1754 ump->um_snapblklist = snapblklist; 1755 VI_UNLOCK(devvp); 1756 vn_cow_establish(devvp, ffs_copyonwrite, devvp); 1757} 1758 1759/* 1760 * Disassociate snapshot files when unmounting. 1761 */ 1762void 1763ffs_snapshot_unmount(mp) 1764 struct mount *mp; 1765{ 1766 struct ufsmount *ump = VFSTOUFS(mp); 1767 struct vnode *devvp = ump->um_devvp; 1768 struct lock *lkp = NULL; 1769 struct inode *xp; 1770 struct vnode *vp; 1771 1772 VI_LOCK(devvp); 1773 while ((xp = TAILQ_FIRST(&ump->um_snapshots)) != 0) { 1774 vp = ITOV(xp); 1775 lkp = vp->v_vnlock; 1776 vp->v_vnlock = &vp->v_lock; 1777 TAILQ_REMOVE(&ump->um_snapshots, xp, i_nextsnap); 1778 xp->i_nextsnap.tqe_prev = 0; 1779 if (xp->i_ffs_effnlink > 0) { 1780 VI_UNLOCK(devvp); 1781 vrele(vp); 1782 VI_LOCK(devvp); 1783 } 1784 } 1785 if (ump->um_snapblklist != NULL) { 1786 FREE(ump->um_snapblklist, M_UFSMNT); 1787 ump->um_snapblklist = NULL; 1788 ump->um_snaplistsize = 0; 1789 } 1790 VI_UNLOCK(devvp); 1791 if (lkp != NULL) { 1792 vn_cow_disestablish(devvp, ffs_copyonwrite, devvp); 1793 FREE(lkp, M_UFSMNT); 1794 } 1795} 1796 1797/* 1798 * Check for need to copy block that is about to be written, 1799 * copying the block if necessary. 1800 */ 1801static int 1802ffs_copyonwrite(v, bp) 1803 void *v; 1804 struct buf *bp; 1805{ 1806 struct buf *ibp; 1807 struct fs *fs; 1808 struct inode *ip; 1809 struct vnode *devvp = v, *vp = 0, *saved_vp = NULL; 1810 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1811 caddr_t saved_data = NULL; 1812 ufs2_daddr_t lbn, blkno, *snapblklist; 1813 int lower, upper, mid, ns, indiroff, snapshot_locked = 0, error = 0; 1814 1815 /* 1816 * Check for valid snapshots. 1817 */ 1818 VI_LOCK(devvp); 1819 ip = TAILQ_FIRST(&ump->um_snapshots); 1820 if (ip == NULL) { 1821 VI_UNLOCK(devvp); 1822 return 0; 1823 } 1824 /* 1825 * First check to see if it is in the preallocated list. 1826 * By doing this check we avoid several potential deadlocks. 1827 */ 1828 fs = ip->i_fs; 1829 ns = UFS_FSNEEDSWAP(fs); 1830 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1831 snapblklist = ump->um_snapblklist; 1832 upper = ump->um_snaplistsize - 1; 1833 lower = 1; 1834 while (lower <= upper) { 1835 mid = (lower + upper) / 2; 1836 if (ufs_rw64(snapblklist[mid], ns) == lbn) 1837 break; 1838 if (ufs_rw64(snapblklist[mid], ns) < lbn) 1839 lower = mid + 1; 1840 else 1841 upper = mid - 1; 1842 } 1843 if (lower <= upper) { 1844 VI_UNLOCK(devvp); 1845 return 0; 1846 } 1847 /* 1848 * Not in the precomputed list, so check the snapshots. 1849 */ 1850retry: 1851 TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) { 1852 vp = ITOV(ip); 1853 /* 1854 * We ensure that everything of our own that needs to be 1855 * copied will be done at the time that ffs_snapshot is 1856 * called. Thus we can skip the check here which can 1857 * deadlock in doing the lookup in VOP_BALLOC. 1858 */ 1859 if (bp->b_vp == vp) 1860 continue; 1861 /* 1862 * Check to see if block needs to be copied. We do not have 1863 * to hold the snapshot lock while doing this lookup as it 1864 * will never require any additional allocations for the 1865 * snapshot inode. 1866 */ 1867 if (lbn < NDADDR) { 1868 blkno = db_get(ip, lbn); 1869 } else { 1870 if (snapshot_locked == 0 && 1871 lockmgr(vp->v_vnlock, 1872 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1873 VI_MTX(devvp)) != 0) { 1874 VI_LOCK(devvp); 1875 goto retry; 1876 } 1877 snapshot_locked = 1; 1878 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1879 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1880 if (error) 1881 break; 1882 indiroff = (lbn - NDADDR) % NINDIR(fs); 1883 blkno = idb_get(ip, ibp->b_data, indiroff); 1884 brelse(ibp); 1885 } 1886#ifdef DIAGNOSTIC 1887 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1888 panic("ffs_copyonwrite: bad copy block"); 1889#endif 1890 if (blkno != 0) 1891 continue; 1892 /* 1893 * Allocate the block into which to do the copy. Since 1894 * multiple processes may all try to copy the same block, 1895 * we have to recheck our need to do a copy if we sleep 1896 * waiting for the lock. 1897 * 1898 * Because all snapshots on a filesystem share a single 1899 * lock, we ensure that we will never be in competition 1900 * with another process to allocate a block. 1901 */ 1902 if (snapshot_locked == 0 && 1903 lockmgr(vp->v_vnlock, 1904 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1905 VI_MTX(devvp)) != 0) { 1906 VI_LOCK(devvp); 1907 goto retry; 1908 } 1909 snapshot_locked = 1; 1910#ifdef DEBUG 1911 if (snapdebug) { 1912 printf("Copyonwrite: snapino %d lbn %jd for ", 1913 ip->i_number, (intmax_t)lbn); 1914 if (bp->b_vp == devvp) 1915 printf("fs metadata"); 1916 else 1917 printf("inum %d", VTOI(bp->b_vp)->i_number); 1918 printf(" lblkno %jd\n", (intmax_t)bp->b_lblkno); 1919 } 1920#endif 1921 /* 1922 * If we have already read the old block contents, then 1923 * simply copy them to the new block. Note that we need 1924 * to synchronously write snapshots that have not been 1925 * unlinked, and hence will be visible after a crash, 1926 * to ensure their integrity. 1927 */ 1928 if (saved_data) { 1929 error = writevnblk(vp, saved_data, lbn); 1930 if (error) 1931 break; 1932 continue; 1933 } 1934 /* 1935 * Otherwise, read the old block contents into the buffer. 1936 */ 1937 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1938 saved_vp = vp; 1939 if ((error = readfsblk(vp, saved_data, lbn)) != 0) { 1940 free(saved_data, M_UFSMNT); 1941 saved_data = NULL; 1942 break; 1943 } 1944 } 1945 /* 1946 * Note that we need to synchronously write snapshots that 1947 * have not been unlinked, and hence will be visible after 1948 * a crash, to ensure their integrity. 1949 */ 1950 if (saved_data) { 1951 error = writevnblk(saved_vp, saved_data, lbn); 1952 free(saved_data, M_UFSMNT); 1953 } 1954 if (snapshot_locked) 1955 VOP_UNLOCK(vp, 0); 1956 else 1957 VI_UNLOCK(devvp); 1958 return error; 1959} 1960 1961/* 1962 * Read the specified block from disk. Vp is usually a snapshot vnode. 1963 */ 1964static int 1965readfsblk(vp, data, lbn) 1966 struct vnode *vp; 1967 caddr_t data; 1968 ufs2_daddr_t lbn; 1969{ 1970 int s, error; 1971 struct inode *ip = VTOI(vp); 1972 struct fs *fs = ip->i_fs; 1973 struct buf *nbp; 1974 1975 s = splbio(); 1976 nbp = pool_get(&bufpool, PR_WAITOK); 1977 splx(s); 1978 1979 BUF_INIT(nbp); 1980 nbp->b_flags = B_READ; 1981 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 1982 nbp->b_error = 0; 1983 nbp->b_data = data; 1984 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn)); 1985 nbp->b_proc = NULL; 1986 nbp->b_dev = ip->i_devvp->v_rdev; 1987 nbp->b_vp = NULLVP; 1988 1989 DEV_STRATEGY(nbp); 1990 1991 error = biowait(nbp); 1992 1993 s = splbio(); 1994 pool_put(&bufpool, nbp); 1995 splx(s); 1996 1997 return error; 1998} 1999 2000/* 2001 * Read the specified block. This is the needed part of ffs_read. 2002 */ 2003static int 2004readvnblk(vp, data, lbn) 2005 struct vnode *vp; 2006 caddr_t data; 2007 ufs2_daddr_t lbn; 2008{ 2009 int error = 0; 2010 vsize_t len, todo; 2011 off_t off; 2012 void *win; 2013 struct inode *ip = VTOI(vp); 2014 struct fs *fs = ip->i_fs; 2015 2016 off = lblktosize(fs, (off_t)lbn); 2017 todo = fs->fs_bsize; 2018 2019 while (todo > 0) { 2020 len = todo; 2021 win = ubc_alloc(&vp->v_uobj, off, &len, UBC_READ); 2022 error = kcopy(win, data, len); 2023 ubc_release(win, 0); 2024 if (error) 2025 break; 2026 data += len; 2027 off += len; 2028 todo -= len; 2029 } 2030 2031 return error; 2032} 2033 2034/* 2035 * Write the specified block. This is the needed part of ffs_write. 2036 * We must never call VOP_UPDATE() since it will deadlock. 2037 */ 2038static int 2039writevnblk(vp, data, lbn) 2040 struct vnode *vp; 2041 caddr_t data; 2042 ufs2_daddr_t lbn; 2043{ 2044 int error, sync; 2045 vsize_t len, todo; 2046 off_t off; 2047 voff_t offlo, offhi; 2048 void *win; 2049 struct inode *ip = VTOI(vp); 2050 struct fs *fs = ip->i_fs; 2051 2052 sync = (dopersistence && ip->i_ffs_effnlink > 0); 2053 off = lblktosize(fs, (off_t)lbn); 2054 offlo = trunc_page(off); 2055 offhi = round_page(off+fs->fs_bsize); 2056 todo = fs->fs_bsize; 2057 2058 error = ufs_balloc_range(vp, off, todo, KERNCRED, sync ? B_SYNC : 0); 2059 if (error) 2060 return error; 2061 2062 while (todo > 0) { 2063 len = todo; 2064 win = ubc_alloc(&vp->v_uobj, off, &len, UBC_WRITE); 2065 error = kcopy(data, win, len); 2066 ubc_release(win, 0); 2067 if (error) 2068 break; 2069 data += len; 2070 off += len; 2071 todo -= len; 2072 } 2073 2074 if (error == 0 && sync) { 2075 simple_lock(&vp->v_interlock); 2076 error = VOP_PUTPAGES(vp, offlo, offhi, PGO_CLEANIT|PGO_SYNCIO); 2077 } 2078 return error; 2079} 2080 2081/* 2082 * Get/Put direct block from inode or buffer containing disk addresses. Take 2083 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 2084 * into a global include. 2085 */ 2086static inline ufs2_daddr_t 2087db_get(struct inode *ip, int loc) 2088{ 2089 int ns = UFS_IPNEEDSWAP(ip); 2090 2091 if (ip->i_ump->um_fstype == UFS1) 2092 return ufs_rw32(ip->i_ffs1_db[loc], ns); 2093 else 2094 return ufs_rw64(ip->i_ffs2_db[loc], ns); 2095} 2096 2097static inline void 2098db_assign(struct inode *ip, int loc, ufs2_daddr_t val) 2099{ 2100 int ns = UFS_IPNEEDSWAP(ip); 2101 2102 if (ip->i_ump->um_fstype == UFS1) 2103 ip->i_ffs1_db[loc] = ufs_rw32(val, ns); 2104 else 2105 ip->i_ffs2_db[loc] = ufs_rw64(val, ns); 2106} 2107 2108static inline ufs2_daddr_t 2109idb_get(struct inode *ip, caddr_t buf, int loc) 2110{ 2111 int ns = UFS_IPNEEDSWAP(ip); 2112 2113 if (ip->i_ump->um_fstype == UFS1) 2114 return ufs_rw32(((ufs1_daddr_t *)(buf))[loc], ns); 2115 else 2116 return ufs_rw64(((ufs2_daddr_t *)(buf))[loc], ns); 2117} 2118 2119static inline void 2120idb_assign(struct inode *ip, caddr_t buf, int loc, ufs2_daddr_t val) 2121{ 2122 int ns = UFS_IPNEEDSWAP(ip); 2123 2124 if (ip->i_ump->um_fstype == UFS1) 2125 ((ufs1_daddr_t *)(buf))[loc] = ufs_rw32(val, ns); 2126 else 2127 ((ufs2_daddr_t *)(buf))[loc] = ufs_rw64(val, ns); 2128} 2129