38 39#include <sys/param.h> 40#include <sys/kernel.h> 41#include <sys/systm.h> 42#include <sys/conf.h> 43#include <sys/bio.h> 44#include <sys/buf.h> 45#include <sys/proc.h> 46#include <sys/namei.h> 47#include <sys/sched.h> 48#include <sys/stat.h> 49#include <sys/malloc.h> 50#include <sys/mount.h> 51#include <sys/resource.h> 52#include <sys/resourcevar.h> 53#include <sys/vnode.h> 54 55#include <ufs/ufs/extattr.h> 56#include <ufs/ufs/quota.h> 57#include <ufs/ufs/ufsmount.h> 58#include <ufs/ufs/inode.h> 59#include <ufs/ufs/ufs_extern.h> 60 61#include <ufs/ffs/fs.h> 62#include <ufs/ffs/ffs_extern.h> 63 64#define KERNCRED thread0.td_ucred 65#define DEBUG 1 66 67TAILQ_HEAD(snaphead, inode); 68 69struct snapdata { 70 struct snaphead sn_head; 71 daddr_t sn_listsize; 72 daddr_t *sn_blklist; 73 struct lock sn_lock; 74}; 75 76static int cgaccount(int, struct vnode *, struct buf *, int); 77static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 78 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 79 ufs_lbn_t, int), int); 80static int indiracct_ufs1(struct vnode *, struct vnode *, int, 81 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 82 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 83 ufs_lbn_t, int), int); 84static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 85 struct fs *, ufs_lbn_t, int); 86static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 87 struct fs *, ufs_lbn_t, int); 88static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 89 struct fs *, ufs_lbn_t, int); 90static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 91 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 92 ufs_lbn_t, int), int); 93static int indiracct_ufs2(struct vnode *, struct vnode *, int, 94 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 95 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 96 ufs_lbn_t, int), int); 97static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 98 struct fs *, ufs_lbn_t, int); 99static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 100 struct fs *, ufs_lbn_t, int); 101static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 102 struct fs *, ufs_lbn_t, int); 103static int ffs_copyonwrite(struct vnode *, struct buf *); 104static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); 105 106/* 107 * To ensure the consistency of snapshots across crashes, we must 108 * synchronously write out copied blocks before allowing the 109 * originals to be modified. Because of the rather severe speed 110 * penalty that this imposes, the following flag allows this 111 * crash persistence to be disabled. 112 */ 113int dopersistence = 0; 114 115#ifdef DEBUG 116#include <sys/sysctl.h> 117SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 118static int snapdebug = 0; 119SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 120int collectsnapstats = 0; 121SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 122 0, ""); 123#endif /* DEBUG */ 124 125/* 126 * Create a snapshot file and initialize it for the filesystem. 127 */ 128int 129ffs_snapshot(mp, snapfile) 130 struct mount *mp; 131 char *snapfile; 132{ 133 ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 134 int error, cg, snaploc; 135 int i, size, len, loc; 136 int flag = mp->mnt_flag; 137 struct timespec starttime = {0, 0}, endtime; 138 char saved_nice = 0; 139 long redo = 0, snaplistsize = 0; 140 int32_t *lp; 141 void *space; 142 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 143 struct thread *td = curthread; 144 struct inode *ip, *xp; 145 struct buf *bp, *nbp, *ibp, *sbp = NULL; 146 struct nameidata nd; 147 struct mount *wrtmp; 148 struct vattr vat; 149 struct vnode *vp, *xvp, *nvp, *devvp; 150 struct uio auio; 151 struct iovec aiov; 152 struct snapdata *sn; 153 154 /* 155 * XXX: make sure we don't go to out1 before we setup sn 156 */ 157 sn = (void *)0xdeadbeef; 158 159 /* 160 * Need to serialize access to snapshot code per filesystem. 161 */ 162 /* 163 * Assign a snapshot slot in the superblock. 164 */ 165 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 166 if (fs->fs_snapinum[snaploc] == 0) 167 break; 168 if (snaploc == FSMAXSNAP) 169 return (ENOSPC); 170 /* 171 * Create the snapshot file. 172 */ 173restart: 174 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td); 175 if ((error = namei(&nd)) != 0) 176 return (error); 177 if (nd.ni_vp != NULL) { 178 vput(nd.ni_vp); 179 error = EEXIST; 180 } 181 if (nd.ni_dvp->v_mount != mp) 182 error = EXDEV; 183 if (error) { 184 NDFREE(&nd, NDF_ONLY_PNBUF); 185 if (nd.ni_dvp == nd.ni_vp) 186 vrele(nd.ni_dvp); 187 else 188 vput(nd.ni_dvp); 189 return (error); 190 } 191 VATTR_NULL(&vat); 192 vat.va_type = VREG; 193 vat.va_mode = S_IRUSR; 194 vat.va_vaflags |= VA_EXCLUSIVE; 195 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 196 wrtmp = NULL; 197 if (wrtmp != mp) 198 panic("ffs_snapshot: mount mismatch"); 199 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 200 NDFREE(&nd, NDF_ONLY_PNBUF); 201 vput(nd.ni_dvp); 202 if ((error = vn_start_write(NULL, &wrtmp, 203 V_XSLEEP | PCATCH)) != 0) 204 return (error); 205 goto restart; 206 } 207 VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); 208 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 209 vput(nd.ni_dvp); 210 if (error) { 211 NDFREE(&nd, NDF_ONLY_PNBUF); 212 vn_finished_write(wrtmp); 213 return (error); 214 } 215 vp = nd.ni_vp; 216 ip = VTOI(vp); 217 devvp = ip->i_devvp; 218 /* 219 * Allocate and copy the last block contents so as to be able 220 * to set size to that of the filesystem. 221 */ 222 numblks = howmany(fs->fs_size, fs->fs_frag); 223 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 224 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 225 if (error) 226 goto out; 227 ip->i_size = lblktosize(fs, (off_t)numblks); 228 DIP_SET(ip, i_size, ip->i_size); 229 ip->i_flag |= IN_CHANGE | IN_UPDATE; 230 if ((error = readblock(vp, bp, numblks - 1)) != 0) 231 goto out; 232 bawrite(bp); 233 /* 234 * Preallocate critical data structures so that we can copy 235 * them in without further allocation after we suspend all 236 * operations on the filesystem. We would like to just release 237 * the allocated buffers without writing them since they will 238 * be filled in below once we are ready to go, but this upsets 239 * the soft update code, so we go ahead and write the new buffers. 240 * 241 * Allocate all indirect blocks and mark all of them as not 242 * needing to be copied. 243 */ 244 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 245 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 246 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 247 if (error) 248 goto out; 249 bawrite(ibp); 250 } 251 /* 252 * Allocate copies for the superblock and its summary information. 253 */ 254 error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 255 0, &nbp); 256 if (error) 257 goto out; 258 bawrite(nbp); 259 blkno = fragstoblks(fs, fs->fs_csaddr); 260 len = howmany(fs->fs_cssize, fs->fs_bsize); 261 for (loc = 0; loc < len; loc++) { 262 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 263 fs->fs_bsize, KERNCRED, 0, &nbp); 264 if (error) 265 goto out; 266 bawrite(nbp); 267 } 268 /* 269 * Allocate all cylinder group blocks. 270 */ 271 for (cg = 0; cg < fs->fs_ncg; cg++) { 272 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 273 fs->fs_bsize, KERNCRED, 0, &nbp); 274 if (error) 275 goto out; 276 bawrite(nbp); 277 } 278 /* 279 * Copy all the cylinder group maps. Although the 280 * filesystem is still active, we hope that only a few 281 * cylinder groups will change between now and when we 282 * suspend operations. Thus, we will be able to quickly 283 * touch up the few cylinder groups that changed during 284 * the suspension period. 285 */ 286 len = howmany(fs->fs_ncg, NBBY); 287 MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK); 288 bzero(fs->fs_active, len); 289 for (cg = 0; cg < fs->fs_ncg; cg++) { 290 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 291 fs->fs_bsize, KERNCRED, 0, &nbp); 292 if (error) 293 goto out; 294 error = cgaccount(cg, vp, nbp, 1); 295 bawrite(nbp); 296 if (error) 297 goto out; 298 } 299 /* 300 * Change inode to snapshot type file. 301 */ 302 ip->i_flags |= SF_SNAPSHOT; 303 DIP_SET(ip, i_flags, ip->i_flags); 304 ip->i_flag |= IN_CHANGE | IN_UPDATE; 305 /* 306 * Ensure that the snapshot is completely on disk. 307 * Since we have marked it as a snapshot it is safe to 308 * unlock it as no process will be allowed to write to it. 309 */ 310 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0) 311 goto out; 312 VOP_UNLOCK(vp, 0, td); 313 /* 314 * All allocations are done, so we can now snapshot the system. 315 * 316 * Recind nice scheduling while running with the filesystem suspended. 317 */ 318 if (td->td_proc->p_nice > 0) { 319 PROC_LOCK(td->td_proc); 320 mtx_lock_spin(&sched_lock); 321 saved_nice = td->td_proc->p_nice; 322 sched_nice(td->td_proc, 0); 323 mtx_unlock_spin(&sched_lock); 324 PROC_UNLOCK(td->td_proc); 325 } 326 /* 327 * Suspend operation on filesystem. 328 */ 329 for (;;) { 330 vn_finished_write(wrtmp); 331 if ((error = vfs_write_suspend(vp->v_mount)) != 0) { 332 vn_start_write(NULL, &wrtmp, V_WAIT); 333 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 334 goto out; 335 } 336 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 337 break; 338 vn_start_write(NULL, &wrtmp, V_WAIT); 339 } 340 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 341 if (collectsnapstats) 342 nanotime(&starttime); 343 /* 344 * First, copy all the cylinder group maps that have changed. 345 */ 346 for (cg = 0; cg < fs->fs_ncg; cg++) { 347 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 348 continue; 349 redo++; 350 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 351 fs->fs_bsize, KERNCRED, 0, &nbp); 352 if (error) 353 goto out1; 354 error = cgaccount(cg, vp, nbp, 2); 355 bawrite(nbp); 356 if (error) 357 goto out1; 358 } 359 /* 360 * Grab a copy of the superblock and its summary information. 361 * We delay writing it until the suspension is released below. 362 */ 363 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 364 KERNCRED, &sbp); 365 if (error) { 366 brelse(sbp); 367 sbp = NULL; 368 goto out1; 369 } 370 loc = blkoff(fs, fs->fs_sblockloc); 371 copy_fs = (struct fs *)(sbp->b_data + loc); 372 bcopy(fs, copy_fs, fs->fs_sbsize); 373 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 374 copy_fs->fs_clean = 1; 375 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 376 if (fs->fs_sbsize < size) 377 bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); 378 size = blkroundup(fs, fs->fs_cssize); 379 if (fs->fs_contigsumsize > 0) 380 size += fs->fs_ncg * sizeof(int32_t); 381 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 382 copy_fs->fs_csp = space; 383 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 384 space = (char *)space + fs->fs_cssize; 385 loc = howmany(fs->fs_cssize, fs->fs_fsize); 386 i = fs->fs_frag - loc % fs->fs_frag; 387 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 388 if (len > 0) { 389 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 390 len, KERNCRED, &bp)) != 0) { 391 brelse(bp); 392 free(copy_fs->fs_csp, M_UFSMNT); 393 bawrite(sbp); 394 sbp = NULL; 395 goto out1; 396 } 397 bcopy(bp->b_data, space, (u_int)len); 398 space = (char *)space + len; 399 bp->b_flags |= B_INVAL | B_NOCACHE; 400 brelse(bp); 401 } 402 if (fs->fs_contigsumsize > 0) { 403 copy_fs->fs_maxcluster = lp = space; 404 for (i = 0; i < fs->fs_ncg; i++) 405 *lp++ = fs->fs_contigsumsize; 406 } 407 /* 408 * We must check for active files that have been unlinked 409 * (e.g., with a zero link count). We have to expunge all 410 * trace of these files from the snapshot so that they are 411 * not reclaimed prematurely by fsck or unnecessarily dumped. 412 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 413 * spec_strategy about writing on a suspended filesystem. 414 * Note that we skip unlinked snapshot files as they will 415 * be handled separately below. 416 * 417 * We also calculate the needed size for the snapshot list. 418 */ 419 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 420 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 421 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 422 MNT_ILOCK(mp); 423loop: 424 MNT_VNODE_FOREACH(xvp, mp, nvp) { 425 VI_LOCK(xvp); 426 MNT_IUNLOCK(mp); 427 if ((xvp->v_iflag & VI_XLOCK) || 428 xvp->v_usecount == 0 || xvp->v_type == VNON || 429 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 430 VI_UNLOCK(xvp); 431 MNT_ILOCK(mp); 432 continue; 433 } 434 /* 435 * We can skip parent directory vnode because it must have 436 * this snapshot file in it. 437 */ 438 if (xvp == nd.ni_dvp) { 439 VI_UNLOCK(xvp); 440 MNT_ILOCK(mp); 441 continue; 442 } 443 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) { 444 MNT_ILOCK(mp); 445 goto loop; 446 } 447 if (snapdebug) 448 vprint("ffs_snapshot: busy vnode", xvp); 449 if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 && 450 vat.va_nlink > 0) { 451 VOP_UNLOCK(xvp, 0, td); 452 MNT_ILOCK(mp); 453 continue; 454 } 455 xp = VTOI(xvp); 456 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 457 VOP_UNLOCK(xvp, 0, td); 458 MNT_ILOCK(mp); 459 continue; 460 } 461 /* 462 * If there is a fragment, clear it here. 463 */ 464 blkno = 0; 465 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 466 if (loc < NDADDR) { 467 len = fragroundup(fs, blkoff(fs, xp->i_size)); 468 if (len < fs->fs_bsize) { 469 ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]), 470 len, xp->i_number); 471 blkno = DIP(xp, i_db[loc]); 472 DIP_SET(xp, i_db[loc], 0); 473 } 474 } 475 snaplistsize += 1; 476 if (xp->i_ump->um_fstype == UFS1) 477 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 478 BLK_NOCOPY); 479 else 480 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 481 BLK_NOCOPY); 482 if (blkno) 483 DIP_SET(xp, i_db[loc], blkno); 484 if (!error) 485 error = ffs_freefile(copy_fs, vp, xp->i_number, 486 xp->i_mode); 487 VOP_UNLOCK(xvp, 0, td); 488 if (error) { 489 free(copy_fs->fs_csp, M_UFSMNT); 490 bawrite(sbp); 491 sbp = NULL; 492 goto out1; 493 } 494 MNT_ILOCK(mp); 495 } 496 MNT_IUNLOCK(mp); 497 /* 498 * If there already exist snapshots on this filesystem, grab a 499 * reference to their shared lock. If this is the first snapshot 500 * on this filesystem, we need to allocate a lock for the snapshots 501 * to share. In either case, acquire the snapshot lock and give 502 * up our original private lock. 503 */ 504 VI_LOCK(devvp); 505 sn = devvp->v_rdev->si_snapdata; 506 if (sn != NULL) { 507 xp = TAILQ_FIRST(&sn->sn_head); 508 VI_UNLOCK(devvp); 509 VI_LOCK(vp); 510 vp->v_vnlock = &sn->sn_lock; 511 } else { 512 VI_UNLOCK(devvp); 513 sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); 514 TAILQ_INIT(&sn->sn_head); 515 lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, 516 LK_CANRECURSE | LK_NOPAUSE); 517 VI_LOCK(vp); 518 vp->v_vnlock = &sn->sn_lock; 519 devvp->v_rdev->si_snapdata = sn; 520 xp = NULL; 521 } 522 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 523 transferlockers(&vp->v_lock, vp->v_vnlock); 524 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 525 /* 526 * If this is the first snapshot on this filesystem, then we need 527 * to allocate the space for the list of preallocated snapshot blocks. 528 * This list will be refined below, but this preliminary one will 529 * keep us out of deadlock until the full one is ready. 530 */ 531 if (xp == NULL) { 532 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 533 M_UFSMNT, M_WAITOK); 534 blkp = &snapblklist[1]; 535 *blkp++ = lblkno(fs, fs->fs_sblockloc); 536 blkno = fragstoblks(fs, fs->fs_csaddr); 537 for (cg = 0; cg < fs->fs_ncg; cg++) { 538 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 539 break; 540 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 541 } 542 len = howmany(fs->fs_cssize, fs->fs_bsize); 543 for (loc = 0; loc < len; loc++) 544 *blkp++ = blkno + loc; 545 for (; cg < fs->fs_ncg; cg++) 546 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 547 snapblklist[0] = blkp - snapblklist; 548 VI_LOCK(devvp); 549 if (sn->sn_blklist != NULL) 550 panic("ffs_snapshot: non-empty list"); 551 sn->sn_blklist = snapblklist; 552 sn->sn_listsize = blkp - snapblklist; 553 VI_UNLOCK(devvp); 554 } 555 /* 556 * Record snapshot inode. Since this is the newest snapshot, 557 * it must be placed at the end of the list. 558 */ 559 VI_LOCK(devvp); 560 fs->fs_snapinum[snaploc] = ip->i_number; 561 if (ip->i_nextsnap.tqe_prev != 0) 562 panic("ffs_snapshot: %d already on list", ip->i_number); 563 TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 564 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 565 devvp->v_vflag |= VV_COPYONWRITE; 566 VI_UNLOCK(devvp); 567 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 568 vp->v_vflag |= VV_SYSTEM; 569out1: 570 KASSERT(sn != (void *)0xdeadbeef, ("email phk@ and mckusick@")); 571 /* 572 * Resume operation on filesystem. 573 */ 574 vfs_write_resume(vp->v_mount); 575 vn_start_write(NULL, &wrtmp, V_WAIT); 576 if (collectsnapstats && starttime.tv_sec > 0) { 577 nanotime(&endtime); 578 timespecsub(&endtime, &starttime); 579 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 580 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 581 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 582 } 583 if (sbp == NULL) 584 goto out; 585 /* 586 * Copy allocation information from all the snapshots in 587 * this snapshot and then expunge them from its view. 588 */ 589 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { 590 if (xp == ip) 591 break; 592 if (xp->i_ump->um_fstype == UFS1) 593 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 594 BLK_SNAP); 595 else 596 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 597 BLK_SNAP); 598 if (error) { 599 fs->fs_snapinum[snaploc] = 0; 600 goto done; 601 } 602 } 603 /* 604 * Allocate space for the full list of preallocated snapshot blocks. 605 */ 606 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 607 M_UFSMNT, M_WAITOK); 608 ip->i_snapblklist = &snapblklist[1]; 609 /* 610 * Expunge the blocks used by the snapshots from the set of 611 * blocks marked as used in the snapshot bitmaps. Also, collect 612 * the list of allocated blocks in i_snapblklist. 613 */ 614 if (ip->i_ump->um_fstype == UFS1) 615 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 616 else 617 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 618 if (error) { 619 fs->fs_snapinum[snaploc] = 0; 620 FREE(snapblklist, M_UFSMNT); 621 goto done; 622 } 623 if (snaplistsize < ip->i_snapblklist - snapblklist) 624 panic("ffs_snapshot: list too small"); 625 snaplistsize = ip->i_snapblklist - snapblklist; 626 snapblklist[0] = snaplistsize; 627 ip->i_snapblklist = 0; 628 /* 629 * Write out the list of allocated blocks to the end of the snapshot. 630 */ 631 auio.uio_iov = &aiov; 632 auio.uio_iovcnt = 1; 633 aiov.iov_base = (void *)snapblklist; 634 aiov.iov_len = snaplistsize * sizeof(daddr_t); 635 auio.uio_resid = aiov.iov_len;; 636 auio.uio_offset = ip->i_size; 637 auio.uio_segflg = UIO_SYSSPACE; 638 auio.uio_rw = UIO_WRITE; 639 auio.uio_td = td; 640 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 641 fs->fs_snapinum[snaploc] = 0; 642 FREE(snapblklist, M_UFSMNT); 643 goto done; 644 } 645 /* 646 * Write the superblock and its summary information 647 * to the snapshot. 648 */ 649 blkno = fragstoblks(fs, fs->fs_csaddr); 650 len = howmany(fs->fs_cssize, fs->fs_bsize); 651 space = copy_fs->fs_csp; 652 for (loc = 0; loc < len; loc++) { 653 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 654 if (error) { 655 brelse(nbp); 656 fs->fs_snapinum[snaploc] = 0; 657 FREE(snapblklist, M_UFSMNT); 658 goto done; 659 } 660 bcopy(space, nbp->b_data, fs->fs_bsize); 661 space = (char *)space + fs->fs_bsize; 662 bawrite(nbp); 663 } 664 /* 665 * As this is the newest list, it is the most inclusive, so 666 * should replace the previous list. 667 */ 668 VI_LOCK(devvp); 669 space = sn->sn_blklist; 670 sn->sn_blklist = snapblklist; 671 sn->sn_listsize = snaplistsize; 672 VI_UNLOCK(devvp); 673 if (space != NULL) 674 FREE(space, M_UFSMNT); 675done: 676 FREE(copy_fs->fs_csp, M_UFSMNT); 677 bawrite(sbp); 678out: 679 if (saved_nice > 0) { 680 PROC_LOCK(td->td_proc); 681 mtx_lock_spin(&sched_lock); 682 sched_nice(td->td_proc, saved_nice); 683 mtx_unlock_spin(&sched_lock); 684 PROC_UNLOCK(td->td_proc); 685 } 686 if (fs->fs_active != 0) { 687 FREE(fs->fs_active, M_DEVBUF); 688 fs->fs_active = 0; 689 } 690 mp->mnt_flag = flag; 691 if (error) 692 (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 693 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 694 if (error) 695 vput(vp); 696 else 697 VOP_UNLOCK(vp, 0, td); 698 vn_finished_write(wrtmp); 699 return (error); 700} 701 702/* 703 * Copy a cylinder group map. All the unallocated blocks are marked 704 * BLK_NOCOPY so that the snapshot knows that it need not copy them 705 * if they are later written. If passno is one, then this is a first 706 * pass, so only setting needs to be done. If passno is 2, then this 707 * is a revision to a previous pass which must be undone as the 708 * replacement pass is done. 709 */ 710static int 711cgaccount(cg, vp, nbp, passno) 712 int cg; 713 struct vnode *vp; 714 struct buf *nbp; 715 int passno; 716{ 717 struct buf *bp, *ibp; 718 struct inode *ip; 719 struct cg *cgp; 720 struct fs *fs; 721 ufs2_daddr_t base, numblks; 722 int error, len, loc, indiroff; 723 724 ip = VTOI(vp); 725 fs = ip->i_fs; 726 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 727 (int)fs->fs_cgsize, KERNCRED, &bp); 728 if (error) { 729 brelse(bp); 730 return (error); 731 } 732 cgp = (struct cg *)bp->b_data; 733 if (!cg_chkmagic(cgp)) { 734 brelse(bp); 735 return (EIO); 736 } 737 atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 738 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 739 if (fs->fs_cgsize < fs->fs_bsize) 740 bzero(&nbp->b_data[fs->fs_cgsize], 741 fs->fs_bsize - fs->fs_cgsize); 742 if (passno == 2) 743 nbp->b_flags |= B_VALIDSUSPWRT; 744 numblks = howmany(fs->fs_size, fs->fs_frag); 745 len = howmany(fs->fs_fpg, fs->fs_frag); 746 base = cg * fs->fs_fpg / fs->fs_frag; 747 if (base + len >= numblks) 748 len = numblks - base - 1; 749 loc = 0; 750 if (base < NDADDR) { 751 for ( ; loc < NDADDR; loc++) { 752 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 753 DIP_SET(ip, i_db[loc], BLK_NOCOPY); 754 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 755 DIP_SET(ip, i_db[loc], 0); 756 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 757 panic("ffs_snapshot: lost direct block"); 758 } 759 } 760 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 761 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 762 if (error) { 763 brelse(bp); 764 return (error); 765 } 766 indiroff = (base + loc - NDADDR) % NINDIR(fs); 767 for ( ; loc < len; loc++, indiroff++) { 768 if (indiroff >= NINDIR(fs)) { 769 if (passno == 2) 770 ibp->b_flags |= B_VALIDSUSPWRT; 771 bawrite(ibp); 772 error = UFS_BALLOC(vp, 773 lblktosize(fs, (off_t)(base + loc)), 774 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 775 if (error) { 776 brelse(bp); 777 return (error); 778 } 779 indiroff = 0; 780 } 781 if (ip->i_ump->um_fstype == UFS1) { 782 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 783 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 784 BLK_NOCOPY; 785 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 786 [indiroff] == BLK_NOCOPY) 787 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 788 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 789 [indiroff] == BLK_NOCOPY) 790 panic("ffs_snapshot: lost indirect block"); 791 continue; 792 } 793 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 794 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 795 else if (passno == 2 && 796 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 797 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 798 else if (passno == 1 && 799 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 800 panic("ffs_snapshot: lost indirect block"); 801 } 802 bqrelse(bp); 803 if (passno == 2) 804 ibp->b_flags |= B_VALIDSUSPWRT; 805 bdwrite(ibp); 806 return (0); 807} 808 809/* 810 * Before expunging a snapshot inode, note all the 811 * blocks that it claims with BLK_SNAP so that fsck will 812 * be able to account for those blocks properly and so 813 * that this snapshot knows that it need not copy them 814 * if the other snapshot holding them is freed. This code 815 * is reproduced once each for UFS1 and UFS2. 816 */ 817static int 818expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 819 struct vnode *snapvp; 820 struct inode *cancelip; 821 struct fs *fs; 822 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 823 struct fs *, ufs_lbn_t, int); 824 int expungetype; 825{ 826 int i, error, indiroff; 827 ufs_lbn_t lbn, rlbn; 828 ufs2_daddr_t len, blkno, numblks, blksperindir; 829 struct ufs1_dinode *dip; 830 struct thread *td = curthread; 831 struct buf *bp; 832 833 /* 834 * Prepare to expunge the inode. If its inode block has not 835 * yet been copied, then allocate and fill the copy. 836 */ 837 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 838 blkno = 0; 839 if (lbn < NDADDR) { 840 blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 841 } else { 842 td->td_pflags |= TDP_COWINPROGRESS; 843 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 844 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 845 td->td_pflags &= ~TDP_COWINPROGRESS; 846 if (error) 847 return (error); 848 indiroff = (lbn - NDADDR) % NINDIR(fs); 849 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 850 bqrelse(bp); 851 } 852 if (blkno != 0) { 853 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 854 return (error); 855 } else { 856 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 857 fs->fs_bsize, KERNCRED, 0, &bp); 858 if (error) 859 return (error); 860 if ((error = readblock(snapvp, bp, lbn)) != 0) 861 return (error); 862 } 863 /* 864 * Set a snapshot inode to be a zero length file, regular files 865 * to be completely unallocated. 866 */ 867 dip = (struct ufs1_dinode *)bp->b_data + 868 ino_to_fsbo(fs, cancelip->i_number); 869 if (expungetype == BLK_NOCOPY) 870 dip->di_mode = 0; 871 dip->di_size = 0; 872 dip->di_blocks = 0; 873 dip->di_flags &= ~SF_SNAPSHOT; 874 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 875 bdwrite(bp); 876 /* 877 * Now go through and expunge all the blocks in the file 878 * using the function requested. 879 */ 880 numblks = howmany(cancelip->i_size, fs->fs_bsize); 881 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 882 &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) 883 return (error); 884 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 885 &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) 886 return (error); 887 blksperindir = 1; 888 lbn = -NDADDR; 889 len = numblks - NDADDR; 890 rlbn = NDADDR; 891 for (i = 0; len > 0 && i < NIADDR; i++) { 892 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 893 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 894 blksperindir, fs, acctfunc, expungetype); 895 if (error) 896 return (error); 897 blksperindir *= NINDIR(fs); 898 lbn -= blksperindir + 1; 899 len -= blksperindir; 900 rlbn += blksperindir; 901 } 902 return (0); 903} 904 905/* 906 * Descend an indirect block chain for vnode cancelvp accounting for all 907 * its indirect blocks in snapvp. 908 */ 909static int 910indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 911 blksperindir, fs, acctfunc, expungetype) 912 struct vnode *snapvp; 913 struct vnode *cancelvp; 914 int level; 915 ufs1_daddr_t blkno; 916 ufs_lbn_t lbn; 917 ufs_lbn_t rlbn; 918 ufs_lbn_t remblks; 919 ufs_lbn_t blksperindir; 920 struct fs *fs; 921 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 922 struct fs *, ufs_lbn_t, int); 923 int expungetype; 924{ 925 int error, num, i; 926 ufs_lbn_t subblksperindir; 927 struct indir indirs[NIADDR + 2]; 928 ufs1_daddr_t last, *bap; 929 struct buf *bp; 930 931 if (blkno == 0) { 932 if (expungetype == BLK_NOCOPY) 933 return (0); 934 panic("indiracct_ufs1: missing indir"); 935 } 936 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 937 return (error); 938 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 939 panic("indiracct_ufs1: botched params"); 940 /* 941 * We have to expand bread here since it will deadlock looking 942 * up the block number for any blocks that are not in the cache. 943 */ 944 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 945 bp->b_blkno = fsbtodb(fs, blkno); 946 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 947 (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 948 brelse(bp); 949 return (error); 950 } 951 /* 952 * Account for the block pointers in this indirect block. 953 */ 954 last = howmany(remblks, blksperindir); 955 if (last > NINDIR(fs)) 956 last = NINDIR(fs); 957 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 958 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 959 bqrelse(bp); 960 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 961 level == 0 ? rlbn : -1, expungetype); 962 if (error || level == 0) 963 goto out; 964 /* 965 * Account for the block pointers in each of the indirect blocks 966 * in the levels below us. 967 */ 968 subblksperindir = blksperindir / NINDIR(fs); 969 for (lbn++, level--, i = 0; i < last; i++) { 970 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 971 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 972 if (error) 973 goto out; 974 rlbn += blksperindir; 975 lbn -= blksperindir; 976 remblks -= blksperindir; 977 } 978out: 979 FREE(bap, M_DEVBUF); 980 return (error); 981} 982 983/* 984 * Do both snap accounting and map accounting. 985 */ 986static int 987fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 988 struct vnode *vp; 989 ufs1_daddr_t *oldblkp, *lastblkp; 990 struct fs *fs; 991 ufs_lbn_t lblkno; 992 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 993{ 994 int error; 995 996 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 997 return (error); 998 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 999} 1000 1001/* 1002 * Identify a set of blocks allocated in a snapshot inode. 1003 */ 1004static int 1005snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1006 struct vnode *vp; 1007 ufs1_daddr_t *oldblkp, *lastblkp; 1008 struct fs *fs; 1009 ufs_lbn_t lblkno; 1010 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1011{ 1012 struct inode *ip = VTOI(vp); 1013 ufs1_daddr_t blkno, *blkp; 1014 ufs_lbn_t lbn; 1015 struct buf *ibp; 1016 int error; 1017 1018 for ( ; oldblkp < lastblkp; oldblkp++) { 1019 blkno = *oldblkp; 1020 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1021 continue; 1022 lbn = fragstoblks(fs, blkno); 1023 if (lbn < NDADDR) { 1024 blkp = &ip->i_din1->di_db[lbn]; 1025 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1026 } else { 1027 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1028 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1029 if (error) 1030 return (error); 1031 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1032 [(lbn - NDADDR) % NINDIR(fs)]; 1033 } 1034 /* 1035 * If we are expunging a snapshot vnode and we 1036 * find a block marked BLK_NOCOPY, then it is 1037 * one that has been allocated to this snapshot after 1038 * we took our current snapshot and can be ignored. 1039 */ 1040 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1041 if (lbn >= NDADDR) 1042 brelse(ibp); 1043 } else { 1044 if (*blkp != 0) 1045 panic("snapacct_ufs1: bad block"); 1046 *blkp = expungetype; 1047 if (lbn >= NDADDR) 1048 bdwrite(ibp); 1049 } 1050 } 1051 return (0); 1052} 1053 1054/* 1055 * Account for a set of blocks allocated in a snapshot inode. 1056 */ 1057static int 1058mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1059 struct vnode *vp; 1060 ufs1_daddr_t *oldblkp, *lastblkp; 1061 struct fs *fs; 1062 ufs_lbn_t lblkno; 1063 int expungetype; 1064{ 1065 ufs1_daddr_t blkno; 1066 struct inode *ip; 1067 ino_t inum; 1068 int acctit; 1069 1070 ip = VTOI(vp); 1071 inum = ip->i_number; 1072 if (lblkno == -1) 1073 acctit = 0; 1074 else 1075 acctit = 1; 1076 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1077 blkno = *oldblkp; 1078 if (blkno == 0 || blkno == BLK_NOCOPY) 1079 continue; 1080 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1081 *ip->i_snapblklist++ = lblkno; 1082 if (blkno == BLK_SNAP) 1083 blkno = blkstofrags(fs, lblkno); 1084 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1085 } 1086 return (0); 1087} 1088 1089/* 1090 * Before expunging a snapshot inode, note all the 1091 * blocks that it claims with BLK_SNAP so that fsck will 1092 * be able to account for those blocks properly and so 1093 * that this snapshot knows that it need not copy them 1094 * if the other snapshot holding them is freed. This code 1095 * is reproduced once each for UFS1 and UFS2. 1096 */ 1097static int 1098expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1099 struct vnode *snapvp; 1100 struct inode *cancelip; 1101 struct fs *fs; 1102 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1103 struct fs *, ufs_lbn_t, int); 1104 int expungetype; 1105{ 1106 int i, error, indiroff; 1107 ufs_lbn_t lbn, rlbn; 1108 ufs2_daddr_t len, blkno, numblks, blksperindir; 1109 struct ufs2_dinode *dip; 1110 struct thread *td = curthread; 1111 struct buf *bp; 1112 1113 /* 1114 * Prepare to expunge the inode. If its inode block has not 1115 * yet been copied, then allocate and fill the copy. 1116 */ 1117 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1118 blkno = 0; 1119 if (lbn < NDADDR) { 1120 blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 1121 } else { 1122 td->td_pflags |= TDP_COWINPROGRESS; 1123 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1124 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1125 td->td_pflags &= ~TDP_COWINPROGRESS; 1126 if (error) 1127 return (error); 1128 indiroff = (lbn - NDADDR) % NINDIR(fs); 1129 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1130 bqrelse(bp); 1131 } 1132 if (blkno != 0) { 1133 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1134 return (error); 1135 } else { 1136 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1137 fs->fs_bsize, KERNCRED, 0, &bp); 1138 if (error) 1139 return (error); 1140 if ((error = readblock(snapvp, bp, lbn)) != 0) 1141 return (error); 1142 } 1143 /* 1144 * Set a snapshot inode to be a zero length file, regular files 1145 * to be completely unallocated. 1146 */ 1147 dip = (struct ufs2_dinode *)bp->b_data + 1148 ino_to_fsbo(fs, cancelip->i_number); 1149 if (expungetype == BLK_NOCOPY) 1150 dip->di_mode = 0; 1151 dip->di_size = 0; 1152 dip->di_blocks = 0; 1153 dip->di_flags &= ~SF_SNAPSHOT; 1154 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1155 bdwrite(bp); 1156 /* 1157 * Now go through and expunge all the blocks in the file 1158 * using the function requested. 1159 */ 1160 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1161 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1162 &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) 1163 return (error); 1164 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1165 &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) 1166 return (error); 1167 blksperindir = 1; 1168 lbn = -NDADDR; 1169 len = numblks - NDADDR; 1170 rlbn = NDADDR; 1171 for (i = 0; len > 0 && i < NIADDR; i++) { 1172 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1173 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1174 blksperindir, fs, acctfunc, expungetype); 1175 if (error) 1176 return (error); 1177 blksperindir *= NINDIR(fs); 1178 lbn -= blksperindir + 1; 1179 len -= blksperindir; 1180 rlbn += blksperindir; 1181 } 1182 return (0); 1183} 1184 1185/* 1186 * Descend an indirect block chain for vnode cancelvp accounting for all 1187 * its indirect blocks in snapvp. 1188 */ 1189static int 1190indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1191 blksperindir, fs, acctfunc, expungetype) 1192 struct vnode *snapvp; 1193 struct vnode *cancelvp; 1194 int level; 1195 ufs2_daddr_t blkno; 1196 ufs_lbn_t lbn; 1197 ufs_lbn_t rlbn; 1198 ufs_lbn_t remblks; 1199 ufs_lbn_t blksperindir; 1200 struct fs *fs; 1201 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1202 struct fs *, ufs_lbn_t, int); 1203 int expungetype; 1204{ 1205 int error, num, i; 1206 ufs_lbn_t subblksperindir; 1207 struct indir indirs[NIADDR + 2]; 1208 ufs2_daddr_t last, *bap; 1209 struct buf *bp; 1210 1211 if (blkno == 0) { 1212 if (expungetype == BLK_NOCOPY) 1213 return (0); 1214 panic("indiracct_ufs2: missing indir"); 1215 } 1216 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1217 return (error); 1218 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1219 panic("indiracct_ufs2: botched params"); 1220 /* 1221 * We have to expand bread here since it will deadlock looking 1222 * up the block number for any blocks that are not in the cache. 1223 */ 1224 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1225 bp->b_blkno = fsbtodb(fs, blkno); 1226 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1227 (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 1228 brelse(bp); 1229 return (error); 1230 } 1231 /* 1232 * Account for the block pointers in this indirect block. 1233 */ 1234 last = howmany(remblks, blksperindir); 1235 if (last > NINDIR(fs)) 1236 last = NINDIR(fs); 1237 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1238 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1239 bqrelse(bp); 1240 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1241 level == 0 ? rlbn : -1, expungetype); 1242 if (error || level == 0) 1243 goto out; 1244 /* 1245 * Account for the block pointers in each of the indirect blocks 1246 * in the levels below us. 1247 */ 1248 subblksperindir = blksperindir / NINDIR(fs); 1249 for (lbn++, level--, i = 0; i < last; i++) { 1250 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1251 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1252 if (error) 1253 goto out; 1254 rlbn += blksperindir; 1255 lbn -= blksperindir; 1256 remblks -= blksperindir; 1257 } 1258out: 1259 FREE(bap, M_DEVBUF); 1260 return (error); 1261} 1262 1263/* 1264 * Do both snap accounting and map accounting. 1265 */ 1266static int 1267fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1268 struct vnode *vp; 1269 ufs2_daddr_t *oldblkp, *lastblkp; 1270 struct fs *fs; 1271 ufs_lbn_t lblkno; 1272 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1273{ 1274 int error; 1275 1276 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1277 return (error); 1278 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1279} 1280 1281/* 1282 * Identify a set of blocks allocated in a snapshot inode. 1283 */ 1284static int 1285snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1286 struct vnode *vp; 1287 ufs2_daddr_t *oldblkp, *lastblkp; 1288 struct fs *fs; 1289 ufs_lbn_t lblkno; 1290 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1291{ 1292 struct inode *ip = VTOI(vp); 1293 ufs2_daddr_t blkno, *blkp; 1294 ufs_lbn_t lbn; 1295 struct buf *ibp; 1296 int error; 1297 1298 for ( ; oldblkp < lastblkp; oldblkp++) { 1299 blkno = *oldblkp; 1300 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1301 continue; 1302 lbn = fragstoblks(fs, blkno); 1303 if (lbn < NDADDR) { 1304 blkp = &ip->i_din2->di_db[lbn]; 1305 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1306 } else { 1307 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1308 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1309 if (error) 1310 return (error); 1311 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1312 [(lbn - NDADDR) % NINDIR(fs)]; 1313 } 1314 /* 1315 * If we are expunging a snapshot vnode and we 1316 * find a block marked BLK_NOCOPY, then it is 1317 * one that has been allocated to this snapshot after 1318 * we took our current snapshot and can be ignored. 1319 */ 1320 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1321 if (lbn >= NDADDR) 1322 brelse(ibp); 1323 } else { 1324 if (*blkp != 0) 1325 panic("snapacct_ufs2: bad block"); 1326 *blkp = expungetype; 1327 if (lbn >= NDADDR) 1328 bdwrite(ibp); 1329 } 1330 } 1331 return (0); 1332} 1333 1334/* 1335 * Account for a set of blocks allocated in a snapshot inode. 1336 */ 1337static int 1338mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1339 struct vnode *vp; 1340 ufs2_daddr_t *oldblkp, *lastblkp; 1341 struct fs *fs; 1342 ufs_lbn_t lblkno; 1343 int expungetype; 1344{ 1345 ufs2_daddr_t blkno; 1346 struct inode *ip; 1347 ino_t inum; 1348 int acctit; 1349 1350 ip = VTOI(vp); 1351 inum = ip->i_number; 1352 if (lblkno == -1) 1353 acctit = 0; 1354 else 1355 acctit = 1; 1356 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1357 blkno = *oldblkp; 1358 if (blkno == 0 || blkno == BLK_NOCOPY) 1359 continue; 1360 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1361 *ip->i_snapblklist++ = lblkno; 1362 if (blkno == BLK_SNAP) 1363 blkno = blkstofrags(fs, lblkno); 1364 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1365 } 1366 return (0); 1367} 1368 1369/* 1370 * Decrement extra reference on snapshot when last name is removed. 1371 * It will not be freed until the last open reference goes away. 1372 */ 1373void 1374ffs_snapgone(ip) 1375 struct inode *ip; 1376{ 1377 struct inode *xp; 1378 struct fs *fs; 1379 int snaploc; 1380 struct snapdata *sn; 1381 1382 /* 1383 * Find snapshot in incore list. 1384 */ 1385 xp = NULL; 1386 sn = ip->i_devvp->v_rdev->si_snapdata; 1387 if (sn != NULL) 1388 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) 1389 if (xp == ip) 1390 break; 1391 if (xp != NULL) 1392 vrele(ITOV(ip)); 1393 else if (snapdebug) 1394 printf("ffs_snapgone: lost snapshot vnode %d\n", 1395 ip->i_number); 1396 /* 1397 * Delete snapshot inode from superblock. Keep list dense. 1398 */ 1399 fs = ip->i_fs; 1400 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1401 if (fs->fs_snapinum[snaploc] == ip->i_number) 1402 break; 1403 if (snaploc < FSMAXSNAP) { 1404 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1405 if (fs->fs_snapinum[snaploc] == 0) 1406 break; 1407 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1408 } 1409 fs->fs_snapinum[snaploc - 1] = 0; 1410 } 1411} 1412 1413/* 1414 * Prepare a snapshot file for being removed. 1415 */ 1416void 1417ffs_snapremove(vp) 1418 struct vnode *vp; 1419{ 1420 struct inode *ip; 1421 struct vnode *devvp; 1422 struct lock *lkp; 1423 struct buf *ibp; 1424 struct fs *fs; 1425 struct thread *td = curthread; 1426 ufs2_daddr_t numblks, blkno, dblk, *snapblklist; 1427 int error, loc, last; 1428 struct snapdata *sn; 1429 1430 ip = VTOI(vp); 1431 fs = ip->i_fs; 1432 devvp = ip->i_devvp; 1433 sn = devvp->v_rdev->si_snapdata; 1434 /* 1435 * If active, delete from incore list (this snapshot may 1436 * already have been in the process of being deleted, so 1437 * would not have been active). 1438 * 1439 * Clear copy-on-write flag if last snapshot. 1440 */ 1441 if (ip->i_nextsnap.tqe_prev != 0) { 1442 VI_LOCK(devvp); 1443 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1444 VI_MTX(devvp), td); 1445 VI_LOCK(devvp); 1446 TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); 1447 ip->i_nextsnap.tqe_prev = 0; 1448 lkp = vp->v_vnlock; 1449 vp->v_vnlock = &vp->v_lock; 1450 lockmgr(lkp, LK_RELEASE, NULL, td); 1451 if (TAILQ_FIRST(&sn->sn_head) != 0) { 1452 VI_UNLOCK(devvp); 1453 } else { 1454 devvp->v_rdev->si_copyonwrite = 0; 1455 snapblklist = sn->sn_blklist; 1456 sn->sn_blklist = 0; 1457 sn->sn_listsize = 0; 1458 devvp->v_rdev->si_snapdata = NULL; 1459 devvp->v_vflag &= ~VV_COPYONWRITE; 1460 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); 1461 lockmgr(lkp, LK_RELEASE, NULL, td); 1462 lockdestroy(lkp); 1463 free(sn, M_UFSMNT); 1464 FREE(snapblklist, M_UFSMNT); 1465 } 1466 } 1467 /* 1468 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1469 * snapshots that want them (see ffs_snapblkfree below). 1470 */ 1471 for (blkno = 1; blkno < NDADDR; blkno++) { 1472 dblk = DIP(ip, i_db[blkno]); 1473 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1474 DIP_SET(ip, i_db[blkno], 0); 1475 else if ((dblk == blkstofrags(fs, blkno) && 1476 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1477 ip->i_number))) { 1478 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - 1479 btodb(fs->fs_bsize)); 1480 DIP_SET(ip, i_db[blkno], 0); 1481 } 1482 } 1483 numblks = howmany(ip->i_size, fs->fs_bsize); 1484 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1485 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1486 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1487 if (error) 1488 continue; 1489 if (fs->fs_size - blkno > NINDIR(fs)) 1490 last = NINDIR(fs); 1491 else 1492 last = fs->fs_size - blkno; 1493 for (loc = 0; loc < last; loc++) { 1494 if (ip->i_ump->um_fstype == UFS1) { 1495 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1496 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1497 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1498 else if ((dblk == blkstofrags(fs, blkno) && 1499 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1500 fs->fs_bsize, ip->i_number))) { 1501 ip->i_din1->di_blocks -= 1502 btodb(fs->fs_bsize); 1503 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1504 } 1505 continue; 1506 } 1507 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1508 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1509 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1510 else if ((dblk == blkstofrags(fs, blkno) && 1511 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1512 fs->fs_bsize, ip->i_number))) { 1513 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1514 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1515 } 1516 } 1517 bawrite(ibp); 1518 } 1519 /* 1520 * Clear snapshot flag and drop reference. 1521 */ 1522 ip->i_flags &= ~SF_SNAPSHOT; 1523 DIP_SET(ip, i_flags, ip->i_flags); 1524 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1525} 1526 1527/* 1528 * Notification that a block is being freed. Return zero if the free 1529 * should be allowed to proceed. Return non-zero if the snapshot file 1530 * wants to claim the block. The block will be claimed if it is an 1531 * uncopied part of one of the snapshots. It will be freed if it is 1532 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1533 * If a fragment is being freed, then all snapshots that care about 1534 * it must make a copy since a snapshot file can only claim full sized 1535 * blocks. Note that if more than one snapshot file maps the block, 1536 * we can pick one at random to claim it. Since none of the snapshots 1537 * can change, we are assurred that they will all see the same unmodified 1538 * image. When deleting a snapshot file (see ffs_snapremove above), we 1539 * must push any of these claimed blocks to one of the other snapshots 1540 * that maps it. These claimed blocks are easily identified as they will 1541 * have a block number equal to their logical block number within the 1542 * snapshot. A copied block can never have this property because they 1543 * must always have been allocated from a BLK_NOCOPY location. 1544 */ 1545int 1546ffs_snapblkfree(fs, devvp, bno, size, inum) 1547 struct fs *fs; 1548 struct vnode *devvp; 1549 ufs2_daddr_t bno; 1550 long size; 1551 ino_t inum; 1552{ 1553 struct buf *ibp, *cbp, *savedcbp = 0; 1554 struct thread *td = curthread; 1555 struct inode *ip; 1556 struct vnode *vp = NULL; 1557 ufs_lbn_t lbn; 1558 ufs2_daddr_t blkno; 1559 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1560 struct snapdata *sn; 1561 1562 lbn = fragstoblks(fs, bno); 1563retry: 1564 VI_LOCK(devvp); 1565 sn = devvp->v_rdev->si_snapdata;
| 38 39#include <sys/param.h> 40#include <sys/kernel.h> 41#include <sys/systm.h> 42#include <sys/conf.h> 43#include <sys/bio.h> 44#include <sys/buf.h> 45#include <sys/proc.h> 46#include <sys/namei.h> 47#include <sys/sched.h> 48#include <sys/stat.h> 49#include <sys/malloc.h> 50#include <sys/mount.h> 51#include <sys/resource.h> 52#include <sys/resourcevar.h> 53#include <sys/vnode.h> 54 55#include <ufs/ufs/extattr.h> 56#include <ufs/ufs/quota.h> 57#include <ufs/ufs/ufsmount.h> 58#include <ufs/ufs/inode.h> 59#include <ufs/ufs/ufs_extern.h> 60 61#include <ufs/ffs/fs.h> 62#include <ufs/ffs/ffs_extern.h> 63 64#define KERNCRED thread0.td_ucred 65#define DEBUG 1 66 67TAILQ_HEAD(snaphead, inode); 68 69struct snapdata { 70 struct snaphead sn_head; 71 daddr_t sn_listsize; 72 daddr_t *sn_blklist; 73 struct lock sn_lock; 74}; 75 76static int cgaccount(int, struct vnode *, struct buf *, int); 77static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 78 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 79 ufs_lbn_t, int), int); 80static int indiracct_ufs1(struct vnode *, struct vnode *, int, 81 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 82 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 83 ufs_lbn_t, int), int); 84static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 85 struct fs *, ufs_lbn_t, int); 86static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 87 struct fs *, ufs_lbn_t, int); 88static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 89 struct fs *, ufs_lbn_t, int); 90static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 91 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 92 ufs_lbn_t, int), int); 93static int indiracct_ufs2(struct vnode *, struct vnode *, int, 94 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 95 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 96 ufs_lbn_t, int), int); 97static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 98 struct fs *, ufs_lbn_t, int); 99static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 100 struct fs *, ufs_lbn_t, int); 101static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 102 struct fs *, ufs_lbn_t, int); 103static int ffs_copyonwrite(struct vnode *, struct buf *); 104static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); 105 106/* 107 * To ensure the consistency of snapshots across crashes, we must 108 * synchronously write out copied blocks before allowing the 109 * originals to be modified. Because of the rather severe speed 110 * penalty that this imposes, the following flag allows this 111 * crash persistence to be disabled. 112 */ 113int dopersistence = 0; 114 115#ifdef DEBUG 116#include <sys/sysctl.h> 117SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 118static int snapdebug = 0; 119SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 120int collectsnapstats = 0; 121SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 122 0, ""); 123#endif /* DEBUG */ 124 125/* 126 * Create a snapshot file and initialize it for the filesystem. 127 */ 128int 129ffs_snapshot(mp, snapfile) 130 struct mount *mp; 131 char *snapfile; 132{ 133 ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 134 int error, cg, snaploc; 135 int i, size, len, loc; 136 int flag = mp->mnt_flag; 137 struct timespec starttime = {0, 0}, endtime; 138 char saved_nice = 0; 139 long redo = 0, snaplistsize = 0; 140 int32_t *lp; 141 void *space; 142 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 143 struct thread *td = curthread; 144 struct inode *ip, *xp; 145 struct buf *bp, *nbp, *ibp, *sbp = NULL; 146 struct nameidata nd; 147 struct mount *wrtmp; 148 struct vattr vat; 149 struct vnode *vp, *xvp, *nvp, *devvp; 150 struct uio auio; 151 struct iovec aiov; 152 struct snapdata *sn; 153 154 /* 155 * XXX: make sure we don't go to out1 before we setup sn 156 */ 157 sn = (void *)0xdeadbeef; 158 159 /* 160 * Need to serialize access to snapshot code per filesystem. 161 */ 162 /* 163 * Assign a snapshot slot in the superblock. 164 */ 165 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 166 if (fs->fs_snapinum[snaploc] == 0) 167 break; 168 if (snaploc == FSMAXSNAP) 169 return (ENOSPC); 170 /* 171 * Create the snapshot file. 172 */ 173restart: 174 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td); 175 if ((error = namei(&nd)) != 0) 176 return (error); 177 if (nd.ni_vp != NULL) { 178 vput(nd.ni_vp); 179 error = EEXIST; 180 } 181 if (nd.ni_dvp->v_mount != mp) 182 error = EXDEV; 183 if (error) { 184 NDFREE(&nd, NDF_ONLY_PNBUF); 185 if (nd.ni_dvp == nd.ni_vp) 186 vrele(nd.ni_dvp); 187 else 188 vput(nd.ni_dvp); 189 return (error); 190 } 191 VATTR_NULL(&vat); 192 vat.va_type = VREG; 193 vat.va_mode = S_IRUSR; 194 vat.va_vaflags |= VA_EXCLUSIVE; 195 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 196 wrtmp = NULL; 197 if (wrtmp != mp) 198 panic("ffs_snapshot: mount mismatch"); 199 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 200 NDFREE(&nd, NDF_ONLY_PNBUF); 201 vput(nd.ni_dvp); 202 if ((error = vn_start_write(NULL, &wrtmp, 203 V_XSLEEP | PCATCH)) != 0) 204 return (error); 205 goto restart; 206 } 207 VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); 208 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 209 vput(nd.ni_dvp); 210 if (error) { 211 NDFREE(&nd, NDF_ONLY_PNBUF); 212 vn_finished_write(wrtmp); 213 return (error); 214 } 215 vp = nd.ni_vp; 216 ip = VTOI(vp); 217 devvp = ip->i_devvp; 218 /* 219 * Allocate and copy the last block contents so as to be able 220 * to set size to that of the filesystem. 221 */ 222 numblks = howmany(fs->fs_size, fs->fs_frag); 223 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 224 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 225 if (error) 226 goto out; 227 ip->i_size = lblktosize(fs, (off_t)numblks); 228 DIP_SET(ip, i_size, ip->i_size); 229 ip->i_flag |= IN_CHANGE | IN_UPDATE; 230 if ((error = readblock(vp, bp, numblks - 1)) != 0) 231 goto out; 232 bawrite(bp); 233 /* 234 * Preallocate critical data structures so that we can copy 235 * them in without further allocation after we suspend all 236 * operations on the filesystem. We would like to just release 237 * the allocated buffers without writing them since they will 238 * be filled in below once we are ready to go, but this upsets 239 * the soft update code, so we go ahead and write the new buffers. 240 * 241 * Allocate all indirect blocks and mark all of them as not 242 * needing to be copied. 243 */ 244 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 245 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 246 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 247 if (error) 248 goto out; 249 bawrite(ibp); 250 } 251 /* 252 * Allocate copies for the superblock and its summary information. 253 */ 254 error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 255 0, &nbp); 256 if (error) 257 goto out; 258 bawrite(nbp); 259 blkno = fragstoblks(fs, fs->fs_csaddr); 260 len = howmany(fs->fs_cssize, fs->fs_bsize); 261 for (loc = 0; loc < len; loc++) { 262 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 263 fs->fs_bsize, KERNCRED, 0, &nbp); 264 if (error) 265 goto out; 266 bawrite(nbp); 267 } 268 /* 269 * Allocate all cylinder group blocks. 270 */ 271 for (cg = 0; cg < fs->fs_ncg; cg++) { 272 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 273 fs->fs_bsize, KERNCRED, 0, &nbp); 274 if (error) 275 goto out; 276 bawrite(nbp); 277 } 278 /* 279 * Copy all the cylinder group maps. Although the 280 * filesystem is still active, we hope that only a few 281 * cylinder groups will change between now and when we 282 * suspend operations. Thus, we will be able to quickly 283 * touch up the few cylinder groups that changed during 284 * the suspension period. 285 */ 286 len = howmany(fs->fs_ncg, NBBY); 287 MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK); 288 bzero(fs->fs_active, len); 289 for (cg = 0; cg < fs->fs_ncg; cg++) { 290 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 291 fs->fs_bsize, KERNCRED, 0, &nbp); 292 if (error) 293 goto out; 294 error = cgaccount(cg, vp, nbp, 1); 295 bawrite(nbp); 296 if (error) 297 goto out; 298 } 299 /* 300 * Change inode to snapshot type file. 301 */ 302 ip->i_flags |= SF_SNAPSHOT; 303 DIP_SET(ip, i_flags, ip->i_flags); 304 ip->i_flag |= IN_CHANGE | IN_UPDATE; 305 /* 306 * Ensure that the snapshot is completely on disk. 307 * Since we have marked it as a snapshot it is safe to 308 * unlock it as no process will be allowed to write to it. 309 */ 310 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0) 311 goto out; 312 VOP_UNLOCK(vp, 0, td); 313 /* 314 * All allocations are done, so we can now snapshot the system. 315 * 316 * Recind nice scheduling while running with the filesystem suspended. 317 */ 318 if (td->td_proc->p_nice > 0) { 319 PROC_LOCK(td->td_proc); 320 mtx_lock_spin(&sched_lock); 321 saved_nice = td->td_proc->p_nice; 322 sched_nice(td->td_proc, 0); 323 mtx_unlock_spin(&sched_lock); 324 PROC_UNLOCK(td->td_proc); 325 } 326 /* 327 * Suspend operation on filesystem. 328 */ 329 for (;;) { 330 vn_finished_write(wrtmp); 331 if ((error = vfs_write_suspend(vp->v_mount)) != 0) { 332 vn_start_write(NULL, &wrtmp, V_WAIT); 333 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 334 goto out; 335 } 336 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 337 break; 338 vn_start_write(NULL, &wrtmp, V_WAIT); 339 } 340 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 341 if (collectsnapstats) 342 nanotime(&starttime); 343 /* 344 * First, copy all the cylinder group maps that have changed. 345 */ 346 for (cg = 0; cg < fs->fs_ncg; cg++) { 347 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 348 continue; 349 redo++; 350 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 351 fs->fs_bsize, KERNCRED, 0, &nbp); 352 if (error) 353 goto out1; 354 error = cgaccount(cg, vp, nbp, 2); 355 bawrite(nbp); 356 if (error) 357 goto out1; 358 } 359 /* 360 * Grab a copy of the superblock and its summary information. 361 * We delay writing it until the suspension is released below. 362 */ 363 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 364 KERNCRED, &sbp); 365 if (error) { 366 brelse(sbp); 367 sbp = NULL; 368 goto out1; 369 } 370 loc = blkoff(fs, fs->fs_sblockloc); 371 copy_fs = (struct fs *)(sbp->b_data + loc); 372 bcopy(fs, copy_fs, fs->fs_sbsize); 373 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 374 copy_fs->fs_clean = 1; 375 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 376 if (fs->fs_sbsize < size) 377 bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); 378 size = blkroundup(fs, fs->fs_cssize); 379 if (fs->fs_contigsumsize > 0) 380 size += fs->fs_ncg * sizeof(int32_t); 381 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 382 copy_fs->fs_csp = space; 383 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 384 space = (char *)space + fs->fs_cssize; 385 loc = howmany(fs->fs_cssize, fs->fs_fsize); 386 i = fs->fs_frag - loc % fs->fs_frag; 387 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 388 if (len > 0) { 389 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 390 len, KERNCRED, &bp)) != 0) { 391 brelse(bp); 392 free(copy_fs->fs_csp, M_UFSMNT); 393 bawrite(sbp); 394 sbp = NULL; 395 goto out1; 396 } 397 bcopy(bp->b_data, space, (u_int)len); 398 space = (char *)space + len; 399 bp->b_flags |= B_INVAL | B_NOCACHE; 400 brelse(bp); 401 } 402 if (fs->fs_contigsumsize > 0) { 403 copy_fs->fs_maxcluster = lp = space; 404 for (i = 0; i < fs->fs_ncg; i++) 405 *lp++ = fs->fs_contigsumsize; 406 } 407 /* 408 * We must check for active files that have been unlinked 409 * (e.g., with a zero link count). We have to expunge all 410 * trace of these files from the snapshot so that they are 411 * not reclaimed prematurely by fsck or unnecessarily dumped. 412 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 413 * spec_strategy about writing on a suspended filesystem. 414 * Note that we skip unlinked snapshot files as they will 415 * be handled separately below. 416 * 417 * We also calculate the needed size for the snapshot list. 418 */ 419 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 420 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 421 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 422 MNT_ILOCK(mp); 423loop: 424 MNT_VNODE_FOREACH(xvp, mp, nvp) { 425 VI_LOCK(xvp); 426 MNT_IUNLOCK(mp); 427 if ((xvp->v_iflag & VI_XLOCK) || 428 xvp->v_usecount == 0 || xvp->v_type == VNON || 429 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 430 VI_UNLOCK(xvp); 431 MNT_ILOCK(mp); 432 continue; 433 } 434 /* 435 * We can skip parent directory vnode because it must have 436 * this snapshot file in it. 437 */ 438 if (xvp == nd.ni_dvp) { 439 VI_UNLOCK(xvp); 440 MNT_ILOCK(mp); 441 continue; 442 } 443 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) { 444 MNT_ILOCK(mp); 445 goto loop; 446 } 447 if (snapdebug) 448 vprint("ffs_snapshot: busy vnode", xvp); 449 if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 && 450 vat.va_nlink > 0) { 451 VOP_UNLOCK(xvp, 0, td); 452 MNT_ILOCK(mp); 453 continue; 454 } 455 xp = VTOI(xvp); 456 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 457 VOP_UNLOCK(xvp, 0, td); 458 MNT_ILOCK(mp); 459 continue; 460 } 461 /* 462 * If there is a fragment, clear it here. 463 */ 464 blkno = 0; 465 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 466 if (loc < NDADDR) { 467 len = fragroundup(fs, blkoff(fs, xp->i_size)); 468 if (len < fs->fs_bsize) { 469 ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]), 470 len, xp->i_number); 471 blkno = DIP(xp, i_db[loc]); 472 DIP_SET(xp, i_db[loc], 0); 473 } 474 } 475 snaplistsize += 1; 476 if (xp->i_ump->um_fstype == UFS1) 477 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 478 BLK_NOCOPY); 479 else 480 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 481 BLK_NOCOPY); 482 if (blkno) 483 DIP_SET(xp, i_db[loc], blkno); 484 if (!error) 485 error = ffs_freefile(copy_fs, vp, xp->i_number, 486 xp->i_mode); 487 VOP_UNLOCK(xvp, 0, td); 488 if (error) { 489 free(copy_fs->fs_csp, M_UFSMNT); 490 bawrite(sbp); 491 sbp = NULL; 492 goto out1; 493 } 494 MNT_ILOCK(mp); 495 } 496 MNT_IUNLOCK(mp); 497 /* 498 * If there already exist snapshots on this filesystem, grab a 499 * reference to their shared lock. If this is the first snapshot 500 * on this filesystem, we need to allocate a lock for the snapshots 501 * to share. In either case, acquire the snapshot lock and give 502 * up our original private lock. 503 */ 504 VI_LOCK(devvp); 505 sn = devvp->v_rdev->si_snapdata; 506 if (sn != NULL) { 507 xp = TAILQ_FIRST(&sn->sn_head); 508 VI_UNLOCK(devvp); 509 VI_LOCK(vp); 510 vp->v_vnlock = &sn->sn_lock; 511 } else { 512 VI_UNLOCK(devvp); 513 sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); 514 TAILQ_INIT(&sn->sn_head); 515 lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, 516 LK_CANRECURSE | LK_NOPAUSE); 517 VI_LOCK(vp); 518 vp->v_vnlock = &sn->sn_lock; 519 devvp->v_rdev->si_snapdata = sn; 520 xp = NULL; 521 } 522 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 523 transferlockers(&vp->v_lock, vp->v_vnlock); 524 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 525 /* 526 * If this is the first snapshot on this filesystem, then we need 527 * to allocate the space for the list of preallocated snapshot blocks. 528 * This list will be refined below, but this preliminary one will 529 * keep us out of deadlock until the full one is ready. 530 */ 531 if (xp == NULL) { 532 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 533 M_UFSMNT, M_WAITOK); 534 blkp = &snapblklist[1]; 535 *blkp++ = lblkno(fs, fs->fs_sblockloc); 536 blkno = fragstoblks(fs, fs->fs_csaddr); 537 for (cg = 0; cg < fs->fs_ncg; cg++) { 538 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 539 break; 540 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 541 } 542 len = howmany(fs->fs_cssize, fs->fs_bsize); 543 for (loc = 0; loc < len; loc++) 544 *blkp++ = blkno + loc; 545 for (; cg < fs->fs_ncg; cg++) 546 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 547 snapblklist[0] = blkp - snapblklist; 548 VI_LOCK(devvp); 549 if (sn->sn_blklist != NULL) 550 panic("ffs_snapshot: non-empty list"); 551 sn->sn_blklist = snapblklist; 552 sn->sn_listsize = blkp - snapblklist; 553 VI_UNLOCK(devvp); 554 } 555 /* 556 * Record snapshot inode. Since this is the newest snapshot, 557 * it must be placed at the end of the list. 558 */ 559 VI_LOCK(devvp); 560 fs->fs_snapinum[snaploc] = ip->i_number; 561 if (ip->i_nextsnap.tqe_prev != 0) 562 panic("ffs_snapshot: %d already on list", ip->i_number); 563 TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 564 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 565 devvp->v_vflag |= VV_COPYONWRITE; 566 VI_UNLOCK(devvp); 567 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 568 vp->v_vflag |= VV_SYSTEM; 569out1: 570 KASSERT(sn != (void *)0xdeadbeef, ("email phk@ and mckusick@")); 571 /* 572 * Resume operation on filesystem. 573 */ 574 vfs_write_resume(vp->v_mount); 575 vn_start_write(NULL, &wrtmp, V_WAIT); 576 if (collectsnapstats && starttime.tv_sec > 0) { 577 nanotime(&endtime); 578 timespecsub(&endtime, &starttime); 579 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 580 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 581 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 582 } 583 if (sbp == NULL) 584 goto out; 585 /* 586 * Copy allocation information from all the snapshots in 587 * this snapshot and then expunge them from its view. 588 */ 589 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { 590 if (xp == ip) 591 break; 592 if (xp->i_ump->um_fstype == UFS1) 593 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 594 BLK_SNAP); 595 else 596 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 597 BLK_SNAP); 598 if (error) { 599 fs->fs_snapinum[snaploc] = 0; 600 goto done; 601 } 602 } 603 /* 604 * Allocate space for the full list of preallocated snapshot blocks. 605 */ 606 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 607 M_UFSMNT, M_WAITOK); 608 ip->i_snapblklist = &snapblklist[1]; 609 /* 610 * Expunge the blocks used by the snapshots from the set of 611 * blocks marked as used in the snapshot bitmaps. Also, collect 612 * the list of allocated blocks in i_snapblklist. 613 */ 614 if (ip->i_ump->um_fstype == UFS1) 615 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 616 else 617 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 618 if (error) { 619 fs->fs_snapinum[snaploc] = 0; 620 FREE(snapblklist, M_UFSMNT); 621 goto done; 622 } 623 if (snaplistsize < ip->i_snapblklist - snapblklist) 624 panic("ffs_snapshot: list too small"); 625 snaplistsize = ip->i_snapblklist - snapblklist; 626 snapblklist[0] = snaplistsize; 627 ip->i_snapblklist = 0; 628 /* 629 * Write out the list of allocated blocks to the end of the snapshot. 630 */ 631 auio.uio_iov = &aiov; 632 auio.uio_iovcnt = 1; 633 aiov.iov_base = (void *)snapblklist; 634 aiov.iov_len = snaplistsize * sizeof(daddr_t); 635 auio.uio_resid = aiov.iov_len;; 636 auio.uio_offset = ip->i_size; 637 auio.uio_segflg = UIO_SYSSPACE; 638 auio.uio_rw = UIO_WRITE; 639 auio.uio_td = td; 640 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 641 fs->fs_snapinum[snaploc] = 0; 642 FREE(snapblklist, M_UFSMNT); 643 goto done; 644 } 645 /* 646 * Write the superblock and its summary information 647 * to the snapshot. 648 */ 649 blkno = fragstoblks(fs, fs->fs_csaddr); 650 len = howmany(fs->fs_cssize, fs->fs_bsize); 651 space = copy_fs->fs_csp; 652 for (loc = 0; loc < len; loc++) { 653 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 654 if (error) { 655 brelse(nbp); 656 fs->fs_snapinum[snaploc] = 0; 657 FREE(snapblklist, M_UFSMNT); 658 goto done; 659 } 660 bcopy(space, nbp->b_data, fs->fs_bsize); 661 space = (char *)space + fs->fs_bsize; 662 bawrite(nbp); 663 } 664 /* 665 * As this is the newest list, it is the most inclusive, so 666 * should replace the previous list. 667 */ 668 VI_LOCK(devvp); 669 space = sn->sn_blklist; 670 sn->sn_blklist = snapblklist; 671 sn->sn_listsize = snaplistsize; 672 VI_UNLOCK(devvp); 673 if (space != NULL) 674 FREE(space, M_UFSMNT); 675done: 676 FREE(copy_fs->fs_csp, M_UFSMNT); 677 bawrite(sbp); 678out: 679 if (saved_nice > 0) { 680 PROC_LOCK(td->td_proc); 681 mtx_lock_spin(&sched_lock); 682 sched_nice(td->td_proc, saved_nice); 683 mtx_unlock_spin(&sched_lock); 684 PROC_UNLOCK(td->td_proc); 685 } 686 if (fs->fs_active != 0) { 687 FREE(fs->fs_active, M_DEVBUF); 688 fs->fs_active = 0; 689 } 690 mp->mnt_flag = flag; 691 if (error) 692 (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 693 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 694 if (error) 695 vput(vp); 696 else 697 VOP_UNLOCK(vp, 0, td); 698 vn_finished_write(wrtmp); 699 return (error); 700} 701 702/* 703 * Copy a cylinder group map. All the unallocated blocks are marked 704 * BLK_NOCOPY so that the snapshot knows that it need not copy them 705 * if they are later written. If passno is one, then this is a first 706 * pass, so only setting needs to be done. If passno is 2, then this 707 * is a revision to a previous pass which must be undone as the 708 * replacement pass is done. 709 */ 710static int 711cgaccount(cg, vp, nbp, passno) 712 int cg; 713 struct vnode *vp; 714 struct buf *nbp; 715 int passno; 716{ 717 struct buf *bp, *ibp; 718 struct inode *ip; 719 struct cg *cgp; 720 struct fs *fs; 721 ufs2_daddr_t base, numblks; 722 int error, len, loc, indiroff; 723 724 ip = VTOI(vp); 725 fs = ip->i_fs; 726 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 727 (int)fs->fs_cgsize, KERNCRED, &bp); 728 if (error) { 729 brelse(bp); 730 return (error); 731 } 732 cgp = (struct cg *)bp->b_data; 733 if (!cg_chkmagic(cgp)) { 734 brelse(bp); 735 return (EIO); 736 } 737 atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 738 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 739 if (fs->fs_cgsize < fs->fs_bsize) 740 bzero(&nbp->b_data[fs->fs_cgsize], 741 fs->fs_bsize - fs->fs_cgsize); 742 if (passno == 2) 743 nbp->b_flags |= B_VALIDSUSPWRT; 744 numblks = howmany(fs->fs_size, fs->fs_frag); 745 len = howmany(fs->fs_fpg, fs->fs_frag); 746 base = cg * fs->fs_fpg / fs->fs_frag; 747 if (base + len >= numblks) 748 len = numblks - base - 1; 749 loc = 0; 750 if (base < NDADDR) { 751 for ( ; loc < NDADDR; loc++) { 752 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 753 DIP_SET(ip, i_db[loc], BLK_NOCOPY); 754 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 755 DIP_SET(ip, i_db[loc], 0); 756 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 757 panic("ffs_snapshot: lost direct block"); 758 } 759 } 760 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 761 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 762 if (error) { 763 brelse(bp); 764 return (error); 765 } 766 indiroff = (base + loc - NDADDR) % NINDIR(fs); 767 for ( ; loc < len; loc++, indiroff++) { 768 if (indiroff >= NINDIR(fs)) { 769 if (passno == 2) 770 ibp->b_flags |= B_VALIDSUSPWRT; 771 bawrite(ibp); 772 error = UFS_BALLOC(vp, 773 lblktosize(fs, (off_t)(base + loc)), 774 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 775 if (error) { 776 brelse(bp); 777 return (error); 778 } 779 indiroff = 0; 780 } 781 if (ip->i_ump->um_fstype == UFS1) { 782 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 783 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 784 BLK_NOCOPY; 785 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 786 [indiroff] == BLK_NOCOPY) 787 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 788 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 789 [indiroff] == BLK_NOCOPY) 790 panic("ffs_snapshot: lost indirect block"); 791 continue; 792 } 793 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 794 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 795 else if (passno == 2 && 796 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 797 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 798 else if (passno == 1 && 799 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 800 panic("ffs_snapshot: lost indirect block"); 801 } 802 bqrelse(bp); 803 if (passno == 2) 804 ibp->b_flags |= B_VALIDSUSPWRT; 805 bdwrite(ibp); 806 return (0); 807} 808 809/* 810 * Before expunging a snapshot inode, note all the 811 * blocks that it claims with BLK_SNAP so that fsck will 812 * be able to account for those blocks properly and so 813 * that this snapshot knows that it need not copy them 814 * if the other snapshot holding them is freed. This code 815 * is reproduced once each for UFS1 and UFS2. 816 */ 817static int 818expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 819 struct vnode *snapvp; 820 struct inode *cancelip; 821 struct fs *fs; 822 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 823 struct fs *, ufs_lbn_t, int); 824 int expungetype; 825{ 826 int i, error, indiroff; 827 ufs_lbn_t lbn, rlbn; 828 ufs2_daddr_t len, blkno, numblks, blksperindir; 829 struct ufs1_dinode *dip; 830 struct thread *td = curthread; 831 struct buf *bp; 832 833 /* 834 * Prepare to expunge the inode. If its inode block has not 835 * yet been copied, then allocate and fill the copy. 836 */ 837 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 838 blkno = 0; 839 if (lbn < NDADDR) { 840 blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 841 } else { 842 td->td_pflags |= TDP_COWINPROGRESS; 843 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 844 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 845 td->td_pflags &= ~TDP_COWINPROGRESS; 846 if (error) 847 return (error); 848 indiroff = (lbn - NDADDR) % NINDIR(fs); 849 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 850 bqrelse(bp); 851 } 852 if (blkno != 0) { 853 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 854 return (error); 855 } else { 856 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 857 fs->fs_bsize, KERNCRED, 0, &bp); 858 if (error) 859 return (error); 860 if ((error = readblock(snapvp, bp, lbn)) != 0) 861 return (error); 862 } 863 /* 864 * Set a snapshot inode to be a zero length file, regular files 865 * to be completely unallocated. 866 */ 867 dip = (struct ufs1_dinode *)bp->b_data + 868 ino_to_fsbo(fs, cancelip->i_number); 869 if (expungetype == BLK_NOCOPY) 870 dip->di_mode = 0; 871 dip->di_size = 0; 872 dip->di_blocks = 0; 873 dip->di_flags &= ~SF_SNAPSHOT; 874 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 875 bdwrite(bp); 876 /* 877 * Now go through and expunge all the blocks in the file 878 * using the function requested. 879 */ 880 numblks = howmany(cancelip->i_size, fs->fs_bsize); 881 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 882 &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) 883 return (error); 884 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 885 &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) 886 return (error); 887 blksperindir = 1; 888 lbn = -NDADDR; 889 len = numblks - NDADDR; 890 rlbn = NDADDR; 891 for (i = 0; len > 0 && i < NIADDR; i++) { 892 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 893 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 894 blksperindir, fs, acctfunc, expungetype); 895 if (error) 896 return (error); 897 blksperindir *= NINDIR(fs); 898 lbn -= blksperindir + 1; 899 len -= blksperindir; 900 rlbn += blksperindir; 901 } 902 return (0); 903} 904 905/* 906 * Descend an indirect block chain for vnode cancelvp accounting for all 907 * its indirect blocks in snapvp. 908 */ 909static int 910indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 911 blksperindir, fs, acctfunc, expungetype) 912 struct vnode *snapvp; 913 struct vnode *cancelvp; 914 int level; 915 ufs1_daddr_t blkno; 916 ufs_lbn_t lbn; 917 ufs_lbn_t rlbn; 918 ufs_lbn_t remblks; 919 ufs_lbn_t blksperindir; 920 struct fs *fs; 921 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 922 struct fs *, ufs_lbn_t, int); 923 int expungetype; 924{ 925 int error, num, i; 926 ufs_lbn_t subblksperindir; 927 struct indir indirs[NIADDR + 2]; 928 ufs1_daddr_t last, *bap; 929 struct buf *bp; 930 931 if (blkno == 0) { 932 if (expungetype == BLK_NOCOPY) 933 return (0); 934 panic("indiracct_ufs1: missing indir"); 935 } 936 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 937 return (error); 938 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 939 panic("indiracct_ufs1: botched params"); 940 /* 941 * We have to expand bread here since it will deadlock looking 942 * up the block number for any blocks that are not in the cache. 943 */ 944 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 945 bp->b_blkno = fsbtodb(fs, blkno); 946 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 947 (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 948 brelse(bp); 949 return (error); 950 } 951 /* 952 * Account for the block pointers in this indirect block. 953 */ 954 last = howmany(remblks, blksperindir); 955 if (last > NINDIR(fs)) 956 last = NINDIR(fs); 957 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 958 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 959 bqrelse(bp); 960 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 961 level == 0 ? rlbn : -1, expungetype); 962 if (error || level == 0) 963 goto out; 964 /* 965 * Account for the block pointers in each of the indirect blocks 966 * in the levels below us. 967 */ 968 subblksperindir = blksperindir / NINDIR(fs); 969 for (lbn++, level--, i = 0; i < last; i++) { 970 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 971 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 972 if (error) 973 goto out; 974 rlbn += blksperindir; 975 lbn -= blksperindir; 976 remblks -= blksperindir; 977 } 978out: 979 FREE(bap, M_DEVBUF); 980 return (error); 981} 982 983/* 984 * Do both snap accounting and map accounting. 985 */ 986static int 987fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 988 struct vnode *vp; 989 ufs1_daddr_t *oldblkp, *lastblkp; 990 struct fs *fs; 991 ufs_lbn_t lblkno; 992 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 993{ 994 int error; 995 996 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 997 return (error); 998 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 999} 1000 1001/* 1002 * Identify a set of blocks allocated in a snapshot inode. 1003 */ 1004static int 1005snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1006 struct vnode *vp; 1007 ufs1_daddr_t *oldblkp, *lastblkp; 1008 struct fs *fs; 1009 ufs_lbn_t lblkno; 1010 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1011{ 1012 struct inode *ip = VTOI(vp); 1013 ufs1_daddr_t blkno, *blkp; 1014 ufs_lbn_t lbn; 1015 struct buf *ibp; 1016 int error; 1017 1018 for ( ; oldblkp < lastblkp; oldblkp++) { 1019 blkno = *oldblkp; 1020 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1021 continue; 1022 lbn = fragstoblks(fs, blkno); 1023 if (lbn < NDADDR) { 1024 blkp = &ip->i_din1->di_db[lbn]; 1025 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1026 } else { 1027 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1028 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1029 if (error) 1030 return (error); 1031 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1032 [(lbn - NDADDR) % NINDIR(fs)]; 1033 } 1034 /* 1035 * If we are expunging a snapshot vnode and we 1036 * find a block marked BLK_NOCOPY, then it is 1037 * one that has been allocated to this snapshot after 1038 * we took our current snapshot and can be ignored. 1039 */ 1040 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1041 if (lbn >= NDADDR) 1042 brelse(ibp); 1043 } else { 1044 if (*blkp != 0) 1045 panic("snapacct_ufs1: bad block"); 1046 *blkp = expungetype; 1047 if (lbn >= NDADDR) 1048 bdwrite(ibp); 1049 } 1050 } 1051 return (0); 1052} 1053 1054/* 1055 * Account for a set of blocks allocated in a snapshot inode. 1056 */ 1057static int 1058mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1059 struct vnode *vp; 1060 ufs1_daddr_t *oldblkp, *lastblkp; 1061 struct fs *fs; 1062 ufs_lbn_t lblkno; 1063 int expungetype; 1064{ 1065 ufs1_daddr_t blkno; 1066 struct inode *ip; 1067 ino_t inum; 1068 int acctit; 1069 1070 ip = VTOI(vp); 1071 inum = ip->i_number; 1072 if (lblkno == -1) 1073 acctit = 0; 1074 else 1075 acctit = 1; 1076 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1077 blkno = *oldblkp; 1078 if (blkno == 0 || blkno == BLK_NOCOPY) 1079 continue; 1080 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1081 *ip->i_snapblklist++ = lblkno; 1082 if (blkno == BLK_SNAP) 1083 blkno = blkstofrags(fs, lblkno); 1084 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1085 } 1086 return (0); 1087} 1088 1089/* 1090 * Before expunging a snapshot inode, note all the 1091 * blocks that it claims with BLK_SNAP so that fsck will 1092 * be able to account for those blocks properly and so 1093 * that this snapshot knows that it need not copy them 1094 * if the other snapshot holding them is freed. This code 1095 * is reproduced once each for UFS1 and UFS2. 1096 */ 1097static int 1098expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1099 struct vnode *snapvp; 1100 struct inode *cancelip; 1101 struct fs *fs; 1102 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1103 struct fs *, ufs_lbn_t, int); 1104 int expungetype; 1105{ 1106 int i, error, indiroff; 1107 ufs_lbn_t lbn, rlbn; 1108 ufs2_daddr_t len, blkno, numblks, blksperindir; 1109 struct ufs2_dinode *dip; 1110 struct thread *td = curthread; 1111 struct buf *bp; 1112 1113 /* 1114 * Prepare to expunge the inode. If its inode block has not 1115 * yet been copied, then allocate and fill the copy. 1116 */ 1117 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1118 blkno = 0; 1119 if (lbn < NDADDR) { 1120 blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 1121 } else { 1122 td->td_pflags |= TDP_COWINPROGRESS; 1123 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1124 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1125 td->td_pflags &= ~TDP_COWINPROGRESS; 1126 if (error) 1127 return (error); 1128 indiroff = (lbn - NDADDR) % NINDIR(fs); 1129 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1130 bqrelse(bp); 1131 } 1132 if (blkno != 0) { 1133 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1134 return (error); 1135 } else { 1136 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1137 fs->fs_bsize, KERNCRED, 0, &bp); 1138 if (error) 1139 return (error); 1140 if ((error = readblock(snapvp, bp, lbn)) != 0) 1141 return (error); 1142 } 1143 /* 1144 * Set a snapshot inode to be a zero length file, regular files 1145 * to be completely unallocated. 1146 */ 1147 dip = (struct ufs2_dinode *)bp->b_data + 1148 ino_to_fsbo(fs, cancelip->i_number); 1149 if (expungetype == BLK_NOCOPY) 1150 dip->di_mode = 0; 1151 dip->di_size = 0; 1152 dip->di_blocks = 0; 1153 dip->di_flags &= ~SF_SNAPSHOT; 1154 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1155 bdwrite(bp); 1156 /* 1157 * Now go through and expunge all the blocks in the file 1158 * using the function requested. 1159 */ 1160 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1161 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1162 &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) 1163 return (error); 1164 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1165 &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) 1166 return (error); 1167 blksperindir = 1; 1168 lbn = -NDADDR; 1169 len = numblks - NDADDR; 1170 rlbn = NDADDR; 1171 for (i = 0; len > 0 && i < NIADDR; i++) { 1172 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1173 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1174 blksperindir, fs, acctfunc, expungetype); 1175 if (error) 1176 return (error); 1177 blksperindir *= NINDIR(fs); 1178 lbn -= blksperindir + 1; 1179 len -= blksperindir; 1180 rlbn += blksperindir; 1181 } 1182 return (0); 1183} 1184 1185/* 1186 * Descend an indirect block chain for vnode cancelvp accounting for all 1187 * its indirect blocks in snapvp. 1188 */ 1189static int 1190indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1191 blksperindir, fs, acctfunc, expungetype) 1192 struct vnode *snapvp; 1193 struct vnode *cancelvp; 1194 int level; 1195 ufs2_daddr_t blkno; 1196 ufs_lbn_t lbn; 1197 ufs_lbn_t rlbn; 1198 ufs_lbn_t remblks; 1199 ufs_lbn_t blksperindir; 1200 struct fs *fs; 1201 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1202 struct fs *, ufs_lbn_t, int); 1203 int expungetype; 1204{ 1205 int error, num, i; 1206 ufs_lbn_t subblksperindir; 1207 struct indir indirs[NIADDR + 2]; 1208 ufs2_daddr_t last, *bap; 1209 struct buf *bp; 1210 1211 if (blkno == 0) { 1212 if (expungetype == BLK_NOCOPY) 1213 return (0); 1214 panic("indiracct_ufs2: missing indir"); 1215 } 1216 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1217 return (error); 1218 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1219 panic("indiracct_ufs2: botched params"); 1220 /* 1221 * We have to expand bread here since it will deadlock looking 1222 * up the block number for any blocks that are not in the cache. 1223 */ 1224 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1225 bp->b_blkno = fsbtodb(fs, blkno); 1226 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1227 (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 1228 brelse(bp); 1229 return (error); 1230 } 1231 /* 1232 * Account for the block pointers in this indirect block. 1233 */ 1234 last = howmany(remblks, blksperindir); 1235 if (last > NINDIR(fs)) 1236 last = NINDIR(fs); 1237 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1238 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1239 bqrelse(bp); 1240 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1241 level == 0 ? rlbn : -1, expungetype); 1242 if (error || level == 0) 1243 goto out; 1244 /* 1245 * Account for the block pointers in each of the indirect blocks 1246 * in the levels below us. 1247 */ 1248 subblksperindir = blksperindir / NINDIR(fs); 1249 for (lbn++, level--, i = 0; i < last; i++) { 1250 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1251 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1252 if (error) 1253 goto out; 1254 rlbn += blksperindir; 1255 lbn -= blksperindir; 1256 remblks -= blksperindir; 1257 } 1258out: 1259 FREE(bap, M_DEVBUF); 1260 return (error); 1261} 1262 1263/* 1264 * Do both snap accounting and map accounting. 1265 */ 1266static int 1267fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1268 struct vnode *vp; 1269 ufs2_daddr_t *oldblkp, *lastblkp; 1270 struct fs *fs; 1271 ufs_lbn_t lblkno; 1272 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1273{ 1274 int error; 1275 1276 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1277 return (error); 1278 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1279} 1280 1281/* 1282 * Identify a set of blocks allocated in a snapshot inode. 1283 */ 1284static int 1285snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1286 struct vnode *vp; 1287 ufs2_daddr_t *oldblkp, *lastblkp; 1288 struct fs *fs; 1289 ufs_lbn_t lblkno; 1290 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1291{ 1292 struct inode *ip = VTOI(vp); 1293 ufs2_daddr_t blkno, *blkp; 1294 ufs_lbn_t lbn; 1295 struct buf *ibp; 1296 int error; 1297 1298 for ( ; oldblkp < lastblkp; oldblkp++) { 1299 blkno = *oldblkp; 1300 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1301 continue; 1302 lbn = fragstoblks(fs, blkno); 1303 if (lbn < NDADDR) { 1304 blkp = &ip->i_din2->di_db[lbn]; 1305 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1306 } else { 1307 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1308 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1309 if (error) 1310 return (error); 1311 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1312 [(lbn - NDADDR) % NINDIR(fs)]; 1313 } 1314 /* 1315 * If we are expunging a snapshot vnode and we 1316 * find a block marked BLK_NOCOPY, then it is 1317 * one that has been allocated to this snapshot after 1318 * we took our current snapshot and can be ignored. 1319 */ 1320 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1321 if (lbn >= NDADDR) 1322 brelse(ibp); 1323 } else { 1324 if (*blkp != 0) 1325 panic("snapacct_ufs2: bad block"); 1326 *blkp = expungetype; 1327 if (lbn >= NDADDR) 1328 bdwrite(ibp); 1329 } 1330 } 1331 return (0); 1332} 1333 1334/* 1335 * Account for a set of blocks allocated in a snapshot inode. 1336 */ 1337static int 1338mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1339 struct vnode *vp; 1340 ufs2_daddr_t *oldblkp, *lastblkp; 1341 struct fs *fs; 1342 ufs_lbn_t lblkno; 1343 int expungetype; 1344{ 1345 ufs2_daddr_t blkno; 1346 struct inode *ip; 1347 ino_t inum; 1348 int acctit; 1349 1350 ip = VTOI(vp); 1351 inum = ip->i_number; 1352 if (lblkno == -1) 1353 acctit = 0; 1354 else 1355 acctit = 1; 1356 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1357 blkno = *oldblkp; 1358 if (blkno == 0 || blkno == BLK_NOCOPY) 1359 continue; 1360 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1361 *ip->i_snapblklist++ = lblkno; 1362 if (blkno == BLK_SNAP) 1363 blkno = blkstofrags(fs, lblkno); 1364 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1365 } 1366 return (0); 1367} 1368 1369/* 1370 * Decrement extra reference on snapshot when last name is removed. 1371 * It will not be freed until the last open reference goes away. 1372 */ 1373void 1374ffs_snapgone(ip) 1375 struct inode *ip; 1376{ 1377 struct inode *xp; 1378 struct fs *fs; 1379 int snaploc; 1380 struct snapdata *sn; 1381 1382 /* 1383 * Find snapshot in incore list. 1384 */ 1385 xp = NULL; 1386 sn = ip->i_devvp->v_rdev->si_snapdata; 1387 if (sn != NULL) 1388 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) 1389 if (xp == ip) 1390 break; 1391 if (xp != NULL) 1392 vrele(ITOV(ip)); 1393 else if (snapdebug) 1394 printf("ffs_snapgone: lost snapshot vnode %d\n", 1395 ip->i_number); 1396 /* 1397 * Delete snapshot inode from superblock. Keep list dense. 1398 */ 1399 fs = ip->i_fs; 1400 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1401 if (fs->fs_snapinum[snaploc] == ip->i_number) 1402 break; 1403 if (snaploc < FSMAXSNAP) { 1404 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1405 if (fs->fs_snapinum[snaploc] == 0) 1406 break; 1407 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1408 } 1409 fs->fs_snapinum[snaploc - 1] = 0; 1410 } 1411} 1412 1413/* 1414 * Prepare a snapshot file for being removed. 1415 */ 1416void 1417ffs_snapremove(vp) 1418 struct vnode *vp; 1419{ 1420 struct inode *ip; 1421 struct vnode *devvp; 1422 struct lock *lkp; 1423 struct buf *ibp; 1424 struct fs *fs; 1425 struct thread *td = curthread; 1426 ufs2_daddr_t numblks, blkno, dblk, *snapblklist; 1427 int error, loc, last; 1428 struct snapdata *sn; 1429 1430 ip = VTOI(vp); 1431 fs = ip->i_fs; 1432 devvp = ip->i_devvp; 1433 sn = devvp->v_rdev->si_snapdata; 1434 /* 1435 * If active, delete from incore list (this snapshot may 1436 * already have been in the process of being deleted, so 1437 * would not have been active). 1438 * 1439 * Clear copy-on-write flag if last snapshot. 1440 */ 1441 if (ip->i_nextsnap.tqe_prev != 0) { 1442 VI_LOCK(devvp); 1443 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1444 VI_MTX(devvp), td); 1445 VI_LOCK(devvp); 1446 TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); 1447 ip->i_nextsnap.tqe_prev = 0; 1448 lkp = vp->v_vnlock; 1449 vp->v_vnlock = &vp->v_lock; 1450 lockmgr(lkp, LK_RELEASE, NULL, td); 1451 if (TAILQ_FIRST(&sn->sn_head) != 0) { 1452 VI_UNLOCK(devvp); 1453 } else { 1454 devvp->v_rdev->si_copyonwrite = 0; 1455 snapblklist = sn->sn_blklist; 1456 sn->sn_blklist = 0; 1457 sn->sn_listsize = 0; 1458 devvp->v_rdev->si_snapdata = NULL; 1459 devvp->v_vflag &= ~VV_COPYONWRITE; 1460 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); 1461 lockmgr(lkp, LK_RELEASE, NULL, td); 1462 lockdestroy(lkp); 1463 free(sn, M_UFSMNT); 1464 FREE(snapblklist, M_UFSMNT); 1465 } 1466 } 1467 /* 1468 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1469 * snapshots that want them (see ffs_snapblkfree below). 1470 */ 1471 for (blkno = 1; blkno < NDADDR; blkno++) { 1472 dblk = DIP(ip, i_db[blkno]); 1473 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1474 DIP_SET(ip, i_db[blkno], 0); 1475 else if ((dblk == blkstofrags(fs, blkno) && 1476 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1477 ip->i_number))) { 1478 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - 1479 btodb(fs->fs_bsize)); 1480 DIP_SET(ip, i_db[blkno], 0); 1481 } 1482 } 1483 numblks = howmany(ip->i_size, fs->fs_bsize); 1484 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1485 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1486 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1487 if (error) 1488 continue; 1489 if (fs->fs_size - blkno > NINDIR(fs)) 1490 last = NINDIR(fs); 1491 else 1492 last = fs->fs_size - blkno; 1493 for (loc = 0; loc < last; loc++) { 1494 if (ip->i_ump->um_fstype == UFS1) { 1495 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1496 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1497 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1498 else if ((dblk == blkstofrags(fs, blkno) && 1499 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1500 fs->fs_bsize, ip->i_number))) { 1501 ip->i_din1->di_blocks -= 1502 btodb(fs->fs_bsize); 1503 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1504 } 1505 continue; 1506 } 1507 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1508 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1509 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1510 else if ((dblk == blkstofrags(fs, blkno) && 1511 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1512 fs->fs_bsize, ip->i_number))) { 1513 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1514 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1515 } 1516 } 1517 bawrite(ibp); 1518 } 1519 /* 1520 * Clear snapshot flag and drop reference. 1521 */ 1522 ip->i_flags &= ~SF_SNAPSHOT; 1523 DIP_SET(ip, i_flags, ip->i_flags); 1524 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1525} 1526 1527/* 1528 * Notification that a block is being freed. Return zero if the free 1529 * should be allowed to proceed. Return non-zero if the snapshot file 1530 * wants to claim the block. The block will be claimed if it is an 1531 * uncopied part of one of the snapshots. It will be freed if it is 1532 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1533 * If a fragment is being freed, then all snapshots that care about 1534 * it must make a copy since a snapshot file can only claim full sized 1535 * blocks. Note that if more than one snapshot file maps the block, 1536 * we can pick one at random to claim it. Since none of the snapshots 1537 * can change, we are assurred that they will all see the same unmodified 1538 * image. When deleting a snapshot file (see ffs_snapremove above), we 1539 * must push any of these claimed blocks to one of the other snapshots 1540 * that maps it. These claimed blocks are easily identified as they will 1541 * have a block number equal to their logical block number within the 1542 * snapshot. A copied block can never have this property because they 1543 * must always have been allocated from a BLK_NOCOPY location. 1544 */ 1545int 1546ffs_snapblkfree(fs, devvp, bno, size, inum) 1547 struct fs *fs; 1548 struct vnode *devvp; 1549 ufs2_daddr_t bno; 1550 long size; 1551 ino_t inum; 1552{ 1553 struct buf *ibp, *cbp, *savedcbp = 0; 1554 struct thread *td = curthread; 1555 struct inode *ip; 1556 struct vnode *vp = NULL; 1557 ufs_lbn_t lbn; 1558 ufs2_daddr_t blkno; 1559 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1560 struct snapdata *sn; 1561 1562 lbn = fragstoblks(fs, bno); 1563retry: 1564 VI_LOCK(devvp); 1565 sn = devvp->v_rdev->si_snapdata;
|
1566 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 1567 vp = ITOV(ip); 1568 /* 1569 * Lookup block being written. 1570 */ 1571 if (lbn < NDADDR) { 1572 blkno = DIP(ip, i_db[lbn]); 1573 } else { 1574 if (snapshot_locked == 0 && 1575 lockmgr(vp->v_vnlock, 1576 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1577 VI_MTX(devvp), td) != 0) 1578 goto retry; 1579 snapshot_locked = 1; 1580 td->td_pflags |= TDP_COWINPROGRESS; 1581 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1582 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1583 td->td_pflags &= ~TDP_COWINPROGRESS; 1584 if (error) 1585 break; 1586 indiroff = (lbn - NDADDR) % NINDIR(fs); 1587 if (ip->i_ump->um_fstype == UFS1) 1588 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1589 else 1590 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1591 } 1592 /* 1593 * Check to see if block needs to be copied. 1594 */ 1595 if (blkno == 0) { 1596 /* 1597 * A block that we map is being freed. If it has not 1598 * been claimed yet, we will claim or copy it (below). 1599 */ 1600 claimedblk = 1; 1601 } else if (blkno == BLK_SNAP) { 1602 /* 1603 * No previous snapshot claimed the block, 1604 * so it will be freed and become a BLK_NOCOPY 1605 * (don't care) for us. 1606 */ 1607 if (claimedblk) 1608 panic("snapblkfree: inconsistent block type"); 1609 if (snapshot_locked == 0 && 1610 lockmgr(vp->v_vnlock, 1611 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1612 VI_MTX(devvp), td) != 0) { 1613 if (lbn >= NDADDR) 1614 bqrelse(ibp); 1615 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1616 goto retry; 1617 } 1618 snapshot_locked = 1; 1619 if (lbn < NDADDR) { 1620 DIP_SET(ip, i_db[lbn], BLK_NOCOPY); 1621 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1622 } else if (ip->i_ump->um_fstype == UFS1) { 1623 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1624 BLK_NOCOPY; 1625 bdwrite(ibp); 1626 } else { 1627 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1628 BLK_NOCOPY; 1629 bdwrite(ibp); 1630 } 1631 continue; 1632 } else /* BLK_NOCOPY or default */ { 1633 /* 1634 * If the snapshot has already copied the block 1635 * (default), or does not care about the block, 1636 * it is not needed. 1637 */ 1638 if (lbn >= NDADDR) 1639 bqrelse(ibp); 1640 continue; 1641 } 1642 /* 1643 * If this is a full size block, we will just grab it 1644 * and assign it to the snapshot inode. Otherwise we 1645 * will proceed to copy it. See explanation for this 1646 * routine as to why only a single snapshot needs to 1647 * claim this block. 1648 */ 1649 if (snapshot_locked == 0 && 1650 lockmgr(vp->v_vnlock, 1651 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1652 VI_MTX(devvp), td) != 0) { 1653 if (lbn >= NDADDR) 1654 bqrelse(ibp); 1655 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1656 goto retry; 1657 } 1658 snapshot_locked = 1; 1659 if (size == fs->fs_bsize) { 1660#ifdef DEBUG 1661 if (snapdebug) 1662 printf("%s %d lbn %jd from inum %d\n", 1663 "Grabonremove: snapino", ip->i_number, 1664 (intmax_t)lbn, inum); 1665#endif 1666 if (lbn < NDADDR) { 1667 DIP_SET(ip, i_db[lbn], bno); 1668 } else if (ip->i_ump->um_fstype == UFS1) { 1669 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1670 bdwrite(ibp); 1671 } else { 1672 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1673 bdwrite(ibp); 1674 } 1675 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); 1676 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1677 VOP_UNLOCK(vp, 0, td); 1678 return (1); 1679 } 1680 if (lbn >= NDADDR) 1681 bqrelse(ibp); 1682 /* 1683 * Allocate the block into which to do the copy. Note that this 1684 * allocation will never require any additional allocations for 1685 * the snapshot inode. 1686 */ 1687 td->td_pflags |= TDP_COWINPROGRESS; 1688 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1689 fs->fs_bsize, KERNCRED, 0, &cbp); 1690 td->td_pflags &= ~TDP_COWINPROGRESS; 1691 if (error) 1692 break; 1693#ifdef DEBUG 1694 if (snapdebug) 1695 printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", 1696 "Copyonremove: snapino ", ip->i_number, 1697 (intmax_t)lbn, "for inum", inum, size, 1698 (intmax_t)cbp->b_blkno); 1699#endif 1700 /* 1701 * If we have already read the old block contents, then 1702 * simply copy them to the new block. Note that we need 1703 * to synchronously write snapshots that have not been 1704 * unlinked, and hence will be visible after a crash, 1705 * to ensure their integrity. 1706 */ 1707 if (savedcbp != 0) { 1708 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1709 bawrite(cbp); 1710 if (dopersistence && ip->i_effnlink > 0) 1711 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1712 continue; 1713 } 1714 /* 1715 * Otherwise, read the old block contents into the buffer. 1716 */ 1717 if ((error = readblock(vp, cbp, lbn)) != 0) { 1718 bzero(cbp->b_data, fs->fs_bsize); 1719 bawrite(cbp); 1720 if (dopersistence && ip->i_effnlink > 0) 1721 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1722 break; 1723 } 1724 savedcbp = cbp; 1725 } 1726 /* 1727 * Note that we need to synchronously write snapshots that 1728 * have not been unlinked, and hence will be visible after 1729 * a crash, to ensure their integrity. 1730 */ 1731 if (savedcbp) { 1732 vp = savedcbp->b_vp; 1733 bawrite(savedcbp); 1734 if (dopersistence && VTOI(vp)->i_effnlink > 0) 1735 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1736 } 1737 /* 1738 * If we have been unable to allocate a block in which to do 1739 * the copy, then return non-zero so that the fragment will 1740 * not be freed. Although space will be lost, the snapshot 1741 * will stay consistent. 1742 */ 1743 if (snapshot_locked) 1744 VOP_UNLOCK(vp, 0, td); 1745 else 1746 VI_UNLOCK(devvp); 1747 return (error); 1748} 1749 1750/* 1751 * Associate snapshot files when mounting. 1752 */ 1753void 1754ffs_snapshot_mount(mp) 1755 struct mount *mp; 1756{ 1757 struct ufsmount *ump = VFSTOUFS(mp); 1758 struct vnode *devvp = ump->um_devvp; 1759 struct fs *fs = ump->um_fs; 1760 struct thread *td = curthread; 1761 struct snapdata *sn; 1762 struct vnode *vp; 1763 struct inode *ip; 1764 struct uio auio; 1765 struct iovec aiov; 1766 void *snapblklist; 1767 char *reason; 1768 daddr_t snaplistsize; 1769 int error, snaploc, loc; 1770 1771 /* 1772 * XXX The following needs to be set before UFS_TRUNCATE or 1773 * VOP_READ can be called. 1774 */ 1775 mp->mnt_stat.f_iosize = fs->fs_bsize; 1776 /* 1777 * Process each snapshot listed in the superblock. 1778 */ 1779 vp = NULL; 1780 sn = devvp->v_rdev->si_snapdata; 1781 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1782 if (fs->fs_snapinum[snaploc] == 0) 1783 break; 1784 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1785 LK_EXCLUSIVE, &vp)) != 0){ 1786 printf("ffs_snapshot_mount: vget failed %d\n", error); 1787 continue; 1788 } 1789 ip = VTOI(vp); 1790 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == 1791 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1792 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1793 reason = "non-snapshot"; 1794 } else { 1795 reason = "old format snapshot"; 1796 (void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 1797 (void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1798 } 1799 printf("ffs_snapshot_mount: %s inode %d\n", 1800 reason, fs->fs_snapinum[snaploc]); 1801 vput(vp); 1802 vp = NULL; 1803 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1804 if (fs->fs_snapinum[loc] == 0) 1805 break; 1806 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1807 } 1808 fs->fs_snapinum[loc - 1] = 0; 1809 snaploc--; 1810 continue; 1811 } 1812 /* 1813 * If there already exist snapshots on this filesystem, grab a 1814 * reference to their shared lock. If this is the first snapshot 1815 * on this filesystem, we need to allocate a lock for the 1816 * snapshots to share. In either case, acquire the snapshot 1817 * lock and give up our original private lock. 1818 */ 1819 VI_LOCK(devvp); 1820 if (sn != NULL) { 1821 1822 VI_UNLOCK(devvp); 1823 VI_LOCK(vp); 1824 vp->v_vnlock = &sn->sn_lock; 1825 } else { 1826 VI_UNLOCK(devvp); 1827 sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); 1828 TAILQ_INIT(&sn->sn_head); 1829 lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, 1830 LK_CANRECURSE | LK_NOPAUSE); 1831 VI_LOCK(vp); 1832 vp->v_vnlock = &sn->sn_lock; 1833 devvp->v_rdev->si_snapdata = sn; 1834 } 1835 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 1836 transferlockers(&vp->v_lock, vp->v_vnlock); 1837 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 1838 /* 1839 * Link it onto the active snapshot list. 1840 */ 1841 VI_LOCK(devvp); 1842 if (ip->i_nextsnap.tqe_prev != 0) 1843 panic("ffs_snapshot_mount: %d already on list", 1844 ip->i_number); 1845 else 1846 TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 1847 vp->v_vflag |= VV_SYSTEM; 1848 VI_UNLOCK(devvp); 1849 VOP_UNLOCK(vp, 0, td); 1850 } 1851 /* 1852 * No usable snapshots found. 1853 */ 1854 if (vp == NULL) 1855 return; 1856 /* 1857 * Allocate the space for the block hints list. We always want to 1858 * use the list from the newest snapshot. 1859 */ 1860 auio.uio_iov = &aiov; 1861 auio.uio_iovcnt = 1; 1862 aiov.iov_base = (void *)&snaplistsize; 1863 aiov.iov_len = sizeof(snaplistsize); 1864 auio.uio_resid = aiov.iov_len; 1865 auio.uio_offset = 1866 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 1867 auio.uio_segflg = UIO_SYSSPACE; 1868 auio.uio_rw = UIO_READ; 1869 auio.uio_td = td; 1870 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1871 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1872 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1873 VOP_UNLOCK(vp, 0, td); 1874 return; 1875 } 1876 MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), 1877 M_UFSMNT, M_WAITOK); 1878 auio.uio_iovcnt = 1; 1879 aiov.iov_base = snapblklist; 1880 aiov.iov_len = snaplistsize * sizeof (daddr_t); 1881 auio.uio_resid = aiov.iov_len; 1882 auio.uio_offset -= sizeof(snaplistsize); 1883 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1884 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1885 VOP_UNLOCK(vp, 0, td); 1886 FREE(snapblklist, M_UFSMNT); 1887 return; 1888 } 1889 VOP_UNLOCK(vp, 0, td); 1890 VI_LOCK(devvp); 1891 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 1892 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 1893 sn->sn_listsize = snaplistsize; 1894 sn->sn_blklist = (daddr_t *)snapblklist; 1895 devvp->v_vflag |= VV_COPYONWRITE; 1896 VI_UNLOCK(devvp); 1897} 1898 1899/* 1900 * Disassociate snapshot files when unmounting. 1901 */ 1902void 1903ffs_snapshot_unmount(mp) 1904 struct mount *mp; 1905{ 1906 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1907 struct snapdata *sn; 1908 struct inode *xp; 1909 struct vnode *vp; 1910 1911 sn = devvp->v_rdev->si_snapdata; 1912 VI_LOCK(devvp); 1913 while ((xp = TAILQ_FIRST(&sn->sn_head)) != 0) { 1914 vp = ITOV(xp); 1915 vp->v_vnlock = &vp->v_lock; 1916 TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); 1917 xp->i_nextsnap.tqe_prev = 0; 1918 if (xp->i_effnlink > 0) { 1919 VI_UNLOCK(devvp); 1920 vrele(vp); 1921 VI_LOCK(devvp); 1922 } 1923 } 1924 if (sn->sn_blklist != NULL) { 1925 FREE(sn->sn_blklist, M_UFSMNT); 1926 sn->sn_blklist = NULL; 1927 sn->sn_listsize = 0; 1928 } 1929 lockdestroy(&sn->sn_lock); 1930 free(sn, M_UFSMNT); 1931 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 1932 devvp->v_rdev->si_copyonwrite = 0; 1933 devvp->v_rdev->si_snapdata = NULL; 1934 devvp->v_vflag &= ~VV_COPYONWRITE; 1935 VI_UNLOCK(devvp); 1936} 1937 1938/* 1939 * Check for need to copy block that is about to be written, 1940 * copying the block if necessary. 1941 */ 1942static int 1943ffs_copyonwrite(devvp, bp) 1944 struct vnode *devvp; 1945 struct buf *bp; 1946{ 1947 struct snapdata *sn; 1948 struct buf *ibp, *cbp, *savedcbp = 0; 1949 struct thread *td = curthread; 1950 struct fs *fs; 1951 struct inode *ip; 1952 struct vnode *vp = 0; 1953 ufs2_daddr_t lbn, blkno, *snapblklist; 1954 int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0; 1955 1956 if (td->td_pflags & TDP_COWINPROGRESS) 1957 panic("ffs_copyonwrite: recursive call"); 1958 /* 1959 * First check to see if it is in the preallocated list. 1960 * By doing this check we avoid several potential deadlocks. 1961 */ 1962 VI_LOCK(devvp); 1963 sn = devvp->v_rdev->si_snapdata; 1964 ip = TAILQ_FIRST(&sn->sn_head); 1965 fs = ip->i_fs; 1966 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1967 snapblklist = sn->sn_blklist; 1968 upper = sn->sn_listsize - 1; 1969 lower = 1; 1970 while (lower <= upper) { 1971 mid = (lower + upper) / 2; 1972 if (snapblklist[mid] == lbn) 1973 break; 1974 if (snapblklist[mid] < lbn) 1975 lower = mid + 1; 1976 else 1977 upper = mid - 1; 1978 } 1979 if (lower <= upper) { 1980 VI_UNLOCK(devvp); 1981 return (0); 1982 } 1983 /* 1984 * Not in the precomputed list, so check the snapshots. 1985 */ 1986retry: 1987 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 1988 vp = ITOV(ip); 1989 /* 1990 * We ensure that everything of our own that needs to be 1991 * copied will be done at the time that ffs_snapshot is 1992 * called. Thus we can skip the check here which can 1993 * deadlock in doing the lookup in UFS_BALLOC. 1994 */ 1995 if (bp->b_vp == vp) 1996 continue; 1997 /* 1998 * Check to see if block needs to be copied. We do not have 1999 * to hold the snapshot lock while doing this lookup as it 2000 * will never require any additional allocations for the 2001 * snapshot inode. 2002 */ 2003 if (lbn < NDADDR) { 2004 blkno = DIP(ip, i_db[lbn]); 2005 } else { 2006 if (snapshot_locked == 0 && 2007 lockmgr(vp->v_vnlock, 2008 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2009 VI_MTX(devvp), td) != 0) { 2010 VI_LOCK(devvp); 2011 goto retry; 2012 } 2013 snapshot_locked = 1; 2014 td->td_pflags |= TDP_COWINPROGRESS; 2015 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2016 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 2017 td->td_pflags &= ~TDP_COWINPROGRESS; 2018 if (error) 2019 break; 2020 indiroff = (lbn - NDADDR) % NINDIR(fs); 2021 if (ip->i_ump->um_fstype == UFS1) 2022 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 2023 else 2024 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 2025 bqrelse(ibp); 2026 } 2027#ifdef DIAGNOSTIC 2028 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 2029 panic("ffs_copyonwrite: bad copy block"); 2030#endif 2031 if (blkno != 0) 2032 continue; 2033 /* 2034 * Allocate the block into which to do the copy. Since 2035 * multiple processes may all try to copy the same block, 2036 * we have to recheck our need to do a copy if we sleep 2037 * waiting for the lock. 2038 * 2039 * Because all snapshots on a filesystem share a single 2040 * lock, we ensure that we will never be in competition 2041 * with another process to allocate a block. 2042 */ 2043 if (snapshot_locked == 0 && 2044 lockmgr(vp->v_vnlock, 2045 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2046 VI_MTX(devvp), td) != 0) { 2047 VI_LOCK(devvp); 2048 goto retry; 2049 } 2050 snapshot_locked = 1; 2051 td->td_pflags |= TDP_COWINPROGRESS; 2052 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2053 fs->fs_bsize, KERNCRED, 0, &cbp); 2054 td->td_pflags &= ~TDP_COWINPROGRESS; 2055 if (error) 2056 break; 2057#ifdef DEBUG 2058 if (snapdebug) { 2059 printf("Copyonwrite: snapino %d lbn %jd for ", 2060 ip->i_number, (intmax_t)lbn); 2061 if (bp->b_vp == devvp) 2062 printf("fs metadata"); 2063 else 2064 printf("inum %d", VTOI(bp->b_vp)->i_number); 2065 printf(" lblkno %jd to blkno %jd\n", 2066 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 2067 } 2068#endif 2069 /* 2070 * If we have already read the old block contents, then 2071 * simply copy them to the new block. Note that we need 2072 * to synchronously write snapshots that have not been 2073 * unlinked, and hence will be visible after a crash, 2074 * to ensure their integrity. 2075 */ 2076 if (savedcbp != 0) { 2077 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 2078 bawrite(cbp); 2079 if (dopersistence && ip->i_effnlink > 0) 2080 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2081 continue; 2082 } 2083 /* 2084 * Otherwise, read the old block contents into the buffer. 2085 */ 2086 if ((error = readblock(vp, cbp, lbn)) != 0) { 2087 bzero(cbp->b_data, fs->fs_bsize); 2088 bawrite(cbp); 2089 if (dopersistence && ip->i_effnlink > 0) 2090 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2091 break; 2092 } 2093 savedcbp = cbp; 2094 } 2095 /* 2096 * Note that we need to synchronously write snapshots that 2097 * have not been unlinked, and hence will be visible after 2098 * a crash, to ensure their integrity. 2099 */ 2100 if (savedcbp) { 2101 vp = savedcbp->b_vp; 2102 bawrite(savedcbp); 2103 if (dopersistence && VTOI(vp)->i_effnlink > 0) 2104 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2105 } 2106 if (snapshot_locked) 2107 VOP_UNLOCK(vp, 0, td); 2108 else 2109 VI_UNLOCK(devvp); 2110 return (error); 2111} 2112 2113/* 2114 * Read the specified block into the given buffer. 2115 * Much of this boiler-plate comes from bwrite(). 2116 */ 2117static int 2118readblock(vp, bp, lbn) 2119 struct vnode *vp; 2120 struct buf *bp; 2121 ufs2_daddr_t lbn; 2122{ 2123 struct uio auio; 2124 struct iovec aiov; 2125 struct thread *td = curthread; 2126 struct inode *ip = VTOI(vp); 2127 2128 aiov.iov_base = bp->b_data; 2129 aiov.iov_len = bp->b_bcount; 2130 auio.uio_iov = &aiov; 2131 auio.uio_iovcnt = 1; 2132 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 2133 auio.uio_resid = bp->b_bcount; 2134 auio.uio_rw = UIO_READ; 2135 auio.uio_segflg = UIO_SYSSPACE; 2136 auio.uio_td = td; 2137 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 2138}
| 1570 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 1571 vp = ITOV(ip); 1572 /* 1573 * Lookup block being written. 1574 */ 1575 if (lbn < NDADDR) { 1576 blkno = DIP(ip, i_db[lbn]); 1577 } else { 1578 if (snapshot_locked == 0 && 1579 lockmgr(vp->v_vnlock, 1580 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1581 VI_MTX(devvp), td) != 0) 1582 goto retry; 1583 snapshot_locked = 1; 1584 td->td_pflags |= TDP_COWINPROGRESS; 1585 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1586 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1587 td->td_pflags &= ~TDP_COWINPROGRESS; 1588 if (error) 1589 break; 1590 indiroff = (lbn - NDADDR) % NINDIR(fs); 1591 if (ip->i_ump->um_fstype == UFS1) 1592 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1593 else 1594 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1595 } 1596 /* 1597 * Check to see if block needs to be copied. 1598 */ 1599 if (blkno == 0) { 1600 /* 1601 * A block that we map is being freed. If it has not 1602 * been claimed yet, we will claim or copy it (below). 1603 */ 1604 claimedblk = 1; 1605 } else if (blkno == BLK_SNAP) { 1606 /* 1607 * No previous snapshot claimed the block, 1608 * so it will be freed and become a BLK_NOCOPY 1609 * (don't care) for us. 1610 */ 1611 if (claimedblk) 1612 panic("snapblkfree: inconsistent block type"); 1613 if (snapshot_locked == 0 && 1614 lockmgr(vp->v_vnlock, 1615 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1616 VI_MTX(devvp), td) != 0) { 1617 if (lbn >= NDADDR) 1618 bqrelse(ibp); 1619 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1620 goto retry; 1621 } 1622 snapshot_locked = 1; 1623 if (lbn < NDADDR) { 1624 DIP_SET(ip, i_db[lbn], BLK_NOCOPY); 1625 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1626 } else if (ip->i_ump->um_fstype == UFS1) { 1627 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1628 BLK_NOCOPY; 1629 bdwrite(ibp); 1630 } else { 1631 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1632 BLK_NOCOPY; 1633 bdwrite(ibp); 1634 } 1635 continue; 1636 } else /* BLK_NOCOPY or default */ { 1637 /* 1638 * If the snapshot has already copied the block 1639 * (default), or does not care about the block, 1640 * it is not needed. 1641 */ 1642 if (lbn >= NDADDR) 1643 bqrelse(ibp); 1644 continue; 1645 } 1646 /* 1647 * If this is a full size block, we will just grab it 1648 * and assign it to the snapshot inode. Otherwise we 1649 * will proceed to copy it. See explanation for this 1650 * routine as to why only a single snapshot needs to 1651 * claim this block. 1652 */ 1653 if (snapshot_locked == 0 && 1654 lockmgr(vp->v_vnlock, 1655 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1656 VI_MTX(devvp), td) != 0) { 1657 if (lbn >= NDADDR) 1658 bqrelse(ibp); 1659 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1660 goto retry; 1661 } 1662 snapshot_locked = 1; 1663 if (size == fs->fs_bsize) { 1664#ifdef DEBUG 1665 if (snapdebug) 1666 printf("%s %d lbn %jd from inum %d\n", 1667 "Grabonremove: snapino", ip->i_number, 1668 (intmax_t)lbn, inum); 1669#endif 1670 if (lbn < NDADDR) { 1671 DIP_SET(ip, i_db[lbn], bno); 1672 } else if (ip->i_ump->um_fstype == UFS1) { 1673 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1674 bdwrite(ibp); 1675 } else { 1676 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1677 bdwrite(ibp); 1678 } 1679 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); 1680 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1681 VOP_UNLOCK(vp, 0, td); 1682 return (1); 1683 } 1684 if (lbn >= NDADDR) 1685 bqrelse(ibp); 1686 /* 1687 * Allocate the block into which to do the copy. Note that this 1688 * allocation will never require any additional allocations for 1689 * the snapshot inode. 1690 */ 1691 td->td_pflags |= TDP_COWINPROGRESS; 1692 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1693 fs->fs_bsize, KERNCRED, 0, &cbp); 1694 td->td_pflags &= ~TDP_COWINPROGRESS; 1695 if (error) 1696 break; 1697#ifdef DEBUG 1698 if (snapdebug) 1699 printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", 1700 "Copyonremove: snapino ", ip->i_number, 1701 (intmax_t)lbn, "for inum", inum, size, 1702 (intmax_t)cbp->b_blkno); 1703#endif 1704 /* 1705 * If we have already read the old block contents, then 1706 * simply copy them to the new block. Note that we need 1707 * to synchronously write snapshots that have not been 1708 * unlinked, and hence will be visible after a crash, 1709 * to ensure their integrity. 1710 */ 1711 if (savedcbp != 0) { 1712 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1713 bawrite(cbp); 1714 if (dopersistence && ip->i_effnlink > 0) 1715 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1716 continue; 1717 } 1718 /* 1719 * Otherwise, read the old block contents into the buffer. 1720 */ 1721 if ((error = readblock(vp, cbp, lbn)) != 0) { 1722 bzero(cbp->b_data, fs->fs_bsize); 1723 bawrite(cbp); 1724 if (dopersistence && ip->i_effnlink > 0) 1725 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1726 break; 1727 } 1728 savedcbp = cbp; 1729 } 1730 /* 1731 * Note that we need to synchronously write snapshots that 1732 * have not been unlinked, and hence will be visible after 1733 * a crash, to ensure their integrity. 1734 */ 1735 if (savedcbp) { 1736 vp = savedcbp->b_vp; 1737 bawrite(savedcbp); 1738 if (dopersistence && VTOI(vp)->i_effnlink > 0) 1739 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1740 } 1741 /* 1742 * If we have been unable to allocate a block in which to do 1743 * the copy, then return non-zero so that the fragment will 1744 * not be freed. Although space will be lost, the snapshot 1745 * will stay consistent. 1746 */ 1747 if (snapshot_locked) 1748 VOP_UNLOCK(vp, 0, td); 1749 else 1750 VI_UNLOCK(devvp); 1751 return (error); 1752} 1753 1754/* 1755 * Associate snapshot files when mounting. 1756 */ 1757void 1758ffs_snapshot_mount(mp) 1759 struct mount *mp; 1760{ 1761 struct ufsmount *ump = VFSTOUFS(mp); 1762 struct vnode *devvp = ump->um_devvp; 1763 struct fs *fs = ump->um_fs; 1764 struct thread *td = curthread; 1765 struct snapdata *sn; 1766 struct vnode *vp; 1767 struct inode *ip; 1768 struct uio auio; 1769 struct iovec aiov; 1770 void *snapblklist; 1771 char *reason; 1772 daddr_t snaplistsize; 1773 int error, snaploc, loc; 1774 1775 /* 1776 * XXX The following needs to be set before UFS_TRUNCATE or 1777 * VOP_READ can be called. 1778 */ 1779 mp->mnt_stat.f_iosize = fs->fs_bsize; 1780 /* 1781 * Process each snapshot listed in the superblock. 1782 */ 1783 vp = NULL; 1784 sn = devvp->v_rdev->si_snapdata; 1785 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1786 if (fs->fs_snapinum[snaploc] == 0) 1787 break; 1788 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1789 LK_EXCLUSIVE, &vp)) != 0){ 1790 printf("ffs_snapshot_mount: vget failed %d\n", error); 1791 continue; 1792 } 1793 ip = VTOI(vp); 1794 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == 1795 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1796 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1797 reason = "non-snapshot"; 1798 } else { 1799 reason = "old format snapshot"; 1800 (void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 1801 (void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1802 } 1803 printf("ffs_snapshot_mount: %s inode %d\n", 1804 reason, fs->fs_snapinum[snaploc]); 1805 vput(vp); 1806 vp = NULL; 1807 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1808 if (fs->fs_snapinum[loc] == 0) 1809 break; 1810 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1811 } 1812 fs->fs_snapinum[loc - 1] = 0; 1813 snaploc--; 1814 continue; 1815 } 1816 /* 1817 * If there already exist snapshots on this filesystem, grab a 1818 * reference to their shared lock. If this is the first snapshot 1819 * on this filesystem, we need to allocate a lock for the 1820 * snapshots to share. In either case, acquire the snapshot 1821 * lock and give up our original private lock. 1822 */ 1823 VI_LOCK(devvp); 1824 if (sn != NULL) { 1825 1826 VI_UNLOCK(devvp); 1827 VI_LOCK(vp); 1828 vp->v_vnlock = &sn->sn_lock; 1829 } else { 1830 VI_UNLOCK(devvp); 1831 sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); 1832 TAILQ_INIT(&sn->sn_head); 1833 lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, 1834 LK_CANRECURSE | LK_NOPAUSE); 1835 VI_LOCK(vp); 1836 vp->v_vnlock = &sn->sn_lock; 1837 devvp->v_rdev->si_snapdata = sn; 1838 } 1839 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 1840 transferlockers(&vp->v_lock, vp->v_vnlock); 1841 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 1842 /* 1843 * Link it onto the active snapshot list. 1844 */ 1845 VI_LOCK(devvp); 1846 if (ip->i_nextsnap.tqe_prev != 0) 1847 panic("ffs_snapshot_mount: %d already on list", 1848 ip->i_number); 1849 else 1850 TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 1851 vp->v_vflag |= VV_SYSTEM; 1852 VI_UNLOCK(devvp); 1853 VOP_UNLOCK(vp, 0, td); 1854 } 1855 /* 1856 * No usable snapshots found. 1857 */ 1858 if (vp == NULL) 1859 return; 1860 /* 1861 * Allocate the space for the block hints list. We always want to 1862 * use the list from the newest snapshot. 1863 */ 1864 auio.uio_iov = &aiov; 1865 auio.uio_iovcnt = 1; 1866 aiov.iov_base = (void *)&snaplistsize; 1867 aiov.iov_len = sizeof(snaplistsize); 1868 auio.uio_resid = aiov.iov_len; 1869 auio.uio_offset = 1870 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 1871 auio.uio_segflg = UIO_SYSSPACE; 1872 auio.uio_rw = UIO_READ; 1873 auio.uio_td = td; 1874 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1875 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1876 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1877 VOP_UNLOCK(vp, 0, td); 1878 return; 1879 } 1880 MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), 1881 M_UFSMNT, M_WAITOK); 1882 auio.uio_iovcnt = 1; 1883 aiov.iov_base = snapblklist; 1884 aiov.iov_len = snaplistsize * sizeof (daddr_t); 1885 auio.uio_resid = aiov.iov_len; 1886 auio.uio_offset -= sizeof(snaplistsize); 1887 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1888 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1889 VOP_UNLOCK(vp, 0, td); 1890 FREE(snapblklist, M_UFSMNT); 1891 return; 1892 } 1893 VOP_UNLOCK(vp, 0, td); 1894 VI_LOCK(devvp); 1895 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 1896 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 1897 sn->sn_listsize = snaplistsize; 1898 sn->sn_blklist = (daddr_t *)snapblklist; 1899 devvp->v_vflag |= VV_COPYONWRITE; 1900 VI_UNLOCK(devvp); 1901} 1902 1903/* 1904 * Disassociate snapshot files when unmounting. 1905 */ 1906void 1907ffs_snapshot_unmount(mp) 1908 struct mount *mp; 1909{ 1910 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1911 struct snapdata *sn; 1912 struct inode *xp; 1913 struct vnode *vp; 1914 1915 sn = devvp->v_rdev->si_snapdata; 1916 VI_LOCK(devvp); 1917 while ((xp = TAILQ_FIRST(&sn->sn_head)) != 0) { 1918 vp = ITOV(xp); 1919 vp->v_vnlock = &vp->v_lock; 1920 TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); 1921 xp->i_nextsnap.tqe_prev = 0; 1922 if (xp->i_effnlink > 0) { 1923 VI_UNLOCK(devvp); 1924 vrele(vp); 1925 VI_LOCK(devvp); 1926 } 1927 } 1928 if (sn->sn_blklist != NULL) { 1929 FREE(sn->sn_blklist, M_UFSMNT); 1930 sn->sn_blklist = NULL; 1931 sn->sn_listsize = 0; 1932 } 1933 lockdestroy(&sn->sn_lock); 1934 free(sn, M_UFSMNT); 1935 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 1936 devvp->v_rdev->si_copyonwrite = 0; 1937 devvp->v_rdev->si_snapdata = NULL; 1938 devvp->v_vflag &= ~VV_COPYONWRITE; 1939 VI_UNLOCK(devvp); 1940} 1941 1942/* 1943 * Check for need to copy block that is about to be written, 1944 * copying the block if necessary. 1945 */ 1946static int 1947ffs_copyonwrite(devvp, bp) 1948 struct vnode *devvp; 1949 struct buf *bp; 1950{ 1951 struct snapdata *sn; 1952 struct buf *ibp, *cbp, *savedcbp = 0; 1953 struct thread *td = curthread; 1954 struct fs *fs; 1955 struct inode *ip; 1956 struct vnode *vp = 0; 1957 ufs2_daddr_t lbn, blkno, *snapblklist; 1958 int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0; 1959 1960 if (td->td_pflags & TDP_COWINPROGRESS) 1961 panic("ffs_copyonwrite: recursive call"); 1962 /* 1963 * First check to see if it is in the preallocated list. 1964 * By doing this check we avoid several potential deadlocks. 1965 */ 1966 VI_LOCK(devvp); 1967 sn = devvp->v_rdev->si_snapdata; 1968 ip = TAILQ_FIRST(&sn->sn_head); 1969 fs = ip->i_fs; 1970 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1971 snapblklist = sn->sn_blklist; 1972 upper = sn->sn_listsize - 1; 1973 lower = 1; 1974 while (lower <= upper) { 1975 mid = (lower + upper) / 2; 1976 if (snapblklist[mid] == lbn) 1977 break; 1978 if (snapblklist[mid] < lbn) 1979 lower = mid + 1; 1980 else 1981 upper = mid - 1; 1982 } 1983 if (lower <= upper) { 1984 VI_UNLOCK(devvp); 1985 return (0); 1986 } 1987 /* 1988 * Not in the precomputed list, so check the snapshots. 1989 */ 1990retry: 1991 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 1992 vp = ITOV(ip); 1993 /* 1994 * We ensure that everything of our own that needs to be 1995 * copied will be done at the time that ffs_snapshot is 1996 * called. Thus we can skip the check here which can 1997 * deadlock in doing the lookup in UFS_BALLOC. 1998 */ 1999 if (bp->b_vp == vp) 2000 continue; 2001 /* 2002 * Check to see if block needs to be copied. We do not have 2003 * to hold the snapshot lock while doing this lookup as it 2004 * will never require any additional allocations for the 2005 * snapshot inode. 2006 */ 2007 if (lbn < NDADDR) { 2008 blkno = DIP(ip, i_db[lbn]); 2009 } else { 2010 if (snapshot_locked == 0 && 2011 lockmgr(vp->v_vnlock, 2012 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2013 VI_MTX(devvp), td) != 0) { 2014 VI_LOCK(devvp); 2015 goto retry; 2016 } 2017 snapshot_locked = 1; 2018 td->td_pflags |= TDP_COWINPROGRESS; 2019 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2020 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 2021 td->td_pflags &= ~TDP_COWINPROGRESS; 2022 if (error) 2023 break; 2024 indiroff = (lbn - NDADDR) % NINDIR(fs); 2025 if (ip->i_ump->um_fstype == UFS1) 2026 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 2027 else 2028 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 2029 bqrelse(ibp); 2030 } 2031#ifdef DIAGNOSTIC 2032 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 2033 panic("ffs_copyonwrite: bad copy block"); 2034#endif 2035 if (blkno != 0) 2036 continue; 2037 /* 2038 * Allocate the block into which to do the copy. Since 2039 * multiple processes may all try to copy the same block, 2040 * we have to recheck our need to do a copy if we sleep 2041 * waiting for the lock. 2042 * 2043 * Because all snapshots on a filesystem share a single 2044 * lock, we ensure that we will never be in competition 2045 * with another process to allocate a block. 2046 */ 2047 if (snapshot_locked == 0 && 2048 lockmgr(vp->v_vnlock, 2049 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2050 VI_MTX(devvp), td) != 0) { 2051 VI_LOCK(devvp); 2052 goto retry; 2053 } 2054 snapshot_locked = 1; 2055 td->td_pflags |= TDP_COWINPROGRESS; 2056 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2057 fs->fs_bsize, KERNCRED, 0, &cbp); 2058 td->td_pflags &= ~TDP_COWINPROGRESS; 2059 if (error) 2060 break; 2061#ifdef DEBUG 2062 if (snapdebug) { 2063 printf("Copyonwrite: snapino %d lbn %jd for ", 2064 ip->i_number, (intmax_t)lbn); 2065 if (bp->b_vp == devvp) 2066 printf("fs metadata"); 2067 else 2068 printf("inum %d", VTOI(bp->b_vp)->i_number); 2069 printf(" lblkno %jd to blkno %jd\n", 2070 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 2071 } 2072#endif 2073 /* 2074 * If we have already read the old block contents, then 2075 * simply copy them to the new block. Note that we need 2076 * to synchronously write snapshots that have not been 2077 * unlinked, and hence will be visible after a crash, 2078 * to ensure their integrity. 2079 */ 2080 if (savedcbp != 0) { 2081 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 2082 bawrite(cbp); 2083 if (dopersistence && ip->i_effnlink > 0) 2084 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2085 continue; 2086 } 2087 /* 2088 * Otherwise, read the old block contents into the buffer. 2089 */ 2090 if ((error = readblock(vp, cbp, lbn)) != 0) { 2091 bzero(cbp->b_data, fs->fs_bsize); 2092 bawrite(cbp); 2093 if (dopersistence && ip->i_effnlink > 0) 2094 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2095 break; 2096 } 2097 savedcbp = cbp; 2098 } 2099 /* 2100 * Note that we need to synchronously write snapshots that 2101 * have not been unlinked, and hence will be visible after 2102 * a crash, to ensure their integrity. 2103 */ 2104 if (savedcbp) { 2105 vp = savedcbp->b_vp; 2106 bawrite(savedcbp); 2107 if (dopersistence && VTOI(vp)->i_effnlink > 0) 2108 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2109 } 2110 if (snapshot_locked) 2111 VOP_UNLOCK(vp, 0, td); 2112 else 2113 VI_UNLOCK(devvp); 2114 return (error); 2115} 2116 2117/* 2118 * Read the specified block into the given buffer. 2119 * Much of this boiler-plate comes from bwrite(). 2120 */ 2121static int 2122readblock(vp, bp, lbn) 2123 struct vnode *vp; 2124 struct buf *bp; 2125 ufs2_daddr_t lbn; 2126{ 2127 struct uio auio; 2128 struct iovec aiov; 2129 struct thread *td = curthread; 2130 struct inode *ip = VTOI(vp); 2131 2132 aiov.iov_base = bp->b_data; 2133 aiov.iov_len = bp->b_bcount; 2134 auio.uio_iov = &aiov; 2135 auio.uio_iovcnt = 1; 2136 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 2137 auio.uio_resid = bp->b_bcount; 2138 auio.uio_rw = UIO_READ; 2139 auio.uio_segflg = UIO_SYSSPACE; 2140 auio.uio_td = td; 2141 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 2142}
|