ffs_snapshot.c revision 1.9
1/* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 * 35 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 36 */ 37 38#include <sys/cdefs.h> 39__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.9 2005/02/09 16:05:29 hannken Exp $"); 40 41#if defined(_KERNEL_OPT) 42#include "opt_ffs.h" 43#endif 44 45#include <sys/param.h> 46#include <sys/kernel.h> 47#include <sys/systm.h> 48#include <sys/conf.h> 49#include <sys/buf.h> 50#include <sys/proc.h> 51#include <sys/namei.h> 52#include <sys/sched.h> 53#include <sys/stat.h> 54#include <sys/malloc.h> 55#include <sys/mount.h> 56#include <sys/resource.h> 57#include <sys/resourcevar.h> 58#include <sys/vnode.h> 59 60#include <miscfs/specfs/specdev.h> 61 62#include <ufs/ufs/quota.h> 63#include <ufs/ufs/ufsmount.h> 64#include <ufs/ufs/inode.h> 65#include <ufs/ufs/ufs_extern.h> 66#include <ufs/ufs/ufs_bswap.h> 67 68#include <ufs/ffs/fs.h> 69#include <ufs/ffs/ffs_extern.h> 70 71/* FreeBSD -> NetBSD conversion */ 72#define KERNCRED proc0.p_ucred 73#define ufs1_daddr_t int32_t 74#define ufs2_daddr_t int64_t 75#define ufs_lbn_t daddr_t 76#define VI_MTX(v) (&(v)->v_interlock) 77#define VI_LOCK(v) simple_lock(&(v)->v_interlock) 78#define VI_UNLOCK(v) simple_unlock(&(v)->v_interlock) 79#define MNT_ILOCK(v) simple_lock(&mntvnode_slock) 80#define MNT_IUNLOCK(v) simple_unlock(&mntvnode_slock) 81 82static int cgaccount(int, struct vnode *, caddr_t, int); 83static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 84 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 85 ufs_lbn_t, int), int); 86static int indiracct_ufs1(struct vnode *, struct vnode *, int, 87 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 88 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 89 ufs_lbn_t, int), int); 90static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 91 struct fs *, ufs_lbn_t, int); 92static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 93 struct fs *, ufs_lbn_t, int); 94static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 95 struct fs *, ufs_lbn_t, int); 96static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 97 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 98 ufs_lbn_t, int), int); 99static int indiracct_ufs2(struct vnode *, struct vnode *, int, 100 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 101 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 102 ufs_lbn_t, int), int); 103static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 104 struct fs *, ufs_lbn_t, int); 105static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 106 struct fs *, ufs_lbn_t, int); 107static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 108 struct fs *, ufs_lbn_t, int); 109static int ffs_copyonwrite(void *, struct buf *); 110static int readfsblk(struct vnode *, caddr_t, ufs2_daddr_t); 111static int readvnblk(struct vnode *, caddr_t, ufs2_daddr_t); 112static int writevnblk(struct vnode *, caddr_t, ufs2_daddr_t); 113static inline int cow_enter(void); 114static inline void cow_leave(int); 115static inline ufs2_daddr_t db_get(struct inode *, int); 116static inline void db_assign(struct inode *, int, ufs2_daddr_t); 117static inline ufs2_daddr_t idb_get(struct inode *, caddr_t, int); 118static inline void idb_assign(struct inode *, caddr_t, int, ufs2_daddr_t); 119 120#ifdef DEBUG 121static int snapdebug = 0; 122#endif 123 124/* 125 * Create a snapshot file and initialize it for the filesystem. 126 * Vnode is locked on entry and return. 127 */ 128int 129ffs_snapshot(mp, vp, ctime) 130 struct mount *mp; 131 struct vnode *vp; 132 struct timespec *ctime; 133{ 134 ufs2_daddr_t numblks, blkno, *blkp, snaplistsize = 0, *snapblklist; 135 int error, ns, cg, snaploc; 136 int i, size, len, loc; 137 int flag = mp->mnt_flag; 138 struct timeval starttime; 139#ifdef DEBUG 140 struct timeval endtime; 141#endif 142 struct timespec ts; 143 long redo = 0; 144 int32_t *lp; 145 void *space; 146 caddr_t cgbuf; 147 struct ufsmount *ump = VFSTOUFS(mp); 148 struct fs *copy_fs = NULL, *fs = ump->um_fs; 149 struct proc *p = curproc; 150 struct inode *ip, *xp; 151 struct buf *bp, *ibp; 152 struct vattr vat; 153 struct vnode *xvp, *nvp, *devvp; 154 155 ns = UFS_FSNEEDSWAP(fs); 156 /* 157 * Need to serialize access to snapshot code per filesystem. 158 */ 159 /* 160 * If the vnode already is a snapshot, return. 161 */ 162 if (VTOI(vp)->i_flags & SF_SNAPSHOT) { 163 if (ctime) { 164 ctime->tv_sec = DIP(VTOI(vp), mtime); 165 ctime->tv_nsec = DIP(VTOI(vp), mtimensec); 166 } 167 return 0; 168 } 169 /* 170 * Check mount, exclusive reference and owner. 171 */ 172 if (vp->v_mount != mp) 173 return EXDEV; 174 if (vp->v_usecount != 1 || vp->v_writecount != 0) 175 return EBUSY; 176 if (suser(p->p_ucred, &p->p_acflag) != 0 && 177 VTOI(vp)->i_uid != p->p_ucred->cr_uid) 178 return EACCES; 179 180 if (vp->v_size != 0) { 181 error = VOP_TRUNCATE(vp, 0, 0, NOCRED, p); 182 if (error) 183 return error; 184 } 185 /* 186 * Assign a snapshot slot in the superblock. 187 */ 188 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 189 if (fs->fs_snapinum[snaploc] == 0) 190 break; 191 if (snaploc == FSMAXSNAP) 192 return (ENOSPC); 193 ip = VTOI(vp); 194 devvp = ip->i_devvp; 195 /* 196 * Allocate and copy the last block contents so as to be able 197 * to set size to that of the filesystem. 198 */ 199 numblks = howmany(fs->fs_size, fs->fs_frag); 200 cgbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 201 if ((error = readfsblk(vp, cgbuf, numblks - 1)) != 0) 202 goto out; 203 error = vn_rdwr(UIO_WRITE, vp, 204 cgbuf, fs->fs_bsize, lblktosize(fs, (off_t)(numblks - 1)), 205 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, NULL); 206 if (error) 207 goto out; 208 /* 209 * Preallocate critical data structures so that we can copy 210 * them in without further allocation after we suspend all 211 * operations on the filesystem. We would like to just release 212 * the allocated buffers without writing them since they will 213 * be filled in below once we are ready to go, but this upsets 214 * the soft update code, so we go ahead and write the new buffers. 215 * 216 * Allocate all indirect blocks and mark all of them as not 217 * needing to be copied. 218 */ 219 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 220 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 221 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 222 if (error) 223 goto out; 224 bwrite(ibp); 225 } 226 /* 227 * Allocate copies for the superblock and its summary information. 228 */ 229 bzero(cgbuf, fs->fs_bsize); 230 blkno = lblkno(fs, fs->fs_sblockloc); 231 for (loc = 0; loc < howmany(fs->fs_sbsize, fs->fs_bsize); loc++) 232 if ((error = writevnblk(vp, cgbuf, blkno + loc)) != 0) 233 goto out; 234 blkno = fragstoblks(fs, fs->fs_csaddr); 235 for (loc = 0; loc < howmany(fs->fs_cssize, fs->fs_bsize); loc++) 236 if ((error = writevnblk(vp, cgbuf, blkno + loc)) != 0) 237 goto out; 238 /* 239 * Allocate all cylinder group blocks. 240 */ 241 for (cg = 0; cg < fs->fs_ncg; cg++) 242 if ((error = writevnblk(vp, cgbuf, 243 fragstoblks(fs, cgtod(fs, cg)))) != 0) 244 goto out; 245 /* 246 * Copy all the cylinder group maps. Although the 247 * filesystem is still active, we hope that only a few 248 * cylinder groups will change between now and when we 249 * suspend operations. Thus, we will be able to quickly 250 * touch up the few cylinder groups that changed during 251 * the suspension period. 252 */ 253 len = howmany(fs->fs_ncg, NBBY); 254 MALLOC(fs->fs_active, u_char *, len, M_DEVBUF, M_WAITOK | M_ZERO); 255 for (cg = 0; cg < fs->fs_ncg; cg++) { 256 if ((error = cgaccount(cg, vp, cgbuf, 1)) != 0) 257 goto out; 258 if ((error = writevnblk(vp, cgbuf, 259 fragstoblks(fs, cgtod(fs, cg)))) != 0) 260 goto out; 261 } 262 /* 263 * Change inode to snapshot type file. 264 */ 265 ip->i_flags |= SF_SNAPSHOT; 266 DIP_ASSIGN(ip, flags, ip->i_flags); 267 ip->i_flag |= IN_CHANGE | IN_UPDATE; 268 /* 269 * Ensure that the snapshot is completely on disk. 270 * Since we have marked it as a snapshot it is safe to 271 * unlock it as no process will be allowed to write to it. 272 */ 273 if ((error = VOP_FSYNC(vp, KERNCRED, FSYNC_WAIT, 0, 0, p)) != 0) 274 goto out; 275 VOP_UNLOCK(vp, 0); 276 /* 277 * All allocations are done, so we can now snapshot the system. 278 * 279 * Suspend operation on filesystem. 280 */ 281 if ((error = vfs_write_suspend(vp->v_mount, PUSER|PCATCH, 0)) != 0) { 282 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 283 goto out; 284 } 285 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 286 microtime(&starttime); 287 /* 288 * First, copy all the cylinder group maps that have changed. 289 */ 290 for (cg = 0; cg < fs->fs_ncg; cg++) { 291 if (ACTIVECG_ISSET(fs, cg)) 292 continue; 293 redo++; 294 if ((error = cgaccount(cg, vp, cgbuf, 2)) != 0) 295 goto out1; 296 if ((error = writevnblk(vp, cgbuf, 297 fragstoblks(fs, cgtod(fs, cg)))) != 0) 298 goto out1; 299 } 300 /* 301 * Grab a copy of the superblock and its summary information. 302 * We delay writing it until the suspension is released below. 303 */ 304 loc = blkoff(fs, fs->fs_sblockloc); 305 if (loc > 0) 306 bzero(&cgbuf[0], loc); 307 copy_fs = (struct fs *)(cgbuf + loc); 308 bcopy(fs, copy_fs, fs->fs_sbsize); 309 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 310 if (fs->fs_sbsize < size) 311 bzero(&cgbuf[loc + fs->fs_sbsize], size - fs->fs_sbsize); 312 size = blkroundup(fs, fs->fs_cssize); 313 if (fs->fs_contigsumsize > 0) 314 size += fs->fs_ncg * sizeof(int32_t); 315 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 316 copy_fs->fs_csp = space; 317 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 318 (char *)space += fs->fs_cssize; 319 loc = howmany(fs->fs_cssize, fs->fs_fsize); 320 i = fs->fs_frag - loc % fs->fs_frag; 321 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 322 if (len > 0) { 323 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 324 len, KERNCRED, &bp)) != 0) { 325 brelse(bp); 326 free(copy_fs->fs_csp, M_UFSMNT); 327 goto out1; 328 } 329 bcopy(bp->b_data, space, (u_int)len); 330 (char *)space += len; 331 bp->b_flags |= B_INVAL | B_NOCACHE; 332 brelse(bp); 333 } 334 if (fs->fs_contigsumsize > 0) { 335 copy_fs->fs_maxcluster = lp = space; 336 for (i = 0; i < fs->fs_ncg; i++) 337 *lp++ = fs->fs_contigsumsize; 338 } 339 /* 340 * We must check for active files that have been unlinked 341 * (e.g., with a zero link count). We have to expunge all 342 * trace of these files from the snapshot so that they are 343 * not reclaimed prematurely by fsck or unnecessarily dumped. 344 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 345 * spec_strategy about writing on a suspended filesystem. 346 * Note that we skip unlinked snapshot files as they will 347 * be handled separately below. 348 * 349 * We also calculate the needed size for the snapshot list. 350 */ 351 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 352 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 353 MNT_ILOCK(mp); 354loop: 355 for (xvp = LIST_FIRST(&mp->mnt_vnodelist); xvp; xvp = nvp) { 356 /* 357 * Make sure this vnode wasn't reclaimed in getnewvnode(). 358 * Start over if it has (it won't be on the list anymore). 359 */ 360 if (xvp->v_mount != mp) 361 goto loop; 362 nvp = LIST_NEXT(xvp, v_mntvnodes); 363 VI_LOCK(xvp); 364 MNT_IUNLOCK(mp); 365 if ((xvp->v_flag & VXLOCK) || 366 xvp->v_usecount == 0 || xvp->v_type == VNON || 367 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 368 VI_UNLOCK(xvp); 369 MNT_ILOCK(mp); 370 continue; 371 } 372 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { 373 MNT_ILOCK(mp); 374 goto loop; 375 } 376#ifdef DEBUG 377 if (snapdebug) 378 vprint("ffs_snapshot: busy vnode", xvp); 379#endif 380 if (VOP_GETATTR(xvp, &vat, p->p_ucred, p) == 0 && 381 vat.va_nlink > 0) { 382 VOP_UNLOCK(xvp, 0); 383 MNT_ILOCK(mp); 384 continue; 385 } 386 xp = VTOI(xvp); 387 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 388 VOP_UNLOCK(xvp, 0); 389 MNT_ILOCK(mp); 390 continue; 391 } 392 /* 393 * If there is a fragment, clear it here. 394 */ 395 blkno = 0; 396 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 397 if (loc < NDADDR) { 398 len = fragroundup(fs, blkoff(fs, xp->i_size)); 399 if (len > 0 && len < fs->fs_bsize) { 400 ffs_blkfree(copy_fs, vp, db_get(xp, loc), 401 len, xp->i_number); 402 blkno = db_get(xp, loc); 403 db_assign(xp, loc, 0); 404 } 405 } 406 snaplistsize += 1; 407 if (xp->i_ump->um_fstype == UFS1) 408 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 409 BLK_NOCOPY); 410 else 411 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 412 BLK_NOCOPY); 413 if (blkno) 414 db_assign(xp, loc, blkno); 415 if (!error) 416 error = ffs_freefile(copy_fs, vp, xp->i_number, 417 xp->i_mode); 418 VOP_UNLOCK(xvp, 0); 419 if (error) { 420 free(copy_fs->fs_csp, M_UFSMNT); 421 goto out1; 422 } 423 MNT_ILOCK(mp); 424 } 425 MNT_IUNLOCK(mp); 426 /* 427 * If there already exist snapshots on this filesystem, grab a 428 * reference to their shared lock. If this is the first snapshot 429 * on this filesystem, we need to allocate a lock for the snapshots 430 * to share. In either case, acquire the snapshot lock and give 431 * up our original private lock. 432 */ 433 VI_LOCK(devvp); 434 if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) { 435 struct lock *lkp; 436 437 lkp = ITOV(xp)->v_vnlock; 438 VI_UNLOCK(devvp); 439 VI_LOCK(vp); 440 vp->v_vnlock = lkp; 441 } else { 442 struct lock *lkp; 443 444 VI_UNLOCK(devvp); 445 MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, 446 M_WAITOK); 447 lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE); 448 VI_LOCK(vp); 449 vp->v_vnlock = lkp; 450 } 451 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY); 452 transferlockers(&vp->v_lock, vp->v_vnlock); 453 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 454 /* 455 * If this is the first snapshot on this filesystem, then we need 456 * to allocate the space for the list of preallocated snapshot blocks. 457 * This list will be refined below, but this preliminary one will 458 * keep us out of deadlock until the full one is ready. 459 */ 460 if (xp == NULL) { 461 MALLOC(snapblklist, ufs2_daddr_t *, 462 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 463 blkp = &snapblklist[1]; 464 *blkp++ = ufs_rw64(lblkno(fs, fs->fs_sblockloc), ns); 465 blkno = fragstoblks(fs, fs->fs_csaddr); 466 for (cg = 0; cg < fs->fs_ncg; cg++) { 467 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 468 break; 469 *blkp++ = ufs_rw64(fragstoblks(fs, cgtod(fs, cg)), ns); 470 } 471 len = howmany(fs->fs_cssize, fs->fs_bsize); 472 for (loc = 0; loc < len; loc++) 473 *blkp++ = ufs_rw64(blkno + loc, ns); 474 for (; cg < fs->fs_ncg; cg++) 475 *blkp++ = ufs_rw64(fragstoblks(fs, cgtod(fs, cg)), ns); 476 snapblklist[0] = ufs_rw64(blkp - snapblklist, ns); 477 VI_LOCK(devvp); 478 if (ump->um_snapblklist != NULL) 479 panic("ffs_snapshot: non-empty list"); 480 ump->um_snapblklist = snapblklist; 481 ump->um_snaplistsize = blkp - snapblklist; 482 VI_UNLOCK(devvp); 483 } 484 /* 485 * Record snapshot inode. Since this is the newest snapshot, 486 * it must be placed at the end of the list. 487 */ 488 VI_LOCK(devvp); 489 fs->fs_snapinum[snaploc] = ip->i_number; 490 if (ip->i_nextsnap.tqe_prev != 0) 491 panic("ffs_snapshot: %d already on list", ip->i_number); 492 TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap); 493 VI_UNLOCK(devvp); 494 if (xp == NULL) 495 vn_cow_establish(devvp, ffs_copyonwrite, devvp); 496 vp->v_flag |= VSYSTEM; 497out1: 498 /* 499 * Resume operation on filesystem. 500 */ 501 vfs_write_resume(vp->v_mount); 502 /* 503 * Set the mtime to the time the snapshot has been taken. 504 */ 505 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 506 if (ctime) 507 *ctime = ts; 508 DIP_ASSIGN(ip, mtime, ts.tv_sec); 509 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 510 ip->i_flag |= IN_CHANGE | IN_UPDATE; 511 512#ifdef DEBUG 513 if (starttime.tv_sec > 0) { 514 microtime(&endtime); 515 timersub(&endtime, &starttime, &endtime); 516 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 517 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 518 endtime.tv_usec / 1000, redo, fs->fs_ncg); 519 } 520#endif 521 if (error) 522 goto out; 523 /* 524 * Copy allocation information from all the snapshots in 525 * this snapshot and then expunge them from its view. 526 */ 527 TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap) { 528 if (xp == ip) 529 break; 530 if (xp->i_ump->um_fstype == UFS1) 531 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 532 BLK_SNAP); 533 else 534 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 535 BLK_SNAP); 536 if (error) { 537 fs->fs_snapinum[snaploc] = 0; 538 goto done; 539 } 540 } 541 /* 542 * Allocate space for the full list of preallocated snapshot blocks. 543 */ 544 MALLOC(snapblklist, ufs2_daddr_t *, snaplistsize * sizeof(ufs2_daddr_t), 545 M_UFSMNT, M_WAITOK); 546 ip->i_snapblklist = &snapblklist[1]; 547 /* 548 * Expunge the blocks used by the snapshots from the set of 549 * blocks marked as used in the snapshot bitmaps. Also, collect 550 * the list of allocated blocks in i_snapblklist. 551 */ 552 if (ip->i_ump->um_fstype == UFS1) 553 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 554 else 555 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 556 if (error) { 557 fs->fs_snapinum[snaploc] = 0; 558 FREE(snapblklist, M_UFSMNT); 559 goto done; 560 } 561 if (snaplistsize < ip->i_snapblklist - snapblklist) 562 panic("ffs_snapshot: list too small"); 563 snaplistsize = ip->i_snapblklist - snapblklist; 564 snapblklist[0] = ufs_rw64(snaplistsize, ns); 565 ip->i_snapblklist = 0; 566 /* 567 * Write out the list of allocated blocks to the end of the snapshot. 568 */ 569 error = vn_rdwr(UIO_WRITE, vp, 570 (caddr_t)snapblklist, snaplistsize*sizeof(ufs2_daddr_t), ip->i_size, 571 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, NULL); 572 if (error) { 573 fs->fs_snapinum[snaploc] = 0; 574 FREE(snapblklist, M_UFSMNT); 575 goto done; 576 } 577 /* 578 * Write the superblock and its summary information 579 * to the snapshot. 580 */ 581 blkno = fragstoblks(fs, fs->fs_csaddr); 582 len = howmany(fs->fs_cssize, fs->fs_bsize); 583 space = copy_fs->fs_csp; 584#ifdef FFS_EI 585 if (ns) { 586 ffs_sb_swap(copy_fs, copy_fs); 587 ffs_csum_swap(space, space, fs->fs_cssize); 588 } 589#endif 590 for (loc = 0; loc < len; loc++) { 591 if ((error = writevnblk(vp, space, blkno + loc)) != 0) { 592 fs->fs_snapinum[snaploc] = 0; 593 FREE(snapblklist, M_UFSMNT); 594 goto done; 595 } 596 space = (char *)space + fs->fs_bsize; 597 } 598 /* 599 * As this is the newest list, it is the most inclusive, so 600 * should replace the previous list. 601 */ 602 VI_LOCK(devvp); 603 space = ump->um_snapblklist; 604 ump->um_snapblklist = snapblklist; 605 ump->um_snaplistsize = snaplistsize; 606 VI_UNLOCK(devvp); 607 if (space != NULL) 608 FREE(space, M_UFSMNT); 609done: 610 free(copy_fs->fs_csp, M_UFSMNT); 611 blkno = lblkno(fs, fs->fs_sblockloc); 612 if (error == 0 && (error = writevnblk(vp, cgbuf, blkno)) != 0) 613 fs->fs_snapinum[snaploc] = 0; 614out: 615 /* 616 * All block address modifications are done. Invalidate and free 617 * all pages on the snapshot vnode. Those coming from read ahead 618 * are no longer valid. 619 */ 620 if (!error) { 621 simple_lock(&vp->v_interlock); 622 error = VOP_PUTPAGES(vp, 0, 0, 623 PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 624 } 625 if (cgbuf) 626 free(cgbuf, M_UFSMNT); 627 if (fs->fs_active != 0) { 628 FREE(fs->fs_active, M_DEVBUF); 629 fs->fs_active = 0; 630 } 631 mp->mnt_flag = flag; 632 if (error) 633 (void) VOP_TRUNCATE(vp, (off_t)0, 0, NOCRED, p); 634 else 635 vref(vp); 636 return (error); 637} 638 639/* 640 * Copy a cylinder group map. All the unallocated blocks are marked 641 * BLK_NOCOPY so that the snapshot knows that it need not copy them 642 * if they are later written. If passno is one, then this is a first 643 * pass, so only setting needs to be done. If passno is 2, then this 644 * is a revision to a previous pass which must be undone as the 645 * replacement pass is done. 646 */ 647static int 648cgaccount(cg, vp, data, passno) 649 int cg; 650 struct vnode *vp; 651 caddr_t data; 652 int passno; 653{ 654 struct buf *bp, *ibp; 655 struct inode *ip; 656 struct cg *cgp; 657 struct fs *fs; 658 ufs2_daddr_t base, numblks; 659 int error, len, loc, ns, indiroff; 660 661 ip = VTOI(vp); 662 fs = ip->i_fs; 663 ns = UFS_FSNEEDSWAP(fs); 664 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 665 (int)fs->fs_cgsize, KERNCRED, &bp); 666 if (error) { 667 brelse(bp); 668 return (error); 669 } 670 cgp = (struct cg *)bp->b_data; 671 if (!cg_chkmagic(cgp, ns)) { 672 brelse(bp); 673 return (EIO); 674 } 675 ACTIVECG_SET(fs, cg); 676 677 bcopy(bp->b_data, data, fs->fs_cgsize); 678 brelse(bp); 679 if (fs->fs_cgsize < fs->fs_bsize) 680 bzero(&data[fs->fs_cgsize], 681 fs->fs_bsize - fs->fs_cgsize); 682 numblks = howmany(fs->fs_size, fs->fs_frag); 683 len = howmany(fs->fs_fpg, fs->fs_frag); 684 base = cg * fs->fs_fpg / fs->fs_frag; 685 if (base + len >= numblks) 686 len = numblks - base - 1; 687 loc = 0; 688 if (base < NDADDR) { 689 for ( ; loc < NDADDR; loc++) { 690 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 691 db_assign(ip, loc, BLK_NOCOPY); 692 else if (db_get(ip, loc) == BLK_NOCOPY) { 693 if (passno == 2) 694 db_assign(ip, loc, 0); 695 else if (passno == 1) 696 panic("ffs_snapshot: lost direct block"); 697 } 698 } 699 } 700 if ((error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 701 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 702 return (error); 703 indiroff = (base + loc - NDADDR) % NINDIR(fs); 704 for ( ; loc < len; loc++, indiroff++) { 705 if (indiroff >= NINDIR(fs)) { 706 bwrite(ibp); 707 if ((error = VOP_BALLOC(vp, 708 lblktosize(fs, (off_t)(base + loc)), 709 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 710 return (error); 711 indiroff = 0; 712 } 713 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 714 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 715 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 716 if (passno == 2) 717 idb_assign(ip, ibp->b_data, indiroff, 0); 718 else if (passno == 1) 719 panic("ffs_snapshot: lost indirect block"); 720 } 721 } 722 bwrite(ibp); 723 return (0); 724} 725 726/* 727 * Before expunging a snapshot inode, note all the 728 * blocks that it claims with BLK_SNAP so that fsck will 729 * be able to account for those blocks properly and so 730 * that this snapshot knows that it need not copy them 731 * if the other snapshot holding them is freed. This code 732 * is reproduced once each for UFS1 and UFS2. 733 */ 734static int 735expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 736 struct vnode *snapvp; 737 struct inode *cancelip; 738 struct fs *fs; 739 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 740 struct fs *, ufs_lbn_t, int); 741 int expungetype; 742{ 743 int i, s, error, ns, indiroff; 744 ufs_lbn_t lbn, rlbn; 745 ufs2_daddr_t len, blkno, numblks, blksperindir; 746 struct ufs1_dinode *dip; 747 struct buf *bp; 748 caddr_t buf; 749 750 ns = UFS_FSNEEDSWAP(fs); 751 /* 752 * Prepare to expunge the inode. If its inode block has not 753 * yet been copied, then allocate and fill the copy. 754 */ 755 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 756 blkno = 0; 757 if (lbn < NDADDR) { 758 blkno = db_get(VTOI(snapvp), lbn); 759 } else { 760 s = cow_enter(); 761 error = VOP_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 762 fs->fs_bsize, KERNCRED, B_METAONLY, &bp); 763 cow_leave(s); 764 if (error) 765 return (error); 766 indiroff = (lbn - NDADDR) % NINDIR(fs); 767 blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff); 768 brelse(bp); 769 } 770 buf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 771 if (blkno != 0) 772 error = readvnblk(snapvp, buf, lbn); 773 else 774 error = readfsblk(snapvp, buf, lbn); 775 if (error) { 776 free(buf, M_UFSMNT); 777 return error; 778 } 779 /* 780 * Set a snapshot inode to be a zero length file, regular files 781 * to be completely unallocated. 782 */ 783 dip = (struct ufs1_dinode *)buf + ino_to_fsbo(fs, cancelip->i_number); 784 if (expungetype == BLK_NOCOPY) 785 dip->di_mode = 0; 786 dip->di_size = 0; 787 dip->di_blocks = 0; 788 dip->di_flags = 789 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 790 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 791 error = writevnblk(snapvp, buf, lbn); 792 free(buf, M_UFSMNT); 793 if (error) 794 return error; 795 /* 796 * Now go through and expunge all the blocks in the file 797 * using the function requested. 798 */ 799 numblks = howmany(cancelip->i_size, fs->fs_bsize); 800 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_db[0], 801 &cancelip->i_ffs1_db[NDADDR], fs, 0, expungetype))) 802 return (error); 803 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_ib[0], 804 &cancelip->i_ffs1_ib[NIADDR], fs, -1, expungetype))) 805 return (error); 806 blksperindir = 1; 807 lbn = -NDADDR; 808 len = numblks - NDADDR; 809 rlbn = NDADDR; 810 for (i = 0; len > 0 && i < NIADDR; i++) { 811 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 812 ufs_rw32(cancelip->i_ffs1_ib[i], ns), lbn, rlbn, len, 813 blksperindir, fs, acctfunc, expungetype); 814 if (error) 815 return (error); 816 blksperindir *= NINDIR(fs); 817 lbn -= blksperindir + 1; 818 len -= blksperindir; 819 rlbn += blksperindir; 820 } 821 return (0); 822} 823 824/* 825 * Descend an indirect block chain for vnode cancelvp accounting for all 826 * its indirect blocks in snapvp. 827 */ 828static int 829indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 830 blksperindir, fs, acctfunc, expungetype) 831 struct vnode *snapvp; 832 struct vnode *cancelvp; 833 int level; 834 ufs1_daddr_t blkno; 835 ufs_lbn_t lbn; 836 ufs_lbn_t rlbn; 837 ufs_lbn_t remblks; 838 ufs_lbn_t blksperindir; 839 struct fs *fs; 840 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 841 struct fs *, ufs_lbn_t, int); 842 int expungetype; 843{ 844 int error, ns, num, i; 845 ufs_lbn_t subblksperindir; 846 struct indir indirs[NIADDR + 2]; 847 ufs1_daddr_t last, *bap; 848 struct buf *bp; 849 850 ns = UFS_FSNEEDSWAP(fs); 851 852 if (blkno == 0) { 853 if (expungetype == BLK_NOCOPY) 854 return (0); 855 panic("indiracct_ufs1: missing indir"); 856 } 857 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 858 return (error); 859 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 860 panic("indiracct_ufs1: botched params"); 861 /* 862 * We have to expand bread here since it will deadlock looking 863 * up the block number for any blocks that are not in the cache. 864 */ 865 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 866 bp->b_blkno = fsbtodb(fs, blkno); 867 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 868 (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) { 869 brelse(bp); 870 return (error); 871 } 872 /* 873 * Account for the block pointers in this indirect block. 874 */ 875 last = howmany(remblks, blksperindir); 876 if (last > NINDIR(fs)) 877 last = NINDIR(fs); 878 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 879 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 880 brelse(bp); 881 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 882 level == 0 ? rlbn : -1, expungetype); 883 if (error || level == 0) 884 goto out; 885 /* 886 * Account for the block pointers in each of the indirect blocks 887 * in the levels below us. 888 */ 889 subblksperindir = blksperindir / NINDIR(fs); 890 for (lbn++, level--, i = 0; i < last; i++) { 891 error = indiracct_ufs1(snapvp, cancelvp, level, 892 ufs_rw32(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 893 fs, acctfunc, expungetype); 894 if (error) 895 goto out; 896 rlbn += blksperindir; 897 lbn -= blksperindir; 898 remblks -= blksperindir; 899 } 900out: 901 FREE(bap, M_DEVBUF); 902 return (error); 903} 904 905/* 906 * Do both snap accounting and map accounting. 907 */ 908static int 909fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 910 struct vnode *vp; 911 ufs1_daddr_t *oldblkp, *lastblkp; 912 struct fs *fs; 913 ufs_lbn_t lblkno; 914 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 915{ 916 int error; 917 918 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 919 return (error); 920 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 921} 922 923/* 924 * Identify a set of blocks allocated in a snapshot inode. 925 */ 926static int 927snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 928 struct vnode *vp; 929 ufs1_daddr_t *oldblkp, *lastblkp; 930 struct fs *fs; 931 ufs_lbn_t lblkno; 932 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 933{ 934 struct inode *ip = VTOI(vp); 935 ufs1_daddr_t blkno, *blkp; 936 ufs_lbn_t lbn; 937 struct buf *ibp; 938 int error, ns; 939 940 ns = UFS_FSNEEDSWAP(fs); 941 942 for ( ; oldblkp < lastblkp; oldblkp++) { 943 blkno = ufs_rw32(*oldblkp, ns); 944 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 945 continue; 946 lbn = fragstoblks(fs, blkno); 947 if (lbn < NDADDR) { 948 blkp = &ip->i_ffs1_db[lbn]; 949 ip->i_flag |= IN_CHANGE | IN_UPDATE; 950 } else { 951 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 952 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 953 if (error) 954 return (error); 955 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 956 [(lbn - NDADDR) % NINDIR(fs)]; 957 } 958 /* 959 * If we are expunging a snapshot vnode and we 960 * find a block marked BLK_NOCOPY, then it is 961 * one that has been allocated to this snapshot after 962 * we took our current snapshot and can be ignored. 963 */ 964 blkno = ufs_rw32(*blkp, ns); 965 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 966 if (lbn >= NDADDR) 967 brelse(ibp); 968 } else { 969 if (blkno != 0) 970 panic("snapacct_ufs1: bad block"); 971 *blkp = ufs_rw32(expungetype, ns); 972 if (lbn >= NDADDR) 973 bwrite(ibp); 974 } 975 } 976 return (0); 977} 978 979/* 980 * Account for a set of blocks allocated in a snapshot inode. 981 */ 982static int 983mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 984 struct vnode *vp; 985 ufs1_daddr_t *oldblkp, *lastblkp; 986 struct fs *fs; 987 ufs_lbn_t lblkno; 988 int expungetype; 989{ 990 ufs1_daddr_t blkno; 991 struct inode *ip; 992 ino_t inum; 993 int acctit, ns; 994 995 ns = UFS_FSNEEDSWAP(fs); 996 ip = VTOI(vp); 997 inum = ip->i_number; 998 if (lblkno == -1) 999 acctit = 0; 1000 else 1001 acctit = 1; 1002 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1003 blkno = ufs_rw32(*oldblkp, ns); 1004 if (blkno == 0 || blkno == BLK_NOCOPY) 1005 continue; 1006 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1007 *ip->i_snapblklist++ = ufs_rw64(lblkno, ns); 1008 if (blkno == BLK_SNAP) 1009 blkno = blkstofrags(fs, lblkno); 1010 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1011 } 1012 return (0); 1013} 1014 1015/* 1016 * Before expunging a snapshot inode, note all the 1017 * blocks that it claims with BLK_SNAP so that fsck will 1018 * be able to account for those blocks properly and so 1019 * that this snapshot knows that it need not copy them 1020 * if the other snapshot holding them is freed. This code 1021 * is reproduced once each for UFS1 and UFS2. 1022 */ 1023static int 1024expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1025 struct vnode *snapvp; 1026 struct inode *cancelip; 1027 struct fs *fs; 1028 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1029 struct fs *, ufs_lbn_t, int); 1030 int expungetype; 1031{ 1032 int i, s, error, ns, indiroff; 1033 ufs_lbn_t lbn, rlbn; 1034 ufs2_daddr_t len, blkno, numblks, blksperindir; 1035 struct ufs2_dinode *dip; 1036 struct buf *bp; 1037 caddr_t buf; 1038 1039 ns = UFS_FSNEEDSWAP(fs); 1040 /* 1041 * Prepare to expunge the inode. If its inode block has not 1042 * yet been copied, then allocate and fill the copy. 1043 */ 1044 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1045 blkno = 0; 1046 if (lbn < NDADDR) { 1047 blkno = db_get(VTOI(snapvp), lbn); 1048 } else { 1049 s = cow_enter(); 1050 error = VOP_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1051 fs->fs_bsize, KERNCRED, B_METAONLY, &bp); 1052 cow_leave(s); 1053 if (error) 1054 return (error); 1055 indiroff = (lbn - NDADDR) % NINDIR(fs); 1056 blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff); 1057 brelse(bp); 1058 } 1059 buf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1060 if (blkno != 0) 1061 error = readvnblk(snapvp, buf, lbn); 1062 else 1063 error = readfsblk(snapvp, buf, lbn); 1064 if (error) { 1065 free(buf, M_UFSMNT); 1066 return error; 1067 } 1068 /* 1069 * Set a snapshot inode to be a zero length file, regular files 1070 * to be completely unallocated. 1071 */ 1072 dip = (struct ufs2_dinode *)buf + ino_to_fsbo(fs, cancelip->i_number); 1073 if (expungetype == BLK_NOCOPY) 1074 dip->di_mode = 0; 1075 dip->di_size = 0; 1076 dip->di_blocks = 0; 1077 dip->di_flags = 1078 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 1079 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1080 error = writevnblk(snapvp, buf, lbn); 1081 free(buf, M_UFSMNT); 1082 if (error) 1083 return error; 1084 /* 1085 * Now go through and expunge all the blocks in the file 1086 * using the function requested. 1087 */ 1088 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1089 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_db[0], 1090 &cancelip->i_ffs2_db[NDADDR], fs, 0, expungetype))) 1091 return (error); 1092 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_ib[0], 1093 &cancelip->i_ffs2_ib[NIADDR], fs, -1, expungetype))) 1094 return (error); 1095 blksperindir = 1; 1096 lbn = -NDADDR; 1097 len = numblks - NDADDR; 1098 rlbn = NDADDR; 1099 for (i = 0; len > 0 && i < NIADDR; i++) { 1100 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1101 ufs_rw64(cancelip->i_ffs2_ib[i], ns), lbn, rlbn, len, 1102 blksperindir, fs, acctfunc, expungetype); 1103 if (error) 1104 return (error); 1105 blksperindir *= NINDIR(fs); 1106 lbn -= blksperindir + 1; 1107 len -= blksperindir; 1108 rlbn += blksperindir; 1109 } 1110 return (0); 1111} 1112 1113/* 1114 * Descend an indirect block chain for vnode cancelvp accounting for all 1115 * its indirect blocks in snapvp. 1116 */ 1117static int 1118indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1119 blksperindir, fs, acctfunc, expungetype) 1120 struct vnode *snapvp; 1121 struct vnode *cancelvp; 1122 int level; 1123 ufs2_daddr_t blkno; 1124 ufs_lbn_t lbn; 1125 ufs_lbn_t rlbn; 1126 ufs_lbn_t remblks; 1127 ufs_lbn_t blksperindir; 1128 struct fs *fs; 1129 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1130 struct fs *, ufs_lbn_t, int); 1131 int expungetype; 1132{ 1133 int error, ns, num, i; 1134 ufs_lbn_t subblksperindir; 1135 struct indir indirs[NIADDR + 2]; 1136 ufs2_daddr_t last, *bap; 1137 struct buf *bp; 1138 1139 ns = UFS_FSNEEDSWAP(fs); 1140 1141 if (blkno == 0) { 1142 if (expungetype == BLK_NOCOPY) 1143 return (0); 1144 panic("indiracct_ufs2: missing indir"); 1145 } 1146 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1147 return (error); 1148 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1149 panic("indiracct_ufs2: botched params"); 1150 /* 1151 * We have to expand bread here since it will deadlock looking 1152 * up the block number for any blocks that are not in the cache. 1153 */ 1154 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 1155 bp->b_blkno = fsbtodb(fs, blkno); 1156 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1157 (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) { 1158 brelse(bp); 1159 return (error); 1160 } 1161 /* 1162 * Account for the block pointers in this indirect block. 1163 */ 1164 last = howmany(remblks, blksperindir); 1165 if (last > NINDIR(fs)) 1166 last = NINDIR(fs); 1167 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1168 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1169 brelse(bp); 1170 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1171 level == 0 ? rlbn : -1, expungetype); 1172 if (error || level == 0) 1173 goto out; 1174 /* 1175 * Account for the block pointers in each of the indirect blocks 1176 * in the levels below us. 1177 */ 1178 subblksperindir = blksperindir / NINDIR(fs); 1179 for (lbn++, level--, i = 0; i < last; i++) { 1180 error = indiracct_ufs2(snapvp, cancelvp, level, 1181 ufs_rw64(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 1182 fs, acctfunc, expungetype); 1183 if (error) 1184 goto out; 1185 rlbn += blksperindir; 1186 lbn -= blksperindir; 1187 remblks -= blksperindir; 1188 } 1189out: 1190 FREE(bap, M_DEVBUF); 1191 return (error); 1192} 1193 1194/* 1195 * Do both snap accounting and map accounting. 1196 */ 1197static int 1198fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1199 struct vnode *vp; 1200 ufs2_daddr_t *oldblkp, *lastblkp; 1201 struct fs *fs; 1202 ufs_lbn_t lblkno; 1203 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1204{ 1205 int error; 1206 1207 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1208 return (error); 1209 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1210} 1211 1212/* 1213 * Identify a set of blocks allocated in a snapshot inode. 1214 */ 1215static int 1216snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1217 struct vnode *vp; 1218 ufs2_daddr_t *oldblkp, *lastblkp; 1219 struct fs *fs; 1220 ufs_lbn_t lblkno; 1221 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1222{ 1223 struct inode *ip = VTOI(vp); 1224 ufs2_daddr_t blkno, *blkp; 1225 ufs_lbn_t lbn; 1226 struct buf *ibp; 1227 int error, ns; 1228 1229 ns = UFS_FSNEEDSWAP(fs); 1230 1231 for ( ; oldblkp < lastblkp; oldblkp++) { 1232 blkno = ufs_rw64(*oldblkp, ns); 1233 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1234 continue; 1235 lbn = fragstoblks(fs, blkno); 1236 if (lbn < NDADDR) { 1237 blkp = &ip->i_ffs2_db[lbn]; 1238 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1239 } else { 1240 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1241 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1242 if (error) 1243 return (error); 1244 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1245 [(lbn - NDADDR) % NINDIR(fs)]; 1246 } 1247 /* 1248 * If we are expunging a snapshot vnode and we 1249 * find a block marked BLK_NOCOPY, then it is 1250 * one that has been allocated to this snapshot after 1251 * we took our current snapshot and can be ignored. 1252 */ 1253 blkno = ufs_rw64(*blkp, ns); 1254 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1255 if (lbn >= NDADDR) 1256 brelse(ibp); 1257 } else { 1258 if (blkno != 0) 1259 panic("snapacct_ufs2: bad block"); 1260 *blkp = ufs_rw64(expungetype, ns); 1261 if (lbn >= NDADDR) 1262 bwrite(ibp); 1263 } 1264 } 1265 return (0); 1266} 1267 1268/* 1269 * Account for a set of blocks allocated in a snapshot inode. 1270 */ 1271static int 1272mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1273 struct vnode *vp; 1274 ufs2_daddr_t *oldblkp, *lastblkp; 1275 struct fs *fs; 1276 ufs_lbn_t lblkno; 1277 int expungetype; 1278{ 1279 ufs2_daddr_t blkno; 1280 struct inode *ip; 1281 ino_t inum; 1282 int acctit, ns; 1283 1284 ns = UFS_FSNEEDSWAP(fs); 1285 ip = VTOI(vp); 1286 inum = ip->i_number; 1287 if (lblkno == -1) 1288 acctit = 0; 1289 else 1290 acctit = 1; 1291 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1292 blkno = ufs_rw64(*oldblkp, ns); 1293 if (blkno == 0 || blkno == BLK_NOCOPY) 1294 continue; 1295 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1296 *ip->i_snapblklist++ = ufs_rw64(lblkno, ns); 1297 if (blkno == BLK_SNAP) 1298 blkno = blkstofrags(fs, lblkno); 1299 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1300 } 1301 return (0); 1302} 1303 1304/* 1305 * Decrement extra reference on snapshot when last name is removed. 1306 * It will not be freed until the last open reference goes away. 1307 */ 1308void 1309ffs_snapgone(ip) 1310 struct inode *ip; 1311{ 1312 struct ufsmount *ump = VFSTOUFS(ip->i_devvp->v_specmountpoint); 1313 struct inode *xp; 1314 struct fs *fs; 1315 int snaploc; 1316 1317 /* 1318 * Find snapshot in incore list. 1319 */ 1320 TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap) 1321 if (xp == ip) 1322 break; 1323 if (xp != NULL) 1324 vrele(ITOV(ip)); 1325#ifdef DEBUG 1326 else if (snapdebug) 1327 printf("ffs_snapgone: lost snapshot vnode %d\n", 1328 ip->i_number); 1329#endif 1330 /* 1331 * Delete snapshot inode from superblock. Keep list dense. 1332 */ 1333 fs = ip->i_fs; 1334 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1335 if (fs->fs_snapinum[snaploc] == ip->i_number) 1336 break; 1337 if (snaploc < FSMAXSNAP) { 1338 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1339 if (fs->fs_snapinum[snaploc] == 0) 1340 break; 1341 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1342 } 1343 fs->fs_snapinum[snaploc - 1] = 0; 1344 } 1345} 1346 1347/* 1348 * Prepare a snapshot file for being removed. 1349 */ 1350void 1351ffs_snapremove(vp) 1352 struct vnode *vp; 1353{ 1354 struct inode *ip = VTOI(vp); 1355 struct vnode *devvp = ip->i_devvp; 1356 struct fs *fs = ip->i_fs; 1357 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1358 struct lock *lkp; 1359 struct buf *ibp; 1360 ufs2_daddr_t numblks, blkno, dblk, *snapblklist; 1361 int error, ns, loc, last; 1362 1363 ns = UFS_FSNEEDSWAP(fs); 1364 /* 1365 * If active, delete from incore list (this snapshot may 1366 * already have been in the process of being deleted, so 1367 * would not have been active). 1368 * 1369 * Clear copy-on-write flag if last snapshot. 1370 */ 1371 if (ip->i_nextsnap.tqe_prev != 0) { 1372 VI_LOCK(devvp); 1373 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1374 VI_MTX(devvp)); 1375 VI_LOCK(devvp); 1376 TAILQ_REMOVE(&ump->um_snapshots, ip, i_nextsnap); 1377 ip->i_nextsnap.tqe_prev = 0; 1378 lkp = vp->v_vnlock; 1379 vp->v_vnlock = &vp->v_lock; 1380 lockmgr(lkp, LK_RELEASE, NULL); 1381 if (TAILQ_FIRST(&ump->um_snapshots) != 0) { 1382 VI_UNLOCK(devvp); 1383 } else { 1384 snapblklist = ump->um_snapblklist; 1385 ump->um_snapblklist = 0; 1386 ump->um_snaplistsize = 0; 1387 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); 1388 lockmgr(lkp, LK_RELEASE, NULL); 1389 vn_cow_disestablish(devvp, ffs_copyonwrite, devvp); 1390 FREE(lkp, M_UFSMNT); 1391 FREE(snapblklist, M_UFSMNT); 1392 } 1393 } 1394 /* 1395 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1396 * snapshots that want them (see ffs_snapblkfree below). 1397 */ 1398 for (blkno = 1; blkno < NDADDR; blkno++) { 1399 dblk = db_get(ip, blkno); 1400 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1401 db_assign(ip, blkno, 0); 1402 else if ((dblk == blkstofrags(fs, blkno) && 1403 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1404 ip->i_number))) { 1405 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1406 db_assign(ip, blkno, 0); 1407 } 1408 } 1409 numblks = howmany(ip->i_size, fs->fs_bsize); 1410 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1411 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1412 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1413 if (error) 1414 continue; 1415 if (fs->fs_size - blkno > NINDIR(fs)) 1416 last = NINDIR(fs); 1417 else 1418 last = fs->fs_size - blkno; 1419 for (loc = 0; loc < last; loc++) { 1420 dblk = idb_get(ip, ibp->b_data, loc); 1421 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1422 idb_assign(ip, ibp->b_data, loc, 0); 1423 else if (dblk == blkstofrags(fs, blkno) && 1424 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1425 fs->fs_bsize, ip->i_number)) { 1426 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1427 idb_assign(ip, ibp->b_data, loc, 0); 1428 } 1429 } 1430 bwrite(ibp); 1431 } 1432 /* 1433 * Clear snapshot flag and drop reference. 1434 */ 1435 ip->i_flags &= ~SF_SNAPSHOT; 1436 DIP_ASSIGN(ip, flags, ip->i_flags); 1437 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1438} 1439 1440/* 1441 * Notification that a block is being freed. Return zero if the free 1442 * should be allowed to proceed. Return non-zero if the snapshot file 1443 * wants to claim the block. The block will be claimed if it is an 1444 * uncopied part of one of the snapshots. It will be freed if it is 1445 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1446 * If a fragment is being freed, then all snapshots that care about 1447 * it must make a copy since a snapshot file can only claim full sized 1448 * blocks. Note that if more than one snapshot file maps the block, 1449 * we can pick one at random to claim it. Since none of the snapshots 1450 * can change, we are assurred that they will all see the same unmodified 1451 * image. When deleting a snapshot file (see ffs_snapremove above), we 1452 * must push any of these claimed blocks to one of the other snapshots 1453 * that maps it. These claimed blocks are easily identified as they will 1454 * have a block number equal to their logical block number within the 1455 * snapshot. A copied block can never have this property because they 1456 * must always have been allocated from a BLK_NOCOPY location. 1457 */ 1458int 1459ffs_snapblkfree(fs, devvp, bno, size, inum) 1460 struct fs *fs; 1461 struct vnode *devvp; 1462 ufs2_daddr_t bno; 1463 long size; 1464 ino_t inum; 1465{ 1466 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1467 struct buf *ibp; 1468 struct inode *ip; 1469 struct vnode *vp = NULL, *saved_vp = NULL; 1470 caddr_t saved_data = NULL; 1471 ufs_lbn_t lbn; 1472 ufs2_daddr_t blkno; 1473 int s, indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1474 1475 lbn = fragstoblks(fs, bno); 1476retry: 1477 VI_LOCK(devvp); 1478 TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) { 1479 vp = ITOV(ip); 1480 /* 1481 * Lookup block being written. 1482 */ 1483 if (lbn < NDADDR) { 1484 blkno = db_get(ip, lbn); 1485 } else { 1486 if (snapshot_locked == 0 && 1487 lockmgr(vp->v_vnlock, 1488 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1489 VI_MTX(devvp)) != 0) 1490 goto retry; 1491 snapshot_locked = 1; 1492 s = cow_enter(); 1493 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1494 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1495 cow_leave(s); 1496 if (error) 1497 break; 1498 indiroff = (lbn - NDADDR) % NINDIR(fs); 1499 blkno = idb_get(ip, ibp->b_data, indiroff); 1500 } 1501 /* 1502 * Check to see if block needs to be copied. 1503 */ 1504 if (blkno == 0) { 1505 /* 1506 * A block that we map is being freed. If it has not 1507 * been claimed yet, we will claim or copy it (below). 1508 */ 1509 claimedblk = 1; 1510 } else if (blkno == BLK_SNAP) { 1511 /* 1512 * No previous snapshot claimed the block, 1513 * so it will be freed and become a BLK_NOCOPY 1514 * (don't care) for us. 1515 */ 1516 if (claimedblk) 1517 panic("snapblkfree: inconsistent block type"); 1518 if (snapshot_locked == 0 && 1519 lockmgr(vp->v_vnlock, 1520 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1521 VI_MTX(devvp)) != 0) { 1522 if (lbn >= NDADDR) 1523 brelse(ibp); 1524 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL); 1525 goto retry; 1526 } 1527 snapshot_locked = 1; 1528 if (lbn < NDADDR) { 1529 db_assign(ip, lbn, BLK_NOCOPY); 1530 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1531 } else { 1532 idb_assign(ip, ibp->b_data, indiroff, 1533 BLK_NOCOPY); 1534 bwrite(ibp); 1535 } 1536 continue; 1537 } else /* BLK_NOCOPY or default */ { 1538 /* 1539 * If the snapshot has already copied the block 1540 * (default), or does not care about the block, 1541 * it is not needed. 1542 */ 1543 if (lbn >= NDADDR) 1544 brelse(ibp); 1545 continue; 1546 } 1547 /* 1548 * If this is a full size block, we will just grab it 1549 * and assign it to the snapshot inode. Otherwise we 1550 * will proceed to copy it. See explanation for this 1551 * routine as to why only a single snapshot needs to 1552 * claim this block. 1553 */ 1554 if (snapshot_locked == 0 && 1555 lockmgr(vp->v_vnlock, 1556 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1557 VI_MTX(devvp)) != 0) { 1558 if (lbn >= NDADDR) 1559 brelse(ibp); 1560 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL); 1561 goto retry; 1562 } 1563 snapshot_locked = 1; 1564 if (size == fs->fs_bsize) { 1565#ifdef DEBUG 1566 if (snapdebug) 1567 printf("%s %d lbn %" PRId64 " from inum %d\n", 1568 "Grabonremove: snapino", ip->i_number, 1569 lbn, inum); 1570#endif 1571 if (lbn < NDADDR) { 1572 db_assign(ip, lbn, bno); 1573 } else { 1574 idb_assign(ip, ibp->b_data, indiroff, bno); 1575 bwrite(ibp); 1576 } 1577 DIP_ADD(ip, blocks, btodb(size)); 1578 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1579 VOP_UNLOCK(vp, 0); 1580 return (1); 1581 } 1582 if (lbn >= NDADDR) 1583 brelse(ibp); 1584#ifdef DEBUG 1585 if (snapdebug) 1586 printf("%s%d lbn %" PRId64 " %s %d size %ld\n", 1587 "Copyonremove: snapino ", ip->i_number, 1588 lbn, "for inum", inum, size); 1589#endif 1590 /* 1591 * If we have already read the old block contents, then 1592 * simply copy them to the new block. Note that we need 1593 * to synchronously write snapshots that have not been 1594 * unlinked, and hence will be visible after a crash, 1595 * to ensure their integrity. 1596 */ 1597 if (saved_data) { 1598 error = writevnblk(vp, saved_data, lbn); 1599 if (error) 1600 break; 1601 continue; 1602 } 1603 /* 1604 * Otherwise, read the old block contents into the buffer. 1605 */ 1606 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1607 saved_vp = vp; 1608 if ((error = readfsblk(vp, saved_data, lbn)) != 0) { 1609 free(saved_data, M_UFSMNT); 1610 saved_data = NULL; 1611 break; 1612 } 1613 } 1614 /* 1615 * Note that we need to synchronously write snapshots that 1616 * have not been unlinked, and hence will be visible after 1617 * a crash, to ensure their integrity. 1618 */ 1619 if (saved_data) { 1620 error = writevnblk(saved_vp, saved_data, lbn); 1621 free(saved_data, M_UFSMNT); 1622 } 1623 /* 1624 * If we have been unable to allocate a block in which to do 1625 * the copy, then return non-zero so that the fragment will 1626 * not be freed. Although space will be lost, the snapshot 1627 * will stay consistent. 1628 */ 1629 if (snapshot_locked) 1630 VOP_UNLOCK(vp, 0); 1631 else 1632 VI_UNLOCK(devvp); 1633 return (error); 1634} 1635 1636/* 1637 * Associate snapshot files when mounting. 1638 */ 1639void 1640ffs_snapshot_mount(mp) 1641 struct mount *mp; 1642{ 1643 struct ufsmount *ump = VFSTOUFS(mp); 1644 struct vnode *devvp = ump->um_devvp; 1645 struct fs *fs = ump->um_fs; 1646 struct proc *p = curproc; 1647 struct vnode *vp; 1648 struct inode *ip, *xp; 1649 ufs2_daddr_t snaplistsize, *snapblklist; 1650 int error, ns, snaploc, loc; 1651 1652 ns = UFS_FSNEEDSWAP(fs); 1653 /* 1654 * XXX The following needs to be set before VOP_TRUNCATE or 1655 * VOP_READ can be called. 1656 */ 1657 mp->mnt_stat.f_iosize = fs->fs_bsize; 1658 /* 1659 * Process each snapshot listed in the superblock. 1660 */ 1661 vp = NULL; 1662 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1663 if (fs->fs_snapinum[snaploc] == 0) 1664 break; 1665 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1666 &vp)) != 0) { 1667 printf("ffs_snapshot_mount: vget failed %d\n", error); 1668 continue; 1669 } 1670 ip = VTOI(vp); 1671 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1672 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1673 fs->fs_snapinum[snaploc]); 1674 vput(vp); 1675 vp = NULL; 1676 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1677 if (fs->fs_snapinum[loc] == 0) 1678 break; 1679 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1680 } 1681 fs->fs_snapinum[loc - 1] = 0; 1682 snaploc--; 1683 continue; 1684 } 1685 /* 1686 * If there already exist snapshots on this filesystem, grab a 1687 * reference to their shared lock. If this is the first snapshot 1688 * on this filesystem, we need to allocate a lock for the 1689 * snapshots to share. In either case, acquire the snapshot 1690 * lock and give up our original private lock. 1691 */ 1692 VI_LOCK(devvp); 1693 if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) { 1694 struct lock *lkp; 1695 1696 lkp = ITOV(xp)->v_vnlock; 1697 VI_UNLOCK(devvp); 1698 VI_LOCK(vp); 1699 vp->v_vnlock = lkp; 1700 } else { 1701 struct lock *lkp; 1702 1703 VI_UNLOCK(devvp); 1704 MALLOC(lkp, struct lock *, sizeof(struct lock), 1705 M_UFSMNT, M_WAITOK); 1706 lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE); 1707 VI_LOCK(vp); 1708 vp->v_vnlock = lkp; 1709 } 1710 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY); 1711 transferlockers(&vp->v_lock, vp->v_vnlock); 1712 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 1713 /* 1714 * Link it onto the active snapshot list. 1715 */ 1716 VI_LOCK(devvp); 1717 if (ip->i_nextsnap.tqe_prev != 0) 1718 panic("ffs_snapshot_mount: %d already on list", 1719 ip->i_number); 1720 else 1721 TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap); 1722 vp->v_flag |= VSYSTEM; 1723 VI_UNLOCK(devvp); 1724 VOP_UNLOCK(vp, 0); 1725 } 1726 /* 1727 * No usable snapshots found. 1728 */ 1729 if (vp == NULL) 1730 return; 1731 /* 1732 * Allocate the space for the block hints list. We always want to 1733 * use the list from the newest snapshot. 1734 */ 1735 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1736 error = vn_rdwr(UIO_READ, vp, 1737 (caddr_t)&snaplistsize, sizeof(snaplistsize), 1738 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1739 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, NULL); 1740 if (error) { 1741 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1742 VOP_UNLOCK(vp, 0); 1743 return; 1744 } 1745 snaplistsize = ufs_rw64(snaplistsize, ns); 1746 MALLOC(snapblklist, ufs2_daddr_t *, snaplistsize * sizeof(ufs2_daddr_t), 1747 M_UFSMNT, M_WAITOK); 1748 error = vn_rdwr(UIO_READ, vp, 1749 (caddr_t)snapblklist, snaplistsize * sizeof(ufs2_daddr_t), 1750 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1751 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, NULL); 1752 if (error) { 1753 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1754 VOP_UNLOCK(vp, 0); 1755 FREE(snapblklist, M_UFSMNT); 1756 return; 1757 } 1758 VOP_UNLOCK(vp, 0); 1759 VI_LOCK(devvp); 1760 ump->um_snaplistsize = snaplistsize; 1761 ump->um_snapblklist = snapblklist; 1762 VI_UNLOCK(devvp); 1763 vn_cow_establish(devvp, ffs_copyonwrite, devvp); 1764} 1765 1766/* 1767 * Disassociate snapshot files when unmounting. 1768 */ 1769void 1770ffs_snapshot_unmount(mp) 1771 struct mount *mp; 1772{ 1773 struct ufsmount *ump = VFSTOUFS(mp); 1774 struct vnode *devvp = ump->um_devvp; 1775 struct lock *lkp = NULL; 1776 struct inode *xp; 1777 struct vnode *vp; 1778 1779 VI_LOCK(devvp); 1780 while ((xp = TAILQ_FIRST(&ump->um_snapshots)) != 0) { 1781 vp = ITOV(xp); 1782 lkp = vp->v_vnlock; 1783 vp->v_vnlock = &vp->v_lock; 1784 TAILQ_REMOVE(&ump->um_snapshots, xp, i_nextsnap); 1785 xp->i_nextsnap.tqe_prev = 0; 1786 if (xp->i_ffs_effnlink > 0) { 1787 VI_UNLOCK(devvp); 1788 vrele(vp); 1789 VI_LOCK(devvp); 1790 } 1791 } 1792 if (ump->um_snapblklist != NULL) { 1793 FREE(ump->um_snapblklist, M_UFSMNT); 1794 ump->um_snapblklist = NULL; 1795 ump->um_snaplistsize = 0; 1796 } 1797 VI_UNLOCK(devvp); 1798 if (lkp != NULL) { 1799 vn_cow_disestablish(devvp, ffs_copyonwrite, devvp); 1800 FREE(lkp, M_UFSMNT); 1801 } 1802} 1803 1804/* 1805 * Check for need to copy block that is about to be written, 1806 * copying the block if necessary. 1807 */ 1808static int 1809ffs_copyonwrite(v, bp) 1810 void *v; 1811 struct buf *bp; 1812{ 1813 struct buf *ibp; 1814 struct fs *fs; 1815 struct inode *ip; 1816 struct vnode *devvp = v, *vp = 0, *saved_vp = NULL; 1817 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1818 caddr_t saved_data = NULL; 1819 ufs2_daddr_t lbn, blkno, *snapblklist; 1820 int lower, upper, mid, s, ns, indiroff, snapshot_locked = 0, error = 0; 1821 1822 /* 1823 * Check for valid snapshots. 1824 */ 1825 VI_LOCK(devvp); 1826 ip = TAILQ_FIRST(&ump->um_snapshots); 1827 if (ip == NULL) { 1828 VI_UNLOCK(devvp); 1829 return 0; 1830 } 1831 /* 1832 * First check to see if it is in the preallocated list. 1833 * By doing this check we avoid several potential deadlocks. 1834 */ 1835 fs = ip->i_fs; 1836 ns = UFS_FSNEEDSWAP(fs); 1837 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1838 snapblklist = ump->um_snapblklist; 1839 upper = ump->um_snaplistsize - 1; 1840 lower = 1; 1841 while (lower <= upper) { 1842 mid = (lower + upper) / 2; 1843 if (ufs_rw64(snapblklist[mid], ns) == lbn) 1844 break; 1845 if (ufs_rw64(snapblklist[mid], ns) < lbn) 1846 lower = mid + 1; 1847 else 1848 upper = mid - 1; 1849 } 1850 if (lower <= upper) { 1851 VI_UNLOCK(devvp); 1852 return 0; 1853 } 1854 /* 1855 * Not in the precomputed list, so check the snapshots. 1856 */ 1857retry: 1858 TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) { 1859 vp = ITOV(ip); 1860 /* 1861 * We ensure that everything of our own that needs to be 1862 * copied will be done at the time that ffs_snapshot is 1863 * called. Thus we can skip the check here which can 1864 * deadlock in doing the lookup in VOP_BALLOC. 1865 */ 1866 if (bp->b_vp == vp) 1867 continue; 1868 /* 1869 * Check to see if block needs to be copied. We do not have 1870 * to hold the snapshot lock while doing this lookup as it 1871 * will never require any additional allocations for the 1872 * snapshot inode. 1873 */ 1874 if (lbn < NDADDR) { 1875 blkno = db_get(ip, lbn); 1876 } else { 1877 if (snapshot_locked == 0 && 1878 lockmgr(vp->v_vnlock, 1879 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1880 VI_MTX(devvp)) != 0) { 1881 VI_LOCK(devvp); 1882 goto retry; 1883 } 1884 snapshot_locked = 1; 1885 s = cow_enter(); 1886 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1887 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1888 cow_leave(s); 1889 if (error) 1890 break; 1891 indiroff = (lbn - NDADDR) % NINDIR(fs); 1892 blkno = idb_get(ip, ibp->b_data, indiroff); 1893 brelse(ibp); 1894 } 1895#ifdef DIAGNOSTIC 1896 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1897 panic("ffs_copyonwrite: bad copy block"); 1898#endif 1899 if (blkno != 0) 1900 continue; 1901#ifdef DIAGNOSTIC 1902 if (curlwp->l_flag & L_COWINPROGRESS) 1903 printf("ffs_copyonwrite: recursive call\n"); 1904#endif 1905 /* 1906 * Allocate the block into which to do the copy. Since 1907 * multiple processes may all try to copy the same block, 1908 * we have to recheck our need to do a copy if we sleep 1909 * waiting for the lock. 1910 * 1911 * Because all snapshots on a filesystem share a single 1912 * lock, we ensure that we will never be in competition 1913 * with another process to allocate a block. 1914 */ 1915 if (snapshot_locked == 0 && 1916 lockmgr(vp->v_vnlock, 1917 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1918 VI_MTX(devvp)) != 0) { 1919 VI_LOCK(devvp); 1920 goto retry; 1921 } 1922 snapshot_locked = 1; 1923#ifdef DEBUG 1924 if (snapdebug) { 1925 printf("Copyonwrite: snapino %d lbn %" PRId64 " for ", 1926 ip->i_number, lbn); 1927 if (bp->b_vp == devvp) 1928 printf("fs metadata"); 1929 else 1930 printf("inum %d", VTOI(bp->b_vp)->i_number); 1931 printf(" lblkno %" PRId64 "\n", bp->b_lblkno); 1932 } 1933#endif 1934 /* 1935 * If we have already read the old block contents, then 1936 * simply copy them to the new block. Note that we need 1937 * to synchronously write snapshots that have not been 1938 * unlinked, and hence will be visible after a crash, 1939 * to ensure their integrity. 1940 */ 1941 if (saved_data) { 1942 error = writevnblk(vp, saved_data, lbn); 1943 if (error) 1944 break; 1945 continue; 1946 } 1947 /* 1948 * Otherwise, read the old block contents into the buffer. 1949 */ 1950 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1951 saved_vp = vp; 1952 if ((error = readfsblk(vp, saved_data, lbn)) != 0) { 1953 free(saved_data, M_UFSMNT); 1954 saved_data = NULL; 1955 break; 1956 } 1957 } 1958 /* 1959 * Note that we need to synchronously write snapshots that 1960 * have not been unlinked, and hence will be visible after 1961 * a crash, to ensure their integrity. 1962 */ 1963 if (saved_data) { 1964 error = writevnblk(saved_vp, saved_data, lbn); 1965 free(saved_data, M_UFSMNT); 1966 } 1967 if (snapshot_locked) 1968 VOP_UNLOCK(vp, 0); 1969 else 1970 VI_UNLOCK(devvp); 1971 return error; 1972} 1973 1974/* 1975 * Read the specified block from disk. Vp is usually a snapshot vnode. 1976 */ 1977static int 1978readfsblk(vp, data, lbn) 1979 struct vnode *vp; 1980 caddr_t data; 1981 ufs2_daddr_t lbn; 1982{ 1983 int s, error; 1984 struct inode *ip = VTOI(vp); 1985 struct fs *fs = ip->i_fs; 1986 struct buf *nbp; 1987 1988 s = splbio(); 1989 nbp = pool_get(&bufpool, PR_WAITOK); 1990 splx(s); 1991 1992 BUF_INIT(nbp); 1993 nbp->b_flags = B_READ; 1994 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 1995 nbp->b_error = 0; 1996 nbp->b_data = data; 1997 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn)); 1998 nbp->b_proc = NULL; 1999 nbp->b_dev = ip->i_devvp->v_rdev; 2000 nbp->b_vp = NULLVP; 2001 2002 DEV_STRATEGY(nbp); 2003 2004 error = biowait(nbp); 2005 2006 s = splbio(); 2007 pool_put(&bufpool, nbp); 2008 splx(s); 2009 2010 return error; 2011} 2012 2013/* 2014 * Read the specified block. Bypass UBC to prevent deadlocks. 2015 */ 2016static int 2017readvnblk(vp, data, lbn) 2018 struct vnode *vp; 2019 caddr_t data; 2020 ufs2_daddr_t lbn; 2021{ 2022 int error; 2023 daddr_t bn; 2024 off_t offset; 2025 struct inode *ip = VTOI(vp); 2026 struct fs *fs = ip->i_fs; 2027 2028 error = VOP_BMAP(vp, lbn, NULL, &bn, NULL); 2029 if (error) 2030 return error; 2031 2032 if (bn != (daddr_t)-1) { 2033 offset = dbtob(bn); 2034 simple_lock(&vp->v_interlock); 2035 error = VOP_PUTPAGES(vp, trunc_page(offset), 2036 round_page(offset+fs->fs_bsize), 2037 PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 2038 if (error) 2039 return error; 2040 2041 return readfsblk(vp, data, fragstoblks(fs, dbtofsb(fs, bn))); 2042 } 2043 2044 bzero(data, fs->fs_bsize); 2045 2046 return 0; 2047} 2048 2049/* 2050 * Write the specified block. Bypass UBC to prevent deadlocks. 2051 */ 2052static int 2053writevnblk(vp, data, lbn) 2054 struct vnode *vp; 2055 caddr_t data; 2056 ufs2_daddr_t lbn; 2057{ 2058 int s, error; 2059 off_t offset; 2060 struct buf *bp; 2061 struct inode *ip = VTOI(vp); 2062 struct fs *fs = ip->i_fs; 2063 2064 offset = lblktosize(fs, (off_t)lbn); 2065 s = cow_enter(); 2066 simple_lock(&vp->v_interlock); 2067 error = VOP_PUTPAGES(vp, trunc_page(offset), 2068 round_page(offset+fs->fs_bsize), PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 2069 if (error == 0) 2070 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2071 fs->fs_bsize, KERNCRED, B_SYNC, &bp); 2072 cow_leave(s); 2073 if (error) 2074 return error; 2075 2076 bcopy(data, bp->b_data, fs->fs_bsize); 2077 bp->b_flags |= B_NOCACHE; 2078 2079 return bwrite(bp); 2080} 2081 2082/* 2083 * Set/reset lwp's L_COWINPROGRESS flag. 2084 * May be called recursive. 2085 */ 2086static inline int 2087cow_enter(void) 2088{ 2089 struct lwp *l = curlwp; 2090 2091 if (l->l_flag & L_COWINPROGRESS) { 2092 return 0; 2093 } else { 2094 l->l_flag |= L_COWINPROGRESS; 2095 return L_COWINPROGRESS; 2096 } 2097} 2098 2099static inline void 2100cow_leave(int flag) 2101{ 2102 struct lwp *l = curlwp; 2103 2104 l->l_flag &= ~flag; 2105} 2106 2107/* 2108 * Get/Put direct block from inode or buffer containing disk addresses. Take 2109 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 2110 * into a global include. 2111 */ 2112static inline ufs2_daddr_t 2113db_get(struct inode *ip, int loc) 2114{ 2115 if (ip->i_ump->um_fstype == UFS1) 2116 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); 2117 else 2118 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); 2119} 2120 2121static inline void 2122db_assign(struct inode *ip, int loc, ufs2_daddr_t val) 2123{ 2124 if (ip->i_ump->um_fstype == UFS1) 2125 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2126 else 2127 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2128} 2129 2130static inline ufs2_daddr_t 2131idb_get(struct inode *ip, caddr_t buf, int loc) 2132{ 2133 if (ip->i_ump->um_fstype == UFS1) 2134 return ufs_rw32(((ufs1_daddr_t *)(buf))[loc], 2135 UFS_IPNEEDSWAP(ip)); 2136 else 2137 return ufs_rw64(((ufs2_daddr_t *)(buf))[loc], 2138 UFS_IPNEEDSWAP(ip)); 2139} 2140 2141static inline void 2142idb_assign(struct inode *ip, caddr_t buf, int loc, ufs2_daddr_t val) 2143{ 2144 if (ip->i_ump->um_fstype == UFS1) 2145 ((ufs1_daddr_t *)(buf))[loc] = 2146 ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2147 else 2148 ((ufs2_daddr_t *)(buf))[loc] = 2149 ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2150} 2151