ffs_softdep.c revision 167737
1/*- 2 * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * The soft updates code is derived from the appendix of a University 5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 6 * "Soft Updates: A Solution to the Metadata Update Problem in File 7 * Systems", CSE-TR-254-95, August 1995). 8 * 9 * Further information about soft updates can be obtained from: 10 * 11 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 12 * 1614 Oxford Street mckusick@mckusick.com 13 * Berkeley, CA 94709-1608 +1-510-843-9542 14 * USA 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 27 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 28 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 29 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 30 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 39 */ 40 41#include <sys/cdefs.h> 42__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 167737 2007-03-20 10:51:45Z kib $"); 43 44/* 45 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide. 46 */ 47#ifndef DIAGNOSTIC 48#define DIAGNOSTIC 49#endif 50#ifndef DEBUG 51#define DEBUG 52#endif 53 54#include <sys/param.h> 55#include <sys/kernel.h> 56#include <sys/systm.h> 57#include <sys/bio.h> 58#include <sys/buf.h> 59#include <sys/kdb.h> 60#include <sys/kthread.h> 61#include <sys/lock.h> 62#include <sys/malloc.h> 63#include <sys/mount.h> 64#include <sys/mutex.h> 65#include <sys/proc.h> 66#include <sys/stat.h> 67#include <sys/sysctl.h> 68#include <sys/syslog.h> 69#include <sys/vnode.h> 70#include <sys/conf.h> 71#include <ufs/ufs/dir.h> 72#include <ufs/ufs/extattr.h> 73#include <ufs/ufs/quota.h> 74#include <ufs/ufs/inode.h> 75#include <ufs/ufs/ufsmount.h> 76#include <ufs/ffs/fs.h> 77#include <ufs/ffs/softdep.h> 78#include <ufs/ffs/ffs_extern.h> 79#include <ufs/ufs/ufs_extern.h> 80 81#include <vm/vm.h> 82 83#include "opt_ffs.h" 84#include "opt_quota.h" 85 86#ifndef SOFTUPDATES 87 88int 89softdep_flushfiles(oldmnt, flags, td) 90 struct mount *oldmnt; 91 int flags; 92 struct thread *td; 93{ 94 95 panic("softdep_flushfiles called"); 96} 97 98int 99softdep_mount(devvp, mp, fs, cred) 100 struct vnode *devvp; 101 struct mount *mp; 102 struct fs *fs; 103 struct ucred *cred; 104{ 105 106 return (0); 107} 108 109void 110softdep_initialize() 111{ 112 113 return; 114} 115 116void 117softdep_uninitialize() 118{ 119 120 return; 121} 122 123void 124softdep_setup_inomapdep(bp, ip, newinum) 125 struct buf *bp; 126 struct inode *ip; 127 ino_t newinum; 128{ 129 130 panic("softdep_setup_inomapdep called"); 131} 132 133void 134softdep_setup_blkmapdep(bp, mp, newblkno) 135 struct buf *bp; 136 struct mount *mp; 137 ufs2_daddr_t newblkno; 138{ 139 140 panic("softdep_setup_blkmapdep called"); 141} 142 143void 144softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 145 struct inode *ip; 146 ufs_lbn_t lbn; 147 ufs2_daddr_t newblkno; 148 ufs2_daddr_t oldblkno; 149 long newsize; 150 long oldsize; 151 struct buf *bp; 152{ 153 154 panic("softdep_setup_allocdirect called"); 155} 156 157void 158softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 159 struct inode *ip; 160 ufs_lbn_t lbn; 161 ufs2_daddr_t newblkno; 162 ufs2_daddr_t oldblkno; 163 long newsize; 164 long oldsize; 165 struct buf *bp; 166{ 167 168 panic("softdep_setup_allocext called"); 169} 170 171void 172softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 173 struct inode *ip; 174 ufs_lbn_t lbn; 175 struct buf *bp; 176 int ptrno; 177 ufs2_daddr_t newblkno; 178 ufs2_daddr_t oldblkno; 179 struct buf *nbp; 180{ 181 182 panic("softdep_setup_allocindir_page called"); 183} 184 185void 186softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 187 struct buf *nbp; 188 struct inode *ip; 189 struct buf *bp; 190 int ptrno; 191 ufs2_daddr_t newblkno; 192{ 193 194 panic("softdep_setup_allocindir_meta called"); 195} 196 197void 198softdep_setup_freeblocks(ip, length, flags) 199 struct inode *ip; 200 off_t length; 201 int flags; 202{ 203 204 panic("softdep_setup_freeblocks called"); 205} 206 207void 208softdep_freefile(pvp, ino, mode) 209 struct vnode *pvp; 210 ino_t ino; 211 int mode; 212{ 213 214 panic("softdep_freefile called"); 215} 216 217int 218softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 219 struct buf *bp; 220 struct inode *dp; 221 off_t diroffset; 222 ino_t newinum; 223 struct buf *newdirbp; 224 int isnewblk; 225{ 226 227 panic("softdep_setup_directory_add called"); 228} 229 230void 231softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) 232 struct inode *dp; 233 caddr_t base; 234 caddr_t oldloc; 235 caddr_t newloc; 236 int entrysize; 237{ 238 239 panic("softdep_change_directoryentry_offset called"); 240} 241 242void 243softdep_setup_remove(bp, dp, ip, isrmdir) 244 struct buf *bp; 245 struct inode *dp; 246 struct inode *ip; 247 int isrmdir; 248{ 249 250 panic("softdep_setup_remove called"); 251} 252 253void 254softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 255 struct buf *bp; 256 struct inode *dp; 257 struct inode *ip; 258 ino_t newinum; 259 int isrmdir; 260{ 261 262 panic("softdep_setup_directory_change called"); 263} 264 265void 266softdep_change_linkcnt(ip) 267 struct inode *ip; 268{ 269 270 panic("softdep_change_linkcnt called"); 271} 272 273void 274softdep_load_inodeblock(ip) 275 struct inode *ip; 276{ 277 278 panic("softdep_load_inodeblock called"); 279} 280 281void 282softdep_update_inodeblock(ip, bp, waitfor) 283 struct inode *ip; 284 struct buf *bp; 285 int waitfor; 286{ 287 288 panic("softdep_update_inodeblock called"); 289} 290 291int 292softdep_fsync(vp) 293 struct vnode *vp; /* the "in_core" copy of the inode */ 294{ 295 296 return (0); 297} 298 299void 300softdep_fsync_mountdev(vp) 301 struct vnode *vp; 302{ 303 304 return; 305} 306 307int 308softdep_flushworklist(oldmnt, countp, td) 309 struct mount *oldmnt; 310 int *countp; 311 struct thread *td; 312{ 313 314 *countp = 0; 315 return (0); 316} 317 318int 319softdep_sync_metadata(struct vnode *vp) 320{ 321 322 return (0); 323} 324 325int 326softdep_slowdown(vp) 327 struct vnode *vp; 328{ 329 330 panic("softdep_slowdown called"); 331} 332 333void 334softdep_releasefile(ip) 335 struct inode *ip; /* inode with the zero effective link count */ 336{ 337 338 panic("softdep_releasefile called"); 339} 340 341int 342softdep_request_cleanup(fs, vp) 343 struct fs *fs; 344 struct vnode *vp; 345{ 346 347 return (0); 348} 349 350int 351softdep_check_suspend(struct mount *mp, 352 struct vnode *devvp, 353 int softdep_deps, 354 int softdep_accdeps, 355 int secondary_writes, 356 int secondary_accwrites) 357{ 358 struct bufobj *bo; 359 int error; 360 361 (void) softdep_deps, 362 (void) softdep_accdeps; 363 364 ASSERT_VI_LOCKED(devvp, "softdep_check_suspend"); 365 bo = &devvp->v_bufobj; 366 367 for (;;) { 368 if (!MNT_ITRYLOCK(mp)) { 369 VI_UNLOCK(devvp); 370 MNT_ILOCK(mp); 371 MNT_IUNLOCK(mp); 372 VI_LOCK(devvp); 373 continue; 374 } 375 if (mp->mnt_secondary_writes != 0) { 376 VI_UNLOCK(devvp); 377 msleep(&mp->mnt_secondary_writes, 378 MNT_MTX(mp), 379 (PUSER - 1) | PDROP, "secwr", 0); 380 VI_LOCK(devvp); 381 continue; 382 } 383 break; 384 } 385 386 /* 387 * Reasons for needing more work before suspend: 388 * - Dirty buffers on devvp. 389 * - Secondary writes occurred after start of vnode sync loop 390 */ 391 error = 0; 392 if (bo->bo_numoutput > 0 || 393 bo->bo_dirty.bv_cnt > 0 || 394 secondary_writes != 0 || 395 mp->mnt_secondary_writes != 0 || 396 secondary_accwrites != mp->mnt_secondary_accwrites) 397 error = EAGAIN; 398 VI_UNLOCK(devvp); 399 return (error); 400} 401 402void 403softdep_get_depcounts(struct mount *mp, 404 int *softdepactivep, 405 int *softdepactiveaccp) 406{ 407 (void) mp; 408 *softdepactivep = 0; 409 *softdepactiveaccp = 0; 410} 411 412#else 413/* 414 * These definitions need to be adapted to the system to which 415 * this file is being ported. 416 */ 417/* 418 * malloc types defined for the softdep system. 419 */ 420static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); 421static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); 422static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); 423static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); 424static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); 425static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); 426static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); 427static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); 428static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); 429static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); 430static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); 431static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); 432static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); 433static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block"); 434static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes"); 435 436#define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE) 437 438#define D_PAGEDEP 0 439#define D_INODEDEP 1 440#define D_NEWBLK 2 441#define D_BMSAFEMAP 3 442#define D_ALLOCDIRECT 4 443#define D_INDIRDEP 5 444#define D_ALLOCINDIR 6 445#define D_FREEFRAG 7 446#define D_FREEBLKS 8 447#define D_FREEFILE 9 448#define D_DIRADD 10 449#define D_MKDIR 11 450#define D_DIRREM 12 451#define D_NEWDIRBLK 13 452#define D_LAST D_NEWDIRBLK 453 454/* 455 * translate from workitem type to memory type 456 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 457 */ 458static struct malloc_type *memtype[] = { 459 M_PAGEDEP, 460 M_INODEDEP, 461 M_NEWBLK, 462 M_BMSAFEMAP, 463 M_ALLOCDIRECT, 464 M_INDIRDEP, 465 M_ALLOCINDIR, 466 M_FREEFRAG, 467 M_FREEBLKS, 468 M_FREEFILE, 469 M_DIRADD, 470 M_MKDIR, 471 M_DIRREM, 472 M_NEWDIRBLK 473}; 474 475#define DtoM(type) (memtype[type]) 476 477/* 478 * Names of malloc types. 479 */ 480#define TYPENAME(type) \ 481 ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") 482/* 483 * End system adaptation definitions. 484 */ 485 486/* 487 * Forward declarations. 488 */ 489struct inodedep_hashhead; 490struct newblk_hashhead; 491struct pagedep_hashhead; 492 493/* 494 * Internal function prototypes. 495 */ 496static void softdep_error(char *, int); 497static void drain_output(struct vnode *); 498static struct buf *getdirtybuf(struct buf *, struct mtx *, int); 499static void clear_remove(struct thread *); 500static void clear_inodedeps(struct thread *); 501static int flush_pagedep_deps(struct vnode *, struct mount *, 502 struct diraddhd *); 503static int flush_inodedep_deps(struct mount *, ino_t); 504static int flush_deplist(struct allocdirectlst *, int, int *); 505static int handle_written_filepage(struct pagedep *, struct buf *); 506static void diradd_inode_written(struct diradd *, struct inodedep *); 507static int handle_written_inodeblock(struct inodedep *, struct buf *); 508static void handle_allocdirect_partdone(struct allocdirect *); 509static void handle_allocindir_partdone(struct allocindir *); 510static void initiate_write_filepage(struct pagedep *, struct buf *); 511static void handle_written_mkdir(struct mkdir *, int); 512static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); 513static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); 514static void handle_workitem_freefile(struct freefile *); 515static void handle_workitem_remove(struct dirrem *, struct vnode *); 516static struct dirrem *newdirrem(struct buf *, struct inode *, 517 struct inode *, int, struct dirrem **); 518static void free_diradd(struct diradd *); 519static void free_allocindir(struct allocindir *, struct inodedep *); 520static void free_newdirblk(struct newdirblk *); 521static int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t, 522 ufs2_daddr_t *); 523static void deallocate_dependencies(struct buf *, struct inodedep *); 524static void free_allocdirect(struct allocdirectlst *, 525 struct allocdirect *, int); 526static int check_inode_unwritten(struct inodedep *); 527static int free_inodedep(struct inodedep *); 528static void handle_workitem_freeblocks(struct freeblks *, int); 529static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); 530static void setup_allocindir_phase2(struct buf *, struct inode *, 531 struct allocindir *); 532static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, 533 ufs2_daddr_t); 534static void handle_workitem_freefrag(struct freefrag *); 535static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long); 536static void allocdirect_merge(struct allocdirectlst *, 537 struct allocdirect *, struct allocdirect *); 538static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *); 539static int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t, 540 struct newblk **); 541static int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **); 542static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, 543 struct inodedep **); 544static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); 545static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **); 546static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, 547 struct mount *mp, int, struct pagedep **); 548static void pause_timer(void *); 549static int request_cleanup(struct mount *, int); 550static int process_worklist_item(struct mount *, int); 551static void add_to_worklist(struct worklist *); 552static void softdep_flush(void); 553static int softdep_speedup(void); 554 555/* 556 * Exported softdep operations. 557 */ 558static void softdep_disk_io_initiation(struct buf *); 559static void softdep_disk_write_complete(struct buf *); 560static void softdep_deallocate_dependencies(struct buf *); 561static int softdep_count_dependencies(struct buf *bp, int); 562 563static struct mtx lk; 564MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF); 565 566#define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk) 567#define ACQUIRE_LOCK(lk) mtx_lock(lk) 568#define FREE_LOCK(lk) mtx_unlock(lk) 569 570/* 571 * Worklist queue management. 572 * These routines require that the lock be held. 573 */ 574#ifndef /* NOT */ DEBUG 575#define WORKLIST_INSERT(head, item) do { \ 576 (item)->wk_state |= ONWORKLIST; \ 577 LIST_INSERT_HEAD(head, item, wk_list); \ 578} while (0) 579#define WORKLIST_REMOVE(item) do { \ 580 (item)->wk_state &= ~ONWORKLIST; \ 581 LIST_REMOVE(item, wk_list); \ 582} while (0) 583#else /* DEBUG */ 584static void worklist_insert(struct workhead *, struct worklist *); 585static void worklist_remove(struct worklist *); 586 587#define WORKLIST_INSERT(head, item) worklist_insert(head, item) 588#define WORKLIST_REMOVE(item) worklist_remove(item) 589 590static void 591worklist_insert(head, item) 592 struct workhead *head; 593 struct worklist *item; 594{ 595 596 mtx_assert(&lk, MA_OWNED); 597 if (item->wk_state & ONWORKLIST) 598 panic("worklist_insert: already on list"); 599 item->wk_state |= ONWORKLIST; 600 LIST_INSERT_HEAD(head, item, wk_list); 601} 602 603static void 604worklist_remove(item) 605 struct worklist *item; 606{ 607 608 mtx_assert(&lk, MA_OWNED); 609 if ((item->wk_state & ONWORKLIST) == 0) 610 panic("worklist_remove: not on list"); 611 item->wk_state &= ~ONWORKLIST; 612 LIST_REMOVE(item, wk_list); 613} 614#endif /* DEBUG */ 615 616/* 617 * Routines for tracking and managing workitems. 618 */ 619static void workitem_free(struct worklist *, int); 620static void workitem_alloc(struct worklist *, int, struct mount *); 621 622#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type)) 623 624static void 625workitem_free(item, type) 626 struct worklist *item; 627 int type; 628{ 629 struct ufsmount *ump; 630 mtx_assert(&lk, MA_OWNED); 631 632#ifdef DEBUG 633 if (item->wk_state & ONWORKLIST) 634 panic("workitem_free: still on list"); 635 if (item->wk_type != type) 636 panic("workitem_free: type mismatch"); 637#endif 638 ump = VFSTOUFS(item->wk_mp); 639 if (--ump->softdep_deps == 0 && ump->softdep_req) 640 wakeup(&ump->softdep_deps); 641 FREE(item, DtoM(type)); 642} 643 644static void 645workitem_alloc(item, type, mp) 646 struct worklist *item; 647 int type; 648 struct mount *mp; 649{ 650 item->wk_type = type; 651 item->wk_mp = mp; 652 item->wk_state = 0; 653 ACQUIRE_LOCK(&lk); 654 VFSTOUFS(mp)->softdep_deps++; 655 VFSTOUFS(mp)->softdep_accdeps++; 656 FREE_LOCK(&lk); 657} 658 659/* 660 * Workitem queue management 661 */ 662static int max_softdeps; /* maximum number of structs before slowdown */ 663static int maxindirdeps = 50; /* max number of indirdeps before slowdown */ 664static int tickdelay = 2; /* number of ticks to pause during slowdown */ 665static int proc_waiting; /* tracks whether we have a timeout posted */ 666static int *stat_countp; /* statistic to count in proc_waiting timeout */ 667static struct callout_handle handle; /* handle on posted proc_waiting timeout */ 668static int req_pending; 669static int req_clear_inodedeps; /* syncer process flush some inodedeps */ 670#define FLUSH_INODES 1 671static int req_clear_remove; /* syncer process flush some freeblks */ 672#define FLUSH_REMOVE 2 673#define FLUSH_REMOVE_WAIT 3 674/* 675 * runtime statistics 676 */ 677static int stat_worklist_push; /* number of worklist cleanups */ 678static int stat_blk_limit_push; /* number of times block limit neared */ 679static int stat_ino_limit_push; /* number of times inode limit neared */ 680static int stat_blk_limit_hit; /* number of times block slowdown imposed */ 681static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ 682static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ 683static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ 684static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ 685static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ 686static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ 687 688SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); 689SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); 690SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, ""); 691SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,""); 692SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); 693SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); 694SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); 695SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); 696SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, ""); 697SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); 698SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); 699SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); 700SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); 701/* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */ 702 703SYSCTL_DECL(_vfs_ffs); 704 705static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ 706SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, 707 &compute_summary_at_mount, 0, "Recompute summary at mount"); 708 709static struct proc *softdepproc; 710static struct kproc_desc softdep_kp = { 711 "softdepflush", 712 softdep_flush, 713 &softdepproc 714}; 715SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, &softdep_kp) 716 717static void 718softdep_flush(void) 719{ 720 struct mount *nmp; 721 struct mount *mp; 722 struct ufsmount *ump; 723 struct thread *td; 724 int remaining; 725 726 td = curthread; 727 td->td_pflags |= TDP_NORUNNINGBUF; 728 729 for (;;) { 730 kthread_suspend_check(softdepproc); 731 ACQUIRE_LOCK(&lk); 732 /* 733 * If requested, try removing inode or removal dependencies. 734 */ 735 if (req_clear_inodedeps) { 736 clear_inodedeps(td); 737 req_clear_inodedeps -= 1; 738 wakeup_one(&proc_waiting); 739 } 740 if (req_clear_remove) { 741 clear_remove(td); 742 req_clear_remove -= 1; 743 wakeup_one(&proc_waiting); 744 } 745 FREE_LOCK(&lk); 746 remaining = 0; 747 mtx_lock(&mountlist_mtx); 748 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 749 nmp = TAILQ_NEXT(mp, mnt_list); 750 if ((mp->mnt_flag & MNT_SOFTDEP) == 0) 751 continue; 752 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) 753 continue; 754 softdep_process_worklist(mp, 0); 755 ump = VFSTOUFS(mp); 756 remaining += ump->softdep_on_worklist - 757 ump->softdep_on_worklist_inprogress; 758 mtx_lock(&mountlist_mtx); 759 nmp = TAILQ_NEXT(mp, mnt_list); 760 vfs_unbusy(mp, td); 761 } 762 mtx_unlock(&mountlist_mtx); 763 if (remaining) 764 continue; 765 ACQUIRE_LOCK(&lk); 766 if (!req_pending) 767 msleep(&req_pending, &lk, PVM, "sdflush", hz); 768 req_pending = 0; 769 FREE_LOCK(&lk); 770 } 771} 772 773static int 774softdep_speedup(void) 775{ 776 777 mtx_assert(&lk, MA_OWNED); 778 if (req_pending == 0) { 779 req_pending = 1; 780 wakeup(&req_pending); 781 } 782 783 return speedup_syncer(); 784} 785 786/* 787 * Add an item to the end of the work queue. 788 * This routine requires that the lock be held. 789 * This is the only routine that adds items to the list. 790 * The following routine is the only one that removes items 791 * and does so in order from first to last. 792 */ 793static void 794add_to_worklist(wk) 795 struct worklist *wk; 796{ 797 struct ufsmount *ump; 798 799 mtx_assert(&lk, MA_OWNED); 800 ump = VFSTOUFS(wk->wk_mp); 801 if (wk->wk_state & ONWORKLIST) 802 panic("add_to_worklist: already on list"); 803 wk->wk_state |= ONWORKLIST; 804 if (LIST_FIRST(&ump->softdep_workitem_pending) == NULL) 805 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 806 else 807 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); 808 ump->softdep_worklist_tail = wk; 809 ump->softdep_on_worklist += 1; 810} 811 812/* 813 * Process that runs once per second to handle items in the background queue. 814 * 815 * Note that we ensure that everything is done in the order in which they 816 * appear in the queue. The code below depends on this property to ensure 817 * that blocks of a file are freed before the inode itself is freed. This 818 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 819 * until all the old ones have been purged from the dependency lists. 820 */ 821int 822softdep_process_worklist(mp, full) 823 struct mount *mp; 824 int full; 825{ 826 struct thread *td = curthread; 827 int cnt, matchcnt, loopcount; 828 struct ufsmount *ump; 829 long starttime; 830 831 KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); 832 /* 833 * Record the process identifier of our caller so that we can give 834 * this process preferential treatment in request_cleanup below. 835 */ 836 matchcnt = 0; 837 ump = VFSTOUFS(mp); 838 ACQUIRE_LOCK(&lk); 839 loopcount = 1; 840 starttime = time_second; 841 while (ump->softdep_on_worklist > 0) { 842 if ((cnt = process_worklist_item(mp, 0)) == -1) 843 break; 844 else 845 matchcnt += cnt; 846 /* 847 * If requested, try removing inode or removal dependencies. 848 */ 849 if (req_clear_inodedeps) { 850 clear_inodedeps(td); 851 req_clear_inodedeps -= 1; 852 wakeup_one(&proc_waiting); 853 } 854 if (req_clear_remove) { 855 clear_remove(td); 856 req_clear_remove -= 1; 857 wakeup_one(&proc_waiting); 858 } 859 /* 860 * We do not generally want to stop for buffer space, but if 861 * we are really being a buffer hog, we will stop and wait. 862 */ 863 if (loopcount++ % 128 == 0) { 864 FREE_LOCK(&lk); 865 bwillwrite(); 866 ACQUIRE_LOCK(&lk); 867 } 868 /* 869 * Never allow processing to run for more than one 870 * second. Otherwise the other mountpoints may get 871 * excessively backlogged. 872 */ 873 if (!full && starttime != time_second) { 874 matchcnt = -1; 875 break; 876 } 877 } 878 FREE_LOCK(&lk); 879 return (matchcnt); 880} 881 882/* 883 * Process one item on the worklist. 884 */ 885static int 886process_worklist_item(mp, flags) 887 struct mount *mp; 888 int flags; 889{ 890 struct worklist *wk, *wkend; 891 struct ufsmount *ump; 892 struct vnode *vp; 893 int matchcnt = 0; 894 895 mtx_assert(&lk, MA_OWNED); 896 KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); 897 /* 898 * If we are being called because of a process doing a 899 * copy-on-write, then it is not safe to write as we may 900 * recurse into the copy-on-write routine. 901 */ 902 if (curthread->td_pflags & TDP_COWINPROGRESS) 903 return (-1); 904 /* 905 * Normally we just process each item on the worklist in order. 906 * However, if we are in a situation where we cannot lock any 907 * inodes, we have to skip over any dirrem requests whose 908 * vnodes are resident and locked. 909 */ 910 ump = VFSTOUFS(mp); 911 vp = NULL; 912 LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) { 913 if (wk->wk_state & INPROGRESS) 914 continue; 915 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM) 916 break; 917 wk->wk_state |= INPROGRESS; 918 ump->softdep_on_worklist_inprogress++; 919 FREE_LOCK(&lk); 920 ffs_vget(mp, WK_DIRREM(wk)->dm_oldinum, 921 LK_NOWAIT | LK_EXCLUSIVE, &vp); 922 ACQUIRE_LOCK(&lk); 923 wk->wk_state &= ~INPROGRESS; 924 ump->softdep_on_worklist_inprogress--; 925 if (vp != NULL) 926 break; 927 } 928 if (wk == 0) 929 return (-1); 930 /* 931 * Remove the item to be processed. If we are removing the last 932 * item on the list, we need to recalculate the tail pointer. 933 * As this happens rarely and usually when the list is short, 934 * we just run down the list to find it rather than tracking it 935 * in the above loop. 936 */ 937 WORKLIST_REMOVE(wk); 938 if (wk == ump->softdep_worklist_tail) { 939 LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list) 940 if (LIST_NEXT(wkend, wk_list) == NULL) 941 break; 942 ump->softdep_worklist_tail = wkend; 943 } 944 ump->softdep_on_worklist -= 1; 945 FREE_LOCK(&lk); 946 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 947 panic("process_worklist_item: suspended filesystem"); 948 matchcnt++; 949 switch (wk->wk_type) { 950 951 case D_DIRREM: 952 /* removal of a directory entry */ 953 handle_workitem_remove(WK_DIRREM(wk), vp); 954 break; 955 956 case D_FREEBLKS: 957 /* releasing blocks and/or fragments from a file */ 958 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT); 959 break; 960 961 case D_FREEFRAG: 962 /* releasing a fragment when replaced as a file grows */ 963 handle_workitem_freefrag(WK_FREEFRAG(wk)); 964 break; 965 966 case D_FREEFILE: 967 /* releasing an inode when its link count drops to 0 */ 968 handle_workitem_freefile(WK_FREEFILE(wk)); 969 break; 970 971 default: 972 panic("%s_process_worklist: Unknown type %s", 973 "softdep", TYPENAME(wk->wk_type)); 974 /* NOTREACHED */ 975 } 976 vn_finished_secondary_write(mp); 977 ACQUIRE_LOCK(&lk); 978 return (matchcnt); 979} 980 981/* 982 * Move dependencies from one buffer to another. 983 */ 984void 985softdep_move_dependencies(oldbp, newbp) 986 struct buf *oldbp; 987 struct buf *newbp; 988{ 989 struct worklist *wk, *wktail; 990 991 if (LIST_FIRST(&newbp->b_dep) != NULL) 992 panic("softdep_move_dependencies: need merge code"); 993 wktail = 0; 994 ACQUIRE_LOCK(&lk); 995 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 996 LIST_REMOVE(wk, wk_list); 997 if (wktail == 0) 998 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 999 else 1000 LIST_INSERT_AFTER(wktail, wk, wk_list); 1001 wktail = wk; 1002 } 1003 FREE_LOCK(&lk); 1004} 1005 1006/* 1007 * Purge the work list of all items associated with a particular mount point. 1008 */ 1009int 1010softdep_flushworklist(oldmnt, countp, td) 1011 struct mount *oldmnt; 1012 int *countp; 1013 struct thread *td; 1014{ 1015 struct vnode *devvp; 1016 int count, error = 0; 1017 struct ufsmount *ump; 1018 1019 /* 1020 * Alternately flush the block device associated with the mount 1021 * point and process any dependencies that the flushing 1022 * creates. We continue until no more worklist dependencies 1023 * are found. 1024 */ 1025 *countp = 0; 1026 ump = VFSTOUFS(oldmnt); 1027 devvp = ump->um_devvp; 1028 while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { 1029 *countp += count; 1030 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); 1031 error = VOP_FSYNC(devvp, MNT_WAIT, td); 1032 VOP_UNLOCK(devvp, 0, td); 1033 if (error) 1034 break; 1035 } 1036 return (error); 1037} 1038 1039int 1040softdep_waitidle(struct mount *mp) 1041{ 1042 struct ufsmount *ump; 1043 int error; 1044 int i; 1045 1046 ump = VFSTOUFS(mp); 1047 ACQUIRE_LOCK(&lk); 1048 for (i = 0; i < 10 && ump->softdep_deps; i++) { 1049 ump->softdep_req = 1; 1050 if (ump->softdep_on_worklist) 1051 panic("softdep_waitidle: work added after flush."); 1052 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1); 1053 } 1054 ump->softdep_req = 0; 1055 FREE_LOCK(&lk); 1056 error = 0; 1057 if (i == 10) { 1058 error = EBUSY; 1059 printf("softdep_waitidle: Failed to flush worklist for %p", 1060 mp); 1061 } 1062 1063 return (error); 1064} 1065 1066/* 1067 * Flush all vnodes and worklist items associated with a specified mount point. 1068 */ 1069int 1070softdep_flushfiles(oldmnt, flags, td) 1071 struct mount *oldmnt; 1072 int flags; 1073 struct thread *td; 1074{ 1075 int error, count, loopcnt; 1076 1077 error = 0; 1078 1079 /* 1080 * Alternately flush the vnodes associated with the mount 1081 * point and process any dependencies that the flushing 1082 * creates. In theory, this loop can happen at most twice, 1083 * but we give it a few extra just to be sure. 1084 */ 1085 for (loopcnt = 10; loopcnt > 0; loopcnt--) { 1086 /* 1087 * Do another flush in case any vnodes were brought in 1088 * as part of the cleanup operations. 1089 */ 1090 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) 1091 break; 1092 if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 || 1093 count == 0) 1094 break; 1095 } 1096 /* 1097 * If we are unmounting then it is an error to fail. If we 1098 * are simply trying to downgrade to read-only, then filesystem 1099 * activity can keep us busy forever, so we just fail with EBUSY. 1100 */ 1101 if (loopcnt == 0) { 1102 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 1103 panic("softdep_flushfiles: looping"); 1104 error = EBUSY; 1105 } 1106 if (!error) 1107 error = softdep_waitidle(oldmnt); 1108 return (error); 1109} 1110 1111/* 1112 * Structure hashing. 1113 * 1114 * There are three types of structures that can be looked up: 1115 * 1) pagedep structures identified by mount point, inode number, 1116 * and logical block. 1117 * 2) inodedep structures identified by mount point and inode number. 1118 * 3) newblk structures identified by mount point and 1119 * physical block number. 1120 * 1121 * The "pagedep" and "inodedep" dependency structures are hashed 1122 * separately from the file blocks and inodes to which they correspond. 1123 * This separation helps when the in-memory copy of an inode or 1124 * file block must be replaced. It also obviates the need to access 1125 * an inode or file page when simply updating (or de-allocating) 1126 * dependency structures. Lookup of newblk structures is needed to 1127 * find newly allocated blocks when trying to associate them with 1128 * their allocdirect or allocindir structure. 1129 * 1130 * The lookup routines optionally create and hash a new instance when 1131 * an existing entry is not found. 1132 */ 1133#define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 1134#define NODELAY 0x0002 /* cannot do background work */ 1135 1136/* 1137 * Structures and routines associated with pagedep caching. 1138 */ 1139LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 1140u_long pagedep_hash; /* size of hash table - 1 */ 1141#define PAGEDEP_HASH(mp, inum, lbn) \ 1142 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 1143 pagedep_hash]) 1144 1145static int 1146pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp) 1147 struct pagedep_hashhead *pagedephd; 1148 ino_t ino; 1149 ufs_lbn_t lbn; 1150 struct mount *mp; 1151 int flags; 1152 struct pagedep **pagedeppp; 1153{ 1154 struct pagedep *pagedep; 1155 1156 LIST_FOREACH(pagedep, pagedephd, pd_hash) 1157 if (ino == pagedep->pd_ino && 1158 lbn == pagedep->pd_lbn && 1159 mp == pagedep->pd_list.wk_mp) 1160 break; 1161 if (pagedep) { 1162 *pagedeppp = pagedep; 1163 if ((flags & DEPALLOC) != 0 && 1164 (pagedep->pd_state & ONWORKLIST) == 0) 1165 return (0); 1166 return (1); 1167 } 1168 *pagedeppp = NULL; 1169 return (0); 1170} 1171/* 1172 * Look up a pagedep. Return 1 if found, 0 if not found or found 1173 * when asked to allocate but not associated with any buffer. 1174 * If not found, allocate if DEPALLOC flag is passed. 1175 * Found or allocated entry is returned in pagedeppp. 1176 * This routine must be called with splbio interrupts blocked. 1177 */ 1178static int 1179pagedep_lookup(ip, lbn, flags, pagedeppp) 1180 struct inode *ip; 1181 ufs_lbn_t lbn; 1182 int flags; 1183 struct pagedep **pagedeppp; 1184{ 1185 struct pagedep *pagedep; 1186 struct pagedep_hashhead *pagedephd; 1187 struct mount *mp; 1188 int ret; 1189 int i; 1190 1191 mtx_assert(&lk, MA_OWNED); 1192 mp = ITOV(ip)->v_mount; 1193 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); 1194 1195 ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp); 1196 if (*pagedeppp || (flags & DEPALLOC) == 0) 1197 return (ret); 1198 FREE_LOCK(&lk); 1199 MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), 1200 M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); 1201 workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); 1202 ACQUIRE_LOCK(&lk); 1203 ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp); 1204 if (*pagedeppp) { 1205 WORKITEM_FREE(pagedep, D_PAGEDEP); 1206 return (ret); 1207 } 1208 pagedep->pd_ino = ip->i_number; 1209 pagedep->pd_lbn = lbn; 1210 LIST_INIT(&pagedep->pd_dirremhd); 1211 LIST_INIT(&pagedep->pd_pendinghd); 1212 for (i = 0; i < DAHASHSZ; i++) 1213 LIST_INIT(&pagedep->pd_diraddhd[i]); 1214 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 1215 *pagedeppp = pagedep; 1216 return (0); 1217} 1218 1219/* 1220 * Structures and routines associated with inodedep caching. 1221 */ 1222LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 1223static u_long inodedep_hash; /* size of hash table - 1 */ 1224static long num_inodedep; /* number of inodedep allocated */ 1225#define INODEDEP_HASH(fs, inum) \ 1226 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 1227 1228static int 1229inodedep_find(inodedephd, fs, inum, inodedeppp) 1230 struct inodedep_hashhead *inodedephd; 1231 struct fs *fs; 1232 ino_t inum; 1233 struct inodedep **inodedeppp; 1234{ 1235 struct inodedep *inodedep; 1236 1237 LIST_FOREACH(inodedep, inodedephd, id_hash) 1238 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 1239 break; 1240 if (inodedep) { 1241 *inodedeppp = inodedep; 1242 return (1); 1243 } 1244 *inodedeppp = NULL; 1245 1246 return (0); 1247} 1248/* 1249 * Look up an inodedep. Return 1 if found, 0 if not found. 1250 * If not found, allocate if DEPALLOC flag is passed. 1251 * Found or allocated entry is returned in inodedeppp. 1252 * This routine must be called with splbio interrupts blocked. 1253 */ 1254static int 1255inodedep_lookup(mp, inum, flags, inodedeppp) 1256 struct mount *mp; 1257 ino_t inum; 1258 int flags; 1259 struct inodedep **inodedeppp; 1260{ 1261 struct inodedep *inodedep; 1262 struct inodedep_hashhead *inodedephd; 1263 struct fs *fs; 1264 1265 mtx_assert(&lk, MA_OWNED); 1266 fs = VFSTOUFS(mp)->um_fs; 1267 inodedephd = INODEDEP_HASH(fs, inum); 1268 1269 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) 1270 return (1); 1271 if ((flags & DEPALLOC) == 0) 1272 return (0); 1273 /* 1274 * If we are over our limit, try to improve the situation. 1275 */ 1276 if (num_inodedep > max_softdeps && (flags & NODELAY) == 0) 1277 request_cleanup(mp, FLUSH_INODES); 1278 FREE_LOCK(&lk); 1279 MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep), 1280 M_INODEDEP, M_SOFTDEP_FLAGS); 1281 workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); 1282 ACQUIRE_LOCK(&lk); 1283 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) { 1284 WORKITEM_FREE(inodedep, D_INODEDEP); 1285 return (1); 1286 } 1287 num_inodedep += 1; 1288 inodedep->id_fs = fs; 1289 inodedep->id_ino = inum; 1290 inodedep->id_state = ALLCOMPLETE; 1291 inodedep->id_nlinkdelta = 0; 1292 inodedep->id_savedino1 = NULL; 1293 inodedep->id_savedsize = -1; 1294 inodedep->id_savedextsize = -1; 1295 inodedep->id_buf = NULL; 1296 LIST_INIT(&inodedep->id_pendinghd); 1297 LIST_INIT(&inodedep->id_inowait); 1298 LIST_INIT(&inodedep->id_bufwait); 1299 TAILQ_INIT(&inodedep->id_inoupdt); 1300 TAILQ_INIT(&inodedep->id_newinoupdt); 1301 TAILQ_INIT(&inodedep->id_extupdt); 1302 TAILQ_INIT(&inodedep->id_newextupdt); 1303 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 1304 *inodedeppp = inodedep; 1305 return (0); 1306} 1307 1308/* 1309 * Structures and routines associated with newblk caching. 1310 */ 1311LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 1312u_long newblk_hash; /* size of hash table - 1 */ 1313#define NEWBLK_HASH(fs, inum) \ 1314 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 1315 1316static int 1317newblk_find(newblkhd, fs, newblkno, newblkpp) 1318 struct newblk_hashhead *newblkhd; 1319 struct fs *fs; 1320 ufs2_daddr_t newblkno; 1321 struct newblk **newblkpp; 1322{ 1323 struct newblk *newblk; 1324 1325 LIST_FOREACH(newblk, newblkhd, nb_hash) 1326 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) 1327 break; 1328 if (newblk) { 1329 *newblkpp = newblk; 1330 return (1); 1331 } 1332 *newblkpp = NULL; 1333 return (0); 1334} 1335 1336/* 1337 * Look up a newblk. Return 1 if found, 0 if not found. 1338 * If not found, allocate if DEPALLOC flag is passed. 1339 * Found or allocated entry is returned in newblkpp. 1340 */ 1341static int 1342newblk_lookup(fs, newblkno, flags, newblkpp) 1343 struct fs *fs; 1344 ufs2_daddr_t newblkno; 1345 int flags; 1346 struct newblk **newblkpp; 1347{ 1348 struct newblk *newblk; 1349 struct newblk_hashhead *newblkhd; 1350 1351 newblkhd = NEWBLK_HASH(fs, newblkno); 1352 if (newblk_find(newblkhd, fs, newblkno, newblkpp)) 1353 return (1); 1354 if ((flags & DEPALLOC) == 0) 1355 return (0); 1356 FREE_LOCK(&lk); 1357 MALLOC(newblk, struct newblk *, sizeof(struct newblk), 1358 M_NEWBLK, M_SOFTDEP_FLAGS); 1359 ACQUIRE_LOCK(&lk); 1360 if (newblk_find(newblkhd, fs, newblkno, newblkpp)) { 1361 FREE(newblk, M_NEWBLK); 1362 return (1); 1363 } 1364 newblk->nb_state = 0; 1365 newblk->nb_fs = fs; 1366 newblk->nb_newblkno = newblkno; 1367 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 1368 *newblkpp = newblk; 1369 return (0); 1370} 1371 1372/* 1373 * Executed during filesystem system initialization before 1374 * mounting any filesystems. 1375 */ 1376void 1377softdep_initialize() 1378{ 1379 1380 LIST_INIT(&mkdirlisthd); 1381 max_softdeps = desiredvnodes * 4; 1382 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, 1383 &pagedep_hash); 1384 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 1385 newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); 1386 1387 /* initialise bioops hack */ 1388 bioops.io_start = softdep_disk_io_initiation; 1389 bioops.io_complete = softdep_disk_write_complete; 1390 bioops.io_deallocate = softdep_deallocate_dependencies; 1391 bioops.io_countdeps = softdep_count_dependencies; 1392} 1393 1394/* 1395 * Executed after all filesystems have been unmounted during 1396 * filesystem module unload. 1397 */ 1398void 1399softdep_uninitialize() 1400{ 1401 1402 hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); 1403 hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); 1404 hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); 1405} 1406 1407/* 1408 * Called at mount time to notify the dependency code that a 1409 * filesystem wishes to use it. 1410 */ 1411int 1412softdep_mount(devvp, mp, fs, cred) 1413 struct vnode *devvp; 1414 struct mount *mp; 1415 struct fs *fs; 1416 struct ucred *cred; 1417{ 1418 struct csum_total cstotal; 1419 struct ufsmount *ump; 1420 struct cg *cgp; 1421 struct buf *bp; 1422 int error, cyl; 1423 1424 MNT_ILOCK(mp); 1425 mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; 1426 if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { 1427 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 1428 MNTK_SOFTDEP; 1429 mp->mnt_noasync++; 1430 } 1431 MNT_IUNLOCK(mp); 1432 ump = VFSTOUFS(mp); 1433 LIST_INIT(&ump->softdep_workitem_pending); 1434 ump->softdep_worklist_tail = NULL; 1435 ump->softdep_on_worklist = 0; 1436 ump->softdep_deps = 0; 1437 /* 1438 * When doing soft updates, the counters in the 1439 * superblock may have gotten out of sync. Recomputation 1440 * can take a long time and can be deferred for background 1441 * fsck. However, the old behavior of scanning the cylinder 1442 * groups and recalculating them at mount time is available 1443 * by setting vfs.ffs.compute_summary_at_mount to one. 1444 */ 1445 if (compute_summary_at_mount == 0 || fs->fs_clean != 0) 1446 return (0); 1447 bzero(&cstotal, sizeof cstotal); 1448 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 1449 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 1450 fs->fs_cgsize, cred, &bp)) != 0) { 1451 brelse(bp); 1452 return (error); 1453 } 1454 cgp = (struct cg *)bp->b_data; 1455 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 1456 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 1457 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 1458 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 1459 fs->fs_cs(fs, cyl) = cgp->cg_cs; 1460 brelse(bp); 1461 } 1462#ifdef DEBUG 1463 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 1464 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 1465#endif 1466 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 1467 return (0); 1468} 1469 1470/* 1471 * Protecting the freemaps (or bitmaps). 1472 * 1473 * To eliminate the need to execute fsck before mounting a filesystem 1474 * after a power failure, one must (conservatively) guarantee that the 1475 * on-disk copy of the bitmaps never indicate that a live inode or block is 1476 * free. So, when a block or inode is allocated, the bitmap should be 1477 * updated (on disk) before any new pointers. When a block or inode is 1478 * freed, the bitmap should not be updated until all pointers have been 1479 * reset. The latter dependency is handled by the delayed de-allocation 1480 * approach described below for block and inode de-allocation. The former 1481 * dependency is handled by calling the following procedure when a block or 1482 * inode is allocated. When an inode is allocated an "inodedep" is created 1483 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 1484 * Each "inodedep" is also inserted into the hash indexing structure so 1485 * that any additional link additions can be made dependent on the inode 1486 * allocation. 1487 * 1488 * The ufs filesystem maintains a number of free block counts (e.g., per 1489 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 1490 * in addition to the bitmaps. These counts are used to improve efficiency 1491 * during allocation and therefore must be consistent with the bitmaps. 1492 * There is no convenient way to guarantee post-crash consistency of these 1493 * counts with simple update ordering, for two main reasons: (1) The counts 1494 * and bitmaps for a single cylinder group block are not in the same disk 1495 * sector. If a disk write is interrupted (e.g., by power failure), one may 1496 * be written and the other not. (2) Some of the counts are located in the 1497 * superblock rather than the cylinder group block. So, we focus our soft 1498 * updates implementation on protecting the bitmaps. When mounting a 1499 * filesystem, we recompute the auxiliary counts from the bitmaps. 1500 */ 1501 1502/* 1503 * Called just after updating the cylinder group block to allocate an inode. 1504 */ 1505void 1506softdep_setup_inomapdep(bp, ip, newinum) 1507 struct buf *bp; /* buffer for cylgroup block with inode map */ 1508 struct inode *ip; /* inode related to allocation */ 1509 ino_t newinum; /* new inode number being allocated */ 1510{ 1511 struct inodedep *inodedep; 1512 struct bmsafemap *bmsafemap; 1513 1514 /* 1515 * Create a dependency for the newly allocated inode. 1516 * Panic if it already exists as something is seriously wrong. 1517 * Otherwise add it to the dependency list for the buffer holding 1518 * the cylinder group map from which it was allocated. 1519 */ 1520 ACQUIRE_LOCK(&lk); 1521 if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY, 1522 &inodedep))) 1523 panic("softdep_setup_inomapdep: dependency for new inode " 1524 "already exists"); 1525 inodedep->id_buf = bp; 1526 inodedep->id_state &= ~DEPCOMPLETE; 1527 bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp); 1528 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 1529 FREE_LOCK(&lk); 1530} 1531 1532/* 1533 * Called just after updating the cylinder group block to 1534 * allocate block or fragment. 1535 */ 1536void 1537softdep_setup_blkmapdep(bp, mp, newblkno) 1538 struct buf *bp; /* buffer for cylgroup block with block map */ 1539 struct mount *mp; /* filesystem doing allocation */ 1540 ufs2_daddr_t newblkno; /* number of newly allocated block */ 1541{ 1542 struct newblk *newblk; 1543 struct bmsafemap *bmsafemap; 1544 struct fs *fs; 1545 1546 fs = VFSTOUFS(mp)->um_fs; 1547 /* 1548 * Create a dependency for the newly allocated block. 1549 * Add it to the dependency list for the buffer holding 1550 * the cylinder group map from which it was allocated. 1551 */ 1552 ACQUIRE_LOCK(&lk); 1553 if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) 1554 panic("softdep_setup_blkmapdep: found block"); 1555 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp); 1556 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 1557 FREE_LOCK(&lk); 1558} 1559 1560/* 1561 * Find the bmsafemap associated with a cylinder group buffer. 1562 * If none exists, create one. The buffer must be locked when 1563 * this routine is called and this routine must be called with 1564 * splbio interrupts blocked. 1565 */ 1566static struct bmsafemap * 1567bmsafemap_lookup(mp, bp) 1568 struct mount *mp; 1569 struct buf *bp; 1570{ 1571 struct bmsafemap *bmsafemap; 1572 struct worklist *wk; 1573 1574 mtx_assert(&lk, MA_OWNED); 1575 LIST_FOREACH(wk, &bp->b_dep, wk_list) 1576 if (wk->wk_type == D_BMSAFEMAP) 1577 return (WK_BMSAFEMAP(wk)); 1578 FREE_LOCK(&lk); 1579 MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap), 1580 M_BMSAFEMAP, M_SOFTDEP_FLAGS); 1581 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); 1582 bmsafemap->sm_buf = bp; 1583 LIST_INIT(&bmsafemap->sm_allocdirecthd); 1584 LIST_INIT(&bmsafemap->sm_allocindirhd); 1585 LIST_INIT(&bmsafemap->sm_inodedephd); 1586 LIST_INIT(&bmsafemap->sm_newblkhd); 1587 ACQUIRE_LOCK(&lk); 1588 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 1589 return (bmsafemap); 1590} 1591 1592/* 1593 * Direct block allocation dependencies. 1594 * 1595 * When a new block is allocated, the corresponding disk locations must be 1596 * initialized (with zeros or new data) before the on-disk inode points to 1597 * them. Also, the freemap from which the block was allocated must be 1598 * updated (on disk) before the inode's pointer. These two dependencies are 1599 * independent of each other and are needed for all file blocks and indirect 1600 * blocks that are pointed to directly by the inode. Just before the 1601 * "in-core" version of the inode is updated with a newly allocated block 1602 * number, a procedure (below) is called to setup allocation dependency 1603 * structures. These structures are removed when the corresponding 1604 * dependencies are satisfied or when the block allocation becomes obsolete 1605 * (i.e., the file is deleted, the block is de-allocated, or the block is a 1606 * fragment that gets upgraded). All of these cases are handled in 1607 * procedures described later. 1608 * 1609 * When a file extension causes a fragment to be upgraded, either to a larger 1610 * fragment or to a full block, the on-disk location may change (if the 1611 * previous fragment could not simply be extended). In this case, the old 1612 * fragment must be de-allocated, but not until after the inode's pointer has 1613 * been updated. In most cases, this is handled by later procedures, which 1614 * will construct a "freefrag" structure to be added to the workitem queue 1615 * when the inode update is complete (or obsolete). The main exception to 1616 * this is when an allocation occurs while a pending allocation dependency 1617 * (for the same block pointer) remains. This case is handled in the main 1618 * allocation dependency setup procedure by immediately freeing the 1619 * unreferenced fragments. 1620 */ 1621void 1622softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 1623 struct inode *ip; /* inode to which block is being added */ 1624 ufs_lbn_t lbn; /* block pointer within inode */ 1625 ufs2_daddr_t newblkno; /* disk block number being added */ 1626 ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ 1627 long newsize; /* size of new block */ 1628 long oldsize; /* size of new block */ 1629 struct buf *bp; /* bp for allocated block */ 1630{ 1631 struct allocdirect *adp, *oldadp; 1632 struct allocdirectlst *adphead; 1633 struct bmsafemap *bmsafemap; 1634 struct inodedep *inodedep; 1635 struct pagedep *pagedep; 1636 struct newblk *newblk; 1637 struct mount *mp; 1638 1639 mp = UFSTOVFS(ip->i_ump); 1640 MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), 1641 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); 1642 workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp); 1643 adp->ad_lbn = lbn; 1644 adp->ad_newblkno = newblkno; 1645 adp->ad_oldblkno = oldblkno; 1646 adp->ad_newsize = newsize; 1647 adp->ad_oldsize = oldsize; 1648 adp->ad_state = ATTACHED; 1649 LIST_INIT(&adp->ad_newdirblk); 1650 if (newblkno == oldblkno) 1651 adp->ad_freefrag = NULL; 1652 else 1653 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); 1654 1655 ACQUIRE_LOCK(&lk); 1656 if (lbn >= NDADDR) { 1657 /* allocating an indirect block */ 1658 if (oldblkno != 0) 1659 panic("softdep_setup_allocdirect: non-zero indir"); 1660 } else { 1661 /* 1662 * Allocating a direct block. 1663 * 1664 * If we are allocating a directory block, then we must 1665 * allocate an associated pagedep to track additions and 1666 * deletions. 1667 */ 1668 if ((ip->i_mode & IFMT) == IFDIR && 1669 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1670 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 1671 } 1672 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) 1673 panic("softdep_setup_allocdirect: lost block"); 1674 if (newblk->nb_state == DEPCOMPLETE) { 1675 adp->ad_state |= DEPCOMPLETE; 1676 adp->ad_buf = NULL; 1677 } else { 1678 bmsafemap = newblk->nb_bmsafemap; 1679 adp->ad_buf = bmsafemap->sm_buf; 1680 LIST_REMOVE(newblk, nb_deps); 1681 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); 1682 } 1683 LIST_REMOVE(newblk, nb_hash); 1684 FREE(newblk, M_NEWBLK); 1685 1686 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 1687 adp->ad_inodedep = inodedep; 1688 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); 1689 /* 1690 * The list of allocdirects must be kept in sorted and ascending 1691 * order so that the rollback routines can quickly determine the 1692 * first uncommitted block (the size of the file stored on disk 1693 * ends at the end of the lowest committed fragment, or if there 1694 * are no fragments, at the end of the highest committed block). 1695 * Since files generally grow, the typical case is that the new 1696 * block is to be added at the end of the list. We speed this 1697 * special case by checking against the last allocdirect in the 1698 * list before laboriously traversing the list looking for the 1699 * insertion point. 1700 */ 1701 adphead = &inodedep->id_newinoupdt; 1702 oldadp = TAILQ_LAST(adphead, allocdirectlst); 1703 if (oldadp == NULL || oldadp->ad_lbn <= lbn) { 1704 /* insert at end of list */ 1705 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 1706 if (oldadp != NULL && oldadp->ad_lbn == lbn) 1707 allocdirect_merge(adphead, adp, oldadp); 1708 FREE_LOCK(&lk); 1709 return; 1710 } 1711 TAILQ_FOREACH(oldadp, adphead, ad_next) { 1712 if (oldadp->ad_lbn >= lbn) 1713 break; 1714 } 1715 if (oldadp == NULL) 1716 panic("softdep_setup_allocdirect: lost entry"); 1717 /* insert in middle of list */ 1718 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 1719 if (oldadp->ad_lbn == lbn) 1720 allocdirect_merge(adphead, adp, oldadp); 1721 FREE_LOCK(&lk); 1722} 1723 1724/* 1725 * Replace an old allocdirect dependency with a newer one. 1726 * This routine must be called with splbio interrupts blocked. 1727 */ 1728static void 1729allocdirect_merge(adphead, newadp, oldadp) 1730 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 1731 struct allocdirect *newadp; /* allocdirect being added */ 1732 struct allocdirect *oldadp; /* existing allocdirect being checked */ 1733{ 1734 struct worklist *wk; 1735 struct freefrag *freefrag; 1736 struct newdirblk *newdirblk; 1737 1738 mtx_assert(&lk, MA_OWNED); 1739 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 1740 newadp->ad_oldsize != oldadp->ad_newsize || 1741 newadp->ad_lbn >= NDADDR) 1742 panic("%s %jd != new %jd || old size %ld != new %ld", 1743 "allocdirect_merge: old blkno", 1744 (intmax_t)newadp->ad_oldblkno, 1745 (intmax_t)oldadp->ad_newblkno, 1746 newadp->ad_oldsize, oldadp->ad_newsize); 1747 newadp->ad_oldblkno = oldadp->ad_oldblkno; 1748 newadp->ad_oldsize = oldadp->ad_oldsize; 1749 /* 1750 * If the old dependency had a fragment to free or had never 1751 * previously had a block allocated, then the new dependency 1752 * can immediately post its freefrag and adopt the old freefrag. 1753 * This action is done by swapping the freefrag dependencies. 1754 * The new dependency gains the old one's freefrag, and the 1755 * old one gets the new one and then immediately puts it on 1756 * the worklist when it is freed by free_allocdirect. It is 1757 * not possible to do this swap when the old dependency had a 1758 * non-zero size but no previous fragment to free. This condition 1759 * arises when the new block is an extension of the old block. 1760 * Here, the first part of the fragment allocated to the new 1761 * dependency is part of the block currently claimed on disk by 1762 * the old dependency, so cannot legitimately be freed until the 1763 * conditions for the new dependency are fulfilled. 1764 */ 1765 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 1766 freefrag = newadp->ad_freefrag; 1767 newadp->ad_freefrag = oldadp->ad_freefrag; 1768 oldadp->ad_freefrag = freefrag; 1769 } 1770 /* 1771 * If we are tracking a new directory-block allocation, 1772 * move it from the old allocdirect to the new allocdirect. 1773 */ 1774 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { 1775 newdirblk = WK_NEWDIRBLK(wk); 1776 WORKLIST_REMOVE(&newdirblk->db_list); 1777 if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL) 1778 panic("allocdirect_merge: extra newdirblk"); 1779 WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list); 1780 } 1781 free_allocdirect(adphead, oldadp, 0); 1782} 1783 1784/* 1785 * Allocate a new freefrag structure if needed. 1786 */ 1787static struct freefrag * 1788newfreefrag(ip, blkno, size) 1789 struct inode *ip; 1790 ufs2_daddr_t blkno; 1791 long size; 1792{ 1793 struct freefrag *freefrag; 1794 struct fs *fs; 1795 1796 if (blkno == 0) 1797 return (NULL); 1798 fs = ip->i_fs; 1799 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 1800 panic("newfreefrag: frag size"); 1801 MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag), 1802 M_FREEFRAG, M_SOFTDEP_FLAGS); 1803 workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); 1804 freefrag->ff_inum = ip->i_number; 1805 freefrag->ff_blkno = blkno; 1806 freefrag->ff_fragsize = size; 1807 return (freefrag); 1808} 1809 1810/* 1811 * This workitem de-allocates fragments that were replaced during 1812 * file block allocation. 1813 */ 1814static void 1815handle_workitem_freefrag(freefrag) 1816 struct freefrag *freefrag; 1817{ 1818 struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); 1819 1820 ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, 1821 freefrag->ff_fragsize, freefrag->ff_inum); 1822 ACQUIRE_LOCK(&lk); 1823 WORKITEM_FREE(freefrag, D_FREEFRAG); 1824 FREE_LOCK(&lk); 1825} 1826 1827/* 1828 * Set up a dependency structure for an external attributes data block. 1829 * This routine follows much of the structure of softdep_setup_allocdirect. 1830 * See the description of softdep_setup_allocdirect above for details. 1831 */ 1832void 1833softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 1834 struct inode *ip; 1835 ufs_lbn_t lbn; 1836 ufs2_daddr_t newblkno; 1837 ufs2_daddr_t oldblkno; 1838 long newsize; 1839 long oldsize; 1840 struct buf *bp; 1841{ 1842 struct allocdirect *adp, *oldadp; 1843 struct allocdirectlst *adphead; 1844 struct bmsafemap *bmsafemap; 1845 struct inodedep *inodedep; 1846 struct newblk *newblk; 1847 struct mount *mp; 1848 1849 mp = UFSTOVFS(ip->i_ump); 1850 MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), 1851 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); 1852 workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp); 1853 adp->ad_lbn = lbn; 1854 adp->ad_newblkno = newblkno; 1855 adp->ad_oldblkno = oldblkno; 1856 adp->ad_newsize = newsize; 1857 adp->ad_oldsize = oldsize; 1858 adp->ad_state = ATTACHED | EXTDATA; 1859 LIST_INIT(&adp->ad_newdirblk); 1860 if (newblkno == oldblkno) 1861 adp->ad_freefrag = NULL; 1862 else 1863 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); 1864 1865 ACQUIRE_LOCK(&lk); 1866 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) 1867 panic("softdep_setup_allocext: lost block"); 1868 1869 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 1870 adp->ad_inodedep = inodedep; 1871 1872 if (newblk->nb_state == DEPCOMPLETE) { 1873 adp->ad_state |= DEPCOMPLETE; 1874 adp->ad_buf = NULL; 1875 } else { 1876 bmsafemap = newblk->nb_bmsafemap; 1877 adp->ad_buf = bmsafemap->sm_buf; 1878 LIST_REMOVE(newblk, nb_deps); 1879 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); 1880 } 1881 LIST_REMOVE(newblk, nb_hash); 1882 FREE(newblk, M_NEWBLK); 1883 1884 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); 1885 if (lbn >= NXADDR) 1886 panic("softdep_setup_allocext: lbn %lld > NXADDR", 1887 (long long)lbn); 1888 /* 1889 * The list of allocdirects must be kept in sorted and ascending 1890 * order so that the rollback routines can quickly determine the 1891 * first uncommitted block (the size of the file stored on disk 1892 * ends at the end of the lowest committed fragment, or if there 1893 * are no fragments, at the end of the highest committed block). 1894 * Since files generally grow, the typical case is that the new 1895 * block is to be added at the end of the list. We speed this 1896 * special case by checking against the last allocdirect in the 1897 * list before laboriously traversing the list looking for the 1898 * insertion point. 1899 */ 1900 adphead = &inodedep->id_newextupdt; 1901 oldadp = TAILQ_LAST(adphead, allocdirectlst); 1902 if (oldadp == NULL || oldadp->ad_lbn <= lbn) { 1903 /* insert at end of list */ 1904 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 1905 if (oldadp != NULL && oldadp->ad_lbn == lbn) 1906 allocdirect_merge(adphead, adp, oldadp); 1907 FREE_LOCK(&lk); 1908 return; 1909 } 1910 TAILQ_FOREACH(oldadp, adphead, ad_next) { 1911 if (oldadp->ad_lbn >= lbn) 1912 break; 1913 } 1914 if (oldadp == NULL) 1915 panic("softdep_setup_allocext: lost entry"); 1916 /* insert in middle of list */ 1917 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 1918 if (oldadp->ad_lbn == lbn) 1919 allocdirect_merge(adphead, adp, oldadp); 1920 FREE_LOCK(&lk); 1921} 1922 1923/* 1924 * Indirect block allocation dependencies. 1925 * 1926 * The same dependencies that exist for a direct block also exist when 1927 * a new block is allocated and pointed to by an entry in a block of 1928 * indirect pointers. The undo/redo states described above are also 1929 * used here. Because an indirect block contains many pointers that 1930 * may have dependencies, a second copy of the entire in-memory indirect 1931 * block is kept. The buffer cache copy is always completely up-to-date. 1932 * The second copy, which is used only as a source for disk writes, 1933 * contains only the safe pointers (i.e., those that have no remaining 1934 * update dependencies). The second copy is freed when all pointers 1935 * are safe. The cache is not allowed to replace indirect blocks with 1936 * pending update dependencies. If a buffer containing an indirect 1937 * block with dependencies is written, these routines will mark it 1938 * dirty again. It can only be successfully written once all the 1939 * dependencies are removed. The ffs_fsync routine in conjunction with 1940 * softdep_sync_metadata work together to get all the dependencies 1941 * removed so that a file can be successfully written to disk. Three 1942 * procedures are used when setting up indirect block pointer 1943 * dependencies. The division is necessary because of the organization 1944 * of the "balloc" routine and because of the distinction between file 1945 * pages and file metadata blocks. 1946 */ 1947 1948/* 1949 * Allocate a new allocindir structure. 1950 */ 1951static struct allocindir * 1952newallocindir(ip, ptrno, newblkno, oldblkno) 1953 struct inode *ip; /* inode for file being extended */ 1954 int ptrno; /* offset of pointer in indirect block */ 1955 ufs2_daddr_t newblkno; /* disk block number being added */ 1956 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 1957{ 1958 struct allocindir *aip; 1959 1960 MALLOC(aip, struct allocindir *, sizeof(struct allocindir), 1961 M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO); 1962 workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump)); 1963 aip->ai_state = ATTACHED; 1964 aip->ai_offset = ptrno; 1965 aip->ai_newblkno = newblkno; 1966 aip->ai_oldblkno = oldblkno; 1967 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); 1968 return (aip); 1969} 1970 1971/* 1972 * Called just before setting an indirect block pointer 1973 * to a newly allocated file page. 1974 */ 1975void 1976softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 1977 struct inode *ip; /* inode for file being extended */ 1978 ufs_lbn_t lbn; /* allocated block number within file */ 1979 struct buf *bp; /* buffer with indirect blk referencing page */ 1980 int ptrno; /* offset of pointer in indirect block */ 1981 ufs2_daddr_t newblkno; /* disk block number being added */ 1982 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 1983 struct buf *nbp; /* buffer holding allocated page */ 1984{ 1985 struct allocindir *aip; 1986 struct pagedep *pagedep; 1987 1988 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); 1989 aip = newallocindir(ip, ptrno, newblkno, oldblkno); 1990 ACQUIRE_LOCK(&lk); 1991 /* 1992 * If we are allocating a directory page, then we must 1993 * allocate an associated pagedep to track additions and 1994 * deletions. 1995 */ 1996 if ((ip->i_mode & IFMT) == IFDIR && 1997 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1998 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); 1999 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 2000 setup_allocindir_phase2(bp, ip, aip); 2001 FREE_LOCK(&lk); 2002} 2003 2004/* 2005 * Called just before setting an indirect block pointer to a 2006 * newly allocated indirect block. 2007 */ 2008void 2009softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 2010 struct buf *nbp; /* newly allocated indirect block */ 2011 struct inode *ip; /* inode for file being extended */ 2012 struct buf *bp; /* indirect block referencing allocated block */ 2013 int ptrno; /* offset of pointer in indirect block */ 2014 ufs2_daddr_t newblkno; /* disk block number being added */ 2015{ 2016 struct allocindir *aip; 2017 2018 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); 2019 aip = newallocindir(ip, ptrno, newblkno, 0); 2020 ACQUIRE_LOCK(&lk); 2021 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 2022 setup_allocindir_phase2(bp, ip, aip); 2023 FREE_LOCK(&lk); 2024} 2025 2026/* 2027 * Called to finish the allocation of the "aip" allocated 2028 * by one of the two routines above. 2029 */ 2030static void 2031setup_allocindir_phase2(bp, ip, aip) 2032 struct buf *bp; /* in-memory copy of the indirect block */ 2033 struct inode *ip; /* inode for file being extended */ 2034 struct allocindir *aip; /* allocindir allocated by the above routines */ 2035{ 2036 struct worklist *wk; 2037 struct indirdep *indirdep, *newindirdep; 2038 struct bmsafemap *bmsafemap; 2039 struct allocindir *oldaip; 2040 struct freefrag *freefrag; 2041 struct newblk *newblk; 2042 ufs2_daddr_t blkno; 2043 2044 mtx_assert(&lk, MA_OWNED); 2045 if (bp->b_lblkno >= 0) 2046 panic("setup_allocindir_phase2: not indir blk"); 2047 for (indirdep = NULL, newindirdep = NULL; ; ) { 2048 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 2049 if (wk->wk_type != D_INDIRDEP) 2050 continue; 2051 indirdep = WK_INDIRDEP(wk); 2052 break; 2053 } 2054 if (indirdep == NULL && newindirdep) { 2055 indirdep = newindirdep; 2056 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 2057 newindirdep = NULL; 2058 } 2059 if (indirdep) { 2060 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, 2061 &newblk) == 0) 2062 panic("setup_allocindir: lost block"); 2063 if (newblk->nb_state == DEPCOMPLETE) { 2064 aip->ai_state |= DEPCOMPLETE; 2065 aip->ai_buf = NULL; 2066 } else { 2067 bmsafemap = newblk->nb_bmsafemap; 2068 aip->ai_buf = bmsafemap->sm_buf; 2069 LIST_REMOVE(newblk, nb_deps); 2070 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, 2071 aip, ai_deps); 2072 } 2073 LIST_REMOVE(newblk, nb_hash); 2074 FREE(newblk, M_NEWBLK); 2075 aip->ai_indirdep = indirdep; 2076 /* 2077 * Check to see if there is an existing dependency 2078 * for this block. If there is, merge the old 2079 * dependency into the new one. 2080 */ 2081 if (aip->ai_oldblkno == 0) 2082 oldaip = NULL; 2083 else 2084 2085 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) 2086 if (oldaip->ai_offset == aip->ai_offset) 2087 break; 2088 freefrag = NULL; 2089 if (oldaip != NULL) { 2090 if (oldaip->ai_newblkno != aip->ai_oldblkno) 2091 panic("setup_allocindir_phase2: blkno"); 2092 aip->ai_oldblkno = oldaip->ai_oldblkno; 2093 freefrag = aip->ai_freefrag; 2094 aip->ai_freefrag = oldaip->ai_freefrag; 2095 oldaip->ai_freefrag = NULL; 2096 free_allocindir(oldaip, NULL); 2097 } 2098 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 2099 if (ip->i_ump->um_fstype == UFS1) 2100 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data) 2101 [aip->ai_offset] = aip->ai_oldblkno; 2102 else 2103 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data) 2104 [aip->ai_offset] = aip->ai_oldblkno; 2105 FREE_LOCK(&lk); 2106 if (freefrag != NULL) 2107 handle_workitem_freefrag(freefrag); 2108 } else 2109 FREE_LOCK(&lk); 2110 if (newindirdep) { 2111 newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; 2112 brelse(newindirdep->ir_savebp); 2113 ACQUIRE_LOCK(&lk); 2114 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); 2115 if (indirdep) 2116 break; 2117 FREE_LOCK(&lk); 2118 } 2119 if (indirdep) { 2120 ACQUIRE_LOCK(&lk); 2121 break; 2122 } 2123 MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep), 2124 M_INDIRDEP, M_SOFTDEP_FLAGS); 2125 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, 2126 UFSTOVFS(ip->i_ump)); 2127 newindirdep->ir_state = ATTACHED; 2128 if (ip->i_ump->um_fstype == UFS1) 2129 newindirdep->ir_state |= UFS1FMT; 2130 LIST_INIT(&newindirdep->ir_deplisthd); 2131 LIST_INIT(&newindirdep->ir_donehd); 2132 if (bp->b_blkno == bp->b_lblkno) { 2133 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, 2134 NULL, NULL); 2135 bp->b_blkno = blkno; 2136 } 2137 newindirdep->ir_savebp = 2138 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); 2139 BUF_KERNPROC(newindirdep->ir_savebp); 2140 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 2141 ACQUIRE_LOCK(&lk); 2142 } 2143} 2144 2145/* 2146 * Block de-allocation dependencies. 2147 * 2148 * When blocks are de-allocated, the on-disk pointers must be nullified before 2149 * the blocks are made available for use by other files. (The true 2150 * requirement is that old pointers must be nullified before new on-disk 2151 * pointers are set. We chose this slightly more stringent requirement to 2152 * reduce complexity.) Our implementation handles this dependency by updating 2153 * the inode (or indirect block) appropriately but delaying the actual block 2154 * de-allocation (i.e., freemap and free space count manipulation) until 2155 * after the updated versions reach stable storage. After the disk is 2156 * updated, the blocks can be safely de-allocated whenever it is convenient. 2157 * This implementation handles only the common case of reducing a file's 2158 * length to zero. Other cases are handled by the conventional synchronous 2159 * write approach. 2160 * 2161 * The ffs implementation with which we worked double-checks 2162 * the state of the block pointers and file size as it reduces 2163 * a file's length. Some of this code is replicated here in our 2164 * soft updates implementation. The freeblks->fb_chkcnt field is 2165 * used to transfer a part of this information to the procedure 2166 * that eventually de-allocates the blocks. 2167 * 2168 * This routine should be called from the routine that shortens 2169 * a file's length, before the inode's size or block pointers 2170 * are modified. It will save the block pointer information for 2171 * later release and zero the inode so that the calling routine 2172 * can release it. 2173 */ 2174void 2175softdep_setup_freeblocks(ip, length, flags) 2176 struct inode *ip; /* The inode whose length is to be reduced */ 2177 off_t length; /* The new length for the file */ 2178 int flags; /* IO_EXT and/or IO_NORMAL */ 2179{ 2180 struct freeblks *freeblks; 2181 struct inodedep *inodedep; 2182 struct allocdirect *adp; 2183 struct vnode *vp; 2184 struct buf *bp; 2185 struct fs *fs; 2186 ufs2_daddr_t extblocks, datablocks; 2187 struct mount *mp; 2188 int i, delay, error; 2189 2190 fs = ip->i_fs; 2191 mp = UFSTOVFS(ip->i_ump); 2192 if (length != 0) 2193 panic("softdep_setup_freeblocks: non-zero length"); 2194 MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks), 2195 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); 2196 workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); 2197 freeblks->fb_state = ATTACHED; 2198 freeblks->fb_uid = ip->i_uid; 2199 freeblks->fb_previousinum = ip->i_number; 2200 freeblks->fb_devvp = ip->i_devvp; 2201 extblocks = 0; 2202 if (fs->fs_magic == FS_UFS2_MAGIC) 2203 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 2204 datablocks = DIP(ip, i_blocks) - extblocks; 2205 if ((flags & IO_NORMAL) == 0) { 2206 freeblks->fb_oldsize = 0; 2207 freeblks->fb_chkcnt = 0; 2208 } else { 2209 freeblks->fb_oldsize = ip->i_size; 2210 ip->i_size = 0; 2211 DIP_SET(ip, i_size, 0); 2212 freeblks->fb_chkcnt = datablocks; 2213 for (i = 0; i < NDADDR; i++) { 2214 freeblks->fb_dblks[i] = DIP(ip, i_db[i]); 2215 DIP_SET(ip, i_db[i], 0); 2216 } 2217 for (i = 0; i < NIADDR; i++) { 2218 freeblks->fb_iblks[i] = DIP(ip, i_ib[i]); 2219 DIP_SET(ip, i_ib[i], 0); 2220 } 2221 /* 2222 * If the file was removed, then the space being freed was 2223 * accounted for then (see softdep_releasefile()). If the 2224 * file is merely being truncated, then we account for it now. 2225 */ 2226 if ((ip->i_flag & IN_SPACECOUNTED) == 0) { 2227 UFS_LOCK(ip->i_ump); 2228 fs->fs_pendingblocks += datablocks; 2229 UFS_UNLOCK(ip->i_ump); 2230 } 2231 } 2232 if ((flags & IO_EXT) == 0) { 2233 freeblks->fb_oldextsize = 0; 2234 } else { 2235 freeblks->fb_oldextsize = ip->i_din2->di_extsize; 2236 ip->i_din2->di_extsize = 0; 2237 freeblks->fb_chkcnt += extblocks; 2238 for (i = 0; i < NXADDR; i++) { 2239 freeblks->fb_eblks[i] = ip->i_din2->di_extb[i]; 2240 ip->i_din2->di_extb[i] = 0; 2241 } 2242 } 2243 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt); 2244 /* 2245 * Push the zero'ed inode to to its disk buffer so that we are free 2246 * to delete its dependencies below. Once the dependencies are gone 2247 * the buffer can be safely released. 2248 */ 2249 if ((error = bread(ip->i_devvp, 2250 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 2251 (int)fs->fs_bsize, NOCRED, &bp)) != 0) { 2252 brelse(bp); 2253 softdep_error("softdep_setup_freeblocks", error); 2254 } 2255 if (ip->i_ump->um_fstype == UFS1) 2256 *((struct ufs1_dinode *)bp->b_data + 2257 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; 2258 else 2259 *((struct ufs2_dinode *)bp->b_data + 2260 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; 2261 /* 2262 * Find and eliminate any inode dependencies. 2263 */ 2264 ACQUIRE_LOCK(&lk); 2265 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 2266 if ((inodedep->id_state & IOSTARTED) != 0) 2267 panic("softdep_setup_freeblocks: inode busy"); 2268 /* 2269 * Add the freeblks structure to the list of operations that 2270 * must await the zero'ed inode being written to disk. If we 2271 * still have a bitmap dependency (delay == 0), then the inode 2272 * has never been written to disk, so we can process the 2273 * freeblks below once we have deleted the dependencies. 2274 */ 2275 delay = (inodedep->id_state & DEPCOMPLETE); 2276 if (delay) 2277 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); 2278 /* 2279 * Because the file length has been truncated to zero, any 2280 * pending block allocation dependency structures associated 2281 * with this inode are obsolete and can simply be de-allocated. 2282 * We must first merge the two dependency lists to get rid of 2283 * any duplicate freefrag structures, then purge the merged list. 2284 * If we still have a bitmap dependency, then the inode has never 2285 * been written to disk, so we can free any fragments without delay. 2286 */ 2287 if (flags & IO_NORMAL) { 2288 merge_inode_lists(&inodedep->id_newinoupdt, 2289 &inodedep->id_inoupdt); 2290 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 2291 free_allocdirect(&inodedep->id_inoupdt, adp, delay); 2292 } 2293 if (flags & IO_EXT) { 2294 merge_inode_lists(&inodedep->id_newextupdt, 2295 &inodedep->id_extupdt); 2296 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 2297 free_allocdirect(&inodedep->id_extupdt, adp, delay); 2298 } 2299 FREE_LOCK(&lk); 2300 bdwrite(bp); 2301 /* 2302 * We must wait for any I/O in progress to finish so that 2303 * all potential buffers on the dirty list will be visible. 2304 * Once they are all there, walk the list and get rid of 2305 * any dependencies. 2306 */ 2307 vp = ITOV(ip); 2308 VI_LOCK(vp); 2309 drain_output(vp); 2310restart: 2311 TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { 2312 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || 2313 ((flags & IO_NORMAL) == 0 && 2314 (bp->b_xflags & BX_ALTDATA) == 0)) 2315 continue; 2316 if ((bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT)) == NULL) 2317 goto restart; 2318 VI_UNLOCK(vp); 2319 ACQUIRE_LOCK(&lk); 2320 (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep); 2321 deallocate_dependencies(bp, inodedep); 2322 FREE_LOCK(&lk); 2323 bp->b_flags |= B_INVAL | B_NOCACHE; 2324 brelse(bp); 2325 VI_LOCK(vp); 2326 goto restart; 2327 } 2328 VI_UNLOCK(vp); 2329 ACQUIRE_LOCK(&lk); 2330 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 2331 (void) free_inodedep(inodedep); 2332 2333 if(delay) { 2334 freeblks->fb_state |= DEPCOMPLETE; 2335 /* 2336 * If the inode with zeroed block pointers is now on disk 2337 * we can start freeing blocks. Add freeblks to the worklist 2338 * instead of calling handle_workitem_freeblocks directly as 2339 * it is more likely that additional IO is needed to complete 2340 * the request here than in the !delay case. 2341 */ 2342 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 2343 add_to_worklist(&freeblks->fb_list); 2344 } 2345 2346 FREE_LOCK(&lk); 2347 /* 2348 * If the inode has never been written to disk (delay == 0), 2349 * then we can process the freeblks now that we have deleted 2350 * the dependencies. 2351 */ 2352 if (!delay) 2353 handle_workitem_freeblocks(freeblks, 0); 2354} 2355 2356/* 2357 * Reclaim any dependency structures from a buffer that is about to 2358 * be reallocated to a new vnode. The buffer must be locked, thus, 2359 * no I/O completion operations can occur while we are manipulating 2360 * its associated dependencies. The mutex is held so that other I/O's 2361 * associated with related dependencies do not occur. 2362 */ 2363static void 2364deallocate_dependencies(bp, inodedep) 2365 struct buf *bp; 2366 struct inodedep *inodedep; 2367{ 2368 struct worklist *wk; 2369 struct indirdep *indirdep; 2370 struct allocindir *aip; 2371 struct pagedep *pagedep; 2372 struct dirrem *dirrem; 2373 struct diradd *dap; 2374 int i; 2375 2376 mtx_assert(&lk, MA_OWNED); 2377 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 2378 switch (wk->wk_type) { 2379 2380 case D_INDIRDEP: 2381 indirdep = WK_INDIRDEP(wk); 2382 /* 2383 * None of the indirect pointers will ever be visible, 2384 * so they can simply be tossed. GOINGAWAY ensures 2385 * that allocated pointers will be saved in the buffer 2386 * cache until they are freed. Note that they will 2387 * only be able to be found by their physical address 2388 * since the inode mapping the logical address will 2389 * be gone. The save buffer used for the safe copy 2390 * was allocated in setup_allocindir_phase2 using 2391 * the physical address so it could be used for this 2392 * purpose. Hence we swap the safe copy with the real 2393 * copy, allowing the safe copy to be freed and holding 2394 * on to the real copy for later use in indir_trunc. 2395 */ 2396 if (indirdep->ir_state & GOINGAWAY) 2397 panic("deallocate_dependencies: already gone"); 2398 indirdep->ir_state |= GOINGAWAY; 2399 VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1; 2400 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 2401 free_allocindir(aip, inodedep); 2402 if (bp->b_lblkno >= 0 || 2403 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 2404 panic("deallocate_dependencies: not indir"); 2405 bcopy(bp->b_data, indirdep->ir_savebp->b_data, 2406 bp->b_bcount); 2407 WORKLIST_REMOVE(wk); 2408 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); 2409 continue; 2410 2411 case D_PAGEDEP: 2412 pagedep = WK_PAGEDEP(wk); 2413 /* 2414 * None of the directory additions will ever be 2415 * visible, so they can simply be tossed. 2416 */ 2417 for (i = 0; i < DAHASHSZ; i++) 2418 while ((dap = 2419 LIST_FIRST(&pagedep->pd_diraddhd[i]))) 2420 free_diradd(dap); 2421 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) 2422 free_diradd(dap); 2423 /* 2424 * Copy any directory remove dependencies to the list 2425 * to be processed after the zero'ed inode is written. 2426 * If the inode has already been written, then they 2427 * can be dumped directly onto the work list. 2428 */ 2429 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 2430 LIST_REMOVE(dirrem, dm_next); 2431 dirrem->dm_dirinum = pagedep->pd_ino; 2432 if (inodedep == NULL || 2433 (inodedep->id_state & ALLCOMPLETE) == 2434 ALLCOMPLETE) 2435 add_to_worklist(&dirrem->dm_list); 2436 else 2437 WORKLIST_INSERT(&inodedep->id_bufwait, 2438 &dirrem->dm_list); 2439 } 2440 if ((pagedep->pd_state & NEWBLOCK) != 0) { 2441 LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list) 2442 if (wk->wk_type == D_NEWDIRBLK && 2443 WK_NEWDIRBLK(wk)->db_pagedep == 2444 pagedep) 2445 break; 2446 if (wk != NULL) { 2447 WORKLIST_REMOVE(wk); 2448 free_newdirblk(WK_NEWDIRBLK(wk)); 2449 } else 2450 panic("deallocate_dependencies: " 2451 "lost pagedep"); 2452 } 2453 WORKLIST_REMOVE(&pagedep->pd_list); 2454 LIST_REMOVE(pagedep, pd_hash); 2455 WORKITEM_FREE(pagedep, D_PAGEDEP); 2456 continue; 2457 2458 case D_ALLOCINDIR: 2459 free_allocindir(WK_ALLOCINDIR(wk), inodedep); 2460 continue; 2461 2462 case D_ALLOCDIRECT: 2463 case D_INODEDEP: 2464 panic("deallocate_dependencies: Unexpected type %s", 2465 TYPENAME(wk->wk_type)); 2466 /* NOTREACHED */ 2467 2468 default: 2469 panic("deallocate_dependencies: Unknown type %s", 2470 TYPENAME(wk->wk_type)); 2471 /* NOTREACHED */ 2472 } 2473 } 2474} 2475 2476/* 2477 * Free an allocdirect. Generate a new freefrag work request if appropriate. 2478 * This routine must be called with splbio interrupts blocked. 2479 */ 2480static void 2481free_allocdirect(adphead, adp, delay) 2482 struct allocdirectlst *adphead; 2483 struct allocdirect *adp; 2484 int delay; 2485{ 2486 struct newdirblk *newdirblk; 2487 struct worklist *wk; 2488 2489 mtx_assert(&lk, MA_OWNED); 2490 if ((adp->ad_state & DEPCOMPLETE) == 0) 2491 LIST_REMOVE(adp, ad_deps); 2492 TAILQ_REMOVE(adphead, adp, ad_next); 2493 if ((adp->ad_state & COMPLETE) == 0) 2494 WORKLIST_REMOVE(&adp->ad_list); 2495 if (adp->ad_freefrag != NULL) { 2496 if (delay) 2497 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 2498 &adp->ad_freefrag->ff_list); 2499 else 2500 add_to_worklist(&adp->ad_freefrag->ff_list); 2501 } 2502 if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) { 2503 newdirblk = WK_NEWDIRBLK(wk); 2504 WORKLIST_REMOVE(&newdirblk->db_list); 2505 if (LIST_FIRST(&adp->ad_newdirblk) != NULL) 2506 panic("free_allocdirect: extra newdirblk"); 2507 if (delay) 2508 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 2509 &newdirblk->db_list); 2510 else 2511 free_newdirblk(newdirblk); 2512 } 2513 WORKITEM_FREE(adp, D_ALLOCDIRECT); 2514} 2515 2516/* 2517 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. 2518 * This routine must be called with splbio interrupts blocked. 2519 */ 2520static void 2521free_newdirblk(newdirblk) 2522 struct newdirblk *newdirblk; 2523{ 2524 struct pagedep *pagedep; 2525 struct diradd *dap; 2526 int i; 2527 2528 mtx_assert(&lk, MA_OWNED); 2529 /* 2530 * If the pagedep is still linked onto the directory buffer 2531 * dependency chain, then some of the entries on the 2532 * pd_pendinghd list may not be committed to disk yet. In 2533 * this case, we will simply clear the NEWBLOCK flag and 2534 * let the pd_pendinghd list be processed when the pagedep 2535 * is next written. If the pagedep is no longer on the buffer 2536 * dependency chain, then all the entries on the pd_pending 2537 * list are committed to disk and we can free them here. 2538 */ 2539 pagedep = newdirblk->db_pagedep; 2540 pagedep->pd_state &= ~NEWBLOCK; 2541 if ((pagedep->pd_state & ONWORKLIST) == 0) 2542 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 2543 free_diradd(dap); 2544 /* 2545 * If no dependencies remain, the pagedep will be freed. 2546 */ 2547 for (i = 0; i < DAHASHSZ; i++) 2548 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) 2549 break; 2550 if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) { 2551 LIST_REMOVE(pagedep, pd_hash); 2552 WORKITEM_FREE(pagedep, D_PAGEDEP); 2553 } 2554 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 2555} 2556 2557/* 2558 * Prepare an inode to be freed. The actual free operation is not 2559 * done until the zero'ed inode has been written to disk. 2560 */ 2561void 2562softdep_freefile(pvp, ino, mode) 2563 struct vnode *pvp; 2564 ino_t ino; 2565 int mode; 2566{ 2567 struct inode *ip = VTOI(pvp); 2568 struct inodedep *inodedep; 2569 struct freefile *freefile; 2570 2571 /* 2572 * This sets up the inode de-allocation dependency. 2573 */ 2574 MALLOC(freefile, struct freefile *, sizeof(struct freefile), 2575 M_FREEFILE, M_SOFTDEP_FLAGS); 2576 workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); 2577 freefile->fx_mode = mode; 2578 freefile->fx_oldinum = ino; 2579 freefile->fx_devvp = ip->i_devvp; 2580 if ((ip->i_flag & IN_SPACECOUNTED) == 0) { 2581 UFS_LOCK(ip->i_ump); 2582 ip->i_fs->fs_pendinginodes += 1; 2583 UFS_UNLOCK(ip->i_ump); 2584 } 2585 2586 /* 2587 * If the inodedep does not exist, then the zero'ed inode has 2588 * been written to disk. If the allocated inode has never been 2589 * written to disk, then the on-disk inode is zero'ed. In either 2590 * case we can free the file immediately. 2591 */ 2592 ACQUIRE_LOCK(&lk); 2593 if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 || 2594 check_inode_unwritten(inodedep)) { 2595 FREE_LOCK(&lk); 2596 handle_workitem_freefile(freefile); 2597 return; 2598 } 2599 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 2600 FREE_LOCK(&lk); 2601} 2602 2603/* 2604 * Check to see if an inode has never been written to disk. If 2605 * so free the inodedep and return success, otherwise return failure. 2606 * This routine must be called with splbio interrupts blocked. 2607 * 2608 * If we still have a bitmap dependency, then the inode has never 2609 * been written to disk. Drop the dependency as it is no longer 2610 * necessary since the inode is being deallocated. We set the 2611 * ALLCOMPLETE flags since the bitmap now properly shows that the 2612 * inode is not allocated. Even if the inode is actively being 2613 * written, it has been rolled back to its zero'ed state, so we 2614 * are ensured that a zero inode is what is on the disk. For short 2615 * lived files, this change will usually result in removing all the 2616 * dependencies from the inode so that it can be freed immediately. 2617 */ 2618static int 2619check_inode_unwritten(inodedep) 2620 struct inodedep *inodedep; 2621{ 2622 2623 mtx_assert(&lk, MA_OWNED); 2624 if ((inodedep->id_state & DEPCOMPLETE) != 0 || 2625 LIST_FIRST(&inodedep->id_pendinghd) != NULL || 2626 LIST_FIRST(&inodedep->id_bufwait) != NULL || 2627 LIST_FIRST(&inodedep->id_inowait) != NULL || 2628 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 2629 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || 2630 TAILQ_FIRST(&inodedep->id_extupdt) != NULL || 2631 TAILQ_FIRST(&inodedep->id_newextupdt) != NULL || 2632 inodedep->id_nlinkdelta != 0) 2633 return (0); 2634 2635 /* 2636 * Another process might be in initiate_write_inodeblock_ufs[12] 2637 * trying to allocate memory without holding "Softdep Lock". 2638 */ 2639 if ((inodedep->id_state & IOSTARTED) != 0 && 2640 inodedep->id_savedino1 == NULL) 2641 return (0); 2642 2643 inodedep->id_state |= ALLCOMPLETE; 2644 LIST_REMOVE(inodedep, id_deps); 2645 inodedep->id_buf = NULL; 2646 if (inodedep->id_state & ONWORKLIST) 2647 WORKLIST_REMOVE(&inodedep->id_list); 2648 if (inodedep->id_savedino1 != NULL) { 2649 FREE(inodedep->id_savedino1, M_SAVEDINO); 2650 inodedep->id_savedino1 = NULL; 2651 } 2652 if (free_inodedep(inodedep) == 0) 2653 panic("check_inode_unwritten: busy inode"); 2654 return (1); 2655} 2656 2657/* 2658 * Try to free an inodedep structure. Return 1 if it could be freed. 2659 */ 2660static int 2661free_inodedep(inodedep) 2662 struct inodedep *inodedep; 2663{ 2664 2665 mtx_assert(&lk, MA_OWNED); 2666 if ((inodedep->id_state & ONWORKLIST) != 0 || 2667 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 2668 LIST_FIRST(&inodedep->id_pendinghd) != NULL || 2669 LIST_FIRST(&inodedep->id_bufwait) != NULL || 2670 LIST_FIRST(&inodedep->id_inowait) != NULL || 2671 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 2672 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || 2673 TAILQ_FIRST(&inodedep->id_extupdt) != NULL || 2674 TAILQ_FIRST(&inodedep->id_newextupdt) != NULL || 2675 inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL) 2676 return (0); 2677 LIST_REMOVE(inodedep, id_hash); 2678 WORKITEM_FREE(inodedep, D_INODEDEP); 2679 num_inodedep -= 1; 2680 return (1); 2681} 2682 2683/* 2684 * This workitem routine performs the block de-allocation. 2685 * The workitem is added to the pending list after the updated 2686 * inode block has been written to disk. As mentioned above, 2687 * checks regarding the number of blocks de-allocated (compared 2688 * to the number of blocks allocated for the file) are also 2689 * performed in this function. 2690 */ 2691static void 2692handle_workitem_freeblocks(freeblks, flags) 2693 struct freeblks *freeblks; 2694 int flags; 2695{ 2696 struct inode *ip; 2697 struct vnode *vp; 2698 struct fs *fs; 2699 struct ufsmount *ump; 2700 int i, nblocks, level, bsize; 2701 ufs2_daddr_t bn, blocksreleased = 0; 2702 int error, allerror = 0; 2703 ufs_lbn_t baselbns[NIADDR], tmpval; 2704 int fs_pendingblocks; 2705 2706 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 2707 fs = ump->um_fs; 2708 fs_pendingblocks = 0; 2709 tmpval = 1; 2710 baselbns[0] = NDADDR; 2711 for (i = 1; i < NIADDR; i++) { 2712 tmpval *= NINDIR(fs); 2713 baselbns[i] = baselbns[i - 1] + tmpval; 2714 } 2715 nblocks = btodb(fs->fs_bsize); 2716 blocksreleased = 0; 2717 /* 2718 * Release all extended attribute blocks or frags. 2719 */ 2720 if (freeblks->fb_oldextsize > 0) { 2721 for (i = (NXADDR - 1); i >= 0; i--) { 2722 if ((bn = freeblks->fb_eblks[i]) == 0) 2723 continue; 2724 bsize = sblksize(fs, freeblks->fb_oldextsize, i); 2725 ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize, 2726 freeblks->fb_previousinum); 2727 blocksreleased += btodb(bsize); 2728 } 2729 } 2730 /* 2731 * Release all data blocks or frags. 2732 */ 2733 if (freeblks->fb_oldsize > 0) { 2734 /* 2735 * Indirect blocks first. 2736 */ 2737 for (level = (NIADDR - 1); level >= 0; level--) { 2738 if ((bn = freeblks->fb_iblks[level]) == 0) 2739 continue; 2740 if ((error = indir_trunc(freeblks, fsbtodb(fs, bn), 2741 level, baselbns[level], &blocksreleased)) != 0) 2742 allerror = error; 2743 ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, 2744 fs->fs_bsize, freeblks->fb_previousinum); 2745 fs_pendingblocks += nblocks; 2746 blocksreleased += nblocks; 2747 } 2748 /* 2749 * All direct blocks or frags. 2750 */ 2751 for (i = (NDADDR - 1); i >= 0; i--) { 2752 if ((bn = freeblks->fb_dblks[i]) == 0) 2753 continue; 2754 bsize = sblksize(fs, freeblks->fb_oldsize, i); 2755 ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize, 2756 freeblks->fb_previousinum); 2757 fs_pendingblocks += btodb(bsize); 2758 blocksreleased += btodb(bsize); 2759 } 2760 } 2761 UFS_LOCK(ump); 2762 fs->fs_pendingblocks -= fs_pendingblocks; 2763 UFS_UNLOCK(ump); 2764 /* 2765 * If we still have not finished background cleanup, then check 2766 * to see if the block count needs to be adjusted. 2767 */ 2768 if (freeblks->fb_chkcnt != blocksreleased && 2769 (fs->fs_flags & FS_UNCLEAN) != 0 && 2770 ffs_vget(freeblks->fb_list.wk_mp, freeblks->fb_previousinum, 2771 (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) { 2772 ip = VTOI(vp); 2773 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \ 2774 freeblks->fb_chkcnt - blocksreleased); 2775 ip->i_flag |= IN_CHANGE; 2776 vput(vp); 2777 } 2778 2779#ifdef DIAGNOSTIC 2780 if (freeblks->fb_chkcnt != blocksreleased && 2781 ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0)) 2782 printf("handle_workitem_freeblocks: block count\n"); 2783 if (allerror) 2784 softdep_error("handle_workitem_freeblks", allerror); 2785#endif /* DIAGNOSTIC */ 2786 2787 ACQUIRE_LOCK(&lk); 2788 WORKITEM_FREE(freeblks, D_FREEBLKS); 2789 FREE_LOCK(&lk); 2790} 2791 2792/* 2793 * Release blocks associated with the inode ip and stored in the indirect 2794 * block dbn. If level is greater than SINGLE, the block is an indirect block 2795 * and recursive calls to indirtrunc must be used to cleanse other indirect 2796 * blocks. 2797 */ 2798static int 2799indir_trunc(freeblks, dbn, level, lbn, countp) 2800 struct freeblks *freeblks; 2801 ufs2_daddr_t dbn; 2802 int level; 2803 ufs_lbn_t lbn; 2804 ufs2_daddr_t *countp; 2805{ 2806 struct buf *bp; 2807 struct fs *fs; 2808 struct worklist *wk; 2809 struct indirdep *indirdep; 2810 struct ufsmount *ump; 2811 ufs1_daddr_t *bap1 = 0; 2812 ufs2_daddr_t nb, *bap2 = 0; 2813 ufs_lbn_t lbnadd; 2814 int i, nblocks, ufs1fmt; 2815 int error, allerror = 0; 2816 int fs_pendingblocks; 2817 2818 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 2819 fs = ump->um_fs; 2820 fs_pendingblocks = 0; 2821 lbnadd = 1; 2822 for (i = level; i > 0; i--) 2823 lbnadd *= NINDIR(fs); 2824 /* 2825 * Get buffer of block pointers to be freed. This routine is not 2826 * called until the zero'ed inode has been written, so it is safe 2827 * to free blocks as they are encountered. Because the inode has 2828 * been zero'ed, calls to bmap on these blocks will fail. So, we 2829 * have to use the on-disk address and the block device for the 2830 * filesystem to look them up. If the file was deleted before its 2831 * indirect blocks were all written to disk, the routine that set 2832 * us up (deallocate_dependencies) will have arranged to leave 2833 * a complete copy of the indirect block in memory for our use. 2834 * Otherwise we have to read the blocks in from the disk. 2835 */ 2836#ifdef notyet 2837 bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0, 2838 GB_NOCREAT); 2839#else 2840 bp = incore(&freeblks->fb_devvp->v_bufobj, dbn); 2841#endif 2842 ACQUIRE_LOCK(&lk); 2843 if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 2844 if (wk->wk_type != D_INDIRDEP || 2845 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || 2846 (indirdep->ir_state & GOINGAWAY) == 0) 2847 panic("indir_trunc: lost indirdep"); 2848 WORKLIST_REMOVE(wk); 2849 WORKITEM_FREE(indirdep, D_INDIRDEP); 2850 if (LIST_FIRST(&bp->b_dep) != NULL) 2851 panic("indir_trunc: dangling dep"); 2852 ump->um_numindirdeps -= 1; 2853 FREE_LOCK(&lk); 2854 } else { 2855#ifdef notyet 2856 if (bp) 2857 brelse(bp); 2858#endif 2859 FREE_LOCK(&lk); 2860 error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 2861 NOCRED, &bp); 2862 if (error) { 2863 brelse(bp); 2864 return (error); 2865 } 2866 } 2867 /* 2868 * Recursively free indirect blocks. 2869 */ 2870 if (ump->um_fstype == UFS1) { 2871 ufs1fmt = 1; 2872 bap1 = (ufs1_daddr_t *)bp->b_data; 2873 } else { 2874 ufs1fmt = 0; 2875 bap2 = (ufs2_daddr_t *)bp->b_data; 2876 } 2877 nblocks = btodb(fs->fs_bsize); 2878 for (i = NINDIR(fs) - 1; i >= 0; i--) { 2879 if (ufs1fmt) 2880 nb = bap1[i]; 2881 else 2882 nb = bap2[i]; 2883 if (nb == 0) 2884 continue; 2885 if (level != 0) { 2886 if ((error = indir_trunc(freeblks, fsbtodb(fs, nb), 2887 level - 1, lbn + (i * lbnadd), countp)) != 0) 2888 allerror = error; 2889 } 2890 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize, 2891 freeblks->fb_previousinum); 2892 fs_pendingblocks += nblocks; 2893 *countp += nblocks; 2894 } 2895 UFS_LOCK(ump); 2896 fs->fs_pendingblocks -= fs_pendingblocks; 2897 UFS_UNLOCK(ump); 2898 bp->b_flags |= B_INVAL | B_NOCACHE; 2899 brelse(bp); 2900 return (allerror); 2901} 2902 2903/* 2904 * Free an allocindir. 2905 * This routine must be called with splbio interrupts blocked. 2906 */ 2907static void 2908free_allocindir(aip, inodedep) 2909 struct allocindir *aip; 2910 struct inodedep *inodedep; 2911{ 2912 struct freefrag *freefrag; 2913 2914 mtx_assert(&lk, MA_OWNED); 2915 if ((aip->ai_state & DEPCOMPLETE) == 0) 2916 LIST_REMOVE(aip, ai_deps); 2917 if (aip->ai_state & ONWORKLIST) 2918 WORKLIST_REMOVE(&aip->ai_list); 2919 LIST_REMOVE(aip, ai_next); 2920 if ((freefrag = aip->ai_freefrag) != NULL) { 2921 if (inodedep == NULL) 2922 add_to_worklist(&freefrag->ff_list); 2923 else 2924 WORKLIST_INSERT(&inodedep->id_bufwait, 2925 &freefrag->ff_list); 2926 } 2927 WORKITEM_FREE(aip, D_ALLOCINDIR); 2928} 2929 2930/* 2931 * Directory entry addition dependencies. 2932 * 2933 * When adding a new directory entry, the inode (with its incremented link 2934 * count) must be written to disk before the directory entry's pointer to it. 2935 * Also, if the inode is newly allocated, the corresponding freemap must be 2936 * updated (on disk) before the directory entry's pointer. These requirements 2937 * are met via undo/redo on the directory entry's pointer, which consists 2938 * simply of the inode number. 2939 * 2940 * As directory entries are added and deleted, the free space within a 2941 * directory block can become fragmented. The ufs filesystem will compact 2942 * a fragmented directory block to make space for a new entry. When this 2943 * occurs, the offsets of previously added entries change. Any "diradd" 2944 * dependency structures corresponding to these entries must be updated with 2945 * the new offsets. 2946 */ 2947 2948/* 2949 * This routine is called after the in-memory inode's link 2950 * count has been incremented, but before the directory entry's 2951 * pointer to the inode has been set. 2952 */ 2953int 2954softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 2955 struct buf *bp; /* buffer containing directory block */ 2956 struct inode *dp; /* inode for directory */ 2957 off_t diroffset; /* offset of new entry in directory */ 2958 ino_t newinum; /* inode referenced by new directory entry */ 2959 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 2960 int isnewblk; /* entry is in a newly allocated block */ 2961{ 2962 int offset; /* offset of new entry within directory block */ 2963 ufs_lbn_t lbn; /* block in directory containing new entry */ 2964 struct fs *fs; 2965 struct diradd *dap; 2966 struct allocdirect *adp; 2967 struct pagedep *pagedep; 2968 struct inodedep *inodedep; 2969 struct newdirblk *newdirblk = 0; 2970 struct mkdir *mkdir1, *mkdir2; 2971 struct mount *mp; 2972 2973 /* 2974 * Whiteouts have no dependencies. 2975 */ 2976 if (newinum == WINO) { 2977 if (newdirbp != NULL) 2978 bdwrite(newdirbp); 2979 return (0); 2980 } 2981 mp = UFSTOVFS(dp->i_ump); 2982 fs = dp->i_fs; 2983 lbn = lblkno(fs, diroffset); 2984 offset = blkoff(fs, diroffset); 2985 MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, 2986 M_SOFTDEP_FLAGS|M_ZERO); 2987 workitem_alloc(&dap->da_list, D_DIRADD, mp); 2988 dap->da_offset = offset; 2989 dap->da_newinum = newinum; 2990 dap->da_state = ATTACHED; 2991 if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) { 2992 MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk), 2993 M_NEWDIRBLK, M_SOFTDEP_FLAGS); 2994 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 2995 } 2996 if (newdirbp == NULL) { 2997 dap->da_state |= DEPCOMPLETE; 2998 ACQUIRE_LOCK(&lk); 2999 } else { 3000 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 3001 MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR, 3002 M_SOFTDEP_FLAGS); 3003 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); 3004 mkdir1->md_state = MKDIR_BODY; 3005 mkdir1->md_diradd = dap; 3006 MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR, 3007 M_SOFTDEP_FLAGS); 3008 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); 3009 mkdir2->md_state = MKDIR_PARENT; 3010 mkdir2->md_diradd = dap; 3011 /* 3012 * Dependency on "." and ".." being written to disk. 3013 */ 3014 mkdir1->md_buf = newdirbp; 3015 ACQUIRE_LOCK(&lk); 3016 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 3017 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); 3018 FREE_LOCK(&lk); 3019 bdwrite(newdirbp); 3020 /* 3021 * Dependency on link count increase for parent directory 3022 */ 3023 ACQUIRE_LOCK(&lk); 3024 if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0 3025 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 3026 dap->da_state &= ~MKDIR_PARENT; 3027 WORKITEM_FREE(mkdir2, D_MKDIR); 3028 } else { 3029 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 3030 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); 3031 } 3032 } 3033 /* 3034 * Link into parent directory pagedep to await its being written. 3035 */ 3036 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 3037 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 3038 dap->da_pagedep = pagedep; 3039 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 3040 da_pdlist); 3041 /* 3042 * Link into its inodedep. Put it on the id_bufwait list if the inode 3043 * is not yet written. If it is written, do the post-inode write 3044 * processing to put it on the id_pendinghd list. 3045 */ 3046 (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 3047 if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 3048 diradd_inode_written(dap, inodedep); 3049 else 3050 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 3051 if (isnewblk) { 3052 /* 3053 * Directories growing into indirect blocks are rare 3054 * enough and the frequency of new block allocation 3055 * in those cases even more rare, that we choose not 3056 * to bother tracking them. Rather we simply force the 3057 * new directory entry to disk. 3058 */ 3059 if (lbn >= NDADDR) { 3060 FREE_LOCK(&lk); 3061 /* 3062 * We only have a new allocation when at the 3063 * beginning of a new block, not when we are 3064 * expanding into an existing block. 3065 */ 3066 if (blkoff(fs, diroffset) == 0) 3067 return (1); 3068 return (0); 3069 } 3070 /* 3071 * We only have a new allocation when at the beginning 3072 * of a new fragment, not when we are expanding into an 3073 * existing fragment. Also, there is nothing to do if we 3074 * are already tracking this block. 3075 */ 3076 if (fragoff(fs, diroffset) != 0) { 3077 FREE_LOCK(&lk); 3078 return (0); 3079 } 3080 if ((pagedep->pd_state & NEWBLOCK) != 0) { 3081 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 3082 FREE_LOCK(&lk); 3083 return (0); 3084 } 3085 /* 3086 * Find our associated allocdirect and have it track us. 3087 */ 3088 if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0) 3089 panic("softdep_setup_directory_add: lost inodedep"); 3090 adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst); 3091 if (adp == NULL || adp->ad_lbn != lbn) 3092 panic("softdep_setup_directory_add: lost entry"); 3093 pagedep->pd_state |= NEWBLOCK; 3094 newdirblk->db_pagedep = pagedep; 3095 WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list); 3096 } 3097 FREE_LOCK(&lk); 3098 return (0); 3099} 3100 3101/* 3102 * This procedure is called to change the offset of a directory 3103 * entry when compacting a directory block which must be owned 3104 * exclusively by the caller. Note that the actual entry movement 3105 * must be done in this procedure to ensure that no I/O completions 3106 * occur while the move is in progress. 3107 */ 3108void 3109softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) 3110 struct inode *dp; /* inode for directory */ 3111 caddr_t base; /* address of dp->i_offset */ 3112 caddr_t oldloc; /* address of old directory location */ 3113 caddr_t newloc; /* address of new directory location */ 3114 int entrysize; /* size of directory entry */ 3115{ 3116 int offset, oldoffset, newoffset; 3117 struct pagedep *pagedep; 3118 struct diradd *dap; 3119 ufs_lbn_t lbn; 3120 3121 ACQUIRE_LOCK(&lk); 3122 lbn = lblkno(dp->i_fs, dp->i_offset); 3123 offset = blkoff(dp->i_fs, dp->i_offset); 3124 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) 3125 goto done; 3126 oldoffset = offset + (oldloc - base); 3127 newoffset = offset + (newloc - base); 3128 3129 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) { 3130 if (dap->da_offset != oldoffset) 3131 continue; 3132 dap->da_offset = newoffset; 3133 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) 3134 break; 3135 LIST_REMOVE(dap, da_pdlist); 3136 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], 3137 dap, da_pdlist); 3138 break; 3139 } 3140 if (dap == NULL) { 3141 3142 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) { 3143 if (dap->da_offset == oldoffset) { 3144 dap->da_offset = newoffset; 3145 break; 3146 } 3147 } 3148 } 3149done: 3150 bcopy(oldloc, newloc, entrysize); 3151 FREE_LOCK(&lk); 3152} 3153 3154/* 3155 * Free a diradd dependency structure. This routine must be called 3156 * with splbio interrupts blocked. 3157 */ 3158static void 3159free_diradd(dap) 3160 struct diradd *dap; 3161{ 3162 struct dirrem *dirrem; 3163 struct pagedep *pagedep; 3164 struct inodedep *inodedep; 3165 struct mkdir *mkdir, *nextmd; 3166 3167 mtx_assert(&lk, MA_OWNED); 3168 WORKLIST_REMOVE(&dap->da_list); 3169 LIST_REMOVE(dap, da_pdlist); 3170 if ((dap->da_state & DIRCHG) == 0) { 3171 pagedep = dap->da_pagedep; 3172 } else { 3173 dirrem = dap->da_previous; 3174 pagedep = dirrem->dm_pagedep; 3175 dirrem->dm_dirinum = pagedep->pd_ino; 3176 add_to_worklist(&dirrem->dm_list); 3177 } 3178 if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 3179 0, &inodedep) != 0) 3180 (void) free_inodedep(inodedep); 3181 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 3182 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 3183 nextmd = LIST_NEXT(mkdir, md_mkdirs); 3184 if (mkdir->md_diradd != dap) 3185 continue; 3186 dap->da_state &= ~mkdir->md_state; 3187 WORKLIST_REMOVE(&mkdir->md_list); 3188 LIST_REMOVE(mkdir, md_mkdirs); 3189 WORKITEM_FREE(mkdir, D_MKDIR); 3190 } 3191 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 3192 panic("free_diradd: unfound ref"); 3193 } 3194 WORKITEM_FREE(dap, D_DIRADD); 3195} 3196 3197/* 3198 * Directory entry removal dependencies. 3199 * 3200 * When removing a directory entry, the entry's inode pointer must be 3201 * zero'ed on disk before the corresponding inode's link count is decremented 3202 * (possibly freeing the inode for re-use). This dependency is handled by 3203 * updating the directory entry but delaying the inode count reduction until 3204 * after the directory block has been written to disk. After this point, the 3205 * inode count can be decremented whenever it is convenient. 3206 */ 3207 3208/* 3209 * This routine should be called immediately after removing 3210 * a directory entry. The inode's link count should not be 3211 * decremented by the calling procedure -- the soft updates 3212 * code will do this task when it is safe. 3213 */ 3214void 3215softdep_setup_remove(bp, dp, ip, isrmdir) 3216 struct buf *bp; /* buffer containing directory block */ 3217 struct inode *dp; /* inode for the directory being modified */ 3218 struct inode *ip; /* inode for directory entry being removed */ 3219 int isrmdir; /* indicates if doing RMDIR */ 3220{ 3221 struct dirrem *dirrem, *prevdirrem; 3222 3223 /* 3224 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. 3225 */ 3226 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 3227 3228 /* 3229 * If the COMPLETE flag is clear, then there were no active 3230 * entries and we want to roll back to a zeroed entry until 3231 * the new inode is committed to disk. If the COMPLETE flag is 3232 * set then we have deleted an entry that never made it to 3233 * disk. If the entry we deleted resulted from a name change, 3234 * then the old name still resides on disk. We cannot delete 3235 * its inode (returned to us in prevdirrem) until the zeroed 3236 * directory entry gets to disk. The new inode has never been 3237 * referenced on the disk, so can be deleted immediately. 3238 */ 3239 if ((dirrem->dm_state & COMPLETE) == 0) { 3240 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 3241 dm_next); 3242 FREE_LOCK(&lk); 3243 } else { 3244 if (prevdirrem != NULL) 3245 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 3246 prevdirrem, dm_next); 3247 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 3248 FREE_LOCK(&lk); 3249 handle_workitem_remove(dirrem, NULL); 3250 } 3251} 3252 3253/* 3254 * Allocate a new dirrem if appropriate and return it along with 3255 * its associated pagedep. Called without a lock, returns with lock. 3256 */ 3257static long num_dirrem; /* number of dirrem allocated */ 3258static struct dirrem * 3259newdirrem(bp, dp, ip, isrmdir, prevdirremp) 3260 struct buf *bp; /* buffer containing directory block */ 3261 struct inode *dp; /* inode for the directory being modified */ 3262 struct inode *ip; /* inode for directory entry being removed */ 3263 int isrmdir; /* indicates if doing RMDIR */ 3264 struct dirrem **prevdirremp; /* previously referenced inode, if any */ 3265{ 3266 int offset; 3267 ufs_lbn_t lbn; 3268 struct diradd *dap; 3269 struct dirrem *dirrem; 3270 struct pagedep *pagedep; 3271 3272 /* 3273 * Whiteouts have no deletion dependencies. 3274 */ 3275 if (ip == NULL) 3276 panic("newdirrem: whiteout"); 3277 /* 3278 * If we are over our limit, try to improve the situation. 3279 * Limiting the number of dirrem structures will also limit 3280 * the number of freefile and freeblks structures. 3281 */ 3282 ACQUIRE_LOCK(&lk); 3283 if (num_dirrem > max_softdeps / 2) 3284 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE); 3285 num_dirrem += 1; 3286 FREE_LOCK(&lk); 3287 MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem), 3288 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); 3289 workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount); 3290 dirrem->dm_state = isrmdir ? RMDIR : 0; 3291 dirrem->dm_oldinum = ip->i_number; 3292 *prevdirremp = NULL; 3293 3294 ACQUIRE_LOCK(&lk); 3295 lbn = lblkno(dp->i_fs, dp->i_offset); 3296 offset = blkoff(dp->i_fs, dp->i_offset); 3297 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 3298 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 3299 dirrem->dm_pagedep = pagedep; 3300 /* 3301 * Check for a diradd dependency for the same directory entry. 3302 * If present, then both dependencies become obsolete and can 3303 * be de-allocated. Check for an entry on both the pd_dirraddhd 3304 * list and the pd_pendinghd list. 3305 */ 3306 3307 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 3308 if (dap->da_offset == offset) 3309 break; 3310 if (dap == NULL) { 3311 3312 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 3313 if (dap->da_offset == offset) 3314 break; 3315 if (dap == NULL) 3316 return (dirrem); 3317 } 3318 /* 3319 * Must be ATTACHED at this point. 3320 */ 3321 if ((dap->da_state & ATTACHED) == 0) 3322 panic("newdirrem: not ATTACHED"); 3323 if (dap->da_newinum != ip->i_number) 3324 panic("newdirrem: inum %d should be %d", 3325 ip->i_number, dap->da_newinum); 3326 /* 3327 * If we are deleting a changed name that never made it to disk, 3328 * then return the dirrem describing the previous inode (which 3329 * represents the inode currently referenced from this entry on disk). 3330 */ 3331 if ((dap->da_state & DIRCHG) != 0) { 3332 *prevdirremp = dap->da_previous; 3333 dap->da_state &= ~DIRCHG; 3334 dap->da_pagedep = pagedep; 3335 } 3336 /* 3337 * We are deleting an entry that never made it to disk. 3338 * Mark it COMPLETE so we can delete its inode immediately. 3339 */ 3340 dirrem->dm_state |= COMPLETE; 3341 free_diradd(dap); 3342 return (dirrem); 3343} 3344 3345/* 3346 * Directory entry change dependencies. 3347 * 3348 * Changing an existing directory entry requires that an add operation 3349 * be completed first followed by a deletion. The semantics for the addition 3350 * are identical to the description of adding a new entry above except 3351 * that the rollback is to the old inode number rather than zero. Once 3352 * the addition dependency is completed, the removal is done as described 3353 * in the removal routine above. 3354 */ 3355 3356/* 3357 * This routine should be called immediately after changing 3358 * a directory entry. The inode's link count should not be 3359 * decremented by the calling procedure -- the soft updates 3360 * code will perform this task when it is safe. 3361 */ 3362void 3363softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 3364 struct buf *bp; /* buffer containing directory block */ 3365 struct inode *dp; /* inode for the directory being modified */ 3366 struct inode *ip; /* inode for directory entry being removed */ 3367 ino_t newinum; /* new inode number for changed entry */ 3368 int isrmdir; /* indicates if doing RMDIR */ 3369{ 3370 int offset; 3371 struct diradd *dap = NULL; 3372 struct dirrem *dirrem, *prevdirrem; 3373 struct pagedep *pagedep; 3374 struct inodedep *inodedep; 3375 struct mount *mp; 3376 3377 offset = blkoff(dp->i_fs, dp->i_offset); 3378 mp = UFSTOVFS(dp->i_ump); 3379 3380 /* 3381 * Whiteouts do not need diradd dependencies. 3382 */ 3383 if (newinum != WINO) { 3384 MALLOC(dap, struct diradd *, sizeof(struct diradd), 3385 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); 3386 workitem_alloc(&dap->da_list, D_DIRADD, mp); 3387 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 3388 dap->da_offset = offset; 3389 dap->da_newinum = newinum; 3390 } 3391 3392 /* 3393 * Allocate a new dirrem and ACQUIRE_LOCK. 3394 */ 3395 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 3396 pagedep = dirrem->dm_pagedep; 3397 /* 3398 * The possible values for isrmdir: 3399 * 0 - non-directory file rename 3400 * 1 - directory rename within same directory 3401 * inum - directory rename to new directory of given inode number 3402 * When renaming to a new directory, we are both deleting and 3403 * creating a new directory entry, so the link count on the new 3404 * directory should not change. Thus we do not need the followup 3405 * dirrem which is usually done in handle_workitem_remove. We set 3406 * the DIRCHG flag to tell handle_workitem_remove to skip the 3407 * followup dirrem. 3408 */ 3409 if (isrmdir > 1) 3410 dirrem->dm_state |= DIRCHG; 3411 3412 /* 3413 * Whiteouts have no additional dependencies, 3414 * so just put the dirrem on the correct list. 3415 */ 3416 if (newinum == WINO) { 3417 if ((dirrem->dm_state & COMPLETE) == 0) { 3418 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 3419 dm_next); 3420 } else { 3421 dirrem->dm_dirinum = pagedep->pd_ino; 3422 add_to_worklist(&dirrem->dm_list); 3423 } 3424 FREE_LOCK(&lk); 3425 return; 3426 } 3427 3428 /* 3429 * If the COMPLETE flag is clear, then there were no active 3430 * entries and we want to roll back to the previous inode until 3431 * the new inode is committed to disk. If the COMPLETE flag is 3432 * set, then we have deleted an entry that never made it to disk. 3433 * If the entry we deleted resulted from a name change, then the old 3434 * inode reference still resides on disk. Any rollback that we do 3435 * needs to be to that old inode (returned to us in prevdirrem). If 3436 * the entry we deleted resulted from a create, then there is 3437 * no entry on the disk, so we want to roll back to zero rather 3438 * than the uncommitted inode. In either of the COMPLETE cases we 3439 * want to immediately free the unwritten and unreferenced inode. 3440 */ 3441 if ((dirrem->dm_state & COMPLETE) == 0) { 3442 dap->da_previous = dirrem; 3443 } else { 3444 if (prevdirrem != NULL) { 3445 dap->da_previous = prevdirrem; 3446 } else { 3447 dap->da_state &= ~DIRCHG; 3448 dap->da_pagedep = pagedep; 3449 } 3450 dirrem->dm_dirinum = pagedep->pd_ino; 3451 add_to_worklist(&dirrem->dm_list); 3452 } 3453 /* 3454 * Link into its inodedep. Put it on the id_bufwait list if the inode 3455 * is not yet written. If it is written, do the post-inode write 3456 * processing to put it on the id_pendinghd list. 3457 */ 3458 if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 || 3459 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 3460 dap->da_state |= COMPLETE; 3461 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3462 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 3463 } else { 3464 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 3465 dap, da_pdlist); 3466 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 3467 } 3468 FREE_LOCK(&lk); 3469} 3470 3471/* 3472 * Called whenever the link count on an inode is changed. 3473 * It creates an inode dependency so that the new reference(s) 3474 * to the inode cannot be committed to disk until the updated 3475 * inode has been written. 3476 */ 3477void 3478softdep_change_linkcnt(ip) 3479 struct inode *ip; /* the inode with the increased link count */ 3480{ 3481 struct inodedep *inodedep; 3482 3483 ACQUIRE_LOCK(&lk); 3484 (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 3485 DEPALLOC, &inodedep); 3486 if (ip->i_nlink < ip->i_effnlink) 3487 panic("softdep_change_linkcnt: bad delta"); 3488 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3489 FREE_LOCK(&lk); 3490} 3491 3492/* 3493 * Called when the effective link count and the reference count 3494 * on an inode drops to zero. At this point there are no names 3495 * referencing the file in the filesystem and no active file 3496 * references. The space associated with the file will be freed 3497 * as soon as the necessary soft dependencies are cleared. 3498 */ 3499void 3500softdep_releasefile(ip) 3501 struct inode *ip; /* inode with the zero effective link count */ 3502{ 3503 struct inodedep *inodedep; 3504 struct fs *fs; 3505 int extblocks; 3506 3507 if (ip->i_effnlink > 0) 3508 panic("softdep_releasefile: file still referenced"); 3509 /* 3510 * We may be called several times as the on-disk link count 3511 * drops to zero. We only want to account for the space once. 3512 */ 3513 if (ip->i_flag & IN_SPACECOUNTED) 3514 return; 3515 /* 3516 * We have to deactivate a snapshot otherwise copyonwrites may 3517 * add blocks and the cleanup may remove blocks after we have 3518 * tried to account for them. 3519 */ 3520 if ((ip->i_flags & SF_SNAPSHOT) != 0) 3521 ffs_snapremove(ITOV(ip)); 3522 /* 3523 * If we are tracking an nlinkdelta, we have to also remember 3524 * whether we accounted for the freed space yet. 3525 */ 3526 ACQUIRE_LOCK(&lk); 3527 if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep))) 3528 inodedep->id_state |= SPACECOUNTED; 3529 FREE_LOCK(&lk); 3530 fs = ip->i_fs; 3531 extblocks = 0; 3532 if (fs->fs_magic == FS_UFS2_MAGIC) 3533 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 3534 UFS_LOCK(ip->i_ump); 3535 ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks; 3536 ip->i_fs->fs_pendinginodes += 1; 3537 UFS_UNLOCK(ip->i_ump); 3538 ip->i_flag |= IN_SPACECOUNTED; 3539} 3540 3541/* 3542 * This workitem decrements the inode's link count. 3543 * If the link count reaches zero, the file is removed. 3544 */ 3545static void 3546handle_workitem_remove(dirrem, xp) 3547 struct dirrem *dirrem; 3548 struct vnode *xp; 3549{ 3550 struct thread *td = curthread; 3551 struct inodedep *inodedep; 3552 struct vnode *vp; 3553 struct inode *ip; 3554 ino_t oldinum; 3555 int error; 3556 3557 if ((vp = xp) == NULL && 3558 (error = ffs_vget(dirrem->dm_list.wk_mp, 3559 dirrem->dm_oldinum, LK_EXCLUSIVE, &vp)) != 0) { 3560 softdep_error("handle_workitem_remove: vget", error); 3561 return; 3562 } 3563 ip = VTOI(vp); 3564 ACQUIRE_LOCK(&lk); 3565 if ((inodedep_lookup(dirrem->dm_list.wk_mp, 3566 dirrem->dm_oldinum, 0, &inodedep)) == 0) 3567 panic("handle_workitem_remove: lost inodedep"); 3568 /* 3569 * Normal file deletion. 3570 */ 3571 if ((dirrem->dm_state & RMDIR) == 0) { 3572 ip->i_nlink--; 3573 DIP_SET(ip, i_nlink, ip->i_nlink); 3574 ip->i_flag |= IN_CHANGE; 3575 if (ip->i_nlink < ip->i_effnlink) 3576 panic("handle_workitem_remove: bad file delta"); 3577 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3578 num_dirrem -= 1; 3579 WORKITEM_FREE(dirrem, D_DIRREM); 3580 FREE_LOCK(&lk); 3581 vput(vp); 3582 return; 3583 } 3584 /* 3585 * Directory deletion. Decrement reference count for both the 3586 * just deleted parent directory entry and the reference for ".". 3587 * Next truncate the directory to length zero. When the 3588 * truncation completes, arrange to have the reference count on 3589 * the parent decremented to account for the loss of "..". 3590 */ 3591 ip->i_nlink -= 2; 3592 DIP_SET(ip, i_nlink, ip->i_nlink); 3593 ip->i_flag |= IN_CHANGE; 3594 if (ip->i_nlink < ip->i_effnlink) 3595 panic("handle_workitem_remove: bad dir delta"); 3596 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3597 FREE_LOCK(&lk); 3598 if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0) 3599 softdep_error("handle_workitem_remove: truncate", error); 3600 ACQUIRE_LOCK(&lk); 3601 /* 3602 * Rename a directory to a new parent. Since, we are both deleting 3603 * and creating a new directory entry, the link count on the new 3604 * directory should not change. Thus we skip the followup dirrem. 3605 */ 3606 if (dirrem->dm_state & DIRCHG) { 3607 num_dirrem -= 1; 3608 WORKITEM_FREE(dirrem, D_DIRREM); 3609 FREE_LOCK(&lk); 3610 vput(vp); 3611 return; 3612 } 3613 /* 3614 * If the inodedep does not exist, then the zero'ed inode has 3615 * been written to disk. If the allocated inode has never been 3616 * written to disk, then the on-disk inode is zero'ed. In either 3617 * case we can remove the file immediately. 3618 */ 3619 dirrem->dm_state = 0; 3620 oldinum = dirrem->dm_oldinum; 3621 dirrem->dm_oldinum = dirrem->dm_dirinum; 3622 if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum, 3623 0, &inodedep) == 0 || check_inode_unwritten(inodedep)) { 3624 FREE_LOCK(&lk); 3625 vput(vp); 3626 handle_workitem_remove(dirrem, NULL); 3627 return; 3628 } 3629 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 3630 FREE_LOCK(&lk); 3631 ip->i_flag |= IN_CHANGE; 3632 ffs_update(vp, 0); 3633 vput(vp); 3634} 3635 3636/* 3637 * Inode de-allocation dependencies. 3638 * 3639 * When an inode's link count is reduced to zero, it can be de-allocated. We 3640 * found it convenient to postpone de-allocation until after the inode is 3641 * written to disk with its new link count (zero). At this point, all of the 3642 * on-disk inode's block pointers are nullified and, with careful dependency 3643 * list ordering, all dependencies related to the inode will be satisfied and 3644 * the corresponding dependency structures de-allocated. So, if/when the 3645 * inode is reused, there will be no mixing of old dependencies with new 3646 * ones. This artificial dependency is set up by the block de-allocation 3647 * procedure above (softdep_setup_freeblocks) and completed by the 3648 * following procedure. 3649 */ 3650static void 3651handle_workitem_freefile(freefile) 3652 struct freefile *freefile; 3653{ 3654 struct fs *fs; 3655 struct inodedep *idp; 3656 struct ufsmount *ump; 3657 int error; 3658 3659 ump = VFSTOUFS(freefile->fx_list.wk_mp); 3660 fs = ump->um_fs; 3661#ifdef DEBUG 3662 ACQUIRE_LOCK(&lk); 3663 error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); 3664 FREE_LOCK(&lk); 3665 if (error) 3666 panic("handle_workitem_freefile: inodedep survived"); 3667#endif 3668 UFS_LOCK(ump); 3669 fs->fs_pendinginodes -= 1; 3670 UFS_UNLOCK(ump); 3671 if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, 3672 freefile->fx_oldinum, freefile->fx_mode)) != 0) 3673 softdep_error("handle_workitem_freefile", error); 3674 ACQUIRE_LOCK(&lk); 3675 WORKITEM_FREE(freefile, D_FREEFILE); 3676 FREE_LOCK(&lk); 3677} 3678 3679 3680/* 3681 * Helper function which unlinks marker element from work list and returns 3682 * the next element on the list. 3683 */ 3684static __inline struct worklist * 3685markernext(struct worklist *marker) 3686{ 3687 struct worklist *next; 3688 3689 next = LIST_NEXT(marker, wk_list); 3690 LIST_REMOVE(marker, wk_list); 3691 return next; 3692} 3693 3694/* 3695 * Disk writes. 3696 * 3697 * The dependency structures constructed above are most actively used when file 3698 * system blocks are written to disk. No constraints are placed on when a 3699 * block can be written, but unsatisfied update dependencies are made safe by 3700 * modifying (or replacing) the source memory for the duration of the disk 3701 * write. When the disk write completes, the memory block is again brought 3702 * up-to-date. 3703 * 3704 * In-core inode structure reclamation. 3705 * 3706 * Because there are a finite number of "in-core" inode structures, they are 3707 * reused regularly. By transferring all inode-related dependencies to the 3708 * in-memory inode block and indexing them separately (via "inodedep"s), we 3709 * can allow "in-core" inode structures to be reused at any time and avoid 3710 * any increase in contention. 3711 * 3712 * Called just before entering the device driver to initiate a new disk I/O. 3713 * The buffer must be locked, thus, no I/O completion operations can occur 3714 * while we are manipulating its associated dependencies. 3715 */ 3716static void 3717softdep_disk_io_initiation(bp) 3718 struct buf *bp; /* structure describing disk write to occur */ 3719{ 3720 struct worklist *wk; 3721 struct worklist marker; 3722 struct indirdep *indirdep; 3723 struct inodedep *inodedep; 3724 3725 /* 3726 * We only care about write operations. There should never 3727 * be dependencies for reads. 3728 */ 3729 if (bp->b_iocmd != BIO_WRITE) 3730 panic("softdep_disk_io_initiation: not write"); 3731 3732 marker.wk_type = D_LAST + 1; /* Not a normal workitem */ 3733 PHOLD(curproc); /* Don't swap out kernel stack */ 3734 3735 ACQUIRE_LOCK(&lk); 3736 /* 3737 * Do any necessary pre-I/O processing. 3738 */ 3739 for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; 3740 wk = markernext(&marker)) { 3741 LIST_INSERT_AFTER(wk, &marker, wk_list); 3742 switch (wk->wk_type) { 3743 3744 case D_PAGEDEP: 3745 initiate_write_filepage(WK_PAGEDEP(wk), bp); 3746 continue; 3747 3748 case D_INODEDEP: 3749 inodedep = WK_INODEDEP(wk); 3750 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) 3751 initiate_write_inodeblock_ufs1(inodedep, bp); 3752 else 3753 initiate_write_inodeblock_ufs2(inodedep, bp); 3754 continue; 3755 3756 case D_INDIRDEP: 3757 indirdep = WK_INDIRDEP(wk); 3758 if (indirdep->ir_state & GOINGAWAY) 3759 panic("disk_io_initiation: indirdep gone"); 3760 /* 3761 * If there are no remaining dependencies, this 3762 * will be writing the real pointers, so the 3763 * dependency can be freed. 3764 */ 3765 if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { 3766 struct buf *bp; 3767 3768 bp = indirdep->ir_savebp; 3769 bp->b_flags |= B_INVAL | B_NOCACHE; 3770 /* inline expand WORKLIST_REMOVE(wk); */ 3771 wk->wk_state &= ~ONWORKLIST; 3772 LIST_REMOVE(wk, wk_list); 3773 WORKITEM_FREE(indirdep, D_INDIRDEP); 3774 FREE_LOCK(&lk); 3775 brelse(bp); 3776 ACQUIRE_LOCK(&lk); 3777 continue; 3778 } 3779 /* 3780 * Replace up-to-date version with safe version. 3781 */ 3782 FREE_LOCK(&lk); 3783 MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount, 3784 M_INDIRDEP, M_SOFTDEP_FLAGS); 3785 ACQUIRE_LOCK(&lk); 3786 indirdep->ir_state &= ~ATTACHED; 3787 indirdep->ir_state |= UNDONE; 3788 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 3789 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 3790 bp->b_bcount); 3791 continue; 3792 3793 case D_MKDIR: 3794 case D_BMSAFEMAP: 3795 case D_ALLOCDIRECT: 3796 case D_ALLOCINDIR: 3797 continue; 3798 3799 default: 3800 panic("handle_disk_io_initiation: Unexpected type %s", 3801 TYPENAME(wk->wk_type)); 3802 /* NOTREACHED */ 3803 } 3804 } 3805 FREE_LOCK(&lk); 3806 PRELE(curproc); /* Allow swapout of kernel stack */ 3807} 3808 3809/* 3810 * Called from within the procedure above to deal with unsatisfied 3811 * allocation dependencies in a directory. The buffer must be locked, 3812 * thus, no I/O completion operations can occur while we are 3813 * manipulating its associated dependencies. 3814 */ 3815static void 3816initiate_write_filepage(pagedep, bp) 3817 struct pagedep *pagedep; 3818 struct buf *bp; 3819{ 3820 struct diradd *dap; 3821 struct direct *ep; 3822 int i; 3823 3824 if (pagedep->pd_state & IOSTARTED) { 3825 /* 3826 * This can only happen if there is a driver that does not 3827 * understand chaining. Here biodone will reissue the call 3828 * to strategy for the incomplete buffers. 3829 */ 3830 printf("initiate_write_filepage: already started\n"); 3831 return; 3832 } 3833 pagedep->pd_state |= IOSTARTED; 3834 for (i = 0; i < DAHASHSZ; i++) { 3835 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 3836 ep = (struct direct *) 3837 ((char *)bp->b_data + dap->da_offset); 3838 if (ep->d_ino != dap->da_newinum) 3839 panic("%s: dir inum %d != new %d", 3840 "initiate_write_filepage", 3841 ep->d_ino, dap->da_newinum); 3842 if (dap->da_state & DIRCHG) 3843 ep->d_ino = dap->da_previous->dm_oldinum; 3844 else 3845 ep->d_ino = 0; 3846 dap->da_state &= ~ATTACHED; 3847 dap->da_state |= UNDONE; 3848 } 3849 } 3850} 3851 3852/* 3853 * Version of initiate_write_inodeblock that handles UFS1 dinodes. 3854 * Note that any bug fixes made to this routine must be done in the 3855 * version found below. 3856 * 3857 * Called from within the procedure above to deal with unsatisfied 3858 * allocation dependencies in an inodeblock. The buffer must be 3859 * locked, thus, no I/O completion operations can occur while we 3860 * are manipulating its associated dependencies. 3861 */ 3862static void 3863initiate_write_inodeblock_ufs1(inodedep, bp) 3864 struct inodedep *inodedep; 3865 struct buf *bp; /* The inode block */ 3866{ 3867 struct allocdirect *adp, *lastadp; 3868 struct ufs1_dinode *dp; 3869 struct ufs1_dinode *sip; 3870 struct fs *fs; 3871 ufs_lbn_t i, prevlbn = 0; 3872 int deplist; 3873 3874 if (inodedep->id_state & IOSTARTED) 3875 panic("initiate_write_inodeblock_ufs1: already started"); 3876 inodedep->id_state |= IOSTARTED; 3877 fs = inodedep->id_fs; 3878 dp = (struct ufs1_dinode *)bp->b_data + 3879 ino_to_fsbo(fs, inodedep->id_ino); 3880 /* 3881 * If the bitmap is not yet written, then the allocated 3882 * inode cannot be written to disk. 3883 */ 3884 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 3885 if (inodedep->id_savedino1 != NULL) 3886 panic("initiate_write_inodeblock_ufs1: I/O underway"); 3887 FREE_LOCK(&lk); 3888 MALLOC(sip, struct ufs1_dinode *, 3889 sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS); 3890 ACQUIRE_LOCK(&lk); 3891 inodedep->id_savedino1 = sip; 3892 *inodedep->id_savedino1 = *dp; 3893 bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); 3894 dp->di_gen = inodedep->id_savedino1->di_gen; 3895 return; 3896 } 3897 /* 3898 * If no dependencies, then there is nothing to roll back. 3899 */ 3900 inodedep->id_savedsize = dp->di_size; 3901 inodedep->id_savedextsize = 0; 3902 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) 3903 return; 3904 /* 3905 * Set the dependencies to busy. 3906 */ 3907 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3908 adp = TAILQ_NEXT(adp, ad_next)) { 3909#ifdef DIAGNOSTIC 3910 if (deplist != 0 && prevlbn >= adp->ad_lbn) 3911 panic("softdep_write_inodeblock: lbn order"); 3912 prevlbn = adp->ad_lbn; 3913 if (adp->ad_lbn < NDADDR && 3914 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) 3915 panic("%s: direct pointer #%jd mismatch %d != %jd", 3916 "softdep_write_inodeblock", 3917 (intmax_t)adp->ad_lbn, 3918 dp->di_db[adp->ad_lbn], 3919 (intmax_t)adp->ad_newblkno); 3920 if (adp->ad_lbn >= NDADDR && 3921 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) 3922 panic("%s: indirect pointer #%jd mismatch %d != %jd", 3923 "softdep_write_inodeblock", 3924 (intmax_t)adp->ad_lbn - NDADDR, 3925 dp->di_ib[adp->ad_lbn - NDADDR], 3926 (intmax_t)adp->ad_newblkno); 3927 deplist |= 1 << adp->ad_lbn; 3928 if ((adp->ad_state & ATTACHED) == 0) 3929 panic("softdep_write_inodeblock: Unknown state 0x%x", 3930 adp->ad_state); 3931#endif /* DIAGNOSTIC */ 3932 adp->ad_state &= ~ATTACHED; 3933 adp->ad_state |= UNDONE; 3934 } 3935 /* 3936 * The on-disk inode cannot claim to be any larger than the last 3937 * fragment that has been written. Otherwise, the on-disk inode 3938 * might have fragments that were not the last block in the file 3939 * which would corrupt the filesystem. 3940 */ 3941 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3942 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 3943 if (adp->ad_lbn >= NDADDR) 3944 break; 3945 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; 3946 /* keep going until hitting a rollback to a frag */ 3947 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 3948 continue; 3949 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 3950 for (i = adp->ad_lbn + 1; i < NDADDR; i++) { 3951#ifdef DIAGNOSTIC 3952 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 3953 panic("softdep_write_inodeblock: lost dep1"); 3954#endif /* DIAGNOSTIC */ 3955 dp->di_db[i] = 0; 3956 } 3957 for (i = 0; i < NIADDR; i++) { 3958#ifdef DIAGNOSTIC 3959 if (dp->di_ib[i] != 0 && 3960 (deplist & ((1 << NDADDR) << i)) == 0) 3961 panic("softdep_write_inodeblock: lost dep2"); 3962#endif /* DIAGNOSTIC */ 3963 dp->di_ib[i] = 0; 3964 } 3965 return; 3966 } 3967 /* 3968 * If we have zero'ed out the last allocated block of the file, 3969 * roll back the size to the last currently allocated block. 3970 * We know that this last allocated block is a full-sized as 3971 * we already checked for fragments in the loop above. 3972 */ 3973 if (lastadp != NULL && 3974 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 3975 for (i = lastadp->ad_lbn; i >= 0; i--) 3976 if (dp->di_db[i] != 0) 3977 break; 3978 dp->di_size = (i + 1) * fs->fs_bsize; 3979 } 3980 /* 3981 * The only dependencies are for indirect blocks. 3982 * 3983 * The file size for indirect block additions is not guaranteed. 3984 * Such a guarantee would be non-trivial to achieve. The conventional 3985 * synchronous write implementation also does not make this guarantee. 3986 * Fsck should catch and fix discrepancies. Arguably, the file size 3987 * can be over-estimated without destroying integrity when the file 3988 * moves into the indirect blocks (i.e., is large). If we want to 3989 * postpone fsck, we are stuck with this argument. 3990 */ 3991 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 3992 dp->di_ib[adp->ad_lbn - NDADDR] = 0; 3993} 3994 3995/* 3996 * Version of initiate_write_inodeblock that handles UFS2 dinodes. 3997 * Note that any bug fixes made to this routine must be done in the 3998 * version found above. 3999 * 4000 * Called from within the procedure above to deal with unsatisfied 4001 * allocation dependencies in an inodeblock. The buffer must be 4002 * locked, thus, no I/O completion operations can occur while we 4003 * are manipulating its associated dependencies. 4004 */ 4005static void 4006initiate_write_inodeblock_ufs2(inodedep, bp) 4007 struct inodedep *inodedep; 4008 struct buf *bp; /* The inode block */ 4009{ 4010 struct allocdirect *adp, *lastadp; 4011 struct ufs2_dinode *dp; 4012 struct ufs2_dinode *sip; 4013 struct fs *fs; 4014 ufs_lbn_t i, prevlbn = 0; 4015 int deplist; 4016 4017 if (inodedep->id_state & IOSTARTED) 4018 panic("initiate_write_inodeblock_ufs2: already started"); 4019 inodedep->id_state |= IOSTARTED; 4020 fs = inodedep->id_fs; 4021 dp = (struct ufs2_dinode *)bp->b_data + 4022 ino_to_fsbo(fs, inodedep->id_ino); 4023 /* 4024 * If the bitmap is not yet written, then the allocated 4025 * inode cannot be written to disk. 4026 */ 4027 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 4028 if (inodedep->id_savedino2 != NULL) 4029 panic("initiate_write_inodeblock_ufs2: I/O underway"); 4030 FREE_LOCK(&lk); 4031 MALLOC(sip, struct ufs2_dinode *, 4032 sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS); 4033 ACQUIRE_LOCK(&lk); 4034 inodedep->id_savedino2 = sip; 4035 *inodedep->id_savedino2 = *dp; 4036 bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); 4037 dp->di_gen = inodedep->id_savedino2->di_gen; 4038 return; 4039 } 4040 /* 4041 * If no dependencies, then there is nothing to roll back. 4042 */ 4043 inodedep->id_savedsize = dp->di_size; 4044 inodedep->id_savedextsize = dp->di_extsize; 4045 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL && 4046 TAILQ_FIRST(&inodedep->id_extupdt) == NULL) 4047 return; 4048 /* 4049 * Set the ext data dependencies to busy. 4050 */ 4051 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 4052 adp = TAILQ_NEXT(adp, ad_next)) { 4053#ifdef DIAGNOSTIC 4054 if (deplist != 0 && prevlbn >= adp->ad_lbn) 4055 panic("softdep_write_inodeblock: lbn order"); 4056 prevlbn = adp->ad_lbn; 4057 if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno) 4058 panic("%s: direct pointer #%jd mismatch %jd != %jd", 4059 "softdep_write_inodeblock", 4060 (intmax_t)adp->ad_lbn, 4061 (intmax_t)dp->di_extb[adp->ad_lbn], 4062 (intmax_t)adp->ad_newblkno); 4063 deplist |= 1 << adp->ad_lbn; 4064 if ((adp->ad_state & ATTACHED) == 0) 4065 panic("softdep_write_inodeblock: Unknown state 0x%x", 4066 adp->ad_state); 4067#endif /* DIAGNOSTIC */ 4068 adp->ad_state &= ~ATTACHED; 4069 adp->ad_state |= UNDONE; 4070 } 4071 /* 4072 * The on-disk inode cannot claim to be any larger than the last 4073 * fragment that has been written. Otherwise, the on-disk inode 4074 * might have fragments that were not the last block in the ext 4075 * data which would corrupt the filesystem. 4076 */ 4077 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 4078 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 4079 dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno; 4080 /* keep going until hitting a rollback to a frag */ 4081 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 4082 continue; 4083 dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 4084 for (i = adp->ad_lbn + 1; i < NXADDR; i++) { 4085#ifdef DIAGNOSTIC 4086 if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) 4087 panic("softdep_write_inodeblock: lost dep1"); 4088#endif /* DIAGNOSTIC */ 4089 dp->di_extb[i] = 0; 4090 } 4091 lastadp = NULL; 4092 break; 4093 } 4094 /* 4095 * If we have zero'ed out the last allocated block of the ext 4096 * data, roll back the size to the last currently allocated block. 4097 * We know that this last allocated block is a full-sized as 4098 * we already checked for fragments in the loop above. 4099 */ 4100 if (lastadp != NULL && 4101 dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 4102 for (i = lastadp->ad_lbn; i >= 0; i--) 4103 if (dp->di_extb[i] != 0) 4104 break; 4105 dp->di_extsize = (i + 1) * fs->fs_bsize; 4106 } 4107 /* 4108 * Set the file data dependencies to busy. 4109 */ 4110 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 4111 adp = TAILQ_NEXT(adp, ad_next)) { 4112#ifdef DIAGNOSTIC 4113 if (deplist != 0 && prevlbn >= adp->ad_lbn) 4114 panic("softdep_write_inodeblock: lbn order"); 4115 prevlbn = adp->ad_lbn; 4116 if (adp->ad_lbn < NDADDR && 4117 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) 4118 panic("%s: direct pointer #%jd mismatch %jd != %jd", 4119 "softdep_write_inodeblock", 4120 (intmax_t)adp->ad_lbn, 4121 (intmax_t)dp->di_db[adp->ad_lbn], 4122 (intmax_t)adp->ad_newblkno); 4123 if (adp->ad_lbn >= NDADDR && 4124 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) 4125 panic("%s indirect pointer #%jd mismatch %jd != %jd", 4126 "softdep_write_inodeblock:", 4127 (intmax_t)adp->ad_lbn - NDADDR, 4128 (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR], 4129 (intmax_t)adp->ad_newblkno); 4130 deplist |= 1 << adp->ad_lbn; 4131 if ((adp->ad_state & ATTACHED) == 0) 4132 panic("softdep_write_inodeblock: Unknown state 0x%x", 4133 adp->ad_state); 4134#endif /* DIAGNOSTIC */ 4135 adp->ad_state &= ~ATTACHED; 4136 adp->ad_state |= UNDONE; 4137 } 4138 /* 4139 * The on-disk inode cannot claim to be any larger than the last 4140 * fragment that has been written. Otherwise, the on-disk inode 4141 * might have fragments that were not the last block in the file 4142 * which would corrupt the filesystem. 4143 */ 4144 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 4145 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 4146 if (adp->ad_lbn >= NDADDR) 4147 break; 4148 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; 4149 /* keep going until hitting a rollback to a frag */ 4150 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 4151 continue; 4152 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 4153 for (i = adp->ad_lbn + 1; i < NDADDR; i++) { 4154#ifdef DIAGNOSTIC 4155 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 4156 panic("softdep_write_inodeblock: lost dep2"); 4157#endif /* DIAGNOSTIC */ 4158 dp->di_db[i] = 0; 4159 } 4160 for (i = 0; i < NIADDR; i++) { 4161#ifdef DIAGNOSTIC 4162 if (dp->di_ib[i] != 0 && 4163 (deplist & ((1 << NDADDR) << i)) == 0) 4164 panic("softdep_write_inodeblock: lost dep3"); 4165#endif /* DIAGNOSTIC */ 4166 dp->di_ib[i] = 0; 4167 } 4168 return; 4169 } 4170 /* 4171 * If we have zero'ed out the last allocated block of the file, 4172 * roll back the size to the last currently allocated block. 4173 * We know that this last allocated block is a full-sized as 4174 * we already checked for fragments in the loop above. 4175 */ 4176 if (lastadp != NULL && 4177 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 4178 for (i = lastadp->ad_lbn; i >= 0; i--) 4179 if (dp->di_db[i] != 0) 4180 break; 4181 dp->di_size = (i + 1) * fs->fs_bsize; 4182 } 4183 /* 4184 * The only dependencies are for indirect blocks. 4185 * 4186 * The file size for indirect block additions is not guaranteed. 4187 * Such a guarantee would be non-trivial to achieve. The conventional 4188 * synchronous write implementation also does not make this guarantee. 4189 * Fsck should catch and fix discrepancies. Arguably, the file size 4190 * can be over-estimated without destroying integrity when the file 4191 * moves into the indirect blocks (i.e., is large). If we want to 4192 * postpone fsck, we are stuck with this argument. 4193 */ 4194 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 4195 dp->di_ib[adp->ad_lbn - NDADDR] = 0; 4196} 4197 4198/* 4199 * This routine is called during the completion interrupt 4200 * service routine for a disk write (from the procedure called 4201 * by the device driver to inform the filesystem caches of 4202 * a request completion). It should be called early in this 4203 * procedure, before the block is made available to other 4204 * processes or other routines are called. 4205 */ 4206static void 4207softdep_disk_write_complete(bp) 4208 struct buf *bp; /* describes the completed disk write */ 4209{ 4210 struct worklist *wk; 4211 struct worklist *owk; 4212 struct workhead reattach; 4213 struct newblk *newblk; 4214 struct allocindir *aip; 4215 struct allocdirect *adp; 4216 struct indirdep *indirdep; 4217 struct inodedep *inodedep; 4218 struct bmsafemap *bmsafemap; 4219 4220 /* 4221 * If an error occurred while doing the write, then the data 4222 * has not hit the disk and the dependencies cannot be unrolled. 4223 */ 4224 if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) 4225 return; 4226 LIST_INIT(&reattach); 4227 /* 4228 * This lock must not be released anywhere in this code segment. 4229 */ 4230 ACQUIRE_LOCK(&lk); 4231 owk = NULL; 4232 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 4233 WORKLIST_REMOVE(wk); 4234 if (wk == owk) 4235 panic("duplicate worklist: %p\n", wk); 4236 owk = wk; 4237 switch (wk->wk_type) { 4238 4239 case D_PAGEDEP: 4240 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 4241 WORKLIST_INSERT(&reattach, wk); 4242 continue; 4243 4244 case D_INODEDEP: 4245 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 4246 WORKLIST_INSERT(&reattach, wk); 4247 continue; 4248 4249 case D_BMSAFEMAP: 4250 bmsafemap = WK_BMSAFEMAP(wk); 4251 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { 4252 newblk->nb_state |= DEPCOMPLETE; 4253 newblk->nb_bmsafemap = NULL; 4254 LIST_REMOVE(newblk, nb_deps); 4255 } 4256 while ((adp = 4257 LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { 4258 adp->ad_state |= DEPCOMPLETE; 4259 adp->ad_buf = NULL; 4260 LIST_REMOVE(adp, ad_deps); 4261 handle_allocdirect_partdone(adp); 4262 } 4263 while ((aip = 4264 LIST_FIRST(&bmsafemap->sm_allocindirhd))) { 4265 aip->ai_state |= DEPCOMPLETE; 4266 aip->ai_buf = NULL; 4267 LIST_REMOVE(aip, ai_deps); 4268 handle_allocindir_partdone(aip); 4269 } 4270 while ((inodedep = 4271 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { 4272 inodedep->id_state |= DEPCOMPLETE; 4273 LIST_REMOVE(inodedep, id_deps); 4274 inodedep->id_buf = NULL; 4275 } 4276 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 4277 continue; 4278 4279 case D_MKDIR: 4280 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 4281 continue; 4282 4283 case D_ALLOCDIRECT: 4284 adp = WK_ALLOCDIRECT(wk); 4285 adp->ad_state |= COMPLETE; 4286 handle_allocdirect_partdone(adp); 4287 continue; 4288 4289 case D_ALLOCINDIR: 4290 aip = WK_ALLOCINDIR(wk); 4291 aip->ai_state |= COMPLETE; 4292 handle_allocindir_partdone(aip); 4293 continue; 4294 4295 case D_INDIRDEP: 4296 indirdep = WK_INDIRDEP(wk); 4297 if (indirdep->ir_state & GOINGAWAY) 4298 panic("disk_write_complete: indirdep gone"); 4299 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 4300 FREE(indirdep->ir_saveddata, M_INDIRDEP); 4301 indirdep->ir_saveddata = 0; 4302 indirdep->ir_state &= ~UNDONE; 4303 indirdep->ir_state |= ATTACHED; 4304 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 4305 handle_allocindir_partdone(aip); 4306 if (aip == LIST_FIRST(&indirdep->ir_donehd)) 4307 panic("disk_write_complete: not gone"); 4308 } 4309 WORKLIST_INSERT(&reattach, wk); 4310 if ((bp->b_flags & B_DELWRI) == 0) 4311 stat_indir_blk_ptrs++; 4312 bdirty(bp); 4313 continue; 4314 4315 default: 4316 panic("handle_disk_write_complete: Unknown type %s", 4317 TYPENAME(wk->wk_type)); 4318 /* NOTREACHED */ 4319 } 4320 } 4321 /* 4322 * Reattach any requests that must be redone. 4323 */ 4324 while ((wk = LIST_FIRST(&reattach)) != NULL) { 4325 WORKLIST_REMOVE(wk); 4326 WORKLIST_INSERT(&bp->b_dep, wk); 4327 } 4328 FREE_LOCK(&lk); 4329} 4330 4331/* 4332 * Called from within softdep_disk_write_complete above. Note that 4333 * this routine is always called from interrupt level with further 4334 * splbio interrupts blocked. 4335 */ 4336static void 4337handle_allocdirect_partdone(adp) 4338 struct allocdirect *adp; /* the completed allocdirect */ 4339{ 4340 struct allocdirectlst *listhead; 4341 struct allocdirect *listadp; 4342 struct inodedep *inodedep; 4343 long bsize, delay; 4344 4345 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 4346 return; 4347 if (adp->ad_buf != NULL) 4348 panic("handle_allocdirect_partdone: dangling dep"); 4349 /* 4350 * The on-disk inode cannot claim to be any larger than the last 4351 * fragment that has been written. Otherwise, the on-disk inode 4352 * might have fragments that were not the last block in the file 4353 * which would corrupt the filesystem. Thus, we cannot free any 4354 * allocdirects after one whose ad_oldblkno claims a fragment as 4355 * these blocks must be rolled back to zero before writing the inode. 4356 * We check the currently active set of allocdirects in id_inoupdt 4357 * or id_extupdt as appropriate. 4358 */ 4359 inodedep = adp->ad_inodedep; 4360 bsize = inodedep->id_fs->fs_bsize; 4361 if (adp->ad_state & EXTDATA) 4362 listhead = &inodedep->id_extupdt; 4363 else 4364 listhead = &inodedep->id_inoupdt; 4365 TAILQ_FOREACH(listadp, listhead, ad_next) { 4366 /* found our block */ 4367 if (listadp == adp) 4368 break; 4369 /* continue if ad_oldlbn is not a fragment */ 4370 if (listadp->ad_oldsize == 0 || 4371 listadp->ad_oldsize == bsize) 4372 continue; 4373 /* hit a fragment */ 4374 return; 4375 } 4376 /* 4377 * If we have reached the end of the current list without 4378 * finding the just finished dependency, then it must be 4379 * on the future dependency list. Future dependencies cannot 4380 * be freed until they are moved to the current list. 4381 */ 4382 if (listadp == NULL) { 4383#ifdef DEBUG 4384 if (adp->ad_state & EXTDATA) 4385 listhead = &inodedep->id_newextupdt; 4386 else 4387 listhead = &inodedep->id_newinoupdt; 4388 TAILQ_FOREACH(listadp, listhead, ad_next) 4389 /* found our block */ 4390 if (listadp == adp) 4391 break; 4392 if (listadp == NULL) 4393 panic("handle_allocdirect_partdone: lost dep"); 4394#endif /* DEBUG */ 4395 return; 4396 } 4397 /* 4398 * If we have found the just finished dependency, then free 4399 * it along with anything that follows it that is complete. 4400 * If the inode still has a bitmap dependency, then it has 4401 * never been written to disk, hence the on-disk inode cannot 4402 * reference the old fragment so we can free it without delay. 4403 */ 4404 delay = (inodedep->id_state & DEPCOMPLETE); 4405 for (; adp; adp = listadp) { 4406 listadp = TAILQ_NEXT(adp, ad_next); 4407 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 4408 return; 4409 free_allocdirect(listhead, adp, delay); 4410 } 4411} 4412 4413/* 4414 * Called from within softdep_disk_write_complete above. Note that 4415 * this routine is always called from interrupt level with further 4416 * splbio interrupts blocked. 4417 */ 4418static void 4419handle_allocindir_partdone(aip) 4420 struct allocindir *aip; /* the completed allocindir */ 4421{ 4422 struct indirdep *indirdep; 4423 4424 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 4425 return; 4426 if (aip->ai_buf != NULL) 4427 panic("handle_allocindir_partdone: dangling dependency"); 4428 indirdep = aip->ai_indirdep; 4429 if (indirdep->ir_state & UNDONE) { 4430 LIST_REMOVE(aip, ai_next); 4431 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 4432 return; 4433 } 4434 if (indirdep->ir_state & UFS1FMT) 4435 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 4436 aip->ai_newblkno; 4437 else 4438 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 4439 aip->ai_newblkno; 4440 LIST_REMOVE(aip, ai_next); 4441 if (aip->ai_freefrag != NULL) 4442 add_to_worklist(&aip->ai_freefrag->ff_list); 4443 WORKITEM_FREE(aip, D_ALLOCINDIR); 4444} 4445 4446/* 4447 * Called from within softdep_disk_write_complete above to restore 4448 * in-memory inode block contents to their most up-to-date state. Note 4449 * that this routine is always called from interrupt level with further 4450 * splbio interrupts blocked. 4451 */ 4452static int 4453handle_written_inodeblock(inodedep, bp) 4454 struct inodedep *inodedep; 4455 struct buf *bp; /* buffer containing the inode block */ 4456{ 4457 struct worklist *wk, *filefree; 4458 struct allocdirect *adp, *nextadp; 4459 struct ufs1_dinode *dp1 = NULL; 4460 struct ufs2_dinode *dp2 = NULL; 4461 int hadchanges, fstype; 4462 4463 if ((inodedep->id_state & IOSTARTED) == 0) 4464 panic("handle_written_inodeblock: not started"); 4465 inodedep->id_state &= ~IOSTARTED; 4466 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { 4467 fstype = UFS1; 4468 dp1 = (struct ufs1_dinode *)bp->b_data + 4469 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 4470 } else { 4471 fstype = UFS2; 4472 dp2 = (struct ufs2_dinode *)bp->b_data + 4473 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 4474 } 4475 /* 4476 * If we had to rollback the inode allocation because of 4477 * bitmaps being incomplete, then simply restore it. 4478 * Keep the block dirty so that it will not be reclaimed until 4479 * all associated dependencies have been cleared and the 4480 * corresponding updates written to disk. 4481 */ 4482 if (inodedep->id_savedino1 != NULL) { 4483 if (fstype == UFS1) 4484 *dp1 = *inodedep->id_savedino1; 4485 else 4486 *dp2 = *inodedep->id_savedino2; 4487 FREE(inodedep->id_savedino1, M_SAVEDINO); 4488 inodedep->id_savedino1 = NULL; 4489 if ((bp->b_flags & B_DELWRI) == 0) 4490 stat_inode_bitmap++; 4491 bdirty(bp); 4492 return (1); 4493 } 4494 inodedep->id_state |= COMPLETE; 4495 /* 4496 * Roll forward anything that had to be rolled back before 4497 * the inode could be updated. 4498 */ 4499 hadchanges = 0; 4500 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 4501 nextadp = TAILQ_NEXT(adp, ad_next); 4502 if (adp->ad_state & ATTACHED) 4503 panic("handle_written_inodeblock: new entry"); 4504 if (fstype == UFS1) { 4505 if (adp->ad_lbn < NDADDR) { 4506 if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno) 4507 panic("%s %s #%jd mismatch %d != %jd", 4508 "handle_written_inodeblock:", 4509 "direct pointer", 4510 (intmax_t)adp->ad_lbn, 4511 dp1->di_db[adp->ad_lbn], 4512 (intmax_t)adp->ad_oldblkno); 4513 dp1->di_db[adp->ad_lbn] = adp->ad_newblkno; 4514 } else { 4515 if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0) 4516 panic("%s: %s #%jd allocated as %d", 4517 "handle_written_inodeblock", 4518 "indirect pointer", 4519 (intmax_t)adp->ad_lbn - NDADDR, 4520 dp1->di_ib[adp->ad_lbn - NDADDR]); 4521 dp1->di_ib[adp->ad_lbn - NDADDR] = 4522 adp->ad_newblkno; 4523 } 4524 } else { 4525 if (adp->ad_lbn < NDADDR) { 4526 if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno) 4527 panic("%s: %s #%jd %s %jd != %jd", 4528 "handle_written_inodeblock", 4529 "direct pointer", 4530 (intmax_t)adp->ad_lbn, "mismatch", 4531 (intmax_t)dp2->di_db[adp->ad_lbn], 4532 (intmax_t)adp->ad_oldblkno); 4533 dp2->di_db[adp->ad_lbn] = adp->ad_newblkno; 4534 } else { 4535 if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0) 4536 panic("%s: %s #%jd allocated as %jd", 4537 "handle_written_inodeblock", 4538 "indirect pointer", 4539 (intmax_t)adp->ad_lbn - NDADDR, 4540 (intmax_t) 4541 dp2->di_ib[adp->ad_lbn - NDADDR]); 4542 dp2->di_ib[adp->ad_lbn - NDADDR] = 4543 adp->ad_newblkno; 4544 } 4545 } 4546 adp->ad_state &= ~UNDONE; 4547 adp->ad_state |= ATTACHED; 4548 hadchanges = 1; 4549 } 4550 for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { 4551 nextadp = TAILQ_NEXT(adp, ad_next); 4552 if (adp->ad_state & ATTACHED) 4553 panic("handle_written_inodeblock: new entry"); 4554 if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno) 4555 panic("%s: direct pointers #%jd %s %jd != %jd", 4556 "handle_written_inodeblock", 4557 (intmax_t)adp->ad_lbn, "mismatch", 4558 (intmax_t)dp2->di_extb[adp->ad_lbn], 4559 (intmax_t)adp->ad_oldblkno); 4560 dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno; 4561 adp->ad_state &= ~UNDONE; 4562 adp->ad_state |= ATTACHED; 4563 hadchanges = 1; 4564 } 4565 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 4566 stat_direct_blk_ptrs++; 4567 /* 4568 * Reset the file size to its most up-to-date value. 4569 */ 4570 if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) 4571 panic("handle_written_inodeblock: bad size"); 4572 if (fstype == UFS1) { 4573 if (dp1->di_size != inodedep->id_savedsize) { 4574 dp1->di_size = inodedep->id_savedsize; 4575 hadchanges = 1; 4576 } 4577 } else { 4578 if (dp2->di_size != inodedep->id_savedsize) { 4579 dp2->di_size = inodedep->id_savedsize; 4580 hadchanges = 1; 4581 } 4582 if (dp2->di_extsize != inodedep->id_savedextsize) { 4583 dp2->di_extsize = inodedep->id_savedextsize; 4584 hadchanges = 1; 4585 } 4586 } 4587 inodedep->id_savedsize = -1; 4588 inodedep->id_savedextsize = -1; 4589 /* 4590 * If there were any rollbacks in the inode block, then it must be 4591 * marked dirty so that its will eventually get written back in 4592 * its correct form. 4593 */ 4594 if (hadchanges) 4595 bdirty(bp); 4596 /* 4597 * Process any allocdirects that completed during the update. 4598 */ 4599 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 4600 handle_allocdirect_partdone(adp); 4601 if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) 4602 handle_allocdirect_partdone(adp); 4603 /* 4604 * Process deallocations that were held pending until the 4605 * inode had been written to disk. Freeing of the inode 4606 * is delayed until after all blocks have been freed to 4607 * avoid creation of new <vfsid, inum, lbn> triples 4608 * before the old ones have been deleted. 4609 */ 4610 filefree = NULL; 4611 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 4612 WORKLIST_REMOVE(wk); 4613 switch (wk->wk_type) { 4614 4615 case D_FREEFILE: 4616 /* 4617 * We defer adding filefree to the worklist until 4618 * all other additions have been made to ensure 4619 * that it will be done after all the old blocks 4620 * have been freed. 4621 */ 4622 if (filefree != NULL) 4623 panic("handle_written_inodeblock: filefree"); 4624 filefree = wk; 4625 continue; 4626 4627 case D_MKDIR: 4628 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 4629 continue; 4630 4631 case D_DIRADD: 4632 diradd_inode_written(WK_DIRADD(wk), inodedep); 4633 continue; 4634 4635 case D_FREEBLKS: 4636 wk->wk_state |= COMPLETE; 4637 if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE) 4638 continue; 4639 /* -- fall through -- */ 4640 case D_FREEFRAG: 4641 case D_DIRREM: 4642 add_to_worklist(wk); 4643 continue; 4644 4645 case D_NEWDIRBLK: 4646 free_newdirblk(WK_NEWDIRBLK(wk)); 4647 continue; 4648 4649 default: 4650 panic("handle_written_inodeblock: Unknown type %s", 4651 TYPENAME(wk->wk_type)); 4652 /* NOTREACHED */ 4653 } 4654 } 4655 if (filefree != NULL) { 4656 if (free_inodedep(inodedep) == 0) 4657 panic("handle_written_inodeblock: live inodedep"); 4658 add_to_worklist(filefree); 4659 return (0); 4660 } 4661 4662 /* 4663 * If no outstanding dependencies, free it. 4664 */ 4665 if (free_inodedep(inodedep) || 4666 (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && 4667 TAILQ_FIRST(&inodedep->id_extupdt) == 0)) 4668 return (0); 4669 return (hadchanges); 4670} 4671 4672/* 4673 * Process a diradd entry after its dependent inode has been written. 4674 * This routine must be called with splbio interrupts blocked. 4675 */ 4676static void 4677diradd_inode_written(dap, inodedep) 4678 struct diradd *dap; 4679 struct inodedep *inodedep; 4680{ 4681 struct pagedep *pagedep; 4682 4683 dap->da_state |= COMPLETE; 4684 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 4685 if (dap->da_state & DIRCHG) 4686 pagedep = dap->da_previous->dm_pagedep; 4687 else 4688 pagedep = dap->da_pagedep; 4689 LIST_REMOVE(dap, da_pdlist); 4690 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 4691 } 4692 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 4693} 4694 4695/* 4696 * Handle the completion of a mkdir dependency. 4697 */ 4698static void 4699handle_written_mkdir(mkdir, type) 4700 struct mkdir *mkdir; 4701 int type; 4702{ 4703 struct diradd *dap; 4704 struct pagedep *pagedep; 4705 4706 if (mkdir->md_state != type) 4707 panic("handle_written_mkdir: bad type"); 4708 dap = mkdir->md_diradd; 4709 dap->da_state &= ~type; 4710 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 4711 dap->da_state |= DEPCOMPLETE; 4712 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 4713 if (dap->da_state & DIRCHG) 4714 pagedep = dap->da_previous->dm_pagedep; 4715 else 4716 pagedep = dap->da_pagedep; 4717 LIST_REMOVE(dap, da_pdlist); 4718 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 4719 } 4720 LIST_REMOVE(mkdir, md_mkdirs); 4721 WORKITEM_FREE(mkdir, D_MKDIR); 4722} 4723 4724/* 4725 * Called from within softdep_disk_write_complete above. 4726 * A write operation was just completed. Removed inodes can 4727 * now be freed and associated block pointers may be committed. 4728 * Note that this routine is always called from interrupt level 4729 * with further splbio interrupts blocked. 4730 */ 4731static int 4732handle_written_filepage(pagedep, bp) 4733 struct pagedep *pagedep; 4734 struct buf *bp; /* buffer containing the written page */ 4735{ 4736 struct dirrem *dirrem; 4737 struct diradd *dap, *nextdap; 4738 struct direct *ep; 4739 int i, chgs; 4740 4741 if ((pagedep->pd_state & IOSTARTED) == 0) 4742 panic("handle_written_filepage: not started"); 4743 pagedep->pd_state &= ~IOSTARTED; 4744 /* 4745 * Process any directory removals that have been committed. 4746 */ 4747 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 4748 LIST_REMOVE(dirrem, dm_next); 4749 dirrem->dm_dirinum = pagedep->pd_ino; 4750 add_to_worklist(&dirrem->dm_list); 4751 } 4752 /* 4753 * Free any directory additions that have been committed. 4754 * If it is a newly allocated block, we have to wait until 4755 * the on-disk directory inode claims the new block. 4756 */ 4757 if ((pagedep->pd_state & NEWBLOCK) == 0) 4758 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 4759 free_diradd(dap); 4760 /* 4761 * Uncommitted directory entries must be restored. 4762 */ 4763 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 4764 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 4765 dap = nextdap) { 4766 nextdap = LIST_NEXT(dap, da_pdlist); 4767 if (dap->da_state & ATTACHED) 4768 panic("handle_written_filepage: attached"); 4769 ep = (struct direct *) 4770 ((char *)bp->b_data + dap->da_offset); 4771 ep->d_ino = dap->da_newinum; 4772 dap->da_state &= ~UNDONE; 4773 dap->da_state |= ATTACHED; 4774 chgs = 1; 4775 /* 4776 * If the inode referenced by the directory has 4777 * been written out, then the dependency can be 4778 * moved to the pending list. 4779 */ 4780 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 4781 LIST_REMOVE(dap, da_pdlist); 4782 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 4783 da_pdlist); 4784 } 4785 } 4786 } 4787 /* 4788 * If there were any rollbacks in the directory, then it must be 4789 * marked dirty so that its will eventually get written back in 4790 * its correct form. 4791 */ 4792 if (chgs) { 4793 if ((bp->b_flags & B_DELWRI) == 0) 4794 stat_dir_entry++; 4795 bdirty(bp); 4796 return (1); 4797 } 4798 /* 4799 * If we are not waiting for a new directory block to be 4800 * claimed by its inode, then the pagedep will be freed. 4801 * Otherwise it will remain to track any new entries on 4802 * the page in case they are fsync'ed. 4803 */ 4804 if ((pagedep->pd_state & NEWBLOCK) == 0) { 4805 LIST_REMOVE(pagedep, pd_hash); 4806 WORKITEM_FREE(pagedep, D_PAGEDEP); 4807 } 4808 return (0); 4809} 4810 4811/* 4812 * Writing back in-core inode structures. 4813 * 4814 * The filesystem only accesses an inode's contents when it occupies an 4815 * "in-core" inode structure. These "in-core" structures are separate from 4816 * the page frames used to cache inode blocks. Only the latter are 4817 * transferred to/from the disk. So, when the updated contents of the 4818 * "in-core" inode structure are copied to the corresponding in-memory inode 4819 * block, the dependencies are also transferred. The following procedure is 4820 * called when copying a dirty "in-core" inode to a cached inode block. 4821 */ 4822 4823/* 4824 * Called when an inode is loaded from disk. If the effective link count 4825 * differed from the actual link count when it was last flushed, then we 4826 * need to ensure that the correct effective link count is put back. 4827 */ 4828void 4829softdep_load_inodeblock(ip) 4830 struct inode *ip; /* the "in_core" copy of the inode */ 4831{ 4832 struct inodedep *inodedep; 4833 4834 /* 4835 * Check for alternate nlink count. 4836 */ 4837 ip->i_effnlink = ip->i_nlink; 4838 ACQUIRE_LOCK(&lk); 4839 if (inodedep_lookup(UFSTOVFS(ip->i_ump), 4840 ip->i_number, 0, &inodedep) == 0) { 4841 FREE_LOCK(&lk); 4842 return; 4843 } 4844 ip->i_effnlink -= inodedep->id_nlinkdelta; 4845 if (inodedep->id_state & SPACECOUNTED) 4846 ip->i_flag |= IN_SPACECOUNTED; 4847 FREE_LOCK(&lk); 4848} 4849 4850/* 4851 * This routine is called just before the "in-core" inode 4852 * information is to be copied to the in-memory inode block. 4853 * Recall that an inode block contains several inodes. If 4854 * the force flag is set, then the dependencies will be 4855 * cleared so that the update can always be made. Note that 4856 * the buffer is locked when this routine is called, so we 4857 * will never be in the middle of writing the inode block 4858 * to disk. 4859 */ 4860void 4861softdep_update_inodeblock(ip, bp, waitfor) 4862 struct inode *ip; /* the "in_core" copy of the inode */ 4863 struct buf *bp; /* the buffer containing the inode block */ 4864 int waitfor; /* nonzero => update must be allowed */ 4865{ 4866 struct inodedep *inodedep; 4867 struct worklist *wk; 4868 struct mount *mp; 4869 struct buf *ibp; 4870 int error; 4871 4872 /* 4873 * If the effective link count is not equal to the actual link 4874 * count, then we must track the difference in an inodedep while 4875 * the inode is (potentially) tossed out of the cache. Otherwise, 4876 * if there is no existing inodedep, then there are no dependencies 4877 * to track. 4878 */ 4879 mp = UFSTOVFS(ip->i_ump); 4880 ACQUIRE_LOCK(&lk); 4881 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 4882 FREE_LOCK(&lk); 4883 if (ip->i_effnlink != ip->i_nlink) 4884 panic("softdep_update_inodeblock: bad link count"); 4885 return; 4886 } 4887 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) 4888 panic("softdep_update_inodeblock: bad delta"); 4889 /* 4890 * Changes have been initiated. Anything depending on these 4891 * changes cannot occur until this inode has been written. 4892 */ 4893 inodedep->id_state &= ~COMPLETE; 4894 if ((inodedep->id_state & ONWORKLIST) == 0) 4895 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 4896 /* 4897 * Any new dependencies associated with the incore inode must 4898 * now be moved to the list associated with the buffer holding 4899 * the in-memory copy of the inode. Once merged process any 4900 * allocdirects that are completed by the merger. 4901 */ 4902 merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); 4903 if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) 4904 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); 4905 merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); 4906 if (TAILQ_FIRST(&inodedep->id_extupdt) != NULL) 4907 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt)); 4908 /* 4909 * Now that the inode has been pushed into the buffer, the 4910 * operations dependent on the inode being written to disk 4911 * can be moved to the id_bufwait so that they will be 4912 * processed when the buffer I/O completes. 4913 */ 4914 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 4915 WORKLIST_REMOVE(wk); 4916 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 4917 } 4918 /* 4919 * Newly allocated inodes cannot be written until the bitmap 4920 * that allocates them have been written (indicated by 4921 * DEPCOMPLETE being set in id_state). If we are doing a 4922 * forced sync (e.g., an fsync on a file), we force the bitmap 4923 * to be written so that the update can be done. 4924 */ 4925 if (waitfor == 0) { 4926 FREE_LOCK(&lk); 4927 return; 4928 } 4929retry: 4930 if ((inodedep->id_state & DEPCOMPLETE) != 0) { 4931 FREE_LOCK(&lk); 4932 return; 4933 } 4934 ibp = inodedep->id_buf; 4935 ibp = getdirtybuf(ibp, &lk, MNT_WAIT); 4936 if (ibp == NULL) { 4937 /* 4938 * If ibp came back as NULL, the dependency could have been 4939 * freed while we slept. Look it up again, and check to see 4940 * that it has completed. 4941 */ 4942 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 4943 goto retry; 4944 FREE_LOCK(&lk); 4945 return; 4946 } 4947 FREE_LOCK(&lk); 4948 if ((error = bwrite(ibp)) != 0) 4949 softdep_error("softdep_update_inodeblock: bwrite", error); 4950} 4951 4952/* 4953 * Merge the a new inode dependency list (such as id_newinoupdt) into an 4954 * old inode dependency list (such as id_inoupdt). This routine must be 4955 * called with splbio interrupts blocked. 4956 */ 4957static void 4958merge_inode_lists(newlisthead, oldlisthead) 4959 struct allocdirectlst *newlisthead; 4960 struct allocdirectlst *oldlisthead; 4961{ 4962 struct allocdirect *listadp, *newadp; 4963 4964 newadp = TAILQ_FIRST(newlisthead); 4965 for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { 4966 if (listadp->ad_lbn < newadp->ad_lbn) { 4967 listadp = TAILQ_NEXT(listadp, ad_next); 4968 continue; 4969 } 4970 TAILQ_REMOVE(newlisthead, newadp, ad_next); 4971 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 4972 if (listadp->ad_lbn == newadp->ad_lbn) { 4973 allocdirect_merge(oldlisthead, newadp, 4974 listadp); 4975 listadp = newadp; 4976 } 4977 newadp = TAILQ_FIRST(newlisthead); 4978 } 4979 while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { 4980 TAILQ_REMOVE(newlisthead, newadp, ad_next); 4981 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); 4982 } 4983} 4984 4985/* 4986 * If we are doing an fsync, then we must ensure that any directory 4987 * entries for the inode have been written after the inode gets to disk. 4988 */ 4989int 4990softdep_fsync(vp) 4991 struct vnode *vp; /* the "in_core" copy of the inode */ 4992{ 4993 struct inodedep *inodedep; 4994 struct pagedep *pagedep; 4995 struct worklist *wk; 4996 struct diradd *dap; 4997 struct mount *mp; 4998 struct vnode *pvp; 4999 struct inode *ip; 5000 struct buf *bp; 5001 struct fs *fs; 5002 struct thread *td = curthread; 5003 int error, flushparent; 5004 ino_t parentino; 5005 ufs_lbn_t lbn; 5006 5007 ip = VTOI(vp); 5008 fs = ip->i_fs; 5009 mp = vp->v_mount; 5010 ACQUIRE_LOCK(&lk); 5011 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 5012 FREE_LOCK(&lk); 5013 return (0); 5014 } 5015 if (LIST_FIRST(&inodedep->id_inowait) != NULL || 5016 LIST_FIRST(&inodedep->id_bufwait) != NULL || 5017 TAILQ_FIRST(&inodedep->id_extupdt) != NULL || 5018 TAILQ_FIRST(&inodedep->id_newextupdt) != NULL || 5019 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 5020 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) 5021 panic("softdep_fsync: pending ops"); 5022 for (error = 0, flushparent = 0; ; ) { 5023 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 5024 break; 5025 if (wk->wk_type != D_DIRADD) 5026 panic("softdep_fsync: Unexpected type %s", 5027 TYPENAME(wk->wk_type)); 5028 dap = WK_DIRADD(wk); 5029 /* 5030 * Flush our parent if this directory entry has a MKDIR_PARENT 5031 * dependency or is contained in a newly allocated block. 5032 */ 5033 if (dap->da_state & DIRCHG) 5034 pagedep = dap->da_previous->dm_pagedep; 5035 else 5036 pagedep = dap->da_pagedep; 5037 parentino = pagedep->pd_ino; 5038 lbn = pagedep->pd_lbn; 5039 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 5040 panic("softdep_fsync: dirty"); 5041 if ((dap->da_state & MKDIR_PARENT) || 5042 (pagedep->pd_state & NEWBLOCK)) 5043 flushparent = 1; 5044 else 5045 flushparent = 0; 5046 /* 5047 * If we are being fsync'ed as part of vgone'ing this vnode, 5048 * then we will not be able to release and recover the 5049 * vnode below, so we just have to give up on writing its 5050 * directory entry out. It will eventually be written, just 5051 * not now, but then the user was not asking to have it 5052 * written, so we are not breaking any promises. 5053 */ 5054 if (vp->v_iflag & VI_DOOMED) 5055 break; 5056 /* 5057 * We prevent deadlock by always fetching inodes from the 5058 * root, moving down the directory tree. Thus, when fetching 5059 * our parent directory, we first try to get the lock. If 5060 * that fails, we must unlock ourselves before requesting 5061 * the lock on our parent. See the comment in ufs_lookup 5062 * for details on possible races. 5063 */ 5064 FREE_LOCK(&lk); 5065 if (ffs_vget(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) { 5066 VOP_UNLOCK(vp, 0, td); 5067 error = ffs_vget(mp, parentino, LK_EXCLUSIVE, &pvp); 5068 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 5069 if (error != 0) 5070 return (error); 5071 } 5072 /* 5073 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps 5074 * that are contained in direct blocks will be resolved by 5075 * doing a ffs_update. Pagedeps contained in indirect blocks 5076 * may require a complete sync'ing of the directory. So, we 5077 * try the cheap and fast ffs_update first, and if that fails, 5078 * then we do the slower ffs_syncvnode of the directory. 5079 */ 5080 if (flushparent) { 5081 if ((error = ffs_update(pvp, 1)) != 0) { 5082 vput(pvp); 5083 return (error); 5084 } 5085 if ((pagedep->pd_state & NEWBLOCK) && 5086 (error = ffs_syncvnode(pvp, MNT_WAIT))) { 5087 vput(pvp); 5088 return (error); 5089 } 5090 } 5091 /* 5092 * Flush directory page containing the inode's name. 5093 */ 5094 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, 5095 &bp); 5096 if (error == 0) 5097 error = bwrite(bp); 5098 else 5099 brelse(bp); 5100 vput(pvp); 5101 if (error != 0) 5102 return (error); 5103 ACQUIRE_LOCK(&lk); 5104 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 5105 break; 5106 } 5107 FREE_LOCK(&lk); 5108 return (0); 5109} 5110 5111/* 5112 * Flush all the dirty bitmaps associated with the block device 5113 * before flushing the rest of the dirty blocks so as to reduce 5114 * the number of dependencies that will have to be rolled back. 5115 */ 5116void 5117softdep_fsync_mountdev(vp) 5118 struct vnode *vp; 5119{ 5120 struct buf *bp, *nbp; 5121 struct worklist *wk; 5122 5123 if (!vn_isdisk(vp, NULL)) 5124 panic("softdep_fsync_mountdev: vnode not a disk"); 5125restart: 5126 ACQUIRE_LOCK(&lk); 5127 VI_LOCK(vp); 5128 TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { 5129 /* 5130 * If it is already scheduled, skip to the next buffer. 5131 */ 5132 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 5133 continue; 5134 5135 if ((bp->b_flags & B_DELWRI) == 0) 5136 panic("softdep_fsync_mountdev: not dirty"); 5137 /* 5138 * We are only interested in bitmaps with outstanding 5139 * dependencies. 5140 */ 5141 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || 5142 wk->wk_type != D_BMSAFEMAP || 5143 (bp->b_vflags & BV_BKGRDINPROG)) { 5144 BUF_UNLOCK(bp); 5145 continue; 5146 } 5147 VI_UNLOCK(vp); 5148 FREE_LOCK(&lk); 5149 bremfree(bp); 5150 (void) bawrite(bp); 5151 goto restart; 5152 } 5153 FREE_LOCK(&lk); 5154 drain_output(vp); 5155 VI_UNLOCK(vp); 5156} 5157 5158/* 5159 * This routine is called when we are trying to synchronously flush a 5160 * file. This routine must eliminate any filesystem metadata dependencies 5161 * so that the syncing routine can succeed by pushing the dirty blocks 5162 * associated with the file. If any I/O errors occur, they are returned. 5163 */ 5164int 5165softdep_sync_metadata(struct vnode *vp) 5166{ 5167 struct pagedep *pagedep; 5168 struct allocdirect *adp; 5169 struct allocindir *aip; 5170 struct buf *bp, *nbp; 5171 struct worklist *wk; 5172 int i, error, waitfor; 5173 5174 if (!DOINGSOFTDEP(vp)) 5175 return (0); 5176 /* 5177 * Ensure that any direct block dependencies have been cleared. 5178 */ 5179 ACQUIRE_LOCK(&lk); 5180 if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) { 5181 FREE_LOCK(&lk); 5182 return (error); 5183 } 5184 FREE_LOCK(&lk); 5185 /* 5186 * For most files, the only metadata dependencies are the 5187 * cylinder group maps that allocate their inode or blocks. 5188 * The block allocation dependencies can be found by traversing 5189 * the dependency lists for any buffers that remain on their 5190 * dirty buffer list. The inode allocation dependency will 5191 * be resolved when the inode is updated with MNT_WAIT. 5192 * This work is done in two passes. The first pass grabs most 5193 * of the buffers and begins asynchronously writing them. The 5194 * only way to wait for these asynchronous writes is to sleep 5195 * on the filesystem vnode which may stay busy for a long time 5196 * if the filesystem is active. So, instead, we make a second 5197 * pass over the dependencies blocking on each write. In the 5198 * usual case we will be blocking against a write that we 5199 * initiated, so when it is done the dependency will have been 5200 * resolved. Thus the second pass is expected to end quickly. 5201 */ 5202 waitfor = MNT_NOWAIT; 5203 5204top: 5205 /* 5206 * We must wait for any I/O in progress to finish so that 5207 * all potential buffers on the dirty list will be visible. 5208 */ 5209 VI_LOCK(vp); 5210 drain_output(vp); 5211 while ((bp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd)) != NULL) { 5212 bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT); 5213 if (bp) 5214 break; 5215 } 5216 VI_UNLOCK(vp); 5217 if (bp == NULL) 5218 return (0); 5219loop: 5220 /* While syncing snapshots, we must allow recursive lookups */ 5221 bp->b_lock.lk_flags |= LK_CANRECURSE; 5222 ACQUIRE_LOCK(&lk); 5223 /* 5224 * As we hold the buffer locked, none of its dependencies 5225 * will disappear. 5226 */ 5227 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 5228 switch (wk->wk_type) { 5229 5230 case D_ALLOCDIRECT: 5231 adp = WK_ALLOCDIRECT(wk); 5232 if (adp->ad_state & DEPCOMPLETE) 5233 continue; 5234 nbp = adp->ad_buf; 5235 nbp = getdirtybuf(nbp, &lk, waitfor); 5236 if (nbp == NULL) 5237 continue; 5238 FREE_LOCK(&lk); 5239 if (waitfor == MNT_NOWAIT) { 5240 bawrite(nbp); 5241 } else if ((error = bwrite(nbp)) != 0) { 5242 break; 5243 } 5244 ACQUIRE_LOCK(&lk); 5245 continue; 5246 5247 case D_ALLOCINDIR: 5248 aip = WK_ALLOCINDIR(wk); 5249 if (aip->ai_state & DEPCOMPLETE) 5250 continue; 5251 nbp = aip->ai_buf; 5252 nbp = getdirtybuf(nbp, &lk, waitfor); 5253 if (nbp == NULL) 5254 continue; 5255 FREE_LOCK(&lk); 5256 if (waitfor == MNT_NOWAIT) { 5257 bawrite(nbp); 5258 } else if ((error = bwrite(nbp)) != 0) { 5259 break; 5260 } 5261 ACQUIRE_LOCK(&lk); 5262 continue; 5263 5264 case D_INDIRDEP: 5265 restart: 5266 5267 LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { 5268 if (aip->ai_state & DEPCOMPLETE) 5269 continue; 5270 nbp = aip->ai_buf; 5271 nbp = getdirtybuf(nbp, &lk, MNT_WAIT); 5272 if (nbp == NULL) 5273 goto restart; 5274 FREE_LOCK(&lk); 5275 if ((error = bwrite(nbp)) != 0) { 5276 goto loop_end; 5277 } 5278 ACQUIRE_LOCK(&lk); 5279 goto restart; 5280 } 5281 continue; 5282 5283 case D_INODEDEP: 5284 if ((error = flush_inodedep_deps(wk->wk_mp, 5285 WK_INODEDEP(wk)->id_ino)) != 0) { 5286 FREE_LOCK(&lk); 5287 break; 5288 } 5289 continue; 5290 5291 case D_PAGEDEP: 5292 /* 5293 * We are trying to sync a directory that may 5294 * have dependencies on both its own metadata 5295 * and/or dependencies on the inodes of any 5296 * recently allocated files. We walk its diradd 5297 * lists pushing out the associated inode. 5298 */ 5299 pagedep = WK_PAGEDEP(wk); 5300 for (i = 0; i < DAHASHSZ; i++) { 5301 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 5302 continue; 5303 if ((error = 5304 flush_pagedep_deps(vp, wk->wk_mp, 5305 &pagedep->pd_diraddhd[i]))) { 5306 FREE_LOCK(&lk); 5307 goto loop_end; 5308 } 5309 } 5310 continue; 5311 5312 case D_MKDIR: 5313 /* 5314 * This case should never happen if the vnode has 5315 * been properly sync'ed. However, if this function 5316 * is used at a place where the vnode has not yet 5317 * been sync'ed, this dependency can show up. So, 5318 * rather than panic, just flush it. 5319 */ 5320 nbp = WK_MKDIR(wk)->md_buf; 5321 nbp = getdirtybuf(nbp, &lk, waitfor); 5322 if (nbp == NULL) 5323 continue; 5324 FREE_LOCK(&lk); 5325 if (waitfor == MNT_NOWAIT) { 5326 bawrite(nbp); 5327 } else if ((error = bwrite(nbp)) != 0) { 5328 break; 5329 } 5330 ACQUIRE_LOCK(&lk); 5331 continue; 5332 5333 case D_BMSAFEMAP: 5334 /* 5335 * This case should never happen if the vnode has 5336 * been properly sync'ed. However, if this function 5337 * is used at a place where the vnode has not yet 5338 * been sync'ed, this dependency can show up. So, 5339 * rather than panic, just flush it. 5340 */ 5341 nbp = WK_BMSAFEMAP(wk)->sm_buf; 5342 nbp = getdirtybuf(nbp, &lk, waitfor); 5343 if (nbp == NULL) 5344 continue; 5345 FREE_LOCK(&lk); 5346 if (waitfor == MNT_NOWAIT) { 5347 bawrite(nbp); 5348 } else if ((error = bwrite(nbp)) != 0) { 5349 break; 5350 } 5351 ACQUIRE_LOCK(&lk); 5352 continue; 5353 5354 default: 5355 panic("softdep_sync_metadata: Unknown type %s", 5356 TYPENAME(wk->wk_type)); 5357 /* NOTREACHED */ 5358 } 5359 loop_end: 5360 /* We reach here only in error and unlocked */ 5361 if (error == 0) 5362 panic("softdep_sync_metadata: zero error"); 5363 bp->b_lock.lk_flags &= ~LK_CANRECURSE; 5364 bawrite(bp); 5365 return (error); 5366 } 5367 FREE_LOCK(&lk); 5368 VI_LOCK(vp); 5369 while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) { 5370 nbp = getdirtybuf(nbp, VI_MTX(vp), MNT_WAIT); 5371 if (nbp) 5372 break; 5373 } 5374 VI_UNLOCK(vp); 5375 bp->b_lock.lk_flags &= ~LK_CANRECURSE; 5376 bawrite(bp); 5377 if (nbp != NULL) { 5378 bp = nbp; 5379 goto loop; 5380 } 5381 /* 5382 * The brief unlock is to allow any pent up dependency 5383 * processing to be done. Then proceed with the second pass. 5384 */ 5385 if (waitfor == MNT_NOWAIT) { 5386 waitfor = MNT_WAIT; 5387 goto top; 5388 } 5389 5390 /* 5391 * If we have managed to get rid of all the dirty buffers, 5392 * then we are done. For certain directories and block 5393 * devices, we may need to do further work. 5394 * 5395 * We must wait for any I/O in progress to finish so that 5396 * all potential buffers on the dirty list will be visible. 5397 */ 5398 VI_LOCK(vp); 5399 drain_output(vp); 5400 VI_UNLOCK(vp); 5401 return (0); 5402} 5403 5404/* 5405 * Flush the dependencies associated with an inodedep. 5406 * Called with splbio blocked. 5407 */ 5408static int 5409flush_inodedep_deps(mp, ino) 5410 struct mount *mp; 5411 ino_t ino; 5412{ 5413 struct inodedep *inodedep; 5414 int error, waitfor; 5415 5416 /* 5417 * This work is done in two passes. The first pass grabs most 5418 * of the buffers and begins asynchronously writing them. The 5419 * only way to wait for these asynchronous writes is to sleep 5420 * on the filesystem vnode which may stay busy for a long time 5421 * if the filesystem is active. So, instead, we make a second 5422 * pass over the dependencies blocking on each write. In the 5423 * usual case we will be blocking against a write that we 5424 * initiated, so when it is done the dependency will have been 5425 * resolved. Thus the second pass is expected to end quickly. 5426 * We give a brief window at the top of the loop to allow 5427 * any pending I/O to complete. 5428 */ 5429 for (error = 0, waitfor = MNT_NOWAIT; ; ) { 5430 if (error) 5431 return (error); 5432 FREE_LOCK(&lk); 5433 ACQUIRE_LOCK(&lk); 5434 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 5435 return (0); 5436 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || 5437 flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || 5438 flush_deplist(&inodedep->id_extupdt, waitfor, &error) || 5439 flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) 5440 continue; 5441 /* 5442 * If pass2, we are done, otherwise do pass 2. 5443 */ 5444 if (waitfor == MNT_WAIT) 5445 break; 5446 waitfor = MNT_WAIT; 5447 } 5448 /* 5449 * Try freeing inodedep in case all dependencies have been removed. 5450 */ 5451 if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) 5452 (void) free_inodedep(inodedep); 5453 return (0); 5454} 5455 5456/* 5457 * Flush an inode dependency list. 5458 * Called with splbio blocked. 5459 */ 5460static int 5461flush_deplist(listhead, waitfor, errorp) 5462 struct allocdirectlst *listhead; 5463 int waitfor; 5464 int *errorp; 5465{ 5466 struct allocdirect *adp; 5467 struct buf *bp; 5468 5469 mtx_assert(&lk, MA_OWNED); 5470 TAILQ_FOREACH(adp, listhead, ad_next) { 5471 if (adp->ad_state & DEPCOMPLETE) 5472 continue; 5473 bp = adp->ad_buf; 5474 bp = getdirtybuf(bp, &lk, waitfor); 5475 if (bp == NULL) { 5476 if (waitfor == MNT_NOWAIT) 5477 continue; 5478 return (1); 5479 } 5480 FREE_LOCK(&lk); 5481 if (waitfor == MNT_NOWAIT) { 5482 bawrite(bp); 5483 } else if ((*errorp = bwrite(bp)) != 0) { 5484 ACQUIRE_LOCK(&lk); 5485 return (1); 5486 } 5487 ACQUIRE_LOCK(&lk); 5488 return (1); 5489 } 5490 return (0); 5491} 5492 5493/* 5494 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 5495 * Called with splbio blocked. 5496 */ 5497static int 5498flush_pagedep_deps(pvp, mp, diraddhdp) 5499 struct vnode *pvp; 5500 struct mount *mp; 5501 struct diraddhd *diraddhdp; 5502{ 5503 struct inodedep *inodedep; 5504 struct ufsmount *ump; 5505 struct diradd *dap; 5506 struct vnode *vp; 5507 int error = 0; 5508 struct buf *bp; 5509 ino_t inum; 5510 struct worklist *wk; 5511 5512 ump = VFSTOUFS(mp); 5513 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 5514 /* 5515 * Flush ourselves if this directory entry 5516 * has a MKDIR_PARENT dependency. 5517 */ 5518 if (dap->da_state & MKDIR_PARENT) { 5519 FREE_LOCK(&lk); 5520 if ((error = ffs_update(pvp, 1)) != 0) 5521 break; 5522 ACQUIRE_LOCK(&lk); 5523 /* 5524 * If that cleared dependencies, go on to next. 5525 */ 5526 if (dap != LIST_FIRST(diraddhdp)) 5527 continue; 5528 if (dap->da_state & MKDIR_PARENT) 5529 panic("flush_pagedep_deps: MKDIR_PARENT"); 5530 } 5531 /* 5532 * A newly allocated directory must have its "." and 5533 * ".." entries written out before its name can be 5534 * committed in its parent. We do not want or need 5535 * the full semantics of a synchronous ffs_syncvnode as 5536 * that may end up here again, once for each directory 5537 * level in the filesystem. Instead, we push the blocks 5538 * and wait for them to clear. We have to fsync twice 5539 * because the first call may choose to defer blocks 5540 * that still have dependencies, but deferral will 5541 * happen at most once. 5542 */ 5543 inum = dap->da_newinum; 5544 if (dap->da_state & MKDIR_BODY) { 5545 FREE_LOCK(&lk); 5546 if ((error = ffs_vget(mp, inum, LK_EXCLUSIVE, &vp))) 5547 break; 5548 if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) || 5549 (error=ffs_syncvnode(vp, MNT_NOWAIT))) { 5550 vput(vp); 5551 break; 5552 } 5553 VI_LOCK(vp); 5554 drain_output(vp); 5555 /* 5556 * If first block is still dirty with a D_MKDIR 5557 * dependency then it needs to be written now. 5558 */ 5559 for (;;) { 5560 error = 0; 5561 bp = gbincore(&vp->v_bufobj, 0); 5562 if (bp == NULL) 5563 break; /* First block not present */ 5564 error = BUF_LOCK(bp, 5565 LK_EXCLUSIVE | 5566 LK_SLEEPFAIL | 5567 LK_INTERLOCK, 5568 VI_MTX(vp)); 5569 VI_LOCK(vp); 5570 if (error == ENOLCK) 5571 continue; /* Slept, retry */ 5572 if (error != 0) 5573 break; /* Failed */ 5574 if ((bp->b_flags & B_DELWRI) == 0) { 5575 BUF_UNLOCK(bp); 5576 break; /* Buffer not dirty */ 5577 } 5578 for (wk = LIST_FIRST(&bp->b_dep); 5579 wk != NULL; 5580 wk = LIST_NEXT(wk, wk_list)) 5581 if (wk->wk_type == D_MKDIR) 5582 break; 5583 if (wk == NULL) 5584 BUF_UNLOCK(bp); /* Dependency gone */ 5585 else { 5586 /* 5587 * D_MKDIR dependency remains, 5588 * must write buffer to stable 5589 * storage. 5590 */ 5591 VI_UNLOCK(vp); 5592 bremfree(bp); 5593 error = bwrite(bp); 5594 VI_LOCK(vp); 5595 } 5596 break; 5597 } 5598 VI_UNLOCK(vp); 5599 vput(vp); 5600 if (error != 0) 5601 break; /* Flushing of first block failed */ 5602 ACQUIRE_LOCK(&lk); 5603 /* 5604 * If that cleared dependencies, go on to next. 5605 */ 5606 if (dap != LIST_FIRST(diraddhdp)) 5607 continue; 5608 if (dap->da_state & MKDIR_BODY) 5609 panic("flush_pagedep_deps: MKDIR_BODY"); 5610 } 5611 /* 5612 * Flush the inode on which the directory entry depends. 5613 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 5614 * the only remaining dependency is that the updated inode 5615 * count must get pushed to disk. The inode has already 5616 * been pushed into its inode buffer (via VOP_UPDATE) at 5617 * the time of the reference count change. So we need only 5618 * locate that buffer, ensure that there will be no rollback 5619 * caused by a bitmap dependency, then write the inode buffer. 5620 */ 5621retry: 5622 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 5623 panic("flush_pagedep_deps: lost inode"); 5624 /* 5625 * If the inode still has bitmap dependencies, 5626 * push them to disk. 5627 */ 5628 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 5629 bp = inodedep->id_buf; 5630 bp = getdirtybuf(bp, &lk, MNT_WAIT); 5631 if (bp == NULL) 5632 goto retry; 5633 FREE_LOCK(&lk); 5634 if ((error = bwrite(bp)) != 0) 5635 break; 5636 ACQUIRE_LOCK(&lk); 5637 if (dap != LIST_FIRST(diraddhdp)) 5638 continue; 5639 } 5640 /* 5641 * If the inode is still sitting in a buffer waiting 5642 * to be written, push it to disk. 5643 */ 5644 FREE_LOCK(&lk); 5645 if ((error = bread(ump->um_devvp, 5646 fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), 5647 (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) { 5648 brelse(bp); 5649 break; 5650 } 5651 if ((error = bwrite(bp)) != 0) 5652 break; 5653 ACQUIRE_LOCK(&lk); 5654 /* 5655 * If we have failed to get rid of all the dependencies 5656 * then something is seriously wrong. 5657 */ 5658 if (dap == LIST_FIRST(diraddhdp)) 5659 panic("flush_pagedep_deps: flush failed"); 5660 } 5661 if (error) 5662 ACQUIRE_LOCK(&lk); 5663 return (error); 5664} 5665 5666/* 5667 * A large burst of file addition or deletion activity can drive the 5668 * memory load excessively high. First attempt to slow things down 5669 * using the techniques below. If that fails, this routine requests 5670 * the offending operations to fall back to running synchronously 5671 * until the memory load returns to a reasonable level. 5672 */ 5673int 5674softdep_slowdown(vp) 5675 struct vnode *vp; 5676{ 5677 int max_softdeps_hard; 5678 5679 ACQUIRE_LOCK(&lk); 5680 max_softdeps_hard = max_softdeps * 11 / 10; 5681 if (num_dirrem < max_softdeps_hard / 2 && 5682 num_inodedep < max_softdeps_hard && 5683 VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps) { 5684 FREE_LOCK(&lk); 5685 return (0); 5686 } 5687 if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps) 5688 softdep_speedup(); 5689 stat_sync_limit_hit += 1; 5690 FREE_LOCK(&lk); 5691 return (1); 5692} 5693 5694/* 5695 * Called by the allocation routines when they are about to fail 5696 * in the hope that we can free up some disk space. 5697 * 5698 * First check to see if the work list has anything on it. If it has, 5699 * clean up entries until we successfully free some space. Because this 5700 * process holds inodes locked, we cannot handle any remove requests 5701 * that might block on a locked inode as that could lead to deadlock. 5702 * If the worklist yields no free space, encourage the syncer daemon 5703 * to help us. In no event will we try for longer than tickdelay seconds. 5704 */ 5705int 5706softdep_request_cleanup(fs, vp) 5707 struct fs *fs; 5708 struct vnode *vp; 5709{ 5710 struct ufsmount *ump; 5711 long starttime; 5712 ufs2_daddr_t needed; 5713 int error; 5714 5715 ump = VTOI(vp)->i_ump; 5716 mtx_assert(UFS_MTX(ump), MA_OWNED); 5717 needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize; 5718 starttime = time_second + tickdelay; 5719 /* 5720 * If we are being called because of a process doing a 5721 * copy-on-write, then it is not safe to update the vnode 5722 * as we may recurse into the copy-on-write routine. 5723 */ 5724 if (!(curthread->td_pflags & TDP_COWINPROGRESS)) { 5725 UFS_UNLOCK(ump); 5726 error = ffs_update(vp, 1); 5727 UFS_LOCK(ump); 5728 if (error != 0) 5729 return (0); 5730 } 5731 while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) { 5732 if (time_second > starttime) 5733 return (0); 5734 UFS_UNLOCK(ump); 5735 ACQUIRE_LOCK(&lk); 5736 if (ump->softdep_on_worklist > 0 && 5737 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) { 5738 stat_worklist_push += 1; 5739 FREE_LOCK(&lk); 5740 UFS_LOCK(ump); 5741 continue; 5742 } 5743 request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT); 5744 FREE_LOCK(&lk); 5745 UFS_LOCK(ump); 5746 } 5747 return (1); 5748} 5749 5750/* 5751 * If memory utilization has gotten too high, deliberately slow things 5752 * down and speed up the I/O processing. 5753 */ 5754extern struct thread *syncertd; 5755static int 5756request_cleanup(mp, resource) 5757 struct mount *mp; 5758 int resource; 5759{ 5760 struct thread *td = curthread; 5761 struct ufsmount *ump; 5762 5763 mtx_assert(&lk, MA_OWNED); 5764 /* 5765 * We never hold up the filesystem syncer or buf daemon. 5766 */ 5767 if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) 5768 return (0); 5769 ump = VFSTOUFS(mp); 5770 /* 5771 * First check to see if the work list has gotten backlogged. 5772 * If it has, co-opt this process to help clean up two entries. 5773 * Because this process may hold inodes locked, we cannot 5774 * handle any remove requests that might block on a locked 5775 * inode as that could lead to deadlock. We set TDP_SOFTDEP 5776 * to avoid recursively processing the worklist. 5777 */ 5778 if (ump->softdep_on_worklist > max_softdeps / 10) { 5779 td->td_pflags |= TDP_SOFTDEP; 5780 process_worklist_item(mp, LK_NOWAIT); 5781 process_worklist_item(mp, LK_NOWAIT); 5782 td->td_pflags &= ~TDP_SOFTDEP; 5783 stat_worklist_push += 2; 5784 return(1); 5785 } 5786 /* 5787 * Next, we attempt to speed up the syncer process. If that 5788 * is successful, then we allow the process to continue. 5789 */ 5790 if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT) 5791 return(0); 5792 /* 5793 * If we are resource constrained on inode dependencies, try 5794 * flushing some dirty inodes. Otherwise, we are constrained 5795 * by file deletions, so try accelerating flushes of directories 5796 * with removal dependencies. We would like to do the cleanup 5797 * here, but we probably hold an inode locked at this point and 5798 * that might deadlock against one that we try to clean. So, 5799 * the best that we can do is request the syncer daemon to do 5800 * the cleanup for us. 5801 */ 5802 switch (resource) { 5803 5804 case FLUSH_INODES: 5805 stat_ino_limit_push += 1; 5806 req_clear_inodedeps += 1; 5807 stat_countp = &stat_ino_limit_hit; 5808 break; 5809 5810 case FLUSH_REMOVE: 5811 case FLUSH_REMOVE_WAIT: 5812 stat_blk_limit_push += 1; 5813 req_clear_remove += 1; 5814 stat_countp = &stat_blk_limit_hit; 5815 break; 5816 5817 default: 5818 panic("request_cleanup: unknown type"); 5819 } 5820 /* 5821 * Hopefully the syncer daemon will catch up and awaken us. 5822 * We wait at most tickdelay before proceeding in any case. 5823 */ 5824 proc_waiting += 1; 5825 if (handle.callout == NULL) 5826 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); 5827 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); 5828 proc_waiting -= 1; 5829 return (1); 5830} 5831 5832/* 5833 * Awaken processes pausing in request_cleanup and clear proc_waiting 5834 * to indicate that there is no longer a timer running. 5835 */ 5836static void 5837pause_timer(arg) 5838 void *arg; 5839{ 5840 5841 ACQUIRE_LOCK(&lk); 5842 *stat_countp += 1; 5843 wakeup_one(&proc_waiting); 5844 if (proc_waiting > 0) 5845 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); 5846 else 5847 handle.callout = NULL; 5848 FREE_LOCK(&lk); 5849} 5850 5851/* 5852 * Flush out a directory with at least one removal dependency in an effort to 5853 * reduce the number of dirrem, freefile, and freeblks dependency structures. 5854 */ 5855static void 5856clear_remove(td) 5857 struct thread *td; 5858{ 5859 struct pagedep_hashhead *pagedephd; 5860 struct pagedep *pagedep; 5861 static int next = 0; 5862 struct mount *mp; 5863 struct vnode *vp; 5864 int error, cnt; 5865 ino_t ino; 5866 5867 mtx_assert(&lk, MA_OWNED); 5868 5869 for (cnt = 0; cnt < pagedep_hash; cnt++) { 5870 pagedephd = &pagedep_hashtbl[next++]; 5871 if (next >= pagedep_hash) 5872 next = 0; 5873 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 5874 if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL) 5875 continue; 5876 mp = pagedep->pd_list.wk_mp; 5877 ino = pagedep->pd_ino; 5878 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 5879 continue; 5880 FREE_LOCK(&lk); 5881 if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp))) { 5882 softdep_error("clear_remove: vget", error); 5883 vn_finished_write(mp); 5884 ACQUIRE_LOCK(&lk); 5885 return; 5886 } 5887 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 5888 softdep_error("clear_remove: fsync", error); 5889 VI_LOCK(vp); 5890 drain_output(vp); 5891 VI_UNLOCK(vp); 5892 vput(vp); 5893 vn_finished_write(mp); 5894 ACQUIRE_LOCK(&lk); 5895 return; 5896 } 5897 } 5898} 5899 5900/* 5901 * Clear out a block of dirty inodes in an effort to reduce 5902 * the number of inodedep dependency structures. 5903 */ 5904static void 5905clear_inodedeps(td) 5906 struct thread *td; 5907{ 5908 struct inodedep_hashhead *inodedephd; 5909 struct inodedep *inodedep; 5910 static int next = 0; 5911 struct mount *mp; 5912 struct vnode *vp; 5913 struct fs *fs; 5914 int error, cnt; 5915 ino_t firstino, lastino, ino; 5916 5917 mtx_assert(&lk, MA_OWNED); 5918 /* 5919 * Pick a random inode dependency to be cleared. 5920 * We will then gather up all the inodes in its block 5921 * that have dependencies and flush them out. 5922 */ 5923 for (cnt = 0; cnt < inodedep_hash; cnt++) { 5924 inodedephd = &inodedep_hashtbl[next++]; 5925 if (next >= inodedep_hash) 5926 next = 0; 5927 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 5928 break; 5929 } 5930 if (inodedep == NULL) 5931 return; 5932 fs = inodedep->id_fs; 5933 mp = inodedep->id_list.wk_mp; 5934 /* 5935 * Find the last inode in the block with dependencies. 5936 */ 5937 firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 5938 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 5939 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) 5940 break; 5941 /* 5942 * Asynchronously push all but the last inode with dependencies. 5943 * Synchronously push the last inode with dependencies to ensure 5944 * that the inode block gets written to free up the inodedeps. 5945 */ 5946 for (ino = firstino; ino <= lastino; ino++) { 5947 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 5948 continue; 5949 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 5950 continue; 5951 FREE_LOCK(&lk); 5952 if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp)) != 0) { 5953 softdep_error("clear_inodedeps: vget", error); 5954 vn_finished_write(mp); 5955 ACQUIRE_LOCK(&lk); 5956 return; 5957 } 5958 if (ino == lastino) { 5959 if ((error = ffs_syncvnode(vp, MNT_WAIT))) 5960 softdep_error("clear_inodedeps: fsync1", error); 5961 } else { 5962 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 5963 softdep_error("clear_inodedeps: fsync2", error); 5964 VI_LOCK(vp); 5965 drain_output(vp); 5966 VI_UNLOCK(vp); 5967 } 5968 vput(vp); 5969 vn_finished_write(mp); 5970 ACQUIRE_LOCK(&lk); 5971 } 5972} 5973 5974/* 5975 * Function to determine if the buffer has outstanding dependencies 5976 * that will cause a roll-back if the buffer is written. If wantcount 5977 * is set, return number of dependencies, otherwise just yes or no. 5978 */ 5979static int 5980softdep_count_dependencies(bp, wantcount) 5981 struct buf *bp; 5982 int wantcount; 5983{ 5984 struct worklist *wk; 5985 struct inodedep *inodedep; 5986 struct indirdep *indirdep; 5987 struct allocindir *aip; 5988 struct pagedep *pagedep; 5989 struct diradd *dap; 5990 int i, retval; 5991 5992 retval = 0; 5993 ACQUIRE_LOCK(&lk); 5994 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 5995 switch (wk->wk_type) { 5996 5997 case D_INODEDEP: 5998 inodedep = WK_INODEDEP(wk); 5999 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 6000 /* bitmap allocation dependency */ 6001 retval += 1; 6002 if (!wantcount) 6003 goto out; 6004 } 6005 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 6006 /* direct block pointer dependency */ 6007 retval += 1; 6008 if (!wantcount) 6009 goto out; 6010 } 6011 if (TAILQ_FIRST(&inodedep->id_extupdt)) { 6012 /* direct block pointer dependency */ 6013 retval += 1; 6014 if (!wantcount) 6015 goto out; 6016 } 6017 continue; 6018 6019 case D_INDIRDEP: 6020 indirdep = WK_INDIRDEP(wk); 6021 6022 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 6023 /* indirect block pointer dependency */ 6024 retval += 1; 6025 if (!wantcount) 6026 goto out; 6027 } 6028 continue; 6029 6030 case D_PAGEDEP: 6031 pagedep = WK_PAGEDEP(wk); 6032 for (i = 0; i < DAHASHSZ; i++) { 6033 6034 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 6035 /* directory entry dependency */ 6036 retval += 1; 6037 if (!wantcount) 6038 goto out; 6039 } 6040 } 6041 continue; 6042 6043 case D_BMSAFEMAP: 6044 case D_ALLOCDIRECT: 6045 case D_ALLOCINDIR: 6046 case D_MKDIR: 6047 /* never a dependency on these blocks */ 6048 continue; 6049 6050 default: 6051 panic("softdep_check_for_rollback: Unexpected type %s", 6052 TYPENAME(wk->wk_type)); 6053 /* NOTREACHED */ 6054 } 6055 } 6056out: 6057 FREE_LOCK(&lk); 6058 return retval; 6059} 6060 6061/* 6062 * Acquire exclusive access to a buffer. 6063 * Must be called with a locked mtx parameter. 6064 * Return acquired buffer or NULL on failure. 6065 */ 6066static struct buf * 6067getdirtybuf(bp, mtx, waitfor) 6068 struct buf *bp; 6069 struct mtx *mtx; 6070 int waitfor; 6071{ 6072 int error; 6073 6074 mtx_assert(mtx, MA_OWNED); 6075 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { 6076 if (waitfor != MNT_WAIT) 6077 return (NULL); 6078 error = BUF_LOCK(bp, 6079 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx); 6080 /* 6081 * Even if we sucessfully acquire bp here, we have dropped 6082 * mtx, which may violates our guarantee. 6083 */ 6084 if (error == 0) 6085 BUF_UNLOCK(bp); 6086 else if (error != ENOLCK) 6087 panic("getdirtybuf: inconsistent lock: %d", error); 6088 mtx_lock(mtx); 6089 return (NULL); 6090 } 6091 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 6092 if (mtx == &lk && waitfor == MNT_WAIT) { 6093 mtx_unlock(mtx); 6094 BO_LOCK(bp->b_bufobj); 6095 BUF_UNLOCK(bp); 6096 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 6097 bp->b_vflags |= BV_BKGRDWAIT; 6098 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), 6099 PRIBIO | PDROP, "getbuf", 0); 6100 } else 6101 BO_UNLOCK(bp->b_bufobj); 6102 mtx_lock(mtx); 6103 return (NULL); 6104 } 6105 BUF_UNLOCK(bp); 6106 if (waitfor != MNT_WAIT) 6107 return (NULL); 6108 /* 6109 * The mtx argument must be bp->b_vp's mutex in 6110 * this case. 6111 */ 6112#ifdef DEBUG_VFS_LOCKS 6113 if (bp->b_vp->v_type != VCHR) 6114 ASSERT_VI_LOCKED(bp->b_vp, "getdirtybuf"); 6115#endif 6116 bp->b_vflags |= BV_BKGRDWAIT; 6117 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0); 6118 return (NULL); 6119 } 6120 if ((bp->b_flags & B_DELWRI) == 0) { 6121 BUF_UNLOCK(bp); 6122 return (NULL); 6123 } 6124 bremfree(bp); 6125 return (bp); 6126} 6127 6128 6129/* 6130 * Check if it is safe to suspend the file system now. On entry, 6131 * the vnode interlock for devvp should be held. Return 0 with 6132 * the mount interlock held if the file system can be suspended now, 6133 * otherwise return EAGAIN with the mount interlock held. 6134 */ 6135int 6136softdep_check_suspend(struct mount *mp, 6137 struct vnode *devvp, 6138 int softdep_deps, 6139 int softdep_accdeps, 6140 int secondary_writes, 6141 int secondary_accwrites) 6142{ 6143 struct bufobj *bo; 6144 struct ufsmount *ump; 6145 int error; 6146 6147 ASSERT_VI_LOCKED(devvp, "softdep_check_suspend"); 6148 ump = VFSTOUFS(mp); 6149 bo = &devvp->v_bufobj; 6150 6151 for (;;) { 6152 if (!TRY_ACQUIRE_LOCK(&lk)) { 6153 VI_UNLOCK(devvp); 6154 ACQUIRE_LOCK(&lk); 6155 FREE_LOCK(&lk); 6156 VI_LOCK(devvp); 6157 continue; 6158 } 6159 if (!MNT_ITRYLOCK(mp)) { 6160 FREE_LOCK(&lk); 6161 VI_UNLOCK(devvp); 6162 MNT_ILOCK(mp); 6163 MNT_IUNLOCK(mp); 6164 VI_LOCK(devvp); 6165 continue; 6166 } 6167 if (mp->mnt_secondary_writes != 0) { 6168 FREE_LOCK(&lk); 6169 VI_UNLOCK(devvp); 6170 msleep(&mp->mnt_secondary_writes, 6171 MNT_MTX(mp), 6172 (PUSER - 1) | PDROP, "secwr", 0); 6173 VI_LOCK(devvp); 6174 continue; 6175 } 6176 break; 6177 } 6178 6179 /* 6180 * Reasons for needing more work before suspend: 6181 * - Dirty buffers on devvp. 6182 * - Softdep activity occurred after start of vnode sync loop 6183 * - Secondary writes occurred after start of vnode sync loop 6184 */ 6185 error = 0; 6186 if (bo->bo_numoutput > 0 || 6187 bo->bo_dirty.bv_cnt > 0 || 6188 softdep_deps != 0 || 6189 ump->softdep_deps != 0 || 6190 softdep_accdeps != ump->softdep_accdeps || 6191 secondary_writes != 0 || 6192 mp->mnt_secondary_writes != 0 || 6193 secondary_accwrites != mp->mnt_secondary_accwrites) 6194 error = EAGAIN; 6195 FREE_LOCK(&lk); 6196 VI_UNLOCK(devvp); 6197 return (error); 6198} 6199 6200 6201/* 6202 * Get the number of dependency structures for the file system, both 6203 * the current number and the total number allocated. These will 6204 * later be used to detect that softdep processing has occurred. 6205 */ 6206void 6207softdep_get_depcounts(struct mount *mp, 6208 int *softdep_depsp, 6209 int *softdep_accdepsp) 6210{ 6211 struct ufsmount *ump; 6212 6213 ump = VFSTOUFS(mp); 6214 ACQUIRE_LOCK(&lk); 6215 *softdep_depsp = ump->softdep_deps; 6216 *softdep_accdepsp = ump->softdep_accdeps; 6217 FREE_LOCK(&lk); 6218} 6219 6220/* 6221 * Wait for pending output on a vnode to complete. 6222 * Must be called with vnode lock and interlock locked. 6223 * 6224 * XXX: Should just be a call to bufobj_wwait(). 6225 */ 6226static void 6227drain_output(vp) 6228 struct vnode *vp; 6229{ 6230 ASSERT_VOP_LOCKED(vp, "drain_output"); 6231 ASSERT_VI_LOCKED(vp, "drain_output"); 6232 6233 while (vp->v_bufobj.bo_numoutput) { 6234 vp->v_bufobj.bo_flag |= BO_WWAIT; 6235 msleep((caddr_t)&vp->v_bufobj.bo_numoutput, 6236 VI_MTX(vp), PRIBIO + 1, "drainvp", 0); 6237 } 6238} 6239 6240/* 6241 * Called whenever a buffer that is being invalidated or reallocated 6242 * contains dependencies. This should only happen if an I/O error has 6243 * occurred. The routine is called with the buffer locked. 6244 */ 6245static void 6246softdep_deallocate_dependencies(bp) 6247 struct buf *bp; 6248{ 6249 6250 if ((bp->b_ioflags & BIO_ERROR) == 0) 6251 panic("softdep_deallocate_dependencies: dangling deps"); 6252 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 6253 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 6254} 6255 6256/* 6257 * Function to handle asynchronous write errors in the filesystem. 6258 */ 6259static void 6260softdep_error(func, error) 6261 char *func; 6262 int error; 6263{ 6264 6265 /* XXX should do something better! */ 6266 printf("%s: got error %d while accessing filesystem\n", func, error); 6267} 6268 6269#endif /* SOFTUPDATES */ 6270