ffs_softdep.c revision 46609
1156960Sume/* 2269867Sume * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved. 3269867Sume * 4269867Sume * The soft updates code is derived from the appendix of a University 5269867Sume * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 6269867Sume * "Soft Updates: A Solution to the Metadata Update Problem in File 7269867Sume * Systems", CSE-TR-254-95, August 1995). 8269867Sume * 9269867Sume * The following are the copyrights and redistribution conditions that 10269867Sume * apply to this copy of the soft update software. For a license 11269867Sume * to use, redistribute or sell the soft update software under 12269867Sume * conditions other than those described here, please contact the 13269867Sume * author at one of the following addresses: 14269867Sume * 15269867Sume * Marshall Kirk McKusick mckusick@mckusick.com 16269867Sume * 1614 Oxford Street +1-510-843-9542 17269867Sume * Berkeley, CA 94709-1608 18269867Sume * USA 19156960Sume * 20156960Sume * Redistribution and use in source and binary forms, with or without 21269867Sume * modification, are permitted provided that the following conditions 221539Srgrimes * are met: 231539Srgrimes * 241539Srgrimes * 1. Redistributions of source code must retain the above copyright 251539Srgrimes * notice, this list of conditions and the following disclaimer. 261539Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 271539Srgrimes * notice, this list of conditions and the following disclaimer in the 281539Srgrimes * documentation and/or other materials provided with the distribution. 291539Srgrimes * 3. None of the names of McKusick, Ganger, Patt, or the University of 30203964Simp * Michigan may be used to endorse or promote products derived from 311539Srgrimes * this software without specific prior written permission. 321539Srgrimes * 4. Redistributions in any form must be accompanied by information on 33269867Sume * how to obtain complete source code for any accompanying software 341539Srgrimes * that uses this software. This source code must either be included 351539Srgrimes * in the distribution or be available for no more than the cost of 361539Srgrimes * distribution plus a nominal fee, and must be freely redistributable 371539Srgrimes * under reasonable conditions. For an executable file, complete 381539Srgrimes * source code means the source code for all modules it contains. 391539Srgrimes * It does not mean source code for modules or files that typically 401539Srgrimes * accompany the operating system on which the executable file runs, 411539Srgrimes * e.g., standard library modules or system header files. 421539Srgrimes * 431539Srgrimes * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 441539Srgrimes * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 4536888Speter * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 4636888Speter * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 47170244Sume * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 483070Spst * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 49269867Sume * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 5050473Speter * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 511539Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 521539Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 531539Srgrimes * SUCH DAMAGE. 541539Srgrimes * 551539Srgrimes * from: @(#)ffs_softdep.c 9.35 (McKusick) 5/6/99 563070Spst * $Id: ffs_softdep.c,v 1.24 1999/03/02 06:38:07 mckusick Exp $ 571539Srgrimes */ 583070Spst 5955163Sshin/* 603070Spst * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide. 61156960Sume */ 623070Spst#ifndef DIAGNOSTIC 63170244Sume#define DIAGNOSTIC 6421055Speter#endif 6521055Speter#ifndef DEBUG 6621055Speter#define DEBUG 673070Spst#endif 683070Spst 693070Spst#include <sys/param.h> 703070Spst#include <sys/buf.h> 71269867Sume#include <sys/kernel.h> 723070Spst#include <sys/malloc.h> 73170244Sume#include <sys/mount.h> 74156960Sume#include <sys/proc.h> 75156960Sume#include <sys/syslog.h> 76156960Sume#include <sys/systm.h> 77156960Sume#include <sys/vnode.h> 78156960Sume#include <miscfs/specfs/specdev.h> 79156960Sume#include <ufs/ufs/dir.h> 80156960Sume#include <ufs/ufs/quota.h> 81156960Sume#include <ufs/ufs/inode.h> 82156960Sume#include <ufs/ufs/ufsmount.h> 83156960Sume#include <ufs/ffs/fs.h> 84156960Sume#include <ufs/ffs/softdep.h> 85156960Sume#include <ufs/ffs/ffs_extern.h> 86156960Sume#include <ufs/ufs/ufs_extern.h> 87156960Sume 88156960Sume/* 89156960Sume * These definitions need to be adapted to the system to which 90156960Sume * this file is being ported. 91156960Sume */ 92156960Sume/* 93170244Sume * malloc types defined for the softdep system. 94156960Sume */ 95156960SumeMALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); 96156960SumeMALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); 97156960SumeMALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); 98170244SumeMALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); 991539SrgrimesMALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); 1001539SrgrimesMALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); 101156960SumeMALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); 1021539SrgrimesMALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); 1031539SrgrimesMALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); 1041539SrgrimesMALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); 1053070SpstMALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); 1061539SrgrimesMALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); 1071539SrgrimesMALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); 108156960Sume 109156960Sume#define D_PAGEDEP 0 110156960Sume#define D_INODEDEP 1 111156960Sume#define D_NEWBLK 2 112156960Sume#define D_BMSAFEMAP 3 113156960Sume#define D_ALLOCDIRECT 4 114156960Sume#define D_INDIRDEP 5 115156960Sume#define D_ALLOCINDIR 6 116156960Sume#define D_FREEFRAG 7 117156960Sume#define D_FREEBLKS 8 118156960Sume#define D_FREEFILE 9 119156960Sume#define D_DIRADD 10 120170244Sume#define D_MKDIR 11 121170244Sume#define D_DIRREM 12 122170244Sume#define D_LAST D_DIRREM 123156960Sume 124156960Sume/* 125170244Sume * translate from workitem type to memory type 1261539Srgrimes * MUST match the defines above, such that memtype[D_XXX] == M_XXX 1271539Srgrimes */ 128170244Sumestatic struct malloc_type *memtype[] = { 129170244Sume M_PAGEDEP, 130170244Sume M_INODEDEP, 131170244Sume M_NEWBLK, 132170244Sume M_BMSAFEMAP, 133170244Sume M_ALLOCDIRECT, 134170244Sume M_INDIRDEP, 135170244Sume M_ALLOCINDIR, 136170244Sume M_FREEFRAG, 137170244Sume M_FREEBLKS, 138170244Sume M_FREEFILE, 139156960Sume M_DIRADD, 140156960Sume M_MKDIR, 1411539Srgrimes M_DIRREM 142170244Sume}; 143170244Sume 144156960Sume#define DtoM(type) (memtype[type]) 145156960Sume 146156960Sume/* 147156960Sume * Names of malloc types. 148156960Sume */ 149156960Sume#define TYPENAME(type) \ 150156960Sume ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") 151170244Sume#define CURPROC curproc 152156960Sume/* 153170244Sume * End system adaptaion definitions. 154156960Sume */ 155170244Sume 1563070Spst/* 157170244Sume * Internal function prototypes. 158170244Sume */ 159170244Sumestatic void softdep_error __P((char *, int)); 160170244Sumestatic void drain_output __P((struct vnode *, int)); 161170244Sumestatic int getdirtybuf __P((struct buf **, int)); 162156960Sumestatic void clear_remove __P((struct proc *)); 163170244Sumestatic void clear_inodedeps __P((struct proc *)); 164156960Sumestatic int flush_pagedep_deps __P((struct vnode *, struct mount *, 165170244Sume struct diraddhd *)); 166156960Sumestatic int flush_inodedep_deps __P((struct fs *, ino_t)); 167170244Sumestatic int handle_written_filepage __P((struct pagedep *, struct buf *)); 168170244Sumestatic void diradd_inode_written __P((struct diradd *, struct inodedep *)); 1693070Spststatic int handle_written_inodeblock __P((struct inodedep *, struct buf *)); 1703070Spststatic void handle_allocdirect_partdone __P((struct allocdirect *)); 1713070Spststatic void handle_allocindir_partdone __P((struct allocindir *)); 1723070Spststatic void initiate_write_filepage __P((struct pagedep *, struct buf *)); 1733070Spststatic void handle_written_mkdir __P((struct mkdir *, int)); 174170244Sumestatic void initiate_write_inodeblock __P((struct inodedep *, struct buf *)); 175170244Sumestatic void handle_workitem_freefile __P((struct freefile *)); 176170244Sumestatic void handle_workitem_remove __P((struct dirrem *)); 177170244Sumestatic struct dirrem *newdirrem __P((struct buf *, struct inode *, 178170244Sume struct inode *, int)); 179292216Svangyzenstatic void free_diradd __P((struct diradd *)); 180156960Sumestatic void free_allocindir __P((struct allocindir *, struct inodedep *)); 181156960Sumestatic int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t, 182269867Sume long *)); 183156960Sumestatic void deallocate_dependencies __P((struct buf *, struct inodedep *)); 184156960Sumestatic void free_allocdirect __P((struct allocdirectlst *, 185170244Sume struct allocdirect *, int)); 186156960Sumestatic int free_inodedep __P((struct inodedep *)); 187229781Suqsstatic void handle_workitem_freeblocks __P((struct freeblks *)); 188156960Sumestatic void merge_inode_lists __P((struct inodedep *)); 189156960Sumestatic void setup_allocindir_phase2 __P((struct buf *, struct inode *, 190269867Sume struct allocindir *)); 1911539Srgrimesstatic struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t, 1921539Srgrimes ufs_daddr_t)); 193156960Sumestatic void handle_workitem_freefrag __P((struct freefrag *)); 194156960Sumestatic struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long)); 195156960Sumestatic void allocdirect_merge __P((struct allocdirectlst *, 196156960Sume struct allocdirect *, struct allocdirect *)); 197156960Sumestatic struct bmsafemap *bmsafemap_lookup __P((struct buf *)); 198156960Sumestatic int newblk_lookup __P((struct fs *, ufs_daddr_t, int, 199156960Sume struct newblk **)); 200156960Sumestatic int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **)); 201170244Sumestatic int pagedep_lookup __P((struct inode *, ufs_lbn_t, int, 202156960Sume struct pagedep **)); 203170244Sumestatic void pause_timer __P((void *)); 204156960Sumestatic int checklimit __P((long *, int)); 205170244Sumestatic void add_to_worklist __P((struct worklist *)); 206156960Sume 207156960Sume/* 208170244Sume * Exported softdep operations. 209156960Sume */ 21055163Sshinstruct bio_ops bioops = { 211170244Sume softdep_disk_io_initiation, /* io_start */ 212170244Sume softdep_disk_write_complete, /* io_complete */ 213170244Sume softdep_deallocate_dependencies, /* io_deallocate */ 214170244Sume softdep_fsync, /* io_fsync */ 215170244Sume softdep_process_worklist, /* io_sync */ 216170244Sume}; 217156960Sume 21855163Sshin/* 219156960Sume * Locking primitives. 220170244Sume * 221170244Sume * For a uniprocessor, all we need to do is protect against disk 222170244Sume * interrupts. For a multiprocessor, this lock would have to be 223156960Sume * a mutex. A single mutex is used throughout this file, though 224170244Sume * finer grain locking could be used if contention warranted it. 2251539Srgrimes * 2261539Srgrimes * For a multiprocessor, the sleep call would accept a lock and 227170244Sume * release it after the sleep processing was complete. In a uniprocessor 228170244Sume * implementation there is no such interlock, so we simple mark 229170244Sume * the places where it needs to be done with the `interlocked' form 230170244Sume * of the lock calls. Since the uniprocessor sleep already interlocks 231170244Sume * the spl, there is nothing that really needs to be done. 232170244Sume */ 233170244Sume#ifndef /* NOT */ DEBUG 234170244Sumestatic struct lockit { 235170244Sume int lkt_spl; 236170244Sume} lk = { 0 }; 237170244Sume#define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio() 238170244Sume#define FREE_LOCK(lk) splx((lk)->lkt_spl) 239170244Sume#define ACQUIRE_LOCK_INTERLOCKED(lk) 240170244Sume#define FREE_LOCK_INTERLOCKED(lk) 241170244Sume 242170244Sume#else /* DEBUG */ 243170244Sumestatic struct lockit { 244170244Sume int lkt_spl; 245186090Sume pid_t lkt_held; 246170244Sume} lk = { 0, -1 }; 247170244Sumestatic int lockcnt; 248156960Sume 24978012Sumestatic void acquire_lock __P((struct lockit *)); 250170244Sumestatic void free_lock __P((struct lockit *)); 251170244Sumestatic void acquire_lock_interlocked __P((struct lockit *)); 252170244Sumestatic void free_lock_interlocked __P((struct lockit *)); 2531539Srgrimes 254156960Sume#define ACQUIRE_LOCK(lk) acquire_lock(lk) 255156960Sume#define FREE_LOCK(lk) free_lock(lk) 2561539Srgrimes#define ACQUIRE_LOCK_INTERLOCKED(lk) acquire_lock_interlocked(lk) 257170244Sume#define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk) 2581539Srgrimes 2591539Srgrimesstatic void 2603070Spstacquire_lock(lk) 26136888Speter struct lockit *lk; 2623070Spst{ 2633070Spst 2643070Spst if (lk->lkt_held != -1) 2653070Spst if (lk->lkt_held == CURPROC->p_pid) 2663070Spst panic("softdep_lock: locking against myself"); 2673070Spst else 2683070Spst panic("softdep_lock: lock held by %d", lk->lkt_held); 2693070Spst lk->lkt_spl = splbio(); 2703070Spst lk->lkt_held = CURPROC->p_pid; 2713070Spst lockcnt++; 2723070Spst} 2733070Spst 274156960Sumestatic void 275156960Sumefree_lock(lk) 276156960Sume struct lockit *lk; 2771539Srgrimes{ 278156960Sume 279156960Sume if (lk->lkt_held == -1) 280156960Sume panic("softdep_unlock: lock not held"); 281156960Sume lk->lkt_held = -1; 282156960Sume splx(lk->lkt_spl); 2831539Srgrimes} 284156960Sume 285156960Sumestatic void 286156960Sumeacquire_lock_interlocked(lk) 287156960Sume struct lockit *lk; 288156960Sume{ 289156960Sume 290156960Sume if (lk->lkt_held != -1) 291156960Sume if (lk->lkt_held == CURPROC->p_pid) 292156960Sume panic("softdep_lock_interlocked: locking against self"); 293156960Sume else 294156960Sume panic("softdep_lock_interlocked: lock held by %d", 295156960Sume lk->lkt_held); 296156960Sume lk->lkt_held = CURPROC->p_pid; 297156960Sume lockcnt++; 298156960Sume} 2993070Spst 300126243Sgreenstatic void 301156960Sumefree_lock_interlocked(lk) 302156960Sume struct lockit *lk; 303156960Sume{ 304156960Sume 305156960Sume if (lk->lkt_held == -1) 306156960Sume panic("softdep_unlock_interlocked: lock not held"); 307156960Sume lk->lkt_held = -1; 308156960Sume} 309156960Sume#endif /* DEBUG */ 310156960Sume 311156960Sume/* 312156960Sume * Place holder for real semaphores. 313156960Sume */ 314156960Sumestruct sema { 315156960Sume int value; 316156960Sume pid_t holder; 317156960Sume char *name; 318126243Sgreen int prio; 319156960Sume int timo; 32055163Sshin}; 321156960Sumestatic void sema_init __P((struct sema *, char *, int, int)); 322156960Sumestatic int sema_get __P((struct sema *, struct lockit *)); 323156960Sumestatic void sema_release __P((struct sema *)); 324269867Sume 325156960Sumestatic void 326156960Sumesema_init(semap, name, prio, timo) 327156960Sume struct sema *semap; 328156960Sume char *name; 329156960Sume int prio, timo; 330156960Sume{ 33121055Speter 33221055Speter semap->holder = -1; 333156960Sume semap->value = 0; 334156960Sume semap->name = name; 3351539Srgrimes semap->prio = prio; 336156960Sume semap->timo = timo; 337156960Sume} 338156960Sume 339156960Sumestatic int 340156960Sumesema_get(semap, interlock) 341156960Sume struct sema *semap; 342156960Sume struct lockit *interlock; 343156960Sume{ 344156960Sume 345156960Sume if (semap->value++ > 0) { 346156960Sume if (interlock != NULL) 347156960Sume FREE_LOCK_INTERLOCKED(interlock); 348156960Sume tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo); 349156960Sume if (interlock != NULL) { 350156960Sume ACQUIRE_LOCK_INTERLOCKED(interlock); 351156960Sume FREE_LOCK(interlock); 352156960Sume } 353156960Sume return (0); 354156960Sume } 355156960Sume semap->holder = CURPROC->p_pid; 356156960Sume if (interlock != NULL) 357156960Sume FREE_LOCK(interlock); 358156960Sume return (1); 359156960Sume} 360156960Sume 361156960Sumestatic void 362158787Sumesema_release(semap) 363156960Sume struct sema *semap; 364156960Sume{ 365156960Sume 366156960Sume if (semap->value <= 0 || semap->holder != CURPROC->p_pid) 367156960Sume panic("sema_release: not held"); 368156960Sume if (--semap->value > 0) { 369156960Sume semap->value = 0; 370156960Sume wakeup(semap); 371156960Sume } 372156960Sume semap->holder = -1; 373156960Sume} 374156960Sume 375156960Sume/* 376156960Sume * Worklist queue management. 377156960Sume * These routines require that the lock be held. 378156960Sume */ 379156960Sume#ifndef /* NOT */ DEBUG 380156960Sume#define WORKLIST_INSERT(head, item) do { \ 381156960Sume (item)->wk_state |= ONWORKLIST; \ 382269867Sume LIST_INSERT_HEAD(head, item, wk_list); \ 383156960Sume} while (0) 384269867Sume#define WORKLIST_REMOVE(item) do { \ 385156960Sume (item)->wk_state &= ~ONWORKLIST; \ 386156960Sume LIST_REMOVE(item, wk_list); \ 387156960Sume} while (0) 388156960Sume#define WORKITEM_FREE(item, type) FREE(item, DtoM(type)) 389186090Sume 390156960Sume#else /* DEBUG */ 391156960Sumestatic void worklist_insert __P((struct workhead *, struct worklist *)); 392156960Sumestatic void worklist_remove __P((struct worklist *)); 393156960Sumestatic void workitem_free __P((struct worklist *, int)); 394156960Sume 395158518Sume#define WORKLIST_INSERT(head, item) worklist_insert(head, item) 396156960Sume#define WORKLIST_REMOVE(item) worklist_remove(item) 397156960Sume#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type) 398156960Sume 399156960Sumestatic void 400156960Sumeworklist_insert(head, item) 401156960Sume struct workhead *head; 402156960Sume struct worklist *item; 403158518Sume{ 404156960Sume 405156960Sume if (lk.lkt_held == -1) 406158518Sume panic("worklist_insert: lock not held"); 407156960Sume if (item->wk_state & ONWORKLIST) 408156960Sume panic("worklist_insert: already on list"); 409158518Sume item->wk_state |= ONWORKLIST; 410156960Sume LIST_INSERT_HEAD(head, item, wk_list); 411156960Sume} 412158518Sume 4131539Srgrimesstatic void 41493032Simpworklist_remove(item) 41593032Simp struct worklist *item; 41693032Simp{ 41793032Simp 41893032Simp if (lk.lkt_held == -1) 41993032Simp panic("worklist_remove: lock not held"); 42093032Simp if ((item->wk_state & ONWORKLIST) == 0) 42193032Simp panic("worklist_remove: not on list"); 42293032Simp item->wk_state &= ~ONWORKLIST; 42393032Simp LIST_REMOVE(item, wk_list); 42493032Simp} 42593032Simp 42693032Simpstatic void 42793032Simpworkitem_free(item, type) 428156960Sume struct worklist *item; 429156960Sume int type; 430156960Sume{ 431156960Sume 43293032Simp if (item->wk_state & ONWORKLIST) 43393032Simp panic("workitem_free: still on list"); 43493032Simp if (item->wk_type != type) 435156960Sume panic("workitem_free: type mismatch"); 436156960Sume FREE(item, DtoM(type)); 43793032Simp} 43893032Simp#endif /* DEBUG */ 439156960Sume 44093032Simp/* 44193032Simp * Workitem queue management 44293032Simp */ 44393032Simpstatic struct workhead softdep_workitem_pending; 44493032Simpstatic int softdep_worklist_busy; 44593032Simpstatic int max_softdeps; /* maximum number of structs before slowdown */ 44693032Simpstatic int tickdelay = 2; /* number of ticks to pause during slowdown */ 447269867Sumestatic int rush_requests; /* number of times I/O speeded up */ 44893032Simpstatic int blk_limit_push; /* number of times block limit neared */ 449269867Sumestatic int ino_limit_push; /* number of times inode limit neared */ 450156960Sumestatic int blk_limit_hit; /* number of times block slowdown imposed */ 451156960Sumestatic int ino_limit_hit; /* number of times inode slowdown imposed */ 45293032Simpstatic int proc_waiting; /* tracks whether we have a timeout posted */ 45393032Simpstatic struct proc *filesys_syncer; /* proc of filesystem syncer process */ 45493032Simpstatic int req_clear_inodedeps; /* syncer process flush some inodedeps */ 455156960Sumestatic int req_clear_remove; /* syncer process flush some freeblks */ 456156960Sume#ifdef DEBUG 457156960Sume#include <vm/vm.h> 458156960Sume#include <sys/sysctl.h> 459156960Sume#if defined(__FreeBSD__) 460156960SumeSYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); 461156960SumeSYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); 462156960SumeSYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &blk_limit_push, 0,""); 463156960SumeSYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &ino_limit_push, 0,""); 464156960SumeSYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &blk_limit_hit, 0, ""); 465156960SumeSYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &ino_limit_hit, 0, ""); 466156960SumeSYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &rush_requests, 0, ""); 467156960Sume#else /* !__FreeBSD__ */ 468156960Sumestruct ctldebug debug7 = { "max_softdeps", &max_softdeps }; 469156960Sumestruct ctldebug debug8 = { "tickdelay", &tickdelay }; 470156960Sumestruct ctldebug debug9 = { "rush_requests", &rush_requests }; 471156960Sumestruct ctldebug debug10 = { "blk_limit_push", &blk_limit_push }; 472156960Sumestruct ctldebug debug11 = { "ino_limit_push", &ino_limit_push }; 473156960Sumestruct ctldebug debug12 = { "blk_limit_hit", &blk_limit_hit }; 474158787Sumestruct ctldebug debug13 = { "ino_limit_hit", &ino_limit_hit }; 475156960Sume#endif /* !__FreeBSD__ */ 476156960Sume 477156960Sume#endif /* DEBUG */ 478156960Sume 479156960Sume/* 480186090Sume * Add an item to the end of the work queue. 481186090Sume * This routine requires that the lock be held. 482156960Sume * This is the only routine that adds items to the list. 483156960Sume * The following routine is the only one that removes items 484156960Sume * and does so in order from first to last. 485158518Sume */ 486156960Sumestatic void 487156960Sumeadd_to_worklist(wk) 488156960Sume struct worklist *wk; 489156960Sume{ 490156960Sume static struct worklist *worklist_tail; 491156960Sume 492156960Sume if (wk->wk_state & ONWORKLIST) 493156960Sume panic("add_to_worklist: already on list"); 494156960Sume wk->wk_state |= ONWORKLIST; 495156960Sume if (LIST_FIRST(&softdep_workitem_pending) == NULL) 496156960Sume LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list); 497156960Sume else 498156960Sume LIST_INSERT_AFTER(worklist_tail, wk, wk_list); 499156960Sume worklist_tail = wk; 5001539Srgrimes} 5011539Srgrimes 5021539Srgrimes/* 503170244Sume * Process that runs once per second to handle items in the background queue. 504 * 505 * Note that we ensure that everything is done in the order in which they 506 * appear in the queue. The code below depends on this property to ensure 507 * that blocks of a file are freed before the inode itself is freed. This 508 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 509 * until all the old ones have been purged from the dependency lists. 510 */ 511int 512softdep_process_worklist(matchmnt) 513 struct mount *matchmnt; 514{ 515 struct proc *p = CURPROC; 516 struct worklist *wk; 517 struct fs *matchfs; 518 int matchcnt; 519 520 /* 521 * Record the process identifier of our caller so that we can 522 * give this process preferential treatment in checklimit below. 523 */ 524 filesys_syncer = p; 525 matchcnt = 0; 526 matchfs = NULL; 527 if (matchmnt != NULL) 528 matchfs = VFSTOUFS(matchmnt)->um_fs; 529 /* 530 * There is no danger of having multiple processes run this 531 * code. It is single threaded solely so that softdep_flushfiles 532 * (below) can get an accurate count of the number of items 533 * related to its mount point that are in the list. 534 */ 535 if (softdep_worklist_busy && matchmnt == NULL) 536 return (-1); 537 /* 538 * If requested, try removing inode or removal dependencies. 539 */ 540 if (req_clear_inodedeps) { 541 clear_inodedeps(p); 542 req_clear_inodedeps = 0; 543 wakeup(&proc_waiting); 544 } 545 if (req_clear_remove) { 546 clear_remove(p); 547 req_clear_remove = 0; 548 wakeup(&proc_waiting); 549 } 550 ACQUIRE_LOCK(&lk); 551 while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) { 552 WORKLIST_REMOVE(wk); 553 FREE_LOCK(&lk); 554 switch (wk->wk_type) { 555 556 case D_DIRREM: 557 /* removal of a directory entry */ 558 if (WK_DIRREM(wk)->dm_mnt == matchmnt) 559 matchcnt += 1; 560 handle_workitem_remove(WK_DIRREM(wk)); 561 break; 562 563 case D_FREEBLKS: 564 /* releasing blocks and/or fragments from a file */ 565 if (WK_FREEBLKS(wk)->fb_fs == matchfs) 566 matchcnt += 1; 567 handle_workitem_freeblocks(WK_FREEBLKS(wk)); 568 break; 569 570 case D_FREEFRAG: 571 /* releasing a fragment when replaced as a file grows */ 572 if (WK_FREEFRAG(wk)->ff_fs == matchfs) 573 matchcnt += 1; 574 handle_workitem_freefrag(WK_FREEFRAG(wk)); 575 break; 576 577 case D_FREEFILE: 578 /* releasing an inode when its link count drops to 0 */ 579 if (WK_FREEFILE(wk)->fx_fs == matchfs) 580 matchcnt += 1; 581 handle_workitem_freefile(WK_FREEFILE(wk)); 582 break; 583 584 default: 585 panic("%s_process_worklist: Unknown type %s", 586 "softdep", TYPENAME(wk->wk_type)); 587 /* NOTREACHED */ 588 } 589 if (softdep_worklist_busy && matchmnt == NULL) 590 return (-1); 591 /* 592 * If requested, try removing inode or removal dependencies. 593 */ 594 if (req_clear_inodedeps) { 595 clear_inodedeps(p); 596 req_clear_inodedeps = 0; 597 wakeup(&proc_waiting); 598 } 599 if (req_clear_remove) { 600 clear_remove(p); 601 req_clear_remove = 0; 602 wakeup(&proc_waiting); 603 } 604 ACQUIRE_LOCK(&lk); 605 } 606 FREE_LOCK(&lk); 607 return (matchcnt); 608} 609 610/* 611 * Purge the work list of all items associated with a particular mount point. 612 */ 613int 614softdep_flushfiles(oldmnt, flags, p) 615 struct mount *oldmnt; 616 int flags; 617 struct proc *p; 618{ 619 struct vnode *devvp; 620 int error, loopcnt; 621 622 /* 623 * Await our turn to clear out the queue. 624 */ 625 while (softdep_worklist_busy) 626 tsleep(&lbolt, PRIBIO, "softflush", 0); 627 softdep_worklist_busy = 1; 628 if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) { 629 softdep_worklist_busy = 0; 630 return (error); 631 } 632 /* 633 * Alternately flush the block device associated with the mount 634 * point and process any dependencies that the flushing 635 * creates. In theory, this loop can happen at most twice, 636 * but we give it a few extra just to be sure. 637 */ 638 devvp = VFSTOUFS(oldmnt)->um_devvp; 639 for (loopcnt = 10; loopcnt > 0; loopcnt--) { 640 if (softdep_process_worklist(oldmnt) == 0) { 641 /* 642 * Do another flush in case any vnodes were brought in 643 * as part of the cleanup operations. 644 */ 645 if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) 646 break; 647 /* 648 * If we still found nothing to do, we are really done. 649 */ 650 if (softdep_process_worklist(oldmnt) == 0) 651 break; 652 } 653 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); 654 error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p); 655 VOP_UNLOCK(devvp, 0, p); 656 if (error) 657 break; 658 } 659 softdep_worklist_busy = 0; 660 /* 661 * If we are unmounting then it is an error to fail. If we 662 * are simply trying to downgrade to read-only, then filesystem 663 * activity can keep us busy forever, so we just fail with EBUSY. 664 */ 665 if (loopcnt == 0) { 666 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 667 panic("softdep_flushfiles: looping"); 668 error = EBUSY; 669 } 670 return (error); 671} 672 673/* 674 * Structure hashing. 675 * 676 * There are three types of structures that can be looked up: 677 * 1) pagedep structures identified by mount point, inode number, 678 * and logical block. 679 * 2) inodedep structures identified by mount point and inode number. 680 * 3) newblk structures identified by mount point and 681 * physical block number. 682 * 683 * The "pagedep" and "inodedep" dependency structures are hashed 684 * separately from the file blocks and inodes to which they correspond. 685 * This separation helps when the in-memory copy of an inode or 686 * file block must be replaced. It also obviates the need to access 687 * an inode or file page when simply updating (or de-allocating) 688 * dependency structures. Lookup of newblk structures is needed to 689 * find newly allocated blocks when trying to associate them with 690 * their allocdirect or allocindir structure. 691 * 692 * The lookup routines optionally create and hash a new instance when 693 * an existing entry is not found. 694 */ 695#define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 696 697/* 698 * Structures and routines associated with pagedep caching. 699 */ 700LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 701u_long pagedep_hash; /* size of hash table - 1 */ 702#define PAGEDEP_HASH(mp, inum, lbn) \ 703 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 704 pagedep_hash]) 705static struct sema pagedep_in_progress; 706 707/* 708 * Look up a pagedep. Return 1 if found, 0 if not found. 709 * If not found, allocate if DEPALLOC flag is passed. 710 * Found or allocated entry is returned in pagedeppp. 711 * This routine must be called with splbio interrupts blocked. 712 */ 713static int 714pagedep_lookup(ip, lbn, flags, pagedeppp) 715 struct inode *ip; 716 ufs_lbn_t lbn; 717 int flags; 718 struct pagedep **pagedeppp; 719{ 720 struct pagedep *pagedep; 721 struct pagedep_hashhead *pagedephd; 722 struct mount *mp; 723 int i; 724 725#ifdef DEBUG 726 if (lk.lkt_held == -1) 727 panic("pagedep_lookup: lock not held"); 728#endif 729 mp = ITOV(ip)->v_mount; 730 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); 731top: 732 for (pagedep = LIST_FIRST(pagedephd); pagedep; 733 pagedep = LIST_NEXT(pagedep, pd_hash)) 734 if (ip->i_number == pagedep->pd_ino && 735 lbn == pagedep->pd_lbn && 736 mp == pagedep->pd_mnt) 737 break; 738 if (pagedep) { 739 *pagedeppp = pagedep; 740 return (1); 741 } 742 if ((flags & DEPALLOC) == 0) { 743 *pagedeppp = NULL; 744 return (0); 745 } 746 if (sema_get(&pagedep_in_progress, &lk) == 0) { 747 ACQUIRE_LOCK(&lk); 748 goto top; 749 } 750 MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP, 751 M_WAITOK); 752 bzero(pagedep, sizeof(struct pagedep)); 753 pagedep->pd_list.wk_type = D_PAGEDEP; 754 pagedep->pd_mnt = mp; 755 pagedep->pd_ino = ip->i_number; 756 pagedep->pd_lbn = lbn; 757 LIST_INIT(&pagedep->pd_dirremhd); 758 LIST_INIT(&pagedep->pd_pendinghd); 759 for (i = 0; i < DAHASHSZ; i++) 760 LIST_INIT(&pagedep->pd_diraddhd[i]); 761 ACQUIRE_LOCK(&lk); 762 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 763 sema_release(&pagedep_in_progress); 764 *pagedeppp = pagedep; 765 return (0); 766} 767 768/* 769 * Structures and routines associated with inodedep caching. 770 */ 771LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 772static u_long inodedep_hash; /* size of hash table - 1 */ 773static long num_inodedep; /* number of inodedep allocated */ 774#define INODEDEP_HASH(fs, inum) \ 775 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 776static struct sema inodedep_in_progress; 777 778/* 779 * Look up a inodedep. Return 1 if found, 0 if not found. 780 * If not found, allocate if DEPALLOC flag is passed. 781 * Found or allocated entry is returned in inodedeppp. 782 * This routine must be called with splbio interrupts blocked. 783 */ 784static int 785inodedep_lookup(fs, inum, flags, inodedeppp) 786 struct fs *fs; 787 ino_t inum; 788 int flags; 789 struct inodedep **inodedeppp; 790{ 791 struct inodedep *inodedep; 792 struct inodedep_hashhead *inodedephd; 793 int firsttry; 794 795#ifdef DEBUG 796 if (lk.lkt_held == -1) 797 panic("inodedep_lookup: lock not held"); 798#endif 799 firsttry = 1; 800 inodedephd = INODEDEP_HASH(fs, inum); 801top: 802 for (inodedep = LIST_FIRST(inodedephd); inodedep; 803 inodedep = LIST_NEXT(inodedep, id_hash)) 804 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 805 break; 806 if (inodedep) { 807 *inodedeppp = inodedep; 808 return (1); 809 } 810 if ((flags & DEPALLOC) == 0) { 811 *inodedeppp = NULL; 812 return (0); 813 } 814 if (firsttry && checklimit(&num_inodedep, 1) == 1) { 815 firsttry = 0; 816 goto top; 817 } 818 if (sema_get(&inodedep_in_progress, &lk) == 0) { 819 ACQUIRE_LOCK(&lk); 820 goto top; 821 } 822 num_inodedep += 1; 823 MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep), 824 M_INODEDEP, M_WAITOK); 825 inodedep->id_list.wk_type = D_INODEDEP; 826 inodedep->id_fs = fs; 827 inodedep->id_ino = inum; 828 inodedep->id_state = ALLCOMPLETE; 829 inodedep->id_nlinkdelta = 0; 830 inodedep->id_savedino = NULL; 831 inodedep->id_savedsize = -1; 832 inodedep->id_buf = NULL; 833 LIST_INIT(&inodedep->id_pendinghd); 834 LIST_INIT(&inodedep->id_inowait); 835 LIST_INIT(&inodedep->id_bufwait); 836 TAILQ_INIT(&inodedep->id_inoupdt); 837 TAILQ_INIT(&inodedep->id_newinoupdt); 838 ACQUIRE_LOCK(&lk); 839 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 840 sema_release(&inodedep_in_progress); 841 *inodedeppp = inodedep; 842 return (0); 843} 844 845/* 846 * Structures and routines associated with newblk caching. 847 */ 848LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 849u_long newblk_hash; /* size of hash table - 1 */ 850#define NEWBLK_HASH(fs, inum) \ 851 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 852static struct sema newblk_in_progress; 853 854/* 855 * Look up a newblk. Return 1 if found, 0 if not found. 856 * If not found, allocate if DEPALLOC flag is passed. 857 * Found or allocated entry is returned in newblkpp. 858 */ 859static int 860newblk_lookup(fs, newblkno, flags, newblkpp) 861 struct fs *fs; 862 ufs_daddr_t newblkno; 863 int flags; 864 struct newblk **newblkpp; 865{ 866 struct newblk *newblk; 867 struct newblk_hashhead *newblkhd; 868 869 newblkhd = NEWBLK_HASH(fs, newblkno); 870top: 871 for (newblk = LIST_FIRST(newblkhd); newblk; 872 newblk = LIST_NEXT(newblk, nb_hash)) 873 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) 874 break; 875 if (newblk) { 876 *newblkpp = newblk; 877 return (1); 878 } 879 if ((flags & DEPALLOC) == 0) { 880 *newblkpp = NULL; 881 return (0); 882 } 883 if (sema_get(&newblk_in_progress, 0) == 0) 884 goto top; 885 MALLOC(newblk, struct newblk *, sizeof(struct newblk), 886 M_NEWBLK, M_WAITOK); 887 newblk->nb_state = 0; 888 newblk->nb_fs = fs; 889 newblk->nb_newblkno = newblkno; 890 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 891 sema_release(&newblk_in_progress); 892 *newblkpp = newblk; 893 return (0); 894} 895 896/* 897 * Executed during filesystem system initialization before 898 * mounting any file systems. 899 */ 900void 901softdep_initialize() 902{ 903 904 LIST_INIT(&mkdirlisthd); 905 LIST_INIT(&softdep_workitem_pending); 906 max_softdeps = desiredvnodes * 8; 907 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, 908 &pagedep_hash); 909 sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0); 910 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 911 sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0); 912 newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); 913 sema_init(&newblk_in_progress, "newblk", PRIBIO, 0); 914} 915 916/* 917 * Called at mount time to notify the dependency code that a 918 * filesystem wishes to use it. 919 */ 920int 921softdep_mount(devvp, mp, fs, cred) 922 struct vnode *devvp; 923 struct mount *mp; 924 struct fs *fs; 925 struct ucred *cred; 926{ 927 struct csum cstotal; 928 struct cg *cgp; 929 struct buf *bp; 930 int error, cyl; 931 932 mp->mnt_flag &= ~MNT_ASYNC; 933 mp->mnt_flag |= MNT_SOFTDEP; 934 /* 935 * When doing soft updates, the counters in the 936 * superblock may have gotten out of sync, so we have 937 * to scan the cylinder groups and recalculate them. 938 */ 939 if (fs->fs_clean != 0) 940 return (0); 941 bzero(&cstotal, sizeof cstotal); 942 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 943 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 944 fs->fs_cgsize, cred, &bp)) != 0) { 945 brelse(bp); 946 return (error); 947 } 948 cgp = (struct cg *)bp->b_data; 949 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 950 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 951 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 952 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 953 fs->fs_cs(fs, cyl) = cgp->cg_cs; 954 brelse(bp); 955 } 956#ifdef DEBUG 957 if (!bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 958 printf("ffs_mountfs: superblock updated for soft updates\n"); 959#endif 960 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 961 return (0); 962} 963 964/* 965 * Protecting the freemaps (or bitmaps). 966 * 967 * To eliminate the need to execute fsck before mounting a file system 968 * after a power failure, one must (conservatively) guarantee that the 969 * on-disk copy of the bitmaps never indicate that a live inode or block is 970 * free. So, when a block or inode is allocated, the bitmap should be 971 * updated (on disk) before any new pointers. When a block or inode is 972 * freed, the bitmap should not be updated until all pointers have been 973 * reset. The latter dependency is handled by the delayed de-allocation 974 * approach described below for block and inode de-allocation. The former 975 * dependency is handled by calling the following procedure when a block or 976 * inode is allocated. When an inode is allocated an "inodedep" is created 977 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 978 * Each "inodedep" is also inserted into the hash indexing structure so 979 * that any additional link additions can be made dependent on the inode 980 * allocation. 981 * 982 * The ufs file system maintains a number of free block counts (e.g., per 983 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 984 * in addition to the bitmaps. These counts are used to improve efficiency 985 * during allocation and therefore must be consistent with the bitmaps. 986 * There is no convenient way to guarantee post-crash consistency of these 987 * counts with simple update ordering, for two main reasons: (1) The counts 988 * and bitmaps for a single cylinder group block are not in the same disk 989 * sector. If a disk write is interrupted (e.g., by power failure), one may 990 * be written and the other not. (2) Some of the counts are located in the 991 * superblock rather than the cylinder group block. So, we focus our soft 992 * updates implementation on protecting the bitmaps. When mounting a 993 * filesystem, we recompute the auxiliary counts from the bitmaps. 994 */ 995 996/* 997 * Called just after updating the cylinder group block to allocate an inode. 998 */ 999void 1000softdep_setup_inomapdep(bp, ip, newinum) 1001 struct buf *bp; /* buffer for cylgroup block with inode map */ 1002 struct inode *ip; /* inode related to allocation */ 1003 ino_t newinum; /* new inode number being allocated */ 1004{ 1005 struct inodedep *inodedep; 1006 struct bmsafemap *bmsafemap; 1007 1008 /* 1009 * Create a dependency for the newly allocated inode. 1010 * Panic if it already exists as something is seriously wrong. 1011 * Otherwise add it to the dependency list for the buffer holding 1012 * the cylinder group map from which it was allocated. 1013 */ 1014 ACQUIRE_LOCK(&lk); 1015 if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0) 1016 panic("softdep_setup_inomapdep: found inode"); 1017 inodedep->id_buf = bp; 1018 inodedep->id_state &= ~DEPCOMPLETE; 1019 bmsafemap = bmsafemap_lookup(bp); 1020 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 1021 FREE_LOCK(&lk); 1022} 1023 1024/* 1025 * Called just after updating the cylinder group block to 1026 * allocate block or fragment. 1027 */ 1028void 1029softdep_setup_blkmapdep(bp, fs, newblkno) 1030 struct buf *bp; /* buffer for cylgroup block with block map */ 1031 struct fs *fs; /* filesystem doing allocation */ 1032 ufs_daddr_t newblkno; /* number of newly allocated block */ 1033{ 1034 struct newblk *newblk; 1035 struct bmsafemap *bmsafemap; 1036 1037 /* 1038 * Create a dependency for the newly allocated block. 1039 * Add it to the dependency list for the buffer holding 1040 * the cylinder group map from which it was allocated. 1041 */ 1042 if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) 1043 panic("softdep_setup_blkmapdep: found block"); 1044 ACQUIRE_LOCK(&lk); 1045 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp); 1046 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 1047 FREE_LOCK(&lk); 1048} 1049 1050/* 1051 * Find the bmsafemap associated with a cylinder group buffer. 1052 * If none exists, create one. The buffer must be locked when 1053 * this routine is called and this routine must be called with 1054 * splbio interrupts blocked. 1055 */ 1056static struct bmsafemap * 1057bmsafemap_lookup(bp) 1058 struct buf *bp; 1059{ 1060 struct bmsafemap *bmsafemap; 1061 struct worklist *wk; 1062 1063#ifdef DEBUG 1064 if (lk.lkt_held == -1) 1065 panic("bmsafemap_lookup: lock not held"); 1066#endif 1067 for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) 1068 if (wk->wk_type == D_BMSAFEMAP) 1069 return (WK_BMSAFEMAP(wk)); 1070 FREE_LOCK(&lk); 1071 MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap), 1072 M_BMSAFEMAP, M_WAITOK); 1073 bmsafemap->sm_list.wk_type = D_BMSAFEMAP; 1074 bmsafemap->sm_list.wk_state = 0; 1075 bmsafemap->sm_buf = bp; 1076 LIST_INIT(&bmsafemap->sm_allocdirecthd); 1077 LIST_INIT(&bmsafemap->sm_allocindirhd); 1078 LIST_INIT(&bmsafemap->sm_inodedephd); 1079 LIST_INIT(&bmsafemap->sm_newblkhd); 1080 ACQUIRE_LOCK(&lk); 1081 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 1082 return (bmsafemap); 1083} 1084 1085/* 1086 * Direct block allocation dependencies. 1087 * 1088 * When a new block is allocated, the corresponding disk locations must be 1089 * initialized (with zeros or new data) before the on-disk inode points to 1090 * them. Also, the freemap from which the block was allocated must be 1091 * updated (on disk) before the inode's pointer. These two dependencies are 1092 * independent of each other and are needed for all file blocks and indirect 1093 * blocks that are pointed to directly by the inode. Just before the 1094 * "in-core" version of the inode is updated with a newly allocated block 1095 * number, a procedure (below) is called to setup allocation dependency 1096 * structures. These structures are removed when the corresponding 1097 * dependencies are satisfied or when the block allocation becomes obsolete 1098 * (i.e., the file is deleted, the block is de-allocated, or the block is a 1099 * fragment that gets upgraded). All of these cases are handled in 1100 * procedures described later. 1101 * 1102 * When a file extension causes a fragment to be upgraded, either to a larger 1103 * fragment or to a full block, the on-disk location may change (if the 1104 * previous fragment could not simply be extended). In this case, the old 1105 * fragment must be de-allocated, but not until after the inode's pointer has 1106 * been updated. In most cases, this is handled by later procedures, which 1107 * will construct a "freefrag" structure to be added to the workitem queue 1108 * when the inode update is complete (or obsolete). The main exception to 1109 * this is when an allocation occurs while a pending allocation dependency 1110 * (for the same block pointer) remains. This case is handled in the main 1111 * allocation dependency setup procedure by immediately freeing the 1112 * unreferenced fragments. 1113 */ 1114void 1115softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 1116 struct inode *ip; /* inode to which block is being added */ 1117 ufs_lbn_t lbn; /* block pointer within inode */ 1118 ufs_daddr_t newblkno; /* disk block number being added */ 1119 ufs_daddr_t oldblkno; /* previous block number, 0 unless frag */ 1120 long newsize; /* size of new block */ 1121 long oldsize; /* size of new block */ 1122 struct buf *bp; /* bp for allocated block */ 1123{ 1124 struct allocdirect *adp, *oldadp; 1125 struct allocdirectlst *adphead; 1126 struct bmsafemap *bmsafemap; 1127 struct inodedep *inodedep; 1128 struct pagedep *pagedep; 1129 struct newblk *newblk; 1130 1131 MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), 1132 M_ALLOCDIRECT, M_WAITOK); 1133 bzero(adp, sizeof(struct allocdirect)); 1134 adp->ad_list.wk_type = D_ALLOCDIRECT; 1135 adp->ad_lbn = lbn; 1136 adp->ad_newblkno = newblkno; 1137 adp->ad_oldblkno = oldblkno; 1138 adp->ad_newsize = newsize; 1139 adp->ad_oldsize = oldsize; 1140 adp->ad_state = ATTACHED; 1141 if (newblkno == oldblkno) 1142 adp->ad_freefrag = NULL; 1143 else 1144 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); 1145 1146 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) 1147 panic("softdep_setup_allocdirect: lost block"); 1148 1149 ACQUIRE_LOCK(&lk); 1150 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); 1151 adp->ad_inodedep = inodedep; 1152 1153 if (newblk->nb_state == DEPCOMPLETE) { 1154 adp->ad_state |= DEPCOMPLETE; 1155 adp->ad_buf = NULL; 1156 } else { 1157 bmsafemap = newblk->nb_bmsafemap; 1158 adp->ad_buf = bmsafemap->sm_buf; 1159 LIST_REMOVE(newblk, nb_deps); 1160 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); 1161 } 1162 LIST_REMOVE(newblk, nb_hash); 1163 FREE(newblk, M_NEWBLK); 1164 1165 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); 1166 if (lbn >= NDADDR) { 1167 /* allocating an indirect block */ 1168 if (oldblkno != 0) 1169 panic("softdep_setup_allocdirect: non-zero indir"); 1170 } else { 1171 /* 1172 * Allocating a direct block. 1173 * 1174 * If we are allocating a directory block, then we must 1175 * allocate an associated pagedep to track additions and 1176 * deletions. 1177 */ 1178 if ((ip->i_mode & IFMT) == IFDIR && 1179 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1180 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 1181 } 1182 /* 1183 * The list of allocdirects must be kept in sorted and ascending 1184 * order so that the rollback routines can quickly determine the 1185 * first uncommitted block (the size of the file stored on disk 1186 * ends at the end of the lowest committed fragment, or if there 1187 * are no fragments, at the end of the highest committed block). 1188 * Since files generally grow, the typical case is that the new 1189 * block is to be added at the end of the list. We speed this 1190 * special case by checking against the last allocdirect in the 1191 * list before laboriously traversing the list looking for the 1192 * insertion point. 1193 */ 1194 adphead = &inodedep->id_newinoupdt; 1195 oldadp = TAILQ_LAST(adphead, allocdirectlst); 1196 if (oldadp == NULL || oldadp->ad_lbn <= lbn) { 1197 /* insert at end of list */ 1198 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 1199 if (oldadp != NULL && oldadp->ad_lbn == lbn) 1200 allocdirect_merge(adphead, adp, oldadp); 1201 FREE_LOCK(&lk); 1202 return; 1203 } 1204 for (oldadp = TAILQ_FIRST(adphead); oldadp; 1205 oldadp = TAILQ_NEXT(oldadp, ad_next)) { 1206 if (oldadp->ad_lbn >= lbn) 1207 break; 1208 } 1209 if (oldadp == NULL) 1210 panic("softdep_setup_allocdirect: lost entry"); 1211 /* insert in middle of list */ 1212 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 1213 if (oldadp->ad_lbn == lbn) 1214 allocdirect_merge(adphead, adp, oldadp); 1215 FREE_LOCK(&lk); 1216} 1217 1218/* 1219 * Replace an old allocdirect dependency with a newer one. 1220 * This routine must be called with splbio interrupts blocked. 1221 */ 1222static void 1223allocdirect_merge(adphead, newadp, oldadp) 1224 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 1225 struct allocdirect *newadp; /* allocdirect being added */ 1226 struct allocdirect *oldadp; /* existing allocdirect being checked */ 1227{ 1228 struct freefrag *freefrag; 1229 1230#ifdef DEBUG 1231 if (lk.lkt_held == -1) 1232 panic("allocdirect_merge: lock not held"); 1233#endif 1234 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 1235 newadp->ad_oldsize != oldadp->ad_newsize || 1236 newadp->ad_lbn >= NDADDR) 1237 panic("allocdirect_check: old %d != new %d || lbn %ld >= %d", 1238 newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn, 1239 NDADDR); 1240 newadp->ad_oldblkno = oldadp->ad_oldblkno; 1241 newadp->ad_oldsize = oldadp->ad_oldsize; 1242 /* 1243 * If the old dependency had a fragment to free or had never 1244 * previously had a block allocated, then the new dependency 1245 * can immediately post its freefrag and adopt the old freefrag. 1246 * This action is done by swapping the freefrag dependencies. 1247 * The new dependency gains the old one's freefrag, and the 1248 * old one gets the new one and then immediately puts it on 1249 * the worklist when it is freed by free_allocdirect. It is 1250 * not possible to do this swap when the old dependency had a 1251 * non-zero size but no previous fragment to free. This condition 1252 * arises when the new block is an extension of the old block. 1253 * Here, the first part of the fragment allocated to the new 1254 * dependency is part of the block currently claimed on disk by 1255 * the old dependency, so cannot legitimately be freed until the 1256 * conditions for the new dependency are fulfilled. 1257 */ 1258 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 1259 freefrag = newadp->ad_freefrag; 1260 newadp->ad_freefrag = oldadp->ad_freefrag; 1261 oldadp->ad_freefrag = freefrag; 1262 } 1263 free_allocdirect(adphead, oldadp, 0); 1264} 1265 1266/* 1267 * Allocate a new freefrag structure if needed. 1268 */ 1269static struct freefrag * 1270newfreefrag(ip, blkno, size) 1271 struct inode *ip; 1272 ufs_daddr_t blkno; 1273 long size; 1274{ 1275 struct freefrag *freefrag; 1276 struct fs *fs; 1277 1278 if (blkno == 0) 1279 return (NULL); 1280 fs = ip->i_fs; 1281 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 1282 panic("newfreefrag: frag size"); 1283 MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag), 1284 M_FREEFRAG, M_WAITOK); 1285 freefrag->ff_list.wk_type = D_FREEFRAG; 1286 freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */ 1287 freefrag->ff_inum = ip->i_number; 1288 freefrag->ff_fs = fs; 1289 freefrag->ff_devvp = ip->i_devvp; 1290 freefrag->ff_blkno = blkno; 1291 freefrag->ff_fragsize = size; 1292 return (freefrag); 1293} 1294 1295/* 1296 * This workitem de-allocates fragments that were replaced during 1297 * file block allocation. 1298 */ 1299static void 1300handle_workitem_freefrag(freefrag) 1301 struct freefrag *freefrag; 1302{ 1303 struct inode tip; 1304 1305 tip.i_fs = freefrag->ff_fs; 1306 tip.i_devvp = freefrag->ff_devvp; 1307 tip.i_dev = freefrag->ff_devvp->v_rdev; 1308 tip.i_number = freefrag->ff_inum; 1309 tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above */ 1310 ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize); 1311 FREE(freefrag, M_FREEFRAG); 1312} 1313 1314/* 1315 * Indirect block allocation dependencies. 1316 * 1317 * The same dependencies that exist for a direct block also exist when 1318 * a new block is allocated and pointed to by an entry in a block of 1319 * indirect pointers. The undo/redo states described above are also 1320 * used here. Because an indirect block contains many pointers that 1321 * may have dependencies, a second copy of the entire in-memory indirect 1322 * block is kept. The buffer cache copy is always completely up-to-date. 1323 * The second copy, which is used only as a source for disk writes, 1324 * contains only the safe pointers (i.e., those that have no remaining 1325 * update dependencies). The second copy is freed when all pointers 1326 * are safe. The cache is not allowed to replace indirect blocks with 1327 * pending update dependencies. If a buffer containing an indirect 1328 * block with dependencies is written, these routines will mark it 1329 * dirty again. It can only be successfully written once all the 1330 * dependencies are removed. The ffs_fsync routine in conjunction with 1331 * softdep_sync_metadata work together to get all the dependencies 1332 * removed so that a file can be successfully written to disk. Three 1333 * procedures are used when setting up indirect block pointer 1334 * dependencies. The division is necessary because of the organization 1335 * of the "balloc" routine and because of the distinction between file 1336 * pages and file metadata blocks. 1337 */ 1338 1339/* 1340 * Allocate a new allocindir structure. 1341 */ 1342static struct allocindir * 1343newallocindir(ip, ptrno, newblkno, oldblkno) 1344 struct inode *ip; /* inode for file being extended */ 1345 int ptrno; /* offset of pointer in indirect block */ 1346 ufs_daddr_t newblkno; /* disk block number being added */ 1347 ufs_daddr_t oldblkno; /* previous block number, 0 if none */ 1348{ 1349 struct allocindir *aip; 1350 1351 MALLOC(aip, struct allocindir *, sizeof(struct allocindir), 1352 M_ALLOCINDIR, M_WAITOK); 1353 bzero(aip, sizeof(struct allocindir)); 1354 aip->ai_list.wk_type = D_ALLOCINDIR; 1355 aip->ai_state = ATTACHED; 1356 aip->ai_offset = ptrno; 1357 aip->ai_newblkno = newblkno; 1358 aip->ai_oldblkno = oldblkno; 1359 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); 1360 return (aip); 1361} 1362 1363/* 1364 * Called just before setting an indirect block pointer 1365 * to a newly allocated file page. 1366 */ 1367void 1368softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 1369 struct inode *ip; /* inode for file being extended */ 1370 ufs_lbn_t lbn; /* allocated block number within file */ 1371 struct buf *bp; /* buffer with indirect blk referencing page */ 1372 int ptrno; /* offset of pointer in indirect block */ 1373 ufs_daddr_t newblkno; /* disk block number being added */ 1374 ufs_daddr_t oldblkno; /* previous block number, 0 if none */ 1375 struct buf *nbp; /* buffer holding allocated page */ 1376{ 1377 struct allocindir *aip; 1378 struct pagedep *pagedep; 1379 1380 aip = newallocindir(ip, ptrno, newblkno, oldblkno); 1381 ACQUIRE_LOCK(&lk); 1382 /* 1383 * If we are allocating a directory page, then we must 1384 * allocate an associated pagedep to track additions and 1385 * deletions. 1386 */ 1387 if ((ip->i_mode & IFMT) == IFDIR && 1388 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1389 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); 1390 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1391 FREE_LOCK(&lk); 1392 setup_allocindir_phase2(bp, ip, aip); 1393} 1394 1395/* 1396 * Called just before setting an indirect block pointer to a 1397 * newly allocated indirect block. 1398 */ 1399void 1400softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 1401 struct buf *nbp; /* newly allocated indirect block */ 1402 struct inode *ip; /* inode for file being extended */ 1403 struct buf *bp; /* indirect block referencing allocated block */ 1404 int ptrno; /* offset of pointer in indirect block */ 1405 ufs_daddr_t newblkno; /* disk block number being added */ 1406{ 1407 struct allocindir *aip; 1408 1409 aip = newallocindir(ip, ptrno, newblkno, 0); 1410 ACQUIRE_LOCK(&lk); 1411 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1412 FREE_LOCK(&lk); 1413 setup_allocindir_phase2(bp, ip, aip); 1414} 1415 1416/* 1417 * Called to finish the allocation of the "aip" allocated 1418 * by one of the two routines above. 1419 */ 1420static void 1421setup_allocindir_phase2(bp, ip, aip) 1422 struct buf *bp; /* in-memory copy of the indirect block */ 1423 struct inode *ip; /* inode for file being extended */ 1424 struct allocindir *aip; /* allocindir allocated by the above routines */ 1425{ 1426 struct worklist *wk; 1427 struct indirdep *indirdep, *newindirdep; 1428 struct bmsafemap *bmsafemap; 1429 struct allocindir *oldaip; 1430 struct freefrag *freefrag; 1431 struct newblk *newblk; 1432 1433 if (bp->b_lblkno >= 0) 1434 panic("setup_allocindir_phase2: not indir blk"); 1435 for (indirdep = NULL, newindirdep = NULL; ; ) { 1436 ACQUIRE_LOCK(&lk); 1437 for (wk = LIST_FIRST(&bp->b_dep); wk; 1438 wk = LIST_NEXT(wk, wk_list)) { 1439 if (wk->wk_type != D_INDIRDEP) 1440 continue; 1441 indirdep = WK_INDIRDEP(wk); 1442 break; 1443 } 1444 if (indirdep == NULL && newindirdep) { 1445 indirdep = newindirdep; 1446 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 1447 newindirdep = NULL; 1448 } 1449 FREE_LOCK(&lk); 1450 if (indirdep) { 1451 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, 1452 &newblk) == 0) 1453 panic("setup_allocindir: lost block"); 1454 ACQUIRE_LOCK(&lk); 1455 if (newblk->nb_state == DEPCOMPLETE) { 1456 aip->ai_state |= DEPCOMPLETE; 1457 aip->ai_buf = NULL; 1458 } else { 1459 bmsafemap = newblk->nb_bmsafemap; 1460 aip->ai_buf = bmsafemap->sm_buf; 1461 LIST_REMOVE(newblk, nb_deps); 1462 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, 1463 aip, ai_deps); 1464 } 1465 LIST_REMOVE(newblk, nb_hash); 1466 FREE(newblk, M_NEWBLK); 1467 aip->ai_indirdep = indirdep; 1468 /* 1469 * Check to see if there is an existing dependency 1470 * for this block. If there is, merge the old 1471 * dependency into the new one. 1472 */ 1473 if (aip->ai_oldblkno == 0) 1474 oldaip = NULL; 1475 else 1476 for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd); 1477 oldaip; oldaip = LIST_NEXT(oldaip, ai_next)) 1478 if (oldaip->ai_offset == aip->ai_offset) 1479 break; 1480 if (oldaip != NULL) { 1481 if (oldaip->ai_newblkno != aip->ai_oldblkno) 1482 panic("setup_allocindir_phase2: blkno"); 1483 aip->ai_oldblkno = oldaip->ai_oldblkno; 1484 freefrag = oldaip->ai_freefrag; 1485 oldaip->ai_freefrag = aip->ai_freefrag; 1486 aip->ai_freefrag = freefrag; 1487 free_allocindir(oldaip, NULL); 1488 } 1489 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 1490 ((ufs_daddr_t *)indirdep->ir_savebp->b_data) 1491 [aip->ai_offset] = aip->ai_oldblkno; 1492 FREE_LOCK(&lk); 1493 } 1494 if (newindirdep) { 1495 if (indirdep->ir_savebp != NULL) 1496 brelse(newindirdep->ir_savebp); 1497 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); 1498 } 1499 if (indirdep) 1500 break; 1501 MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep), 1502 M_INDIRDEP, M_WAITOK); 1503 newindirdep->ir_list.wk_type = D_INDIRDEP; 1504 newindirdep->ir_state = ATTACHED; 1505 LIST_INIT(&newindirdep->ir_deplisthd); 1506 LIST_INIT(&newindirdep->ir_donehd); 1507#ifdef __FreeBSD__ 1508 if (bp->b_blkno == bp->b_lblkno) { 1509#if 0 /* we know this happens.. research suggested.. */ 1510 printf("setup_allocindir_phase2: need bmap, blk %d\n", 1511 bp->b_lblkno); 1512#endif 1513 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 1514 NULL, NULL); 1515 } 1516#endif /* __FreeBSD__ */ 1517 newindirdep->ir_savebp = 1518 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0); 1519 bp->b_flags |= B_XXX; 1520 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 1521 } 1522} 1523 1524/* 1525 * Block de-allocation dependencies. 1526 * 1527 * When blocks are de-allocated, the on-disk pointers must be nullified before 1528 * the blocks are made available for use by other files. (The true 1529 * requirement is that old pointers must be nullified before new on-disk 1530 * pointers are set. We chose this slightly more stringent requirement to 1531 * reduce complexity.) Our implementation handles this dependency by updating 1532 * the inode (or indirect block) appropriately but delaying the actual block 1533 * de-allocation (i.e., freemap and free space count manipulation) until 1534 * after the updated versions reach stable storage. After the disk is 1535 * updated, the blocks can be safely de-allocated whenever it is convenient. 1536 * This implementation handles only the common case of reducing a file's 1537 * length to zero. Other cases are handled by the conventional synchronous 1538 * write approach. 1539 * 1540 * The ffs implementation with which we worked double-checks 1541 * the state of the block pointers and file size as it reduces 1542 * a file's length. Some of this code is replicated here in our 1543 * soft updates implementation. The freeblks->fb_chkcnt field is 1544 * used to transfer a part of this information to the procedure 1545 * that eventually de-allocates the blocks. 1546 * 1547 * This routine should be called from the routine that shortens 1548 * a file's length, before the inode's size or block pointers 1549 * are modified. It will save the block pointer information for 1550 * later release and zero the inode so that the calling routine 1551 * can release it. 1552 */ 1553static long num_freeblks; /* number of freeblks allocated */ 1554void 1555softdep_setup_freeblocks(ip, length) 1556 struct inode *ip; /* The inode whose length is to be reduced */ 1557 off_t length; /* The new length for the file */ 1558{ 1559 struct freeblks *freeblks; 1560 struct inodedep *inodedep; 1561 struct allocdirect *adp; 1562 struct vnode *vp; 1563 struct buf *bp; 1564 struct fs *fs; 1565 int i, error; 1566 1567 fs = ip->i_fs; 1568 if (length != 0) 1569 panic("softde_setup_freeblocks: non-zero length"); 1570 (void) checklimit(&num_freeblks, 0); 1571 num_freeblks += 1; 1572 MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks), 1573 M_FREEBLKS, M_WAITOK); 1574 bzero(freeblks, sizeof(struct freeblks)); 1575 freeblks->fb_list.wk_type = D_FREEBLKS; 1576 freeblks->fb_uid = ip->i_uid; 1577 freeblks->fb_previousinum = ip->i_number; 1578 freeblks->fb_devvp = ip->i_devvp; 1579 freeblks->fb_fs = fs; 1580 freeblks->fb_oldsize = ip->i_size; 1581 freeblks->fb_newsize = length; 1582 freeblks->fb_chkcnt = ip->i_blocks; 1583 for (i = 0; i < NDADDR; i++) { 1584 freeblks->fb_dblks[i] = ip->i_db[i]; 1585 ip->i_db[i] = 0; 1586 } 1587 for (i = 0; i < NIADDR; i++) { 1588 freeblks->fb_iblks[i] = ip->i_ib[i]; 1589 ip->i_ib[i] = 0; 1590 } 1591 ip->i_blocks = 0; 1592 ip->i_size = 0; 1593 /* 1594 * Push the zero'ed inode to to its disk buffer so that we are free 1595 * to delete its dependencies below. Once the dependencies are gone 1596 * the buffer can be safely released. 1597 */ 1598 if ((error = bread(ip->i_devvp, 1599 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 1600 (int)fs->fs_bsize, NOCRED, &bp)) != 0) 1601 softdep_error("softdep_setup_freeblocks", error); 1602 *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = 1603 ip->i_din; 1604 /* 1605 * Find and eliminate any inode dependencies. 1606 */ 1607 ACQUIRE_LOCK(&lk); 1608 (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep); 1609 if ((inodedep->id_state & IOSTARTED) != 0) 1610 panic("softdep_setup_freeblocks: inode busy"); 1611 /* 1612 * Add the freeblks structure to the list of operations that 1613 * must await the zero'ed inode being written to disk. 1614 */ 1615 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); 1616 /* 1617 * Because the file length has been truncated to zero, any 1618 * pending block allocation dependency structures associated 1619 * with this inode are obsolete and can simply be de-allocated. 1620 * We must first merge the two dependency lists to get rid of 1621 * any duplicate freefrag structures, then purge the merged list. 1622 */ 1623 merge_inode_lists(inodedep); 1624 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 1625 free_allocdirect(&inodedep->id_inoupdt, adp, 1); 1626 FREE_LOCK(&lk); 1627 bdwrite(bp); 1628 /* 1629 * We must wait for any I/O in progress to finish so that 1630 * all potential buffers on the dirty list will be visible. 1631 * Once they are all there, walk the list and get rid of 1632 * any dependencies. 1633 */ 1634 vp = ITOV(ip); 1635 ACQUIRE_LOCK(&lk); 1636 drain_output(vp, 1); 1637 while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) { 1638 bp = TAILQ_FIRST(&vp->v_dirtyblkhd); 1639 (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep); 1640 deallocate_dependencies(bp, inodedep); 1641 bp->b_flags |= B_INVAL | B_NOCACHE; 1642 FREE_LOCK(&lk); 1643 brelse(bp); 1644 ACQUIRE_LOCK(&lk); 1645 } 1646 /* 1647 * Try freeing the inodedep in case that was the last dependency. 1648 */ 1649 if ((inodedep_lookup(fs, ip->i_number, 0, &inodedep)) != 0) 1650 (void) free_inodedep(inodedep); 1651 FREE_LOCK(&lk); 1652} 1653 1654/* 1655 * Reclaim any dependency structures from a buffer that is about to 1656 * be reallocated to a new vnode. The buffer must be locked, thus, 1657 * no I/O completion operations can occur while we are manipulating 1658 * its associated dependencies. The mutex is held so that other I/O's 1659 * associated with related dependencies do not occur. 1660 */ 1661static void 1662deallocate_dependencies(bp, inodedep) 1663 struct buf *bp; 1664 struct inodedep *inodedep; 1665{ 1666 struct worklist *wk; 1667 struct indirdep *indirdep; 1668 struct allocindir *aip; 1669 struct pagedep *pagedep; 1670 struct dirrem *dirrem; 1671 struct diradd *dap; 1672 int i; 1673 1674 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 1675 switch (wk->wk_type) { 1676 1677 case D_INDIRDEP: 1678 indirdep = WK_INDIRDEP(wk); 1679 /* 1680 * None of the indirect pointers will ever be visible, 1681 * so they can simply be tossed. GOINGAWAY ensures 1682 * that allocated pointers will be saved in the buffer 1683 * cache until they are freed. Note that they will 1684 * only be able to be found by their physical address 1685 * since the inode mapping the logical address will 1686 * be gone. The save buffer used for the safe copy 1687 * was allocated in setup_allocindir_phase2 using 1688 * the physical address so it could be used for this 1689 * purpose. Hence we swap the safe copy with the real 1690 * copy, allowing the safe copy to be freed and holding 1691 * on to the real copy for later use in indir_trunc. 1692 */ 1693 if (indirdep->ir_state & GOINGAWAY) 1694 panic("deallocate_dependencies: already gone"); 1695 indirdep->ir_state |= GOINGAWAY; 1696 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 1697 free_allocindir(aip, inodedep); 1698 if (bp->b_lblkno >= 0 || 1699 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 1700 panic("deallocate_dependencies: not indir"); 1701 bcopy(bp->b_data, indirdep->ir_savebp->b_data, 1702 bp->b_bcount); 1703 WORKLIST_REMOVE(wk); 1704 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); 1705 continue; 1706 1707 case D_PAGEDEP: 1708 pagedep = WK_PAGEDEP(wk); 1709 /* 1710 * None of the directory additions will ever be 1711 * visible, so they can simply be tossed. 1712 */ 1713 for (i = 0; i < DAHASHSZ; i++) 1714 while (dap=LIST_FIRST(&pagedep->pd_diraddhd[i])) 1715 free_diradd(dap); 1716 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) 1717 free_diradd(dap); 1718 /* 1719 * Copy any directory remove dependencies to the list 1720 * to be processed after the zero'ed inode is written. 1721 * If the inode has already been written, then they 1722 * can be dumped directly onto the work list. 1723 */ 1724 for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem; 1725 dirrem = LIST_NEXT(dirrem, dm_next)) { 1726 LIST_REMOVE(dirrem, dm_next); 1727 dirrem->dm_dirinum = pagedep->pd_ino; 1728 if (inodedep == NULL) 1729 add_to_worklist(&dirrem->dm_list); 1730 else 1731 WORKLIST_INSERT(&inodedep->id_bufwait, 1732 &dirrem->dm_list); 1733 } 1734 WORKLIST_REMOVE(&pagedep->pd_list); 1735 LIST_REMOVE(pagedep, pd_hash); 1736 WORKITEM_FREE(pagedep, D_PAGEDEP); 1737 continue; 1738 1739 case D_ALLOCINDIR: 1740 free_allocindir(WK_ALLOCINDIR(wk), inodedep); 1741 continue; 1742 1743 case D_ALLOCDIRECT: 1744 case D_INODEDEP: 1745 panic("deallocate_dependencies: Unexpected type %s", 1746 TYPENAME(wk->wk_type)); 1747 /* NOTREACHED */ 1748 1749 default: 1750 panic("deallocate_dependencies: Unknown type %s", 1751 TYPENAME(wk->wk_type)); 1752 /* NOTREACHED */ 1753 } 1754 } 1755} 1756 1757/* 1758 * Free an allocdirect. Generate a new freefrag work request if appropriate. 1759 * This routine must be called with splbio interrupts blocked. 1760 */ 1761static void 1762free_allocdirect(adphead, adp, delay) 1763 struct allocdirectlst *adphead; 1764 struct allocdirect *adp; 1765 int delay; 1766{ 1767 1768#ifdef DEBUG 1769 if (lk.lkt_held == -1) 1770 panic("free_allocdirect: lock not held"); 1771#endif 1772 if ((adp->ad_state & DEPCOMPLETE) == 0) 1773 LIST_REMOVE(adp, ad_deps); 1774 TAILQ_REMOVE(adphead, adp, ad_next); 1775 if ((adp->ad_state & COMPLETE) == 0) 1776 WORKLIST_REMOVE(&adp->ad_list); 1777 if (adp->ad_freefrag != NULL) { 1778 if (delay) 1779 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 1780 &adp->ad_freefrag->ff_list); 1781 else 1782 add_to_worklist(&adp->ad_freefrag->ff_list); 1783 } 1784 WORKITEM_FREE(adp, D_ALLOCDIRECT); 1785} 1786 1787/* 1788 * Prepare an inode to be freed. The actual free operation is not 1789 * done until the zero'ed inode has been written to disk. 1790 */ 1791static long num_freefile; /* number of freefile allocated */ 1792void 1793softdep_freefile(pvp, ino, mode) 1794 struct vnode *pvp; 1795 ino_t ino; 1796 int mode; 1797{ 1798 struct inode *ip = VTOI(pvp); 1799 struct inodedep *inodedep; 1800 struct freefile *freefile; 1801 1802 /* 1803 * This sets up the inode de-allocation dependency. 1804 */ 1805 (void) checklimit(&num_freefile, 0); 1806 num_freefile += 1; 1807 MALLOC(freefile, struct freefile *, sizeof(struct freefile), 1808 M_FREEFILE, M_WAITOK); 1809 freefile->fx_list.wk_type = D_FREEFILE; 1810 freefile->fx_list.wk_state = 0; 1811 freefile->fx_mode = mode; 1812 freefile->fx_oldinum = ino; 1813 freefile->fx_devvp = ip->i_devvp; 1814 freefile->fx_fs = ip->i_fs; 1815 1816 /* 1817 * If the inodedep does not exist, then the zero'ed inode has 1818 * been written to disk and we can free the file immediately. 1819 */ 1820 ACQUIRE_LOCK(&lk); 1821 if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0) { 1822 add_to_worklist(&freefile->fx_list); 1823 FREE_LOCK(&lk); 1824 return; 1825 } 1826 1827 /* 1828 * If we still have a bitmap dependency, then the inode has never 1829 * been written to disk. Drop the dependency as it is no longer 1830 * necessary since the inode is being deallocated. We could process 1831 * the freefile immediately, but then we would have to clear the 1832 * id_inowait dependencies here and it is easier just to let the 1833 * zero'ed inode be written and let them be cleaned up in the 1834 * normal followup actions that follow the inode write. 1835 */ 1836 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 1837 inodedep->id_state |= DEPCOMPLETE; 1838 LIST_REMOVE(inodedep, id_deps); 1839 inodedep->id_buf = NULL; 1840 } 1841 /* 1842 * If the inodedep has no dependencies associated with it, 1843 * then we must free it here and free the file immediately. 1844 * This case arises when an early allocation fails (for 1845 * example, the user is over their file quota). 1846 */ 1847 if (free_inodedep(inodedep) == 0) 1848 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 1849 else 1850 add_to_worklist(&freefile->fx_list); 1851 FREE_LOCK(&lk); 1852} 1853 1854/* 1855 * Try to free an inodedep structure. Return 1 if it could be freed. 1856 */ 1857static int 1858free_inodedep(inodedep) 1859 struct inodedep *inodedep; 1860{ 1861 1862 if ((inodedep->id_state & ONWORKLIST) != 0 || 1863 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 1864 LIST_FIRST(&inodedep->id_pendinghd) != NULL || 1865 LIST_FIRST(&inodedep->id_bufwait) != NULL || 1866 LIST_FIRST(&inodedep->id_inowait) != NULL || 1867 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 1868 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || 1869 inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL) 1870 return (0); 1871 LIST_REMOVE(inodedep, id_hash); 1872 WORKITEM_FREE(inodedep, D_INODEDEP); 1873 num_inodedep -= 1; 1874 return (1); 1875} 1876 1877/* 1878 * This workitem routine performs the block de-allocation. 1879 * The workitem is added to the pending list after the updated 1880 * inode block has been written to disk. As mentioned above, 1881 * checks regarding the number of blocks de-allocated (compared 1882 * to the number of blocks allocated for the file) are also 1883 * performed in this function. 1884 */ 1885static void 1886handle_workitem_freeblocks(freeblks) 1887 struct freeblks *freeblks; 1888{ 1889 struct inode tip; 1890 ufs_daddr_t bn; 1891 struct fs *fs; 1892 int i, level, bsize; 1893 long nblocks, blocksreleased = 0; 1894 int error, allerror = 0; 1895 ufs_lbn_t baselbns[NIADDR], tmpval; 1896 1897 tip.i_number = freeblks->fb_previousinum; 1898 tip.i_devvp = freeblks->fb_devvp; 1899 tip.i_dev = freeblks->fb_devvp->v_rdev; 1900 tip.i_fs = freeblks->fb_fs; 1901 tip.i_size = freeblks->fb_oldsize; 1902 tip.i_uid = freeblks->fb_uid; 1903 fs = freeblks->fb_fs; 1904 tmpval = 1; 1905 baselbns[0] = NDADDR; 1906 for (i = 1; i < NIADDR; i++) { 1907 tmpval *= NINDIR(fs); 1908 baselbns[i] = baselbns[i - 1] + tmpval; 1909 } 1910 nblocks = btodb(fs->fs_bsize); 1911 blocksreleased = 0; 1912 /* 1913 * Indirect blocks first. 1914 */ 1915 for (level = (NIADDR - 1); level >= 0; level--) { 1916 if ((bn = freeblks->fb_iblks[level]) == 0) 1917 continue; 1918 if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level, 1919 baselbns[level], &blocksreleased)) == 0) 1920 allerror = error; 1921 ffs_blkfree(&tip, bn, fs->fs_bsize); 1922 blocksreleased += nblocks; 1923 } 1924 /* 1925 * All direct blocks or frags. 1926 */ 1927 for (i = (NDADDR - 1); i >= 0; i--) { 1928 if ((bn = freeblks->fb_dblks[i]) == 0) 1929 continue; 1930 bsize = blksize(fs, &tip, i); 1931 ffs_blkfree(&tip, bn, bsize); 1932 blocksreleased += btodb(bsize); 1933 } 1934 1935#ifdef DIAGNOSTIC 1936 if (freeblks->fb_chkcnt != blocksreleased) 1937 panic("handle_workitem_freeblocks: block count"); 1938 if (allerror) 1939 softdep_error("handle_workitem_freeblks", allerror); 1940#endif /* DIAGNOSTIC */ 1941 WORKITEM_FREE(freeblks, D_FREEBLKS); 1942 num_freeblks -= 1; 1943} 1944 1945/* 1946 * Release blocks associated with the inode ip and stored in the indirect 1947 * block dbn. If level is greater than SINGLE, the block is an indirect block 1948 * and recursive calls to indirtrunc must be used to cleanse other indirect 1949 * blocks. 1950 */ 1951static int 1952indir_trunc(ip, dbn, level, lbn, countp) 1953 struct inode *ip; 1954 ufs_daddr_t dbn; 1955 int level; 1956 ufs_lbn_t lbn; 1957 long *countp; 1958{ 1959 struct buf *bp; 1960 ufs_daddr_t *bap; 1961 ufs_daddr_t nb; 1962 struct fs *fs; 1963 struct worklist *wk; 1964 struct indirdep *indirdep; 1965 int i, lbnadd, nblocks; 1966 int error, allerror = 0; 1967 1968 fs = ip->i_fs; 1969 lbnadd = 1; 1970 for (i = level; i > 0; i--) 1971 lbnadd *= NINDIR(fs); 1972 /* 1973 * Get buffer of block pointers to be freed. This routine is not 1974 * called until the zero'ed inode has been written, so it is safe 1975 * to free blocks as they are encountered. Because the inode has 1976 * been zero'ed, calls to bmap on these blocks will fail. So, we 1977 * have to use the on-disk address and the block device for the 1978 * filesystem to look them up. If the file was deleted before its 1979 * indirect blocks were all written to disk, the routine that set 1980 * us up (deallocate_dependencies) will have arranged to leave 1981 * a complete copy of the indirect block in memory for our use. 1982 * Otherwise we have to read the blocks in from the disk. 1983 */ 1984 ACQUIRE_LOCK(&lk); 1985 if ((bp = incore(ip->i_devvp, dbn)) != NULL && 1986 (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 1987 if (wk->wk_type != D_INDIRDEP || 1988 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || 1989 (indirdep->ir_state & GOINGAWAY) == 0) 1990 panic("indir_trunc: lost indirdep"); 1991 WORKLIST_REMOVE(wk); 1992 WORKITEM_FREE(indirdep, D_INDIRDEP); 1993 if (LIST_FIRST(&bp->b_dep) != NULL) 1994 panic("indir_trunc: dangling dep"); 1995 FREE_LOCK(&lk); 1996 } else { 1997 FREE_LOCK(&lk); 1998 error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp); 1999 if (error) 2000 return (error); 2001 } 2002 /* 2003 * Recursively free indirect blocks. 2004 */ 2005 bap = (ufs_daddr_t *)bp->b_data; 2006 nblocks = btodb(fs->fs_bsize); 2007 for (i = NINDIR(fs) - 1; i >= 0; i--) { 2008 if ((nb = bap[i]) == 0) 2009 continue; 2010 if (level != 0) { 2011 if ((error = indir_trunc(ip, fsbtodb(fs, nb), 2012 level - 1, lbn + (i * lbnadd), countp)) != 0) 2013 allerror = error; 2014 } 2015 ffs_blkfree(ip, nb, fs->fs_bsize); 2016 *countp += nblocks; 2017 } 2018 bp->b_flags |= B_INVAL | B_NOCACHE; 2019 bp->b_flags &= ~B_XXX; 2020 brelse(bp); 2021 return (allerror); 2022} 2023 2024/* 2025 * Free an allocindir. 2026 * This routine must be called with splbio interrupts blocked. 2027 */ 2028static void 2029free_allocindir(aip, inodedep) 2030 struct allocindir *aip; 2031 struct inodedep *inodedep; 2032{ 2033 struct freefrag *freefrag; 2034 2035#ifdef DEBUG 2036 if (lk.lkt_held == -1) 2037 panic("free_allocindir: lock not held"); 2038#endif 2039 if ((aip->ai_state & DEPCOMPLETE) == 0) 2040 LIST_REMOVE(aip, ai_deps); 2041 if (aip->ai_state & ONWORKLIST) 2042 WORKLIST_REMOVE(&aip->ai_list); 2043 LIST_REMOVE(aip, ai_next); 2044 if ((freefrag = aip->ai_freefrag) != NULL) { 2045 if (inodedep == NULL) 2046 add_to_worklist(&freefrag->ff_list); 2047 else 2048 WORKLIST_INSERT(&inodedep->id_bufwait, 2049 &freefrag->ff_list); 2050 } 2051 WORKITEM_FREE(aip, D_ALLOCINDIR); 2052} 2053 2054/* 2055 * Directory entry addition dependencies. 2056 * 2057 * When adding a new directory entry, the inode (with its incremented link 2058 * count) must be written to disk before the directory entry's pointer to it. 2059 * Also, if the inode is newly allocated, the corresponding freemap must be 2060 * updated (on disk) before the directory entry's pointer. These requirements 2061 * are met via undo/redo on the directory entry's pointer, which consists 2062 * simply of the inode number. 2063 * 2064 * As directory entries are added and deleted, the free space within a 2065 * directory block can become fragmented. The ufs file system will compact 2066 * a fragmented directory block to make space for a new entry. When this 2067 * occurs, the offsets of previously added entries change. Any "diradd" 2068 * dependency structures corresponding to these entries must be updated with 2069 * the new offsets. 2070 */ 2071 2072/* 2073 * This routine is called after the in-memory inode's link 2074 * count has been incremented, but before the directory entry's 2075 * pointer to the inode has been set. 2076 */ 2077void 2078softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) 2079 struct buf *bp; /* buffer containing directory block */ 2080 struct inode *dp; /* inode for directory */ 2081 off_t diroffset; /* offset of new entry in directory */ 2082 long newinum; /* inode referenced by new directory entry */ 2083 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 2084{ 2085 int offset; /* offset of new entry within directory block */ 2086 ufs_lbn_t lbn; /* block in directory containing new entry */ 2087 struct fs *fs; 2088 struct diradd *dap; 2089 struct pagedep *pagedep; 2090 struct inodedep *inodedep; 2091 struct mkdir *mkdir1, *mkdir2; 2092 2093 /* 2094 * Whiteouts have no dependencies. 2095 */ 2096 if (newinum == WINO) { 2097 if (newdirbp != NULL) 2098 bdwrite(newdirbp); 2099 return; 2100 } 2101 2102 fs = dp->i_fs; 2103 lbn = lblkno(fs, diroffset); 2104 offset = blkoff(fs, diroffset); 2105 MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK); 2106 bzero(dap, sizeof(struct diradd)); 2107 dap->da_list.wk_type = D_DIRADD; 2108 dap->da_offset = offset; 2109 dap->da_newinum = newinum; 2110 dap->da_state = ATTACHED; 2111 if (newdirbp == NULL) { 2112 dap->da_state |= DEPCOMPLETE; 2113 ACQUIRE_LOCK(&lk); 2114 } else { 2115 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 2116 MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR, 2117 M_WAITOK); 2118 mkdir1->md_list.wk_type = D_MKDIR; 2119 mkdir1->md_state = MKDIR_BODY; 2120 mkdir1->md_diradd = dap; 2121 MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR, 2122 M_WAITOK); 2123 mkdir2->md_list.wk_type = D_MKDIR; 2124 mkdir2->md_state = MKDIR_PARENT; 2125 mkdir2->md_diradd = dap; 2126 /* 2127 * Dependency on "." and ".." being written to disk. 2128 */ 2129 mkdir1->md_buf = newdirbp; 2130 ACQUIRE_LOCK(&lk); 2131 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 2132 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); 2133 FREE_LOCK(&lk); 2134 bdwrite(newdirbp); 2135 /* 2136 * Dependency on link count increase for parent directory 2137 */ 2138 ACQUIRE_LOCK(&lk); 2139 if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0 2140 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2141 dap->da_state &= ~MKDIR_PARENT; 2142 WORKITEM_FREE(mkdir2, D_MKDIR); 2143 } else { 2144 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 2145 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); 2146 } 2147 } 2148 /* 2149 * Link into parent directory pagedep to await its being written. 2150 */ 2151 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2152 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2153 dap->da_pagedep = pagedep; 2154 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 2155 da_pdlist); 2156 /* 2157 * Link into its inodedep. Put it on the id_bufwait list if the inode 2158 * is not yet written. If it is written, do the post-inode write 2159 * processing to put it on the id_pendinghd list. 2160 */ 2161 (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep); 2162 if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 2163 diradd_inode_written(dap, inodedep); 2164 else 2165 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 2166 FREE_LOCK(&lk); 2167} 2168 2169/* 2170 * This procedure is called to change the offset of a directory 2171 * entry when compacting a directory block which must be owned 2172 * exclusively by the caller. Note that the actual entry movement 2173 * must be done in this procedure to ensure that no I/O completions 2174 * occur while the move is in progress. 2175 */ 2176void 2177softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) 2178 struct inode *dp; /* inode for directory */ 2179 caddr_t base; /* address of dp->i_offset */ 2180 caddr_t oldloc; /* address of old directory location */ 2181 caddr_t newloc; /* address of new directory location */ 2182 int entrysize; /* size of directory entry */ 2183{ 2184 int offset, oldoffset, newoffset; 2185 struct pagedep *pagedep; 2186 struct diradd *dap; 2187 ufs_lbn_t lbn; 2188 2189 ACQUIRE_LOCK(&lk); 2190 lbn = lblkno(dp->i_fs, dp->i_offset); 2191 offset = blkoff(dp->i_fs, dp->i_offset); 2192 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) 2193 goto done; 2194 oldoffset = offset + (oldloc - base); 2195 newoffset = offset + (newloc - base); 2196 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]); 2197 dap; dap = LIST_NEXT(dap, da_pdlist)) { 2198 if (dap->da_offset != oldoffset) 2199 continue; 2200 dap->da_offset = newoffset; 2201 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) 2202 break; 2203 LIST_REMOVE(dap, da_pdlist); 2204 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], 2205 dap, da_pdlist); 2206 break; 2207 } 2208 if (dap == NULL) { 2209 for (dap = LIST_FIRST(&pagedep->pd_pendinghd); 2210 dap; dap = LIST_NEXT(dap, da_pdlist)) { 2211 if (dap->da_offset == oldoffset) { 2212 dap->da_offset = newoffset; 2213 break; 2214 } 2215 } 2216 } 2217done: 2218 bcopy(oldloc, newloc, entrysize); 2219 FREE_LOCK(&lk); 2220} 2221 2222/* 2223 * Free a diradd dependency structure. This routine must be called 2224 * with splbio interrupts blocked. 2225 */ 2226static void 2227free_diradd(dap) 2228 struct diradd *dap; 2229{ 2230 struct dirrem *dirrem; 2231 struct pagedep *pagedep; 2232 struct inodedep *inodedep; 2233 struct mkdir *mkdir, *nextmd; 2234 2235#ifdef DEBUG 2236 if (lk.lkt_held == -1) 2237 panic("free_diradd: lock not held"); 2238#endif 2239 WORKLIST_REMOVE(&dap->da_list); 2240 LIST_REMOVE(dap, da_pdlist); 2241 if ((dap->da_state & DIRCHG) == 0) { 2242 pagedep = dap->da_pagedep; 2243 } else { 2244 dirrem = dap->da_previous; 2245 pagedep = dirrem->dm_pagedep; 2246 dirrem->dm_dirinum = pagedep->pd_ino; 2247 add_to_worklist(&dirrem->dm_list); 2248 } 2249 if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum, 2250 0, &inodedep) != 0) 2251 (void) free_inodedep(inodedep); 2252 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 2253 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 2254 nextmd = LIST_NEXT(mkdir, md_mkdirs); 2255 if (mkdir->md_diradd != dap) 2256 continue; 2257 dap->da_state &= ~mkdir->md_state; 2258 WORKLIST_REMOVE(&mkdir->md_list); 2259 LIST_REMOVE(mkdir, md_mkdirs); 2260 WORKITEM_FREE(mkdir, D_MKDIR); 2261 } 2262 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 2263 panic("free_diradd: unfound ref"); 2264 } 2265 WORKITEM_FREE(dap, D_DIRADD); 2266} 2267 2268/* 2269 * Directory entry removal dependencies. 2270 * 2271 * When removing a directory entry, the entry's inode pointer must be 2272 * zero'ed on disk before the corresponding inode's link count is decremented 2273 * (possibly freeing the inode for re-use). This dependency is handled by 2274 * updating the directory entry but delaying the inode count reduction until 2275 * after the directory block has been written to disk. After this point, the 2276 * inode count can be decremented whenever it is convenient. 2277 */ 2278 2279/* 2280 * This routine should be called immediately after removing 2281 * a directory entry. The inode's link count should not be 2282 * decremented by the calling procedure -- the soft updates 2283 * code will do this task when it is safe. 2284 */ 2285void 2286softdep_setup_remove(bp, dp, ip, isrmdir) 2287 struct buf *bp; /* buffer containing directory block */ 2288 struct inode *dp; /* inode for the directory being modified */ 2289 struct inode *ip; /* inode for directory entry being removed */ 2290 int isrmdir; /* indicates if doing RMDIR */ 2291{ 2292 struct dirrem *dirrem; 2293 2294 /* 2295 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. 2296 */ 2297 dirrem = newdirrem(bp, dp, ip, isrmdir); 2298 if ((dirrem->dm_state & COMPLETE) == 0) { 2299 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 2300 dm_next); 2301 } else { 2302 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 2303 add_to_worklist(&dirrem->dm_list); 2304 } 2305 FREE_LOCK(&lk); 2306} 2307 2308/* 2309 * Allocate a new dirrem if appropriate and return it along with 2310 * its associated pagedep. Called without a lock, returns with lock. 2311 */ 2312static struct dirrem * 2313newdirrem(bp, dp, ip, isrmdir) 2314 struct buf *bp; /* buffer containing directory block */ 2315 struct inode *dp; /* inode for the directory being modified */ 2316 struct inode *ip; /* inode for directory entry being removed */ 2317 int isrmdir; /* indicates if doing RMDIR */ 2318{ 2319 int offset; 2320 ufs_lbn_t lbn; 2321 struct diradd *dap; 2322 struct dirrem *dirrem; 2323 struct pagedep *pagedep; 2324 2325 /* 2326 * Whiteouts have no deletion dependencies. 2327 */ 2328 if (ip == NULL) 2329 panic("newdirrem: whiteout"); 2330 MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem), 2331 M_DIRREM, M_WAITOK); 2332 bzero(dirrem, sizeof(struct dirrem)); 2333 dirrem->dm_list.wk_type = D_DIRREM; 2334 dirrem->dm_state = isrmdir ? RMDIR : 0; 2335 dirrem->dm_mnt = ITOV(ip)->v_mount; 2336 dirrem->dm_oldinum = ip->i_number; 2337 2338 ACQUIRE_LOCK(&lk); 2339 lbn = lblkno(dp->i_fs, dp->i_offset); 2340 offset = blkoff(dp->i_fs, dp->i_offset); 2341 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2342 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2343 dirrem->dm_pagedep = pagedep; 2344 /* 2345 * Check for a diradd dependency for the same directory entry. 2346 * If present, then both dependencies become obsolete and can 2347 * be de-allocated. Check for an entry on both the pd_dirraddhd 2348 * list and the pd_pendinghd list. 2349 */ 2350 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(offset)]); 2351 dap; dap = LIST_NEXT(dap, da_pdlist)) 2352 if (dap->da_offset == offset) 2353 break; 2354 if (dap == NULL) { 2355 for (dap = LIST_FIRST(&pagedep->pd_pendinghd); 2356 dap; dap = LIST_NEXT(dap, da_pdlist)) 2357 if (dap->da_offset == offset) 2358 break; 2359 if (dap == NULL) 2360 return (dirrem); 2361 } 2362 /* 2363 * Must be ATTACHED at this point, so just delete it. 2364 */ 2365 if ((dap->da_state & ATTACHED) == 0) 2366 panic("newdirrem: not ATTACHED"); 2367 if (dap->da_newinum != ip->i_number) 2368 panic("newdirrem: inum %d should be %d", 2369 ip->i_number, dap->da_newinum); 2370 free_diradd(dap); 2371 dirrem->dm_state |= COMPLETE; 2372 return (dirrem); 2373} 2374 2375/* 2376 * Directory entry change dependencies. 2377 * 2378 * Changing an existing directory entry requires that an add operation 2379 * be completed first followed by a deletion. The semantics for the addition 2380 * are identical to the description of adding a new entry above except 2381 * that the rollback is to the old inode number rather than zero. Once 2382 * the addition dependency is completed, the removal is done as described 2383 * in the removal routine above. 2384 */ 2385 2386/* 2387 * This routine should be called immediately after changing 2388 * a directory entry. The inode's link count should not be 2389 * decremented by the calling procedure -- the soft updates 2390 * code will perform this task when it is safe. 2391 */ 2392void 2393softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 2394 struct buf *bp; /* buffer containing directory block */ 2395 struct inode *dp; /* inode for the directory being modified */ 2396 struct inode *ip; /* inode for directory entry being removed */ 2397 long newinum; /* new inode number for changed entry */ 2398 int isrmdir; /* indicates if doing RMDIR */ 2399{ 2400 int offset; 2401 struct diradd *dap = NULL; 2402 struct dirrem *dirrem; 2403 struct pagedep *pagedep; 2404 struct inodedep *inodedep; 2405 2406 offset = blkoff(dp->i_fs, dp->i_offset); 2407 2408 /* 2409 * Whiteouts do not need diradd dependencies. 2410 */ 2411 if (newinum != WINO) { 2412 MALLOC(dap, struct diradd *, sizeof(struct diradd), 2413 M_DIRADD, M_WAITOK); 2414 bzero(dap, sizeof(struct diradd)); 2415 dap->da_list.wk_type = D_DIRADD; 2416 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 2417 dap->da_offset = offset; 2418 dap->da_newinum = newinum; 2419 } 2420 2421 /* 2422 * Allocate a new dirrem and ACQUIRE_LOCK. 2423 */ 2424 dirrem = newdirrem(bp, dp, ip, isrmdir); 2425 pagedep = dirrem->dm_pagedep; 2426 /* 2427 * The possible values for isrmdir: 2428 * 0 - non-directory file rename 2429 * 1 - directory rename within same directory 2430 * inum - directory rename to new directory of given inode number 2431 * When renaming to a new directory, we are both deleting and 2432 * creating a new directory entry, so the link count on the new 2433 * directory should not change. Thus we do not need the followup 2434 * dirrem which is usually done in handle_workitem_remove. We set 2435 * the DIRCHG flag to tell handle_workitem_remove to skip the 2436 * followup dirrem. 2437 */ 2438 if (isrmdir > 1) 2439 dirrem->dm_state |= DIRCHG; 2440 2441 /* 2442 * Whiteouts have no additional dependencies, 2443 * so just put the dirrem on the correct list. 2444 */ 2445 if (newinum == WINO) { 2446 if ((dirrem->dm_state & COMPLETE) == 0) { 2447 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 2448 dm_next); 2449 } else { 2450 dirrem->dm_dirinum = pagedep->pd_ino; 2451 add_to_worklist(&dirrem->dm_list); 2452 } 2453 FREE_LOCK(&lk); 2454 return; 2455 } 2456 2457 /* 2458 * Link into its inodedep. Put it on the id_bufwait list if the inode 2459 * is not yet written. If it is written, do the post-inode write 2460 * processing to put it on the id_pendinghd list. 2461 */ 2462 dap->da_previous = dirrem; 2463 if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 || 2464 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2465 dap->da_state |= COMPLETE; 2466 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 2467 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 2468 } else { 2469 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 2470 dap, da_pdlist); 2471 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 2472 } 2473 /* 2474 * If the previous inode was never written or its previous directory 2475 * entry was never written, then we do not want to roll back to this 2476 * previous value. Instead we want to roll back to zero and immediately 2477 * free the unwritten or unreferenced inode. 2478 */ 2479 if (dirrem->dm_state & COMPLETE) { 2480 dap->da_state &= ~DIRCHG; 2481 dap->da_pagedep = pagedep; 2482 dirrem->dm_dirinum = pagedep->pd_ino; 2483 add_to_worklist(&dirrem->dm_list); 2484 } 2485 FREE_LOCK(&lk); 2486} 2487 2488/* 2489 * Called whenever the link count on an inode is increased. 2490 * It creates an inode dependency so that the new reference(s) 2491 * to the inode cannot be committed to disk until the updated 2492 * inode has been written. 2493 */ 2494void 2495softdep_increase_linkcnt(ip) 2496 struct inode *ip; /* the inode with the increased link count */ 2497{ 2498 struct inodedep *inodedep; 2499 2500 ACQUIRE_LOCK(&lk); 2501 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); 2502 FREE_LOCK(&lk); 2503} 2504 2505/* 2506 * This workitem decrements the inode's link count. 2507 * If the link count reaches zero, the file is removed. 2508 */ 2509static void 2510handle_workitem_remove(dirrem) 2511 struct dirrem *dirrem; 2512{ 2513 struct proc *p = CURPROC; /* XXX */ 2514 struct inodedep *inodedep; 2515 struct vnode *vp; 2516 struct inode *ip; 2517 int error; 2518 2519 if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) { 2520 softdep_error("handle_workitem_remove: vget", error); 2521 return; 2522 } 2523 ip = VTOI(vp); 2524 /* 2525 * Normal file deletion. 2526 */ 2527 if ((dirrem->dm_state & RMDIR) == 0) { 2528 ip->i_nlink--; 2529 if (ip->i_nlink < ip->i_effnlink) 2530 panic("handle_workitem_remove: bad file delta"); 2531 ip->i_flag |= IN_CHANGE; 2532 vput(vp); 2533 WORKITEM_FREE(dirrem, D_DIRREM); 2534 return; 2535 } 2536 /* 2537 * Directory deletion. Decrement reference count for both the 2538 * just deleted parent directory entry and the reference for ".". 2539 * Next truncate the directory to length zero. When the 2540 * truncation completes, arrange to have the reference count on 2541 * the parent decremented to account for the loss of "..". 2542 */ 2543 ip->i_nlink -= 2; 2544 if (ip->i_nlink < ip->i_effnlink) 2545 panic("handle_workitem_remove: bad dir delta"); 2546 ip->i_flag |= IN_CHANGE; 2547 if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0) 2548 softdep_error("handle_workitem_remove: truncate", error); 2549 /* 2550 * Rename a directory to a new parent. Since, we are both deleting 2551 * and creating a new directory entry, the link count on the new 2552 * directory should not change. Thus we skip the followup dirrem. 2553 */ 2554 if (dirrem->dm_state & DIRCHG) { 2555 vput(vp); 2556 WORKITEM_FREE(dirrem, D_DIRREM); 2557 return; 2558 } 2559 ACQUIRE_LOCK(&lk); 2560 (void) inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, DEPALLOC, 2561 &inodedep); 2562 dirrem->dm_state = 0; 2563 dirrem->dm_oldinum = dirrem->dm_dirinum; 2564 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 2565 FREE_LOCK(&lk); 2566 vput(vp); 2567} 2568 2569/* 2570 * Inode de-allocation dependencies. 2571 * 2572 * When an inode's link count is reduced to zero, it can be de-allocated. We 2573 * found it convenient to postpone de-allocation until after the inode is 2574 * written to disk with its new link count (zero). At this point, all of the 2575 * on-disk inode's block pointers are nullified and, with careful dependency 2576 * list ordering, all dependencies related to the inode will be satisfied and 2577 * the corresponding dependency structures de-allocated. So, if/when the 2578 * inode is reused, there will be no mixing of old dependencies with new 2579 * ones. This artificial dependency is set up by the block de-allocation 2580 * procedure above (softdep_setup_freeblocks) and completed by the 2581 * following procedure. 2582 */ 2583static void 2584handle_workitem_freefile(freefile) 2585 struct freefile *freefile; 2586{ 2587 struct vnode vp; 2588 struct inode tip; 2589 struct inodedep *idp; 2590 int error; 2591 2592#ifdef DEBUG 2593 ACQUIRE_LOCK(&lk); 2594 if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp)) 2595 panic("handle_workitem_freefile: inodedep survived"); 2596 FREE_LOCK(&lk); 2597#endif 2598 tip.i_devvp = freefile->fx_devvp; 2599 tip.i_dev = freefile->fx_devvp->v_rdev; 2600 tip.i_fs = freefile->fx_fs; 2601 vp.v_data = &tip; 2602 if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0) 2603 softdep_error("handle_workitem_freefile", error); 2604 WORKITEM_FREE(freefile, D_FREEFILE); 2605 num_freefile -= 1; 2606} 2607 2608/* 2609 * Disk writes. 2610 * 2611 * The dependency structures constructed above are most actively used when file 2612 * system blocks are written to disk. No constraints are placed on when a 2613 * block can be written, but unsatisfied update dependencies are made safe by 2614 * modifying (or replacing) the source memory for the duration of the disk 2615 * write. When the disk write completes, the memory block is again brought 2616 * up-to-date. 2617 * 2618 * In-core inode structure reclamation. 2619 * 2620 * Because there are a finite number of "in-core" inode structures, they are 2621 * reused regularly. By transferring all inode-related dependencies to the 2622 * in-memory inode block and indexing them separately (via "inodedep"s), we 2623 * can allow "in-core" inode structures to be reused at any time and avoid 2624 * any increase in contention. 2625 * 2626 * Called just before entering the device driver to initiate a new disk I/O. 2627 * The buffer must be locked, thus, no I/O completion operations can occur 2628 * while we are manipulating its associated dependencies. 2629 */ 2630void 2631softdep_disk_io_initiation(bp) 2632 struct buf *bp; /* structure describing disk write to occur */ 2633{ 2634 struct worklist *wk, *nextwk; 2635 struct indirdep *indirdep; 2636 2637 /* 2638 * We only care about write operations. There should never 2639 * be dependencies for reads. 2640 */ 2641 if (bp->b_flags & B_READ) 2642 panic("softdep_disk_io_initiation: read"); 2643 /* 2644 * Do any necessary pre-I/O processing. 2645 */ 2646 for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) { 2647 nextwk = LIST_NEXT(wk, wk_list); 2648 switch (wk->wk_type) { 2649 2650 case D_PAGEDEP: 2651 initiate_write_filepage(WK_PAGEDEP(wk), bp); 2652 continue; 2653 2654 case D_INODEDEP: 2655 initiate_write_inodeblock(WK_INODEDEP(wk), bp); 2656 continue; 2657 2658 case D_INDIRDEP: 2659 indirdep = WK_INDIRDEP(wk); 2660 if (indirdep->ir_state & GOINGAWAY) 2661 panic("disk_io_initiation: indirdep gone"); 2662 /* 2663 * If there are no remaining dependencies, this 2664 * will be writing the real pointers, so the 2665 * dependency can be freed. 2666 */ 2667 if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { 2668 indirdep->ir_savebp->b_flags &= ~B_XXX; 2669 indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; 2670 brelse(indirdep->ir_savebp); 2671 /* inline expand WORKLIST_REMOVE(wk); */ 2672 wk->wk_state &= ~ONWORKLIST; 2673 LIST_REMOVE(wk, wk_list); 2674 WORKITEM_FREE(indirdep, D_INDIRDEP); 2675 continue; 2676 } 2677 /* 2678 * Replace up-to-date version with safe version. 2679 */ 2680 ACQUIRE_LOCK(&lk); 2681 indirdep->ir_state &= ~ATTACHED; 2682 indirdep->ir_state |= UNDONE; 2683 MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount, 2684 M_INDIRDEP, M_WAITOK); 2685 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 2686 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 2687 bp->b_bcount); 2688 FREE_LOCK(&lk); 2689 continue; 2690 2691 case D_MKDIR: 2692 case D_BMSAFEMAP: 2693 case D_ALLOCDIRECT: 2694 case D_ALLOCINDIR: 2695 continue; 2696 2697 default: 2698 panic("handle_disk_io_initiation: Unexpected type %s", 2699 TYPENAME(wk->wk_type)); 2700 /* NOTREACHED */ 2701 } 2702 } 2703} 2704 2705/* 2706 * Called from within the procedure above to deal with unsatisfied 2707 * allocation dependencies in a directory. The buffer must be locked, 2708 * thus, no I/O completion operations can occur while we are 2709 * manipulating its associated dependencies. 2710 */ 2711static void 2712initiate_write_filepage(pagedep, bp) 2713 struct pagedep *pagedep; 2714 struct buf *bp; 2715{ 2716 struct diradd *dap; 2717 struct direct *ep; 2718 int i; 2719 2720 if (pagedep->pd_state & IOSTARTED) { 2721 /* 2722 * This can only happen if there is a driver that does not 2723 * understand chaining. Here biodone will reissue the call 2724 * to strategy for the incomplete buffers. 2725 */ 2726 printf("initiate_write_filepage: already started\n"); 2727 return; 2728 } 2729 pagedep->pd_state |= IOSTARTED; 2730 ACQUIRE_LOCK(&lk); 2731 for (i = 0; i < DAHASHSZ; i++) { 2732 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 2733 dap = LIST_NEXT(dap, da_pdlist)) { 2734 ep = (struct direct *) 2735 ((char *)bp->b_data + dap->da_offset); 2736 if (ep->d_ino != dap->da_newinum) 2737 panic("%s: dir inum %d != new %d", 2738 "initiate_write_filepage", 2739 ep->d_ino, dap->da_newinum); 2740 if (dap->da_state & DIRCHG) 2741 ep->d_ino = dap->da_previous->dm_oldinum; 2742 else 2743 ep->d_ino = 0; 2744 dap->da_state &= ~ATTACHED; 2745 dap->da_state |= UNDONE; 2746 } 2747 } 2748 FREE_LOCK(&lk); 2749} 2750 2751/* 2752 * Called from within the procedure above to deal with unsatisfied 2753 * allocation dependencies in an inodeblock. The buffer must be 2754 * locked, thus, no I/O completion operations can occur while we 2755 * are manipulating its associated dependencies. 2756 */ 2757static void 2758initiate_write_inodeblock(inodedep, bp) 2759 struct inodedep *inodedep; 2760 struct buf *bp; /* The inode block */ 2761{ 2762 struct allocdirect *adp, *lastadp; 2763 struct dinode *dp; 2764 struct fs *fs; 2765 ufs_lbn_t prevlbn = 0; 2766 int i, deplist; 2767 2768 if (inodedep->id_state & IOSTARTED) 2769 panic("initiate_write_inodeblock: already started"); 2770 inodedep->id_state |= IOSTARTED; 2771 fs = inodedep->id_fs; 2772 dp = (struct dinode *)bp->b_data + 2773 ino_to_fsbo(fs, inodedep->id_ino); 2774 /* 2775 * If the bitmap is not yet written, then the allocated 2776 * inode cannot be written to disk. 2777 */ 2778 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 2779 if (inodedep->id_savedino != NULL) 2780 panic("initiate_write_inodeblock: already doing I/O"); 2781 MALLOC(inodedep->id_savedino, struct dinode *, 2782 sizeof(struct dinode), M_INODEDEP, M_WAITOK); 2783 *inodedep->id_savedino = *dp; 2784 bzero((caddr_t)dp, sizeof(struct dinode)); 2785 return; 2786 } 2787 /* 2788 * If no dependencies, then there is nothing to roll back. 2789 */ 2790 inodedep->id_savedsize = dp->di_size; 2791 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) 2792 return; 2793 /* 2794 * Set the dependencies to busy. 2795 */ 2796 ACQUIRE_LOCK(&lk); 2797 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 2798 adp = TAILQ_NEXT(adp, ad_next)) { 2799#ifdef DIAGNOSTIC 2800 if (deplist != 0 && prevlbn >= adp->ad_lbn) 2801 panic("softdep_write_inodeblock: lbn order"); 2802 prevlbn = adp->ad_lbn; 2803 if (adp->ad_lbn < NDADDR && 2804 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) 2805 panic("%s: direct pointer #%ld mismatch %d != %d", 2806 "softdep_write_inodeblock", adp->ad_lbn, 2807 dp->di_db[adp->ad_lbn], adp->ad_newblkno); 2808 if (adp->ad_lbn >= NDADDR && 2809 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) 2810 panic("%s: indirect pointer #%ld mismatch %d != %d", 2811 "softdep_write_inodeblock", adp->ad_lbn - NDADDR, 2812 dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno); 2813 deplist |= 1 << adp->ad_lbn; 2814 if ((adp->ad_state & ATTACHED) == 0) 2815 panic("softdep_write_inodeblock: Unknown state 0x%x", 2816 adp->ad_state); 2817#endif /* DIAGNOSTIC */ 2818 adp->ad_state &= ~ATTACHED; 2819 adp->ad_state |= UNDONE; 2820 } 2821 /* 2822 * The on-disk inode cannot claim to be any larger than the last 2823 * fragment that has been written. Otherwise, the on-disk inode 2824 * might have fragments that were not the last block in the file 2825 * which would corrupt the filesystem. 2826 */ 2827 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 2828 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 2829 if (adp->ad_lbn >= NDADDR) 2830 break; 2831 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; 2832 /* keep going until hitting a rollback to a frag */ 2833 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 2834 continue; 2835 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 2836 for (i = adp->ad_lbn + 1; i < NDADDR; i++) { 2837#ifdef DIAGNOSTIC 2838 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 2839 panic("softdep_write_inodeblock: lost dep1"); 2840#endif /* DIAGNOSTIC */ 2841 dp->di_db[i] = 0; 2842 } 2843 for (i = 0; i < NIADDR; i++) { 2844#ifdef DIAGNOSTIC 2845 if (dp->di_ib[i] != 0 && 2846 (deplist & ((1 << NDADDR) << i)) == 0) 2847 panic("softdep_write_inodeblock: lost dep2"); 2848#endif /* DIAGNOSTIC */ 2849 dp->di_ib[i] = 0; 2850 } 2851 FREE_LOCK(&lk); 2852 return; 2853 } 2854 /* 2855 * If we have zero'ed out the last allocated block of the file, 2856 * roll back the size to the last currently allocated block. 2857 * We know that this last allocated block is a full-sized as 2858 * we already checked for fragments in the loop above. 2859 */ 2860 if (lastadp != NULL && 2861 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 2862 for (i = lastadp->ad_lbn; i >= 0; i--) 2863 if (dp->di_db[i] != 0) 2864 break; 2865 dp->di_size = (i + 1) * fs->fs_bsize; 2866 } 2867 /* 2868 * The only dependencies are for indirect blocks. 2869 * 2870 * The file size for indirect block additions is not guaranteed. 2871 * Such a guarantee would be non-trivial to achieve. The conventional 2872 * synchronous write implementation also does not make this guarantee. 2873 * Fsck should catch and fix discrepancies. Arguably, the file size 2874 * can be over-estimated without destroying integrity when the file 2875 * moves into the indirect blocks (i.e., is large). If we want to 2876 * postpone fsck, we are stuck with this argument. 2877 */ 2878 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 2879 dp->di_ib[adp->ad_lbn - NDADDR] = 0; 2880 FREE_LOCK(&lk); 2881} 2882 2883/* 2884 * This routine is called during the completion interrupt 2885 * service routine for a disk write (from the procedure called 2886 * by the device driver to inform the file system caches of 2887 * a request completion). It should be called early in this 2888 * procedure, before the block is made available to other 2889 * processes or other routines are called. 2890 */ 2891void 2892softdep_disk_write_complete(bp) 2893 struct buf *bp; /* describes the completed disk write */ 2894{ 2895 struct worklist *wk; 2896 struct workhead reattach; 2897 struct newblk *newblk; 2898 struct allocindir *aip; 2899 struct allocdirect *adp; 2900 struct indirdep *indirdep; 2901 struct inodedep *inodedep; 2902 struct bmsafemap *bmsafemap; 2903 2904#ifdef DEBUG 2905 if (lk.lkt_held != -1) 2906 panic("softdep_disk_write_complete: lock is held"); 2907 lk.lkt_held = -2; 2908#endif 2909 LIST_INIT(&reattach); 2910 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 2911 WORKLIST_REMOVE(wk); 2912 switch (wk->wk_type) { 2913 2914 case D_PAGEDEP: 2915 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 2916 WORKLIST_INSERT(&reattach, wk); 2917 continue; 2918 2919 case D_INODEDEP: 2920 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 2921 WORKLIST_INSERT(&reattach, wk); 2922 continue; 2923 2924 case D_BMSAFEMAP: 2925 bmsafemap = WK_BMSAFEMAP(wk); 2926 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { 2927 newblk->nb_state |= DEPCOMPLETE; 2928 newblk->nb_bmsafemap = NULL; 2929 LIST_REMOVE(newblk, nb_deps); 2930 } 2931 while (adp = LIST_FIRST(&bmsafemap->sm_allocdirecthd)) { 2932 adp->ad_state |= DEPCOMPLETE; 2933 adp->ad_buf = NULL; 2934 LIST_REMOVE(adp, ad_deps); 2935 handle_allocdirect_partdone(adp); 2936 } 2937 while (aip = LIST_FIRST(&bmsafemap->sm_allocindirhd)) { 2938 aip->ai_state |= DEPCOMPLETE; 2939 aip->ai_buf = NULL; 2940 LIST_REMOVE(aip, ai_deps); 2941 handle_allocindir_partdone(aip); 2942 } 2943 while ((inodedep = 2944 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { 2945 inodedep->id_state |= DEPCOMPLETE; 2946 LIST_REMOVE(inodedep, id_deps); 2947 inodedep->id_buf = NULL; 2948 } 2949 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 2950 continue; 2951 2952 case D_MKDIR: 2953 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 2954 continue; 2955 2956 case D_ALLOCDIRECT: 2957 adp = WK_ALLOCDIRECT(wk); 2958 adp->ad_state |= COMPLETE; 2959 handle_allocdirect_partdone(adp); 2960 continue; 2961 2962 case D_ALLOCINDIR: 2963 aip = WK_ALLOCINDIR(wk); 2964 aip->ai_state |= COMPLETE; 2965 handle_allocindir_partdone(aip); 2966 continue; 2967 2968 case D_INDIRDEP: 2969 indirdep = WK_INDIRDEP(wk); 2970 if (indirdep->ir_state & GOINGAWAY) 2971 panic("disk_write_complete: indirdep gone"); 2972 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 2973 FREE(indirdep->ir_saveddata, M_INDIRDEP); 2974 indirdep->ir_saveddata = 0; 2975 indirdep->ir_state &= ~UNDONE; 2976 indirdep->ir_state |= ATTACHED; 2977 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 2978 handle_allocindir_partdone(aip); 2979 if (aip == LIST_FIRST(&indirdep->ir_donehd)) 2980 panic("disk_write_complete: not gone"); 2981 } 2982 WORKLIST_INSERT(&reattach, wk); 2983 bdirty(bp); 2984 continue; 2985 2986 default: 2987 panic("handle_disk_write_complete: Unknown type %s", 2988 TYPENAME(wk->wk_type)); 2989 /* NOTREACHED */ 2990 } 2991 } 2992 /* 2993 * Reattach any requests that must be redone. 2994 */ 2995 while ((wk = LIST_FIRST(&reattach)) != NULL) { 2996 WORKLIST_REMOVE(wk); 2997 WORKLIST_INSERT(&bp->b_dep, wk); 2998 } 2999#ifdef DEBUG 3000 if (lk.lkt_held != -2) 3001 panic("softdep_disk_write_complete: lock lost"); 3002 lk.lkt_held = -1; 3003#endif 3004} 3005 3006/* 3007 * Called from within softdep_disk_write_complete above. Note that 3008 * this routine is always called from interrupt level with further 3009 * splbio interrupts blocked. 3010 */ 3011static void 3012handle_allocdirect_partdone(adp) 3013 struct allocdirect *adp; /* the completed allocdirect */ 3014{ 3015 struct allocdirect *listadp; 3016 struct inodedep *inodedep; 3017 long bsize; 3018 3019 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 3020 return; 3021 if (adp->ad_buf != NULL) 3022 panic("handle_allocdirect_partdone: dangling dep"); 3023 /* 3024 * The on-disk inode cannot claim to be any larger than the last 3025 * fragment that has been written. Otherwise, the on-disk inode 3026 * might have fragments that were not the last block in the file 3027 * which would corrupt the filesystem. Thus, we cannot free any 3028 * allocdirects after one whose ad_oldblkno claims a fragment as 3029 * these blocks must be rolled back to zero before writing the inode. 3030 * We check the currently active set of allocdirects in id_inoupdt. 3031 */ 3032 inodedep = adp->ad_inodedep; 3033 bsize = inodedep->id_fs->fs_bsize; 3034 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp; 3035 listadp = TAILQ_NEXT(listadp, ad_next)) { 3036 /* found our block */ 3037 if (listadp == adp) 3038 break; 3039 /* continue if ad_oldlbn is not a fragment */ 3040 if (listadp->ad_oldsize == 0 || 3041 listadp->ad_oldsize == bsize) 3042 continue; 3043 /* hit a fragment */ 3044 return; 3045 } 3046 /* 3047 * If we have reached the end of the current list without 3048 * finding the just finished dependency, then it must be 3049 * on the future dependency list. Future dependencies cannot 3050 * be freed until they are moved to the current list. 3051 */ 3052 if (listadp == NULL) { 3053#ifdef DEBUG 3054 for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp; 3055 listadp = TAILQ_NEXT(listadp, ad_next)) 3056 /* found our block */ 3057 if (listadp == adp) 3058 break; 3059 if (listadp == NULL) 3060 panic("handle_allocdirect_partdone: lost dep"); 3061#endif /* DEBUG */ 3062 return; 3063 } 3064 /* 3065 * If we have found the just finished dependency, then free 3066 * it along with anything that follows it that is complete. 3067 */ 3068 for (; adp; adp = listadp) { 3069 listadp = TAILQ_NEXT(adp, ad_next); 3070 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 3071 return; 3072 free_allocdirect(&inodedep->id_inoupdt, adp, 1); 3073 } 3074} 3075 3076/* 3077 * Called from within softdep_disk_write_complete above. Note that 3078 * this routine is always called from interrupt level with further 3079 * splbio interrupts blocked. 3080 */ 3081static void 3082handle_allocindir_partdone(aip) 3083 struct allocindir *aip; /* the completed allocindir */ 3084{ 3085 struct indirdep *indirdep; 3086 3087 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 3088 return; 3089 if (aip->ai_buf != NULL) 3090 panic("handle_allocindir_partdone: dangling dependency"); 3091 indirdep = aip->ai_indirdep; 3092 if (indirdep->ir_state & UNDONE) { 3093 LIST_REMOVE(aip, ai_next); 3094 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 3095 return; 3096 } 3097 ((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 3098 aip->ai_newblkno; 3099 LIST_REMOVE(aip, ai_next); 3100 if (aip->ai_freefrag != NULL) 3101 add_to_worklist(&aip->ai_freefrag->ff_list); 3102 WORKITEM_FREE(aip, D_ALLOCINDIR); 3103} 3104 3105/* 3106 * Called from within softdep_disk_write_complete above to restore 3107 * in-memory inode block contents to their most up-to-date state. Note 3108 * that this routine is always called from interrupt level with further 3109 * splbio interrupts blocked. 3110 */ 3111static int 3112handle_written_inodeblock(inodedep, bp) 3113 struct inodedep *inodedep; 3114 struct buf *bp; /* buffer containing the inode block */ 3115{ 3116 struct worklist *wk, *filefree; 3117 struct allocdirect *adp, *nextadp; 3118 struct dinode *dp; 3119 int hadchanges; 3120 3121 if ((inodedep->id_state & IOSTARTED) == 0) 3122 panic("handle_written_inodeblock: not started"); 3123 inodedep->id_state &= ~IOSTARTED; 3124 inodedep->id_state |= COMPLETE; 3125 dp = (struct dinode *)bp->b_data + 3126 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 3127 /* 3128 * If we had to rollback the inode allocation because of 3129 * bitmaps being incomplete, then simply restore it. 3130 * Keep the block dirty so that it will not be reclaimed until 3131 * all associated dependencies have been cleared and the 3132 * corresponding updates written to disk. 3133 */ 3134 if (inodedep->id_savedino != NULL) { 3135 *dp = *inodedep->id_savedino; 3136 FREE(inodedep->id_savedino, M_INODEDEP); 3137 inodedep->id_savedino = NULL; 3138 bdirty(bp); 3139 return (1); 3140 } 3141 /* 3142 * Roll forward anything that had to be rolled back before 3143 * the inode could be updated. 3144 */ 3145 hadchanges = 0; 3146 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 3147 nextadp = TAILQ_NEXT(adp, ad_next); 3148 if (adp->ad_state & ATTACHED) 3149 panic("handle_written_inodeblock: new entry"); 3150 if (adp->ad_lbn < NDADDR) { 3151 if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) 3152 panic("%s: %s #%ld mismatch %d != %d", 3153 "handle_written_inodeblock", 3154 "direct pointer", adp->ad_lbn, 3155 dp->di_db[adp->ad_lbn], adp->ad_oldblkno); 3156 dp->di_db[adp->ad_lbn] = adp->ad_newblkno; 3157 } else { 3158 if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) 3159 panic("%s: %s #%ld allocated as %d", 3160 "handle_written_inodeblock", 3161 "indirect pointer", adp->ad_lbn - NDADDR, 3162 dp->di_ib[adp->ad_lbn - NDADDR]); 3163 dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; 3164 } 3165 adp->ad_state &= ~UNDONE; 3166 adp->ad_state |= ATTACHED; 3167 hadchanges = 1; 3168 } 3169 /* 3170 * Reset the file size to its most up-to-date value. 3171 */ 3172 if (inodedep->id_savedsize == -1) 3173 panic("handle_written_inodeblock: bad size"); 3174 if (dp->di_size != inodedep->id_savedsize) { 3175 dp->di_size = inodedep->id_savedsize; 3176 hadchanges = 1; 3177 } 3178 inodedep->id_savedsize = -1; 3179 /* 3180 * If there were any rollbacks in the inode block, then it must be 3181 * marked dirty so that its will eventually get written back in 3182 * its correct form. 3183 */ 3184 if (hadchanges) 3185 bdirty(bp); 3186 /* 3187 * Process any allocdirects that completed during the update. 3188 */ 3189 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 3190 handle_allocdirect_partdone(adp); 3191 /* 3192 * Process deallocations that were held pending until the 3193 * inode had been written to disk. Freeing of the inode 3194 * is delayed until after all blocks have been freed to 3195 * avoid creation of new <vfsid, inum, lbn> triples 3196 * before the old ones have been deleted. 3197 */ 3198 filefree = NULL; 3199 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 3200 WORKLIST_REMOVE(wk); 3201 switch (wk->wk_type) { 3202 3203 case D_FREEFILE: 3204 /* 3205 * We defer adding filefree to the worklist until 3206 * all other additions have been made to ensure 3207 * that it will be done after all the old blocks 3208 * have been freed. 3209 */ 3210 if (filefree != NULL) 3211 panic("handle_written_inodeblock: filefree"); 3212 filefree = wk; 3213 continue; 3214 3215 case D_MKDIR: 3216 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 3217 continue; 3218 3219 case D_DIRADD: 3220 diradd_inode_written(WK_DIRADD(wk), inodedep); 3221 continue; 3222 3223 case D_FREEBLKS: 3224 case D_FREEFRAG: 3225 case D_DIRREM: 3226 add_to_worklist(wk); 3227 continue; 3228 3229 default: 3230 panic("handle_written_inodeblock: Unknown type %s", 3231 TYPENAME(wk->wk_type)); 3232 /* NOTREACHED */ 3233 } 3234 } 3235 if (filefree != NULL) { 3236 if (free_inodedep(inodedep) == 0) 3237 panic("handle_written_inodeblock: live inodedep"); 3238 add_to_worklist(filefree); 3239 return (0); 3240 } 3241 3242 /* 3243 * If no outstanding dependencies, free it. 3244 */ 3245 if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0) 3246 return (0); 3247 return (hadchanges); 3248} 3249 3250/* 3251 * Process a diradd entry after its dependent inode has been written. 3252 * This routine must be called with splbio interrupts blocked. 3253 */ 3254static void 3255diradd_inode_written(dap, inodedep) 3256 struct diradd *dap; 3257 struct inodedep *inodedep; 3258{ 3259 struct pagedep *pagedep; 3260 3261 dap->da_state |= COMPLETE; 3262 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3263 if (dap->da_state & DIRCHG) 3264 pagedep = dap->da_previous->dm_pagedep; 3265 else 3266 pagedep = dap->da_pagedep; 3267 LIST_REMOVE(dap, da_pdlist); 3268 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3269 } 3270 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 3271} 3272 3273/* 3274 * Handle the completion of a mkdir dependency. 3275 */ 3276static void 3277handle_written_mkdir(mkdir, type) 3278 struct mkdir *mkdir; 3279 int type; 3280{ 3281 struct diradd *dap; 3282 struct pagedep *pagedep; 3283 3284 if (mkdir->md_state != type) 3285 panic("handle_written_mkdir: bad type"); 3286 dap = mkdir->md_diradd; 3287 dap->da_state &= ~type; 3288 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 3289 dap->da_state |= DEPCOMPLETE; 3290 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3291 if (dap->da_state & DIRCHG) 3292 pagedep = dap->da_previous->dm_pagedep; 3293 else 3294 pagedep = dap->da_pagedep; 3295 LIST_REMOVE(dap, da_pdlist); 3296 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3297 } 3298 LIST_REMOVE(mkdir, md_mkdirs); 3299 WORKITEM_FREE(mkdir, D_MKDIR); 3300} 3301 3302/* 3303 * Called from within softdep_disk_write_complete above. 3304 * A write operation was just completed. Removed inodes can 3305 * now be freed and associated block pointers may be committed. 3306 * Note that this routine is always called from interrupt level 3307 * with further splbio interrupts blocked. 3308 */ 3309static int 3310handle_written_filepage(pagedep, bp) 3311 struct pagedep *pagedep; 3312 struct buf *bp; /* buffer containing the written page */ 3313{ 3314 struct dirrem *dirrem; 3315 struct diradd *dap, *nextdap; 3316 struct direct *ep; 3317 int i, chgs; 3318 3319 if ((pagedep->pd_state & IOSTARTED) == 0) 3320 panic("handle_written_filepage: not started"); 3321 pagedep->pd_state &= ~IOSTARTED; 3322 /* 3323 * Process any directory removals that have been committed. 3324 */ 3325 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 3326 LIST_REMOVE(dirrem, dm_next); 3327 dirrem->dm_dirinum = pagedep->pd_ino; 3328 add_to_worklist(&dirrem->dm_list); 3329 } 3330 /* 3331 * Free any directory additions that have been committed. 3332 */ 3333 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 3334 free_diradd(dap); 3335 /* 3336 * Uncommitted directory entries must be restored. 3337 */ 3338 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 3339 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 3340 dap = nextdap) { 3341 nextdap = LIST_NEXT(dap, da_pdlist); 3342 if (dap->da_state & ATTACHED) 3343 panic("handle_written_filepage: attached"); 3344 ep = (struct direct *) 3345 ((char *)bp->b_data + dap->da_offset); 3346 ep->d_ino = dap->da_newinum; 3347 dap->da_state &= ~UNDONE; 3348 dap->da_state |= ATTACHED; 3349 chgs = 1; 3350 /* 3351 * If the inode referenced by the directory has 3352 * been written out, then the dependency can be 3353 * moved to the pending list. 3354 */ 3355 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3356 LIST_REMOVE(dap, da_pdlist); 3357 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 3358 da_pdlist); 3359 } 3360 } 3361 } 3362 /* 3363 * If there were any rollbacks in the directory, then it must be 3364 * marked dirty so that its will eventually get written back in 3365 * its correct form. 3366 */ 3367 if (chgs) 3368 bdirty(bp); 3369 /* 3370 * If no dependencies remain, the pagedep will be freed. 3371 * Otherwise it will remain to update the page before it 3372 * is written back to disk. 3373 */ 3374 if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) { 3375 for (i = 0; i < DAHASHSZ; i++) 3376 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) 3377 break; 3378 if (i == DAHASHSZ) { 3379 LIST_REMOVE(pagedep, pd_hash); 3380 WORKITEM_FREE(pagedep, D_PAGEDEP); 3381 return (0); 3382 } 3383 } 3384 return (1); 3385} 3386 3387/* 3388 * Writing back in-core inode structures. 3389 * 3390 * The file system only accesses an inode's contents when it occupies an 3391 * "in-core" inode structure. These "in-core" structures are separate from 3392 * the page frames used to cache inode blocks. Only the latter are 3393 * transferred to/from the disk. So, when the updated contents of the 3394 * "in-core" inode structure are copied to the corresponding in-memory inode 3395 * block, the dependencies are also transferred. The following procedure is 3396 * called when copying a dirty "in-core" inode to a cached inode block. 3397 */ 3398 3399/* 3400 * Called when an inode is loaded from disk. If the effective link count 3401 * differed from the actual link count when it was last flushed, then we 3402 * need to ensure that the correct effective link count is put back. 3403 */ 3404void 3405softdep_load_inodeblock(ip) 3406 struct inode *ip; /* the "in_core" copy of the inode */ 3407{ 3408 struct inodedep *inodedep; 3409 3410 /* 3411 * Check for alternate nlink count. 3412 */ 3413 ip->i_effnlink = ip->i_nlink; 3414 ACQUIRE_LOCK(&lk); 3415 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3416 FREE_LOCK(&lk); 3417 return; 3418 } 3419 if (inodedep->id_nlinkdelta != 0) { 3420 ip->i_effnlink -= inodedep->id_nlinkdelta; 3421 ip->i_flag |= IN_MODIFIED; 3422 inodedep->id_nlinkdelta = 0; 3423 (void) free_inodedep(inodedep); 3424 } 3425 FREE_LOCK(&lk); 3426} 3427 3428/* 3429 * This routine is called just before the "in-core" inode 3430 * information is to be copied to the in-memory inode block. 3431 * Recall that an inode block contains several inodes. If 3432 * the force flag is set, then the dependencies will be 3433 * cleared so that the update can always be made. Note that 3434 * the buffer is locked when this routine is called, so we 3435 * will never be in the middle of writing the inode block 3436 * to disk. 3437 */ 3438void 3439softdep_update_inodeblock(ip, bp, waitfor) 3440 struct inode *ip; /* the "in_core" copy of the inode */ 3441 struct buf *bp; /* the buffer containing the inode block */ 3442 int waitfor; /* nonzero => update must be allowed */ 3443{ 3444 struct inodedep *inodedep; 3445 struct worklist *wk; 3446 int error, gotit; 3447 3448 /* 3449 * If the effective link count is not equal to the actual link 3450 * count, then we must track the difference in an inodedep while 3451 * the inode is (potentially) tossed out of the cache. Otherwise, 3452 * if there is no existing inodedep, then there are no dependencies 3453 * to track. 3454 */ 3455 ACQUIRE_LOCK(&lk); 3456 if (ip->i_effnlink != ip->i_nlink) { 3457 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, 3458 &inodedep); 3459 } else if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3460 FREE_LOCK(&lk); 3461 return; 3462 } 3463 if (ip->i_nlink < ip->i_effnlink) 3464 panic("softdep_update_inodeblock: bad delta"); 3465 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3466 /* 3467 * Changes have been initiated. Anything depending on these 3468 * changes cannot occur until this inode has been written. 3469 */ 3470 inodedep->id_state &= ~COMPLETE; 3471 if ((inodedep->id_state & ONWORKLIST) == 0) 3472 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 3473 /* 3474 * Any new dependencies associated with the incore inode must 3475 * now be moved to the list associated with the buffer holding 3476 * the in-memory copy of the inode. Once merged process any 3477 * allocdirects that are completed by the merger. 3478 */ 3479 merge_inode_lists(inodedep); 3480 if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) 3481 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); 3482 /* 3483 * Now that the inode has been pushed into the buffer, the 3484 * operations dependent on the inode being written to disk 3485 * can be moved to the id_bufwait so that they will be 3486 * processed when the buffer I/O completes. 3487 */ 3488 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 3489 WORKLIST_REMOVE(wk); 3490 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 3491 } 3492 /* 3493 * Newly allocated inodes cannot be written until the bitmap 3494 * that allocates them have been written (indicated by 3495 * DEPCOMPLETE being set in id_state). If we are doing a 3496 * forced sync (e.g., an fsync on a file), we force the bitmap 3497 * to be written so that the update can be done. 3498 */ 3499 if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) { 3500 FREE_LOCK(&lk); 3501 return; 3502 } 3503 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 3504 FREE_LOCK(&lk); 3505 if (gotit && (error = VOP_BWRITE(inodedep->id_buf)) != 0) 3506 softdep_error("softdep_update_inodeblock: bwrite", error); 3507 if ((inodedep->id_state & DEPCOMPLETE) == 0) 3508 panic("softdep_update_inodeblock: update failed"); 3509} 3510 3511/* 3512 * Merge the new inode dependency list (id_newinoupdt) into the old 3513 * inode dependency list (id_inoupdt). This routine must be called 3514 * with splbio interrupts blocked. 3515 */ 3516static void 3517merge_inode_lists(inodedep) 3518 struct inodedep *inodedep; 3519{ 3520 struct allocdirect *listadp, *newadp; 3521 3522 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3523 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) { 3524 if (listadp->ad_lbn < newadp->ad_lbn) { 3525 listadp = TAILQ_NEXT(listadp, ad_next); 3526 continue; 3527 } 3528 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3529 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 3530 if (listadp->ad_lbn == newadp->ad_lbn) { 3531 allocdirect_merge(&inodedep->id_inoupdt, newadp, 3532 listadp); 3533 listadp = newadp; 3534 } 3535 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3536 } 3537 while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) { 3538 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3539 TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next); 3540 } 3541} 3542 3543/* 3544 * If we are doing an fsync, then we must ensure that any directory 3545 * entries for the inode have been written after the inode gets to disk. 3546 */ 3547int 3548softdep_fsync(vp) 3549 struct vnode *vp; /* the "in_core" copy of the inode */ 3550{ 3551 struct diradd *dap, *olddap; 3552 struct inodedep *inodedep; 3553 struct pagedep *pagedep; 3554 struct worklist *wk; 3555 struct mount *mnt; 3556 struct vnode *pvp; 3557 struct inode *ip; 3558 struct buf *bp; 3559 struct fs *fs; 3560 struct proc *p = CURPROC; /* XXX */ 3561 int error, ret, flushparent; 3562#ifndef __FreeBSD__ 3563 struct timeval tv; 3564#endif 3565 ino_t parentino; 3566 ufs_lbn_t lbn; 3567 3568 ip = VTOI(vp); 3569 fs = ip->i_fs; 3570 for (error = 0, flushparent = 0, olddap = NULL; ; ) { 3571 ACQUIRE_LOCK(&lk); 3572 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) 3573 break; 3574 if (LIST_FIRST(&inodedep->id_inowait) != NULL || 3575 LIST_FIRST(&inodedep->id_bufwait) != NULL || 3576 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 3577 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) 3578 panic("softdep_fsync: pending ops"); 3579 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 3580 break; 3581 if (wk->wk_type != D_DIRADD) 3582 panic("softdep_fsync: Unexpected type %s", 3583 TYPENAME(wk->wk_type)); 3584 dap = WK_DIRADD(wk); 3585 /* 3586 * If we have failed to get rid of all the dependencies 3587 * then something is seriously wrong. 3588 */ 3589 if (dap == olddap) 3590 panic("softdep_fsync: flush failed"); 3591 olddap = dap; 3592 /* 3593 * Flush our parent if this directory entry 3594 * has a MKDIR_PARENT dependency. 3595 */ 3596 if (dap->da_state & DIRCHG) 3597 pagedep = dap->da_previous->dm_pagedep; 3598 else 3599 pagedep = dap->da_pagedep; 3600 mnt = pagedep->pd_mnt; 3601 parentino = pagedep->pd_ino; 3602 lbn = pagedep->pd_lbn; 3603 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 3604 panic("softdep_fsync: dirty"); 3605 flushparent = dap->da_state & MKDIR_PARENT; 3606 /* 3607 * If we are being fsync'ed as part of vgone'ing this vnode, 3608 * then we will not be able to release and recover the 3609 * vnode below, so we just have to give up on writing its 3610 * directory entry out. It will eventually be written, just 3611 * not now, but then the user was not asking to have it 3612 * written, so we are not breaking any promises. 3613 */ 3614 if (vp->v_flag & VXLOCK) 3615 break; 3616 /* 3617 * We prevent deadlock by always fetching inodes from the 3618 * root, moving down the directory tree. Thus, when fetching 3619 * our parent directory, we must unlock ourselves before 3620 * requesting the lock on our parent. See the comment in 3621 * ufs_lookup for details on possible races. 3622 */ 3623 FREE_LOCK(&lk); 3624 VOP_UNLOCK(vp, 0, p); 3625 if ((error = VFS_VGET(mnt, parentino, &pvp)) != 0) { 3626 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 3627 return (error); 3628 } 3629 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 3630 if (flushparent) { 3631#ifdef __FreeBSD__ 3632 error = UFS_UPDATE(pvp, 1); 3633#else 3634 tv = time; 3635 error = UFS_UPDATE(pvp, &tv, &tv, 1); 3636#endif 3637 if (error) { 3638 vput(pvp); 3639 return (error); 3640 } 3641 } 3642 /* 3643 * Flush directory page containing the inode's name. 3644 */ 3645 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred, 3646 &bp); 3647 ret = VOP_BWRITE(bp); 3648 vput(pvp); 3649 if (error != 0) 3650 return (error); 3651 if (ret != 0) 3652 return (ret); 3653 } 3654 FREE_LOCK(&lk); 3655 return (0); 3656} 3657 3658/* 3659 * This routine is called when we are trying to synchronously flush a 3660 * file. This routine must eliminate any filesystem metadata dependencies 3661 * so that the syncing routine can succeed by pushing the dirty blocks 3662 * associated with the file. If any I/O errors occur, they are returned. 3663 */ 3664int 3665softdep_sync_metadata(ap) 3666 struct vop_fsync_args /* { 3667 struct vnode *a_vp; 3668 struct ucred *a_cred; 3669 int a_waitfor; 3670 struct proc *a_p; 3671 } */ *ap; 3672{ 3673 struct vnode *vp = ap->a_vp; 3674 struct pagedep *pagedep; 3675 struct allocdirect *adp; 3676 struct allocindir *aip; 3677 struct buf *bp, *nbp; 3678 struct worklist *wk; 3679 int i, error, waitfor; 3680 3681 /* 3682 * Check whether this vnode is involved in a filesystem 3683 * that is doing soft dependency processing. 3684 */ 3685 if (vp->v_type != VBLK) { 3686 if (!DOINGSOFTDEP(vp)) 3687 return (0); 3688 } else 3689 if (vp->v_specmountpoint == NULL || 3690 (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0) 3691 return (0); 3692 /* 3693 * Ensure that any direct block dependencies have been cleared. 3694 */ 3695 ACQUIRE_LOCK(&lk); 3696 if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) { 3697 FREE_LOCK(&lk); 3698 return (error); 3699 } 3700 /* 3701 * For most files, the only metadata dependencies are the 3702 * cylinder group maps that allocate their inode or blocks. 3703 * The block allocation dependencies can be found by traversing 3704 * the dependency lists for any buffers that remain on their 3705 * dirty buffer list. The inode allocation dependency will 3706 * be resolved when the inode is updated with MNT_WAIT. 3707 * This work is done in two passes. The first pass grabs most 3708 * of the buffers and begins asynchronously writing them. The 3709 * only way to wait for these asynchronous writes is to sleep 3710 * on the filesystem vnode which may stay busy for a long time 3711 * if the filesystem is active. So, instead, we make a second 3712 * pass over the dependencies blocking on each write. In the 3713 * usual case we will be blocking against a write that we 3714 * initiated, so when it is done the dependency will have been 3715 * resolved. Thus the second pass is expected to end quickly. 3716 */ 3717 waitfor = MNT_NOWAIT; 3718top: 3719 if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) { 3720 FREE_LOCK(&lk); 3721 return (0); 3722 } 3723 bp = TAILQ_FIRST(&vp->v_dirtyblkhd); 3724loop: 3725 /* 3726 * As we hold the buffer locked, none of its dependencies 3727 * will disappear. 3728 */ 3729 for (wk = LIST_FIRST(&bp->b_dep); wk; 3730 wk = LIST_NEXT(wk, wk_list)) { 3731 switch (wk->wk_type) { 3732 3733 case D_ALLOCDIRECT: 3734 adp = WK_ALLOCDIRECT(wk); 3735 if (adp->ad_state & DEPCOMPLETE) 3736 break; 3737 nbp = adp->ad_buf; 3738 if (getdirtybuf(&nbp, waitfor) == 0) 3739 break; 3740 FREE_LOCK(&lk); 3741 if (waitfor == MNT_NOWAIT) { 3742 bawrite(nbp); 3743 } else if ((error = VOP_BWRITE(nbp)) != 0) { 3744 bawrite(bp); 3745 return (error); 3746 } 3747 ACQUIRE_LOCK(&lk); 3748 break; 3749 3750 case D_ALLOCINDIR: 3751 aip = WK_ALLOCINDIR(wk); 3752 if (aip->ai_state & DEPCOMPLETE) 3753 break; 3754 nbp = aip->ai_buf; 3755 if (getdirtybuf(&nbp, waitfor) == 0) 3756 break; 3757 FREE_LOCK(&lk); 3758 if (waitfor == MNT_NOWAIT) { 3759 bawrite(nbp); 3760 } else if ((error = VOP_BWRITE(nbp)) != 0) { 3761 bawrite(bp); 3762 return (error); 3763 } 3764 ACQUIRE_LOCK(&lk); 3765 break; 3766 3767 case D_INDIRDEP: 3768 restart: 3769 for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd); 3770 aip; aip = LIST_NEXT(aip, ai_next)) { 3771 if (aip->ai_state & DEPCOMPLETE) 3772 continue; 3773 nbp = aip->ai_buf; 3774 if (getdirtybuf(&nbp, MNT_WAIT) == 0) 3775 goto restart; 3776 FREE_LOCK(&lk); 3777 if ((error = VOP_BWRITE(nbp)) != 0) { 3778 bawrite(bp); 3779 return (error); 3780 } 3781 ACQUIRE_LOCK(&lk); 3782 goto restart; 3783 } 3784 break; 3785 3786 case D_INODEDEP: 3787 if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs, 3788 WK_INODEDEP(wk)->id_ino)) != 0) { 3789 FREE_LOCK(&lk); 3790 bawrite(bp); 3791 return (error); 3792 } 3793 break; 3794 3795 case D_PAGEDEP: 3796 /* 3797 * We are trying to sync a directory that may 3798 * have dependencies on both its own metadata 3799 * and/or dependencies on the inodes of any 3800 * recently allocated files. We walk its diradd 3801 * lists pushing out the associated inode. 3802 */ 3803 pagedep = WK_PAGEDEP(wk); 3804 for (i = 0; i < DAHASHSZ; i++) { 3805 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 3806 continue; 3807 if (error = flush_pagedep_deps(vp, 3808 pagedep->pd_mnt, &pagedep->pd_diraddhd[i])) { 3809 FREE_LOCK(&lk); 3810 bawrite(bp); 3811 return (error); 3812 } 3813 } 3814 break; 3815 3816 case D_MKDIR: 3817 /* 3818 * This case should never happen if the vnode has 3819 * been properly sync'ed. However, if this function 3820 * is used at a place where the vnode has not yet 3821 * been sync'ed, this dependency can show up. So, 3822 * rather than panic, just flush it. 3823 */ 3824 nbp = WK_MKDIR(wk)->md_buf; 3825 if (getdirtybuf(&nbp, waitfor) == 0) 3826 break; 3827 FREE_LOCK(&lk); 3828 if (waitfor == MNT_NOWAIT) { 3829 bawrite(nbp); 3830 } else if ((error = VOP_BWRITE(nbp)) != 0) { 3831 bawrite(bp); 3832 return (error); 3833 } 3834 ACQUIRE_LOCK(&lk); 3835 break; 3836 3837 case D_BMSAFEMAP: 3838 /* 3839 * This case should never happen if the vnode has 3840 * been properly sync'ed. However, if this function 3841 * is used at a place where the vnode has not yet 3842 * been sync'ed, this dependency can show up. So, 3843 * rather than panic, just flush it. 3844 */ 3845 nbp = WK_BMSAFEMAP(wk)->sm_buf; 3846 if (getdirtybuf(&nbp, waitfor) == 0) 3847 break; 3848 FREE_LOCK(&lk); 3849 if (waitfor == MNT_NOWAIT) { 3850 bawrite(nbp); 3851 } else if ((error = VOP_BWRITE(nbp)) != 0) { 3852 bawrite(bp); 3853 return (error); 3854 } 3855 ACQUIRE_LOCK(&lk); 3856 break; 3857 3858 default: 3859 panic("softdep_sync_metadata: Unknown type %s", 3860 TYPENAME(wk->wk_type)); 3861 /* NOTREACHED */ 3862 } 3863 } 3864 (void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT); 3865 nbp = TAILQ_NEXT(bp, b_vnbufs); 3866 FREE_LOCK(&lk); 3867 bawrite(bp); 3868 ACQUIRE_LOCK(&lk); 3869 if (nbp != NULL) { 3870 bp = nbp; 3871 goto loop; 3872 } 3873 /* 3874 * We must wait for any I/O in progress to finish so that 3875 * all potential buffers on the dirty list will be visible. 3876 * Once they are all there, proceed with the second pass 3877 * which will wait for the I/O as per above. 3878 */ 3879 drain_output(vp, 1); 3880 /* 3881 * The brief unlock is to allow any pent up dependency 3882 * processing to be done. 3883 */ 3884 if (waitfor == MNT_NOWAIT) { 3885 waitfor = MNT_WAIT; 3886 FREE_LOCK(&lk); 3887 ACQUIRE_LOCK(&lk); 3888 goto top; 3889 } 3890 3891 /* 3892 * If we have managed to get rid of all the dirty buffers, 3893 * then we are done. For certain directories and block 3894 * devices, we may need to do further work. 3895 */ 3896 if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) { 3897 FREE_LOCK(&lk); 3898 return (0); 3899 } 3900 3901 FREE_LOCK(&lk); 3902 /* 3903 * If we are trying to sync a block device, some of its buffers may 3904 * contain metadata that cannot be written until the contents of some 3905 * partially written files have been written to disk. The only easy 3906 * way to accomplish this is to sync the entire filesystem (luckily 3907 * this happens rarely). 3908 */ 3909 if (vp->v_type == VBLK && vp->v_specmountpoint && !VOP_ISLOCKED(vp) && 3910 (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred, 3911 ap->a_p)) != 0) 3912 return (error); 3913 return (0); 3914} 3915 3916/* 3917 * Flush the dependencies associated with an inodedep. 3918 * Called with splbio blocked. 3919 */ 3920static int 3921flush_inodedep_deps(fs, ino) 3922 struct fs *fs; 3923 ino_t ino; 3924{ 3925 struct inodedep *inodedep; 3926 struct allocdirect *adp; 3927 int error, waitfor; 3928 struct buf *bp; 3929 3930 /* 3931 * This work is done in two passes. The first pass grabs most 3932 * of the buffers and begins asynchronously writing them. The 3933 * only way to wait for these asynchronous writes is to sleep 3934 * on the filesystem vnode which may stay busy for a long time 3935 * if the filesystem is active. So, instead, we make a second 3936 * pass over the dependencies blocking on each write. In the 3937 * usual case we will be blocking against a write that we 3938 * initiated, so when it is done the dependency will have been 3939 * resolved. Thus the second pass is expected to end quickly. 3940 * We give a brief window at the top of the loop to allow 3941 * any pending I/O to complete. 3942 */ 3943 for (waitfor = MNT_NOWAIT; ; ) { 3944 FREE_LOCK(&lk); 3945 ACQUIRE_LOCK(&lk); 3946 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 3947 return (0); 3948 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3949 adp = TAILQ_NEXT(adp, ad_next)) { 3950 if (adp->ad_state & DEPCOMPLETE) 3951 continue; 3952 bp = adp->ad_buf; 3953 if (getdirtybuf(&bp, waitfor) == 0) { 3954 if (waitfor == MNT_NOWAIT) 3955 continue; 3956 break; 3957 } 3958 FREE_LOCK(&lk); 3959 if (waitfor == MNT_NOWAIT) { 3960 bawrite(bp); 3961 } else if ((error = VOP_BWRITE(bp)) != 0) { 3962 ACQUIRE_LOCK(&lk); 3963 return (error); 3964 } 3965 ACQUIRE_LOCK(&lk); 3966 break; 3967 } 3968 if (adp != NULL) 3969 continue; 3970 for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp; 3971 adp = TAILQ_NEXT(adp, ad_next)) { 3972 if (adp->ad_state & DEPCOMPLETE) 3973 continue; 3974 bp = adp->ad_buf; 3975 if (getdirtybuf(&bp, waitfor) == 0) { 3976 if (waitfor == MNT_NOWAIT) 3977 continue; 3978 break; 3979 } 3980 FREE_LOCK(&lk); 3981 if (waitfor == MNT_NOWAIT) { 3982 bawrite(bp); 3983 } else if ((error = VOP_BWRITE(bp)) != 0) { 3984 ACQUIRE_LOCK(&lk); 3985 return (error); 3986 } 3987 ACQUIRE_LOCK(&lk); 3988 break; 3989 } 3990 if (adp != NULL) 3991 continue; 3992 /* 3993 * If pass2, we are done, otherwise do pass 2. 3994 */ 3995 if (waitfor == MNT_WAIT) 3996 break; 3997 waitfor = MNT_WAIT; 3998 } 3999 /* 4000 * Try freeing inodedep in case all dependencies have been removed. 4001 */ 4002 if (inodedep_lookup(fs, ino, 0, &inodedep) != 0) 4003 (void) free_inodedep(inodedep); 4004 return (0); 4005} 4006 4007/* 4008 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 4009 * Called with splbio blocked. 4010 */ 4011static int 4012flush_pagedep_deps(pvp, mp, diraddhdp) 4013 struct vnode *pvp; 4014 struct mount *mp; 4015 struct diraddhd *diraddhdp; 4016{ 4017 struct proc *p = CURPROC; /* XXX */ 4018 struct inodedep *inodedep; 4019 struct ufsmount *ump; 4020 struct diradd *dap; 4021#ifndef __FreeBSD__ 4022 struct timeval tv; 4023#endif 4024 struct vnode *vp; 4025 int gotit, error = 0; 4026 struct buf *bp; 4027 ino_t inum; 4028 4029 ump = VFSTOUFS(mp); 4030 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 4031 /* 4032 * Flush ourselves if this directory entry 4033 * has a MKDIR_PARENT dependency. 4034 */ 4035 if (dap->da_state & MKDIR_PARENT) { 4036 FREE_LOCK(&lk); 4037#ifdef __FreeBSD__ 4038 error = UFS_UPDATE(pvp, 1); 4039#else 4040 tv = time; 4041 error = UFS_UPDATE(pvp, &tv, &tv, 1); 4042#endif 4043 if (error) 4044 break; 4045 ACQUIRE_LOCK(&lk); 4046 /* 4047 * If that cleared dependencies, go on to next. 4048 */ 4049 if (dap != LIST_FIRST(diraddhdp)) 4050 continue; 4051 if (dap->da_state & MKDIR_PARENT) 4052 panic("flush_pagedep_deps: MKDIR"); 4053 } 4054 /* 4055 * Flush the file on which the directory entry depends. 4056 * If the inode has already been pushed out of the cache, 4057 * then all the block dependencies will have been flushed 4058 * leaving only inode dependencies (e.g., bitmaps). Thus, 4059 * we do a ufs_ihashget to check for the vnode in the cache. 4060 * If it is there, we do a full flush. If it is no longer 4061 * there we need only dispose of any remaining bitmap 4062 * dependencies and write the inode to disk. 4063 */ 4064 inum = dap->da_newinum; 4065 FREE_LOCK(&lk); 4066 if ((vp = ufs_ihashget(ump->um_dev, inum)) == NULL) { 4067 ACQUIRE_LOCK(&lk); 4068 if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0 4069 && dap == LIST_FIRST(diraddhdp)) 4070 panic("flush_pagedep_deps: flush 1 failed"); 4071 /* 4072 * If the inode still has bitmap dependencies, 4073 * push them to disk. 4074 */ 4075 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 4076 gotit = getdirtybuf(&inodedep->id_buf,MNT_WAIT); 4077 FREE_LOCK(&lk); 4078 if (gotit && 4079 (error = VOP_BWRITE(inodedep->id_buf)) != 0) 4080 break; 4081 ACQUIRE_LOCK(&lk); 4082 } 4083 if (dap != LIST_FIRST(diraddhdp)) 4084 continue; 4085 /* 4086 * If the inode is still sitting in a buffer waiting 4087 * to be written, push it to disk. 4088 */ 4089 FREE_LOCK(&lk); 4090 if ((error = bread(ump->um_devvp, 4091 fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), 4092 (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) 4093 break; 4094 if ((error = VOP_BWRITE(bp)) != 0) 4095 break; 4096 ACQUIRE_LOCK(&lk); 4097 if (dap == LIST_FIRST(diraddhdp)) 4098 panic("flush_pagedep_deps: flush 2 failed"); 4099 continue; 4100 } 4101 if (vp->v_type == VDIR) { 4102 /* 4103 * A newly allocated directory must have its "." and 4104 * ".." entries written out before its name can be 4105 * committed in its parent. We do not want or need 4106 * the full semantics of a synchronous VOP_FSYNC as 4107 * that may end up here again, once for each directory 4108 * level in the filesystem. Instead, we push the blocks 4109 * and wait for them to clear. 4110 */ 4111 if (error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) { 4112 vput(vp); 4113 break; 4114 } 4115 drain_output(vp, 0); 4116 } 4117#ifdef __FreeBSD__ 4118 error = UFS_UPDATE(vp, 1); 4119#else 4120 tv = time; 4121 error = UFS_UPDATE(vp, &tv, &tv, 1); 4122#endif 4123 vput(vp); 4124 if (error) 4125 break; 4126 /* 4127 * If we have failed to get rid of all the dependencies 4128 * then something is seriously wrong. 4129 */ 4130 if (dap == LIST_FIRST(diraddhdp)) 4131 panic("flush_pagedep_deps: flush 3 failed"); 4132 ACQUIRE_LOCK(&lk); 4133 } 4134 if (error) 4135 ACQUIRE_LOCK(&lk); 4136 return (error); 4137} 4138 4139/* 4140 * A large burst of file addition or deletion activity can drive the 4141 * memory load excessively high. Therefore we deliberately slow things 4142 * down and speed up the I/O processing if we find ourselves with too 4143 * many dependencies in progress. 4144 */ 4145static int 4146checklimit(resource, islocked) 4147 long *resource; 4148 int islocked; 4149{ 4150 struct callout_handle handle; 4151 struct proc *p = CURPROC; 4152 int s; 4153 4154 /* 4155 * If we are under our limit, just proceed. 4156 */ 4157 if (*resource < max_softdeps) 4158 return (0); 4159 /* 4160 * We never hold up the filesystem syncer process. 4161 */ 4162 if (p == filesys_syncer) 4163 return (0); 4164 /* 4165 * Our first approach is to speed up the syncer process. 4166 * We never push it to speed up more than half of its 4167 * normal turn time, otherwise it could take over the cpu. 4168 */ 4169 s = splhigh(); 4170 if (filesys_syncer->p_wchan == &lbolt) 4171 setrunnable(filesys_syncer); 4172 splx(s); 4173 if (rushjob < syncdelay / 2) { 4174 rushjob += 1; 4175 rush_requests += 1; 4176 return (0); 4177 } 4178 /* 4179 * If we are resource constrained on inode dependencies, try 4180 * flushing some dirty inodes. Otherwise, we are constrained 4181 * by file deletions, so try accelerating flushes of directories 4182 * with removal dependencies. We would like to do the cleanup 4183 * here, but we probably hold an inode locked at this point and 4184 * that might deadlock against one that we try to clean. So, 4185 * the best that we can do is request the syncer daemon (kick 4186 * started above) to do the cleanup for us. 4187 */ 4188 if (resource == &num_inodedep) { 4189 ino_limit_push += 1; 4190 req_clear_inodedeps = 1; 4191 } else { 4192 blk_limit_push += 1; 4193 req_clear_remove = 1; 4194 } 4195 /* 4196 * Hopefully the syncer daemon will catch up and awaken us. 4197 * We wait at most tickdelay before proceeding in any case. 4198 */ 4199 if (islocked == 0) 4200 ACQUIRE_LOCK(&lk); 4201 if (proc_waiting == 0) { 4202 proc_waiting = 1; 4203 handle = timeout(pause_timer, NULL, 4204 tickdelay > 2 ? tickdelay : 2); 4205 } 4206 FREE_LOCK_INTERLOCKED(&lk); 4207 (void) tsleep((caddr_t)&proc_waiting, PPAUSE | PCATCH, "softupdate", 0); 4208 ACQUIRE_LOCK_INTERLOCKED(&lk); 4209 if (proc_waiting) { 4210 untimeout(pause_timer, NULL, handle); 4211 proc_waiting = 0; 4212 } else { 4213 if (resource == &num_inodedep) 4214 ino_limit_hit += 1; 4215 else 4216 blk_limit_hit += 1; 4217 } 4218 if (islocked == 0) 4219 FREE_LOCK(&lk); 4220 return (1); 4221} 4222 4223/* 4224 * Awaken processes pausing in checklimit and clear proc_waiting 4225 * to indicate that there is no longer a timer running. 4226 */ 4227void 4228pause_timer(arg) 4229 void *arg; 4230{ 4231 4232 proc_waiting = 0; 4233 wakeup(&proc_waiting); 4234} 4235 4236/* 4237 * Flush out a directory with at least one removal dependency in an effort 4238 * to reduce the number of freefile and freeblks dependency structures. 4239 */ 4240static void 4241clear_remove(p) 4242 struct proc *p; 4243{ 4244 struct pagedep_hashhead *pagedephd; 4245 struct pagedep *pagedep; 4246 static int next = 0; 4247 struct mount *mp; 4248 struct vnode *vp; 4249 int error, cnt; 4250 ino_t ino; 4251 4252 ACQUIRE_LOCK(&lk); 4253 for (cnt = 0; cnt < pagedep_hash; cnt++) { 4254 pagedephd = &pagedep_hashtbl[next++]; 4255 if (next >= pagedep_hash) 4256 next = 0; 4257 for (pagedep = LIST_FIRST(pagedephd); pagedep; 4258 pagedep = LIST_NEXT(pagedep, pd_hash)) { 4259 if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL) 4260 continue; 4261 mp = pagedep->pd_mnt; 4262 ino = pagedep->pd_ino; 4263 FREE_LOCK(&lk); 4264 if ((error = VFS_VGET(mp, ino, &vp)) != 0) { 4265 softdep_error("clear_remove: vget", error); 4266 return; 4267 } 4268 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) 4269 softdep_error("clear_remove: fsync", error); 4270 drain_output(vp, 0); 4271 vput(vp); 4272 return; 4273 } 4274 } 4275 FREE_LOCK(&lk); 4276} 4277 4278/* 4279 * Clear out a block of dirty inodes in an effort to reduce 4280 * the number of inodedep dependency structures. 4281 */ 4282static void 4283clear_inodedeps(p) 4284 struct proc *p; 4285{ 4286 struct inodedep_hashhead *inodedephd; 4287 struct inodedep *inodedep; 4288 static int next = 0; 4289 struct mount *mp; 4290 struct vnode *vp; 4291 struct fs *fs; 4292 int error, cnt; 4293 ino_t firstino, lastino, ino; 4294 4295 ACQUIRE_LOCK(&lk); 4296 /* 4297 * Pick a random inode dependency to be cleared. 4298 * We will then gather up all the inodes in its block 4299 * that have dependencies and flush them out. 4300 */ 4301 for (cnt = 0; cnt < inodedep_hash; cnt++) { 4302 inodedephd = &inodedep_hashtbl[next++]; 4303 if (next >= inodedep_hash) 4304 next = 0; 4305 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 4306 break; 4307 } 4308 /* 4309 * Ugly code to find mount point given pointer to superblock. 4310 */ 4311 fs = inodedep->id_fs; 4312 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 4313 mp = CIRCLEQ_NEXT(mp, mnt_list)) 4314 if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs) 4315 break; 4316 /* 4317 * Find the last inode in the block with dependencies. 4318 */ 4319 firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 4320 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 4321 if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0) 4322 break; 4323 /* 4324 * Asynchronously push all but the last inode with dependencies. 4325 * Synchronously push the last inode with dependencies to ensure 4326 * that the inode block gets written to free up the inodedeps. 4327 */ 4328 for (ino = firstino; ino <= lastino; ino++) { 4329 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 4330 continue; 4331 FREE_LOCK(&lk); 4332 if ((error = VFS_VGET(mp, ino, &vp)) != 0) { 4333 softdep_error("clear_inodedeps: vget", error); 4334 return; 4335 } 4336 if (ino == lastino) { 4337 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p))) 4338 softdep_error("clear_inodedeps: fsync1", error); 4339 } else { 4340 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) 4341 softdep_error("clear_inodedeps: fsync2", error); 4342 drain_output(vp, 0); 4343 } 4344 vput(vp); 4345 ACQUIRE_LOCK(&lk); 4346 } 4347 FREE_LOCK(&lk); 4348} 4349 4350/* 4351 * Acquire exclusive access to a buffer. 4352 * Must be called with splbio blocked. 4353 * Return 1 if buffer was acquired. 4354 */ 4355static int 4356getdirtybuf(bpp, waitfor) 4357 struct buf **bpp; 4358 int waitfor; 4359{ 4360 struct buf *bp; 4361 4362 for (;;) { 4363 if ((bp = *bpp) == NULL) 4364 return (0); 4365 if ((bp->b_flags & B_BUSY) == 0) 4366 break; 4367 if (waitfor != MNT_WAIT) 4368 return (0); 4369 bp->b_flags |= B_WANTED; 4370 FREE_LOCK_INTERLOCKED(&lk); 4371 tsleep((caddr_t)bp, PRIBIO + 1, "sdsdty", 0); 4372 ACQUIRE_LOCK_INTERLOCKED(&lk); 4373 } 4374 if ((bp->b_flags & B_DELWRI) == 0) 4375 return (0); 4376 bremfree(bp); 4377 bp->b_flags |= B_BUSY; 4378 return (1); 4379} 4380 4381/* 4382 * Wait for pending output on a vnode to complete. 4383 * Must be called with vnode locked. 4384 */ 4385static void 4386drain_output(vp, islocked) 4387 struct vnode *vp; 4388 int islocked; 4389{ 4390 4391 if (!islocked) 4392 ACQUIRE_LOCK(&lk); 4393 while (vp->v_numoutput) { 4394 vp->v_flag |= VBWAIT; 4395 FREE_LOCK_INTERLOCKED(&lk); 4396 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drainvp", 0); 4397 ACQUIRE_LOCK_INTERLOCKED(&lk); 4398 } 4399 if (!islocked) 4400 FREE_LOCK(&lk); 4401} 4402 4403/* 4404 * Called whenever a buffer that is being invalidated or reallocated 4405 * contains dependencies. This should only happen if an I/O error has 4406 * occurred. The routine is called with the buffer locked. 4407 */ 4408void 4409softdep_deallocate_dependencies(bp) 4410 struct buf *bp; 4411{ 4412 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 4413 panic("softdep_deallocate_dependencies: dangling deps"); 4414} 4415 4416/* 4417 * Function to handle asynchronous write errors in the filesystem. 4418 */ 4419void 4420softdep_error(func, error) 4421 char *func; 4422 int error; 4423{ 4424 /* XXX should do something better! */ 4425 printf("%s: got error %d while accessing filesystem\n", func, error); 4426} 4427