Cross Reference: /freebsd-10.3-release/sys/ufs/ffs/ffs

Deleted Added

sdiff udiff text old ( 76354 ) new ( 76357 )

full compact

ffs_softdep.c (76354)	ffs_softdep.c (76357)
1/* 2 * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * The soft updates code is derived from the appendix of a University 5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 6 * "Soft Updates: A Solution to the Metadata Update Problem in File 7 * Systems", CSE-TR-254-95, August 1995). 8 * 9 * Further information about soft updates can be obtained from: 10 * 11 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 12 * 1614 Oxford Street mckusick@mckusick.com 13 * Berkeley, CA 94709-1608 +1-510-843-9542 14 * USA 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 27 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 28 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 29 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 30 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00	1/* 2 * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * The soft updates code is derived from the appendix of a University 5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 6 * "Soft Updates: A Solution to the Metadata Update Problem in File 7 * Systems", CSE-TR-254-95, August 1995). 8 * 9 * Further information about soft updates can be obtained from: 10 * 11 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 12 * 1614 Oxford Street mckusick@mckusick.com 13 * Berkeley, CA 94709-1608 +1-510-843-9542 14 * USA 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 27 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 28 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 29 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 30 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
39 * $FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 76354 2001-05-08 07:13:00Z mckusick $	39 * $FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 76357 2001-05-08 07:42:20Z mckusick $
40 / 41 42/ 43 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide. 44 */ 45#ifndef DIAGNOSTIC 46#define DIAGNOSTIC 47#endif 48#ifndef DEBUG 49#define DEBUG 50#endif 51 52#include <sys/param.h> 53#include <sys/kernel.h> 54#include <sys/systm.h> 55#include <sys/bio.h> 56#include <sys/buf.h> 57#include <sys/malloc.h> 58#include <sys/mount.h> 59#include <sys/proc.h>	40 / 41 42/ 43 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide. 44 */ 45#ifndef DIAGNOSTIC 46#define DIAGNOSTIC 47#endif 48#ifndef DEBUG 49#define DEBUG 50#endif 51 52#include <sys/param.h> 53#include <sys/kernel.h> 54#include <sys/systm.h> 55#include <sys/bio.h> 56#include <sys/buf.h> 57#include <sys/malloc.h> 58#include <sys/mount.h> 59#include <sys/proc.h>
	60#include <sys/stat.h>
60#include <sys/syslog.h> 61#include <sys/vnode.h> 62#include <sys/conf.h> 63#include <ufs/ufs/dir.h> 64#include <ufs/ufs/extattr.h> 65#include <ufs/ufs/quota.h> 66#include <ufs/ufs/inode.h> 67#include <ufs/ufs/ufsmount.h> 68#include <ufs/ffs/fs.h> 69#include <ufs/ffs/softdep.h> 70#include <ufs/ffs/ffs_extern.h> 71#include <ufs/ufs/ufs_extern.h> 72 73/* 74 * These definitions need to be adapted to the system to which 75 * this file is being ported. 76 / 77/ 78 * malloc types defined for the softdep system. 79 / 80static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); 81static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); 82static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); 83static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); 84static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); 85static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); 86static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); 87static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); 88static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); 89static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); 90static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); 91static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); 92static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); 93 94#define M_SOFTDEP_FLAGS (M_WAITOK \| M_USE_RESERVE) 95 96#define D_PAGEDEP 0 97#define D_INODEDEP 1 98#define D_NEWBLK 2 99#define D_BMSAFEMAP 3 100#define D_ALLOCDIRECT 4 101#define D_INDIRDEP 5 102#define D_ALLOCINDIR 6 103#define D_FREEFRAG 7 104#define D_FREEBLKS 8 105#define D_FREEFILE 9 106#define D_DIRADD 10 107#define D_MKDIR 11 108#define D_DIRREM 12 109#define D_LAST D_DIRREM 110* 111/* 112 * translate from workitem type to memory type 113 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 114 / 115static struct malloc_type memtype[] = { 116 M_PAGEDEP, 117 M_INODEDEP, 118 M_NEWBLK, 119 M_BMSAFEMAP, 120 M_ALLOCDIRECT, 121 M_INDIRDEP, 122 M_ALLOCINDIR, 123 M_FREEFRAG, 124 M_FREEBLKS, 125 M_FREEFILE, 126 M_DIRADD, 127 M_MKDIR, 128 M_DIRREM 129}; 130 131#define DtoM(type) (memtype[type]) 132 133/* 134 * Names of malloc types. 135 / 136#define TYPENAME(type) \ 137* ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") 138/* 139 * End system adaptaion definitions. 140 / 141* 142/* 143 * Internal function prototypes. 144 / 145static void softdep_error __P((char , int)); 146static void drain_output __P((struct vnode , int)); 147static int getdirtybuf __P((struct buf , int)); 148static void clear_remove __P((struct proc )); 149static void clear_inodedeps __P((struct proc )); 150static int flush_pagedep_deps __P((struct vnode , struct mount , 151* struct diraddhd )); 152static int flush_inodedep_deps __P((struct fs , ino_t)); 153static int handle_written_filepage __P((struct pagedep , struct buf )); 154static void diradd_inode_written __P((struct diradd , struct inodedep )); 155static int handle_written_inodeblock __P((struct inodedep , struct buf )); 156static void handle_allocdirect_partdone __P((struct allocdirect )); 157static void handle_allocindir_partdone __P((struct allocindir )); 158static void initiate_write_filepage __P((struct pagedep , struct buf )); 159static void handle_written_mkdir __P((struct mkdir , int)); 160static void initiate_write_inodeblock __P((struct inodedep , struct buf )); 161static void handle_workitem_freefile __P((struct freefile )); 162static void handle_workitem_remove __P((struct dirrem )); 163static struct dirrem newdirrem __P((struct buf , struct inode , 164 struct inode , int, struct dirrem )); 165static void free_diradd __P((struct diradd )); 166static void free_allocindir __P((struct allocindir , struct inodedep )); 167static int indir_trunc __P((struct inode , ufs_daddr_t, int, ufs_lbn_t, 168* long )); 169static void deallocate_dependencies __P((struct buf , struct inodedep )); 170static void free_allocdirect __P((struct allocdirectlst , 171 struct allocdirect , int)); 172static int check_inode_unwritten __P((struct inodedep )); 173static int free_inodedep __P((struct inodedep )); 174static void handle_workitem_freeblocks __P((struct freeblks , int)); 175static void merge_inode_lists __P((struct inodedep )); 176static void setup_allocindir_phase2 __P((struct buf , struct inode , 177* struct allocindir )); 178static struct allocindir newallocindir __P((struct inode , int, ufs_daddr_t, 179* ufs_daddr_t)); 180static void handle_workitem_freefrag __P((struct freefrag )); 181static struct freefrag newfreefrag __P((struct inode , ufs_daddr_t, long)); 182static void allocdirect_merge __P((struct allocdirectlst , 183 struct allocdirect , struct allocdirect )); 184static struct bmsafemap bmsafemap_lookup __P((struct buf )); 185static int newblk_lookup __P((struct fs , ufs_daddr_t, int, 186* struct newblk *)); 187static int inodedep_lookup __P((struct fs , ino_t, int, struct inodedep *)); 188static int pagedep_lookup __P((struct inode , ufs_lbn_t, int, 189 struct pagedep *)); 190static void pause_timer __P((void )); 191static int request_cleanup __P((int, int)); 192static int process_worklist_item __P((struct mount , int)); 193static void add_to_worklist __P((struct worklist )); 194 195/* 196 * Exported softdep operations. 197 / 198static void softdep_disk_io_initiation __P((struct buf )); 199static void softdep_disk_write_complete __P((struct buf )); 200static void softdep_deallocate_dependencies __P((struct buf )); 201static void softdep_move_dependencies __P((struct buf , struct buf )); 202static int softdep_count_dependencies __P((struct buf bp, int)); 203* 204struct bio_ops bioops = { 205 softdep_disk_io_initiation, /* io_start / 206* softdep_disk_write_complete, /* io_complete / 207* softdep_deallocate_dependencies, /* io_deallocate / 208* softdep_move_dependencies, /* io_movedeps / 209* softdep_count_dependencies, /* io_countdeps / 210}; 211* 212/* 213 * Locking primitives. 214 * 215 * For a uniprocessor, all we need to do is protect against disk 216 * interrupts. For a multiprocessor, this lock would have to be 217 * a mutex. A single mutex is used throughout this file, though 218 * finer grain locking could be used if contention warranted it. 219 * 220 * For a multiprocessor, the sleep call would accept a lock and 221 * release it after the sleep processing was complete. In a uniprocessor 222 * implementation there is no such interlock, so we simple mark 223 * the places where it needs to be done with the `interlocked' form 224 * of the lock calls. Since the uniprocessor sleep already interlocks 225 * the spl, there is nothing that really needs to be done. 226 / 227#ifndef / NOT / DEBUG 228static struct lockit { 229* int lkt_spl; 230} lk = { 0 }; 231#define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio() 232#define FREE_LOCK(lk) splx((lk)->lkt_spl) 233#define ACQUIRE_LOCK_INTERLOCKED(lk) 234#define FREE_LOCK_INTERLOCKED(lk) 235 236#else /* DEBUG / 237static struct lockit { 238* int lkt_spl; 239 pid_t lkt_held; 240} lk = { 0, -1 }; 241static int lockcnt; 242 243static void acquire_lock __P((struct lockit )); 244static void free_lock __P((struct lockit )); 245static void acquire_lock_interlocked __P((struct lockit )); 246static void free_lock_interlocked __P((struct lockit )); 247 248#define ACQUIRE_LOCK(lk) acquire_lock(lk) 249#define FREE_LOCK(lk) free_lock(lk) 250#define ACQUIRE_LOCK_INTERLOCKED(lk) acquire_lock_interlocked(lk) 251#define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk) 252 253static void 254acquire_lock(lk) 255 struct lockit lk; 256{ 257* pid_t holder; 258 259 if (lk->lkt_held != -1) { 260 holder = lk->lkt_held; 261 FREE_LOCK(lk); 262 if (holder == CURPROC->p_pid) 263 panic("softdep_lock: locking against myself"); 264 else 265 panic("softdep_lock: lock held by %d", holder); 266 } 267 lk->lkt_spl = splbio(); 268 lk->lkt_held = CURPROC->p_pid; 269 lockcnt++; 270} 271 272static void 273free_lock(lk) 274 struct lockit lk; 275{ 276* 277 if (lk->lkt_held == -1) 278 panic("softdep_unlock: lock not held"); 279 lk->lkt_held = -1; 280 splx(lk->lkt_spl); 281} 282 283static void 284acquire_lock_interlocked(lk) 285 struct lockit lk; 286{ 287* pid_t holder; 288 289 if (lk->lkt_held != -1) { 290 holder = lk->lkt_held; 291 FREE_LOCK(lk); 292 if (holder == CURPROC->p_pid) 293 panic("softdep_lock_interlocked: locking against self"); 294 else 295 panic("softdep_lock_interlocked: lock held by %d", 296 holder); 297 } 298 lk->lkt_held = CURPROC->p_pid; 299 lockcnt++; 300} 301 302static void 303free_lock_interlocked(lk) 304 struct lockit lk; 305{ 306* 307 if (lk->lkt_held == -1) 308 panic("softdep_unlock_interlocked: lock not held"); 309 lk->lkt_held = -1; 310} 311#endif /* DEBUG / 312* 313/* 314 * Place holder for real semaphores. 315 / 316struct sema { 317* int value; 318 pid_t holder; 319 char name; 320* int prio; 321 int timo; 322}; 323static void sema_init __P((struct sema , char , int, int)); 324static int sema_get __P((struct sema , struct lockit )); 325static void sema_release __P((struct sema )); 326* 327static void 328sema_init(semap, name, prio, timo) 329 struct sema semap; 330* char name; 331* int prio, timo; 332{ 333 334 semap->holder = -1; 335 semap->value = 0; 336 semap->name = name; 337 semap->prio = prio; 338 semap->timo = timo; 339} 340 341static int 342sema_get(semap, interlock) 343 struct sema semap; 344* struct lockit interlock; 345{ 346* 347 if (semap->value++ > 0) { 348 if (interlock != NULL) 349 FREE_LOCK_INTERLOCKED(interlock); 350 tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo); 351 if (interlock != NULL) { 352 ACQUIRE_LOCK_INTERLOCKED(interlock); 353 FREE_LOCK(interlock); 354 } 355 return (0); 356 } 357 semap->holder = CURPROC->p_pid; 358 if (interlock != NULL) 359 FREE_LOCK(interlock); 360 return (1); 361} 362 363static void 364sema_release(semap) 365 struct sema semap; 366{ 367* 368 if (semap->value <= 0 \|\| semap->holder != CURPROC->p_pid) { 369 if (lk.lkt_held != -1) 370 FREE_LOCK(&lk); 371 panic("sema_release: not held"); 372 } 373 if (--semap->value > 0) { 374 semap->value = 0; 375 wakeup(semap); 376 } 377 semap->holder = -1; 378} 379 380/* 381 * Worklist queue management. 382 * These routines require that the lock be held. 383 / 384#ifndef / NOT / DEBUG 385#define WORKLIST_INSERT(head, item) do { \ 386* (item)->wk_state \|= ONWORKLIST; \ 387 LIST_INSERT_HEAD(head, item, wk_list); \ 388} while (0) 389#define WORKLIST_REMOVE(item) do { \ 390 (item)->wk_state &= ~ONWORKLIST; \ 391 LIST_REMOVE(item, wk_list); \ 392} while (0) 393#define WORKITEM_FREE(item, type) FREE(item, DtoM(type)) 394 395#else /* DEBUG / 396static void worklist_insert __P((struct workhead , struct worklist )); 397static void worklist_remove __P((struct worklist )); 398static void workitem_free __P((struct worklist , int)); 399* 400#define WORKLIST_INSERT(head, item) worklist_insert(head, item) 401#define WORKLIST_REMOVE(item) worklist_remove(item) 402#define WORKITEM_FREE(item, type) workitem_free((struct worklist )item, type) 403* 404static void 405worklist_insert(head, item) 406 struct workhead head; 407* struct worklist item; 408{ 409* 410 if (lk.lkt_held == -1) 411 panic("worklist_insert: lock not held"); 412 if (item->wk_state & ONWORKLIST) { 413 FREE_LOCK(&lk); 414 panic("worklist_insert: already on list"); 415 } 416 item->wk_state \|= ONWORKLIST; 417 LIST_INSERT_HEAD(head, item, wk_list); 418} 419 420static void 421worklist_remove(item) 422 struct worklist item; 423{ 424* 425 if (lk.lkt_held == -1) 426 panic("worklist_remove: lock not held"); 427 if ((item->wk_state & ONWORKLIST) == 0) { 428 FREE_LOCK(&lk); 429 panic("worklist_remove: not on list"); 430 } 431 item->wk_state &= ~ONWORKLIST; 432 LIST_REMOVE(item, wk_list); 433} 434 435static void 436workitem_free(item, type) 437 struct worklist item; 438* int type; 439{ 440 441 if (item->wk_state & ONWORKLIST) { 442 if (lk.lkt_held != -1) 443 FREE_LOCK(&lk); 444 panic("workitem_free: still on list"); 445 } 446 if (item->wk_type != type) { 447 if (lk.lkt_held != -1) 448 FREE_LOCK(&lk); 449 panic("workitem_free: type mismatch"); 450 } 451 FREE(item, DtoM(type)); 452} 453#endif /* DEBUG / 454* 455/* 456 * Workitem queue management 457 / 458static struct workhead softdep_workitem_pending; 459static int num_on_worklist; / number of worklist items to be processed / 460static int softdep_worklist_busy; / 1 => trying to do unmount / 461static int softdep_worklist_req; / serialized waiters / 462static int max_softdeps; / maximum number of structs before slowdown / 463static int tickdelay = 2; / number of ticks to pause during slowdown / 464static int proc_waiting; / tracks whether we have a timeout posted / 465static int stat_countp; /* statistic to count in proc_waiting timeout / 466static struct callout_handle handle; / handle on posted proc_waiting timeout / 467static struct proc filesys_syncer; /* proc of filesystem syncer process / 468static int req_clear_inodedeps; / syncer process flush some inodedeps / 469#define FLUSH_INODES 1 470static int req_clear_remove; / syncer process flush some freeblks / 471#define FLUSH_REMOVE 2 472/ 473 * runtime statistics 474 / 475static int stat_worklist_push; / number of worklist cleanups / 476static int stat_blk_limit_push; / number of times block limit neared / 477static int stat_ino_limit_push; / number of times inode limit neared / 478static int stat_blk_limit_hit; / number of times block slowdown imposed / 479static int stat_ino_limit_hit; / number of times inode slowdown imposed / 480static int stat_sync_limit_hit; / number of synchronous slowdowns imposed / 481static int stat_indir_blk_ptrs; / bufs redirtied as indir ptrs not written / 482static int stat_inode_bitmap; / bufs redirtied as inode bitmap not written / 483static int stat_direct_blk_ptrs;/ bufs redirtied as direct ptrs not written / 484static int stat_dir_entry; / bufs redirtied as dir entry cannot write / 485#ifdef DEBUG 486#include <vm/vm.h> 487#include <sys/sysctl.h> 488SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); 489SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); 490SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,""); 491SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); 492SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); 493SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); 494SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); 495SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, ""); 496SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); 497SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); 498SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); 499SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); 500#endif / DEBUG / 501* 502/* 503 * Add an item to the end of the work queue. 504 * This routine requires that the lock be held. 505 * This is the only routine that adds items to the list. 506 * The following routine is the only one that removes items 507 * and does so in order from first to last. 508 / 509static void 510add_to_worklist(wk) 511* struct worklist wk; 512{ 513* static struct worklist worklist_tail; 514* 515 if (wk->wk_state & ONWORKLIST) { 516 if (lk.lkt_held != -1) 517 FREE_LOCK(&lk); 518 panic("add_to_worklist: already on list"); 519 } 520 wk->wk_state \|= ONWORKLIST; 521 if (LIST_FIRST(&softdep_workitem_pending) == NULL) 522 LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list); 523 else 524 LIST_INSERT_AFTER(worklist_tail, wk, wk_list); 525 worklist_tail = wk; 526 num_on_worklist += 1; 527} 528 529/* 530 * Process that runs once per second to handle items in the background queue. 531 * 532 * Note that we ensure that everything is done in the order in which they 533 * appear in the queue. The code below depends on this property to ensure 534 * that blocks of a file are freed before the inode itself is freed. This 535 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 536 * until all the old ones have been purged from the dependency lists. 537 / 538int 539softdep_process_worklist(matchmnt) 540* struct mount matchmnt; 541{ 542* struct proc p = CURPROC; 543* int matchcnt, loopcount; 544 long starttime; 545 546 /* 547 * Record the process identifier of our caller so that we can give 548 * this process preferential treatment in request_cleanup below. 549 / 550* filesys_syncer = p; 551 matchcnt = 0; 552 553 /* 554 * There is no danger of having multiple processes run this 555 * code, but we have to single-thread it when softdep_flushfiles() 556 * is in operation to get an accurate count of the number of items 557 * related to its mount point that are in the list. 558 / 559* if (matchmnt == NULL) { 560 if (softdep_worklist_busy < 0) 561 return(-1); 562 softdep_worklist_busy += 1; 563 } 564 565 /* 566 * If requested, try removing inode or removal dependencies. 567 / 568* if (req_clear_inodedeps) { 569 clear_inodedeps(p); 570 req_clear_inodedeps -= 1; 571 wakeup_one(&proc_waiting); 572 } 573 if (req_clear_remove) { 574 clear_remove(p); 575 req_clear_remove -= 1; 576 wakeup_one(&proc_waiting); 577 } 578 loopcount = 1; 579 starttime = time_second; 580 while (num_on_worklist > 0) { 581 matchcnt += process_worklist_item(matchmnt, 0); 582 583 /* 584 * If a umount operation wants to run the worklist 585 * accurately, abort. 586 / 587* if (softdep_worklist_req && matchmnt == NULL) { 588 matchcnt = -1; 589 break; 590 } 591 592 /* 593 * If requested, try removing inode or removal dependencies. 594 / 595* if (req_clear_inodedeps) { 596 clear_inodedeps(p); 597 req_clear_inodedeps -= 1; 598 wakeup_one(&proc_waiting); 599 } 600 if (req_clear_remove) { 601 clear_remove(p); 602 req_clear_remove -= 1; 603 wakeup_one(&proc_waiting); 604 } 605 /* 606 * We do not generally want to stop for buffer space, but if 607 * we are really being a buffer hog, we will stop and wait. 608 / 609* if (loopcount++ % 128 == 0) 610 bwillwrite(); 611 /* 612 * Never allow processing to run for more than one 613 * second. Otherwise the other syncer tasks may get 614 * excessively backlogged. 615 / 616* if (starttime != time_second && matchmnt == NULL) { 617 matchcnt = -1; 618 break; 619 } 620 } 621 if (matchmnt == NULL) { 622 softdep_worklist_busy -= 1; 623 if (softdep_worklist_req && softdep_worklist_busy == 0) 624 wakeup(&softdep_worklist_req); 625 } 626 return (matchcnt); 627} 628 629/* 630 * Process one item on the worklist. 631 / 632static int 633process_worklist_item(matchmnt, flags) 634* struct mount matchmnt; 635* int flags; 636{ 637 struct worklist wk; 638* struct dirrem dirrem; 639* struct mount mp; 640* struct vnode vp; 641* int matchcnt = 0; 642 643 ACQUIRE_LOCK(&lk); 644 /* 645 * Normally we just process each item on the worklist in order. 646 * However, if we are in a situation where we cannot lock any 647 * inodes, we have to skip over any dirrem requests whose 648 * vnodes are resident and locked. 649 / 650* LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) { 651 if ((flags & LK_NOWAIT) == 0 \|\| wk->wk_type != D_DIRREM) 652 break; 653 dirrem = WK_DIRREM(wk); 654 vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev, 655 dirrem->dm_oldinum); 656 if (vp == NULL \|\| !VOP_ISLOCKED(vp, CURPROC)) 657 break; 658 } 659 if (wk == 0) { 660 FREE_LOCK(&lk); 661 return (0); 662 } 663 WORKLIST_REMOVE(wk); 664 num_on_worklist -= 1; 665 FREE_LOCK(&lk); 666 switch (wk->wk_type) { 667 668 case D_DIRREM: 669 /* removal of a directory entry / 670* mp = WK_DIRREM(wk)->dm_mnt; 671 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 672 panic("%s: dirrem on suspended filesystem", 673 "process_worklist_item"); 674 if (mp == matchmnt) 675 matchcnt += 1; 676 handle_workitem_remove(WK_DIRREM(wk)); 677 break; 678 679 case D_FREEBLKS: 680 /* releasing blocks and/or fragments from a file / 681* mp = WK_FREEBLKS(wk)->fb_mnt; 682 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 683 panic("%s: freeblks on suspended filesystem", 684 "process_worklist_item"); 685 if (mp == matchmnt) 686 matchcnt += 1; 687 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT); 688 break; 689 690 case D_FREEFRAG: 691 /* releasing a fragment when replaced as a file grows / 692* mp = WK_FREEFRAG(wk)->ff_mnt; 693 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 694 panic("%s: freefrag on suspended filesystem", 695 "process_worklist_item"); 696 if (mp == matchmnt) 697 matchcnt += 1; 698 handle_workitem_freefrag(WK_FREEFRAG(wk)); 699 break; 700 701 case D_FREEFILE: 702 /* releasing an inode when its link count drops to 0 / 703* mp = WK_FREEFILE(wk)->fx_mnt; 704 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 705 panic("%s: freefile on suspended filesystem", 706 "process_worklist_item"); 707 if (mp == matchmnt) 708 matchcnt += 1; 709 handle_workitem_freefile(WK_FREEFILE(wk)); 710 break; 711 712 default: 713 panic("%s_process_worklist: Unknown type %s", 714 "softdep", TYPENAME(wk->wk_type)); 715 /* NOTREACHED / 716* } 717 return (matchcnt); 718} 719 720/* 721 * Move dependencies from one buffer to another. 722 / 723static void 724softdep_move_dependencies(oldbp, newbp) 725* struct buf oldbp; 726* struct buf newbp; 727{ 728* struct worklist wk, wktail; 729 730 if (LIST_FIRST(&newbp->b_dep) != NULL) 731 panic("softdep_move_dependencies: need merge code"); 732 wktail = 0; 733 ACQUIRE_LOCK(&lk); 734 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 735 LIST_REMOVE(wk, wk_list); 736 if (wktail == 0) 737 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 738 else 739 LIST_INSERT_AFTER(wktail, wk, wk_list); 740 wktail = wk; 741 } 742 FREE_LOCK(&lk); 743} 744 745/* 746 * Purge the work list of all items associated with a particular mount point. 747 / 748int 749softdep_flushworklist(oldmnt, countp, p) 750* struct mount oldmnt; 751* int countp; 752* struct proc p; 753{ 754* struct vnode devvp; 755* int count, error = 0; 756 757 /* 758 * Await our turn to clear out the queue, then serialize access. 759 / 760* while (softdep_worklist_busy) { 761 softdep_worklist_req += 1; 762 tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0); 763 softdep_worklist_req -= 1; 764 } 765 softdep_worklist_busy = -1; 766 /* 767 * Alternately flush the block device associated with the mount 768 * point and process any dependencies that the flushing 769 * creates. We continue until no more worklist dependencies 770 * are found. 771 / 772* countp = 0; 773* devvp = VFSTOUFS(oldmnt)->um_devvp; 774 while ((count = softdep_process_worklist(oldmnt)) > 0) { 775 countp += count; 776* vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY, p); 777 error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p); 778 VOP_UNLOCK(devvp, 0, p); 779 if (error) 780 break; 781 } 782 softdep_worklist_busy = 0; 783 if (softdep_worklist_req) 784 wakeup(&softdep_worklist_req); 785 return (error); 786} 787 788/* 789 * Flush all vnodes and worklist items associated with a specified mount point. 790 / 791int 792softdep_flushfiles(oldmnt, flags, p) 793* struct mount oldmnt; 794* int flags; 795 struct proc p; 796{ 797* int error, count, loopcnt; 798 799 /* 800 * Alternately flush the vnodes associated with the mount 801 * point and process any dependencies that the flushing 802 * creates. In theory, this loop can happen at most twice, 803 * but we give it a few extra just to be sure. 804 / 805* for (loopcnt = 10; loopcnt > 0; loopcnt--) { 806 /* 807 * Do another flush in case any vnodes were brought in 808 * as part of the cleanup operations. 809 / 810* if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) 811 break; 812 if ((error = softdep_flushworklist(oldmnt, &count, p)) != 0 \|\| 813 count == 0) 814 break; 815 } 816 /* 817 * If we are unmounting then it is an error to fail. If we 818 * are simply trying to downgrade to read-only, then filesystem 819 * activity can keep us busy forever, so we just fail with EBUSY. 820 / 821* if (loopcnt == 0) { 822 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 823 panic("softdep_flushfiles: looping"); 824 error = EBUSY; 825 } 826 return (error); 827} 828 829/* 830 * Structure hashing. 831 * 832 * There are three types of structures that can be looked up: 833 * 1) pagedep structures identified by mount point, inode number, 834 * and logical block. 835 * 2) inodedep structures identified by mount point and inode number. 836 * 3) newblk structures identified by mount point and 837 * physical block number. 838 * 839 * The "pagedep" and "inodedep" dependency structures are hashed 840 * separately from the file blocks and inodes to which they correspond. 841 * This separation helps when the in-memory copy of an inode or 842 * file block must be replaced. It also obviates the need to access 843 * an inode or file page when simply updating (or de-allocating) 844 * dependency structures. Lookup of newblk structures is needed to 845 * find newly allocated blocks when trying to associate them with 846 * their allocdirect or allocindir structure. 847 * 848 * The lookup routines optionally create and hash a new instance when 849 * an existing entry is not found. 850 / 851#define DEPALLOC 0x0001 / allocate structure if lookup fails / 852#define NODELAY 0x0002 / cannot do background work / 853* 854/* 855 * Structures and routines associated with pagedep caching. 856 / 857LIST_HEAD(pagedep_hashhead, pagedep) pagedep_hashtbl; 858u_long pagedep_hash; /* size of hash table - 1 / 859#define PAGEDEP_HASH(mp, inum, lbn) \ 860* (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 861 pagedep_hash]) 862static struct sema pagedep_in_progress; 863 864/* 865 * Look up a pagedep. Return 1 if found, 0 if not found. 866 * If not found, allocate if DEPALLOC flag is passed. 867 * Found or allocated entry is returned in pagedeppp. 868 * This routine must be called with splbio interrupts blocked. 869 / 870static int 871pagedep_lookup(ip, lbn, flags, pagedeppp) 872* struct inode ip; 873* ufs_lbn_t lbn; 874 int flags; 875 struct pagedep *pagedeppp; 876{ 877* struct pagedep pagedep; 878* struct pagedep_hashhead pagedephd; 879* struct mount mp; 880* int i; 881 882#ifdef DEBUG 883 if (lk.lkt_held == -1) 884 panic("pagedep_lookup: lock not held"); 885#endif 886 mp = ITOV(ip)->v_mount; 887 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); 888top: 889 LIST_FOREACH(pagedep, pagedephd, pd_hash) 890 if (ip->i_number == pagedep->pd_ino && 891 lbn == pagedep->pd_lbn && 892 mp == pagedep->pd_mnt) 893 break; 894 if (pagedep) { 895 pagedeppp = pagedep; 896* return (1); 897 } 898 if ((flags & DEPALLOC) == 0) { 899 pagedeppp = NULL; 900* return (0); 901 } 902 if (sema_get(&pagedep_in_progress, &lk) == 0) { 903 ACQUIRE_LOCK(&lk); 904 goto top; 905 } 906 MALLOC(pagedep, struct pagedep , sizeof(struct pagedep), M_PAGEDEP, 907* M_SOFTDEP_FLAGS\|M_ZERO); 908 pagedep->pd_list.wk_type = D_PAGEDEP; 909 pagedep->pd_mnt = mp; 910 pagedep->pd_ino = ip->i_number; 911 pagedep->pd_lbn = lbn; 912 LIST_INIT(&pagedep->pd_dirremhd); 913 LIST_INIT(&pagedep->pd_pendinghd); 914 for (i = 0; i < DAHASHSZ; i++) 915 LIST_INIT(&pagedep->pd_diraddhd[i]); 916 ACQUIRE_LOCK(&lk); 917 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 918 sema_release(&pagedep_in_progress); 919 pagedeppp = pagedep; 920* return (0); 921} 922 923/* 924 * Structures and routines associated with inodedep caching. 925 / 926LIST_HEAD(inodedep_hashhead, inodedep) inodedep_hashtbl; 927static u_long inodedep_hash; /* size of hash table - 1 / 928static long num_inodedep; / number of inodedep allocated / 929#define INODEDEP_HASH(fs, inum) \ 930* (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 931static struct sema inodedep_in_progress; 932 933/* 934 * Look up a inodedep. Return 1 if found, 0 if not found. 935 * If not found, allocate if DEPALLOC flag is passed. 936 * Found or allocated entry is returned in inodedeppp. 937 * This routine must be called with splbio interrupts blocked. 938 / 939static int 940inodedep_lookup(fs, inum, flags, inodedeppp) 941* struct fs fs; 942* ino_t inum; 943 int flags; 944 struct inodedep *inodedeppp; 945{ 946* struct inodedep inodedep; 947* struct inodedep_hashhead inodedephd; 948* int firsttry; 949 950#ifdef DEBUG 951 if (lk.lkt_held == -1) 952 panic("inodedep_lookup: lock not held"); 953#endif 954 firsttry = 1; 955 inodedephd = INODEDEP_HASH(fs, inum); 956top: 957 LIST_FOREACH(inodedep, inodedephd, id_hash) 958 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 959 break; 960 if (inodedep) { 961 inodedeppp = inodedep; 962* return (1); 963 } 964 if ((flags & DEPALLOC) == 0) { 965 inodedeppp = NULL; 966* return (0); 967 } 968 /* 969 * If we are over our limit, try to improve the situation. 970 / 971* if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 && 972 request_cleanup(FLUSH_INODES, 1)) { 973 firsttry = 0; 974 goto top; 975 } 976 if (sema_get(&inodedep_in_progress, &lk) == 0) { 977 ACQUIRE_LOCK(&lk); 978 goto top; 979 } 980 num_inodedep += 1; 981 MALLOC(inodedep, struct inodedep , sizeof(struct inodedep), 982* M_INODEDEP, M_SOFTDEP_FLAGS); 983 inodedep->id_list.wk_type = D_INODEDEP; 984 inodedep->id_fs = fs; 985 inodedep->id_ino = inum; 986 inodedep->id_state = ALLCOMPLETE; 987 inodedep->id_nlinkdelta = 0; 988 inodedep->id_savedino = NULL; 989 inodedep->id_savedsize = -1; 990 inodedep->id_buf = NULL; 991 LIST_INIT(&inodedep->id_pendinghd); 992 LIST_INIT(&inodedep->id_inowait); 993 LIST_INIT(&inodedep->id_bufwait); 994 TAILQ_INIT(&inodedep->id_inoupdt); 995 TAILQ_INIT(&inodedep->id_newinoupdt); 996 ACQUIRE_LOCK(&lk); 997 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 998 sema_release(&inodedep_in_progress); 999 inodedeppp = inodedep; 1000* return (0); 1001} 1002 1003/* 1004 * Structures and routines associated with newblk caching. 1005 / 1006LIST_HEAD(newblk_hashhead, newblk) newblk_hashtbl; 1007u_long newblk_hash; /* size of hash table - 1 / 1008#define NEWBLK_HASH(fs, inum) \ 1009* (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 1010static struct sema newblk_in_progress; 1011 1012/* 1013 * Look up a newblk. Return 1 if found, 0 if not found. 1014 * If not found, allocate if DEPALLOC flag is passed. 1015 * Found or allocated entry is returned in newblkpp. 1016 / 1017static int 1018newblk_lookup(fs, newblkno, flags, newblkpp) 1019* struct fs fs; 1020* ufs_daddr_t newblkno; 1021 int flags; 1022 struct newblk *newblkpp; 1023{ 1024* struct newblk newblk; 1025* struct newblk_hashhead newblkhd; 1026* 1027 newblkhd = NEWBLK_HASH(fs, newblkno); 1028top: 1029 LIST_FOREACH(newblk, newblkhd, nb_hash) 1030 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) 1031 break; 1032 if (newblk) { 1033 newblkpp = newblk; 1034* return (1); 1035 } 1036 if ((flags & DEPALLOC) == 0) { 1037 newblkpp = NULL; 1038* return (0); 1039 } 1040 if (sema_get(&newblk_in_progress, 0) == 0) 1041 goto top; 1042 MALLOC(newblk, struct newblk , sizeof(struct newblk), 1043* M_NEWBLK, M_SOFTDEP_FLAGS); 1044 newblk->nb_state = 0; 1045 newblk->nb_fs = fs; 1046 newblk->nb_newblkno = newblkno; 1047 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 1048 sema_release(&newblk_in_progress); 1049 newblkpp = newblk; 1050* return (0); 1051} 1052 1053/* 1054 * Executed during filesystem system initialization before 1055 * mounting any file systems. 1056 / 1057void 1058softdep_initialize() 1059{ 1060* 1061 LIST_INIT(&mkdirlisthd); 1062 LIST_INIT(&softdep_workitem_pending); 1063 max_softdeps = min(desiredvnodes * 8, 1064 M_INODEDEP->ks_limit / (2 * sizeof(struct inodedep))); 1065 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, 1066 &pagedep_hash); 1067 sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0); 1068 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 1069 sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0); 1070 newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); 1071 sema_init(&newblk_in_progress, "newblk", PRIBIO, 0); 1072} 1073 1074/* 1075 * Called at mount time to notify the dependency code that a 1076 * filesystem wishes to use it. 1077 / 1078int 1079softdep_mount(devvp, mp, fs, cred) 1080* struct vnode devvp; 1081* struct mount mp; 1082* struct fs fs; 1083* struct ucred cred; 1084{ 1085* struct csum cstotal; 1086 struct cg cgp; 1087* struct buf bp; 1088* int error, cyl; 1089 1090 mp->mnt_flag &= ~MNT_ASYNC; 1091 mp->mnt_flag \|= MNT_SOFTDEP; 1092 /* 1093 * When doing soft updates, the counters in the 1094 * superblock may have gotten out of sync, so we have 1095 * to scan the cylinder groups and recalculate them. 1096 / 1097* if (fs->fs_clean != 0) 1098 return (0); 1099 bzero(&cstotal, sizeof cstotal); 1100 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 1101 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 1102 fs->fs_cgsize, cred, &bp)) != 0) { 1103 brelse(bp); 1104 return (error); 1105 } 1106 cgp = (struct cg )bp->b_data; 1107* cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 1108 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 1109 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 1110 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 1111 fs->fs_cs(fs, cyl) = cgp->cg_cs; 1112 brelse(bp); 1113 } 1114#ifdef DEBUG 1115 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 1116 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 1117#endif 1118 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 1119 return (0); 1120} 1121 1122/* 1123 * Protecting the freemaps (or bitmaps). 1124 * 1125 * To eliminate the need to execute fsck before mounting a file system 1126 * after a power failure, one must (conservatively) guarantee that the 1127 * on-disk copy of the bitmaps never indicate that a live inode or block is 1128 * free. So, when a block or inode is allocated, the bitmap should be 1129 * updated (on disk) before any new pointers. When a block or inode is 1130 * freed, the bitmap should not be updated until all pointers have been 1131 * reset. The latter dependency is handled by the delayed de-allocation 1132 * approach described below for block and inode de-allocation. The former 1133 * dependency is handled by calling the following procedure when a block or 1134 * inode is allocated. When an inode is allocated an "inodedep" is created 1135 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 1136 * Each "inodedep" is also inserted into the hash indexing structure so 1137 * that any additional link additions can be made dependent on the inode 1138 * allocation. 1139 * 1140 * The ufs file system maintains a number of free block counts (e.g., per 1141 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 1142 * in addition to the bitmaps. These counts are used to improve efficiency 1143 * during allocation and therefore must be consistent with the bitmaps. 1144 * There is no convenient way to guarantee post-crash consistency of these 1145 * counts with simple update ordering, for two main reasons: (1) The counts 1146 * and bitmaps for a single cylinder group block are not in the same disk 1147 * sector. If a disk write is interrupted (e.g., by power failure), one may 1148 * be written and the other not. (2) Some of the counts are located in the 1149 * superblock rather than the cylinder group block. So, we focus our soft 1150 * updates implementation on protecting the bitmaps. When mounting a 1151 * filesystem, we recompute the auxiliary counts from the bitmaps. 1152 / 1153* 1154/* 1155 * Called just after updating the cylinder group block to allocate an inode. 1156 / 1157void 1158softdep_setup_inomapdep(bp, ip, newinum) 1159* struct buf bp; / buffer for cylgroup block with inode map / 1160* struct inode ip; / inode related to allocation / 1161* ino_t newinum; /* new inode number being allocated / 1162{ 1163* struct inodedep inodedep; 1164* struct bmsafemap bmsafemap; 1165* 1166 /* 1167 * Create a dependency for the newly allocated inode. 1168 * Panic if it already exists as something is seriously wrong. 1169 * Otherwise add it to the dependency list for the buffer holding 1170 * the cylinder group map from which it was allocated. 1171 / 1172* ACQUIRE_LOCK(&lk); 1173 if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC\|NODELAY, &inodedep))) { 1174 FREE_LOCK(&lk); 1175 panic("softdep_setup_inomapdep: found inode"); 1176 } 1177 inodedep->id_buf = bp; 1178 inodedep->id_state &= ~DEPCOMPLETE; 1179 bmsafemap = bmsafemap_lookup(bp); 1180 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 1181 FREE_LOCK(&lk); 1182} 1183 1184/* 1185 * Called just after updating the cylinder group block to 1186 * allocate block or fragment. 1187 / 1188void 1189softdep_setup_blkmapdep(bp, fs, newblkno) 1190* struct buf bp; / buffer for cylgroup block with block map / 1191* struct fs fs; / filesystem doing allocation / 1192* ufs_daddr_t newblkno; /* number of newly allocated block / 1193{ 1194* struct newblk newblk; 1195* struct bmsafemap bmsafemap; 1196* 1197 /* 1198 * Create a dependency for the newly allocated block. 1199 * Add it to the dependency list for the buffer holding 1200 * the cylinder group map from which it was allocated. 1201 / 1202* if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) 1203 panic("softdep_setup_blkmapdep: found block"); 1204 ACQUIRE_LOCK(&lk); 1205 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp); 1206 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 1207 FREE_LOCK(&lk); 1208} 1209 1210/* 1211 * Find the bmsafemap associated with a cylinder group buffer. 1212 * If none exists, create one. The buffer must be locked when 1213 * this routine is called and this routine must be called with 1214 * splbio interrupts blocked. 1215 / 1216static struct bmsafemap 1217bmsafemap_lookup(bp) 1218 struct buf bp; 1219{ 1220* struct bmsafemap bmsafemap; 1221* struct worklist wk; 1222* 1223#ifdef DEBUG 1224 if (lk.lkt_held == -1) 1225 panic("bmsafemap_lookup: lock not held"); 1226#endif 1227 LIST_FOREACH(wk, &bp->b_dep, wk_list) 1228 if (wk->wk_type == D_BMSAFEMAP) 1229 return (WK_BMSAFEMAP(wk)); 1230 FREE_LOCK(&lk); 1231 MALLOC(bmsafemap, struct bmsafemap , sizeof(struct bmsafemap), 1232* M_BMSAFEMAP, M_SOFTDEP_FLAGS); 1233 bmsafemap->sm_list.wk_type = D_BMSAFEMAP; 1234 bmsafemap->sm_list.wk_state = 0; 1235 bmsafemap->sm_buf = bp; 1236 LIST_INIT(&bmsafemap->sm_allocdirecthd); 1237 LIST_INIT(&bmsafemap->sm_allocindirhd); 1238 LIST_INIT(&bmsafemap->sm_inodedephd); 1239 LIST_INIT(&bmsafemap->sm_newblkhd); 1240 ACQUIRE_LOCK(&lk); 1241 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 1242 return (bmsafemap); 1243} 1244 1245/* 1246 * Direct block allocation dependencies. 1247 * 1248 * When a new block is allocated, the corresponding disk locations must be 1249 * initialized (with zeros or new data) before the on-disk inode points to 1250 * them. Also, the freemap from which the block was allocated must be 1251 * updated (on disk) before the inode's pointer. These two dependencies are 1252 * independent of each other and are needed for all file blocks and indirect 1253 * blocks that are pointed to directly by the inode. Just before the 1254 * "in-core" version of the inode is updated with a newly allocated block 1255 * number, a procedure (below) is called to setup allocation dependency 1256 * structures. These structures are removed when the corresponding 1257 * dependencies are satisfied or when the block allocation becomes obsolete 1258 * (i.e., the file is deleted, the block is de-allocated, or the block is a 1259 * fragment that gets upgraded). All of these cases are handled in 1260 * procedures described later. 1261 * 1262 * When a file extension causes a fragment to be upgraded, either to a larger 1263 * fragment or to a full block, the on-disk location may change (if the 1264 * previous fragment could not simply be extended). In this case, the old 1265 * fragment must be de-allocated, but not until after the inode's pointer has 1266 * been updated. In most cases, this is handled by later procedures, which 1267 * will construct a "freefrag" structure to be added to the workitem queue 1268 * when the inode update is complete (or obsolete). The main exception to 1269 * this is when an allocation occurs while a pending allocation dependency 1270 * (for the same block pointer) remains. This case is handled in the main 1271 * allocation dependency setup procedure by immediately freeing the 1272 * unreferenced fragments. 1273 / 1274void 1275softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 1276* struct inode ip; / inode to which block is being added / 1277* ufs_lbn_t lbn; /* block pointer within inode / 1278* ufs_daddr_t newblkno; /* disk block number being added / 1279* ufs_daddr_t oldblkno; /* previous block number, 0 unless frag / 1280* long newsize; /* size of new block / 1281* long oldsize; /* size of new block / 1282* struct buf bp; / bp for allocated block / 1283{ 1284* struct allocdirect adp, oldadp; 1285 struct allocdirectlst adphead; 1286* struct bmsafemap bmsafemap; 1287* struct inodedep inodedep; 1288* struct pagedep pagedep; 1289* struct newblk newblk; 1290* 1291 MALLOC(adp, struct allocdirect , sizeof(struct allocdirect), 1292* M_ALLOCDIRECT, M_SOFTDEP_FLAGS\|M_ZERO); 1293 adp->ad_list.wk_type = D_ALLOCDIRECT; 1294 adp->ad_lbn = lbn; 1295 adp->ad_newblkno = newblkno; 1296 adp->ad_oldblkno = oldblkno; 1297 adp->ad_newsize = newsize; 1298 adp->ad_oldsize = oldsize; 1299 adp->ad_state = ATTACHED; 1300 if (newblkno == oldblkno) 1301 adp->ad_freefrag = NULL; 1302 else 1303 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); 1304 1305 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) 1306 panic("softdep_setup_allocdirect: lost block"); 1307 1308 ACQUIRE_LOCK(&lk); 1309 inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC \| NODELAY, &inodedep); 1310 adp->ad_inodedep = inodedep; 1311 1312 if (newblk->nb_state == DEPCOMPLETE) { 1313 adp->ad_state \|= DEPCOMPLETE; 1314 adp->ad_buf = NULL; 1315 } else { 1316 bmsafemap = newblk->nb_bmsafemap; 1317 adp->ad_buf = bmsafemap->sm_buf; 1318 LIST_REMOVE(newblk, nb_deps); 1319 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); 1320 } 1321 LIST_REMOVE(newblk, nb_hash); 1322 FREE(newblk, M_NEWBLK); 1323 1324 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); 1325 if (lbn >= NDADDR) { 1326 /* allocating an indirect block / 1327* if (oldblkno != 0) { 1328 FREE_LOCK(&lk); 1329 panic("softdep_setup_allocdirect: non-zero indir"); 1330 } 1331 } else { 1332 /* 1333 * Allocating a direct block. 1334 * 1335 * If we are allocating a directory block, then we must 1336 * allocate an associated pagedep to track additions and 1337 * deletions. 1338 / 1339* if ((ip->i_mode & IFMT) == IFDIR && 1340 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1341 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 1342 } 1343 /* 1344 * The list of allocdirects must be kept in sorted and ascending 1345 * order so that the rollback routines can quickly determine the 1346 * first uncommitted block (the size of the file stored on disk 1347 * ends at the end of the lowest committed fragment, or if there 1348 * are no fragments, at the end of the highest committed block). 1349 * Since files generally grow, the typical case is that the new 1350 * block is to be added at the end of the list. We speed this 1351 * special case by checking against the last allocdirect in the 1352 * list before laboriously traversing the list looking for the 1353 * insertion point. 1354 / 1355* adphead = &inodedep->id_newinoupdt; 1356 oldadp = TAILQ_LAST(adphead, allocdirectlst); 1357 if (oldadp == NULL \|\| oldadp->ad_lbn <= lbn) { 1358 /* insert at end of list / 1359* TAILQ_INSERT_TAIL(adphead, adp, ad_next); 1360 if (oldadp != NULL && oldadp->ad_lbn == lbn) 1361 allocdirect_merge(adphead, adp, oldadp); 1362 FREE_LOCK(&lk); 1363 return; 1364 } 1365 TAILQ_FOREACH(oldadp, adphead, ad_next) { 1366 if (oldadp->ad_lbn >= lbn) 1367 break; 1368 } 1369 if (oldadp == NULL) { 1370 FREE_LOCK(&lk); 1371 panic("softdep_setup_allocdirect: lost entry"); 1372 } 1373 /* insert in middle of list / 1374* TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 1375 if (oldadp->ad_lbn == lbn) 1376 allocdirect_merge(adphead, adp, oldadp); 1377 FREE_LOCK(&lk); 1378} 1379 1380/* 1381 * Replace an old allocdirect dependency with a newer one. 1382 * This routine must be called with splbio interrupts blocked. 1383 / 1384static void 1385allocdirect_merge(adphead, newadp, oldadp) 1386* struct allocdirectlst adphead; / head of list holding allocdirects / 1387* struct allocdirect newadp; / allocdirect being added / 1388* struct allocdirect oldadp; / existing allocdirect being checked / 1389{ 1390* struct freefrag freefrag; 1391* 1392#ifdef DEBUG 1393 if (lk.lkt_held == -1) 1394 panic("allocdirect_merge: lock not held"); 1395#endif 1396 if (newadp->ad_oldblkno != oldadp->ad_newblkno \|\| 1397 newadp->ad_oldsize != oldadp->ad_newsize \|\| 1398 newadp->ad_lbn >= NDADDR) { 1399 FREE_LOCK(&lk); 1400 panic("allocdirect_check: old %d != new %d \|\| lbn %ld >= %d", 1401 newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn, 1402 NDADDR); 1403 } 1404 newadp->ad_oldblkno = oldadp->ad_oldblkno; 1405 newadp->ad_oldsize = oldadp->ad_oldsize; 1406 /* 1407 * If the old dependency had a fragment to free or had never 1408 * previously had a block allocated, then the new dependency 1409 * can immediately post its freefrag and adopt the old freefrag. 1410 * This action is done by swapping the freefrag dependencies. 1411 * The new dependency gains the old one's freefrag, and the 1412 * old one gets the new one and then immediately puts it on 1413 * the worklist when it is freed by free_allocdirect. It is 1414 * not possible to do this swap when the old dependency had a 1415 * non-zero size but no previous fragment to free. This condition 1416 * arises when the new block is an extension of the old block. 1417 * Here, the first part of the fragment allocated to the new 1418 * dependency is part of the block currently claimed on disk by 1419 * the old dependency, so cannot legitimately be freed until the 1420 * conditions for the new dependency are fulfilled. 1421 / 1422* if (oldadp->ad_freefrag != NULL \|\| oldadp->ad_oldblkno == 0) { 1423 freefrag = newadp->ad_freefrag; 1424 newadp->ad_freefrag = oldadp->ad_freefrag; 1425 oldadp->ad_freefrag = freefrag; 1426 } 1427 free_allocdirect(adphead, oldadp, 0); 1428} 1429 1430/* 1431 * Allocate a new freefrag structure if needed. 1432 / 1433static struct freefrag 1434newfreefrag(ip, blkno, size) 1435 struct inode ip; 1436* ufs_daddr_t blkno; 1437 long size; 1438{ 1439 struct freefrag freefrag; 1440* struct fs fs; 1441* 1442 if (blkno == 0) 1443 return (NULL); 1444 fs = ip->i_fs; 1445 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 1446 panic("newfreefrag: frag size"); 1447 MALLOC(freefrag, struct freefrag , sizeof(struct freefrag), 1448* M_FREEFRAG, M_SOFTDEP_FLAGS); 1449 freefrag->ff_list.wk_type = D_FREEFRAG; 1450 freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below / 1451* freefrag->ff_inum = ip->i_number; 1452 freefrag->ff_mnt = ITOV(ip)->v_mount; 1453 freefrag->ff_devvp = ip->i_devvp; 1454 freefrag->ff_blkno = blkno; 1455 freefrag->ff_fragsize = size; 1456 return (freefrag); 1457} 1458 1459/* 1460 * This workitem de-allocates fragments that were replaced during 1461 * file block allocation. 1462 / 1463static void 1464handle_workitem_freefrag(freefrag) 1465* struct freefrag freefrag; 1466{ 1467* struct inode tip; 1468 1469 tip.i_vnode = NULL; 1470 tip.i_fs = VFSTOUFS(freefrag->ff_mnt)->um_fs; 1471 tip.i_devvp = freefrag->ff_devvp; 1472 tip.i_dev = freefrag->ff_devvp->v_rdev; 1473 tip.i_number = freefrag->ff_inum; 1474 tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above / 1475* ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize); 1476 FREE(freefrag, M_FREEFRAG); 1477} 1478 1479/* 1480 * Indirect block allocation dependencies. 1481 * 1482 * The same dependencies that exist for a direct block also exist when 1483 * a new block is allocated and pointed to by an entry in a block of 1484 * indirect pointers. The undo/redo states described above are also 1485 * used here. Because an indirect block contains many pointers that 1486 * may have dependencies, a second copy of the entire in-memory indirect 1487 * block is kept. The buffer cache copy is always completely up-to-date. 1488 * The second copy, which is used only as a source for disk writes, 1489 * contains only the safe pointers (i.e., those that have no remaining 1490 * update dependencies). The second copy is freed when all pointers 1491 * are safe. The cache is not allowed to replace indirect blocks with 1492 * pending update dependencies. If a buffer containing an indirect 1493 * block with dependencies is written, these routines will mark it 1494 * dirty again. It can only be successfully written once all the 1495 * dependencies are removed. The ffs_fsync routine in conjunction with 1496 * softdep_sync_metadata work together to get all the dependencies 1497 * removed so that a file can be successfully written to disk. Three 1498 * procedures are used when setting up indirect block pointer 1499 * dependencies. The division is necessary because of the organization 1500 * of the "balloc" routine and because of the distinction between file 1501 * pages and file metadata blocks. 1502 / 1503* 1504/* 1505 * Allocate a new allocindir structure. 1506 / 1507static struct allocindir 1508newallocindir(ip, ptrno, newblkno, oldblkno) 1509 struct inode ip; / inode for file being extended / 1510* int ptrno; /* offset of pointer in indirect block / 1511* ufs_daddr_t newblkno; /* disk block number being added / 1512* ufs_daddr_t oldblkno; /* previous block number, 0 if none / 1513{ 1514* struct allocindir aip; 1515* 1516 MALLOC(aip, struct allocindir , sizeof(struct allocindir), 1517* M_ALLOCINDIR, M_SOFTDEP_FLAGS\|M_ZERO); 1518 aip->ai_list.wk_type = D_ALLOCINDIR; 1519 aip->ai_state = ATTACHED; 1520 aip->ai_offset = ptrno; 1521 aip->ai_newblkno = newblkno; 1522 aip->ai_oldblkno = oldblkno; 1523 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); 1524 return (aip); 1525} 1526 1527/* 1528 * Called just before setting an indirect block pointer 1529 * to a newly allocated file page. 1530 / 1531void 1532softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 1533* struct inode ip; / inode for file being extended / 1534* ufs_lbn_t lbn; /* allocated block number within file / 1535* struct buf bp; / buffer with indirect blk referencing page / 1536* int ptrno; /* offset of pointer in indirect block / 1537* ufs_daddr_t newblkno; /* disk block number being added / 1538* ufs_daddr_t oldblkno; /* previous block number, 0 if none / 1539* struct buf nbp; / buffer holding allocated page / 1540{ 1541* struct allocindir aip; 1542* struct pagedep pagedep; 1543* 1544 aip = newallocindir(ip, ptrno, newblkno, oldblkno); 1545 ACQUIRE_LOCK(&lk); 1546 /* 1547 * If we are allocating a directory page, then we must 1548 * allocate an associated pagedep to track additions and 1549 * deletions. 1550 / 1551* if ((ip->i_mode & IFMT) == IFDIR && 1552 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1553 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); 1554 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1555 FREE_LOCK(&lk); 1556 setup_allocindir_phase2(bp, ip, aip); 1557} 1558 1559/* 1560 * Called just before setting an indirect block pointer to a 1561 * newly allocated indirect block. 1562 / 1563void 1564softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 1565* struct buf nbp; / newly allocated indirect block / 1566* struct inode ip; / inode for file being extended / 1567* struct buf bp; / indirect block referencing allocated block / 1568* int ptrno; /* offset of pointer in indirect block / 1569* ufs_daddr_t newblkno; /* disk block number being added / 1570{ 1571* struct allocindir aip; 1572* 1573 aip = newallocindir(ip, ptrno, newblkno, 0); 1574 ACQUIRE_LOCK(&lk); 1575 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1576 FREE_LOCK(&lk); 1577 setup_allocindir_phase2(bp, ip, aip); 1578} 1579 1580/* 1581 * Called to finish the allocation of the "aip" allocated 1582 * by one of the two routines above. 1583 / 1584static void 1585setup_allocindir_phase2(bp, ip, aip) 1586* struct buf bp; / in-memory copy of the indirect block / 1587* struct inode ip; / inode for file being extended / 1588* struct allocindir aip; / allocindir allocated by the above routines / 1589{ 1590* struct worklist wk; 1591* struct indirdep indirdep, newindirdep; 1592 struct bmsafemap bmsafemap; 1593* struct allocindir oldaip; 1594* struct freefrag freefrag; 1595* struct newblk newblk; 1596* 1597 if (bp->b_lblkno >= 0) 1598 panic("setup_allocindir_phase2: not indir blk"); 1599 for (indirdep = NULL, newindirdep = NULL; ; ) { 1600 ACQUIRE_LOCK(&lk); 1601 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 1602 if (wk->wk_type != D_INDIRDEP) 1603 continue; 1604 indirdep = WK_INDIRDEP(wk); 1605 break; 1606 } 1607 if (indirdep == NULL && newindirdep) { 1608 indirdep = newindirdep; 1609 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 1610 newindirdep = NULL; 1611 } 1612 FREE_LOCK(&lk); 1613 if (indirdep) { 1614 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, 1615 &newblk) == 0) 1616 panic("setup_allocindir: lost block"); 1617 ACQUIRE_LOCK(&lk); 1618 if (newblk->nb_state == DEPCOMPLETE) { 1619 aip->ai_state \|= DEPCOMPLETE; 1620 aip->ai_buf = NULL; 1621 } else { 1622 bmsafemap = newblk->nb_bmsafemap; 1623 aip->ai_buf = bmsafemap->sm_buf; 1624 LIST_REMOVE(newblk, nb_deps); 1625 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, 1626 aip, ai_deps); 1627 } 1628 LIST_REMOVE(newblk, nb_hash); 1629 FREE(newblk, M_NEWBLK); 1630 aip->ai_indirdep = indirdep; 1631 /* 1632 * Check to see if there is an existing dependency 1633 * for this block. If there is, merge the old 1634 * dependency into the new one. 1635 / 1636* if (aip->ai_oldblkno == 0) 1637 oldaip = NULL; 1638 else 1639 1640 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) 1641 if (oldaip->ai_offset == aip->ai_offset) 1642 break; 1643 freefrag = NULL; 1644 if (oldaip != NULL) { 1645 if (oldaip->ai_newblkno != aip->ai_oldblkno) { 1646 FREE_LOCK(&lk); 1647 panic("setup_allocindir_phase2: blkno"); 1648 } 1649 aip->ai_oldblkno = oldaip->ai_oldblkno; 1650 freefrag = aip->ai_freefrag; 1651 aip->ai_freefrag = oldaip->ai_freefrag; 1652 oldaip->ai_freefrag = NULL; 1653 free_allocindir(oldaip, NULL); 1654 } 1655 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 1656 ((ufs_daddr_t )indirdep->ir_savebp->b_data) 1657* [aip->ai_offset] = aip->ai_oldblkno; 1658 FREE_LOCK(&lk); 1659 if (freefrag != NULL) 1660 handle_workitem_freefrag(freefrag); 1661 } 1662 if (newindirdep) { 1663 if (indirdep->ir_savebp != NULL) 1664 brelse(newindirdep->ir_savebp); 1665 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); 1666 } 1667 if (indirdep) 1668 break; 1669 MALLOC(newindirdep, struct indirdep , sizeof(struct indirdep), 1670* M_INDIRDEP, M_SOFTDEP_FLAGS); 1671 newindirdep->ir_list.wk_type = D_INDIRDEP; 1672 newindirdep->ir_state = ATTACHED; 1673 LIST_INIT(&newindirdep->ir_deplisthd); 1674 LIST_INIT(&newindirdep->ir_donehd); 1675 if (bp->b_blkno == bp->b_lblkno) 1676 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &bp->b_blkno, NULL, NULL); 1677 newindirdep->ir_savebp = 1678 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0); 1679 BUF_KERNPROC(newindirdep->ir_savebp); 1680 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 1681 } 1682} 1683 1684/* 1685 * Block de-allocation dependencies. 1686 * 1687 * When blocks are de-allocated, the on-disk pointers must be nullified before 1688 * the blocks are made available for use by other files. (The true 1689 * requirement is that old pointers must be nullified before new on-disk 1690 * pointers are set. We chose this slightly more stringent requirement to 1691 * reduce complexity.) Our implementation handles this dependency by updating 1692 * the inode (or indirect block) appropriately but delaying the actual block 1693 * de-allocation (i.e., freemap and free space count manipulation) until 1694 * after the updated versions reach stable storage. After the disk is 1695 * updated, the blocks can be safely de-allocated whenever it is convenient. 1696 * This implementation handles only the common case of reducing a file's 1697 * length to zero. Other cases are handled by the conventional synchronous 1698 * write approach. 1699 * 1700 * The ffs implementation with which we worked double-checks 1701 * the state of the block pointers and file size as it reduces 1702 * a file's length. Some of this code is replicated here in our 1703 * soft updates implementation. The freeblks->fb_chkcnt field is 1704 * used to transfer a part of this information to the procedure 1705 * that eventually de-allocates the blocks. 1706 * 1707 * This routine should be called from the routine that shortens 1708 * a file's length, before the inode's size or block pointers 1709 * are modified. It will save the block pointer information for 1710 * later release and zero the inode so that the calling routine 1711 * can release it. 1712 / 1713void 1714softdep_setup_freeblocks(ip, length) 1715* struct inode ip; / The inode whose length is to be reduced / 1716* off_t length; /* The new length for the file / 1717{ 1718* struct freeblks freeblks; 1719* struct inodedep inodedep; 1720* struct allocdirect adp; 1721* struct vnode vp; 1722* struct buf bp; 1723* struct fs fs; 1724* int i, delay, error; 1725 1726 fs = ip->i_fs; 1727 if (length != 0)	61#include <sys/syslog.h> 62#include <sys/vnode.h> 63#include <sys/conf.h> 64#include <ufs/ufs/dir.h> 65#include <ufs/ufs/extattr.h> 66#include <ufs/ufs/quota.h> 67#include <ufs/ufs/inode.h> 68#include <ufs/ufs/ufsmount.h> 69#include <ufs/ffs/fs.h> 70#include <ufs/ffs/softdep.h> 71#include <ufs/ffs/ffs_extern.h> 72#include <ufs/ufs/ufs_extern.h> 73 74/* 75 * These definitions need to be adapted to the system to which 76 * this file is being ported. 77 / 78/ 79 * malloc types defined for the softdep system. 80 / 81static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); 82static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); 83static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); 84static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); 85static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); 86static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); 87static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); 88static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); 89static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); 90static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); 91static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); 92static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); 93static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); 94 95#define M_SOFTDEP_FLAGS (M_WAITOK \| M_USE_RESERVE) 96 97#define D_PAGEDEP 0 98#define D_INODEDEP 1 99#define D_NEWBLK 2 100#define D_BMSAFEMAP 3 101#define D_ALLOCDIRECT 4 102#define D_INDIRDEP 5 103#define D_ALLOCINDIR 6 104#define D_FREEFRAG 7 105#define D_FREEBLKS 8 106#define D_FREEFILE 9 107#define D_DIRADD 10 108#define D_MKDIR 11 109#define D_DIRREM 12 110#define D_LAST D_DIRREM 111* 112/* 113 * translate from workitem type to memory type 114 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 115 / 116static struct malloc_type memtype[] = { 117 M_PAGEDEP, 118 M_INODEDEP, 119 M_NEWBLK, 120 M_BMSAFEMAP, 121 M_ALLOCDIRECT, 122 M_INDIRDEP, 123 M_ALLOCINDIR, 124 M_FREEFRAG, 125 M_FREEBLKS, 126 M_FREEFILE, 127 M_DIRADD, 128 M_MKDIR, 129 M_DIRREM 130}; 131 132#define DtoM(type) (memtype[type]) 133 134/* 135 * Names of malloc types. 136 / 137#define TYPENAME(type) \ 138* ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") 139/* 140 * End system adaptaion definitions. 141 / 142* 143/* 144 * Internal function prototypes. 145 / 146static void softdep_error __P((char , int)); 147static void drain_output __P((struct vnode , int)); 148static int getdirtybuf __P((struct buf , int)); 149static void clear_remove __P((struct proc )); 150static void clear_inodedeps __P((struct proc )); 151static int flush_pagedep_deps __P((struct vnode , struct mount , 152* struct diraddhd )); 153static int flush_inodedep_deps __P((struct fs , ino_t)); 154static int handle_written_filepage __P((struct pagedep , struct buf )); 155static void diradd_inode_written __P((struct diradd , struct inodedep )); 156static int handle_written_inodeblock __P((struct inodedep , struct buf )); 157static void handle_allocdirect_partdone __P((struct allocdirect )); 158static void handle_allocindir_partdone __P((struct allocindir )); 159static void initiate_write_filepage __P((struct pagedep , struct buf )); 160static void handle_written_mkdir __P((struct mkdir , int)); 161static void initiate_write_inodeblock __P((struct inodedep , struct buf )); 162static void handle_workitem_freefile __P((struct freefile )); 163static void handle_workitem_remove __P((struct dirrem )); 164static struct dirrem newdirrem __P((struct buf , struct inode , 165 struct inode , int, struct dirrem )); 166static void free_diradd __P((struct diradd )); 167static void free_allocindir __P((struct allocindir , struct inodedep )); 168static int indir_trunc __P((struct inode , ufs_daddr_t, int, ufs_lbn_t, 169* long )); 170static void deallocate_dependencies __P((struct buf , struct inodedep )); 171static void free_allocdirect __P((struct allocdirectlst , 172 struct allocdirect , int)); 173static int check_inode_unwritten __P((struct inodedep )); 174static int free_inodedep __P((struct inodedep )); 175static void handle_workitem_freeblocks __P((struct freeblks , int)); 176static void merge_inode_lists __P((struct inodedep )); 177static void setup_allocindir_phase2 __P((struct buf , struct inode , 178* struct allocindir )); 179static struct allocindir newallocindir __P((struct inode , int, ufs_daddr_t, 180* ufs_daddr_t)); 181static void handle_workitem_freefrag __P((struct freefrag )); 182static struct freefrag newfreefrag __P((struct inode , ufs_daddr_t, long)); 183static void allocdirect_merge __P((struct allocdirectlst , 184 struct allocdirect , struct allocdirect )); 185static struct bmsafemap bmsafemap_lookup __P((struct buf )); 186static int newblk_lookup __P((struct fs , ufs_daddr_t, int, 187* struct newblk *)); 188static int inodedep_lookup __P((struct fs , ino_t, int, struct inodedep *)); 189static int pagedep_lookup __P((struct inode , ufs_lbn_t, int, 190 struct pagedep *)); 191static void pause_timer __P((void )); 192static int request_cleanup __P((int, int)); 193static int process_worklist_item __P((struct mount , int)); 194static void add_to_worklist __P((struct worklist )); 195 196/* 197 * Exported softdep operations. 198 / 199static void softdep_disk_io_initiation __P((struct buf )); 200static void softdep_disk_write_complete __P((struct buf )); 201static void softdep_deallocate_dependencies __P((struct buf )); 202static void softdep_move_dependencies __P((struct buf , struct buf )); 203static int softdep_count_dependencies __P((struct buf bp, int)); 204* 205struct bio_ops bioops = { 206 softdep_disk_io_initiation, /* io_start / 207* softdep_disk_write_complete, /* io_complete / 208* softdep_deallocate_dependencies, /* io_deallocate / 209* softdep_move_dependencies, /* io_movedeps / 210* softdep_count_dependencies, /* io_countdeps / 211}; 212* 213/* 214 * Locking primitives. 215 * 216 * For a uniprocessor, all we need to do is protect against disk 217 * interrupts. For a multiprocessor, this lock would have to be 218 * a mutex. A single mutex is used throughout this file, though 219 * finer grain locking could be used if contention warranted it. 220 * 221 * For a multiprocessor, the sleep call would accept a lock and 222 * release it after the sleep processing was complete. In a uniprocessor 223 * implementation there is no such interlock, so we simple mark 224 * the places where it needs to be done with the `interlocked' form 225 * of the lock calls. Since the uniprocessor sleep already interlocks 226 * the spl, there is nothing that really needs to be done. 227 / 228#ifndef / NOT / DEBUG 229static struct lockit { 230* int lkt_spl; 231} lk = { 0 }; 232#define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio() 233#define FREE_LOCK(lk) splx((lk)->lkt_spl) 234#define ACQUIRE_LOCK_INTERLOCKED(lk) 235#define FREE_LOCK_INTERLOCKED(lk) 236 237#else /* DEBUG / 238static struct lockit { 239* int lkt_spl; 240 pid_t lkt_held; 241} lk = { 0, -1 }; 242static int lockcnt; 243 244static void acquire_lock __P((struct lockit )); 245static void free_lock __P((struct lockit )); 246static void acquire_lock_interlocked __P((struct lockit )); 247static void free_lock_interlocked __P((struct lockit )); 248 249#define ACQUIRE_LOCK(lk) acquire_lock(lk) 250#define FREE_LOCK(lk) free_lock(lk) 251#define ACQUIRE_LOCK_INTERLOCKED(lk) acquire_lock_interlocked(lk) 252#define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk) 253 254static void 255acquire_lock(lk) 256 struct lockit lk; 257{ 258* pid_t holder; 259 260 if (lk->lkt_held != -1) { 261 holder = lk->lkt_held; 262 FREE_LOCK(lk); 263 if (holder == CURPROC->p_pid) 264 panic("softdep_lock: locking against myself"); 265 else 266 panic("softdep_lock: lock held by %d", holder); 267 } 268 lk->lkt_spl = splbio(); 269 lk->lkt_held = CURPROC->p_pid; 270 lockcnt++; 271} 272 273static void 274free_lock(lk) 275 struct lockit lk; 276{ 277* 278 if (lk->lkt_held == -1) 279 panic("softdep_unlock: lock not held"); 280 lk->lkt_held = -1; 281 splx(lk->lkt_spl); 282} 283 284static void 285acquire_lock_interlocked(lk) 286 struct lockit lk; 287{ 288* pid_t holder; 289 290 if (lk->lkt_held != -1) { 291 holder = lk->lkt_held; 292 FREE_LOCK(lk); 293 if (holder == CURPROC->p_pid) 294 panic("softdep_lock_interlocked: locking against self"); 295 else 296 panic("softdep_lock_interlocked: lock held by %d", 297 holder); 298 } 299 lk->lkt_held = CURPROC->p_pid; 300 lockcnt++; 301} 302 303static void 304free_lock_interlocked(lk) 305 struct lockit lk; 306{ 307* 308 if (lk->lkt_held == -1) 309 panic("softdep_unlock_interlocked: lock not held"); 310 lk->lkt_held = -1; 311} 312#endif /* DEBUG / 313* 314/* 315 * Place holder for real semaphores. 316 / 317struct sema { 318* int value; 319 pid_t holder; 320 char name; 321* int prio; 322 int timo; 323}; 324static void sema_init __P((struct sema , char , int, int)); 325static int sema_get __P((struct sema , struct lockit )); 326static void sema_release __P((struct sema )); 327* 328static void 329sema_init(semap, name, prio, timo) 330 struct sema semap; 331* char name; 332* int prio, timo; 333{ 334 335 semap->holder = -1; 336 semap->value = 0; 337 semap->name = name; 338 semap->prio = prio; 339 semap->timo = timo; 340} 341 342static int 343sema_get(semap, interlock) 344 struct sema semap; 345* struct lockit interlock; 346{ 347* 348 if (semap->value++ > 0) { 349 if (interlock != NULL) 350 FREE_LOCK_INTERLOCKED(interlock); 351 tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo); 352 if (interlock != NULL) { 353 ACQUIRE_LOCK_INTERLOCKED(interlock); 354 FREE_LOCK(interlock); 355 } 356 return (0); 357 } 358 semap->holder = CURPROC->p_pid; 359 if (interlock != NULL) 360 FREE_LOCK(interlock); 361 return (1); 362} 363 364static void 365sema_release(semap) 366 struct sema semap; 367{ 368* 369 if (semap->value <= 0 \|\| semap->holder != CURPROC->p_pid) { 370 if (lk.lkt_held != -1) 371 FREE_LOCK(&lk); 372 panic("sema_release: not held"); 373 } 374 if (--semap->value > 0) { 375 semap->value = 0; 376 wakeup(semap); 377 } 378 semap->holder = -1; 379} 380 381/* 382 * Worklist queue management. 383 * These routines require that the lock be held. 384 / 385#ifndef / NOT / DEBUG 386#define WORKLIST_INSERT(head, item) do { \ 387* (item)->wk_state \|= ONWORKLIST; \ 388 LIST_INSERT_HEAD(head, item, wk_list); \ 389} while (0) 390#define WORKLIST_REMOVE(item) do { \ 391 (item)->wk_state &= ~ONWORKLIST; \ 392 LIST_REMOVE(item, wk_list); \ 393} while (0) 394#define WORKITEM_FREE(item, type) FREE(item, DtoM(type)) 395 396#else /* DEBUG / 397static void worklist_insert __P((struct workhead , struct worklist )); 398static void worklist_remove __P((struct worklist )); 399static void workitem_free __P((struct worklist , int)); 400* 401#define WORKLIST_INSERT(head, item) worklist_insert(head, item) 402#define WORKLIST_REMOVE(item) worklist_remove(item) 403#define WORKITEM_FREE(item, type) workitem_free((struct worklist )item, type) 404* 405static void 406worklist_insert(head, item) 407 struct workhead head; 408* struct worklist item; 409{ 410* 411 if (lk.lkt_held == -1) 412 panic("worklist_insert: lock not held"); 413 if (item->wk_state & ONWORKLIST) { 414 FREE_LOCK(&lk); 415 panic("worklist_insert: already on list"); 416 } 417 item->wk_state \|= ONWORKLIST; 418 LIST_INSERT_HEAD(head, item, wk_list); 419} 420 421static void 422worklist_remove(item) 423 struct worklist item; 424{ 425* 426 if (lk.lkt_held == -1) 427 panic("worklist_remove: lock not held"); 428 if ((item->wk_state & ONWORKLIST) == 0) { 429 FREE_LOCK(&lk); 430 panic("worklist_remove: not on list"); 431 } 432 item->wk_state &= ~ONWORKLIST; 433 LIST_REMOVE(item, wk_list); 434} 435 436static void 437workitem_free(item, type) 438 struct worklist item; 439* int type; 440{ 441 442 if (item->wk_state & ONWORKLIST) { 443 if (lk.lkt_held != -1) 444 FREE_LOCK(&lk); 445 panic("workitem_free: still on list"); 446 } 447 if (item->wk_type != type) { 448 if (lk.lkt_held != -1) 449 FREE_LOCK(&lk); 450 panic("workitem_free: type mismatch"); 451 } 452 FREE(item, DtoM(type)); 453} 454#endif /* DEBUG / 455* 456/* 457 * Workitem queue management 458 / 459static struct workhead softdep_workitem_pending; 460static int num_on_worklist; / number of worklist items to be processed / 461static int softdep_worklist_busy; / 1 => trying to do unmount / 462static int softdep_worklist_req; / serialized waiters / 463static int max_softdeps; / maximum number of structs before slowdown / 464static int tickdelay = 2; / number of ticks to pause during slowdown / 465static int proc_waiting; / tracks whether we have a timeout posted / 466static int stat_countp; /* statistic to count in proc_waiting timeout / 467static struct callout_handle handle; / handle on posted proc_waiting timeout / 468static struct proc filesys_syncer; /* proc of filesystem syncer process / 469static int req_clear_inodedeps; / syncer process flush some inodedeps / 470#define FLUSH_INODES 1 471static int req_clear_remove; / syncer process flush some freeblks / 472#define FLUSH_REMOVE 2 473/ 474 * runtime statistics 475 / 476static int stat_worklist_push; / number of worklist cleanups / 477static int stat_blk_limit_push; / number of times block limit neared / 478static int stat_ino_limit_push; / number of times inode limit neared / 479static int stat_blk_limit_hit; / number of times block slowdown imposed / 480static int stat_ino_limit_hit; / number of times inode slowdown imposed / 481static int stat_sync_limit_hit; / number of synchronous slowdowns imposed / 482static int stat_indir_blk_ptrs; / bufs redirtied as indir ptrs not written / 483static int stat_inode_bitmap; / bufs redirtied as inode bitmap not written / 484static int stat_direct_blk_ptrs;/ bufs redirtied as direct ptrs not written / 485static int stat_dir_entry; / bufs redirtied as dir entry cannot write / 486#ifdef DEBUG 487#include <vm/vm.h> 488#include <sys/sysctl.h> 489SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); 490SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); 491SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,""); 492SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); 493SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); 494SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); 495SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); 496SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, ""); 497SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); 498SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); 499SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); 500SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); 501#endif / DEBUG / 502* 503/* 504 * Add an item to the end of the work queue. 505 * This routine requires that the lock be held. 506 * This is the only routine that adds items to the list. 507 * The following routine is the only one that removes items 508 * and does so in order from first to last. 509 / 510static void 511add_to_worklist(wk) 512* struct worklist wk; 513{ 514* static struct worklist worklist_tail; 515* 516 if (wk->wk_state & ONWORKLIST) { 517 if (lk.lkt_held != -1) 518 FREE_LOCK(&lk); 519 panic("add_to_worklist: already on list"); 520 } 521 wk->wk_state \|= ONWORKLIST; 522 if (LIST_FIRST(&softdep_workitem_pending) == NULL) 523 LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list); 524 else 525 LIST_INSERT_AFTER(worklist_tail, wk, wk_list); 526 worklist_tail = wk; 527 num_on_worklist += 1; 528} 529 530/* 531 * Process that runs once per second to handle items in the background queue. 532 * 533 * Note that we ensure that everything is done in the order in which they 534 * appear in the queue. The code below depends on this property to ensure 535 * that blocks of a file are freed before the inode itself is freed. This 536 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 537 * until all the old ones have been purged from the dependency lists. 538 / 539int 540softdep_process_worklist(matchmnt) 541* struct mount matchmnt; 542{ 543* struct proc p = CURPROC; 544* int matchcnt, loopcount; 545 long starttime; 546 547 /* 548 * Record the process identifier of our caller so that we can give 549 * this process preferential treatment in request_cleanup below. 550 / 551* filesys_syncer = p; 552 matchcnt = 0; 553 554 /* 555 * There is no danger of having multiple processes run this 556 * code, but we have to single-thread it when softdep_flushfiles() 557 * is in operation to get an accurate count of the number of items 558 * related to its mount point that are in the list. 559 / 560* if (matchmnt == NULL) { 561 if (softdep_worklist_busy < 0) 562 return(-1); 563 softdep_worklist_busy += 1; 564 } 565 566 /* 567 * If requested, try removing inode or removal dependencies. 568 / 569* if (req_clear_inodedeps) { 570 clear_inodedeps(p); 571 req_clear_inodedeps -= 1; 572 wakeup_one(&proc_waiting); 573 } 574 if (req_clear_remove) { 575 clear_remove(p); 576 req_clear_remove -= 1; 577 wakeup_one(&proc_waiting); 578 } 579 loopcount = 1; 580 starttime = time_second; 581 while (num_on_worklist > 0) { 582 matchcnt += process_worklist_item(matchmnt, 0); 583 584 /* 585 * If a umount operation wants to run the worklist 586 * accurately, abort. 587 / 588* if (softdep_worklist_req && matchmnt == NULL) { 589 matchcnt = -1; 590 break; 591 } 592 593 /* 594 * If requested, try removing inode or removal dependencies. 595 / 596* if (req_clear_inodedeps) { 597 clear_inodedeps(p); 598 req_clear_inodedeps -= 1; 599 wakeup_one(&proc_waiting); 600 } 601 if (req_clear_remove) { 602 clear_remove(p); 603 req_clear_remove -= 1; 604 wakeup_one(&proc_waiting); 605 } 606 /* 607 * We do not generally want to stop for buffer space, but if 608 * we are really being a buffer hog, we will stop and wait. 609 / 610* if (loopcount++ % 128 == 0) 611 bwillwrite(); 612 /* 613 * Never allow processing to run for more than one 614 * second. Otherwise the other syncer tasks may get 615 * excessively backlogged. 616 / 617* if (starttime != time_second && matchmnt == NULL) { 618 matchcnt = -1; 619 break; 620 } 621 } 622 if (matchmnt == NULL) { 623 softdep_worklist_busy -= 1; 624 if (softdep_worklist_req && softdep_worklist_busy == 0) 625 wakeup(&softdep_worklist_req); 626 } 627 return (matchcnt); 628} 629 630/* 631 * Process one item on the worklist. 632 / 633static int 634process_worklist_item(matchmnt, flags) 635* struct mount matchmnt; 636* int flags; 637{ 638 struct worklist wk; 639* struct dirrem dirrem; 640* struct mount mp; 641* struct vnode vp; 642* int matchcnt = 0; 643 644 ACQUIRE_LOCK(&lk); 645 /* 646 * Normally we just process each item on the worklist in order. 647 * However, if we are in a situation where we cannot lock any 648 * inodes, we have to skip over any dirrem requests whose 649 * vnodes are resident and locked. 650 / 651* LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) { 652 if ((flags & LK_NOWAIT) == 0 \|\| wk->wk_type != D_DIRREM) 653 break; 654 dirrem = WK_DIRREM(wk); 655 vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev, 656 dirrem->dm_oldinum); 657 if (vp == NULL \|\| !VOP_ISLOCKED(vp, CURPROC)) 658 break; 659 } 660 if (wk == 0) { 661 FREE_LOCK(&lk); 662 return (0); 663 } 664 WORKLIST_REMOVE(wk); 665 num_on_worklist -= 1; 666 FREE_LOCK(&lk); 667 switch (wk->wk_type) { 668 669 case D_DIRREM: 670 /* removal of a directory entry / 671* mp = WK_DIRREM(wk)->dm_mnt; 672 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 673 panic("%s: dirrem on suspended filesystem", 674 "process_worklist_item"); 675 if (mp == matchmnt) 676 matchcnt += 1; 677 handle_workitem_remove(WK_DIRREM(wk)); 678 break; 679 680 case D_FREEBLKS: 681 /* releasing blocks and/or fragments from a file / 682* mp = WK_FREEBLKS(wk)->fb_mnt; 683 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 684 panic("%s: freeblks on suspended filesystem", 685 "process_worklist_item"); 686 if (mp == matchmnt) 687 matchcnt += 1; 688 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT); 689 break; 690 691 case D_FREEFRAG: 692 /* releasing a fragment when replaced as a file grows / 693* mp = WK_FREEFRAG(wk)->ff_mnt; 694 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 695 panic("%s: freefrag on suspended filesystem", 696 "process_worklist_item"); 697 if (mp == matchmnt) 698 matchcnt += 1; 699 handle_workitem_freefrag(WK_FREEFRAG(wk)); 700 break; 701 702 case D_FREEFILE: 703 /* releasing an inode when its link count drops to 0 / 704* mp = WK_FREEFILE(wk)->fx_mnt; 705 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 706 panic("%s: freefile on suspended filesystem", 707 "process_worklist_item"); 708 if (mp == matchmnt) 709 matchcnt += 1; 710 handle_workitem_freefile(WK_FREEFILE(wk)); 711 break; 712 713 default: 714 panic("%s_process_worklist: Unknown type %s", 715 "softdep", TYPENAME(wk->wk_type)); 716 /* NOTREACHED / 717* } 718 return (matchcnt); 719} 720 721/* 722 * Move dependencies from one buffer to another. 723 / 724static void 725softdep_move_dependencies(oldbp, newbp) 726* struct buf oldbp; 727* struct buf newbp; 728{ 729* struct worklist wk, wktail; 730 731 if (LIST_FIRST(&newbp->b_dep) != NULL) 732 panic("softdep_move_dependencies: need merge code"); 733 wktail = 0; 734 ACQUIRE_LOCK(&lk); 735 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 736 LIST_REMOVE(wk, wk_list); 737 if (wktail == 0) 738 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 739 else 740 LIST_INSERT_AFTER(wktail, wk, wk_list); 741 wktail = wk; 742 } 743 FREE_LOCK(&lk); 744} 745 746/* 747 * Purge the work list of all items associated with a particular mount point. 748 / 749int 750softdep_flushworklist(oldmnt, countp, p) 751* struct mount oldmnt; 752* int countp; 753* struct proc p; 754{ 755* struct vnode devvp; 756* int count, error = 0; 757 758 /* 759 * Await our turn to clear out the queue, then serialize access. 760 / 761* while (softdep_worklist_busy) { 762 softdep_worklist_req += 1; 763 tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0); 764 softdep_worklist_req -= 1; 765 } 766 softdep_worklist_busy = -1; 767 /* 768 * Alternately flush the block device associated with the mount 769 * point and process any dependencies that the flushing 770 * creates. We continue until no more worklist dependencies 771 * are found. 772 / 773* countp = 0; 774* devvp = VFSTOUFS(oldmnt)->um_devvp; 775 while ((count = softdep_process_worklist(oldmnt)) > 0) { 776 countp += count; 777* vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY, p); 778 error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p); 779 VOP_UNLOCK(devvp, 0, p); 780 if (error) 781 break; 782 } 783 softdep_worklist_busy = 0; 784 if (softdep_worklist_req) 785 wakeup(&softdep_worklist_req); 786 return (error); 787} 788 789/* 790 * Flush all vnodes and worklist items associated with a specified mount point. 791 / 792int 793softdep_flushfiles(oldmnt, flags, p) 794* struct mount oldmnt; 795* int flags; 796 struct proc p; 797{ 798* int error, count, loopcnt; 799 800 /* 801 * Alternately flush the vnodes associated with the mount 802 * point and process any dependencies that the flushing 803 * creates. In theory, this loop can happen at most twice, 804 * but we give it a few extra just to be sure. 805 / 806* for (loopcnt = 10; loopcnt > 0; loopcnt--) { 807 /* 808 * Do another flush in case any vnodes were brought in 809 * as part of the cleanup operations. 810 / 811* if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) 812 break; 813 if ((error = softdep_flushworklist(oldmnt, &count, p)) != 0 \|\| 814 count == 0) 815 break; 816 } 817 /* 818 * If we are unmounting then it is an error to fail. If we 819 * are simply trying to downgrade to read-only, then filesystem 820 * activity can keep us busy forever, so we just fail with EBUSY. 821 / 822* if (loopcnt == 0) { 823 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 824 panic("softdep_flushfiles: looping"); 825 error = EBUSY; 826 } 827 return (error); 828} 829 830/* 831 * Structure hashing. 832 * 833 * There are three types of structures that can be looked up: 834 * 1) pagedep structures identified by mount point, inode number, 835 * and logical block. 836 * 2) inodedep structures identified by mount point and inode number. 837 * 3) newblk structures identified by mount point and 838 * physical block number. 839 * 840 * The "pagedep" and "inodedep" dependency structures are hashed 841 * separately from the file blocks and inodes to which they correspond. 842 * This separation helps when the in-memory copy of an inode or 843 * file block must be replaced. It also obviates the need to access 844 * an inode or file page when simply updating (or de-allocating) 845 * dependency structures. Lookup of newblk structures is needed to 846 * find newly allocated blocks when trying to associate them with 847 * their allocdirect or allocindir structure. 848 * 849 * The lookup routines optionally create and hash a new instance when 850 * an existing entry is not found. 851 / 852#define DEPALLOC 0x0001 / allocate structure if lookup fails / 853#define NODELAY 0x0002 / cannot do background work / 854* 855/* 856 * Structures and routines associated with pagedep caching. 857 / 858LIST_HEAD(pagedep_hashhead, pagedep) pagedep_hashtbl; 859u_long pagedep_hash; /* size of hash table - 1 / 860#define PAGEDEP_HASH(mp, inum, lbn) \ 861* (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 862 pagedep_hash]) 863static struct sema pagedep_in_progress; 864 865/* 866 * Look up a pagedep. Return 1 if found, 0 if not found. 867 * If not found, allocate if DEPALLOC flag is passed. 868 * Found or allocated entry is returned in pagedeppp. 869 * This routine must be called with splbio interrupts blocked. 870 / 871static int 872pagedep_lookup(ip, lbn, flags, pagedeppp) 873* struct inode ip; 874* ufs_lbn_t lbn; 875 int flags; 876 struct pagedep *pagedeppp; 877{ 878* struct pagedep pagedep; 879* struct pagedep_hashhead pagedephd; 880* struct mount mp; 881* int i; 882 883#ifdef DEBUG 884 if (lk.lkt_held == -1) 885 panic("pagedep_lookup: lock not held"); 886#endif 887 mp = ITOV(ip)->v_mount; 888 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); 889top: 890 LIST_FOREACH(pagedep, pagedephd, pd_hash) 891 if (ip->i_number == pagedep->pd_ino && 892 lbn == pagedep->pd_lbn && 893 mp == pagedep->pd_mnt) 894 break; 895 if (pagedep) { 896 pagedeppp = pagedep; 897* return (1); 898 } 899 if ((flags & DEPALLOC) == 0) { 900 pagedeppp = NULL; 901* return (0); 902 } 903 if (sema_get(&pagedep_in_progress, &lk) == 0) { 904 ACQUIRE_LOCK(&lk); 905 goto top; 906 } 907 MALLOC(pagedep, struct pagedep , sizeof(struct pagedep), M_PAGEDEP, 908* M_SOFTDEP_FLAGS\|M_ZERO); 909 pagedep->pd_list.wk_type = D_PAGEDEP; 910 pagedep->pd_mnt = mp; 911 pagedep->pd_ino = ip->i_number; 912 pagedep->pd_lbn = lbn; 913 LIST_INIT(&pagedep->pd_dirremhd); 914 LIST_INIT(&pagedep->pd_pendinghd); 915 for (i = 0; i < DAHASHSZ; i++) 916 LIST_INIT(&pagedep->pd_diraddhd[i]); 917 ACQUIRE_LOCK(&lk); 918 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 919 sema_release(&pagedep_in_progress); 920 pagedeppp = pagedep; 921* return (0); 922} 923 924/* 925 * Structures and routines associated with inodedep caching. 926 / 927LIST_HEAD(inodedep_hashhead, inodedep) inodedep_hashtbl; 928static u_long inodedep_hash; /* size of hash table - 1 / 929static long num_inodedep; / number of inodedep allocated / 930#define INODEDEP_HASH(fs, inum) \ 931* (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 932static struct sema inodedep_in_progress; 933 934/* 935 * Look up a inodedep. Return 1 if found, 0 if not found. 936 * If not found, allocate if DEPALLOC flag is passed. 937 * Found or allocated entry is returned in inodedeppp. 938 * This routine must be called with splbio interrupts blocked. 939 / 940static int 941inodedep_lookup(fs, inum, flags, inodedeppp) 942* struct fs fs; 943* ino_t inum; 944 int flags; 945 struct inodedep *inodedeppp; 946{ 947* struct inodedep inodedep; 948* struct inodedep_hashhead inodedephd; 949* int firsttry; 950 951#ifdef DEBUG 952 if (lk.lkt_held == -1) 953 panic("inodedep_lookup: lock not held"); 954#endif 955 firsttry = 1; 956 inodedephd = INODEDEP_HASH(fs, inum); 957top: 958 LIST_FOREACH(inodedep, inodedephd, id_hash) 959 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 960 break; 961 if (inodedep) { 962 inodedeppp = inodedep; 963* return (1); 964 } 965 if ((flags & DEPALLOC) == 0) { 966 inodedeppp = NULL; 967* return (0); 968 } 969 /* 970 * If we are over our limit, try to improve the situation. 971 / 972* if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 && 973 request_cleanup(FLUSH_INODES, 1)) { 974 firsttry = 0; 975 goto top; 976 } 977 if (sema_get(&inodedep_in_progress, &lk) == 0) { 978 ACQUIRE_LOCK(&lk); 979 goto top; 980 } 981 num_inodedep += 1; 982 MALLOC(inodedep, struct inodedep , sizeof(struct inodedep), 983* M_INODEDEP, M_SOFTDEP_FLAGS); 984 inodedep->id_list.wk_type = D_INODEDEP; 985 inodedep->id_fs = fs; 986 inodedep->id_ino = inum; 987 inodedep->id_state = ALLCOMPLETE; 988 inodedep->id_nlinkdelta = 0; 989 inodedep->id_savedino = NULL; 990 inodedep->id_savedsize = -1; 991 inodedep->id_buf = NULL; 992 LIST_INIT(&inodedep->id_pendinghd); 993 LIST_INIT(&inodedep->id_inowait); 994 LIST_INIT(&inodedep->id_bufwait); 995 TAILQ_INIT(&inodedep->id_inoupdt); 996 TAILQ_INIT(&inodedep->id_newinoupdt); 997 ACQUIRE_LOCK(&lk); 998 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 999 sema_release(&inodedep_in_progress); 1000 inodedeppp = inodedep; 1001* return (0); 1002} 1003 1004/* 1005 * Structures and routines associated with newblk caching. 1006 / 1007LIST_HEAD(newblk_hashhead, newblk) newblk_hashtbl; 1008u_long newblk_hash; /* size of hash table - 1 / 1009#define NEWBLK_HASH(fs, inum) \ 1010* (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 1011static struct sema newblk_in_progress; 1012 1013/* 1014 * Look up a newblk. Return 1 if found, 0 if not found. 1015 * If not found, allocate if DEPALLOC flag is passed. 1016 * Found or allocated entry is returned in newblkpp. 1017 / 1018static int 1019newblk_lookup(fs, newblkno, flags, newblkpp) 1020* struct fs fs; 1021* ufs_daddr_t newblkno; 1022 int flags; 1023 struct newblk *newblkpp; 1024{ 1025* struct newblk newblk; 1026* struct newblk_hashhead newblkhd; 1027* 1028 newblkhd = NEWBLK_HASH(fs, newblkno); 1029top: 1030 LIST_FOREACH(newblk, newblkhd, nb_hash) 1031 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) 1032 break; 1033 if (newblk) { 1034 newblkpp = newblk; 1035* return (1); 1036 } 1037 if ((flags & DEPALLOC) == 0) { 1038 newblkpp = NULL; 1039* return (0); 1040 } 1041 if (sema_get(&newblk_in_progress, 0) == 0) 1042 goto top; 1043 MALLOC(newblk, struct newblk , sizeof(struct newblk), 1044* M_NEWBLK, M_SOFTDEP_FLAGS); 1045 newblk->nb_state = 0; 1046 newblk->nb_fs = fs; 1047 newblk->nb_newblkno = newblkno; 1048 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 1049 sema_release(&newblk_in_progress); 1050 newblkpp = newblk; 1051* return (0); 1052} 1053 1054/* 1055 * Executed during filesystem system initialization before 1056 * mounting any file systems. 1057 / 1058void 1059softdep_initialize() 1060{ 1061* 1062 LIST_INIT(&mkdirlisthd); 1063 LIST_INIT(&softdep_workitem_pending); 1064 max_softdeps = min(desiredvnodes * 8, 1065 M_INODEDEP->ks_limit / (2 * sizeof(struct inodedep))); 1066 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, 1067 &pagedep_hash); 1068 sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0); 1069 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 1070 sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0); 1071 newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); 1072 sema_init(&newblk_in_progress, "newblk", PRIBIO, 0); 1073} 1074 1075/* 1076 * Called at mount time to notify the dependency code that a 1077 * filesystem wishes to use it. 1078 / 1079int 1080softdep_mount(devvp, mp, fs, cred) 1081* struct vnode devvp; 1082* struct mount mp; 1083* struct fs fs; 1084* struct ucred cred; 1085{ 1086* struct csum cstotal; 1087 struct cg cgp; 1088* struct buf bp; 1089* int error, cyl; 1090 1091 mp->mnt_flag &= ~MNT_ASYNC; 1092 mp->mnt_flag \|= MNT_SOFTDEP; 1093 /* 1094 * When doing soft updates, the counters in the 1095 * superblock may have gotten out of sync, so we have 1096 * to scan the cylinder groups and recalculate them. 1097 / 1098* if (fs->fs_clean != 0) 1099 return (0); 1100 bzero(&cstotal, sizeof cstotal); 1101 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 1102 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 1103 fs->fs_cgsize, cred, &bp)) != 0) { 1104 brelse(bp); 1105 return (error); 1106 } 1107 cgp = (struct cg )bp->b_data; 1108* cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 1109 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 1110 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 1111 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 1112 fs->fs_cs(fs, cyl) = cgp->cg_cs; 1113 brelse(bp); 1114 } 1115#ifdef DEBUG 1116 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 1117 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 1118#endif 1119 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 1120 return (0); 1121} 1122 1123/* 1124 * Protecting the freemaps (or bitmaps). 1125 * 1126 * To eliminate the need to execute fsck before mounting a file system 1127 * after a power failure, one must (conservatively) guarantee that the 1128 * on-disk copy of the bitmaps never indicate that a live inode or block is 1129 * free. So, when a block or inode is allocated, the bitmap should be 1130 * updated (on disk) before any new pointers. When a block or inode is 1131 * freed, the bitmap should not be updated until all pointers have been 1132 * reset. The latter dependency is handled by the delayed de-allocation 1133 * approach described below for block and inode de-allocation. The former 1134 * dependency is handled by calling the following procedure when a block or 1135 * inode is allocated. When an inode is allocated an "inodedep" is created 1136 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 1137 * Each "inodedep" is also inserted into the hash indexing structure so 1138 * that any additional link additions can be made dependent on the inode 1139 * allocation. 1140 * 1141 * The ufs file system maintains a number of free block counts (e.g., per 1142 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 1143 * in addition to the bitmaps. These counts are used to improve efficiency 1144 * during allocation and therefore must be consistent with the bitmaps. 1145 * There is no convenient way to guarantee post-crash consistency of these 1146 * counts with simple update ordering, for two main reasons: (1) The counts 1147 * and bitmaps for a single cylinder group block are not in the same disk 1148 * sector. If a disk write is interrupted (e.g., by power failure), one may 1149 * be written and the other not. (2) Some of the counts are located in the 1150 * superblock rather than the cylinder group block. So, we focus our soft 1151 * updates implementation on protecting the bitmaps. When mounting a 1152 * filesystem, we recompute the auxiliary counts from the bitmaps. 1153 / 1154* 1155/* 1156 * Called just after updating the cylinder group block to allocate an inode. 1157 / 1158void 1159softdep_setup_inomapdep(bp, ip, newinum) 1160* struct buf bp; / buffer for cylgroup block with inode map / 1161* struct inode ip; / inode related to allocation / 1162* ino_t newinum; /* new inode number being allocated / 1163{ 1164* struct inodedep inodedep; 1165* struct bmsafemap bmsafemap; 1166* 1167 /* 1168 * Create a dependency for the newly allocated inode. 1169 * Panic if it already exists as something is seriously wrong. 1170 * Otherwise add it to the dependency list for the buffer holding 1171 * the cylinder group map from which it was allocated. 1172 / 1173* ACQUIRE_LOCK(&lk); 1174 if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC\|NODELAY, &inodedep))) { 1175 FREE_LOCK(&lk); 1176 panic("softdep_setup_inomapdep: found inode"); 1177 } 1178 inodedep->id_buf = bp; 1179 inodedep->id_state &= ~DEPCOMPLETE; 1180 bmsafemap = bmsafemap_lookup(bp); 1181 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 1182 FREE_LOCK(&lk); 1183} 1184 1185/* 1186 * Called just after updating the cylinder group block to 1187 * allocate block or fragment. 1188 / 1189void 1190softdep_setup_blkmapdep(bp, fs, newblkno) 1191* struct buf bp; / buffer for cylgroup block with block map / 1192* struct fs fs; / filesystem doing allocation / 1193* ufs_daddr_t newblkno; /* number of newly allocated block / 1194{ 1195* struct newblk newblk; 1196* struct bmsafemap bmsafemap; 1197* 1198 /* 1199 * Create a dependency for the newly allocated block. 1200 * Add it to the dependency list for the buffer holding 1201 * the cylinder group map from which it was allocated. 1202 / 1203* if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) 1204 panic("softdep_setup_blkmapdep: found block"); 1205 ACQUIRE_LOCK(&lk); 1206 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp); 1207 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 1208 FREE_LOCK(&lk); 1209} 1210 1211/* 1212 * Find the bmsafemap associated with a cylinder group buffer. 1213 * If none exists, create one. The buffer must be locked when 1214 * this routine is called and this routine must be called with 1215 * splbio interrupts blocked. 1216 / 1217static struct bmsafemap 1218bmsafemap_lookup(bp) 1219 struct buf bp; 1220{ 1221* struct bmsafemap bmsafemap; 1222* struct worklist wk; 1223* 1224#ifdef DEBUG 1225 if (lk.lkt_held == -1) 1226 panic("bmsafemap_lookup: lock not held"); 1227#endif 1228 LIST_FOREACH(wk, &bp->b_dep, wk_list) 1229 if (wk->wk_type == D_BMSAFEMAP) 1230 return (WK_BMSAFEMAP(wk)); 1231 FREE_LOCK(&lk); 1232 MALLOC(bmsafemap, struct bmsafemap , sizeof(struct bmsafemap), 1233* M_BMSAFEMAP, M_SOFTDEP_FLAGS); 1234 bmsafemap->sm_list.wk_type = D_BMSAFEMAP; 1235 bmsafemap->sm_list.wk_state = 0; 1236 bmsafemap->sm_buf = bp; 1237 LIST_INIT(&bmsafemap->sm_allocdirecthd); 1238 LIST_INIT(&bmsafemap->sm_allocindirhd); 1239 LIST_INIT(&bmsafemap->sm_inodedephd); 1240 LIST_INIT(&bmsafemap->sm_newblkhd); 1241 ACQUIRE_LOCK(&lk); 1242 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 1243 return (bmsafemap); 1244} 1245 1246/* 1247 * Direct block allocation dependencies. 1248 * 1249 * When a new block is allocated, the corresponding disk locations must be 1250 * initialized (with zeros or new data) before the on-disk inode points to 1251 * them. Also, the freemap from which the block was allocated must be 1252 * updated (on disk) before the inode's pointer. These two dependencies are 1253 * independent of each other and are needed for all file blocks and indirect 1254 * blocks that are pointed to directly by the inode. Just before the 1255 * "in-core" version of the inode is updated with a newly allocated block 1256 * number, a procedure (below) is called to setup allocation dependency 1257 * structures. These structures are removed when the corresponding 1258 * dependencies are satisfied or when the block allocation becomes obsolete 1259 * (i.e., the file is deleted, the block is de-allocated, or the block is a 1260 * fragment that gets upgraded). All of these cases are handled in 1261 * procedures described later. 1262 * 1263 * When a file extension causes a fragment to be upgraded, either to a larger 1264 * fragment or to a full block, the on-disk location may change (if the 1265 * previous fragment could not simply be extended). In this case, the old 1266 * fragment must be de-allocated, but not until after the inode's pointer has 1267 * been updated. In most cases, this is handled by later procedures, which 1268 * will construct a "freefrag" structure to be added to the workitem queue 1269 * when the inode update is complete (or obsolete). The main exception to 1270 * this is when an allocation occurs while a pending allocation dependency 1271 * (for the same block pointer) remains. This case is handled in the main 1272 * allocation dependency setup procedure by immediately freeing the 1273 * unreferenced fragments. 1274 / 1275void 1276softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 1277* struct inode ip; / inode to which block is being added / 1278* ufs_lbn_t lbn; /* block pointer within inode / 1279* ufs_daddr_t newblkno; /* disk block number being added / 1280* ufs_daddr_t oldblkno; /* previous block number, 0 unless frag / 1281* long newsize; /* size of new block / 1282* long oldsize; /* size of new block / 1283* struct buf bp; / bp for allocated block / 1284{ 1285* struct allocdirect adp, oldadp; 1286 struct allocdirectlst adphead; 1287* struct bmsafemap bmsafemap; 1288* struct inodedep inodedep; 1289* struct pagedep pagedep; 1290* struct newblk newblk; 1291* 1292 MALLOC(adp, struct allocdirect , sizeof(struct allocdirect), 1293* M_ALLOCDIRECT, M_SOFTDEP_FLAGS\|M_ZERO); 1294 adp->ad_list.wk_type = D_ALLOCDIRECT; 1295 adp->ad_lbn = lbn; 1296 adp->ad_newblkno = newblkno; 1297 adp->ad_oldblkno = oldblkno; 1298 adp->ad_newsize = newsize; 1299 adp->ad_oldsize = oldsize; 1300 adp->ad_state = ATTACHED; 1301 if (newblkno == oldblkno) 1302 adp->ad_freefrag = NULL; 1303 else 1304 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); 1305 1306 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) 1307 panic("softdep_setup_allocdirect: lost block"); 1308 1309 ACQUIRE_LOCK(&lk); 1310 inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC \| NODELAY, &inodedep); 1311 adp->ad_inodedep = inodedep; 1312 1313 if (newblk->nb_state == DEPCOMPLETE) { 1314 adp->ad_state \|= DEPCOMPLETE; 1315 adp->ad_buf = NULL; 1316 } else { 1317 bmsafemap = newblk->nb_bmsafemap; 1318 adp->ad_buf = bmsafemap->sm_buf; 1319 LIST_REMOVE(newblk, nb_deps); 1320 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); 1321 } 1322 LIST_REMOVE(newblk, nb_hash); 1323 FREE(newblk, M_NEWBLK); 1324 1325 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); 1326 if (lbn >= NDADDR) { 1327 /* allocating an indirect block / 1328* if (oldblkno != 0) { 1329 FREE_LOCK(&lk); 1330 panic("softdep_setup_allocdirect: non-zero indir"); 1331 } 1332 } else { 1333 /* 1334 * Allocating a direct block. 1335 * 1336 * If we are allocating a directory block, then we must 1337 * allocate an associated pagedep to track additions and 1338 * deletions. 1339 / 1340* if ((ip->i_mode & IFMT) == IFDIR && 1341 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1342 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 1343 } 1344 /* 1345 * The list of allocdirects must be kept in sorted and ascending 1346 * order so that the rollback routines can quickly determine the 1347 * first uncommitted block (the size of the file stored on disk 1348 * ends at the end of the lowest committed fragment, or if there 1349 * are no fragments, at the end of the highest committed block). 1350 * Since files generally grow, the typical case is that the new 1351 * block is to be added at the end of the list. We speed this 1352 * special case by checking against the last allocdirect in the 1353 * list before laboriously traversing the list looking for the 1354 * insertion point. 1355 / 1356* adphead = &inodedep->id_newinoupdt; 1357 oldadp = TAILQ_LAST(adphead, allocdirectlst); 1358 if (oldadp == NULL \|\| oldadp->ad_lbn <= lbn) { 1359 /* insert at end of list / 1360* TAILQ_INSERT_TAIL(adphead, adp, ad_next); 1361 if (oldadp != NULL && oldadp->ad_lbn == lbn) 1362 allocdirect_merge(adphead, adp, oldadp); 1363 FREE_LOCK(&lk); 1364 return; 1365 } 1366 TAILQ_FOREACH(oldadp, adphead, ad_next) { 1367 if (oldadp->ad_lbn >= lbn) 1368 break; 1369 } 1370 if (oldadp == NULL) { 1371 FREE_LOCK(&lk); 1372 panic("softdep_setup_allocdirect: lost entry"); 1373 } 1374 /* insert in middle of list / 1375* TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 1376 if (oldadp->ad_lbn == lbn) 1377 allocdirect_merge(adphead, adp, oldadp); 1378 FREE_LOCK(&lk); 1379} 1380 1381/* 1382 * Replace an old allocdirect dependency with a newer one. 1383 * This routine must be called with splbio interrupts blocked. 1384 / 1385static void 1386allocdirect_merge(adphead, newadp, oldadp) 1387* struct allocdirectlst adphead; / head of list holding allocdirects / 1388* struct allocdirect newadp; / allocdirect being added / 1389* struct allocdirect oldadp; / existing allocdirect being checked / 1390{ 1391* struct freefrag freefrag; 1392* 1393#ifdef DEBUG 1394 if (lk.lkt_held == -1) 1395 panic("allocdirect_merge: lock not held"); 1396#endif 1397 if (newadp->ad_oldblkno != oldadp->ad_newblkno \|\| 1398 newadp->ad_oldsize != oldadp->ad_newsize \|\| 1399 newadp->ad_lbn >= NDADDR) { 1400 FREE_LOCK(&lk); 1401 panic("allocdirect_check: old %d != new %d \|\| lbn %ld >= %d", 1402 newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn, 1403 NDADDR); 1404 } 1405 newadp->ad_oldblkno = oldadp->ad_oldblkno; 1406 newadp->ad_oldsize = oldadp->ad_oldsize; 1407 /* 1408 * If the old dependency had a fragment to free or had never 1409 * previously had a block allocated, then the new dependency 1410 * can immediately post its freefrag and adopt the old freefrag. 1411 * This action is done by swapping the freefrag dependencies. 1412 * The new dependency gains the old one's freefrag, and the 1413 * old one gets the new one and then immediately puts it on 1414 * the worklist when it is freed by free_allocdirect. It is 1415 * not possible to do this swap when the old dependency had a 1416 * non-zero size but no previous fragment to free. This condition 1417 * arises when the new block is an extension of the old block. 1418 * Here, the first part of the fragment allocated to the new 1419 * dependency is part of the block currently claimed on disk by 1420 * the old dependency, so cannot legitimately be freed until the 1421 * conditions for the new dependency are fulfilled. 1422 / 1423* if (oldadp->ad_freefrag != NULL \|\| oldadp->ad_oldblkno == 0) { 1424 freefrag = newadp->ad_freefrag; 1425 newadp->ad_freefrag = oldadp->ad_freefrag; 1426 oldadp->ad_freefrag = freefrag; 1427 } 1428 free_allocdirect(adphead, oldadp, 0); 1429} 1430 1431/* 1432 * Allocate a new freefrag structure if needed. 1433 / 1434static struct freefrag 1435newfreefrag(ip, blkno, size) 1436 struct inode ip; 1437* ufs_daddr_t blkno; 1438 long size; 1439{ 1440 struct freefrag freefrag; 1441* struct fs fs; 1442* 1443 if (blkno == 0) 1444 return (NULL); 1445 fs = ip->i_fs; 1446 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 1447 panic("newfreefrag: frag size"); 1448 MALLOC(freefrag, struct freefrag , sizeof(struct freefrag), 1449* M_FREEFRAG, M_SOFTDEP_FLAGS); 1450 freefrag->ff_list.wk_type = D_FREEFRAG; 1451 freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below / 1452* freefrag->ff_inum = ip->i_number; 1453 freefrag->ff_mnt = ITOV(ip)->v_mount; 1454 freefrag->ff_devvp = ip->i_devvp; 1455 freefrag->ff_blkno = blkno; 1456 freefrag->ff_fragsize = size; 1457 return (freefrag); 1458} 1459 1460/* 1461 * This workitem de-allocates fragments that were replaced during 1462 * file block allocation. 1463 / 1464static void 1465handle_workitem_freefrag(freefrag) 1466* struct freefrag freefrag; 1467{ 1468* struct inode tip; 1469 1470 tip.i_vnode = NULL; 1471 tip.i_fs = VFSTOUFS(freefrag->ff_mnt)->um_fs; 1472 tip.i_devvp = freefrag->ff_devvp; 1473 tip.i_dev = freefrag->ff_devvp->v_rdev; 1474 tip.i_number = freefrag->ff_inum; 1475 tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above / 1476* ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize); 1477 FREE(freefrag, M_FREEFRAG); 1478} 1479 1480/* 1481 * Indirect block allocation dependencies. 1482 * 1483 * The same dependencies that exist for a direct block also exist when 1484 * a new block is allocated and pointed to by an entry in a block of 1485 * indirect pointers. The undo/redo states described above are also 1486 * used here. Because an indirect block contains many pointers that 1487 * may have dependencies, a second copy of the entire in-memory indirect 1488 * block is kept. The buffer cache copy is always completely up-to-date. 1489 * The second copy, which is used only as a source for disk writes, 1490 * contains only the safe pointers (i.e., those that have no remaining 1491 * update dependencies). The second copy is freed when all pointers 1492 * are safe. The cache is not allowed to replace indirect blocks with 1493 * pending update dependencies. If a buffer containing an indirect 1494 * block with dependencies is written, these routines will mark it 1495 * dirty again. It can only be successfully written once all the 1496 * dependencies are removed. The ffs_fsync routine in conjunction with 1497 * softdep_sync_metadata work together to get all the dependencies 1498 * removed so that a file can be successfully written to disk. Three 1499 * procedures are used when setting up indirect block pointer 1500 * dependencies. The division is necessary because of the organization 1501 * of the "balloc" routine and because of the distinction between file 1502 * pages and file metadata blocks. 1503 / 1504* 1505/* 1506 * Allocate a new allocindir structure. 1507 / 1508static struct allocindir 1509newallocindir(ip, ptrno, newblkno, oldblkno) 1510 struct inode ip; / inode for file being extended / 1511* int ptrno; /* offset of pointer in indirect block / 1512* ufs_daddr_t newblkno; /* disk block number being added / 1513* ufs_daddr_t oldblkno; /* previous block number, 0 if none / 1514{ 1515* struct allocindir aip; 1516* 1517 MALLOC(aip, struct allocindir , sizeof(struct allocindir), 1518* M_ALLOCINDIR, M_SOFTDEP_FLAGS\|M_ZERO); 1519 aip->ai_list.wk_type = D_ALLOCINDIR; 1520 aip->ai_state = ATTACHED; 1521 aip->ai_offset = ptrno; 1522 aip->ai_newblkno = newblkno; 1523 aip->ai_oldblkno = oldblkno; 1524 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); 1525 return (aip); 1526} 1527 1528/* 1529 * Called just before setting an indirect block pointer 1530 * to a newly allocated file page. 1531 / 1532void 1533softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 1534* struct inode ip; / inode for file being extended / 1535* ufs_lbn_t lbn; /* allocated block number within file / 1536* struct buf bp; / buffer with indirect blk referencing page / 1537* int ptrno; /* offset of pointer in indirect block / 1538* ufs_daddr_t newblkno; /* disk block number being added / 1539* ufs_daddr_t oldblkno; /* previous block number, 0 if none / 1540* struct buf nbp; / buffer holding allocated page / 1541{ 1542* struct allocindir aip; 1543* struct pagedep pagedep; 1544* 1545 aip = newallocindir(ip, ptrno, newblkno, oldblkno); 1546 ACQUIRE_LOCK(&lk); 1547 /* 1548 * If we are allocating a directory page, then we must 1549 * allocate an associated pagedep to track additions and 1550 * deletions. 1551 / 1552* if ((ip->i_mode & IFMT) == IFDIR && 1553 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1554 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); 1555 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1556 FREE_LOCK(&lk); 1557 setup_allocindir_phase2(bp, ip, aip); 1558} 1559 1560/* 1561 * Called just before setting an indirect block pointer to a 1562 * newly allocated indirect block. 1563 / 1564void 1565softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 1566* struct buf nbp; / newly allocated indirect block / 1567* struct inode ip; / inode for file being extended / 1568* struct buf bp; / indirect block referencing allocated block / 1569* int ptrno; /* offset of pointer in indirect block / 1570* ufs_daddr_t newblkno; /* disk block number being added / 1571{ 1572* struct allocindir aip; 1573* 1574 aip = newallocindir(ip, ptrno, newblkno, 0); 1575 ACQUIRE_LOCK(&lk); 1576 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1577 FREE_LOCK(&lk); 1578 setup_allocindir_phase2(bp, ip, aip); 1579} 1580 1581/* 1582 * Called to finish the allocation of the "aip" allocated 1583 * by one of the two routines above. 1584 / 1585static void 1586setup_allocindir_phase2(bp, ip, aip) 1587* struct buf bp; / in-memory copy of the indirect block / 1588* struct inode ip; / inode for file being extended / 1589* struct allocindir aip; / allocindir allocated by the above routines / 1590{ 1591* struct worklist wk; 1592* struct indirdep indirdep, newindirdep; 1593 struct bmsafemap bmsafemap; 1594* struct allocindir oldaip; 1595* struct freefrag freefrag; 1596* struct newblk newblk; 1597* 1598 if (bp->b_lblkno >= 0) 1599 panic("setup_allocindir_phase2: not indir blk"); 1600 for (indirdep = NULL, newindirdep = NULL; ; ) { 1601 ACQUIRE_LOCK(&lk); 1602 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 1603 if (wk->wk_type != D_INDIRDEP) 1604 continue; 1605 indirdep = WK_INDIRDEP(wk); 1606 break; 1607 } 1608 if (indirdep == NULL && newindirdep) { 1609 indirdep = newindirdep; 1610 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 1611 newindirdep = NULL; 1612 } 1613 FREE_LOCK(&lk); 1614 if (indirdep) { 1615 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, 1616 &newblk) == 0) 1617 panic("setup_allocindir: lost block"); 1618 ACQUIRE_LOCK(&lk); 1619 if (newblk->nb_state == DEPCOMPLETE) { 1620 aip->ai_state \|= DEPCOMPLETE; 1621 aip->ai_buf = NULL; 1622 } else { 1623 bmsafemap = newblk->nb_bmsafemap; 1624 aip->ai_buf = bmsafemap->sm_buf; 1625 LIST_REMOVE(newblk, nb_deps); 1626 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, 1627 aip, ai_deps); 1628 } 1629 LIST_REMOVE(newblk, nb_hash); 1630 FREE(newblk, M_NEWBLK); 1631 aip->ai_indirdep = indirdep; 1632 /* 1633 * Check to see if there is an existing dependency 1634 * for this block. If there is, merge the old 1635 * dependency into the new one. 1636 / 1637* if (aip->ai_oldblkno == 0) 1638 oldaip = NULL; 1639 else 1640 1641 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) 1642 if (oldaip->ai_offset == aip->ai_offset) 1643 break; 1644 freefrag = NULL; 1645 if (oldaip != NULL) { 1646 if (oldaip->ai_newblkno != aip->ai_oldblkno) { 1647 FREE_LOCK(&lk); 1648 panic("setup_allocindir_phase2: blkno"); 1649 } 1650 aip->ai_oldblkno = oldaip->ai_oldblkno; 1651 freefrag = aip->ai_freefrag; 1652 aip->ai_freefrag = oldaip->ai_freefrag; 1653 oldaip->ai_freefrag = NULL; 1654 free_allocindir(oldaip, NULL); 1655 } 1656 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 1657 ((ufs_daddr_t )indirdep->ir_savebp->b_data) 1658* [aip->ai_offset] = aip->ai_oldblkno; 1659 FREE_LOCK(&lk); 1660 if (freefrag != NULL) 1661 handle_workitem_freefrag(freefrag); 1662 } 1663 if (newindirdep) { 1664 if (indirdep->ir_savebp != NULL) 1665 brelse(newindirdep->ir_savebp); 1666 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); 1667 } 1668 if (indirdep) 1669 break; 1670 MALLOC(newindirdep, struct indirdep , sizeof(struct indirdep), 1671* M_INDIRDEP, M_SOFTDEP_FLAGS); 1672 newindirdep->ir_list.wk_type = D_INDIRDEP; 1673 newindirdep->ir_state = ATTACHED; 1674 LIST_INIT(&newindirdep->ir_deplisthd); 1675 LIST_INIT(&newindirdep->ir_donehd); 1676 if (bp->b_blkno == bp->b_lblkno) 1677 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &bp->b_blkno, NULL, NULL); 1678 newindirdep->ir_savebp = 1679 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0); 1680 BUF_KERNPROC(newindirdep->ir_savebp); 1681 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 1682 } 1683} 1684 1685/* 1686 * Block de-allocation dependencies. 1687 * 1688 * When blocks are de-allocated, the on-disk pointers must be nullified before 1689 * the blocks are made available for use by other files. (The true 1690 * requirement is that old pointers must be nullified before new on-disk 1691 * pointers are set. We chose this slightly more stringent requirement to 1692 * reduce complexity.) Our implementation handles this dependency by updating 1693 * the inode (or indirect block) appropriately but delaying the actual block 1694 * de-allocation (i.e., freemap and free space count manipulation) until 1695 * after the updated versions reach stable storage. After the disk is 1696 * updated, the blocks can be safely de-allocated whenever it is convenient. 1697 * This implementation handles only the common case of reducing a file's 1698 * length to zero. Other cases are handled by the conventional synchronous 1699 * write approach. 1700 * 1701 * The ffs implementation with which we worked double-checks 1702 * the state of the block pointers and file size as it reduces 1703 * a file's length. Some of this code is replicated here in our 1704 * soft updates implementation. The freeblks->fb_chkcnt field is 1705 * used to transfer a part of this information to the procedure 1706 * that eventually de-allocates the blocks. 1707 * 1708 * This routine should be called from the routine that shortens 1709 * a file's length, before the inode's size or block pointers 1710 * are modified. It will save the block pointer information for 1711 * later release and zero the inode so that the calling routine 1712 * can release it. 1713 / 1714void 1715softdep_setup_freeblocks(ip, length) 1716* struct inode ip; / The inode whose length is to be reduced / 1717* off_t length; /* The new length for the file / 1718{ 1719* struct freeblks freeblks; 1720* struct inodedep inodedep; 1721* struct allocdirect adp; 1722* struct vnode vp; 1723* struct buf bp; 1724* struct fs fs; 1725* int i, delay, error; 1726 1727 fs = ip->i_fs; 1728 if (length != 0)
1728 panic("softde_setup_freeblocks: non-zero length");	1729 panic("softdep_setup_freeblocks: non-zero length");
1729 MALLOC(freeblks, struct freeblks , sizeof(struct freeblks), 1730* M_FREEBLKS, M_SOFTDEP_FLAGS\|M_ZERO); 1731 freeblks->fb_list.wk_type = D_FREEBLKS; 1732 freeblks->fb_uid = ip->i_uid; 1733 freeblks->fb_previousinum = ip->i_number; 1734 freeblks->fb_devvp = ip->i_devvp; 1735 freeblks->fb_mnt = ITOV(ip)->v_mount; 1736 freeblks->fb_oldsize = ip->i_size; 1737 freeblks->fb_newsize = length; 1738 freeblks->fb_chkcnt = ip->i_blocks; 1739 for (i = 0; i < NDADDR; i++) { 1740 freeblks->fb_dblks[i] = ip->i_db[i]; 1741 ip->i_db[i] = 0; 1742 } 1743 for (i = 0; i < NIADDR; i++) { 1744 freeblks->fb_iblks[i] = ip->i_ib[i]; 1745 ip->i_ib[i] = 0; 1746 } 1747 ip->i_blocks = 0; 1748 ip->i_size = 0; 1749 /*	1730 MALLOC(freeblks, struct freeblks , sizeof(struct freeblks), 1731* M_FREEBLKS, M_SOFTDEP_FLAGS\|M_ZERO); 1732 freeblks->fb_list.wk_type = D_FREEBLKS; 1733 freeblks->fb_uid = ip->i_uid; 1734 freeblks->fb_previousinum = ip->i_number; 1735 freeblks->fb_devvp = ip->i_devvp; 1736 freeblks->fb_mnt = ITOV(ip)->v_mount; 1737 freeblks->fb_oldsize = ip->i_size; 1738 freeblks->fb_newsize = length; 1739 freeblks->fb_chkcnt = ip->i_blocks; 1740 for (i = 0; i < NDADDR; i++) { 1741 freeblks->fb_dblks[i] = ip->i_db[i]; 1742 ip->i_db[i] = 0; 1743 } 1744 for (i = 0; i < NIADDR; i++) { 1745 freeblks->fb_iblks[i] = ip->i_ib[i]; 1746 ip->i_ib[i] = 0; 1747 } 1748 ip->i_blocks = 0; 1749 ip->i_size = 0; 1750 /*
	1751 * If the file was removed, then the space being freed was 1752 * accounted for then (see softdep_filereleased()). If the 1753 * file is merely being truncated, then we account for it now. 1754 / 1755* if ((ip->i_flag & IN_SPACECOUNTED) == 0) 1756 fs->fs_pendingblocks += freeblks->fb_chkcnt; 1757 /*
1750 * Push the zero'ed inode to to its disk buffer so that we are free 1751 * to delete its dependencies below. Once the dependencies are gone 1752 * the buffer can be safely released. 1753 / 1754* if ((error = bread(ip->i_devvp, 1755 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 1756 (int)fs->fs_bsize, NOCRED, &bp)) != 0) 1757 softdep_error("softdep_setup_freeblocks", error); 1758 ((struct dinode )bp->b_data + ino_to_fsbo(fs, ip->i_number)) = 1759 ip->i_din; 1760 /* 1761 * Find and eliminate any inode dependencies. 1762 / 1763* ACQUIRE_LOCK(&lk); 1764 (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep); 1765 if ((inodedep->id_state & IOSTARTED) != 0) { 1766 FREE_LOCK(&lk); 1767 panic("softdep_setup_freeblocks: inode busy"); 1768 } 1769 /* 1770 * Add the freeblks structure to the list of operations that 1771 * must await the zero'ed inode being written to disk. If we 1772 * still have a bitmap dependency (delay == 0), then the inode 1773 * has never been written to disk, so we can process the 1774 * freeblks below once we have deleted the dependencies. 1775 / 1776* delay = (inodedep->id_state & DEPCOMPLETE); 1777 if (delay) 1778 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); 1779 /* 1780 * Because the file length has been truncated to zero, any 1781 * pending block allocation dependency structures associated 1782 * with this inode are obsolete and can simply be de-allocated. 1783 * We must first merge the two dependency lists to get rid of 1784 * any duplicate freefrag structures, then purge the merged list. 1785 * If we still have a bitmap dependency, then the inode has never 1786 * been written to disk, so we can free any fragments without delay. 1787 / 1788* merge_inode_lists(inodedep); 1789 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 1790 free_allocdirect(&inodedep->id_inoupdt, adp, delay); 1791 FREE_LOCK(&lk); 1792 bdwrite(bp); 1793 /* 1794 * We must wait for any I/O in progress to finish so that 1795 * all potential buffers on the dirty list will be visible. 1796 * Once they are all there, walk the list and get rid of 1797 * any dependencies. 1798 / 1799* vp = ITOV(ip); 1800 ACQUIRE_LOCK(&lk); 1801 drain_output(vp, 1); 1802 while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) { 1803 bp = TAILQ_FIRST(&vp->v_dirtyblkhd); 1804 (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep); 1805 deallocate_dependencies(bp, inodedep); 1806 bp->b_flags \|= B_INVAL \| B_NOCACHE; 1807 FREE_LOCK(&lk); 1808 brelse(bp); 1809 ACQUIRE_LOCK(&lk); 1810 } 1811 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0) 1812 (void) free_inodedep(inodedep); 1813 FREE_LOCK(&lk); 1814 /* 1815 * If the inode has never been written to disk (delay == 0), 1816 * then we can process the freeblks now that we have deleted 1817 * the dependencies. 1818 / 1819* if (!delay) 1820 handle_workitem_freeblocks(freeblks, 0); 1821} 1822 1823/* 1824 * Reclaim any dependency structures from a buffer that is about to 1825 * be reallocated to a new vnode. The buffer must be locked, thus, 1826 * no I/O completion operations can occur while we are manipulating 1827 * its associated dependencies. The mutex is held so that other I/O's 1828 * associated with related dependencies do not occur. 1829 / 1830static void 1831deallocate_dependencies(bp, inodedep) 1832* struct buf bp; 1833* struct inodedep inodedep; 1834{ 1835* struct worklist wk; 1836* struct indirdep indirdep; 1837* struct allocindir aip; 1838* struct pagedep pagedep; 1839* struct dirrem dirrem; 1840* struct diradd dap; 1841* int i; 1842 1843 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 1844 switch (wk->wk_type) { 1845 1846 case D_INDIRDEP: 1847 indirdep = WK_INDIRDEP(wk); 1848 /* 1849 * None of the indirect pointers will ever be visible, 1850 * so they can simply be tossed. GOINGAWAY ensures 1851 * that allocated pointers will be saved in the buffer 1852 * cache until they are freed. Note that they will 1853 * only be able to be found by their physical address 1854 * since the inode mapping the logical address will 1855 * be gone. The save buffer used for the safe copy 1856 * was allocated in setup_allocindir_phase2 using 1857 * the physical address so it could be used for this 1858 * purpose. Hence we swap the safe copy with the real 1859 * copy, allowing the safe copy to be freed and holding 1860 * on to the real copy for later use in indir_trunc. 1861 / 1862* if (indirdep->ir_state & GOINGAWAY) { 1863 FREE_LOCK(&lk); 1864 panic("deallocate_dependencies: already gone"); 1865 } 1866 indirdep->ir_state \|= GOINGAWAY; 1867 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 1868 free_allocindir(aip, inodedep); 1869 if (bp->b_lblkno >= 0 \|\| 1870 bp->b_blkno != indirdep->ir_savebp->b_lblkno) { 1871 FREE_LOCK(&lk); 1872 panic("deallocate_dependencies: not indir"); 1873 } 1874 bcopy(bp->b_data, indirdep->ir_savebp->b_data, 1875 bp->b_bcount); 1876 WORKLIST_REMOVE(wk); 1877 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); 1878 continue; 1879 1880 case D_PAGEDEP: 1881 pagedep = WK_PAGEDEP(wk); 1882 /* 1883 * None of the directory additions will ever be 1884 * visible, so they can simply be tossed. 1885 / 1886* for (i = 0; i < DAHASHSZ; i++) 1887 while ((dap = 1888 LIST_FIRST(&pagedep->pd_diraddhd[i]))) 1889 free_diradd(dap); 1890 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) 1891 free_diradd(dap); 1892 /* 1893 * Copy any directory remove dependencies to the list 1894 * to be processed after the zero'ed inode is written. 1895 * If the inode has already been written, then they 1896 * can be dumped directly onto the work list. 1897 / 1898* LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 1899 LIST_REMOVE(dirrem, dm_next); 1900 dirrem->dm_dirinum = pagedep->pd_ino; 1901 if (inodedep == NULL \|\| 1902 (inodedep->id_state & ALLCOMPLETE) == 1903 ALLCOMPLETE) 1904 add_to_worklist(&dirrem->dm_list); 1905 else 1906 WORKLIST_INSERT(&inodedep->id_bufwait, 1907 &dirrem->dm_list); 1908 } 1909 WORKLIST_REMOVE(&pagedep->pd_list); 1910 LIST_REMOVE(pagedep, pd_hash); 1911 WORKITEM_FREE(pagedep, D_PAGEDEP); 1912 continue; 1913 1914 case D_ALLOCINDIR: 1915 free_allocindir(WK_ALLOCINDIR(wk), inodedep); 1916 continue; 1917 1918 case D_ALLOCDIRECT: 1919 case D_INODEDEP: 1920 FREE_LOCK(&lk); 1921 panic("deallocate_dependencies: Unexpected type %s", 1922 TYPENAME(wk->wk_type)); 1923 /* NOTREACHED / 1924* 1925 default: 1926 FREE_LOCK(&lk); 1927 panic("deallocate_dependencies: Unknown type %s", 1928 TYPENAME(wk->wk_type)); 1929 /* NOTREACHED / 1930* } 1931 } 1932} 1933 1934/* 1935 * Free an allocdirect. Generate a new freefrag work request if appropriate. 1936 * This routine must be called with splbio interrupts blocked. 1937 / 1938static void 1939free_allocdirect(adphead, adp, delay) 1940* struct allocdirectlst adphead; 1941* struct allocdirect adp; 1942* int delay; 1943{ 1944 1945#ifdef DEBUG 1946 if (lk.lkt_held == -1) 1947 panic("free_allocdirect: lock not held"); 1948#endif 1949 if ((adp->ad_state & DEPCOMPLETE) == 0) 1950 LIST_REMOVE(adp, ad_deps); 1951 TAILQ_REMOVE(adphead, adp, ad_next); 1952 if ((adp->ad_state & COMPLETE) == 0) 1953 WORKLIST_REMOVE(&adp->ad_list); 1954 if (adp->ad_freefrag != NULL) { 1955 if (delay) 1956 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 1957 &adp->ad_freefrag->ff_list); 1958 else 1959 add_to_worklist(&adp->ad_freefrag->ff_list); 1960 } 1961 WORKITEM_FREE(adp, D_ALLOCDIRECT); 1962} 1963 1964/* 1965 * Prepare an inode to be freed. The actual free operation is not 1966 * done until the zero'ed inode has been written to disk. 1967 / 1968void 1969softdep_freefile(pvp, ino, mode) 1970* struct vnode pvp; 1971* ino_t ino; 1972 int mode; 1973{ 1974 struct inode ip = VTOI(pvp); 1975* struct inodedep inodedep; 1976* struct freefile freefile; 1977* 1978 /* 1979 * This sets up the inode de-allocation dependency. 1980 / 1981* MALLOC(freefile, struct freefile , sizeof(struct freefile), 1982* M_FREEFILE, M_SOFTDEP_FLAGS); 1983 freefile->fx_list.wk_type = D_FREEFILE; 1984 freefile->fx_list.wk_state = 0; 1985 freefile->fx_mode = mode; 1986 freefile->fx_oldinum = ino; 1987 freefile->fx_devvp = ip->i_devvp; 1988 freefile->fx_mnt = ITOV(ip)->v_mount;	1758 * Push the zero'ed inode to to its disk buffer so that we are free 1759 * to delete its dependencies below. Once the dependencies are gone 1760 * the buffer can be safely released. 1761 / 1762* if ((error = bread(ip->i_devvp, 1763 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 1764 (int)fs->fs_bsize, NOCRED, &bp)) != 0) 1765 softdep_error("softdep_setup_freeblocks", error); 1766 ((struct dinode )bp->b_data + ino_to_fsbo(fs, ip->i_number)) = 1767 ip->i_din; 1768 /* 1769 * Find and eliminate any inode dependencies. 1770 / 1771* ACQUIRE_LOCK(&lk); 1772 (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep); 1773 if ((inodedep->id_state & IOSTARTED) != 0) { 1774 FREE_LOCK(&lk); 1775 panic("softdep_setup_freeblocks: inode busy"); 1776 } 1777 /* 1778 * Add the freeblks structure to the list of operations that 1779 * must await the zero'ed inode being written to disk. If we 1780 * still have a bitmap dependency (delay == 0), then the inode 1781 * has never been written to disk, so we can process the 1782 * freeblks below once we have deleted the dependencies. 1783 / 1784* delay = (inodedep->id_state & DEPCOMPLETE); 1785 if (delay) 1786 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); 1787 /* 1788 * Because the file length has been truncated to zero, any 1789 * pending block allocation dependency structures associated 1790 * with this inode are obsolete and can simply be de-allocated. 1791 * We must first merge the two dependency lists to get rid of 1792 * any duplicate freefrag structures, then purge the merged list. 1793 * If we still have a bitmap dependency, then the inode has never 1794 * been written to disk, so we can free any fragments without delay. 1795 / 1796* merge_inode_lists(inodedep); 1797 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 1798 free_allocdirect(&inodedep->id_inoupdt, adp, delay); 1799 FREE_LOCK(&lk); 1800 bdwrite(bp); 1801 /* 1802 * We must wait for any I/O in progress to finish so that 1803 * all potential buffers on the dirty list will be visible. 1804 * Once they are all there, walk the list and get rid of 1805 * any dependencies. 1806 / 1807* vp = ITOV(ip); 1808 ACQUIRE_LOCK(&lk); 1809 drain_output(vp, 1); 1810 while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) { 1811 bp = TAILQ_FIRST(&vp->v_dirtyblkhd); 1812 (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep); 1813 deallocate_dependencies(bp, inodedep); 1814 bp->b_flags \|= B_INVAL \| B_NOCACHE; 1815 FREE_LOCK(&lk); 1816 brelse(bp); 1817 ACQUIRE_LOCK(&lk); 1818 } 1819 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0) 1820 (void) free_inodedep(inodedep); 1821 FREE_LOCK(&lk); 1822 /* 1823 * If the inode has never been written to disk (delay == 0), 1824 * then we can process the freeblks now that we have deleted 1825 * the dependencies. 1826 / 1827* if (!delay) 1828 handle_workitem_freeblocks(freeblks, 0); 1829} 1830 1831/* 1832 * Reclaim any dependency structures from a buffer that is about to 1833 * be reallocated to a new vnode. The buffer must be locked, thus, 1834 * no I/O completion operations can occur while we are manipulating 1835 * its associated dependencies. The mutex is held so that other I/O's 1836 * associated with related dependencies do not occur. 1837 / 1838static void 1839deallocate_dependencies(bp, inodedep) 1840* struct buf bp; 1841* struct inodedep inodedep; 1842{ 1843* struct worklist wk; 1844* struct indirdep indirdep; 1845* struct allocindir aip; 1846* struct pagedep pagedep; 1847* struct dirrem dirrem; 1848* struct diradd dap; 1849* int i; 1850 1851 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 1852 switch (wk->wk_type) { 1853 1854 case D_INDIRDEP: 1855 indirdep = WK_INDIRDEP(wk); 1856 /* 1857 * None of the indirect pointers will ever be visible, 1858 * so they can simply be tossed. GOINGAWAY ensures 1859 * that allocated pointers will be saved in the buffer 1860 * cache until they are freed. Note that they will 1861 * only be able to be found by their physical address 1862 * since the inode mapping the logical address will 1863 * be gone. The save buffer used for the safe copy 1864 * was allocated in setup_allocindir_phase2 using 1865 * the physical address so it could be used for this 1866 * purpose. Hence we swap the safe copy with the real 1867 * copy, allowing the safe copy to be freed and holding 1868 * on to the real copy for later use in indir_trunc. 1869 / 1870* if (indirdep->ir_state & GOINGAWAY) { 1871 FREE_LOCK(&lk); 1872 panic("deallocate_dependencies: already gone"); 1873 } 1874 indirdep->ir_state \|= GOINGAWAY; 1875 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 1876 free_allocindir(aip, inodedep); 1877 if (bp->b_lblkno >= 0 \|\| 1878 bp->b_blkno != indirdep->ir_savebp->b_lblkno) { 1879 FREE_LOCK(&lk); 1880 panic("deallocate_dependencies: not indir"); 1881 } 1882 bcopy(bp->b_data, indirdep->ir_savebp->b_data, 1883 bp->b_bcount); 1884 WORKLIST_REMOVE(wk); 1885 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); 1886 continue; 1887 1888 case D_PAGEDEP: 1889 pagedep = WK_PAGEDEP(wk); 1890 /* 1891 * None of the directory additions will ever be 1892 * visible, so they can simply be tossed. 1893 / 1894* for (i = 0; i < DAHASHSZ; i++) 1895 while ((dap = 1896 LIST_FIRST(&pagedep->pd_diraddhd[i]))) 1897 free_diradd(dap); 1898 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) 1899 free_diradd(dap); 1900 /* 1901 * Copy any directory remove dependencies to the list 1902 * to be processed after the zero'ed inode is written. 1903 * If the inode has already been written, then they 1904 * can be dumped directly onto the work list. 1905 / 1906* LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 1907 LIST_REMOVE(dirrem, dm_next); 1908 dirrem->dm_dirinum = pagedep->pd_ino; 1909 if (inodedep == NULL \|\| 1910 (inodedep->id_state & ALLCOMPLETE) == 1911 ALLCOMPLETE) 1912 add_to_worklist(&dirrem->dm_list); 1913 else 1914 WORKLIST_INSERT(&inodedep->id_bufwait, 1915 &dirrem->dm_list); 1916 } 1917 WORKLIST_REMOVE(&pagedep->pd_list); 1918 LIST_REMOVE(pagedep, pd_hash); 1919 WORKITEM_FREE(pagedep, D_PAGEDEP); 1920 continue; 1921 1922 case D_ALLOCINDIR: 1923 free_allocindir(WK_ALLOCINDIR(wk), inodedep); 1924 continue; 1925 1926 case D_ALLOCDIRECT: 1927 case D_INODEDEP: 1928 FREE_LOCK(&lk); 1929 panic("deallocate_dependencies: Unexpected type %s", 1930 TYPENAME(wk->wk_type)); 1931 /* NOTREACHED / 1932* 1933 default: 1934 FREE_LOCK(&lk); 1935 panic("deallocate_dependencies: Unknown type %s", 1936 TYPENAME(wk->wk_type)); 1937 /* NOTREACHED / 1938* } 1939 } 1940} 1941 1942/* 1943 * Free an allocdirect. Generate a new freefrag work request if appropriate. 1944 * This routine must be called with splbio interrupts blocked. 1945 / 1946static void 1947free_allocdirect(adphead, adp, delay) 1948* struct allocdirectlst adphead; 1949* struct allocdirect adp; 1950* int delay; 1951{ 1952 1953#ifdef DEBUG 1954 if (lk.lkt_held == -1) 1955 panic("free_allocdirect: lock not held"); 1956#endif 1957 if ((adp->ad_state & DEPCOMPLETE) == 0) 1958 LIST_REMOVE(adp, ad_deps); 1959 TAILQ_REMOVE(adphead, adp, ad_next); 1960 if ((adp->ad_state & COMPLETE) == 0) 1961 WORKLIST_REMOVE(&adp->ad_list); 1962 if (adp->ad_freefrag != NULL) { 1963 if (delay) 1964 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 1965 &adp->ad_freefrag->ff_list); 1966 else 1967 add_to_worklist(&adp->ad_freefrag->ff_list); 1968 } 1969 WORKITEM_FREE(adp, D_ALLOCDIRECT); 1970} 1971 1972/* 1973 * Prepare an inode to be freed. The actual free operation is not 1974 * done until the zero'ed inode has been written to disk. 1975 / 1976void 1977softdep_freefile(pvp, ino, mode) 1978* struct vnode pvp; 1979* ino_t ino; 1980 int mode; 1981{ 1982 struct inode ip = VTOI(pvp); 1983* struct inodedep inodedep; 1984* struct freefile freefile; 1985* 1986 /* 1987 * This sets up the inode de-allocation dependency. 1988 / 1989* MALLOC(freefile, struct freefile , sizeof(struct freefile), 1990* M_FREEFILE, M_SOFTDEP_FLAGS); 1991 freefile->fx_list.wk_type = D_FREEFILE; 1992 freefile->fx_list.wk_state = 0; 1993 freefile->fx_mode = mode; 1994 freefile->fx_oldinum = ino; 1995 freefile->fx_devvp = ip->i_devvp; 1996 freefile->fx_mnt = ITOV(ip)->v_mount;
	1997 if ((ip->i_flag & IN_SPACECOUNTED) == 0) 1998 ip->i_fs->fs_pendinginodes += 1;
1989 1990 /* 1991 * If the inodedep does not exist, then the zero'ed inode has 1992 * been written to disk. If the allocated inode has never been 1993 * written to disk, then the on-disk inode is zero'ed. In either 1994 * case we can free the file immediately. 1995 / 1996* ACQUIRE_LOCK(&lk); 1997 if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 \|\| 1998 check_inode_unwritten(inodedep)) { 1999 FREE_LOCK(&lk); 2000 handle_workitem_freefile(freefile); 2001 return; 2002 } 2003 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 2004 FREE_LOCK(&lk); 2005} 2006 2007/* 2008 * Check to see if an inode has never been written to disk. If 2009 * so free the inodedep and return success, otherwise return failure. 2010 * This routine must be called with splbio interrupts blocked. 2011 * 2012 * If we still have a bitmap dependency, then the inode has never 2013 * been written to disk. Drop the dependency as it is no longer 2014 * necessary since the inode is being deallocated. We set the 2015 * ALLCOMPLETE flags since the bitmap now properly shows that the 2016 * inode is not allocated. Even if the inode is actively being 2017 * written, it has been rolled back to its zero'ed state, so we 2018 * are ensured that a zero inode is what is on the disk. For short 2019 * lived files, this change will usually result in removing all the 2020 * dependencies from the inode so that it can be freed immediately. 2021 / 2022static int 2023check_inode_unwritten(inodedep) 2024* struct inodedep inodedep; 2025{ 2026* 2027 if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\| 2028 LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\| 2029 LIST_FIRST(&inodedep->id_bufwait) != NULL \|\| 2030 LIST_FIRST(&inodedep->id_inowait) != NULL \|\| 2031 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\| 2032 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\| 2033 inodedep->id_nlinkdelta != 0) 2034 return (0); 2035 inodedep->id_state \|= ALLCOMPLETE; 2036 LIST_REMOVE(inodedep, id_deps); 2037 inodedep->id_buf = NULL; 2038 if (inodedep->id_state & ONWORKLIST) 2039 WORKLIST_REMOVE(&inodedep->id_list); 2040 if (inodedep->id_savedino != NULL) { 2041 FREE(inodedep->id_savedino, M_INODEDEP); 2042 inodedep->id_savedino = NULL; 2043 } 2044 if (free_inodedep(inodedep) == 0) { 2045 FREE_LOCK(&lk); 2046 panic("check_inode_unwritten: busy inode"); 2047 } 2048 return (1); 2049} 2050 2051/* 2052 * Try to free an inodedep structure. Return 1 if it could be freed. 2053 / 2054static int 2055free_inodedep(inodedep) 2056* struct inodedep inodedep; 2057{ 2058* 2059 if ((inodedep->id_state & ONWORKLIST) != 0 \|\| 2060 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE \|\| 2061 LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\| 2062 LIST_FIRST(&inodedep->id_bufwait) != NULL \|\| 2063 LIST_FIRST(&inodedep->id_inowait) != NULL \|\| 2064 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\| 2065 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\| 2066 inodedep->id_nlinkdelta != 0 \|\| inodedep->id_savedino != NULL) 2067 return (0); 2068 LIST_REMOVE(inodedep, id_hash); 2069 WORKITEM_FREE(inodedep, D_INODEDEP); 2070 num_inodedep -= 1; 2071 return (1); 2072} 2073 2074/* 2075 * This workitem routine performs the block de-allocation. 2076 * The workitem is added to the pending list after the updated 2077 * inode block has been written to disk. As mentioned above, 2078 * checks regarding the number of blocks de-allocated (compared 2079 * to the number of blocks allocated for the file) are also 2080 * performed in this function. 2081 / 2082static void 2083handle_workitem_freeblocks(freeblks, flags) 2084* struct freeblks freeblks; 2085* int flags; 2086{ 2087 struct inode tip, ip; 2088* struct vnode vp; 2089* ufs_daddr_t bn; 2090 struct fs fs; 2091* int i, level, bsize; 2092 long nblocks, blocksreleased = 0; 2093 int error, allerror = 0; 2094 ufs_lbn_t baselbns[NIADDR], tmpval; 2095 2096 tip.i_fs = fs = VFSTOUFS(freeblks->fb_mnt)->um_fs; 2097 tip.i_number = freeblks->fb_previousinum; 2098 tip.i_devvp = freeblks->fb_devvp; 2099 tip.i_dev = freeblks->fb_devvp->v_rdev; 2100 tip.i_size = freeblks->fb_oldsize; 2101 tip.i_uid = freeblks->fb_uid; 2102 tip.i_vnode = NULL; 2103 tmpval = 1; 2104 baselbns[0] = NDADDR; 2105 for (i = 1; i < NIADDR; i++) { 2106 tmpval = NINDIR(fs); 2107* baselbns[i] = baselbns[i - 1] + tmpval; 2108 } 2109 nblocks = btodb(fs->fs_bsize); 2110 blocksreleased = 0; 2111 /* 2112 * Indirect blocks first. 2113 / 2114* for (level = (NIADDR - 1); level >= 0; level--) { 2115 if ((bn = freeblks->fb_iblks[level]) == 0) 2116 continue; 2117 if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level, 2118 baselbns[level], &blocksreleased)) == 0) 2119 allerror = error; 2120 ffs_blkfree(&tip, bn, fs->fs_bsize);	1999 2000 /* 2001 * If the inodedep does not exist, then the zero'ed inode has 2002 * been written to disk. If the allocated inode has never been 2003 * written to disk, then the on-disk inode is zero'ed. In either 2004 * case we can free the file immediately. 2005 / 2006* ACQUIRE_LOCK(&lk); 2007 if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 \|\| 2008 check_inode_unwritten(inodedep)) { 2009 FREE_LOCK(&lk); 2010 handle_workitem_freefile(freefile); 2011 return; 2012 } 2013 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 2014 FREE_LOCK(&lk); 2015} 2016 2017/* 2018 * Check to see if an inode has never been written to disk. If 2019 * so free the inodedep and return success, otherwise return failure. 2020 * This routine must be called with splbio interrupts blocked. 2021 * 2022 * If we still have a bitmap dependency, then the inode has never 2023 * been written to disk. Drop the dependency as it is no longer 2024 * necessary since the inode is being deallocated. We set the 2025 * ALLCOMPLETE flags since the bitmap now properly shows that the 2026 * inode is not allocated. Even if the inode is actively being 2027 * written, it has been rolled back to its zero'ed state, so we 2028 * are ensured that a zero inode is what is on the disk. For short 2029 * lived files, this change will usually result in removing all the 2030 * dependencies from the inode so that it can be freed immediately. 2031 / 2032static int 2033check_inode_unwritten(inodedep) 2034* struct inodedep inodedep; 2035{ 2036* 2037 if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\| 2038 LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\| 2039 LIST_FIRST(&inodedep->id_bufwait) != NULL \|\| 2040 LIST_FIRST(&inodedep->id_inowait) != NULL \|\| 2041 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\| 2042 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\| 2043 inodedep->id_nlinkdelta != 0) 2044 return (0); 2045 inodedep->id_state \|= ALLCOMPLETE; 2046 LIST_REMOVE(inodedep, id_deps); 2047 inodedep->id_buf = NULL; 2048 if (inodedep->id_state & ONWORKLIST) 2049 WORKLIST_REMOVE(&inodedep->id_list); 2050 if (inodedep->id_savedino != NULL) { 2051 FREE(inodedep->id_savedino, M_INODEDEP); 2052 inodedep->id_savedino = NULL; 2053 } 2054 if (free_inodedep(inodedep) == 0) { 2055 FREE_LOCK(&lk); 2056 panic("check_inode_unwritten: busy inode"); 2057 } 2058 return (1); 2059} 2060 2061/* 2062 * Try to free an inodedep structure. Return 1 if it could be freed. 2063 / 2064static int 2065free_inodedep(inodedep) 2066* struct inodedep inodedep; 2067{ 2068* 2069 if ((inodedep->id_state & ONWORKLIST) != 0 \|\| 2070 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE \|\| 2071 LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\| 2072 LIST_FIRST(&inodedep->id_bufwait) != NULL \|\| 2073 LIST_FIRST(&inodedep->id_inowait) != NULL \|\| 2074 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\| 2075 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\| 2076 inodedep->id_nlinkdelta != 0 \|\| inodedep->id_savedino != NULL) 2077 return (0); 2078 LIST_REMOVE(inodedep, id_hash); 2079 WORKITEM_FREE(inodedep, D_INODEDEP); 2080 num_inodedep -= 1; 2081 return (1); 2082} 2083 2084/* 2085 * This workitem routine performs the block de-allocation. 2086 * The workitem is added to the pending list after the updated 2087 * inode block has been written to disk. As mentioned above, 2088 * checks regarding the number of blocks de-allocated (compared 2089 * to the number of blocks allocated for the file) are also 2090 * performed in this function. 2091 / 2092static void 2093handle_workitem_freeblocks(freeblks, flags) 2094* struct freeblks freeblks; 2095* int flags; 2096{ 2097 struct inode tip, ip; 2098* struct vnode vp; 2099* ufs_daddr_t bn; 2100 struct fs fs; 2101* int i, level, bsize; 2102 long nblocks, blocksreleased = 0; 2103 int error, allerror = 0; 2104 ufs_lbn_t baselbns[NIADDR], tmpval; 2105 2106 tip.i_fs = fs = VFSTOUFS(freeblks->fb_mnt)->um_fs; 2107 tip.i_number = freeblks->fb_previousinum; 2108 tip.i_devvp = freeblks->fb_devvp; 2109 tip.i_dev = freeblks->fb_devvp->v_rdev; 2110 tip.i_size = freeblks->fb_oldsize; 2111 tip.i_uid = freeblks->fb_uid; 2112 tip.i_vnode = NULL; 2113 tmpval = 1; 2114 baselbns[0] = NDADDR; 2115 for (i = 1; i < NIADDR; i++) { 2116 tmpval = NINDIR(fs); 2117* baselbns[i] = baselbns[i - 1] + tmpval; 2118 } 2119 nblocks = btodb(fs->fs_bsize); 2120 blocksreleased = 0; 2121 /* 2122 * Indirect blocks first. 2123 / 2124* for (level = (NIADDR - 1); level >= 0; level--) { 2125 if ((bn = freeblks->fb_iblks[level]) == 0) 2126 continue; 2127 if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level, 2128 baselbns[level], &blocksreleased)) == 0) 2129 allerror = error; 2130 ffs_blkfree(&tip, bn, fs->fs_bsize);
	2131 fs->fs_pendingblocks -= nblocks;
2121 blocksreleased += nblocks; 2122 } 2123 /* 2124 * All direct blocks or frags. 2125 / 2126* for (i = (NDADDR - 1); i >= 0; i--) { 2127 if ((bn = freeblks->fb_dblks[i]) == 0) 2128 continue; 2129 bsize = blksize(fs, &tip, i); 2130 ffs_blkfree(&tip, bn, bsize);	2132 blocksreleased += nblocks; 2133 } 2134 /* 2135 * All direct blocks or frags. 2136 / 2137* for (i = (NDADDR - 1); i >= 0; i--) { 2138 if ((bn = freeblks->fb_dblks[i]) == 0) 2139 continue; 2140 bsize = blksize(fs, &tip, i); 2141 ffs_blkfree(&tip, bn, bsize);
	2142 fs->fs_pendingblocks -= btodb(bsize);
2131 blocksreleased += btodb(bsize); 2132 } 2133 /* 2134 * If we still have not finished background cleanup, then check 2135 * to see if the block count needs to be adjusted. 2136 / 2137* if (freeblks->fb_chkcnt != blocksreleased && 2138 (fs->fs_flags & FS_UNCLEAN) != 0 && (flags & LK_NOWAIT) == 0 && 2139 VFS_VGET(freeblks->fb_mnt, freeblks->fb_previousinum, &vp) == 0) { 2140 ip = VTOI(vp); 2141 ip->i_blocks += freeblks->fb_chkcnt - blocksreleased; 2142 ip->i_flag \|= IN_CHANGE; 2143 vput(vp); 2144 } 2145 2146#ifdef DIAGNOSTIC 2147 if (freeblks->fb_chkcnt != blocksreleased && 2148 ((fs->fs_flags & FS_UNCLEAN) == 0 \|\| (flags & LK_NOWAIT) != 0)) 2149 printf("handle_workitem_freeblocks: block count"); 2150 if (allerror) 2151 softdep_error("handle_workitem_freeblks", allerror); 2152#endif /* DIAGNOSTIC / 2153* 2154 WORKITEM_FREE(freeblks, D_FREEBLKS); 2155} 2156 2157/* 2158 * Release blocks associated with the inode ip and stored in the indirect 2159 * block dbn. If level is greater than SINGLE, the block is an indirect block 2160 * and recursive calls to indirtrunc must be used to cleanse other indirect 2161 * blocks. 2162 / 2163static int 2164indir_trunc(ip, dbn, level, lbn, countp) 2165* struct inode ip; 2166* ufs_daddr_t dbn; 2167 int level; 2168 ufs_lbn_t lbn; 2169 long countp; 2170{ 2171* struct buf bp; 2172* ufs_daddr_t bap; 2173* ufs_daddr_t nb; 2174 struct fs fs; 2175* struct worklist wk; 2176* struct indirdep indirdep; 2177* int i, lbnadd, nblocks; 2178 int error, allerror = 0; 2179 2180 fs = ip->i_fs; 2181 lbnadd = 1; 2182 for (i = level; i > 0; i--) 2183 lbnadd = NINDIR(fs); 2184* /* 2185 * Get buffer of block pointers to be freed. This routine is not 2186 * called until the zero'ed inode has been written, so it is safe 2187 * to free blocks as they are encountered. Because the inode has 2188 * been zero'ed, calls to bmap on these blocks will fail. So, we 2189 * have to use the on-disk address and the block device for the 2190 * filesystem to look them up. If the file was deleted before its 2191 * indirect blocks were all written to disk, the routine that set 2192 * us up (deallocate_dependencies) will have arranged to leave 2193 * a complete copy of the indirect block in memory for our use. 2194 * Otherwise we have to read the blocks in from the disk. 2195 / 2196* ACQUIRE_LOCK(&lk); 2197 if ((bp = incore(ip->i_devvp, dbn)) != NULL && 2198 (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 2199 if (wk->wk_type != D_INDIRDEP \|\| 2200 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp \|\| 2201 (indirdep->ir_state & GOINGAWAY) == 0) { 2202 FREE_LOCK(&lk); 2203 panic("indir_trunc: lost indirdep"); 2204 } 2205 WORKLIST_REMOVE(wk); 2206 WORKITEM_FREE(indirdep, D_INDIRDEP); 2207 if (LIST_FIRST(&bp->b_dep) != NULL) { 2208 FREE_LOCK(&lk); 2209 panic("indir_trunc: dangling dep"); 2210 } 2211 FREE_LOCK(&lk); 2212 } else { 2213 FREE_LOCK(&lk); 2214 error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp); 2215 if (error) 2216 return (error); 2217 } 2218 /* 2219 * Recursively free indirect blocks. 2220 / 2221* bap = (ufs_daddr_t )bp->b_data; 2222* nblocks = btodb(fs->fs_bsize); 2223 for (i = NINDIR(fs) - 1; i >= 0; i--) { 2224 if ((nb = bap[i]) == 0) 2225 continue; 2226 if (level != 0) { 2227 if ((error = indir_trunc(ip, fsbtodb(fs, nb), 2228 level - 1, lbn + (i * lbnadd), countp)) != 0) 2229 allerror = error; 2230 } 2231 ffs_blkfree(ip, nb, fs->fs_bsize);	2143 blocksreleased += btodb(bsize); 2144 } 2145 /* 2146 * If we still have not finished background cleanup, then check 2147 * to see if the block count needs to be adjusted. 2148 / 2149* if (freeblks->fb_chkcnt != blocksreleased && 2150 (fs->fs_flags & FS_UNCLEAN) != 0 && (flags & LK_NOWAIT) == 0 && 2151 VFS_VGET(freeblks->fb_mnt, freeblks->fb_previousinum, &vp) == 0) { 2152 ip = VTOI(vp); 2153 ip->i_blocks += freeblks->fb_chkcnt - blocksreleased; 2154 ip->i_flag \|= IN_CHANGE; 2155 vput(vp); 2156 } 2157 2158#ifdef DIAGNOSTIC 2159 if (freeblks->fb_chkcnt != blocksreleased && 2160 ((fs->fs_flags & FS_UNCLEAN) == 0 \|\| (flags & LK_NOWAIT) != 0)) 2161 printf("handle_workitem_freeblocks: block count"); 2162 if (allerror) 2163 softdep_error("handle_workitem_freeblks", allerror); 2164#endif /* DIAGNOSTIC / 2165* 2166 WORKITEM_FREE(freeblks, D_FREEBLKS); 2167} 2168 2169/* 2170 * Release blocks associated with the inode ip and stored in the indirect 2171 * block dbn. If level is greater than SINGLE, the block is an indirect block 2172 * and recursive calls to indirtrunc must be used to cleanse other indirect 2173 * blocks. 2174 / 2175static int 2176indir_trunc(ip, dbn, level, lbn, countp) 2177* struct inode ip; 2178* ufs_daddr_t dbn; 2179 int level; 2180 ufs_lbn_t lbn; 2181 long countp; 2182{ 2183* struct buf bp; 2184* ufs_daddr_t bap; 2185* ufs_daddr_t nb; 2186 struct fs fs; 2187* struct worklist wk; 2188* struct indirdep indirdep; 2189* int i, lbnadd, nblocks; 2190 int error, allerror = 0; 2191 2192 fs = ip->i_fs; 2193 lbnadd = 1; 2194 for (i = level; i > 0; i--) 2195 lbnadd = NINDIR(fs); 2196* /* 2197 * Get buffer of block pointers to be freed. This routine is not 2198 * called until the zero'ed inode has been written, so it is safe 2199 * to free blocks as they are encountered. Because the inode has 2200 * been zero'ed, calls to bmap on these blocks will fail. So, we 2201 * have to use the on-disk address and the block device for the 2202 * filesystem to look them up. If the file was deleted before its 2203 * indirect blocks were all written to disk, the routine that set 2204 * us up (deallocate_dependencies) will have arranged to leave 2205 * a complete copy of the indirect block in memory for our use. 2206 * Otherwise we have to read the blocks in from the disk. 2207 / 2208* ACQUIRE_LOCK(&lk); 2209 if ((bp = incore(ip->i_devvp, dbn)) != NULL && 2210 (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 2211 if (wk->wk_type != D_INDIRDEP \|\| 2212 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp \|\| 2213 (indirdep->ir_state & GOINGAWAY) == 0) { 2214 FREE_LOCK(&lk); 2215 panic("indir_trunc: lost indirdep"); 2216 } 2217 WORKLIST_REMOVE(wk); 2218 WORKITEM_FREE(indirdep, D_INDIRDEP); 2219 if (LIST_FIRST(&bp->b_dep) != NULL) { 2220 FREE_LOCK(&lk); 2221 panic("indir_trunc: dangling dep"); 2222 } 2223 FREE_LOCK(&lk); 2224 } else { 2225 FREE_LOCK(&lk); 2226 error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp); 2227 if (error) 2228 return (error); 2229 } 2230 /* 2231 * Recursively free indirect blocks. 2232 / 2233* bap = (ufs_daddr_t )bp->b_data; 2234* nblocks = btodb(fs->fs_bsize); 2235 for (i = NINDIR(fs) - 1; i >= 0; i--) { 2236 if ((nb = bap[i]) == 0) 2237 continue; 2238 if (level != 0) { 2239 if ((error = indir_trunc(ip, fsbtodb(fs, nb), 2240 level - 1, lbn + (i * lbnadd), countp)) != 0) 2241 allerror = error; 2242 } 2243 ffs_blkfree(ip, nb, fs->fs_bsize);
	2244 fs->fs_pendingblocks -= nblocks;
2232 countp += nblocks; 2233* } 2234 bp->b_flags \|= B_INVAL \| B_NOCACHE; 2235 brelse(bp); 2236 return (allerror); 2237} 2238 2239/* 2240 * Free an allocindir. 2241 * This routine must be called with splbio interrupts blocked. 2242 / 2243static void 2244free_allocindir(aip, inodedep) 2245* struct allocindir aip; 2246* struct inodedep inodedep; 2247{ 2248* struct freefrag freefrag; 2249* 2250#ifdef DEBUG 2251 if (lk.lkt_held == -1) 2252 panic("free_allocindir: lock not held"); 2253#endif 2254 if ((aip->ai_state & DEPCOMPLETE) == 0) 2255 LIST_REMOVE(aip, ai_deps); 2256 if (aip->ai_state & ONWORKLIST) 2257 WORKLIST_REMOVE(&aip->ai_list); 2258 LIST_REMOVE(aip, ai_next); 2259 if ((freefrag = aip->ai_freefrag) != NULL) { 2260 if (inodedep == NULL) 2261 add_to_worklist(&freefrag->ff_list); 2262 else 2263 WORKLIST_INSERT(&inodedep->id_bufwait, 2264 &freefrag->ff_list); 2265 } 2266 WORKITEM_FREE(aip, D_ALLOCINDIR); 2267} 2268 2269/* 2270 * Directory entry addition dependencies. 2271 * 2272 * When adding a new directory entry, the inode (with its incremented link 2273 * count) must be written to disk before the directory entry's pointer to it. 2274 * Also, if the inode is newly allocated, the corresponding freemap must be 2275 * updated (on disk) before the directory entry's pointer. These requirements 2276 * are met via undo/redo on the directory entry's pointer, which consists 2277 * simply of the inode number. 2278 * 2279 * As directory entries are added and deleted, the free space within a 2280 * directory block can become fragmented. The ufs file system will compact 2281 * a fragmented directory block to make space for a new entry. When this 2282 * occurs, the offsets of previously added entries change. Any "diradd" 2283 * dependency structures corresponding to these entries must be updated with 2284 * the new offsets. 2285 / 2286* 2287/* 2288 * This routine is called after the in-memory inode's link 2289 * count has been incremented, but before the directory entry's 2290 * pointer to the inode has been set. 2291 / 2292void 2293softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) 2294* struct buf bp; / buffer containing directory block / 2295* struct inode dp; / inode for directory / 2296* off_t diroffset; /* offset of new entry in directory / 2297* long newinum; /* inode referenced by new directory entry / 2298* struct buf newdirbp; / non-NULL => contents of new mkdir / 2299{ 2300* int offset; /* offset of new entry within directory block / 2301* ufs_lbn_t lbn; /* block in directory containing new entry / 2302* struct fs fs; 2303* struct diradd dap; 2304* struct pagedep pagedep; 2305* struct inodedep inodedep; 2306* struct mkdir mkdir1, mkdir2; 2307 2308 /* 2309 * Whiteouts have no dependencies. 2310 / 2311* if (newinum == WINO) { 2312 if (newdirbp != NULL) 2313 bdwrite(newdirbp); 2314 return; 2315 } 2316 2317 fs = dp->i_fs; 2318 lbn = lblkno(fs, diroffset); 2319 offset = blkoff(fs, diroffset); 2320 MALLOC(dap, struct diradd , sizeof(struct diradd), M_DIRADD, 2321* M_SOFTDEP_FLAGS\|M_ZERO); 2322 dap->da_list.wk_type = D_DIRADD; 2323 dap->da_offset = offset; 2324 dap->da_newinum = newinum; 2325 dap->da_state = ATTACHED; 2326 if (newdirbp == NULL) { 2327 dap->da_state \|= DEPCOMPLETE; 2328 ACQUIRE_LOCK(&lk); 2329 } else { 2330 dap->da_state \|= MKDIR_BODY \| MKDIR_PARENT; 2331 MALLOC(mkdir1, struct mkdir , sizeof(struct mkdir), M_MKDIR, 2332* M_SOFTDEP_FLAGS); 2333 mkdir1->md_list.wk_type = D_MKDIR; 2334 mkdir1->md_state = MKDIR_BODY; 2335 mkdir1->md_diradd = dap; 2336 MALLOC(mkdir2, struct mkdir , sizeof(struct mkdir), M_MKDIR, 2337* M_SOFTDEP_FLAGS); 2338 mkdir2->md_list.wk_type = D_MKDIR; 2339 mkdir2->md_state = MKDIR_PARENT; 2340 mkdir2->md_diradd = dap; 2341 /* 2342 * Dependency on "." and ".." being written to disk. 2343 / 2344* mkdir1->md_buf = newdirbp; 2345 ACQUIRE_LOCK(&lk); 2346 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 2347 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); 2348 FREE_LOCK(&lk); 2349 bdwrite(newdirbp); 2350 /* 2351 * Dependency on link count increase for parent directory 2352 / 2353* ACQUIRE_LOCK(&lk); 2354 if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0 2355 \|\| (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2356 dap->da_state &= ~MKDIR_PARENT; 2357 WORKITEM_FREE(mkdir2, D_MKDIR); 2358 } else { 2359 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 2360 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); 2361 } 2362 } 2363 /* 2364 * Link into parent directory pagedep to await its being written. 2365 / 2366* if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2367 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2368 dap->da_pagedep = pagedep; 2369 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 2370 da_pdlist); 2371 /* 2372 * Link into its inodedep. Put it on the id_bufwait list if the inode 2373 * is not yet written. If it is written, do the post-inode write 2374 * processing to put it on the id_pendinghd list. 2375 / 2376* (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep); 2377 if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 2378 diradd_inode_written(dap, inodedep); 2379 else 2380 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 2381 FREE_LOCK(&lk); 2382} 2383 2384/* 2385 * This procedure is called to change the offset of a directory 2386 * entry when compacting a directory block which must be owned 2387 * exclusively by the caller. Note that the actual entry movement 2388 * must be done in this procedure to ensure that no I/O completions 2389 * occur while the move is in progress. 2390 / 2391void 2392softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) 2393* struct inode dp; / inode for directory / 2394* caddr_t base; /* address of dp->i_offset / 2395* caddr_t oldloc; /* address of old directory location / 2396* caddr_t newloc; /* address of new directory location / 2397* int entrysize; /* size of directory entry / 2398{ 2399* int offset, oldoffset, newoffset; 2400 struct pagedep pagedep; 2401* struct diradd dap; 2402* ufs_lbn_t lbn; 2403 2404 ACQUIRE_LOCK(&lk); 2405 lbn = lblkno(dp->i_fs, dp->i_offset); 2406 offset = blkoff(dp->i_fs, dp->i_offset); 2407 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) 2408 goto done; 2409 oldoffset = offset + (oldloc - base); 2410 newoffset = offset + (newloc - base); 2411 2412 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) { 2413 if (dap->da_offset != oldoffset) 2414 continue; 2415 dap->da_offset = newoffset; 2416 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) 2417 break; 2418 LIST_REMOVE(dap, da_pdlist); 2419 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], 2420 dap, da_pdlist); 2421 break; 2422 } 2423 if (dap == NULL) { 2424 2425 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) { 2426 if (dap->da_offset == oldoffset) { 2427 dap->da_offset = newoffset; 2428 break; 2429 } 2430 } 2431 } 2432done: 2433 bcopy(oldloc, newloc, entrysize); 2434 FREE_LOCK(&lk); 2435} 2436 2437/* 2438 * Free a diradd dependency structure. This routine must be called 2439 * with splbio interrupts blocked. 2440 / 2441static void 2442free_diradd(dap) 2443* struct diradd dap; 2444{ 2445* struct dirrem dirrem; 2446* struct pagedep pagedep; 2447* struct inodedep inodedep; 2448* struct mkdir mkdir, nextmd; 2449 2450#ifdef DEBUG 2451 if (lk.lkt_held == -1) 2452 panic("free_diradd: lock not held"); 2453#endif 2454 WORKLIST_REMOVE(&dap->da_list); 2455 LIST_REMOVE(dap, da_pdlist); 2456 if ((dap->da_state & DIRCHG) == 0) { 2457 pagedep = dap->da_pagedep; 2458 } else { 2459 dirrem = dap->da_previous; 2460 pagedep = dirrem->dm_pagedep; 2461 dirrem->dm_dirinum = pagedep->pd_ino; 2462 add_to_worklist(&dirrem->dm_list); 2463 } 2464 if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum, 2465 0, &inodedep) != 0) 2466 (void) free_inodedep(inodedep); 2467 if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) { 2468 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 2469 nextmd = LIST_NEXT(mkdir, md_mkdirs); 2470 if (mkdir->md_diradd != dap) 2471 continue; 2472 dap->da_state &= ~mkdir->md_state; 2473 WORKLIST_REMOVE(&mkdir->md_list); 2474 LIST_REMOVE(mkdir, md_mkdirs); 2475 WORKITEM_FREE(mkdir, D_MKDIR); 2476 } 2477 if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) { 2478 FREE_LOCK(&lk); 2479 panic("free_diradd: unfound ref"); 2480 } 2481 } 2482 WORKITEM_FREE(dap, D_DIRADD); 2483} 2484 2485/* 2486 * Directory entry removal dependencies. 2487 * 2488 * When removing a directory entry, the entry's inode pointer must be 2489 * zero'ed on disk before the corresponding inode's link count is decremented 2490 * (possibly freeing the inode for re-use). This dependency is handled by 2491 * updating the directory entry but delaying the inode count reduction until 2492 * after the directory block has been written to disk. After this point, the 2493 * inode count can be decremented whenever it is convenient. 2494 / 2495* 2496/* 2497 * This routine should be called immediately after removing 2498 * a directory entry. The inode's link count should not be 2499 * decremented by the calling procedure -- the soft updates 2500 * code will do this task when it is safe. 2501 / 2502void 2503softdep_setup_remove(bp, dp, ip, isrmdir) 2504* struct buf bp; / buffer containing directory block / 2505* struct inode dp; / inode for the directory being modified / 2506* struct inode ip; / inode for directory entry being removed / 2507* int isrmdir; /* indicates if doing RMDIR / 2508{ 2509* struct dirrem dirrem, prevdirrem; 2510 2511 /* 2512 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. 2513 / 2514* dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 2515 2516 /* 2517 * If the COMPLETE flag is clear, then there were no active 2518 * entries and we want to roll back to a zeroed entry until 2519 * the new inode is committed to disk. If the COMPLETE flag is 2520 * set then we have deleted an entry that never made it to 2521 * disk. If the entry we deleted resulted from a name change, 2522 * then the old name still resides on disk. We cannot delete 2523 * its inode (returned to us in prevdirrem) until the zeroed 2524 * directory entry gets to disk. The new inode has never been 2525 * referenced on the disk, so can be deleted immediately. 2526 / 2527* if ((dirrem->dm_state & COMPLETE) == 0) { 2528 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 2529 dm_next); 2530 FREE_LOCK(&lk); 2531 } else { 2532 if (prevdirrem != NULL) 2533 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 2534 prevdirrem, dm_next); 2535 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 2536 FREE_LOCK(&lk); 2537 handle_workitem_remove(dirrem); 2538 } 2539} 2540 2541/* 2542 * Allocate a new dirrem if appropriate and return it along with 2543 * its associated pagedep. Called without a lock, returns with lock. 2544 / 2545static long num_dirrem; / number of dirrem allocated / 2546static struct dirrem 2547newdirrem(bp, dp, ip, isrmdir, prevdirremp) 2548 struct buf bp; / buffer containing directory block / 2549* struct inode dp; / inode for the directory being modified / 2550* struct inode ip; / inode for directory entry being removed / 2551* int isrmdir; /* indicates if doing RMDIR / 2552* struct dirrem *prevdirremp; / previously referenced inode, if any / 2553{ 2554* int offset; 2555 ufs_lbn_t lbn; 2556 struct diradd dap; 2557* struct dirrem dirrem; 2558* struct pagedep pagedep; 2559* 2560 /* 2561 * Whiteouts have no deletion dependencies. 2562 / 2563* if (ip == NULL) 2564 panic("newdirrem: whiteout"); 2565 /* 2566 * If we are over our limit, try to improve the situation. 2567 * Limiting the number of dirrem structures will also limit 2568 * the number of freefile and freeblks structures. 2569 / 2570* if (num_dirrem > max_softdeps / 2) 2571 (void) request_cleanup(FLUSH_REMOVE, 0); 2572 num_dirrem += 1; 2573 MALLOC(dirrem, struct dirrem , sizeof(struct dirrem), 2574* M_DIRREM, M_SOFTDEP_FLAGS\|M_ZERO); 2575 dirrem->dm_list.wk_type = D_DIRREM; 2576 dirrem->dm_state = isrmdir ? RMDIR : 0; 2577 dirrem->dm_mnt = ITOV(ip)->v_mount; 2578 dirrem->dm_oldinum = ip->i_number; 2579 prevdirremp = NULL; 2580* 2581 ACQUIRE_LOCK(&lk); 2582 lbn = lblkno(dp->i_fs, dp->i_offset); 2583 offset = blkoff(dp->i_fs, dp->i_offset); 2584 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2585 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2586 dirrem->dm_pagedep = pagedep; 2587 /* 2588 * Check for a diradd dependency for the same directory entry. 2589 * If present, then both dependencies become obsolete and can 2590 * be de-allocated. Check for an entry on both the pd_dirraddhd 2591 * list and the pd_pendinghd list. 2592 / 2593* 2594 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 2595 if (dap->da_offset == offset) 2596 break; 2597 if (dap == NULL) { 2598 2599 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 2600 if (dap->da_offset == offset) 2601 break; 2602 if (dap == NULL) 2603 return (dirrem); 2604 } 2605 /* 2606 * Must be ATTACHED at this point. 2607 / 2608* if ((dap->da_state & ATTACHED) == 0) { 2609 FREE_LOCK(&lk); 2610 panic("newdirrem: not ATTACHED"); 2611 } 2612 if (dap->da_newinum != ip->i_number) { 2613 FREE_LOCK(&lk); 2614 panic("newdirrem: inum %d should be %d", 2615 ip->i_number, dap->da_newinum); 2616 } 2617 /* 2618 * If we are deleting a changed name that never made it to disk, 2619 * then return the dirrem describing the previous inode (which 2620 * represents the inode currently referenced from this entry on disk). 2621 / 2622* if ((dap->da_state & DIRCHG) != 0) { 2623 prevdirremp = dap->da_previous; 2624* dap->da_state &= ~DIRCHG; 2625 dap->da_pagedep = pagedep; 2626 } 2627 /* 2628 * We are deleting an entry that never made it to disk. 2629 * Mark it COMPLETE so we can delete its inode immediately. 2630 / 2631* dirrem->dm_state \|= COMPLETE; 2632 free_diradd(dap); 2633 return (dirrem); 2634} 2635 2636/* 2637 * Directory entry change dependencies. 2638 * 2639 * Changing an existing directory entry requires that an add operation 2640 * be completed first followed by a deletion. The semantics for the addition 2641 * are identical to the description of adding a new entry above except 2642 * that the rollback is to the old inode number rather than zero. Once 2643 * the addition dependency is completed, the removal is done as described 2644 * in the removal routine above. 2645 / 2646* 2647/* 2648 * This routine should be called immediately after changing 2649 * a directory entry. The inode's link count should not be 2650 * decremented by the calling procedure -- the soft updates 2651 * code will perform this task when it is safe. 2652 / 2653void 2654softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 2655* struct buf bp; / buffer containing directory block / 2656* struct inode dp; / inode for the directory being modified / 2657* struct inode ip; / inode for directory entry being removed / 2658* long newinum; /* new inode number for changed entry / 2659* int isrmdir; /* indicates if doing RMDIR / 2660{ 2661* int offset; 2662 struct diradd dap = NULL; 2663* struct dirrem dirrem, prevdirrem; 2664 struct pagedep pagedep; 2665* struct inodedep inodedep; 2666* 2667 offset = blkoff(dp->i_fs, dp->i_offset); 2668 2669 /* 2670 * Whiteouts do not need diradd dependencies. 2671 / 2672* if (newinum != WINO) { 2673 MALLOC(dap, struct diradd , sizeof(struct diradd), 2674* M_DIRADD, M_SOFTDEP_FLAGS\|M_ZERO); 2675 dap->da_list.wk_type = D_DIRADD; 2676 dap->da_state = DIRCHG \| ATTACHED \| DEPCOMPLETE; 2677 dap->da_offset = offset; 2678 dap->da_newinum = newinum; 2679 } 2680 2681 /* 2682 * Allocate a new dirrem and ACQUIRE_LOCK. 2683 / 2684* dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 2685 pagedep = dirrem->dm_pagedep; 2686 /* 2687 * The possible values for isrmdir: 2688 * 0 - non-directory file rename 2689 * 1 - directory rename within same directory 2690 * inum - directory rename to new directory of given inode number 2691 * When renaming to a new directory, we are both deleting and 2692 * creating a new directory entry, so the link count on the new 2693 * directory should not change. Thus we do not need the followup 2694 * dirrem which is usually done in handle_workitem_remove. We set 2695 * the DIRCHG flag to tell handle_workitem_remove to skip the 2696 * followup dirrem. 2697 / 2698* if (isrmdir > 1) 2699 dirrem->dm_state \|= DIRCHG; 2700 2701 /* 2702 * Whiteouts have no additional dependencies, 2703 * so just put the dirrem on the correct list. 2704 / 2705* if (newinum == WINO) { 2706 if ((dirrem->dm_state & COMPLETE) == 0) { 2707 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 2708 dm_next); 2709 } else { 2710 dirrem->dm_dirinum = pagedep->pd_ino; 2711 add_to_worklist(&dirrem->dm_list); 2712 } 2713 FREE_LOCK(&lk); 2714 return; 2715 } 2716 2717 /* 2718 * If the COMPLETE flag is clear, then there were no active 2719 * entries and we want to roll back to the previous inode until 2720 * the new inode is committed to disk. If the COMPLETE flag is 2721 * set, then we have deleted an entry that never made it to disk. 2722 * If the entry we deleted resulted from a name change, then the old 2723 * inode reference still resides on disk. Any rollback that we do 2724 * needs to be to that old inode (returned to us in prevdirrem). If 2725 * the entry we deleted resulted from a create, then there is 2726 * no entry on the disk, so we want to roll back to zero rather 2727 * than the uncommitted inode. In either of the COMPLETE cases we 2728 * want to immediately free the unwritten and unreferenced inode. 2729 / 2730* if ((dirrem->dm_state & COMPLETE) == 0) { 2731 dap->da_previous = dirrem; 2732 } else { 2733 if (prevdirrem != NULL) { 2734 dap->da_previous = prevdirrem; 2735 } else { 2736 dap->da_state &= ~DIRCHG; 2737 dap->da_pagedep = pagedep; 2738 } 2739 dirrem->dm_dirinum = pagedep->pd_ino; 2740 add_to_worklist(&dirrem->dm_list); 2741 } 2742 /* 2743 * Link into its inodedep. Put it on the id_bufwait list if the inode 2744 * is not yet written. If it is written, do the post-inode write 2745 * processing to put it on the id_pendinghd list. 2746 / 2747* if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 \|\| 2748 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2749 dap->da_state \|= COMPLETE; 2750 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 2751 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 2752 } else { 2753 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 2754 dap, da_pdlist); 2755 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 2756 } 2757 FREE_LOCK(&lk); 2758} 2759 2760/* 2761 * Called whenever the link count on an inode is changed. 2762 * It creates an inode dependency so that the new reference(s) 2763 * to the inode cannot be committed to disk until the updated 2764 * inode has been written. 2765 / 2766void 2767softdep_change_linkcnt(ip) 2768* struct inode ip; / the inode with the increased link count / 2769{ 2770* struct inodedep inodedep; 2771* 2772 ACQUIRE_LOCK(&lk); 2773 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); 2774 if (ip->i_nlink < ip->i_effnlink) { 2775 FREE_LOCK(&lk); 2776 panic("softdep_change_linkcnt: bad delta"); 2777 } 2778 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2779 FREE_LOCK(&lk); 2780} 2781 2782/*	2245 countp += nblocks; 2246* } 2247 bp->b_flags \|= B_INVAL \| B_NOCACHE; 2248 brelse(bp); 2249 return (allerror); 2250} 2251 2252/* 2253 * Free an allocindir. 2254 * This routine must be called with splbio interrupts blocked. 2255 / 2256static void 2257free_allocindir(aip, inodedep) 2258* struct allocindir aip; 2259* struct inodedep inodedep; 2260{ 2261* struct freefrag freefrag; 2262* 2263#ifdef DEBUG 2264 if (lk.lkt_held == -1) 2265 panic("free_allocindir: lock not held"); 2266#endif 2267 if ((aip->ai_state & DEPCOMPLETE) == 0) 2268 LIST_REMOVE(aip, ai_deps); 2269 if (aip->ai_state & ONWORKLIST) 2270 WORKLIST_REMOVE(&aip->ai_list); 2271 LIST_REMOVE(aip, ai_next); 2272 if ((freefrag = aip->ai_freefrag) != NULL) { 2273 if (inodedep == NULL) 2274 add_to_worklist(&freefrag->ff_list); 2275 else 2276 WORKLIST_INSERT(&inodedep->id_bufwait, 2277 &freefrag->ff_list); 2278 } 2279 WORKITEM_FREE(aip, D_ALLOCINDIR); 2280} 2281 2282/* 2283 * Directory entry addition dependencies. 2284 * 2285 * When adding a new directory entry, the inode (with its incremented link 2286 * count) must be written to disk before the directory entry's pointer to it. 2287 * Also, if the inode is newly allocated, the corresponding freemap must be 2288 * updated (on disk) before the directory entry's pointer. These requirements 2289 * are met via undo/redo on the directory entry's pointer, which consists 2290 * simply of the inode number. 2291 * 2292 * As directory entries are added and deleted, the free space within a 2293 * directory block can become fragmented. The ufs file system will compact 2294 * a fragmented directory block to make space for a new entry. When this 2295 * occurs, the offsets of previously added entries change. Any "diradd" 2296 * dependency structures corresponding to these entries must be updated with 2297 * the new offsets. 2298 / 2299* 2300/* 2301 * This routine is called after the in-memory inode's link 2302 * count has been incremented, but before the directory entry's 2303 * pointer to the inode has been set. 2304 / 2305void 2306softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) 2307* struct buf bp; / buffer containing directory block / 2308* struct inode dp; / inode for directory / 2309* off_t diroffset; /* offset of new entry in directory / 2310* long newinum; /* inode referenced by new directory entry / 2311* struct buf newdirbp; / non-NULL => contents of new mkdir / 2312{ 2313* int offset; /* offset of new entry within directory block / 2314* ufs_lbn_t lbn; /* block in directory containing new entry / 2315* struct fs fs; 2316* struct diradd dap; 2317* struct pagedep pagedep; 2318* struct inodedep inodedep; 2319* struct mkdir mkdir1, mkdir2; 2320 2321 /* 2322 * Whiteouts have no dependencies. 2323 / 2324* if (newinum == WINO) { 2325 if (newdirbp != NULL) 2326 bdwrite(newdirbp); 2327 return; 2328 } 2329 2330 fs = dp->i_fs; 2331 lbn = lblkno(fs, diroffset); 2332 offset = blkoff(fs, diroffset); 2333 MALLOC(dap, struct diradd , sizeof(struct diradd), M_DIRADD, 2334* M_SOFTDEP_FLAGS\|M_ZERO); 2335 dap->da_list.wk_type = D_DIRADD; 2336 dap->da_offset = offset; 2337 dap->da_newinum = newinum; 2338 dap->da_state = ATTACHED; 2339 if (newdirbp == NULL) { 2340 dap->da_state \|= DEPCOMPLETE; 2341 ACQUIRE_LOCK(&lk); 2342 } else { 2343 dap->da_state \|= MKDIR_BODY \| MKDIR_PARENT; 2344 MALLOC(mkdir1, struct mkdir , sizeof(struct mkdir), M_MKDIR, 2345* M_SOFTDEP_FLAGS); 2346 mkdir1->md_list.wk_type = D_MKDIR; 2347 mkdir1->md_state = MKDIR_BODY; 2348 mkdir1->md_diradd = dap; 2349 MALLOC(mkdir2, struct mkdir , sizeof(struct mkdir), M_MKDIR, 2350* M_SOFTDEP_FLAGS); 2351 mkdir2->md_list.wk_type = D_MKDIR; 2352 mkdir2->md_state = MKDIR_PARENT; 2353 mkdir2->md_diradd = dap; 2354 /* 2355 * Dependency on "." and ".." being written to disk. 2356 / 2357* mkdir1->md_buf = newdirbp; 2358 ACQUIRE_LOCK(&lk); 2359 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 2360 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); 2361 FREE_LOCK(&lk); 2362 bdwrite(newdirbp); 2363 /* 2364 * Dependency on link count increase for parent directory 2365 / 2366* ACQUIRE_LOCK(&lk); 2367 if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0 2368 \|\| (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2369 dap->da_state &= ~MKDIR_PARENT; 2370 WORKITEM_FREE(mkdir2, D_MKDIR); 2371 } else { 2372 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 2373 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); 2374 } 2375 } 2376 /* 2377 * Link into parent directory pagedep to await its being written. 2378 / 2379* if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2380 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2381 dap->da_pagedep = pagedep; 2382 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 2383 da_pdlist); 2384 /* 2385 * Link into its inodedep. Put it on the id_bufwait list if the inode 2386 * is not yet written. If it is written, do the post-inode write 2387 * processing to put it on the id_pendinghd list. 2388 / 2389* (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep); 2390 if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 2391 diradd_inode_written(dap, inodedep); 2392 else 2393 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 2394 FREE_LOCK(&lk); 2395} 2396 2397/* 2398 * This procedure is called to change the offset of a directory 2399 * entry when compacting a directory block which must be owned 2400 * exclusively by the caller. Note that the actual entry movement 2401 * must be done in this procedure to ensure that no I/O completions 2402 * occur while the move is in progress. 2403 / 2404void 2405softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) 2406* struct inode dp; / inode for directory / 2407* caddr_t base; /* address of dp->i_offset / 2408* caddr_t oldloc; /* address of old directory location / 2409* caddr_t newloc; /* address of new directory location / 2410* int entrysize; /* size of directory entry / 2411{ 2412* int offset, oldoffset, newoffset; 2413 struct pagedep pagedep; 2414* struct diradd dap; 2415* ufs_lbn_t lbn; 2416 2417 ACQUIRE_LOCK(&lk); 2418 lbn = lblkno(dp->i_fs, dp->i_offset); 2419 offset = blkoff(dp->i_fs, dp->i_offset); 2420 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) 2421 goto done; 2422 oldoffset = offset + (oldloc - base); 2423 newoffset = offset + (newloc - base); 2424 2425 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) { 2426 if (dap->da_offset != oldoffset) 2427 continue; 2428 dap->da_offset = newoffset; 2429 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) 2430 break; 2431 LIST_REMOVE(dap, da_pdlist); 2432 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], 2433 dap, da_pdlist); 2434 break; 2435 } 2436 if (dap == NULL) { 2437 2438 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) { 2439 if (dap->da_offset == oldoffset) { 2440 dap->da_offset = newoffset; 2441 break; 2442 } 2443 } 2444 } 2445done: 2446 bcopy(oldloc, newloc, entrysize); 2447 FREE_LOCK(&lk); 2448} 2449 2450/* 2451 * Free a diradd dependency structure. This routine must be called 2452 * with splbio interrupts blocked. 2453 / 2454static void 2455free_diradd(dap) 2456* struct diradd dap; 2457{ 2458* struct dirrem dirrem; 2459* struct pagedep pagedep; 2460* struct inodedep inodedep; 2461* struct mkdir mkdir, nextmd; 2462 2463#ifdef DEBUG 2464 if (lk.lkt_held == -1) 2465 panic("free_diradd: lock not held"); 2466#endif 2467 WORKLIST_REMOVE(&dap->da_list); 2468 LIST_REMOVE(dap, da_pdlist); 2469 if ((dap->da_state & DIRCHG) == 0) { 2470 pagedep = dap->da_pagedep; 2471 } else { 2472 dirrem = dap->da_previous; 2473 pagedep = dirrem->dm_pagedep; 2474 dirrem->dm_dirinum = pagedep->pd_ino; 2475 add_to_worklist(&dirrem->dm_list); 2476 } 2477 if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum, 2478 0, &inodedep) != 0) 2479 (void) free_inodedep(inodedep); 2480 if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) { 2481 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 2482 nextmd = LIST_NEXT(mkdir, md_mkdirs); 2483 if (mkdir->md_diradd != dap) 2484 continue; 2485 dap->da_state &= ~mkdir->md_state; 2486 WORKLIST_REMOVE(&mkdir->md_list); 2487 LIST_REMOVE(mkdir, md_mkdirs); 2488 WORKITEM_FREE(mkdir, D_MKDIR); 2489 } 2490 if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) { 2491 FREE_LOCK(&lk); 2492 panic("free_diradd: unfound ref"); 2493 } 2494 } 2495 WORKITEM_FREE(dap, D_DIRADD); 2496} 2497 2498/* 2499 * Directory entry removal dependencies. 2500 * 2501 * When removing a directory entry, the entry's inode pointer must be 2502 * zero'ed on disk before the corresponding inode's link count is decremented 2503 * (possibly freeing the inode for re-use). This dependency is handled by 2504 * updating the directory entry but delaying the inode count reduction until 2505 * after the directory block has been written to disk. After this point, the 2506 * inode count can be decremented whenever it is convenient. 2507 / 2508* 2509/* 2510 * This routine should be called immediately after removing 2511 * a directory entry. The inode's link count should not be 2512 * decremented by the calling procedure -- the soft updates 2513 * code will do this task when it is safe. 2514 / 2515void 2516softdep_setup_remove(bp, dp, ip, isrmdir) 2517* struct buf bp; / buffer containing directory block / 2518* struct inode dp; / inode for the directory being modified / 2519* struct inode ip; / inode for directory entry being removed / 2520* int isrmdir; /* indicates if doing RMDIR / 2521{ 2522* struct dirrem dirrem, prevdirrem; 2523 2524 /* 2525 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. 2526 / 2527* dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 2528 2529 /* 2530 * If the COMPLETE flag is clear, then there were no active 2531 * entries and we want to roll back to a zeroed entry until 2532 * the new inode is committed to disk. If the COMPLETE flag is 2533 * set then we have deleted an entry that never made it to 2534 * disk. If the entry we deleted resulted from a name change, 2535 * then the old name still resides on disk. We cannot delete 2536 * its inode (returned to us in prevdirrem) until the zeroed 2537 * directory entry gets to disk. The new inode has never been 2538 * referenced on the disk, so can be deleted immediately. 2539 / 2540* if ((dirrem->dm_state & COMPLETE) == 0) { 2541 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 2542 dm_next); 2543 FREE_LOCK(&lk); 2544 } else { 2545 if (prevdirrem != NULL) 2546 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 2547 prevdirrem, dm_next); 2548 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 2549 FREE_LOCK(&lk); 2550 handle_workitem_remove(dirrem); 2551 } 2552} 2553 2554/* 2555 * Allocate a new dirrem if appropriate and return it along with 2556 * its associated pagedep. Called without a lock, returns with lock. 2557 / 2558static long num_dirrem; / number of dirrem allocated / 2559static struct dirrem 2560newdirrem(bp, dp, ip, isrmdir, prevdirremp) 2561 struct buf bp; / buffer containing directory block / 2562* struct inode dp; / inode for the directory being modified / 2563* struct inode ip; / inode for directory entry being removed / 2564* int isrmdir; /* indicates if doing RMDIR / 2565* struct dirrem *prevdirremp; / previously referenced inode, if any / 2566{ 2567* int offset; 2568 ufs_lbn_t lbn; 2569 struct diradd dap; 2570* struct dirrem dirrem; 2571* struct pagedep pagedep; 2572* 2573 /* 2574 * Whiteouts have no deletion dependencies. 2575 / 2576* if (ip == NULL) 2577 panic("newdirrem: whiteout"); 2578 /* 2579 * If we are over our limit, try to improve the situation. 2580 * Limiting the number of dirrem structures will also limit 2581 * the number of freefile and freeblks structures. 2582 / 2583* if (num_dirrem > max_softdeps / 2) 2584 (void) request_cleanup(FLUSH_REMOVE, 0); 2585 num_dirrem += 1; 2586 MALLOC(dirrem, struct dirrem , sizeof(struct dirrem), 2587* M_DIRREM, M_SOFTDEP_FLAGS\|M_ZERO); 2588 dirrem->dm_list.wk_type = D_DIRREM; 2589 dirrem->dm_state = isrmdir ? RMDIR : 0; 2590 dirrem->dm_mnt = ITOV(ip)->v_mount; 2591 dirrem->dm_oldinum = ip->i_number; 2592 prevdirremp = NULL; 2593* 2594 ACQUIRE_LOCK(&lk); 2595 lbn = lblkno(dp->i_fs, dp->i_offset); 2596 offset = blkoff(dp->i_fs, dp->i_offset); 2597 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2598 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2599 dirrem->dm_pagedep = pagedep; 2600 /* 2601 * Check for a diradd dependency for the same directory entry. 2602 * If present, then both dependencies become obsolete and can 2603 * be de-allocated. Check for an entry on both the pd_dirraddhd 2604 * list and the pd_pendinghd list. 2605 / 2606* 2607 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 2608 if (dap->da_offset == offset) 2609 break; 2610 if (dap == NULL) { 2611 2612 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 2613 if (dap->da_offset == offset) 2614 break; 2615 if (dap == NULL) 2616 return (dirrem); 2617 } 2618 /* 2619 * Must be ATTACHED at this point. 2620 / 2621* if ((dap->da_state & ATTACHED) == 0) { 2622 FREE_LOCK(&lk); 2623 panic("newdirrem: not ATTACHED"); 2624 } 2625 if (dap->da_newinum != ip->i_number) { 2626 FREE_LOCK(&lk); 2627 panic("newdirrem: inum %d should be %d", 2628 ip->i_number, dap->da_newinum); 2629 } 2630 /* 2631 * If we are deleting a changed name that never made it to disk, 2632 * then return the dirrem describing the previous inode (which 2633 * represents the inode currently referenced from this entry on disk). 2634 / 2635* if ((dap->da_state & DIRCHG) != 0) { 2636 prevdirremp = dap->da_previous; 2637* dap->da_state &= ~DIRCHG; 2638 dap->da_pagedep = pagedep; 2639 } 2640 /* 2641 * We are deleting an entry that never made it to disk. 2642 * Mark it COMPLETE so we can delete its inode immediately. 2643 / 2644* dirrem->dm_state \|= COMPLETE; 2645 free_diradd(dap); 2646 return (dirrem); 2647} 2648 2649/* 2650 * Directory entry change dependencies. 2651 * 2652 * Changing an existing directory entry requires that an add operation 2653 * be completed first followed by a deletion. The semantics for the addition 2654 * are identical to the description of adding a new entry above except 2655 * that the rollback is to the old inode number rather than zero. Once 2656 * the addition dependency is completed, the removal is done as described 2657 * in the removal routine above. 2658 / 2659* 2660/* 2661 * This routine should be called immediately after changing 2662 * a directory entry. The inode's link count should not be 2663 * decremented by the calling procedure -- the soft updates 2664 * code will perform this task when it is safe. 2665 / 2666void 2667softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 2668* struct buf bp; / buffer containing directory block / 2669* struct inode dp; / inode for the directory being modified / 2670* struct inode ip; / inode for directory entry being removed / 2671* long newinum; /* new inode number for changed entry / 2672* int isrmdir; /* indicates if doing RMDIR / 2673{ 2674* int offset; 2675 struct diradd dap = NULL; 2676* struct dirrem dirrem, prevdirrem; 2677 struct pagedep pagedep; 2678* struct inodedep inodedep; 2679* 2680 offset = blkoff(dp->i_fs, dp->i_offset); 2681 2682 /* 2683 * Whiteouts do not need diradd dependencies. 2684 / 2685* if (newinum != WINO) { 2686 MALLOC(dap, struct diradd , sizeof(struct diradd), 2687* M_DIRADD, M_SOFTDEP_FLAGS\|M_ZERO); 2688 dap->da_list.wk_type = D_DIRADD; 2689 dap->da_state = DIRCHG \| ATTACHED \| DEPCOMPLETE; 2690 dap->da_offset = offset; 2691 dap->da_newinum = newinum; 2692 } 2693 2694 /* 2695 * Allocate a new dirrem and ACQUIRE_LOCK. 2696 / 2697* dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 2698 pagedep = dirrem->dm_pagedep; 2699 /* 2700 * The possible values for isrmdir: 2701 * 0 - non-directory file rename 2702 * 1 - directory rename within same directory 2703 * inum - directory rename to new directory of given inode number 2704 * When renaming to a new directory, we are both deleting and 2705 * creating a new directory entry, so the link count on the new 2706 * directory should not change. Thus we do not need the followup 2707 * dirrem which is usually done in handle_workitem_remove. We set 2708 * the DIRCHG flag to tell handle_workitem_remove to skip the 2709 * followup dirrem. 2710 / 2711* if (isrmdir > 1) 2712 dirrem->dm_state \|= DIRCHG; 2713 2714 /* 2715 * Whiteouts have no additional dependencies, 2716 * so just put the dirrem on the correct list. 2717 / 2718* if (newinum == WINO) { 2719 if ((dirrem->dm_state & COMPLETE) == 0) { 2720 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 2721 dm_next); 2722 } else { 2723 dirrem->dm_dirinum = pagedep->pd_ino; 2724 add_to_worklist(&dirrem->dm_list); 2725 } 2726 FREE_LOCK(&lk); 2727 return; 2728 } 2729 2730 /* 2731 * If the COMPLETE flag is clear, then there were no active 2732 * entries and we want to roll back to the previous inode until 2733 * the new inode is committed to disk. If the COMPLETE flag is 2734 * set, then we have deleted an entry that never made it to disk. 2735 * If the entry we deleted resulted from a name change, then the old 2736 * inode reference still resides on disk. Any rollback that we do 2737 * needs to be to that old inode (returned to us in prevdirrem). If 2738 * the entry we deleted resulted from a create, then there is 2739 * no entry on the disk, so we want to roll back to zero rather 2740 * than the uncommitted inode. In either of the COMPLETE cases we 2741 * want to immediately free the unwritten and unreferenced inode. 2742 / 2743* if ((dirrem->dm_state & COMPLETE) == 0) { 2744 dap->da_previous = dirrem; 2745 } else { 2746 if (prevdirrem != NULL) { 2747 dap->da_previous = prevdirrem; 2748 } else { 2749 dap->da_state &= ~DIRCHG; 2750 dap->da_pagedep = pagedep; 2751 } 2752 dirrem->dm_dirinum = pagedep->pd_ino; 2753 add_to_worklist(&dirrem->dm_list); 2754 } 2755 /* 2756 * Link into its inodedep. Put it on the id_bufwait list if the inode 2757 * is not yet written. If it is written, do the post-inode write 2758 * processing to put it on the id_pendinghd list. 2759 / 2760* if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 \|\| 2761 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2762 dap->da_state \|= COMPLETE; 2763 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 2764 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 2765 } else { 2766 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 2767 dap, da_pdlist); 2768 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 2769 } 2770 FREE_LOCK(&lk); 2771} 2772 2773/* 2774 * Called whenever the link count on an inode is changed. 2775 * It creates an inode dependency so that the new reference(s) 2776 * to the inode cannot be committed to disk until the updated 2777 * inode has been written. 2778 / 2779void 2780softdep_change_linkcnt(ip) 2781* struct inode ip; / the inode with the increased link count / 2782{ 2783* struct inodedep inodedep; 2784* 2785 ACQUIRE_LOCK(&lk); 2786 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); 2787 if (ip->i_nlink < ip->i_effnlink) { 2788 FREE_LOCK(&lk); 2789 panic("softdep_change_linkcnt: bad delta"); 2790 } 2791 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2792 FREE_LOCK(&lk); 2793} 2794 2795/*
	2796 * Called when the effective link count and the reference count 2797 * on an inode drops to zero. At this point there are no names 2798 * referencing the file in the filesystem and no active file 2799 * references. The space associated with the file will be freed 2800 * as soon as the necessary soft dependencies are cleared. 2801 / 2802void 2803softdep_releasefile(ip) 2804* struct inode ip; / inode with the zero effective link count / 2805{ 2806* struct inodedep inodedep; 2807* 2808 if (ip->i_effnlink > 0) 2809 panic("softdep_filerelease: file still referenced"); 2810 /* 2811 * We may be called several times as the real reference count 2812 * drops to zero. We only want to account for the space once. 2813 / 2814* if (ip->i_flag & IN_SPACECOUNTED) 2815 return; 2816 /* 2817 * We have to deactivate a snapshot otherwise copyonwrites may 2818 * add blocks and the cleanup may remove blocks after we have 2819 * tried to account for them. 2820 / 2821* if ((ip->i_flags & SF_SNAPSHOT) != 0) 2822 ffs_snapremove(ITOV(ip)); 2823 /* 2824 * If we are tracking an nlinkdelta, we have to also remember 2825 * whether we accounted for the freed space yet. 2826 / 2827* ACQUIRE_LOCK(&lk); 2828 if ((inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep))) 2829 inodedep->id_state \|= SPACECOUNTED; 2830 FREE_LOCK(&lk); 2831 ip->i_fs->fs_pendingblocks += ip->i_blocks; 2832 ip->i_fs->fs_pendinginodes += 1; 2833 ip->i_flag \|= IN_SPACECOUNTED; 2834} 2835 2836/*
2783 * This workitem decrements the inode's link count. 2784 * If the link count reaches zero, the file is removed. 2785 / 2786static void 2787handle_workitem_remove(dirrem) 2788* struct dirrem dirrem; 2789{ 2790* struct proc p = CURPROC; / XXX / 2791* struct inodedep inodedep; 2792* struct vnode vp; 2793* struct inode ip; 2794* ino_t oldinum; 2795 int error; 2796 2797 if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) { 2798 softdep_error("handle_workitem_remove: vget", error); 2799 return; 2800 } 2801 ip = VTOI(vp); 2802 ACQUIRE_LOCK(&lk); 2803 if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){ 2804 FREE_LOCK(&lk); 2805 panic("handle_workitem_remove: lost inodedep"); 2806 } 2807 /* 2808 * Normal file deletion. 2809 / 2810* if ((dirrem->dm_state & RMDIR) == 0) { 2811 ip->i_nlink--; 2812 ip->i_flag \|= IN_CHANGE; 2813 if (ip->i_nlink < ip->i_effnlink) { 2814 FREE_LOCK(&lk); 2815 panic("handle_workitem_remove: bad file delta"); 2816 } 2817 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2818 FREE_LOCK(&lk); 2819 vput(vp); 2820 num_dirrem -= 1; 2821 WORKITEM_FREE(dirrem, D_DIRREM); 2822 return; 2823 } 2824 /* 2825 * Directory deletion. Decrement reference count for both the 2826 * just deleted parent directory entry and the reference for ".". 2827 * Next truncate the directory to length zero. When the 2828 * truncation completes, arrange to have the reference count on 2829 * the parent decremented to account for the loss of "..". 2830 / 2831* ip->i_nlink -= 2; 2832 ip->i_flag \|= IN_CHANGE; 2833 if (ip->i_nlink < ip->i_effnlink) { 2834 FREE_LOCK(&lk); 2835 panic("handle_workitem_remove: bad dir delta"); 2836 } 2837 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2838 FREE_LOCK(&lk); 2839 if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0) 2840 softdep_error("handle_workitem_remove: truncate", error); 2841 /* 2842 * Rename a directory to a new parent. Since, we are both deleting 2843 * and creating a new directory entry, the link count on the new 2844 * directory should not change. Thus we skip the followup dirrem. 2845 / 2846* if (dirrem->dm_state & DIRCHG) { 2847 vput(vp); 2848 num_dirrem -= 1; 2849 WORKITEM_FREE(dirrem, D_DIRREM); 2850 return; 2851 } 2852 /* 2853 * If the inodedep does not exist, then the zero'ed inode has 2854 * been written to disk. If the allocated inode has never been 2855 * written to disk, then the on-disk inode is zero'ed. In either 2856 * case we can remove the file immediately. 2857 / 2858* ACQUIRE_LOCK(&lk); 2859 dirrem->dm_state = 0; 2860 oldinum = dirrem->dm_oldinum; 2861 dirrem->dm_oldinum = dirrem->dm_dirinum; 2862 if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 \|\| 2863 check_inode_unwritten(inodedep)) { 2864 FREE_LOCK(&lk); 2865 vput(vp); 2866 handle_workitem_remove(dirrem); 2867 return; 2868 } 2869 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 2870 FREE_LOCK(&lk); 2871 vput(vp); 2872} 2873 2874/* 2875 * Inode de-allocation dependencies. 2876 * 2877 * When an inode's link count is reduced to zero, it can be de-allocated. We 2878 * found it convenient to postpone de-allocation until after the inode is 2879 * written to disk with its new link count (zero). At this point, all of the 2880 * on-disk inode's block pointers are nullified and, with careful dependency 2881 * list ordering, all dependencies related to the inode will be satisfied and 2882 * the corresponding dependency structures de-allocated. So, if/when the 2883 * inode is reused, there will be no mixing of old dependencies with new 2884 * ones. This artificial dependency is set up by the block de-allocation 2885 * procedure above (softdep_setup_freeblocks) and completed by the 2886 * following procedure. 2887 / 2888static void 2889handle_workitem_freefile(freefile) 2890* struct freefile freefile; 2891{ 2892* struct fs fs; 2893* struct inode tip; 2894 struct inodedep idp; 2895* int error; 2896 2897 fs = VFSTOUFS(freefile->fx_mnt)->um_fs; 2898#ifdef DEBUG 2899 ACQUIRE_LOCK(&lk); 2900 error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp); 2901 FREE_LOCK(&lk); 2902 if (error) 2903 panic("handle_workitem_freefile: inodedep survived"); 2904#endif 2905 tip.i_devvp = freefile->fx_devvp; 2906 tip.i_dev = freefile->fx_devvp->v_rdev; 2907 tip.i_fs = fs;	2837 * This workitem decrements the inode's link count. 2838 * If the link count reaches zero, the file is removed. 2839 / 2840static void 2841handle_workitem_remove(dirrem) 2842* struct dirrem dirrem; 2843{ 2844* struct proc p = CURPROC; / XXX / 2845* struct inodedep inodedep; 2846* struct vnode vp; 2847* struct inode ip; 2848* ino_t oldinum; 2849 int error; 2850 2851 if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) { 2852 softdep_error("handle_workitem_remove: vget", error); 2853 return; 2854 } 2855 ip = VTOI(vp); 2856 ACQUIRE_LOCK(&lk); 2857 if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){ 2858 FREE_LOCK(&lk); 2859 panic("handle_workitem_remove: lost inodedep"); 2860 } 2861 /* 2862 * Normal file deletion. 2863 / 2864* if ((dirrem->dm_state & RMDIR) == 0) { 2865 ip->i_nlink--; 2866 ip->i_flag \|= IN_CHANGE; 2867 if (ip->i_nlink < ip->i_effnlink) { 2868 FREE_LOCK(&lk); 2869 panic("handle_workitem_remove: bad file delta"); 2870 } 2871 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2872 FREE_LOCK(&lk); 2873 vput(vp); 2874 num_dirrem -= 1; 2875 WORKITEM_FREE(dirrem, D_DIRREM); 2876 return; 2877 } 2878 /* 2879 * Directory deletion. Decrement reference count for both the 2880 * just deleted parent directory entry and the reference for ".". 2881 * Next truncate the directory to length zero. When the 2882 * truncation completes, arrange to have the reference count on 2883 * the parent decremented to account for the loss of "..". 2884 / 2885* ip->i_nlink -= 2; 2886 ip->i_flag \|= IN_CHANGE; 2887 if (ip->i_nlink < ip->i_effnlink) { 2888 FREE_LOCK(&lk); 2889 panic("handle_workitem_remove: bad dir delta"); 2890 } 2891 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2892 FREE_LOCK(&lk); 2893 if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0) 2894 softdep_error("handle_workitem_remove: truncate", error); 2895 /* 2896 * Rename a directory to a new parent. Since, we are both deleting 2897 * and creating a new directory entry, the link count on the new 2898 * directory should not change. Thus we skip the followup dirrem. 2899 / 2900* if (dirrem->dm_state & DIRCHG) { 2901 vput(vp); 2902 num_dirrem -= 1; 2903 WORKITEM_FREE(dirrem, D_DIRREM); 2904 return; 2905 } 2906 /* 2907 * If the inodedep does not exist, then the zero'ed inode has 2908 * been written to disk. If the allocated inode has never been 2909 * written to disk, then the on-disk inode is zero'ed. In either 2910 * case we can remove the file immediately. 2911 / 2912* ACQUIRE_LOCK(&lk); 2913 dirrem->dm_state = 0; 2914 oldinum = dirrem->dm_oldinum; 2915 dirrem->dm_oldinum = dirrem->dm_dirinum; 2916 if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 \|\| 2917 check_inode_unwritten(inodedep)) { 2918 FREE_LOCK(&lk); 2919 vput(vp); 2920 handle_workitem_remove(dirrem); 2921 return; 2922 } 2923 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 2924 FREE_LOCK(&lk); 2925 vput(vp); 2926} 2927 2928/* 2929 * Inode de-allocation dependencies. 2930 * 2931 * When an inode's link count is reduced to zero, it can be de-allocated. We 2932 * found it convenient to postpone de-allocation until after the inode is 2933 * written to disk with its new link count (zero). At this point, all of the 2934 * on-disk inode's block pointers are nullified and, with careful dependency 2935 * list ordering, all dependencies related to the inode will be satisfied and 2936 * the corresponding dependency structures de-allocated. So, if/when the 2937 * inode is reused, there will be no mixing of old dependencies with new 2938 * ones. This artificial dependency is set up by the block de-allocation 2939 * procedure above (softdep_setup_freeblocks) and completed by the 2940 * following procedure. 2941 / 2942static void 2943handle_workitem_freefile(freefile) 2944* struct freefile freefile; 2945{ 2946* struct fs fs; 2947* struct inode tip; 2948 struct inodedep idp; 2949* int error; 2950 2951 fs = VFSTOUFS(freefile->fx_mnt)->um_fs; 2952#ifdef DEBUG 2953 ACQUIRE_LOCK(&lk); 2954 error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp); 2955 FREE_LOCK(&lk); 2956 if (error) 2957 panic("handle_workitem_freefile: inodedep survived"); 2958#endif 2959 tip.i_devvp = freefile->fx_devvp; 2960 tip.i_dev = freefile->fx_devvp->v_rdev; 2961 tip.i_fs = fs;
	2962 fs->fs_pendinginodes -= 1;
2908 if ((error = ffs_freefile(&tip, freefile->fx_oldinum, freefile->fx_mode)) != 0) 2909 softdep_error("handle_workitem_freefile", error); 2910 WORKITEM_FREE(freefile, D_FREEFILE); 2911} 2912 2913/* 2914 * Disk writes. 2915 * 2916 * The dependency structures constructed above are most actively used when file 2917 * system blocks are written to disk. No constraints are placed on when a 2918 * block can be written, but unsatisfied update dependencies are made safe by 2919 * modifying (or replacing) the source memory for the duration of the disk 2920 * write. When the disk write completes, the memory block is again brought 2921 * up-to-date. 2922 * 2923 * In-core inode structure reclamation. 2924 * 2925 * Because there are a finite number of "in-core" inode structures, they are 2926 * reused regularly. By transferring all inode-related dependencies to the 2927 * in-memory inode block and indexing them separately (via "inodedep"s), we 2928 * can allow "in-core" inode structures to be reused at any time and avoid 2929 * any increase in contention. 2930 * 2931 * Called just before entering the device driver to initiate a new disk I/O. 2932 * The buffer must be locked, thus, no I/O completion operations can occur 2933 * while we are manipulating its associated dependencies. 2934 / 2935static void 2936softdep_disk_io_initiation(bp) 2937* struct buf bp; / structure describing disk write to occur / 2938{ 2939* struct worklist wk, nextwk; 2940 struct indirdep indirdep; 2941* 2942 /* 2943 * We only care about write operations. There should never 2944 * be dependencies for reads. 2945 / 2946* if (bp->b_iocmd == BIO_READ) 2947 panic("softdep_disk_io_initiation: read"); 2948 /* 2949 * Do any necessary pre-I/O processing. 2950 / 2951* for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) { 2952 nextwk = LIST_NEXT(wk, wk_list); 2953 switch (wk->wk_type) { 2954 2955 case D_PAGEDEP: 2956 initiate_write_filepage(WK_PAGEDEP(wk), bp); 2957 continue; 2958 2959 case D_INODEDEP: 2960 initiate_write_inodeblock(WK_INODEDEP(wk), bp); 2961 continue; 2962 2963 case D_INDIRDEP: 2964 indirdep = WK_INDIRDEP(wk); 2965 if (indirdep->ir_state & GOINGAWAY) 2966 panic("disk_io_initiation: indirdep gone"); 2967 /* 2968 * If there are no remaining dependencies, this 2969 * will be writing the real pointers, so the 2970 * dependency can be freed. 2971 / 2972* if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { 2973 indirdep->ir_savebp->b_flags \|= B_INVAL \| B_NOCACHE; 2974 brelse(indirdep->ir_savebp); 2975 /* inline expand WORKLIST_REMOVE(wk); / 2976* wk->wk_state &= ~ONWORKLIST; 2977 LIST_REMOVE(wk, wk_list); 2978 WORKITEM_FREE(indirdep, D_INDIRDEP); 2979 continue; 2980 } 2981 /* 2982 * Replace up-to-date version with safe version. 2983 / 2984* MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount, 2985 M_INDIRDEP, M_SOFTDEP_FLAGS); 2986 ACQUIRE_LOCK(&lk); 2987 indirdep->ir_state &= ~ATTACHED; 2988 indirdep->ir_state \|= UNDONE; 2989 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 2990 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 2991 bp->b_bcount); 2992 FREE_LOCK(&lk); 2993 continue; 2994 2995 case D_MKDIR: 2996 case D_BMSAFEMAP: 2997 case D_ALLOCDIRECT: 2998 case D_ALLOCINDIR: 2999 continue; 3000 3001 default: 3002 panic("handle_disk_io_initiation: Unexpected type %s", 3003 TYPENAME(wk->wk_type)); 3004 /* NOTREACHED / 3005* } 3006 } 3007} 3008 3009/* 3010 * Called from within the procedure above to deal with unsatisfied 3011 * allocation dependencies in a directory. The buffer must be locked, 3012 * thus, no I/O completion operations can occur while we are 3013 * manipulating its associated dependencies. 3014 / 3015static void 3016initiate_write_filepage(pagedep, bp) 3017* struct pagedep pagedep; 3018* struct buf bp; 3019{ 3020* struct diradd dap; 3021* struct direct ep; 3022* int i; 3023 3024 if (pagedep->pd_state & IOSTARTED) { 3025 /* 3026 * This can only happen if there is a driver that does not 3027 * understand chaining. Here biodone will reissue the call 3028 * to strategy for the incomplete buffers. 3029 / 3030* printf("initiate_write_filepage: already started\n"); 3031 return; 3032 } 3033 pagedep->pd_state \|= IOSTARTED; 3034 ACQUIRE_LOCK(&lk); 3035 for (i = 0; i < DAHASHSZ; i++) { 3036 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 3037 ep = (struct direct ) 3038* ((char )bp->b_data + dap->da_offset); 3039* if (ep->d_ino != dap->da_newinum) { 3040 FREE_LOCK(&lk); 3041 panic("%s: dir inum %d != new %d", 3042 "initiate_write_filepage", 3043 ep->d_ino, dap->da_newinum); 3044 } 3045 if (dap->da_state & DIRCHG) 3046 ep->d_ino = dap->da_previous->dm_oldinum; 3047 else 3048 ep->d_ino = 0; 3049 dap->da_state &= ~ATTACHED; 3050 dap->da_state \|= UNDONE; 3051 } 3052 } 3053 FREE_LOCK(&lk); 3054} 3055 3056/* 3057 * Called from within the procedure above to deal with unsatisfied 3058 * allocation dependencies in an inodeblock. The buffer must be 3059 * locked, thus, no I/O completion operations can occur while we 3060 * are manipulating its associated dependencies. 3061 / 3062static void 3063initiate_write_inodeblock(inodedep, bp) 3064* struct inodedep inodedep; 3065* struct buf bp; / The inode block / 3066{ 3067* struct allocdirect adp, lastadp; 3068 struct dinode dp; 3069* struct fs fs; 3070* ufs_lbn_t prevlbn = 0; 3071 int i, deplist; 3072 3073 if (inodedep->id_state & IOSTARTED) 3074 panic("initiate_write_inodeblock: already started"); 3075 inodedep->id_state \|= IOSTARTED; 3076 fs = inodedep->id_fs; 3077 dp = (struct dinode )bp->b_data + 3078* ino_to_fsbo(fs, inodedep->id_ino); 3079 /* 3080 * If the bitmap is not yet written, then the allocated 3081 * inode cannot be written to disk. 3082 / 3083* if ((inodedep->id_state & DEPCOMPLETE) == 0) { 3084 if (inodedep->id_savedino != NULL) 3085 panic("initiate_write_inodeblock: already doing I/O"); 3086 MALLOC(inodedep->id_savedino, struct dinode , 3087* sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS); 3088 inodedep->id_savedino = dp; 3089 bzero((caddr_t)dp, sizeof(struct dinode)); 3090 return; 3091 } 3092 /* 3093 * If no dependencies, then there is nothing to roll back. 3094 / 3095* inodedep->id_savedsize = dp->di_size; 3096 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) 3097 return; 3098 /* 3099 * Set the dependencies to busy. 3100 / 3101* ACQUIRE_LOCK(&lk); 3102 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3103 adp = TAILQ_NEXT(adp, ad_next)) { 3104#ifdef DIAGNOSTIC 3105 if (deplist != 0 && prevlbn >= adp->ad_lbn) { 3106 FREE_LOCK(&lk); 3107 panic("softdep_write_inodeblock: lbn order"); 3108 } 3109 prevlbn = adp->ad_lbn; 3110 if (adp->ad_lbn < NDADDR && 3111 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) { 3112 FREE_LOCK(&lk); 3113 panic("%s: direct pointer #%ld mismatch %d != %d", 3114 "softdep_write_inodeblock", adp->ad_lbn, 3115 dp->di_db[adp->ad_lbn], adp->ad_newblkno); 3116 } 3117 if (adp->ad_lbn >= NDADDR && 3118 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) { 3119 FREE_LOCK(&lk); 3120 panic("%s: indirect pointer #%ld mismatch %d != %d", 3121 "softdep_write_inodeblock", adp->ad_lbn - NDADDR, 3122 dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno); 3123 } 3124 deplist \|= 1 << adp->ad_lbn; 3125 if ((adp->ad_state & ATTACHED) == 0) { 3126 FREE_LOCK(&lk); 3127 panic("softdep_write_inodeblock: Unknown state 0x%x", 3128 adp->ad_state); 3129 } 3130#endif /* DIAGNOSTIC / 3131* adp->ad_state &= ~ATTACHED; 3132 adp->ad_state \|= UNDONE; 3133 } 3134 /* 3135 * The on-disk inode cannot claim to be any larger than the last 3136 * fragment that has been written. Otherwise, the on-disk inode 3137 * might have fragments that were not the last block in the file 3138 * which would corrupt the filesystem. 3139 / 3140* for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3141 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 3142 if (adp->ad_lbn >= NDADDR) 3143 break; 3144 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; 3145 /* keep going until hitting a rollback to a frag / 3146* if (adp->ad_oldsize == 0 \|\| adp->ad_oldsize == fs->fs_bsize) 3147 continue; 3148 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 3149 for (i = adp->ad_lbn + 1; i < NDADDR; i++) { 3150#ifdef DIAGNOSTIC 3151 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) { 3152 FREE_LOCK(&lk); 3153 panic("softdep_write_inodeblock: lost dep1"); 3154 } 3155#endif /* DIAGNOSTIC / 3156* dp->di_db[i] = 0; 3157 } 3158 for (i = 0; i < NIADDR; i++) { 3159#ifdef DIAGNOSTIC 3160 if (dp->di_ib[i] != 0 && 3161 (deplist & ((1 << NDADDR) << i)) == 0) { 3162 FREE_LOCK(&lk); 3163 panic("softdep_write_inodeblock: lost dep2"); 3164 } 3165#endif /* DIAGNOSTIC / 3166* dp->di_ib[i] = 0; 3167 } 3168 FREE_LOCK(&lk); 3169 return; 3170 } 3171 /* 3172 * If we have zero'ed out the last allocated block of the file, 3173 * roll back the size to the last currently allocated block. 3174 * We know that this last allocated block is a full-sized as 3175 * we already checked for fragments in the loop above. 3176 / 3177* if (lastadp != NULL && 3178 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 3179 for (i = lastadp->ad_lbn; i >= 0; i--) 3180 if (dp->di_db[i] != 0) 3181 break; 3182 dp->di_size = (i + 1) * fs->fs_bsize; 3183 } 3184 /* 3185 * The only dependencies are for indirect blocks. 3186 * 3187 * The file size for indirect block additions is not guaranteed. 3188 * Such a guarantee would be non-trivial to achieve. The conventional 3189 * synchronous write implementation also does not make this guarantee. 3190 * Fsck should catch and fix discrepancies. Arguably, the file size 3191 * can be over-estimated without destroying integrity when the file 3192 * moves into the indirect blocks (i.e., is large). If we want to 3193 * postpone fsck, we are stuck with this argument. 3194 / 3195* for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 3196 dp->di_ib[adp->ad_lbn - NDADDR] = 0; 3197 FREE_LOCK(&lk); 3198} 3199 3200/* 3201 * This routine is called during the completion interrupt 3202 * service routine for a disk write (from the procedure called 3203 * by the device driver to inform the file system caches of 3204 * a request completion). It should be called early in this 3205 * procedure, before the block is made available to other 3206 * processes or other routines are called. 3207 / 3208static void 3209softdep_disk_write_complete(bp) 3210* struct buf bp; / describes the completed disk write / 3211{ 3212* struct worklist wk; 3213* struct workhead reattach; 3214 struct newblk newblk; 3215* struct allocindir aip; 3216* struct allocdirect adp; 3217* struct indirdep indirdep; 3218* struct inodedep inodedep; 3219* struct bmsafemap bmsafemap; 3220* 3221#ifdef DEBUG 3222 if (lk.lkt_held != -1) 3223 panic("softdep_disk_write_complete: lock is held"); 3224 lk.lkt_held = -2; 3225#endif 3226 LIST_INIT(&reattach); 3227 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 3228 WORKLIST_REMOVE(wk); 3229 switch (wk->wk_type) { 3230 3231 case D_PAGEDEP: 3232 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 3233 WORKLIST_INSERT(&reattach, wk); 3234 continue; 3235 3236 case D_INODEDEP: 3237 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 3238 WORKLIST_INSERT(&reattach, wk); 3239 continue; 3240 3241 case D_BMSAFEMAP: 3242 bmsafemap = WK_BMSAFEMAP(wk); 3243 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { 3244 newblk->nb_state \|= DEPCOMPLETE; 3245 newblk->nb_bmsafemap = NULL; 3246 LIST_REMOVE(newblk, nb_deps); 3247 } 3248 while ((adp = 3249 LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { 3250 adp->ad_state \|= DEPCOMPLETE; 3251 adp->ad_buf = NULL; 3252 LIST_REMOVE(adp, ad_deps); 3253 handle_allocdirect_partdone(adp); 3254 } 3255 while ((aip = 3256 LIST_FIRST(&bmsafemap->sm_allocindirhd))) { 3257 aip->ai_state \|= DEPCOMPLETE; 3258 aip->ai_buf = NULL; 3259 LIST_REMOVE(aip, ai_deps); 3260 handle_allocindir_partdone(aip); 3261 } 3262 while ((inodedep = 3263 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { 3264 inodedep->id_state \|= DEPCOMPLETE; 3265 LIST_REMOVE(inodedep, id_deps); 3266 inodedep->id_buf = NULL; 3267 } 3268 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 3269 continue; 3270 3271 case D_MKDIR: 3272 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 3273 continue; 3274 3275 case D_ALLOCDIRECT: 3276 adp = WK_ALLOCDIRECT(wk); 3277 adp->ad_state \|= COMPLETE; 3278 handle_allocdirect_partdone(adp); 3279 continue; 3280 3281 case D_ALLOCINDIR: 3282 aip = WK_ALLOCINDIR(wk); 3283 aip->ai_state \|= COMPLETE; 3284 handle_allocindir_partdone(aip); 3285 continue; 3286 3287 case D_INDIRDEP: 3288 indirdep = WK_INDIRDEP(wk); 3289 if (indirdep->ir_state & GOINGAWAY) { 3290 lk.lkt_held = -1; 3291 panic("disk_write_complete: indirdep gone"); 3292 } 3293 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 3294 FREE(indirdep->ir_saveddata, M_INDIRDEP); 3295 indirdep->ir_saveddata = 0; 3296 indirdep->ir_state &= ~UNDONE; 3297 indirdep->ir_state \|= ATTACHED; 3298 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 3299 handle_allocindir_partdone(aip); 3300 if (aip == LIST_FIRST(&indirdep->ir_donehd)) { 3301 lk.lkt_held = -1; 3302 panic("disk_write_complete: not gone"); 3303 } 3304 } 3305 WORKLIST_INSERT(&reattach, wk); 3306 if ((bp->b_flags & B_DELWRI) == 0) 3307 stat_indir_blk_ptrs++; 3308 bdirty(bp); 3309 continue; 3310 3311 default: 3312 lk.lkt_held = -1; 3313 panic("handle_disk_write_complete: Unknown type %s", 3314 TYPENAME(wk->wk_type)); 3315 /* NOTREACHED / 3316* } 3317 } 3318 /* 3319 * Reattach any requests that must be redone. 3320 / 3321* while ((wk = LIST_FIRST(&reattach)) != NULL) { 3322 WORKLIST_REMOVE(wk); 3323 WORKLIST_INSERT(&bp->b_dep, wk); 3324 } 3325#ifdef DEBUG 3326 if (lk.lkt_held != -2) 3327 panic("softdep_disk_write_complete: lock lost"); 3328 lk.lkt_held = -1; 3329#endif 3330} 3331 3332/* 3333 * Called from within softdep_disk_write_complete above. Note that 3334 * this routine is always called from interrupt level with further 3335 * splbio interrupts blocked. 3336 / 3337static void 3338handle_allocdirect_partdone(adp) 3339* struct allocdirect adp; / the completed allocdirect / 3340{ 3341* struct allocdirect listadp; 3342* struct inodedep inodedep; 3343* long bsize, delay; 3344 3345 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 3346 return; 3347 if (adp->ad_buf != NULL) { 3348 lk.lkt_held = -1; 3349 panic("handle_allocdirect_partdone: dangling dep"); 3350 } 3351 /* 3352 * The on-disk inode cannot claim to be any larger than the last 3353 * fragment that has been written. Otherwise, the on-disk inode 3354 * might have fragments that were not the last block in the file 3355 * which would corrupt the filesystem. Thus, we cannot free any 3356 * allocdirects after one whose ad_oldblkno claims a fragment as 3357 * these blocks must be rolled back to zero before writing the inode. 3358 * We check the currently active set of allocdirects in id_inoupdt. 3359 / 3360* inodedep = adp->ad_inodedep; 3361 bsize = inodedep->id_fs->fs_bsize; 3362 TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) { 3363 /* found our block / 3364* if (listadp == adp) 3365 break; 3366 /* continue if ad_oldlbn is not a fragment / 3367* if (listadp->ad_oldsize == 0 \|\| 3368 listadp->ad_oldsize == bsize) 3369 continue; 3370 /* hit a fragment / 3371* return; 3372 } 3373 /* 3374 * If we have reached the end of the current list without 3375 * finding the just finished dependency, then it must be 3376 * on the future dependency list. Future dependencies cannot 3377 * be freed until they are moved to the current list. 3378 / 3379* if (listadp == NULL) { 3380#ifdef DEBUG 3381 TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next) 3382 /* found our block / 3383* if (listadp == adp) 3384 break; 3385 if (listadp == NULL) { 3386 lk.lkt_held = -1; 3387 panic("handle_allocdirect_partdone: lost dep"); 3388 } 3389#endif /* DEBUG / 3390* return; 3391 } 3392 /* 3393 * If we have found the just finished dependency, then free 3394 * it along with anything that follows it that is complete. 3395 * If the inode still has a bitmap dependency, then it has 3396 * never been written to disk, hence the on-disk inode cannot 3397 * reference the old fragment so we can free it without delay. 3398 / 3399* delay = (inodedep->id_state & DEPCOMPLETE); 3400 for (; adp; adp = listadp) { 3401 listadp = TAILQ_NEXT(adp, ad_next); 3402 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 3403 return; 3404 free_allocdirect(&inodedep->id_inoupdt, adp, delay); 3405 } 3406} 3407 3408/* 3409 * Called from within softdep_disk_write_complete above. Note that 3410 * this routine is always called from interrupt level with further 3411 * splbio interrupts blocked. 3412 / 3413static void 3414handle_allocindir_partdone(aip) 3415* struct allocindir aip; / the completed allocindir / 3416{ 3417* struct indirdep indirdep; 3418* 3419 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 3420 return; 3421 if (aip->ai_buf != NULL) { 3422 lk.lkt_held = -1; 3423 panic("handle_allocindir_partdone: dangling dependency"); 3424 } 3425 indirdep = aip->ai_indirdep; 3426 if (indirdep->ir_state & UNDONE) { 3427 LIST_REMOVE(aip, ai_next); 3428 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 3429 return; 3430 } 3431 ((ufs_daddr_t )indirdep->ir_savebp->b_data)[aip->ai_offset] = 3432* aip->ai_newblkno; 3433 LIST_REMOVE(aip, ai_next); 3434 if (aip->ai_freefrag != NULL) 3435 add_to_worklist(&aip->ai_freefrag->ff_list); 3436 WORKITEM_FREE(aip, D_ALLOCINDIR); 3437} 3438 3439/* 3440 * Called from within softdep_disk_write_complete above to restore 3441 * in-memory inode block contents to their most up-to-date state. Note 3442 * that this routine is always called from interrupt level with further 3443 * splbio interrupts blocked. 3444 / 3445static int 3446handle_written_inodeblock(inodedep, bp) 3447* struct inodedep inodedep; 3448* struct buf bp; / buffer containing the inode block / 3449{ 3450* struct worklist wk, filefree; 3451 struct allocdirect adp, nextadp; 3452 struct dinode dp; 3453* int hadchanges; 3454 3455 if ((inodedep->id_state & IOSTARTED) == 0) { 3456 lk.lkt_held = -1; 3457 panic("handle_written_inodeblock: not started"); 3458 } 3459 inodedep->id_state &= ~IOSTARTED; 3460 inodedep->id_state \|= COMPLETE; 3461 dp = (struct dinode )bp->b_data + 3462* ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 3463 /* 3464 * If we had to rollback the inode allocation because of 3465 * bitmaps being incomplete, then simply restore it. 3466 * Keep the block dirty so that it will not be reclaimed until 3467 * all associated dependencies have been cleared and the 3468 * corresponding updates written to disk. 3469 / 3470* if (inodedep->id_savedino != NULL) { 3471 dp = inodedep->id_savedino; 3472 FREE(inodedep->id_savedino, M_INODEDEP); 3473 inodedep->id_savedino = NULL; 3474 if ((bp->b_flags & B_DELWRI) == 0) 3475 stat_inode_bitmap++; 3476 bdirty(bp); 3477 return (1); 3478 } 3479 /* 3480 * Roll forward anything that had to be rolled back before 3481 * the inode could be updated. 3482 / 3483* hadchanges = 0; 3484 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 3485 nextadp = TAILQ_NEXT(adp, ad_next); 3486 if (adp->ad_state & ATTACHED) { 3487 lk.lkt_held = -1; 3488 panic("handle_written_inodeblock: new entry"); 3489 } 3490 if (adp->ad_lbn < NDADDR) { 3491 if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) { 3492 lk.lkt_held = -1; 3493 panic("%s: %s #%ld mismatch %d != %d", 3494 "handle_written_inodeblock", 3495 "direct pointer", adp->ad_lbn, 3496 dp->di_db[adp->ad_lbn], adp->ad_oldblkno); 3497 } 3498 dp->di_db[adp->ad_lbn] = adp->ad_newblkno; 3499 } else { 3500 if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) { 3501 lk.lkt_held = -1; 3502 panic("%s: %s #%ld allocated as %d", 3503 "handle_written_inodeblock", 3504 "indirect pointer", adp->ad_lbn - NDADDR, 3505 dp->di_ib[adp->ad_lbn - NDADDR]); 3506 } 3507 dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; 3508 } 3509 adp->ad_state &= ~UNDONE; 3510 adp->ad_state \|= ATTACHED; 3511 hadchanges = 1; 3512 } 3513 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 3514 stat_direct_blk_ptrs++; 3515 /* 3516 * Reset the file size to its most up-to-date value. 3517 / 3518* if (inodedep->id_savedsize == -1) { 3519 lk.lkt_held = -1; 3520 panic("handle_written_inodeblock: bad size"); 3521 } 3522 if (dp->di_size != inodedep->id_savedsize) { 3523 dp->di_size = inodedep->id_savedsize; 3524 hadchanges = 1; 3525 } 3526 inodedep->id_savedsize = -1; 3527 /* 3528 * If there were any rollbacks in the inode block, then it must be 3529 * marked dirty so that its will eventually get written back in 3530 * its correct form. 3531 / 3532* if (hadchanges) 3533 bdirty(bp); 3534 /* 3535 * Process any allocdirects that completed during the update. 3536 / 3537* if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 3538 handle_allocdirect_partdone(adp); 3539 /* 3540 * Process deallocations that were held pending until the 3541 * inode had been written to disk. Freeing of the inode 3542 * is delayed until after all blocks have been freed to 3543 * avoid creation of new <vfsid, inum, lbn> triples 3544 * before the old ones have been deleted. 3545 / 3546* filefree = NULL; 3547 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 3548 WORKLIST_REMOVE(wk); 3549 switch (wk->wk_type) { 3550 3551 case D_FREEFILE: 3552 /* 3553 * We defer adding filefree to the worklist until 3554 * all other additions have been made to ensure 3555 * that it will be done after all the old blocks 3556 * have been freed. 3557 / 3558* if (filefree != NULL) { 3559 lk.lkt_held = -1; 3560 panic("handle_written_inodeblock: filefree"); 3561 } 3562 filefree = wk; 3563 continue; 3564 3565 case D_MKDIR: 3566 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 3567 continue; 3568 3569 case D_DIRADD: 3570 diradd_inode_written(WK_DIRADD(wk), inodedep); 3571 continue; 3572 3573 case D_FREEBLKS: 3574 case D_FREEFRAG: 3575 case D_DIRREM: 3576 add_to_worklist(wk); 3577 continue; 3578 3579 default: 3580 lk.lkt_held = -1; 3581 panic("handle_written_inodeblock: Unknown type %s", 3582 TYPENAME(wk->wk_type)); 3583 /* NOTREACHED / 3584* } 3585 } 3586 if (filefree != NULL) { 3587 if (free_inodedep(inodedep) == 0) { 3588 lk.lkt_held = -1; 3589 panic("handle_written_inodeblock: live inodedep"); 3590 } 3591 add_to_worklist(filefree); 3592 return (0); 3593 } 3594 3595 /* 3596 * If no outstanding dependencies, free it. 3597 / 3598* if (free_inodedep(inodedep) \|\| TAILQ_FIRST(&inodedep->id_inoupdt) == 0) 3599 return (0); 3600 return (hadchanges); 3601} 3602 3603/* 3604 * Process a diradd entry after its dependent inode has been written. 3605 * This routine must be called with splbio interrupts blocked. 3606 / 3607static void 3608diradd_inode_written(dap, inodedep) 3609* struct diradd dap; 3610* struct inodedep inodedep; 3611{ 3612* struct pagedep pagedep; 3613* 3614 dap->da_state \|= COMPLETE; 3615 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3616 if (dap->da_state & DIRCHG) 3617 pagedep = dap->da_previous->dm_pagedep; 3618 else 3619 pagedep = dap->da_pagedep; 3620 LIST_REMOVE(dap, da_pdlist); 3621 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3622 } 3623 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 3624} 3625 3626/* 3627 * Handle the completion of a mkdir dependency. 3628 / 3629static void 3630handle_written_mkdir(mkdir, type) 3631* struct mkdir mkdir; 3632* int type; 3633{ 3634 struct diradd dap; 3635* struct pagedep pagedep; 3636* 3637 if (mkdir->md_state != type) { 3638 lk.lkt_held = -1; 3639 panic("handle_written_mkdir: bad type"); 3640 } 3641 dap = mkdir->md_diradd; 3642 dap->da_state &= ~type; 3643 if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) == 0) 3644 dap->da_state \|= DEPCOMPLETE; 3645 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3646 if (dap->da_state & DIRCHG) 3647 pagedep = dap->da_previous->dm_pagedep; 3648 else 3649 pagedep = dap->da_pagedep; 3650 LIST_REMOVE(dap, da_pdlist); 3651 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3652 } 3653 LIST_REMOVE(mkdir, md_mkdirs); 3654 WORKITEM_FREE(mkdir, D_MKDIR); 3655} 3656 3657/* 3658 * Called from within softdep_disk_write_complete above. 3659 * A write operation was just completed. Removed inodes can 3660 * now be freed and associated block pointers may be committed. 3661 * Note that this routine is always called from interrupt level 3662 * with further splbio interrupts blocked. 3663 / 3664static int 3665handle_written_filepage(pagedep, bp) 3666* struct pagedep pagedep; 3667* struct buf bp; / buffer containing the written page / 3668{ 3669* struct dirrem dirrem; 3670* struct diradd dap, nextdap; 3671 struct direct ep; 3672* int i, chgs; 3673 3674 if ((pagedep->pd_state & IOSTARTED) == 0) { 3675 lk.lkt_held = -1; 3676 panic("handle_written_filepage: not started"); 3677 } 3678 pagedep->pd_state &= ~IOSTARTED; 3679 /* 3680 * Process any directory removals that have been committed. 3681 / 3682* while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 3683 LIST_REMOVE(dirrem, dm_next); 3684 dirrem->dm_dirinum = pagedep->pd_ino; 3685 add_to_worklist(&dirrem->dm_list); 3686 } 3687 /* 3688 * Free any directory additions that have been committed. 3689 / 3690* while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 3691 free_diradd(dap); 3692 /* 3693 * Uncommitted directory entries must be restored. 3694 / 3695* for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 3696 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 3697 dap = nextdap) { 3698 nextdap = LIST_NEXT(dap, da_pdlist); 3699 if (dap->da_state & ATTACHED) { 3700 lk.lkt_held = -1; 3701 panic("handle_written_filepage: attached"); 3702 } 3703 ep = (struct direct ) 3704* ((char )bp->b_data + dap->da_offset); 3705* ep->d_ino = dap->da_newinum; 3706 dap->da_state &= ~UNDONE; 3707 dap->da_state \|= ATTACHED; 3708 chgs = 1; 3709 /* 3710 * If the inode referenced by the directory has 3711 * been written out, then the dependency can be 3712 * moved to the pending list. 3713 / 3714* if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3715 LIST_REMOVE(dap, da_pdlist); 3716 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 3717 da_pdlist); 3718 } 3719 } 3720 } 3721 /* 3722 * If there were any rollbacks in the directory, then it must be 3723 * marked dirty so that its will eventually get written back in 3724 * its correct form. 3725 / 3726* if (chgs) { 3727 if ((bp->b_flags & B_DELWRI) == 0) 3728 stat_dir_entry++; 3729 bdirty(bp); 3730 } 3731 /* 3732 * If no dependencies remain, the pagedep will be freed. 3733 * Otherwise it will remain to update the page before it 3734 * is written back to disk. 3735 / 3736* if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) { 3737 for (i = 0; i < DAHASHSZ; i++) 3738 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) 3739 break; 3740 if (i == DAHASHSZ) { 3741 LIST_REMOVE(pagedep, pd_hash); 3742 WORKITEM_FREE(pagedep, D_PAGEDEP); 3743 return (0); 3744 } 3745 } 3746 return (1); 3747} 3748 3749/* 3750 * Writing back in-core inode structures. 3751 * 3752 * The file system only accesses an inode's contents when it occupies an 3753 * "in-core" inode structure. These "in-core" structures are separate from 3754 * the page frames used to cache inode blocks. Only the latter are 3755 * transferred to/from the disk. So, when the updated contents of the 3756 * "in-core" inode structure are copied to the corresponding in-memory inode 3757 * block, the dependencies are also transferred. The following procedure is 3758 * called when copying a dirty "in-core" inode to a cached inode block. 3759 / 3760* 3761/* 3762 * Called when an inode is loaded from disk. If the effective link count 3763 * differed from the actual link count when it was last flushed, then we 3764 * need to ensure that the correct effective link count is put back. 3765 / 3766void 3767softdep_load_inodeblock(ip) 3768* struct inode ip; / the "in_core" copy of the inode / 3769{ 3770* struct inodedep inodedep; 3771* 3772 /* 3773 * Check for alternate nlink count. 3774 / 3775* ip->i_effnlink = ip->i_nlink; 3776 ACQUIRE_LOCK(&lk); 3777 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3778 FREE_LOCK(&lk); 3779 return; 3780 } 3781 ip->i_effnlink -= inodedep->id_nlinkdelta;	2963 if ((error = ffs_freefile(&tip, freefile->fx_oldinum, freefile->fx_mode)) != 0) 2964 softdep_error("handle_workitem_freefile", error); 2965 WORKITEM_FREE(freefile, D_FREEFILE); 2966} 2967 2968/* 2969 * Disk writes. 2970 * 2971 * The dependency structures constructed above are most actively used when file 2972 * system blocks are written to disk. No constraints are placed on when a 2973 * block can be written, but unsatisfied update dependencies are made safe by 2974 * modifying (or replacing) the source memory for the duration of the disk 2975 * write. When the disk write completes, the memory block is again brought 2976 * up-to-date. 2977 * 2978 * In-core inode structure reclamation. 2979 * 2980 * Because there are a finite number of "in-core" inode structures, they are 2981 * reused regularly. By transferring all inode-related dependencies to the 2982 * in-memory inode block and indexing them separately (via "inodedep"s), we 2983 * can allow "in-core" inode structures to be reused at any time and avoid 2984 * any increase in contention. 2985 * 2986 * Called just before entering the device driver to initiate a new disk I/O. 2987 * The buffer must be locked, thus, no I/O completion operations can occur 2988 * while we are manipulating its associated dependencies. 2989 / 2990static void 2991softdep_disk_io_initiation(bp) 2992* struct buf bp; / structure describing disk write to occur / 2993{ 2994* struct worklist wk, nextwk; 2995 struct indirdep indirdep; 2996* 2997 /* 2998 * We only care about write operations. There should never 2999 * be dependencies for reads. 3000 / 3001* if (bp->b_iocmd == BIO_READ) 3002 panic("softdep_disk_io_initiation: read"); 3003 /* 3004 * Do any necessary pre-I/O processing. 3005 / 3006* for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) { 3007 nextwk = LIST_NEXT(wk, wk_list); 3008 switch (wk->wk_type) { 3009 3010 case D_PAGEDEP: 3011 initiate_write_filepage(WK_PAGEDEP(wk), bp); 3012 continue; 3013 3014 case D_INODEDEP: 3015 initiate_write_inodeblock(WK_INODEDEP(wk), bp); 3016 continue; 3017 3018 case D_INDIRDEP: 3019 indirdep = WK_INDIRDEP(wk); 3020 if (indirdep->ir_state & GOINGAWAY) 3021 panic("disk_io_initiation: indirdep gone"); 3022 /* 3023 * If there are no remaining dependencies, this 3024 * will be writing the real pointers, so the 3025 * dependency can be freed. 3026 / 3027* if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { 3028 indirdep->ir_savebp->b_flags \|= B_INVAL \| B_NOCACHE; 3029 brelse(indirdep->ir_savebp); 3030 /* inline expand WORKLIST_REMOVE(wk); / 3031* wk->wk_state &= ~ONWORKLIST; 3032 LIST_REMOVE(wk, wk_list); 3033 WORKITEM_FREE(indirdep, D_INDIRDEP); 3034 continue; 3035 } 3036 /* 3037 * Replace up-to-date version with safe version. 3038 / 3039* MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount, 3040 M_INDIRDEP, M_SOFTDEP_FLAGS); 3041 ACQUIRE_LOCK(&lk); 3042 indirdep->ir_state &= ~ATTACHED; 3043 indirdep->ir_state \|= UNDONE; 3044 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 3045 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 3046 bp->b_bcount); 3047 FREE_LOCK(&lk); 3048 continue; 3049 3050 case D_MKDIR: 3051 case D_BMSAFEMAP: 3052 case D_ALLOCDIRECT: 3053 case D_ALLOCINDIR: 3054 continue; 3055 3056 default: 3057 panic("handle_disk_io_initiation: Unexpected type %s", 3058 TYPENAME(wk->wk_type)); 3059 /* NOTREACHED / 3060* } 3061 } 3062} 3063 3064/* 3065 * Called from within the procedure above to deal with unsatisfied 3066 * allocation dependencies in a directory. The buffer must be locked, 3067 * thus, no I/O completion operations can occur while we are 3068 * manipulating its associated dependencies. 3069 / 3070static void 3071initiate_write_filepage(pagedep, bp) 3072* struct pagedep pagedep; 3073* struct buf bp; 3074{ 3075* struct diradd dap; 3076* struct direct ep; 3077* int i; 3078 3079 if (pagedep->pd_state & IOSTARTED) { 3080 /* 3081 * This can only happen if there is a driver that does not 3082 * understand chaining. Here biodone will reissue the call 3083 * to strategy for the incomplete buffers. 3084 / 3085* printf("initiate_write_filepage: already started\n"); 3086 return; 3087 } 3088 pagedep->pd_state \|= IOSTARTED; 3089 ACQUIRE_LOCK(&lk); 3090 for (i = 0; i < DAHASHSZ; i++) { 3091 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 3092 ep = (struct direct ) 3093* ((char )bp->b_data + dap->da_offset); 3094* if (ep->d_ino != dap->da_newinum) { 3095 FREE_LOCK(&lk); 3096 panic("%s: dir inum %d != new %d", 3097 "initiate_write_filepage", 3098 ep->d_ino, dap->da_newinum); 3099 } 3100 if (dap->da_state & DIRCHG) 3101 ep->d_ino = dap->da_previous->dm_oldinum; 3102 else 3103 ep->d_ino = 0; 3104 dap->da_state &= ~ATTACHED; 3105 dap->da_state \|= UNDONE; 3106 } 3107 } 3108 FREE_LOCK(&lk); 3109} 3110 3111/* 3112 * Called from within the procedure above to deal with unsatisfied 3113 * allocation dependencies in an inodeblock. The buffer must be 3114 * locked, thus, no I/O completion operations can occur while we 3115 * are manipulating its associated dependencies. 3116 / 3117static void 3118initiate_write_inodeblock(inodedep, bp) 3119* struct inodedep inodedep; 3120* struct buf bp; / The inode block / 3121{ 3122* struct allocdirect adp, lastadp; 3123 struct dinode dp; 3124* struct fs fs; 3125* ufs_lbn_t prevlbn = 0; 3126 int i, deplist; 3127 3128 if (inodedep->id_state & IOSTARTED) 3129 panic("initiate_write_inodeblock: already started"); 3130 inodedep->id_state \|= IOSTARTED; 3131 fs = inodedep->id_fs; 3132 dp = (struct dinode )bp->b_data + 3133* ino_to_fsbo(fs, inodedep->id_ino); 3134 /* 3135 * If the bitmap is not yet written, then the allocated 3136 * inode cannot be written to disk. 3137 / 3138* if ((inodedep->id_state & DEPCOMPLETE) == 0) { 3139 if (inodedep->id_savedino != NULL) 3140 panic("initiate_write_inodeblock: already doing I/O"); 3141 MALLOC(inodedep->id_savedino, struct dinode , 3142* sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS); 3143 inodedep->id_savedino = dp; 3144 bzero((caddr_t)dp, sizeof(struct dinode)); 3145 return; 3146 } 3147 /* 3148 * If no dependencies, then there is nothing to roll back. 3149 / 3150* inodedep->id_savedsize = dp->di_size; 3151 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) 3152 return; 3153 /* 3154 * Set the dependencies to busy. 3155 / 3156* ACQUIRE_LOCK(&lk); 3157 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3158 adp = TAILQ_NEXT(adp, ad_next)) { 3159#ifdef DIAGNOSTIC 3160 if (deplist != 0 && prevlbn >= adp->ad_lbn) { 3161 FREE_LOCK(&lk); 3162 panic("softdep_write_inodeblock: lbn order"); 3163 } 3164 prevlbn = adp->ad_lbn; 3165 if (adp->ad_lbn < NDADDR && 3166 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) { 3167 FREE_LOCK(&lk); 3168 panic("%s: direct pointer #%ld mismatch %d != %d", 3169 "softdep_write_inodeblock", adp->ad_lbn, 3170 dp->di_db[adp->ad_lbn], adp->ad_newblkno); 3171 } 3172 if (adp->ad_lbn >= NDADDR && 3173 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) { 3174 FREE_LOCK(&lk); 3175 panic("%s: indirect pointer #%ld mismatch %d != %d", 3176 "softdep_write_inodeblock", adp->ad_lbn - NDADDR, 3177 dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno); 3178 } 3179 deplist \|= 1 << adp->ad_lbn; 3180 if ((adp->ad_state & ATTACHED) == 0) { 3181 FREE_LOCK(&lk); 3182 panic("softdep_write_inodeblock: Unknown state 0x%x", 3183 adp->ad_state); 3184 } 3185#endif /* DIAGNOSTIC / 3186* adp->ad_state &= ~ATTACHED; 3187 adp->ad_state \|= UNDONE; 3188 } 3189 /* 3190 * The on-disk inode cannot claim to be any larger than the last 3191 * fragment that has been written. Otherwise, the on-disk inode 3192 * might have fragments that were not the last block in the file 3193 * which would corrupt the filesystem. 3194 / 3195* for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3196 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 3197 if (adp->ad_lbn >= NDADDR) 3198 break; 3199 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; 3200 /* keep going until hitting a rollback to a frag / 3201* if (adp->ad_oldsize == 0 \|\| adp->ad_oldsize == fs->fs_bsize) 3202 continue; 3203 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 3204 for (i = adp->ad_lbn + 1; i < NDADDR; i++) { 3205#ifdef DIAGNOSTIC 3206 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) { 3207 FREE_LOCK(&lk); 3208 panic("softdep_write_inodeblock: lost dep1"); 3209 } 3210#endif /* DIAGNOSTIC / 3211* dp->di_db[i] = 0; 3212 } 3213 for (i = 0; i < NIADDR; i++) { 3214#ifdef DIAGNOSTIC 3215 if (dp->di_ib[i] != 0 && 3216 (deplist & ((1 << NDADDR) << i)) == 0) { 3217 FREE_LOCK(&lk); 3218 panic("softdep_write_inodeblock: lost dep2"); 3219 } 3220#endif /* DIAGNOSTIC / 3221* dp->di_ib[i] = 0; 3222 } 3223 FREE_LOCK(&lk); 3224 return; 3225 } 3226 /* 3227 * If we have zero'ed out the last allocated block of the file, 3228 * roll back the size to the last currently allocated block. 3229 * We know that this last allocated block is a full-sized as 3230 * we already checked for fragments in the loop above. 3231 / 3232* if (lastadp != NULL && 3233 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 3234 for (i = lastadp->ad_lbn; i >= 0; i--) 3235 if (dp->di_db[i] != 0) 3236 break; 3237 dp->di_size = (i + 1) * fs->fs_bsize; 3238 } 3239 /* 3240 * The only dependencies are for indirect blocks. 3241 * 3242 * The file size for indirect block additions is not guaranteed. 3243 * Such a guarantee would be non-trivial to achieve. The conventional 3244 * synchronous write implementation also does not make this guarantee. 3245 * Fsck should catch and fix discrepancies. Arguably, the file size 3246 * can be over-estimated without destroying integrity when the file 3247 * moves into the indirect blocks (i.e., is large). If we want to 3248 * postpone fsck, we are stuck with this argument. 3249 / 3250* for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 3251 dp->di_ib[adp->ad_lbn - NDADDR] = 0; 3252 FREE_LOCK(&lk); 3253} 3254 3255/* 3256 * This routine is called during the completion interrupt 3257 * service routine for a disk write (from the procedure called 3258 * by the device driver to inform the file system caches of 3259 * a request completion). It should be called early in this 3260 * procedure, before the block is made available to other 3261 * processes or other routines are called. 3262 / 3263static void 3264softdep_disk_write_complete(bp) 3265* struct buf bp; / describes the completed disk write / 3266{ 3267* struct worklist wk; 3268* struct workhead reattach; 3269 struct newblk newblk; 3270* struct allocindir aip; 3271* struct allocdirect adp; 3272* struct indirdep indirdep; 3273* struct inodedep inodedep; 3274* struct bmsafemap bmsafemap; 3275* 3276#ifdef DEBUG 3277 if (lk.lkt_held != -1) 3278 panic("softdep_disk_write_complete: lock is held"); 3279 lk.lkt_held = -2; 3280#endif 3281 LIST_INIT(&reattach); 3282 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 3283 WORKLIST_REMOVE(wk); 3284 switch (wk->wk_type) { 3285 3286 case D_PAGEDEP: 3287 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 3288 WORKLIST_INSERT(&reattach, wk); 3289 continue; 3290 3291 case D_INODEDEP: 3292 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 3293 WORKLIST_INSERT(&reattach, wk); 3294 continue; 3295 3296 case D_BMSAFEMAP: 3297 bmsafemap = WK_BMSAFEMAP(wk); 3298 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { 3299 newblk->nb_state \|= DEPCOMPLETE; 3300 newblk->nb_bmsafemap = NULL; 3301 LIST_REMOVE(newblk, nb_deps); 3302 } 3303 while ((adp = 3304 LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { 3305 adp->ad_state \|= DEPCOMPLETE; 3306 adp->ad_buf = NULL; 3307 LIST_REMOVE(adp, ad_deps); 3308 handle_allocdirect_partdone(adp); 3309 } 3310 while ((aip = 3311 LIST_FIRST(&bmsafemap->sm_allocindirhd))) { 3312 aip->ai_state \|= DEPCOMPLETE; 3313 aip->ai_buf = NULL; 3314 LIST_REMOVE(aip, ai_deps); 3315 handle_allocindir_partdone(aip); 3316 } 3317 while ((inodedep = 3318 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { 3319 inodedep->id_state \|= DEPCOMPLETE; 3320 LIST_REMOVE(inodedep, id_deps); 3321 inodedep->id_buf = NULL; 3322 } 3323 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 3324 continue; 3325 3326 case D_MKDIR: 3327 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 3328 continue; 3329 3330 case D_ALLOCDIRECT: 3331 adp = WK_ALLOCDIRECT(wk); 3332 adp->ad_state \|= COMPLETE; 3333 handle_allocdirect_partdone(adp); 3334 continue; 3335 3336 case D_ALLOCINDIR: 3337 aip = WK_ALLOCINDIR(wk); 3338 aip->ai_state \|= COMPLETE; 3339 handle_allocindir_partdone(aip); 3340 continue; 3341 3342 case D_INDIRDEP: 3343 indirdep = WK_INDIRDEP(wk); 3344 if (indirdep->ir_state & GOINGAWAY) { 3345 lk.lkt_held = -1; 3346 panic("disk_write_complete: indirdep gone"); 3347 } 3348 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 3349 FREE(indirdep->ir_saveddata, M_INDIRDEP); 3350 indirdep->ir_saveddata = 0; 3351 indirdep->ir_state &= ~UNDONE; 3352 indirdep->ir_state \|= ATTACHED; 3353 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 3354 handle_allocindir_partdone(aip); 3355 if (aip == LIST_FIRST(&indirdep->ir_donehd)) { 3356 lk.lkt_held = -1; 3357 panic("disk_write_complete: not gone"); 3358 } 3359 } 3360 WORKLIST_INSERT(&reattach, wk); 3361 if ((bp->b_flags & B_DELWRI) == 0) 3362 stat_indir_blk_ptrs++; 3363 bdirty(bp); 3364 continue; 3365 3366 default: 3367 lk.lkt_held = -1; 3368 panic("handle_disk_write_complete: Unknown type %s", 3369 TYPENAME(wk->wk_type)); 3370 /* NOTREACHED / 3371* } 3372 } 3373 /* 3374 * Reattach any requests that must be redone. 3375 / 3376* while ((wk = LIST_FIRST(&reattach)) != NULL) { 3377 WORKLIST_REMOVE(wk); 3378 WORKLIST_INSERT(&bp->b_dep, wk); 3379 } 3380#ifdef DEBUG 3381 if (lk.lkt_held != -2) 3382 panic("softdep_disk_write_complete: lock lost"); 3383 lk.lkt_held = -1; 3384#endif 3385} 3386 3387/* 3388 * Called from within softdep_disk_write_complete above. Note that 3389 * this routine is always called from interrupt level with further 3390 * splbio interrupts blocked. 3391 / 3392static void 3393handle_allocdirect_partdone(adp) 3394* struct allocdirect adp; / the completed allocdirect / 3395{ 3396* struct allocdirect listadp; 3397* struct inodedep inodedep; 3398* long bsize, delay; 3399 3400 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 3401 return; 3402 if (adp->ad_buf != NULL) { 3403 lk.lkt_held = -1; 3404 panic("handle_allocdirect_partdone: dangling dep"); 3405 } 3406 /* 3407 * The on-disk inode cannot claim to be any larger than the last 3408 * fragment that has been written. Otherwise, the on-disk inode 3409 * might have fragments that were not the last block in the file 3410 * which would corrupt the filesystem. Thus, we cannot free any 3411 * allocdirects after one whose ad_oldblkno claims a fragment as 3412 * these blocks must be rolled back to zero before writing the inode. 3413 * We check the currently active set of allocdirects in id_inoupdt. 3414 / 3415* inodedep = adp->ad_inodedep; 3416 bsize = inodedep->id_fs->fs_bsize; 3417 TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) { 3418 /* found our block / 3419* if (listadp == adp) 3420 break; 3421 /* continue if ad_oldlbn is not a fragment / 3422* if (listadp->ad_oldsize == 0 \|\| 3423 listadp->ad_oldsize == bsize) 3424 continue; 3425 /* hit a fragment / 3426* return; 3427 } 3428 /* 3429 * If we have reached the end of the current list without 3430 * finding the just finished dependency, then it must be 3431 * on the future dependency list. Future dependencies cannot 3432 * be freed until they are moved to the current list. 3433 / 3434* if (listadp == NULL) { 3435#ifdef DEBUG 3436 TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next) 3437 /* found our block / 3438* if (listadp == adp) 3439 break; 3440 if (listadp == NULL) { 3441 lk.lkt_held = -1; 3442 panic("handle_allocdirect_partdone: lost dep"); 3443 } 3444#endif /* DEBUG / 3445* return; 3446 } 3447 /* 3448 * If we have found the just finished dependency, then free 3449 * it along with anything that follows it that is complete. 3450 * If the inode still has a bitmap dependency, then it has 3451 * never been written to disk, hence the on-disk inode cannot 3452 * reference the old fragment so we can free it without delay. 3453 / 3454* delay = (inodedep->id_state & DEPCOMPLETE); 3455 for (; adp; adp = listadp) { 3456 listadp = TAILQ_NEXT(adp, ad_next); 3457 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 3458 return; 3459 free_allocdirect(&inodedep->id_inoupdt, adp, delay); 3460 } 3461} 3462 3463/* 3464 * Called from within softdep_disk_write_complete above. Note that 3465 * this routine is always called from interrupt level with further 3466 * splbio interrupts blocked. 3467 / 3468static void 3469handle_allocindir_partdone(aip) 3470* struct allocindir aip; / the completed allocindir / 3471{ 3472* struct indirdep indirdep; 3473* 3474 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 3475 return; 3476 if (aip->ai_buf != NULL) { 3477 lk.lkt_held = -1; 3478 panic("handle_allocindir_partdone: dangling dependency"); 3479 } 3480 indirdep = aip->ai_indirdep; 3481 if (indirdep->ir_state & UNDONE) { 3482 LIST_REMOVE(aip, ai_next); 3483 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 3484 return; 3485 } 3486 ((ufs_daddr_t )indirdep->ir_savebp->b_data)[aip->ai_offset] = 3487* aip->ai_newblkno; 3488 LIST_REMOVE(aip, ai_next); 3489 if (aip->ai_freefrag != NULL) 3490 add_to_worklist(&aip->ai_freefrag->ff_list); 3491 WORKITEM_FREE(aip, D_ALLOCINDIR); 3492} 3493 3494/* 3495 * Called from within softdep_disk_write_complete above to restore 3496 * in-memory inode block contents to their most up-to-date state. Note 3497 * that this routine is always called from interrupt level with further 3498 * splbio interrupts blocked. 3499 / 3500static int 3501handle_written_inodeblock(inodedep, bp) 3502* struct inodedep inodedep; 3503* struct buf bp; / buffer containing the inode block / 3504{ 3505* struct worklist wk, filefree; 3506 struct allocdirect adp, nextadp; 3507 struct dinode dp; 3508* int hadchanges; 3509 3510 if ((inodedep->id_state & IOSTARTED) == 0) { 3511 lk.lkt_held = -1; 3512 panic("handle_written_inodeblock: not started"); 3513 } 3514 inodedep->id_state &= ~IOSTARTED; 3515 inodedep->id_state \|= COMPLETE; 3516 dp = (struct dinode )bp->b_data + 3517* ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 3518 /* 3519 * If we had to rollback the inode allocation because of 3520 * bitmaps being incomplete, then simply restore it. 3521 * Keep the block dirty so that it will not be reclaimed until 3522 * all associated dependencies have been cleared and the 3523 * corresponding updates written to disk. 3524 / 3525* if (inodedep->id_savedino != NULL) { 3526 dp = inodedep->id_savedino; 3527 FREE(inodedep->id_savedino, M_INODEDEP); 3528 inodedep->id_savedino = NULL; 3529 if ((bp->b_flags & B_DELWRI) == 0) 3530 stat_inode_bitmap++; 3531 bdirty(bp); 3532 return (1); 3533 } 3534 /* 3535 * Roll forward anything that had to be rolled back before 3536 * the inode could be updated. 3537 / 3538* hadchanges = 0; 3539 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 3540 nextadp = TAILQ_NEXT(adp, ad_next); 3541 if (adp->ad_state & ATTACHED) { 3542 lk.lkt_held = -1; 3543 panic("handle_written_inodeblock: new entry"); 3544 } 3545 if (adp->ad_lbn < NDADDR) { 3546 if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) { 3547 lk.lkt_held = -1; 3548 panic("%s: %s #%ld mismatch %d != %d", 3549 "handle_written_inodeblock", 3550 "direct pointer", adp->ad_lbn, 3551 dp->di_db[adp->ad_lbn], adp->ad_oldblkno); 3552 } 3553 dp->di_db[adp->ad_lbn] = adp->ad_newblkno; 3554 } else { 3555 if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) { 3556 lk.lkt_held = -1; 3557 panic("%s: %s #%ld allocated as %d", 3558 "handle_written_inodeblock", 3559 "indirect pointer", adp->ad_lbn - NDADDR, 3560 dp->di_ib[adp->ad_lbn - NDADDR]); 3561 } 3562 dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; 3563 } 3564 adp->ad_state &= ~UNDONE; 3565 adp->ad_state \|= ATTACHED; 3566 hadchanges = 1; 3567 } 3568 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 3569 stat_direct_blk_ptrs++; 3570 /* 3571 * Reset the file size to its most up-to-date value. 3572 / 3573* if (inodedep->id_savedsize == -1) { 3574 lk.lkt_held = -1; 3575 panic("handle_written_inodeblock: bad size"); 3576 } 3577 if (dp->di_size != inodedep->id_savedsize) { 3578 dp->di_size = inodedep->id_savedsize; 3579 hadchanges = 1; 3580 } 3581 inodedep->id_savedsize = -1; 3582 /* 3583 * If there were any rollbacks in the inode block, then it must be 3584 * marked dirty so that its will eventually get written back in 3585 * its correct form. 3586 / 3587* if (hadchanges) 3588 bdirty(bp); 3589 /* 3590 * Process any allocdirects that completed during the update. 3591 / 3592* if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 3593 handle_allocdirect_partdone(adp); 3594 /* 3595 * Process deallocations that were held pending until the 3596 * inode had been written to disk. Freeing of the inode 3597 * is delayed until after all blocks have been freed to 3598 * avoid creation of new <vfsid, inum, lbn> triples 3599 * before the old ones have been deleted. 3600 / 3601* filefree = NULL; 3602 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 3603 WORKLIST_REMOVE(wk); 3604 switch (wk->wk_type) { 3605 3606 case D_FREEFILE: 3607 /* 3608 * We defer adding filefree to the worklist until 3609 * all other additions have been made to ensure 3610 * that it will be done after all the old blocks 3611 * have been freed. 3612 / 3613* if (filefree != NULL) { 3614 lk.lkt_held = -1; 3615 panic("handle_written_inodeblock: filefree"); 3616 } 3617 filefree = wk; 3618 continue; 3619 3620 case D_MKDIR: 3621 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 3622 continue; 3623 3624 case D_DIRADD: 3625 diradd_inode_written(WK_DIRADD(wk), inodedep); 3626 continue; 3627 3628 case D_FREEBLKS: 3629 case D_FREEFRAG: 3630 case D_DIRREM: 3631 add_to_worklist(wk); 3632 continue; 3633 3634 default: 3635 lk.lkt_held = -1; 3636 panic("handle_written_inodeblock: Unknown type %s", 3637 TYPENAME(wk->wk_type)); 3638 /* NOTREACHED / 3639* } 3640 } 3641 if (filefree != NULL) { 3642 if (free_inodedep(inodedep) == 0) { 3643 lk.lkt_held = -1; 3644 panic("handle_written_inodeblock: live inodedep"); 3645 } 3646 add_to_worklist(filefree); 3647 return (0); 3648 } 3649 3650 /* 3651 * If no outstanding dependencies, free it. 3652 / 3653* if (free_inodedep(inodedep) \|\| TAILQ_FIRST(&inodedep->id_inoupdt) == 0) 3654 return (0); 3655 return (hadchanges); 3656} 3657 3658/* 3659 * Process a diradd entry after its dependent inode has been written. 3660 * This routine must be called with splbio interrupts blocked. 3661 / 3662static void 3663diradd_inode_written(dap, inodedep) 3664* struct diradd dap; 3665* struct inodedep inodedep; 3666{ 3667* struct pagedep pagedep; 3668* 3669 dap->da_state \|= COMPLETE; 3670 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3671 if (dap->da_state & DIRCHG) 3672 pagedep = dap->da_previous->dm_pagedep; 3673 else 3674 pagedep = dap->da_pagedep; 3675 LIST_REMOVE(dap, da_pdlist); 3676 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3677 } 3678 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 3679} 3680 3681/* 3682 * Handle the completion of a mkdir dependency. 3683 / 3684static void 3685handle_written_mkdir(mkdir, type) 3686* struct mkdir mkdir; 3687* int type; 3688{ 3689 struct diradd dap; 3690* struct pagedep pagedep; 3691* 3692 if (mkdir->md_state != type) { 3693 lk.lkt_held = -1; 3694 panic("handle_written_mkdir: bad type"); 3695 } 3696 dap = mkdir->md_diradd; 3697 dap->da_state &= ~type; 3698 if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) == 0) 3699 dap->da_state \|= DEPCOMPLETE; 3700 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3701 if (dap->da_state & DIRCHG) 3702 pagedep = dap->da_previous->dm_pagedep; 3703 else 3704 pagedep = dap->da_pagedep; 3705 LIST_REMOVE(dap, da_pdlist); 3706 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3707 } 3708 LIST_REMOVE(mkdir, md_mkdirs); 3709 WORKITEM_FREE(mkdir, D_MKDIR); 3710} 3711 3712/* 3713 * Called from within softdep_disk_write_complete above. 3714 * A write operation was just completed. Removed inodes can 3715 * now be freed and associated block pointers may be committed. 3716 * Note that this routine is always called from interrupt level 3717 * with further splbio interrupts blocked. 3718 / 3719static int 3720handle_written_filepage(pagedep, bp) 3721* struct pagedep pagedep; 3722* struct buf bp; / buffer containing the written page / 3723{ 3724* struct dirrem dirrem; 3725* struct diradd dap, nextdap; 3726 struct direct ep; 3727* int i, chgs; 3728 3729 if ((pagedep->pd_state & IOSTARTED) == 0) { 3730 lk.lkt_held = -1; 3731 panic("handle_written_filepage: not started"); 3732 } 3733 pagedep->pd_state &= ~IOSTARTED; 3734 /* 3735 * Process any directory removals that have been committed. 3736 / 3737* while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 3738 LIST_REMOVE(dirrem, dm_next); 3739 dirrem->dm_dirinum = pagedep->pd_ino; 3740 add_to_worklist(&dirrem->dm_list); 3741 } 3742 /* 3743 * Free any directory additions that have been committed. 3744 / 3745* while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 3746 free_diradd(dap); 3747 /* 3748 * Uncommitted directory entries must be restored. 3749 / 3750* for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 3751 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 3752 dap = nextdap) { 3753 nextdap = LIST_NEXT(dap, da_pdlist); 3754 if (dap->da_state & ATTACHED) { 3755 lk.lkt_held = -1; 3756 panic("handle_written_filepage: attached"); 3757 } 3758 ep = (struct direct ) 3759* ((char )bp->b_data + dap->da_offset); 3760* ep->d_ino = dap->da_newinum; 3761 dap->da_state &= ~UNDONE; 3762 dap->da_state \|= ATTACHED; 3763 chgs = 1; 3764 /* 3765 * If the inode referenced by the directory has 3766 * been written out, then the dependency can be 3767 * moved to the pending list. 3768 / 3769* if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3770 LIST_REMOVE(dap, da_pdlist); 3771 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 3772 da_pdlist); 3773 } 3774 } 3775 } 3776 /* 3777 * If there were any rollbacks in the directory, then it must be 3778 * marked dirty so that its will eventually get written back in 3779 * its correct form. 3780 / 3781* if (chgs) { 3782 if ((bp->b_flags & B_DELWRI) == 0) 3783 stat_dir_entry++; 3784 bdirty(bp); 3785 } 3786 /* 3787 * If no dependencies remain, the pagedep will be freed. 3788 * Otherwise it will remain to update the page before it 3789 * is written back to disk. 3790 / 3791* if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) { 3792 for (i = 0; i < DAHASHSZ; i++) 3793 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) 3794 break; 3795 if (i == DAHASHSZ) { 3796 LIST_REMOVE(pagedep, pd_hash); 3797 WORKITEM_FREE(pagedep, D_PAGEDEP); 3798 return (0); 3799 } 3800 } 3801 return (1); 3802} 3803 3804/* 3805 * Writing back in-core inode structures. 3806 * 3807 * The file system only accesses an inode's contents when it occupies an 3808 * "in-core" inode structure. These "in-core" structures are separate from 3809 * the page frames used to cache inode blocks. Only the latter are 3810 * transferred to/from the disk. So, when the updated contents of the 3811 * "in-core" inode structure are copied to the corresponding in-memory inode 3812 * block, the dependencies are also transferred. The following procedure is 3813 * called when copying a dirty "in-core" inode to a cached inode block. 3814 / 3815* 3816/* 3817 * Called when an inode is loaded from disk. If the effective link count 3818 * differed from the actual link count when it was last flushed, then we 3819 * need to ensure that the correct effective link count is put back. 3820 / 3821void 3822softdep_load_inodeblock(ip) 3823* struct inode ip; / the "in_core" copy of the inode / 3824{ 3825* struct inodedep inodedep; 3826* 3827 /* 3828 * Check for alternate nlink count. 3829 / 3830* ip->i_effnlink = ip->i_nlink; 3831 ACQUIRE_LOCK(&lk); 3832 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3833 FREE_LOCK(&lk); 3834 return; 3835 } 3836 ip->i_effnlink -= inodedep->id_nlinkdelta;
	3837 if (inodedep->id_state & SPACECOUNTED) 3838 ip->i_flag \|= IN_SPACECOUNTED;
3782 FREE_LOCK(&lk); 3783} 3784 3785/* 3786 * This routine is called just before the "in-core" inode 3787 * information is to be copied to the in-memory inode block. 3788 * Recall that an inode block contains several inodes. If 3789 * the force flag is set, then the dependencies will be 3790 * cleared so that the update can always be made. Note that 3791 * the buffer is locked when this routine is called, so we 3792 * will never be in the middle of writing the inode block 3793 * to disk. 3794 / 3795void 3796softdep_update_inodeblock(ip, bp, waitfor) 3797* struct inode ip; / the "in_core" copy of the inode / 3798* struct buf bp; / the buffer containing the inode block / 3799* int waitfor; /* nonzero => update must be allowed / 3800{ 3801* struct inodedep inodedep; 3802* struct worklist wk; 3803* int error, gotit; 3804 3805 /* 3806 * If the effective link count is not equal to the actual link 3807 * count, then we must track the difference in an inodedep while 3808 * the inode is (potentially) tossed out of the cache. Otherwise, 3809 * if there is no existing inodedep, then there are no dependencies 3810 * to track. 3811 / 3812* ACQUIRE_LOCK(&lk); 3813 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3814 FREE_LOCK(&lk); 3815 if (ip->i_effnlink != ip->i_nlink) 3816 panic("softdep_update_inodeblock: bad link count"); 3817 return; 3818 } 3819 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) { 3820 FREE_LOCK(&lk); 3821 panic("softdep_update_inodeblock: bad delta"); 3822 } 3823 /* 3824 * Changes have been initiated. Anything depending on these 3825 * changes cannot occur until this inode has been written. 3826 / 3827* inodedep->id_state &= ~COMPLETE; 3828 if ((inodedep->id_state & ONWORKLIST) == 0) 3829 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 3830 /* 3831 * Any new dependencies associated with the incore inode must 3832 * now be moved to the list associated with the buffer holding 3833 * the in-memory copy of the inode. Once merged process any 3834 * allocdirects that are completed by the merger. 3835 / 3836* merge_inode_lists(inodedep); 3837 if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) 3838 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); 3839 /* 3840 * Now that the inode has been pushed into the buffer, the 3841 * operations dependent on the inode being written to disk 3842 * can be moved to the id_bufwait so that they will be 3843 * processed when the buffer I/O completes. 3844 / 3845* while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 3846 WORKLIST_REMOVE(wk); 3847 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 3848 } 3849 /* 3850 * Newly allocated inodes cannot be written until the bitmap 3851 * that allocates them have been written (indicated by 3852 * DEPCOMPLETE being set in id_state). If we are doing a 3853 * forced sync (e.g., an fsync on a file), we force the bitmap 3854 * to be written so that the update can be done. 3855 / 3856* if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\| waitfor == 0) { 3857 FREE_LOCK(&lk); 3858 return; 3859 } 3860 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 3861 FREE_LOCK(&lk); 3862 if (gotit && 3863 (error = BUF_WRITE(inodedep->id_buf)) != 0) 3864 softdep_error("softdep_update_inodeblock: bwrite", error); 3865 if ((inodedep->id_state & DEPCOMPLETE) == 0) 3866 panic("softdep_update_inodeblock: update failed"); 3867} 3868 3869/* 3870 * Merge the new inode dependency list (id_newinoupdt) into the old 3871 * inode dependency list (id_inoupdt). This routine must be called 3872 * with splbio interrupts blocked. 3873 / 3874static void 3875merge_inode_lists(inodedep) 3876* struct inodedep inodedep; 3877{ 3878* struct allocdirect listadp, newadp; 3879 3880 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3881 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) { 3882 if (listadp->ad_lbn < newadp->ad_lbn) { 3883 listadp = TAILQ_NEXT(listadp, ad_next); 3884 continue; 3885 } 3886 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3887 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 3888 if (listadp->ad_lbn == newadp->ad_lbn) { 3889 allocdirect_merge(&inodedep->id_inoupdt, newadp, 3890 listadp); 3891 listadp = newadp; 3892 } 3893 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3894 } 3895 while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) { 3896 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3897 TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next); 3898 } 3899} 3900 3901/* 3902 * If we are doing an fsync, then we must ensure that any directory 3903 * entries for the inode have been written after the inode gets to disk. 3904 / 3905int 3906softdep_fsync(vp) 3907* struct vnode vp; / the "in_core" copy of the inode / 3908{ 3909* struct inodedep inodedep; 3910* struct pagedep pagedep; 3911* struct worklist wk; 3912* struct diradd dap; 3913* struct mount mnt; 3914* struct vnode pvp; 3915* struct inode ip; 3916* struct buf bp; 3917* struct fs fs; 3918* struct proc p = CURPROC; / XXX / 3919* int error, flushparent; 3920 ino_t parentino; 3921 ufs_lbn_t lbn; 3922 3923 ip = VTOI(vp); 3924 fs = ip->i_fs; 3925 ACQUIRE_LOCK(&lk); 3926 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) { 3927 FREE_LOCK(&lk); 3928 return (0); 3929 } 3930 if (LIST_FIRST(&inodedep->id_inowait) != NULL \|\| 3931 LIST_FIRST(&inodedep->id_bufwait) != NULL \|\| 3932 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\| 3933 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) { 3934 FREE_LOCK(&lk); 3935 panic("softdep_fsync: pending ops"); 3936 } 3937 for (error = 0, flushparent = 0; ; ) { 3938 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 3939 break; 3940 if (wk->wk_type != D_DIRADD) { 3941 FREE_LOCK(&lk); 3942 panic("softdep_fsync: Unexpected type %s", 3943 TYPENAME(wk->wk_type)); 3944 } 3945 dap = WK_DIRADD(wk); 3946 /* 3947 * Flush our parent if this directory entry 3948 * has a MKDIR_PARENT dependency. 3949 / 3950* if (dap->da_state & DIRCHG) 3951 pagedep = dap->da_previous->dm_pagedep; 3952 else 3953 pagedep = dap->da_pagedep; 3954 mnt = pagedep->pd_mnt; 3955 parentino = pagedep->pd_ino; 3956 lbn = pagedep->pd_lbn; 3957 if ((dap->da_state & (MKDIR_BODY \| COMPLETE)) != COMPLETE) { 3958 FREE_LOCK(&lk); 3959 panic("softdep_fsync: dirty"); 3960 } 3961 flushparent = dap->da_state & MKDIR_PARENT; 3962 /* 3963 * If we are being fsync'ed as part of vgone'ing this vnode, 3964 * then we will not be able to release and recover the 3965 * vnode below, so we just have to give up on writing its 3966 * directory entry out. It will eventually be written, just 3967 * not now, but then the user was not asking to have it 3968 * written, so we are not breaking any promises. 3969 / 3970* if (vp->v_flag & VXLOCK) 3971 break; 3972 /* 3973 * We prevent deadlock by always fetching inodes from the 3974 * root, moving down the directory tree. Thus, when fetching 3975 * our parent directory, we must unlock ourselves before 3976 * requesting the lock on our parent. See the comment in 3977 * ufs_lookup for details on possible races. 3978 / 3979* FREE_LOCK(&lk); 3980 VOP_UNLOCK(vp, 0, p); 3981 error = VFS_VGET(mnt, parentino, &pvp); 3982 vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY, p); 3983 if (error != 0) 3984 return (error); 3985 if (flushparent) { 3986 if ((error = UFS_UPDATE(pvp, 1)) != 0) { 3987 vput(pvp); 3988 return (error); 3989 } 3990 } 3991 /* 3992 * Flush directory page containing the inode's name. 3993 / 3994* error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred, 3995 &bp); 3996 if (error == 0) 3997 error = BUF_WRITE(bp); 3998 vput(pvp); 3999 if (error != 0) 4000 return (error); 4001 ACQUIRE_LOCK(&lk); 4002 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) 4003 break; 4004 } 4005 FREE_LOCK(&lk); 4006 return (0); 4007} 4008 4009/* 4010 * Flush all the dirty bitmaps associated with the block device 4011 * before flushing the rest of the dirty blocks so as to reduce 4012 * the number of dependencies that will have to be rolled back. 4013 / 4014void 4015softdep_fsync_mountdev(vp) 4016* struct vnode vp; 4017{ 4018* struct buf bp, nbp; 4019 struct worklist wk; 4020* 4021 if (!vn_isdisk(vp, NULL)) 4022 panic("softdep_fsync_mountdev: vnode not a disk"); 4023 ACQUIRE_LOCK(&lk); 4024 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 4025 nbp = TAILQ_NEXT(bp, b_vnbufs); 4026 /* 4027 * If it is already scheduled, skip to the next buffer. 4028 / 4029* if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) 4030 continue; 4031 if ((bp->b_flags & B_DELWRI) == 0) { 4032 FREE_LOCK(&lk); 4033 panic("softdep_fsync_mountdev: not dirty"); 4034 } 4035 /* 4036 * We are only interested in bitmaps with outstanding 4037 * dependencies. 4038 / 4039* if ((wk = LIST_FIRST(&bp->b_dep)) == NULL \|\| 4040 wk->wk_type != D_BMSAFEMAP \|\| 4041 (bp->b_xflags & BX_BKGRDINPROG)) { 4042 BUF_UNLOCK(bp); 4043 continue; 4044 } 4045 bremfree(bp); 4046 FREE_LOCK(&lk); 4047 (void) bawrite(bp); 4048 ACQUIRE_LOCK(&lk); 4049 /* 4050 * Since we may have slept during the I/O, we need 4051 * to start from a known point. 4052 / 4053* nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); 4054 } 4055 drain_output(vp, 1); 4056 FREE_LOCK(&lk); 4057} 4058 4059/* 4060 * This routine is called when we are trying to synchronously flush a 4061 * file. This routine must eliminate any filesystem metadata dependencies 4062 * so that the syncing routine can succeed by pushing the dirty blocks 4063 * associated with the file. If any I/O errors occur, they are returned. 4064 / 4065int 4066softdep_sync_metadata(ap) 4067* struct vop_fsync_args /* { 4068 struct vnode a_vp; 4069* struct ucred a_cred; 4070* int a_waitfor; 4071 struct proc a_p; 4072* } / ap; 4073{ 4074 struct vnode vp = ap->a_vp; 4075* struct pagedep pagedep; 4076* struct allocdirect adp; 4077* struct allocindir aip; 4078* struct buf bp, nbp; 4079 struct worklist wk; 4080* int i, error, waitfor; 4081 4082 /* 4083 * Check whether this vnode is involved in a filesystem 4084 * that is doing soft dependency processing. 4085 / 4086* if (!vn_isdisk(vp, NULL)) { 4087 if (!DOINGSOFTDEP(vp)) 4088 return (0); 4089 } else 4090 if (vp->v_rdev->si_mountpoint == NULL \|\| 4091 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0) 4092 return (0); 4093 /* 4094 * Ensure that any direct block dependencies have been cleared. 4095 / 4096* ACQUIRE_LOCK(&lk); 4097 if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) { 4098 FREE_LOCK(&lk); 4099 return (error); 4100 } 4101 /* 4102 * For most files, the only metadata dependencies are the 4103 * cylinder group maps that allocate their inode or blocks. 4104 * The block allocation dependencies can be found by traversing 4105 * the dependency lists for any buffers that remain on their 4106 * dirty buffer list. The inode allocation dependency will 4107 * be resolved when the inode is updated with MNT_WAIT. 4108 * This work is done in two passes. The first pass grabs most 4109 * of the buffers and begins asynchronously writing them. The 4110 * only way to wait for these asynchronous writes is to sleep 4111 * on the filesystem vnode which may stay busy for a long time 4112 * if the filesystem is active. So, instead, we make a second 4113 * pass over the dependencies blocking on each write. In the 4114 * usual case we will be blocking against a write that we 4115 * initiated, so when it is done the dependency will have been 4116 * resolved. Thus the second pass is expected to end quickly. 4117 / 4118* waitfor = MNT_NOWAIT; 4119top: 4120 if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) { 4121 FREE_LOCK(&lk); 4122 return (0); 4123 } 4124 bp = TAILQ_FIRST(&vp->v_dirtyblkhd); 4125 /* While syncing snapshots, we must allow recursive lookups / 4126* bp->b_lock.lk_flags \|= LK_CANRECURSE; 4127loop: 4128 /* 4129 * As we hold the buffer locked, none of its dependencies 4130 * will disappear. 4131 / 4132* LIST_FOREACH(wk, &bp->b_dep, wk_list) { 4133 switch (wk->wk_type) { 4134 4135 case D_ALLOCDIRECT: 4136 adp = WK_ALLOCDIRECT(wk); 4137 if (adp->ad_state & DEPCOMPLETE) 4138 continue; 4139 nbp = adp->ad_buf; 4140 if (getdirtybuf(&nbp, waitfor) == 0) 4141 continue; 4142 FREE_LOCK(&lk); 4143 if (waitfor == MNT_NOWAIT) { 4144 bawrite(nbp); 4145 } else if ((error = BUF_WRITE(nbp)) != 0) { 4146 break; 4147 } 4148 ACQUIRE_LOCK(&lk); 4149 continue; 4150 4151 case D_ALLOCINDIR: 4152 aip = WK_ALLOCINDIR(wk); 4153 if (aip->ai_state & DEPCOMPLETE) 4154 continue; 4155 nbp = aip->ai_buf; 4156 if (getdirtybuf(&nbp, waitfor) == 0) 4157 continue; 4158 FREE_LOCK(&lk); 4159 if (waitfor == MNT_NOWAIT) { 4160 bawrite(nbp); 4161 } else if ((error = BUF_WRITE(nbp)) != 0) { 4162 break; 4163 } 4164 ACQUIRE_LOCK(&lk); 4165 continue; 4166 4167 case D_INDIRDEP: 4168 restart: 4169 4170 LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { 4171 if (aip->ai_state & DEPCOMPLETE) 4172 continue; 4173 nbp = aip->ai_buf; 4174 if (getdirtybuf(&nbp, MNT_WAIT) == 0) 4175 goto restart; 4176 FREE_LOCK(&lk); 4177 if ((error = BUF_WRITE(nbp)) != 0) { 4178 break; 4179 } 4180 ACQUIRE_LOCK(&lk); 4181 goto restart; 4182 } 4183 continue; 4184 4185 case D_INODEDEP: 4186 if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs, 4187 WK_INODEDEP(wk)->id_ino)) != 0) { 4188 FREE_LOCK(&lk); 4189 break; 4190 } 4191 continue; 4192 4193 case D_PAGEDEP: 4194 /* 4195 * We are trying to sync a directory that may 4196 * have dependencies on both its own metadata 4197 * and/or dependencies on the inodes of any 4198 * recently allocated files. We walk its diradd 4199 * lists pushing out the associated inode. 4200 / 4201* pagedep = WK_PAGEDEP(wk); 4202 for (i = 0; i < DAHASHSZ; i++) { 4203 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 4204 continue; 4205 if ((error = 4206 flush_pagedep_deps(vp, pagedep->pd_mnt, 4207 &pagedep->pd_diraddhd[i]))) { 4208 FREE_LOCK(&lk); 4209 break; 4210 } 4211 } 4212 continue; 4213 4214 case D_MKDIR: 4215 /* 4216 * This case should never happen if the vnode has 4217 * been properly sync'ed. However, if this function 4218 * is used at a place where the vnode has not yet 4219 * been sync'ed, this dependency can show up. So, 4220 * rather than panic, just flush it. 4221 / 4222* nbp = WK_MKDIR(wk)->md_buf; 4223 if (getdirtybuf(&nbp, waitfor) == 0) 4224 continue; 4225 FREE_LOCK(&lk); 4226 if (waitfor == MNT_NOWAIT) { 4227 bawrite(nbp); 4228 } else if ((error = BUF_WRITE(nbp)) != 0) { 4229 break; 4230 } 4231 ACQUIRE_LOCK(&lk); 4232 continue; 4233 4234 case D_BMSAFEMAP: 4235 /* 4236 * This case should never happen if the vnode has 4237 * been properly sync'ed. However, if this function 4238 * is used at a place where the vnode has not yet 4239 * been sync'ed, this dependency can show up. So, 4240 * rather than panic, just flush it. 4241 / 4242* nbp = WK_BMSAFEMAP(wk)->sm_buf; 4243 if (getdirtybuf(&nbp, waitfor) == 0) 4244 continue; 4245 FREE_LOCK(&lk); 4246 if (waitfor == MNT_NOWAIT) { 4247 bawrite(nbp); 4248 } else if ((error = BUF_WRITE(nbp)) != 0) { 4249 break; 4250 } 4251 ACQUIRE_LOCK(&lk); 4252 continue; 4253 4254 default: 4255 FREE_LOCK(&lk); 4256 panic("softdep_sync_metadata: Unknown type %s", 4257 TYPENAME(wk->wk_type)); 4258 /* NOTREACHED / 4259* } 4260 /* We reach here only in error and unlocked / 4261* if (error == 0) 4262 panic("softdep_sync_metadata: zero error"); 4263 bp->b_lock.lk_flags &= ~LK_CANRECURSE; 4264 bawrite(bp); 4265 return (error); 4266 } 4267 (void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT); 4268 nbp = TAILQ_NEXT(bp, b_vnbufs); 4269 FREE_LOCK(&lk); 4270 bp->b_lock.lk_flags &= ~LK_CANRECURSE; 4271 bawrite(bp); 4272 ACQUIRE_LOCK(&lk); 4273 if (nbp != NULL) { 4274 bp = nbp; 4275 goto loop; 4276 } 4277 /* 4278 * We must wait for any I/O in progress to finish so that 4279 * all potential buffers on the dirty list will be visible. 4280 * Once they are all there, proceed with the second pass 4281 * which will wait for the I/O as per above. 4282 / 4283* drain_output(vp, 1); 4284 /* 4285 * The brief unlock is to allow any pent up dependency 4286 * processing to be done. 4287 / 4288* if (waitfor == MNT_NOWAIT) { 4289 waitfor = MNT_WAIT; 4290 FREE_LOCK(&lk); 4291 ACQUIRE_LOCK(&lk); 4292 goto top; 4293 } 4294 4295 /* 4296 * If we have managed to get rid of all the dirty buffers, 4297 * then we are done. For certain directories and block 4298 * devices, we may need to do further work. 4299 / 4300* if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) { 4301 FREE_LOCK(&lk); 4302 return (0); 4303 } 4304 4305 FREE_LOCK(&lk); 4306 /* 4307 * If we are trying to sync a block device, some of its buffers may 4308 * contain metadata that cannot be written until the contents of some 4309 * partially written files have been written to disk. The only easy 4310 * way to accomplish this is to sync the entire filesystem (luckily 4311 * this happens rarely). 4312 / 4313* if (vn_isdisk(vp, NULL) && 4314 vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) && 4315 (error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT, ap->a_cred, 4316 ap->a_p)) != 0) 4317 return (error); 4318 return (0); 4319} 4320 4321/* 4322 * Flush the dependencies associated with an inodedep. 4323 * Called with splbio blocked. 4324 / 4325static int 4326flush_inodedep_deps(fs, ino) 4327* struct fs fs; 4328* ino_t ino; 4329{ 4330 struct inodedep inodedep; 4331* struct allocdirect adp; 4332* int error, waitfor; 4333 struct buf bp; 4334* 4335 /* 4336 * This work is done in two passes. The first pass grabs most 4337 * of the buffers and begins asynchronously writing them. The 4338 * only way to wait for these asynchronous writes is to sleep 4339 * on the filesystem vnode which may stay busy for a long time 4340 * if the filesystem is active. So, instead, we make a second 4341 * pass over the dependencies blocking on each write. In the 4342 * usual case we will be blocking against a write that we 4343 * initiated, so when it is done the dependency will have been 4344 * resolved. Thus the second pass is expected to end quickly. 4345 * We give a brief window at the top of the loop to allow 4346 * any pending I/O to complete. 4347 / 4348* for (waitfor = MNT_NOWAIT; ; ) { 4349 FREE_LOCK(&lk); 4350 ACQUIRE_LOCK(&lk); 4351 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 4352 return (0); 4353 TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) { 4354 if (adp->ad_state & DEPCOMPLETE) 4355 continue; 4356 bp = adp->ad_buf; 4357 if (getdirtybuf(&bp, waitfor) == 0) { 4358 if (waitfor == MNT_NOWAIT) 4359 continue; 4360 break; 4361 } 4362 FREE_LOCK(&lk); 4363 if (waitfor == MNT_NOWAIT) { 4364 bawrite(bp); 4365 } else if ((error = BUF_WRITE(bp)) != 0) { 4366 ACQUIRE_LOCK(&lk); 4367 return (error); 4368 } 4369 ACQUIRE_LOCK(&lk); 4370 break; 4371 } 4372 if (adp != NULL) 4373 continue; 4374 TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) { 4375 if (adp->ad_state & DEPCOMPLETE) 4376 continue; 4377 bp = adp->ad_buf; 4378 if (getdirtybuf(&bp, waitfor) == 0) { 4379 if (waitfor == MNT_NOWAIT) 4380 continue; 4381 break; 4382 } 4383 FREE_LOCK(&lk); 4384 if (waitfor == MNT_NOWAIT) { 4385 bawrite(bp); 4386 } else if ((error = BUF_WRITE(bp)) != 0) { 4387 ACQUIRE_LOCK(&lk); 4388 return (error); 4389 } 4390 ACQUIRE_LOCK(&lk); 4391 break; 4392 } 4393 if (adp != NULL) 4394 continue; 4395 /* 4396 * If pass2, we are done, otherwise do pass 2. 4397 / 4398* if (waitfor == MNT_WAIT) 4399 break; 4400 waitfor = MNT_WAIT; 4401 } 4402 /* 4403 * Try freeing inodedep in case all dependencies have been removed. 4404 / 4405* if (inodedep_lookup(fs, ino, 0, &inodedep) != 0) 4406 (void) free_inodedep(inodedep); 4407 return (0); 4408} 4409 4410/* 4411 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 4412 * Called with splbio blocked. 4413 / 4414static int 4415flush_pagedep_deps(pvp, mp, diraddhdp) 4416* struct vnode pvp; 4417* struct mount mp; 4418* struct diraddhd diraddhdp; 4419{ 4420* struct proc p = CURPROC; / XXX / 4421* struct inodedep inodedep; 4422* struct ufsmount ump; 4423* struct diradd dap; 4424* struct vnode vp; 4425* int gotit, error = 0; 4426 struct buf bp; 4427* ino_t inum; 4428 4429 ump = VFSTOUFS(mp); 4430 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 4431 /* 4432 * Flush ourselves if this directory entry 4433 * has a MKDIR_PARENT dependency. 4434 / 4435* if (dap->da_state & MKDIR_PARENT) { 4436 FREE_LOCK(&lk); 4437 if ((error = UFS_UPDATE(pvp, 1)) != 0) 4438 break; 4439 ACQUIRE_LOCK(&lk); 4440 /* 4441 * If that cleared dependencies, go on to next. 4442 / 4443* if (dap != LIST_FIRST(diraddhdp)) 4444 continue; 4445 if (dap->da_state & MKDIR_PARENT) { 4446 FREE_LOCK(&lk); 4447 panic("flush_pagedep_deps: MKDIR_PARENT"); 4448 } 4449 } 4450 /* 4451 * A newly allocated directory must have its "." and 4452 * ".." entries written out before its name can be 4453 * committed in its parent. We do not want or need 4454 * the full semantics of a synchronous VOP_FSYNC as 4455 * that may end up here again, once for each directory 4456 * level in the filesystem. Instead, we push the blocks 4457 * and wait for them to clear. We have to fsync twice 4458 * because the first call may choose to defer blocks 4459 * that still have dependencies, but deferral will 4460 * happen at most once. 4461 / 4462* inum = dap->da_newinum; 4463 if (dap->da_state & MKDIR_BODY) { 4464 FREE_LOCK(&lk); 4465 if ((error = VFS_VGET(mp, inum, &vp)) != 0) 4466 break; 4467 if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) \|\| 4468 (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) { 4469 vput(vp); 4470 break; 4471 } 4472 drain_output(vp, 0); 4473 vput(vp); 4474 ACQUIRE_LOCK(&lk); 4475 /* 4476 * If that cleared dependencies, go on to next. 4477 / 4478* if (dap != LIST_FIRST(diraddhdp)) 4479 continue; 4480 if (dap->da_state & MKDIR_BODY) { 4481 FREE_LOCK(&lk); 4482 panic("flush_pagedep_deps: MKDIR_BODY"); 4483 } 4484 } 4485 /* 4486 * Flush the inode on which the directory entry depends. 4487 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 4488 * the only remaining dependency is that the updated inode 4489 * count must get pushed to disk. The inode has already 4490 * been pushed into its inode buffer (via VOP_UPDATE) at 4491 * the time of the reference count change. So we need only 4492 * locate that buffer, ensure that there will be no rollback 4493 * caused by a bitmap dependency, then write the inode buffer. 4494 / 4495* if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) { 4496 FREE_LOCK(&lk); 4497 panic("flush_pagedep_deps: lost inode"); 4498 } 4499 /* 4500 * If the inode still has bitmap dependencies, 4501 * push them to disk. 4502 / 4503* if ((inodedep->id_state & DEPCOMPLETE) == 0) { 4504 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 4505 FREE_LOCK(&lk); 4506 if (gotit && 4507 (error = BUF_WRITE(inodedep->id_buf)) != 0) 4508 break; 4509 ACQUIRE_LOCK(&lk); 4510 if (dap != LIST_FIRST(diraddhdp)) 4511 continue; 4512 } 4513 /* 4514 * If the inode is still sitting in a buffer waiting 4515 * to be written, push it to disk. 4516 / 4517* FREE_LOCK(&lk); 4518 if ((error = bread(ump->um_devvp, 4519 fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), 4520 (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) 4521 break; 4522 if ((error = BUF_WRITE(bp)) != 0) 4523 break; 4524 ACQUIRE_LOCK(&lk); 4525 /* 4526 * If we have failed to get rid of all the dependencies 4527 * then something is seriously wrong. 4528 / 4529* if (dap == LIST_FIRST(diraddhdp)) { 4530 FREE_LOCK(&lk); 4531 panic("flush_pagedep_deps: flush failed"); 4532 } 4533 } 4534 if (error) 4535 ACQUIRE_LOCK(&lk); 4536 return (error); 4537} 4538 4539/* 4540 * A large burst of file addition or deletion activity can drive the 4541 * memory load excessively high. First attempt to slow things down 4542 * using the techniques below. If that fails, this routine requests 4543 * the offending operations to fall back to running synchronously 4544 * until the memory load returns to a reasonable level. 4545 / 4546int 4547softdep_slowdown(vp) 4548* struct vnode vp; 4549{ 4550* int max_softdeps_hard; 4551 4552 max_softdeps_hard = max_softdeps * 11 / 10; 4553 if (num_dirrem < max_softdeps_hard / 2 && 4554 num_inodedep < max_softdeps_hard) 4555 return (0); 4556 stat_sync_limit_hit += 1; 4557 return (1); 4558} 4559 4560/* 4561 * If memory utilization has gotten too high, deliberately slow things 4562 * down and speed up the I/O processing. 4563 / 4564static int 4565request_cleanup(resource, islocked) 4566* int resource; 4567 int islocked; 4568{ 4569 struct proc p = CURPROC; 4570* 4571 /* 4572 * We never hold up the filesystem syncer process. 4573 / 4574* if (p == filesys_syncer) 4575 return (0); 4576 /* 4577 * First check to see if the work list has gotten backlogged. 4578 * If it has, co-opt this process to help clean up two entries. 4579 * Because this process may hold inodes locked, we cannot 4580 * handle any remove requests that might block on a locked 4581 * inode as that could lead to deadlock. 4582 / 4583* if (num_on_worklist > max_softdeps / 10) { 4584 if (islocked) 4585 FREE_LOCK(&lk); 4586 process_worklist_item(NULL, LK_NOWAIT); 4587 process_worklist_item(NULL, LK_NOWAIT); 4588 stat_worklist_push += 2; 4589 if (islocked) 4590 ACQUIRE_LOCK(&lk); 4591 return(1); 4592 } 4593 /* 4594 * Next, we attempt to speed up the syncer process. If that 4595 * is successful, then we allow the process to continue. 4596 / 4597* if (speedup_syncer()) 4598 return(0); 4599 /* 4600 * If we are resource constrained on inode dependencies, try 4601 * flushing some dirty inodes. Otherwise, we are constrained 4602 * by file deletions, so try accelerating flushes of directories 4603 * with removal dependencies. We would like to do the cleanup 4604 * here, but we probably hold an inode locked at this point and 4605 * that might deadlock against one that we try to clean. So, 4606 * the best that we can do is request the syncer daemon to do 4607 * the cleanup for us. 4608 / 4609* switch (resource) { 4610 4611 case FLUSH_INODES: 4612 stat_ino_limit_push += 1; 4613 req_clear_inodedeps += 1; 4614 stat_countp = &stat_ino_limit_hit; 4615 break; 4616 4617 case FLUSH_REMOVE: 4618 stat_blk_limit_push += 1; 4619 req_clear_remove += 1; 4620 stat_countp = &stat_blk_limit_hit; 4621 break; 4622 4623 default: 4624 if (islocked) 4625 FREE_LOCK(&lk); 4626 panic("request_cleanup: unknown type"); 4627 } 4628 /* 4629 * Hopefully the syncer daemon will catch up and awaken us. 4630 * We wait at most tickdelay before proceeding in any case. 4631 / 4632* if (islocked == 0) 4633 ACQUIRE_LOCK(&lk); 4634 proc_waiting += 1; 4635 if (handle.callout == NULL) 4636 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); 4637 FREE_LOCK_INTERLOCKED(&lk); 4638 (void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0); 4639 ACQUIRE_LOCK_INTERLOCKED(&lk); 4640 proc_waiting -= 1; 4641 if (islocked == 0) 4642 FREE_LOCK(&lk); 4643 return (1); 4644} 4645 4646/* 4647 * Awaken processes pausing in request_cleanup and clear proc_waiting 4648 * to indicate that there is no longer a timer running. 4649 / 4650void 4651pause_timer(arg) 4652* void arg; 4653{ 4654* 4655 stat_countp += 1; 4656* wakeup_one(&proc_waiting); 4657 if (proc_waiting > 0) 4658 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); 4659 else 4660 handle.callout = NULL; 4661} 4662 4663/* 4664 * Flush out a directory with at least one removal dependency in an effort to 4665 * reduce the number of dirrem, freefile, and freeblks dependency structures. 4666 / 4667static void 4668clear_remove(p) 4669* struct proc p; 4670{ 4671* struct pagedep_hashhead pagedephd; 4672* struct pagedep pagedep; 4673* static int next = 0; 4674 struct mount mp; 4675* struct vnode vp; 4676* int error, cnt; 4677 ino_t ino; 4678 4679 ACQUIRE_LOCK(&lk); 4680 for (cnt = 0; cnt < pagedep_hash; cnt++) { 4681 pagedephd = &pagedep_hashtbl[next++]; 4682 if (next >= pagedep_hash) 4683 next = 0; 4684 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 4685 if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL) 4686 continue; 4687 mp = pagedep->pd_mnt; 4688 ino = pagedep->pd_ino; 4689 FREE_LOCK(&lk); 4690 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 4691 continue; 4692 if ((error = VFS_VGET(mp, ino, &vp)) != 0) { 4693 softdep_error("clear_remove: vget", error); 4694 vn_finished_write(mp); 4695 return; 4696 } 4697 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) 4698 softdep_error("clear_remove: fsync", error); 4699 drain_output(vp, 0); 4700 vput(vp); 4701 vn_finished_write(mp); 4702 return; 4703 } 4704 } 4705 FREE_LOCK(&lk); 4706} 4707 4708/* 4709 * Clear out a block of dirty inodes in an effort to reduce 4710 * the number of inodedep dependency structures. 4711 / 4712static void 4713clear_inodedeps(p) 4714* struct proc p; 4715{ 4716* struct inodedep_hashhead inodedephd; 4717* struct inodedep inodedep; 4718* static int next = 0; 4719 struct mount mp; 4720* struct vnode vp; 4721* struct fs fs; 4722* int error, cnt; 4723 ino_t firstino, lastino, ino; 4724 4725 ACQUIRE_LOCK(&lk); 4726 /* 4727 * Pick a random inode dependency to be cleared. 4728 * We will then gather up all the inodes in its block 4729 * that have dependencies and flush them out. 4730 / 4731* for (cnt = 0; cnt < inodedep_hash; cnt++) { 4732 inodedephd = &inodedep_hashtbl[next++]; 4733 if (next >= inodedep_hash) 4734 next = 0; 4735 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 4736 break; 4737 } 4738 if (inodedep == NULL) 4739 return; 4740 /* 4741 * Ugly code to find mount point given pointer to superblock. 4742 / 4743* fs = inodedep->id_fs; 4744 TAILQ_FOREACH(mp, &mountlist, mnt_list) 4745 if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs) 4746 break; 4747 /* 4748 * Find the last inode in the block with dependencies. 4749 / 4750* firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 4751 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 4752 if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0) 4753 break; 4754 /* 4755 * Asynchronously push all but the last inode with dependencies. 4756 * Synchronously push the last inode with dependencies to ensure 4757 * that the inode block gets written to free up the inodedeps. 4758 / 4759* for (ino = firstino; ino <= lastino; ino++) { 4760 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 4761 continue; 4762 FREE_LOCK(&lk); 4763 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 4764 continue; 4765 if ((error = VFS_VGET(mp, ino, &vp)) != 0) { 4766 softdep_error("clear_inodedeps: vget", error); 4767 vn_finished_write(mp); 4768 return; 4769 } 4770 if (ino == lastino) { 4771 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p))) 4772 softdep_error("clear_inodedeps: fsync1", error); 4773 } else { 4774 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) 4775 softdep_error("clear_inodedeps: fsync2", error); 4776 drain_output(vp, 0); 4777 } 4778 vput(vp); 4779 vn_finished_write(mp); 4780 ACQUIRE_LOCK(&lk); 4781 } 4782 FREE_LOCK(&lk); 4783} 4784 4785/* 4786 * Function to determine if the buffer has outstanding dependencies 4787 * that will cause a roll-back if the buffer is written. If wantcount 4788 * is set, return number of dependencies, otherwise just yes or no. 4789 / 4790static int 4791softdep_count_dependencies(bp, wantcount) 4792* struct buf bp; 4793* int wantcount; 4794{ 4795 struct worklist wk; 4796* struct inodedep inodedep; 4797* struct indirdep indirdep; 4798* struct allocindir aip; 4799* struct pagedep pagedep; 4800* struct diradd dap; 4801* int i, retval; 4802 4803 retval = 0; 4804 ACQUIRE_LOCK(&lk); 4805 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 4806 switch (wk->wk_type) { 4807 4808 case D_INODEDEP: 4809 inodedep = WK_INODEDEP(wk); 4810 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 4811 /* bitmap allocation dependency / 4812* retval += 1; 4813 if (!wantcount) 4814 goto out; 4815 } 4816 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 4817 /* direct block pointer dependency / 4818* retval += 1; 4819 if (!wantcount) 4820 goto out; 4821 } 4822 continue; 4823 4824 case D_INDIRDEP: 4825 indirdep = WK_INDIRDEP(wk); 4826 4827 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 4828 /* indirect block pointer dependency / 4829* retval += 1; 4830 if (!wantcount) 4831 goto out; 4832 } 4833 continue; 4834 4835 case D_PAGEDEP: 4836 pagedep = WK_PAGEDEP(wk); 4837 for (i = 0; i < DAHASHSZ; i++) { 4838 4839 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 4840 /* directory entry dependency / 4841* retval += 1; 4842 if (!wantcount) 4843 goto out; 4844 } 4845 } 4846 continue; 4847 4848 case D_BMSAFEMAP: 4849 case D_ALLOCDIRECT: 4850 case D_ALLOCINDIR: 4851 case D_MKDIR: 4852 /* never a dependency on these blocks / 4853* continue; 4854 4855 default: 4856 FREE_LOCK(&lk); 4857 panic("softdep_check_for_rollback: Unexpected type %s", 4858 TYPENAME(wk->wk_type)); 4859 /* NOTREACHED / 4860* } 4861 } 4862out: 4863 FREE_LOCK(&lk); 4864 return retval; 4865} 4866 4867/* 4868 * Acquire exclusive access to a buffer. 4869 * Must be called with splbio blocked. 4870 * Return 1 if buffer was acquired. 4871 / 4872static int 4873getdirtybuf(bpp, waitfor) 4874* struct buf *bpp; 4875* int waitfor; 4876{ 4877 struct buf bp; 4878* 4879 for (;;) { 4880 if ((bp = bpp) == NULL) 4881* return (0); 4882 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) == 0) { 4883 if ((bp->b_xflags & BX_BKGRDINPROG) == 0) 4884 break; 4885 BUF_UNLOCK(bp); 4886 if (waitfor != MNT_WAIT) 4887 return (0); 4888 bp->b_xflags \|= BX_BKGRDWAIT; 4889 FREE_LOCK_INTERLOCKED(&lk); 4890 tsleep(&bp->b_xflags, PRIBIO, "getbuf", 0); 4891 ACQUIRE_LOCK_INTERLOCKED(&lk); 4892 continue; 4893 } 4894 if (waitfor != MNT_WAIT) 4895 return (0); 4896 FREE_LOCK_INTERLOCKED(&lk); 4897 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_SLEEPFAIL) != ENOLCK) 4898 panic("getdirtybuf: inconsistent lock"); 4899 ACQUIRE_LOCK_INTERLOCKED(&lk); 4900 } 4901 if ((bp->b_flags & B_DELWRI) == 0) { 4902 BUF_UNLOCK(bp); 4903 return (0); 4904 } 4905 bremfree(bp); 4906 return (1); 4907} 4908 4909/* 4910 * Wait for pending output on a vnode to complete. 4911 * Must be called with vnode locked. 4912 / 4913static void 4914drain_output(vp, islocked) 4915* struct vnode vp; 4916* int islocked; 4917{ 4918 4919 if (!islocked) 4920 ACQUIRE_LOCK(&lk); 4921 while (vp->v_numoutput) { 4922 vp->v_flag \|= VBWAIT; 4923 FREE_LOCK_INTERLOCKED(&lk); 4924 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drainvp", 0); 4925 ACQUIRE_LOCK_INTERLOCKED(&lk); 4926 } 4927 if (!islocked) 4928 FREE_LOCK(&lk); 4929} 4930 4931/* 4932 * Called whenever a buffer that is being invalidated or reallocated 4933 * contains dependencies. This should only happen if an I/O error has 4934 * occurred. The routine is called with the buffer locked. 4935 / 4936static void 4937softdep_deallocate_dependencies(bp) 4938* struct buf bp; 4939{ 4940* 4941 if ((bp->b_ioflags & BIO_ERROR) == 0) 4942 panic("softdep_deallocate_dependencies: dangling deps"); 4943 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 4944 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 4945} 4946 4947/* 4948 * Function to handle asynchronous write errors in the filesystem. 4949 / 4950void 4951softdep_error(func, error) 4952* char func; 4953* int error; 4954{ 4955 4956 /* XXX should do something better! / 4957* printf("%s: got error %d while accessing filesystem\n", func, error); 4958}	3839 FREE_LOCK(&lk); 3840} 3841 3842/* 3843 * This routine is called just before the "in-core" inode 3844 * information is to be copied to the in-memory inode block. 3845 * Recall that an inode block contains several inodes. If 3846 * the force flag is set, then the dependencies will be 3847 * cleared so that the update can always be made. Note that 3848 * the buffer is locked when this routine is called, so we 3849 * will never be in the middle of writing the inode block 3850 * to disk. 3851 / 3852void 3853softdep_update_inodeblock(ip, bp, waitfor) 3854* struct inode ip; / the "in_core" copy of the inode / 3855* struct buf bp; / the buffer containing the inode block / 3856* int waitfor; /* nonzero => update must be allowed / 3857{ 3858* struct inodedep inodedep; 3859* struct worklist wk; 3860* int error, gotit; 3861 3862 /* 3863 * If the effective link count is not equal to the actual link 3864 * count, then we must track the difference in an inodedep while 3865 * the inode is (potentially) tossed out of the cache. Otherwise, 3866 * if there is no existing inodedep, then there are no dependencies 3867 * to track. 3868 / 3869* ACQUIRE_LOCK(&lk); 3870 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3871 FREE_LOCK(&lk); 3872 if (ip->i_effnlink != ip->i_nlink) 3873 panic("softdep_update_inodeblock: bad link count"); 3874 return; 3875 } 3876 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) { 3877 FREE_LOCK(&lk); 3878 panic("softdep_update_inodeblock: bad delta"); 3879 } 3880 /* 3881 * Changes have been initiated. Anything depending on these 3882 * changes cannot occur until this inode has been written. 3883 / 3884* inodedep->id_state &= ~COMPLETE; 3885 if ((inodedep->id_state & ONWORKLIST) == 0) 3886 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 3887 /* 3888 * Any new dependencies associated with the incore inode must 3889 * now be moved to the list associated with the buffer holding 3890 * the in-memory copy of the inode. Once merged process any 3891 * allocdirects that are completed by the merger. 3892 / 3893* merge_inode_lists(inodedep); 3894 if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) 3895 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); 3896 /* 3897 * Now that the inode has been pushed into the buffer, the 3898 * operations dependent on the inode being written to disk 3899 * can be moved to the id_bufwait so that they will be 3900 * processed when the buffer I/O completes. 3901 / 3902* while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 3903 WORKLIST_REMOVE(wk); 3904 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 3905 } 3906 /* 3907 * Newly allocated inodes cannot be written until the bitmap 3908 * that allocates them have been written (indicated by 3909 * DEPCOMPLETE being set in id_state). If we are doing a 3910 * forced sync (e.g., an fsync on a file), we force the bitmap 3911 * to be written so that the update can be done. 3912 / 3913* if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\| waitfor == 0) { 3914 FREE_LOCK(&lk); 3915 return; 3916 } 3917 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 3918 FREE_LOCK(&lk); 3919 if (gotit && 3920 (error = BUF_WRITE(inodedep->id_buf)) != 0) 3921 softdep_error("softdep_update_inodeblock: bwrite", error); 3922 if ((inodedep->id_state & DEPCOMPLETE) == 0) 3923 panic("softdep_update_inodeblock: update failed"); 3924} 3925 3926/* 3927 * Merge the new inode dependency list (id_newinoupdt) into the old 3928 * inode dependency list (id_inoupdt). This routine must be called 3929 * with splbio interrupts blocked. 3930 / 3931static void 3932merge_inode_lists(inodedep) 3933* struct inodedep inodedep; 3934{ 3935* struct allocdirect listadp, newadp; 3936 3937 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3938 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) { 3939 if (listadp->ad_lbn < newadp->ad_lbn) { 3940 listadp = TAILQ_NEXT(listadp, ad_next); 3941 continue; 3942 } 3943 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3944 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 3945 if (listadp->ad_lbn == newadp->ad_lbn) { 3946 allocdirect_merge(&inodedep->id_inoupdt, newadp, 3947 listadp); 3948 listadp = newadp; 3949 } 3950 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3951 } 3952 while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) { 3953 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3954 TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next); 3955 } 3956} 3957 3958/* 3959 * If we are doing an fsync, then we must ensure that any directory 3960 * entries for the inode have been written after the inode gets to disk. 3961 / 3962int 3963softdep_fsync(vp) 3964* struct vnode vp; / the "in_core" copy of the inode / 3965{ 3966* struct inodedep inodedep; 3967* struct pagedep pagedep; 3968* struct worklist wk; 3969* struct diradd dap; 3970* struct mount mnt; 3971* struct vnode pvp; 3972* struct inode ip; 3973* struct buf bp; 3974* struct fs fs; 3975* struct proc p = CURPROC; / XXX / 3976* int error, flushparent; 3977 ino_t parentino; 3978 ufs_lbn_t lbn; 3979 3980 ip = VTOI(vp); 3981 fs = ip->i_fs; 3982 ACQUIRE_LOCK(&lk); 3983 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) { 3984 FREE_LOCK(&lk); 3985 return (0); 3986 } 3987 if (LIST_FIRST(&inodedep->id_inowait) != NULL \|\| 3988 LIST_FIRST(&inodedep->id_bufwait) != NULL \|\| 3989 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\| 3990 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) { 3991 FREE_LOCK(&lk); 3992 panic("softdep_fsync: pending ops"); 3993 } 3994 for (error = 0, flushparent = 0; ; ) { 3995 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 3996 break; 3997 if (wk->wk_type != D_DIRADD) { 3998 FREE_LOCK(&lk); 3999 panic("softdep_fsync: Unexpected type %s", 4000 TYPENAME(wk->wk_type)); 4001 } 4002 dap = WK_DIRADD(wk); 4003 /* 4004 * Flush our parent if this directory entry 4005 * has a MKDIR_PARENT dependency. 4006 / 4007* if (dap->da_state & DIRCHG) 4008 pagedep = dap->da_previous->dm_pagedep; 4009 else 4010 pagedep = dap->da_pagedep; 4011 mnt = pagedep->pd_mnt; 4012 parentino = pagedep->pd_ino; 4013 lbn = pagedep->pd_lbn; 4014 if ((dap->da_state & (MKDIR_BODY \| COMPLETE)) != COMPLETE) { 4015 FREE_LOCK(&lk); 4016 panic("softdep_fsync: dirty"); 4017 } 4018 flushparent = dap->da_state & MKDIR_PARENT; 4019 /* 4020 * If we are being fsync'ed as part of vgone'ing this vnode, 4021 * then we will not be able to release and recover the 4022 * vnode below, so we just have to give up on writing its 4023 * directory entry out. It will eventually be written, just 4024 * not now, but then the user was not asking to have it 4025 * written, so we are not breaking any promises. 4026 / 4027* if (vp->v_flag & VXLOCK) 4028 break; 4029 /* 4030 * We prevent deadlock by always fetching inodes from the 4031 * root, moving down the directory tree. Thus, when fetching 4032 * our parent directory, we must unlock ourselves before 4033 * requesting the lock on our parent. See the comment in 4034 * ufs_lookup for details on possible races. 4035 / 4036* FREE_LOCK(&lk); 4037 VOP_UNLOCK(vp, 0, p); 4038 error = VFS_VGET(mnt, parentino, &pvp); 4039 vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY, p); 4040 if (error != 0) 4041 return (error); 4042 if (flushparent) { 4043 if ((error = UFS_UPDATE(pvp, 1)) != 0) { 4044 vput(pvp); 4045 return (error); 4046 } 4047 } 4048 /* 4049 * Flush directory page containing the inode's name. 4050 / 4051* error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred, 4052 &bp); 4053 if (error == 0) 4054 error = BUF_WRITE(bp); 4055 vput(pvp); 4056 if (error != 0) 4057 return (error); 4058 ACQUIRE_LOCK(&lk); 4059 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) 4060 break; 4061 } 4062 FREE_LOCK(&lk); 4063 return (0); 4064} 4065 4066/* 4067 * Flush all the dirty bitmaps associated with the block device 4068 * before flushing the rest of the dirty blocks so as to reduce 4069 * the number of dependencies that will have to be rolled back. 4070 / 4071void 4072softdep_fsync_mountdev(vp) 4073* struct vnode vp; 4074{ 4075* struct buf bp, nbp; 4076 struct worklist wk; 4077* 4078 if (!vn_isdisk(vp, NULL)) 4079 panic("softdep_fsync_mountdev: vnode not a disk"); 4080 ACQUIRE_LOCK(&lk); 4081 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 4082 nbp = TAILQ_NEXT(bp, b_vnbufs); 4083 /* 4084 * If it is already scheduled, skip to the next buffer. 4085 / 4086* if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) 4087 continue; 4088 if ((bp->b_flags & B_DELWRI) == 0) { 4089 FREE_LOCK(&lk); 4090 panic("softdep_fsync_mountdev: not dirty"); 4091 } 4092 /* 4093 * We are only interested in bitmaps with outstanding 4094 * dependencies. 4095 / 4096* if ((wk = LIST_FIRST(&bp->b_dep)) == NULL \|\| 4097 wk->wk_type != D_BMSAFEMAP \|\| 4098 (bp->b_xflags & BX_BKGRDINPROG)) { 4099 BUF_UNLOCK(bp); 4100 continue; 4101 } 4102 bremfree(bp); 4103 FREE_LOCK(&lk); 4104 (void) bawrite(bp); 4105 ACQUIRE_LOCK(&lk); 4106 /* 4107 * Since we may have slept during the I/O, we need 4108 * to start from a known point. 4109 / 4110* nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); 4111 } 4112 drain_output(vp, 1); 4113 FREE_LOCK(&lk); 4114} 4115 4116/* 4117 * This routine is called when we are trying to synchronously flush a 4118 * file. This routine must eliminate any filesystem metadata dependencies 4119 * so that the syncing routine can succeed by pushing the dirty blocks 4120 * associated with the file. If any I/O errors occur, they are returned. 4121 / 4122int 4123softdep_sync_metadata(ap) 4124* struct vop_fsync_args /* { 4125 struct vnode a_vp; 4126* struct ucred a_cred; 4127* int a_waitfor; 4128 struct proc a_p; 4129* } / ap; 4130{ 4131 struct vnode vp = ap->a_vp; 4132* struct pagedep pagedep; 4133* struct allocdirect adp; 4134* struct allocindir aip; 4135* struct buf bp, nbp; 4136 struct worklist wk; 4137* int i, error, waitfor; 4138 4139 /* 4140 * Check whether this vnode is involved in a filesystem 4141 * that is doing soft dependency processing. 4142 / 4143* if (!vn_isdisk(vp, NULL)) { 4144 if (!DOINGSOFTDEP(vp)) 4145 return (0); 4146 } else 4147 if (vp->v_rdev->si_mountpoint == NULL \|\| 4148 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0) 4149 return (0); 4150 /* 4151 * Ensure that any direct block dependencies have been cleared. 4152 / 4153* ACQUIRE_LOCK(&lk); 4154 if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) { 4155 FREE_LOCK(&lk); 4156 return (error); 4157 } 4158 /* 4159 * For most files, the only metadata dependencies are the 4160 * cylinder group maps that allocate their inode or blocks. 4161 * The block allocation dependencies can be found by traversing 4162 * the dependency lists for any buffers that remain on their 4163 * dirty buffer list. The inode allocation dependency will 4164 * be resolved when the inode is updated with MNT_WAIT. 4165 * This work is done in two passes. The first pass grabs most 4166 * of the buffers and begins asynchronously writing them. The 4167 * only way to wait for these asynchronous writes is to sleep 4168 * on the filesystem vnode which may stay busy for a long time 4169 * if the filesystem is active. So, instead, we make a second 4170 * pass over the dependencies blocking on each write. In the 4171 * usual case we will be blocking against a write that we 4172 * initiated, so when it is done the dependency will have been 4173 * resolved. Thus the second pass is expected to end quickly. 4174 / 4175* waitfor = MNT_NOWAIT; 4176top: 4177 if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) { 4178 FREE_LOCK(&lk); 4179 return (0); 4180 } 4181 bp = TAILQ_FIRST(&vp->v_dirtyblkhd); 4182 /* While syncing snapshots, we must allow recursive lookups / 4183* bp->b_lock.lk_flags \|= LK_CANRECURSE; 4184loop: 4185 /* 4186 * As we hold the buffer locked, none of its dependencies 4187 * will disappear. 4188 / 4189* LIST_FOREACH(wk, &bp->b_dep, wk_list) { 4190 switch (wk->wk_type) { 4191 4192 case D_ALLOCDIRECT: 4193 adp = WK_ALLOCDIRECT(wk); 4194 if (adp->ad_state & DEPCOMPLETE) 4195 continue; 4196 nbp = adp->ad_buf; 4197 if (getdirtybuf(&nbp, waitfor) == 0) 4198 continue; 4199 FREE_LOCK(&lk); 4200 if (waitfor == MNT_NOWAIT) { 4201 bawrite(nbp); 4202 } else if ((error = BUF_WRITE(nbp)) != 0) { 4203 break; 4204 } 4205 ACQUIRE_LOCK(&lk); 4206 continue; 4207 4208 case D_ALLOCINDIR: 4209 aip = WK_ALLOCINDIR(wk); 4210 if (aip->ai_state & DEPCOMPLETE) 4211 continue; 4212 nbp = aip->ai_buf; 4213 if (getdirtybuf(&nbp, waitfor) == 0) 4214 continue; 4215 FREE_LOCK(&lk); 4216 if (waitfor == MNT_NOWAIT) { 4217 bawrite(nbp); 4218 } else if ((error = BUF_WRITE(nbp)) != 0) { 4219 break; 4220 } 4221 ACQUIRE_LOCK(&lk); 4222 continue; 4223 4224 case D_INDIRDEP: 4225 restart: 4226 4227 LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { 4228 if (aip->ai_state & DEPCOMPLETE) 4229 continue; 4230 nbp = aip->ai_buf; 4231 if (getdirtybuf(&nbp, MNT_WAIT) == 0) 4232 goto restart; 4233 FREE_LOCK(&lk); 4234 if ((error = BUF_WRITE(nbp)) != 0) { 4235 break; 4236 } 4237 ACQUIRE_LOCK(&lk); 4238 goto restart; 4239 } 4240 continue; 4241 4242 case D_INODEDEP: 4243 if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs, 4244 WK_INODEDEP(wk)->id_ino)) != 0) { 4245 FREE_LOCK(&lk); 4246 break; 4247 } 4248 continue; 4249 4250 case D_PAGEDEP: 4251 /* 4252 * We are trying to sync a directory that may 4253 * have dependencies on both its own metadata 4254 * and/or dependencies on the inodes of any 4255 * recently allocated files. We walk its diradd 4256 * lists pushing out the associated inode. 4257 / 4258* pagedep = WK_PAGEDEP(wk); 4259 for (i = 0; i < DAHASHSZ; i++) { 4260 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 4261 continue; 4262 if ((error = 4263 flush_pagedep_deps(vp, pagedep->pd_mnt, 4264 &pagedep->pd_diraddhd[i]))) { 4265 FREE_LOCK(&lk); 4266 break; 4267 } 4268 } 4269 continue; 4270 4271 case D_MKDIR: 4272 /* 4273 * This case should never happen if the vnode has 4274 * been properly sync'ed. However, if this function 4275 * is used at a place where the vnode has not yet 4276 * been sync'ed, this dependency can show up. So, 4277 * rather than panic, just flush it. 4278 / 4279* nbp = WK_MKDIR(wk)->md_buf; 4280 if (getdirtybuf(&nbp, waitfor) == 0) 4281 continue; 4282 FREE_LOCK(&lk); 4283 if (waitfor == MNT_NOWAIT) { 4284 bawrite(nbp); 4285 } else if ((error = BUF_WRITE(nbp)) != 0) { 4286 break; 4287 } 4288 ACQUIRE_LOCK(&lk); 4289 continue; 4290 4291 case D_BMSAFEMAP: 4292 /* 4293 * This case should never happen if the vnode has 4294 * been properly sync'ed. However, if this function 4295 * is used at a place where the vnode has not yet 4296 * been sync'ed, this dependency can show up. So, 4297 * rather than panic, just flush it. 4298 / 4299* nbp = WK_BMSAFEMAP(wk)->sm_buf; 4300 if (getdirtybuf(&nbp, waitfor) == 0) 4301 continue; 4302 FREE_LOCK(&lk); 4303 if (waitfor == MNT_NOWAIT) { 4304 bawrite(nbp); 4305 } else if ((error = BUF_WRITE(nbp)) != 0) { 4306 break; 4307 } 4308 ACQUIRE_LOCK(&lk); 4309 continue; 4310 4311 default: 4312 FREE_LOCK(&lk); 4313 panic("softdep_sync_metadata: Unknown type %s", 4314 TYPENAME(wk->wk_type)); 4315 /* NOTREACHED / 4316* } 4317 /* We reach here only in error and unlocked / 4318* if (error == 0) 4319 panic("softdep_sync_metadata: zero error"); 4320 bp->b_lock.lk_flags &= ~LK_CANRECURSE; 4321 bawrite(bp); 4322 return (error); 4323 } 4324 (void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT); 4325 nbp = TAILQ_NEXT(bp, b_vnbufs); 4326 FREE_LOCK(&lk); 4327 bp->b_lock.lk_flags &= ~LK_CANRECURSE; 4328 bawrite(bp); 4329 ACQUIRE_LOCK(&lk); 4330 if (nbp != NULL) { 4331 bp = nbp; 4332 goto loop; 4333 } 4334 /* 4335 * We must wait for any I/O in progress to finish so that 4336 * all potential buffers on the dirty list will be visible. 4337 * Once they are all there, proceed with the second pass 4338 * which will wait for the I/O as per above. 4339 / 4340* drain_output(vp, 1); 4341 /* 4342 * The brief unlock is to allow any pent up dependency 4343 * processing to be done. 4344 / 4345* if (waitfor == MNT_NOWAIT) { 4346 waitfor = MNT_WAIT; 4347 FREE_LOCK(&lk); 4348 ACQUIRE_LOCK(&lk); 4349 goto top; 4350 } 4351 4352 /* 4353 * If we have managed to get rid of all the dirty buffers, 4354 * then we are done. For certain directories and block 4355 * devices, we may need to do further work. 4356 / 4357* if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) { 4358 FREE_LOCK(&lk); 4359 return (0); 4360 } 4361 4362 FREE_LOCK(&lk); 4363 /* 4364 * If we are trying to sync a block device, some of its buffers may 4365 * contain metadata that cannot be written until the contents of some 4366 * partially written files have been written to disk. The only easy 4367 * way to accomplish this is to sync the entire filesystem (luckily 4368 * this happens rarely). 4369 / 4370* if (vn_isdisk(vp, NULL) && 4371 vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) && 4372 (error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT, ap->a_cred, 4373 ap->a_p)) != 0) 4374 return (error); 4375 return (0); 4376} 4377 4378/* 4379 * Flush the dependencies associated with an inodedep. 4380 * Called with splbio blocked. 4381 / 4382static int 4383flush_inodedep_deps(fs, ino) 4384* struct fs fs; 4385* ino_t ino; 4386{ 4387 struct inodedep inodedep; 4388* struct allocdirect adp; 4389* int error, waitfor; 4390 struct buf bp; 4391* 4392 /* 4393 * This work is done in two passes. The first pass grabs most 4394 * of the buffers and begins asynchronously writing them. The 4395 * only way to wait for these asynchronous writes is to sleep 4396 * on the filesystem vnode which may stay busy for a long time 4397 * if the filesystem is active. So, instead, we make a second 4398 * pass over the dependencies blocking on each write. In the 4399 * usual case we will be blocking against a write that we 4400 * initiated, so when it is done the dependency will have been 4401 * resolved. Thus the second pass is expected to end quickly. 4402 * We give a brief window at the top of the loop to allow 4403 * any pending I/O to complete. 4404 / 4405* for (waitfor = MNT_NOWAIT; ; ) { 4406 FREE_LOCK(&lk); 4407 ACQUIRE_LOCK(&lk); 4408 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 4409 return (0); 4410 TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) { 4411 if (adp->ad_state & DEPCOMPLETE) 4412 continue; 4413 bp = adp->ad_buf; 4414 if (getdirtybuf(&bp, waitfor) == 0) { 4415 if (waitfor == MNT_NOWAIT) 4416 continue; 4417 break; 4418 } 4419 FREE_LOCK(&lk); 4420 if (waitfor == MNT_NOWAIT) { 4421 bawrite(bp); 4422 } else if ((error = BUF_WRITE(bp)) != 0) { 4423 ACQUIRE_LOCK(&lk); 4424 return (error); 4425 } 4426 ACQUIRE_LOCK(&lk); 4427 break; 4428 } 4429 if (adp != NULL) 4430 continue; 4431 TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) { 4432 if (adp->ad_state & DEPCOMPLETE) 4433 continue; 4434 bp = adp->ad_buf; 4435 if (getdirtybuf(&bp, waitfor) == 0) { 4436 if (waitfor == MNT_NOWAIT) 4437 continue; 4438 break; 4439 } 4440 FREE_LOCK(&lk); 4441 if (waitfor == MNT_NOWAIT) { 4442 bawrite(bp); 4443 } else if ((error = BUF_WRITE(bp)) != 0) { 4444 ACQUIRE_LOCK(&lk); 4445 return (error); 4446 } 4447 ACQUIRE_LOCK(&lk); 4448 break; 4449 } 4450 if (adp != NULL) 4451 continue; 4452 /* 4453 * If pass2, we are done, otherwise do pass 2. 4454 / 4455* if (waitfor == MNT_WAIT) 4456 break; 4457 waitfor = MNT_WAIT; 4458 } 4459 /* 4460 * Try freeing inodedep in case all dependencies have been removed. 4461 / 4462* if (inodedep_lookup(fs, ino, 0, &inodedep) != 0) 4463 (void) free_inodedep(inodedep); 4464 return (0); 4465} 4466 4467/* 4468 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 4469 * Called with splbio blocked. 4470 / 4471static int 4472flush_pagedep_deps(pvp, mp, diraddhdp) 4473* struct vnode pvp; 4474* struct mount mp; 4475* struct diraddhd diraddhdp; 4476{ 4477* struct proc p = CURPROC; / XXX / 4478* struct inodedep inodedep; 4479* struct ufsmount ump; 4480* struct diradd dap; 4481* struct vnode vp; 4482* int gotit, error = 0; 4483 struct buf bp; 4484* ino_t inum; 4485 4486 ump = VFSTOUFS(mp); 4487 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 4488 /* 4489 * Flush ourselves if this directory entry 4490 * has a MKDIR_PARENT dependency. 4491 / 4492* if (dap->da_state & MKDIR_PARENT) { 4493 FREE_LOCK(&lk); 4494 if ((error = UFS_UPDATE(pvp, 1)) != 0) 4495 break; 4496 ACQUIRE_LOCK(&lk); 4497 /* 4498 * If that cleared dependencies, go on to next. 4499 / 4500* if (dap != LIST_FIRST(diraddhdp)) 4501 continue; 4502 if (dap->da_state & MKDIR_PARENT) { 4503 FREE_LOCK(&lk); 4504 panic("flush_pagedep_deps: MKDIR_PARENT"); 4505 } 4506 } 4507 /* 4508 * A newly allocated directory must have its "." and 4509 * ".." entries written out before its name can be 4510 * committed in its parent. We do not want or need 4511 * the full semantics of a synchronous VOP_FSYNC as 4512 * that may end up here again, once for each directory 4513 * level in the filesystem. Instead, we push the blocks 4514 * and wait for them to clear. We have to fsync twice 4515 * because the first call may choose to defer blocks 4516 * that still have dependencies, but deferral will 4517 * happen at most once. 4518 / 4519* inum = dap->da_newinum; 4520 if (dap->da_state & MKDIR_BODY) { 4521 FREE_LOCK(&lk); 4522 if ((error = VFS_VGET(mp, inum, &vp)) != 0) 4523 break; 4524 if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) \|\| 4525 (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) { 4526 vput(vp); 4527 break; 4528 } 4529 drain_output(vp, 0); 4530 vput(vp); 4531 ACQUIRE_LOCK(&lk); 4532 /* 4533 * If that cleared dependencies, go on to next. 4534 / 4535* if (dap != LIST_FIRST(diraddhdp)) 4536 continue; 4537 if (dap->da_state & MKDIR_BODY) { 4538 FREE_LOCK(&lk); 4539 panic("flush_pagedep_deps: MKDIR_BODY"); 4540 } 4541 } 4542 /* 4543 * Flush the inode on which the directory entry depends. 4544 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 4545 * the only remaining dependency is that the updated inode 4546 * count must get pushed to disk. The inode has already 4547 * been pushed into its inode buffer (via VOP_UPDATE) at 4548 * the time of the reference count change. So we need only 4549 * locate that buffer, ensure that there will be no rollback 4550 * caused by a bitmap dependency, then write the inode buffer. 4551 / 4552* if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) { 4553 FREE_LOCK(&lk); 4554 panic("flush_pagedep_deps: lost inode"); 4555 } 4556 /* 4557 * If the inode still has bitmap dependencies, 4558 * push them to disk. 4559 / 4560* if ((inodedep->id_state & DEPCOMPLETE) == 0) { 4561 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 4562 FREE_LOCK(&lk); 4563 if (gotit && 4564 (error = BUF_WRITE(inodedep->id_buf)) != 0) 4565 break; 4566 ACQUIRE_LOCK(&lk); 4567 if (dap != LIST_FIRST(diraddhdp)) 4568 continue; 4569 } 4570 /* 4571 * If the inode is still sitting in a buffer waiting 4572 * to be written, push it to disk. 4573 / 4574* FREE_LOCK(&lk); 4575 if ((error = bread(ump->um_devvp, 4576 fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), 4577 (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) 4578 break; 4579 if ((error = BUF_WRITE(bp)) != 0) 4580 break; 4581 ACQUIRE_LOCK(&lk); 4582 /* 4583 * If we have failed to get rid of all the dependencies 4584 * then something is seriously wrong. 4585 / 4586* if (dap == LIST_FIRST(diraddhdp)) { 4587 FREE_LOCK(&lk); 4588 panic("flush_pagedep_deps: flush failed"); 4589 } 4590 } 4591 if (error) 4592 ACQUIRE_LOCK(&lk); 4593 return (error); 4594} 4595 4596/* 4597 * A large burst of file addition or deletion activity can drive the 4598 * memory load excessively high. First attempt to slow things down 4599 * using the techniques below. If that fails, this routine requests 4600 * the offending operations to fall back to running synchronously 4601 * until the memory load returns to a reasonable level. 4602 / 4603int 4604softdep_slowdown(vp) 4605* struct vnode vp; 4606{ 4607* int max_softdeps_hard; 4608 4609 max_softdeps_hard = max_softdeps * 11 / 10; 4610 if (num_dirrem < max_softdeps_hard / 2 && 4611 num_inodedep < max_softdeps_hard) 4612 return (0); 4613 stat_sync_limit_hit += 1; 4614 return (1); 4615} 4616 4617/* 4618 * If memory utilization has gotten too high, deliberately slow things 4619 * down and speed up the I/O processing. 4620 / 4621static int 4622request_cleanup(resource, islocked) 4623* int resource; 4624 int islocked; 4625{ 4626 struct proc p = CURPROC; 4627* 4628 /* 4629 * We never hold up the filesystem syncer process. 4630 / 4631* if (p == filesys_syncer) 4632 return (0); 4633 /* 4634 * First check to see if the work list has gotten backlogged. 4635 * If it has, co-opt this process to help clean up two entries. 4636 * Because this process may hold inodes locked, we cannot 4637 * handle any remove requests that might block on a locked 4638 * inode as that could lead to deadlock. 4639 / 4640* if (num_on_worklist > max_softdeps / 10) { 4641 if (islocked) 4642 FREE_LOCK(&lk); 4643 process_worklist_item(NULL, LK_NOWAIT); 4644 process_worklist_item(NULL, LK_NOWAIT); 4645 stat_worklist_push += 2; 4646 if (islocked) 4647 ACQUIRE_LOCK(&lk); 4648 return(1); 4649 } 4650 /* 4651 * Next, we attempt to speed up the syncer process. If that 4652 * is successful, then we allow the process to continue. 4653 / 4654* if (speedup_syncer()) 4655 return(0); 4656 /* 4657 * If we are resource constrained on inode dependencies, try 4658 * flushing some dirty inodes. Otherwise, we are constrained 4659 * by file deletions, so try accelerating flushes of directories 4660 * with removal dependencies. We would like to do the cleanup 4661 * here, but we probably hold an inode locked at this point and 4662 * that might deadlock against one that we try to clean. So, 4663 * the best that we can do is request the syncer daemon to do 4664 * the cleanup for us. 4665 / 4666* switch (resource) { 4667 4668 case FLUSH_INODES: 4669 stat_ino_limit_push += 1; 4670 req_clear_inodedeps += 1; 4671 stat_countp = &stat_ino_limit_hit; 4672 break; 4673 4674 case FLUSH_REMOVE: 4675 stat_blk_limit_push += 1; 4676 req_clear_remove += 1; 4677 stat_countp = &stat_blk_limit_hit; 4678 break; 4679 4680 default: 4681 if (islocked) 4682 FREE_LOCK(&lk); 4683 panic("request_cleanup: unknown type"); 4684 } 4685 /* 4686 * Hopefully the syncer daemon will catch up and awaken us. 4687 * We wait at most tickdelay before proceeding in any case. 4688 / 4689* if (islocked == 0) 4690 ACQUIRE_LOCK(&lk); 4691 proc_waiting += 1; 4692 if (handle.callout == NULL) 4693 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); 4694 FREE_LOCK_INTERLOCKED(&lk); 4695 (void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0); 4696 ACQUIRE_LOCK_INTERLOCKED(&lk); 4697 proc_waiting -= 1; 4698 if (islocked == 0) 4699 FREE_LOCK(&lk); 4700 return (1); 4701} 4702 4703/* 4704 * Awaken processes pausing in request_cleanup and clear proc_waiting 4705 * to indicate that there is no longer a timer running. 4706 / 4707void 4708pause_timer(arg) 4709* void arg; 4710{ 4711* 4712 stat_countp += 1; 4713* wakeup_one(&proc_waiting); 4714 if (proc_waiting > 0) 4715 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); 4716 else 4717 handle.callout = NULL; 4718} 4719 4720/* 4721 * Flush out a directory with at least one removal dependency in an effort to 4722 * reduce the number of dirrem, freefile, and freeblks dependency structures. 4723 / 4724static void 4725clear_remove(p) 4726* struct proc p; 4727{ 4728* struct pagedep_hashhead pagedephd; 4729* struct pagedep pagedep; 4730* static int next = 0; 4731 struct mount mp; 4732* struct vnode vp; 4733* int error, cnt; 4734 ino_t ino; 4735 4736 ACQUIRE_LOCK(&lk); 4737 for (cnt = 0; cnt < pagedep_hash; cnt++) { 4738 pagedephd = &pagedep_hashtbl[next++]; 4739 if (next >= pagedep_hash) 4740 next = 0; 4741 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 4742 if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL) 4743 continue; 4744 mp = pagedep->pd_mnt; 4745 ino = pagedep->pd_ino; 4746 FREE_LOCK(&lk); 4747 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 4748 continue; 4749 if ((error = VFS_VGET(mp, ino, &vp)) != 0) { 4750 softdep_error("clear_remove: vget", error); 4751 vn_finished_write(mp); 4752 return; 4753 } 4754 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) 4755 softdep_error("clear_remove: fsync", error); 4756 drain_output(vp, 0); 4757 vput(vp); 4758 vn_finished_write(mp); 4759 return; 4760 } 4761 } 4762 FREE_LOCK(&lk); 4763} 4764 4765/* 4766 * Clear out a block of dirty inodes in an effort to reduce 4767 * the number of inodedep dependency structures. 4768 / 4769static void 4770clear_inodedeps(p) 4771* struct proc p; 4772{ 4773* struct inodedep_hashhead inodedephd; 4774* struct inodedep inodedep; 4775* static int next = 0; 4776 struct mount mp; 4777* struct vnode vp; 4778* struct fs fs; 4779* int error, cnt; 4780 ino_t firstino, lastino, ino; 4781 4782 ACQUIRE_LOCK(&lk); 4783 /* 4784 * Pick a random inode dependency to be cleared. 4785 * We will then gather up all the inodes in its block 4786 * that have dependencies and flush them out. 4787 / 4788* for (cnt = 0; cnt < inodedep_hash; cnt++) { 4789 inodedephd = &inodedep_hashtbl[next++]; 4790 if (next >= inodedep_hash) 4791 next = 0; 4792 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 4793 break; 4794 } 4795 if (inodedep == NULL) 4796 return; 4797 /* 4798 * Ugly code to find mount point given pointer to superblock. 4799 / 4800* fs = inodedep->id_fs; 4801 TAILQ_FOREACH(mp, &mountlist, mnt_list) 4802 if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs) 4803 break; 4804 /* 4805 * Find the last inode in the block with dependencies. 4806 / 4807* firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 4808 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 4809 if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0) 4810 break; 4811 /* 4812 * Asynchronously push all but the last inode with dependencies. 4813 * Synchronously push the last inode with dependencies to ensure 4814 * that the inode block gets written to free up the inodedeps. 4815 / 4816* for (ino = firstino; ino <= lastino; ino++) { 4817 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 4818 continue; 4819 FREE_LOCK(&lk); 4820 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 4821 continue; 4822 if ((error = VFS_VGET(mp, ino, &vp)) != 0) { 4823 softdep_error("clear_inodedeps: vget", error); 4824 vn_finished_write(mp); 4825 return; 4826 } 4827 if (ino == lastino) { 4828 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p))) 4829 softdep_error("clear_inodedeps: fsync1", error); 4830 } else { 4831 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) 4832 softdep_error("clear_inodedeps: fsync2", error); 4833 drain_output(vp, 0); 4834 } 4835 vput(vp); 4836 vn_finished_write(mp); 4837 ACQUIRE_LOCK(&lk); 4838 } 4839 FREE_LOCK(&lk); 4840} 4841 4842/* 4843 * Function to determine if the buffer has outstanding dependencies 4844 * that will cause a roll-back if the buffer is written. If wantcount 4845 * is set, return number of dependencies, otherwise just yes or no. 4846 / 4847static int 4848softdep_count_dependencies(bp, wantcount) 4849* struct buf bp; 4850* int wantcount; 4851{ 4852 struct worklist wk; 4853* struct inodedep inodedep; 4854* struct indirdep indirdep; 4855* struct allocindir aip; 4856* struct pagedep pagedep; 4857* struct diradd dap; 4858* int i, retval; 4859 4860 retval = 0; 4861 ACQUIRE_LOCK(&lk); 4862 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 4863 switch (wk->wk_type) { 4864 4865 case D_INODEDEP: 4866 inodedep = WK_INODEDEP(wk); 4867 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 4868 /* bitmap allocation dependency / 4869* retval += 1; 4870 if (!wantcount) 4871 goto out; 4872 } 4873 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 4874 /* direct block pointer dependency / 4875* retval += 1; 4876 if (!wantcount) 4877 goto out; 4878 } 4879 continue; 4880 4881 case D_INDIRDEP: 4882 indirdep = WK_INDIRDEP(wk); 4883 4884 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 4885 /* indirect block pointer dependency / 4886* retval += 1; 4887 if (!wantcount) 4888 goto out; 4889 } 4890 continue; 4891 4892 case D_PAGEDEP: 4893 pagedep = WK_PAGEDEP(wk); 4894 for (i = 0; i < DAHASHSZ; i++) { 4895 4896 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 4897 /* directory entry dependency / 4898* retval += 1; 4899 if (!wantcount) 4900 goto out; 4901 } 4902 } 4903 continue; 4904 4905 case D_BMSAFEMAP: 4906 case D_ALLOCDIRECT: 4907 case D_ALLOCINDIR: 4908 case D_MKDIR: 4909 /* never a dependency on these blocks / 4910* continue; 4911 4912 default: 4913 FREE_LOCK(&lk); 4914 panic("softdep_check_for_rollback: Unexpected type %s", 4915 TYPENAME(wk->wk_type)); 4916 /* NOTREACHED / 4917* } 4918 } 4919out: 4920 FREE_LOCK(&lk); 4921 return retval; 4922} 4923 4924/* 4925 * Acquire exclusive access to a buffer. 4926 * Must be called with splbio blocked. 4927 * Return 1 if buffer was acquired. 4928 / 4929static int 4930getdirtybuf(bpp, waitfor) 4931* struct buf *bpp; 4932* int waitfor; 4933{ 4934 struct buf bp; 4935* 4936 for (;;) { 4937 if ((bp = bpp) == NULL) 4938* return (0); 4939 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) == 0) { 4940 if ((bp->b_xflags & BX_BKGRDINPROG) == 0) 4941 break; 4942 BUF_UNLOCK(bp); 4943 if (waitfor != MNT_WAIT) 4944 return (0); 4945 bp->b_xflags \|= BX_BKGRDWAIT; 4946 FREE_LOCK_INTERLOCKED(&lk); 4947 tsleep(&bp->b_xflags, PRIBIO, "getbuf", 0); 4948 ACQUIRE_LOCK_INTERLOCKED(&lk); 4949 continue; 4950 } 4951 if (waitfor != MNT_WAIT) 4952 return (0); 4953 FREE_LOCK_INTERLOCKED(&lk); 4954 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_SLEEPFAIL) != ENOLCK) 4955 panic("getdirtybuf: inconsistent lock"); 4956 ACQUIRE_LOCK_INTERLOCKED(&lk); 4957 } 4958 if ((bp->b_flags & B_DELWRI) == 0) { 4959 BUF_UNLOCK(bp); 4960 return (0); 4961 } 4962 bremfree(bp); 4963 return (1); 4964} 4965 4966/* 4967 * Wait for pending output on a vnode to complete. 4968 * Must be called with vnode locked. 4969 / 4970static void 4971drain_output(vp, islocked) 4972* struct vnode vp; 4973* int islocked; 4974{ 4975 4976 if (!islocked) 4977 ACQUIRE_LOCK(&lk); 4978 while (vp->v_numoutput) { 4979 vp->v_flag \|= VBWAIT; 4980 FREE_LOCK_INTERLOCKED(&lk); 4981 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drainvp", 0); 4982 ACQUIRE_LOCK_INTERLOCKED(&lk); 4983 } 4984 if (!islocked) 4985 FREE_LOCK(&lk); 4986} 4987 4988/* 4989 * Called whenever a buffer that is being invalidated or reallocated 4990 * contains dependencies. This should only happen if an I/O error has 4991 * occurred. The routine is called with the buffer locked. 4992 / 4993static void 4994softdep_deallocate_dependencies(bp) 4995* struct buf bp; 4996{ 4997* 4998 if ((bp->b_ioflags & BIO_ERROR) == 0) 4999 panic("softdep_deallocate_dependencies: dangling deps"); 5000 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 5001 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 5002} 5003 5004/* 5005 * Function to handle asynchronous write errors in the filesystem. 5006 / 5007void 5008softdep_error(func, error) 5009* char func; 5010* int error; 5011{ 5012 5013 /* XXX should do something better! / 5014* printf("%s: got error %d while accessing filesystem\n", func, error); 5015}

ffs_softdep.c (76354)	ffs_softdep.c (76357)
1/* 2 * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * The soft updates code is derived from the appendix of a University 5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 6 * "Soft Updates: A Solution to the Metadata Update Problem in File 7 * Systems", CSE-TR-254-95, August 1995). 8 * 9 * Further information about soft updates can be obtained from: 10 * 11 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 12 * 1614 Oxford Street mckusick@mckusick.com 13 * Berkeley, CA 94709-1608 +1-510-843-9542 14 * USA 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 27 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 28 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 29 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 30 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00	1/* 2 * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * The soft updates code is derived from the appendix of a University 5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 6 * "Soft Updates: A Solution to the Metadata Update Problem in File 7 * Systems", CSE-TR-254-95, August 1995). 8 * 9 * Further information about soft updates can be obtained from: 10 * 11 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 12 * 1614 Oxford Street mckusick@mckusick.com 13 * Berkeley, CA 94709-1608 +1-510-843-9542 14 * USA 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 27 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 28 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 29 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 30 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
39 * $FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 76354 2001-05-08 07:13:00Z mckusick $	39 * $FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 76357 2001-05-08 07:42:20Z mckusick $
40 / 41 42/ 43 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide. 44 */ 45#ifndef DIAGNOSTIC 46#define DIAGNOSTIC 47#endif 48#ifndef DEBUG 49#define DEBUG 50#endif 51 52#include <sys/param.h> 53#include <sys/kernel.h> 54#include <sys/systm.h> 55#include <sys/bio.h> 56#include <sys/buf.h> 57#include <sys/malloc.h> 58#include <sys/mount.h> 59#include <sys/proc.h>	40 / 41 42/ 43 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide. 44 */ 45#ifndef DIAGNOSTIC 46#define DIAGNOSTIC 47#endif 48#ifndef DEBUG 49#define DEBUG 50#endif 51 52#include <sys/param.h> 53#include <sys/kernel.h> 54#include <sys/systm.h> 55#include <sys/bio.h> 56#include <sys/buf.h> 57#include <sys/malloc.h> 58#include <sys/mount.h> 59#include <sys/proc.h>
	60#include <sys/stat.h>
60#include <sys/syslog.h> 61#include <sys/vnode.h> 62#include <sys/conf.h> 63#include <ufs/ufs/dir.h> 64#include <ufs/ufs/extattr.h> 65#include <ufs/ufs/quota.h> 66#include <ufs/ufs/inode.h> 67#include <ufs/ufs/ufsmount.h> 68#include <ufs/ffs/fs.h> 69#include <ufs/ffs/softdep.h> 70#include <ufs/ffs/ffs_extern.h> 71#include <ufs/ufs/ufs_extern.h> 72 73/* 74 * These definitions need to be adapted to the system to which 75 * this file is being ported. 76 / 77/ 78 * malloc types defined for the softdep system. 79 / 80static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); 81static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); 82static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); 83static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); 84static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); 85static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); 86static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); 87static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); 88static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); 89static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); 90static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); 91static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); 92static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); 93 94#define M_SOFTDEP_FLAGS (M_WAITOK \| M_USE_RESERVE) 95 96#define D_PAGEDEP 0 97#define D_INODEDEP 1 98#define D_NEWBLK 2 99#define D_BMSAFEMAP 3 100#define D_ALLOCDIRECT 4 101#define D_INDIRDEP 5 102#define D_ALLOCINDIR 6 103#define D_FREEFRAG 7 104#define D_FREEBLKS 8 105#define D_FREEFILE 9 106#define D_DIRADD 10 107#define D_MKDIR 11 108#define D_DIRREM 12 109#define D_LAST D_DIRREM 110* 111/* 112 * translate from workitem type to memory type 113 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 114 / 115static struct malloc_type memtype[] = { 116 M_PAGEDEP, 117 M_INODEDEP, 118 M_NEWBLK, 119 M_BMSAFEMAP, 120 M_ALLOCDIRECT, 121 M_INDIRDEP, 122 M_ALLOCINDIR, 123 M_FREEFRAG, 124 M_FREEBLKS, 125 M_FREEFILE, 126 M_DIRADD, 127 M_MKDIR, 128 M_DIRREM 129}; 130 131#define DtoM(type) (memtype[type]) 132 133/* 134 * Names of malloc types. 135 / 136#define TYPENAME(type) \ 137* ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") 138/* 139 * End system adaptaion definitions. 140 / 141* 142/* 143 * Internal function prototypes. 144 / 145static void softdep_error __P((char , int)); 146static void drain_output __P((struct vnode , int)); 147static int getdirtybuf __P((struct buf , int)); 148static void clear_remove __P((struct proc )); 149static void clear_inodedeps __P((struct proc )); 150static int flush_pagedep_deps __P((struct vnode , struct mount , 151* struct diraddhd )); 152static int flush_inodedep_deps __P((struct fs , ino_t)); 153static int handle_written_filepage __P((struct pagedep , struct buf )); 154static void diradd_inode_written __P((struct diradd , struct inodedep )); 155static int handle_written_inodeblock __P((struct inodedep , struct buf )); 156static void handle_allocdirect_partdone __P((struct allocdirect )); 157static void handle_allocindir_partdone __P((struct allocindir )); 158static void initiate_write_filepage __P((struct pagedep , struct buf )); 159static void handle_written_mkdir __P((struct mkdir , int)); 160static void initiate_write_inodeblock __P((struct inodedep , struct buf )); 161static void handle_workitem_freefile __P((struct freefile )); 162static void handle_workitem_remove __P((struct dirrem )); 163static struct dirrem newdirrem __P((struct buf , struct inode , 164 struct inode , int, struct dirrem )); 165static void free_diradd __P((struct diradd )); 166static void free_allocindir __P((struct allocindir , struct inodedep )); 167static int indir_trunc __P((struct inode , ufs_daddr_t, int, ufs_lbn_t, 168* long )); 169static void deallocate_dependencies __P((struct buf , struct inodedep )); 170static void free_allocdirect __P((struct allocdirectlst , 171 struct allocdirect , int)); 172static int check_inode_unwritten __P((struct inodedep )); 173static int free_inodedep __P((struct inodedep )); 174static void handle_workitem_freeblocks __P((struct freeblks , int)); 175static void merge_inode_lists __P((struct inodedep )); 176static void setup_allocindir_phase2 __P((struct buf , struct inode , 177* struct allocindir )); 178static struct allocindir newallocindir __P((struct inode , int, ufs_daddr_t, 179* ufs_daddr_t)); 180static void handle_workitem_freefrag __P((struct freefrag )); 181static struct freefrag newfreefrag __P((struct inode , ufs_daddr_t, long)); 182static void allocdirect_merge __P((struct allocdirectlst , 183 struct allocdirect , struct allocdirect )); 184static struct bmsafemap bmsafemap_lookup __P((struct buf )); 185static int newblk_lookup __P((struct fs , ufs_daddr_t, int, 186* struct newblk *)); 187static int inodedep_lookup __P((struct fs , ino_t, int, struct inodedep *)); 188static int pagedep_lookup __P((struct inode , ufs_lbn_t, int, 189 struct pagedep *)); 190static void pause_timer __P((void )); 191static int request_cleanup __P((int, int)); 192static int process_worklist_item __P((struct mount , int)); 193static void add_to_worklist __P((struct worklist )); 194 195/* 196 * Exported softdep operations. 197 / 198static void softdep_disk_io_initiation __P((struct buf )); 199static void softdep_disk_write_complete __P((struct buf )); 200static void softdep_deallocate_dependencies __P((struct buf )); 201static void softdep_move_dependencies __P((struct buf , struct buf )); 202static int softdep_count_dependencies __P((struct buf bp, int)); 203* 204struct bio_ops bioops = { 205 softdep_disk_io_initiation, /* io_start / 206* softdep_disk_write_complete, /* io_complete / 207* softdep_deallocate_dependencies, /* io_deallocate / 208* softdep_move_dependencies, /* io_movedeps / 209* softdep_count_dependencies, /* io_countdeps / 210}; 211* 212/* 213 * Locking primitives. 214 * 215 * For a uniprocessor, all we need to do is protect against disk 216 * interrupts. For a multiprocessor, this lock would have to be 217 * a mutex. A single mutex is used throughout this file, though 218 * finer grain locking could be used if contention warranted it. 219 * 220 * For a multiprocessor, the sleep call would accept a lock and 221 * release it after the sleep processing was complete. In a uniprocessor 222 * implementation there is no such interlock, so we simple mark 223 * the places where it needs to be done with the `interlocked' form 224 * of the lock calls. Since the uniprocessor sleep already interlocks 225 * the spl, there is nothing that really needs to be done. 226 / 227#ifndef / NOT / DEBUG 228static struct lockit { 229* int lkt_spl; 230} lk = { 0 }; 231#define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio() 232#define FREE_LOCK(lk) splx((lk)->lkt_spl) 233#define ACQUIRE_LOCK_INTERLOCKED(lk) 234#define FREE_LOCK_INTERLOCKED(lk) 235 236#else /* DEBUG / 237static struct lockit { 238* int lkt_spl; 239 pid_t lkt_held; 240} lk = { 0, -1 }; 241static int lockcnt; 242 243static void acquire_lock __P((struct lockit )); 244static void free_lock __P((struct lockit )); 245static void acquire_lock_interlocked __P((struct lockit )); 246static void free_lock_interlocked __P((struct lockit )); 247 248#define ACQUIRE_LOCK(lk) acquire_lock(lk) 249#define FREE_LOCK(lk) free_lock(lk) 250#define ACQUIRE_LOCK_INTERLOCKED(lk) acquire_lock_interlocked(lk) 251#define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk) 252 253static void 254acquire_lock(lk) 255 struct lockit lk; 256{ 257* pid_t holder; 258 259 if (lk->lkt_held != -1) { 260 holder = lk->lkt_held; 261 FREE_LOCK(lk); 262 if (holder == CURPROC->p_pid) 263 panic("softdep_lock: locking against myself"); 264 else 265 panic("softdep_lock: lock held by %d", holder); 266 } 267 lk->lkt_spl = splbio(); 268 lk->lkt_held = CURPROC->p_pid; 269 lockcnt++; 270} 271 272static void 273free_lock(lk) 274 struct lockit lk; 275{ 276* 277 if (lk->lkt_held == -1) 278 panic("softdep_unlock: lock not held"); 279 lk->lkt_held = -1; 280 splx(lk->lkt_spl); 281} 282 283static void 284acquire_lock_interlocked(lk) 285 struct lockit lk; 286{ 287* pid_t holder; 288 289 if (lk->lkt_held != -1) { 290 holder = lk->lkt_held; 291 FREE_LOCK(lk); 292 if (holder == CURPROC->p_pid) 293 panic("softdep_lock_interlocked: locking against self"); 294 else 295 panic("softdep_lock_interlocked: lock held by %d", 296 holder); 297 } 298 lk->lkt_held = CURPROC->p_pid; 299 lockcnt++; 300} 301 302static void 303free_lock_interlocked(lk) 304 struct lockit lk; 305{ 306* 307 if (lk->lkt_held == -1) 308 panic("softdep_unlock_interlocked: lock not held"); 309 lk->lkt_held = -1; 310} 311#endif /* DEBUG / 312* 313/* 314 * Place holder for real semaphores. 315 / 316struct sema { 317* int value; 318 pid_t holder; 319 char name; 320* int prio; 321 int timo; 322}; 323static void sema_init __P((struct sema , char , int, int)); 324static int sema_get __P((struct sema , struct lockit )); 325static void sema_release __P((struct sema )); 326* 327static void 328sema_init(semap, name, prio, timo) 329 struct sema semap; 330* char name; 331* int prio, timo; 332{ 333 334 semap->holder = -1; 335 semap->value = 0; 336 semap->name = name; 337 semap->prio = prio; 338 semap->timo = timo; 339} 340 341static int 342sema_get(semap, interlock) 343 struct sema semap; 344* struct lockit interlock; 345{ 346* 347 if (semap->value++ > 0) { 348 if (interlock != NULL) 349 FREE_LOCK_INTERLOCKED(interlock); 350 tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo); 351 if (interlock != NULL) { 352 ACQUIRE_LOCK_INTERLOCKED(interlock); 353 FREE_LOCK(interlock); 354 } 355 return (0); 356 } 357 semap->holder = CURPROC->p_pid; 358 if (interlock != NULL) 359 FREE_LOCK(interlock); 360 return (1); 361} 362 363static void 364sema_release(semap) 365 struct sema semap; 366{ 367* 368 if (semap->value <= 0 \|\| semap->holder != CURPROC->p_pid) { 369 if (lk.lkt_held != -1) 370 FREE_LOCK(&lk); 371 panic("sema_release: not held"); 372 } 373 if (--semap->value > 0) { 374 semap->value = 0; 375 wakeup(semap); 376 } 377 semap->holder = -1; 378} 379 380/* 381 * Worklist queue management. 382 * These routines require that the lock be held. 383 / 384#ifndef / NOT / DEBUG 385#define WORKLIST_INSERT(head, item) do { \ 386* (item)->wk_state \|= ONWORKLIST; \ 387 LIST_INSERT_HEAD(head, item, wk_list); \ 388} while (0) 389#define WORKLIST_REMOVE(item) do { \ 390 (item)->wk_state &= ~ONWORKLIST; \ 391 LIST_REMOVE(item, wk_list); \ 392} while (0) 393#define WORKITEM_FREE(item, type) FREE(item, DtoM(type)) 394 395#else /* DEBUG / 396static void worklist_insert __P((struct workhead , struct worklist )); 397static void worklist_remove __P((struct worklist )); 398static void workitem_free __P((struct worklist , int)); 399* 400#define WORKLIST_INSERT(head, item) worklist_insert(head, item) 401#define WORKLIST_REMOVE(item) worklist_remove(item) 402#define WORKITEM_FREE(item, type) workitem_free((struct worklist )item, type) 403* 404static void 405worklist_insert(head, item) 406 struct workhead head; 407* struct worklist item; 408{ 409* 410 if (lk.lkt_held == -1) 411 panic("worklist_insert: lock not held"); 412 if (item->wk_state & ONWORKLIST) { 413 FREE_LOCK(&lk); 414 panic("worklist_insert: already on list"); 415 } 416 item->wk_state \|= ONWORKLIST; 417 LIST_INSERT_HEAD(head, item, wk_list); 418} 419 420static void 421worklist_remove(item) 422 struct worklist item; 423{ 424* 425 if (lk.lkt_held == -1) 426 panic("worklist_remove: lock not held"); 427 if ((item->wk_state & ONWORKLIST) == 0) { 428 FREE_LOCK(&lk); 429 panic("worklist_remove: not on list"); 430 } 431 item->wk_state &= ~ONWORKLIST; 432 LIST_REMOVE(item, wk_list); 433} 434 435static void 436workitem_free(item, type) 437 struct worklist item; 438* int type; 439{ 440 441 if (item->wk_state & ONWORKLIST) { 442 if (lk.lkt_held != -1) 443 FREE_LOCK(&lk); 444 panic("workitem_free: still on list"); 445 } 446 if (item->wk_type != type) { 447 if (lk.lkt_held != -1) 448 FREE_LOCK(&lk); 449 panic("workitem_free: type mismatch"); 450 } 451 FREE(item, DtoM(type)); 452} 453#endif /* DEBUG / 454* 455/* 456 * Workitem queue management 457 / 458static struct workhead softdep_workitem_pending; 459static int num_on_worklist; / number of worklist items to be processed / 460static int softdep_worklist_busy; / 1 => trying to do unmount / 461static int softdep_worklist_req; / serialized waiters / 462static int max_softdeps; / maximum number of structs before slowdown / 463static int tickdelay = 2; / number of ticks to pause during slowdown / 464static int proc_waiting; / tracks whether we have a timeout posted / 465static int stat_countp; /* statistic to count in proc_waiting timeout / 466static struct callout_handle handle; / handle on posted proc_waiting timeout / 467static struct proc filesys_syncer; /* proc of filesystem syncer process / 468static int req_clear_inodedeps; / syncer process flush some inodedeps / 469#define FLUSH_INODES 1 470static int req_clear_remove; / syncer process flush some freeblks / 471#define FLUSH_REMOVE 2 472/ 473 * runtime statistics 474 / 475static int stat_worklist_push; / number of worklist cleanups / 476static int stat_blk_limit_push; / number of times block limit neared / 477static int stat_ino_limit_push; / number of times inode limit neared / 478static int stat_blk_limit_hit; / number of times block slowdown imposed / 479static int stat_ino_limit_hit; / number of times inode slowdown imposed / 480static int stat_sync_limit_hit; / number of synchronous slowdowns imposed / 481static int stat_indir_blk_ptrs; / bufs redirtied as indir ptrs not written / 482static int stat_inode_bitmap; / bufs redirtied as inode bitmap not written / 483static int stat_direct_blk_ptrs;/ bufs redirtied as direct ptrs not written / 484static int stat_dir_entry; / bufs redirtied as dir entry cannot write / 485#ifdef DEBUG 486#include <vm/vm.h> 487#include <sys/sysctl.h> 488SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); 489SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); 490SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,""); 491SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); 492SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); 493SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); 494SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); 495SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, ""); 496SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); 497SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); 498SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); 499SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); 500#endif / DEBUG / 501* 502/* 503 * Add an item to the end of the work queue. 504 * This routine requires that the lock be held. 505 * This is the only routine that adds items to the list. 506 * The following routine is the only one that removes items 507 * and does so in order from first to last. 508 / 509static void 510add_to_worklist(wk) 511* struct worklist wk; 512{ 513* static struct worklist worklist_tail; 514* 515 if (wk->wk_state & ONWORKLIST) { 516 if (lk.lkt_held != -1) 517 FREE_LOCK(&lk); 518 panic("add_to_worklist: already on list"); 519 } 520 wk->wk_state \|= ONWORKLIST; 521 if (LIST_FIRST(&softdep_workitem_pending) == NULL) 522 LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list); 523 else 524 LIST_INSERT_AFTER(worklist_tail, wk, wk_list); 525 worklist_tail = wk; 526 num_on_worklist += 1; 527} 528 529/* 530 * Process that runs once per second to handle items in the background queue. 531 * 532 * Note that we ensure that everything is done in the order in which they 533 * appear in the queue. The code below depends on this property to ensure 534 * that blocks of a file are freed before the inode itself is freed. This 535 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 536 * until all the old ones have been purged from the dependency lists. 537 / 538int 539softdep_process_worklist(matchmnt) 540* struct mount matchmnt; 541{ 542* struct proc p = CURPROC; 543* int matchcnt, loopcount; 544 long starttime; 545 546 /* 547 * Record the process identifier of our caller so that we can give 548 * this process preferential treatment in request_cleanup below. 549 / 550* filesys_syncer = p; 551 matchcnt = 0; 552 553 /* 554 * There is no danger of having multiple processes run this 555 * code, but we have to single-thread it when softdep_flushfiles() 556 * is in operation to get an accurate count of the number of items 557 * related to its mount point that are in the list. 558 / 559* if (matchmnt == NULL) { 560 if (softdep_worklist_busy < 0) 561 return(-1); 562 softdep_worklist_busy += 1; 563 } 564 565 /* 566 * If requested, try removing inode or removal dependencies. 567 / 568* if (req_clear_inodedeps) { 569 clear_inodedeps(p); 570 req_clear_inodedeps -= 1; 571 wakeup_one(&proc_waiting); 572 } 573 if (req_clear_remove) { 574 clear_remove(p); 575 req_clear_remove -= 1; 576 wakeup_one(&proc_waiting); 577 } 578 loopcount = 1; 579 starttime = time_second; 580 while (num_on_worklist > 0) { 581 matchcnt += process_worklist_item(matchmnt, 0); 582 583 /* 584 * If a umount operation wants to run the worklist 585 * accurately, abort. 586 / 587* if (softdep_worklist_req && matchmnt == NULL) { 588 matchcnt = -1; 589 break; 590 } 591 592 /* 593 * If requested, try removing inode or removal dependencies. 594 / 595* if (req_clear_inodedeps) { 596 clear_inodedeps(p); 597 req_clear_inodedeps -= 1; 598 wakeup_one(&proc_waiting); 599 } 600 if (req_clear_remove) { 601 clear_remove(p); 602 req_clear_remove -= 1; 603 wakeup_one(&proc_waiting); 604 } 605 /* 606 * We do not generally want to stop for buffer space, but if 607 * we are really being a buffer hog, we will stop and wait. 608 / 609* if (loopcount++ % 128 == 0) 610 bwillwrite(); 611 /* 612 * Never allow processing to run for more than one 613 * second. Otherwise the other syncer tasks may get 614 * excessively backlogged. 615 / 616* if (starttime != time_second && matchmnt == NULL) { 617 matchcnt = -1; 618 break; 619 } 620 } 621 if (matchmnt == NULL) { 622 softdep_worklist_busy -= 1; 623 if (softdep_worklist_req && softdep_worklist_busy == 0) 624 wakeup(&softdep_worklist_req); 625 } 626 return (matchcnt); 627} 628 629/* 630 * Process one item on the worklist. 631 / 632static int 633process_worklist_item(matchmnt, flags) 634* struct mount matchmnt; 635* int flags; 636{ 637 struct worklist wk; 638* struct dirrem dirrem; 639* struct mount mp; 640* struct vnode vp; 641* int matchcnt = 0; 642 643 ACQUIRE_LOCK(&lk); 644 /* 645 * Normally we just process each item on the worklist in order. 646 * However, if we are in a situation where we cannot lock any 647 * inodes, we have to skip over any dirrem requests whose 648 * vnodes are resident and locked. 649 / 650* LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) { 651 if ((flags & LK_NOWAIT) == 0 \|\| wk->wk_type != D_DIRREM) 652 break; 653 dirrem = WK_DIRREM(wk); 654 vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev, 655 dirrem->dm_oldinum); 656 if (vp == NULL \|\| !VOP_ISLOCKED(vp, CURPROC)) 657 break; 658 } 659 if (wk == 0) { 660 FREE_LOCK(&lk); 661 return (0); 662 } 663 WORKLIST_REMOVE(wk); 664 num_on_worklist -= 1; 665 FREE_LOCK(&lk); 666 switch (wk->wk_type) { 667 668 case D_DIRREM: 669 /* removal of a directory entry / 670* mp = WK_DIRREM(wk)->dm_mnt; 671 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 672 panic("%s: dirrem on suspended filesystem", 673 "process_worklist_item"); 674 if (mp == matchmnt) 675 matchcnt += 1; 676 handle_workitem_remove(WK_DIRREM(wk)); 677 break; 678 679 case D_FREEBLKS: 680 /* releasing blocks and/or fragments from a file / 681* mp = WK_FREEBLKS(wk)->fb_mnt; 682 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 683 panic("%s: freeblks on suspended filesystem", 684 "process_worklist_item"); 685 if (mp == matchmnt) 686 matchcnt += 1; 687 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT); 688 break; 689 690 case D_FREEFRAG: 691 /* releasing a fragment when replaced as a file grows / 692* mp = WK_FREEFRAG(wk)->ff_mnt; 693 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 694 panic("%s: freefrag on suspended filesystem", 695 "process_worklist_item"); 696 if (mp == matchmnt) 697 matchcnt += 1; 698 handle_workitem_freefrag(WK_FREEFRAG(wk)); 699 break; 700 701 case D_FREEFILE: 702 /* releasing an inode when its link count drops to 0 / 703* mp = WK_FREEFILE(wk)->fx_mnt; 704 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 705 panic("%s: freefile on suspended filesystem", 706 "process_worklist_item"); 707 if (mp == matchmnt) 708 matchcnt += 1; 709 handle_workitem_freefile(WK_FREEFILE(wk)); 710 break; 711 712 default: 713 panic("%s_process_worklist: Unknown type %s", 714 "softdep", TYPENAME(wk->wk_type)); 715 /* NOTREACHED / 716* } 717 return (matchcnt); 718} 719 720/* 721 * Move dependencies from one buffer to another. 722 / 723static void 724softdep_move_dependencies(oldbp, newbp) 725* struct buf oldbp; 726* struct buf newbp; 727{ 728* struct worklist wk, wktail; 729 730 if (LIST_FIRST(&newbp->b_dep) != NULL) 731 panic("softdep_move_dependencies: need merge code"); 732 wktail = 0; 733 ACQUIRE_LOCK(&lk); 734 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 735 LIST_REMOVE(wk, wk_list); 736 if (wktail == 0) 737 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 738 else 739 LIST_INSERT_AFTER(wktail, wk, wk_list); 740 wktail = wk; 741 } 742 FREE_LOCK(&lk); 743} 744 745/* 746 * Purge the work list of all items associated with a particular mount point. 747 / 748int 749softdep_flushworklist(oldmnt, countp, p) 750* struct mount oldmnt; 751* int countp; 752* struct proc p; 753{ 754* struct vnode devvp; 755* int count, error = 0; 756 757 /* 758 * Await our turn to clear out the queue, then serialize access. 759 / 760* while (softdep_worklist_busy) { 761 softdep_worklist_req += 1; 762 tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0); 763 softdep_worklist_req -= 1; 764 } 765 softdep_worklist_busy = -1; 766 /* 767 * Alternately flush the block device associated with the mount 768 * point and process any dependencies that the flushing 769 * creates. We continue until no more worklist dependencies 770 * are found. 771 / 772* countp = 0; 773* devvp = VFSTOUFS(oldmnt)->um_devvp; 774 while ((count = softdep_process_worklist(oldmnt)) > 0) { 775 countp += count; 776* vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY, p); 777 error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p); 778 VOP_UNLOCK(devvp, 0, p); 779 if (error) 780 break; 781 } 782 softdep_worklist_busy = 0; 783 if (softdep_worklist_req) 784 wakeup(&softdep_worklist_req); 785 return (error); 786} 787 788/* 789 * Flush all vnodes and worklist items associated with a specified mount point. 790 / 791int 792softdep_flushfiles(oldmnt, flags, p) 793* struct mount oldmnt; 794* int flags; 795 struct proc p; 796{ 797* int error, count, loopcnt; 798 799 /* 800 * Alternately flush the vnodes associated with the mount 801 * point and process any dependencies that the flushing 802 * creates. In theory, this loop can happen at most twice, 803 * but we give it a few extra just to be sure. 804 / 805* for (loopcnt = 10; loopcnt > 0; loopcnt--) { 806 /* 807 * Do another flush in case any vnodes were brought in 808 * as part of the cleanup operations. 809 / 810* if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) 811 break; 812 if ((error = softdep_flushworklist(oldmnt, &count, p)) != 0 \|\| 813 count == 0) 814 break; 815 } 816 /* 817 * If we are unmounting then it is an error to fail. If we 818 * are simply trying to downgrade to read-only, then filesystem 819 * activity can keep us busy forever, so we just fail with EBUSY. 820 / 821* if (loopcnt == 0) { 822 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 823 panic("softdep_flushfiles: looping"); 824 error = EBUSY; 825 } 826 return (error); 827} 828 829/* 830 * Structure hashing. 831 * 832 * There are three types of structures that can be looked up: 833 * 1) pagedep structures identified by mount point, inode number, 834 * and logical block. 835 * 2) inodedep structures identified by mount point and inode number. 836 * 3) newblk structures identified by mount point and 837 * physical block number. 838 * 839 * The "pagedep" and "inodedep" dependency structures are hashed 840 * separately from the file blocks and inodes to which they correspond. 841 * This separation helps when the in-memory copy of an inode or 842 * file block must be replaced. It also obviates the need to access 843 * an inode or file page when simply updating (or de-allocating) 844 * dependency structures. Lookup of newblk structures is needed to 845 * find newly allocated blocks when trying to associate them with 846 * their allocdirect or allocindir structure. 847 * 848 * The lookup routines optionally create and hash a new instance when 849 * an existing entry is not found. 850 / 851#define DEPALLOC 0x0001 / allocate structure if lookup fails / 852#define NODELAY 0x0002 / cannot do background work / 853* 854/* 855 * Structures and routines associated with pagedep caching. 856 / 857LIST_HEAD(pagedep_hashhead, pagedep) pagedep_hashtbl; 858u_long pagedep_hash; /* size of hash table - 1 / 859#define PAGEDEP_HASH(mp, inum, lbn) \ 860* (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 861 pagedep_hash]) 862static struct sema pagedep_in_progress; 863 864/* 865 * Look up a pagedep. Return 1 if found, 0 if not found. 866 * If not found, allocate if DEPALLOC flag is passed. 867 * Found or allocated entry is returned in pagedeppp. 868 * This routine must be called with splbio interrupts blocked. 869 / 870static int 871pagedep_lookup(ip, lbn, flags, pagedeppp) 872* struct inode ip; 873* ufs_lbn_t lbn; 874 int flags; 875 struct pagedep *pagedeppp; 876{ 877* struct pagedep pagedep; 878* struct pagedep_hashhead pagedephd; 879* struct mount mp; 880* int i; 881 882#ifdef DEBUG 883 if (lk.lkt_held == -1) 884 panic("pagedep_lookup: lock not held"); 885#endif 886 mp = ITOV(ip)->v_mount; 887 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); 888top: 889 LIST_FOREACH(pagedep, pagedephd, pd_hash) 890 if (ip->i_number == pagedep->pd_ino && 891 lbn == pagedep->pd_lbn && 892 mp == pagedep->pd_mnt) 893 break; 894 if (pagedep) { 895 pagedeppp = pagedep; 896* return (1); 897 } 898 if ((flags & DEPALLOC) == 0) { 899 pagedeppp = NULL; 900* return (0); 901 } 902 if (sema_get(&pagedep_in_progress, &lk) == 0) { 903 ACQUIRE_LOCK(&lk); 904 goto top; 905 } 906 MALLOC(pagedep, struct pagedep , sizeof(struct pagedep), M_PAGEDEP, 907* M_SOFTDEP_FLAGS\|M_ZERO); 908 pagedep->pd_list.wk_type = D_PAGEDEP; 909 pagedep->pd_mnt = mp; 910 pagedep->pd_ino = ip->i_number; 911 pagedep->pd_lbn = lbn; 912 LIST_INIT(&pagedep->pd_dirremhd); 913 LIST_INIT(&pagedep->pd_pendinghd); 914 for (i = 0; i < DAHASHSZ; i++) 915 LIST_INIT(&pagedep->pd_diraddhd[i]); 916 ACQUIRE_LOCK(&lk); 917 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 918 sema_release(&pagedep_in_progress); 919 pagedeppp = pagedep; 920* return (0); 921} 922 923/* 924 * Structures and routines associated with inodedep caching. 925 / 926LIST_HEAD(inodedep_hashhead, inodedep) inodedep_hashtbl; 927static u_long inodedep_hash; /* size of hash table - 1 / 928static long num_inodedep; / number of inodedep allocated / 929#define INODEDEP_HASH(fs, inum) \ 930* (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 931static struct sema inodedep_in_progress; 932 933/* 934 * Look up a inodedep. Return 1 if found, 0 if not found. 935 * If not found, allocate if DEPALLOC flag is passed. 936 * Found or allocated entry is returned in inodedeppp. 937 * This routine must be called with splbio interrupts blocked. 938 / 939static int 940inodedep_lookup(fs, inum, flags, inodedeppp) 941* struct fs fs; 942* ino_t inum; 943 int flags; 944 struct inodedep *inodedeppp; 945{ 946* struct inodedep inodedep; 947* struct inodedep_hashhead inodedephd; 948* int firsttry; 949 950#ifdef DEBUG 951 if (lk.lkt_held == -1) 952 panic("inodedep_lookup: lock not held"); 953#endif 954 firsttry = 1; 955 inodedephd = INODEDEP_HASH(fs, inum); 956top: 957 LIST_FOREACH(inodedep, inodedephd, id_hash) 958 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 959 break; 960 if (inodedep) { 961 inodedeppp = inodedep; 962* return (1); 963 } 964 if ((flags & DEPALLOC) == 0) { 965 inodedeppp = NULL; 966* return (0); 967 } 968 /* 969 * If we are over our limit, try to improve the situation. 970 / 971* if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 && 972 request_cleanup(FLUSH_INODES, 1)) { 973 firsttry = 0; 974 goto top; 975 } 976 if (sema_get(&inodedep_in_progress, &lk) == 0) { 977 ACQUIRE_LOCK(&lk); 978 goto top; 979 } 980 num_inodedep += 1; 981 MALLOC(inodedep, struct inodedep , sizeof(struct inodedep), 982* M_INODEDEP, M_SOFTDEP_FLAGS); 983 inodedep->id_list.wk_type = D_INODEDEP; 984 inodedep->id_fs = fs; 985 inodedep->id_ino = inum; 986 inodedep->id_state = ALLCOMPLETE; 987 inodedep->id_nlinkdelta = 0; 988 inodedep->id_savedino = NULL; 989 inodedep->id_savedsize = -1; 990 inodedep->id_buf = NULL; 991 LIST_INIT(&inodedep->id_pendinghd); 992 LIST_INIT(&inodedep->id_inowait); 993 LIST_INIT(&inodedep->id_bufwait); 994 TAILQ_INIT(&inodedep->id_inoupdt); 995 TAILQ_INIT(&inodedep->id_newinoupdt); 996 ACQUIRE_LOCK(&lk); 997 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 998 sema_release(&inodedep_in_progress); 999 inodedeppp = inodedep; 1000* return (0); 1001} 1002 1003/* 1004 * Structures and routines associated with newblk caching. 1005 / 1006LIST_HEAD(newblk_hashhead, newblk) newblk_hashtbl; 1007u_long newblk_hash; /* size of hash table - 1 / 1008#define NEWBLK_HASH(fs, inum) \ 1009* (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 1010static struct sema newblk_in_progress; 1011 1012/* 1013 * Look up a newblk. Return 1 if found, 0 if not found. 1014 * If not found, allocate if DEPALLOC flag is passed. 1015 * Found or allocated entry is returned in newblkpp. 1016 / 1017static int 1018newblk_lookup(fs, newblkno, flags, newblkpp) 1019* struct fs fs; 1020* ufs_daddr_t newblkno; 1021 int flags; 1022 struct newblk *newblkpp; 1023{ 1024* struct newblk newblk; 1025* struct newblk_hashhead newblkhd; 1026* 1027 newblkhd = NEWBLK_HASH(fs, newblkno); 1028top: 1029 LIST_FOREACH(newblk, newblkhd, nb_hash) 1030 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) 1031 break; 1032 if (newblk) { 1033 newblkpp = newblk; 1034* return (1); 1035 } 1036 if ((flags & DEPALLOC) == 0) { 1037 newblkpp = NULL; 1038* return (0); 1039 } 1040 if (sema_get(&newblk_in_progress, 0) == 0) 1041 goto top; 1042 MALLOC(newblk, struct newblk , sizeof(struct newblk), 1043* M_NEWBLK, M_SOFTDEP_FLAGS); 1044 newblk->nb_state = 0; 1045 newblk->nb_fs = fs; 1046 newblk->nb_newblkno = newblkno; 1047 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 1048 sema_release(&newblk_in_progress); 1049 newblkpp = newblk; 1050* return (0); 1051} 1052 1053/* 1054 * Executed during filesystem system initialization before 1055 * mounting any file systems. 1056 / 1057void 1058softdep_initialize() 1059{ 1060* 1061 LIST_INIT(&mkdirlisthd); 1062 LIST_INIT(&softdep_workitem_pending); 1063 max_softdeps = min(desiredvnodes * 8, 1064 M_INODEDEP->ks_limit / (2 * sizeof(struct inodedep))); 1065 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, 1066 &pagedep_hash); 1067 sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0); 1068 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 1069 sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0); 1070 newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); 1071 sema_init(&newblk_in_progress, "newblk", PRIBIO, 0); 1072} 1073 1074/* 1075 * Called at mount time to notify the dependency code that a 1076 * filesystem wishes to use it. 1077 / 1078int 1079softdep_mount(devvp, mp, fs, cred) 1080* struct vnode devvp; 1081* struct mount mp; 1082* struct fs fs; 1083* struct ucred cred; 1084{ 1085* struct csum cstotal; 1086 struct cg cgp; 1087* struct buf bp; 1088* int error, cyl; 1089 1090 mp->mnt_flag &= ~MNT_ASYNC; 1091 mp->mnt_flag \|= MNT_SOFTDEP; 1092 /* 1093 * When doing soft updates, the counters in the 1094 * superblock may have gotten out of sync, so we have 1095 * to scan the cylinder groups and recalculate them. 1096 / 1097* if (fs->fs_clean != 0) 1098 return (0); 1099 bzero(&cstotal, sizeof cstotal); 1100 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 1101 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 1102 fs->fs_cgsize, cred, &bp)) != 0) { 1103 brelse(bp); 1104 return (error); 1105 } 1106 cgp = (struct cg )bp->b_data; 1107* cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 1108 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 1109 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 1110 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 1111 fs->fs_cs(fs, cyl) = cgp->cg_cs; 1112 brelse(bp); 1113 } 1114#ifdef DEBUG 1115 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 1116 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 1117#endif 1118 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 1119 return (0); 1120} 1121 1122/* 1123 * Protecting the freemaps (or bitmaps). 1124 * 1125 * To eliminate the need to execute fsck before mounting a file system 1126 * after a power failure, one must (conservatively) guarantee that the 1127 * on-disk copy of the bitmaps never indicate that a live inode or block is 1128 * free. So, when a block or inode is allocated, the bitmap should be 1129 * updated (on disk) before any new pointers. When a block or inode is 1130 * freed, the bitmap should not be updated until all pointers have been 1131 * reset. The latter dependency is handled by the delayed de-allocation 1132 * approach described below for block and inode de-allocation. The former 1133 * dependency is handled by calling the following procedure when a block or 1134 * inode is allocated. When an inode is allocated an "inodedep" is created 1135 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 1136 * Each "inodedep" is also inserted into the hash indexing structure so 1137 * that any additional link additions can be made dependent on the inode 1138 * allocation. 1139 * 1140 * The ufs file system maintains a number of free block counts (e.g., per 1141 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 1142 * in addition to the bitmaps. These counts are used to improve efficiency 1143 * during allocation and therefore must be consistent with the bitmaps. 1144 * There is no convenient way to guarantee post-crash consistency of these 1145 * counts with simple update ordering, for two main reasons: (1) The counts 1146 * and bitmaps for a single cylinder group block are not in the same disk 1147 * sector. If a disk write is interrupted (e.g., by power failure), one may 1148 * be written and the other not. (2) Some of the counts are located in the 1149 * superblock rather than the cylinder group block. So, we focus our soft 1150 * updates implementation on protecting the bitmaps. When mounting a 1151 * filesystem, we recompute the auxiliary counts from the bitmaps. 1152 / 1153* 1154/* 1155 * Called just after updating the cylinder group block to allocate an inode. 1156 / 1157void 1158softdep_setup_inomapdep(bp, ip, newinum) 1159* struct buf bp; / buffer for cylgroup block with inode map / 1160* struct inode ip; / inode related to allocation / 1161* ino_t newinum; /* new inode number being allocated / 1162{ 1163* struct inodedep inodedep; 1164* struct bmsafemap bmsafemap; 1165* 1166 /* 1167 * Create a dependency for the newly allocated inode. 1168 * Panic if it already exists as something is seriously wrong. 1169 * Otherwise add it to the dependency list for the buffer holding 1170 * the cylinder group map from which it was allocated. 1171 / 1172* ACQUIRE_LOCK(&lk); 1173 if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC\|NODELAY, &inodedep))) { 1174 FREE_LOCK(&lk); 1175 panic("softdep_setup_inomapdep: found inode"); 1176 } 1177 inodedep->id_buf = bp; 1178 inodedep->id_state &= ~DEPCOMPLETE; 1179 bmsafemap = bmsafemap_lookup(bp); 1180 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 1181 FREE_LOCK(&lk); 1182} 1183 1184/* 1185 * Called just after updating the cylinder group block to 1186 * allocate block or fragment. 1187 / 1188void 1189softdep_setup_blkmapdep(bp, fs, newblkno) 1190* struct buf bp; / buffer for cylgroup block with block map / 1191* struct fs fs; / filesystem doing allocation / 1192* ufs_daddr_t newblkno; /* number of newly allocated block / 1193{ 1194* struct newblk newblk; 1195* struct bmsafemap bmsafemap; 1196* 1197 /* 1198 * Create a dependency for the newly allocated block. 1199 * Add it to the dependency list for the buffer holding 1200 * the cylinder group map from which it was allocated. 1201 / 1202* if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) 1203 panic("softdep_setup_blkmapdep: found block"); 1204 ACQUIRE_LOCK(&lk); 1205 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp); 1206 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 1207 FREE_LOCK(&lk); 1208} 1209 1210/* 1211 * Find the bmsafemap associated with a cylinder group buffer. 1212 * If none exists, create one. The buffer must be locked when 1213 * this routine is called and this routine must be called with 1214 * splbio interrupts blocked. 1215 / 1216static struct bmsafemap 1217bmsafemap_lookup(bp) 1218 struct buf bp; 1219{ 1220* struct bmsafemap bmsafemap; 1221* struct worklist wk; 1222* 1223#ifdef DEBUG 1224 if (lk.lkt_held == -1) 1225 panic("bmsafemap_lookup: lock not held"); 1226#endif 1227 LIST_FOREACH(wk, &bp->b_dep, wk_list) 1228 if (wk->wk_type == D_BMSAFEMAP) 1229 return (WK_BMSAFEMAP(wk)); 1230 FREE_LOCK(&lk); 1231 MALLOC(bmsafemap, struct bmsafemap , sizeof(struct bmsafemap), 1232* M_BMSAFEMAP, M_SOFTDEP_FLAGS); 1233 bmsafemap->sm_list.wk_type = D_BMSAFEMAP; 1234 bmsafemap->sm_list.wk_state = 0; 1235 bmsafemap->sm_buf = bp; 1236 LIST_INIT(&bmsafemap->sm_allocdirecthd); 1237 LIST_INIT(&bmsafemap->sm_allocindirhd); 1238 LIST_INIT(&bmsafemap->sm_inodedephd); 1239 LIST_INIT(&bmsafemap->sm_newblkhd); 1240 ACQUIRE_LOCK(&lk); 1241 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 1242 return (bmsafemap); 1243} 1244 1245/* 1246 * Direct block allocation dependencies. 1247 * 1248 * When a new block is allocated, the corresponding disk locations must be 1249 * initialized (with zeros or new data) before the on-disk inode points to 1250 * them. Also, the freemap from which the block was allocated must be 1251 * updated (on disk) before the inode's pointer. These two dependencies are 1252 * independent of each other and are needed for all file blocks and indirect 1253 * blocks that are pointed to directly by the inode. Just before the 1254 * "in-core" version of the inode is updated with a newly allocated block 1255 * number, a procedure (below) is called to setup allocation dependency 1256 * structures. These structures are removed when the corresponding 1257 * dependencies are satisfied or when the block allocation becomes obsolete 1258 * (i.e., the file is deleted, the block is de-allocated, or the block is a 1259 * fragment that gets upgraded). All of these cases are handled in 1260 * procedures described later. 1261 * 1262 * When a file extension causes a fragment to be upgraded, either to a larger 1263 * fragment or to a full block, the on-disk location may change (if the 1264 * previous fragment could not simply be extended). In this case, the old 1265 * fragment must be de-allocated, but not until after the inode's pointer has 1266 * been updated. In most cases, this is handled by later procedures, which 1267 * will construct a "freefrag" structure to be added to the workitem queue 1268 * when the inode update is complete (or obsolete). The main exception to 1269 * this is when an allocation occurs while a pending allocation dependency 1270 * (for the same block pointer) remains. This case is handled in the main 1271 * allocation dependency setup procedure by immediately freeing the 1272 * unreferenced fragments. 1273 / 1274void 1275softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 1276* struct inode ip; / inode to which block is being added / 1277* ufs_lbn_t lbn; /* block pointer within inode / 1278* ufs_daddr_t newblkno; /* disk block number being added / 1279* ufs_daddr_t oldblkno; /* previous block number, 0 unless frag / 1280* long newsize; /* size of new block / 1281* long oldsize; /* size of new block / 1282* struct buf bp; / bp for allocated block / 1283{ 1284* struct allocdirect adp, oldadp; 1285 struct allocdirectlst adphead; 1286* struct bmsafemap bmsafemap; 1287* struct inodedep inodedep; 1288* struct pagedep pagedep; 1289* struct newblk newblk; 1290* 1291 MALLOC(adp, struct allocdirect , sizeof(struct allocdirect), 1292* M_ALLOCDIRECT, M_SOFTDEP_FLAGS\|M_ZERO); 1293 adp->ad_list.wk_type = D_ALLOCDIRECT; 1294 adp->ad_lbn = lbn; 1295 adp->ad_newblkno = newblkno; 1296 adp->ad_oldblkno = oldblkno; 1297 adp->ad_newsize = newsize; 1298 adp->ad_oldsize = oldsize; 1299 adp->ad_state = ATTACHED; 1300 if (newblkno == oldblkno) 1301 adp->ad_freefrag = NULL; 1302 else 1303 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); 1304 1305 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) 1306 panic("softdep_setup_allocdirect: lost block"); 1307 1308 ACQUIRE_LOCK(&lk); 1309 inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC \| NODELAY, &inodedep); 1310 adp->ad_inodedep = inodedep; 1311 1312 if (newblk->nb_state == DEPCOMPLETE) { 1313 adp->ad_state \|= DEPCOMPLETE; 1314 adp->ad_buf = NULL; 1315 } else { 1316 bmsafemap = newblk->nb_bmsafemap; 1317 adp->ad_buf = bmsafemap->sm_buf; 1318 LIST_REMOVE(newblk, nb_deps); 1319 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); 1320 } 1321 LIST_REMOVE(newblk, nb_hash); 1322 FREE(newblk, M_NEWBLK); 1323 1324 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); 1325 if (lbn >= NDADDR) { 1326 /* allocating an indirect block / 1327* if (oldblkno != 0) { 1328 FREE_LOCK(&lk); 1329 panic("softdep_setup_allocdirect: non-zero indir"); 1330 } 1331 } else { 1332 /* 1333 * Allocating a direct block. 1334 * 1335 * If we are allocating a directory block, then we must 1336 * allocate an associated pagedep to track additions and 1337 * deletions. 1338 / 1339* if ((ip->i_mode & IFMT) == IFDIR && 1340 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1341 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 1342 } 1343 /* 1344 * The list of allocdirects must be kept in sorted and ascending 1345 * order so that the rollback routines can quickly determine the 1346 * first uncommitted block (the size of the file stored on disk 1347 * ends at the end of the lowest committed fragment, or if there 1348 * are no fragments, at the end of the highest committed block). 1349 * Since files generally grow, the typical case is that the new 1350 * block is to be added at the end of the list. We speed this 1351 * special case by checking against the last allocdirect in the 1352 * list before laboriously traversing the list looking for the 1353 * insertion point. 1354 / 1355* adphead = &inodedep->id_newinoupdt; 1356 oldadp = TAILQ_LAST(adphead, allocdirectlst); 1357 if (oldadp == NULL \|\| oldadp->ad_lbn <= lbn) { 1358 /* insert at end of list / 1359* TAILQ_INSERT_TAIL(adphead, adp, ad_next); 1360 if (oldadp != NULL && oldadp->ad_lbn == lbn) 1361 allocdirect_merge(adphead, adp, oldadp); 1362 FREE_LOCK(&lk); 1363 return; 1364 } 1365 TAILQ_FOREACH(oldadp, adphead, ad_next) { 1366 if (oldadp->ad_lbn >= lbn) 1367 break; 1368 } 1369 if (oldadp == NULL) { 1370 FREE_LOCK(&lk); 1371 panic("softdep_setup_allocdirect: lost entry"); 1372 } 1373 /* insert in middle of list / 1374* TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 1375 if (oldadp->ad_lbn == lbn) 1376 allocdirect_merge(adphead, adp, oldadp); 1377 FREE_LOCK(&lk); 1378} 1379 1380/* 1381 * Replace an old allocdirect dependency with a newer one. 1382 * This routine must be called with splbio interrupts blocked. 1383 / 1384static void 1385allocdirect_merge(adphead, newadp, oldadp) 1386* struct allocdirectlst adphead; / head of list holding allocdirects / 1387* struct allocdirect newadp; / allocdirect being added / 1388* struct allocdirect oldadp; / existing allocdirect being checked / 1389{ 1390* struct freefrag freefrag; 1391* 1392#ifdef DEBUG 1393 if (lk.lkt_held == -1) 1394 panic("allocdirect_merge: lock not held"); 1395#endif 1396 if (newadp->ad_oldblkno != oldadp->ad_newblkno \|\| 1397 newadp->ad_oldsize != oldadp->ad_newsize \|\| 1398 newadp->ad_lbn >= NDADDR) { 1399 FREE_LOCK(&lk); 1400 panic("allocdirect_check: old %d != new %d \|\| lbn %ld >= %d", 1401 newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn, 1402 NDADDR); 1403 } 1404 newadp->ad_oldblkno = oldadp->ad_oldblkno; 1405 newadp->ad_oldsize = oldadp->ad_oldsize; 1406 /* 1407 * If the old dependency had a fragment to free or had never 1408 * previously had a block allocated, then the new dependency 1409 * can immediately post its freefrag and adopt the old freefrag. 1410 * This action is done by swapping the freefrag dependencies. 1411 * The new dependency gains the old one's freefrag, and the 1412 * old one gets the new one and then immediately puts it on 1413 * the worklist when it is freed by free_allocdirect. It is 1414 * not possible to do this swap when the old dependency had a 1415 * non-zero size but no previous fragment to free. This condition 1416 * arises when the new block is an extension of the old block. 1417 * Here, the first part of the fragment allocated to the new 1418 * dependency is part of the block currently claimed on disk by 1419 * the old dependency, so cannot legitimately be freed until the 1420 * conditions for the new dependency are fulfilled. 1421 / 1422* if (oldadp->ad_freefrag != NULL \|\| oldadp->ad_oldblkno == 0) { 1423 freefrag = newadp->ad_freefrag; 1424 newadp->ad_freefrag = oldadp->ad_freefrag; 1425 oldadp->ad_freefrag = freefrag; 1426 } 1427 free_allocdirect(adphead, oldadp, 0); 1428} 1429 1430/* 1431 * Allocate a new freefrag structure if needed. 1432 / 1433static struct freefrag 1434newfreefrag(ip, blkno, size) 1435 struct inode ip; 1436* ufs_daddr_t blkno; 1437 long size; 1438{ 1439 struct freefrag freefrag; 1440* struct fs fs; 1441* 1442 if (blkno == 0) 1443 return (NULL); 1444 fs = ip->i_fs; 1445 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 1446 panic("newfreefrag: frag size"); 1447 MALLOC(freefrag, struct freefrag , sizeof(struct freefrag), 1448* M_FREEFRAG, M_SOFTDEP_FLAGS); 1449 freefrag->ff_list.wk_type = D_FREEFRAG; 1450 freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below / 1451* freefrag->ff_inum = ip->i_number; 1452 freefrag->ff_mnt = ITOV(ip)->v_mount; 1453 freefrag->ff_devvp = ip->i_devvp; 1454 freefrag->ff_blkno = blkno; 1455 freefrag->ff_fragsize = size; 1456 return (freefrag); 1457} 1458 1459/* 1460 * This workitem de-allocates fragments that were replaced during 1461 * file block allocation. 1462 / 1463static void 1464handle_workitem_freefrag(freefrag) 1465* struct freefrag freefrag; 1466{ 1467* struct inode tip; 1468 1469 tip.i_vnode = NULL; 1470 tip.i_fs = VFSTOUFS(freefrag->ff_mnt)->um_fs; 1471 tip.i_devvp = freefrag->ff_devvp; 1472 tip.i_dev = freefrag->ff_devvp->v_rdev; 1473 tip.i_number = freefrag->ff_inum; 1474 tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above / 1475* ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize); 1476 FREE(freefrag, M_FREEFRAG); 1477} 1478 1479/* 1480 * Indirect block allocation dependencies. 1481 * 1482 * The same dependencies that exist for a direct block also exist when 1483 * a new block is allocated and pointed to by an entry in a block of 1484 * indirect pointers. The undo/redo states described above are also 1485 * used here. Because an indirect block contains many pointers that 1486 * may have dependencies, a second copy of the entire in-memory indirect 1487 * block is kept. The buffer cache copy is always completely up-to-date. 1488 * The second copy, which is used only as a source for disk writes, 1489 * contains only the safe pointers (i.e., those that have no remaining 1490 * update dependencies). The second copy is freed when all pointers 1491 * are safe. The cache is not allowed to replace indirect blocks with 1492 * pending update dependencies. If a buffer containing an indirect 1493 * block with dependencies is written, these routines will mark it 1494 * dirty again. It can only be successfully written once all the 1495 * dependencies are removed. The ffs_fsync routine in conjunction with 1496 * softdep_sync_metadata work together to get all the dependencies 1497 * removed so that a file can be successfully written to disk. Three 1498 * procedures are used when setting up indirect block pointer 1499 * dependencies. The division is necessary because of the organization 1500 * of the "balloc" routine and because of the distinction between file 1501 * pages and file metadata blocks. 1502 / 1503* 1504/* 1505 * Allocate a new allocindir structure. 1506 / 1507static struct allocindir 1508newallocindir(ip, ptrno, newblkno, oldblkno) 1509 struct inode ip; / inode for file being extended / 1510* int ptrno; /* offset of pointer in indirect block / 1511* ufs_daddr_t newblkno; /* disk block number being added / 1512* ufs_daddr_t oldblkno; /* previous block number, 0 if none / 1513{ 1514* struct allocindir aip; 1515* 1516 MALLOC(aip, struct allocindir , sizeof(struct allocindir), 1517* M_ALLOCINDIR, M_SOFTDEP_FLAGS\|M_ZERO); 1518 aip->ai_list.wk_type = D_ALLOCINDIR; 1519 aip->ai_state = ATTACHED; 1520 aip->ai_offset = ptrno; 1521 aip->ai_newblkno = newblkno; 1522 aip->ai_oldblkno = oldblkno; 1523 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); 1524 return (aip); 1525} 1526 1527/* 1528 * Called just before setting an indirect block pointer 1529 * to a newly allocated file page. 1530 / 1531void 1532softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 1533* struct inode ip; / inode for file being extended / 1534* ufs_lbn_t lbn; /* allocated block number within file / 1535* struct buf bp; / buffer with indirect blk referencing page / 1536* int ptrno; /* offset of pointer in indirect block / 1537* ufs_daddr_t newblkno; /* disk block number being added / 1538* ufs_daddr_t oldblkno; /* previous block number, 0 if none / 1539* struct buf nbp; / buffer holding allocated page / 1540{ 1541* struct allocindir aip; 1542* struct pagedep pagedep; 1543* 1544 aip = newallocindir(ip, ptrno, newblkno, oldblkno); 1545 ACQUIRE_LOCK(&lk); 1546 /* 1547 * If we are allocating a directory page, then we must 1548 * allocate an associated pagedep to track additions and 1549 * deletions. 1550 / 1551* if ((ip->i_mode & IFMT) == IFDIR && 1552 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1553 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); 1554 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1555 FREE_LOCK(&lk); 1556 setup_allocindir_phase2(bp, ip, aip); 1557} 1558 1559/* 1560 * Called just before setting an indirect block pointer to a 1561 * newly allocated indirect block. 1562 / 1563void 1564softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 1565* struct buf nbp; / newly allocated indirect block / 1566* struct inode ip; / inode for file being extended / 1567* struct buf bp; / indirect block referencing allocated block / 1568* int ptrno; /* offset of pointer in indirect block / 1569* ufs_daddr_t newblkno; /* disk block number being added / 1570{ 1571* struct allocindir aip; 1572* 1573 aip = newallocindir(ip, ptrno, newblkno, 0); 1574 ACQUIRE_LOCK(&lk); 1575 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1576 FREE_LOCK(&lk); 1577 setup_allocindir_phase2(bp, ip, aip); 1578} 1579 1580/* 1581 * Called to finish the allocation of the "aip" allocated 1582 * by one of the two routines above. 1583 / 1584static void 1585setup_allocindir_phase2(bp, ip, aip) 1586* struct buf bp; / in-memory copy of the indirect block / 1587* struct inode ip; / inode for file being extended / 1588* struct allocindir aip; / allocindir allocated by the above routines / 1589{ 1590* struct worklist wk; 1591* struct indirdep indirdep, newindirdep; 1592 struct bmsafemap bmsafemap; 1593* struct allocindir oldaip; 1594* struct freefrag freefrag; 1595* struct newblk newblk; 1596* 1597 if (bp->b_lblkno >= 0) 1598 panic("setup_allocindir_phase2: not indir blk"); 1599 for (indirdep = NULL, newindirdep = NULL; ; ) { 1600 ACQUIRE_LOCK(&lk); 1601 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 1602 if (wk->wk_type != D_INDIRDEP) 1603 continue; 1604 indirdep = WK_INDIRDEP(wk); 1605 break; 1606 } 1607 if (indirdep == NULL && newindirdep) { 1608 indirdep = newindirdep; 1609 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 1610 newindirdep = NULL; 1611 } 1612 FREE_LOCK(&lk); 1613 if (indirdep) { 1614 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, 1615 &newblk) == 0) 1616 panic("setup_allocindir: lost block"); 1617 ACQUIRE_LOCK(&lk); 1618 if (newblk->nb_state == DEPCOMPLETE) { 1619 aip->ai_state \|= DEPCOMPLETE; 1620 aip->ai_buf = NULL; 1621 } else { 1622 bmsafemap = newblk->nb_bmsafemap; 1623 aip->ai_buf = bmsafemap->sm_buf; 1624 LIST_REMOVE(newblk, nb_deps); 1625 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, 1626 aip, ai_deps); 1627 } 1628 LIST_REMOVE(newblk, nb_hash); 1629 FREE(newblk, M_NEWBLK); 1630 aip->ai_indirdep = indirdep; 1631 /* 1632 * Check to see if there is an existing dependency 1633 * for this block. If there is, merge the old 1634 * dependency into the new one. 1635 / 1636* if (aip->ai_oldblkno == 0) 1637 oldaip = NULL; 1638 else 1639 1640 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) 1641 if (oldaip->ai_offset == aip->ai_offset) 1642 break; 1643 freefrag = NULL; 1644 if (oldaip != NULL) { 1645 if (oldaip->ai_newblkno != aip->ai_oldblkno) { 1646 FREE_LOCK(&lk); 1647 panic("setup_allocindir_phase2: blkno"); 1648 } 1649 aip->ai_oldblkno = oldaip->ai_oldblkno; 1650 freefrag = aip->ai_freefrag; 1651 aip->ai_freefrag = oldaip->ai_freefrag; 1652 oldaip->ai_freefrag = NULL; 1653 free_allocindir(oldaip, NULL); 1654 } 1655 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 1656 ((ufs_daddr_t )indirdep->ir_savebp->b_data) 1657* [aip->ai_offset] = aip->ai_oldblkno; 1658 FREE_LOCK(&lk); 1659 if (freefrag != NULL) 1660 handle_workitem_freefrag(freefrag); 1661 } 1662 if (newindirdep) { 1663 if (indirdep->ir_savebp != NULL) 1664 brelse(newindirdep->ir_savebp); 1665 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); 1666 } 1667 if (indirdep) 1668 break; 1669 MALLOC(newindirdep, struct indirdep , sizeof(struct indirdep), 1670* M_INDIRDEP, M_SOFTDEP_FLAGS); 1671 newindirdep->ir_list.wk_type = D_INDIRDEP; 1672 newindirdep->ir_state = ATTACHED; 1673 LIST_INIT(&newindirdep->ir_deplisthd); 1674 LIST_INIT(&newindirdep->ir_donehd); 1675 if (bp->b_blkno == bp->b_lblkno) 1676 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &bp->b_blkno, NULL, NULL); 1677 newindirdep->ir_savebp = 1678 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0); 1679 BUF_KERNPROC(newindirdep->ir_savebp); 1680 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 1681 } 1682} 1683 1684/* 1685 * Block de-allocation dependencies. 1686 * 1687 * When blocks are de-allocated, the on-disk pointers must be nullified before 1688 * the blocks are made available for use by other files. (The true 1689 * requirement is that old pointers must be nullified before new on-disk 1690 * pointers are set. We chose this slightly more stringent requirement to 1691 * reduce complexity.) Our implementation handles this dependency by updating 1692 * the inode (or indirect block) appropriately but delaying the actual block 1693 * de-allocation (i.e., freemap and free space count manipulation) until 1694 * after the updated versions reach stable storage. After the disk is 1695 * updated, the blocks can be safely de-allocated whenever it is convenient. 1696 * This implementation handles only the common case of reducing a file's 1697 * length to zero. Other cases are handled by the conventional synchronous 1698 * write approach. 1699 * 1700 * The ffs implementation with which we worked double-checks 1701 * the state of the block pointers and file size as it reduces 1702 * a file's length. Some of this code is replicated here in our 1703 * soft updates implementation. The freeblks->fb_chkcnt field is 1704 * used to transfer a part of this information to the procedure 1705 * that eventually de-allocates the blocks. 1706 * 1707 * This routine should be called from the routine that shortens 1708 * a file's length, before the inode's size or block pointers 1709 * are modified. It will save the block pointer information for 1710 * later release and zero the inode so that the calling routine 1711 * can release it. 1712 / 1713void 1714softdep_setup_freeblocks(ip, length) 1715* struct inode ip; / The inode whose length is to be reduced / 1716* off_t length; /* The new length for the file / 1717{ 1718* struct freeblks freeblks; 1719* struct inodedep inodedep; 1720* struct allocdirect adp; 1721* struct vnode vp; 1722* struct buf bp; 1723* struct fs fs; 1724* int i, delay, error; 1725 1726 fs = ip->i_fs; 1727 if (length != 0)	61#include <sys/syslog.h> 62#include <sys/vnode.h> 63#include <sys/conf.h> 64#include <ufs/ufs/dir.h> 65#include <ufs/ufs/extattr.h> 66#include <ufs/ufs/quota.h> 67#include <ufs/ufs/inode.h> 68#include <ufs/ufs/ufsmount.h> 69#include <ufs/ffs/fs.h> 70#include <ufs/ffs/softdep.h> 71#include <ufs/ffs/ffs_extern.h> 72#include <ufs/ufs/ufs_extern.h> 73 74/* 75 * These definitions need to be adapted to the system to which 76 * this file is being ported. 77 / 78/ 79 * malloc types defined for the softdep system. 80 / 81static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); 82static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); 83static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); 84static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); 85static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); 86static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); 87static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); 88static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); 89static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); 90static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); 91static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); 92static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); 93static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); 94 95#define M_SOFTDEP_FLAGS (M_WAITOK \| M_USE_RESERVE) 96 97#define D_PAGEDEP 0 98#define D_INODEDEP 1 99#define D_NEWBLK 2 100#define D_BMSAFEMAP 3 101#define D_ALLOCDIRECT 4 102#define D_INDIRDEP 5 103#define D_ALLOCINDIR 6 104#define D_FREEFRAG 7 105#define D_FREEBLKS 8 106#define D_FREEFILE 9 107#define D_DIRADD 10 108#define D_MKDIR 11 109#define D_DIRREM 12 110#define D_LAST D_DIRREM 111* 112/* 113 * translate from workitem type to memory type 114 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 115 / 116static struct malloc_type memtype[] = { 117 M_PAGEDEP, 118 M_INODEDEP, 119 M_NEWBLK, 120 M_BMSAFEMAP, 121 M_ALLOCDIRECT, 122 M_INDIRDEP, 123 M_ALLOCINDIR, 124 M_FREEFRAG, 125 M_FREEBLKS, 126 M_FREEFILE, 127 M_DIRADD, 128 M_MKDIR, 129 M_DIRREM 130}; 131 132#define DtoM(type) (memtype[type]) 133 134/* 135 * Names of malloc types. 136 / 137#define TYPENAME(type) \ 138* ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") 139/* 140 * End system adaptaion definitions. 141 / 142* 143/* 144 * Internal function prototypes. 145 / 146static void softdep_error __P((char , int)); 147static void drain_output __P((struct vnode , int)); 148static int getdirtybuf __P((struct buf , int)); 149static void clear_remove __P((struct proc )); 150static void clear_inodedeps __P((struct proc )); 151static int flush_pagedep_deps __P((struct vnode , struct mount , 152* struct diraddhd )); 153static int flush_inodedep_deps __P((struct fs , ino_t)); 154static int handle_written_filepage __P((struct pagedep , struct buf )); 155static void diradd_inode_written __P((struct diradd , struct inodedep )); 156static int handle_written_inodeblock __P((struct inodedep , struct buf )); 157static void handle_allocdirect_partdone __P((struct allocdirect )); 158static void handle_allocindir_partdone __P((struct allocindir )); 159static void initiate_write_filepage __P((struct pagedep , struct buf )); 160static void handle_written_mkdir __P((struct mkdir , int)); 161static void initiate_write_inodeblock __P((struct inodedep , struct buf )); 162static void handle_workitem_freefile __P((struct freefile )); 163static void handle_workitem_remove __P((struct dirrem )); 164static struct dirrem newdirrem __P((struct buf , struct inode , 165 struct inode , int, struct dirrem )); 166static void free_diradd __P((struct diradd )); 167static void free_allocindir __P((struct allocindir , struct inodedep )); 168static int indir_trunc __P((struct inode , ufs_daddr_t, int, ufs_lbn_t, 169* long )); 170static void deallocate_dependencies __P((struct buf , struct inodedep )); 171static void free_allocdirect __P((struct allocdirectlst , 172 struct allocdirect , int)); 173static int check_inode_unwritten __P((struct inodedep )); 174static int free_inodedep __P((struct inodedep )); 175static void handle_workitem_freeblocks __P((struct freeblks , int)); 176static void merge_inode_lists __P((struct inodedep )); 177static void setup_allocindir_phase2 __P((struct buf , struct inode , 178* struct allocindir )); 179static struct allocindir newallocindir __P((struct inode , int, ufs_daddr_t, 180* ufs_daddr_t)); 181static void handle_workitem_freefrag __P((struct freefrag )); 182static struct freefrag newfreefrag __P((struct inode , ufs_daddr_t, long)); 183static void allocdirect_merge __P((struct allocdirectlst , 184 struct allocdirect , struct allocdirect )); 185static struct bmsafemap bmsafemap_lookup __P((struct buf )); 186static int newblk_lookup __P((struct fs , ufs_daddr_t, int, 187* struct newblk *)); 188static int inodedep_lookup __P((struct fs , ino_t, int, struct inodedep *)); 189static int pagedep_lookup __P((struct inode , ufs_lbn_t, int, 190 struct pagedep *)); 191static void pause_timer __P((void )); 192static int request_cleanup __P((int, int)); 193static int process_worklist_item __P((struct mount , int)); 194static void add_to_worklist __P((struct worklist )); 195 196/* 197 * Exported softdep operations. 198 / 199static void softdep_disk_io_initiation __P((struct buf )); 200static void softdep_disk_write_complete __P((struct buf )); 201static void softdep_deallocate_dependencies __P((struct buf )); 202static void softdep_move_dependencies __P((struct buf , struct buf )); 203static int softdep_count_dependencies __P((struct buf bp, int)); 204* 205struct bio_ops bioops = { 206 softdep_disk_io_initiation, /* io_start / 207* softdep_disk_write_complete, /* io_complete / 208* softdep_deallocate_dependencies, /* io_deallocate / 209* softdep_move_dependencies, /* io_movedeps / 210* softdep_count_dependencies, /* io_countdeps / 211}; 212* 213/* 214 * Locking primitives. 215 * 216 * For a uniprocessor, all we need to do is protect against disk 217 * interrupts. For a multiprocessor, this lock would have to be 218 * a mutex. A single mutex is used throughout this file, though 219 * finer grain locking could be used if contention warranted it. 220 * 221 * For a multiprocessor, the sleep call would accept a lock and 222 * release it after the sleep processing was complete. In a uniprocessor 223 * implementation there is no such interlock, so we simple mark 224 * the places where it needs to be done with the `interlocked' form 225 * of the lock calls. Since the uniprocessor sleep already interlocks 226 * the spl, there is nothing that really needs to be done. 227 / 228#ifndef / NOT / DEBUG 229static struct lockit { 230* int lkt_spl; 231} lk = { 0 }; 232#define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio() 233#define FREE_LOCK(lk) splx((lk)->lkt_spl) 234#define ACQUIRE_LOCK_INTERLOCKED(lk) 235#define FREE_LOCK_INTERLOCKED(lk) 236 237#else /* DEBUG / 238static struct lockit { 239* int lkt_spl; 240 pid_t lkt_held; 241} lk = { 0, -1 }; 242static int lockcnt; 243 244static void acquire_lock __P((struct lockit )); 245static void free_lock __P((struct lockit )); 246static void acquire_lock_interlocked __P((struct lockit )); 247static void free_lock_interlocked __P((struct lockit )); 248 249#define ACQUIRE_LOCK(lk) acquire_lock(lk) 250#define FREE_LOCK(lk) free_lock(lk) 251#define ACQUIRE_LOCK_INTERLOCKED(lk) acquire_lock_interlocked(lk) 252#define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk) 253 254static void 255acquire_lock(lk) 256 struct lockit lk; 257{ 258* pid_t holder; 259 260 if (lk->lkt_held != -1) { 261 holder = lk->lkt_held; 262 FREE_LOCK(lk); 263 if (holder == CURPROC->p_pid) 264 panic("softdep_lock: locking against myself"); 265 else 266 panic("softdep_lock: lock held by %d", holder); 267 } 268 lk->lkt_spl = splbio(); 269 lk->lkt_held = CURPROC->p_pid; 270 lockcnt++; 271} 272 273static void 274free_lock(lk) 275 struct lockit lk; 276{ 277* 278 if (lk->lkt_held == -1) 279 panic("softdep_unlock: lock not held"); 280 lk->lkt_held = -1; 281 splx(lk->lkt_spl); 282} 283 284static void 285acquire_lock_interlocked(lk) 286 struct lockit lk; 287{ 288* pid_t holder; 289 290 if (lk->lkt_held != -1) { 291 holder = lk->lkt_held; 292 FREE_LOCK(lk); 293 if (holder == CURPROC->p_pid) 294 panic("softdep_lock_interlocked: locking against self"); 295 else 296 panic("softdep_lock_interlocked: lock held by %d", 297 holder); 298 } 299 lk->lkt_held = CURPROC->p_pid; 300 lockcnt++; 301} 302 303static void 304free_lock_interlocked(lk) 305 struct lockit lk; 306{ 307* 308 if (lk->lkt_held == -1) 309 panic("softdep_unlock_interlocked: lock not held"); 310 lk->lkt_held = -1; 311} 312#endif /* DEBUG / 313* 314/* 315 * Place holder for real semaphores. 316 / 317struct sema { 318* int value; 319 pid_t holder; 320 char name; 321* int prio; 322 int timo; 323}; 324static void sema_init __P((struct sema , char , int, int)); 325static int sema_get __P((struct sema , struct lockit )); 326static void sema_release __P((struct sema )); 327* 328static void 329sema_init(semap, name, prio, timo) 330 struct sema semap; 331* char name; 332* int prio, timo; 333{ 334 335 semap->holder = -1; 336 semap->value = 0; 337 semap->name = name; 338 semap->prio = prio; 339 semap->timo = timo; 340} 341 342static int 343sema_get(semap, interlock) 344 struct sema semap; 345* struct lockit interlock; 346{ 347* 348 if (semap->value++ > 0) { 349 if (interlock != NULL) 350 FREE_LOCK_INTERLOCKED(interlock); 351 tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo); 352 if (interlock != NULL) { 353 ACQUIRE_LOCK_INTERLOCKED(interlock); 354 FREE_LOCK(interlock); 355 } 356 return (0); 357 } 358 semap->holder = CURPROC->p_pid; 359 if (interlock != NULL) 360 FREE_LOCK(interlock); 361 return (1); 362} 363 364static void 365sema_release(semap) 366 struct sema semap; 367{ 368* 369 if (semap->value <= 0 \|\| semap->holder != CURPROC->p_pid) { 370 if (lk.lkt_held != -1) 371 FREE_LOCK(&lk); 372 panic("sema_release: not held"); 373 } 374 if (--semap->value > 0) { 375 semap->value = 0; 376 wakeup(semap); 377 } 378 semap->holder = -1; 379} 380 381/* 382 * Worklist queue management. 383 * These routines require that the lock be held. 384 / 385#ifndef / NOT / DEBUG 386#define WORKLIST_INSERT(head, item) do { \ 387* (item)->wk_state \|= ONWORKLIST; \ 388 LIST_INSERT_HEAD(head, item, wk_list); \ 389} while (0) 390#define WORKLIST_REMOVE(item) do { \ 391 (item)->wk_state &= ~ONWORKLIST; \ 392 LIST_REMOVE(item, wk_list); \ 393} while (0) 394#define WORKITEM_FREE(item, type) FREE(item, DtoM(type)) 395 396#else /* DEBUG / 397static void worklist_insert __P((struct workhead , struct worklist )); 398static void worklist_remove __P((struct worklist )); 399static void workitem_free __P((struct worklist , int)); 400* 401#define WORKLIST_INSERT(head, item) worklist_insert(head, item) 402#define WORKLIST_REMOVE(item) worklist_remove(item) 403#define WORKITEM_FREE(item, type) workitem_free((struct worklist )item, type) 404* 405static void 406worklist_insert(head, item) 407 struct workhead head; 408* struct worklist item; 409{ 410* 411 if (lk.lkt_held == -1) 412 panic("worklist_insert: lock not held"); 413 if (item->wk_state & ONWORKLIST) { 414 FREE_LOCK(&lk); 415 panic("worklist_insert: already on list"); 416 } 417 item->wk_state \|= ONWORKLIST; 418 LIST_INSERT_HEAD(head, item, wk_list); 419} 420 421static void 422worklist_remove(item) 423 struct worklist item; 424{ 425* 426 if (lk.lkt_held == -1) 427 panic("worklist_remove: lock not held"); 428 if ((item->wk_state & ONWORKLIST) == 0) { 429 FREE_LOCK(&lk); 430 panic("worklist_remove: not on list"); 431 } 432 item->wk_state &= ~ONWORKLIST; 433 LIST_REMOVE(item, wk_list); 434} 435 436static void 437workitem_free(item, type) 438 struct worklist item; 439* int type; 440{ 441 442 if (item->wk_state & ONWORKLIST) { 443 if (lk.lkt_held != -1) 444 FREE_LOCK(&lk); 445 panic("workitem_free: still on list"); 446 } 447 if (item->wk_type != type) { 448 if (lk.lkt_held != -1) 449 FREE_LOCK(&lk); 450 panic("workitem_free: type mismatch"); 451 } 452 FREE(item, DtoM(type)); 453} 454#endif /* DEBUG / 455* 456/* 457 * Workitem queue management 458 / 459static struct workhead softdep_workitem_pending; 460static int num_on_worklist; / number of worklist items to be processed / 461static int softdep_worklist_busy; / 1 => trying to do unmount / 462static int softdep_worklist_req; / serialized waiters / 463static int max_softdeps; / maximum number of structs before slowdown / 464static int tickdelay = 2; / number of ticks to pause during slowdown / 465static int proc_waiting; / tracks whether we have a timeout posted / 466static int stat_countp; /* statistic to count in proc_waiting timeout / 467static struct callout_handle handle; / handle on posted proc_waiting timeout / 468static struct proc filesys_syncer; /* proc of filesystem syncer process / 469static int req_clear_inodedeps; / syncer process flush some inodedeps / 470#define FLUSH_INODES 1 471static int req_clear_remove; / syncer process flush some freeblks / 472#define FLUSH_REMOVE 2 473/ 474 * runtime statistics 475 / 476static int stat_worklist_push; / number of worklist cleanups / 477static int stat_blk_limit_push; / number of times block limit neared / 478static int stat_ino_limit_push; / number of times inode limit neared / 479static int stat_blk_limit_hit; / number of times block slowdown imposed / 480static int stat_ino_limit_hit; / number of times inode slowdown imposed / 481static int stat_sync_limit_hit; / number of synchronous slowdowns imposed / 482static int stat_indir_blk_ptrs; / bufs redirtied as indir ptrs not written / 483static int stat_inode_bitmap; / bufs redirtied as inode bitmap not written / 484static int stat_direct_blk_ptrs;/ bufs redirtied as direct ptrs not written / 485static int stat_dir_entry; / bufs redirtied as dir entry cannot write / 486#ifdef DEBUG 487#include <vm/vm.h> 488#include <sys/sysctl.h> 489SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); 490SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); 491SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,""); 492SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); 493SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); 494SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); 495SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); 496SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, ""); 497SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); 498SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); 499SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); 500SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); 501#endif / DEBUG / 502* 503/* 504 * Add an item to the end of the work queue. 505 * This routine requires that the lock be held. 506 * This is the only routine that adds items to the list. 507 * The following routine is the only one that removes items 508 * and does so in order from first to last. 509 / 510static void 511add_to_worklist(wk) 512* struct worklist wk; 513{ 514* static struct worklist worklist_tail; 515* 516 if (wk->wk_state & ONWORKLIST) { 517 if (lk.lkt_held != -1) 518 FREE_LOCK(&lk); 519 panic("add_to_worklist: already on list"); 520 } 521 wk->wk_state \|= ONWORKLIST; 522 if (LIST_FIRST(&softdep_workitem_pending) == NULL) 523 LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list); 524 else 525 LIST_INSERT_AFTER(worklist_tail, wk, wk_list); 526 worklist_tail = wk; 527 num_on_worklist += 1; 528} 529 530/* 531 * Process that runs once per second to handle items in the background queue. 532 * 533 * Note that we ensure that everything is done in the order in which they 534 * appear in the queue. The code below depends on this property to ensure 535 * that blocks of a file are freed before the inode itself is freed. This 536 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 537 * until all the old ones have been purged from the dependency lists. 538 / 539int 540softdep_process_worklist(matchmnt) 541* struct mount matchmnt; 542{ 543* struct proc p = CURPROC; 544* int matchcnt, loopcount; 545 long starttime; 546 547 /* 548 * Record the process identifier of our caller so that we can give 549 * this process preferential treatment in request_cleanup below. 550 / 551* filesys_syncer = p; 552 matchcnt = 0; 553 554 /* 555 * There is no danger of having multiple processes run this 556 * code, but we have to single-thread it when softdep_flushfiles() 557 * is in operation to get an accurate count of the number of items 558 * related to its mount point that are in the list. 559 / 560* if (matchmnt == NULL) { 561 if (softdep_worklist_busy < 0) 562 return(-1); 563 softdep_worklist_busy += 1; 564 } 565 566 /* 567 * If requested, try removing inode or removal dependencies. 568 / 569* if (req_clear_inodedeps) { 570 clear_inodedeps(p); 571 req_clear_inodedeps -= 1; 572 wakeup_one(&proc_waiting); 573 } 574 if (req_clear_remove) { 575 clear_remove(p); 576 req_clear_remove -= 1; 577 wakeup_one(&proc_waiting); 578 } 579 loopcount = 1; 580 starttime = time_second; 581 while (num_on_worklist > 0) { 582 matchcnt += process_worklist_item(matchmnt, 0); 583 584 /* 585 * If a umount operation wants to run the worklist 586 * accurately, abort. 587 / 588* if (softdep_worklist_req && matchmnt == NULL) { 589 matchcnt = -1; 590 break; 591 } 592 593 /* 594 * If requested, try removing inode or removal dependencies. 595 / 596* if (req_clear_inodedeps) { 597 clear_inodedeps(p); 598 req_clear_inodedeps -= 1; 599 wakeup_one(&proc_waiting); 600 } 601 if (req_clear_remove) { 602 clear_remove(p); 603 req_clear_remove -= 1; 604 wakeup_one(&proc_waiting); 605 } 606 /* 607 * We do not generally want to stop for buffer space, but if 608 * we are really being a buffer hog, we will stop and wait. 609 / 610* if (loopcount++ % 128 == 0) 611 bwillwrite(); 612 /* 613 * Never allow processing to run for more than one 614 * second. Otherwise the other syncer tasks may get 615 * excessively backlogged. 616 / 617* if (starttime != time_second && matchmnt == NULL) { 618 matchcnt = -1; 619 break; 620 } 621 } 622 if (matchmnt == NULL) { 623 softdep_worklist_busy -= 1; 624 if (softdep_worklist_req && softdep_worklist_busy == 0) 625 wakeup(&softdep_worklist_req); 626 } 627 return (matchcnt); 628} 629 630/* 631 * Process one item on the worklist. 632 / 633static int 634process_worklist_item(matchmnt, flags) 635* struct mount matchmnt; 636* int flags; 637{ 638 struct worklist wk; 639* struct dirrem dirrem; 640* struct mount mp; 641* struct vnode vp; 642* int matchcnt = 0; 643 644 ACQUIRE_LOCK(&lk); 645 /* 646 * Normally we just process each item on the worklist in order. 647 * However, if we are in a situation where we cannot lock any 648 * inodes, we have to skip over any dirrem requests whose 649 * vnodes are resident and locked. 650 / 651* LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) { 652 if ((flags & LK_NOWAIT) == 0 \|\| wk->wk_type != D_DIRREM) 653 break; 654 dirrem = WK_DIRREM(wk); 655 vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev, 656 dirrem->dm_oldinum); 657 if (vp == NULL \|\| !VOP_ISLOCKED(vp, CURPROC)) 658 break; 659 } 660 if (wk == 0) { 661 FREE_LOCK(&lk); 662 return (0); 663 } 664 WORKLIST_REMOVE(wk); 665 num_on_worklist -= 1; 666 FREE_LOCK(&lk); 667 switch (wk->wk_type) { 668 669 case D_DIRREM: 670 /* removal of a directory entry / 671* mp = WK_DIRREM(wk)->dm_mnt; 672 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 673 panic("%s: dirrem on suspended filesystem", 674 "process_worklist_item"); 675 if (mp == matchmnt) 676 matchcnt += 1; 677 handle_workitem_remove(WK_DIRREM(wk)); 678 break; 679 680 case D_FREEBLKS: 681 /* releasing blocks and/or fragments from a file / 682* mp = WK_FREEBLKS(wk)->fb_mnt; 683 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 684 panic("%s: freeblks on suspended filesystem", 685 "process_worklist_item"); 686 if (mp == matchmnt) 687 matchcnt += 1; 688 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT); 689 break; 690 691 case D_FREEFRAG: 692 /* releasing a fragment when replaced as a file grows / 693* mp = WK_FREEFRAG(wk)->ff_mnt; 694 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 695 panic("%s: freefrag on suspended filesystem", 696 "process_worklist_item"); 697 if (mp == matchmnt) 698 matchcnt += 1; 699 handle_workitem_freefrag(WK_FREEFRAG(wk)); 700 break; 701 702 case D_FREEFILE: 703 /* releasing an inode when its link count drops to 0 / 704* mp = WK_FREEFILE(wk)->fx_mnt; 705 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) 706 panic("%s: freefile on suspended filesystem", 707 "process_worklist_item"); 708 if (mp == matchmnt) 709 matchcnt += 1; 710 handle_workitem_freefile(WK_FREEFILE(wk)); 711 break; 712 713 default: 714 panic("%s_process_worklist: Unknown type %s", 715 "softdep", TYPENAME(wk->wk_type)); 716 /* NOTREACHED / 717* } 718 return (matchcnt); 719} 720 721/* 722 * Move dependencies from one buffer to another. 723 / 724static void 725softdep_move_dependencies(oldbp, newbp) 726* struct buf oldbp; 727* struct buf newbp; 728{ 729* struct worklist wk, wktail; 730 731 if (LIST_FIRST(&newbp->b_dep) != NULL) 732 panic("softdep_move_dependencies: need merge code"); 733 wktail = 0; 734 ACQUIRE_LOCK(&lk); 735 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 736 LIST_REMOVE(wk, wk_list); 737 if (wktail == 0) 738 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 739 else 740 LIST_INSERT_AFTER(wktail, wk, wk_list); 741 wktail = wk; 742 } 743 FREE_LOCK(&lk); 744} 745 746/* 747 * Purge the work list of all items associated with a particular mount point. 748 / 749int 750softdep_flushworklist(oldmnt, countp, p) 751* struct mount oldmnt; 752* int countp; 753* struct proc p; 754{ 755* struct vnode devvp; 756* int count, error = 0; 757 758 /* 759 * Await our turn to clear out the queue, then serialize access. 760 / 761* while (softdep_worklist_busy) { 762 softdep_worklist_req += 1; 763 tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0); 764 softdep_worklist_req -= 1; 765 } 766 softdep_worklist_busy = -1; 767 /* 768 * Alternately flush the block device associated with the mount 769 * point and process any dependencies that the flushing 770 * creates. We continue until no more worklist dependencies 771 * are found. 772 / 773* countp = 0; 774* devvp = VFSTOUFS(oldmnt)->um_devvp; 775 while ((count = softdep_process_worklist(oldmnt)) > 0) { 776 countp += count; 777* vn_lock(devvp, LK_EXCLUSIVE \| LK_RETRY, p); 778 error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p); 779 VOP_UNLOCK(devvp, 0, p); 780 if (error) 781 break; 782 } 783 softdep_worklist_busy = 0; 784 if (softdep_worklist_req) 785 wakeup(&softdep_worklist_req); 786 return (error); 787} 788 789/* 790 * Flush all vnodes and worklist items associated with a specified mount point. 791 / 792int 793softdep_flushfiles(oldmnt, flags, p) 794* struct mount oldmnt; 795* int flags; 796 struct proc p; 797{ 798* int error, count, loopcnt; 799 800 /* 801 * Alternately flush the vnodes associated with the mount 802 * point and process any dependencies that the flushing 803 * creates. In theory, this loop can happen at most twice, 804 * but we give it a few extra just to be sure. 805 / 806* for (loopcnt = 10; loopcnt > 0; loopcnt--) { 807 /* 808 * Do another flush in case any vnodes were brought in 809 * as part of the cleanup operations. 810 / 811* if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) 812 break; 813 if ((error = softdep_flushworklist(oldmnt, &count, p)) != 0 \|\| 814 count == 0) 815 break; 816 } 817 /* 818 * If we are unmounting then it is an error to fail. If we 819 * are simply trying to downgrade to read-only, then filesystem 820 * activity can keep us busy forever, so we just fail with EBUSY. 821 / 822* if (loopcnt == 0) { 823 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 824 panic("softdep_flushfiles: looping"); 825 error = EBUSY; 826 } 827 return (error); 828} 829 830/* 831 * Structure hashing. 832 * 833 * There are three types of structures that can be looked up: 834 * 1) pagedep structures identified by mount point, inode number, 835 * and logical block. 836 * 2) inodedep structures identified by mount point and inode number. 837 * 3) newblk structures identified by mount point and 838 * physical block number. 839 * 840 * The "pagedep" and "inodedep" dependency structures are hashed 841 * separately from the file blocks and inodes to which they correspond. 842 * This separation helps when the in-memory copy of an inode or 843 * file block must be replaced. It also obviates the need to access 844 * an inode or file page when simply updating (or de-allocating) 845 * dependency structures. Lookup of newblk structures is needed to 846 * find newly allocated blocks when trying to associate them with 847 * their allocdirect or allocindir structure. 848 * 849 * The lookup routines optionally create and hash a new instance when 850 * an existing entry is not found. 851 / 852#define DEPALLOC 0x0001 / allocate structure if lookup fails / 853#define NODELAY 0x0002 / cannot do background work / 854* 855/* 856 * Structures and routines associated with pagedep caching. 857 / 858LIST_HEAD(pagedep_hashhead, pagedep) pagedep_hashtbl; 859u_long pagedep_hash; /* size of hash table - 1 / 860#define PAGEDEP_HASH(mp, inum, lbn) \ 861* (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 862 pagedep_hash]) 863static struct sema pagedep_in_progress; 864 865/* 866 * Look up a pagedep. Return 1 if found, 0 if not found. 867 * If not found, allocate if DEPALLOC flag is passed. 868 * Found or allocated entry is returned in pagedeppp. 869 * This routine must be called with splbio interrupts blocked. 870 / 871static int 872pagedep_lookup(ip, lbn, flags, pagedeppp) 873* struct inode ip; 874* ufs_lbn_t lbn; 875 int flags; 876 struct pagedep *pagedeppp; 877{ 878* struct pagedep pagedep; 879* struct pagedep_hashhead pagedephd; 880* struct mount mp; 881* int i; 882 883#ifdef DEBUG 884 if (lk.lkt_held == -1) 885 panic("pagedep_lookup: lock not held"); 886#endif 887 mp = ITOV(ip)->v_mount; 888 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); 889top: 890 LIST_FOREACH(pagedep, pagedephd, pd_hash) 891 if (ip->i_number == pagedep->pd_ino && 892 lbn == pagedep->pd_lbn && 893 mp == pagedep->pd_mnt) 894 break; 895 if (pagedep) { 896 pagedeppp = pagedep; 897* return (1); 898 } 899 if ((flags & DEPALLOC) == 0) { 900 pagedeppp = NULL; 901* return (0); 902 } 903 if (sema_get(&pagedep_in_progress, &lk) == 0) { 904 ACQUIRE_LOCK(&lk); 905 goto top; 906 } 907 MALLOC(pagedep, struct pagedep , sizeof(struct pagedep), M_PAGEDEP, 908* M_SOFTDEP_FLAGS\|M_ZERO); 909 pagedep->pd_list.wk_type = D_PAGEDEP; 910 pagedep->pd_mnt = mp; 911 pagedep->pd_ino = ip->i_number; 912 pagedep->pd_lbn = lbn; 913 LIST_INIT(&pagedep->pd_dirremhd); 914 LIST_INIT(&pagedep->pd_pendinghd); 915 for (i = 0; i < DAHASHSZ; i++) 916 LIST_INIT(&pagedep->pd_diraddhd[i]); 917 ACQUIRE_LOCK(&lk); 918 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 919 sema_release(&pagedep_in_progress); 920 pagedeppp = pagedep; 921* return (0); 922} 923 924/* 925 * Structures and routines associated with inodedep caching. 926 / 927LIST_HEAD(inodedep_hashhead, inodedep) inodedep_hashtbl; 928static u_long inodedep_hash; /* size of hash table - 1 / 929static long num_inodedep; / number of inodedep allocated / 930#define INODEDEP_HASH(fs, inum) \ 931* (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 932static struct sema inodedep_in_progress; 933 934/* 935 * Look up a inodedep. Return 1 if found, 0 if not found. 936 * If not found, allocate if DEPALLOC flag is passed. 937 * Found or allocated entry is returned in inodedeppp. 938 * This routine must be called with splbio interrupts blocked. 939 / 940static int 941inodedep_lookup(fs, inum, flags, inodedeppp) 942* struct fs fs; 943* ino_t inum; 944 int flags; 945 struct inodedep *inodedeppp; 946{ 947* struct inodedep inodedep; 948* struct inodedep_hashhead inodedephd; 949* int firsttry; 950 951#ifdef DEBUG 952 if (lk.lkt_held == -1) 953 panic("inodedep_lookup: lock not held"); 954#endif 955 firsttry = 1; 956 inodedephd = INODEDEP_HASH(fs, inum); 957top: 958 LIST_FOREACH(inodedep, inodedephd, id_hash) 959 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 960 break; 961 if (inodedep) { 962 inodedeppp = inodedep; 963* return (1); 964 } 965 if ((flags & DEPALLOC) == 0) { 966 inodedeppp = NULL; 967* return (0); 968 } 969 /* 970 * If we are over our limit, try to improve the situation. 971 / 972* if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 && 973 request_cleanup(FLUSH_INODES, 1)) { 974 firsttry = 0; 975 goto top; 976 } 977 if (sema_get(&inodedep_in_progress, &lk) == 0) { 978 ACQUIRE_LOCK(&lk); 979 goto top; 980 } 981 num_inodedep += 1; 982 MALLOC(inodedep, struct inodedep , sizeof(struct inodedep), 983* M_INODEDEP, M_SOFTDEP_FLAGS); 984 inodedep->id_list.wk_type = D_INODEDEP; 985 inodedep->id_fs = fs; 986 inodedep->id_ino = inum; 987 inodedep->id_state = ALLCOMPLETE; 988 inodedep->id_nlinkdelta = 0; 989 inodedep->id_savedino = NULL; 990 inodedep->id_savedsize = -1; 991 inodedep->id_buf = NULL; 992 LIST_INIT(&inodedep->id_pendinghd); 993 LIST_INIT(&inodedep->id_inowait); 994 LIST_INIT(&inodedep->id_bufwait); 995 TAILQ_INIT(&inodedep->id_inoupdt); 996 TAILQ_INIT(&inodedep->id_newinoupdt); 997 ACQUIRE_LOCK(&lk); 998 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 999 sema_release(&inodedep_in_progress); 1000 inodedeppp = inodedep; 1001* return (0); 1002} 1003 1004/* 1005 * Structures and routines associated with newblk caching. 1006 / 1007LIST_HEAD(newblk_hashhead, newblk) newblk_hashtbl; 1008u_long newblk_hash; /* size of hash table - 1 / 1009#define NEWBLK_HASH(fs, inum) \ 1010* (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 1011static struct sema newblk_in_progress; 1012 1013/* 1014 * Look up a newblk. Return 1 if found, 0 if not found. 1015 * If not found, allocate if DEPALLOC flag is passed. 1016 * Found or allocated entry is returned in newblkpp. 1017 / 1018static int 1019newblk_lookup(fs, newblkno, flags, newblkpp) 1020* struct fs fs; 1021* ufs_daddr_t newblkno; 1022 int flags; 1023 struct newblk *newblkpp; 1024{ 1025* struct newblk newblk; 1026* struct newblk_hashhead newblkhd; 1027* 1028 newblkhd = NEWBLK_HASH(fs, newblkno); 1029top: 1030 LIST_FOREACH(newblk, newblkhd, nb_hash) 1031 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) 1032 break; 1033 if (newblk) { 1034 newblkpp = newblk; 1035* return (1); 1036 } 1037 if ((flags & DEPALLOC) == 0) { 1038 newblkpp = NULL; 1039* return (0); 1040 } 1041 if (sema_get(&newblk_in_progress, 0) == 0) 1042 goto top; 1043 MALLOC(newblk, struct newblk , sizeof(struct newblk), 1044* M_NEWBLK, M_SOFTDEP_FLAGS); 1045 newblk->nb_state = 0; 1046 newblk->nb_fs = fs; 1047 newblk->nb_newblkno = newblkno; 1048 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 1049 sema_release(&newblk_in_progress); 1050 newblkpp = newblk; 1051* return (0); 1052} 1053 1054/* 1055 * Executed during filesystem system initialization before 1056 * mounting any file systems. 1057 / 1058void 1059softdep_initialize() 1060{ 1061* 1062 LIST_INIT(&mkdirlisthd); 1063 LIST_INIT(&softdep_workitem_pending); 1064 max_softdeps = min(desiredvnodes * 8, 1065 M_INODEDEP->ks_limit / (2 * sizeof(struct inodedep))); 1066 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, 1067 &pagedep_hash); 1068 sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0); 1069 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 1070 sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0); 1071 newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); 1072 sema_init(&newblk_in_progress, "newblk", PRIBIO, 0); 1073} 1074 1075/* 1076 * Called at mount time to notify the dependency code that a 1077 * filesystem wishes to use it. 1078 / 1079int 1080softdep_mount(devvp, mp, fs, cred) 1081* struct vnode devvp; 1082* struct mount mp; 1083* struct fs fs; 1084* struct ucred cred; 1085{ 1086* struct csum cstotal; 1087 struct cg cgp; 1088* struct buf bp; 1089* int error, cyl; 1090 1091 mp->mnt_flag &= ~MNT_ASYNC; 1092 mp->mnt_flag \|= MNT_SOFTDEP; 1093 /* 1094 * When doing soft updates, the counters in the 1095 * superblock may have gotten out of sync, so we have 1096 * to scan the cylinder groups and recalculate them. 1097 / 1098* if (fs->fs_clean != 0) 1099 return (0); 1100 bzero(&cstotal, sizeof cstotal); 1101 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 1102 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 1103 fs->fs_cgsize, cred, &bp)) != 0) { 1104 brelse(bp); 1105 return (error); 1106 } 1107 cgp = (struct cg )bp->b_data; 1108* cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 1109 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 1110 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 1111 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 1112 fs->fs_cs(fs, cyl) = cgp->cg_cs; 1113 brelse(bp); 1114 } 1115#ifdef DEBUG 1116 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 1117 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 1118#endif 1119 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 1120 return (0); 1121} 1122 1123/* 1124 * Protecting the freemaps (or bitmaps). 1125 * 1126 * To eliminate the need to execute fsck before mounting a file system 1127 * after a power failure, one must (conservatively) guarantee that the 1128 * on-disk copy of the bitmaps never indicate that a live inode or block is 1129 * free. So, when a block or inode is allocated, the bitmap should be 1130 * updated (on disk) before any new pointers. When a block or inode is 1131 * freed, the bitmap should not be updated until all pointers have been 1132 * reset. The latter dependency is handled by the delayed de-allocation 1133 * approach described below for block and inode de-allocation. The former 1134 * dependency is handled by calling the following procedure when a block or 1135 * inode is allocated. When an inode is allocated an "inodedep" is created 1136 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 1137 * Each "inodedep" is also inserted into the hash indexing structure so 1138 * that any additional link additions can be made dependent on the inode 1139 * allocation. 1140 * 1141 * The ufs file system maintains a number of free block counts (e.g., per 1142 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 1143 * in addition to the bitmaps. These counts are used to improve efficiency 1144 * during allocation and therefore must be consistent with the bitmaps. 1145 * There is no convenient way to guarantee post-crash consistency of these 1146 * counts with simple update ordering, for two main reasons: (1) The counts 1147 * and bitmaps for a single cylinder group block are not in the same disk 1148 * sector. If a disk write is interrupted (e.g., by power failure), one may 1149 * be written and the other not. (2) Some of the counts are located in the 1150 * superblock rather than the cylinder group block. So, we focus our soft 1151 * updates implementation on protecting the bitmaps. When mounting a 1152 * filesystem, we recompute the auxiliary counts from the bitmaps. 1153 / 1154* 1155/* 1156 * Called just after updating the cylinder group block to allocate an inode. 1157 / 1158void 1159softdep_setup_inomapdep(bp, ip, newinum) 1160* struct buf bp; / buffer for cylgroup block with inode map / 1161* struct inode ip; / inode related to allocation / 1162* ino_t newinum; /* new inode number being allocated / 1163{ 1164* struct inodedep inodedep; 1165* struct bmsafemap bmsafemap; 1166* 1167 /* 1168 * Create a dependency for the newly allocated inode. 1169 * Panic if it already exists as something is seriously wrong. 1170 * Otherwise add it to the dependency list for the buffer holding 1171 * the cylinder group map from which it was allocated. 1172 / 1173* ACQUIRE_LOCK(&lk); 1174 if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC\|NODELAY, &inodedep))) { 1175 FREE_LOCK(&lk); 1176 panic("softdep_setup_inomapdep: found inode"); 1177 } 1178 inodedep->id_buf = bp; 1179 inodedep->id_state &= ~DEPCOMPLETE; 1180 bmsafemap = bmsafemap_lookup(bp); 1181 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 1182 FREE_LOCK(&lk); 1183} 1184 1185/* 1186 * Called just after updating the cylinder group block to 1187 * allocate block or fragment. 1188 / 1189void 1190softdep_setup_blkmapdep(bp, fs, newblkno) 1191* struct buf bp; / buffer for cylgroup block with block map / 1192* struct fs fs; / filesystem doing allocation / 1193* ufs_daddr_t newblkno; /* number of newly allocated block / 1194{ 1195* struct newblk newblk; 1196* struct bmsafemap bmsafemap; 1197* 1198 /* 1199 * Create a dependency for the newly allocated block. 1200 * Add it to the dependency list for the buffer holding 1201 * the cylinder group map from which it was allocated. 1202 / 1203* if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) 1204 panic("softdep_setup_blkmapdep: found block"); 1205 ACQUIRE_LOCK(&lk); 1206 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp); 1207 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 1208 FREE_LOCK(&lk); 1209} 1210 1211/* 1212 * Find the bmsafemap associated with a cylinder group buffer. 1213 * If none exists, create one. The buffer must be locked when 1214 * this routine is called and this routine must be called with 1215 * splbio interrupts blocked. 1216 / 1217static struct bmsafemap 1218bmsafemap_lookup(bp) 1219 struct buf bp; 1220{ 1221* struct bmsafemap bmsafemap; 1222* struct worklist wk; 1223* 1224#ifdef DEBUG 1225 if (lk.lkt_held == -1) 1226 panic("bmsafemap_lookup: lock not held"); 1227#endif 1228 LIST_FOREACH(wk, &bp->b_dep, wk_list) 1229 if (wk->wk_type == D_BMSAFEMAP) 1230 return (WK_BMSAFEMAP(wk)); 1231 FREE_LOCK(&lk); 1232 MALLOC(bmsafemap, struct bmsafemap , sizeof(struct bmsafemap), 1233* M_BMSAFEMAP, M_SOFTDEP_FLAGS); 1234 bmsafemap->sm_list.wk_type = D_BMSAFEMAP; 1235 bmsafemap->sm_list.wk_state = 0; 1236 bmsafemap->sm_buf = bp; 1237 LIST_INIT(&bmsafemap->sm_allocdirecthd); 1238 LIST_INIT(&bmsafemap->sm_allocindirhd); 1239 LIST_INIT(&bmsafemap->sm_inodedephd); 1240 LIST_INIT(&bmsafemap->sm_newblkhd); 1241 ACQUIRE_LOCK(&lk); 1242 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 1243 return (bmsafemap); 1244} 1245 1246/* 1247 * Direct block allocation dependencies. 1248 * 1249 * When a new block is allocated, the corresponding disk locations must be 1250 * initialized (with zeros or new data) before the on-disk inode points to 1251 * them. Also, the freemap from which the block was allocated must be 1252 * updated (on disk) before the inode's pointer. These two dependencies are 1253 * independent of each other and are needed for all file blocks and indirect 1254 * blocks that are pointed to directly by the inode. Just before the 1255 * "in-core" version of the inode is updated with a newly allocated block 1256 * number, a procedure (below) is called to setup allocation dependency 1257 * structures. These structures are removed when the corresponding 1258 * dependencies are satisfied or when the block allocation becomes obsolete 1259 * (i.e., the file is deleted, the block is de-allocated, or the block is a 1260 * fragment that gets upgraded). All of these cases are handled in 1261 * procedures described later. 1262 * 1263 * When a file extension causes a fragment to be upgraded, either to a larger 1264 * fragment or to a full block, the on-disk location may change (if the 1265 * previous fragment could not simply be extended). In this case, the old 1266 * fragment must be de-allocated, but not until after the inode's pointer has 1267 * been updated. In most cases, this is handled by later procedures, which 1268 * will construct a "freefrag" structure to be added to the workitem queue 1269 * when the inode update is complete (or obsolete). The main exception to 1270 * this is when an allocation occurs while a pending allocation dependency 1271 * (for the same block pointer) remains. This case is handled in the main 1272 * allocation dependency setup procedure by immediately freeing the 1273 * unreferenced fragments. 1274 / 1275void 1276softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 1277* struct inode ip; / inode to which block is being added / 1278* ufs_lbn_t lbn; /* block pointer within inode / 1279* ufs_daddr_t newblkno; /* disk block number being added / 1280* ufs_daddr_t oldblkno; /* previous block number, 0 unless frag / 1281* long newsize; /* size of new block / 1282* long oldsize; /* size of new block / 1283* struct buf bp; / bp for allocated block / 1284{ 1285* struct allocdirect adp, oldadp; 1286 struct allocdirectlst adphead; 1287* struct bmsafemap bmsafemap; 1288* struct inodedep inodedep; 1289* struct pagedep pagedep; 1290* struct newblk newblk; 1291* 1292 MALLOC(adp, struct allocdirect , sizeof(struct allocdirect), 1293* M_ALLOCDIRECT, M_SOFTDEP_FLAGS\|M_ZERO); 1294 adp->ad_list.wk_type = D_ALLOCDIRECT; 1295 adp->ad_lbn = lbn; 1296 adp->ad_newblkno = newblkno; 1297 adp->ad_oldblkno = oldblkno; 1298 adp->ad_newsize = newsize; 1299 adp->ad_oldsize = oldsize; 1300 adp->ad_state = ATTACHED; 1301 if (newblkno == oldblkno) 1302 adp->ad_freefrag = NULL; 1303 else 1304 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); 1305 1306 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) 1307 panic("softdep_setup_allocdirect: lost block"); 1308 1309 ACQUIRE_LOCK(&lk); 1310 inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC \| NODELAY, &inodedep); 1311 adp->ad_inodedep = inodedep; 1312 1313 if (newblk->nb_state == DEPCOMPLETE) { 1314 adp->ad_state \|= DEPCOMPLETE; 1315 adp->ad_buf = NULL; 1316 } else { 1317 bmsafemap = newblk->nb_bmsafemap; 1318 adp->ad_buf = bmsafemap->sm_buf; 1319 LIST_REMOVE(newblk, nb_deps); 1320 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); 1321 } 1322 LIST_REMOVE(newblk, nb_hash); 1323 FREE(newblk, M_NEWBLK); 1324 1325 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); 1326 if (lbn >= NDADDR) { 1327 /* allocating an indirect block / 1328* if (oldblkno != 0) { 1329 FREE_LOCK(&lk); 1330 panic("softdep_setup_allocdirect: non-zero indir"); 1331 } 1332 } else { 1333 /* 1334 * Allocating a direct block. 1335 * 1336 * If we are allocating a directory block, then we must 1337 * allocate an associated pagedep to track additions and 1338 * deletions. 1339 / 1340* if ((ip->i_mode & IFMT) == IFDIR && 1341 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1342 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 1343 } 1344 /* 1345 * The list of allocdirects must be kept in sorted and ascending 1346 * order so that the rollback routines can quickly determine the 1347 * first uncommitted block (the size of the file stored on disk 1348 * ends at the end of the lowest committed fragment, or if there 1349 * are no fragments, at the end of the highest committed block). 1350 * Since files generally grow, the typical case is that the new 1351 * block is to be added at the end of the list. We speed this 1352 * special case by checking against the last allocdirect in the 1353 * list before laboriously traversing the list looking for the 1354 * insertion point. 1355 / 1356* adphead = &inodedep->id_newinoupdt; 1357 oldadp = TAILQ_LAST(adphead, allocdirectlst); 1358 if (oldadp == NULL \|\| oldadp->ad_lbn <= lbn) { 1359 /* insert at end of list / 1360* TAILQ_INSERT_TAIL(adphead, adp, ad_next); 1361 if (oldadp != NULL && oldadp->ad_lbn == lbn) 1362 allocdirect_merge(adphead, adp, oldadp); 1363 FREE_LOCK(&lk); 1364 return; 1365 } 1366 TAILQ_FOREACH(oldadp, adphead, ad_next) { 1367 if (oldadp->ad_lbn >= lbn) 1368 break; 1369 } 1370 if (oldadp == NULL) { 1371 FREE_LOCK(&lk); 1372 panic("softdep_setup_allocdirect: lost entry"); 1373 } 1374 /* insert in middle of list / 1375* TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 1376 if (oldadp->ad_lbn == lbn) 1377 allocdirect_merge(adphead, adp, oldadp); 1378 FREE_LOCK(&lk); 1379} 1380 1381/* 1382 * Replace an old allocdirect dependency with a newer one. 1383 * This routine must be called with splbio interrupts blocked. 1384 / 1385static void 1386allocdirect_merge(adphead, newadp, oldadp) 1387* struct allocdirectlst adphead; / head of list holding allocdirects / 1388* struct allocdirect newadp; / allocdirect being added / 1389* struct allocdirect oldadp; / existing allocdirect being checked / 1390{ 1391* struct freefrag freefrag; 1392* 1393#ifdef DEBUG 1394 if (lk.lkt_held == -1) 1395 panic("allocdirect_merge: lock not held"); 1396#endif 1397 if (newadp->ad_oldblkno != oldadp->ad_newblkno \|\| 1398 newadp->ad_oldsize != oldadp->ad_newsize \|\| 1399 newadp->ad_lbn >= NDADDR) { 1400 FREE_LOCK(&lk); 1401 panic("allocdirect_check: old %d != new %d \|\| lbn %ld >= %d", 1402 newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn, 1403 NDADDR); 1404 } 1405 newadp->ad_oldblkno = oldadp->ad_oldblkno; 1406 newadp->ad_oldsize = oldadp->ad_oldsize; 1407 /* 1408 * If the old dependency had a fragment to free or had never 1409 * previously had a block allocated, then the new dependency 1410 * can immediately post its freefrag and adopt the old freefrag. 1411 * This action is done by swapping the freefrag dependencies. 1412 * The new dependency gains the old one's freefrag, and the 1413 * old one gets the new one and then immediately puts it on 1414 * the worklist when it is freed by free_allocdirect. It is 1415 * not possible to do this swap when the old dependency had a 1416 * non-zero size but no previous fragment to free. This condition 1417 * arises when the new block is an extension of the old block. 1418 * Here, the first part of the fragment allocated to the new 1419 * dependency is part of the block currently claimed on disk by 1420 * the old dependency, so cannot legitimately be freed until the 1421 * conditions for the new dependency are fulfilled. 1422 / 1423* if (oldadp->ad_freefrag != NULL \|\| oldadp->ad_oldblkno == 0) { 1424 freefrag = newadp->ad_freefrag; 1425 newadp->ad_freefrag = oldadp->ad_freefrag; 1426 oldadp->ad_freefrag = freefrag; 1427 } 1428 free_allocdirect(adphead, oldadp, 0); 1429} 1430 1431/* 1432 * Allocate a new freefrag structure if needed. 1433 / 1434static struct freefrag 1435newfreefrag(ip, blkno, size) 1436 struct inode ip; 1437* ufs_daddr_t blkno; 1438 long size; 1439{ 1440 struct freefrag freefrag; 1441* struct fs fs; 1442* 1443 if (blkno == 0) 1444 return (NULL); 1445 fs = ip->i_fs; 1446 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 1447 panic("newfreefrag: frag size"); 1448 MALLOC(freefrag, struct freefrag , sizeof(struct freefrag), 1449* M_FREEFRAG, M_SOFTDEP_FLAGS); 1450 freefrag->ff_list.wk_type = D_FREEFRAG; 1451 freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below / 1452* freefrag->ff_inum = ip->i_number; 1453 freefrag->ff_mnt = ITOV(ip)->v_mount; 1454 freefrag->ff_devvp = ip->i_devvp; 1455 freefrag->ff_blkno = blkno; 1456 freefrag->ff_fragsize = size; 1457 return (freefrag); 1458} 1459 1460/* 1461 * This workitem de-allocates fragments that were replaced during 1462 * file block allocation. 1463 / 1464static void 1465handle_workitem_freefrag(freefrag) 1466* struct freefrag freefrag; 1467{ 1468* struct inode tip; 1469 1470 tip.i_vnode = NULL; 1471 tip.i_fs = VFSTOUFS(freefrag->ff_mnt)->um_fs; 1472 tip.i_devvp = freefrag->ff_devvp; 1473 tip.i_dev = freefrag->ff_devvp->v_rdev; 1474 tip.i_number = freefrag->ff_inum; 1475 tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above / 1476* ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize); 1477 FREE(freefrag, M_FREEFRAG); 1478} 1479 1480/* 1481 * Indirect block allocation dependencies. 1482 * 1483 * The same dependencies that exist for a direct block also exist when 1484 * a new block is allocated and pointed to by an entry in a block of 1485 * indirect pointers. The undo/redo states described above are also 1486 * used here. Because an indirect block contains many pointers that 1487 * may have dependencies, a second copy of the entire in-memory indirect 1488 * block is kept. The buffer cache copy is always completely up-to-date. 1489 * The second copy, which is used only as a source for disk writes, 1490 * contains only the safe pointers (i.e., those that have no remaining 1491 * update dependencies). The second copy is freed when all pointers 1492 * are safe. The cache is not allowed to replace indirect blocks with 1493 * pending update dependencies. If a buffer containing an indirect 1494 * block with dependencies is written, these routines will mark it 1495 * dirty again. It can only be successfully written once all the 1496 * dependencies are removed. The ffs_fsync routine in conjunction with 1497 * softdep_sync_metadata work together to get all the dependencies 1498 * removed so that a file can be successfully written to disk. Three 1499 * procedures are used when setting up indirect block pointer 1500 * dependencies. The division is necessary because of the organization 1501 * of the "balloc" routine and because of the distinction between file 1502 * pages and file metadata blocks. 1503 / 1504* 1505/* 1506 * Allocate a new allocindir structure. 1507 / 1508static struct allocindir 1509newallocindir(ip, ptrno, newblkno, oldblkno) 1510 struct inode ip; / inode for file being extended / 1511* int ptrno; /* offset of pointer in indirect block / 1512* ufs_daddr_t newblkno; /* disk block number being added / 1513* ufs_daddr_t oldblkno; /* previous block number, 0 if none / 1514{ 1515* struct allocindir aip; 1516* 1517 MALLOC(aip, struct allocindir , sizeof(struct allocindir), 1518* M_ALLOCINDIR, M_SOFTDEP_FLAGS\|M_ZERO); 1519 aip->ai_list.wk_type = D_ALLOCINDIR; 1520 aip->ai_state = ATTACHED; 1521 aip->ai_offset = ptrno; 1522 aip->ai_newblkno = newblkno; 1523 aip->ai_oldblkno = oldblkno; 1524 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); 1525 return (aip); 1526} 1527 1528/* 1529 * Called just before setting an indirect block pointer 1530 * to a newly allocated file page. 1531 / 1532void 1533softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 1534* struct inode ip; / inode for file being extended / 1535* ufs_lbn_t lbn; /* allocated block number within file / 1536* struct buf bp; / buffer with indirect blk referencing page / 1537* int ptrno; /* offset of pointer in indirect block / 1538* ufs_daddr_t newblkno; /* disk block number being added / 1539* ufs_daddr_t oldblkno; /* previous block number, 0 if none / 1540* struct buf nbp; / buffer holding allocated page / 1541{ 1542* struct allocindir aip; 1543* struct pagedep pagedep; 1544* 1545 aip = newallocindir(ip, ptrno, newblkno, oldblkno); 1546 ACQUIRE_LOCK(&lk); 1547 /* 1548 * If we are allocating a directory page, then we must 1549 * allocate an associated pagedep to track additions and 1550 * deletions. 1551 / 1552* if ((ip->i_mode & IFMT) == IFDIR && 1553 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1554 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); 1555 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1556 FREE_LOCK(&lk); 1557 setup_allocindir_phase2(bp, ip, aip); 1558} 1559 1560/* 1561 * Called just before setting an indirect block pointer to a 1562 * newly allocated indirect block. 1563 / 1564void 1565softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 1566* struct buf nbp; / newly allocated indirect block / 1567* struct inode ip; / inode for file being extended / 1568* struct buf bp; / indirect block referencing allocated block / 1569* int ptrno; /* offset of pointer in indirect block / 1570* ufs_daddr_t newblkno; /* disk block number being added / 1571{ 1572* struct allocindir aip; 1573* 1574 aip = newallocindir(ip, ptrno, newblkno, 0); 1575 ACQUIRE_LOCK(&lk); 1576 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1577 FREE_LOCK(&lk); 1578 setup_allocindir_phase2(bp, ip, aip); 1579} 1580 1581/* 1582 * Called to finish the allocation of the "aip" allocated 1583 * by one of the two routines above. 1584 / 1585static void 1586setup_allocindir_phase2(bp, ip, aip) 1587* struct buf bp; / in-memory copy of the indirect block / 1588* struct inode ip; / inode for file being extended / 1589* struct allocindir aip; / allocindir allocated by the above routines / 1590{ 1591* struct worklist wk; 1592* struct indirdep indirdep, newindirdep; 1593 struct bmsafemap bmsafemap; 1594* struct allocindir oldaip; 1595* struct freefrag freefrag; 1596* struct newblk newblk; 1597* 1598 if (bp->b_lblkno >= 0) 1599 panic("setup_allocindir_phase2: not indir blk"); 1600 for (indirdep = NULL, newindirdep = NULL; ; ) { 1601 ACQUIRE_LOCK(&lk); 1602 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 1603 if (wk->wk_type != D_INDIRDEP) 1604 continue; 1605 indirdep = WK_INDIRDEP(wk); 1606 break; 1607 } 1608 if (indirdep == NULL && newindirdep) { 1609 indirdep = newindirdep; 1610 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 1611 newindirdep = NULL; 1612 } 1613 FREE_LOCK(&lk); 1614 if (indirdep) { 1615 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, 1616 &newblk) == 0) 1617 panic("setup_allocindir: lost block"); 1618 ACQUIRE_LOCK(&lk); 1619 if (newblk->nb_state == DEPCOMPLETE) { 1620 aip->ai_state \|= DEPCOMPLETE; 1621 aip->ai_buf = NULL; 1622 } else { 1623 bmsafemap = newblk->nb_bmsafemap; 1624 aip->ai_buf = bmsafemap->sm_buf; 1625 LIST_REMOVE(newblk, nb_deps); 1626 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, 1627 aip, ai_deps); 1628 } 1629 LIST_REMOVE(newblk, nb_hash); 1630 FREE(newblk, M_NEWBLK); 1631 aip->ai_indirdep = indirdep; 1632 /* 1633 * Check to see if there is an existing dependency 1634 * for this block. If there is, merge the old 1635 * dependency into the new one. 1636 / 1637* if (aip->ai_oldblkno == 0) 1638 oldaip = NULL; 1639 else 1640 1641 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) 1642 if (oldaip->ai_offset == aip->ai_offset) 1643 break; 1644 freefrag = NULL; 1645 if (oldaip != NULL) { 1646 if (oldaip->ai_newblkno != aip->ai_oldblkno) { 1647 FREE_LOCK(&lk); 1648 panic("setup_allocindir_phase2: blkno"); 1649 } 1650 aip->ai_oldblkno = oldaip->ai_oldblkno; 1651 freefrag = aip->ai_freefrag; 1652 aip->ai_freefrag = oldaip->ai_freefrag; 1653 oldaip->ai_freefrag = NULL; 1654 free_allocindir(oldaip, NULL); 1655 } 1656 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 1657 ((ufs_daddr_t )indirdep->ir_savebp->b_data) 1658* [aip->ai_offset] = aip->ai_oldblkno; 1659 FREE_LOCK(&lk); 1660 if (freefrag != NULL) 1661 handle_workitem_freefrag(freefrag); 1662 } 1663 if (newindirdep) { 1664 if (indirdep->ir_savebp != NULL) 1665 brelse(newindirdep->ir_savebp); 1666 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); 1667 } 1668 if (indirdep) 1669 break; 1670 MALLOC(newindirdep, struct indirdep , sizeof(struct indirdep), 1671* M_INDIRDEP, M_SOFTDEP_FLAGS); 1672 newindirdep->ir_list.wk_type = D_INDIRDEP; 1673 newindirdep->ir_state = ATTACHED; 1674 LIST_INIT(&newindirdep->ir_deplisthd); 1675 LIST_INIT(&newindirdep->ir_donehd); 1676 if (bp->b_blkno == bp->b_lblkno) 1677 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &bp->b_blkno, NULL, NULL); 1678 newindirdep->ir_savebp = 1679 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0); 1680 BUF_KERNPROC(newindirdep->ir_savebp); 1681 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 1682 } 1683} 1684 1685/* 1686 * Block de-allocation dependencies. 1687 * 1688 * When blocks are de-allocated, the on-disk pointers must be nullified before 1689 * the blocks are made available for use by other files. (The true 1690 * requirement is that old pointers must be nullified before new on-disk 1691 * pointers are set. We chose this slightly more stringent requirement to 1692 * reduce complexity.) Our implementation handles this dependency by updating 1693 * the inode (or indirect block) appropriately but delaying the actual block 1694 * de-allocation (i.e., freemap and free space count manipulation) until 1695 * after the updated versions reach stable storage. After the disk is 1696 * updated, the blocks can be safely de-allocated whenever it is convenient. 1697 * This implementation handles only the common case of reducing a file's 1698 * length to zero. Other cases are handled by the conventional synchronous 1699 * write approach. 1700 * 1701 * The ffs implementation with which we worked double-checks 1702 * the state of the block pointers and file size as it reduces 1703 * a file's length. Some of this code is replicated here in our 1704 * soft updates implementation. The freeblks->fb_chkcnt field is 1705 * used to transfer a part of this information to the procedure 1706 * that eventually de-allocates the blocks. 1707 * 1708 * This routine should be called from the routine that shortens 1709 * a file's length, before the inode's size or block pointers 1710 * are modified. It will save the block pointer information for 1711 * later release and zero the inode so that the calling routine 1712 * can release it. 1713 / 1714void 1715softdep_setup_freeblocks(ip, length) 1716* struct inode ip; / The inode whose length is to be reduced / 1717* off_t length; /* The new length for the file / 1718{ 1719* struct freeblks freeblks; 1720* struct inodedep inodedep; 1721* struct allocdirect adp; 1722* struct vnode vp; 1723* struct buf bp; 1724* struct fs fs; 1725* int i, delay, error; 1726 1727 fs = ip->i_fs; 1728 if (length != 0)
1728 panic("softde_setup_freeblocks: non-zero length");	1729 panic("softdep_setup_freeblocks: non-zero length");
1729 MALLOC(freeblks, struct freeblks , sizeof(struct freeblks), 1730* M_FREEBLKS, M_SOFTDEP_FLAGS\|M_ZERO); 1731 freeblks->fb_list.wk_type = D_FREEBLKS; 1732 freeblks->fb_uid = ip->i_uid; 1733 freeblks->fb_previousinum = ip->i_number; 1734 freeblks->fb_devvp = ip->i_devvp; 1735 freeblks->fb_mnt = ITOV(ip)->v_mount; 1736 freeblks->fb_oldsize = ip->i_size; 1737 freeblks->fb_newsize = length; 1738 freeblks->fb_chkcnt = ip->i_blocks; 1739 for (i = 0; i < NDADDR; i++) { 1740 freeblks->fb_dblks[i] = ip->i_db[i]; 1741 ip->i_db[i] = 0; 1742 } 1743 for (i = 0; i < NIADDR; i++) { 1744 freeblks->fb_iblks[i] = ip->i_ib[i]; 1745 ip->i_ib[i] = 0; 1746 } 1747 ip->i_blocks = 0; 1748 ip->i_size = 0; 1749 /*	1730 MALLOC(freeblks, struct freeblks , sizeof(struct freeblks), 1731* M_FREEBLKS, M_SOFTDEP_FLAGS\|M_ZERO); 1732 freeblks->fb_list.wk_type = D_FREEBLKS; 1733 freeblks->fb_uid = ip->i_uid; 1734 freeblks->fb_previousinum = ip->i_number; 1735 freeblks->fb_devvp = ip->i_devvp; 1736 freeblks->fb_mnt = ITOV(ip)->v_mount; 1737 freeblks->fb_oldsize = ip->i_size; 1738 freeblks->fb_newsize = length; 1739 freeblks->fb_chkcnt = ip->i_blocks; 1740 for (i = 0; i < NDADDR; i++) { 1741 freeblks->fb_dblks[i] = ip->i_db[i]; 1742 ip->i_db[i] = 0; 1743 } 1744 for (i = 0; i < NIADDR; i++) { 1745 freeblks->fb_iblks[i] = ip->i_ib[i]; 1746 ip->i_ib[i] = 0; 1747 } 1748 ip->i_blocks = 0; 1749 ip->i_size = 0; 1750 /*
	1751 * If the file was removed, then the space being freed was 1752 * accounted for then (see softdep_filereleased()). If the 1753 * file is merely being truncated, then we account for it now. 1754 / 1755* if ((ip->i_flag & IN_SPACECOUNTED) == 0) 1756 fs->fs_pendingblocks += freeblks->fb_chkcnt; 1757 /*
1750 * Push the zero'ed inode to to its disk buffer so that we are free 1751 * to delete its dependencies below. Once the dependencies are gone 1752 * the buffer can be safely released. 1753 / 1754* if ((error = bread(ip->i_devvp, 1755 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 1756 (int)fs->fs_bsize, NOCRED, &bp)) != 0) 1757 softdep_error("softdep_setup_freeblocks", error); 1758 ((struct dinode )bp->b_data + ino_to_fsbo(fs, ip->i_number)) = 1759 ip->i_din; 1760 /* 1761 * Find and eliminate any inode dependencies. 1762 / 1763* ACQUIRE_LOCK(&lk); 1764 (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep); 1765 if ((inodedep->id_state & IOSTARTED) != 0) { 1766 FREE_LOCK(&lk); 1767 panic("softdep_setup_freeblocks: inode busy"); 1768 } 1769 /* 1770 * Add the freeblks structure to the list of operations that 1771 * must await the zero'ed inode being written to disk. If we 1772 * still have a bitmap dependency (delay == 0), then the inode 1773 * has never been written to disk, so we can process the 1774 * freeblks below once we have deleted the dependencies. 1775 / 1776* delay = (inodedep->id_state & DEPCOMPLETE); 1777 if (delay) 1778 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); 1779 /* 1780 * Because the file length has been truncated to zero, any 1781 * pending block allocation dependency structures associated 1782 * with this inode are obsolete and can simply be de-allocated. 1783 * We must first merge the two dependency lists to get rid of 1784 * any duplicate freefrag structures, then purge the merged list. 1785 * If we still have a bitmap dependency, then the inode has never 1786 * been written to disk, so we can free any fragments without delay. 1787 / 1788* merge_inode_lists(inodedep); 1789 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 1790 free_allocdirect(&inodedep->id_inoupdt, adp, delay); 1791 FREE_LOCK(&lk); 1792 bdwrite(bp); 1793 /* 1794 * We must wait for any I/O in progress to finish so that 1795 * all potential buffers on the dirty list will be visible. 1796 * Once they are all there, walk the list and get rid of 1797 * any dependencies. 1798 / 1799* vp = ITOV(ip); 1800 ACQUIRE_LOCK(&lk); 1801 drain_output(vp, 1); 1802 while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) { 1803 bp = TAILQ_FIRST(&vp->v_dirtyblkhd); 1804 (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep); 1805 deallocate_dependencies(bp, inodedep); 1806 bp->b_flags \|= B_INVAL \| B_NOCACHE; 1807 FREE_LOCK(&lk); 1808 brelse(bp); 1809 ACQUIRE_LOCK(&lk); 1810 } 1811 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0) 1812 (void) free_inodedep(inodedep); 1813 FREE_LOCK(&lk); 1814 /* 1815 * If the inode has never been written to disk (delay == 0), 1816 * then we can process the freeblks now that we have deleted 1817 * the dependencies. 1818 / 1819* if (!delay) 1820 handle_workitem_freeblocks(freeblks, 0); 1821} 1822 1823/* 1824 * Reclaim any dependency structures from a buffer that is about to 1825 * be reallocated to a new vnode. The buffer must be locked, thus, 1826 * no I/O completion operations can occur while we are manipulating 1827 * its associated dependencies. The mutex is held so that other I/O's 1828 * associated with related dependencies do not occur. 1829 / 1830static void 1831deallocate_dependencies(bp, inodedep) 1832* struct buf bp; 1833* struct inodedep inodedep; 1834{ 1835* struct worklist wk; 1836* struct indirdep indirdep; 1837* struct allocindir aip; 1838* struct pagedep pagedep; 1839* struct dirrem dirrem; 1840* struct diradd dap; 1841* int i; 1842 1843 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 1844 switch (wk->wk_type) { 1845 1846 case D_INDIRDEP: 1847 indirdep = WK_INDIRDEP(wk); 1848 /* 1849 * None of the indirect pointers will ever be visible, 1850 * so they can simply be tossed. GOINGAWAY ensures 1851 * that allocated pointers will be saved in the buffer 1852 * cache until they are freed. Note that they will 1853 * only be able to be found by their physical address 1854 * since the inode mapping the logical address will 1855 * be gone. The save buffer used for the safe copy 1856 * was allocated in setup_allocindir_phase2 using 1857 * the physical address so it could be used for this 1858 * purpose. Hence we swap the safe copy with the real 1859 * copy, allowing the safe copy to be freed and holding 1860 * on to the real copy for later use in indir_trunc. 1861 / 1862* if (indirdep->ir_state & GOINGAWAY) { 1863 FREE_LOCK(&lk); 1864 panic("deallocate_dependencies: already gone"); 1865 } 1866 indirdep->ir_state \|= GOINGAWAY; 1867 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 1868 free_allocindir(aip, inodedep); 1869 if (bp->b_lblkno >= 0 \|\| 1870 bp->b_blkno != indirdep->ir_savebp->b_lblkno) { 1871 FREE_LOCK(&lk); 1872 panic("deallocate_dependencies: not indir"); 1873 } 1874 bcopy(bp->b_data, indirdep->ir_savebp->b_data, 1875 bp->b_bcount); 1876 WORKLIST_REMOVE(wk); 1877 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); 1878 continue; 1879 1880 case D_PAGEDEP: 1881 pagedep = WK_PAGEDEP(wk); 1882 /* 1883 * None of the directory additions will ever be 1884 * visible, so they can simply be tossed. 1885 / 1886* for (i = 0; i < DAHASHSZ; i++) 1887 while ((dap = 1888 LIST_FIRST(&pagedep->pd_diraddhd[i]))) 1889 free_diradd(dap); 1890 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) 1891 free_diradd(dap); 1892 /* 1893 * Copy any directory remove dependencies to the list 1894 * to be processed after the zero'ed inode is written. 1895 * If the inode has already been written, then they 1896 * can be dumped directly onto the work list. 1897 / 1898* LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 1899 LIST_REMOVE(dirrem, dm_next); 1900 dirrem->dm_dirinum = pagedep->pd_ino; 1901 if (inodedep == NULL \|\| 1902 (inodedep->id_state & ALLCOMPLETE) == 1903 ALLCOMPLETE) 1904 add_to_worklist(&dirrem->dm_list); 1905 else 1906 WORKLIST_INSERT(&inodedep->id_bufwait, 1907 &dirrem->dm_list); 1908 } 1909 WORKLIST_REMOVE(&pagedep->pd_list); 1910 LIST_REMOVE(pagedep, pd_hash); 1911 WORKITEM_FREE(pagedep, D_PAGEDEP); 1912 continue; 1913 1914 case D_ALLOCINDIR: 1915 free_allocindir(WK_ALLOCINDIR(wk), inodedep); 1916 continue; 1917 1918 case D_ALLOCDIRECT: 1919 case D_INODEDEP: 1920 FREE_LOCK(&lk); 1921 panic("deallocate_dependencies: Unexpected type %s", 1922 TYPENAME(wk->wk_type)); 1923 /* NOTREACHED / 1924* 1925 default: 1926 FREE_LOCK(&lk); 1927 panic("deallocate_dependencies: Unknown type %s", 1928 TYPENAME(wk->wk_type)); 1929 /* NOTREACHED / 1930* } 1931 } 1932} 1933 1934/* 1935 * Free an allocdirect. Generate a new freefrag work request if appropriate. 1936 * This routine must be called with splbio interrupts blocked. 1937 / 1938static void 1939free_allocdirect(adphead, adp, delay) 1940* struct allocdirectlst adphead; 1941* struct allocdirect adp; 1942* int delay; 1943{ 1944 1945#ifdef DEBUG 1946 if (lk.lkt_held == -1) 1947 panic("free_allocdirect: lock not held"); 1948#endif 1949 if ((adp->ad_state & DEPCOMPLETE) == 0) 1950 LIST_REMOVE(adp, ad_deps); 1951 TAILQ_REMOVE(adphead, adp, ad_next); 1952 if ((adp->ad_state & COMPLETE) == 0) 1953 WORKLIST_REMOVE(&adp->ad_list); 1954 if (adp->ad_freefrag != NULL) { 1955 if (delay) 1956 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 1957 &adp->ad_freefrag->ff_list); 1958 else 1959 add_to_worklist(&adp->ad_freefrag->ff_list); 1960 } 1961 WORKITEM_FREE(adp, D_ALLOCDIRECT); 1962} 1963 1964/* 1965 * Prepare an inode to be freed. The actual free operation is not 1966 * done until the zero'ed inode has been written to disk. 1967 / 1968void 1969softdep_freefile(pvp, ino, mode) 1970* struct vnode pvp; 1971* ino_t ino; 1972 int mode; 1973{ 1974 struct inode ip = VTOI(pvp); 1975* struct inodedep inodedep; 1976* struct freefile freefile; 1977* 1978 /* 1979 * This sets up the inode de-allocation dependency. 1980 / 1981* MALLOC(freefile, struct freefile , sizeof(struct freefile), 1982* M_FREEFILE, M_SOFTDEP_FLAGS); 1983 freefile->fx_list.wk_type = D_FREEFILE; 1984 freefile->fx_list.wk_state = 0; 1985 freefile->fx_mode = mode; 1986 freefile->fx_oldinum = ino; 1987 freefile->fx_devvp = ip->i_devvp; 1988 freefile->fx_mnt = ITOV(ip)->v_mount;	1758 * Push the zero'ed inode to to its disk buffer so that we are free 1759 * to delete its dependencies below. Once the dependencies are gone 1760 * the buffer can be safely released. 1761 / 1762* if ((error = bread(ip->i_devvp, 1763 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 1764 (int)fs->fs_bsize, NOCRED, &bp)) != 0) 1765 softdep_error("softdep_setup_freeblocks", error); 1766 ((struct dinode )bp->b_data + ino_to_fsbo(fs, ip->i_number)) = 1767 ip->i_din; 1768 /* 1769 * Find and eliminate any inode dependencies. 1770 / 1771* ACQUIRE_LOCK(&lk); 1772 (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep); 1773 if ((inodedep->id_state & IOSTARTED) != 0) { 1774 FREE_LOCK(&lk); 1775 panic("softdep_setup_freeblocks: inode busy"); 1776 } 1777 /* 1778 * Add the freeblks structure to the list of operations that 1779 * must await the zero'ed inode being written to disk. If we 1780 * still have a bitmap dependency (delay == 0), then the inode 1781 * has never been written to disk, so we can process the 1782 * freeblks below once we have deleted the dependencies. 1783 / 1784* delay = (inodedep->id_state & DEPCOMPLETE); 1785 if (delay) 1786 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); 1787 /* 1788 * Because the file length has been truncated to zero, any 1789 * pending block allocation dependency structures associated 1790 * with this inode are obsolete and can simply be de-allocated. 1791 * We must first merge the two dependency lists to get rid of 1792 * any duplicate freefrag structures, then purge the merged list. 1793 * If we still have a bitmap dependency, then the inode has never 1794 * been written to disk, so we can free any fragments without delay. 1795 / 1796* merge_inode_lists(inodedep); 1797 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 1798 free_allocdirect(&inodedep->id_inoupdt, adp, delay); 1799 FREE_LOCK(&lk); 1800 bdwrite(bp); 1801 /* 1802 * We must wait for any I/O in progress to finish so that 1803 * all potential buffers on the dirty list will be visible. 1804 * Once they are all there, walk the list and get rid of 1805 * any dependencies. 1806 / 1807* vp = ITOV(ip); 1808 ACQUIRE_LOCK(&lk); 1809 drain_output(vp, 1); 1810 while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) { 1811 bp = TAILQ_FIRST(&vp->v_dirtyblkhd); 1812 (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep); 1813 deallocate_dependencies(bp, inodedep); 1814 bp->b_flags \|= B_INVAL \| B_NOCACHE; 1815 FREE_LOCK(&lk); 1816 brelse(bp); 1817 ACQUIRE_LOCK(&lk); 1818 } 1819 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0) 1820 (void) free_inodedep(inodedep); 1821 FREE_LOCK(&lk); 1822 /* 1823 * If the inode has never been written to disk (delay == 0), 1824 * then we can process the freeblks now that we have deleted 1825 * the dependencies. 1826 / 1827* if (!delay) 1828 handle_workitem_freeblocks(freeblks, 0); 1829} 1830 1831/* 1832 * Reclaim any dependency structures from a buffer that is about to 1833 * be reallocated to a new vnode. The buffer must be locked, thus, 1834 * no I/O completion operations can occur while we are manipulating 1835 * its associated dependencies. The mutex is held so that other I/O's 1836 * associated with related dependencies do not occur. 1837 / 1838static void 1839deallocate_dependencies(bp, inodedep) 1840* struct buf bp; 1841* struct inodedep inodedep; 1842{ 1843* struct worklist wk; 1844* struct indirdep indirdep; 1845* struct allocindir aip; 1846* struct pagedep pagedep; 1847* struct dirrem dirrem; 1848* struct diradd dap; 1849* int i; 1850 1851 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 1852 switch (wk->wk_type) { 1853 1854 case D_INDIRDEP: 1855 indirdep = WK_INDIRDEP(wk); 1856 /* 1857 * None of the indirect pointers will ever be visible, 1858 * so they can simply be tossed. GOINGAWAY ensures 1859 * that allocated pointers will be saved in the buffer 1860 * cache until they are freed. Note that they will 1861 * only be able to be found by their physical address 1862 * since the inode mapping the logical address will 1863 * be gone. The save buffer used for the safe copy 1864 * was allocated in setup_allocindir_phase2 using 1865 * the physical address so it could be used for this 1866 * purpose. Hence we swap the safe copy with the real 1867 * copy, allowing the safe copy to be freed and holding 1868 * on to the real copy for later use in indir_trunc. 1869 / 1870* if (indirdep->ir_state & GOINGAWAY) { 1871 FREE_LOCK(&lk); 1872 panic("deallocate_dependencies: already gone"); 1873 } 1874 indirdep->ir_state \|= GOINGAWAY; 1875 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 1876 free_allocindir(aip, inodedep); 1877 if (bp->b_lblkno >= 0 \|\| 1878 bp->b_blkno != indirdep->ir_savebp->b_lblkno) { 1879 FREE_LOCK(&lk); 1880 panic("deallocate_dependencies: not indir"); 1881 } 1882 bcopy(bp->b_data, indirdep->ir_savebp->b_data, 1883 bp->b_bcount); 1884 WORKLIST_REMOVE(wk); 1885 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); 1886 continue; 1887 1888 case D_PAGEDEP: 1889 pagedep = WK_PAGEDEP(wk); 1890 /* 1891 * None of the directory additions will ever be 1892 * visible, so they can simply be tossed. 1893 / 1894* for (i = 0; i < DAHASHSZ; i++) 1895 while ((dap = 1896 LIST_FIRST(&pagedep->pd_diraddhd[i]))) 1897 free_diradd(dap); 1898 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) 1899 free_diradd(dap); 1900 /* 1901 * Copy any directory remove dependencies to the list 1902 * to be processed after the zero'ed inode is written. 1903 * If the inode has already been written, then they 1904 * can be dumped directly onto the work list. 1905 / 1906* LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 1907 LIST_REMOVE(dirrem, dm_next); 1908 dirrem->dm_dirinum = pagedep->pd_ino; 1909 if (inodedep == NULL \|\| 1910 (inodedep->id_state & ALLCOMPLETE) == 1911 ALLCOMPLETE) 1912 add_to_worklist(&dirrem->dm_list); 1913 else 1914 WORKLIST_INSERT(&inodedep->id_bufwait, 1915 &dirrem->dm_list); 1916 } 1917 WORKLIST_REMOVE(&pagedep->pd_list); 1918 LIST_REMOVE(pagedep, pd_hash); 1919 WORKITEM_FREE(pagedep, D_PAGEDEP); 1920 continue; 1921 1922 case D_ALLOCINDIR: 1923 free_allocindir(WK_ALLOCINDIR(wk), inodedep); 1924 continue; 1925 1926 case D_ALLOCDIRECT: 1927 case D_INODEDEP: 1928 FREE_LOCK(&lk); 1929 panic("deallocate_dependencies: Unexpected type %s", 1930 TYPENAME(wk->wk_type)); 1931 /* NOTREACHED / 1932* 1933 default: 1934 FREE_LOCK(&lk); 1935 panic("deallocate_dependencies: Unknown type %s", 1936 TYPENAME(wk->wk_type)); 1937 /* NOTREACHED / 1938* } 1939 } 1940} 1941 1942/* 1943 * Free an allocdirect. Generate a new freefrag work request if appropriate. 1944 * This routine must be called with splbio interrupts blocked. 1945 / 1946static void 1947free_allocdirect(adphead, adp, delay) 1948* struct allocdirectlst adphead; 1949* struct allocdirect adp; 1950* int delay; 1951{ 1952 1953#ifdef DEBUG 1954 if (lk.lkt_held == -1) 1955 panic("free_allocdirect: lock not held"); 1956#endif 1957 if ((adp->ad_state & DEPCOMPLETE) == 0) 1958 LIST_REMOVE(adp, ad_deps); 1959 TAILQ_REMOVE(adphead, adp, ad_next); 1960 if ((adp->ad_state & COMPLETE) == 0) 1961 WORKLIST_REMOVE(&adp->ad_list); 1962 if (adp->ad_freefrag != NULL) { 1963 if (delay) 1964 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 1965 &adp->ad_freefrag->ff_list); 1966 else 1967 add_to_worklist(&adp->ad_freefrag->ff_list); 1968 } 1969 WORKITEM_FREE(adp, D_ALLOCDIRECT); 1970} 1971 1972/* 1973 * Prepare an inode to be freed. The actual free operation is not 1974 * done until the zero'ed inode has been written to disk. 1975 / 1976void 1977softdep_freefile(pvp, ino, mode) 1978* struct vnode pvp; 1979* ino_t ino; 1980 int mode; 1981{ 1982 struct inode ip = VTOI(pvp); 1983* struct inodedep inodedep; 1984* struct freefile freefile; 1985* 1986 /* 1987 * This sets up the inode de-allocation dependency. 1988 / 1989* MALLOC(freefile, struct freefile , sizeof(struct freefile), 1990* M_FREEFILE, M_SOFTDEP_FLAGS); 1991 freefile->fx_list.wk_type = D_FREEFILE; 1992 freefile->fx_list.wk_state = 0; 1993 freefile->fx_mode = mode; 1994 freefile->fx_oldinum = ino; 1995 freefile->fx_devvp = ip->i_devvp; 1996 freefile->fx_mnt = ITOV(ip)->v_mount;
	1997 if ((ip->i_flag & IN_SPACECOUNTED) == 0) 1998 ip->i_fs->fs_pendinginodes += 1;
1989 1990 /* 1991 * If the inodedep does not exist, then the zero'ed inode has 1992 * been written to disk. If the allocated inode has never been 1993 * written to disk, then the on-disk inode is zero'ed. In either 1994 * case we can free the file immediately. 1995 / 1996* ACQUIRE_LOCK(&lk); 1997 if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 \|\| 1998 check_inode_unwritten(inodedep)) { 1999 FREE_LOCK(&lk); 2000 handle_workitem_freefile(freefile); 2001 return; 2002 } 2003 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 2004 FREE_LOCK(&lk); 2005} 2006 2007/* 2008 * Check to see if an inode has never been written to disk. If 2009 * so free the inodedep and return success, otherwise return failure. 2010 * This routine must be called with splbio interrupts blocked. 2011 * 2012 * If we still have a bitmap dependency, then the inode has never 2013 * been written to disk. Drop the dependency as it is no longer 2014 * necessary since the inode is being deallocated. We set the 2015 * ALLCOMPLETE flags since the bitmap now properly shows that the 2016 * inode is not allocated. Even if the inode is actively being 2017 * written, it has been rolled back to its zero'ed state, so we 2018 * are ensured that a zero inode is what is on the disk. For short 2019 * lived files, this change will usually result in removing all the 2020 * dependencies from the inode so that it can be freed immediately. 2021 / 2022static int 2023check_inode_unwritten(inodedep) 2024* struct inodedep inodedep; 2025{ 2026* 2027 if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\| 2028 LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\| 2029 LIST_FIRST(&inodedep->id_bufwait) != NULL \|\| 2030 LIST_FIRST(&inodedep->id_inowait) != NULL \|\| 2031 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\| 2032 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\| 2033 inodedep->id_nlinkdelta != 0) 2034 return (0); 2035 inodedep->id_state \|= ALLCOMPLETE; 2036 LIST_REMOVE(inodedep, id_deps); 2037 inodedep->id_buf = NULL; 2038 if (inodedep->id_state & ONWORKLIST) 2039 WORKLIST_REMOVE(&inodedep->id_list); 2040 if (inodedep->id_savedino != NULL) { 2041 FREE(inodedep->id_savedino, M_INODEDEP); 2042 inodedep->id_savedino = NULL; 2043 } 2044 if (free_inodedep(inodedep) == 0) { 2045 FREE_LOCK(&lk); 2046 panic("check_inode_unwritten: busy inode"); 2047 } 2048 return (1); 2049} 2050 2051/* 2052 * Try to free an inodedep structure. Return 1 if it could be freed. 2053 / 2054static int 2055free_inodedep(inodedep) 2056* struct inodedep inodedep; 2057{ 2058* 2059 if ((inodedep->id_state & ONWORKLIST) != 0 \|\| 2060 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE \|\| 2061 LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\| 2062 LIST_FIRST(&inodedep->id_bufwait) != NULL \|\| 2063 LIST_FIRST(&inodedep->id_inowait) != NULL \|\| 2064 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\| 2065 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\| 2066 inodedep->id_nlinkdelta != 0 \|\| inodedep->id_savedino != NULL) 2067 return (0); 2068 LIST_REMOVE(inodedep, id_hash); 2069 WORKITEM_FREE(inodedep, D_INODEDEP); 2070 num_inodedep -= 1; 2071 return (1); 2072} 2073 2074/* 2075 * This workitem routine performs the block de-allocation. 2076 * The workitem is added to the pending list after the updated 2077 * inode block has been written to disk. As mentioned above, 2078 * checks regarding the number of blocks de-allocated (compared 2079 * to the number of blocks allocated for the file) are also 2080 * performed in this function. 2081 / 2082static void 2083handle_workitem_freeblocks(freeblks, flags) 2084* struct freeblks freeblks; 2085* int flags; 2086{ 2087 struct inode tip, ip; 2088* struct vnode vp; 2089* ufs_daddr_t bn; 2090 struct fs fs; 2091* int i, level, bsize; 2092 long nblocks, blocksreleased = 0; 2093 int error, allerror = 0; 2094 ufs_lbn_t baselbns[NIADDR], tmpval; 2095 2096 tip.i_fs = fs = VFSTOUFS(freeblks->fb_mnt)->um_fs; 2097 tip.i_number = freeblks->fb_previousinum; 2098 tip.i_devvp = freeblks->fb_devvp; 2099 tip.i_dev = freeblks->fb_devvp->v_rdev; 2100 tip.i_size = freeblks->fb_oldsize; 2101 tip.i_uid = freeblks->fb_uid; 2102 tip.i_vnode = NULL; 2103 tmpval = 1; 2104 baselbns[0] = NDADDR; 2105 for (i = 1; i < NIADDR; i++) { 2106 tmpval = NINDIR(fs); 2107* baselbns[i] = baselbns[i - 1] + tmpval; 2108 } 2109 nblocks = btodb(fs->fs_bsize); 2110 blocksreleased = 0; 2111 /* 2112 * Indirect blocks first. 2113 / 2114* for (level = (NIADDR - 1); level >= 0; level--) { 2115 if ((bn = freeblks->fb_iblks[level]) == 0) 2116 continue; 2117 if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level, 2118 baselbns[level], &blocksreleased)) == 0) 2119 allerror = error; 2120 ffs_blkfree(&tip, bn, fs->fs_bsize);	1999 2000 /* 2001 * If the inodedep does not exist, then the zero'ed inode has 2002 * been written to disk. If the allocated inode has never been 2003 * written to disk, then the on-disk inode is zero'ed. In either 2004 * case we can free the file immediately. 2005 / 2006* ACQUIRE_LOCK(&lk); 2007 if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 \|\| 2008 check_inode_unwritten(inodedep)) { 2009 FREE_LOCK(&lk); 2010 handle_workitem_freefile(freefile); 2011 return; 2012 } 2013 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 2014 FREE_LOCK(&lk); 2015} 2016 2017/* 2018 * Check to see if an inode has never been written to disk. If 2019 * so free the inodedep and return success, otherwise return failure. 2020 * This routine must be called with splbio interrupts blocked. 2021 * 2022 * If we still have a bitmap dependency, then the inode has never 2023 * been written to disk. Drop the dependency as it is no longer 2024 * necessary since the inode is being deallocated. We set the 2025 * ALLCOMPLETE flags since the bitmap now properly shows that the 2026 * inode is not allocated. Even if the inode is actively being 2027 * written, it has been rolled back to its zero'ed state, so we 2028 * are ensured that a zero inode is what is on the disk. For short 2029 * lived files, this change will usually result in removing all the 2030 * dependencies from the inode so that it can be freed immediately. 2031 / 2032static int 2033check_inode_unwritten(inodedep) 2034* struct inodedep inodedep; 2035{ 2036* 2037 if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\| 2038 LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\| 2039 LIST_FIRST(&inodedep->id_bufwait) != NULL \|\| 2040 LIST_FIRST(&inodedep->id_inowait) != NULL \|\| 2041 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\| 2042 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\| 2043 inodedep->id_nlinkdelta != 0) 2044 return (0); 2045 inodedep->id_state \|= ALLCOMPLETE; 2046 LIST_REMOVE(inodedep, id_deps); 2047 inodedep->id_buf = NULL; 2048 if (inodedep->id_state & ONWORKLIST) 2049 WORKLIST_REMOVE(&inodedep->id_list); 2050 if (inodedep->id_savedino != NULL) { 2051 FREE(inodedep->id_savedino, M_INODEDEP); 2052 inodedep->id_savedino = NULL; 2053 } 2054 if (free_inodedep(inodedep) == 0) { 2055 FREE_LOCK(&lk); 2056 panic("check_inode_unwritten: busy inode"); 2057 } 2058 return (1); 2059} 2060 2061/* 2062 * Try to free an inodedep structure. Return 1 if it could be freed. 2063 / 2064static int 2065free_inodedep(inodedep) 2066* struct inodedep inodedep; 2067{ 2068* 2069 if ((inodedep->id_state & ONWORKLIST) != 0 \|\| 2070 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE \|\| 2071 LIST_FIRST(&inodedep->id_pendinghd) != NULL \|\| 2072 LIST_FIRST(&inodedep->id_bufwait) != NULL \|\| 2073 LIST_FIRST(&inodedep->id_inowait) != NULL \|\| 2074 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\| 2075 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL \|\| 2076 inodedep->id_nlinkdelta != 0 \|\| inodedep->id_savedino != NULL) 2077 return (0); 2078 LIST_REMOVE(inodedep, id_hash); 2079 WORKITEM_FREE(inodedep, D_INODEDEP); 2080 num_inodedep -= 1; 2081 return (1); 2082} 2083 2084/* 2085 * This workitem routine performs the block de-allocation. 2086 * The workitem is added to the pending list after the updated 2087 * inode block has been written to disk. As mentioned above, 2088 * checks regarding the number of blocks de-allocated (compared 2089 * to the number of blocks allocated for the file) are also 2090 * performed in this function. 2091 / 2092static void 2093handle_workitem_freeblocks(freeblks, flags) 2094* struct freeblks freeblks; 2095* int flags; 2096{ 2097 struct inode tip, ip; 2098* struct vnode vp; 2099* ufs_daddr_t bn; 2100 struct fs fs; 2101* int i, level, bsize; 2102 long nblocks, blocksreleased = 0; 2103 int error, allerror = 0; 2104 ufs_lbn_t baselbns[NIADDR], tmpval; 2105 2106 tip.i_fs = fs = VFSTOUFS(freeblks->fb_mnt)->um_fs; 2107 tip.i_number = freeblks->fb_previousinum; 2108 tip.i_devvp = freeblks->fb_devvp; 2109 tip.i_dev = freeblks->fb_devvp->v_rdev; 2110 tip.i_size = freeblks->fb_oldsize; 2111 tip.i_uid = freeblks->fb_uid; 2112 tip.i_vnode = NULL; 2113 tmpval = 1; 2114 baselbns[0] = NDADDR; 2115 for (i = 1; i < NIADDR; i++) { 2116 tmpval = NINDIR(fs); 2117* baselbns[i] = baselbns[i - 1] + tmpval; 2118 } 2119 nblocks = btodb(fs->fs_bsize); 2120 blocksreleased = 0; 2121 /* 2122 * Indirect blocks first. 2123 / 2124* for (level = (NIADDR - 1); level >= 0; level--) { 2125 if ((bn = freeblks->fb_iblks[level]) == 0) 2126 continue; 2127 if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level, 2128 baselbns[level], &blocksreleased)) == 0) 2129 allerror = error; 2130 ffs_blkfree(&tip, bn, fs->fs_bsize);
	2131 fs->fs_pendingblocks -= nblocks;
2121 blocksreleased += nblocks; 2122 } 2123 /* 2124 * All direct blocks or frags. 2125 / 2126* for (i = (NDADDR - 1); i >= 0; i--) { 2127 if ((bn = freeblks->fb_dblks[i]) == 0) 2128 continue; 2129 bsize = blksize(fs, &tip, i); 2130 ffs_blkfree(&tip, bn, bsize);	2132 blocksreleased += nblocks; 2133 } 2134 /* 2135 * All direct blocks or frags. 2136 / 2137* for (i = (NDADDR - 1); i >= 0; i--) { 2138 if ((bn = freeblks->fb_dblks[i]) == 0) 2139 continue; 2140 bsize = blksize(fs, &tip, i); 2141 ffs_blkfree(&tip, bn, bsize);
	2142 fs->fs_pendingblocks -= btodb(bsize);
2131 blocksreleased += btodb(bsize); 2132 } 2133 /* 2134 * If we still have not finished background cleanup, then check 2135 * to see if the block count needs to be adjusted. 2136 / 2137* if (freeblks->fb_chkcnt != blocksreleased && 2138 (fs->fs_flags & FS_UNCLEAN) != 0 && (flags & LK_NOWAIT) == 0 && 2139 VFS_VGET(freeblks->fb_mnt, freeblks->fb_previousinum, &vp) == 0) { 2140 ip = VTOI(vp); 2141 ip->i_blocks += freeblks->fb_chkcnt - blocksreleased; 2142 ip->i_flag \|= IN_CHANGE; 2143 vput(vp); 2144 } 2145 2146#ifdef DIAGNOSTIC 2147 if (freeblks->fb_chkcnt != blocksreleased && 2148 ((fs->fs_flags & FS_UNCLEAN) == 0 \|\| (flags & LK_NOWAIT) != 0)) 2149 printf("handle_workitem_freeblocks: block count"); 2150 if (allerror) 2151 softdep_error("handle_workitem_freeblks", allerror); 2152#endif /* DIAGNOSTIC / 2153* 2154 WORKITEM_FREE(freeblks, D_FREEBLKS); 2155} 2156 2157/* 2158 * Release blocks associated with the inode ip and stored in the indirect 2159 * block dbn. If level is greater than SINGLE, the block is an indirect block 2160 * and recursive calls to indirtrunc must be used to cleanse other indirect 2161 * blocks. 2162 / 2163static int 2164indir_trunc(ip, dbn, level, lbn, countp) 2165* struct inode ip; 2166* ufs_daddr_t dbn; 2167 int level; 2168 ufs_lbn_t lbn; 2169 long countp; 2170{ 2171* struct buf bp; 2172* ufs_daddr_t bap; 2173* ufs_daddr_t nb; 2174 struct fs fs; 2175* struct worklist wk; 2176* struct indirdep indirdep; 2177* int i, lbnadd, nblocks; 2178 int error, allerror = 0; 2179 2180 fs = ip->i_fs; 2181 lbnadd = 1; 2182 for (i = level; i > 0; i--) 2183 lbnadd = NINDIR(fs); 2184* /* 2185 * Get buffer of block pointers to be freed. This routine is not 2186 * called until the zero'ed inode has been written, so it is safe 2187 * to free blocks as they are encountered. Because the inode has 2188 * been zero'ed, calls to bmap on these blocks will fail. So, we 2189 * have to use the on-disk address and the block device for the 2190 * filesystem to look them up. If the file was deleted before its 2191 * indirect blocks were all written to disk, the routine that set 2192 * us up (deallocate_dependencies) will have arranged to leave 2193 * a complete copy of the indirect block in memory for our use. 2194 * Otherwise we have to read the blocks in from the disk. 2195 / 2196* ACQUIRE_LOCK(&lk); 2197 if ((bp = incore(ip->i_devvp, dbn)) != NULL && 2198 (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 2199 if (wk->wk_type != D_INDIRDEP \|\| 2200 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp \|\| 2201 (indirdep->ir_state & GOINGAWAY) == 0) { 2202 FREE_LOCK(&lk); 2203 panic("indir_trunc: lost indirdep"); 2204 } 2205 WORKLIST_REMOVE(wk); 2206 WORKITEM_FREE(indirdep, D_INDIRDEP); 2207 if (LIST_FIRST(&bp->b_dep) != NULL) { 2208 FREE_LOCK(&lk); 2209 panic("indir_trunc: dangling dep"); 2210 } 2211 FREE_LOCK(&lk); 2212 } else { 2213 FREE_LOCK(&lk); 2214 error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp); 2215 if (error) 2216 return (error); 2217 } 2218 /* 2219 * Recursively free indirect blocks. 2220 / 2221* bap = (ufs_daddr_t )bp->b_data; 2222* nblocks = btodb(fs->fs_bsize); 2223 for (i = NINDIR(fs) - 1; i >= 0; i--) { 2224 if ((nb = bap[i]) == 0) 2225 continue; 2226 if (level != 0) { 2227 if ((error = indir_trunc(ip, fsbtodb(fs, nb), 2228 level - 1, lbn + (i * lbnadd), countp)) != 0) 2229 allerror = error; 2230 } 2231 ffs_blkfree(ip, nb, fs->fs_bsize);	2143 blocksreleased += btodb(bsize); 2144 } 2145 /* 2146 * If we still have not finished background cleanup, then check 2147 * to see if the block count needs to be adjusted. 2148 / 2149* if (freeblks->fb_chkcnt != blocksreleased && 2150 (fs->fs_flags & FS_UNCLEAN) != 0 && (flags & LK_NOWAIT) == 0 && 2151 VFS_VGET(freeblks->fb_mnt, freeblks->fb_previousinum, &vp) == 0) { 2152 ip = VTOI(vp); 2153 ip->i_blocks += freeblks->fb_chkcnt - blocksreleased; 2154 ip->i_flag \|= IN_CHANGE; 2155 vput(vp); 2156 } 2157 2158#ifdef DIAGNOSTIC 2159 if (freeblks->fb_chkcnt != blocksreleased && 2160 ((fs->fs_flags & FS_UNCLEAN) == 0 \|\| (flags & LK_NOWAIT) != 0)) 2161 printf("handle_workitem_freeblocks: block count"); 2162 if (allerror) 2163 softdep_error("handle_workitem_freeblks", allerror); 2164#endif /* DIAGNOSTIC / 2165* 2166 WORKITEM_FREE(freeblks, D_FREEBLKS); 2167} 2168 2169/* 2170 * Release blocks associated with the inode ip and stored in the indirect 2171 * block dbn. If level is greater than SINGLE, the block is an indirect block 2172 * and recursive calls to indirtrunc must be used to cleanse other indirect 2173 * blocks. 2174 / 2175static int 2176indir_trunc(ip, dbn, level, lbn, countp) 2177* struct inode ip; 2178* ufs_daddr_t dbn; 2179 int level; 2180 ufs_lbn_t lbn; 2181 long countp; 2182{ 2183* struct buf bp; 2184* ufs_daddr_t bap; 2185* ufs_daddr_t nb; 2186 struct fs fs; 2187* struct worklist wk; 2188* struct indirdep indirdep; 2189* int i, lbnadd, nblocks; 2190 int error, allerror = 0; 2191 2192 fs = ip->i_fs; 2193 lbnadd = 1; 2194 for (i = level; i > 0; i--) 2195 lbnadd = NINDIR(fs); 2196* /* 2197 * Get buffer of block pointers to be freed. This routine is not 2198 * called until the zero'ed inode has been written, so it is safe 2199 * to free blocks as they are encountered. Because the inode has 2200 * been zero'ed, calls to bmap on these blocks will fail. So, we 2201 * have to use the on-disk address and the block device for the 2202 * filesystem to look them up. If the file was deleted before its 2203 * indirect blocks were all written to disk, the routine that set 2204 * us up (deallocate_dependencies) will have arranged to leave 2205 * a complete copy of the indirect block in memory for our use. 2206 * Otherwise we have to read the blocks in from the disk. 2207 / 2208* ACQUIRE_LOCK(&lk); 2209 if ((bp = incore(ip->i_devvp, dbn)) != NULL && 2210 (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 2211 if (wk->wk_type != D_INDIRDEP \|\| 2212 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp \|\| 2213 (indirdep->ir_state & GOINGAWAY) == 0) { 2214 FREE_LOCK(&lk); 2215 panic("indir_trunc: lost indirdep"); 2216 } 2217 WORKLIST_REMOVE(wk); 2218 WORKITEM_FREE(indirdep, D_INDIRDEP); 2219 if (LIST_FIRST(&bp->b_dep) != NULL) { 2220 FREE_LOCK(&lk); 2221 panic("indir_trunc: dangling dep"); 2222 } 2223 FREE_LOCK(&lk); 2224 } else { 2225 FREE_LOCK(&lk); 2226 error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp); 2227 if (error) 2228 return (error); 2229 } 2230 /* 2231 * Recursively free indirect blocks. 2232 / 2233* bap = (ufs_daddr_t )bp->b_data; 2234* nblocks = btodb(fs->fs_bsize); 2235 for (i = NINDIR(fs) - 1; i >= 0; i--) { 2236 if ((nb = bap[i]) == 0) 2237 continue; 2238 if (level != 0) { 2239 if ((error = indir_trunc(ip, fsbtodb(fs, nb), 2240 level - 1, lbn + (i * lbnadd), countp)) != 0) 2241 allerror = error; 2242 } 2243 ffs_blkfree(ip, nb, fs->fs_bsize);
	2244 fs->fs_pendingblocks -= nblocks;
2232 countp += nblocks; 2233* } 2234 bp->b_flags \|= B_INVAL \| B_NOCACHE; 2235 brelse(bp); 2236 return (allerror); 2237} 2238 2239/* 2240 * Free an allocindir. 2241 * This routine must be called with splbio interrupts blocked. 2242 / 2243static void 2244free_allocindir(aip, inodedep) 2245* struct allocindir aip; 2246* struct inodedep inodedep; 2247{ 2248* struct freefrag freefrag; 2249* 2250#ifdef DEBUG 2251 if (lk.lkt_held == -1) 2252 panic("free_allocindir: lock not held"); 2253#endif 2254 if ((aip->ai_state & DEPCOMPLETE) == 0) 2255 LIST_REMOVE(aip, ai_deps); 2256 if (aip->ai_state & ONWORKLIST) 2257 WORKLIST_REMOVE(&aip->ai_list); 2258 LIST_REMOVE(aip, ai_next); 2259 if ((freefrag = aip->ai_freefrag) != NULL) { 2260 if (inodedep == NULL) 2261 add_to_worklist(&freefrag->ff_list); 2262 else 2263 WORKLIST_INSERT(&inodedep->id_bufwait, 2264 &freefrag->ff_list); 2265 } 2266 WORKITEM_FREE(aip, D_ALLOCINDIR); 2267} 2268 2269/* 2270 * Directory entry addition dependencies. 2271 * 2272 * When adding a new directory entry, the inode (with its incremented link 2273 * count) must be written to disk before the directory entry's pointer to it. 2274 * Also, if the inode is newly allocated, the corresponding freemap must be 2275 * updated (on disk) before the directory entry's pointer. These requirements 2276 * are met via undo/redo on the directory entry's pointer, which consists 2277 * simply of the inode number. 2278 * 2279 * As directory entries are added and deleted, the free space within a 2280 * directory block can become fragmented. The ufs file system will compact 2281 * a fragmented directory block to make space for a new entry. When this 2282 * occurs, the offsets of previously added entries change. Any "diradd" 2283 * dependency structures corresponding to these entries must be updated with 2284 * the new offsets. 2285 / 2286* 2287/* 2288 * This routine is called after the in-memory inode's link 2289 * count has been incremented, but before the directory entry's 2290 * pointer to the inode has been set. 2291 / 2292void 2293softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) 2294* struct buf bp; / buffer containing directory block / 2295* struct inode dp; / inode for directory / 2296* off_t diroffset; /* offset of new entry in directory / 2297* long newinum; /* inode referenced by new directory entry / 2298* struct buf newdirbp; / non-NULL => contents of new mkdir / 2299{ 2300* int offset; /* offset of new entry within directory block / 2301* ufs_lbn_t lbn; /* block in directory containing new entry / 2302* struct fs fs; 2303* struct diradd dap; 2304* struct pagedep pagedep; 2305* struct inodedep inodedep; 2306* struct mkdir mkdir1, mkdir2; 2307 2308 /* 2309 * Whiteouts have no dependencies. 2310 / 2311* if (newinum == WINO) { 2312 if (newdirbp != NULL) 2313 bdwrite(newdirbp); 2314 return; 2315 } 2316 2317 fs = dp->i_fs; 2318 lbn = lblkno(fs, diroffset); 2319 offset = blkoff(fs, diroffset); 2320 MALLOC(dap, struct diradd , sizeof(struct diradd), M_DIRADD, 2321* M_SOFTDEP_FLAGS\|M_ZERO); 2322 dap->da_list.wk_type = D_DIRADD; 2323 dap->da_offset = offset; 2324 dap->da_newinum = newinum; 2325 dap->da_state = ATTACHED; 2326 if (newdirbp == NULL) { 2327 dap->da_state \|= DEPCOMPLETE; 2328 ACQUIRE_LOCK(&lk); 2329 } else { 2330 dap->da_state \|= MKDIR_BODY \| MKDIR_PARENT; 2331 MALLOC(mkdir1, struct mkdir , sizeof(struct mkdir), M_MKDIR, 2332* M_SOFTDEP_FLAGS); 2333 mkdir1->md_list.wk_type = D_MKDIR; 2334 mkdir1->md_state = MKDIR_BODY; 2335 mkdir1->md_diradd = dap; 2336 MALLOC(mkdir2, struct mkdir , sizeof(struct mkdir), M_MKDIR, 2337* M_SOFTDEP_FLAGS); 2338 mkdir2->md_list.wk_type = D_MKDIR; 2339 mkdir2->md_state = MKDIR_PARENT; 2340 mkdir2->md_diradd = dap; 2341 /* 2342 * Dependency on "." and ".." being written to disk. 2343 / 2344* mkdir1->md_buf = newdirbp; 2345 ACQUIRE_LOCK(&lk); 2346 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 2347 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); 2348 FREE_LOCK(&lk); 2349 bdwrite(newdirbp); 2350 /* 2351 * Dependency on link count increase for parent directory 2352 / 2353* ACQUIRE_LOCK(&lk); 2354 if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0 2355 \|\| (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2356 dap->da_state &= ~MKDIR_PARENT; 2357 WORKITEM_FREE(mkdir2, D_MKDIR); 2358 } else { 2359 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 2360 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); 2361 } 2362 } 2363 /* 2364 * Link into parent directory pagedep to await its being written. 2365 / 2366* if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2367 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2368 dap->da_pagedep = pagedep; 2369 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 2370 da_pdlist); 2371 /* 2372 * Link into its inodedep. Put it on the id_bufwait list if the inode 2373 * is not yet written. If it is written, do the post-inode write 2374 * processing to put it on the id_pendinghd list. 2375 / 2376* (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep); 2377 if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 2378 diradd_inode_written(dap, inodedep); 2379 else 2380 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 2381 FREE_LOCK(&lk); 2382} 2383 2384/* 2385 * This procedure is called to change the offset of a directory 2386 * entry when compacting a directory block which must be owned 2387 * exclusively by the caller. Note that the actual entry movement 2388 * must be done in this procedure to ensure that no I/O completions 2389 * occur while the move is in progress. 2390 / 2391void 2392softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) 2393* struct inode dp; / inode for directory / 2394* caddr_t base; /* address of dp->i_offset / 2395* caddr_t oldloc; /* address of old directory location / 2396* caddr_t newloc; /* address of new directory location / 2397* int entrysize; /* size of directory entry / 2398{ 2399* int offset, oldoffset, newoffset; 2400 struct pagedep pagedep; 2401* struct diradd dap; 2402* ufs_lbn_t lbn; 2403 2404 ACQUIRE_LOCK(&lk); 2405 lbn = lblkno(dp->i_fs, dp->i_offset); 2406 offset = blkoff(dp->i_fs, dp->i_offset); 2407 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) 2408 goto done; 2409 oldoffset = offset + (oldloc - base); 2410 newoffset = offset + (newloc - base); 2411 2412 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) { 2413 if (dap->da_offset != oldoffset) 2414 continue; 2415 dap->da_offset = newoffset; 2416 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) 2417 break; 2418 LIST_REMOVE(dap, da_pdlist); 2419 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], 2420 dap, da_pdlist); 2421 break; 2422 } 2423 if (dap == NULL) { 2424 2425 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) { 2426 if (dap->da_offset == oldoffset) { 2427 dap->da_offset = newoffset; 2428 break; 2429 } 2430 } 2431 } 2432done: 2433 bcopy(oldloc, newloc, entrysize); 2434 FREE_LOCK(&lk); 2435} 2436 2437/* 2438 * Free a diradd dependency structure. This routine must be called 2439 * with splbio interrupts blocked. 2440 / 2441static void 2442free_diradd(dap) 2443* struct diradd dap; 2444{ 2445* struct dirrem dirrem; 2446* struct pagedep pagedep; 2447* struct inodedep inodedep; 2448* struct mkdir mkdir, nextmd; 2449 2450#ifdef DEBUG 2451 if (lk.lkt_held == -1) 2452 panic("free_diradd: lock not held"); 2453#endif 2454 WORKLIST_REMOVE(&dap->da_list); 2455 LIST_REMOVE(dap, da_pdlist); 2456 if ((dap->da_state & DIRCHG) == 0) { 2457 pagedep = dap->da_pagedep; 2458 } else { 2459 dirrem = dap->da_previous; 2460 pagedep = dirrem->dm_pagedep; 2461 dirrem->dm_dirinum = pagedep->pd_ino; 2462 add_to_worklist(&dirrem->dm_list); 2463 } 2464 if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum, 2465 0, &inodedep) != 0) 2466 (void) free_inodedep(inodedep); 2467 if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) { 2468 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 2469 nextmd = LIST_NEXT(mkdir, md_mkdirs); 2470 if (mkdir->md_diradd != dap) 2471 continue; 2472 dap->da_state &= ~mkdir->md_state; 2473 WORKLIST_REMOVE(&mkdir->md_list); 2474 LIST_REMOVE(mkdir, md_mkdirs); 2475 WORKITEM_FREE(mkdir, D_MKDIR); 2476 } 2477 if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) { 2478 FREE_LOCK(&lk); 2479 panic("free_diradd: unfound ref"); 2480 } 2481 } 2482 WORKITEM_FREE(dap, D_DIRADD); 2483} 2484 2485/* 2486 * Directory entry removal dependencies. 2487 * 2488 * When removing a directory entry, the entry's inode pointer must be 2489 * zero'ed on disk before the corresponding inode's link count is decremented 2490 * (possibly freeing the inode for re-use). This dependency is handled by 2491 * updating the directory entry but delaying the inode count reduction until 2492 * after the directory block has been written to disk. After this point, the 2493 * inode count can be decremented whenever it is convenient. 2494 / 2495* 2496/* 2497 * This routine should be called immediately after removing 2498 * a directory entry. The inode's link count should not be 2499 * decremented by the calling procedure -- the soft updates 2500 * code will do this task when it is safe. 2501 / 2502void 2503softdep_setup_remove(bp, dp, ip, isrmdir) 2504* struct buf bp; / buffer containing directory block / 2505* struct inode dp; / inode for the directory being modified / 2506* struct inode ip; / inode for directory entry being removed / 2507* int isrmdir; /* indicates if doing RMDIR / 2508{ 2509* struct dirrem dirrem, prevdirrem; 2510 2511 /* 2512 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. 2513 / 2514* dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 2515 2516 /* 2517 * If the COMPLETE flag is clear, then there were no active 2518 * entries and we want to roll back to a zeroed entry until 2519 * the new inode is committed to disk. If the COMPLETE flag is 2520 * set then we have deleted an entry that never made it to 2521 * disk. If the entry we deleted resulted from a name change, 2522 * then the old name still resides on disk. We cannot delete 2523 * its inode (returned to us in prevdirrem) until the zeroed 2524 * directory entry gets to disk. The new inode has never been 2525 * referenced on the disk, so can be deleted immediately. 2526 / 2527* if ((dirrem->dm_state & COMPLETE) == 0) { 2528 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 2529 dm_next); 2530 FREE_LOCK(&lk); 2531 } else { 2532 if (prevdirrem != NULL) 2533 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 2534 prevdirrem, dm_next); 2535 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 2536 FREE_LOCK(&lk); 2537 handle_workitem_remove(dirrem); 2538 } 2539} 2540 2541/* 2542 * Allocate a new dirrem if appropriate and return it along with 2543 * its associated pagedep. Called without a lock, returns with lock. 2544 / 2545static long num_dirrem; / number of dirrem allocated / 2546static struct dirrem 2547newdirrem(bp, dp, ip, isrmdir, prevdirremp) 2548 struct buf bp; / buffer containing directory block / 2549* struct inode dp; / inode for the directory being modified / 2550* struct inode ip; / inode for directory entry being removed / 2551* int isrmdir; /* indicates if doing RMDIR / 2552* struct dirrem *prevdirremp; / previously referenced inode, if any / 2553{ 2554* int offset; 2555 ufs_lbn_t lbn; 2556 struct diradd dap; 2557* struct dirrem dirrem; 2558* struct pagedep pagedep; 2559* 2560 /* 2561 * Whiteouts have no deletion dependencies. 2562 / 2563* if (ip == NULL) 2564 panic("newdirrem: whiteout"); 2565 /* 2566 * If we are over our limit, try to improve the situation. 2567 * Limiting the number of dirrem structures will also limit 2568 * the number of freefile and freeblks structures. 2569 / 2570* if (num_dirrem > max_softdeps / 2) 2571 (void) request_cleanup(FLUSH_REMOVE, 0); 2572 num_dirrem += 1; 2573 MALLOC(dirrem, struct dirrem , sizeof(struct dirrem), 2574* M_DIRREM, M_SOFTDEP_FLAGS\|M_ZERO); 2575 dirrem->dm_list.wk_type = D_DIRREM; 2576 dirrem->dm_state = isrmdir ? RMDIR : 0; 2577 dirrem->dm_mnt = ITOV(ip)->v_mount; 2578 dirrem->dm_oldinum = ip->i_number; 2579 prevdirremp = NULL; 2580* 2581 ACQUIRE_LOCK(&lk); 2582 lbn = lblkno(dp->i_fs, dp->i_offset); 2583 offset = blkoff(dp->i_fs, dp->i_offset); 2584 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2585 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2586 dirrem->dm_pagedep = pagedep; 2587 /* 2588 * Check for a diradd dependency for the same directory entry. 2589 * If present, then both dependencies become obsolete and can 2590 * be de-allocated. Check for an entry on both the pd_dirraddhd 2591 * list and the pd_pendinghd list. 2592 / 2593* 2594 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 2595 if (dap->da_offset == offset) 2596 break; 2597 if (dap == NULL) { 2598 2599 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 2600 if (dap->da_offset == offset) 2601 break; 2602 if (dap == NULL) 2603 return (dirrem); 2604 } 2605 /* 2606 * Must be ATTACHED at this point. 2607 / 2608* if ((dap->da_state & ATTACHED) == 0) { 2609 FREE_LOCK(&lk); 2610 panic("newdirrem: not ATTACHED"); 2611 } 2612 if (dap->da_newinum != ip->i_number) { 2613 FREE_LOCK(&lk); 2614 panic("newdirrem: inum %d should be %d", 2615 ip->i_number, dap->da_newinum); 2616 } 2617 /* 2618 * If we are deleting a changed name that never made it to disk, 2619 * then return the dirrem describing the previous inode (which 2620 * represents the inode currently referenced from this entry on disk). 2621 / 2622* if ((dap->da_state & DIRCHG) != 0) { 2623 prevdirremp = dap->da_previous; 2624* dap->da_state &= ~DIRCHG; 2625 dap->da_pagedep = pagedep; 2626 } 2627 /* 2628 * We are deleting an entry that never made it to disk. 2629 * Mark it COMPLETE so we can delete its inode immediately. 2630 / 2631* dirrem->dm_state \|= COMPLETE; 2632 free_diradd(dap); 2633 return (dirrem); 2634} 2635 2636/* 2637 * Directory entry change dependencies. 2638 * 2639 * Changing an existing directory entry requires that an add operation 2640 * be completed first followed by a deletion. The semantics for the addition 2641 * are identical to the description of adding a new entry above except 2642 * that the rollback is to the old inode number rather than zero. Once 2643 * the addition dependency is completed, the removal is done as described 2644 * in the removal routine above. 2645 / 2646* 2647/* 2648 * This routine should be called immediately after changing 2649 * a directory entry. The inode's link count should not be 2650 * decremented by the calling procedure -- the soft updates 2651 * code will perform this task when it is safe. 2652 / 2653void 2654softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 2655* struct buf bp; / buffer containing directory block / 2656* struct inode dp; / inode for the directory being modified / 2657* struct inode ip; / inode for directory entry being removed / 2658* long newinum; /* new inode number for changed entry / 2659* int isrmdir; /* indicates if doing RMDIR / 2660{ 2661* int offset; 2662 struct diradd dap = NULL; 2663* struct dirrem dirrem, prevdirrem; 2664 struct pagedep pagedep; 2665* struct inodedep inodedep; 2666* 2667 offset = blkoff(dp->i_fs, dp->i_offset); 2668 2669 /* 2670 * Whiteouts do not need diradd dependencies. 2671 / 2672* if (newinum != WINO) { 2673 MALLOC(dap, struct diradd , sizeof(struct diradd), 2674* M_DIRADD, M_SOFTDEP_FLAGS\|M_ZERO); 2675 dap->da_list.wk_type = D_DIRADD; 2676 dap->da_state = DIRCHG \| ATTACHED \| DEPCOMPLETE; 2677 dap->da_offset = offset; 2678 dap->da_newinum = newinum; 2679 } 2680 2681 /* 2682 * Allocate a new dirrem and ACQUIRE_LOCK. 2683 / 2684* dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 2685 pagedep = dirrem->dm_pagedep; 2686 /* 2687 * The possible values for isrmdir: 2688 * 0 - non-directory file rename 2689 * 1 - directory rename within same directory 2690 * inum - directory rename to new directory of given inode number 2691 * When renaming to a new directory, we are both deleting and 2692 * creating a new directory entry, so the link count on the new 2693 * directory should not change. Thus we do not need the followup 2694 * dirrem which is usually done in handle_workitem_remove. We set 2695 * the DIRCHG flag to tell handle_workitem_remove to skip the 2696 * followup dirrem. 2697 / 2698* if (isrmdir > 1) 2699 dirrem->dm_state \|= DIRCHG; 2700 2701 /* 2702 * Whiteouts have no additional dependencies, 2703 * so just put the dirrem on the correct list. 2704 / 2705* if (newinum == WINO) { 2706 if ((dirrem->dm_state & COMPLETE) == 0) { 2707 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 2708 dm_next); 2709 } else { 2710 dirrem->dm_dirinum = pagedep->pd_ino; 2711 add_to_worklist(&dirrem->dm_list); 2712 } 2713 FREE_LOCK(&lk); 2714 return; 2715 } 2716 2717 /* 2718 * If the COMPLETE flag is clear, then there were no active 2719 * entries and we want to roll back to the previous inode until 2720 * the new inode is committed to disk. If the COMPLETE flag is 2721 * set, then we have deleted an entry that never made it to disk. 2722 * If the entry we deleted resulted from a name change, then the old 2723 * inode reference still resides on disk. Any rollback that we do 2724 * needs to be to that old inode (returned to us in prevdirrem). If 2725 * the entry we deleted resulted from a create, then there is 2726 * no entry on the disk, so we want to roll back to zero rather 2727 * than the uncommitted inode. In either of the COMPLETE cases we 2728 * want to immediately free the unwritten and unreferenced inode. 2729 / 2730* if ((dirrem->dm_state & COMPLETE) == 0) { 2731 dap->da_previous = dirrem; 2732 } else { 2733 if (prevdirrem != NULL) { 2734 dap->da_previous = prevdirrem; 2735 } else { 2736 dap->da_state &= ~DIRCHG; 2737 dap->da_pagedep = pagedep; 2738 } 2739 dirrem->dm_dirinum = pagedep->pd_ino; 2740 add_to_worklist(&dirrem->dm_list); 2741 } 2742 /* 2743 * Link into its inodedep. Put it on the id_bufwait list if the inode 2744 * is not yet written. If it is written, do the post-inode write 2745 * processing to put it on the id_pendinghd list. 2746 / 2747* if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 \|\| 2748 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2749 dap->da_state \|= COMPLETE; 2750 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 2751 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 2752 } else { 2753 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 2754 dap, da_pdlist); 2755 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 2756 } 2757 FREE_LOCK(&lk); 2758} 2759 2760/* 2761 * Called whenever the link count on an inode is changed. 2762 * It creates an inode dependency so that the new reference(s) 2763 * to the inode cannot be committed to disk until the updated 2764 * inode has been written. 2765 / 2766void 2767softdep_change_linkcnt(ip) 2768* struct inode ip; / the inode with the increased link count / 2769{ 2770* struct inodedep inodedep; 2771* 2772 ACQUIRE_LOCK(&lk); 2773 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); 2774 if (ip->i_nlink < ip->i_effnlink) { 2775 FREE_LOCK(&lk); 2776 panic("softdep_change_linkcnt: bad delta"); 2777 } 2778 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2779 FREE_LOCK(&lk); 2780} 2781 2782/*	2245 countp += nblocks; 2246* } 2247 bp->b_flags \|= B_INVAL \| B_NOCACHE; 2248 brelse(bp); 2249 return (allerror); 2250} 2251 2252/* 2253 * Free an allocindir. 2254 * This routine must be called with splbio interrupts blocked. 2255 / 2256static void 2257free_allocindir(aip, inodedep) 2258* struct allocindir aip; 2259* struct inodedep inodedep; 2260{ 2261* struct freefrag freefrag; 2262* 2263#ifdef DEBUG 2264 if (lk.lkt_held == -1) 2265 panic("free_allocindir: lock not held"); 2266#endif 2267 if ((aip->ai_state & DEPCOMPLETE) == 0) 2268 LIST_REMOVE(aip, ai_deps); 2269 if (aip->ai_state & ONWORKLIST) 2270 WORKLIST_REMOVE(&aip->ai_list); 2271 LIST_REMOVE(aip, ai_next); 2272 if ((freefrag = aip->ai_freefrag) != NULL) { 2273 if (inodedep == NULL) 2274 add_to_worklist(&freefrag->ff_list); 2275 else 2276 WORKLIST_INSERT(&inodedep->id_bufwait, 2277 &freefrag->ff_list); 2278 } 2279 WORKITEM_FREE(aip, D_ALLOCINDIR); 2280} 2281 2282/* 2283 * Directory entry addition dependencies. 2284 * 2285 * When adding a new directory entry, the inode (with its incremented link 2286 * count) must be written to disk before the directory entry's pointer to it. 2287 * Also, if the inode is newly allocated, the corresponding freemap must be 2288 * updated (on disk) before the directory entry's pointer. These requirements 2289 * are met via undo/redo on the directory entry's pointer, which consists 2290 * simply of the inode number. 2291 * 2292 * As directory entries are added and deleted, the free space within a 2293 * directory block can become fragmented. The ufs file system will compact 2294 * a fragmented directory block to make space for a new entry. When this 2295 * occurs, the offsets of previously added entries change. Any "diradd" 2296 * dependency structures corresponding to these entries must be updated with 2297 * the new offsets. 2298 / 2299* 2300/* 2301 * This routine is called after the in-memory inode's link 2302 * count has been incremented, but before the directory entry's 2303 * pointer to the inode has been set. 2304 / 2305void 2306softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) 2307* struct buf bp; / buffer containing directory block / 2308* struct inode dp; / inode for directory / 2309* off_t diroffset; /* offset of new entry in directory / 2310* long newinum; /* inode referenced by new directory entry / 2311* struct buf newdirbp; / non-NULL => contents of new mkdir / 2312{ 2313* int offset; /* offset of new entry within directory block / 2314* ufs_lbn_t lbn; /* block in directory containing new entry / 2315* struct fs fs; 2316* struct diradd dap; 2317* struct pagedep pagedep; 2318* struct inodedep inodedep; 2319* struct mkdir mkdir1, mkdir2; 2320 2321 /* 2322 * Whiteouts have no dependencies. 2323 / 2324* if (newinum == WINO) { 2325 if (newdirbp != NULL) 2326 bdwrite(newdirbp); 2327 return; 2328 } 2329 2330 fs = dp->i_fs; 2331 lbn = lblkno(fs, diroffset); 2332 offset = blkoff(fs, diroffset); 2333 MALLOC(dap, struct diradd , sizeof(struct diradd), M_DIRADD, 2334* M_SOFTDEP_FLAGS\|M_ZERO); 2335 dap->da_list.wk_type = D_DIRADD; 2336 dap->da_offset = offset; 2337 dap->da_newinum = newinum; 2338 dap->da_state = ATTACHED; 2339 if (newdirbp == NULL) { 2340 dap->da_state \|= DEPCOMPLETE; 2341 ACQUIRE_LOCK(&lk); 2342 } else { 2343 dap->da_state \|= MKDIR_BODY \| MKDIR_PARENT; 2344 MALLOC(mkdir1, struct mkdir , sizeof(struct mkdir), M_MKDIR, 2345* M_SOFTDEP_FLAGS); 2346 mkdir1->md_list.wk_type = D_MKDIR; 2347 mkdir1->md_state = MKDIR_BODY; 2348 mkdir1->md_diradd = dap; 2349 MALLOC(mkdir2, struct mkdir , sizeof(struct mkdir), M_MKDIR, 2350* M_SOFTDEP_FLAGS); 2351 mkdir2->md_list.wk_type = D_MKDIR; 2352 mkdir2->md_state = MKDIR_PARENT; 2353 mkdir2->md_diradd = dap; 2354 /* 2355 * Dependency on "." and ".." being written to disk. 2356 / 2357* mkdir1->md_buf = newdirbp; 2358 ACQUIRE_LOCK(&lk); 2359 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 2360 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); 2361 FREE_LOCK(&lk); 2362 bdwrite(newdirbp); 2363 /* 2364 * Dependency on link count increase for parent directory 2365 / 2366* ACQUIRE_LOCK(&lk); 2367 if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0 2368 \|\| (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2369 dap->da_state &= ~MKDIR_PARENT; 2370 WORKITEM_FREE(mkdir2, D_MKDIR); 2371 } else { 2372 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 2373 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); 2374 } 2375 } 2376 /* 2377 * Link into parent directory pagedep to await its being written. 2378 / 2379* if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2380 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2381 dap->da_pagedep = pagedep; 2382 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 2383 da_pdlist); 2384 /* 2385 * Link into its inodedep. Put it on the id_bufwait list if the inode 2386 * is not yet written. If it is written, do the post-inode write 2387 * processing to put it on the id_pendinghd list. 2388 / 2389* (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep); 2390 if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 2391 diradd_inode_written(dap, inodedep); 2392 else 2393 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 2394 FREE_LOCK(&lk); 2395} 2396 2397/* 2398 * This procedure is called to change the offset of a directory 2399 * entry when compacting a directory block which must be owned 2400 * exclusively by the caller. Note that the actual entry movement 2401 * must be done in this procedure to ensure that no I/O completions 2402 * occur while the move is in progress. 2403 / 2404void 2405softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) 2406* struct inode dp; / inode for directory / 2407* caddr_t base; /* address of dp->i_offset / 2408* caddr_t oldloc; /* address of old directory location / 2409* caddr_t newloc; /* address of new directory location / 2410* int entrysize; /* size of directory entry / 2411{ 2412* int offset, oldoffset, newoffset; 2413 struct pagedep pagedep; 2414* struct diradd dap; 2415* ufs_lbn_t lbn; 2416 2417 ACQUIRE_LOCK(&lk); 2418 lbn = lblkno(dp->i_fs, dp->i_offset); 2419 offset = blkoff(dp->i_fs, dp->i_offset); 2420 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) 2421 goto done; 2422 oldoffset = offset + (oldloc - base); 2423 newoffset = offset + (newloc - base); 2424 2425 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) { 2426 if (dap->da_offset != oldoffset) 2427 continue; 2428 dap->da_offset = newoffset; 2429 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) 2430 break; 2431 LIST_REMOVE(dap, da_pdlist); 2432 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], 2433 dap, da_pdlist); 2434 break; 2435 } 2436 if (dap == NULL) { 2437 2438 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) { 2439 if (dap->da_offset == oldoffset) { 2440 dap->da_offset = newoffset; 2441 break; 2442 } 2443 } 2444 } 2445done: 2446 bcopy(oldloc, newloc, entrysize); 2447 FREE_LOCK(&lk); 2448} 2449 2450/* 2451 * Free a diradd dependency structure. This routine must be called 2452 * with splbio interrupts blocked. 2453 / 2454static void 2455free_diradd(dap) 2456* struct diradd dap; 2457{ 2458* struct dirrem dirrem; 2459* struct pagedep pagedep; 2460* struct inodedep inodedep; 2461* struct mkdir mkdir, nextmd; 2462 2463#ifdef DEBUG 2464 if (lk.lkt_held == -1) 2465 panic("free_diradd: lock not held"); 2466#endif 2467 WORKLIST_REMOVE(&dap->da_list); 2468 LIST_REMOVE(dap, da_pdlist); 2469 if ((dap->da_state & DIRCHG) == 0) { 2470 pagedep = dap->da_pagedep; 2471 } else { 2472 dirrem = dap->da_previous; 2473 pagedep = dirrem->dm_pagedep; 2474 dirrem->dm_dirinum = pagedep->pd_ino; 2475 add_to_worklist(&dirrem->dm_list); 2476 } 2477 if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum, 2478 0, &inodedep) != 0) 2479 (void) free_inodedep(inodedep); 2480 if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) { 2481 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 2482 nextmd = LIST_NEXT(mkdir, md_mkdirs); 2483 if (mkdir->md_diradd != dap) 2484 continue; 2485 dap->da_state &= ~mkdir->md_state; 2486 WORKLIST_REMOVE(&mkdir->md_list); 2487 LIST_REMOVE(mkdir, md_mkdirs); 2488 WORKITEM_FREE(mkdir, D_MKDIR); 2489 } 2490 if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) != 0) { 2491 FREE_LOCK(&lk); 2492 panic("free_diradd: unfound ref"); 2493 } 2494 } 2495 WORKITEM_FREE(dap, D_DIRADD); 2496} 2497 2498/* 2499 * Directory entry removal dependencies. 2500 * 2501 * When removing a directory entry, the entry's inode pointer must be 2502 * zero'ed on disk before the corresponding inode's link count is decremented 2503 * (possibly freeing the inode for re-use). This dependency is handled by 2504 * updating the directory entry but delaying the inode count reduction until 2505 * after the directory block has been written to disk. After this point, the 2506 * inode count can be decremented whenever it is convenient. 2507 / 2508* 2509/* 2510 * This routine should be called immediately after removing 2511 * a directory entry. The inode's link count should not be 2512 * decremented by the calling procedure -- the soft updates 2513 * code will do this task when it is safe. 2514 / 2515void 2516softdep_setup_remove(bp, dp, ip, isrmdir) 2517* struct buf bp; / buffer containing directory block / 2518* struct inode dp; / inode for the directory being modified / 2519* struct inode ip; / inode for directory entry being removed / 2520* int isrmdir; /* indicates if doing RMDIR / 2521{ 2522* struct dirrem dirrem, prevdirrem; 2523 2524 /* 2525 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. 2526 / 2527* dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 2528 2529 /* 2530 * If the COMPLETE flag is clear, then there were no active 2531 * entries and we want to roll back to a zeroed entry until 2532 * the new inode is committed to disk. If the COMPLETE flag is 2533 * set then we have deleted an entry that never made it to 2534 * disk. If the entry we deleted resulted from a name change, 2535 * then the old name still resides on disk. We cannot delete 2536 * its inode (returned to us in prevdirrem) until the zeroed 2537 * directory entry gets to disk. The new inode has never been 2538 * referenced on the disk, so can be deleted immediately. 2539 / 2540* if ((dirrem->dm_state & COMPLETE) == 0) { 2541 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 2542 dm_next); 2543 FREE_LOCK(&lk); 2544 } else { 2545 if (prevdirrem != NULL) 2546 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 2547 prevdirrem, dm_next); 2548 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 2549 FREE_LOCK(&lk); 2550 handle_workitem_remove(dirrem); 2551 } 2552} 2553 2554/* 2555 * Allocate a new dirrem if appropriate and return it along with 2556 * its associated pagedep. Called without a lock, returns with lock. 2557 / 2558static long num_dirrem; / number of dirrem allocated / 2559static struct dirrem 2560newdirrem(bp, dp, ip, isrmdir, prevdirremp) 2561 struct buf bp; / buffer containing directory block / 2562* struct inode dp; / inode for the directory being modified / 2563* struct inode ip; / inode for directory entry being removed / 2564* int isrmdir; /* indicates if doing RMDIR / 2565* struct dirrem *prevdirremp; / previously referenced inode, if any / 2566{ 2567* int offset; 2568 ufs_lbn_t lbn; 2569 struct diradd dap; 2570* struct dirrem dirrem; 2571* struct pagedep pagedep; 2572* 2573 /* 2574 * Whiteouts have no deletion dependencies. 2575 / 2576* if (ip == NULL) 2577 panic("newdirrem: whiteout"); 2578 /* 2579 * If we are over our limit, try to improve the situation. 2580 * Limiting the number of dirrem structures will also limit 2581 * the number of freefile and freeblks structures. 2582 / 2583* if (num_dirrem > max_softdeps / 2) 2584 (void) request_cleanup(FLUSH_REMOVE, 0); 2585 num_dirrem += 1; 2586 MALLOC(dirrem, struct dirrem , sizeof(struct dirrem), 2587* M_DIRREM, M_SOFTDEP_FLAGS\|M_ZERO); 2588 dirrem->dm_list.wk_type = D_DIRREM; 2589 dirrem->dm_state = isrmdir ? RMDIR : 0; 2590 dirrem->dm_mnt = ITOV(ip)->v_mount; 2591 dirrem->dm_oldinum = ip->i_number; 2592 prevdirremp = NULL; 2593* 2594 ACQUIRE_LOCK(&lk); 2595 lbn = lblkno(dp->i_fs, dp->i_offset); 2596 offset = blkoff(dp->i_fs, dp->i_offset); 2597 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2598 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2599 dirrem->dm_pagedep = pagedep; 2600 /* 2601 * Check for a diradd dependency for the same directory entry. 2602 * If present, then both dependencies become obsolete and can 2603 * be de-allocated. Check for an entry on both the pd_dirraddhd 2604 * list and the pd_pendinghd list. 2605 / 2606* 2607 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 2608 if (dap->da_offset == offset) 2609 break; 2610 if (dap == NULL) { 2611 2612 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 2613 if (dap->da_offset == offset) 2614 break; 2615 if (dap == NULL) 2616 return (dirrem); 2617 } 2618 /* 2619 * Must be ATTACHED at this point. 2620 / 2621* if ((dap->da_state & ATTACHED) == 0) { 2622 FREE_LOCK(&lk); 2623 panic("newdirrem: not ATTACHED"); 2624 } 2625 if (dap->da_newinum != ip->i_number) { 2626 FREE_LOCK(&lk); 2627 panic("newdirrem: inum %d should be %d", 2628 ip->i_number, dap->da_newinum); 2629 } 2630 /* 2631 * If we are deleting a changed name that never made it to disk, 2632 * then return the dirrem describing the previous inode (which 2633 * represents the inode currently referenced from this entry on disk). 2634 / 2635* if ((dap->da_state & DIRCHG) != 0) { 2636 prevdirremp = dap->da_previous; 2637* dap->da_state &= ~DIRCHG; 2638 dap->da_pagedep = pagedep; 2639 } 2640 /* 2641 * We are deleting an entry that never made it to disk. 2642 * Mark it COMPLETE so we can delete its inode immediately. 2643 / 2644* dirrem->dm_state \|= COMPLETE; 2645 free_diradd(dap); 2646 return (dirrem); 2647} 2648 2649/* 2650 * Directory entry change dependencies. 2651 * 2652 * Changing an existing directory entry requires that an add operation 2653 * be completed first followed by a deletion. The semantics for the addition 2654 * are identical to the description of adding a new entry above except 2655 * that the rollback is to the old inode number rather than zero. Once 2656 * the addition dependency is completed, the removal is done as described 2657 * in the removal routine above. 2658 / 2659* 2660/* 2661 * This routine should be called immediately after changing 2662 * a directory entry. The inode's link count should not be 2663 * decremented by the calling procedure -- the soft updates 2664 * code will perform this task when it is safe. 2665 / 2666void 2667softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 2668* struct buf bp; / buffer containing directory block / 2669* struct inode dp; / inode for the directory being modified / 2670* struct inode ip; / inode for directory entry being removed / 2671* long newinum; /* new inode number for changed entry / 2672* int isrmdir; /* indicates if doing RMDIR / 2673{ 2674* int offset; 2675 struct diradd dap = NULL; 2676* struct dirrem dirrem, prevdirrem; 2677 struct pagedep pagedep; 2678* struct inodedep inodedep; 2679* 2680 offset = blkoff(dp->i_fs, dp->i_offset); 2681 2682 /* 2683 * Whiteouts do not need diradd dependencies. 2684 / 2685* if (newinum != WINO) { 2686 MALLOC(dap, struct diradd , sizeof(struct diradd), 2687* M_DIRADD, M_SOFTDEP_FLAGS\|M_ZERO); 2688 dap->da_list.wk_type = D_DIRADD; 2689 dap->da_state = DIRCHG \| ATTACHED \| DEPCOMPLETE; 2690 dap->da_offset = offset; 2691 dap->da_newinum = newinum; 2692 } 2693 2694 /* 2695 * Allocate a new dirrem and ACQUIRE_LOCK. 2696 / 2697* dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 2698 pagedep = dirrem->dm_pagedep; 2699 /* 2700 * The possible values for isrmdir: 2701 * 0 - non-directory file rename 2702 * 1 - directory rename within same directory 2703 * inum - directory rename to new directory of given inode number 2704 * When renaming to a new directory, we are both deleting and 2705 * creating a new directory entry, so the link count on the new 2706 * directory should not change. Thus we do not need the followup 2707 * dirrem which is usually done in handle_workitem_remove. We set 2708 * the DIRCHG flag to tell handle_workitem_remove to skip the 2709 * followup dirrem. 2710 / 2711* if (isrmdir > 1) 2712 dirrem->dm_state \|= DIRCHG; 2713 2714 /* 2715 * Whiteouts have no additional dependencies, 2716 * so just put the dirrem on the correct list. 2717 / 2718* if (newinum == WINO) { 2719 if ((dirrem->dm_state & COMPLETE) == 0) { 2720 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 2721 dm_next); 2722 } else { 2723 dirrem->dm_dirinum = pagedep->pd_ino; 2724 add_to_worklist(&dirrem->dm_list); 2725 } 2726 FREE_LOCK(&lk); 2727 return; 2728 } 2729 2730 /* 2731 * If the COMPLETE flag is clear, then there were no active 2732 * entries and we want to roll back to the previous inode until 2733 * the new inode is committed to disk. If the COMPLETE flag is 2734 * set, then we have deleted an entry that never made it to disk. 2735 * If the entry we deleted resulted from a name change, then the old 2736 * inode reference still resides on disk. Any rollback that we do 2737 * needs to be to that old inode (returned to us in prevdirrem). If 2738 * the entry we deleted resulted from a create, then there is 2739 * no entry on the disk, so we want to roll back to zero rather 2740 * than the uncommitted inode. In either of the COMPLETE cases we 2741 * want to immediately free the unwritten and unreferenced inode. 2742 / 2743* if ((dirrem->dm_state & COMPLETE) == 0) { 2744 dap->da_previous = dirrem; 2745 } else { 2746 if (prevdirrem != NULL) { 2747 dap->da_previous = prevdirrem; 2748 } else { 2749 dap->da_state &= ~DIRCHG; 2750 dap->da_pagedep = pagedep; 2751 } 2752 dirrem->dm_dirinum = pagedep->pd_ino; 2753 add_to_worklist(&dirrem->dm_list); 2754 } 2755 /* 2756 * Link into its inodedep. Put it on the id_bufwait list if the inode 2757 * is not yet written. If it is written, do the post-inode write 2758 * processing to put it on the id_pendinghd list. 2759 / 2760* if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 \|\| 2761 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2762 dap->da_state \|= COMPLETE; 2763 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 2764 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 2765 } else { 2766 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 2767 dap, da_pdlist); 2768 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 2769 } 2770 FREE_LOCK(&lk); 2771} 2772 2773/* 2774 * Called whenever the link count on an inode is changed. 2775 * It creates an inode dependency so that the new reference(s) 2776 * to the inode cannot be committed to disk until the updated 2777 * inode has been written. 2778 / 2779void 2780softdep_change_linkcnt(ip) 2781* struct inode ip; / the inode with the increased link count / 2782{ 2783* struct inodedep inodedep; 2784* 2785 ACQUIRE_LOCK(&lk); 2786 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); 2787 if (ip->i_nlink < ip->i_effnlink) { 2788 FREE_LOCK(&lk); 2789 panic("softdep_change_linkcnt: bad delta"); 2790 } 2791 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2792 FREE_LOCK(&lk); 2793} 2794 2795/*
	2796 * Called when the effective link count and the reference count 2797 * on an inode drops to zero. At this point there are no names 2798 * referencing the file in the filesystem and no active file 2799 * references. The space associated with the file will be freed 2800 * as soon as the necessary soft dependencies are cleared. 2801 / 2802void 2803softdep_releasefile(ip) 2804* struct inode ip; / inode with the zero effective link count / 2805{ 2806* struct inodedep inodedep; 2807* 2808 if (ip->i_effnlink > 0) 2809 panic("softdep_filerelease: file still referenced"); 2810 /* 2811 * We may be called several times as the real reference count 2812 * drops to zero. We only want to account for the space once. 2813 / 2814* if (ip->i_flag & IN_SPACECOUNTED) 2815 return; 2816 /* 2817 * We have to deactivate a snapshot otherwise copyonwrites may 2818 * add blocks and the cleanup may remove blocks after we have 2819 * tried to account for them. 2820 / 2821* if ((ip->i_flags & SF_SNAPSHOT) != 0) 2822 ffs_snapremove(ITOV(ip)); 2823 /* 2824 * If we are tracking an nlinkdelta, we have to also remember 2825 * whether we accounted for the freed space yet. 2826 / 2827* ACQUIRE_LOCK(&lk); 2828 if ((inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep))) 2829 inodedep->id_state \|= SPACECOUNTED; 2830 FREE_LOCK(&lk); 2831 ip->i_fs->fs_pendingblocks += ip->i_blocks; 2832 ip->i_fs->fs_pendinginodes += 1; 2833 ip->i_flag \|= IN_SPACECOUNTED; 2834} 2835 2836/*
2783 * This workitem decrements the inode's link count. 2784 * If the link count reaches zero, the file is removed. 2785 / 2786static void 2787handle_workitem_remove(dirrem) 2788* struct dirrem dirrem; 2789{ 2790* struct proc p = CURPROC; / XXX / 2791* struct inodedep inodedep; 2792* struct vnode vp; 2793* struct inode ip; 2794* ino_t oldinum; 2795 int error; 2796 2797 if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) { 2798 softdep_error("handle_workitem_remove: vget", error); 2799 return; 2800 } 2801 ip = VTOI(vp); 2802 ACQUIRE_LOCK(&lk); 2803 if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){ 2804 FREE_LOCK(&lk); 2805 panic("handle_workitem_remove: lost inodedep"); 2806 } 2807 /* 2808 * Normal file deletion. 2809 / 2810* if ((dirrem->dm_state & RMDIR) == 0) { 2811 ip->i_nlink--; 2812 ip->i_flag \|= IN_CHANGE; 2813 if (ip->i_nlink < ip->i_effnlink) { 2814 FREE_LOCK(&lk); 2815 panic("handle_workitem_remove: bad file delta"); 2816 } 2817 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2818 FREE_LOCK(&lk); 2819 vput(vp); 2820 num_dirrem -= 1; 2821 WORKITEM_FREE(dirrem, D_DIRREM); 2822 return; 2823 } 2824 /* 2825 * Directory deletion. Decrement reference count for both the 2826 * just deleted parent directory entry and the reference for ".". 2827 * Next truncate the directory to length zero. When the 2828 * truncation completes, arrange to have the reference count on 2829 * the parent decremented to account for the loss of "..". 2830 / 2831* ip->i_nlink -= 2; 2832 ip->i_flag \|= IN_CHANGE; 2833 if (ip->i_nlink < ip->i_effnlink) { 2834 FREE_LOCK(&lk); 2835 panic("handle_workitem_remove: bad dir delta"); 2836 } 2837 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2838 FREE_LOCK(&lk); 2839 if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0) 2840 softdep_error("handle_workitem_remove: truncate", error); 2841 /* 2842 * Rename a directory to a new parent. Since, we are both deleting 2843 * and creating a new directory entry, the link count on the new 2844 * directory should not change. Thus we skip the followup dirrem. 2845 / 2846* if (dirrem->dm_state & DIRCHG) { 2847 vput(vp); 2848 num_dirrem -= 1; 2849 WORKITEM_FREE(dirrem, D_DIRREM); 2850 return; 2851 } 2852 /* 2853 * If the inodedep does not exist, then the zero'ed inode has 2854 * been written to disk. If the allocated inode has never been 2855 * written to disk, then the on-disk inode is zero'ed. In either 2856 * case we can remove the file immediately. 2857 / 2858* ACQUIRE_LOCK(&lk); 2859 dirrem->dm_state = 0; 2860 oldinum = dirrem->dm_oldinum; 2861 dirrem->dm_oldinum = dirrem->dm_dirinum; 2862 if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 \|\| 2863 check_inode_unwritten(inodedep)) { 2864 FREE_LOCK(&lk); 2865 vput(vp); 2866 handle_workitem_remove(dirrem); 2867 return; 2868 } 2869 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 2870 FREE_LOCK(&lk); 2871 vput(vp); 2872} 2873 2874/* 2875 * Inode de-allocation dependencies. 2876 * 2877 * When an inode's link count is reduced to zero, it can be de-allocated. We 2878 * found it convenient to postpone de-allocation until after the inode is 2879 * written to disk with its new link count (zero). At this point, all of the 2880 * on-disk inode's block pointers are nullified and, with careful dependency 2881 * list ordering, all dependencies related to the inode will be satisfied and 2882 * the corresponding dependency structures de-allocated. So, if/when the 2883 * inode is reused, there will be no mixing of old dependencies with new 2884 * ones. This artificial dependency is set up by the block de-allocation 2885 * procedure above (softdep_setup_freeblocks) and completed by the 2886 * following procedure. 2887 / 2888static void 2889handle_workitem_freefile(freefile) 2890* struct freefile freefile; 2891{ 2892* struct fs fs; 2893* struct inode tip; 2894 struct inodedep idp; 2895* int error; 2896 2897 fs = VFSTOUFS(freefile->fx_mnt)->um_fs; 2898#ifdef DEBUG 2899 ACQUIRE_LOCK(&lk); 2900 error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp); 2901 FREE_LOCK(&lk); 2902 if (error) 2903 panic("handle_workitem_freefile: inodedep survived"); 2904#endif 2905 tip.i_devvp = freefile->fx_devvp; 2906 tip.i_dev = freefile->fx_devvp->v_rdev; 2907 tip.i_fs = fs;	2837 * This workitem decrements the inode's link count. 2838 * If the link count reaches zero, the file is removed. 2839 / 2840static void 2841handle_workitem_remove(dirrem) 2842* struct dirrem dirrem; 2843{ 2844* struct proc p = CURPROC; / XXX / 2845* struct inodedep inodedep; 2846* struct vnode vp; 2847* struct inode ip; 2848* ino_t oldinum; 2849 int error; 2850 2851 if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) { 2852 softdep_error("handle_workitem_remove: vget", error); 2853 return; 2854 } 2855 ip = VTOI(vp); 2856 ACQUIRE_LOCK(&lk); 2857 if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){ 2858 FREE_LOCK(&lk); 2859 panic("handle_workitem_remove: lost inodedep"); 2860 } 2861 /* 2862 * Normal file deletion. 2863 / 2864* if ((dirrem->dm_state & RMDIR) == 0) { 2865 ip->i_nlink--; 2866 ip->i_flag \|= IN_CHANGE; 2867 if (ip->i_nlink < ip->i_effnlink) { 2868 FREE_LOCK(&lk); 2869 panic("handle_workitem_remove: bad file delta"); 2870 } 2871 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2872 FREE_LOCK(&lk); 2873 vput(vp); 2874 num_dirrem -= 1; 2875 WORKITEM_FREE(dirrem, D_DIRREM); 2876 return; 2877 } 2878 /* 2879 * Directory deletion. Decrement reference count for both the 2880 * just deleted parent directory entry and the reference for ".". 2881 * Next truncate the directory to length zero. When the 2882 * truncation completes, arrange to have the reference count on 2883 * the parent decremented to account for the loss of "..". 2884 / 2885* ip->i_nlink -= 2; 2886 ip->i_flag \|= IN_CHANGE; 2887 if (ip->i_nlink < ip->i_effnlink) { 2888 FREE_LOCK(&lk); 2889 panic("handle_workitem_remove: bad dir delta"); 2890 } 2891 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 2892 FREE_LOCK(&lk); 2893 if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0) 2894 softdep_error("handle_workitem_remove: truncate", error); 2895 /* 2896 * Rename a directory to a new parent. Since, we are both deleting 2897 * and creating a new directory entry, the link count on the new 2898 * directory should not change. Thus we skip the followup dirrem. 2899 / 2900* if (dirrem->dm_state & DIRCHG) { 2901 vput(vp); 2902 num_dirrem -= 1; 2903 WORKITEM_FREE(dirrem, D_DIRREM); 2904 return; 2905 } 2906 /* 2907 * If the inodedep does not exist, then the zero'ed inode has 2908 * been written to disk. If the allocated inode has never been 2909 * written to disk, then the on-disk inode is zero'ed. In either 2910 * case we can remove the file immediately. 2911 / 2912* ACQUIRE_LOCK(&lk); 2913 dirrem->dm_state = 0; 2914 oldinum = dirrem->dm_oldinum; 2915 dirrem->dm_oldinum = dirrem->dm_dirinum; 2916 if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 \|\| 2917 check_inode_unwritten(inodedep)) { 2918 FREE_LOCK(&lk); 2919 vput(vp); 2920 handle_workitem_remove(dirrem); 2921 return; 2922 } 2923 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 2924 FREE_LOCK(&lk); 2925 vput(vp); 2926} 2927 2928/* 2929 * Inode de-allocation dependencies. 2930 * 2931 * When an inode's link count is reduced to zero, it can be de-allocated. We 2932 * found it convenient to postpone de-allocation until after the inode is 2933 * written to disk with its new link count (zero). At this point, all of the 2934 * on-disk inode's block pointers are nullified and, with careful dependency 2935 * list ordering, all dependencies related to the inode will be satisfied and 2936 * the corresponding dependency structures de-allocated. So, if/when the 2937 * inode is reused, there will be no mixing of old dependencies with new 2938 * ones. This artificial dependency is set up by the block de-allocation 2939 * procedure above (softdep_setup_freeblocks) and completed by the 2940 * following procedure. 2941 / 2942static void 2943handle_workitem_freefile(freefile) 2944* struct freefile freefile; 2945{ 2946* struct fs fs; 2947* struct inode tip; 2948 struct inodedep idp; 2949* int error; 2950 2951 fs = VFSTOUFS(freefile->fx_mnt)->um_fs; 2952#ifdef DEBUG 2953 ACQUIRE_LOCK(&lk); 2954 error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp); 2955 FREE_LOCK(&lk); 2956 if (error) 2957 panic("handle_workitem_freefile: inodedep survived"); 2958#endif 2959 tip.i_devvp = freefile->fx_devvp; 2960 tip.i_dev = freefile->fx_devvp->v_rdev; 2961 tip.i_fs = fs;
	2962 fs->fs_pendinginodes -= 1;
2908 if ((error = ffs_freefile(&tip, freefile->fx_oldinum, freefile->fx_mode)) != 0) 2909 softdep_error("handle_workitem_freefile", error); 2910 WORKITEM_FREE(freefile, D_FREEFILE); 2911} 2912 2913/* 2914 * Disk writes. 2915 * 2916 * The dependency structures constructed above are most actively used when file 2917 * system blocks are written to disk. No constraints are placed on when a 2918 * block can be written, but unsatisfied update dependencies are made safe by 2919 * modifying (or replacing) the source memory for the duration of the disk 2920 * write. When the disk write completes, the memory block is again brought 2921 * up-to-date. 2922 * 2923 * In-core inode structure reclamation. 2924 * 2925 * Because there are a finite number of "in-core" inode structures, they are 2926 * reused regularly. By transferring all inode-related dependencies to the 2927 * in-memory inode block and indexing them separately (via "inodedep"s), we 2928 * can allow "in-core" inode structures to be reused at any time and avoid 2929 * any increase in contention. 2930 * 2931 * Called just before entering the device driver to initiate a new disk I/O. 2932 * The buffer must be locked, thus, no I/O completion operations can occur 2933 * while we are manipulating its associated dependencies. 2934 / 2935static void 2936softdep_disk_io_initiation(bp) 2937* struct buf bp; / structure describing disk write to occur / 2938{ 2939* struct worklist wk, nextwk; 2940 struct indirdep indirdep; 2941* 2942 /* 2943 * We only care about write operations. There should never 2944 * be dependencies for reads. 2945 / 2946* if (bp->b_iocmd == BIO_READ) 2947 panic("softdep_disk_io_initiation: read"); 2948 /* 2949 * Do any necessary pre-I/O processing. 2950 / 2951* for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) { 2952 nextwk = LIST_NEXT(wk, wk_list); 2953 switch (wk->wk_type) { 2954 2955 case D_PAGEDEP: 2956 initiate_write_filepage(WK_PAGEDEP(wk), bp); 2957 continue; 2958 2959 case D_INODEDEP: 2960 initiate_write_inodeblock(WK_INODEDEP(wk), bp); 2961 continue; 2962 2963 case D_INDIRDEP: 2964 indirdep = WK_INDIRDEP(wk); 2965 if (indirdep->ir_state & GOINGAWAY) 2966 panic("disk_io_initiation: indirdep gone"); 2967 /* 2968 * If there are no remaining dependencies, this 2969 * will be writing the real pointers, so the 2970 * dependency can be freed. 2971 / 2972* if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { 2973 indirdep->ir_savebp->b_flags \|= B_INVAL \| B_NOCACHE; 2974 brelse(indirdep->ir_savebp); 2975 /* inline expand WORKLIST_REMOVE(wk); / 2976* wk->wk_state &= ~ONWORKLIST; 2977 LIST_REMOVE(wk, wk_list); 2978 WORKITEM_FREE(indirdep, D_INDIRDEP); 2979 continue; 2980 } 2981 /* 2982 * Replace up-to-date version with safe version. 2983 / 2984* MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount, 2985 M_INDIRDEP, M_SOFTDEP_FLAGS); 2986 ACQUIRE_LOCK(&lk); 2987 indirdep->ir_state &= ~ATTACHED; 2988 indirdep->ir_state \|= UNDONE; 2989 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 2990 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 2991 bp->b_bcount); 2992 FREE_LOCK(&lk); 2993 continue; 2994 2995 case D_MKDIR: 2996 case D_BMSAFEMAP: 2997 case D_ALLOCDIRECT: 2998 case D_ALLOCINDIR: 2999 continue; 3000 3001 default: 3002 panic("handle_disk_io_initiation: Unexpected type %s", 3003 TYPENAME(wk->wk_type)); 3004 /* NOTREACHED / 3005* } 3006 } 3007} 3008 3009/* 3010 * Called from within the procedure above to deal with unsatisfied 3011 * allocation dependencies in a directory. The buffer must be locked, 3012 * thus, no I/O completion operations can occur while we are 3013 * manipulating its associated dependencies. 3014 / 3015static void 3016initiate_write_filepage(pagedep, bp) 3017* struct pagedep pagedep; 3018* struct buf bp; 3019{ 3020* struct diradd dap; 3021* struct direct ep; 3022* int i; 3023 3024 if (pagedep->pd_state & IOSTARTED) { 3025 /* 3026 * This can only happen if there is a driver that does not 3027 * understand chaining. Here biodone will reissue the call 3028 * to strategy for the incomplete buffers. 3029 / 3030* printf("initiate_write_filepage: already started\n"); 3031 return; 3032 } 3033 pagedep->pd_state \|= IOSTARTED; 3034 ACQUIRE_LOCK(&lk); 3035 for (i = 0; i < DAHASHSZ; i++) { 3036 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 3037 ep = (struct direct ) 3038* ((char )bp->b_data + dap->da_offset); 3039* if (ep->d_ino != dap->da_newinum) { 3040 FREE_LOCK(&lk); 3041 panic("%s: dir inum %d != new %d", 3042 "initiate_write_filepage", 3043 ep->d_ino, dap->da_newinum); 3044 } 3045 if (dap->da_state & DIRCHG) 3046 ep->d_ino = dap->da_previous->dm_oldinum; 3047 else 3048 ep->d_ino = 0; 3049 dap->da_state &= ~ATTACHED; 3050 dap->da_state \|= UNDONE; 3051 } 3052 } 3053 FREE_LOCK(&lk); 3054} 3055 3056/* 3057 * Called from within the procedure above to deal with unsatisfied 3058 * allocation dependencies in an inodeblock. The buffer must be 3059 * locked, thus, no I/O completion operations can occur while we 3060 * are manipulating its associated dependencies. 3061 / 3062static void 3063initiate_write_inodeblock(inodedep, bp) 3064* struct inodedep inodedep; 3065* struct buf bp; / The inode block / 3066{ 3067* struct allocdirect adp, lastadp; 3068 struct dinode dp; 3069* struct fs fs; 3070* ufs_lbn_t prevlbn = 0; 3071 int i, deplist; 3072 3073 if (inodedep->id_state & IOSTARTED) 3074 panic("initiate_write_inodeblock: already started"); 3075 inodedep->id_state \|= IOSTARTED; 3076 fs = inodedep->id_fs; 3077 dp = (struct dinode )bp->b_data + 3078* ino_to_fsbo(fs, inodedep->id_ino); 3079 /* 3080 * If the bitmap is not yet written, then the allocated 3081 * inode cannot be written to disk. 3082 / 3083* if ((inodedep->id_state & DEPCOMPLETE) == 0) { 3084 if (inodedep->id_savedino != NULL) 3085 panic("initiate_write_inodeblock: already doing I/O"); 3086 MALLOC(inodedep->id_savedino, struct dinode , 3087* sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS); 3088 inodedep->id_savedino = dp; 3089 bzero((caddr_t)dp, sizeof(struct dinode)); 3090 return; 3091 } 3092 /* 3093 * If no dependencies, then there is nothing to roll back. 3094 / 3095* inodedep->id_savedsize = dp->di_size; 3096 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) 3097 return; 3098 /* 3099 * Set the dependencies to busy. 3100 / 3101* ACQUIRE_LOCK(&lk); 3102 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3103 adp = TAILQ_NEXT(adp, ad_next)) { 3104#ifdef DIAGNOSTIC 3105 if (deplist != 0 && prevlbn >= adp->ad_lbn) { 3106 FREE_LOCK(&lk); 3107 panic("softdep_write_inodeblock: lbn order"); 3108 } 3109 prevlbn = adp->ad_lbn; 3110 if (adp->ad_lbn < NDADDR && 3111 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) { 3112 FREE_LOCK(&lk); 3113 panic("%s: direct pointer #%ld mismatch %d != %d", 3114 "softdep_write_inodeblock", adp->ad_lbn, 3115 dp->di_db[adp->ad_lbn], adp->ad_newblkno); 3116 } 3117 if (adp->ad_lbn >= NDADDR && 3118 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) { 3119 FREE_LOCK(&lk); 3120 panic("%s: indirect pointer #%ld mismatch %d != %d", 3121 "softdep_write_inodeblock", adp->ad_lbn - NDADDR, 3122 dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno); 3123 } 3124 deplist \|= 1 << adp->ad_lbn; 3125 if ((adp->ad_state & ATTACHED) == 0) { 3126 FREE_LOCK(&lk); 3127 panic("softdep_write_inodeblock: Unknown state 0x%x", 3128 adp->ad_state); 3129 } 3130#endif /* DIAGNOSTIC / 3131* adp->ad_state &= ~ATTACHED; 3132 adp->ad_state \|= UNDONE; 3133 } 3134 /* 3135 * The on-disk inode cannot claim to be any larger than the last 3136 * fragment that has been written. Otherwise, the on-disk inode 3137 * might have fragments that were not the last block in the file 3138 * which would corrupt the filesystem. 3139 / 3140* for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3141 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 3142 if (adp->ad_lbn >= NDADDR) 3143 break; 3144 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; 3145 /* keep going until hitting a rollback to a frag / 3146* if (adp->ad_oldsize == 0 \|\| adp->ad_oldsize == fs->fs_bsize) 3147 continue; 3148 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 3149 for (i = adp->ad_lbn + 1; i < NDADDR; i++) { 3150#ifdef DIAGNOSTIC 3151 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) { 3152 FREE_LOCK(&lk); 3153 panic("softdep_write_inodeblock: lost dep1"); 3154 } 3155#endif /* DIAGNOSTIC / 3156* dp->di_db[i] = 0; 3157 } 3158 for (i = 0; i < NIADDR; i++) { 3159#ifdef DIAGNOSTIC 3160 if (dp->di_ib[i] != 0 && 3161 (deplist & ((1 << NDADDR) << i)) == 0) { 3162 FREE_LOCK(&lk); 3163 panic("softdep_write_inodeblock: lost dep2"); 3164 } 3165#endif /* DIAGNOSTIC / 3166* dp->di_ib[i] = 0; 3167 } 3168 FREE_LOCK(&lk); 3169 return; 3170 } 3171 /* 3172 * If we have zero'ed out the last allocated block of the file, 3173 * roll back the size to the last currently allocated block. 3174 * We know that this last allocated block is a full-sized as 3175 * we already checked for fragments in the loop above. 3176 / 3177* if (lastadp != NULL && 3178 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 3179 for (i = lastadp->ad_lbn; i >= 0; i--) 3180 if (dp->di_db[i] != 0) 3181 break; 3182 dp->di_size = (i + 1) * fs->fs_bsize; 3183 } 3184 /* 3185 * The only dependencies are for indirect blocks. 3186 * 3187 * The file size for indirect block additions is not guaranteed. 3188 * Such a guarantee would be non-trivial to achieve. The conventional 3189 * synchronous write implementation also does not make this guarantee. 3190 * Fsck should catch and fix discrepancies. Arguably, the file size 3191 * can be over-estimated without destroying integrity when the file 3192 * moves into the indirect blocks (i.e., is large). If we want to 3193 * postpone fsck, we are stuck with this argument. 3194 / 3195* for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 3196 dp->di_ib[adp->ad_lbn - NDADDR] = 0; 3197 FREE_LOCK(&lk); 3198} 3199 3200/* 3201 * This routine is called during the completion interrupt 3202 * service routine for a disk write (from the procedure called 3203 * by the device driver to inform the file system caches of 3204 * a request completion). It should be called early in this 3205 * procedure, before the block is made available to other 3206 * processes or other routines are called. 3207 / 3208static void 3209softdep_disk_write_complete(bp) 3210* struct buf bp; / describes the completed disk write / 3211{ 3212* struct worklist wk; 3213* struct workhead reattach; 3214 struct newblk newblk; 3215* struct allocindir aip; 3216* struct allocdirect adp; 3217* struct indirdep indirdep; 3218* struct inodedep inodedep; 3219* struct bmsafemap bmsafemap; 3220* 3221#ifdef DEBUG 3222 if (lk.lkt_held != -1) 3223 panic("softdep_disk_write_complete: lock is held"); 3224 lk.lkt_held = -2; 3225#endif 3226 LIST_INIT(&reattach); 3227 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 3228 WORKLIST_REMOVE(wk); 3229 switch (wk->wk_type) { 3230 3231 case D_PAGEDEP: 3232 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 3233 WORKLIST_INSERT(&reattach, wk); 3234 continue; 3235 3236 case D_INODEDEP: 3237 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 3238 WORKLIST_INSERT(&reattach, wk); 3239 continue; 3240 3241 case D_BMSAFEMAP: 3242 bmsafemap = WK_BMSAFEMAP(wk); 3243 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { 3244 newblk->nb_state \|= DEPCOMPLETE; 3245 newblk->nb_bmsafemap = NULL; 3246 LIST_REMOVE(newblk, nb_deps); 3247 } 3248 while ((adp = 3249 LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { 3250 adp->ad_state \|= DEPCOMPLETE; 3251 adp->ad_buf = NULL; 3252 LIST_REMOVE(adp, ad_deps); 3253 handle_allocdirect_partdone(adp); 3254 } 3255 while ((aip = 3256 LIST_FIRST(&bmsafemap->sm_allocindirhd))) { 3257 aip->ai_state \|= DEPCOMPLETE; 3258 aip->ai_buf = NULL; 3259 LIST_REMOVE(aip, ai_deps); 3260 handle_allocindir_partdone(aip); 3261 } 3262 while ((inodedep = 3263 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { 3264 inodedep->id_state \|= DEPCOMPLETE; 3265 LIST_REMOVE(inodedep, id_deps); 3266 inodedep->id_buf = NULL; 3267 } 3268 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 3269 continue; 3270 3271 case D_MKDIR: 3272 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 3273 continue; 3274 3275 case D_ALLOCDIRECT: 3276 adp = WK_ALLOCDIRECT(wk); 3277 adp->ad_state \|= COMPLETE; 3278 handle_allocdirect_partdone(adp); 3279 continue; 3280 3281 case D_ALLOCINDIR: 3282 aip = WK_ALLOCINDIR(wk); 3283 aip->ai_state \|= COMPLETE; 3284 handle_allocindir_partdone(aip); 3285 continue; 3286 3287 case D_INDIRDEP: 3288 indirdep = WK_INDIRDEP(wk); 3289 if (indirdep->ir_state & GOINGAWAY) { 3290 lk.lkt_held = -1; 3291 panic("disk_write_complete: indirdep gone"); 3292 } 3293 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 3294 FREE(indirdep->ir_saveddata, M_INDIRDEP); 3295 indirdep->ir_saveddata = 0; 3296 indirdep->ir_state &= ~UNDONE; 3297 indirdep->ir_state \|= ATTACHED; 3298 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 3299 handle_allocindir_partdone(aip); 3300 if (aip == LIST_FIRST(&indirdep->ir_donehd)) { 3301 lk.lkt_held = -1; 3302 panic("disk_write_complete: not gone"); 3303 } 3304 } 3305 WORKLIST_INSERT(&reattach, wk); 3306 if ((bp->b_flags & B_DELWRI) == 0) 3307 stat_indir_blk_ptrs++; 3308 bdirty(bp); 3309 continue; 3310 3311 default: 3312 lk.lkt_held = -1; 3313 panic("handle_disk_write_complete: Unknown type %s", 3314 TYPENAME(wk->wk_type)); 3315 /* NOTREACHED / 3316* } 3317 } 3318 /* 3319 * Reattach any requests that must be redone. 3320 / 3321* while ((wk = LIST_FIRST(&reattach)) != NULL) { 3322 WORKLIST_REMOVE(wk); 3323 WORKLIST_INSERT(&bp->b_dep, wk); 3324 } 3325#ifdef DEBUG 3326 if (lk.lkt_held != -2) 3327 panic("softdep_disk_write_complete: lock lost"); 3328 lk.lkt_held = -1; 3329#endif 3330} 3331 3332/* 3333 * Called from within softdep_disk_write_complete above. Note that 3334 * this routine is always called from interrupt level with further 3335 * splbio interrupts blocked. 3336 / 3337static void 3338handle_allocdirect_partdone(adp) 3339* struct allocdirect adp; / the completed allocdirect / 3340{ 3341* struct allocdirect listadp; 3342* struct inodedep inodedep; 3343* long bsize, delay; 3344 3345 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 3346 return; 3347 if (adp->ad_buf != NULL) { 3348 lk.lkt_held = -1; 3349 panic("handle_allocdirect_partdone: dangling dep"); 3350 } 3351 /* 3352 * The on-disk inode cannot claim to be any larger than the last 3353 * fragment that has been written. Otherwise, the on-disk inode 3354 * might have fragments that were not the last block in the file 3355 * which would corrupt the filesystem. Thus, we cannot free any 3356 * allocdirects after one whose ad_oldblkno claims a fragment as 3357 * these blocks must be rolled back to zero before writing the inode. 3358 * We check the currently active set of allocdirects in id_inoupdt. 3359 / 3360* inodedep = adp->ad_inodedep; 3361 bsize = inodedep->id_fs->fs_bsize; 3362 TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) { 3363 /* found our block / 3364* if (listadp == adp) 3365 break; 3366 /* continue if ad_oldlbn is not a fragment / 3367* if (listadp->ad_oldsize == 0 \|\| 3368 listadp->ad_oldsize == bsize) 3369 continue; 3370 /* hit a fragment / 3371* return; 3372 } 3373 /* 3374 * If we have reached the end of the current list without 3375 * finding the just finished dependency, then it must be 3376 * on the future dependency list. Future dependencies cannot 3377 * be freed until they are moved to the current list. 3378 / 3379* if (listadp == NULL) { 3380#ifdef DEBUG 3381 TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next) 3382 /* found our block / 3383* if (listadp == adp) 3384 break; 3385 if (listadp == NULL) { 3386 lk.lkt_held = -1; 3387 panic("handle_allocdirect_partdone: lost dep"); 3388 } 3389#endif /* DEBUG / 3390* return; 3391 } 3392 /* 3393 * If we have found the just finished dependency, then free 3394 * it along with anything that follows it that is complete. 3395 * If the inode still has a bitmap dependency, then it has 3396 * never been written to disk, hence the on-disk inode cannot 3397 * reference the old fragment so we can free it without delay. 3398 / 3399* delay = (inodedep->id_state & DEPCOMPLETE); 3400 for (; adp; adp = listadp) { 3401 listadp = TAILQ_NEXT(adp, ad_next); 3402 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 3403 return; 3404 free_allocdirect(&inodedep->id_inoupdt, adp, delay); 3405 } 3406} 3407 3408/* 3409 * Called from within softdep_disk_write_complete above. Note that 3410 * this routine is always called from interrupt level with further 3411 * splbio interrupts blocked. 3412 / 3413static void 3414handle_allocindir_partdone(aip) 3415* struct allocindir aip; / the completed allocindir / 3416{ 3417* struct indirdep indirdep; 3418* 3419 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 3420 return; 3421 if (aip->ai_buf != NULL) { 3422 lk.lkt_held = -1; 3423 panic("handle_allocindir_partdone: dangling dependency"); 3424 } 3425 indirdep = aip->ai_indirdep; 3426 if (indirdep->ir_state & UNDONE) { 3427 LIST_REMOVE(aip, ai_next); 3428 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 3429 return; 3430 } 3431 ((ufs_daddr_t )indirdep->ir_savebp->b_data)[aip->ai_offset] = 3432* aip->ai_newblkno; 3433 LIST_REMOVE(aip, ai_next); 3434 if (aip->ai_freefrag != NULL) 3435 add_to_worklist(&aip->ai_freefrag->ff_list); 3436 WORKITEM_FREE(aip, D_ALLOCINDIR); 3437} 3438 3439/* 3440 * Called from within softdep_disk_write_complete above to restore 3441 * in-memory inode block contents to their most up-to-date state. Note 3442 * that this routine is always called from interrupt level with further 3443 * splbio interrupts blocked. 3444 / 3445static int 3446handle_written_inodeblock(inodedep, bp) 3447* struct inodedep inodedep; 3448* struct buf bp; / buffer containing the inode block / 3449{ 3450* struct worklist wk, filefree; 3451 struct allocdirect adp, nextadp; 3452 struct dinode dp; 3453* int hadchanges; 3454 3455 if ((inodedep->id_state & IOSTARTED) == 0) { 3456 lk.lkt_held = -1; 3457 panic("handle_written_inodeblock: not started"); 3458 } 3459 inodedep->id_state &= ~IOSTARTED; 3460 inodedep->id_state \|= COMPLETE; 3461 dp = (struct dinode )bp->b_data + 3462* ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 3463 /* 3464 * If we had to rollback the inode allocation because of 3465 * bitmaps being incomplete, then simply restore it. 3466 * Keep the block dirty so that it will not be reclaimed until 3467 * all associated dependencies have been cleared and the 3468 * corresponding updates written to disk. 3469 / 3470* if (inodedep->id_savedino != NULL) { 3471 dp = inodedep->id_savedino; 3472 FREE(inodedep->id_savedino, M_INODEDEP); 3473 inodedep->id_savedino = NULL; 3474 if ((bp->b_flags & B_DELWRI) == 0) 3475 stat_inode_bitmap++; 3476 bdirty(bp); 3477 return (1); 3478 } 3479 /* 3480 * Roll forward anything that had to be rolled back before 3481 * the inode could be updated. 3482 / 3483* hadchanges = 0; 3484 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 3485 nextadp = TAILQ_NEXT(adp, ad_next); 3486 if (adp->ad_state & ATTACHED) { 3487 lk.lkt_held = -1; 3488 panic("handle_written_inodeblock: new entry"); 3489 } 3490 if (adp->ad_lbn < NDADDR) { 3491 if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) { 3492 lk.lkt_held = -1; 3493 panic("%s: %s #%ld mismatch %d != %d", 3494 "handle_written_inodeblock", 3495 "direct pointer", adp->ad_lbn, 3496 dp->di_db[adp->ad_lbn], adp->ad_oldblkno); 3497 } 3498 dp->di_db[adp->ad_lbn] = adp->ad_newblkno; 3499 } else { 3500 if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) { 3501 lk.lkt_held = -1; 3502 panic("%s: %s #%ld allocated as %d", 3503 "handle_written_inodeblock", 3504 "indirect pointer", adp->ad_lbn - NDADDR, 3505 dp->di_ib[adp->ad_lbn - NDADDR]); 3506 } 3507 dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; 3508 } 3509 adp->ad_state &= ~UNDONE; 3510 adp->ad_state \|= ATTACHED; 3511 hadchanges = 1; 3512 } 3513 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 3514 stat_direct_blk_ptrs++; 3515 /* 3516 * Reset the file size to its most up-to-date value. 3517 / 3518* if (inodedep->id_savedsize == -1) { 3519 lk.lkt_held = -1; 3520 panic("handle_written_inodeblock: bad size"); 3521 } 3522 if (dp->di_size != inodedep->id_savedsize) { 3523 dp->di_size = inodedep->id_savedsize; 3524 hadchanges = 1; 3525 } 3526 inodedep->id_savedsize = -1; 3527 /* 3528 * If there were any rollbacks in the inode block, then it must be 3529 * marked dirty so that its will eventually get written back in 3530 * its correct form. 3531 / 3532* if (hadchanges) 3533 bdirty(bp); 3534 /* 3535 * Process any allocdirects that completed during the update. 3536 / 3537* if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 3538 handle_allocdirect_partdone(adp); 3539 /* 3540 * Process deallocations that were held pending until the 3541 * inode had been written to disk. Freeing of the inode 3542 * is delayed until after all blocks have been freed to 3543 * avoid creation of new <vfsid, inum, lbn> triples 3544 * before the old ones have been deleted. 3545 / 3546* filefree = NULL; 3547 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 3548 WORKLIST_REMOVE(wk); 3549 switch (wk->wk_type) { 3550 3551 case D_FREEFILE: 3552 /* 3553 * We defer adding filefree to the worklist until 3554 * all other additions have been made to ensure 3555 * that it will be done after all the old blocks 3556 * have been freed. 3557 / 3558* if (filefree != NULL) { 3559 lk.lkt_held = -1; 3560 panic("handle_written_inodeblock: filefree"); 3561 } 3562 filefree = wk; 3563 continue; 3564 3565 case D_MKDIR: 3566 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 3567 continue; 3568 3569 case D_DIRADD: 3570 diradd_inode_written(WK_DIRADD(wk), inodedep); 3571 continue; 3572 3573 case D_FREEBLKS: 3574 case D_FREEFRAG: 3575 case D_DIRREM: 3576 add_to_worklist(wk); 3577 continue; 3578 3579 default: 3580 lk.lkt_held = -1; 3581 panic("handle_written_inodeblock: Unknown type %s", 3582 TYPENAME(wk->wk_type)); 3583 /* NOTREACHED / 3584* } 3585 } 3586 if (filefree != NULL) { 3587 if (free_inodedep(inodedep) == 0) { 3588 lk.lkt_held = -1; 3589 panic("handle_written_inodeblock: live inodedep"); 3590 } 3591 add_to_worklist(filefree); 3592 return (0); 3593 } 3594 3595 /* 3596 * If no outstanding dependencies, free it. 3597 / 3598* if (free_inodedep(inodedep) \|\| TAILQ_FIRST(&inodedep->id_inoupdt) == 0) 3599 return (0); 3600 return (hadchanges); 3601} 3602 3603/* 3604 * Process a diradd entry after its dependent inode has been written. 3605 * This routine must be called with splbio interrupts blocked. 3606 / 3607static void 3608diradd_inode_written(dap, inodedep) 3609* struct diradd dap; 3610* struct inodedep inodedep; 3611{ 3612* struct pagedep pagedep; 3613* 3614 dap->da_state \|= COMPLETE; 3615 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3616 if (dap->da_state & DIRCHG) 3617 pagedep = dap->da_previous->dm_pagedep; 3618 else 3619 pagedep = dap->da_pagedep; 3620 LIST_REMOVE(dap, da_pdlist); 3621 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3622 } 3623 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 3624} 3625 3626/* 3627 * Handle the completion of a mkdir dependency. 3628 / 3629static void 3630handle_written_mkdir(mkdir, type) 3631* struct mkdir mkdir; 3632* int type; 3633{ 3634 struct diradd dap; 3635* struct pagedep pagedep; 3636* 3637 if (mkdir->md_state != type) { 3638 lk.lkt_held = -1; 3639 panic("handle_written_mkdir: bad type"); 3640 } 3641 dap = mkdir->md_diradd; 3642 dap->da_state &= ~type; 3643 if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) == 0) 3644 dap->da_state \|= DEPCOMPLETE; 3645 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3646 if (dap->da_state & DIRCHG) 3647 pagedep = dap->da_previous->dm_pagedep; 3648 else 3649 pagedep = dap->da_pagedep; 3650 LIST_REMOVE(dap, da_pdlist); 3651 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3652 } 3653 LIST_REMOVE(mkdir, md_mkdirs); 3654 WORKITEM_FREE(mkdir, D_MKDIR); 3655} 3656 3657/* 3658 * Called from within softdep_disk_write_complete above. 3659 * A write operation was just completed. Removed inodes can 3660 * now be freed and associated block pointers may be committed. 3661 * Note that this routine is always called from interrupt level 3662 * with further splbio interrupts blocked. 3663 / 3664static int 3665handle_written_filepage(pagedep, bp) 3666* struct pagedep pagedep; 3667* struct buf bp; / buffer containing the written page / 3668{ 3669* struct dirrem dirrem; 3670* struct diradd dap, nextdap; 3671 struct direct ep; 3672* int i, chgs; 3673 3674 if ((pagedep->pd_state & IOSTARTED) == 0) { 3675 lk.lkt_held = -1; 3676 panic("handle_written_filepage: not started"); 3677 } 3678 pagedep->pd_state &= ~IOSTARTED; 3679 /* 3680 * Process any directory removals that have been committed. 3681 / 3682* while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 3683 LIST_REMOVE(dirrem, dm_next); 3684 dirrem->dm_dirinum = pagedep->pd_ino; 3685 add_to_worklist(&dirrem->dm_list); 3686 } 3687 /* 3688 * Free any directory additions that have been committed. 3689 / 3690* while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 3691 free_diradd(dap); 3692 /* 3693 * Uncommitted directory entries must be restored. 3694 / 3695* for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 3696 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 3697 dap = nextdap) { 3698 nextdap = LIST_NEXT(dap, da_pdlist); 3699 if (dap->da_state & ATTACHED) { 3700 lk.lkt_held = -1; 3701 panic("handle_written_filepage: attached"); 3702 } 3703 ep = (struct direct ) 3704* ((char )bp->b_data + dap->da_offset); 3705* ep->d_ino = dap->da_newinum; 3706 dap->da_state &= ~UNDONE; 3707 dap->da_state \|= ATTACHED; 3708 chgs = 1; 3709 /* 3710 * If the inode referenced by the directory has 3711 * been written out, then the dependency can be 3712 * moved to the pending list. 3713 / 3714* if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3715 LIST_REMOVE(dap, da_pdlist); 3716 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 3717 da_pdlist); 3718 } 3719 } 3720 } 3721 /* 3722 * If there were any rollbacks in the directory, then it must be 3723 * marked dirty so that its will eventually get written back in 3724 * its correct form. 3725 / 3726* if (chgs) { 3727 if ((bp->b_flags & B_DELWRI) == 0) 3728 stat_dir_entry++; 3729 bdirty(bp); 3730 } 3731 /* 3732 * If no dependencies remain, the pagedep will be freed. 3733 * Otherwise it will remain to update the page before it 3734 * is written back to disk. 3735 / 3736* if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) { 3737 for (i = 0; i < DAHASHSZ; i++) 3738 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) 3739 break; 3740 if (i == DAHASHSZ) { 3741 LIST_REMOVE(pagedep, pd_hash); 3742 WORKITEM_FREE(pagedep, D_PAGEDEP); 3743 return (0); 3744 } 3745 } 3746 return (1); 3747} 3748 3749/* 3750 * Writing back in-core inode structures. 3751 * 3752 * The file system only accesses an inode's contents when it occupies an 3753 * "in-core" inode structure. These "in-core" structures are separate from 3754 * the page frames used to cache inode blocks. Only the latter are 3755 * transferred to/from the disk. So, when the updated contents of the 3756 * "in-core" inode structure are copied to the corresponding in-memory inode 3757 * block, the dependencies are also transferred. The following procedure is 3758 * called when copying a dirty "in-core" inode to a cached inode block. 3759 / 3760* 3761/* 3762 * Called when an inode is loaded from disk. If the effective link count 3763 * differed from the actual link count when it was last flushed, then we 3764 * need to ensure that the correct effective link count is put back. 3765 / 3766void 3767softdep_load_inodeblock(ip) 3768* struct inode ip; / the "in_core" copy of the inode / 3769{ 3770* struct inodedep inodedep; 3771* 3772 /* 3773 * Check for alternate nlink count. 3774 / 3775* ip->i_effnlink = ip->i_nlink; 3776 ACQUIRE_LOCK(&lk); 3777 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3778 FREE_LOCK(&lk); 3779 return; 3780 } 3781 ip->i_effnlink -= inodedep->id_nlinkdelta;	2963 if ((error = ffs_freefile(&tip, freefile->fx_oldinum, freefile->fx_mode)) != 0) 2964 softdep_error("handle_workitem_freefile", error); 2965 WORKITEM_FREE(freefile, D_FREEFILE); 2966} 2967 2968/* 2969 * Disk writes. 2970 * 2971 * The dependency structures constructed above are most actively used when file 2972 * system blocks are written to disk. No constraints are placed on when a 2973 * block can be written, but unsatisfied update dependencies are made safe by 2974 * modifying (or replacing) the source memory for the duration of the disk 2975 * write. When the disk write completes, the memory block is again brought 2976 * up-to-date. 2977 * 2978 * In-core inode structure reclamation. 2979 * 2980 * Because there are a finite number of "in-core" inode structures, they are 2981 * reused regularly. By transferring all inode-related dependencies to the 2982 * in-memory inode block and indexing them separately (via "inodedep"s), we 2983 * can allow "in-core" inode structures to be reused at any time and avoid 2984 * any increase in contention. 2985 * 2986 * Called just before entering the device driver to initiate a new disk I/O. 2987 * The buffer must be locked, thus, no I/O completion operations can occur 2988 * while we are manipulating its associated dependencies. 2989 / 2990static void 2991softdep_disk_io_initiation(bp) 2992* struct buf bp; / structure describing disk write to occur / 2993{ 2994* struct worklist wk, nextwk; 2995 struct indirdep indirdep; 2996* 2997 /* 2998 * We only care about write operations. There should never 2999 * be dependencies for reads. 3000 / 3001* if (bp->b_iocmd == BIO_READ) 3002 panic("softdep_disk_io_initiation: read"); 3003 /* 3004 * Do any necessary pre-I/O processing. 3005 / 3006* for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) { 3007 nextwk = LIST_NEXT(wk, wk_list); 3008 switch (wk->wk_type) { 3009 3010 case D_PAGEDEP: 3011 initiate_write_filepage(WK_PAGEDEP(wk), bp); 3012 continue; 3013 3014 case D_INODEDEP: 3015 initiate_write_inodeblock(WK_INODEDEP(wk), bp); 3016 continue; 3017 3018 case D_INDIRDEP: 3019 indirdep = WK_INDIRDEP(wk); 3020 if (indirdep->ir_state & GOINGAWAY) 3021 panic("disk_io_initiation: indirdep gone"); 3022 /* 3023 * If there are no remaining dependencies, this 3024 * will be writing the real pointers, so the 3025 * dependency can be freed. 3026 / 3027* if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { 3028 indirdep->ir_savebp->b_flags \|= B_INVAL \| B_NOCACHE; 3029 brelse(indirdep->ir_savebp); 3030 /* inline expand WORKLIST_REMOVE(wk); / 3031* wk->wk_state &= ~ONWORKLIST; 3032 LIST_REMOVE(wk, wk_list); 3033 WORKITEM_FREE(indirdep, D_INDIRDEP); 3034 continue; 3035 } 3036 /* 3037 * Replace up-to-date version with safe version. 3038 / 3039* MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount, 3040 M_INDIRDEP, M_SOFTDEP_FLAGS); 3041 ACQUIRE_LOCK(&lk); 3042 indirdep->ir_state &= ~ATTACHED; 3043 indirdep->ir_state \|= UNDONE; 3044 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 3045 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 3046 bp->b_bcount); 3047 FREE_LOCK(&lk); 3048 continue; 3049 3050 case D_MKDIR: 3051 case D_BMSAFEMAP: 3052 case D_ALLOCDIRECT: 3053 case D_ALLOCINDIR: 3054 continue; 3055 3056 default: 3057 panic("handle_disk_io_initiation: Unexpected type %s", 3058 TYPENAME(wk->wk_type)); 3059 /* NOTREACHED / 3060* } 3061 } 3062} 3063 3064/* 3065 * Called from within the procedure above to deal with unsatisfied 3066 * allocation dependencies in a directory. The buffer must be locked, 3067 * thus, no I/O completion operations can occur while we are 3068 * manipulating its associated dependencies. 3069 / 3070static void 3071initiate_write_filepage(pagedep, bp) 3072* struct pagedep pagedep; 3073* struct buf bp; 3074{ 3075* struct diradd dap; 3076* struct direct ep; 3077* int i; 3078 3079 if (pagedep->pd_state & IOSTARTED) { 3080 /* 3081 * This can only happen if there is a driver that does not 3082 * understand chaining. Here biodone will reissue the call 3083 * to strategy for the incomplete buffers. 3084 / 3085* printf("initiate_write_filepage: already started\n"); 3086 return; 3087 } 3088 pagedep->pd_state \|= IOSTARTED; 3089 ACQUIRE_LOCK(&lk); 3090 for (i = 0; i < DAHASHSZ; i++) { 3091 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 3092 ep = (struct direct ) 3093* ((char )bp->b_data + dap->da_offset); 3094* if (ep->d_ino != dap->da_newinum) { 3095 FREE_LOCK(&lk); 3096 panic("%s: dir inum %d != new %d", 3097 "initiate_write_filepage", 3098 ep->d_ino, dap->da_newinum); 3099 } 3100 if (dap->da_state & DIRCHG) 3101 ep->d_ino = dap->da_previous->dm_oldinum; 3102 else 3103 ep->d_ino = 0; 3104 dap->da_state &= ~ATTACHED; 3105 dap->da_state \|= UNDONE; 3106 } 3107 } 3108 FREE_LOCK(&lk); 3109} 3110 3111/* 3112 * Called from within the procedure above to deal with unsatisfied 3113 * allocation dependencies in an inodeblock. The buffer must be 3114 * locked, thus, no I/O completion operations can occur while we 3115 * are manipulating its associated dependencies. 3116 / 3117static void 3118initiate_write_inodeblock(inodedep, bp) 3119* struct inodedep inodedep; 3120* struct buf bp; / The inode block / 3121{ 3122* struct allocdirect adp, lastadp; 3123 struct dinode dp; 3124* struct fs fs; 3125* ufs_lbn_t prevlbn = 0; 3126 int i, deplist; 3127 3128 if (inodedep->id_state & IOSTARTED) 3129 panic("initiate_write_inodeblock: already started"); 3130 inodedep->id_state \|= IOSTARTED; 3131 fs = inodedep->id_fs; 3132 dp = (struct dinode )bp->b_data + 3133* ino_to_fsbo(fs, inodedep->id_ino); 3134 /* 3135 * If the bitmap is not yet written, then the allocated 3136 * inode cannot be written to disk. 3137 / 3138* if ((inodedep->id_state & DEPCOMPLETE) == 0) { 3139 if (inodedep->id_savedino != NULL) 3140 panic("initiate_write_inodeblock: already doing I/O"); 3141 MALLOC(inodedep->id_savedino, struct dinode , 3142* sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS); 3143 inodedep->id_savedino = dp; 3144 bzero((caddr_t)dp, sizeof(struct dinode)); 3145 return; 3146 } 3147 /* 3148 * If no dependencies, then there is nothing to roll back. 3149 / 3150* inodedep->id_savedsize = dp->di_size; 3151 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) 3152 return; 3153 /* 3154 * Set the dependencies to busy. 3155 / 3156* ACQUIRE_LOCK(&lk); 3157 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3158 adp = TAILQ_NEXT(adp, ad_next)) { 3159#ifdef DIAGNOSTIC 3160 if (deplist != 0 && prevlbn >= adp->ad_lbn) { 3161 FREE_LOCK(&lk); 3162 panic("softdep_write_inodeblock: lbn order"); 3163 } 3164 prevlbn = adp->ad_lbn; 3165 if (adp->ad_lbn < NDADDR && 3166 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) { 3167 FREE_LOCK(&lk); 3168 panic("%s: direct pointer #%ld mismatch %d != %d", 3169 "softdep_write_inodeblock", adp->ad_lbn, 3170 dp->di_db[adp->ad_lbn], adp->ad_newblkno); 3171 } 3172 if (adp->ad_lbn >= NDADDR && 3173 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) { 3174 FREE_LOCK(&lk); 3175 panic("%s: indirect pointer #%ld mismatch %d != %d", 3176 "softdep_write_inodeblock", adp->ad_lbn - NDADDR, 3177 dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno); 3178 } 3179 deplist \|= 1 << adp->ad_lbn; 3180 if ((adp->ad_state & ATTACHED) == 0) { 3181 FREE_LOCK(&lk); 3182 panic("softdep_write_inodeblock: Unknown state 0x%x", 3183 adp->ad_state); 3184 } 3185#endif /* DIAGNOSTIC / 3186* adp->ad_state &= ~ATTACHED; 3187 adp->ad_state \|= UNDONE; 3188 } 3189 /* 3190 * The on-disk inode cannot claim to be any larger than the last 3191 * fragment that has been written. Otherwise, the on-disk inode 3192 * might have fragments that were not the last block in the file 3193 * which would corrupt the filesystem. 3194 / 3195* for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3196 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 3197 if (adp->ad_lbn >= NDADDR) 3198 break; 3199 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; 3200 /* keep going until hitting a rollback to a frag / 3201* if (adp->ad_oldsize == 0 \|\| adp->ad_oldsize == fs->fs_bsize) 3202 continue; 3203 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 3204 for (i = adp->ad_lbn + 1; i < NDADDR; i++) { 3205#ifdef DIAGNOSTIC 3206 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) { 3207 FREE_LOCK(&lk); 3208 panic("softdep_write_inodeblock: lost dep1"); 3209 } 3210#endif /* DIAGNOSTIC / 3211* dp->di_db[i] = 0; 3212 } 3213 for (i = 0; i < NIADDR; i++) { 3214#ifdef DIAGNOSTIC 3215 if (dp->di_ib[i] != 0 && 3216 (deplist & ((1 << NDADDR) << i)) == 0) { 3217 FREE_LOCK(&lk); 3218 panic("softdep_write_inodeblock: lost dep2"); 3219 } 3220#endif /* DIAGNOSTIC / 3221* dp->di_ib[i] = 0; 3222 } 3223 FREE_LOCK(&lk); 3224 return; 3225 } 3226 /* 3227 * If we have zero'ed out the last allocated block of the file, 3228 * roll back the size to the last currently allocated block. 3229 * We know that this last allocated block is a full-sized as 3230 * we already checked for fragments in the loop above. 3231 / 3232* if (lastadp != NULL && 3233 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 3234 for (i = lastadp->ad_lbn; i >= 0; i--) 3235 if (dp->di_db[i] != 0) 3236 break; 3237 dp->di_size = (i + 1) * fs->fs_bsize; 3238 } 3239 /* 3240 * The only dependencies are for indirect blocks. 3241 * 3242 * The file size for indirect block additions is not guaranteed. 3243 * Such a guarantee would be non-trivial to achieve. The conventional 3244 * synchronous write implementation also does not make this guarantee. 3245 * Fsck should catch and fix discrepancies. Arguably, the file size 3246 * can be over-estimated without destroying integrity when the file 3247 * moves into the indirect blocks (i.e., is large). If we want to 3248 * postpone fsck, we are stuck with this argument. 3249 / 3250* for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 3251 dp->di_ib[adp->ad_lbn - NDADDR] = 0; 3252 FREE_LOCK(&lk); 3253} 3254 3255/* 3256 * This routine is called during the completion interrupt 3257 * service routine for a disk write (from the procedure called 3258 * by the device driver to inform the file system caches of 3259 * a request completion). It should be called early in this 3260 * procedure, before the block is made available to other 3261 * processes or other routines are called. 3262 / 3263static void 3264softdep_disk_write_complete(bp) 3265* struct buf bp; / describes the completed disk write / 3266{ 3267* struct worklist wk; 3268* struct workhead reattach; 3269 struct newblk newblk; 3270* struct allocindir aip; 3271* struct allocdirect adp; 3272* struct indirdep indirdep; 3273* struct inodedep inodedep; 3274* struct bmsafemap bmsafemap; 3275* 3276#ifdef DEBUG 3277 if (lk.lkt_held != -1) 3278 panic("softdep_disk_write_complete: lock is held"); 3279 lk.lkt_held = -2; 3280#endif 3281 LIST_INIT(&reattach); 3282 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 3283 WORKLIST_REMOVE(wk); 3284 switch (wk->wk_type) { 3285 3286 case D_PAGEDEP: 3287 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 3288 WORKLIST_INSERT(&reattach, wk); 3289 continue; 3290 3291 case D_INODEDEP: 3292 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 3293 WORKLIST_INSERT(&reattach, wk); 3294 continue; 3295 3296 case D_BMSAFEMAP: 3297 bmsafemap = WK_BMSAFEMAP(wk); 3298 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { 3299 newblk->nb_state \|= DEPCOMPLETE; 3300 newblk->nb_bmsafemap = NULL; 3301 LIST_REMOVE(newblk, nb_deps); 3302 } 3303 while ((adp = 3304 LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { 3305 adp->ad_state \|= DEPCOMPLETE; 3306 adp->ad_buf = NULL; 3307 LIST_REMOVE(adp, ad_deps); 3308 handle_allocdirect_partdone(adp); 3309 } 3310 while ((aip = 3311 LIST_FIRST(&bmsafemap->sm_allocindirhd))) { 3312 aip->ai_state \|= DEPCOMPLETE; 3313 aip->ai_buf = NULL; 3314 LIST_REMOVE(aip, ai_deps); 3315 handle_allocindir_partdone(aip); 3316 } 3317 while ((inodedep = 3318 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { 3319 inodedep->id_state \|= DEPCOMPLETE; 3320 LIST_REMOVE(inodedep, id_deps); 3321 inodedep->id_buf = NULL; 3322 } 3323 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 3324 continue; 3325 3326 case D_MKDIR: 3327 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 3328 continue; 3329 3330 case D_ALLOCDIRECT: 3331 adp = WK_ALLOCDIRECT(wk); 3332 adp->ad_state \|= COMPLETE; 3333 handle_allocdirect_partdone(adp); 3334 continue; 3335 3336 case D_ALLOCINDIR: 3337 aip = WK_ALLOCINDIR(wk); 3338 aip->ai_state \|= COMPLETE; 3339 handle_allocindir_partdone(aip); 3340 continue; 3341 3342 case D_INDIRDEP: 3343 indirdep = WK_INDIRDEP(wk); 3344 if (indirdep->ir_state & GOINGAWAY) { 3345 lk.lkt_held = -1; 3346 panic("disk_write_complete: indirdep gone"); 3347 } 3348 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 3349 FREE(indirdep->ir_saveddata, M_INDIRDEP); 3350 indirdep->ir_saveddata = 0; 3351 indirdep->ir_state &= ~UNDONE; 3352 indirdep->ir_state \|= ATTACHED; 3353 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 3354 handle_allocindir_partdone(aip); 3355 if (aip == LIST_FIRST(&indirdep->ir_donehd)) { 3356 lk.lkt_held = -1; 3357 panic("disk_write_complete: not gone"); 3358 } 3359 } 3360 WORKLIST_INSERT(&reattach, wk); 3361 if ((bp->b_flags & B_DELWRI) == 0) 3362 stat_indir_blk_ptrs++; 3363 bdirty(bp); 3364 continue; 3365 3366 default: 3367 lk.lkt_held = -1; 3368 panic("handle_disk_write_complete: Unknown type %s", 3369 TYPENAME(wk->wk_type)); 3370 /* NOTREACHED / 3371* } 3372 } 3373 /* 3374 * Reattach any requests that must be redone. 3375 / 3376* while ((wk = LIST_FIRST(&reattach)) != NULL) { 3377 WORKLIST_REMOVE(wk); 3378 WORKLIST_INSERT(&bp->b_dep, wk); 3379 } 3380#ifdef DEBUG 3381 if (lk.lkt_held != -2) 3382 panic("softdep_disk_write_complete: lock lost"); 3383 lk.lkt_held = -1; 3384#endif 3385} 3386 3387/* 3388 * Called from within softdep_disk_write_complete above. Note that 3389 * this routine is always called from interrupt level with further 3390 * splbio interrupts blocked. 3391 / 3392static void 3393handle_allocdirect_partdone(adp) 3394* struct allocdirect adp; / the completed allocdirect / 3395{ 3396* struct allocdirect listadp; 3397* struct inodedep inodedep; 3398* long bsize, delay; 3399 3400 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 3401 return; 3402 if (adp->ad_buf != NULL) { 3403 lk.lkt_held = -1; 3404 panic("handle_allocdirect_partdone: dangling dep"); 3405 } 3406 /* 3407 * The on-disk inode cannot claim to be any larger than the last 3408 * fragment that has been written. Otherwise, the on-disk inode 3409 * might have fragments that were not the last block in the file 3410 * which would corrupt the filesystem. Thus, we cannot free any 3411 * allocdirects after one whose ad_oldblkno claims a fragment as 3412 * these blocks must be rolled back to zero before writing the inode. 3413 * We check the currently active set of allocdirects in id_inoupdt. 3414 / 3415* inodedep = adp->ad_inodedep; 3416 bsize = inodedep->id_fs->fs_bsize; 3417 TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) { 3418 /* found our block / 3419* if (listadp == adp) 3420 break; 3421 /* continue if ad_oldlbn is not a fragment / 3422* if (listadp->ad_oldsize == 0 \|\| 3423 listadp->ad_oldsize == bsize) 3424 continue; 3425 /* hit a fragment / 3426* return; 3427 } 3428 /* 3429 * If we have reached the end of the current list without 3430 * finding the just finished dependency, then it must be 3431 * on the future dependency list. Future dependencies cannot 3432 * be freed until they are moved to the current list. 3433 / 3434* if (listadp == NULL) { 3435#ifdef DEBUG 3436 TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next) 3437 /* found our block / 3438* if (listadp == adp) 3439 break; 3440 if (listadp == NULL) { 3441 lk.lkt_held = -1; 3442 panic("handle_allocdirect_partdone: lost dep"); 3443 } 3444#endif /* DEBUG / 3445* return; 3446 } 3447 /* 3448 * If we have found the just finished dependency, then free 3449 * it along with anything that follows it that is complete. 3450 * If the inode still has a bitmap dependency, then it has 3451 * never been written to disk, hence the on-disk inode cannot 3452 * reference the old fragment so we can free it without delay. 3453 / 3454* delay = (inodedep->id_state & DEPCOMPLETE); 3455 for (; adp; adp = listadp) { 3456 listadp = TAILQ_NEXT(adp, ad_next); 3457 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 3458 return; 3459 free_allocdirect(&inodedep->id_inoupdt, adp, delay); 3460 } 3461} 3462 3463/* 3464 * Called from within softdep_disk_write_complete above. Note that 3465 * this routine is always called from interrupt level with further 3466 * splbio interrupts blocked. 3467 / 3468static void 3469handle_allocindir_partdone(aip) 3470* struct allocindir aip; / the completed allocindir / 3471{ 3472* struct indirdep indirdep; 3473* 3474 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 3475 return; 3476 if (aip->ai_buf != NULL) { 3477 lk.lkt_held = -1; 3478 panic("handle_allocindir_partdone: dangling dependency"); 3479 } 3480 indirdep = aip->ai_indirdep; 3481 if (indirdep->ir_state & UNDONE) { 3482 LIST_REMOVE(aip, ai_next); 3483 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 3484 return; 3485 } 3486 ((ufs_daddr_t )indirdep->ir_savebp->b_data)[aip->ai_offset] = 3487* aip->ai_newblkno; 3488 LIST_REMOVE(aip, ai_next); 3489 if (aip->ai_freefrag != NULL) 3490 add_to_worklist(&aip->ai_freefrag->ff_list); 3491 WORKITEM_FREE(aip, D_ALLOCINDIR); 3492} 3493 3494/* 3495 * Called from within softdep_disk_write_complete above to restore 3496 * in-memory inode block contents to their most up-to-date state. Note 3497 * that this routine is always called from interrupt level with further 3498 * splbio interrupts blocked. 3499 / 3500static int 3501handle_written_inodeblock(inodedep, bp) 3502* struct inodedep inodedep; 3503* struct buf bp; / buffer containing the inode block / 3504{ 3505* struct worklist wk, filefree; 3506 struct allocdirect adp, nextadp; 3507 struct dinode dp; 3508* int hadchanges; 3509 3510 if ((inodedep->id_state & IOSTARTED) == 0) { 3511 lk.lkt_held = -1; 3512 panic("handle_written_inodeblock: not started"); 3513 } 3514 inodedep->id_state &= ~IOSTARTED; 3515 inodedep->id_state \|= COMPLETE; 3516 dp = (struct dinode )bp->b_data + 3517* ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 3518 /* 3519 * If we had to rollback the inode allocation because of 3520 * bitmaps being incomplete, then simply restore it. 3521 * Keep the block dirty so that it will not be reclaimed until 3522 * all associated dependencies have been cleared and the 3523 * corresponding updates written to disk. 3524 / 3525* if (inodedep->id_savedino != NULL) { 3526 dp = inodedep->id_savedino; 3527 FREE(inodedep->id_savedino, M_INODEDEP); 3528 inodedep->id_savedino = NULL; 3529 if ((bp->b_flags & B_DELWRI) == 0) 3530 stat_inode_bitmap++; 3531 bdirty(bp); 3532 return (1); 3533 } 3534 /* 3535 * Roll forward anything that had to be rolled back before 3536 * the inode could be updated. 3537 / 3538* hadchanges = 0; 3539 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 3540 nextadp = TAILQ_NEXT(adp, ad_next); 3541 if (adp->ad_state & ATTACHED) { 3542 lk.lkt_held = -1; 3543 panic("handle_written_inodeblock: new entry"); 3544 } 3545 if (adp->ad_lbn < NDADDR) { 3546 if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) { 3547 lk.lkt_held = -1; 3548 panic("%s: %s #%ld mismatch %d != %d", 3549 "handle_written_inodeblock", 3550 "direct pointer", adp->ad_lbn, 3551 dp->di_db[adp->ad_lbn], adp->ad_oldblkno); 3552 } 3553 dp->di_db[adp->ad_lbn] = adp->ad_newblkno; 3554 } else { 3555 if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) { 3556 lk.lkt_held = -1; 3557 panic("%s: %s #%ld allocated as %d", 3558 "handle_written_inodeblock", 3559 "indirect pointer", adp->ad_lbn - NDADDR, 3560 dp->di_ib[adp->ad_lbn - NDADDR]); 3561 } 3562 dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; 3563 } 3564 adp->ad_state &= ~UNDONE; 3565 adp->ad_state \|= ATTACHED; 3566 hadchanges = 1; 3567 } 3568 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 3569 stat_direct_blk_ptrs++; 3570 /* 3571 * Reset the file size to its most up-to-date value. 3572 / 3573* if (inodedep->id_savedsize == -1) { 3574 lk.lkt_held = -1; 3575 panic("handle_written_inodeblock: bad size"); 3576 } 3577 if (dp->di_size != inodedep->id_savedsize) { 3578 dp->di_size = inodedep->id_savedsize; 3579 hadchanges = 1; 3580 } 3581 inodedep->id_savedsize = -1; 3582 /* 3583 * If there were any rollbacks in the inode block, then it must be 3584 * marked dirty so that its will eventually get written back in 3585 * its correct form. 3586 / 3587* if (hadchanges) 3588 bdirty(bp); 3589 /* 3590 * Process any allocdirects that completed during the update. 3591 / 3592* if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 3593 handle_allocdirect_partdone(adp); 3594 /* 3595 * Process deallocations that were held pending until the 3596 * inode had been written to disk. Freeing of the inode 3597 * is delayed until after all blocks have been freed to 3598 * avoid creation of new <vfsid, inum, lbn> triples 3599 * before the old ones have been deleted. 3600 / 3601* filefree = NULL; 3602 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 3603 WORKLIST_REMOVE(wk); 3604 switch (wk->wk_type) { 3605 3606 case D_FREEFILE: 3607 /* 3608 * We defer adding filefree to the worklist until 3609 * all other additions have been made to ensure 3610 * that it will be done after all the old blocks 3611 * have been freed. 3612 / 3613* if (filefree != NULL) { 3614 lk.lkt_held = -1; 3615 panic("handle_written_inodeblock: filefree"); 3616 } 3617 filefree = wk; 3618 continue; 3619 3620 case D_MKDIR: 3621 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 3622 continue; 3623 3624 case D_DIRADD: 3625 diradd_inode_written(WK_DIRADD(wk), inodedep); 3626 continue; 3627 3628 case D_FREEBLKS: 3629 case D_FREEFRAG: 3630 case D_DIRREM: 3631 add_to_worklist(wk); 3632 continue; 3633 3634 default: 3635 lk.lkt_held = -1; 3636 panic("handle_written_inodeblock: Unknown type %s", 3637 TYPENAME(wk->wk_type)); 3638 /* NOTREACHED / 3639* } 3640 } 3641 if (filefree != NULL) { 3642 if (free_inodedep(inodedep) == 0) { 3643 lk.lkt_held = -1; 3644 panic("handle_written_inodeblock: live inodedep"); 3645 } 3646 add_to_worklist(filefree); 3647 return (0); 3648 } 3649 3650 /* 3651 * If no outstanding dependencies, free it. 3652 / 3653* if (free_inodedep(inodedep) \|\| TAILQ_FIRST(&inodedep->id_inoupdt) == 0) 3654 return (0); 3655 return (hadchanges); 3656} 3657 3658/* 3659 * Process a diradd entry after its dependent inode has been written. 3660 * This routine must be called with splbio interrupts blocked. 3661 / 3662static void 3663diradd_inode_written(dap, inodedep) 3664* struct diradd dap; 3665* struct inodedep inodedep; 3666{ 3667* struct pagedep pagedep; 3668* 3669 dap->da_state \|= COMPLETE; 3670 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3671 if (dap->da_state & DIRCHG) 3672 pagedep = dap->da_previous->dm_pagedep; 3673 else 3674 pagedep = dap->da_pagedep; 3675 LIST_REMOVE(dap, da_pdlist); 3676 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3677 } 3678 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 3679} 3680 3681/* 3682 * Handle the completion of a mkdir dependency. 3683 / 3684static void 3685handle_written_mkdir(mkdir, type) 3686* struct mkdir mkdir; 3687* int type; 3688{ 3689 struct diradd dap; 3690* struct pagedep pagedep; 3691* 3692 if (mkdir->md_state != type) { 3693 lk.lkt_held = -1; 3694 panic("handle_written_mkdir: bad type"); 3695 } 3696 dap = mkdir->md_diradd; 3697 dap->da_state &= ~type; 3698 if ((dap->da_state & (MKDIR_PARENT \| MKDIR_BODY)) == 0) 3699 dap->da_state \|= DEPCOMPLETE; 3700 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3701 if (dap->da_state & DIRCHG) 3702 pagedep = dap->da_previous->dm_pagedep; 3703 else 3704 pagedep = dap->da_pagedep; 3705 LIST_REMOVE(dap, da_pdlist); 3706 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3707 } 3708 LIST_REMOVE(mkdir, md_mkdirs); 3709 WORKITEM_FREE(mkdir, D_MKDIR); 3710} 3711 3712/* 3713 * Called from within softdep_disk_write_complete above. 3714 * A write operation was just completed. Removed inodes can 3715 * now be freed and associated block pointers may be committed. 3716 * Note that this routine is always called from interrupt level 3717 * with further splbio interrupts blocked. 3718 / 3719static int 3720handle_written_filepage(pagedep, bp) 3721* struct pagedep pagedep; 3722* struct buf bp; / buffer containing the written page / 3723{ 3724* struct dirrem dirrem; 3725* struct diradd dap, nextdap; 3726 struct direct ep; 3727* int i, chgs; 3728 3729 if ((pagedep->pd_state & IOSTARTED) == 0) { 3730 lk.lkt_held = -1; 3731 panic("handle_written_filepage: not started"); 3732 } 3733 pagedep->pd_state &= ~IOSTARTED; 3734 /* 3735 * Process any directory removals that have been committed. 3736 / 3737* while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 3738 LIST_REMOVE(dirrem, dm_next); 3739 dirrem->dm_dirinum = pagedep->pd_ino; 3740 add_to_worklist(&dirrem->dm_list); 3741 } 3742 /* 3743 * Free any directory additions that have been committed. 3744 / 3745* while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 3746 free_diradd(dap); 3747 /* 3748 * Uncommitted directory entries must be restored. 3749 / 3750* for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 3751 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 3752 dap = nextdap) { 3753 nextdap = LIST_NEXT(dap, da_pdlist); 3754 if (dap->da_state & ATTACHED) { 3755 lk.lkt_held = -1; 3756 panic("handle_written_filepage: attached"); 3757 } 3758 ep = (struct direct ) 3759* ((char )bp->b_data + dap->da_offset); 3760* ep->d_ino = dap->da_newinum; 3761 dap->da_state &= ~UNDONE; 3762 dap->da_state \|= ATTACHED; 3763 chgs = 1; 3764 /* 3765 * If the inode referenced by the directory has 3766 * been written out, then the dependency can be 3767 * moved to the pending list. 3768 / 3769* if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3770 LIST_REMOVE(dap, da_pdlist); 3771 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 3772 da_pdlist); 3773 } 3774 } 3775 } 3776 /* 3777 * If there were any rollbacks in the directory, then it must be 3778 * marked dirty so that its will eventually get written back in 3779 * its correct form. 3780 / 3781* if (chgs) { 3782 if ((bp->b_flags & B_DELWRI) == 0) 3783 stat_dir_entry++; 3784 bdirty(bp); 3785 } 3786 /* 3787 * If no dependencies remain, the pagedep will be freed. 3788 * Otherwise it will remain to update the page before it 3789 * is written back to disk. 3790 / 3791* if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) { 3792 for (i = 0; i < DAHASHSZ; i++) 3793 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) 3794 break; 3795 if (i == DAHASHSZ) { 3796 LIST_REMOVE(pagedep, pd_hash); 3797 WORKITEM_FREE(pagedep, D_PAGEDEP); 3798 return (0); 3799 } 3800 } 3801 return (1); 3802} 3803 3804/* 3805 * Writing back in-core inode structures. 3806 * 3807 * The file system only accesses an inode's contents when it occupies an 3808 * "in-core" inode structure. These "in-core" structures are separate from 3809 * the page frames used to cache inode blocks. Only the latter are 3810 * transferred to/from the disk. So, when the updated contents of the 3811 * "in-core" inode structure are copied to the corresponding in-memory inode 3812 * block, the dependencies are also transferred. The following procedure is 3813 * called when copying a dirty "in-core" inode to a cached inode block. 3814 / 3815* 3816/* 3817 * Called when an inode is loaded from disk. If the effective link count 3818 * differed from the actual link count when it was last flushed, then we 3819 * need to ensure that the correct effective link count is put back. 3820 / 3821void 3822softdep_load_inodeblock(ip) 3823* struct inode ip; / the "in_core" copy of the inode / 3824{ 3825* struct inodedep inodedep; 3826* 3827 /* 3828 * Check for alternate nlink count. 3829 / 3830* ip->i_effnlink = ip->i_nlink; 3831 ACQUIRE_LOCK(&lk); 3832 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3833 FREE_LOCK(&lk); 3834 return; 3835 } 3836 ip->i_effnlink -= inodedep->id_nlinkdelta;
	3837 if (inodedep->id_state & SPACECOUNTED) 3838 ip->i_flag \|= IN_SPACECOUNTED;
3782 FREE_LOCK(&lk); 3783} 3784 3785/* 3786 * This routine is called just before the "in-core" inode 3787 * information is to be copied to the in-memory inode block. 3788 * Recall that an inode block contains several inodes. If 3789 * the force flag is set, then the dependencies will be 3790 * cleared so that the update can always be made. Note that 3791 * the buffer is locked when this routine is called, so we 3792 * will never be in the middle of writing the inode block 3793 * to disk. 3794 / 3795void 3796softdep_update_inodeblock(ip, bp, waitfor) 3797* struct inode ip; / the "in_core" copy of the inode / 3798* struct buf bp; / the buffer containing the inode block / 3799* int waitfor; /* nonzero => update must be allowed / 3800{ 3801* struct inodedep inodedep; 3802* struct worklist wk; 3803* int error, gotit; 3804 3805 /* 3806 * If the effective link count is not equal to the actual link 3807 * count, then we must track the difference in an inodedep while 3808 * the inode is (potentially) tossed out of the cache. Otherwise, 3809 * if there is no existing inodedep, then there are no dependencies 3810 * to track. 3811 / 3812* ACQUIRE_LOCK(&lk); 3813 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3814 FREE_LOCK(&lk); 3815 if (ip->i_effnlink != ip->i_nlink) 3816 panic("softdep_update_inodeblock: bad link count"); 3817 return; 3818 } 3819 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) { 3820 FREE_LOCK(&lk); 3821 panic("softdep_update_inodeblock: bad delta"); 3822 } 3823 /* 3824 * Changes have been initiated. Anything depending on these 3825 * changes cannot occur until this inode has been written. 3826 / 3827* inodedep->id_state &= ~COMPLETE; 3828 if ((inodedep->id_state & ONWORKLIST) == 0) 3829 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 3830 /* 3831 * Any new dependencies associated with the incore inode must 3832 * now be moved to the list associated with the buffer holding 3833 * the in-memory copy of the inode. Once merged process any 3834 * allocdirects that are completed by the merger. 3835 / 3836* merge_inode_lists(inodedep); 3837 if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) 3838 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); 3839 /* 3840 * Now that the inode has been pushed into the buffer, the 3841 * operations dependent on the inode being written to disk 3842 * can be moved to the id_bufwait so that they will be 3843 * processed when the buffer I/O completes. 3844 / 3845* while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 3846 WORKLIST_REMOVE(wk); 3847 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 3848 } 3849 /* 3850 * Newly allocated inodes cannot be written until the bitmap 3851 * that allocates them have been written (indicated by 3852 * DEPCOMPLETE being set in id_state). If we are doing a 3853 * forced sync (e.g., an fsync on a file), we force the bitmap 3854 * to be written so that the update can be done. 3855 / 3856* if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\| waitfor == 0) { 3857 FREE_LOCK(&lk); 3858 return; 3859 } 3860 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 3861 FREE_LOCK(&lk); 3862 if (gotit && 3863 (error = BUF_WRITE(inodedep->id_buf)) != 0) 3864 softdep_error("softdep_update_inodeblock: bwrite", error); 3865 if ((inodedep->id_state & DEPCOMPLETE) == 0) 3866 panic("softdep_update_inodeblock: update failed"); 3867} 3868 3869/* 3870 * Merge the new inode dependency list (id_newinoupdt) into the old 3871 * inode dependency list (id_inoupdt). This routine must be called 3872 * with splbio interrupts blocked. 3873 / 3874static void 3875merge_inode_lists(inodedep) 3876* struct inodedep inodedep; 3877{ 3878* struct allocdirect listadp, newadp; 3879 3880 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3881 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) { 3882 if (listadp->ad_lbn < newadp->ad_lbn) { 3883 listadp = TAILQ_NEXT(listadp, ad_next); 3884 continue; 3885 } 3886 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3887 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 3888 if (listadp->ad_lbn == newadp->ad_lbn) { 3889 allocdirect_merge(&inodedep->id_inoupdt, newadp, 3890 listadp); 3891 listadp = newadp; 3892 } 3893 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3894 } 3895 while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) { 3896 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3897 TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next); 3898 } 3899} 3900 3901/* 3902 * If we are doing an fsync, then we must ensure that any directory 3903 * entries for the inode have been written after the inode gets to disk. 3904 / 3905int 3906softdep_fsync(vp) 3907* struct vnode vp; / the "in_core" copy of the inode / 3908{ 3909* struct inodedep inodedep; 3910* struct pagedep pagedep; 3911* struct worklist wk; 3912* struct diradd dap; 3913* struct mount mnt; 3914* struct vnode pvp; 3915* struct inode ip; 3916* struct buf bp; 3917* struct fs fs; 3918* struct proc p = CURPROC; / XXX / 3919* int error, flushparent; 3920 ino_t parentino; 3921 ufs_lbn_t lbn; 3922 3923 ip = VTOI(vp); 3924 fs = ip->i_fs; 3925 ACQUIRE_LOCK(&lk); 3926 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) { 3927 FREE_LOCK(&lk); 3928 return (0); 3929 } 3930 if (LIST_FIRST(&inodedep->id_inowait) != NULL \|\| 3931 LIST_FIRST(&inodedep->id_bufwait) != NULL \|\| 3932 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\| 3933 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) { 3934 FREE_LOCK(&lk); 3935 panic("softdep_fsync: pending ops"); 3936 } 3937 for (error = 0, flushparent = 0; ; ) { 3938 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 3939 break; 3940 if (wk->wk_type != D_DIRADD) { 3941 FREE_LOCK(&lk); 3942 panic("softdep_fsync: Unexpected type %s", 3943 TYPENAME(wk->wk_type)); 3944 } 3945 dap = WK_DIRADD(wk); 3946 /* 3947 * Flush our parent if this directory entry 3948 * has a MKDIR_PARENT dependency. 3949 / 3950* if (dap->da_state & DIRCHG) 3951 pagedep = dap->da_previous->dm_pagedep; 3952 else 3953 pagedep = dap->da_pagedep; 3954 mnt = pagedep->pd_mnt; 3955 parentino = pagedep->pd_ino; 3956 lbn = pagedep->pd_lbn; 3957 if ((dap->da_state & (MKDIR_BODY \| COMPLETE)) != COMPLETE) { 3958 FREE_LOCK(&lk); 3959 panic("softdep_fsync: dirty"); 3960 } 3961 flushparent = dap->da_state & MKDIR_PARENT; 3962 /* 3963 * If we are being fsync'ed as part of vgone'ing this vnode, 3964 * then we will not be able to release and recover the 3965 * vnode below, so we just have to give up on writing its 3966 * directory entry out. It will eventually be written, just 3967 * not now, but then the user was not asking to have it 3968 * written, so we are not breaking any promises. 3969 / 3970* if (vp->v_flag & VXLOCK) 3971 break; 3972 /* 3973 * We prevent deadlock by always fetching inodes from the 3974 * root, moving down the directory tree. Thus, when fetching 3975 * our parent directory, we must unlock ourselves before 3976 * requesting the lock on our parent. See the comment in 3977 * ufs_lookup for details on possible races. 3978 / 3979* FREE_LOCK(&lk); 3980 VOP_UNLOCK(vp, 0, p); 3981 error = VFS_VGET(mnt, parentino, &pvp); 3982 vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY, p); 3983 if (error != 0) 3984 return (error); 3985 if (flushparent) { 3986 if ((error = UFS_UPDATE(pvp, 1)) != 0) { 3987 vput(pvp); 3988 return (error); 3989 } 3990 } 3991 /* 3992 * Flush directory page containing the inode's name. 3993 / 3994* error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred, 3995 &bp); 3996 if (error == 0) 3997 error = BUF_WRITE(bp); 3998 vput(pvp); 3999 if (error != 0) 4000 return (error); 4001 ACQUIRE_LOCK(&lk); 4002 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) 4003 break; 4004 } 4005 FREE_LOCK(&lk); 4006 return (0); 4007} 4008 4009/* 4010 * Flush all the dirty bitmaps associated with the block device 4011 * before flushing the rest of the dirty blocks so as to reduce 4012 * the number of dependencies that will have to be rolled back. 4013 / 4014void 4015softdep_fsync_mountdev(vp) 4016* struct vnode vp; 4017{ 4018* struct buf bp, nbp; 4019 struct worklist wk; 4020* 4021 if (!vn_isdisk(vp, NULL)) 4022 panic("softdep_fsync_mountdev: vnode not a disk"); 4023 ACQUIRE_LOCK(&lk); 4024 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 4025 nbp = TAILQ_NEXT(bp, b_vnbufs); 4026 /* 4027 * If it is already scheduled, skip to the next buffer. 4028 / 4029* if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) 4030 continue; 4031 if ((bp->b_flags & B_DELWRI) == 0) { 4032 FREE_LOCK(&lk); 4033 panic("softdep_fsync_mountdev: not dirty"); 4034 } 4035 /* 4036 * We are only interested in bitmaps with outstanding 4037 * dependencies. 4038 / 4039* if ((wk = LIST_FIRST(&bp->b_dep)) == NULL \|\| 4040 wk->wk_type != D_BMSAFEMAP \|\| 4041 (bp->b_xflags & BX_BKGRDINPROG)) { 4042 BUF_UNLOCK(bp); 4043 continue; 4044 } 4045 bremfree(bp); 4046 FREE_LOCK(&lk); 4047 (void) bawrite(bp); 4048 ACQUIRE_LOCK(&lk); 4049 /* 4050 * Since we may have slept during the I/O, we need 4051 * to start from a known point. 4052 / 4053* nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); 4054 } 4055 drain_output(vp, 1); 4056 FREE_LOCK(&lk); 4057} 4058 4059/* 4060 * This routine is called when we are trying to synchronously flush a 4061 * file. This routine must eliminate any filesystem metadata dependencies 4062 * so that the syncing routine can succeed by pushing the dirty blocks 4063 * associated with the file. If any I/O errors occur, they are returned. 4064 / 4065int 4066softdep_sync_metadata(ap) 4067* struct vop_fsync_args /* { 4068 struct vnode a_vp; 4069* struct ucred a_cred; 4070* int a_waitfor; 4071 struct proc a_p; 4072* } / ap; 4073{ 4074 struct vnode vp = ap->a_vp; 4075* struct pagedep pagedep; 4076* struct allocdirect adp; 4077* struct allocindir aip; 4078* struct buf bp, nbp; 4079 struct worklist wk; 4080* int i, error, waitfor; 4081 4082 /* 4083 * Check whether this vnode is involved in a filesystem 4084 * that is doing soft dependency processing. 4085 / 4086* if (!vn_isdisk(vp, NULL)) { 4087 if (!DOINGSOFTDEP(vp)) 4088 return (0); 4089 } else 4090 if (vp->v_rdev->si_mountpoint == NULL \|\| 4091 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0) 4092 return (0); 4093 /* 4094 * Ensure that any direct block dependencies have been cleared. 4095 / 4096* ACQUIRE_LOCK(&lk); 4097 if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) { 4098 FREE_LOCK(&lk); 4099 return (error); 4100 } 4101 /* 4102 * For most files, the only metadata dependencies are the 4103 * cylinder group maps that allocate their inode or blocks. 4104 * The block allocation dependencies can be found by traversing 4105 * the dependency lists for any buffers that remain on their 4106 * dirty buffer list. The inode allocation dependency will 4107 * be resolved when the inode is updated with MNT_WAIT. 4108 * This work is done in two passes. The first pass grabs most 4109 * of the buffers and begins asynchronously writing them. The 4110 * only way to wait for these asynchronous writes is to sleep 4111 * on the filesystem vnode which may stay busy for a long time 4112 * if the filesystem is active. So, instead, we make a second 4113 * pass over the dependencies blocking on each write. In the 4114 * usual case we will be blocking against a write that we 4115 * initiated, so when it is done the dependency will have been 4116 * resolved. Thus the second pass is expected to end quickly. 4117 / 4118* waitfor = MNT_NOWAIT; 4119top: 4120 if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) { 4121 FREE_LOCK(&lk); 4122 return (0); 4123 } 4124 bp = TAILQ_FIRST(&vp->v_dirtyblkhd); 4125 /* While syncing snapshots, we must allow recursive lookups / 4126* bp->b_lock.lk_flags \|= LK_CANRECURSE; 4127loop: 4128 /* 4129 * As we hold the buffer locked, none of its dependencies 4130 * will disappear. 4131 / 4132* LIST_FOREACH(wk, &bp->b_dep, wk_list) { 4133 switch (wk->wk_type) { 4134 4135 case D_ALLOCDIRECT: 4136 adp = WK_ALLOCDIRECT(wk); 4137 if (adp->ad_state & DEPCOMPLETE) 4138 continue; 4139 nbp = adp->ad_buf; 4140 if (getdirtybuf(&nbp, waitfor) == 0) 4141 continue; 4142 FREE_LOCK(&lk); 4143 if (waitfor == MNT_NOWAIT) { 4144 bawrite(nbp); 4145 } else if ((error = BUF_WRITE(nbp)) != 0) { 4146 break; 4147 } 4148 ACQUIRE_LOCK(&lk); 4149 continue; 4150 4151 case D_ALLOCINDIR: 4152 aip = WK_ALLOCINDIR(wk); 4153 if (aip->ai_state & DEPCOMPLETE) 4154 continue; 4155 nbp = aip->ai_buf; 4156 if (getdirtybuf(&nbp, waitfor) == 0) 4157 continue; 4158 FREE_LOCK(&lk); 4159 if (waitfor == MNT_NOWAIT) { 4160 bawrite(nbp); 4161 } else if ((error = BUF_WRITE(nbp)) != 0) { 4162 break; 4163 } 4164 ACQUIRE_LOCK(&lk); 4165 continue; 4166 4167 case D_INDIRDEP: 4168 restart: 4169 4170 LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { 4171 if (aip->ai_state & DEPCOMPLETE) 4172 continue; 4173 nbp = aip->ai_buf; 4174 if (getdirtybuf(&nbp, MNT_WAIT) == 0) 4175 goto restart; 4176 FREE_LOCK(&lk); 4177 if ((error = BUF_WRITE(nbp)) != 0) { 4178 break; 4179 } 4180 ACQUIRE_LOCK(&lk); 4181 goto restart; 4182 } 4183 continue; 4184 4185 case D_INODEDEP: 4186 if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs, 4187 WK_INODEDEP(wk)->id_ino)) != 0) { 4188 FREE_LOCK(&lk); 4189 break; 4190 } 4191 continue; 4192 4193 case D_PAGEDEP: 4194 /* 4195 * We are trying to sync a directory that may 4196 * have dependencies on both its own metadata 4197 * and/or dependencies on the inodes of any 4198 * recently allocated files. We walk its diradd 4199 * lists pushing out the associated inode. 4200 / 4201* pagedep = WK_PAGEDEP(wk); 4202 for (i = 0; i < DAHASHSZ; i++) { 4203 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 4204 continue; 4205 if ((error = 4206 flush_pagedep_deps(vp, pagedep->pd_mnt, 4207 &pagedep->pd_diraddhd[i]))) { 4208 FREE_LOCK(&lk); 4209 break; 4210 } 4211 } 4212 continue; 4213 4214 case D_MKDIR: 4215 /* 4216 * This case should never happen if the vnode has 4217 * been properly sync'ed. However, if this function 4218 * is used at a place where the vnode has not yet 4219 * been sync'ed, this dependency can show up. So, 4220 * rather than panic, just flush it. 4221 / 4222* nbp = WK_MKDIR(wk)->md_buf; 4223 if (getdirtybuf(&nbp, waitfor) == 0) 4224 continue; 4225 FREE_LOCK(&lk); 4226 if (waitfor == MNT_NOWAIT) { 4227 bawrite(nbp); 4228 } else if ((error = BUF_WRITE(nbp)) != 0) { 4229 break; 4230 } 4231 ACQUIRE_LOCK(&lk); 4232 continue; 4233 4234 case D_BMSAFEMAP: 4235 /* 4236 * This case should never happen if the vnode has 4237 * been properly sync'ed. However, if this function 4238 * is used at a place where the vnode has not yet 4239 * been sync'ed, this dependency can show up. So, 4240 * rather than panic, just flush it. 4241 / 4242* nbp = WK_BMSAFEMAP(wk)->sm_buf; 4243 if (getdirtybuf(&nbp, waitfor) == 0) 4244 continue; 4245 FREE_LOCK(&lk); 4246 if (waitfor == MNT_NOWAIT) { 4247 bawrite(nbp); 4248 } else if ((error = BUF_WRITE(nbp)) != 0) { 4249 break; 4250 } 4251 ACQUIRE_LOCK(&lk); 4252 continue; 4253 4254 default: 4255 FREE_LOCK(&lk); 4256 panic("softdep_sync_metadata: Unknown type %s", 4257 TYPENAME(wk->wk_type)); 4258 /* NOTREACHED / 4259* } 4260 /* We reach here only in error and unlocked / 4261* if (error == 0) 4262 panic("softdep_sync_metadata: zero error"); 4263 bp->b_lock.lk_flags &= ~LK_CANRECURSE; 4264 bawrite(bp); 4265 return (error); 4266 } 4267 (void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT); 4268 nbp = TAILQ_NEXT(bp, b_vnbufs); 4269 FREE_LOCK(&lk); 4270 bp->b_lock.lk_flags &= ~LK_CANRECURSE; 4271 bawrite(bp); 4272 ACQUIRE_LOCK(&lk); 4273 if (nbp != NULL) { 4274 bp = nbp; 4275 goto loop; 4276 } 4277 /* 4278 * We must wait for any I/O in progress to finish so that 4279 * all potential buffers on the dirty list will be visible. 4280 * Once they are all there, proceed with the second pass 4281 * which will wait for the I/O as per above. 4282 / 4283* drain_output(vp, 1); 4284 /* 4285 * The brief unlock is to allow any pent up dependency 4286 * processing to be done. 4287 / 4288* if (waitfor == MNT_NOWAIT) { 4289 waitfor = MNT_WAIT; 4290 FREE_LOCK(&lk); 4291 ACQUIRE_LOCK(&lk); 4292 goto top; 4293 } 4294 4295 /* 4296 * If we have managed to get rid of all the dirty buffers, 4297 * then we are done. For certain directories and block 4298 * devices, we may need to do further work. 4299 / 4300* if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) { 4301 FREE_LOCK(&lk); 4302 return (0); 4303 } 4304 4305 FREE_LOCK(&lk); 4306 /* 4307 * If we are trying to sync a block device, some of its buffers may 4308 * contain metadata that cannot be written until the contents of some 4309 * partially written files have been written to disk. The only easy 4310 * way to accomplish this is to sync the entire filesystem (luckily 4311 * this happens rarely). 4312 / 4313* if (vn_isdisk(vp, NULL) && 4314 vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) && 4315 (error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT, ap->a_cred, 4316 ap->a_p)) != 0) 4317 return (error); 4318 return (0); 4319} 4320 4321/* 4322 * Flush the dependencies associated with an inodedep. 4323 * Called with splbio blocked. 4324 / 4325static int 4326flush_inodedep_deps(fs, ino) 4327* struct fs fs; 4328* ino_t ino; 4329{ 4330 struct inodedep inodedep; 4331* struct allocdirect adp; 4332* int error, waitfor; 4333 struct buf bp; 4334* 4335 /* 4336 * This work is done in two passes. The first pass grabs most 4337 * of the buffers and begins asynchronously writing them. The 4338 * only way to wait for these asynchronous writes is to sleep 4339 * on the filesystem vnode which may stay busy for a long time 4340 * if the filesystem is active. So, instead, we make a second 4341 * pass over the dependencies blocking on each write. In the 4342 * usual case we will be blocking against a write that we 4343 * initiated, so when it is done the dependency will have been 4344 * resolved. Thus the second pass is expected to end quickly. 4345 * We give a brief window at the top of the loop to allow 4346 * any pending I/O to complete. 4347 / 4348* for (waitfor = MNT_NOWAIT; ; ) { 4349 FREE_LOCK(&lk); 4350 ACQUIRE_LOCK(&lk); 4351 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 4352 return (0); 4353 TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) { 4354 if (adp->ad_state & DEPCOMPLETE) 4355 continue; 4356 bp = adp->ad_buf; 4357 if (getdirtybuf(&bp, waitfor) == 0) { 4358 if (waitfor == MNT_NOWAIT) 4359 continue; 4360 break; 4361 } 4362 FREE_LOCK(&lk); 4363 if (waitfor == MNT_NOWAIT) { 4364 bawrite(bp); 4365 } else if ((error = BUF_WRITE(bp)) != 0) { 4366 ACQUIRE_LOCK(&lk); 4367 return (error); 4368 } 4369 ACQUIRE_LOCK(&lk); 4370 break; 4371 } 4372 if (adp != NULL) 4373 continue; 4374 TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) { 4375 if (adp->ad_state & DEPCOMPLETE) 4376 continue; 4377 bp = adp->ad_buf; 4378 if (getdirtybuf(&bp, waitfor) == 0) { 4379 if (waitfor == MNT_NOWAIT) 4380 continue; 4381 break; 4382 } 4383 FREE_LOCK(&lk); 4384 if (waitfor == MNT_NOWAIT) { 4385 bawrite(bp); 4386 } else if ((error = BUF_WRITE(bp)) != 0) { 4387 ACQUIRE_LOCK(&lk); 4388 return (error); 4389 } 4390 ACQUIRE_LOCK(&lk); 4391 break; 4392 } 4393 if (adp != NULL) 4394 continue; 4395 /* 4396 * If pass2, we are done, otherwise do pass 2. 4397 / 4398* if (waitfor == MNT_WAIT) 4399 break; 4400 waitfor = MNT_WAIT; 4401 } 4402 /* 4403 * Try freeing inodedep in case all dependencies have been removed. 4404 / 4405* if (inodedep_lookup(fs, ino, 0, &inodedep) != 0) 4406 (void) free_inodedep(inodedep); 4407 return (0); 4408} 4409 4410/* 4411 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 4412 * Called with splbio blocked. 4413 / 4414static int 4415flush_pagedep_deps(pvp, mp, diraddhdp) 4416* struct vnode pvp; 4417* struct mount mp; 4418* struct diraddhd diraddhdp; 4419{ 4420* struct proc p = CURPROC; / XXX / 4421* struct inodedep inodedep; 4422* struct ufsmount ump; 4423* struct diradd dap; 4424* struct vnode vp; 4425* int gotit, error = 0; 4426 struct buf bp; 4427* ino_t inum; 4428 4429 ump = VFSTOUFS(mp); 4430 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 4431 /* 4432 * Flush ourselves if this directory entry 4433 * has a MKDIR_PARENT dependency. 4434 / 4435* if (dap->da_state & MKDIR_PARENT) { 4436 FREE_LOCK(&lk); 4437 if ((error = UFS_UPDATE(pvp, 1)) != 0) 4438 break; 4439 ACQUIRE_LOCK(&lk); 4440 /* 4441 * If that cleared dependencies, go on to next. 4442 / 4443* if (dap != LIST_FIRST(diraddhdp)) 4444 continue; 4445 if (dap->da_state & MKDIR_PARENT) { 4446 FREE_LOCK(&lk); 4447 panic("flush_pagedep_deps: MKDIR_PARENT"); 4448 } 4449 } 4450 /* 4451 * A newly allocated directory must have its "." and 4452 * ".." entries written out before its name can be 4453 * committed in its parent. We do not want or need 4454 * the full semantics of a synchronous VOP_FSYNC as 4455 * that may end up here again, once for each directory 4456 * level in the filesystem. Instead, we push the blocks 4457 * and wait for them to clear. We have to fsync twice 4458 * because the first call may choose to defer blocks 4459 * that still have dependencies, but deferral will 4460 * happen at most once. 4461 / 4462* inum = dap->da_newinum; 4463 if (dap->da_state & MKDIR_BODY) { 4464 FREE_LOCK(&lk); 4465 if ((error = VFS_VGET(mp, inum, &vp)) != 0) 4466 break; 4467 if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) \|\| 4468 (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) { 4469 vput(vp); 4470 break; 4471 } 4472 drain_output(vp, 0); 4473 vput(vp); 4474 ACQUIRE_LOCK(&lk); 4475 /* 4476 * If that cleared dependencies, go on to next. 4477 / 4478* if (dap != LIST_FIRST(diraddhdp)) 4479 continue; 4480 if (dap->da_state & MKDIR_BODY) { 4481 FREE_LOCK(&lk); 4482 panic("flush_pagedep_deps: MKDIR_BODY"); 4483 } 4484 } 4485 /* 4486 * Flush the inode on which the directory entry depends. 4487 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 4488 * the only remaining dependency is that the updated inode 4489 * count must get pushed to disk. The inode has already 4490 * been pushed into its inode buffer (via VOP_UPDATE) at 4491 * the time of the reference count change. So we need only 4492 * locate that buffer, ensure that there will be no rollback 4493 * caused by a bitmap dependency, then write the inode buffer. 4494 / 4495* if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) { 4496 FREE_LOCK(&lk); 4497 panic("flush_pagedep_deps: lost inode"); 4498 } 4499 /* 4500 * If the inode still has bitmap dependencies, 4501 * push them to disk. 4502 / 4503* if ((inodedep->id_state & DEPCOMPLETE) == 0) { 4504 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 4505 FREE_LOCK(&lk); 4506 if (gotit && 4507 (error = BUF_WRITE(inodedep->id_buf)) != 0) 4508 break; 4509 ACQUIRE_LOCK(&lk); 4510 if (dap != LIST_FIRST(diraddhdp)) 4511 continue; 4512 } 4513 /* 4514 * If the inode is still sitting in a buffer waiting 4515 * to be written, push it to disk. 4516 / 4517* FREE_LOCK(&lk); 4518 if ((error = bread(ump->um_devvp, 4519 fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), 4520 (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) 4521 break; 4522 if ((error = BUF_WRITE(bp)) != 0) 4523 break; 4524 ACQUIRE_LOCK(&lk); 4525 /* 4526 * If we have failed to get rid of all the dependencies 4527 * then something is seriously wrong. 4528 / 4529* if (dap == LIST_FIRST(diraddhdp)) { 4530 FREE_LOCK(&lk); 4531 panic("flush_pagedep_deps: flush failed"); 4532 } 4533 } 4534 if (error) 4535 ACQUIRE_LOCK(&lk); 4536 return (error); 4537} 4538 4539/* 4540 * A large burst of file addition or deletion activity can drive the 4541 * memory load excessively high. First attempt to slow things down 4542 * using the techniques below. If that fails, this routine requests 4543 * the offending operations to fall back to running synchronously 4544 * until the memory load returns to a reasonable level. 4545 / 4546int 4547softdep_slowdown(vp) 4548* struct vnode vp; 4549{ 4550* int max_softdeps_hard; 4551 4552 max_softdeps_hard = max_softdeps * 11 / 10; 4553 if (num_dirrem < max_softdeps_hard / 2 && 4554 num_inodedep < max_softdeps_hard) 4555 return (0); 4556 stat_sync_limit_hit += 1; 4557 return (1); 4558} 4559 4560/* 4561 * If memory utilization has gotten too high, deliberately slow things 4562 * down and speed up the I/O processing. 4563 / 4564static int 4565request_cleanup(resource, islocked) 4566* int resource; 4567 int islocked; 4568{ 4569 struct proc p = CURPROC; 4570* 4571 /* 4572 * We never hold up the filesystem syncer process. 4573 / 4574* if (p == filesys_syncer) 4575 return (0); 4576 /* 4577 * First check to see if the work list has gotten backlogged. 4578 * If it has, co-opt this process to help clean up two entries. 4579 * Because this process may hold inodes locked, we cannot 4580 * handle any remove requests that might block on a locked 4581 * inode as that could lead to deadlock. 4582 / 4583* if (num_on_worklist > max_softdeps / 10) { 4584 if (islocked) 4585 FREE_LOCK(&lk); 4586 process_worklist_item(NULL, LK_NOWAIT); 4587 process_worklist_item(NULL, LK_NOWAIT); 4588 stat_worklist_push += 2; 4589 if (islocked) 4590 ACQUIRE_LOCK(&lk); 4591 return(1); 4592 } 4593 /* 4594 * Next, we attempt to speed up the syncer process. If that 4595 * is successful, then we allow the process to continue. 4596 / 4597* if (speedup_syncer()) 4598 return(0); 4599 /* 4600 * If we are resource constrained on inode dependencies, try 4601 * flushing some dirty inodes. Otherwise, we are constrained 4602 * by file deletions, so try accelerating flushes of directories 4603 * with removal dependencies. We would like to do the cleanup 4604 * here, but we probably hold an inode locked at this point and 4605 * that might deadlock against one that we try to clean. So, 4606 * the best that we can do is request the syncer daemon to do 4607 * the cleanup for us. 4608 / 4609* switch (resource) { 4610 4611 case FLUSH_INODES: 4612 stat_ino_limit_push += 1; 4613 req_clear_inodedeps += 1; 4614 stat_countp = &stat_ino_limit_hit; 4615 break; 4616 4617 case FLUSH_REMOVE: 4618 stat_blk_limit_push += 1; 4619 req_clear_remove += 1; 4620 stat_countp = &stat_blk_limit_hit; 4621 break; 4622 4623 default: 4624 if (islocked) 4625 FREE_LOCK(&lk); 4626 panic("request_cleanup: unknown type"); 4627 } 4628 /* 4629 * Hopefully the syncer daemon will catch up and awaken us. 4630 * We wait at most tickdelay before proceeding in any case. 4631 / 4632* if (islocked == 0) 4633 ACQUIRE_LOCK(&lk); 4634 proc_waiting += 1; 4635 if (handle.callout == NULL) 4636 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); 4637 FREE_LOCK_INTERLOCKED(&lk); 4638 (void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0); 4639 ACQUIRE_LOCK_INTERLOCKED(&lk); 4640 proc_waiting -= 1; 4641 if (islocked == 0) 4642 FREE_LOCK(&lk); 4643 return (1); 4644} 4645 4646/* 4647 * Awaken processes pausing in request_cleanup and clear proc_waiting 4648 * to indicate that there is no longer a timer running. 4649 / 4650void 4651pause_timer(arg) 4652* void arg; 4653{ 4654* 4655 stat_countp += 1; 4656* wakeup_one(&proc_waiting); 4657 if (proc_waiting > 0) 4658 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); 4659 else 4660 handle.callout = NULL; 4661} 4662 4663/* 4664 * Flush out a directory with at least one removal dependency in an effort to 4665 * reduce the number of dirrem, freefile, and freeblks dependency structures. 4666 / 4667static void 4668clear_remove(p) 4669* struct proc p; 4670{ 4671* struct pagedep_hashhead pagedephd; 4672* struct pagedep pagedep; 4673* static int next = 0; 4674 struct mount mp; 4675* struct vnode vp; 4676* int error, cnt; 4677 ino_t ino; 4678 4679 ACQUIRE_LOCK(&lk); 4680 for (cnt = 0; cnt < pagedep_hash; cnt++) { 4681 pagedephd = &pagedep_hashtbl[next++]; 4682 if (next >= pagedep_hash) 4683 next = 0; 4684 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 4685 if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL) 4686 continue; 4687 mp = pagedep->pd_mnt; 4688 ino = pagedep->pd_ino; 4689 FREE_LOCK(&lk); 4690 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 4691 continue; 4692 if ((error = VFS_VGET(mp, ino, &vp)) != 0) { 4693 softdep_error("clear_remove: vget", error); 4694 vn_finished_write(mp); 4695 return; 4696 } 4697 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) 4698 softdep_error("clear_remove: fsync", error); 4699 drain_output(vp, 0); 4700 vput(vp); 4701 vn_finished_write(mp); 4702 return; 4703 } 4704 } 4705 FREE_LOCK(&lk); 4706} 4707 4708/* 4709 * Clear out a block of dirty inodes in an effort to reduce 4710 * the number of inodedep dependency structures. 4711 / 4712static void 4713clear_inodedeps(p) 4714* struct proc p; 4715{ 4716* struct inodedep_hashhead inodedephd; 4717* struct inodedep inodedep; 4718* static int next = 0; 4719 struct mount mp; 4720* struct vnode vp; 4721* struct fs fs; 4722* int error, cnt; 4723 ino_t firstino, lastino, ino; 4724 4725 ACQUIRE_LOCK(&lk); 4726 /* 4727 * Pick a random inode dependency to be cleared. 4728 * We will then gather up all the inodes in its block 4729 * that have dependencies and flush them out. 4730 / 4731* for (cnt = 0; cnt < inodedep_hash; cnt++) { 4732 inodedephd = &inodedep_hashtbl[next++]; 4733 if (next >= inodedep_hash) 4734 next = 0; 4735 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 4736 break; 4737 } 4738 if (inodedep == NULL) 4739 return; 4740 /* 4741 * Ugly code to find mount point given pointer to superblock. 4742 / 4743* fs = inodedep->id_fs; 4744 TAILQ_FOREACH(mp, &mountlist, mnt_list) 4745 if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs) 4746 break; 4747 /* 4748 * Find the last inode in the block with dependencies. 4749 / 4750* firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 4751 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 4752 if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0) 4753 break; 4754 /* 4755 * Asynchronously push all but the last inode with dependencies. 4756 * Synchronously push the last inode with dependencies to ensure 4757 * that the inode block gets written to free up the inodedeps. 4758 / 4759* for (ino = firstino; ino <= lastino; ino++) { 4760 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 4761 continue; 4762 FREE_LOCK(&lk); 4763 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 4764 continue; 4765 if ((error = VFS_VGET(mp, ino, &vp)) != 0) { 4766 softdep_error("clear_inodedeps: vget", error); 4767 vn_finished_write(mp); 4768 return; 4769 } 4770 if (ino == lastino) { 4771 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p))) 4772 softdep_error("clear_inodedeps: fsync1", error); 4773 } else { 4774 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) 4775 softdep_error("clear_inodedeps: fsync2", error); 4776 drain_output(vp, 0); 4777 } 4778 vput(vp); 4779 vn_finished_write(mp); 4780 ACQUIRE_LOCK(&lk); 4781 } 4782 FREE_LOCK(&lk); 4783} 4784 4785/* 4786 * Function to determine if the buffer has outstanding dependencies 4787 * that will cause a roll-back if the buffer is written. If wantcount 4788 * is set, return number of dependencies, otherwise just yes or no. 4789 / 4790static int 4791softdep_count_dependencies(bp, wantcount) 4792* struct buf bp; 4793* int wantcount; 4794{ 4795 struct worklist wk; 4796* struct inodedep inodedep; 4797* struct indirdep indirdep; 4798* struct allocindir aip; 4799* struct pagedep pagedep; 4800* struct diradd dap; 4801* int i, retval; 4802 4803 retval = 0; 4804 ACQUIRE_LOCK(&lk); 4805 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 4806 switch (wk->wk_type) { 4807 4808 case D_INODEDEP: 4809 inodedep = WK_INODEDEP(wk); 4810 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 4811 /* bitmap allocation dependency / 4812* retval += 1; 4813 if (!wantcount) 4814 goto out; 4815 } 4816 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 4817 /* direct block pointer dependency / 4818* retval += 1; 4819 if (!wantcount) 4820 goto out; 4821 } 4822 continue; 4823 4824 case D_INDIRDEP: 4825 indirdep = WK_INDIRDEP(wk); 4826 4827 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 4828 /* indirect block pointer dependency / 4829* retval += 1; 4830 if (!wantcount) 4831 goto out; 4832 } 4833 continue; 4834 4835 case D_PAGEDEP: 4836 pagedep = WK_PAGEDEP(wk); 4837 for (i = 0; i < DAHASHSZ; i++) { 4838 4839 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 4840 /* directory entry dependency / 4841* retval += 1; 4842 if (!wantcount) 4843 goto out; 4844 } 4845 } 4846 continue; 4847 4848 case D_BMSAFEMAP: 4849 case D_ALLOCDIRECT: 4850 case D_ALLOCINDIR: 4851 case D_MKDIR: 4852 /* never a dependency on these blocks / 4853* continue; 4854 4855 default: 4856 FREE_LOCK(&lk); 4857 panic("softdep_check_for_rollback: Unexpected type %s", 4858 TYPENAME(wk->wk_type)); 4859 /* NOTREACHED / 4860* } 4861 } 4862out: 4863 FREE_LOCK(&lk); 4864 return retval; 4865} 4866 4867/* 4868 * Acquire exclusive access to a buffer. 4869 * Must be called with splbio blocked. 4870 * Return 1 if buffer was acquired. 4871 / 4872static int 4873getdirtybuf(bpp, waitfor) 4874* struct buf *bpp; 4875* int waitfor; 4876{ 4877 struct buf bp; 4878* 4879 for (;;) { 4880 if ((bp = bpp) == NULL) 4881* return (0); 4882 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) == 0) { 4883 if ((bp->b_xflags & BX_BKGRDINPROG) == 0) 4884 break; 4885 BUF_UNLOCK(bp); 4886 if (waitfor != MNT_WAIT) 4887 return (0); 4888 bp->b_xflags \|= BX_BKGRDWAIT; 4889 FREE_LOCK_INTERLOCKED(&lk); 4890 tsleep(&bp->b_xflags, PRIBIO, "getbuf", 0); 4891 ACQUIRE_LOCK_INTERLOCKED(&lk); 4892 continue; 4893 } 4894 if (waitfor != MNT_WAIT) 4895 return (0); 4896 FREE_LOCK_INTERLOCKED(&lk); 4897 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_SLEEPFAIL) != ENOLCK) 4898 panic("getdirtybuf: inconsistent lock"); 4899 ACQUIRE_LOCK_INTERLOCKED(&lk); 4900 } 4901 if ((bp->b_flags & B_DELWRI) == 0) { 4902 BUF_UNLOCK(bp); 4903 return (0); 4904 } 4905 bremfree(bp); 4906 return (1); 4907} 4908 4909/* 4910 * Wait for pending output on a vnode to complete. 4911 * Must be called with vnode locked. 4912 / 4913static void 4914drain_output(vp, islocked) 4915* struct vnode vp; 4916* int islocked; 4917{ 4918 4919 if (!islocked) 4920 ACQUIRE_LOCK(&lk); 4921 while (vp->v_numoutput) { 4922 vp->v_flag \|= VBWAIT; 4923 FREE_LOCK_INTERLOCKED(&lk); 4924 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drainvp", 0); 4925 ACQUIRE_LOCK_INTERLOCKED(&lk); 4926 } 4927 if (!islocked) 4928 FREE_LOCK(&lk); 4929} 4930 4931/* 4932 * Called whenever a buffer that is being invalidated or reallocated 4933 * contains dependencies. This should only happen if an I/O error has 4934 * occurred. The routine is called with the buffer locked. 4935 / 4936static void 4937softdep_deallocate_dependencies(bp) 4938* struct buf bp; 4939{ 4940* 4941 if ((bp->b_ioflags & BIO_ERROR) == 0) 4942 panic("softdep_deallocate_dependencies: dangling deps"); 4943 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 4944 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 4945} 4946 4947/* 4948 * Function to handle asynchronous write errors in the filesystem. 4949 / 4950void 4951softdep_error(func, error) 4952* char func; 4953* int error; 4954{ 4955 4956 /* XXX should do something better! / 4957* printf("%s: got error %d while accessing filesystem\n", func, error); 4958}	3839 FREE_LOCK(&lk); 3840} 3841 3842/* 3843 * This routine is called just before the "in-core" inode 3844 * information is to be copied to the in-memory inode block. 3845 * Recall that an inode block contains several inodes. If 3846 * the force flag is set, then the dependencies will be 3847 * cleared so that the update can always be made. Note that 3848 * the buffer is locked when this routine is called, so we 3849 * will never be in the middle of writing the inode block 3850 * to disk. 3851 / 3852void 3853softdep_update_inodeblock(ip, bp, waitfor) 3854* struct inode ip; / the "in_core" copy of the inode / 3855* struct buf bp; / the buffer containing the inode block / 3856* int waitfor; /* nonzero => update must be allowed / 3857{ 3858* struct inodedep inodedep; 3859* struct worklist wk; 3860* int error, gotit; 3861 3862 /* 3863 * If the effective link count is not equal to the actual link 3864 * count, then we must track the difference in an inodedep while 3865 * the inode is (potentially) tossed out of the cache. Otherwise, 3866 * if there is no existing inodedep, then there are no dependencies 3867 * to track. 3868 / 3869* ACQUIRE_LOCK(&lk); 3870 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3871 FREE_LOCK(&lk); 3872 if (ip->i_effnlink != ip->i_nlink) 3873 panic("softdep_update_inodeblock: bad link count"); 3874 return; 3875 } 3876 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) { 3877 FREE_LOCK(&lk); 3878 panic("softdep_update_inodeblock: bad delta"); 3879 } 3880 /* 3881 * Changes have been initiated. Anything depending on these 3882 * changes cannot occur until this inode has been written. 3883 / 3884* inodedep->id_state &= ~COMPLETE; 3885 if ((inodedep->id_state & ONWORKLIST) == 0) 3886 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 3887 /* 3888 * Any new dependencies associated with the incore inode must 3889 * now be moved to the list associated with the buffer holding 3890 * the in-memory copy of the inode. Once merged process any 3891 * allocdirects that are completed by the merger. 3892 / 3893* merge_inode_lists(inodedep); 3894 if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) 3895 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); 3896 /* 3897 * Now that the inode has been pushed into the buffer, the 3898 * operations dependent on the inode being written to disk 3899 * can be moved to the id_bufwait so that they will be 3900 * processed when the buffer I/O completes. 3901 / 3902* while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 3903 WORKLIST_REMOVE(wk); 3904 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 3905 } 3906 /* 3907 * Newly allocated inodes cannot be written until the bitmap 3908 * that allocates them have been written (indicated by 3909 * DEPCOMPLETE being set in id_state). If we are doing a 3910 * forced sync (e.g., an fsync on a file), we force the bitmap 3911 * to be written so that the update can be done. 3912 / 3913* if ((inodedep->id_state & DEPCOMPLETE) != 0 \|\| waitfor == 0) { 3914 FREE_LOCK(&lk); 3915 return; 3916 } 3917 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 3918 FREE_LOCK(&lk); 3919 if (gotit && 3920 (error = BUF_WRITE(inodedep->id_buf)) != 0) 3921 softdep_error("softdep_update_inodeblock: bwrite", error); 3922 if ((inodedep->id_state & DEPCOMPLETE) == 0) 3923 panic("softdep_update_inodeblock: update failed"); 3924} 3925 3926/* 3927 * Merge the new inode dependency list (id_newinoupdt) into the old 3928 * inode dependency list (id_inoupdt). This routine must be called 3929 * with splbio interrupts blocked. 3930 / 3931static void 3932merge_inode_lists(inodedep) 3933* struct inodedep inodedep; 3934{ 3935* struct allocdirect listadp, newadp; 3936 3937 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3938 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) { 3939 if (listadp->ad_lbn < newadp->ad_lbn) { 3940 listadp = TAILQ_NEXT(listadp, ad_next); 3941 continue; 3942 } 3943 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3944 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 3945 if (listadp->ad_lbn == newadp->ad_lbn) { 3946 allocdirect_merge(&inodedep->id_inoupdt, newadp, 3947 listadp); 3948 listadp = newadp; 3949 } 3950 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3951 } 3952 while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) { 3953 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3954 TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next); 3955 } 3956} 3957 3958/* 3959 * If we are doing an fsync, then we must ensure that any directory 3960 * entries for the inode have been written after the inode gets to disk. 3961 / 3962int 3963softdep_fsync(vp) 3964* struct vnode vp; / the "in_core" copy of the inode / 3965{ 3966* struct inodedep inodedep; 3967* struct pagedep pagedep; 3968* struct worklist wk; 3969* struct diradd dap; 3970* struct mount mnt; 3971* struct vnode pvp; 3972* struct inode ip; 3973* struct buf bp; 3974* struct fs fs; 3975* struct proc p = CURPROC; / XXX / 3976* int error, flushparent; 3977 ino_t parentino; 3978 ufs_lbn_t lbn; 3979 3980 ip = VTOI(vp); 3981 fs = ip->i_fs; 3982 ACQUIRE_LOCK(&lk); 3983 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) { 3984 FREE_LOCK(&lk); 3985 return (0); 3986 } 3987 if (LIST_FIRST(&inodedep->id_inowait) != NULL \|\| 3988 LIST_FIRST(&inodedep->id_bufwait) != NULL \|\| 3989 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL \|\| 3990 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) { 3991 FREE_LOCK(&lk); 3992 panic("softdep_fsync: pending ops"); 3993 } 3994 for (error = 0, flushparent = 0; ; ) { 3995 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 3996 break; 3997 if (wk->wk_type != D_DIRADD) { 3998 FREE_LOCK(&lk); 3999 panic("softdep_fsync: Unexpected type %s", 4000 TYPENAME(wk->wk_type)); 4001 } 4002 dap = WK_DIRADD(wk); 4003 /* 4004 * Flush our parent if this directory entry 4005 * has a MKDIR_PARENT dependency. 4006 / 4007* if (dap->da_state & DIRCHG) 4008 pagedep = dap->da_previous->dm_pagedep; 4009 else 4010 pagedep = dap->da_pagedep; 4011 mnt = pagedep->pd_mnt; 4012 parentino = pagedep->pd_ino; 4013 lbn = pagedep->pd_lbn; 4014 if ((dap->da_state & (MKDIR_BODY \| COMPLETE)) != COMPLETE) { 4015 FREE_LOCK(&lk); 4016 panic("softdep_fsync: dirty"); 4017 } 4018 flushparent = dap->da_state & MKDIR_PARENT; 4019 /* 4020 * If we are being fsync'ed as part of vgone'ing this vnode, 4021 * then we will not be able to release and recover the 4022 * vnode below, so we just have to give up on writing its 4023 * directory entry out. It will eventually be written, just 4024 * not now, but then the user was not asking to have it 4025 * written, so we are not breaking any promises. 4026 / 4027* if (vp->v_flag & VXLOCK) 4028 break; 4029 /* 4030 * We prevent deadlock by always fetching inodes from the 4031 * root, moving down the directory tree. Thus, when fetching 4032 * our parent directory, we must unlock ourselves before 4033 * requesting the lock on our parent. See the comment in 4034 * ufs_lookup for details on possible races. 4035 / 4036* FREE_LOCK(&lk); 4037 VOP_UNLOCK(vp, 0, p); 4038 error = VFS_VGET(mnt, parentino, &pvp); 4039 vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY, p); 4040 if (error != 0) 4041 return (error); 4042 if (flushparent) { 4043 if ((error = UFS_UPDATE(pvp, 1)) != 0) { 4044 vput(pvp); 4045 return (error); 4046 } 4047 } 4048 /* 4049 * Flush directory page containing the inode's name. 4050 / 4051* error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred, 4052 &bp); 4053 if (error == 0) 4054 error = BUF_WRITE(bp); 4055 vput(pvp); 4056 if (error != 0) 4057 return (error); 4058 ACQUIRE_LOCK(&lk); 4059 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) 4060 break; 4061 } 4062 FREE_LOCK(&lk); 4063 return (0); 4064} 4065 4066/* 4067 * Flush all the dirty bitmaps associated with the block device 4068 * before flushing the rest of the dirty blocks so as to reduce 4069 * the number of dependencies that will have to be rolled back. 4070 / 4071void 4072softdep_fsync_mountdev(vp) 4073* struct vnode vp; 4074{ 4075* struct buf bp, nbp; 4076 struct worklist wk; 4077* 4078 if (!vn_isdisk(vp, NULL)) 4079 panic("softdep_fsync_mountdev: vnode not a disk"); 4080 ACQUIRE_LOCK(&lk); 4081 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 4082 nbp = TAILQ_NEXT(bp, b_vnbufs); 4083 /* 4084 * If it is already scheduled, skip to the next buffer. 4085 / 4086* if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) 4087 continue; 4088 if ((bp->b_flags & B_DELWRI) == 0) { 4089 FREE_LOCK(&lk); 4090 panic("softdep_fsync_mountdev: not dirty"); 4091 } 4092 /* 4093 * We are only interested in bitmaps with outstanding 4094 * dependencies. 4095 / 4096* if ((wk = LIST_FIRST(&bp->b_dep)) == NULL \|\| 4097 wk->wk_type != D_BMSAFEMAP \|\| 4098 (bp->b_xflags & BX_BKGRDINPROG)) { 4099 BUF_UNLOCK(bp); 4100 continue; 4101 } 4102 bremfree(bp); 4103 FREE_LOCK(&lk); 4104 (void) bawrite(bp); 4105 ACQUIRE_LOCK(&lk); 4106 /* 4107 * Since we may have slept during the I/O, we need 4108 * to start from a known point. 4109 / 4110* nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); 4111 } 4112 drain_output(vp, 1); 4113 FREE_LOCK(&lk); 4114} 4115 4116/* 4117 * This routine is called when we are trying to synchronously flush a 4118 * file. This routine must eliminate any filesystem metadata dependencies 4119 * so that the syncing routine can succeed by pushing the dirty blocks 4120 * associated with the file. If any I/O errors occur, they are returned. 4121 / 4122int 4123softdep_sync_metadata(ap) 4124* struct vop_fsync_args /* { 4125 struct vnode a_vp; 4126* struct ucred a_cred; 4127* int a_waitfor; 4128 struct proc a_p; 4129* } / ap; 4130{ 4131 struct vnode vp = ap->a_vp; 4132* struct pagedep pagedep; 4133* struct allocdirect adp; 4134* struct allocindir aip; 4135* struct buf bp, nbp; 4136 struct worklist wk; 4137* int i, error, waitfor; 4138 4139 /* 4140 * Check whether this vnode is involved in a filesystem 4141 * that is doing soft dependency processing. 4142 / 4143* if (!vn_isdisk(vp, NULL)) { 4144 if (!DOINGSOFTDEP(vp)) 4145 return (0); 4146 } else 4147 if (vp->v_rdev->si_mountpoint == NULL \|\| 4148 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0) 4149 return (0); 4150 /* 4151 * Ensure that any direct block dependencies have been cleared. 4152 / 4153* ACQUIRE_LOCK(&lk); 4154 if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) { 4155 FREE_LOCK(&lk); 4156 return (error); 4157 } 4158 /* 4159 * For most files, the only metadata dependencies are the 4160 * cylinder group maps that allocate their inode or blocks. 4161 * The block allocation dependencies can be found by traversing 4162 * the dependency lists for any buffers that remain on their 4163 * dirty buffer list. The inode allocation dependency will 4164 * be resolved when the inode is updated with MNT_WAIT. 4165 * This work is done in two passes. The first pass grabs most 4166 * of the buffers and begins asynchronously writing them. The 4167 * only way to wait for these asynchronous writes is to sleep 4168 * on the filesystem vnode which may stay busy for a long time 4169 * if the filesystem is active. So, instead, we make a second 4170 * pass over the dependencies blocking on each write. In the 4171 * usual case we will be blocking against a write that we 4172 * initiated, so when it is done the dependency will have been 4173 * resolved. Thus the second pass is expected to end quickly. 4174 / 4175* waitfor = MNT_NOWAIT; 4176top: 4177 if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) { 4178 FREE_LOCK(&lk); 4179 return (0); 4180 } 4181 bp = TAILQ_FIRST(&vp->v_dirtyblkhd); 4182 /* While syncing snapshots, we must allow recursive lookups / 4183* bp->b_lock.lk_flags \|= LK_CANRECURSE; 4184loop: 4185 /* 4186 * As we hold the buffer locked, none of its dependencies 4187 * will disappear. 4188 / 4189* LIST_FOREACH(wk, &bp->b_dep, wk_list) { 4190 switch (wk->wk_type) { 4191 4192 case D_ALLOCDIRECT: 4193 adp = WK_ALLOCDIRECT(wk); 4194 if (adp->ad_state & DEPCOMPLETE) 4195 continue; 4196 nbp = adp->ad_buf; 4197 if (getdirtybuf(&nbp, waitfor) == 0) 4198 continue; 4199 FREE_LOCK(&lk); 4200 if (waitfor == MNT_NOWAIT) { 4201 bawrite(nbp); 4202 } else if ((error = BUF_WRITE(nbp)) != 0) { 4203 break; 4204 } 4205 ACQUIRE_LOCK(&lk); 4206 continue; 4207 4208 case D_ALLOCINDIR: 4209 aip = WK_ALLOCINDIR(wk); 4210 if (aip->ai_state & DEPCOMPLETE) 4211 continue; 4212 nbp = aip->ai_buf; 4213 if (getdirtybuf(&nbp, waitfor) == 0) 4214 continue; 4215 FREE_LOCK(&lk); 4216 if (waitfor == MNT_NOWAIT) { 4217 bawrite(nbp); 4218 } else if ((error = BUF_WRITE(nbp)) != 0) { 4219 break; 4220 } 4221 ACQUIRE_LOCK(&lk); 4222 continue; 4223 4224 case D_INDIRDEP: 4225 restart: 4226 4227 LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { 4228 if (aip->ai_state & DEPCOMPLETE) 4229 continue; 4230 nbp = aip->ai_buf; 4231 if (getdirtybuf(&nbp, MNT_WAIT) == 0) 4232 goto restart; 4233 FREE_LOCK(&lk); 4234 if ((error = BUF_WRITE(nbp)) != 0) { 4235 break; 4236 } 4237 ACQUIRE_LOCK(&lk); 4238 goto restart; 4239 } 4240 continue; 4241 4242 case D_INODEDEP: 4243 if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs, 4244 WK_INODEDEP(wk)->id_ino)) != 0) { 4245 FREE_LOCK(&lk); 4246 break; 4247 } 4248 continue; 4249 4250 case D_PAGEDEP: 4251 /* 4252 * We are trying to sync a directory that may 4253 * have dependencies on both its own metadata 4254 * and/or dependencies on the inodes of any 4255 * recently allocated files. We walk its diradd 4256 * lists pushing out the associated inode. 4257 / 4258* pagedep = WK_PAGEDEP(wk); 4259 for (i = 0; i < DAHASHSZ; i++) { 4260 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 4261 continue; 4262 if ((error = 4263 flush_pagedep_deps(vp, pagedep->pd_mnt, 4264 &pagedep->pd_diraddhd[i]))) { 4265 FREE_LOCK(&lk); 4266 break; 4267 } 4268 } 4269 continue; 4270 4271 case D_MKDIR: 4272 /* 4273 * This case should never happen if the vnode has 4274 * been properly sync'ed. However, if this function 4275 * is used at a place where the vnode has not yet 4276 * been sync'ed, this dependency can show up. So, 4277 * rather than panic, just flush it. 4278 / 4279* nbp = WK_MKDIR(wk)->md_buf; 4280 if (getdirtybuf(&nbp, waitfor) == 0) 4281 continue; 4282 FREE_LOCK(&lk); 4283 if (waitfor == MNT_NOWAIT) { 4284 bawrite(nbp); 4285 } else if ((error = BUF_WRITE(nbp)) != 0) { 4286 break; 4287 } 4288 ACQUIRE_LOCK(&lk); 4289 continue; 4290 4291 case D_BMSAFEMAP: 4292 /* 4293 * This case should never happen if the vnode has 4294 * been properly sync'ed. However, if this function 4295 * is used at a place where the vnode has not yet 4296 * been sync'ed, this dependency can show up. So, 4297 * rather than panic, just flush it. 4298 / 4299* nbp = WK_BMSAFEMAP(wk)->sm_buf; 4300 if (getdirtybuf(&nbp, waitfor) == 0) 4301 continue; 4302 FREE_LOCK(&lk); 4303 if (waitfor == MNT_NOWAIT) { 4304 bawrite(nbp); 4305 } else if ((error = BUF_WRITE(nbp)) != 0) { 4306 break; 4307 } 4308 ACQUIRE_LOCK(&lk); 4309 continue; 4310 4311 default: 4312 FREE_LOCK(&lk); 4313 panic("softdep_sync_metadata: Unknown type %s", 4314 TYPENAME(wk->wk_type)); 4315 /* NOTREACHED / 4316* } 4317 /* We reach here only in error and unlocked / 4318* if (error == 0) 4319 panic("softdep_sync_metadata: zero error"); 4320 bp->b_lock.lk_flags &= ~LK_CANRECURSE; 4321 bawrite(bp); 4322 return (error); 4323 } 4324 (void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT); 4325 nbp = TAILQ_NEXT(bp, b_vnbufs); 4326 FREE_LOCK(&lk); 4327 bp->b_lock.lk_flags &= ~LK_CANRECURSE; 4328 bawrite(bp); 4329 ACQUIRE_LOCK(&lk); 4330 if (nbp != NULL) { 4331 bp = nbp; 4332 goto loop; 4333 } 4334 /* 4335 * We must wait for any I/O in progress to finish so that 4336 * all potential buffers on the dirty list will be visible. 4337 * Once they are all there, proceed with the second pass 4338 * which will wait for the I/O as per above. 4339 / 4340* drain_output(vp, 1); 4341 /* 4342 * The brief unlock is to allow any pent up dependency 4343 * processing to be done. 4344 / 4345* if (waitfor == MNT_NOWAIT) { 4346 waitfor = MNT_WAIT; 4347 FREE_LOCK(&lk); 4348 ACQUIRE_LOCK(&lk); 4349 goto top; 4350 } 4351 4352 /* 4353 * If we have managed to get rid of all the dirty buffers, 4354 * then we are done. For certain directories and block 4355 * devices, we may need to do further work. 4356 / 4357* if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) { 4358 FREE_LOCK(&lk); 4359 return (0); 4360 } 4361 4362 FREE_LOCK(&lk); 4363 /* 4364 * If we are trying to sync a block device, some of its buffers may 4365 * contain metadata that cannot be written until the contents of some 4366 * partially written files have been written to disk. The only easy 4367 * way to accomplish this is to sync the entire filesystem (luckily 4368 * this happens rarely). 4369 / 4370* if (vn_isdisk(vp, NULL) && 4371 vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) && 4372 (error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT, ap->a_cred, 4373 ap->a_p)) != 0) 4374 return (error); 4375 return (0); 4376} 4377 4378/* 4379 * Flush the dependencies associated with an inodedep. 4380 * Called with splbio blocked. 4381 / 4382static int 4383flush_inodedep_deps(fs, ino) 4384* struct fs fs; 4385* ino_t ino; 4386{ 4387 struct inodedep inodedep; 4388* struct allocdirect adp; 4389* int error, waitfor; 4390 struct buf bp; 4391* 4392 /* 4393 * This work is done in two passes. The first pass grabs most 4394 * of the buffers and begins asynchronously writing them. The 4395 * only way to wait for these asynchronous writes is to sleep 4396 * on the filesystem vnode which may stay busy for a long time 4397 * if the filesystem is active. So, instead, we make a second 4398 * pass over the dependencies blocking on each write. In the 4399 * usual case we will be blocking against a write that we 4400 * initiated, so when it is done the dependency will have been 4401 * resolved. Thus the second pass is expected to end quickly. 4402 * We give a brief window at the top of the loop to allow 4403 * any pending I/O to complete. 4404 / 4405* for (waitfor = MNT_NOWAIT; ; ) { 4406 FREE_LOCK(&lk); 4407 ACQUIRE_LOCK(&lk); 4408 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 4409 return (0); 4410 TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) { 4411 if (adp->ad_state & DEPCOMPLETE) 4412 continue; 4413 bp = adp->ad_buf; 4414 if (getdirtybuf(&bp, waitfor) == 0) { 4415 if (waitfor == MNT_NOWAIT) 4416 continue; 4417 break; 4418 } 4419 FREE_LOCK(&lk); 4420 if (waitfor == MNT_NOWAIT) { 4421 bawrite(bp); 4422 } else if ((error = BUF_WRITE(bp)) != 0) { 4423 ACQUIRE_LOCK(&lk); 4424 return (error); 4425 } 4426 ACQUIRE_LOCK(&lk); 4427 break; 4428 } 4429 if (adp != NULL) 4430 continue; 4431 TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) { 4432 if (adp->ad_state & DEPCOMPLETE) 4433 continue; 4434 bp = adp->ad_buf; 4435 if (getdirtybuf(&bp, waitfor) == 0) { 4436 if (waitfor == MNT_NOWAIT) 4437 continue; 4438 break; 4439 } 4440 FREE_LOCK(&lk); 4441 if (waitfor == MNT_NOWAIT) { 4442 bawrite(bp); 4443 } else if ((error = BUF_WRITE(bp)) != 0) { 4444 ACQUIRE_LOCK(&lk); 4445 return (error); 4446 } 4447 ACQUIRE_LOCK(&lk); 4448 break; 4449 } 4450 if (adp != NULL) 4451 continue; 4452 /* 4453 * If pass2, we are done, otherwise do pass 2. 4454 / 4455* if (waitfor == MNT_WAIT) 4456 break; 4457 waitfor = MNT_WAIT; 4458 } 4459 /* 4460 * Try freeing inodedep in case all dependencies have been removed. 4461 / 4462* if (inodedep_lookup(fs, ino, 0, &inodedep) != 0) 4463 (void) free_inodedep(inodedep); 4464 return (0); 4465} 4466 4467/* 4468 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 4469 * Called with splbio blocked. 4470 / 4471static int 4472flush_pagedep_deps(pvp, mp, diraddhdp) 4473* struct vnode pvp; 4474* struct mount mp; 4475* struct diraddhd diraddhdp; 4476{ 4477* struct proc p = CURPROC; / XXX / 4478* struct inodedep inodedep; 4479* struct ufsmount ump; 4480* struct diradd dap; 4481* struct vnode vp; 4482* int gotit, error = 0; 4483 struct buf bp; 4484* ino_t inum; 4485 4486 ump = VFSTOUFS(mp); 4487 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 4488 /* 4489 * Flush ourselves if this directory entry 4490 * has a MKDIR_PARENT dependency. 4491 / 4492* if (dap->da_state & MKDIR_PARENT) { 4493 FREE_LOCK(&lk); 4494 if ((error = UFS_UPDATE(pvp, 1)) != 0) 4495 break; 4496 ACQUIRE_LOCK(&lk); 4497 /* 4498 * If that cleared dependencies, go on to next. 4499 / 4500* if (dap != LIST_FIRST(diraddhdp)) 4501 continue; 4502 if (dap->da_state & MKDIR_PARENT) { 4503 FREE_LOCK(&lk); 4504 panic("flush_pagedep_deps: MKDIR_PARENT"); 4505 } 4506 } 4507 /* 4508 * A newly allocated directory must have its "." and 4509 * ".." entries written out before its name can be 4510 * committed in its parent. We do not want or need 4511 * the full semantics of a synchronous VOP_FSYNC as 4512 * that may end up here again, once for each directory 4513 * level in the filesystem. Instead, we push the blocks 4514 * and wait for them to clear. We have to fsync twice 4515 * because the first call may choose to defer blocks 4516 * that still have dependencies, but deferral will 4517 * happen at most once. 4518 / 4519* inum = dap->da_newinum; 4520 if (dap->da_state & MKDIR_BODY) { 4521 FREE_LOCK(&lk); 4522 if ((error = VFS_VGET(mp, inum, &vp)) != 0) 4523 break; 4524 if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) \|\| 4525 (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) { 4526 vput(vp); 4527 break; 4528 } 4529 drain_output(vp, 0); 4530 vput(vp); 4531 ACQUIRE_LOCK(&lk); 4532 /* 4533 * If that cleared dependencies, go on to next. 4534 / 4535* if (dap != LIST_FIRST(diraddhdp)) 4536 continue; 4537 if (dap->da_state & MKDIR_BODY) { 4538 FREE_LOCK(&lk); 4539 panic("flush_pagedep_deps: MKDIR_BODY"); 4540 } 4541 } 4542 /* 4543 * Flush the inode on which the directory entry depends. 4544 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 4545 * the only remaining dependency is that the updated inode 4546 * count must get pushed to disk. The inode has already 4547 * been pushed into its inode buffer (via VOP_UPDATE) at 4548 * the time of the reference count change. So we need only 4549 * locate that buffer, ensure that there will be no rollback 4550 * caused by a bitmap dependency, then write the inode buffer. 4551 / 4552* if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) { 4553 FREE_LOCK(&lk); 4554 panic("flush_pagedep_deps: lost inode"); 4555 } 4556 /* 4557 * If the inode still has bitmap dependencies, 4558 * push them to disk. 4559 / 4560* if ((inodedep->id_state & DEPCOMPLETE) == 0) { 4561 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 4562 FREE_LOCK(&lk); 4563 if (gotit && 4564 (error = BUF_WRITE(inodedep->id_buf)) != 0) 4565 break; 4566 ACQUIRE_LOCK(&lk); 4567 if (dap != LIST_FIRST(diraddhdp)) 4568 continue; 4569 } 4570 /* 4571 * If the inode is still sitting in a buffer waiting 4572 * to be written, push it to disk. 4573 / 4574* FREE_LOCK(&lk); 4575 if ((error = bread(ump->um_devvp, 4576 fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), 4577 (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) 4578 break; 4579 if ((error = BUF_WRITE(bp)) != 0) 4580 break; 4581 ACQUIRE_LOCK(&lk); 4582 /* 4583 * If we have failed to get rid of all the dependencies 4584 * then something is seriously wrong. 4585 / 4586* if (dap == LIST_FIRST(diraddhdp)) { 4587 FREE_LOCK(&lk); 4588 panic("flush_pagedep_deps: flush failed"); 4589 } 4590 } 4591 if (error) 4592 ACQUIRE_LOCK(&lk); 4593 return (error); 4594} 4595 4596/* 4597 * A large burst of file addition or deletion activity can drive the 4598 * memory load excessively high. First attempt to slow things down 4599 * using the techniques below. If that fails, this routine requests 4600 * the offending operations to fall back to running synchronously 4601 * until the memory load returns to a reasonable level. 4602 / 4603int 4604softdep_slowdown(vp) 4605* struct vnode vp; 4606{ 4607* int max_softdeps_hard; 4608 4609 max_softdeps_hard = max_softdeps * 11 / 10; 4610 if (num_dirrem < max_softdeps_hard / 2 && 4611 num_inodedep < max_softdeps_hard) 4612 return (0); 4613 stat_sync_limit_hit += 1; 4614 return (1); 4615} 4616 4617/* 4618 * If memory utilization has gotten too high, deliberately slow things 4619 * down and speed up the I/O processing. 4620 / 4621static int 4622request_cleanup(resource, islocked) 4623* int resource; 4624 int islocked; 4625{ 4626 struct proc p = CURPROC; 4627* 4628 /* 4629 * We never hold up the filesystem syncer process. 4630 / 4631* if (p == filesys_syncer) 4632 return (0); 4633 /* 4634 * First check to see if the work list has gotten backlogged. 4635 * If it has, co-opt this process to help clean up two entries. 4636 * Because this process may hold inodes locked, we cannot 4637 * handle any remove requests that might block on a locked 4638 * inode as that could lead to deadlock. 4639 / 4640* if (num_on_worklist > max_softdeps / 10) { 4641 if (islocked) 4642 FREE_LOCK(&lk); 4643 process_worklist_item(NULL, LK_NOWAIT); 4644 process_worklist_item(NULL, LK_NOWAIT); 4645 stat_worklist_push += 2; 4646 if (islocked) 4647 ACQUIRE_LOCK(&lk); 4648 return(1); 4649 } 4650 /* 4651 * Next, we attempt to speed up the syncer process. If that 4652 * is successful, then we allow the process to continue. 4653 / 4654* if (speedup_syncer()) 4655 return(0); 4656 /* 4657 * If we are resource constrained on inode dependencies, try 4658 * flushing some dirty inodes. Otherwise, we are constrained 4659 * by file deletions, so try accelerating flushes of directories 4660 * with removal dependencies. We would like to do the cleanup 4661 * here, but we probably hold an inode locked at this point and 4662 * that might deadlock against one that we try to clean. So, 4663 * the best that we can do is request the syncer daemon to do 4664 * the cleanup for us. 4665 / 4666* switch (resource) { 4667 4668 case FLUSH_INODES: 4669 stat_ino_limit_push += 1; 4670 req_clear_inodedeps += 1; 4671 stat_countp = &stat_ino_limit_hit; 4672 break; 4673 4674 case FLUSH_REMOVE: 4675 stat_blk_limit_push += 1; 4676 req_clear_remove += 1; 4677 stat_countp = &stat_blk_limit_hit; 4678 break; 4679 4680 default: 4681 if (islocked) 4682 FREE_LOCK(&lk); 4683 panic("request_cleanup: unknown type"); 4684 } 4685 /* 4686 * Hopefully the syncer daemon will catch up and awaken us. 4687 * We wait at most tickdelay before proceeding in any case. 4688 / 4689* if (islocked == 0) 4690 ACQUIRE_LOCK(&lk); 4691 proc_waiting += 1; 4692 if (handle.callout == NULL) 4693 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); 4694 FREE_LOCK_INTERLOCKED(&lk); 4695 (void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0); 4696 ACQUIRE_LOCK_INTERLOCKED(&lk); 4697 proc_waiting -= 1; 4698 if (islocked == 0) 4699 FREE_LOCK(&lk); 4700 return (1); 4701} 4702 4703/* 4704 * Awaken processes pausing in request_cleanup and clear proc_waiting 4705 * to indicate that there is no longer a timer running. 4706 / 4707void 4708pause_timer(arg) 4709* void arg; 4710{ 4711* 4712 stat_countp += 1; 4713* wakeup_one(&proc_waiting); 4714 if (proc_waiting > 0) 4715 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); 4716 else 4717 handle.callout = NULL; 4718} 4719 4720/* 4721 * Flush out a directory with at least one removal dependency in an effort to 4722 * reduce the number of dirrem, freefile, and freeblks dependency structures. 4723 / 4724static void 4725clear_remove(p) 4726* struct proc p; 4727{ 4728* struct pagedep_hashhead pagedephd; 4729* struct pagedep pagedep; 4730* static int next = 0; 4731 struct mount mp; 4732* struct vnode vp; 4733* int error, cnt; 4734 ino_t ino; 4735 4736 ACQUIRE_LOCK(&lk); 4737 for (cnt = 0; cnt < pagedep_hash; cnt++) { 4738 pagedephd = &pagedep_hashtbl[next++]; 4739 if (next >= pagedep_hash) 4740 next = 0; 4741 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 4742 if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL) 4743 continue; 4744 mp = pagedep->pd_mnt; 4745 ino = pagedep->pd_ino; 4746 FREE_LOCK(&lk); 4747 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 4748 continue; 4749 if ((error = VFS_VGET(mp, ino, &vp)) != 0) { 4750 softdep_error("clear_remove: vget", error); 4751 vn_finished_write(mp); 4752 return; 4753 } 4754 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) 4755 softdep_error("clear_remove: fsync", error); 4756 drain_output(vp, 0); 4757 vput(vp); 4758 vn_finished_write(mp); 4759 return; 4760 } 4761 } 4762 FREE_LOCK(&lk); 4763} 4764 4765/* 4766 * Clear out a block of dirty inodes in an effort to reduce 4767 * the number of inodedep dependency structures. 4768 / 4769static void 4770clear_inodedeps(p) 4771* struct proc p; 4772{ 4773* struct inodedep_hashhead inodedephd; 4774* struct inodedep inodedep; 4775* static int next = 0; 4776 struct mount mp; 4777* struct vnode vp; 4778* struct fs fs; 4779* int error, cnt; 4780 ino_t firstino, lastino, ino; 4781 4782 ACQUIRE_LOCK(&lk); 4783 /* 4784 * Pick a random inode dependency to be cleared. 4785 * We will then gather up all the inodes in its block 4786 * that have dependencies and flush them out. 4787 / 4788* for (cnt = 0; cnt < inodedep_hash; cnt++) { 4789 inodedephd = &inodedep_hashtbl[next++]; 4790 if (next >= inodedep_hash) 4791 next = 0; 4792 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 4793 break; 4794 } 4795 if (inodedep == NULL) 4796 return; 4797 /* 4798 * Ugly code to find mount point given pointer to superblock. 4799 / 4800* fs = inodedep->id_fs; 4801 TAILQ_FOREACH(mp, &mountlist, mnt_list) 4802 if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs) 4803 break; 4804 /* 4805 * Find the last inode in the block with dependencies. 4806 / 4807* firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 4808 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 4809 if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0) 4810 break; 4811 /* 4812 * Asynchronously push all but the last inode with dependencies. 4813 * Synchronously push the last inode with dependencies to ensure 4814 * that the inode block gets written to free up the inodedeps. 4815 / 4816* for (ino = firstino; ino <= lastino; ino++) { 4817 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 4818 continue; 4819 FREE_LOCK(&lk); 4820 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 4821 continue; 4822 if ((error = VFS_VGET(mp, ino, &vp)) != 0) { 4823 softdep_error("clear_inodedeps: vget", error); 4824 vn_finished_write(mp); 4825 return; 4826 } 4827 if (ino == lastino) { 4828 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p))) 4829 softdep_error("clear_inodedeps: fsync1", error); 4830 } else { 4831 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) 4832 softdep_error("clear_inodedeps: fsync2", error); 4833 drain_output(vp, 0); 4834 } 4835 vput(vp); 4836 vn_finished_write(mp); 4837 ACQUIRE_LOCK(&lk); 4838 } 4839 FREE_LOCK(&lk); 4840} 4841 4842/* 4843 * Function to determine if the buffer has outstanding dependencies 4844 * that will cause a roll-back if the buffer is written. If wantcount 4845 * is set, return number of dependencies, otherwise just yes or no. 4846 / 4847static int 4848softdep_count_dependencies(bp, wantcount) 4849* struct buf bp; 4850* int wantcount; 4851{ 4852 struct worklist wk; 4853* struct inodedep inodedep; 4854* struct indirdep indirdep; 4855* struct allocindir aip; 4856* struct pagedep pagedep; 4857* struct diradd dap; 4858* int i, retval; 4859 4860 retval = 0; 4861 ACQUIRE_LOCK(&lk); 4862 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 4863 switch (wk->wk_type) { 4864 4865 case D_INODEDEP: 4866 inodedep = WK_INODEDEP(wk); 4867 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 4868 /* bitmap allocation dependency / 4869* retval += 1; 4870 if (!wantcount) 4871 goto out; 4872 } 4873 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 4874 /* direct block pointer dependency / 4875* retval += 1; 4876 if (!wantcount) 4877 goto out; 4878 } 4879 continue; 4880 4881 case D_INDIRDEP: 4882 indirdep = WK_INDIRDEP(wk); 4883 4884 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 4885 /* indirect block pointer dependency / 4886* retval += 1; 4887 if (!wantcount) 4888 goto out; 4889 } 4890 continue; 4891 4892 case D_PAGEDEP: 4893 pagedep = WK_PAGEDEP(wk); 4894 for (i = 0; i < DAHASHSZ; i++) { 4895 4896 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 4897 /* directory entry dependency / 4898* retval += 1; 4899 if (!wantcount) 4900 goto out; 4901 } 4902 } 4903 continue; 4904 4905 case D_BMSAFEMAP: 4906 case D_ALLOCDIRECT: 4907 case D_ALLOCINDIR: 4908 case D_MKDIR: 4909 /* never a dependency on these blocks / 4910* continue; 4911 4912 default: 4913 FREE_LOCK(&lk); 4914 panic("softdep_check_for_rollback: Unexpected type %s", 4915 TYPENAME(wk->wk_type)); 4916 /* NOTREACHED / 4917* } 4918 } 4919out: 4920 FREE_LOCK(&lk); 4921 return retval; 4922} 4923 4924/* 4925 * Acquire exclusive access to a buffer. 4926 * Must be called with splbio blocked. 4927 * Return 1 if buffer was acquired. 4928 / 4929static int 4930getdirtybuf(bpp, waitfor) 4931* struct buf *bpp; 4932* int waitfor; 4933{ 4934 struct buf bp; 4935* 4936 for (;;) { 4937 if ((bp = bpp) == NULL) 4938* return (0); 4939 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT) == 0) { 4940 if ((bp->b_xflags & BX_BKGRDINPROG) == 0) 4941 break; 4942 BUF_UNLOCK(bp); 4943 if (waitfor != MNT_WAIT) 4944 return (0); 4945 bp->b_xflags \|= BX_BKGRDWAIT; 4946 FREE_LOCK_INTERLOCKED(&lk); 4947 tsleep(&bp->b_xflags, PRIBIO, "getbuf", 0); 4948 ACQUIRE_LOCK_INTERLOCKED(&lk); 4949 continue; 4950 } 4951 if (waitfor != MNT_WAIT) 4952 return (0); 4953 FREE_LOCK_INTERLOCKED(&lk); 4954 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_SLEEPFAIL) != ENOLCK) 4955 panic("getdirtybuf: inconsistent lock"); 4956 ACQUIRE_LOCK_INTERLOCKED(&lk); 4957 } 4958 if ((bp->b_flags & B_DELWRI) == 0) { 4959 BUF_UNLOCK(bp); 4960 return (0); 4961 } 4962 bremfree(bp); 4963 return (1); 4964} 4965 4966/* 4967 * Wait for pending output on a vnode to complete. 4968 * Must be called with vnode locked. 4969 / 4970static void 4971drain_output(vp, islocked) 4972* struct vnode vp; 4973* int islocked; 4974{ 4975 4976 if (!islocked) 4977 ACQUIRE_LOCK(&lk); 4978 while (vp->v_numoutput) { 4979 vp->v_flag \|= VBWAIT; 4980 FREE_LOCK_INTERLOCKED(&lk); 4981 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drainvp", 0); 4982 ACQUIRE_LOCK_INTERLOCKED(&lk); 4983 } 4984 if (!islocked) 4985 FREE_LOCK(&lk); 4986} 4987 4988/* 4989 * Called whenever a buffer that is being invalidated or reallocated 4990 * contains dependencies. This should only happen if an I/O error has 4991 * occurred. The routine is called with the buffer locked. 4992 / 4993static void 4994softdep_deallocate_dependencies(bp) 4995* struct buf bp; 4996{ 4997* 4998 if ((bp->b_ioflags & BIO_ERROR) == 0) 4999 panic("softdep_deallocate_dependencies: dangling deps"); 5000 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 5001 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 5002} 5003 5004/* 5005 * Function to handle asynchronous write errors in the filesystem. 5006 / 5007void 5008softdep_error(func, error) 5009* char func; 5010* int error; 5011{ 5012 5013 /* XXX should do something better! / 5014* printf("%s: got error %d while accessing filesystem\n", func, error); 5015}