ffs_softdep.c revision 36201
1/* 2 * Copyright 1997 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * The soft dependency code is derived from work done by Greg Ganger 5 * at the University of Michigan. 6 * 7 * The following are the copyrights and redistribution conditions that 8 * apply to this copy of the soft dependency software. For a license 9 * to use, redistribute or sell the soft dependency software under 10 * conditions other than those described here, please contact the 11 * author at one of the following addresses: 12 * 13 * Marshall Kirk McKusick mckusick@mckusick.com 14 * 1614 Oxford Street +1-510-843-9542 15 * Berkeley, CA 94709-1608 16 * USA 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 3. None of the names of McKusick, Ganger, or the University of Michigan 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 4. Redistributions in any form must be accompanied by information on 31 * how to obtain complete source code for any accompanying software 32 * that uses the this software. This source code must either be included 33 * in the distribution or be available for no more than the cost of 34 * distribution plus a nominal fee, and must be freely redistributable 35 * under reasonable conditions. For an executable file, complete 36 * source code means the source code for all modules it contains. 37 * It does not mean source code for modules or files that typically 38 * accompany the operating system on which the executable file runs, 39 * e.g., standard library modules or system header files. 40 * 41 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 42 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 43 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 44 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 45 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 51 * SUCH DAMAGE. 52 * 53 * @(#)ffs_softdep.c 9.1 (McKusick) 7/9/97 54 */ 55 56#include <sys/param.h> 57#include <sys/buf.h> 58#include <sys/kernel.h> 59#include <sys/malloc.h> 60#include <sys/mount.h> 61#include <sys/syslog.h> 62#include <sys/systm.h> 63#include <sys/vnode.h> 64#include <miscfs/specfs/specdev.h> 65#include <ufs/ufs/dir.h> 66#include <ufs/ufs/quota.h> 67#include <ufs/ufs/inode.h> 68#include <ufs/ufs/ufsmount.h> 69#include <ufs/ffs/fs.h> 70#include <ufs/ffs/softdep.h> 71#include <ufs/ffs/ffs_extern.h> 72#include <ufs/ufs/ufs_extern.h> 73 74/* 75 * Internal function prototypes. 76 */ 77static void softdep_error __P((char *, int)); 78static int getdirtybuf __P((struct buf **, int)); 79static int flush_pagedep_deps __P((struct vnode *, struct pagedep *)); 80static int flush_inodedep_deps __P((struct fs *, ino_t)); 81static int handle_written_filepage __P((struct pagedep *, struct buf *)); 82static int handle_written_inodeblock __P((struct inodedep *, struct buf *)); 83static void handle_allocdirect_partdone __P((struct allocdirect *)); 84static void handle_allocindir_partdone __P((struct allocindir *)); 85static void initiate_write_filepage __P((struct pagedep *, struct buf *)); 86static void handle_written_mkdir __P((struct mkdir *, int)); 87static void initiate_write_inodeblock __P((struct inodedep *, struct buf *)); 88static void handle_workitem_freefile __P((struct freefile *)); 89static void handle_workitem_remove __P((struct dirrem *)); 90static struct dirrem *newdirrem __P((struct buf *, struct inode *, 91 struct inode *, int)); 92static void free_diradd __P((struct diradd *)); 93static void free_allocindir __P((struct allocindir *, struct inodedep *)); 94static int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t, 95 long *)); 96static void deallocate_dependencies __P((struct buf *, struct inodedep *)); 97static void free_allocdirect __P((struct allocdirectlst *, 98 struct allocdirect *, int)); 99static int free_inodedep __P((struct inodedep *)); 100static void handle_workitem_freeblocks __P((struct freeblks *)); 101static void merge_inode_lists __P((struct inodedep *)); 102static void setup_allocindir_phase2 __P((struct buf *, struct inode *, 103 struct allocindir *)); 104static struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t, 105 ufs_daddr_t)); 106static void handle_workitem_freefrag __P((struct freefrag *)); 107static struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long)); 108static void allocdirect_merge __P((struct allocdirectlst *, 109 struct allocdirect *, struct allocdirect *)); 110static struct bmsafemap *bmsafemap_lookup __P((struct buf *)); 111static int newblk_lookup __P((struct fs *, ufs_daddr_t, int, 112 struct newblk **)); 113static int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **)); 114static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int, 115 struct pagedep **)); 116static void add_to_worklist __P((struct worklist *)); 117 118/* 119 * Exported softdep operations. 120 */ 121struct bio_ops bioops = { 122 softdep_disk_io_initiation, /* io_start */ 123 softdep_disk_write_complete, /* io_complete */ 124 softdep_deallocate_dependencies, /* io_deallocate */ 125 softdep_process_worklist, /* io_sync */ 126}; 127 128/* 129 * Names of malloc types. 130 */ 131extern char *memname[]; 132#define TYPENAME(type) ((unsigned)(type) < M_LAST ? memname[type] : "???") 133 134/* 135 * Locking primitives. 136 * 137 * For a uniprocessor, all we need to do is protect against disk 138 * interrupts. For a multiprocessor, this lock would have to be 139 * a mutex. A single mutex is used throughout this file, though 140 * finer grain locking could be used if contention warranted it. 141 * 142 * For a multiprocessor, the sleep call would accept a lock and 143 * release it after the sleep processing was complete. In a uniprocessor 144 * implementation there is no such interlock, so we simple mark 145 * the places where it needs to be done with the `interlocked' form 146 * of the lock calls. Since the uniprocessor sleep already interlocks 147 * the spl, there is nothing that really needs to be done. 148 */ 149#ifndef /* NOT */ DEBUG 150static int lk; 151#define ACQUIRE_LOCK(lk) *lk = splbio() 152#define FREE_LOCK(lk) splx(*lk) 153#define ACQUIRE_LOCK_INTERLOCKED(lk) 154#define FREE_LOCK_INTERLOCKED(lk) 155 156#else /* DEBUG */ 157#include <sys/proc.h> 158static struct lockit { 159 int lkt_spl; 160 pid_t lkt_held; 161} lk = { 0, -1 }; 162static int lockcnt; 163 164static void acquire_lock __P((struct lockit *)); 165static void free_lock __P((struct lockit *)); 166static void acquire_lock_interlocked __P((struct lockit *)); 167static void free_lock_interlocked __P((struct lockit *)); 168 169#define ACQUIRE_LOCK(lk) acquire_lock(lk) 170#define FREE_LOCK(lk) free_lock(lk) 171#define ACQUIRE_LOCK_INTERLOCKED(lk) acquire_lock_interlocked(lk) 172#define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk) 173 174static void 175acquire_lock(lk) 176 struct lockit *lk; 177{ 178 179 if (lk->lkt_held != -1) 180 if (lk->lkt_held == curproc->p_pid) 181 panic("softdep_lock: locking against myself"); 182 else 183 panic("softdep_lock: lock held by %d", lk->lkt_held); 184 lk->lkt_spl = splbio(); 185 lk->lkt_held = curproc->p_pid; 186 lockcnt++; 187} 188 189static void 190free_lock(lk) 191 struct lockit *lk; 192{ 193 194 if (lk->lkt_held == -1) 195 panic("softdep_unlock: lock not held"); 196 lk->lkt_held = -1; 197 splx(lk->lkt_spl); 198} 199 200static void 201acquire_lock_interlocked(lk) 202 struct lockit *lk; 203{ 204 205 if (lk->lkt_held != -1) 206 if (lk->lkt_held == curproc->p_pid) 207 panic("softdep_lock_interlocked: locking against self"); 208 else 209 panic("softdep_lock_interlocked: lock held by %d", 210 lk->lkt_held); 211 lk->lkt_held = curproc->p_pid; 212 lockcnt++; 213} 214 215static void 216free_lock_interlocked(lk) 217 struct lockit *lk; 218{ 219 220 if (lk->lkt_held == -1) 221 panic("softdep_unlock_interlocked: lock not held"); 222 lk->lkt_held = -1; 223} 224#endif /* DEBUG */ 225 226/* 227 * Place holder for real semaphores. 228 */ 229struct sema { 230 int value; 231 pid_t holder; 232 char *name; 233 int prio; 234 int timo; 235}; 236static void sema_init __P((struct sema *, char *, int, int)); 237static int sema_get __P((struct sema *, struct lockit *)); 238static void sema_release __P((struct sema *)); 239 240static void 241sema_init(semap, name, prio, timo) 242 struct sema *semap; 243 char *name; 244 int prio, timo; 245{ 246 247 semap->holder = -1; 248 semap->value = 0; 249 semap->name = name; 250 semap->prio = prio; 251 semap->timo = timo; 252} 253 254static int 255sema_get(semap, interlock) 256 struct sema *semap; 257 struct lockit *interlock; 258{ 259 260 if (semap->value++ > 0) { 261 if (interlock != NULL) 262 FREE_LOCK_INTERLOCKED(interlock); 263 tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo); 264 if (interlock != NULL) { 265 ACQUIRE_LOCK_INTERLOCKED(interlock); 266 FREE_LOCK(interlock); 267 } 268 return (0); 269 } 270 semap->holder = curproc->p_pid; 271 if (interlock != NULL) 272 FREE_LOCK(interlock); 273 return (1); 274} 275 276static void 277sema_release(semap) 278 struct sema *semap; 279{ 280 281 if (semap->value <= 0 || semap->holder != curproc->p_pid) 282 panic("sema_release: not held"); 283 if (--semap->value > 0) { 284 semap->value = 0; 285 wakeup(semap); 286 } 287 semap->holder = -1; 288} 289 290/* 291 * Worklist queue management. 292 * These routines require that the lock be held. 293 */ 294#ifndef /* NOT */ DEBUG 295#define WORKLIST_INSERT(head, item) do { \ 296 item->wk_state |= ONWORKLIST; \ 297 LIST_INSERT_HEAD(head, item, wk_list); \ 298} while (0) 299#define WORKLIST_REMOVE(item) do { \ 300 item->wk_state &= ~ONWORKLIST; \ 301 LIST_REMOVE(item, wk_list); \ 302} while (0) 303#define WORKITEM_FREE(item, type) FREE(item, type) 304 305#else /* DEBUG */ 306static void worklist_insert __P((struct workhead *, struct worklist *)); 307static void worklist_remove __P((struct worklist *)); 308static void workitem_free __P((struct worklist *, int)); 309 310#define WORKLIST_INSERT(head, item) worklist_insert(head, item) 311#define WORKLIST_REMOVE(item) worklist_remove(item) 312#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type) 313 314static void 315worklist_insert(head, item) 316 struct workhead *head; 317 struct worklist *item; 318{ 319 320 if (lk.lkt_held == -1) 321 panic("worklist_insert: lock not held"); 322 if (item->wk_state & ONWORKLIST) 323 panic("worklist_insert: already on list"); 324 item->wk_state |= ONWORKLIST; 325 LIST_INSERT_HEAD(head, item, wk_list); 326} 327 328static void 329worklist_remove(item) 330 struct worklist *item; 331{ 332 333 if (lk.lkt_held == -1) 334 panic("worklist_remove: lock not held"); 335 if ((item->wk_state & ONWORKLIST) == 0) 336 panic("worklist_remove: not on list"); 337 item->wk_state &= ~ONWORKLIST; 338 LIST_REMOVE(item, wk_list); 339} 340 341static void 342workitem_free(item, type) 343 struct worklist *item; 344 int type; 345{ 346 347 if (item->wk_state & ONWORKLIST) 348 panic("workitem_free: still on list"); 349 if (item->wk_type != type) 350 panic("workitem_free: type mismatch"); 351 FREE(item, type); 352} 353#endif /* DEBUG */ 354 355/* 356 * Workitem queue management 357 */ 358static struct workhead softdep_workitem_pending; 359static int softdep_worklist_busy; 360 361/* 362 * Add an item to the end of the work queue. 363 * This routine requires that the lock be held. 364 * This is the only routine that adds items to the list. 365 * The following routine is the only one that removes items 366 * and does so in order from first to last. 367 */ 368static void 369add_to_worklist(wk) 370 struct worklist *wk; 371{ 372 static struct worklist *worklist_tail; 373 374 if (wk->wk_state & ONWORKLIST) 375 panic("add_to_worklist: already on list"); 376 wk->wk_state |= ONWORKLIST; 377 if (LIST_FIRST(&softdep_workitem_pending) == NULL) 378 LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list); 379 else 380 LIST_INSERT_AFTER(worklist_tail, wk, wk_list); 381 worklist_tail = wk; 382} 383 384/* 385 * Process that runs once per second to handle items in the background queue. 386 * 387 * Note that we ensure that everything is done in the order in which they 388 * appear in the queue. The code below depends on this property to ensure 389 * that blocks of a file are freed before the inode itself is freed. This 390 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 391 * until all the old ones have been purged from the dependency lists. 392 */ 393int 394softdep_process_worklist(matchmnt) 395 struct mount *matchmnt; 396{ 397 struct worklist *wk; 398 struct fs *matchfs; 399 int matchcnt; 400 401 matchcnt = 0; 402 matchfs = NULL; 403 if (matchmnt != NULL) 404 matchfs = VFSTOUFS(matchmnt)->um_fs; 405 /* 406 * There is no danger of having multiple processes run this 407 * code. It is single threaded solely so that softdep_flushfiles 408 * (below) can get an accurate count of the number of items 409 * related to its mount point that are in the list. 410 */ 411 if (softdep_worklist_busy && matchmnt == NULL) 412 return (-1); 413 ACQUIRE_LOCK(&lk); 414 while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) { 415 WORKLIST_REMOVE(wk); 416 FREE_LOCK(&lk); 417 switch (wk->wk_type) { 418 419 case M_DIRREM: 420 /* removal of a directory entry */ 421 if (WK_DIRREM(wk)->dm_mnt == matchmnt) 422 matchcnt += 1; 423 handle_workitem_remove(WK_DIRREM(wk)); 424 break; 425 426 case M_FREEBLKS: 427 /* releasing blocks and/or fragments from a file */ 428 if (WK_FREEBLKS(wk)->fb_fs == matchfs) 429 matchcnt += 1; 430 handle_workitem_freeblocks(WK_FREEBLKS(wk)); 431 break; 432 433 case M_FREEFRAG: 434 /* releasing a fragment when replaced as a file grows */ 435 if (WK_FREEFRAG(wk)->ff_fs == matchfs) 436 matchcnt += 1; 437 handle_workitem_freefrag(WK_FREEFRAG(wk)); 438 break; 439 440 case M_FREEFILE: 441 /* releasing an inode when its link count drops to 0 */ 442 if (WK_FREEFILE(wk)->fx_fs == matchfs) 443 matchcnt += 1; 444 handle_workitem_freefile(WK_FREEFILE(wk)); 445 break; 446 447 default: 448 panic("%s_process_worklist: Unknown type %s", 449 "softdep", TYPENAME(wk->wk_type)); 450 /* NOTREACHED */ 451 } 452 if (softdep_worklist_busy && matchmnt == NULL) 453 return (-1); 454 ACQUIRE_LOCK(&lk); 455 } 456 FREE_LOCK(&lk); 457 return (matchcnt); 458} 459 460/* 461 * Purge the work list of all items associated with a particular mount point. 462 */ 463int 464softdep_flushfiles(oldmnt, flags, p) 465 struct mount *oldmnt; 466 int flags; 467 struct proc *p; 468{ 469 struct vnode *devvp; 470 int error, loopcnt; 471 472 /* 473 * Await our turn to clear out the queue. 474 */ 475 while (softdep_worklist_busy) 476 sleep(&lbolt, PRIBIO); 477 softdep_worklist_busy = 1; 478 if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) { 479 softdep_worklist_busy = 0; 480 return (error); 481 } 482 /* 483 * Alternately flush the block device associated with the mount 484 * point and process any dependencies that the flushing 485 * creates. In theory, this loop can happen at most twice, 486 * but we give it a few extra just to be sure. 487 */ 488 devvp = VFSTOUFS(oldmnt)->um_devvp; 489 for (loopcnt = 10; loopcnt > 0; loopcnt--) { 490 if (softdep_process_worklist(oldmnt) == 0) { 491 /* 492 * Do another flush in case any vnodes were brought in 493 * as part of the cleanup operations. 494 */ 495 if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) 496 break; 497 /* 498 * If we still found nothing to do, we are really done. 499 */ 500 if (softdep_process_worklist(oldmnt) == 0) 501 break; 502 } 503 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); 504 error = VOP_FSYNC(devvp, p->p_cred, MNT_WAIT, p); 505 VOP_UNLOCK(devvp, 0, p); 506 if (error) 507 break; 508 } 509 softdep_worklist_busy = 0; 510 if (loopcnt == 0) 511 panic("softdep_flushfiles: looping"); 512 return (error); 513} 514 515/* 516 * Structure hashing. 517 * 518 * There are three types of structures that can be looked up: 519 * 1) pagedep structures identified by mount point, inode number, 520 * and logical block. 521 * 2) inodedep structures identified by mount point and inode number. 522 * 3) newblk structures identified by mount point and 523 * physical block number. 524 * 525 * The "pagedep" and "inodedep" dependency structures are hashed 526 * separately from the file blocks and inodes to which they correspond. 527 * This separation helps when the in-memory copy of an inode or 528 * file block must be replaced. It also obviates the need to access 529 * an inode or file page when simply updating (or de-allocating) 530 * dependency structures. Lookup of newblk structures is needed to 531 * find newly allocated blocks when trying to associate them with 532 * their allocdirect or allocindir structure. 533 * 534 * The lookup routines optionally create and hash a new instance when 535 * an existing entry is not found. 536 */ 537#define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 538 539/* 540 * Structures and routines associated with pagedep caching. 541 */ 542LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 543u_long pagedep_hash; /* size of hash table - 1 */ 544#define PAGEDEP_HASH(mp, inum, lbn) \ 545 (&pagedep_hashtbl[((((int)(mp)) >> 13) + (inum) + (lbn)) & pagedep_hash]) 546static struct sema pagedep_in_progress; 547 548/* 549 * Look up a pagedep. Return 1 if found, 0 if not found. 550 * If not found, allocate if DEPALLOC flag is passed. 551 * Found or allocated entry is returned in pagedeppp. 552 * This routine must be called with splbio interrupts blocked. 553 */ 554static int 555pagedep_lookup(ip, lbn, flags, pagedeppp) 556 struct inode *ip; 557 ufs_lbn_t lbn; 558 int flags; 559 struct pagedep **pagedeppp; 560{ 561 struct pagedep *pagedep; 562 struct pagedep_hashhead *pagedephd; 563 struct mount *mp; 564 int i; 565 566#ifdef DEBUG 567 if (lk.lkt_held == -1) 568 panic("pagedep_lookup: lock not held"); 569#endif 570 mp = ITOV(ip)->v_mount; 571 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); 572top: 573 for (pagedep = LIST_FIRST(pagedephd); pagedep; 574 pagedep = LIST_NEXT(pagedep, pd_hash)) 575 if (ip->i_number == pagedep->pd_ino && 576 lbn == pagedep->pd_lbn && 577 mp == pagedep->pd_mnt) 578 break; 579 if (pagedep) { 580 *pagedeppp = pagedep; 581 return (1); 582 } 583 if ((flags & DEPALLOC) == 0) { 584 *pagedeppp = NULL; 585 return (0); 586 } 587 if (sema_get(&pagedep_in_progress, &lk) == 0) { 588 ACQUIRE_LOCK(&lk); 589 goto top; 590 } 591 MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP, 592 M_WAITOK); 593 bzero(pagedep, sizeof(struct pagedep)); 594 pagedep->pd_list.wk_type = M_PAGEDEP; 595 pagedep->pd_mnt = mp; 596 pagedep->pd_ino = ip->i_number; 597 pagedep->pd_lbn = lbn; 598 LIST_INIT(&pagedep->pd_dirremhd); 599 LIST_INIT(&pagedep->pd_pendinghd); 600 for (i = 0; i < DAHASHSZ; i++) 601 LIST_INIT(&pagedep->pd_diraddhd[i]); 602 ACQUIRE_LOCK(&lk); 603 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 604 sema_release(&pagedep_in_progress); 605 *pagedeppp = pagedep; 606 return (0); 607} 608 609/* 610 * Structures and routines associated with inodedep caching. 611 */ 612LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 613u_long inodedep_hash; /* size of hash table - 1 */ 614#define INODEDEP_HASH(fs, inum) \ 615 (&inodedep_hashtbl[((((int)(fs)) >> 13) + (inum)) & inodedep_hash]) 616static struct sema inodedep_in_progress; 617 618/* 619 * Look up a inodedep. Return 1 if found, 0 if not found. 620 * If not found, allocate if DEPALLOC flag is passed. 621 * Found or allocated entry is returned in inodedeppp. 622 * This routine must be called with splbio interrupts blocked. 623 */ 624static int 625inodedep_lookup(fs, inum, flags, inodedeppp) 626 struct fs *fs; 627 ino_t inum; 628 int flags; 629 struct inodedep **inodedeppp; 630{ 631 struct inodedep *inodedep; 632 struct inodedep_hashhead *inodedephd; 633 634#ifdef DEBUG 635 if (lk.lkt_held == -1) 636 panic("inodedep_lookup: lock not held"); 637#endif 638 inodedephd = INODEDEP_HASH(fs, inum); 639top: 640 for (inodedep = LIST_FIRST(inodedephd); inodedep; 641 inodedep = LIST_NEXT(inodedep, id_hash)) 642 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 643 break; 644 if (inodedep) { 645 *inodedeppp = inodedep; 646 return (1); 647 } 648 if ((flags & DEPALLOC) == 0) { 649 *inodedeppp = NULL; 650 return (0); 651 } 652 if (sema_get(&inodedep_in_progress, &lk) == 0) { 653 ACQUIRE_LOCK(&lk); 654 goto top; 655 } 656 MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep), 657 M_INODEDEP, M_WAITOK); 658 inodedep->id_list.wk_type = M_INODEDEP; 659 inodedep->id_fs = fs; 660 inodedep->id_ino = inum; 661 inodedep->id_state = ALLCOMPLETE; 662 inodedep->id_nlinkdelta = 0; 663 inodedep->id_savedino = NULL; 664 inodedep->id_savedsize = -1; 665 inodedep->id_buf = NULL; 666 LIST_INIT(&inodedep->id_pendinghd); 667 LIST_INIT(&inodedep->id_inowait); 668 TAILQ_INIT(&inodedep->id_inoupdt); 669 TAILQ_INIT(&inodedep->id_newinoupdt); 670 ACQUIRE_LOCK(&lk); 671 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 672 sema_release(&inodedep_in_progress); 673 *inodedeppp = inodedep; 674 return (0); 675} 676 677/* 678 * Structures and routines associated with newblk caching. 679 */ 680LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 681u_long newblk_hash; /* size of hash table - 1 */ 682#define NEWBLK_HASH(fs, inum) \ 683 (&newblk_hashtbl[((((int)(fs)) >> 13) + (inum)) & newblk_hash]) 684static struct sema newblk_in_progress; 685 686/* 687 * Look up a newblk. Return 1 if found, 0 if not found. 688 * If not found, allocate if DEPALLOC flag is passed. 689 * Found or allocated entry is returned in newblkpp. 690 */ 691static int 692newblk_lookup(fs, newblkno, flags, newblkpp) 693 struct fs *fs; 694 ufs_daddr_t newblkno; 695 int flags; 696 struct newblk **newblkpp; 697{ 698 struct newblk *newblk; 699 struct newblk_hashhead *newblkhd; 700 701 newblkhd = NEWBLK_HASH(fs, newblkno); 702top: 703 for (newblk = LIST_FIRST(newblkhd); newblk; 704 newblk = LIST_NEXT(newblk, nb_hash)) 705 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) 706 break; 707 if (newblk) { 708 *newblkpp = newblk; 709 return (1); 710 } 711 if ((flags & DEPALLOC) == 0) { 712 *newblkpp = NULL; 713 return (0); 714 } 715 if (sema_get(&newblk_in_progress, 0) == 0) 716 goto top; 717 MALLOC(newblk, struct newblk *, sizeof(struct newblk), 718 M_NEWBLK, M_WAITOK); 719 newblk->nb_state = 0; 720 newblk->nb_fs = fs; 721 newblk->nb_newblkno = newblkno; 722 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 723 sema_release(&newblk_in_progress); 724 *newblkpp = newblk; 725 return (0); 726} 727 728/* 729 * Executed during filesystem system initialization before 730 * mounting any file systems. 731 */ 732void 733softdep_initialize() 734{ 735 736 LIST_INIT(&mkdirlisthd); 737 LIST_INIT(&softdep_workitem_pending); 738 pagedep_hashtbl = hashinit(desiredvnodes * 2, M_PAGEDEP, &pagedep_hash); 739 sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0); 740 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 741 sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0); 742 newblk_hashtbl = hashinit(desiredvnodes / 10, M_NEWBLK, &newblk_hash); 743 sema_init(&newblk_in_progress, "newblk", PRIBIO, 0); 744} 745 746/* 747 * Called at mount time to notify the dependency code that a 748 * filesystem wishes to use it. 749 */ 750int 751softdep_mount(devvp, mp, fs, cred) 752 struct vnode *devvp; 753 struct mount *mp; 754 struct fs *fs; 755 struct ucred *cred; 756{ 757 struct csum cstotal; 758 struct cg *cgp; 759 struct buf *bp; 760 int error, cyl; 761 762 mp->mnt_flag |= MNT_SOFTDEP; 763 /* 764 * When doing soft updates, the counters in the 765 * superblock may have gotten out of sync, so we have 766 * to scan the cylinder groups and recalculate them. 767 */ 768 if (fs->fs_clean != 0) 769 return (0); 770 bzero(&cstotal, sizeof cstotal); 771 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 772 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 773 fs->fs_cgsize, cred, &bp)) != 0) { 774 brelse(bp); 775 return (error); 776 } 777 cgp = (struct cg *)bp->b_data; 778 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 779 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 780 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 781 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 782 fs->fs_cs(fs, cyl) = cgp->cg_cs; 783 brelse(bp); 784 } 785#ifdef DEBUG 786 if (!bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 787 printf("ffs_mountfs: superblock updated\n"); 788#endif 789 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 790 return (0); 791} 792 793/* 794 * Protecting the freemaps (or bitmaps). 795 * 796 * To eliminate the need to execute fsck before mounting a file system 797 * after a power failure, one must (conservatively) guarantee that the 798 * on-disk copy of the bitmaps never indicate that a live inode or block is 799 * free. So, when a block or inode is allocated, the bitmap should be 800 * updated (on disk) before any new pointers. When a block or inode is 801 * freed, the bitmap should not be updated until all pointers have been 802 * reset. The latter dependency is handled by the delayed de-allocation 803 * approach described below for block and inode de-allocation. The former 804 * dependency is handled by calling the following procedure when a block or 805 * inode is allocated. When an inode is allocated an "inodedep" is created 806 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 807 * Each "inodedep" is also inserted into the hash indexing structure so 808 * that any additional link additions can be made dependent on the inode 809 * allocation. 810 * 811 * The ufs file system maintains a number of free block counts (e.g., per 812 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 813 * in addition to the bitmaps. These counts are used to improve efficiency 814 * during allocation and therefore must be consistent with the bitmaps. 815 * There is no convenient way to guarantee post-crash consistency of these 816 * counts with simple update ordering, for two main reasons: (1) The counts 817 * and bitmaps for a single cylinder group block are not in the same disk 818 * sector. If a disk write is interrupted (e.g., by power failure), one may 819 * be written and the other not. (2) Some of the counts are located in the 820 * superblock rather than the cylinder group block. So, we focus our soft 821 * updates implementation on protecting the bitmaps. When mounting a 822 * filesystem, we recompute the auxiliary counts from the bitmaps. 823 */ 824 825/* 826 * Called just after updating the cylinder group block to allocate an inode. 827 */ 828void 829softdep_setup_inomapdep(bp, ip, newinum) 830 struct buf *bp; /* buffer for cylgroup block with inode map */ 831 struct inode *ip; /* inode related to allocation */ 832 ino_t newinum; /* new inode number being allocated */ 833{ 834 struct inodedep *inodedep; 835 struct bmsafemap *bmsafemap; 836 837 /* 838 * Create a dependency for the newly allocated inode. 839 * Panic if it already exists as something is seriously wrong. 840 * Otherwise add it to the dependency list for the buffer holding 841 * the cylinder group map from which it was allocated. 842 */ 843 ACQUIRE_LOCK(&lk); 844 if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0) 845 panic("softdep_setup_inomapdep: found inode"); 846 inodedep->id_buf = bp; 847 inodedep->id_state &= ~DEPCOMPLETE; 848 bmsafemap = bmsafemap_lookup(bp); 849 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 850 FREE_LOCK(&lk); 851} 852 853/* 854 * Called just after updating the cylinder group block to 855 * allocate block or fragment. 856 */ 857void 858softdep_setup_blkmapdep(bp, fs, newblkno) 859 struct buf *bp; /* buffer for cylgroup block with block map */ 860 struct fs *fs; /* filesystem doing allocation */ 861 ufs_daddr_t newblkno; /* number of newly allocated block */ 862{ 863 struct newblk *newblk; 864 struct bmsafemap *bmsafemap; 865 866 /* 867 * Create a dependency for the newly allocated block. 868 * Add it to the dependency list for the buffer holding 869 * the cylinder group map from which it was allocated. 870 */ 871 if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) 872 panic("softdep_setup_blkmapdep: found block"); 873 ACQUIRE_LOCK(&lk); 874 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp); 875 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 876 FREE_LOCK(&lk); 877} 878 879/* 880 * Find the bmsafemap associated with a cylinder group buffer. 881 * If none exists, create one. The buffer must be locked when 882 * this routine is called and this routine must be called with 883 * splbio interrupts blocked. 884 */ 885static struct bmsafemap * 886bmsafemap_lookup(bp) 887 struct buf *bp; 888{ 889 struct bmsafemap *bmsafemap; 890 struct worklist *wk; 891 892#ifdef DEBUG 893 if (lk.lkt_held == -1) 894 panic("bmsafemap_lookup: lock not held"); 895#endif 896 for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) 897 if (wk->wk_type == M_BMSAFEMAP) 898 return (WK_BMSAFEMAP(wk)); 899 FREE_LOCK(&lk); 900 MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap), 901 M_BMSAFEMAP, M_WAITOK); 902 bmsafemap->sm_list.wk_type = M_BMSAFEMAP; 903 bmsafemap->sm_list.wk_state = 0; 904 bmsafemap->sm_buf = bp; 905 LIST_INIT(&bmsafemap->sm_allocdirecthd); 906 LIST_INIT(&bmsafemap->sm_allocindirhd); 907 LIST_INIT(&bmsafemap->sm_inodedephd); 908 LIST_INIT(&bmsafemap->sm_newblkhd); 909 ACQUIRE_LOCK(&lk); 910 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 911 return (bmsafemap); 912} 913 914/* 915 * Direct block allocation dependencies. 916 * 917 * When a new block is allocated, the corresponding disk locations must be 918 * initialized (with zeros or new data) before the on-disk inode points to 919 * them. Also, the freemap from which the block was allocated must be 920 * updated (on disk) before the inode's pointer. These two dependencies are 921 * independent of each other and are needed for all file blocks and indirect 922 * blocks that are pointed to directly by the inode. Just before the 923 * "in-core" version of the inode is updated with a newly allocated block 924 * number, a procedure (below) is called to setup allocation dependency 925 * structures. These structures are removed when the corresponding 926 * dependencies are satisfied or when the block allocation becomes obsolete 927 * (i.e., the file is deleted, the block is de-allocated, or the block is a 928 * fragment that gets upgraded). All of these cases are handled in 929 * procedures described later. 930 * 931 * When a file extension causes a fragment to be upgraded, either to a larger 932 * fragment or to a full block, the on-disk location may change (if the 933 * previous fragment could not simply be extended). In this case, the old 934 * fragment must be de-allocated, but not until after the inode's pointer has 935 * been updated. In most cases, this is handled by later procedures, which 936 * will construct a "freefrag" structure to be added to the workitem queue 937 * when the inode update is complete (or obsolete). The main exception to 938 * this is when an allocation occurs while a pending allocation dependency 939 * (for the same block pointer) remains. This case is handled in the main 940 * allocation dependency setup procedure by immediately freeing the 941 * unreferenced fragments. 942 */ 943void 944softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 945 struct inode *ip; /* inode to which block is being added */ 946 ufs_lbn_t lbn; /* block pointer within inode */ 947 ufs_daddr_t newblkno; /* disk block number being added */ 948 ufs_daddr_t oldblkno; /* previous block number, 0 unless frag */ 949 long newsize; /* size of new block */ 950 long oldsize; /* size of new block */ 951 struct buf *bp; /* bp for allocated block */ 952{ 953 struct allocdirect *adp, *oldadp; 954 struct allocdirectlst *adphead; 955 struct bmsafemap *bmsafemap; 956 struct inodedep *inodedep; 957 struct pagedep *pagedep; 958 struct newblk *newblk; 959 960 MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), 961 M_ALLOCDIRECT, M_WAITOK); 962 bzero(adp, sizeof(struct allocdirect)); 963 adp->ad_list.wk_type = M_ALLOCDIRECT; 964 adp->ad_lbn = lbn; 965 adp->ad_newblkno = newblkno; 966 adp->ad_oldblkno = oldblkno; 967 adp->ad_newsize = newsize; 968 adp->ad_oldsize = oldsize; 969 adp->ad_state = ATTACHED; 970 if (newblkno == oldblkno) 971 adp->ad_freefrag = NULL; 972 else 973 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); 974 975 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) 976 panic("softdep_setup_allocdirect: lost block"); 977 978 ACQUIRE_LOCK(&lk); 979 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); 980 adp->ad_inodedep = inodedep; 981 982 if (newblk->nb_state == DEPCOMPLETE) { 983 adp->ad_state |= DEPCOMPLETE; 984 adp->ad_buf = NULL; 985 } else { 986 bmsafemap = newblk->nb_bmsafemap; 987 adp->ad_buf = bmsafemap->sm_buf; 988 LIST_REMOVE(newblk, nb_deps); 989 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); 990 } 991 LIST_REMOVE(newblk, nb_hash); 992 FREE(newblk, M_NEWBLK); 993 994 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); 995 if (lbn >= NDADDR) { 996 /* allocating an indirect block */ 997 if (oldblkno != 0) 998 panic("softdep_setup_allocdirect: non-zero indir"); 999 } else { 1000 /* 1001 * Allocating a direct block. 1002 * 1003 * If we are allocating a directory block, then we must 1004 * allocate an associated pagedep to track additions and 1005 * deletions. 1006 */ 1007 if ((ip->i_mode & IFMT) == IFDIR && 1008 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1009 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 1010 } 1011 /* 1012 * The list of allocdirects must be kept in sorted and ascending 1013 * order so that the rollback routines can quickly determine the 1014 * first uncommitted block (the size of the file stored on disk 1015 * ends at the end of the lowest committed fragment, or if there 1016 * are no fragments, at the end of the highest committed block). 1017 * Since files generally grow, the typical case is that the new 1018 * block is to be added at the end of the list. We speed this 1019 * special case by checking against the last allocdirect in the 1020 * list before laboriously traversing the list looking for the 1021 * insertion point. 1022 */ 1023 adphead = &inodedep->id_newinoupdt; 1024 oldadp = TAILQ_LAST(adphead, allocdirectlst); 1025 if (oldadp == NULL || oldadp->ad_lbn <= lbn) { 1026 /* insert at end of list */ 1027 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 1028 if (oldadp != NULL && oldadp->ad_lbn == lbn) 1029 allocdirect_merge(adphead, adp, oldadp); 1030 FREE_LOCK(&lk); 1031 return; 1032 } 1033 for (oldadp = TAILQ_FIRST(adphead); oldadp; 1034 oldadp = TAILQ_NEXT(oldadp, ad_next)) { 1035 if (oldadp->ad_lbn >= lbn) 1036 break; 1037 } 1038 if (oldadp == NULL) 1039 panic("softdep_setup_allocdirect: lost entry"); 1040 /* insert in middle of list */ 1041 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 1042 if (oldadp->ad_lbn == lbn) 1043 allocdirect_merge(adphead, adp, oldadp); 1044 FREE_LOCK(&lk); 1045} 1046 1047/* 1048 * Replace an old allocdirect dependency with a newer one. 1049 * This routine must be called with splbio interrupts blocked. 1050 */ 1051static void 1052allocdirect_merge(adphead, newadp, oldadp) 1053 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 1054 struct allocdirect *newadp; /* allocdirect being added */ 1055 struct allocdirect *oldadp; /* existing allocdirect being checked */ 1056{ 1057 struct freefrag *freefrag; 1058 1059#ifdef DEBUG 1060 if (lk.lkt_held == -1) 1061 panic("allocdirect_merge: lock not held"); 1062#endif 1063 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 1064 newadp->ad_oldsize != oldadp->ad_newsize || 1065 newadp->ad_lbn >= NDADDR) 1066 panic("allocdirect_check: old %d != new %d || lbn %d >= %d", 1067 newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn, 1068 NDADDR); 1069 newadp->ad_oldblkno = oldadp->ad_oldblkno; 1070 newadp->ad_oldsize = oldadp->ad_oldsize; 1071 /* 1072 * If the old dependency had a fragment to free or had never 1073 * previously had a block allocated, then the new dependency 1074 * can immediately post its freefrag and adopt the old freefrag. 1075 * This action is done by swapping the freefrag dependencies. 1076 * The new dependency gains the old one's freefrag, and the 1077 * old one gets the new one and then immediately puts it on 1078 * the worklist when it is freed by free_allocdirect. It is 1079 * not possible to do this swap when the old dependency had a 1080 * non-zero size but no previous fragment to free. This condition 1081 * arises when the new block is an extension of the old block. 1082 * Here, the first part of the fragment allocated to the new 1083 * dependency is part of the block currently claimed on disk by 1084 * the old dependency, so cannot legitimately be freed until the 1085 * conditions for the new dependency are fulfilled. 1086 */ 1087 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 1088 freefrag = newadp->ad_freefrag; 1089 newadp->ad_freefrag = oldadp->ad_freefrag; 1090 oldadp->ad_freefrag = freefrag; 1091 } 1092 free_allocdirect(adphead, oldadp, 0); 1093} 1094 1095/* 1096 * Allocate a new freefrag structure if needed. 1097 */ 1098static struct freefrag * 1099newfreefrag(ip, blkno, size) 1100 struct inode *ip; 1101 ufs_daddr_t blkno; 1102 long size; 1103{ 1104 struct freefrag *freefrag; 1105 struct fs *fs; 1106 1107 if (blkno == 0) 1108 return (NULL); 1109 fs = ip->i_fs; 1110 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 1111 panic("newfreefrag: frag size"); 1112 MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag), 1113 M_FREEFRAG, M_WAITOK); 1114 freefrag->ff_list.wk_type = M_FREEFRAG; 1115 freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */ 1116 freefrag->ff_inum = ip->i_number; 1117 freefrag->ff_fs = fs; 1118 freefrag->ff_devvp = ip->i_devvp; 1119 freefrag->ff_blkno = blkno; 1120 freefrag->ff_fragsize = size; 1121 return (freefrag); 1122} 1123 1124/* 1125 * This workitem de-allocates fragments that were replaced during 1126 * file block allocation. 1127 */ 1128static void 1129handle_workitem_freefrag(freefrag) 1130 struct freefrag *freefrag; 1131{ 1132 struct inode tip; 1133 1134 tip.i_fs = freefrag->ff_fs; 1135 tip.i_devvp = freefrag->ff_devvp; 1136 tip.i_dev = freefrag->ff_devvp->v_rdev; 1137 tip.i_number = freefrag->ff_inum; 1138 tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above */ 1139 ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize); 1140 FREE(freefrag, M_FREEFRAG); 1141} 1142 1143/* 1144 * Indirect block allocation dependencies. 1145 * 1146 * The same dependencies that exist for a direct block also exist when 1147 * a new block is allocated and pointed to by an entry in a block of 1148 * indirect pointers. The undo/redo states described above are also 1149 * used here. Because an indirect block contains many pointers that 1150 * may have dependencies, a second copy of the entire in-memory indirect 1151 * block is kept. The buffer cache copy is always completely up-to-date. 1152 * The second copy, which is used only as a source for disk writes, 1153 * contains only the safe pointers (i.e., those that have no remaining 1154 * update dependencies). The second copy is freed when all pointers 1155 * are safe. The cache is not allowed to replace indirect blocks with 1156 * pending update dependencies. If a buffer containing an indirect 1157 * block with dependencies is written, these routines will mark it 1158 * dirty again. It can only be successfully written once all the 1159 * dependencies are removed. The ffs_fsync routine in conjunction with 1160 * softdep_sync_metadata work together to get all the dependencies 1161 * removed so that a file can be successfully written to disk. Three 1162 * procedures are used when setting up indirect block pointer 1163 * dependencies. The division is necessary because of the organization 1164 * of the "balloc" routine and because of the distinction between file 1165 * pages and file metadata blocks. 1166 */ 1167 1168/* 1169 * Allocate a new allocindir structure. 1170 */ 1171static struct allocindir * 1172newallocindir(ip, ptrno, newblkno, oldblkno) 1173 struct inode *ip; /* inode for file being extended */ 1174 int ptrno; /* offset of pointer in indirect block */ 1175 ufs_daddr_t newblkno; /* disk block number being added */ 1176 ufs_daddr_t oldblkno; /* previous block number, 0 if none */ 1177{ 1178 struct allocindir *aip; 1179 1180 MALLOC(aip, struct allocindir *, sizeof(struct allocindir), 1181 M_ALLOCINDIR, M_WAITOK); 1182 bzero(aip, sizeof(struct allocindir)); 1183 aip->ai_list.wk_type = M_ALLOCINDIR; 1184 aip->ai_state = ATTACHED; 1185 aip->ai_offset = ptrno; 1186 aip->ai_newblkno = newblkno; 1187 aip->ai_oldblkno = oldblkno; 1188 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); 1189 return (aip); 1190} 1191 1192/* 1193 * Called just before setting an indirect block pointer 1194 * to a newly allocated file page. 1195 */ 1196void 1197softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 1198 struct inode *ip; /* inode for file being extended */ 1199 ufs_lbn_t lbn; /* allocated block number within file */ 1200 struct buf *bp; /* buffer with indirect blk referencing page */ 1201 int ptrno; /* offset of pointer in indirect block */ 1202 ufs_daddr_t newblkno; /* disk block number being added */ 1203 ufs_daddr_t oldblkno; /* previous block number, 0 if none */ 1204 struct buf *nbp; /* buffer holding allocated page */ 1205{ 1206 struct allocindir *aip; 1207 struct pagedep *pagedep; 1208 1209 aip = newallocindir(ip, ptrno, newblkno, oldblkno); 1210 ACQUIRE_LOCK(&lk); 1211 /* 1212 * If we are allocating a directory page, then we must 1213 * allocate an associated pagedep to track additions and 1214 * deletions. 1215 */ 1216 if ((ip->i_mode & IFMT) == IFDIR && 1217 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1218 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); 1219 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1220 FREE_LOCK(&lk); 1221 setup_allocindir_phase2(bp, ip, aip); 1222} 1223 1224/* 1225 * Called just before setting an indirect block pointer to a 1226 * newly allocated indirect block. 1227 */ 1228void 1229softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 1230 struct buf *nbp; /* newly allocated indirect block */ 1231 struct inode *ip; /* inode for file being extended */ 1232 struct buf *bp; /* indirect block referencing allocated block */ 1233 int ptrno; /* offset of pointer in indirect block */ 1234 ufs_daddr_t newblkno; /* disk block number being added */ 1235{ 1236 struct allocindir *aip; 1237 1238 aip = newallocindir(ip, ptrno, newblkno, 0); 1239 ACQUIRE_LOCK(&lk); 1240 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1241 FREE_LOCK(&lk); 1242 setup_allocindir_phase2(bp, ip, aip); 1243} 1244 1245/* 1246 * Called to finish the allocation of the "aip" allocated 1247 * by one of the two routines above. 1248 */ 1249static void 1250setup_allocindir_phase2(bp, ip, aip) 1251 struct buf *bp; /* in-memory copy of the indirect block */ 1252 struct inode *ip; /* inode for file being extended */ 1253 struct allocindir *aip; /* allocindir allocated by the above routines */ 1254{ 1255 struct worklist *wk; 1256 struct indirdep *indirdep, *newindirdep; 1257 struct bmsafemap *bmsafemap; 1258 struct allocindir *oldaip; 1259 struct freefrag *freefrag; 1260 struct newblk *newblk; 1261 1262 if (bp->b_lblkno >= 0) 1263 panic("setup_allocindir_phase2: not indir blk"); 1264 for (indirdep = NULL, newindirdep = NULL; ; ) { 1265 ACQUIRE_LOCK(&lk); 1266 for (wk = LIST_FIRST(&bp->b_dep); wk; 1267 wk = LIST_NEXT(wk, wk_list)) { 1268 if (wk->wk_type != M_INDIRDEP) 1269 continue; 1270 indirdep = WK_INDIRDEP(wk); 1271 break; 1272 } 1273 if (indirdep == NULL && newindirdep) { 1274 indirdep = newindirdep; 1275 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 1276 newindirdep = NULL; 1277 } 1278 FREE_LOCK(&lk); 1279 if (indirdep) { 1280 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, 1281 &newblk) == 0) 1282 panic("setup_allocindir: lost block"); 1283 ACQUIRE_LOCK(&lk); 1284 if (newblk->nb_state == DEPCOMPLETE) { 1285 aip->ai_state |= DEPCOMPLETE; 1286 aip->ai_buf = NULL; 1287 } else { 1288 bmsafemap = newblk->nb_bmsafemap; 1289 aip->ai_buf = bmsafemap->sm_buf; 1290 LIST_REMOVE(newblk, nb_deps); 1291 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, 1292 aip, ai_deps); 1293 } 1294 LIST_REMOVE(newblk, nb_hash); 1295 FREE(newblk, M_NEWBLK); 1296 aip->ai_indirdep = indirdep; 1297 /* 1298 * Check to see if there is an existing dependency 1299 * for this block. If there is, merge the old 1300 * dependency into the new one. 1301 */ 1302 if (aip->ai_oldblkno == 0) 1303 oldaip = NULL; 1304 else 1305 for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd); 1306 oldaip; oldaip = LIST_NEXT(oldaip, ai_next)) 1307 if (oldaip->ai_offset == aip->ai_offset) 1308 break; 1309 if (oldaip != NULL) { 1310 if (oldaip->ai_newblkno != aip->ai_oldblkno) 1311 panic("setup_allocindir_phase2: blkno"); 1312 aip->ai_oldblkno = oldaip->ai_oldblkno; 1313 freefrag = oldaip->ai_freefrag; 1314 oldaip->ai_freefrag = aip->ai_freefrag; 1315 aip->ai_freefrag = freefrag; 1316 free_allocindir(oldaip, NULL); 1317 } 1318 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 1319 ((ufs_daddr_t *)indirdep->ir_savebp->b_data) 1320 [aip->ai_offset] = aip->ai_oldblkno; 1321 FREE_LOCK(&lk); 1322 } 1323 if (newindirdep) { 1324 if (indirdep->ir_savebp != NULL) 1325 brelse(newindirdep->ir_savebp); 1326 WORKITEM_FREE((caddr_t)newindirdep, M_INDIRDEP); 1327 } 1328 if (indirdep) 1329 break; 1330 MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep), 1331 M_INDIRDEP, M_WAITOK); 1332 newindirdep->ir_list.wk_type = M_INDIRDEP; 1333 newindirdep->ir_state = ATTACHED; 1334 LIST_INIT(&newindirdep->ir_deplisthd); 1335 LIST_INIT(&newindirdep->ir_donehd); 1336 newindirdep->ir_saveddata = (ufs_daddr_t *)bp->b_data; 1337 newindirdep->ir_savebp = 1338 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0); 1339 bcopy((caddr_t)newindirdep->ir_saveddata, 1340 newindirdep->ir_savebp->b_data, bp->b_bcount); 1341 } 1342} 1343 1344/* 1345 * Block de-allocation dependencies. 1346 * 1347 * When blocks are de-allocated, the on-disk pointers must be nullified before 1348 * the blocks are made available for use by other files. (The true 1349 * requirement is that old pointers must be nullified before new on-disk 1350 * pointers are set. We chose this slightly more stringent requirement to 1351 * reduce complexity.) Our implementation handles this dependency by updating 1352 * the inode (or indirect block) appropriately but delaying the actual block 1353 * de-allocation (i.e., freemap and free space count manipulation) until 1354 * after the updated versions reach stable storage. After the disk is 1355 * updated, the blocks can be safely de-allocated whenever it is convenient. 1356 * This implementation handles only the common case of reducing a file's 1357 * length to zero. Other cases are handled by the conventional synchronous 1358 * write approach. 1359 * 1360 * The ffs implementation with which we worked double-checks 1361 * the state of the block pointers and file size as it reduces 1362 * a file's length. Some of this code is replicated here in our 1363 * soft updates implementation. The freeblks->fb_chkcnt field is 1364 * used to transfer a part of this information to the procedure 1365 * that eventually de-allocates the blocks. 1366 * 1367 * This routine should be called from the routine that shortens 1368 * a file's length, before the inode's size or block pointers 1369 * are modified. It will save the block pointer information for 1370 * later release and zero the inode so that the calling routine 1371 * can release it. 1372 */ 1373void 1374softdep_setup_freeblocks(ip, length) 1375 struct inode *ip; /* The inode whose length is to be reduced */ 1376 off_t length; /* The new length for the file */ 1377{ 1378 struct freeblks *freeblks; 1379 struct inodedep *inodedep; 1380 struct allocdirect *adp; 1381 struct vnode *vp; 1382 struct buf *bp; 1383 struct fs *fs; 1384 int i, error; 1385 1386 fs = ip->i_fs; 1387 if (length != 0) 1388 panic("softde_setup_freeblocks: non-zero length"); 1389 MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks), 1390 M_FREEBLKS, M_WAITOK); 1391 bzero(freeblks, sizeof(struct freeblks)); 1392 freeblks->fb_list.wk_type = M_FREEBLKS; 1393 freeblks->fb_uid = ip->i_uid; 1394 freeblks->fb_previousinum = ip->i_number; 1395 freeblks->fb_devvp = ip->i_devvp; 1396 freeblks->fb_fs = fs; 1397 freeblks->fb_oldsize = ip->i_size; 1398 freeblks->fb_newsize = length; 1399 freeblks->fb_chkcnt = ip->i_blocks; 1400 for (i = 0; i < NDADDR; i++) { 1401 freeblks->fb_dblks[i] = ip->i_db[i]; 1402 ip->i_db[i] = 0; 1403 } 1404 for (i = 0; i < NIADDR; i++) { 1405 freeblks->fb_iblks[i] = ip->i_ib[i]; 1406 ip->i_ib[i] = 0; 1407 } 1408 ip->i_blocks = 0; 1409 ip->i_size = 0; 1410 /* 1411 * Push the zero'ed inode to to its disk buffer so that we are free 1412 * to delete its dependencies below. Once the dependencies are gone 1413 * the buffer can be safely released. 1414 */ 1415 if ((error = bread(ip->i_devvp, 1416 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 1417 (int)fs->fs_bsize, NOCRED, &bp)) != 0) 1418 softdep_error("softdep_setup_freeblocks", error); 1419 *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = 1420 ip->i_din; 1421 /* 1422 * Find and eliminate any inode dependencies. 1423 */ 1424 ACQUIRE_LOCK(&lk); 1425 (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep); 1426 if ((inodedep->id_state & IOSTARTED) != 0) 1427 panic("softdep_setup_freeblocks: inode busy"); 1428 /* 1429 * Add the freeblks structure to the list of operations that 1430 * must await the zero'ed inode being written to disk. 1431 */ 1432 WORKLIST_INSERT(&inodedep->id_inowait, &freeblks->fb_list); 1433 /* 1434 * Because the file length has been truncated to zero, any 1435 * pending block allocation dependency structures associated 1436 * with this inode are obsolete and can simply be de-allocated. 1437 * We must first merge the two dependency lists to get rid of 1438 * any duplicate freefrag structures, then purge the merged list. 1439 */ 1440 merge_inode_lists(inodedep); 1441 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 1442 free_allocdirect(&inodedep->id_inoupdt, adp, 1); 1443 bdwrite(bp); 1444 /* 1445 * We must wait for any I/O in progress to finish so that 1446 * all potential buffers on the dirty list will be visible. 1447 * Once they are all there, walk the list and get rid of 1448 * any dependencies. 1449 */ 1450 vp = ITOV(ip); 1451 while (vp->v_numoutput) { 1452 vp->v_flag |= VBWAIT; 1453 FREE_LOCK_INTERLOCKED(&lk); 1454 sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1); 1455 ACQUIRE_LOCK_INTERLOCKED(&lk); 1456 } 1457 while (getdirtybuf(&LIST_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) { 1458 bp = LIST_FIRST(&vp->v_dirtyblkhd); 1459 (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep); 1460 deallocate_dependencies(bp, inodedep); 1461 bp->b_flags |= B_INVAL; 1462 brelse(bp); 1463 } 1464 /* 1465 * Try freeing the inodedep in case that was the last dependency. 1466 */ 1467 if ((inodedep_lookup(fs, ip->i_number, 0, &inodedep)) != 0) 1468 (void) free_inodedep(inodedep); 1469 FREE_LOCK(&lk); 1470} 1471 1472/* 1473 * Reclaim any dependency structures from a buffer that is about to 1474 * be reallocated to a new vnode. The buffer must be locked, thus, 1475 * no I/O completion operations can occur while we are manipulating 1476 * its associated dependencies. The mutex is held so that other I/O's 1477 * associated with related dependencies do not occur. 1478 */ 1479static void 1480deallocate_dependencies(bp, inodedep) 1481 struct buf *bp; 1482 struct inodedep *inodedep; 1483{ 1484 struct worklist *wk; 1485 struct indirdep *indirdep; 1486 struct allocindir *aip; 1487 struct pagedep *pagedep; 1488 struct dirrem *dirrem; 1489 struct diradd *dap; 1490 long tmpsize; 1491 caddr_t tmp; 1492 int i; 1493 1494 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 1495 switch (wk->wk_type) { 1496 1497 case M_INDIRDEP: 1498 indirdep = WK_INDIRDEP(wk); 1499 /* 1500 * None of the indirect pointers will ever be visible, 1501 * so they can simply be tossed. GOINGAWAY ensures 1502 * that allocated pointers will be saved in the buffer 1503 * cache until they are freed. Note that they will 1504 * only be able to be found by their physical address 1505 * since the inode mapping the logical address will 1506 * be gone. The save buffer used for the safe copy 1507 * was allocated in setup_allocindir_phase2 using 1508 * the physical address so it could be used for this 1509 * purpose. Hence we swap the safe copy with the real 1510 * copy, allowing the safe copy to be freed and holding 1511 * on to the real copy for later use in indir_trunc. 1512 */ 1513 if (indirdep->ir_state & GOINGAWAY) 1514 panic("deallocate_dependencies: already gone"); 1515 indirdep->ir_state |= GOINGAWAY; 1516 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 1517 free_allocindir(aip, inodedep); 1518 if (bp->b_lblkno >= 0 || 1519 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 1520 panic("deallocate_dependencies: not indir"); 1521 tmp = indirdep->ir_savebp->b_data; 1522 indirdep->ir_savebp->b_data = bp->b_data; 1523 bp->b_data = tmp; 1524 tmpsize = indirdep->ir_savebp->b_bufsize; 1525 indirdep->ir_savebp->b_bufsize = bp->b_bufsize; 1526 bp->b_bufsize = tmpsize; 1527 WORKLIST_REMOVE(wk); 1528 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); 1529 continue; 1530 1531 case M_PAGEDEP: 1532 pagedep = WK_PAGEDEP(wk); 1533 /* 1534 * None of the directory additions will ever be 1535 * visible, so they can simply be tossed. 1536 */ 1537 for (i = 0; i < DAHASHSZ; i++) 1538 while (dap=LIST_FIRST(&pagedep->pd_diraddhd[i])) 1539 free_diradd(dap); 1540 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) 1541 free_diradd(dap); 1542 /* 1543 * Copy any directory remove dependencies to the list 1544 * to be processed after the zero'ed inode is written. 1545 * If the inode has already been written, then they 1546 * can be dumped directly onto the work list. 1547 */ 1548 for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem; 1549 dirrem = LIST_NEXT(dirrem, dm_next)) { 1550 LIST_REMOVE(dirrem, dm_next); 1551 dirrem->dm_dirinum = pagedep->pd_ino; 1552 if (inodedep == NULL) 1553 add_to_worklist(&dirrem->dm_list); 1554 else 1555 WORKLIST_INSERT(&inodedep->id_inowait, 1556 &dirrem->dm_list); 1557 } 1558 WORKLIST_REMOVE(&pagedep->pd_list); 1559 LIST_REMOVE(pagedep, pd_hash); 1560 WORKITEM_FREE(pagedep, M_PAGEDEP); 1561 continue; 1562 1563 case M_ALLOCINDIR: 1564 free_allocindir(WK_ALLOCINDIR(wk), inodedep); 1565 continue; 1566 1567 case M_ALLOCDIRECT: 1568 case M_INODEDEP: 1569 panic("deallocate_dependencies: Unexpected type %s", 1570 TYPENAME(wk->wk_type)); 1571 /* NOTREACHED */ 1572 1573 default: 1574 panic("deallocate_dependencies: Unknown type %s", 1575 TYPENAME(wk->wk_type)); 1576 /* NOTREACHED */ 1577 } 1578 } 1579} 1580 1581/* 1582 * Free an allocdirect. Generate a new freefrag work request if appropriate. 1583 * This routine must be called with splbio interrupts blocked. 1584 */ 1585static void 1586free_allocdirect(adphead, adp, delay) 1587 struct allocdirectlst *adphead; 1588 struct allocdirect *adp; 1589 int delay; 1590{ 1591 1592#ifdef DEBUG 1593 if (lk.lkt_held == -1) 1594 panic("free_allocdirect: lock not held"); 1595#endif 1596 if ((adp->ad_state & DEPCOMPLETE) == 0) 1597 LIST_REMOVE(adp, ad_deps); 1598 TAILQ_REMOVE(adphead, adp, ad_next); 1599 if ((adp->ad_state & COMPLETE) == 0) 1600 WORKLIST_REMOVE(&adp->ad_list); 1601 if (adp->ad_freefrag != NULL) { 1602 if (delay) 1603 WORKLIST_INSERT(&adp->ad_inodedep->id_inowait, 1604 &adp->ad_freefrag->ff_list); 1605 else 1606 add_to_worklist(&adp->ad_freefrag->ff_list); 1607 } 1608 WORKITEM_FREE(adp, M_ALLOCDIRECT); 1609} 1610 1611/* 1612 * Prepare an inode to be freed. The actual free operation is not 1613 * done until the zero'ed inode has been written to disk. 1614 */ 1615void 1616softdep_freefile(ap) 1617 struct vop_vfree_args /* { 1618 struct vnode *a_pvp; 1619 ino_t a_ino; 1620 int a_mode; 1621 } */ *ap; 1622{ 1623 struct inode *ip = VTOI(ap->a_pvp); 1624 struct inodedep *inodedep; 1625 struct freefile *freefile; 1626 1627 /* 1628 * This sets up the inode de-allocation dependency. 1629 */ 1630 MALLOC(freefile, struct freefile *, sizeof(struct freefile), 1631 M_FREEFILE, M_WAITOK); 1632 freefile->fx_list.wk_type = M_FREEFILE; 1633 freefile->fx_list.wk_state = 0; 1634 freefile->fx_mode = ap->a_mode; 1635 freefile->fx_oldinum = ap->a_ino; 1636 freefile->fx_devvp = ip->i_devvp; 1637 freefile->fx_fs = ip->i_fs; 1638 1639 /* 1640 * If the inodedep does not exist, then the zero'ed inode has 1641 * been written to disk and we can free the file immediately. 1642 */ 1643 ACQUIRE_LOCK(&lk); 1644 if (inodedep_lookup(ip->i_fs, ap->a_ino, 0, &inodedep) == 0) { 1645 add_to_worklist(&freefile->fx_list); 1646 FREE_LOCK(&lk); 1647 return; 1648 } 1649 1650 /* 1651 * If we still have a bitmap dependency, then the inode has never 1652 * been written to disk. Drop the dependency as it is no longer 1653 * necessary since the inode is being deallocated. We could process 1654 * the freefile immediately, but then we would have to clear the 1655 * id_inowait dependencies here and it is easier just to let the 1656 * zero'ed inode be written and let them be cleaned up in the 1657 * normal followup actions that follow the inode write. 1658 */ 1659 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 1660 inodedep->id_state |= DEPCOMPLETE; 1661 LIST_REMOVE(inodedep, id_deps); 1662 inodedep->id_buf = NULL; 1663 } 1664 /* 1665 * If the inodedep has no dependencies associated with it, 1666 * then we must free it here and free the file immediately. 1667 * This case arises when an early allocation fails (for 1668 * example, the user is over their file quota). 1669 */ 1670 if (free_inodedep(inodedep) == 0) 1671 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 1672 else 1673 add_to_worklist(&freefile->fx_list); 1674 FREE_LOCK(&lk); 1675} 1676 1677/* 1678 * Try to free an inodedep structure. Return 1 if it could be freed. 1679 */ 1680static int 1681free_inodedep(inodedep) 1682 struct inodedep *inodedep; 1683{ 1684 1685 if ((inodedep->id_state & ONWORKLIST) != 0 || 1686 LIST_FIRST(&inodedep->id_pendinghd) != NULL || 1687 LIST_FIRST(&inodedep->id_inowait) != NULL || 1688 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 1689 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || 1690 inodedep->id_nlinkdelta != 0 || inodedep->id_buf != NULL || 1691 inodedep->id_savedino != NULL) 1692 return (0); 1693 LIST_REMOVE(inodedep, id_hash); 1694 WORKITEM_FREE(inodedep, M_INODEDEP); 1695 return (1); 1696} 1697 1698/* 1699 * This workitem routine performs the block de-allocation. 1700 * The workitem is added to the pending list after the updated 1701 * inode block has been written to disk. As mentioned above, 1702 * checks regarding the number of blocks de-allocated (compared 1703 * to the number of blocks allocated for the file) are also 1704 * performed in this function. 1705 */ 1706static void 1707handle_workitem_freeblocks(freeblks) 1708 struct freeblks *freeblks; 1709{ 1710 struct inode tip; 1711 ufs_daddr_t bn; 1712 struct fs *fs; 1713 int i, level, bsize; 1714 long nblocks, blocksreleased = 0; 1715 int error, allerror = 0; 1716 ufs_lbn_t baselbns[NIADDR], tmpval; 1717 1718 tip.i_number = freeblks->fb_previousinum; 1719 tip.i_devvp = freeblks->fb_devvp; 1720 tip.i_dev = freeblks->fb_devvp->v_rdev; 1721 tip.i_fs = freeblks->fb_fs; 1722 tip.i_size = freeblks->fb_oldsize; 1723 tip.i_uid = freeblks->fb_uid; 1724 fs = freeblks->fb_fs; 1725 tmpval = 1; 1726 baselbns[0] = NDADDR; 1727 for (i = 1; i < NIADDR; i++) { 1728 tmpval *= NINDIR(fs); 1729 baselbns[i] = baselbns[i - 1] + tmpval; 1730 } 1731 nblocks = btodb(fs->fs_bsize); 1732 blocksreleased = 0; 1733 /* 1734 * Indirect blocks first. 1735 */ 1736 for (level = (NIADDR - 1); level >= 0; level--) { 1737 if ((bn = freeblks->fb_iblks[level]) == 0) 1738 continue; 1739 if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level, 1740 baselbns[level], &blocksreleased)) == 0) 1741 allerror = error; 1742 ffs_blkfree(&tip, bn, fs->fs_bsize); 1743 blocksreleased += nblocks; 1744 } 1745 /* 1746 * All direct blocks or frags. 1747 */ 1748 for (i = (NDADDR - 1); i >= 0; i--) { 1749 if ((bn = freeblks->fb_dblks[i]) == 0) 1750 continue; 1751 bsize = blksize(fs, &tip, i); 1752 ffs_blkfree(&tip, bn, bsize); 1753 blocksreleased += btodb(bsize); 1754 } 1755 1756#ifdef DIAGNOSTIC 1757 if (freeblks->fb_chkcnt != blocksreleased) 1758 panic("handle_workitem_freeblocks: block count"); 1759 if (allerror) 1760 softdep_error("handle_workitem_freeblks", allerror); 1761#endif /* DIAGNOSTIC */ 1762 WORKITEM_FREE(freeblks, M_FREEBLKS); 1763} 1764 1765/* 1766 * Release blocks associated with the inode ip and stored in the indirect 1767 * block dbn. If level is greater than SINGLE, the block is an indirect block 1768 * and recursive calls to indirtrunc must be used to cleanse other indirect 1769 * blocks. 1770 */ 1771static int 1772indir_trunc(ip, dbn, level, lbn, countp) 1773 struct inode *ip; 1774 ufs_daddr_t dbn; 1775 int level; 1776 ufs_lbn_t lbn; 1777 long *countp; 1778{ 1779 struct buf *bp; 1780 ufs_daddr_t *bap; 1781 ufs_daddr_t nb; 1782 struct fs *fs; 1783 struct worklist *wk; 1784 struct indirdep *indirdep; 1785 int i, lbnadd, nblocks; 1786 int error, allerror = 0; 1787 1788 fs = ip->i_fs; 1789 lbnadd = 1; 1790 for (i = level; i > 0; i--) 1791 lbnadd *= NINDIR(fs); 1792 /* 1793 * Get buffer of block pointers to be freed. This routine is not 1794 * called until the zero'ed inode has been written, so it is safe 1795 * to free blocks as they are encountered. Because the inode has 1796 * been zero'ed, calls to bmap on these blocks will fail. So, we 1797 * have to use the on-disk address and the block device for the 1798 * filesystem to look them up. If the file was deleted before its 1799 * indirect blocks were all written to disk, the routine that set 1800 * us up (deallocate_dependencies) will have arranged to leave 1801 * a complete copy of the indirect block in memory for our use. 1802 * Otherwise we have to read the blocks in from the disk. 1803 */ 1804 ACQUIRE_LOCK(&lk); 1805 if ((bp = incore(ip->i_devvp, dbn)) != NULL && 1806 (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 1807 if (wk->wk_type != M_INDIRDEP || 1808 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || 1809 (indirdep->ir_state & GOINGAWAY) == 0) 1810 panic("indir_trunc: lost indirdep"); 1811 WORKLIST_REMOVE(wk); 1812 WORKITEM_FREE(indirdep, M_INDIRDEP); 1813 if (LIST_FIRST(&bp->b_dep) != NULL) 1814 panic("indir_trunc: dangling dep"); 1815 FREE_LOCK(&lk); 1816 } else { 1817 FREE_LOCK(&lk); 1818 error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp); 1819 if (error) 1820 return (error); 1821 } 1822 /* 1823 * Recursively free indirect blocks. 1824 */ 1825 bap = (ufs_daddr_t *)bp->b_data; 1826 nblocks = btodb(fs->fs_bsize); 1827 for (i = NINDIR(fs) - 1; i >= 0; i--) { 1828 if ((nb = bap[i]) == 0) 1829 continue; 1830 if (level != 0) { 1831 if ((error = indir_trunc(ip, fsbtodb(fs, nb), 1832 level - 1, lbn + (i * lbnadd), countp)) != 0) 1833 allerror = error; 1834 } 1835 ffs_blkfree(ip, nb, fs->fs_bsize); 1836 *countp += nblocks; 1837 } 1838 bp->b_flags |= B_INVAL; 1839 brelse(bp); 1840 return (allerror); 1841} 1842 1843/* 1844 * Free an allocindir. 1845 * This routine must be called with splbio interrupts blocked. 1846 */ 1847static void 1848free_allocindir(aip, inodedep) 1849 struct allocindir *aip; 1850 struct inodedep *inodedep; 1851{ 1852 struct freefrag *freefrag; 1853 1854#ifdef DEBUG 1855 if (lk.lkt_held == -1) 1856 panic("free_allocindir: lock not held"); 1857#endif 1858 if ((aip->ai_state & DEPCOMPLETE) == 0) 1859 LIST_REMOVE(aip, ai_deps); 1860 if (aip->ai_state & ONWORKLIST) 1861 WORKLIST_REMOVE(&aip->ai_list); 1862 LIST_REMOVE(aip, ai_next); 1863 if ((freefrag = aip->ai_freefrag) != NULL) { 1864 if (inodedep == NULL) 1865 add_to_worklist(&freefrag->ff_list); 1866 else 1867 WORKLIST_INSERT(&inodedep->id_inowait, 1868 &freefrag->ff_list); 1869 } 1870 WORKITEM_FREE(aip, M_ALLOCINDIR); 1871} 1872 1873/* 1874 * Directory entry addition dependencies. 1875 * 1876 * When adding a new directory entry, the inode (with its incremented link 1877 * count) must be written to disk before the directory entry's pointer to it. 1878 * Also, if the inode is newly allocated, the corresponding freemap must be 1879 * updated (on disk) before the directory entry's pointer. These requirements 1880 * are met via undo/redo on the directory entry's pointer, which consists 1881 * simply of the inode number. 1882 * 1883 * As directory entries are added and deleted, the free space within a 1884 * directory block can become fragmented. The ufs file system will compact 1885 * a fragmented directory block to make space for a new entry. When this 1886 * occurs, the offsets of previously added entries change. Any "diradd" 1887 * dependency structures corresponding to these entries must be updated with 1888 * the new offsets. 1889 */ 1890 1891/* 1892 * This routine is called after the in-memory inode's link 1893 * count has been incremented, but before the directory entry's 1894 * pointer to the inode has been set. 1895 */ 1896void 1897softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) 1898 struct buf *bp; /* buffer containing directory block */ 1899 struct inode *dp; /* inode for directory */ 1900 off_t diroffset; /* offset of new entry in directory */ 1901 long newinum; /* inode referenced by new directory entry */ 1902 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 1903{ 1904 int offset; /* offset of new entry within directory block */ 1905 ufs_lbn_t lbn; /* block in directory containing new entry */ 1906 struct fs *fs; 1907 struct diradd *dap; 1908 struct pagedep *pagedep; 1909 struct inodedep *inodedep; 1910 struct mkdir *mkdir1, *mkdir2; 1911 1912 /* 1913 * Whiteouts have no dependencies. 1914 */ 1915 if (newinum == WINO) { 1916 if (newdirbp != NULL) 1917 bdwrite(newdirbp); 1918 return; 1919 } 1920 1921 fs = dp->i_fs; 1922 lbn = lblkno(fs, diroffset); 1923 offset = blkoff(fs, diroffset); 1924 MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK); 1925 bzero(dap, sizeof(struct diradd)); 1926 dap->da_list.wk_type = M_DIRADD; 1927 dap->da_offset = offset; 1928 dap->da_newinum = newinum; 1929 dap->da_state = ATTACHED; 1930 if (newdirbp == NULL) { 1931 dap->da_state |= DEPCOMPLETE; 1932 } else { 1933 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 1934 MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR, 1935 M_WAITOK); 1936 mkdir1->md_list.wk_type = M_MKDIR; 1937 mkdir1->md_state = MKDIR_BODY; 1938 mkdir1->md_diradd = dap; 1939 MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR, 1940 M_WAITOK); 1941 mkdir2->md_list.wk_type = M_MKDIR; 1942 mkdir2->md_state = MKDIR_PARENT; 1943 mkdir2->md_diradd = dap; 1944 1945 } 1946 1947 ACQUIRE_LOCK(&lk); 1948 /* 1949 * If this directory entry references a new directory, create 1950 * its two additional dependencies: its "." and ".." being written 1951 * to disk and the link count increase for its parent directory. 1952 */ 1953 if (newdirbp != NULL) { 1954 /* 1955 * Dependency on "." and ".." being written to disk 1956 */ 1957 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 1958 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); 1959 bdwrite(newdirbp); 1960 /* 1961 * Dependency on link count increase for parent directory 1962 */ 1963 if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0 1964 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 1965 dap->da_state &= ~MKDIR_PARENT; 1966 WORKITEM_FREE(mkdir2, M_MKDIR); 1967 } else { 1968 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 1969 WORKLIST_INSERT(&inodedep->id_inowait,&mkdir2->md_list); 1970 } 1971 } 1972 /* 1973 * Link into parent directory pagedep and new inode inodedep 1974 * structures to await its being written. 1975 */ 1976 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 1977 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 1978 dap->da_pagedep = pagedep; 1979 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 1980 da_pdlist); 1981 if (inodedep_lookup(fs, newinum, DEPALLOC, &inodedep) == 1 && 1982 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 1983 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 1984 else 1985 WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list); 1986 FREE_LOCK(&lk); 1987} 1988 1989/* 1990 * This procedure is called to change the offset of a directory 1991 * entry when compacting a directory block which must be owned 1992 * exclusively by the caller. Note that the actual entry movement 1993 * must be done in this procedure to ensure that no I/O completions 1994 * occur while the move is in progress. 1995 */ 1996void 1997softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) 1998 struct inode *dp; /* inode for directory */ 1999 caddr_t base; /* address of dp->i_offset */ 2000 caddr_t oldloc; /* address of old directory location */ 2001 caddr_t newloc; /* address of new directory location */ 2002 int entrysize; /* size of directory entry */ 2003{ 2004 int oldoffset, newoffset; 2005 struct pagedep *pagedep; 2006 struct diradd *dap; 2007 ufs_lbn_t lbn; 2008 2009 ACQUIRE_LOCK(&lk); 2010 lbn = lblkno(dp->i_fs, dp->i_offset); 2011 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) 2012 goto done; 2013 oldoffset = dp->i_offset + (oldloc - base); 2014 newoffset = dp->i_offset + (newloc - base); 2015 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]); 2016 dap; dap = LIST_NEXT(dap, da_pdlist)) { 2017 if (dap->da_offset != oldoffset) 2018 continue; 2019 dap->da_offset = newoffset; 2020 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) 2021 break; 2022 LIST_REMOVE(dap, da_pdlist); 2023 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], 2024 dap, da_pdlist); 2025 break; 2026 } 2027done: 2028 bcopy(oldloc, newloc, entrysize); 2029 FREE_LOCK(&lk); 2030} 2031 2032/* 2033 * Free a diradd dependency structure. This routine must be called 2034 * with splbio interrupts blocked. 2035 */ 2036static void 2037free_diradd(dap) 2038 struct diradd *dap; 2039{ 2040 struct dirrem *dirrem; 2041 struct pagedep *pagedep; 2042 struct inodedep *inodedep; 2043 struct mkdir *mkdir, *nextmd; 2044 2045#ifdef DEBUG 2046 if (lk.lkt_held == -1) 2047 panic("free_diradd: lock not held"); 2048#endif 2049 WORKLIST_REMOVE(&dap->da_list); 2050 LIST_REMOVE(dap, da_pdlist); 2051 if ((dap->da_state & DIRCHG) == 0) { 2052 pagedep = dap->da_pagedep; 2053 } else { 2054 dirrem = dap->da_previous; 2055 pagedep = dirrem->dm_pagedep; 2056 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, dm_next); 2057 } 2058 if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum, 2059 0, &inodedep) != 0) 2060 (void) free_inodedep(inodedep); 2061 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 2062 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 2063 nextmd = LIST_NEXT(mkdir, md_mkdirs); 2064 if (mkdir->md_diradd != dap) 2065 continue; 2066 dap->da_state &= ~mkdir->md_state; 2067 WORKLIST_REMOVE(&mkdir->md_list); 2068 LIST_REMOVE(mkdir, md_mkdirs); 2069 WORKITEM_FREE(mkdir, M_MKDIR); 2070 } 2071 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 2072 panic("free_diradd: unfound ref"); 2073 } 2074 WORKITEM_FREE(dap, M_DIRADD); 2075} 2076 2077/* 2078 * Directory entry removal dependencies. 2079 * 2080 * When removing a directory entry, the entry's inode pointer must be 2081 * zero'ed on disk before the corresponding inode's link count is decremented 2082 * (possibly freeing the inode for re-use). This dependency is handled by 2083 * updating the directory entry but delaying the inode count reduction until 2084 * after the directory block has been written to disk. After this point, the 2085 * inode count can be decremented whenever it is convenient. 2086 */ 2087 2088/* 2089 * This routine should be called immediately after removing 2090 * a directory entry. The inode's link count should not be 2091 * decremented by the calling procedure -- the soft updates 2092 * code will do this task when it is safe. 2093 */ 2094void 2095softdep_setup_remove(bp, dp, ip, isrmdir) 2096 struct buf *bp; /* buffer containing directory block */ 2097 struct inode *dp; /* inode for the directory being modified */ 2098 struct inode *ip; /* inode for directory entry being removed */ 2099 int isrmdir; /* indicates if doing RMDIR */ 2100{ 2101 struct dirrem *dirrem; 2102 2103 /* 2104 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. 2105 */ 2106 dirrem = newdirrem(bp, dp, ip, isrmdir); 2107 if ((dirrem->dm_state & COMPLETE) == 0) { 2108 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 2109 dm_next); 2110 } else { 2111 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 2112 add_to_worklist(&dirrem->dm_list); 2113 } 2114 FREE_LOCK(&lk); 2115} 2116 2117/* 2118 * Allocate a new dirrem if appropriate and return it along with 2119 * its associated pagedep. Called without a lock, returns with lock. 2120 */ 2121static struct dirrem * 2122newdirrem(bp, dp, ip, isrmdir) 2123 struct buf *bp; /* buffer containing directory block */ 2124 struct inode *dp; /* inode for the directory being modified */ 2125 struct inode *ip; /* inode for directory entry being removed */ 2126 int isrmdir; /* indicates if doing RMDIR */ 2127{ 2128 ufs_lbn_t lbn; 2129 struct diradd *dap; 2130 struct dirrem *dirrem; 2131 struct pagedep *pagedep; 2132 2133 /* 2134 * Whiteouts have no deletion dependencies. 2135 */ 2136 if (ip == NULL) 2137 panic("newdirrem: whiteout"); 2138 MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem), 2139 M_DIRREM, M_WAITOK); 2140 bzero(dirrem, sizeof(struct dirrem)); 2141 dirrem->dm_list.wk_type = M_DIRREM; 2142 dirrem->dm_state = isrmdir ? RMDIR : 0; 2143 dirrem->dm_mnt = ITOV(ip)->v_mount; 2144 dirrem->dm_oldinum = ip->i_number; 2145 2146 ACQUIRE_LOCK(&lk); 2147 lbn = lblkno(dp->i_fs, dp->i_offset); 2148 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2149 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2150 dirrem->dm_pagedep = pagedep; 2151 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(dp->i_offset)]); 2152 dap; dap = LIST_NEXT(dap, da_pdlist)) { 2153 /* 2154 * Check for a diradd dependency for the same directory entry. 2155 * If present, then both dependencies become obsolete and can 2156 * be de-allocated. 2157 */ 2158 if (dap->da_offset != dp->i_offset) 2159 continue; 2160 /* 2161 * Must be ATTACHED at this point, so just delete it. 2162 */ 2163 if ((dap->da_state & ATTACHED) == 0) 2164 panic("newdirrem: not ATTACHED"); 2165 if (dap->da_newinum != ip->i_number) 2166 panic("newdirrem: inum %d should be %d", 2167 ip->i_number, dap->da_newinum); 2168 free_diradd(dap); 2169 dirrem->dm_state |= COMPLETE; 2170 break; 2171 } 2172 return (dirrem); 2173} 2174 2175/* 2176 * Directory entry change dependencies. 2177 * 2178 * Changing an existing directory entry requires that an add operation 2179 * be completed first followed by a deletion. The semantics for the addition 2180 * are identical to the description of adding a new entry above except 2181 * that the rollback is to the old inode number rather than zero. Once 2182 * the addition dependency is completed, the removal is done as described 2183 * in the removal routine above. 2184 */ 2185 2186/* 2187 * This routine should be called immediately after changing 2188 * a directory entry. The inode's link count should not be 2189 * decremented by the calling procedure -- the soft updates 2190 * code will perform this task when it is safe. 2191 */ 2192void 2193softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 2194 struct buf *bp; /* buffer containing directory block */ 2195 struct inode *dp; /* inode for the directory being modified */ 2196 struct inode *ip; /* inode for directory entry being removed */ 2197 long newinum; /* new inode number for changed entry */ 2198 int isrmdir; /* indicates if doing RMDIR */ 2199{ 2200 int offset; 2201 struct diradd *dap; 2202 struct dirrem *dirrem; 2203 struct inodedep *inodedep; 2204 2205 offset = blkoff(dp->i_fs, dp->i_offset); 2206 2207 /* 2208 * Whiteouts have no addition dependencies. 2209 */ 2210 if (newinum == WINO) { 2211 dap = NULL; 2212 } else { 2213 MALLOC(dap, struct diradd *, sizeof(struct diradd), 2214 M_DIRADD, M_WAITOK); 2215 bzero(dap, sizeof(struct diradd)); 2216 dap->da_list.wk_type = M_DIRADD; 2217 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 2218 dap->da_offset = offset; 2219 dap->da_newinum = newinum; 2220 } 2221 2222 /* 2223 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. 2224 */ 2225 dirrem = newdirrem(bp, dp, ip, isrmdir); 2226 2227 /* 2228 * If the inode has already been written, then no addition 2229 * dependency needs to be created. 2230 */ 2231 if (inodedep_lookup(dp->i_fs, newinum, 0, &inodedep) == 0 || 2232 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2233 WORKITEM_FREE(dap, M_DIRADD); 2234 dap = NULL; 2235 } 2236 2237 if (dap) { 2238 dap->da_previous = dirrem; 2239 LIST_INSERT_HEAD( 2240 &dirrem->dm_pagedep->pd_diraddhd[DIRADDHASH(offset)], 2241 dap, da_pdlist); 2242 WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list); 2243 } else if ((dirrem->dm_state & COMPLETE) == 0) { 2244 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 2245 dm_next); 2246 } else { 2247 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 2248 add_to_worklist(&dirrem->dm_list); 2249 } 2250 FREE_LOCK(&lk); 2251} 2252 2253/* 2254 * Called whenever the link count on an inode is increased. 2255 * It creates an inode dependency so that the new reference(s) 2256 * to the inode cannot be committed to disk until the updated 2257 * inode has been written. 2258 */ 2259void 2260softdep_increase_linkcnt(ip) 2261 struct inode *ip; /* the inode with the increased link count */ 2262{ 2263 struct inodedep *inodedep; 2264 2265 ACQUIRE_LOCK(&lk); 2266 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); 2267 FREE_LOCK(&lk); 2268} 2269 2270/* 2271 * This workitem decrements the inode's link count. 2272 * If the link count reaches zero, the file is removed. 2273 */ 2274static void 2275handle_workitem_remove(dirrem) 2276 struct dirrem *dirrem; 2277{ 2278 struct proc *p = curproc; /* XXX */ 2279 struct inodedep *inodedep; 2280 struct vnode *vp; 2281 struct inode *ip; 2282 int error; 2283 2284 if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) { 2285 softdep_error("handle_workitem_remove: vget", error); 2286 return; 2287 } 2288 ip = VTOI(vp); 2289 /* 2290 * Normal file deletion. 2291 */ 2292 if ((dirrem->dm_state & RMDIR) == 0) { 2293 ip->i_nlink--; 2294 if (ip->i_nlink < ip->i_effnlink) 2295 panic("handle_workitem_remove: bad file delta"); 2296 ip->i_flag |= IN_CHANGE; 2297 vput(vp); 2298 WORKITEM_FREE(dirrem, M_DIRREM); 2299 return; 2300 } 2301 /* 2302 * Directory deletion. Decrement reference count for both the 2303 * just deleted parent directory entry and the reference for ".". 2304 * Next truncate the directory to length zero. When the 2305 * truncation completes, arrange to have the reference count on 2306 * the parent decremented to account for the loss of "..". 2307 */ 2308 ip->i_nlink -= 2; 2309 if (ip->i_nlink < ip->i_effnlink) 2310 panic("handle_workitem_remove: bad dir delta"); 2311 ip->i_flag |= IN_CHANGE; 2312 if ((error = VOP_TRUNCATE(vp, (off_t)0, 0, p->p_cred, p)) != 0) 2313 softdep_error("handle_workitem_remove: truncate", error); 2314 ACQUIRE_LOCK(&lk); 2315 (void) inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, DEPALLOC, 2316 &inodedep); 2317 dirrem->dm_state = 0; 2318 dirrem->dm_oldinum = dirrem->dm_dirinum; 2319 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 2320 FREE_LOCK(&lk); 2321 vput(vp); 2322} 2323 2324/* 2325 * Inode de-allocation dependencies. 2326 * 2327 * When an inode's link count is reduced to zero, it can be de-allocated. We 2328 * found it convenient to postpone de-allocation until after the inode is 2329 * written to disk with its new link count (zero). At this point, all of the 2330 * on-disk inode's block pointers are nullified and, with careful dependency 2331 * list ordering, all dependencies related to the inode will be satisfied and 2332 * the corresponding dependency structures de-allocated. So, if/when the 2333 * inode is reused, there will be no mixing of old dependencies with new 2334 * ones. This artificial dependency is set up by the block de-allocation 2335 * procedure above (softdep_setup_freeblocks) and completed by the 2336 * following procedure. 2337 */ 2338static void 2339handle_workitem_freefile(freefile) 2340 struct freefile *freefile; 2341{ 2342 struct vnode vp; 2343 struct inode tip; 2344 struct inodedep *idp; 2345 struct vop_vfree_args args; 2346 int error; 2347 2348#ifdef DEBUG 2349 ACQUIRE_LOCK(&lk); 2350 if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp)) 2351 panic("handle_workitem_freefile: inodedep survived"); 2352 FREE_LOCK(&lk); 2353#endif 2354 tip.i_devvp = freefile->fx_devvp; 2355 tip.i_dev = freefile->fx_devvp->v_rdev; 2356 tip.i_fs = freefile->fx_fs; 2357 vp.v_data = &tip; 2358 args.a_pvp = &vp; 2359 args.a_ino = freefile->fx_oldinum; 2360 args.a_mode = freefile->fx_mode; 2361 if ((error = ffs_freefile(&args)) != 0) 2362 softdep_error("handle_workitem_freefile", error); 2363 WORKITEM_FREE(freefile, M_FREEFILE); 2364} 2365 2366/* 2367 * Disk writes. 2368 * 2369 * The dependency structures constructed above are most actively used when file 2370 * system blocks are written to disk. No constraints are placed on when a 2371 * block can be written, but unsatisfied update dependencies are made safe by 2372 * modifying (or replacing) the source memory for the duration of the disk 2373 * write. When the disk write completes, the memory block is again brought 2374 * up-to-date. 2375 * 2376 * In-core inode structure reclamation. 2377 * 2378 * Because there are a finite number of "in-core" inode structures, they are 2379 * reused regularly. By transferring all inode-related dependencies to the 2380 * in-memory inode block and indexing them separately (via "inodedep"s), we 2381 * can allow "in-core" inode structures to be reused at any time and avoid 2382 * any increase in contention. 2383 * 2384 * Called just before entering the device driver to initiate a new disk I/O. 2385 * The buffer must be locked, thus, no I/O completion operations can occur 2386 * while we are manipulating its associated dependencies. 2387 */ 2388void 2389softdep_disk_io_initiation(bp) 2390 struct buf *bp; /* structure describing disk write to occur */ 2391{ 2392 struct worklist *wk, *nextwk; 2393 struct indirdep *indirdep; 2394 2395 /* 2396 * We only care about write operations. There should never 2397 * be dependencies for reads. 2398 */ 2399 if (bp->b_flags & B_READ) 2400 panic("softdep_disk_io_initiation: read"); 2401 /* 2402 * Do any necessary pre-I/O processing. 2403 */ 2404 for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) { 2405 nextwk = LIST_NEXT(wk, wk_list); 2406 switch (wk->wk_type) { 2407 2408 case M_PAGEDEP: 2409 initiate_write_filepage(WK_PAGEDEP(wk), bp); 2410 continue; 2411 2412 case M_INODEDEP: 2413 initiate_write_inodeblock(WK_INODEDEP(wk), bp); 2414 continue; 2415 2416 case M_INDIRDEP: 2417 indirdep = WK_INDIRDEP(wk); 2418 if (indirdep->ir_state & GOINGAWAY) 2419 panic("disk_io_initiation: indirdep gone"); 2420 /* 2421 * If there are no remaining dependencies, this 2422 * will be writing the real pointers, so the 2423 * dependency can be freed. 2424 */ 2425 if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { 2426 brelse(indirdep->ir_savebp); 2427 /* inline expand WORKLIST_REMOVE(wk); */ 2428 wk->wk_state &= ~ONWORKLIST; 2429 LIST_REMOVE(wk, wk_list); 2430 WORKITEM_FREE(indirdep, M_INDIRDEP); 2431 continue; 2432 } 2433 /* 2434 * Replace up-to-date version with safe version. 2435 */ 2436 ACQUIRE_LOCK(&lk); 2437 indirdep->ir_state &= ~ATTACHED; 2438 indirdep->ir_state |= UNDONE; 2439 bp->b_data = indirdep->ir_savebp->b_data; 2440 FREE_LOCK(&lk); 2441 continue; 2442 2443 case M_MKDIR: 2444 case M_BMSAFEMAP: 2445 case M_ALLOCDIRECT: 2446 case M_ALLOCINDIR: 2447 continue; 2448 2449 default: 2450 panic("handle_disk_io_initiation: Unexpected type %s", 2451 TYPENAME(wk->wk_type)); 2452 /* NOTREACHED */ 2453 } 2454 } 2455} 2456 2457/* 2458 * Called from within the procedure above to deal with unsatisfied 2459 * allocation dependencies in a directory. The buffer must be locked, 2460 * thus, no I/O completion operations can occur while we are 2461 * manipulating its associated dependencies. 2462 */ 2463static void 2464initiate_write_filepage(pagedep, bp) 2465 struct pagedep *pagedep; 2466 struct buf *bp; 2467{ 2468 struct diradd *dap; 2469 struct direct *ep; 2470 int i; 2471 2472 if (pagedep->pd_state & IOSTARTED) { 2473 /* 2474 * This can only happen if there is a driver that does not 2475 * understand chaining. Here biodone will reissue the call 2476 * to strategy for the incomplete buffers. 2477 */ 2478 printf("initiate_write_filepage: already started\n"); 2479 return; 2480 } 2481 pagedep->pd_state |= IOSTARTED; 2482 ACQUIRE_LOCK(&lk); 2483 for (i = 0; i < DAHASHSZ; i++) { 2484 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 2485 dap = LIST_NEXT(dap, da_pdlist)) { 2486 ep = (struct direct *) 2487 ((char *)bp->b_data + dap->da_offset); 2488 if (ep->d_ino != dap->da_newinum) 2489 panic("%s: dir inum %d != new %d", 2490 "initiate_write_filepage", 2491 ep->d_ino, dap->da_newinum); 2492 if (dap->da_state & DIRCHG) 2493 ep->d_ino = dap->da_previous->dm_oldinum; 2494 else 2495 ep->d_ino = 0; 2496 dap->da_state &= ~ATTACHED; 2497 dap->da_state |= UNDONE; 2498 } 2499 } 2500 FREE_LOCK(&lk); 2501} 2502 2503/* 2504 * Called from within the procedure above to deal with unsatisfied 2505 * allocation dependencies in an inodeblock. The buffer must be 2506 * locked, thus, no I/O completion operations can occur while we 2507 * are manipulating its associated dependencies. 2508 */ 2509static void 2510initiate_write_inodeblock(inodedep, bp) 2511 struct inodedep *inodedep; 2512 struct buf *bp; /* The inode block */ 2513{ 2514 struct allocdirect *adp, *lastadp; 2515 struct dinode *dp; 2516 struct fs *fs; 2517 ufs_lbn_t prevlbn; 2518 int i, deplist; 2519 2520 if (inodedep->id_state & IOSTARTED) 2521 panic("initiate_write_inodeblock: already started"); 2522 inodedep->id_state |= IOSTARTED; 2523 fs = inodedep->id_fs; 2524 dp = (struct dinode *)bp->b_data + 2525 ino_to_fsbo(fs, inodedep->id_ino); 2526 /* 2527 * If the bitmap is not yet written, then the allocated 2528 * inode cannot be written to disk. 2529 */ 2530 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 2531 if (inodedep->id_savedino != NULL) 2532 panic("initiate_write_inodeblock: already doing I/O"); 2533 MALLOC(inodedep->id_savedino, struct dinode *, 2534 sizeof(struct dinode), M_INODEDEP, M_WAITOK); 2535 *inodedep->id_savedino = *dp; 2536 bzero((caddr_t)dp, sizeof(struct dinode)); 2537 return; 2538 } 2539 /* 2540 * If no dependencies, then there is nothing to roll back. 2541 */ 2542 inodedep->id_savedsize = dp->di_size; 2543 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) 2544 return; 2545 /* 2546 * Set the dependencies to busy. 2547 */ 2548 ACQUIRE_LOCK(&lk); 2549 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 2550 adp = TAILQ_NEXT(adp, ad_next)) { 2551#ifdef DIAGNOSTIC 2552 if (deplist != 0 && prevlbn >= adp->ad_lbn) 2553 panic("softdep_write_inodeblock: lbn order"); 2554 prevlbn = adp->ad_lbn; 2555 if (adp->ad_lbn < NDADDR && 2556 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) 2557 panic("%s: direct pointer #%d mismatch %d != %d", 2558 "softdep_write_inodeblock", adp->ad_lbn, 2559 dp->di_db[adp->ad_lbn], adp->ad_newblkno); 2560 if (adp->ad_lbn >= NDADDR && 2561 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) 2562 panic("%s: indirect pointer #%d mismatch %d != %d", 2563 "softdep_write_inodeblock", adp->ad_lbn - NDADDR, 2564 dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno); 2565 deplist |= 1 << adp->ad_lbn; 2566 if ((adp->ad_state & ATTACHED) == 0) 2567 panic("softdep_write_inodeblock: Unknown state 0x%x", 2568 adp->ad_state); 2569#endif /* DIAGNOSTIC */ 2570 adp->ad_state &= ~ATTACHED; 2571 adp->ad_state |= UNDONE; 2572 } 2573 /* 2574 * The on-disk inode cannot claim to be any larger than the last 2575 * fragment that has been written. Otherwise, the on-disk inode 2576 * might have fragments that were not the last block in the file 2577 * which would corrupt the filesystem. 2578 */ 2579 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 2580 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 2581 if (adp->ad_lbn >= NDADDR) 2582 break; 2583 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; 2584 /* keep going until hitting a rollback to a frag */ 2585 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 2586 continue; 2587 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 2588 for (i = adp->ad_lbn + 1; i < NDADDR; i++) { 2589#ifdef DIAGNOSTIC 2590 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 2591 panic("softdep_write_inodeblock: lost dep1"); 2592#endif /* DIAGNOSTIC */ 2593 dp->di_db[i] = 0; 2594 } 2595 for (i = 0; i < NIADDR; i++) { 2596#ifdef DIAGNOSTIC 2597 if (dp->di_ib[i] != 0 && 2598 (deplist & ((1 << NDADDR) << i)) == 0) 2599 panic("softdep_write_inodeblock: lost dep2"); 2600#endif /* DIAGNOSTIC */ 2601 dp->di_ib[i] = 0; 2602 } 2603 FREE_LOCK(&lk); 2604 return; 2605 } 2606 /* 2607 * If we have zero'ed out the last allocated block of the file, 2608 * roll back the size to the last currently allocated block. 2609 * We know that this last allocated block is a full-sized as 2610 * we already checked for fragments in the loop above. 2611 */ 2612 if (lastadp != NULL && 2613 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 2614 for (i = lastadp->ad_lbn; i >= 0; i--) 2615 if (dp->di_db[i] != 0) 2616 break; 2617 dp->di_size = (i + 1) * fs->fs_bsize; 2618 } 2619 /* 2620 * The only dependencies are for indirect blocks. 2621 * 2622 * The file size for indirect block additions is not guaranteed. 2623 * Such a guarantee would be non-trivial to achieve. The conventional 2624 * synchronous write implementation also does not make this guarantee. 2625 * Fsck should catch and fix discrepancies. Arguably, the file size 2626 * can be over-estimated without destroying integrity when the file 2627 * moves into the indirect blocks (i.e., is large). If we want to 2628 * postpone fsck, we are stuck with this argument. 2629 */ 2630 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 2631 dp->di_ib[adp->ad_lbn - NDADDR] = 0; 2632 FREE_LOCK(&lk); 2633} 2634 2635/* 2636 * This routine is called during the completion interrupt 2637 * service routine for a disk write (from the procedure called 2638 * by the device driver to inform the file system caches of 2639 * a request completion). It should be called early in this 2640 * procedure, before the block is made available to other 2641 * processes or other routines are called. 2642 */ 2643void 2644softdep_disk_write_complete(bp) 2645 struct buf *bp; /* describes the completed disk write */ 2646{ 2647 struct worklist *wk; 2648 struct workhead reattach; 2649 struct newblk *newblk; 2650 struct allocindir *aip; 2651 struct allocdirect *adp; 2652 struct indirdep *indirdep; 2653 struct inodedep *inodedep; 2654 struct bmsafemap *bmsafemap; 2655 2656#ifdef DEBUG 2657 if (lk.lkt_held != -1) 2658 panic("softdep_disk_write_complete: lock is held"); 2659 lk.lkt_held = -2; 2660#endif 2661 LIST_INIT(&reattach); 2662 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 2663 WORKLIST_REMOVE(wk); 2664 switch (wk->wk_type) { 2665 2666 case M_PAGEDEP: 2667 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 2668 WORKLIST_INSERT(&reattach, wk); 2669 continue; 2670 2671 case M_INODEDEP: 2672 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 2673 WORKLIST_INSERT(&reattach, wk); 2674 continue; 2675 2676 case M_BMSAFEMAP: 2677 bmsafemap = WK_BMSAFEMAP(wk); 2678 while (newblk = LIST_FIRST(&bmsafemap->sm_newblkhd)) { 2679 newblk->nb_state |= DEPCOMPLETE; 2680 newblk->nb_bmsafemap = NULL; 2681 LIST_REMOVE(newblk, nb_deps); 2682 } 2683 while (adp = LIST_FIRST(&bmsafemap->sm_allocdirecthd)) { 2684 adp->ad_state |= DEPCOMPLETE; 2685 adp->ad_buf = NULL; 2686 LIST_REMOVE(adp, ad_deps); 2687 handle_allocdirect_partdone(adp); 2688 } 2689 while (aip = LIST_FIRST(&bmsafemap->sm_allocindirhd)) { 2690 aip->ai_state |= DEPCOMPLETE; 2691 aip->ai_buf = NULL; 2692 LIST_REMOVE(aip, ai_deps); 2693 handle_allocindir_partdone(aip); 2694 } 2695 while ((inodedep = 2696 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { 2697 inodedep->id_state |= DEPCOMPLETE; 2698 LIST_REMOVE(inodedep, id_deps); 2699 inodedep->id_buf = NULL; 2700 } 2701 WORKITEM_FREE(bmsafemap, M_BMSAFEMAP); 2702 continue; 2703 2704 case M_MKDIR: 2705 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 2706 continue; 2707 2708 case M_ALLOCDIRECT: 2709 adp = WK_ALLOCDIRECT(wk); 2710 adp->ad_state |= COMPLETE; 2711 handle_allocdirect_partdone(adp); 2712 continue; 2713 2714 case M_ALLOCINDIR: 2715 aip = WK_ALLOCINDIR(wk); 2716 aip->ai_state |= COMPLETE; 2717 handle_allocindir_partdone(aip); 2718 continue; 2719 2720 case M_INDIRDEP: 2721 indirdep = WK_INDIRDEP(wk); 2722 if (indirdep->ir_state & GOINGAWAY) 2723 panic("disk_write_complete: indirdep gone"); 2724 bp->b_data = (caddr_t)indirdep->ir_saveddata; 2725 indirdep->ir_state &= ~UNDONE; 2726 indirdep->ir_state |= ATTACHED; 2727 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 2728 LIST_REMOVE(aip, ai_next); 2729 handle_allocindir_partdone(aip); 2730 } 2731 WORKLIST_INSERT(&reattach, wk); 2732 bdirty(bp); 2733 continue; 2734 2735 default: 2736 panic("handle_disk_write_complete: Unknown type %s", 2737 TYPENAME(wk->wk_type)); 2738 /* NOTREACHED */ 2739 } 2740 } 2741 /* 2742 * Reattach any requests that must be redone. 2743 */ 2744 while ((wk = LIST_FIRST(&reattach)) != NULL) { 2745 WORKLIST_REMOVE(wk); 2746 WORKLIST_INSERT(&bp->b_dep, wk); 2747 } 2748#ifdef DEBUG 2749 if (lk.lkt_held != -2) 2750 panic("softdep_disk_write_complete: lock lost"); 2751 lk.lkt_held = -1; 2752#endif 2753} 2754 2755/* 2756 * Called from within softdep_disk_write_complete above. Note that 2757 * this routine is always called from interrupt level with further 2758 * splbio interrupts blocked. 2759 */ 2760static void 2761handle_allocdirect_partdone(adp) 2762 struct allocdirect *adp; /* the completed allocdirect */ 2763{ 2764 struct allocdirect *listadp; 2765 struct inodedep *inodedep; 2766 long bsize; 2767 2768 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 2769 return; 2770 if (adp->ad_buf != NULL) 2771 panic("handle_allocdirect_partdone: dangling dep"); 2772 /* 2773 * The on-disk inode cannot claim to be any larger than the last 2774 * fragment that has been written. Otherwise, the on-disk inode 2775 * might have fragments that were not the last block in the file 2776 * which would corrupt the filesystem. Thus, we cannot free any 2777 * allocdirects after one whose ad_oldblkno claims a fragment as 2778 * these blocks must be rolled back to zero before writing the inode. 2779 * We check the currently active set of allocdirects in id_inoupdt. 2780 */ 2781 inodedep = adp->ad_inodedep; 2782 bsize = inodedep->id_fs->fs_bsize; 2783 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp; 2784 listadp = TAILQ_NEXT(listadp, ad_next)) { 2785 /* found our block */ 2786 if (listadp == adp) 2787 break; 2788 /* continue if ad_oldlbn is not a fragment */ 2789 if (listadp->ad_oldsize == 0 || 2790 listadp->ad_oldsize == bsize) 2791 continue; 2792 /* hit a fragment */ 2793 return; 2794 } 2795 /* 2796 * If we have reached the end of the current list without 2797 * finding the just finished dependency, then it must be 2798 * on the future dependency list. Future dependencies cannot 2799 * be freed until they are moved to the current list. 2800 */ 2801 if (listadp == NULL) { 2802#ifdef DEBUG 2803 for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp; 2804 listadp = TAILQ_NEXT(listadp, ad_next)) 2805 /* found our block */ 2806 if (listadp == adp) 2807 break; 2808 if (listadp == NULL) 2809 panic("handle_allocdirect_partdone: lost dep"); 2810#endif /* DEBUG */ 2811 return; 2812 } 2813 /* 2814 * If we have found the just finished dependency, then free 2815 * it along with anything that follows it that is complete. 2816 */ 2817 for (; adp; adp = listadp) { 2818 listadp = TAILQ_NEXT(adp, ad_next); 2819 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 2820 return; 2821 free_allocdirect(&inodedep->id_inoupdt, adp, 1); 2822 } 2823 /* 2824 * Try freeing the inodedep in case that was the last dependency. 2825 */ 2826 (void) free_inodedep(inodedep); 2827} 2828 2829/* 2830 * Called from within softdep_disk_write_complete above. Note that 2831 * this routine is always called from interrupt level with further 2832 * splbio interrupts blocked. 2833 */ 2834static void 2835handle_allocindir_partdone(aip) 2836 struct allocindir *aip; /* the completed allocindir */ 2837{ 2838 struct indirdep *indirdep; 2839 2840 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 2841 return; 2842 if (aip->ai_buf != NULL) 2843 panic("handle_allocindir_partdone: dangling dependency"); 2844 indirdep = aip->ai_indirdep; 2845 if (indirdep->ir_state & UNDONE) { 2846 LIST_REMOVE(aip, ai_next); 2847 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 2848 return; 2849 } 2850 ((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 2851 aip->ai_newblkno; 2852 LIST_REMOVE(aip, ai_next); 2853 if (aip->ai_freefrag != NULL) 2854 add_to_worklist(&aip->ai_freefrag->ff_list); 2855 WORKITEM_FREE(aip, M_ALLOCINDIR); 2856} 2857 2858/* 2859 * Called from within softdep_disk_write_complete above to restore 2860 * in-memory inode block contents to their most up-to-date state. Note 2861 * that this routine is always called from interrupt level with further 2862 * splbio interrupts blocked. 2863 */ 2864static int 2865handle_written_inodeblock(inodedep, bp) 2866 struct inodedep *inodedep; 2867 struct buf *bp; /* buffer containing the inode block */ 2868{ 2869 struct pagedep *pagedep; 2870 struct worklist *wk, *filefree; 2871 struct allocdirect *adp, *nextadp; 2872 struct dinode *dp; 2873 struct diradd *dap; 2874 int hadchanges; 2875 2876 if ((inodedep->id_state & IOSTARTED) == 0) 2877 panic("handle_written_inodeblock: not started"); 2878 inodedep->id_state &= ~IOSTARTED; 2879 inodedep->id_state |= COMPLETE; 2880 dp = (struct dinode *)bp->b_data + 2881 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 2882 /* 2883 * If we had to rollback the inode allocation because of 2884 * bitmaps being incomplete, then simply restore it. 2885 * Keep the block dirty so that it will not be reclaimed until 2886 * all associated dependencies have been cleared and the 2887 * corresponding updates written to disk. 2888 */ 2889 if (inodedep->id_savedino != NULL) { 2890 *dp = *inodedep->id_savedino; 2891 FREE(inodedep->id_savedino, M_INODEDEP); 2892 inodedep->id_savedino = NULL; 2893 bdirty(bp); 2894 return (1); 2895 } 2896 /* 2897 * Roll forward anything that had to be rolled back before 2898 * the inode could be updated. 2899 */ 2900 hadchanges = 0; 2901 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 2902 nextadp = TAILQ_NEXT(adp, ad_next); 2903 if (adp->ad_state & ATTACHED) 2904 panic("handle_written_inodeblock: new entry"); 2905 if (adp->ad_lbn < NDADDR) { 2906 if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) 2907 panic("%s: %s #%d mismatch %d != %d", 2908 "handle_written_inodeblock", 2909 "direct pointer", adp->ad_lbn, 2910 dp->di_db[adp->ad_lbn], adp->ad_oldblkno); 2911 dp->di_db[adp->ad_lbn] = adp->ad_newblkno; 2912 } else { 2913 if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) 2914 panic("%s: %s #%d allocated as %d", 2915 "handle_written_inodeblock", 2916 "indirect pointer", adp->ad_lbn - NDADDR, 2917 dp->di_ib[adp->ad_lbn - NDADDR]); 2918 dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; 2919 } 2920 adp->ad_state &= ~UNDONE; 2921 adp->ad_state |= ATTACHED; 2922 hadchanges = 1; 2923 } 2924 /* 2925 * Reset the file size to its most up-to-date value. 2926 */ 2927 if (inodedep->id_savedsize == -1) 2928 panic("handle_written_inodeblock: bad size"); 2929 if (dp->di_size != inodedep->id_savedsize) { 2930 dp->di_size = inodedep->id_savedsize; 2931 hadchanges = 1; 2932 } 2933 inodedep->id_savedsize = -1; 2934 /* 2935 * If there were any rollbacks in the inode block, then it must be 2936 * marked dirty so that its will eventually get written back in 2937 * its correct form. 2938 */ 2939 if (hadchanges) 2940 bdirty(bp); 2941 /* 2942 * Process any allocdirects that completed during the update. 2943 */ 2944 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 2945 handle_allocdirect_partdone(adp); 2946 /* 2947 * Process deallocations that were held pending until the 2948 * inode had been written to disk. Freeing of the inode 2949 * is delayed until after all blocks have been freed to 2950 * avoid creation of new <vfsid, inum, lbn> triples 2951 * before the old ones have been deleted. 2952 */ 2953 filefree = NULL; 2954 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 2955 WORKLIST_REMOVE(wk); 2956 switch (wk->wk_type) { 2957 2958 case M_FREEFILE: 2959 /* 2960 * We defer adding filefree to the worklist until 2961 * all other additions have been made to ensure 2962 * that it will be done after all the old blocks 2963 * have been freed. 2964 */ 2965 if (filefree != NULL) 2966 panic("handle_written_inodeblock: filefree"); 2967 filefree = wk; 2968 continue; 2969 2970 case M_MKDIR: 2971 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 2972 continue; 2973 2974 case M_DIRADD: 2975 dap = WK_DIRADD(wk); 2976 dap->da_state |= COMPLETE; 2977 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 2978 if (dap->da_state & DIRCHG) 2979 pagedep = dap->da_previous->dm_pagedep; 2980 else 2981 pagedep = dap->da_pagedep; 2982 LIST_REMOVE(dap, da_pdlist); 2983 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 2984 da_pdlist); 2985 } 2986 WORKLIST_INSERT(&inodedep->id_pendinghd, wk); 2987 continue; 2988 2989 case M_FREEBLKS: 2990 case M_FREEFRAG: 2991 case M_DIRREM: 2992 add_to_worklist(wk); 2993 continue; 2994 2995 default: 2996 panic("handle_written_inodeblock: Unknown type %s", 2997 TYPENAME(wk->wk_type)); 2998 /* NOTREACHED */ 2999 } 3000 } 3001 if (filefree != NULL) 3002 add_to_worklist(filefree); 3003 3004 /* 3005 * If no outstanding dependencies, free it. 3006 */ 3007 if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0) 3008 return (0); 3009 return (hadchanges); 3010} 3011 3012/* 3013 * Handle the completion of a mkdir dependency. 3014 */ 3015static void 3016handle_written_mkdir(mkdir, type) 3017 struct mkdir *mkdir; 3018 int type; 3019{ 3020 struct diradd *dap; 3021 struct pagedep *pagedep; 3022 3023 if (mkdir->md_state != type) 3024 panic("handle_written_mkdir: bad type"); 3025 dap = mkdir->md_diradd; 3026 dap->da_state &= ~type; 3027 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 3028 dap->da_state |= DEPCOMPLETE; 3029 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3030 if (dap->da_state & DIRCHG) 3031 pagedep = dap->da_previous->dm_pagedep; 3032 else 3033 pagedep = dap->da_pagedep; 3034 LIST_REMOVE(dap, da_pdlist); 3035 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3036 } 3037 LIST_REMOVE(mkdir, md_mkdirs); 3038 WORKITEM_FREE(mkdir, M_MKDIR); 3039} 3040 3041/* 3042 * Called from within softdep_disk_write_complete above. 3043 * A write operation was just completed. Removed inodes can 3044 * now be freed and associated block pointers may be committed. 3045 * Note that this routine is always called from interrupt level 3046 * with further splbio interrupts blocked. 3047 */ 3048static int 3049handle_written_filepage(pagedep, bp) 3050 struct pagedep *pagedep; 3051 struct buf *bp; /* buffer containing the written page */ 3052{ 3053 struct dirrem *dirrem; 3054 struct diradd *dap, *nextdap; 3055 struct direct *ep; 3056 int i, chgs; 3057 3058 if ((pagedep->pd_state & IOSTARTED) == 0) 3059 panic("handle_written_filepage: not started"); 3060 pagedep->pd_state &= ~IOSTARTED; 3061 /* 3062 * Process any directory removals that have been committed. 3063 */ 3064 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 3065 LIST_REMOVE(dirrem, dm_next); 3066 dirrem->dm_dirinum = pagedep->pd_ino; 3067 add_to_worklist(&dirrem->dm_list); 3068 } 3069 /* 3070 * Free any directory additions that have been committed. 3071 */ 3072 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 3073 free_diradd(dap); 3074 /* 3075 * Uncommitted directory entries must be restored. 3076 */ 3077 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 3078 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 3079 dap = nextdap) { 3080 nextdap = LIST_NEXT(dap, da_pdlist); 3081 if (dap->da_state & ATTACHED) 3082 panic("handle_written_filepage: attached"); 3083 ep = (struct direct *) 3084 ((char *)bp->b_data + dap->da_offset); 3085 ep->d_ino = dap->da_newinum; 3086 dap->da_state &= ~UNDONE; 3087 dap->da_state |= ATTACHED; 3088 chgs = 1; 3089 /* 3090 * If the inode referenced by the directory has 3091 * been written out, then the dependency can be 3092 * moved to the pending list. 3093 */ 3094 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3095 LIST_REMOVE(dap, da_pdlist); 3096 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 3097 da_pdlist); 3098 } 3099 } 3100 } 3101 /* 3102 * If there were any rollbacks in the directory, then it must be 3103 * marked dirty so that its will eventually get written back in 3104 * its correct form. 3105 */ 3106 if (chgs) 3107 bdirty(bp); 3108 /* 3109 * If no dependencies remain, the pagedep will be freed. 3110 * Otherwise it will remain to update the page before it 3111 * is written back to disk. 3112 */ 3113 if (LIST_FIRST(&pagedep->pd_dirremhd) == 0 && 3114 LIST_FIRST(&pagedep->pd_pendinghd) == 0) { 3115 for (i = 0; i < DAHASHSZ; i++) 3116 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) 3117 break; 3118 if (i == DAHASHSZ) { 3119 LIST_REMOVE(pagedep, pd_hash); 3120 WORKITEM_FREE(pagedep, M_PAGEDEP); 3121 return (0); 3122 } 3123 } 3124 return (1); 3125} 3126 3127/* 3128 * Writing back in-core inode structures. 3129 * 3130 * The file system only accesses an inode's contents when it occupies an 3131 * "in-core" inode structure. These "in-core" structures are separate from 3132 * the page frames used to cache inode blocks. Only the latter are 3133 * transferred to/from the disk. So, when the updated contents of the 3134 * "in-core" inode structure are copied to the corresponding in-memory inode 3135 * block, the dependencies are also transferred. The following procedure is 3136 * called when copying a dirty "in-core" inode to a cached inode block. 3137 */ 3138 3139/* 3140 * Called when an inode is loaded from disk. If the effective link count 3141 * differed from the actual link count when it was last flushed, then we 3142 * need to ensure that the correct effective link count is put back. 3143 */ 3144void 3145softdep_load_inodeblock(ip) 3146 struct inode *ip; /* the "in_core" copy of the inode */ 3147{ 3148 struct inodedep *inodedep; 3149 int error, gotit; 3150 3151 /* 3152 * Check for alternate nlink count. 3153 */ 3154 ip->i_effnlink = ip->i_nlink; 3155 ACQUIRE_LOCK(&lk); 3156 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3157 FREE_LOCK(&lk); 3158 return; 3159 } 3160 if (inodedep->id_nlinkdelta != 0) { 3161 ip->i_effnlink -= inodedep->id_nlinkdelta; 3162 inodedep->id_nlinkdelta = 0; 3163 (void) free_inodedep(inodedep); 3164 } 3165 FREE_LOCK(&lk); 3166} 3167 3168/* 3169 * This routine is called just before the "in-core" inode 3170 * information is to be copied to the in-memory inode block. 3171 * Recall that an inode block contains several inodes. If 3172 * the force flag is set, then the dependencies will be 3173 * cleared so that the update can always be made. Note that 3174 * the buffer is locked when this routine is called, so we 3175 * will never be in the middle of writing the inode block 3176 * to disk. 3177 */ 3178void 3179softdep_update_inodeblock(ip, bp, waitfor) 3180 struct inode *ip; /* the "in_core" copy of the inode */ 3181 struct buf *bp; /* the buffer containing the inode block */ 3182 int waitfor; /* 1 => update must be allowed */ 3183{ 3184 struct inodedep *inodedep; 3185 int error, gotit; 3186 3187 /* 3188 * If the effective link count is not equal to the actual link 3189 * count, then we must track the difference in an inodedep while 3190 * the inode is (potentially) tossed out of the cache. Otherwise, 3191 * if there is no existing inodedep, then there are no dependencies 3192 * to track. 3193 */ 3194 ACQUIRE_LOCK(&lk); 3195 if (ip->i_effnlink != ip->i_nlink) { 3196 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, 3197 &inodedep); 3198 } else if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3199 FREE_LOCK(&lk); 3200 return; 3201 } 3202 if (ip->i_nlink < ip->i_effnlink) 3203 panic("softdep_update_inodeblock: bad delta"); 3204 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3205 /* 3206 * If the last remaining use for the inodedep was to track the 3207 * link count, and there is no difference between the effective 3208 * and actual link count, then we can free the inodedep. 3209 */ 3210 if (free_inodedep(inodedep)) { 3211 FREE_LOCK(&lk); 3212 return; 3213 } 3214 /* 3215 * Changes have been initiated. Anything depending on these 3216 * changes cannot occur until this inode has been written. 3217 */ 3218 inodedep->id_state &= ~COMPLETE; 3219 if ((inodedep->id_state & ONWORKLIST) == 0) 3220 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 3221 /* 3222 * Any new dependencies associated with the incore inode must 3223 * now be moved to the list associated with the buffer holding 3224 * the in-memory copy of the inode. Once merged process any 3225 * allocdirects that are completed by the merger. 3226 */ 3227 merge_inode_lists(inodedep); 3228 if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) 3229 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); 3230 /* 3231 * Newly allocated inodes cannot be written until the bitmap 3232 * that allocates them have been written (indicated by 3233 * DEPCOMPLETE being set in id_state). If we are doing a 3234 * forced sync (e.g., an fsync on a file), we force the bitmap 3235 * to be written so that the update can be done. 3236 */ 3237 if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) { 3238 FREE_LOCK(&lk); 3239 return; 3240 } 3241 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 3242 FREE_LOCK(&lk); 3243 if (gotit && (error = VOP_BWRITE(inodedep->id_buf)) != 0) 3244 softdep_error("softdep_update_inodeblock: bwrite", error); 3245 if ((inodedep->id_state & DEPCOMPLETE) == 0) 3246 panic("softdep_update_inodeblock: update failed"); 3247} 3248 3249/* 3250 * Merge the new inode dependency list (id_newinoupdt) into the old 3251 * inode dependency list (id_inoupdt). This routine must be called 3252 * with splbio interrupts blocked. 3253 */ 3254static void 3255merge_inode_lists(inodedep) 3256 struct inodedep *inodedep; 3257{ 3258 struct allocdirect *listadp, *newadp; 3259 3260 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3261 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) { 3262 if (listadp->ad_lbn < newadp->ad_lbn) { 3263 listadp = TAILQ_NEXT(listadp, ad_next); 3264 continue; 3265 } 3266 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3267 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 3268 if (listadp->ad_lbn == newadp->ad_lbn) { 3269 allocdirect_merge(&inodedep->id_inoupdt, newadp, 3270 listadp); 3271 listadp = newadp; 3272 } 3273 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3274 } 3275 while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) { 3276 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3277 TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next); 3278 } 3279} 3280 3281/* 3282 * If we are doing an fsync, then we must ensure that any directory 3283 * entries for the inode have been written after the inode gets to disk. 3284 */ 3285int 3286softdep_fsync(vp) 3287 struct vnode *vp; /* the "in_core" copy of the inode */ 3288{ 3289 struct diradd *dap, *olddap; 3290 struct inodedep *inodedep; 3291 struct pagedep *pagedep; 3292 struct worklist *wk; 3293 struct mount *mnt; 3294 struct vnode *pvp; 3295 struct inode *ip; 3296 struct buf *bp; 3297 struct fs *fs; 3298 struct proc *p = curproc; /* XXX */ 3299 int error, ret, flushparent; 3300 struct timeval tv; 3301 ino_t parentino; 3302 ufs_lbn_t lbn; 3303 3304 ip = VTOI(vp); 3305 fs = ip->i_fs; 3306 for (error = 0, flushparent = 0, olddap = NULL; ; ) { 3307 ACQUIRE_LOCK(&lk); 3308 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) 3309 break; 3310 if (LIST_FIRST(&inodedep->id_inowait) != NULL || 3311 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 3312 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) 3313 panic("softdep_fsync: pending ops"); 3314 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 3315 break; 3316 if (wk->wk_type != M_DIRADD) 3317 panic("softdep_fsync: Unexpcted type %s", 3318 TYPENAME(wk->wk_type)); 3319 dap = WK_DIRADD(wk); 3320 /* 3321 * If we have failed to get rid of all the dependencies 3322 * then something is seriously wrong. 3323 */ 3324 if (dap == olddap) 3325 panic("softdep_fsync: flush failed"); 3326 olddap = dap; 3327 /* 3328 * Flush our parent if this directory entry 3329 * has a MKDIR_PARENT dependency. 3330 */ 3331 if (dap->da_state & DIRCHG) 3332 pagedep = dap->da_previous->dm_pagedep; 3333 else 3334 pagedep = dap->da_pagedep; 3335 mnt = pagedep->pd_mnt; 3336 parentino = pagedep->pd_ino; 3337 lbn = pagedep->pd_lbn; 3338 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 3339 panic("softdep_fsync: dirty"); 3340 flushparent = dap->da_state & MKDIR_PARENT; 3341 /* 3342 * If we are being fsync'ed as part of vgone'ing this vnode, 3343 * then we will not be able to release and recover the 3344 * vnode below, so we just have to give up on writing its 3345 * directory entry out. It will eventually be written, just 3346 * not now, but then the user was not asking to have it 3347 * written, so we are not breaking any promises. 3348 */ 3349 if (vp->v_flag & VXLOCK) 3350 break; 3351 /* 3352 * We prevent deadlock by always fetching inodes from the 3353 * root, moving down the directory tree. Thus, when fetching 3354 * our parent directory, we must unlock ourselves before 3355 * requesting the lock on our parent. See the comment in 3356 * ufs_lookup for details on possible races. 3357 */ 3358 FREE_LOCK(&lk); 3359 VOP_UNLOCK(vp, 0, p); 3360 if ((error = VFS_VGET(mnt, parentino, &pvp)) != 0) { 3361 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 3362 return (error); 3363 } 3364 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 3365 if (flushparent) { 3366 tv = time; 3367 if (error = VOP_UPDATE(pvp, &tv, &tv, MNT_WAIT)) { 3368 vput(pvp); 3369 return (error); 3370 } 3371 } 3372 /* 3373 * Flush directory page containing the inode's name. 3374 */ 3375 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred, 3376 &bp); 3377 vput(pvp); 3378 ret = VOP_BWRITE(bp); 3379 if (error != 0) 3380 return (error); 3381 if (ret != 0) 3382 return (ret); 3383 } 3384 FREE_LOCK(&lk); 3385 return (0); 3386} 3387 3388/* 3389 * This routine is called when we are trying to synchronously flush a 3390 * file. This routine must eliminate any filesystem metadata dependencies 3391 * so that the syncing routine can succeed by pushing the dirty blocks 3392 * associated with the file. If any I/O errors occur, they are returned. 3393 */ 3394int 3395softdep_sync_metadata(ap) 3396 struct vop_fsync_args /* { 3397 struct vnode *a_vp; 3398 struct ucred *a_cred; 3399 int a_waitfor; 3400 struct proc *a_p; 3401 } */ *ap; 3402{ 3403 struct vnode *vp = ap->a_vp; 3404 struct allocdirect *adp; 3405 struct allocindir *aip; 3406 struct buf *bp, *nbp; 3407 struct worklist *wk; 3408 int error, waitfor; 3409 3410 /* 3411 * Check whether this vnode is involved in a filesystem 3412 * that is doing soft dependency processing. 3413 */ 3414 if (vp->v_type != VBLK) { 3415 if (!DOINGSOFTDEP(vp)) 3416 return (0); 3417 } else 3418 if (vp->v_specmountpoint == NULL || 3419 (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0) 3420 return (0); 3421 /* 3422 * Ensure that any direct block dependencies have been cleared. 3423 */ 3424 ACQUIRE_LOCK(&lk); 3425 if (error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number)) { 3426 FREE_LOCK(&lk); 3427 return (error); 3428 } 3429 /* 3430 * For most files, the only metadata dependencies are the 3431 * cylinder group maps that allocate their inode or blocks. 3432 * The block allocation dependencies can be found by traversing 3433 * the dependency lists for any buffers that remain on their 3434 * dirty buffer list. The inode allocation dependency will 3435 * be resolved when the inode is updated with MNT_WAIT. 3436 * This work is done in two passes. The first pass grabs most 3437 * of the buffers and begins asynchronously writing them. The 3438 * only way to wait for these asynchronous writes is to sleep 3439 * on the filesystem vnode which may stay busy for a long time 3440 * if the filesystem is active. So, instead, we make a second 3441 * pass over the dependencies blocking on each write. In the 3442 * usual case we will be blocking against a write that we 3443 * initiated, so when it is done the dependency will have been 3444 * resolved. Thus the second pass is expected to end quickly. 3445 */ 3446 waitfor = MNT_NOWAIT; 3447top: 3448 if (getdirtybuf(&LIST_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) { 3449 FREE_LOCK(&lk); 3450 return (0); 3451 } 3452 bp = LIST_FIRST(&vp->v_dirtyblkhd); 3453loop: 3454 /* 3455 * As we hold the buffer locked, none of its dependencies 3456 * will disappear. 3457 */ 3458 for (wk = LIST_FIRST(&bp->b_dep); wk; 3459 wk = LIST_NEXT(wk, wk_list)) { 3460 switch (wk->wk_type) { 3461 3462 case M_ALLOCDIRECT: 3463 adp = WK_ALLOCDIRECT(wk); 3464 if (adp->ad_state & DEPCOMPLETE) 3465 break; 3466 nbp = adp->ad_buf; 3467 if (getdirtybuf(&nbp, waitfor) == 0) 3468 break; 3469 FREE_LOCK(&lk); 3470 if (waitfor == MNT_NOWAIT) { 3471 bawrite(nbp); 3472 } else if ((error = VOP_BWRITE(nbp)) != 0) { 3473 bawrite(bp); 3474 return (error); 3475 } 3476 ACQUIRE_LOCK(&lk); 3477 break; 3478 3479 case M_ALLOCINDIR: 3480 aip = WK_ALLOCINDIR(wk); 3481 if (aip->ai_state & DEPCOMPLETE) 3482 break; 3483 nbp = aip->ai_buf; 3484 if (getdirtybuf(&nbp, waitfor) == 0) 3485 break; 3486 FREE_LOCK(&lk); 3487 if (waitfor == MNT_NOWAIT) { 3488 bawrite(nbp); 3489 } else if ((error = VOP_BWRITE(nbp)) != 0) { 3490 bawrite(bp); 3491 return (error); 3492 } 3493 ACQUIRE_LOCK(&lk); 3494 break; 3495 3496 case M_INDIRDEP: 3497 for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd); 3498 aip; aip = LIST_NEXT(aip, ai_next)) { 3499 if (aip->ai_state & DEPCOMPLETE) 3500 continue; 3501 nbp = aip->ai_buf; 3502 if (getdirtybuf(&nbp, waitfor) == 0) 3503 break; 3504 FREE_LOCK(&lk); 3505 if (waitfor == MNT_NOWAIT) { 3506 bawrite(nbp); 3507 } else if ((error = VOP_BWRITE(nbp)) != 0) { 3508 bawrite(bp); 3509 return (error); 3510 } 3511 ACQUIRE_LOCK(&lk); 3512 continue; 3513 } 3514 break; 3515 3516 case M_INODEDEP: 3517 if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs, 3518 WK_INODEDEP(wk)->id_ino)) != 0) { 3519 FREE_LOCK(&lk); 3520 bawrite(bp); 3521 return (error); 3522 } 3523 break; 3524 3525 case M_PAGEDEP: 3526 /* 3527 * We are trying to sync a directory that may 3528 * have dependencies on both its own metadata 3529 * and/or dependencies on the inodes of any 3530 * recently allocated files. We walk its diradd 3531 * lists pushing out the associated inode. 3532 */ 3533 if (error = flush_pagedep_deps(vp, WK_PAGEDEP(wk))) { 3534 FREE_LOCK(&lk); 3535 bawrite(bp); 3536 return (error); 3537 } 3538 break; 3539 3540 default: 3541 panic("softdep_sync_metadata: Unknown type %s", 3542 TYPENAME(wk->wk_type)); 3543 /* NOTREACHED */ 3544 } 3545 } 3546 (void) getdirtybuf(&LIST_NEXT(bp, b_vnbufs), MNT_WAIT); 3547 nbp = LIST_NEXT(bp, b_vnbufs); 3548 FREE_LOCK(&lk); 3549 bawrite(bp); 3550 ACQUIRE_LOCK(&lk); 3551 if (nbp != NULL) { 3552 bp = nbp; 3553 goto loop; 3554 } 3555 /* 3556 * We must wait for any I/O in progress to finish so that 3557 * all potential buffers on the dirty list will be visible. 3558 * Once they are all there, proceed with the second pass 3559 * which will wait for the I/O as per above. 3560 */ 3561 while (vp->v_numoutput) { 3562 vp->v_flag |= VBWAIT; 3563 FREE_LOCK_INTERLOCKED(&lk); 3564 sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1); 3565 ACQUIRE_LOCK_INTERLOCKED(&lk); 3566 } 3567 /* 3568 * The brief unlock is to allow any pent up dependency 3569 * processing to be done. 3570 */ 3571 if (waitfor == MNT_NOWAIT) { 3572 waitfor = MNT_WAIT; 3573 FREE_LOCK(&lk); 3574 ACQUIRE_LOCK(&lk); 3575 goto top; 3576 } 3577 3578 /* 3579 * If we have managed to get rid of all the dirty buffers, 3580 * then we are done. For certain directories and block 3581 * devices, we may need to do further work. 3582 */ 3583 if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 3584 FREE_LOCK(&lk); 3585 return (0); 3586 } 3587 3588 FREE_LOCK(&lk); 3589 /* 3590 * If we are trying to sync a block device, some of its buffers may 3591 * contain metadata that cannot be written until the contents of some 3592 * partially written files have been written to disk. The only easy 3593 * way to accomplish this is to sync the entire filesystem (luckily 3594 * this happens rarely). 3595 */ 3596 if (vp->v_type == VBLK && vp->v_specmountpoint && !VOP_ISLOCKED(vp) && 3597 (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred, 3598 ap->a_p)) != 0) 3599 return (error); 3600 return (0); 3601} 3602 3603/* 3604 * Flush the dependencies associated with an inodedep. 3605 * Called with splbio blocked. 3606 */ 3607static int 3608flush_inodedep_deps(fs, ino) 3609 struct fs *fs; 3610 ino_t ino; 3611{ 3612 struct inodedep *inodedep; 3613 struct allocdirect *adp; 3614 int error, waitfor; 3615 struct buf *bp; 3616 3617 /* 3618 * This work is done in two passes. The first pass grabs most 3619 * of the buffers and begins asynchronously writing them. The 3620 * only way to wait for these asynchronous writes is to sleep 3621 * on the filesystem vnode which may stay busy for a long time 3622 * if the filesystem is active. So, instead, we make a second 3623 * pass over the dependencies blocking on each write. In the 3624 * usual case we will be blocking against a write that we 3625 * initiated, so when it is done the dependency will have been 3626 * resolved. Thus the second pass is expected to end quickly. 3627 * We give a brief window at the top of the loop to allow 3628 * any pending I/O to complete. 3629 */ 3630 for (waitfor = MNT_NOWAIT; ; ) { 3631 FREE_LOCK(&lk); 3632 ACQUIRE_LOCK(&lk); 3633 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 3634 return (0); 3635 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3636 adp = TAILQ_NEXT(adp, ad_next)) { 3637 if (adp->ad_state & DEPCOMPLETE) 3638 continue; 3639 bp = adp->ad_buf; 3640 if (getdirtybuf(&bp, waitfor) == 0) 3641 break; 3642 FREE_LOCK(&lk); 3643 if (waitfor == MNT_NOWAIT) { 3644 bawrite(bp); 3645 } else if ((error = VOP_BWRITE(bp)) != 0) { 3646 ACQUIRE_LOCK(&lk); 3647 return (error); 3648 } 3649 ACQUIRE_LOCK(&lk); 3650 break; 3651 } 3652 if (adp != NULL) 3653 continue; 3654 for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp; 3655 adp = TAILQ_NEXT(adp, ad_next)) { 3656 if (adp->ad_state & DEPCOMPLETE) 3657 continue; 3658 bp = adp->ad_buf; 3659 if (getdirtybuf(&bp, waitfor) == 0) 3660 break; 3661 FREE_LOCK(&lk); 3662 if (waitfor == MNT_NOWAIT) { 3663 bawrite(bp); 3664 } else if ((error = VOP_BWRITE(bp)) != 0) { 3665 ACQUIRE_LOCK(&lk); 3666 return (error); 3667 } 3668 ACQUIRE_LOCK(&lk); 3669 break; 3670 } 3671 if (adp != NULL) 3672 continue; 3673 /* 3674 * If pass2, we are done, otherwise do pass 2. 3675 */ 3676 if (waitfor == MNT_WAIT) 3677 break; 3678 waitfor = MNT_WAIT; 3679 } 3680 /* 3681 * Try freeing inodedep in case all dependencies have been removed. 3682 */ 3683 if (inodedep_lookup(fs, ino, 0, &inodedep) != 0) 3684 (void) free_inodedep(inodedep); 3685 return (0); 3686} 3687 3688/* 3689 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 3690 * Called with splbio blocked. 3691 */ 3692static int 3693flush_pagedep_deps(pvp, pagedep) 3694 struct vnode *pvp; 3695 struct pagedep *pagedep; 3696{ 3697 struct proc *p = curproc; /* XXX */ 3698 struct diradd *dap; 3699 struct timeval tv; 3700 struct vnode *vp; 3701 int i, error; 3702 ino_t inum; 3703 3704 for (i = 0, error = 0; i < DAHASHSZ && error == 0; i++) { 3705 while ((dap = LIST_FIRST(&pagedep->pd_diraddhd[i])) != NULL) { 3706 /* 3707 * Flush ourselves if this directory entry 3708 * has a MKDIR_PARENT dependency. 3709 */ 3710 if (dap->da_state & MKDIR_PARENT) { 3711 tv = time; 3712 FREE_LOCK(&lk); 3713 if (error = VOP_UPDATE(pvp, &tv, &tv, MNT_WAIT)) 3714 break; 3715 ACQUIRE_LOCK(&lk); 3716 /* 3717 * If that cleared dependencies, go on to next. 3718 */ 3719 if (dap != LIST_FIRST(&pagedep->pd_diraddhd[i])) 3720 continue; 3721 if (dap->da_state & MKDIR_PARENT) 3722 panic("flush_pagedep_deps: MKDIR"); 3723 } 3724 /* 3725 * Flush the file on which the directory entry depends. 3726 */ 3727 inum = dap->da_newinum; 3728 FREE_LOCK(&lk); 3729 if ((error = VFS_VGET(pagedep->pd_mnt, inum, &vp)) != 0) 3730 break; 3731 if (vp->v_type == VDIR) { 3732 /* 3733 * A newly allocated directory must have its 3734 * "." and ".." entries written out before its 3735 * name can be committed in its parent. We do 3736 * not want or need the full semantics of a 3737 * synchronous VOP_FSYNC as that may end up 3738 * here again, once for each directory level in 3739 * the filesystem. Instead, we push the blocks 3740 * and wait for them to clear. 3741 */ 3742 if (error = 3743 VOP_FSYNC(vp, p->p_cred, MNT_NOWAIT, p)) { 3744 vput(vp); 3745 break; 3746 } 3747 ACQUIRE_LOCK(&lk); 3748 while (vp->v_numoutput) { 3749 vp->v_flag |= VBWAIT; 3750 FREE_LOCK_INTERLOCKED(&lk); 3751 sleep((caddr_t)&vp->v_numoutput, 3752 PRIBIO + 1); 3753 ACQUIRE_LOCK_INTERLOCKED(&lk); 3754 } 3755 FREE_LOCK(&lk); 3756 } 3757 tv = time; 3758 error = VOP_UPDATE(vp, &tv, &tv, MNT_WAIT); 3759 vput(vp); 3760 if (error) 3761 break; 3762 /* 3763 * If we have failed to get rid of all the dependencies 3764 * then something is seriously wrong. 3765 */ 3766 if (dap == LIST_FIRST(&pagedep->pd_diraddhd[i])) 3767 panic("flush_pagedep_deps: flush failed"); 3768 ACQUIRE_LOCK(&lk); 3769 } 3770 } 3771 if (error) 3772 ACQUIRE_LOCK(&lk); 3773 return (error); 3774} 3775 3776/* 3777 * Acquire exclusive access to a buffer. 3778 * Must be called with splbio blocked. 3779 * Return 1 if buffer was acquired. 3780 */ 3781static int 3782getdirtybuf(bpp, waitfor) 3783 struct buf **bpp; 3784 int waitfor; 3785{ 3786 struct buf *bp; 3787 3788 for (;;) { 3789 if ((bp = *bpp) == NULL) 3790 return (0); 3791 if ((bp->b_flags & B_BUSY) == 0) 3792 break; 3793 if (waitfor != MNT_WAIT) 3794 return (0); 3795 bp->b_flags |= B_WANTED; 3796 FREE_LOCK_INTERLOCKED(&lk); 3797 sleep((caddr_t)bp, PRIBIO + 1); 3798 ACQUIRE_LOCK_INTERLOCKED(&lk); 3799 } 3800 if ((bp->b_flags & B_DELWRI) == 0) 3801 return (0); 3802 bremfree(bp); 3803 bp->b_flags |= B_BUSY; 3804 return (1); 3805} 3806 3807/* 3808 * Called whenever a buffer that is being invalidated or reallocated 3809 * contains dependencies. This should only happen if an I/O error has 3810 * occurred. The routine is called with the buffer locked. 3811 */ 3812void 3813softdep_deallocate_dependencies(bp) 3814 struct buf *bp; 3815{ 3816 struct worklist *wk; 3817 3818 if ((bp->b_flags & B_ERROR) == 0) 3819 panic("softdep_deallocate_dependencies: dangling deps"); 3820 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 3821 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 3822 WORKLIST_REMOVE(wk); 3823 switch (wk->wk_type) { 3824 /* 3825 * XXX - should really clean up, but for now we will 3826 * just leak memory and not worry about it. 3827 */ 3828 case M_PAGEDEP: case M_INDIRDEP: case M_INODEDEP: 3829#ifdef DEBUG 3830 printf("Lost %s\n", TYPENAME(wk->wk_type)); 3831#endif 3832 break; 3833 default: 3834 panic("softdep_deallocate_dependencies: bad type"); 3835 } 3836 } 3837} 3838 3839/* 3840 * Function to handle asynchronous write errors in the filesystem. 3841 */ 3842void 3843softdep_error(func, error) 3844 char *func; 3845 int error; 3846{ 3847 3848 /* XXX should do something better! */ 3849 log(LOG_ERR, "%s: got error %d while accessing filesystem\n", 3850 func, error); 3851} 3852