ffs_softdep.c revision 36207
1264790Sbapt/* 2264790Sbapt * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved. 3272955Srodrigc * 4264790Sbapt * The soft updates code is derived from the appendix of a University 5264790Sbapt * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 6264790Sbapt * "Soft Updates: A Solution to the Metadata Update Problem in File 7264790Sbapt * Systems", CSE-TR-254-95, August 1995). 8264790Sbapt * 9264790Sbapt * The following are the copyrights and redistribution conditions that 10264790Sbapt * apply to this copy of the soft update software. For a license 11264790Sbapt * to use, redistribute or sell the soft update software under 12264790Sbapt * conditions other than those described here, please contact the 13264790Sbapt * author at one of the following addresses: 14264790Sbapt * 15264790Sbapt * Marshall Kirk McKusick mckusick@mckusick.com 16 * 1614 Oxford Street +1-510-843-9542 17 * Berkeley, CA 94709-1608 18 * USA 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * 1. Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * 2. Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in the 28 * documentation and/or other materials provided with the distribution. 29 * 3. None of the names of McKusick, Ganger, Patt, or the University of 30 * Michigan may be used to endorse or promote products derived from 31 * this software without specific prior written permission. 32 * 4. Redistributions in any form must be accompanied by information on 33 * how to obtain complete source code for any accompanying software 34 * that uses this software. This source code must either be included 35 * in the distribution or be available for no more than the cost of 36 * distribution plus a nominal fee, and must be freely redistributable 37 * under reasonable conditions. For an executable file, complete 38 * source code means the source code for all modules it contains. 39 * It does not mean source code for modules or files that typically 40 * accompany the operating system on which the executable file runs, 41 * e.g., standard library modules or system header files. 42 * 43 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 44 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 45 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 46 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 47 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 53 * SUCH DAMAGE. 54 * 55 * 56 * from: @(#)ffs_softdep.c 9.14 (McKusick) 1/15/98 57 */ 58 59/* 60 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide. 61 */ 62#ifndef DIAGNOSTIC 63#define DIAGNOSTIC 64#endif 65#ifndef DEBUG 66#define DEBUG 67#endif 68 69#include <sys/param.h> 70#include <sys/buf.h> 71#include <sys/kernel.h> 72#include <sys/malloc.h> 73#include <sys/mount.h> 74#include <sys/proc.h> 75#include <sys/syslog.h> 76#include <sys/systm.h> 77#include <sys/vnode.h> 78#include <machine/pcpu.h> 79#include <miscfs/specfs/specdev.h> 80#include <ufs/ufs/dir.h> 81#include <ufs/ufs/quota.h> 82#include <ufs/ufs/inode.h> 83#include <ufs/ufs/ufsmount.h> 84#include <ufs/ffs/fs.h> 85#include <ufs/ffs/softdep.h> 86#include <ufs/ffs/ffs_extern.h> 87#include <ufs/ufs/ufs_extern.h> 88 89/* 90 * Internal function prototypes. 91 */ 92static void softdep_error __P((char *, int)); 93static int getdirtybuf __P((struct buf **, int)); 94static int flush_pagedep_deps __P((struct vnode *, struct mount *, 95 struct diraddhd *)); 96static int flush_inodedep_deps __P((struct fs *, ino_t)); 97static int handle_written_filepage __P((struct pagedep *, struct buf *)); 98static int handle_written_inodeblock __P((struct inodedep *, struct buf *)); 99static void handle_allocdirect_partdone __P((struct allocdirect *)); 100static void handle_allocindir_partdone __P((struct allocindir *)); 101static void initiate_write_filepage __P((struct pagedep *, struct buf *)); 102static void handle_written_mkdir __P((struct mkdir *, int)); 103static void initiate_write_inodeblock __P((struct inodedep *, struct buf *)); 104static void handle_workitem_freefile __P((struct freefile *)); 105static void handle_workitem_remove __P((struct dirrem *)); 106static struct dirrem *newdirrem __P((struct buf *, struct inode *, 107 struct inode *, int)); 108static void free_diradd __P((struct diradd *)); 109static void free_allocindir __P((struct allocindir *, struct inodedep *)); 110static int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t, 111 long *)); 112static void deallocate_dependencies __P((struct buf *, struct inodedep *)); 113static void free_allocdirect __P((struct allocdirectlst *, 114 struct allocdirect *, int)); 115static int free_inodedep __P((struct inodedep *)); 116static void handle_workitem_freeblocks __P((struct freeblks *)); 117static void merge_inode_lists __P((struct inodedep *)); 118static void setup_allocindir_phase2 __P((struct buf *, struct inode *, 119 struct allocindir *)); 120static struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t, 121 ufs_daddr_t)); 122static void handle_workitem_freefrag __P((struct freefrag *)); 123static struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long)); 124static void allocdirect_merge __P((struct allocdirectlst *, 125 struct allocdirect *, struct allocdirect *)); 126static struct bmsafemap *bmsafemap_lookup __P((struct buf *)); 127static int newblk_lookup __P((struct fs *, ufs_daddr_t, int, 128 struct newblk **)); 129static int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **)); 130static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int, 131 struct pagedep **)); 132static void add_to_worklist __P((struct worklist *)); 133 134/* 135 * Exported softdep operations. 136 */ 137struct bio_ops bioops = { 138 softdep_disk_io_initiation, /* io_start */ 139 softdep_disk_write_complete, /* io_complete */ 140 softdep_deallocate_dependencies, /* io_deallocate */ 141 softdep_process_worklist, /* io_sync */ 142}; 143 144/* 145 * Names of malloc types. 146 */ 147extern char *memname[]; 148#define TYPENAME(type) ((unsigned)(type) < M_LAST ? memname[type] : "???") 149 150/* 151 * Locking primitives. 152 * 153 * For a uniprocessor, all we need to do is protect against disk 154 * interrupts. For a multiprocessor, this lock would have to be 155 * a mutex. A single mutex is used throughout this file, though 156 * finer grain locking could be used if contention warranted it. 157 * 158 * For a multiprocessor, the sleep call would accept a lock and 159 * release it after the sleep processing was complete. In a uniprocessor 160 * implementation there is no such interlock, so we simple mark 161 * the places where it needs to be done with the `interlocked' form 162 * of the lock calls. Since the uniprocessor sleep already interlocks 163 * the spl, there is nothing that really needs to be done. 164 */ 165#ifndef /* NOT */ DEBUG 166static struct lockit { 167 int lkt_spl; 168} lk = { 0 }; 169#define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio() 170#define FREE_LOCK(lk) splx((lk)->lkt_spl) 171#define ACQUIRE_LOCK_INTERLOCKED(lk) 172#define FREE_LOCK_INTERLOCKED(lk) 173 174#else /* DEBUG */ 175static struct lockit { 176 int lkt_spl; 177 pid_t lkt_held; 178} lk = { 0, -1 }; 179static int lockcnt; 180 181static void acquire_lock __P((struct lockit *)); 182static void free_lock __P((struct lockit *)); 183static void acquire_lock_interlocked __P((struct lockit *)); 184static void free_lock_interlocked __P((struct lockit *)); 185 186#define ACQUIRE_LOCK(lk) acquire_lock(lk) 187#define FREE_LOCK(lk) free_lock(lk) 188#define ACQUIRE_LOCK_INTERLOCKED(lk) acquire_lock_interlocked(lk) 189#define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk) 190 191static void 192acquire_lock(lk) 193 struct lockit *lk; 194{ 195 196 if (lk->lkt_held != -1) 197 if (lk->lkt_held == curproc->p_pid) 198 panic("softdep_lock: locking against myself"); 199 else 200 panic("softdep_lock: lock held by %d", lk->lkt_held); 201 lk->lkt_spl = splbio(); 202 lk->lkt_held = curproc->p_pid; 203 lockcnt++; 204} 205 206static void 207free_lock(lk) 208 struct lockit *lk; 209{ 210 211 if (lk->lkt_held == -1) 212 panic("softdep_unlock: lock not held"); 213 lk->lkt_held = -1; 214 splx(lk->lkt_spl); 215} 216 217static void 218acquire_lock_interlocked(lk) 219 struct lockit *lk; 220{ 221 222 if (lk->lkt_held != -1) 223 if (lk->lkt_held == curproc->p_pid) 224 panic("softdep_lock_interlocked: locking against self"); 225 else 226 panic("softdep_lock_interlocked: lock held by %d", 227 lk->lkt_held); 228 lk->lkt_held = curproc->p_pid; 229 lockcnt++; 230} 231 232static void 233free_lock_interlocked(lk) 234 struct lockit *lk; 235{ 236 237 if (lk->lkt_held == -1) 238 panic("softdep_unlock_interlocked: lock not held"); 239 lk->lkt_held = -1; 240} 241#endif /* DEBUG */ 242 243/* 244 * Place holder for real semaphores. 245 */ 246struct sema { 247 int value; 248 pid_t holder; 249 char *name; 250 int prio; 251 int timo; 252}; 253static void sema_init __P((struct sema *, char *, int, int)); 254static int sema_get __P((struct sema *, struct lockit *)); 255static void sema_release __P((struct sema *)); 256 257static void 258sema_init(semap, name, prio, timo) 259 struct sema *semap; 260 char *name; 261 int prio, timo; 262{ 263 264 semap->holder = -1; 265 semap->value = 0; 266 semap->name = name; 267 semap->prio = prio; 268 semap->timo = timo; 269} 270 271static int 272sema_get(semap, interlock) 273 struct sema *semap; 274 struct lockit *interlock; 275{ 276 277 if (semap->value++ > 0) { 278 if (interlock != NULL) 279 FREE_LOCK_INTERLOCKED(interlock); 280 tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo); 281 if (interlock != NULL) { 282 ACQUIRE_LOCK_INTERLOCKED(interlock); 283 FREE_LOCK(interlock); 284 } 285 return (0); 286 } 287 semap->holder = curproc->p_pid; 288 if (interlock != NULL) 289 FREE_LOCK(interlock); 290 return (1); 291} 292 293static void 294sema_release(semap) 295 struct sema *semap; 296{ 297 298 if (semap->value <= 0 || semap->holder != curproc->p_pid) 299 panic("sema_release: not held"); 300 if (--semap->value > 0) { 301 semap->value = 0; 302 wakeup(semap); 303 } 304 semap->holder = -1; 305} 306 307/* 308 * Worklist queue management. 309 * These routines require that the lock be held. 310 */ 311#ifndef /* NOT */ DEBUG 312#define WORKLIST_INSERT(head, item) do { \ 313 (item)->wk_state |= ONWORKLIST; \ 314 LIST_INSERT_HEAD(head, item, wk_list); \ 315} while (0) 316#define WORKLIST_REMOVE(item) do { \ 317 (item)->wk_state &= ~ONWORKLIST; \ 318 LIST_REMOVE(item, wk_list); \ 319} while (0) 320#define WORKITEM_FREE(item, type) FREE(item, type) 321 322#else /* DEBUG */ 323static void worklist_insert __P((struct workhead *, struct worklist *)); 324static void worklist_remove __P((struct worklist *)); 325static void workitem_free __P((struct worklist *, int)); 326 327#define WORKLIST_INSERT(head, item) worklist_insert(head, item) 328#define WORKLIST_REMOVE(item) worklist_remove(item) 329#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type) 330 331static void 332worklist_insert(head, item) 333 struct workhead *head; 334 struct worklist *item; 335{ 336 337 if (lk.lkt_held == -1) 338 panic("worklist_insert: lock not held"); 339 if (item->wk_state & ONWORKLIST) 340 panic("worklist_insert: already on list"); 341 item->wk_state |= ONWORKLIST; 342 LIST_INSERT_HEAD(head, item, wk_list); 343} 344 345static void 346worklist_remove(item) 347 struct worklist *item; 348{ 349 350 if (lk.lkt_held == -1) 351 panic("worklist_remove: lock not held"); 352 if ((item->wk_state & ONWORKLIST) == 0) 353 panic("worklist_remove: not on list"); 354 item->wk_state &= ~ONWORKLIST; 355 LIST_REMOVE(item, wk_list); 356} 357 358static void 359workitem_free(item, type) 360 struct worklist *item; 361 int type; 362{ 363 364 if (item->wk_state & ONWORKLIST) 365 panic("workitem_free: still on list"); 366 if (item->wk_type != type) 367 panic("workitem_free: type mismatch"); 368 FREE(item, type); 369} 370#endif /* DEBUG */ 371 372/* 373 * Workitem queue management 374 */ 375static struct workhead softdep_workitem_pending; 376static int softdep_worklist_busy; 377 378/* 379 * Add an item to the end of the work queue. 380 * This routine requires that the lock be held. 381 * This is the only routine that adds items to the list. 382 * The following routine is the only one that removes items 383 * and does so in order from first to last. 384 */ 385static void 386add_to_worklist(wk) 387 struct worklist *wk; 388{ 389 static struct worklist *worklist_tail; 390 391 if (wk->wk_state & ONWORKLIST) 392 panic("add_to_worklist: already on list"); 393 wk->wk_state |= ONWORKLIST; 394 if (LIST_FIRST(&softdep_workitem_pending) == NULL) 395 LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list); 396 else 397 LIST_INSERT_AFTER(worklist_tail, wk, wk_list); 398 worklist_tail = wk; 399} 400 401/* 402 * Process that runs once per second to handle items in the background queue. 403 * 404 * Note that we ensure that everything is done in the order in which they 405 * appear in the queue. The code below depends on this property to ensure 406 * that blocks of a file are freed before the inode itself is freed. This 407 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 408 * until all the old ones have been purged from the dependency lists. 409 */ 410int 411softdep_process_worklist(matchmnt) 412 struct mount *matchmnt; 413{ 414 struct worklist *wk; 415 struct fs *matchfs; 416 int matchcnt; 417 418 matchcnt = 0; 419 matchfs = NULL; 420 if (matchmnt != NULL) 421 matchfs = VFSTOUFS(matchmnt)->um_fs; 422 /* 423 * There is no danger of having multiple processes run this 424 * code. It is single threaded solely so that softdep_flushfiles 425 * (below) can get an accurate count of the number of items 426 * related to its mount point that are in the list. 427 */ 428 if (softdep_worklist_busy && matchmnt == NULL) 429 return (-1); 430 ACQUIRE_LOCK(&lk); 431 while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) { 432 WORKLIST_REMOVE(wk); 433 FREE_LOCK(&lk); 434 switch (wk->wk_type) { 435 436 case M_DIRREM: 437 /* removal of a directory entry */ 438 if (WK_DIRREM(wk)->dm_mnt == matchmnt) 439 matchcnt += 1; 440 handle_workitem_remove(WK_DIRREM(wk)); 441 break; 442 443 case M_FREEBLKS: 444 /* releasing blocks and/or fragments from a file */ 445 if (WK_FREEBLKS(wk)->fb_fs == matchfs) 446 matchcnt += 1; 447 handle_workitem_freeblocks(WK_FREEBLKS(wk)); 448 break; 449 450 case M_FREEFRAG: 451 /* releasing a fragment when replaced as a file grows */ 452 if (WK_FREEFRAG(wk)->ff_fs == matchfs) 453 matchcnt += 1; 454 handle_workitem_freefrag(WK_FREEFRAG(wk)); 455 break; 456 457 case M_FREEFILE: 458 /* releasing an inode when its link count drops to 0 */ 459 if (WK_FREEFILE(wk)->fx_fs == matchfs) 460 matchcnt += 1; 461 handle_workitem_freefile(WK_FREEFILE(wk)); 462 break; 463 464 default: 465 panic("%s_process_worklist: Unknown type %s", 466 "softdep", TYPENAME(wk->wk_type)); 467 /* NOTREACHED */ 468 } 469 if (softdep_worklist_busy && matchmnt == NULL) 470 return (-1); 471 ACQUIRE_LOCK(&lk); 472 } 473 FREE_LOCK(&lk); 474 return (matchcnt); 475} 476 477/* 478 * Purge the work list of all items associated with a particular mount point. 479 */ 480int 481softdep_flushfiles(oldmnt, flags, p) 482 struct mount *oldmnt; 483 int flags; 484 struct proc *p; 485{ 486 struct vnode *devvp; 487 int error, loopcnt; 488 489 /* 490 * Await our turn to clear out the queue. 491 */ 492 while (softdep_worklist_busy) 493 sleep(&lbolt, PRIBIO); 494 softdep_worklist_busy = 1; 495 if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) { 496 softdep_worklist_busy = 0; 497 return (error); 498 } 499 /* 500 * Alternately flush the block device associated with the mount 501 * point and process any dependencies that the flushing 502 * creates. In theory, this loop can happen at most twice, 503 * but we give it a few extra just to be sure. 504 */ 505 devvp = VFSTOUFS(oldmnt)->um_devvp; 506 for (loopcnt = 10; loopcnt > 0; loopcnt--) { 507 if (softdep_process_worklist(oldmnt) == 0) { 508 /* 509 * Do another flush in case any vnodes were brought in 510 * as part of the cleanup operations. 511 */ 512 if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) 513 break; 514 /* 515 * If we still found nothing to do, we are really done. 516 */ 517 if (softdep_process_worklist(oldmnt) == 0) 518 break; 519 } 520 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); 521 error = VOP_FSYNC(devvp, p->p_cred, MNT_WAIT, p); 522 VOP_UNLOCK(devvp, 0, p); 523 if (error) 524 break; 525 } 526 softdep_worklist_busy = 0; 527 /* 528 * If we are unmounting then it is an error to fail. If we 529 * are simply trying to downgrade to read-only, then filesystem 530 * activity can keep us busy forever, so we just fail with EBUSY. 531 */ 532 if (loopcnt == 0) { 533 if (oldmnt->mnt_flag & MNT_UNMOUNT) 534 panic("softdep_flushfiles: looping"); 535 error = EBUSY; 536 } 537 return (error); 538} 539 540/* 541 * Structure hashing. 542 * 543 * There are three types of structures that can be looked up: 544 * 1) pagedep structures identified by mount point, inode number, 545 * and logical block. 546 * 2) inodedep structures identified by mount point and inode number. 547 * 3) newblk structures identified by mount point and 548 * physical block number. 549 * 550 * The "pagedep" and "inodedep" dependency structures are hashed 551 * separately from the file blocks and inodes to which they correspond. 552 * This separation helps when the in-memory copy of an inode or 553 * file block must be replaced. It also obviates the need to access 554 * an inode or file page when simply updating (or de-allocating) 555 * dependency structures. Lookup of newblk structures is needed to 556 * find newly allocated blocks when trying to associate them with 557 * their allocdirect or allocindir structure. 558 * 559 * The lookup routines optionally create and hash a new instance when 560 * an existing entry is not found. 561 */ 562#define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 563 564/* 565 * Structures and routines associated with pagedep caching. 566 */ 567LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 568u_long pagedep_hash; /* size of hash table - 1 */ 569#define PAGEDEP_HASH(mp, inum, lbn) \ 570 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 571 pagedep_hash]) 572static struct sema pagedep_in_progress; 573 574/* 575 * Look up a pagedep. Return 1 if found, 0 if not found. 576 * If not found, allocate if DEPALLOC flag is passed. 577 * Found or allocated entry is returned in pagedeppp. 578 * This routine must be called with splbio interrupts blocked. 579 */ 580static int 581pagedep_lookup(ip, lbn, flags, pagedeppp) 582 struct inode *ip; 583 ufs_lbn_t lbn; 584 int flags; 585 struct pagedep **pagedeppp; 586{ 587 struct pagedep *pagedep; 588 struct pagedep_hashhead *pagedephd; 589 struct mount *mp; 590 int i; 591 592#ifdef DEBUG 593 if (lk.lkt_held == -1) 594 panic("pagedep_lookup: lock not held"); 595#endif 596 mp = ITOV(ip)->v_mount; 597 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); 598top: 599 for (pagedep = LIST_FIRST(pagedephd); pagedep; 600 pagedep = LIST_NEXT(pagedep, pd_hash)) 601 if (ip->i_number == pagedep->pd_ino && 602 lbn == pagedep->pd_lbn && 603 mp == pagedep->pd_mnt) 604 break; 605 if (pagedep) { 606 *pagedeppp = pagedep; 607 return (1); 608 } 609 if ((flags & DEPALLOC) == 0) { 610 *pagedeppp = NULL; 611 return (0); 612 } 613 if (sema_get(&pagedep_in_progress, &lk) == 0) { 614 ACQUIRE_LOCK(&lk); 615 goto top; 616 } 617 MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP, 618 M_WAITOK); 619 bzero(pagedep, sizeof(struct pagedep)); 620 pagedep->pd_list.wk_type = M_PAGEDEP; 621 pagedep->pd_mnt = mp; 622 pagedep->pd_ino = ip->i_number; 623 pagedep->pd_lbn = lbn; 624 LIST_INIT(&pagedep->pd_dirremhd); 625 LIST_INIT(&pagedep->pd_pendinghd); 626 for (i = 0; i < DAHASHSZ; i++) 627 LIST_INIT(&pagedep->pd_diraddhd[i]); 628 ACQUIRE_LOCK(&lk); 629 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 630 sema_release(&pagedep_in_progress); 631 *pagedeppp = pagedep; 632 return (0); 633} 634 635/* 636 * Structures and routines associated with inodedep caching. 637 */ 638LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 639u_long inodedep_hash; /* size of hash table - 1 */ 640#define INODEDEP_HASH(fs, inum) \ 641 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 642static struct sema inodedep_in_progress; 643 644/* 645 * Look up a inodedep. Return 1 if found, 0 if not found. 646 * If not found, allocate if DEPALLOC flag is passed. 647 * Found or allocated entry is returned in inodedeppp. 648 * This routine must be called with splbio interrupts blocked. 649 */ 650static int 651inodedep_lookup(fs, inum, flags, inodedeppp) 652 struct fs *fs; 653 ino_t inum; 654 int flags; 655 struct inodedep **inodedeppp; 656{ 657 struct inodedep *inodedep; 658 struct inodedep_hashhead *inodedephd; 659 660#ifdef DEBUG 661 if (lk.lkt_held == -1) 662 panic("inodedep_lookup: lock not held"); 663#endif 664 inodedephd = INODEDEP_HASH(fs, inum); 665top: 666 for (inodedep = LIST_FIRST(inodedephd); inodedep; 667 inodedep = LIST_NEXT(inodedep, id_hash)) 668 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 669 break; 670 if (inodedep) { 671 *inodedeppp = inodedep; 672 return (1); 673 } 674 if ((flags & DEPALLOC) == 0) { 675 *inodedeppp = NULL; 676 return (0); 677 } 678 if (sema_get(&inodedep_in_progress, &lk) == 0) { 679 ACQUIRE_LOCK(&lk); 680 goto top; 681 } 682 MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep), 683 M_INODEDEP, M_WAITOK); 684 inodedep->id_list.wk_type = M_INODEDEP; 685 inodedep->id_fs = fs; 686 inodedep->id_ino = inum; 687 inodedep->id_state = ALLCOMPLETE; 688 inodedep->id_nlinkdelta = 0; 689 inodedep->id_savedino = NULL; 690 inodedep->id_savedsize = -1; 691 inodedep->id_buf = NULL; 692 LIST_INIT(&inodedep->id_pendinghd); 693 LIST_INIT(&inodedep->id_inowait); 694 TAILQ_INIT(&inodedep->id_inoupdt); 695 TAILQ_INIT(&inodedep->id_newinoupdt); 696 ACQUIRE_LOCK(&lk); 697 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 698 sema_release(&inodedep_in_progress); 699 *inodedeppp = inodedep; 700 return (0); 701} 702 703/* 704 * Structures and routines associated with newblk caching. 705 */ 706LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 707u_long newblk_hash; /* size of hash table - 1 */ 708#define NEWBLK_HASH(fs, inum) \ 709 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 710static struct sema newblk_in_progress; 711 712/* 713 * Look up a newblk. Return 1 if found, 0 if not found. 714 * If not found, allocate if DEPALLOC flag is passed. 715 * Found or allocated entry is returned in newblkpp. 716 */ 717static int 718newblk_lookup(fs, newblkno, flags, newblkpp) 719 struct fs *fs; 720 ufs_daddr_t newblkno; 721 int flags; 722 struct newblk **newblkpp; 723{ 724 struct newblk *newblk; 725 struct newblk_hashhead *newblkhd; 726 727 newblkhd = NEWBLK_HASH(fs, newblkno); 728top: 729 for (newblk = LIST_FIRST(newblkhd); newblk; 730 newblk = LIST_NEXT(newblk, nb_hash)) 731 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) 732 break; 733 if (newblk) { 734 *newblkpp = newblk; 735 return (1); 736 } 737 if ((flags & DEPALLOC) == 0) { 738 *newblkpp = NULL; 739 return (0); 740 } 741 if (sema_get(&newblk_in_progress, 0) == 0) 742 goto top; 743 MALLOC(newblk, struct newblk *, sizeof(struct newblk), 744 M_NEWBLK, M_WAITOK); 745 newblk->nb_state = 0; 746 newblk->nb_fs = fs; 747 newblk->nb_newblkno = newblkno; 748 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 749 sema_release(&newblk_in_progress); 750 *newblkpp = newblk; 751 return (0); 752} 753 754/* 755 * Executed during filesystem system initialization before 756 * mounting any file systems. 757 */ 758void 759softdep_initialize() 760{ 761 762 LIST_INIT(&mkdirlisthd); 763 LIST_INIT(&softdep_workitem_pending); 764 pagedep_hashtbl = hashinit(desiredvnodes / 10, M_PAGEDEP, 765 &pagedep_hash); 766 sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0); 767 inodedep_hashtbl = hashinit(desiredvnodes / 2, M_INODEDEP, 768 &inodedep_hash); 769 sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0); 770 newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); 771 sema_init(&newblk_in_progress, "newblk", PRIBIO, 0); 772} 773 774/* 775 * Called at mount time to notify the dependency code that a 776 * filesystem wishes to use it. 777 */ 778int 779softdep_mount(devvp, mp, fs, cred) 780 struct vnode *devvp; 781 struct mount *mp; 782 struct fs *fs; 783 struct ucred *cred; 784{ 785 struct csum cstotal; 786 struct cg *cgp; 787 struct buf *bp; 788 int error, cyl; 789 790 mp->mnt_flag |= MNT_SOFTDEP; 791 /* 792 * When doing soft updates, the counters in the 793 * superblock may have gotten out of sync, so we have 794 * to scan the cylinder groups and recalculate them. 795 */ 796 if (fs->fs_clean != 0) 797 return (0); 798 bzero(&cstotal, sizeof cstotal); 799 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 800 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 801 fs->fs_cgsize, cred, &bp)) != 0) { 802 brelse(bp); 803 return (error); 804 } 805 cgp = (struct cg *)bp->b_data; 806 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 807 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 808 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 809 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 810 fs->fs_cs(fs, cyl) = cgp->cg_cs; 811 brelse(bp); 812 } 813#ifdef DEBUG 814 if (!bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 815 printf("ffs_mountfs: superblock updated\n"); 816#endif 817 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 818 return (0); 819} 820 821/* 822 * Protecting the freemaps (or bitmaps). 823 * 824 * To eliminate the need to execute fsck before mounting a file system 825 * after a power failure, one must (conservatively) guarantee that the 826 * on-disk copy of the bitmaps never indicate that a live inode or block is 827 * free. So, when a block or inode is allocated, the bitmap should be 828 * updated (on disk) before any new pointers. When a block or inode is 829 * freed, the bitmap should not be updated until all pointers have been 830 * reset. The latter dependency is handled by the delayed de-allocation 831 * approach described below for block and inode de-allocation. The former 832 * dependency is handled by calling the following procedure when a block or 833 * inode is allocated. When an inode is allocated an "inodedep" is created 834 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 835 * Each "inodedep" is also inserted into the hash indexing structure so 836 * that any additional link additions can be made dependent on the inode 837 * allocation. 838 * 839 * The ufs file system maintains a number of free block counts (e.g., per 840 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 841 * in addition to the bitmaps. These counts are used to improve efficiency 842 * during allocation and therefore must be consistent with the bitmaps. 843 * There is no convenient way to guarantee post-crash consistency of these 844 * counts with simple update ordering, for two main reasons: (1) The counts 845 * and bitmaps for a single cylinder group block are not in the same disk 846 * sector. If a disk write is interrupted (e.g., by power failure), one may 847 * be written and the other not. (2) Some of the counts are located in the 848 * superblock rather than the cylinder group block. So, we focus our soft 849 * updates implementation on protecting the bitmaps. When mounting a 850 * filesystem, we recompute the auxiliary counts from the bitmaps. 851 */ 852 853/* 854 * Called just after updating the cylinder group block to allocate an inode. 855 */ 856void 857softdep_setup_inomapdep(bp, ip, newinum) 858 struct buf *bp; /* buffer for cylgroup block with inode map */ 859 struct inode *ip; /* inode related to allocation */ 860 ino_t newinum; /* new inode number being allocated */ 861{ 862 struct inodedep *inodedep; 863 struct bmsafemap *bmsafemap; 864 865 /* 866 * Create a dependency for the newly allocated inode. 867 * Panic if it already exists as something is seriously wrong. 868 * Otherwise add it to the dependency list for the buffer holding 869 * the cylinder group map from which it was allocated. 870 */ 871 ACQUIRE_LOCK(&lk); 872 if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0) 873 panic("softdep_setup_inomapdep: found inode"); 874 inodedep->id_buf = bp; 875 inodedep->id_state &= ~DEPCOMPLETE; 876 bmsafemap = bmsafemap_lookup(bp); 877 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 878 FREE_LOCK(&lk); 879} 880 881/* 882 * Called just after updating the cylinder group block to 883 * allocate block or fragment. 884 */ 885void 886softdep_setup_blkmapdep(bp, fs, newblkno) 887 struct buf *bp; /* buffer for cylgroup block with block map */ 888 struct fs *fs; /* filesystem doing allocation */ 889 ufs_daddr_t newblkno; /* number of newly allocated block */ 890{ 891 struct newblk *newblk; 892 struct bmsafemap *bmsafemap; 893 894 /* 895 * Create a dependency for the newly allocated block. 896 * Add it to the dependency list for the buffer holding 897 * the cylinder group map from which it was allocated. 898 */ 899 if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) 900 panic("softdep_setup_blkmapdep: found block"); 901 ACQUIRE_LOCK(&lk); 902 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp); 903 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 904 FREE_LOCK(&lk); 905} 906 907/* 908 * Find the bmsafemap associated with a cylinder group buffer. 909 * If none exists, create one. The buffer must be locked when 910 * this routine is called and this routine must be called with 911 * splbio interrupts blocked. 912 */ 913static struct bmsafemap * 914bmsafemap_lookup(bp) 915 struct buf *bp; 916{ 917 struct bmsafemap *bmsafemap; 918 struct worklist *wk; 919 920#ifdef DEBUG 921 if (lk.lkt_held == -1) 922 panic("bmsafemap_lookup: lock not held"); 923#endif 924 for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) 925 if (wk->wk_type == M_BMSAFEMAP) 926 return (WK_BMSAFEMAP(wk)); 927 FREE_LOCK(&lk); 928 MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap), 929 M_BMSAFEMAP, M_WAITOK); 930 bmsafemap->sm_list.wk_type = M_BMSAFEMAP; 931 bmsafemap->sm_list.wk_state = 0; 932 bmsafemap->sm_buf = bp; 933 LIST_INIT(&bmsafemap->sm_allocdirecthd); 934 LIST_INIT(&bmsafemap->sm_allocindirhd); 935 LIST_INIT(&bmsafemap->sm_inodedephd); 936 LIST_INIT(&bmsafemap->sm_newblkhd); 937 ACQUIRE_LOCK(&lk); 938 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 939 return (bmsafemap); 940} 941 942/* 943 * Direct block allocation dependencies. 944 * 945 * When a new block is allocated, the corresponding disk locations must be 946 * initialized (with zeros or new data) before the on-disk inode points to 947 * them. Also, the freemap from which the block was allocated must be 948 * updated (on disk) before the inode's pointer. These two dependencies are 949 * independent of each other and are needed for all file blocks and indirect 950 * blocks that are pointed to directly by the inode. Just before the 951 * "in-core" version of the inode is updated with a newly allocated block 952 * number, a procedure (below) is called to setup allocation dependency 953 * structures. These structures are removed when the corresponding 954 * dependencies are satisfied or when the block allocation becomes obsolete 955 * (i.e., the file is deleted, the block is de-allocated, or the block is a 956 * fragment that gets upgraded). All of these cases are handled in 957 * procedures described later. 958 * 959 * When a file extension causes a fragment to be upgraded, either to a larger 960 * fragment or to a full block, the on-disk location may change (if the 961 * previous fragment could not simply be extended). In this case, the old 962 * fragment must be de-allocated, but not until after the inode's pointer has 963 * been updated. In most cases, this is handled by later procedures, which 964 * will construct a "freefrag" structure to be added to the workitem queue 965 * when the inode update is complete (or obsolete). The main exception to 966 * this is when an allocation occurs while a pending allocation dependency 967 * (for the same block pointer) remains. This case is handled in the main 968 * allocation dependency setup procedure by immediately freeing the 969 * unreferenced fragments. 970 */ 971void 972softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 973 struct inode *ip; /* inode to which block is being added */ 974 ufs_lbn_t lbn; /* block pointer within inode */ 975 ufs_daddr_t newblkno; /* disk block number being added */ 976 ufs_daddr_t oldblkno; /* previous block number, 0 unless frag */ 977 long newsize; /* size of new block */ 978 long oldsize; /* size of new block */ 979 struct buf *bp; /* bp for allocated block */ 980{ 981 struct allocdirect *adp, *oldadp; 982 struct allocdirectlst *adphead; 983 struct bmsafemap *bmsafemap; 984 struct inodedep *inodedep; 985 struct pagedep *pagedep; 986 struct newblk *newblk; 987 988 MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), 989 M_ALLOCDIRECT, M_WAITOK); 990 bzero(adp, sizeof(struct allocdirect)); 991 adp->ad_list.wk_type = M_ALLOCDIRECT; 992 adp->ad_lbn = lbn; 993 adp->ad_newblkno = newblkno; 994 adp->ad_oldblkno = oldblkno; 995 adp->ad_newsize = newsize; 996 adp->ad_oldsize = oldsize; 997 adp->ad_state = ATTACHED; 998 if (newblkno == oldblkno) 999 adp->ad_freefrag = NULL; 1000 else 1001 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); 1002 1003 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) 1004 panic("softdep_setup_allocdirect: lost block"); 1005 1006 ACQUIRE_LOCK(&lk); 1007 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); 1008 adp->ad_inodedep = inodedep; 1009 1010 if (newblk->nb_state == DEPCOMPLETE) { 1011 adp->ad_state |= DEPCOMPLETE; 1012 adp->ad_buf = NULL; 1013 } else { 1014 bmsafemap = newblk->nb_bmsafemap; 1015 adp->ad_buf = bmsafemap->sm_buf; 1016 LIST_REMOVE(newblk, nb_deps); 1017 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); 1018 } 1019 LIST_REMOVE(newblk, nb_hash); 1020 FREE(newblk, M_NEWBLK); 1021 1022 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); 1023 if (lbn >= NDADDR) { 1024 /* allocating an indirect block */ 1025 if (oldblkno != 0) 1026 panic("softdep_setup_allocdirect: non-zero indir"); 1027 } else { 1028 /* 1029 * Allocating a direct block. 1030 * 1031 * If we are allocating a directory block, then we must 1032 * allocate an associated pagedep to track additions and 1033 * deletions. 1034 */ 1035 if ((ip->i_mode & IFMT) == IFDIR && 1036 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1037 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 1038 } 1039 /* 1040 * The list of allocdirects must be kept in sorted and ascending 1041 * order so that the rollback routines can quickly determine the 1042 * first uncommitted block (the size of the file stored on disk 1043 * ends at the end of the lowest committed fragment, or if there 1044 * are no fragments, at the end of the highest committed block). 1045 * Since files generally grow, the typical case is that the new 1046 * block is to be added at the end of the list. We speed this 1047 * special case by checking against the last allocdirect in the 1048 * list before laboriously traversing the list looking for the 1049 * insertion point. 1050 */ 1051 adphead = &inodedep->id_newinoupdt; 1052 oldadp = TAILQ_LAST(adphead, allocdirectlst); 1053 if (oldadp == NULL || oldadp->ad_lbn <= lbn) { 1054 /* insert at end of list */ 1055 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 1056 if (oldadp != NULL && oldadp->ad_lbn == lbn) 1057 allocdirect_merge(adphead, adp, oldadp); 1058 FREE_LOCK(&lk); 1059 return; 1060 } 1061 for (oldadp = TAILQ_FIRST(adphead); oldadp; 1062 oldadp = TAILQ_NEXT(oldadp, ad_next)) { 1063 if (oldadp->ad_lbn >= lbn) 1064 break; 1065 } 1066 if (oldadp == NULL) 1067 panic("softdep_setup_allocdirect: lost entry"); 1068 /* insert in middle of list */ 1069 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 1070 if (oldadp->ad_lbn == lbn) 1071 allocdirect_merge(adphead, adp, oldadp); 1072 FREE_LOCK(&lk); 1073} 1074 1075/* 1076 * Replace an old allocdirect dependency with a newer one. 1077 * This routine must be called with splbio interrupts blocked. 1078 */ 1079static void 1080allocdirect_merge(adphead, newadp, oldadp) 1081 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 1082 struct allocdirect *newadp; /* allocdirect being added */ 1083 struct allocdirect *oldadp; /* existing allocdirect being checked */ 1084{ 1085 struct freefrag *freefrag; 1086 1087#ifdef DEBUG 1088 if (lk.lkt_held == -1) 1089 panic("allocdirect_merge: lock not held"); 1090#endif 1091 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 1092 newadp->ad_oldsize != oldadp->ad_newsize || 1093 newadp->ad_lbn >= NDADDR) 1094 panic("allocdirect_check: old %d != new %d || lbn %d >= %d", 1095 newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn, 1096 NDADDR); 1097 newadp->ad_oldblkno = oldadp->ad_oldblkno; 1098 newadp->ad_oldsize = oldadp->ad_oldsize; 1099 /* 1100 * If the old dependency had a fragment to free or had never 1101 * previously had a block allocated, then the new dependency 1102 * can immediately post its freefrag and adopt the old freefrag. 1103 * This action is done by swapping the freefrag dependencies. 1104 * The new dependency gains the old one's freefrag, and the 1105 * old one gets the new one and then immediately puts it on 1106 * the worklist when it is freed by free_allocdirect. It is 1107 * not possible to do this swap when the old dependency had a 1108 * non-zero size but no previous fragment to free. This condition 1109 * arises when the new block is an extension of the old block. 1110 * Here, the first part of the fragment allocated to the new 1111 * dependency is part of the block currently claimed on disk by 1112 * the old dependency, so cannot legitimately be freed until the 1113 * conditions for the new dependency are fulfilled. 1114 */ 1115 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 1116 freefrag = newadp->ad_freefrag; 1117 newadp->ad_freefrag = oldadp->ad_freefrag; 1118 oldadp->ad_freefrag = freefrag; 1119 } 1120 free_allocdirect(adphead, oldadp, 0); 1121} 1122 1123/* 1124 * Allocate a new freefrag structure if needed. 1125 */ 1126static struct freefrag * 1127newfreefrag(ip, blkno, size) 1128 struct inode *ip; 1129 ufs_daddr_t blkno; 1130 long size; 1131{ 1132 struct freefrag *freefrag; 1133 struct fs *fs; 1134 1135 if (blkno == 0) 1136 return (NULL); 1137 fs = ip->i_fs; 1138 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 1139 panic("newfreefrag: frag size"); 1140 MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag), 1141 M_FREEFRAG, M_WAITOK); 1142 freefrag->ff_list.wk_type = M_FREEFRAG; 1143 freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */ 1144 freefrag->ff_inum = ip->i_number; 1145 freefrag->ff_fs = fs; 1146 freefrag->ff_devvp = ip->i_devvp; 1147 freefrag->ff_blkno = blkno; 1148 freefrag->ff_fragsize = size; 1149 return (freefrag); 1150} 1151 1152/* 1153 * This workitem de-allocates fragments that were replaced during 1154 * file block allocation. 1155 */ 1156static void 1157handle_workitem_freefrag(freefrag) 1158 struct freefrag *freefrag; 1159{ 1160 struct inode tip; 1161 1162 tip.i_fs = freefrag->ff_fs; 1163 tip.i_devvp = freefrag->ff_devvp; 1164 tip.i_dev = freefrag->ff_devvp->v_rdev; 1165 tip.i_number = freefrag->ff_inum; 1166 tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above */ 1167 ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize); 1168 FREE(freefrag, M_FREEFRAG); 1169} 1170 1171/* 1172 * Indirect block allocation dependencies. 1173 * 1174 * The same dependencies that exist for a direct block also exist when 1175 * a new block is allocated and pointed to by an entry in a block of 1176 * indirect pointers. The undo/redo states described above are also 1177 * used here. Because an indirect block contains many pointers that 1178 * may have dependencies, a second copy of the entire in-memory indirect 1179 * block is kept. The buffer cache copy is always completely up-to-date. 1180 * The second copy, which is used only as a source for disk writes, 1181 * contains only the safe pointers (i.e., those that have no remaining 1182 * update dependencies). The second copy is freed when all pointers 1183 * are safe. The cache is not allowed to replace indirect blocks with 1184 * pending update dependencies. If a buffer containing an indirect 1185 * block with dependencies is written, these routines will mark it 1186 * dirty again. It can only be successfully written once all the 1187 * dependencies are removed. The ffs_fsync routine in conjunction with 1188 * softdep_sync_metadata work together to get all the dependencies 1189 * removed so that a file can be successfully written to disk. Three 1190 * procedures are used when setting up indirect block pointer 1191 * dependencies. The division is necessary because of the organization 1192 * of the "balloc" routine and because of the distinction between file 1193 * pages and file metadata blocks. 1194 */ 1195 1196/* 1197 * Allocate a new allocindir structure. 1198 */ 1199static struct allocindir * 1200newallocindir(ip, ptrno, newblkno, oldblkno) 1201 struct inode *ip; /* inode for file being extended */ 1202 int ptrno; /* offset of pointer in indirect block */ 1203 ufs_daddr_t newblkno; /* disk block number being added */ 1204 ufs_daddr_t oldblkno; /* previous block number, 0 if none */ 1205{ 1206 struct allocindir *aip; 1207 1208 MALLOC(aip, struct allocindir *, sizeof(struct allocindir), 1209 M_ALLOCINDIR, M_WAITOK); 1210 bzero(aip, sizeof(struct allocindir)); 1211 aip->ai_list.wk_type = M_ALLOCINDIR; 1212 aip->ai_state = ATTACHED; 1213 aip->ai_offset = ptrno; 1214 aip->ai_newblkno = newblkno; 1215 aip->ai_oldblkno = oldblkno; 1216 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); 1217 return (aip); 1218} 1219 1220/* 1221 * Called just before setting an indirect block pointer 1222 * to a newly allocated file page. 1223 */ 1224void 1225softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 1226 struct inode *ip; /* inode for file being extended */ 1227 ufs_lbn_t lbn; /* allocated block number within file */ 1228 struct buf *bp; /* buffer with indirect blk referencing page */ 1229 int ptrno; /* offset of pointer in indirect block */ 1230 ufs_daddr_t newblkno; /* disk block number being added */ 1231 ufs_daddr_t oldblkno; /* previous block number, 0 if none */ 1232 struct buf *nbp; /* buffer holding allocated page */ 1233{ 1234 struct allocindir *aip; 1235 struct pagedep *pagedep; 1236 1237 aip = newallocindir(ip, ptrno, newblkno, oldblkno); 1238 ACQUIRE_LOCK(&lk); 1239 /* 1240 * If we are allocating a directory page, then we must 1241 * allocate an associated pagedep to track additions and 1242 * deletions. 1243 */ 1244 if ((ip->i_mode & IFMT) == IFDIR && 1245 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) 1246 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); 1247 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1248 FREE_LOCK(&lk); 1249 setup_allocindir_phase2(bp, ip, aip); 1250} 1251 1252/* 1253 * Called just before setting an indirect block pointer to a 1254 * newly allocated indirect block. 1255 */ 1256void 1257softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 1258 struct buf *nbp; /* newly allocated indirect block */ 1259 struct inode *ip; /* inode for file being extended */ 1260 struct buf *bp; /* indirect block referencing allocated block */ 1261 int ptrno; /* offset of pointer in indirect block */ 1262 ufs_daddr_t newblkno; /* disk block number being added */ 1263{ 1264 struct allocindir *aip; 1265 1266 aip = newallocindir(ip, ptrno, newblkno, 0); 1267 ACQUIRE_LOCK(&lk); 1268 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); 1269 FREE_LOCK(&lk); 1270 setup_allocindir_phase2(bp, ip, aip); 1271} 1272 1273/* 1274 * Called to finish the allocation of the "aip" allocated 1275 * by one of the two routines above. 1276 */ 1277static void 1278setup_allocindir_phase2(bp, ip, aip) 1279 struct buf *bp; /* in-memory copy of the indirect block */ 1280 struct inode *ip; /* inode for file being extended */ 1281 struct allocindir *aip; /* allocindir allocated by the above routines */ 1282{ 1283 struct worklist *wk; 1284 struct indirdep *indirdep, *newindirdep; 1285 struct bmsafemap *bmsafemap; 1286 struct allocindir *oldaip; 1287 struct freefrag *freefrag; 1288 struct newblk *newblk; 1289 1290 if (bp->b_lblkno >= 0) 1291 panic("setup_allocindir_phase2: not indir blk"); 1292 for (indirdep = NULL, newindirdep = NULL; ; ) { 1293 ACQUIRE_LOCK(&lk); 1294 for (wk = LIST_FIRST(&bp->b_dep); wk; 1295 wk = LIST_NEXT(wk, wk_list)) { 1296 if (wk->wk_type != M_INDIRDEP) 1297 continue; 1298 indirdep = WK_INDIRDEP(wk); 1299 break; 1300 } 1301 if (indirdep == NULL && newindirdep) { 1302 indirdep = newindirdep; 1303 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 1304 newindirdep = NULL; 1305 } 1306 FREE_LOCK(&lk); 1307 if (indirdep) { 1308 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, 1309 &newblk) == 0) 1310 panic("setup_allocindir: lost block"); 1311 ACQUIRE_LOCK(&lk); 1312 if (newblk->nb_state == DEPCOMPLETE) { 1313 aip->ai_state |= DEPCOMPLETE; 1314 aip->ai_buf = NULL; 1315 } else { 1316 bmsafemap = newblk->nb_bmsafemap; 1317 aip->ai_buf = bmsafemap->sm_buf; 1318 LIST_REMOVE(newblk, nb_deps); 1319 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, 1320 aip, ai_deps); 1321 } 1322 LIST_REMOVE(newblk, nb_hash); 1323 FREE(newblk, M_NEWBLK); 1324 aip->ai_indirdep = indirdep; 1325 /* 1326 * Check to see if there is an existing dependency 1327 * for this block. If there is, merge the old 1328 * dependency into the new one. 1329 */ 1330 if (aip->ai_oldblkno == 0) 1331 oldaip = NULL; 1332 else 1333 for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd); 1334 oldaip; oldaip = LIST_NEXT(oldaip, ai_next)) 1335 if (oldaip->ai_offset == aip->ai_offset) 1336 break; 1337 if (oldaip != NULL) { 1338 if (oldaip->ai_newblkno != aip->ai_oldblkno) 1339 panic("setup_allocindir_phase2: blkno"); 1340 aip->ai_oldblkno = oldaip->ai_oldblkno; 1341 freefrag = oldaip->ai_freefrag; 1342 oldaip->ai_freefrag = aip->ai_freefrag; 1343 aip->ai_freefrag = freefrag; 1344 free_allocindir(oldaip, NULL); 1345 } 1346 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 1347 ((ufs_daddr_t *)indirdep->ir_savebp->b_data) 1348 [aip->ai_offset] = aip->ai_oldblkno; 1349 FREE_LOCK(&lk); 1350 } 1351 if (newindirdep) { 1352 if (indirdep->ir_savebp != NULL) 1353 brelse(newindirdep->ir_savebp); 1354 WORKITEM_FREE((caddr_t)newindirdep, M_INDIRDEP); 1355 } 1356 if (indirdep) 1357 break; 1358 MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep), 1359 M_INDIRDEP, M_WAITOK); 1360 newindirdep->ir_list.wk_type = M_INDIRDEP; 1361 newindirdep->ir_state = ATTACHED; 1362 LIST_INIT(&newindirdep->ir_deplisthd); 1363 LIST_INIT(&newindirdep->ir_donehd); 1364 newindirdep->ir_saveddata = (ufs_daddr_t *)bp->b_data; 1365 newindirdep->ir_savebp = 1366 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0); 1367 bcopy((caddr_t)newindirdep->ir_saveddata, 1368 newindirdep->ir_savebp->b_data, bp->b_bcount); 1369 } 1370} 1371 1372/* 1373 * Block de-allocation dependencies. 1374 * 1375 * When blocks are de-allocated, the on-disk pointers must be nullified before 1376 * the blocks are made available for use by other files. (The true 1377 * requirement is that old pointers must be nullified before new on-disk 1378 * pointers are set. We chose this slightly more stringent requirement to 1379 * reduce complexity.) Our implementation handles this dependency by updating 1380 * the inode (or indirect block) appropriately but delaying the actual block 1381 * de-allocation (i.e., freemap and free space count manipulation) until 1382 * after the updated versions reach stable storage. After the disk is 1383 * updated, the blocks can be safely de-allocated whenever it is convenient. 1384 * This implementation handles only the common case of reducing a file's 1385 * length to zero. Other cases are handled by the conventional synchronous 1386 * write approach. 1387 * 1388 * The ffs implementation with which we worked double-checks 1389 * the state of the block pointers and file size as it reduces 1390 * a file's length. Some of this code is replicated here in our 1391 * soft updates implementation. The freeblks->fb_chkcnt field is 1392 * used to transfer a part of this information to the procedure 1393 * that eventually de-allocates the blocks. 1394 * 1395 * This routine should be called from the routine that shortens 1396 * a file's length, before the inode's size or block pointers 1397 * are modified. It will save the block pointer information for 1398 * later release and zero the inode so that the calling routine 1399 * can release it. 1400 */ 1401void 1402softdep_setup_freeblocks(ip, length) 1403 struct inode *ip; /* The inode whose length is to be reduced */ 1404 off_t length; /* The new length for the file */ 1405{ 1406 struct freeblks *freeblks; 1407 struct inodedep *inodedep; 1408 struct allocdirect *adp; 1409 struct vnode *vp; 1410 struct buf *bp; 1411 struct fs *fs; 1412 int i, error; 1413 1414 fs = ip->i_fs; 1415 if (length != 0) 1416 panic("softde_setup_freeblocks: non-zero length"); 1417 MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks), 1418 M_FREEBLKS, M_WAITOK); 1419 bzero(freeblks, sizeof(struct freeblks)); 1420 freeblks->fb_list.wk_type = M_FREEBLKS; 1421 freeblks->fb_uid = ip->i_uid; 1422 freeblks->fb_previousinum = ip->i_number; 1423 freeblks->fb_devvp = ip->i_devvp; 1424 freeblks->fb_fs = fs; 1425 freeblks->fb_oldsize = ip->i_size; 1426 freeblks->fb_newsize = length; 1427 freeblks->fb_chkcnt = ip->i_blocks; 1428 for (i = 0; i < NDADDR; i++) { 1429 freeblks->fb_dblks[i] = ip->i_db[i]; 1430 ip->i_db[i] = 0; 1431 } 1432 for (i = 0; i < NIADDR; i++) { 1433 freeblks->fb_iblks[i] = ip->i_ib[i]; 1434 ip->i_ib[i] = 0; 1435 } 1436 ip->i_blocks = 0; 1437 ip->i_size = 0; 1438 /* 1439 * Push the zero'ed inode to to its disk buffer so that we are free 1440 * to delete its dependencies below. Once the dependencies are gone 1441 * the buffer can be safely released. 1442 */ 1443 if ((error = bread(ip->i_devvp, 1444 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 1445 (int)fs->fs_bsize, NOCRED, &bp)) != 0) 1446 softdep_error("softdep_setup_freeblocks", error); 1447 *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = 1448 ip->i_din; 1449 /* 1450 * Find and eliminate any inode dependencies. 1451 */ 1452 ACQUIRE_LOCK(&lk); 1453 (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep); 1454 if ((inodedep->id_state & IOSTARTED) != 0) 1455 panic("softdep_setup_freeblocks: inode busy"); 1456 /* 1457 * Add the freeblks structure to the list of operations that 1458 * must await the zero'ed inode being written to disk. 1459 */ 1460 WORKLIST_INSERT(&inodedep->id_inowait, &freeblks->fb_list); 1461 /* 1462 * Because the file length has been truncated to zero, any 1463 * pending block allocation dependency structures associated 1464 * with this inode are obsolete and can simply be de-allocated. 1465 * We must first merge the two dependency lists to get rid of 1466 * any duplicate freefrag structures, then purge the merged list. 1467 */ 1468 merge_inode_lists(inodedep); 1469 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 1470 free_allocdirect(&inodedep->id_inoupdt, adp, 1); 1471 bdwrite(bp); 1472 /* 1473 * We must wait for any I/O in progress to finish so that 1474 * all potential buffers on the dirty list will be visible. 1475 * Once they are all there, walk the list and get rid of 1476 * any dependencies. 1477 */ 1478 vp = ITOV(ip); 1479 while (vp->v_numoutput) { 1480 vp->v_flag |= VBWAIT; 1481 FREE_LOCK_INTERLOCKED(&lk); 1482 sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1); 1483 ACQUIRE_LOCK_INTERLOCKED(&lk); 1484 } 1485 while (getdirtybuf(&LIST_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) { 1486 bp = LIST_FIRST(&vp->v_dirtyblkhd); 1487 (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep); 1488 deallocate_dependencies(bp, inodedep); 1489 bp->b_flags |= B_INVAL; 1490 brelse(bp); 1491 } 1492 /* 1493 * Try freeing the inodedep in case that was the last dependency. 1494 */ 1495 if ((inodedep_lookup(fs, ip->i_number, 0, &inodedep)) != 0) 1496 (void) free_inodedep(inodedep); 1497 FREE_LOCK(&lk); 1498} 1499 1500/* 1501 * Reclaim any dependency structures from a buffer that is about to 1502 * be reallocated to a new vnode. The buffer must be locked, thus, 1503 * no I/O completion operations can occur while we are manipulating 1504 * its associated dependencies. The mutex is held so that other I/O's 1505 * associated with related dependencies do not occur. 1506 */ 1507static void 1508deallocate_dependencies(bp, inodedep) 1509 struct buf *bp; 1510 struct inodedep *inodedep; 1511{ 1512 struct worklist *wk; 1513 struct indirdep *indirdep; 1514 struct allocindir *aip; 1515 struct pagedep *pagedep; 1516 struct dirrem *dirrem; 1517 struct diradd *dap; 1518 long tmpsize; 1519 caddr_t tmp; 1520 int i; 1521 1522 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 1523 switch (wk->wk_type) { 1524 1525 case M_INDIRDEP: 1526 indirdep = WK_INDIRDEP(wk); 1527 /* 1528 * None of the indirect pointers will ever be visible, 1529 * so they can simply be tossed. GOINGAWAY ensures 1530 * that allocated pointers will be saved in the buffer 1531 * cache until they are freed. Note that they will 1532 * only be able to be found by their physical address 1533 * since the inode mapping the logical address will 1534 * be gone. The save buffer used for the safe copy 1535 * was allocated in setup_allocindir_phase2 using 1536 * the physical address so it could be used for this 1537 * purpose. Hence we swap the safe copy with the real 1538 * copy, allowing the safe copy to be freed and holding 1539 * on to the real copy for later use in indir_trunc. 1540 */ 1541 if (indirdep->ir_state & GOINGAWAY) 1542 panic("deallocate_dependencies: already gone"); 1543 indirdep->ir_state |= GOINGAWAY; 1544 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 1545 free_allocindir(aip, inodedep); 1546 if (bp->b_lblkno >= 0 || 1547 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 1548 panic("deallocate_dependencies: not indir"); 1549 tmp = indirdep->ir_savebp->b_data; 1550 indirdep->ir_savebp->b_data = bp->b_data; 1551 bp->b_data = tmp; 1552 tmpsize = indirdep->ir_savebp->b_bufsize; 1553 indirdep->ir_savebp->b_bufsize = bp->b_bufsize; 1554 bp->b_bufsize = tmpsize; 1555 WORKLIST_REMOVE(wk); 1556 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); 1557 continue; 1558 1559 case M_PAGEDEP: 1560 pagedep = WK_PAGEDEP(wk); 1561 /* 1562 * None of the directory additions will ever be 1563 * visible, so they can simply be tossed. 1564 */ 1565 for (i = 0; i < DAHASHSZ; i++) 1566 while (dap=LIST_FIRST(&pagedep->pd_diraddhd[i])) 1567 free_diradd(dap); 1568 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) 1569 free_diradd(dap); 1570 /* 1571 * Copy any directory remove dependencies to the list 1572 * to be processed after the zero'ed inode is written. 1573 * If the inode has already been written, then they 1574 * can be dumped directly onto the work list. 1575 */ 1576 for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem; 1577 dirrem = LIST_NEXT(dirrem, dm_next)) { 1578 LIST_REMOVE(dirrem, dm_next); 1579 dirrem->dm_dirinum = pagedep->pd_ino; 1580 if (inodedep == NULL) 1581 add_to_worklist(&dirrem->dm_list); 1582 else 1583 WORKLIST_INSERT(&inodedep->id_inowait, 1584 &dirrem->dm_list); 1585 } 1586 WORKLIST_REMOVE(&pagedep->pd_list); 1587 LIST_REMOVE(pagedep, pd_hash); 1588 WORKITEM_FREE(pagedep, M_PAGEDEP); 1589 continue; 1590 1591 case M_ALLOCINDIR: 1592 free_allocindir(WK_ALLOCINDIR(wk), inodedep); 1593 continue; 1594 1595 case M_ALLOCDIRECT: 1596 case M_INODEDEP: 1597 panic("deallocate_dependencies: Unexpected type %s", 1598 TYPENAME(wk->wk_type)); 1599 /* NOTREACHED */ 1600 1601 default: 1602 panic("deallocate_dependencies: Unknown type %s", 1603 TYPENAME(wk->wk_type)); 1604 /* NOTREACHED */ 1605 } 1606 } 1607} 1608 1609/* 1610 * Free an allocdirect. Generate a new freefrag work request if appropriate. 1611 * This routine must be called with splbio interrupts blocked. 1612 */ 1613static void 1614free_allocdirect(adphead, adp, delay) 1615 struct allocdirectlst *adphead; 1616 struct allocdirect *adp; 1617 int delay; 1618{ 1619 1620#ifdef DEBUG 1621 if (lk.lkt_held == -1) 1622 panic("free_allocdirect: lock not held"); 1623#endif 1624 if ((adp->ad_state & DEPCOMPLETE) == 0) 1625 LIST_REMOVE(adp, ad_deps); 1626 TAILQ_REMOVE(adphead, adp, ad_next); 1627 if ((adp->ad_state & COMPLETE) == 0) 1628 WORKLIST_REMOVE(&adp->ad_list); 1629 if (adp->ad_freefrag != NULL) { 1630 if (delay) 1631 WORKLIST_INSERT(&adp->ad_inodedep->id_inowait, 1632 &adp->ad_freefrag->ff_list); 1633 else 1634 add_to_worklist(&adp->ad_freefrag->ff_list); 1635 } 1636 WORKITEM_FREE(adp, M_ALLOCDIRECT); 1637} 1638 1639/* 1640 * Prepare an inode to be freed. The actual free operation is not 1641 * done until the zero'ed inode has been written to disk. 1642 */ 1643void 1644softdep_freefile(ap) 1645 struct vop_vfree_args /* { 1646 struct vnode *a_pvp; 1647 ino_t a_ino; 1648 int a_mode; 1649 } */ *ap; 1650{ 1651 struct inode *ip = VTOI(ap->a_pvp); 1652 struct inodedep *inodedep; 1653 struct freefile *freefile; 1654 1655 /* 1656 * This sets up the inode de-allocation dependency. 1657 */ 1658 MALLOC(freefile, struct freefile *, sizeof(struct freefile), 1659 M_FREEFILE, M_WAITOK); 1660 freefile->fx_list.wk_type = M_FREEFILE; 1661 freefile->fx_list.wk_state = 0; 1662 freefile->fx_mode = ap->a_mode; 1663 freefile->fx_oldinum = ap->a_ino; 1664 freefile->fx_devvp = ip->i_devvp; 1665 freefile->fx_fs = ip->i_fs; 1666 1667 /* 1668 * If the inodedep does not exist, then the zero'ed inode has 1669 * been written to disk and we can free the file immediately. 1670 */ 1671 ACQUIRE_LOCK(&lk); 1672 if (inodedep_lookup(ip->i_fs, ap->a_ino, 0, &inodedep) == 0) { 1673 add_to_worklist(&freefile->fx_list); 1674 FREE_LOCK(&lk); 1675 return; 1676 } 1677 1678 /* 1679 * If we still have a bitmap dependency, then the inode has never 1680 * been written to disk. Drop the dependency as it is no longer 1681 * necessary since the inode is being deallocated. We could process 1682 * the freefile immediately, but then we would have to clear the 1683 * id_inowait dependencies here and it is easier just to let the 1684 * zero'ed inode be written and let them be cleaned up in the 1685 * normal followup actions that follow the inode write. 1686 */ 1687 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 1688 inodedep->id_state |= DEPCOMPLETE; 1689 LIST_REMOVE(inodedep, id_deps); 1690 inodedep->id_buf = NULL; 1691 } 1692 /* 1693 * If the inodedep has no dependencies associated with it, 1694 * then we must free it here and free the file immediately. 1695 * This case arises when an early allocation fails (for 1696 * example, the user is over their file quota). 1697 */ 1698 if (free_inodedep(inodedep) == 0) 1699 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 1700 else 1701 add_to_worklist(&freefile->fx_list); 1702 FREE_LOCK(&lk); 1703} 1704 1705/* 1706 * Try to free an inodedep structure. Return 1 if it could be freed. 1707 */ 1708static int 1709free_inodedep(inodedep) 1710 struct inodedep *inodedep; 1711{ 1712 1713 if ((inodedep->id_state & ONWORKLIST) != 0 || 1714 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 1715 LIST_FIRST(&inodedep->id_pendinghd) != NULL || 1716 LIST_FIRST(&inodedep->id_inowait) != NULL || 1717 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 1718 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || 1719 inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL) 1720 return (0); 1721 LIST_REMOVE(inodedep, id_hash); 1722 WORKITEM_FREE(inodedep, M_INODEDEP); 1723 return (1); 1724} 1725 1726/* 1727 * This workitem routine performs the block de-allocation. 1728 * The workitem is added to the pending list after the updated 1729 * inode block has been written to disk. As mentioned above, 1730 * checks regarding the number of blocks de-allocated (compared 1731 * to the number of blocks allocated for the file) are also 1732 * performed in this function. 1733 */ 1734static void 1735handle_workitem_freeblocks(freeblks) 1736 struct freeblks *freeblks; 1737{ 1738 struct inode tip; 1739 ufs_daddr_t bn; 1740 struct fs *fs; 1741 int i, level, bsize; 1742 long nblocks, blocksreleased = 0; 1743 int error, allerror = 0; 1744 ufs_lbn_t baselbns[NIADDR], tmpval; 1745 1746 tip.i_number = freeblks->fb_previousinum; 1747 tip.i_devvp = freeblks->fb_devvp; 1748 tip.i_dev = freeblks->fb_devvp->v_rdev; 1749 tip.i_fs = freeblks->fb_fs; 1750 tip.i_size = freeblks->fb_oldsize; 1751 tip.i_uid = freeblks->fb_uid; 1752 fs = freeblks->fb_fs; 1753 tmpval = 1; 1754 baselbns[0] = NDADDR; 1755 for (i = 1; i < NIADDR; i++) { 1756 tmpval *= NINDIR(fs); 1757 baselbns[i] = baselbns[i - 1] + tmpval; 1758 } 1759 nblocks = btodb(fs->fs_bsize); 1760 blocksreleased = 0; 1761 /* 1762 * Indirect blocks first. 1763 */ 1764 for (level = (NIADDR - 1); level >= 0; level--) { 1765 if ((bn = freeblks->fb_iblks[level]) == 0) 1766 continue; 1767 if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level, 1768 baselbns[level], &blocksreleased)) == 0) 1769 allerror = error; 1770 ffs_blkfree(&tip, bn, fs->fs_bsize); 1771 blocksreleased += nblocks; 1772 } 1773 /* 1774 * All direct blocks or frags. 1775 */ 1776 for (i = (NDADDR - 1); i >= 0; i--) { 1777 if ((bn = freeblks->fb_dblks[i]) == 0) 1778 continue; 1779 bsize = blksize(fs, &tip, i); 1780 ffs_blkfree(&tip, bn, bsize); 1781 blocksreleased += btodb(bsize); 1782 } 1783 1784#ifdef DIAGNOSTIC 1785 if (freeblks->fb_chkcnt != blocksreleased) 1786 panic("handle_workitem_freeblocks: block count"); 1787 if (allerror) 1788 softdep_error("handle_workitem_freeblks", allerror); 1789#endif /* DIAGNOSTIC */ 1790 WORKITEM_FREE(freeblks, M_FREEBLKS); 1791} 1792 1793/* 1794 * Release blocks associated with the inode ip and stored in the indirect 1795 * block dbn. If level is greater than SINGLE, the block is an indirect block 1796 * and recursive calls to indirtrunc must be used to cleanse other indirect 1797 * blocks. 1798 */ 1799static int 1800indir_trunc(ip, dbn, level, lbn, countp) 1801 struct inode *ip; 1802 ufs_daddr_t dbn; 1803 int level; 1804 ufs_lbn_t lbn; 1805 long *countp; 1806{ 1807 struct buf *bp; 1808 ufs_daddr_t *bap; 1809 ufs_daddr_t nb; 1810 struct fs *fs; 1811 struct worklist *wk; 1812 struct indirdep *indirdep; 1813 int i, lbnadd, nblocks; 1814 int error, allerror = 0; 1815 1816 fs = ip->i_fs; 1817 lbnadd = 1; 1818 for (i = level; i > 0; i--) 1819 lbnadd *= NINDIR(fs); 1820 /* 1821 * Get buffer of block pointers to be freed. This routine is not 1822 * called until the zero'ed inode has been written, so it is safe 1823 * to free blocks as they are encountered. Because the inode has 1824 * been zero'ed, calls to bmap on these blocks will fail. So, we 1825 * have to use the on-disk address and the block device for the 1826 * filesystem to look them up. If the file was deleted before its 1827 * indirect blocks were all written to disk, the routine that set 1828 * us up (deallocate_dependencies) will have arranged to leave 1829 * a complete copy of the indirect block in memory for our use. 1830 * Otherwise we have to read the blocks in from the disk. 1831 */ 1832 ACQUIRE_LOCK(&lk); 1833 if ((bp = incore(ip->i_devvp, dbn)) != NULL && 1834 (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 1835 if (wk->wk_type != M_INDIRDEP || 1836 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || 1837 (indirdep->ir_state & GOINGAWAY) == 0) 1838 panic("indir_trunc: lost indirdep"); 1839 WORKLIST_REMOVE(wk); 1840 WORKITEM_FREE(indirdep, M_INDIRDEP); 1841 if (LIST_FIRST(&bp->b_dep) != NULL) 1842 panic("indir_trunc: dangling dep"); 1843 FREE_LOCK(&lk); 1844 } else { 1845 FREE_LOCK(&lk); 1846 error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp); 1847 if (error) 1848 return (error); 1849 } 1850 /* 1851 * Recursively free indirect blocks. 1852 */ 1853 bap = (ufs_daddr_t *)bp->b_data; 1854 nblocks = btodb(fs->fs_bsize); 1855 for (i = NINDIR(fs) - 1; i >= 0; i--) { 1856 if ((nb = bap[i]) == 0) 1857 continue; 1858 if (level != 0) { 1859 if ((error = indir_trunc(ip, fsbtodb(fs, nb), 1860 level - 1, lbn + (i * lbnadd), countp)) != 0) 1861 allerror = error; 1862 } 1863 ffs_blkfree(ip, nb, fs->fs_bsize); 1864 *countp += nblocks; 1865 } 1866 bp->b_flags |= B_INVAL; 1867 brelse(bp); 1868 return (allerror); 1869} 1870 1871/* 1872 * Free an allocindir. 1873 * This routine must be called with splbio interrupts blocked. 1874 */ 1875static void 1876free_allocindir(aip, inodedep) 1877 struct allocindir *aip; 1878 struct inodedep *inodedep; 1879{ 1880 struct freefrag *freefrag; 1881 1882#ifdef DEBUG 1883 if (lk.lkt_held == -1) 1884 panic("free_allocindir: lock not held"); 1885#endif 1886 if ((aip->ai_state & DEPCOMPLETE) == 0) 1887 LIST_REMOVE(aip, ai_deps); 1888 if (aip->ai_state & ONWORKLIST) 1889 WORKLIST_REMOVE(&aip->ai_list); 1890 LIST_REMOVE(aip, ai_next); 1891 if ((freefrag = aip->ai_freefrag) != NULL) { 1892 if (inodedep == NULL) 1893 add_to_worklist(&freefrag->ff_list); 1894 else 1895 WORKLIST_INSERT(&inodedep->id_inowait, 1896 &freefrag->ff_list); 1897 } 1898 WORKITEM_FREE(aip, M_ALLOCINDIR); 1899} 1900 1901/* 1902 * Directory entry addition dependencies. 1903 * 1904 * When adding a new directory entry, the inode (with its incremented link 1905 * count) must be written to disk before the directory entry's pointer to it. 1906 * Also, if the inode is newly allocated, the corresponding freemap must be 1907 * updated (on disk) before the directory entry's pointer. These requirements 1908 * are met via undo/redo on the directory entry's pointer, which consists 1909 * simply of the inode number. 1910 * 1911 * As directory entries are added and deleted, the free space within a 1912 * directory block can become fragmented. The ufs file system will compact 1913 * a fragmented directory block to make space for a new entry. When this 1914 * occurs, the offsets of previously added entries change. Any "diradd" 1915 * dependency structures corresponding to these entries must be updated with 1916 * the new offsets. 1917 */ 1918 1919/* 1920 * This routine is called after the in-memory inode's link 1921 * count has been incremented, but before the directory entry's 1922 * pointer to the inode has been set. 1923 */ 1924void 1925softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) 1926 struct buf *bp; /* buffer containing directory block */ 1927 struct inode *dp; /* inode for directory */ 1928 off_t diroffset; /* offset of new entry in directory */ 1929 long newinum; /* inode referenced by new directory entry */ 1930 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 1931{ 1932 int offset; /* offset of new entry within directory block */ 1933 ufs_lbn_t lbn; /* block in directory containing new entry */ 1934 struct fs *fs; 1935 struct diradd *dap; 1936 struct pagedep *pagedep; 1937 struct inodedep *inodedep; 1938 struct mkdir *mkdir1, *mkdir2; 1939 1940 /* 1941 * Whiteouts have no dependencies. 1942 */ 1943 if (newinum == WINO) { 1944 if (newdirbp != NULL) 1945 bdwrite(newdirbp); 1946 return; 1947 } 1948 1949 fs = dp->i_fs; 1950 lbn = lblkno(fs, diroffset); 1951 offset = blkoff(fs, diroffset); 1952 MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK); 1953 bzero(dap, sizeof(struct diradd)); 1954 dap->da_list.wk_type = M_DIRADD; 1955 dap->da_offset = offset; 1956 dap->da_newinum = newinum; 1957 dap->da_state = ATTACHED; 1958 if (newdirbp == NULL) { 1959 dap->da_state |= DEPCOMPLETE; 1960 } else { 1961 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 1962 MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR, 1963 M_WAITOK); 1964 mkdir1->md_list.wk_type = M_MKDIR; 1965 mkdir1->md_state = MKDIR_BODY; 1966 mkdir1->md_diradd = dap; 1967 MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR, 1968 M_WAITOK); 1969 mkdir2->md_list.wk_type = M_MKDIR; 1970 mkdir2->md_state = MKDIR_PARENT; 1971 mkdir2->md_diradd = dap; 1972 1973 } 1974 1975 ACQUIRE_LOCK(&lk); 1976 /* 1977 * If this directory entry references a new directory, create 1978 * its two additional dependencies: its "." and ".." being written 1979 * to disk and the link count increase for its parent directory. 1980 */ 1981 if (newdirbp != NULL) { 1982 /* 1983 * Dependency on "." and ".." being written to disk 1984 */ 1985 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 1986 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); 1987 bdwrite(newdirbp); 1988 /* 1989 * Dependency on link count increase for parent directory 1990 */ 1991 if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0 1992 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 1993 dap->da_state &= ~MKDIR_PARENT; 1994 WORKITEM_FREE(mkdir2, M_MKDIR); 1995 } else { 1996 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 1997 WORKLIST_INSERT(&inodedep->id_inowait,&mkdir2->md_list); 1998 } 1999 } 2000 /* 2001 * Link into parent directory pagedep and new inode inodedep 2002 * structures to await its being written. 2003 */ 2004 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2005 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2006 dap->da_pagedep = pagedep; 2007 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 2008 da_pdlist); 2009 if (inodedep_lookup(fs, newinum, DEPALLOC, &inodedep) == 1 && 2010 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 2011 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 2012 else 2013 WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list); 2014 FREE_LOCK(&lk); 2015} 2016 2017/* 2018 * This procedure is called to change the offset of a directory 2019 * entry when compacting a directory block which must be owned 2020 * exclusively by the caller. Note that the actual entry movement 2021 * must be done in this procedure to ensure that no I/O completions 2022 * occur while the move is in progress. 2023 */ 2024void 2025softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) 2026 struct inode *dp; /* inode for directory */ 2027 caddr_t base; /* address of dp->i_offset */ 2028 caddr_t oldloc; /* address of old directory location */ 2029 caddr_t newloc; /* address of new directory location */ 2030 int entrysize; /* size of directory entry */ 2031{ 2032 int offset, oldoffset, newoffset; 2033 struct pagedep *pagedep; 2034 struct diradd *dap; 2035 ufs_lbn_t lbn; 2036 2037 ACQUIRE_LOCK(&lk); 2038 lbn = lblkno(dp->i_fs, dp->i_offset); 2039 offset = blkoff(dp->i_fs, dp->i_offset); 2040 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) 2041 goto done; 2042 oldoffset = offset + (oldloc - base); 2043 newoffset = offset + (newloc - base); 2044 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]); 2045 dap; dap = LIST_NEXT(dap, da_pdlist)) { 2046 if (dap->da_offset != oldoffset) 2047 continue; 2048 dap->da_offset = newoffset; 2049 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) 2050 break; 2051 LIST_REMOVE(dap, da_pdlist); 2052 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], 2053 dap, da_pdlist); 2054 break; 2055 } 2056done: 2057 bcopy(oldloc, newloc, entrysize); 2058 FREE_LOCK(&lk); 2059} 2060 2061/* 2062 * Free a diradd dependency structure. This routine must be called 2063 * with splbio interrupts blocked. 2064 */ 2065static void 2066free_diradd(dap) 2067 struct diradd *dap; 2068{ 2069 struct dirrem *dirrem; 2070 struct pagedep *pagedep; 2071 struct inodedep *inodedep; 2072 struct mkdir *mkdir, *nextmd; 2073 2074#ifdef DEBUG 2075 if (lk.lkt_held == -1) 2076 panic("free_diradd: lock not held"); 2077#endif 2078 WORKLIST_REMOVE(&dap->da_list); 2079 LIST_REMOVE(dap, da_pdlist); 2080 if ((dap->da_state & DIRCHG) == 0) { 2081 pagedep = dap->da_pagedep; 2082 } else { 2083 dirrem = dap->da_previous; 2084 pagedep = dirrem->dm_pagedep; 2085 add_to_worklist(&dirrem->dm_list); 2086 } 2087 if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum, 2088 0, &inodedep) != 0) 2089 (void) free_inodedep(inodedep); 2090 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 2091 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 2092 nextmd = LIST_NEXT(mkdir, md_mkdirs); 2093 if (mkdir->md_diradd != dap) 2094 continue; 2095 dap->da_state &= ~mkdir->md_state; 2096 WORKLIST_REMOVE(&mkdir->md_list); 2097 LIST_REMOVE(mkdir, md_mkdirs); 2098 WORKITEM_FREE(mkdir, M_MKDIR); 2099 } 2100 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 2101 panic("free_diradd: unfound ref"); 2102 } 2103 WORKITEM_FREE(dap, M_DIRADD); 2104} 2105 2106/* 2107 * Directory entry removal dependencies. 2108 * 2109 * When removing a directory entry, the entry's inode pointer must be 2110 * zero'ed on disk before the corresponding inode's link count is decremented 2111 * (possibly freeing the inode for re-use). This dependency is handled by 2112 * updating the directory entry but delaying the inode count reduction until 2113 * after the directory block has been written to disk. After this point, the 2114 * inode count can be decremented whenever it is convenient. 2115 */ 2116 2117/* 2118 * This routine should be called immediately after removing 2119 * a directory entry. The inode's link count should not be 2120 * decremented by the calling procedure -- the soft updates 2121 * code will do this task when it is safe. 2122 */ 2123void 2124softdep_setup_remove(bp, dp, ip, isrmdir) 2125 struct buf *bp; /* buffer containing directory block */ 2126 struct inode *dp; /* inode for the directory being modified */ 2127 struct inode *ip; /* inode for directory entry being removed */ 2128 int isrmdir; /* indicates if doing RMDIR */ 2129{ 2130 struct dirrem *dirrem; 2131 2132 /* 2133 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. 2134 */ 2135 dirrem = newdirrem(bp, dp, ip, isrmdir); 2136 if ((dirrem->dm_state & COMPLETE) == 0) { 2137 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 2138 dm_next); 2139 } else { 2140 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 2141 add_to_worklist(&dirrem->dm_list); 2142 } 2143 FREE_LOCK(&lk); 2144} 2145 2146/* 2147 * Allocate a new dirrem if appropriate and return it along with 2148 * its associated pagedep. Called without a lock, returns with lock. 2149 */ 2150static struct dirrem * 2151newdirrem(bp, dp, ip, isrmdir) 2152 struct buf *bp; /* buffer containing directory block */ 2153 struct inode *dp; /* inode for the directory being modified */ 2154 struct inode *ip; /* inode for directory entry being removed */ 2155 int isrmdir; /* indicates if doing RMDIR */ 2156{ 2157 int offset; 2158 ufs_lbn_t lbn; 2159 struct diradd *dap; 2160 struct dirrem *dirrem; 2161 struct pagedep *pagedep; 2162 2163 /* 2164 * Whiteouts have no deletion dependencies. 2165 */ 2166 if (ip == NULL) 2167 panic("newdirrem: whiteout"); 2168 MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem), 2169 M_DIRREM, M_WAITOK); 2170 bzero(dirrem, sizeof(struct dirrem)); 2171 dirrem->dm_list.wk_type = M_DIRREM; 2172 dirrem->dm_state = isrmdir ? RMDIR : 0; 2173 dirrem->dm_mnt = ITOV(ip)->v_mount; 2174 dirrem->dm_oldinum = ip->i_number; 2175 2176 ACQUIRE_LOCK(&lk); 2177 lbn = lblkno(dp->i_fs, dp->i_offset); 2178 offset = blkoff(dp->i_fs, dp->i_offset); 2179 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) 2180 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2181 dirrem->dm_pagedep = pagedep; 2182 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(offset)]); 2183 dap; dap = LIST_NEXT(dap, da_pdlist)) { 2184 /* 2185 * Check for a diradd dependency for the same directory entry. 2186 * If present, then both dependencies become obsolete and can 2187 * be de-allocated. 2188 */ 2189 if (dap->da_offset != offset) 2190 continue; 2191 /* 2192 * Must be ATTACHED at this point, so just delete it. 2193 */ 2194 if ((dap->da_state & ATTACHED) == 0) 2195 panic("newdirrem: not ATTACHED"); 2196 if (dap->da_newinum != ip->i_number) 2197 panic("newdirrem: inum %d should be %d", 2198 ip->i_number, dap->da_newinum); 2199 free_diradd(dap); 2200 dirrem->dm_state |= COMPLETE; 2201 break; 2202 } 2203 return (dirrem); 2204} 2205 2206/* 2207 * Directory entry change dependencies. 2208 * 2209 * Changing an existing directory entry requires that an add operation 2210 * be completed first followed by a deletion. The semantics for the addition 2211 * are identical to the description of adding a new entry above except 2212 * that the rollback is to the old inode number rather than zero. Once 2213 * the addition dependency is completed, the removal is done as described 2214 * in the removal routine above. 2215 */ 2216 2217/* 2218 * This routine should be called immediately after changing 2219 * a directory entry. The inode's link count should not be 2220 * decremented by the calling procedure -- the soft updates 2221 * code will perform this task when it is safe. 2222 */ 2223void 2224softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 2225 struct buf *bp; /* buffer containing directory block */ 2226 struct inode *dp; /* inode for the directory being modified */ 2227 struct inode *ip; /* inode for directory entry being removed */ 2228 long newinum; /* new inode number for changed entry */ 2229 int isrmdir; /* indicates if doing RMDIR */ 2230{ 2231 int offset; 2232 struct diradd *dap; 2233 struct dirrem *dirrem; 2234 struct inodedep *inodedep; 2235 2236 offset = blkoff(dp->i_fs, dp->i_offset); 2237 2238 /* 2239 * Whiteouts have no addition dependencies. 2240 */ 2241 if (newinum == WINO) { 2242 dap = NULL; 2243 } else { 2244 MALLOC(dap, struct diradd *, sizeof(struct diradd), 2245 M_DIRADD, M_WAITOK); 2246 bzero(dap, sizeof(struct diradd)); 2247 dap->da_list.wk_type = M_DIRADD; 2248 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 2249 dap->da_offset = offset; 2250 dap->da_newinum = newinum; 2251 } 2252 2253 /* 2254 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. 2255 */ 2256 dirrem = newdirrem(bp, dp, ip, isrmdir); 2257 2258 /* 2259 * If the inode has already been written, then no addition 2260 * dependency needs to be created. 2261 */ 2262 if (inodedep_lookup(dp->i_fs, newinum, 0, &inodedep) == 0 || 2263 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 2264 WORKITEM_FREE(dap, M_DIRADD); 2265 dap = NULL; 2266 } 2267 2268 if (dap) { 2269 dap->da_previous = dirrem; 2270 LIST_INSERT_HEAD( 2271 &dirrem->dm_pagedep->pd_diraddhd[DIRADDHASH(offset)], 2272 dap, da_pdlist); 2273 WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list); 2274 } else if ((dirrem->dm_state & COMPLETE) == 0) { 2275 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 2276 dm_next); 2277 } else { 2278 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 2279 add_to_worklist(&dirrem->dm_list); 2280 } 2281 FREE_LOCK(&lk); 2282} 2283 2284/* 2285 * Called whenever the link count on an inode is increased. 2286 * It creates an inode dependency so that the new reference(s) 2287 * to the inode cannot be committed to disk until the updated 2288 * inode has been written. 2289 */ 2290void 2291softdep_increase_linkcnt(ip) 2292 struct inode *ip; /* the inode with the increased link count */ 2293{ 2294 struct inodedep *inodedep; 2295 2296 ACQUIRE_LOCK(&lk); 2297 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); 2298 FREE_LOCK(&lk); 2299} 2300 2301/* 2302 * This workitem decrements the inode's link count. 2303 * If the link count reaches zero, the file is removed. 2304 */ 2305static void 2306handle_workitem_remove(dirrem) 2307 struct dirrem *dirrem; 2308{ 2309 struct proc *p = curproc; /* XXX */ 2310 struct inodedep *inodedep; 2311 struct vnode *vp; 2312 struct inode *ip; 2313 int error; 2314 2315 if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) { 2316 softdep_error("handle_workitem_remove: vget", error); 2317 return; 2318 } 2319 ip = VTOI(vp); 2320 /* 2321 * Normal file deletion. 2322 */ 2323 if ((dirrem->dm_state & RMDIR) == 0) { 2324 ip->i_nlink--; 2325 if (ip->i_nlink < ip->i_effnlink) { 2326#ifdef DIAGNOSTIC 2327 vprint("handle_workitem_remove: bad file delta", vp); 2328#endif 2329 ip->i_effnlink = ip->i_nlink; 2330 } 2331 ip->i_flag |= IN_CHANGE; 2332 vput(vp); 2333 WORKITEM_FREE(dirrem, M_DIRREM); 2334 return; 2335 } 2336 /* 2337 * Directory deletion. Decrement reference count for both the 2338 * just deleted parent directory entry and the reference for ".". 2339 * Next truncate the directory to length zero. When the 2340 * truncation completes, arrange to have the reference count on 2341 * the parent decremented to account for the loss of "..". 2342 */ 2343 ip->i_nlink -= 2; 2344 if (ip->i_nlink < ip->i_effnlink) 2345 panic("handle_workitem_remove: bad dir delta"); 2346 ip->i_flag |= IN_CHANGE; 2347 if ((error = VOP_TRUNCATE(vp, (off_t)0, 0, p->p_cred, p)) != 0) 2348 softdep_error("handle_workitem_remove: truncate", error); 2349 ACQUIRE_LOCK(&lk); 2350 (void) inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, DEPALLOC, 2351 &inodedep); 2352 dirrem->dm_state = 0; 2353 dirrem->dm_oldinum = dirrem->dm_dirinum; 2354 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 2355 FREE_LOCK(&lk); 2356 vput(vp); 2357} 2358 2359/* 2360 * Inode de-allocation dependencies. 2361 * 2362 * When an inode's link count is reduced to zero, it can be de-allocated. We 2363 * found it convenient to postpone de-allocation until after the inode is 2364 * written to disk with its new link count (zero). At this point, all of the 2365 * on-disk inode's block pointers are nullified and, with careful dependency 2366 * list ordering, all dependencies related to the inode will be satisfied and 2367 * the corresponding dependency structures de-allocated. So, if/when the 2368 * inode is reused, there will be no mixing of old dependencies with new 2369 * ones. This artificial dependency is set up by the block de-allocation 2370 * procedure above (softdep_setup_freeblocks) and completed by the 2371 * following procedure. 2372 */ 2373static void 2374handle_workitem_freefile(freefile) 2375 struct freefile *freefile; 2376{ 2377 struct vnode vp; 2378 struct inode tip; 2379 struct inodedep *idp; 2380 struct vop_vfree_args args; 2381 int error; 2382 2383#ifdef DEBUG 2384 ACQUIRE_LOCK(&lk); 2385 if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp)) 2386 panic("handle_workitem_freefile: inodedep survived"); 2387 FREE_LOCK(&lk); 2388#endif 2389 tip.i_devvp = freefile->fx_devvp; 2390 tip.i_dev = freefile->fx_devvp->v_rdev; 2391 tip.i_fs = freefile->fx_fs; 2392 vp.v_data = &tip; 2393 args.a_pvp = &vp; 2394 args.a_ino = freefile->fx_oldinum; 2395 args.a_mode = freefile->fx_mode; 2396 if ((error = ffs_freefile(&args)) != 0) 2397 softdep_error("handle_workitem_freefile", error); 2398 WORKITEM_FREE(freefile, M_FREEFILE); 2399} 2400 2401/* 2402 * Disk writes. 2403 * 2404 * The dependency structures constructed above are most actively used when file 2405 * system blocks are written to disk. No constraints are placed on when a 2406 * block can be written, but unsatisfied update dependencies are made safe by 2407 * modifying (or replacing) the source memory for the duration of the disk 2408 * write. When the disk write completes, the memory block is again brought 2409 * up-to-date. 2410 * 2411 * In-core inode structure reclamation. 2412 * 2413 * Because there are a finite number of "in-core" inode structures, they are 2414 * reused regularly. By transferring all inode-related dependencies to the 2415 * in-memory inode block and indexing them separately (via "inodedep"s), we 2416 * can allow "in-core" inode structures to be reused at any time and avoid 2417 * any increase in contention. 2418 * 2419 * Called just before entering the device driver to initiate a new disk I/O. 2420 * The buffer must be locked, thus, no I/O completion operations can occur 2421 * while we are manipulating its associated dependencies. 2422 */ 2423void 2424softdep_disk_io_initiation(bp) 2425 struct buf *bp; /* structure describing disk write to occur */ 2426{ 2427 struct worklist *wk, *nextwk; 2428 struct indirdep *indirdep; 2429 2430 /* 2431 * We only care about write operations. There should never 2432 * be dependencies for reads. 2433 */ 2434 if (bp->b_flags & B_READ) 2435 panic("softdep_disk_io_initiation: read"); 2436 /* 2437 * Do any necessary pre-I/O processing. 2438 */ 2439 for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) { 2440 nextwk = LIST_NEXT(wk, wk_list); 2441 switch (wk->wk_type) { 2442 2443 case M_PAGEDEP: 2444 initiate_write_filepage(WK_PAGEDEP(wk), bp); 2445 continue; 2446 2447 case M_INODEDEP: 2448 initiate_write_inodeblock(WK_INODEDEP(wk), bp); 2449 continue; 2450 2451 case M_INDIRDEP: 2452 indirdep = WK_INDIRDEP(wk); 2453 if (indirdep->ir_state & GOINGAWAY) 2454 panic("disk_io_initiation: indirdep gone"); 2455 /* 2456 * If there are no remaining dependencies, this 2457 * will be writing the real pointers, so the 2458 * dependency can be freed. 2459 */ 2460 if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { 2461 brelse(indirdep->ir_savebp); 2462 /* inline expand WORKLIST_REMOVE(wk); */ 2463 wk->wk_state &= ~ONWORKLIST; 2464 LIST_REMOVE(wk, wk_list); 2465 WORKITEM_FREE(indirdep, M_INDIRDEP); 2466 continue; 2467 } 2468 /* 2469 * Replace up-to-date version with safe version. 2470 */ 2471 ACQUIRE_LOCK(&lk); 2472 indirdep->ir_state &= ~ATTACHED; 2473 indirdep->ir_state |= UNDONE; 2474 bp->b_data = indirdep->ir_savebp->b_data; 2475 FREE_LOCK(&lk); 2476 continue; 2477 2478 case M_MKDIR: 2479 case M_BMSAFEMAP: 2480 case M_ALLOCDIRECT: 2481 case M_ALLOCINDIR: 2482 continue; 2483 2484 default: 2485 panic("handle_disk_io_initiation: Unexpected type %s", 2486 TYPENAME(wk->wk_type)); 2487 /* NOTREACHED */ 2488 } 2489 } 2490} 2491 2492/* 2493 * Called from within the procedure above to deal with unsatisfied 2494 * allocation dependencies in a directory. The buffer must be locked, 2495 * thus, no I/O completion operations can occur while we are 2496 * manipulating its associated dependencies. 2497 */ 2498static void 2499initiate_write_filepage(pagedep, bp) 2500 struct pagedep *pagedep; 2501 struct buf *bp; 2502{ 2503 struct diradd *dap; 2504 struct direct *ep; 2505 int i; 2506 2507 if (pagedep->pd_state & IOSTARTED) { 2508 /* 2509 * This can only happen if there is a driver that does not 2510 * understand chaining. Here biodone will reissue the call 2511 * to strategy for the incomplete buffers. 2512 */ 2513 printf("initiate_write_filepage: already started\n"); 2514 return; 2515 } 2516 pagedep->pd_state |= IOSTARTED; 2517 ACQUIRE_LOCK(&lk); 2518 for (i = 0; i < DAHASHSZ; i++) { 2519 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 2520 dap = LIST_NEXT(dap, da_pdlist)) { 2521 ep = (struct direct *) 2522 ((char *)bp->b_data + dap->da_offset); 2523 if (ep->d_ino != dap->da_newinum) 2524 panic("%s: dir inum %d != new %d", 2525 "initiate_write_filepage", 2526 ep->d_ino, dap->da_newinum); 2527 if (dap->da_state & DIRCHG) 2528 ep->d_ino = dap->da_previous->dm_oldinum; 2529 else 2530 ep->d_ino = 0; 2531 dap->da_state &= ~ATTACHED; 2532 dap->da_state |= UNDONE; 2533 } 2534 } 2535 FREE_LOCK(&lk); 2536} 2537 2538/* 2539 * Called from within the procedure above to deal with unsatisfied 2540 * allocation dependencies in an inodeblock. The buffer must be 2541 * locked, thus, no I/O completion operations can occur while we 2542 * are manipulating its associated dependencies. 2543 */ 2544static void 2545initiate_write_inodeblock(inodedep, bp) 2546 struct inodedep *inodedep; 2547 struct buf *bp; /* The inode block */ 2548{ 2549 struct allocdirect *adp, *lastadp; 2550 struct dinode *dp; 2551 struct fs *fs; 2552 ufs_lbn_t prevlbn; 2553 int i, deplist; 2554 2555 if (inodedep->id_state & IOSTARTED) 2556 panic("initiate_write_inodeblock: already started"); 2557 inodedep->id_state |= IOSTARTED; 2558 fs = inodedep->id_fs; 2559 dp = (struct dinode *)bp->b_data + 2560 ino_to_fsbo(fs, inodedep->id_ino); 2561 /* 2562 * If the bitmap is not yet written, then the allocated 2563 * inode cannot be written to disk. 2564 */ 2565 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 2566 if (inodedep->id_savedino != NULL) 2567 panic("initiate_write_inodeblock: already doing I/O"); 2568 MALLOC(inodedep->id_savedino, struct dinode *, 2569 sizeof(struct dinode), M_INODEDEP, M_WAITOK); 2570 *inodedep->id_savedino = *dp; 2571 bzero((caddr_t)dp, sizeof(struct dinode)); 2572 return; 2573 } 2574 /* 2575 * If no dependencies, then there is nothing to roll back. 2576 */ 2577 inodedep->id_savedsize = dp->di_size; 2578 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) 2579 return; 2580 /* 2581 * Set the dependencies to busy. 2582 */ 2583 ACQUIRE_LOCK(&lk); 2584 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 2585 adp = TAILQ_NEXT(adp, ad_next)) { 2586#ifdef DIAGNOSTIC 2587 if (deplist != 0 && prevlbn >= adp->ad_lbn) 2588 panic("softdep_write_inodeblock: lbn order"); 2589 prevlbn = adp->ad_lbn; 2590 if (adp->ad_lbn < NDADDR && 2591 dp->di_db[adp->ad_lbn] != adp->ad_newblkno) 2592 panic("%s: direct pointer #%d mismatch %d != %d", 2593 "softdep_write_inodeblock", adp->ad_lbn, 2594 dp->di_db[adp->ad_lbn], adp->ad_newblkno); 2595 if (adp->ad_lbn >= NDADDR && 2596 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) 2597 panic("%s: indirect pointer #%d mismatch %d != %d", 2598 "softdep_write_inodeblock", adp->ad_lbn - NDADDR, 2599 dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno); 2600 deplist |= 1 << adp->ad_lbn; 2601 if ((adp->ad_state & ATTACHED) == 0) 2602 panic("softdep_write_inodeblock: Unknown state 0x%x", 2603 adp->ad_state); 2604#endif /* DIAGNOSTIC */ 2605 adp->ad_state &= ~ATTACHED; 2606 adp->ad_state |= UNDONE; 2607 } 2608 /* 2609 * The on-disk inode cannot claim to be any larger than the last 2610 * fragment that has been written. Otherwise, the on-disk inode 2611 * might have fragments that were not the last block in the file 2612 * which would corrupt the filesystem. 2613 */ 2614 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 2615 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 2616 if (adp->ad_lbn >= NDADDR) 2617 break; 2618 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; 2619 /* keep going until hitting a rollback to a frag */ 2620 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 2621 continue; 2622 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; 2623 for (i = adp->ad_lbn + 1; i < NDADDR; i++) { 2624#ifdef DIAGNOSTIC 2625 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 2626 panic("softdep_write_inodeblock: lost dep1"); 2627#endif /* DIAGNOSTIC */ 2628 dp->di_db[i] = 0; 2629 } 2630 for (i = 0; i < NIADDR; i++) { 2631#ifdef DIAGNOSTIC 2632 if (dp->di_ib[i] != 0 && 2633 (deplist & ((1 << NDADDR) << i)) == 0) 2634 panic("softdep_write_inodeblock: lost dep2"); 2635#endif /* DIAGNOSTIC */ 2636 dp->di_ib[i] = 0; 2637 } 2638 FREE_LOCK(&lk); 2639 return; 2640 } 2641 /* 2642 * If we have zero'ed out the last allocated block of the file, 2643 * roll back the size to the last currently allocated block. 2644 * We know that this last allocated block is a full-sized as 2645 * we already checked for fragments in the loop above. 2646 */ 2647 if (lastadp != NULL && 2648 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { 2649 for (i = lastadp->ad_lbn; i >= 0; i--) 2650 if (dp->di_db[i] != 0) 2651 break; 2652 dp->di_size = (i + 1) * fs->fs_bsize; 2653 } 2654 /* 2655 * The only dependencies are for indirect blocks. 2656 * 2657 * The file size for indirect block additions is not guaranteed. 2658 * Such a guarantee would be non-trivial to achieve. The conventional 2659 * synchronous write implementation also does not make this guarantee. 2660 * Fsck should catch and fix discrepancies. Arguably, the file size 2661 * can be over-estimated without destroying integrity when the file 2662 * moves into the indirect blocks (i.e., is large). If we want to 2663 * postpone fsck, we are stuck with this argument. 2664 */ 2665 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 2666 dp->di_ib[adp->ad_lbn - NDADDR] = 0; 2667 FREE_LOCK(&lk); 2668} 2669 2670/* 2671 * This routine is called during the completion interrupt 2672 * service routine for a disk write (from the procedure called 2673 * by the device driver to inform the file system caches of 2674 * a request completion). It should be called early in this 2675 * procedure, before the block is made available to other 2676 * processes or other routines are called. 2677 */ 2678void 2679softdep_disk_write_complete(bp) 2680 struct buf *bp; /* describes the completed disk write */ 2681{ 2682 struct worklist *wk; 2683 struct workhead reattach; 2684 struct newblk *newblk; 2685 struct allocindir *aip; 2686 struct allocdirect *adp; 2687 struct indirdep *indirdep; 2688 struct inodedep *inodedep; 2689 struct bmsafemap *bmsafemap; 2690 2691#ifdef DEBUG 2692 if (lk.lkt_held != -1) 2693 panic("softdep_disk_write_complete: lock is held"); 2694 lk.lkt_held = -2; 2695#endif 2696 LIST_INIT(&reattach); 2697 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 2698 WORKLIST_REMOVE(wk); 2699 switch (wk->wk_type) { 2700 2701 case M_PAGEDEP: 2702 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 2703 WORKLIST_INSERT(&reattach, wk); 2704 continue; 2705 2706 case M_INODEDEP: 2707 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 2708 WORKLIST_INSERT(&reattach, wk); 2709 continue; 2710 2711 case M_BMSAFEMAP: 2712 bmsafemap = WK_BMSAFEMAP(wk); 2713 while (newblk = LIST_FIRST(&bmsafemap->sm_newblkhd)) { 2714 newblk->nb_state |= DEPCOMPLETE; 2715 newblk->nb_bmsafemap = NULL; 2716 LIST_REMOVE(newblk, nb_deps); 2717 } 2718 while (adp = LIST_FIRST(&bmsafemap->sm_allocdirecthd)) { 2719 adp->ad_state |= DEPCOMPLETE; 2720 adp->ad_buf = NULL; 2721 LIST_REMOVE(adp, ad_deps); 2722 handle_allocdirect_partdone(adp); 2723 } 2724 while (aip = LIST_FIRST(&bmsafemap->sm_allocindirhd)) { 2725 aip->ai_state |= DEPCOMPLETE; 2726 aip->ai_buf = NULL; 2727 LIST_REMOVE(aip, ai_deps); 2728 handle_allocindir_partdone(aip); 2729 } 2730 while ((inodedep = 2731 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { 2732 inodedep->id_state |= DEPCOMPLETE; 2733 LIST_REMOVE(inodedep, id_deps); 2734 inodedep->id_buf = NULL; 2735 } 2736 WORKITEM_FREE(bmsafemap, M_BMSAFEMAP); 2737 continue; 2738 2739 case M_MKDIR: 2740 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 2741 continue; 2742 2743 case M_ALLOCDIRECT: 2744 adp = WK_ALLOCDIRECT(wk); 2745 adp->ad_state |= COMPLETE; 2746 handle_allocdirect_partdone(adp); 2747 continue; 2748 2749 case M_ALLOCINDIR: 2750 aip = WK_ALLOCINDIR(wk); 2751 aip->ai_state |= COMPLETE; 2752 handle_allocindir_partdone(aip); 2753 continue; 2754 2755 case M_INDIRDEP: 2756 indirdep = WK_INDIRDEP(wk); 2757 if (indirdep->ir_state & GOINGAWAY) 2758 panic("disk_write_complete: indirdep gone"); 2759 bp->b_data = (caddr_t)indirdep->ir_saveddata; 2760 indirdep->ir_state &= ~UNDONE; 2761 indirdep->ir_state |= ATTACHED; 2762 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 2763 LIST_REMOVE(aip, ai_next); 2764 handle_allocindir_partdone(aip); 2765 } 2766 WORKLIST_INSERT(&reattach, wk); 2767 bdirty(bp); 2768 continue; 2769 2770 default: 2771 panic("handle_disk_write_complete: Unknown type %s", 2772 TYPENAME(wk->wk_type)); 2773 /* NOTREACHED */ 2774 } 2775 } 2776 /* 2777 * Reattach any requests that must be redone. 2778 */ 2779 while ((wk = LIST_FIRST(&reattach)) != NULL) { 2780 WORKLIST_REMOVE(wk); 2781 WORKLIST_INSERT(&bp->b_dep, wk); 2782 } 2783#ifdef DEBUG 2784 if (lk.lkt_held != -2) 2785 panic("softdep_disk_write_complete: lock lost"); 2786 lk.lkt_held = -1; 2787#endif 2788} 2789 2790/* 2791 * Called from within softdep_disk_write_complete above. Note that 2792 * this routine is always called from interrupt level with further 2793 * splbio interrupts blocked. 2794 */ 2795static void 2796handle_allocdirect_partdone(adp) 2797 struct allocdirect *adp; /* the completed allocdirect */ 2798{ 2799 struct allocdirect *listadp; 2800 struct inodedep *inodedep; 2801 long bsize; 2802 2803 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 2804 return; 2805 if (adp->ad_buf != NULL) 2806 panic("handle_allocdirect_partdone: dangling dep"); 2807 /* 2808 * The on-disk inode cannot claim to be any larger than the last 2809 * fragment that has been written. Otherwise, the on-disk inode 2810 * might have fragments that were not the last block in the file 2811 * which would corrupt the filesystem. Thus, we cannot free any 2812 * allocdirects after one whose ad_oldblkno claims a fragment as 2813 * these blocks must be rolled back to zero before writing the inode. 2814 * We check the currently active set of allocdirects in id_inoupdt. 2815 */ 2816 inodedep = adp->ad_inodedep; 2817 bsize = inodedep->id_fs->fs_bsize; 2818 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp; 2819 listadp = TAILQ_NEXT(listadp, ad_next)) { 2820 /* found our block */ 2821 if (listadp == adp) 2822 break; 2823 /* continue if ad_oldlbn is not a fragment */ 2824 if (listadp->ad_oldsize == 0 || 2825 listadp->ad_oldsize == bsize) 2826 continue; 2827 /* hit a fragment */ 2828 return; 2829 } 2830 /* 2831 * If we have reached the end of the current list without 2832 * finding the just finished dependency, then it must be 2833 * on the future dependency list. Future dependencies cannot 2834 * be freed until they are moved to the current list. 2835 */ 2836 if (listadp == NULL) { 2837#ifdef DEBUG 2838 for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp; 2839 listadp = TAILQ_NEXT(listadp, ad_next)) 2840 /* found our block */ 2841 if (listadp == adp) 2842 break; 2843 if (listadp == NULL) 2844 panic("handle_allocdirect_partdone: lost dep"); 2845#endif /* DEBUG */ 2846 return; 2847 } 2848 /* 2849 * If we have found the just finished dependency, then free 2850 * it along with anything that follows it that is complete. 2851 */ 2852 for (; adp; adp = listadp) { 2853 listadp = TAILQ_NEXT(adp, ad_next); 2854 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 2855 return; 2856 free_allocdirect(&inodedep->id_inoupdt, adp, 1); 2857 } 2858} 2859 2860/* 2861 * Called from within softdep_disk_write_complete above. Note that 2862 * this routine is always called from interrupt level with further 2863 * splbio interrupts blocked. 2864 */ 2865static void 2866handle_allocindir_partdone(aip) 2867 struct allocindir *aip; /* the completed allocindir */ 2868{ 2869 struct indirdep *indirdep; 2870 2871 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 2872 return; 2873 if (aip->ai_buf != NULL) 2874 panic("handle_allocindir_partdone: dangling dependency"); 2875 indirdep = aip->ai_indirdep; 2876 if (indirdep->ir_state & UNDONE) { 2877 LIST_REMOVE(aip, ai_next); 2878 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 2879 return; 2880 } 2881 ((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 2882 aip->ai_newblkno; 2883 LIST_REMOVE(aip, ai_next); 2884 if (aip->ai_freefrag != NULL) 2885 add_to_worklist(&aip->ai_freefrag->ff_list); 2886 WORKITEM_FREE(aip, M_ALLOCINDIR); 2887} 2888 2889/* 2890 * Called from within softdep_disk_write_complete above to restore 2891 * in-memory inode block contents to their most up-to-date state. Note 2892 * that this routine is always called from interrupt level with further 2893 * splbio interrupts blocked. 2894 */ 2895static int 2896handle_written_inodeblock(inodedep, bp) 2897 struct inodedep *inodedep; 2898 struct buf *bp; /* buffer containing the inode block */ 2899{ 2900 struct pagedep *pagedep; 2901 struct worklist *wk, *filefree; 2902 struct allocdirect *adp, *nextadp; 2903 struct dinode *dp; 2904 struct diradd *dap; 2905 int hadchanges; 2906 2907 if ((inodedep->id_state & IOSTARTED) == 0) 2908 panic("handle_written_inodeblock: not started"); 2909 inodedep->id_state &= ~IOSTARTED; 2910 inodedep->id_state |= COMPLETE; 2911 dp = (struct dinode *)bp->b_data + 2912 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 2913 /* 2914 * If we had to rollback the inode allocation because of 2915 * bitmaps being incomplete, then simply restore it. 2916 * Keep the block dirty so that it will not be reclaimed until 2917 * all associated dependencies have been cleared and the 2918 * corresponding updates written to disk. 2919 */ 2920 if (inodedep->id_savedino != NULL) { 2921 *dp = *inodedep->id_savedino; 2922 FREE(inodedep->id_savedino, M_INODEDEP); 2923 inodedep->id_savedino = NULL; 2924 bdirty(bp); 2925 return (1); 2926 } 2927 /* 2928 * Roll forward anything that had to be rolled back before 2929 * the inode could be updated. 2930 */ 2931 hadchanges = 0; 2932 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 2933 nextadp = TAILQ_NEXT(adp, ad_next); 2934 if (adp->ad_state & ATTACHED) 2935 panic("handle_written_inodeblock: new entry"); 2936 if (adp->ad_lbn < NDADDR) { 2937 if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) 2938 panic("%s: %s #%d mismatch %d != %d", 2939 "handle_written_inodeblock", 2940 "direct pointer", adp->ad_lbn, 2941 dp->di_db[adp->ad_lbn], adp->ad_oldblkno); 2942 dp->di_db[adp->ad_lbn] = adp->ad_newblkno; 2943 } else { 2944 if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) 2945 panic("%s: %s #%d allocated as %d", 2946 "handle_written_inodeblock", 2947 "indirect pointer", adp->ad_lbn - NDADDR, 2948 dp->di_ib[adp->ad_lbn - NDADDR]); 2949 dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; 2950 } 2951 adp->ad_state &= ~UNDONE; 2952 adp->ad_state |= ATTACHED; 2953 hadchanges = 1; 2954 } 2955 /* 2956 * Reset the file size to its most up-to-date value. 2957 */ 2958 if (inodedep->id_savedsize == -1) 2959 panic("handle_written_inodeblock: bad size"); 2960 if (dp->di_size != inodedep->id_savedsize) { 2961 dp->di_size = inodedep->id_savedsize; 2962 hadchanges = 1; 2963 } 2964 inodedep->id_savedsize = -1; 2965 /* 2966 * If there were any rollbacks in the inode block, then it must be 2967 * marked dirty so that its will eventually get written back in 2968 * its correct form. 2969 */ 2970 if (hadchanges) 2971 bdirty(bp); 2972 /* 2973 * Process any allocdirects that completed during the update. 2974 */ 2975 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 2976 handle_allocdirect_partdone(adp); 2977 /* 2978 * Process deallocations that were held pending until the 2979 * inode had been written to disk. Freeing of the inode 2980 * is delayed until after all blocks have been freed to 2981 * avoid creation of new <vfsid, inum, lbn> triples 2982 * before the old ones have been deleted. 2983 */ 2984 filefree = NULL; 2985 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 2986 WORKLIST_REMOVE(wk); 2987 switch (wk->wk_type) { 2988 2989 case M_FREEFILE: 2990 /* 2991 * We defer adding filefree to the worklist until 2992 * all other additions have been made to ensure 2993 * that it will be done after all the old blocks 2994 * have been freed. 2995 */ 2996 if (filefree != NULL) 2997 panic("handle_written_inodeblock: filefree"); 2998 filefree = wk; 2999 continue; 3000 3001 case M_MKDIR: 3002 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 3003 continue; 3004 3005 case M_DIRADD: 3006 dap = WK_DIRADD(wk); 3007 dap->da_state |= COMPLETE; 3008 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3009 if (dap->da_state & DIRCHG) 3010 pagedep = dap->da_previous->dm_pagedep; 3011 else 3012 pagedep = dap->da_pagedep; 3013 LIST_REMOVE(dap, da_pdlist); 3014 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 3015 da_pdlist); 3016 } 3017 WORKLIST_INSERT(&inodedep->id_pendinghd, wk); 3018 continue; 3019 3020 case M_FREEBLKS: 3021 case M_FREEFRAG: 3022 case M_DIRREM: 3023 add_to_worklist(wk); 3024 continue; 3025 3026 default: 3027 panic("handle_written_inodeblock: Unknown type %s", 3028 TYPENAME(wk->wk_type)); 3029 /* NOTREACHED */ 3030 } 3031 } 3032 if (filefree != NULL) 3033 add_to_worklist(filefree); 3034 3035 /* 3036 * If no outstanding dependencies, free it. 3037 */ 3038 if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0) 3039 return (0); 3040 return (hadchanges); 3041} 3042 3043/* 3044 * Handle the completion of a mkdir dependency. 3045 */ 3046static void 3047handle_written_mkdir(mkdir, type) 3048 struct mkdir *mkdir; 3049 int type; 3050{ 3051 struct diradd *dap; 3052 struct pagedep *pagedep; 3053 3054 if (mkdir->md_state != type) 3055 panic("handle_written_mkdir: bad type"); 3056 dap = mkdir->md_diradd; 3057 dap->da_state &= ~type; 3058 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 3059 dap->da_state |= DEPCOMPLETE; 3060 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3061 if (dap->da_state & DIRCHG) 3062 pagedep = dap->da_previous->dm_pagedep; 3063 else 3064 pagedep = dap->da_pagedep; 3065 LIST_REMOVE(dap, da_pdlist); 3066 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 3067 } 3068 LIST_REMOVE(mkdir, md_mkdirs); 3069 WORKITEM_FREE(mkdir, M_MKDIR); 3070} 3071 3072/* 3073 * Called from within softdep_disk_write_complete above. 3074 * A write operation was just completed. Removed inodes can 3075 * now be freed and associated block pointers may be committed. 3076 * Note that this routine is always called from interrupt level 3077 * with further splbio interrupts blocked. 3078 */ 3079static int 3080handle_written_filepage(pagedep, bp) 3081 struct pagedep *pagedep; 3082 struct buf *bp; /* buffer containing the written page */ 3083{ 3084 struct dirrem *dirrem; 3085 struct diradd *dap, *nextdap; 3086 struct direct *ep; 3087 int i, chgs; 3088 3089 if ((pagedep->pd_state & IOSTARTED) == 0) 3090 panic("handle_written_filepage: not started"); 3091 pagedep->pd_state &= ~IOSTARTED; 3092 /* 3093 * Process any directory removals that have been committed. 3094 */ 3095 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 3096 LIST_REMOVE(dirrem, dm_next); 3097 dirrem->dm_dirinum = pagedep->pd_ino; 3098 add_to_worklist(&dirrem->dm_list); 3099 } 3100 /* 3101 * Free any directory additions that have been committed. 3102 */ 3103 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 3104 free_diradd(dap); 3105 /* 3106 * Uncommitted directory entries must be restored. 3107 */ 3108 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 3109 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 3110 dap = nextdap) { 3111 nextdap = LIST_NEXT(dap, da_pdlist); 3112 if (dap->da_state & ATTACHED) 3113 panic("handle_written_filepage: attached"); 3114 ep = (struct direct *) 3115 ((char *)bp->b_data + dap->da_offset); 3116 ep->d_ino = dap->da_newinum; 3117 dap->da_state &= ~UNDONE; 3118 dap->da_state |= ATTACHED; 3119 chgs = 1; 3120 /* 3121 * If the inode referenced by the directory has 3122 * been written out, then the dependency can be 3123 * moved to the pending list. 3124 */ 3125 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 3126 LIST_REMOVE(dap, da_pdlist); 3127 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 3128 da_pdlist); 3129 } 3130 } 3131 } 3132 /* 3133 * If there were any rollbacks in the directory, then it must be 3134 * marked dirty so that its will eventually get written back in 3135 * its correct form. 3136 */ 3137 if (chgs) 3138 bdirty(bp); 3139 /* 3140 * If no dependencies remain, the pagedep will be freed. 3141 * Otherwise it will remain to update the page before it 3142 * is written back to disk. 3143 */ 3144 if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) { 3145 for (i = 0; i < DAHASHSZ; i++) 3146 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) 3147 break; 3148 if (i == DAHASHSZ) { 3149 LIST_REMOVE(pagedep, pd_hash); 3150 WORKITEM_FREE(pagedep, M_PAGEDEP); 3151 return (0); 3152 } 3153 } 3154 return (1); 3155} 3156 3157/* 3158 * Writing back in-core inode structures. 3159 * 3160 * The file system only accesses an inode's contents when it occupies an 3161 * "in-core" inode structure. These "in-core" structures are separate from 3162 * the page frames used to cache inode blocks. Only the latter are 3163 * transferred to/from the disk. So, when the updated contents of the 3164 * "in-core" inode structure are copied to the corresponding in-memory inode 3165 * block, the dependencies are also transferred. The following procedure is 3166 * called when copying a dirty "in-core" inode to a cached inode block. 3167 */ 3168 3169/* 3170 * Called when an inode is loaded from disk. If the effective link count 3171 * differed from the actual link count when it was last flushed, then we 3172 * need to ensure that the correct effective link count is put back. 3173 */ 3174void 3175softdep_load_inodeblock(ip) 3176 struct inode *ip; /* the "in_core" copy of the inode */ 3177{ 3178 struct inodedep *inodedep; 3179 int error, gotit; 3180 3181 /* 3182 * Check for alternate nlink count. 3183 */ 3184 ip->i_effnlink = ip->i_nlink; 3185 ACQUIRE_LOCK(&lk); 3186 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3187 FREE_LOCK(&lk); 3188 return; 3189 } 3190 if (inodedep->id_nlinkdelta != 0) { 3191 ip->i_effnlink -= inodedep->id_nlinkdelta; 3192 inodedep->id_nlinkdelta = 0; 3193 (void) free_inodedep(inodedep); 3194 } 3195 FREE_LOCK(&lk); 3196} 3197 3198/* 3199 * This routine is called just before the "in-core" inode 3200 * information is to be copied to the in-memory inode block. 3201 * Recall that an inode block contains several inodes. If 3202 * the force flag is set, then the dependencies will be 3203 * cleared so that the update can always be made. Note that 3204 * the buffer is locked when this routine is called, so we 3205 * will never be in the middle of writing the inode block 3206 * to disk. 3207 */ 3208void 3209softdep_update_inodeblock(ip, bp, waitfor) 3210 struct inode *ip; /* the "in_core" copy of the inode */ 3211 struct buf *bp; /* the buffer containing the inode block */ 3212 int waitfor; /* 1 => update must be allowed */ 3213{ 3214 struct inodedep *inodedep; 3215 int error, gotit; 3216 3217 /* 3218 * If the effective link count is not equal to the actual link 3219 * count, then we must track the difference in an inodedep while 3220 * the inode is (potentially) tossed out of the cache. Otherwise, 3221 * if there is no existing inodedep, then there are no dependencies 3222 * to track. 3223 */ 3224 ACQUIRE_LOCK(&lk); 3225 if (ip->i_effnlink != ip->i_nlink) { 3226 (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, 3227 &inodedep); 3228 } else if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { 3229 FREE_LOCK(&lk); 3230 return; 3231 } 3232 if (ip->i_nlink < ip->i_effnlink) 3233 panic("softdep_update_inodeblock: bad delta"); 3234 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3235 /* 3236 * If the last remaining use for the inodedep was to track the 3237 * link count, and there is no difference between the effective 3238 * and actual link count, then we can free the inodedep. 3239 */ 3240 if (free_inodedep(inodedep)) { 3241 FREE_LOCK(&lk); 3242 return; 3243 } 3244 /* 3245 * Changes have been initiated. Anything depending on these 3246 * changes cannot occur until this inode has been written. 3247 */ 3248 inodedep->id_state &= ~COMPLETE; 3249 if ((inodedep->id_state & ONWORKLIST) == 0) 3250 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 3251 /* 3252 * Any new dependencies associated with the incore inode must 3253 * now be moved to the list associated with the buffer holding 3254 * the in-memory copy of the inode. Once merged process any 3255 * allocdirects that are completed by the merger. 3256 */ 3257 merge_inode_lists(inodedep); 3258 if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) 3259 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); 3260 /* 3261 * Newly allocated inodes cannot be written until the bitmap 3262 * that allocates them have been written (indicated by 3263 * DEPCOMPLETE being set in id_state). If we are doing a 3264 * forced sync (e.g., an fsync on a file), we force the bitmap 3265 * to be written so that the update can be done. 3266 */ 3267 if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) { 3268 FREE_LOCK(&lk); 3269 return; 3270 } 3271 gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); 3272 FREE_LOCK(&lk); 3273 if (gotit && (error = VOP_BWRITE(inodedep->id_buf)) != 0) 3274 softdep_error("softdep_update_inodeblock: bwrite", error); 3275 if ((inodedep->id_state & DEPCOMPLETE) == 0) 3276 panic("softdep_update_inodeblock: update failed"); 3277} 3278 3279/* 3280 * Merge the new inode dependency list (id_newinoupdt) into the old 3281 * inode dependency list (id_inoupdt). This routine must be called 3282 * with splbio interrupts blocked. 3283 */ 3284static void 3285merge_inode_lists(inodedep) 3286 struct inodedep *inodedep; 3287{ 3288 struct allocdirect *listadp, *newadp; 3289 3290 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3291 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) { 3292 if (listadp->ad_lbn < newadp->ad_lbn) { 3293 listadp = TAILQ_NEXT(listadp, ad_next); 3294 continue; 3295 } 3296 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3297 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 3298 if (listadp->ad_lbn == newadp->ad_lbn) { 3299 allocdirect_merge(&inodedep->id_inoupdt, newadp, 3300 listadp); 3301 listadp = newadp; 3302 } 3303 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); 3304 } 3305 while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) { 3306 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); 3307 TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next); 3308 } 3309} 3310 3311/* 3312 * If we are doing an fsync, then we must ensure that any directory 3313 * entries for the inode have been written after the inode gets to disk. 3314 */ 3315int 3316softdep_fsync(vp) 3317 struct vnode *vp; /* the "in_core" copy of the inode */ 3318{ 3319 struct diradd *dap, *olddap; 3320 struct inodedep *inodedep; 3321 struct pagedep *pagedep; 3322 struct worklist *wk; 3323 struct mount *mnt; 3324 struct vnode *pvp; 3325 struct inode *ip; 3326 struct buf *bp; 3327 struct fs *fs; 3328 struct proc *p = curproc; /* XXX */ 3329 int error, ret, flushparent; 3330 struct timeval tv; 3331 ino_t parentino; 3332 ufs_lbn_t lbn; 3333 3334 ip = VTOI(vp); 3335 fs = ip->i_fs; 3336 for (error = 0, flushparent = 0, olddap = NULL; ; ) { 3337 ACQUIRE_LOCK(&lk); 3338 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) 3339 break; 3340 if (LIST_FIRST(&inodedep->id_inowait) != NULL || 3341 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || 3342 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) 3343 panic("softdep_fsync: pending ops"); 3344 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 3345 break; 3346 if (wk->wk_type != M_DIRADD) 3347 panic("softdep_fsync: Unexpected type %s", 3348 TYPENAME(wk->wk_type)); 3349 dap = WK_DIRADD(wk); 3350 /* 3351 * If we have failed to get rid of all the dependencies 3352 * then something is seriously wrong. 3353 */ 3354 if (dap == olddap) 3355 panic("softdep_fsync: flush failed"); 3356 olddap = dap; 3357 /* 3358 * Flush our parent if this directory entry 3359 * has a MKDIR_PARENT dependency. 3360 */ 3361 if (dap->da_state & DIRCHG) 3362 pagedep = dap->da_previous->dm_pagedep; 3363 else 3364 pagedep = dap->da_pagedep; 3365 mnt = pagedep->pd_mnt; 3366 parentino = pagedep->pd_ino; 3367 lbn = pagedep->pd_lbn; 3368 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 3369 panic("softdep_fsync: dirty"); 3370 flushparent = dap->da_state & MKDIR_PARENT; 3371 /* 3372 * If we are being fsync'ed as part of vgone'ing this vnode, 3373 * then we will not be able to release and recover the 3374 * vnode below, so we just have to give up on writing its 3375 * directory entry out. It will eventually be written, just 3376 * not now, but then the user was not asking to have it 3377 * written, so we are not breaking any promises. 3378 */ 3379 if (vp->v_flag & VXLOCK) 3380 break; 3381 /* 3382 * We prevent deadlock by always fetching inodes from the 3383 * root, moving down the directory tree. Thus, when fetching 3384 * our parent directory, we must unlock ourselves before 3385 * requesting the lock on our parent. See the comment in 3386 * ufs_lookup for details on possible races. 3387 */ 3388 FREE_LOCK(&lk); 3389 VOP_UNLOCK(vp, 0, p); 3390 if ((error = VFS_VGET(mnt, parentino, &pvp)) != 0) { 3391 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 3392 return (error); 3393 } 3394 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 3395 if (flushparent) { 3396 tv = time; 3397 if (error = VOP_UPDATE(pvp, &tv, &tv, MNT_WAIT)) { 3398 vput(pvp); 3399 return (error); 3400 } 3401 } 3402 /* 3403 * Flush directory page containing the inode's name. 3404 */ 3405 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred, 3406 &bp); 3407 vput(pvp); 3408 ret = VOP_BWRITE(bp); 3409 if (error != 0) 3410 return (error); 3411 if (ret != 0) 3412 return (ret); 3413 } 3414 FREE_LOCK(&lk); 3415 return (0); 3416} 3417 3418/* 3419 * This routine is called when we are trying to synchronously flush a 3420 * file. This routine must eliminate any filesystem metadata dependencies 3421 * so that the syncing routine can succeed by pushing the dirty blocks 3422 * associated with the file. If any I/O errors occur, they are returned. 3423 */ 3424int 3425softdep_sync_metadata(ap) 3426 struct vop_fsync_args /* { 3427 struct vnode *a_vp; 3428 struct ucred *a_cred; 3429 int a_waitfor; 3430 struct proc *a_p; 3431 } */ *ap; 3432{ 3433 struct vnode *vp = ap->a_vp; 3434 struct pagedep *pagedep; 3435 struct allocdirect *adp; 3436 struct allocindir *aip; 3437 struct buf *bp, *nbp; 3438 struct worklist *wk; 3439 int i, error, waitfor; 3440 3441 /* 3442 * Check whether this vnode is involved in a filesystem 3443 * that is doing soft dependency processing. 3444 */ 3445 if (vp->v_type != VBLK) { 3446 if (!DOINGSOFTDEP(vp)) 3447 return (0); 3448 } else 3449 if (vp->v_specmountpoint == NULL || 3450 (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0) 3451 return (0); 3452 /* 3453 * Ensure that any direct block dependencies have been cleared. 3454 */ 3455 ACQUIRE_LOCK(&lk); 3456 if (error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number)) { 3457 FREE_LOCK(&lk); 3458 return (error); 3459 } 3460 /* 3461 * For most files, the only metadata dependencies are the 3462 * cylinder group maps that allocate their inode or blocks. 3463 * The block allocation dependencies can be found by traversing 3464 * the dependency lists for any buffers that remain on their 3465 * dirty buffer list. The inode allocation dependency will 3466 * be resolved when the inode is updated with MNT_WAIT. 3467 * This work is done in two passes. The first pass grabs most 3468 * of the buffers and begins asynchronously writing them. The 3469 * only way to wait for these asynchronous writes is to sleep 3470 * on the filesystem vnode which may stay busy for a long time 3471 * if the filesystem is active. So, instead, we make a second 3472 * pass over the dependencies blocking on each write. In the 3473 * usual case we will be blocking against a write that we 3474 * initiated, so when it is done the dependency will have been 3475 * resolved. Thus the second pass is expected to end quickly. 3476 */ 3477 waitfor = MNT_NOWAIT; 3478top: 3479 if (getdirtybuf(&LIST_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) { 3480 FREE_LOCK(&lk); 3481 return (0); 3482 } 3483 bp = LIST_FIRST(&vp->v_dirtyblkhd); 3484loop: 3485 /* 3486 * As we hold the buffer locked, none of its dependencies 3487 * will disappear. 3488 */ 3489 for (wk = LIST_FIRST(&bp->b_dep); wk; 3490 wk = LIST_NEXT(wk, wk_list)) { 3491 switch (wk->wk_type) { 3492 3493 case M_ALLOCDIRECT: 3494 adp = WK_ALLOCDIRECT(wk); 3495 if (adp->ad_state & DEPCOMPLETE) 3496 break; 3497 nbp = adp->ad_buf; 3498 if (getdirtybuf(&nbp, waitfor) == 0) 3499 break; 3500 FREE_LOCK(&lk); 3501 if (waitfor == MNT_NOWAIT) { 3502 bawrite(nbp); 3503 } else if ((error = VOP_BWRITE(nbp)) != 0) { 3504 bawrite(bp); 3505 return (error); 3506 } 3507 ACQUIRE_LOCK(&lk); 3508 break; 3509 3510 case M_ALLOCINDIR: 3511 aip = WK_ALLOCINDIR(wk); 3512 if (aip->ai_state & DEPCOMPLETE) 3513 break; 3514 nbp = aip->ai_buf; 3515 if (getdirtybuf(&nbp, waitfor) == 0) 3516 break; 3517 FREE_LOCK(&lk); 3518 if (waitfor == MNT_NOWAIT) { 3519 bawrite(nbp); 3520 } else if ((error = VOP_BWRITE(nbp)) != 0) { 3521 bawrite(bp); 3522 return (error); 3523 } 3524 ACQUIRE_LOCK(&lk); 3525 break; 3526 3527 case M_INDIRDEP: 3528 restart: 3529 for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd); 3530 aip; aip = LIST_NEXT(aip, ai_next)) { 3531 if (aip->ai_state & DEPCOMPLETE) 3532 continue; 3533 nbp = aip->ai_buf; 3534 if (getdirtybuf(&nbp, MNT_WAIT) == 0) 3535 goto restart; 3536 FREE_LOCK(&lk); 3537 if ((error = VOP_BWRITE(nbp)) != 0) { 3538 bawrite(bp); 3539 return (error); 3540 } 3541 ACQUIRE_LOCK(&lk); 3542 goto restart; 3543 } 3544 break; 3545 3546 case M_INODEDEP: 3547 if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs, 3548 WK_INODEDEP(wk)->id_ino)) != 0) { 3549 FREE_LOCK(&lk); 3550 bawrite(bp); 3551 return (error); 3552 } 3553 break; 3554 3555 case M_PAGEDEP: 3556 /* 3557 * We are trying to sync a directory that may 3558 * have dependencies on both its own metadata 3559 * and/or dependencies on the inodes of any 3560 * recently allocated files. We walk its diradd 3561 * lists pushing out the associated inode. 3562 */ 3563 pagedep = WK_PAGEDEP(wk); 3564 for (i = 0; i < DAHASHSZ; i++) { 3565 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 3566 continue; 3567 if (error = flush_pagedep_deps(vp, 3568 pagedep->pd_mnt, &pagedep->pd_diraddhd[i])) { 3569 FREE_LOCK(&lk); 3570 bawrite(bp); 3571 return (error); 3572 } 3573 } 3574 break; 3575 3576 default: 3577 panic("softdep_sync_metadata: Unknown type %s", 3578 TYPENAME(wk->wk_type)); 3579 /* NOTREACHED */ 3580 } 3581 } 3582 (void) getdirtybuf(&LIST_NEXT(bp, b_vnbufs), MNT_WAIT); 3583 nbp = LIST_NEXT(bp, b_vnbufs); 3584 FREE_LOCK(&lk); 3585 bawrite(bp); 3586 ACQUIRE_LOCK(&lk); 3587 if (nbp != NULL) { 3588 bp = nbp; 3589 goto loop; 3590 } 3591 /* 3592 * We must wait for any I/O in progress to finish so that 3593 * all potential buffers on the dirty list will be visible. 3594 * Once they are all there, proceed with the second pass 3595 * which will wait for the I/O as per above. 3596 */ 3597 while (vp->v_numoutput) { 3598 vp->v_flag |= VBWAIT; 3599 FREE_LOCK_INTERLOCKED(&lk); 3600 sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1); 3601 ACQUIRE_LOCK_INTERLOCKED(&lk); 3602 } 3603 /* 3604 * The brief unlock is to allow any pent up dependency 3605 * processing to be done. 3606 */ 3607 if (waitfor == MNT_NOWAIT) { 3608 waitfor = MNT_WAIT; 3609 FREE_LOCK(&lk); 3610 ACQUIRE_LOCK(&lk); 3611 goto top; 3612 } 3613 3614 /* 3615 * If we have managed to get rid of all the dirty buffers, 3616 * then we are done. For certain directories and block 3617 * devices, we may need to do further work. 3618 */ 3619 if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 3620 FREE_LOCK(&lk); 3621 return (0); 3622 } 3623 3624 FREE_LOCK(&lk); 3625 /* 3626 * If we are trying to sync a block device, some of its buffers may 3627 * contain metadata that cannot be written until the contents of some 3628 * partially written files have been written to disk. The only easy 3629 * way to accomplish this is to sync the entire filesystem (luckily 3630 * this happens rarely). 3631 */ 3632 if (vp->v_type == VBLK && vp->v_specmountpoint && !VOP_ISLOCKED(vp) && 3633 (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred, 3634 ap->a_p)) != 0) 3635 return (error); 3636 return (0); 3637} 3638 3639/* 3640 * Flush the dependencies associated with an inodedep. 3641 * Called with splbio blocked. 3642 */ 3643static int 3644flush_inodedep_deps(fs, ino) 3645 struct fs *fs; 3646 ino_t ino; 3647{ 3648 struct inodedep *inodedep; 3649 struct allocdirect *adp; 3650 int error, waitfor; 3651 struct buf *bp; 3652 3653 /* 3654 * This work is done in two passes. The first pass grabs most 3655 * of the buffers and begins asynchronously writing them. The 3656 * only way to wait for these asynchronous writes is to sleep 3657 * on the filesystem vnode which may stay busy for a long time 3658 * if the filesystem is active. So, instead, we make a second 3659 * pass over the dependencies blocking on each write. In the 3660 * usual case we will be blocking against a write that we 3661 * initiated, so when it is done the dependency will have been 3662 * resolved. Thus the second pass is expected to end quickly. 3663 * We give a brief window at the top of the loop to allow 3664 * any pending I/O to complete. 3665 */ 3666 for (waitfor = MNT_NOWAIT; ; ) { 3667 FREE_LOCK(&lk); 3668 ACQUIRE_LOCK(&lk); 3669 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) 3670 return (0); 3671 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 3672 adp = TAILQ_NEXT(adp, ad_next)) { 3673 if (adp->ad_state & DEPCOMPLETE) 3674 continue; 3675 bp = adp->ad_buf; 3676 if (getdirtybuf(&bp, waitfor) == 0) 3677 break; 3678 FREE_LOCK(&lk); 3679 if (waitfor == MNT_NOWAIT) { 3680 bawrite(bp); 3681 } else if ((error = VOP_BWRITE(bp)) != 0) { 3682 ACQUIRE_LOCK(&lk); 3683 return (error); 3684 } 3685 ACQUIRE_LOCK(&lk); 3686 break; 3687 } 3688 if (adp != NULL) 3689 continue; 3690 for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp; 3691 adp = TAILQ_NEXT(adp, ad_next)) { 3692 if (adp->ad_state & DEPCOMPLETE) 3693 continue; 3694 bp = adp->ad_buf; 3695 if (getdirtybuf(&bp, waitfor) == 0) 3696 break; 3697 FREE_LOCK(&lk); 3698 if (waitfor == MNT_NOWAIT) { 3699 bawrite(bp); 3700 } else if ((error = VOP_BWRITE(bp)) != 0) { 3701 ACQUIRE_LOCK(&lk); 3702 return (error); 3703 } 3704 ACQUIRE_LOCK(&lk); 3705 break; 3706 } 3707 if (adp != NULL) 3708 continue; 3709 /* 3710 * If pass2, we are done, otherwise do pass 2. 3711 */ 3712 if (waitfor == MNT_WAIT) 3713 break; 3714 waitfor = MNT_WAIT; 3715 } 3716 /* 3717 * Try freeing inodedep in case all dependencies have been removed. 3718 */ 3719 if (inodedep_lookup(fs, ino, 0, &inodedep) != 0) 3720 (void) free_inodedep(inodedep); 3721 return (0); 3722} 3723 3724/* 3725 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 3726 * Called with splbio blocked. 3727 */ 3728static int 3729flush_pagedep_deps(pvp, mp, diraddhdp) 3730 struct vnode *pvp; 3731 struct mount *mp; 3732 struct diraddhd *diraddhdp; 3733{ 3734 struct proc *p = curproc; /* XXX */ 3735 struct inodedep *inodedep; 3736 struct ufsmount *ump; 3737 struct diradd *dap; 3738 struct timeval tv; 3739 struct vnode *vp; 3740 int gotit, error; 3741 struct buf *bp; 3742 ino_t inum; 3743 3744 ump = VFSTOUFS(mp); 3745 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 3746 /* 3747 * Flush ourselves if this directory entry 3748 * has a MKDIR_PARENT dependency. 3749 */ 3750 if (dap->da_state & MKDIR_PARENT) { 3751 tv = time; 3752 FREE_LOCK(&lk); 3753 if (error = VOP_UPDATE(pvp, &tv, &tv, MNT_WAIT)) 3754 break; 3755 ACQUIRE_LOCK(&lk); 3756 /* 3757 * If that cleared dependencies, go on to next. 3758 */ 3759 if (dap != LIST_FIRST(diraddhdp)) 3760 continue; 3761 if (dap->da_state & MKDIR_PARENT) 3762 panic("flush_pagedep_deps: MKDIR"); 3763 } 3764 /* 3765 * Flush the file on which the directory entry depends. 3766 * If the inode has already been pushed out of the cache, 3767 * then all the block dependencies will have been flushed 3768 * leaving only inode dependencies (e.g., bitmaps). Thus, 3769 * we do a ufs_ihashget to check for the vnode in the cache. 3770 * If it is there, we do a full flush. If it is no longer 3771 * there we need only dispose of any remaining bitmap 3772 * dependencies and write the inode to disk. 3773 */ 3774 inum = dap->da_newinum; 3775 FREE_LOCK(&lk); 3776 if ((vp = ufs_ihashget(ump->um_dev, inum)) == NULL) { 3777 ACQUIRE_LOCK(&lk); 3778 if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0 3779 && dap == LIST_FIRST(diraddhdp)) 3780 panic("flush_pagedep_deps: flush 1 failed"); 3781 /* 3782 * If the inode still has bitmap dependencies, 3783 * push them to disk. 3784 */ 3785 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 3786 gotit = getdirtybuf(&inodedep->id_buf,MNT_WAIT); 3787 FREE_LOCK(&lk); 3788 if (gotit && 3789 (error = VOP_BWRITE(inodedep->id_buf)) != 0) 3790 break; 3791 ACQUIRE_LOCK(&lk); 3792 } 3793 if (dap != LIST_FIRST(diraddhdp)) 3794 continue; 3795 /* 3796 * If the inode is still sitting in a buffer waiting 3797 * to be written, push it to disk. 3798 */ 3799 FREE_LOCK(&lk); 3800 if ((error = bread(ump->um_devvp, 3801 fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), 3802 (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) 3803 break; 3804 if ((error = VOP_BWRITE(bp)) != 0) 3805 break; 3806 ACQUIRE_LOCK(&lk); 3807 if (dap == LIST_FIRST(diraddhdp)) 3808 panic("flush_pagedep_deps: flush 2 failed"); 3809 continue; 3810 } 3811 if (vp->v_type == VDIR) { 3812 /* 3813 * A newly allocated directory must have its "." and 3814 * ".." entries written out before its name can be 3815 * committed in its parent. We do not want or need 3816 * the full semantics of a synchronous VOP_FSYNC as 3817 * that may end up here again, once for each directory 3818 * level in the filesystem. Instead, we push the blocks 3819 * and wait for them to clear. 3820 */ 3821 if (error = VOP_FSYNC(vp, p->p_cred, MNT_NOWAIT, p)) { 3822 vput(vp); 3823 break; 3824 } 3825 ACQUIRE_LOCK(&lk); 3826 while (vp->v_numoutput) { 3827 vp->v_flag |= VBWAIT; 3828 FREE_LOCK_INTERLOCKED(&lk); 3829 sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1); 3830 ACQUIRE_LOCK_INTERLOCKED(&lk); 3831 } 3832 FREE_LOCK(&lk); 3833 } 3834 tv = time; 3835 error = VOP_UPDATE(vp, &tv, &tv, MNT_WAIT); 3836 vput(vp); 3837 if (error) 3838 break; 3839 /* 3840 * If we have failed to get rid of all the dependencies 3841 * then something is seriously wrong. 3842 */ 3843 if (dap == LIST_FIRST(diraddhdp)) 3844 panic("flush_pagedep_deps: flush 3 failed"); 3845 ACQUIRE_LOCK(&lk); 3846 } 3847 if (error) 3848 ACQUIRE_LOCK(&lk); 3849 return (error); 3850} 3851 3852/* 3853 * Acquire exclusive access to a buffer. 3854 * Must be called with splbio blocked. 3855 * Return 1 if buffer was acquired. 3856 */ 3857static int 3858getdirtybuf(bpp, waitfor) 3859 struct buf **bpp; 3860 int waitfor; 3861{ 3862 struct buf *bp; 3863 3864 for (;;) { 3865 if ((bp = *bpp) == NULL) 3866 return (0); 3867 if ((bp->b_flags & B_BUSY) == 0) 3868 break; 3869 if (waitfor != MNT_WAIT) 3870 return (0); 3871 bp->b_flags |= B_WANTED; 3872 FREE_LOCK_INTERLOCKED(&lk); 3873 sleep((caddr_t)bp, PRIBIO + 1); 3874 ACQUIRE_LOCK_INTERLOCKED(&lk); 3875 } 3876 if ((bp->b_flags & B_DELWRI) == 0) 3877 return (0); 3878 bremfree(bp); 3879 bp->b_flags |= B_BUSY; 3880 return (1); 3881} 3882 3883/* 3884 * Called whenever a buffer that is being invalidated or reallocated 3885 * contains dependencies. This should only happen if an I/O error has 3886 * occurred. The routine is called with the buffer locked. 3887 */ 3888void 3889softdep_deallocate_dependencies(bp) 3890 struct buf *bp; 3891{ 3892 struct worklist *wk; 3893 3894 if ((bp->b_flags & B_ERROR) == 0) 3895 panic("softdep_deallocate_dependencies: dangling deps"); 3896 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 3897 ACQUIRE_LOCK(&lk); 3898 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 3899 WORKLIST_REMOVE(wk); 3900 FREE_LOCK(&lk); 3901 switch (wk->wk_type) { 3902 /* 3903 * XXX - should really clean up, but for now we will 3904 * just leak memory and not worry about it. Also should 3905 * mark the filesystem permanently dirty so that it will 3906 * force fsck to be run (though this would best be done 3907 * in the mainline code). 3908 */ 3909 case M_PAGEDEP: 3910 case M_INODEDEP: 3911 case M_BMSAFEMAP: 3912 case M_ALLOCDIRECT: 3913 case M_INDIRDEP: 3914 case M_ALLOCINDIR: 3915 case M_MKDIR: 3916#ifdef DEBUG 3917 printf("Lost type %s\n", TYPENAME(wk->wk_type)); 3918#endif 3919 break; 3920 default: 3921 panic("%s: Unexpected type %s", 3922 "softdep_deallocate_dependencies", 3923 TYPENAME(wk->wk_type)); 3924 /* NOTREACHED */ 3925 } 3926 ACQUIRE_LOCK(&lk); 3927 } 3928 FREE_LOCK(&lk); 3929} 3930 3931/* 3932 * Function to handle asynchronous write errors in the filesystem. 3933 */ 3934void 3935softdep_error(func, error) 3936 char *func; 3937 int error; 3938{ 3939 3940 /* XXX should do something better! */ 3941 log(LOG_ERR, "%s: got error %d while accessing filesystem\n", 3942 func, error); 3943} 3944