ffs_softdep.c revision 231091
1/*- 2 * Copyright 1998, 2000 Marshall Kirk McKusick. 3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org> 4 * All rights reserved. 5 * 6 * The soft updates code is derived from the appendix of a University 7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 8 * "Soft Updates: A Solution to the Metadata Update Problem in File 9 * Systems", CSE-TR-254-95, August 1995). 10 * 11 * Further information about soft updates can be obtained from: 12 * 13 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 14 * 1614 Oxford Street mckusick@mckusick.com 15 * Berkeley, CA 94709-1608 +1-510-843-9542 16 * USA 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 * 39 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 40 */ 41 42#include <sys/cdefs.h> 43__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 231091 2012-02-06 17:59:14Z kib $"); 44 45#include "opt_ffs.h" 46#include "opt_quota.h" 47#include "opt_ddb.h" 48 49/* 50 * For now we want the safety net that the DEBUG flag provides. 51 */ 52#ifndef DEBUG 53#define DEBUG 54#endif 55 56#include <sys/param.h> 57#include <sys/kernel.h> 58#include <sys/systm.h> 59#include <sys/bio.h> 60#include <sys/buf.h> 61#include <sys/kdb.h> 62#include <sys/kthread.h> 63#include <sys/limits.h> 64#include <sys/lock.h> 65#include <sys/malloc.h> 66#include <sys/mount.h> 67#include <sys/mutex.h> 68#include <sys/namei.h> 69#include <sys/priv.h> 70#include <sys/proc.h> 71#include <sys/stat.h> 72#include <sys/sysctl.h> 73#include <sys/syslog.h> 74#include <sys/vnode.h> 75#include <sys/conf.h> 76 77#include <ufs/ufs/dir.h> 78#include <ufs/ufs/extattr.h> 79#include <ufs/ufs/quota.h> 80#include <ufs/ufs/inode.h> 81#include <ufs/ufs/ufsmount.h> 82#include <ufs/ffs/fs.h> 83#include <ufs/ffs/softdep.h> 84#include <ufs/ffs/ffs_extern.h> 85#include <ufs/ufs/ufs_extern.h> 86 87#include <vm/vm.h> 88#include <vm/vm_extern.h> 89#include <vm/vm_object.h> 90 91#include <ddb/ddb.h> 92 93#ifndef SOFTUPDATES 94 95int 96softdep_flushfiles(oldmnt, flags, td) 97 struct mount *oldmnt; 98 int flags; 99 struct thread *td; 100{ 101 102 panic("softdep_flushfiles called"); 103} 104 105int 106softdep_mount(devvp, mp, fs, cred) 107 struct vnode *devvp; 108 struct mount *mp; 109 struct fs *fs; 110 struct ucred *cred; 111{ 112 113 return (0); 114} 115 116void 117softdep_initialize() 118{ 119 120 return; 121} 122 123void 124softdep_uninitialize() 125{ 126 127 return; 128} 129 130void 131softdep_unmount(mp) 132 struct mount *mp; 133{ 134 135} 136 137void 138softdep_setup_sbupdate(ump, fs, bp) 139 struct ufsmount *ump; 140 struct fs *fs; 141 struct buf *bp; 142{ 143} 144 145void 146softdep_setup_inomapdep(bp, ip, newinum, mode) 147 struct buf *bp; 148 struct inode *ip; 149 ino_t newinum; 150 int mode; 151{ 152 153 panic("softdep_setup_inomapdep called"); 154} 155 156void 157softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 158 struct buf *bp; 159 struct mount *mp; 160 ufs2_daddr_t newblkno; 161 int frags; 162 int oldfrags; 163{ 164 165 panic("softdep_setup_blkmapdep called"); 166} 167 168void 169softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 170 struct inode *ip; 171 ufs_lbn_t lbn; 172 ufs2_daddr_t newblkno; 173 ufs2_daddr_t oldblkno; 174 long newsize; 175 long oldsize; 176 struct buf *bp; 177{ 178 179 panic("softdep_setup_allocdirect called"); 180} 181 182void 183softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 184 struct inode *ip; 185 ufs_lbn_t lbn; 186 ufs2_daddr_t newblkno; 187 ufs2_daddr_t oldblkno; 188 long newsize; 189 long oldsize; 190 struct buf *bp; 191{ 192 193 panic("softdep_setup_allocext called"); 194} 195 196void 197softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 198 struct inode *ip; 199 ufs_lbn_t lbn; 200 struct buf *bp; 201 int ptrno; 202 ufs2_daddr_t newblkno; 203 ufs2_daddr_t oldblkno; 204 struct buf *nbp; 205{ 206 207 panic("softdep_setup_allocindir_page called"); 208} 209 210void 211softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 212 struct buf *nbp; 213 struct inode *ip; 214 struct buf *bp; 215 int ptrno; 216 ufs2_daddr_t newblkno; 217{ 218 219 panic("softdep_setup_allocindir_meta called"); 220} 221 222void 223softdep_journal_freeblocks(ip, cred, length, flags) 224 struct inode *ip; 225 struct ucred *cred; 226 off_t length; 227 int flags; 228{ 229 230 panic("softdep_journal_freeblocks called"); 231} 232 233void 234softdep_journal_fsync(ip) 235 struct inode *ip; 236{ 237 238 panic("softdep_journal_fsync called"); 239} 240 241void 242softdep_setup_freeblocks(ip, length, flags) 243 struct inode *ip; 244 off_t length; 245 int flags; 246{ 247 248 panic("softdep_setup_freeblocks called"); 249} 250 251void 252softdep_freefile(pvp, ino, mode) 253 struct vnode *pvp; 254 ino_t ino; 255 int mode; 256{ 257 258 panic("softdep_freefile called"); 259} 260 261int 262softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 263 struct buf *bp; 264 struct inode *dp; 265 off_t diroffset; 266 ino_t newinum; 267 struct buf *newdirbp; 268 int isnewblk; 269{ 270 271 panic("softdep_setup_directory_add called"); 272} 273 274void 275softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 276 struct buf *bp; 277 struct inode *dp; 278 caddr_t base; 279 caddr_t oldloc; 280 caddr_t newloc; 281 int entrysize; 282{ 283 284 panic("softdep_change_directoryentry_offset called"); 285} 286 287void 288softdep_setup_remove(bp, dp, ip, isrmdir) 289 struct buf *bp; 290 struct inode *dp; 291 struct inode *ip; 292 int isrmdir; 293{ 294 295 panic("softdep_setup_remove called"); 296} 297 298void 299softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 300 struct buf *bp; 301 struct inode *dp; 302 struct inode *ip; 303 ino_t newinum; 304 int isrmdir; 305{ 306 307 panic("softdep_setup_directory_change called"); 308} 309 310void 311softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 312 struct mount *mp; 313 struct buf *bp; 314 ufs2_daddr_t blkno; 315 int frags; 316 struct workhead *wkhd; 317{ 318 319 panic("%s called", __FUNCTION__); 320} 321 322void 323softdep_setup_inofree(mp, bp, ino, wkhd) 324 struct mount *mp; 325 struct buf *bp; 326 ino_t ino; 327 struct workhead *wkhd; 328{ 329 330 panic("%s called", __FUNCTION__); 331} 332 333void 334softdep_setup_unlink(dp, ip) 335 struct inode *dp; 336 struct inode *ip; 337{ 338 339 panic("%s called", __FUNCTION__); 340} 341 342void 343softdep_setup_link(dp, ip) 344 struct inode *dp; 345 struct inode *ip; 346{ 347 348 panic("%s called", __FUNCTION__); 349} 350 351void 352softdep_revert_link(dp, ip) 353 struct inode *dp; 354 struct inode *ip; 355{ 356 357 panic("%s called", __FUNCTION__); 358} 359 360void 361softdep_setup_rmdir(dp, ip) 362 struct inode *dp; 363 struct inode *ip; 364{ 365 366 panic("%s called", __FUNCTION__); 367} 368 369void 370softdep_revert_rmdir(dp, ip) 371 struct inode *dp; 372 struct inode *ip; 373{ 374 375 panic("%s called", __FUNCTION__); 376} 377 378void 379softdep_setup_create(dp, ip) 380 struct inode *dp; 381 struct inode *ip; 382{ 383 384 panic("%s called", __FUNCTION__); 385} 386 387void 388softdep_revert_create(dp, ip) 389 struct inode *dp; 390 struct inode *ip; 391{ 392 393 panic("%s called", __FUNCTION__); 394} 395 396void 397softdep_setup_mkdir(dp, ip) 398 struct inode *dp; 399 struct inode *ip; 400{ 401 402 panic("%s called", __FUNCTION__); 403} 404 405void 406softdep_revert_mkdir(dp, ip) 407 struct inode *dp; 408 struct inode *ip; 409{ 410 411 panic("%s called", __FUNCTION__); 412} 413 414void 415softdep_setup_dotdot_link(dp, ip) 416 struct inode *dp; 417 struct inode *ip; 418{ 419 420 panic("%s called", __FUNCTION__); 421} 422 423int 424softdep_prealloc(vp, waitok) 425 struct vnode *vp; 426 int waitok; 427{ 428 429 panic("%s called", __FUNCTION__); 430 431 return (0); 432} 433 434int 435softdep_journal_lookup(mp, vpp) 436 struct mount *mp; 437 struct vnode **vpp; 438{ 439 440 return (ENOENT); 441} 442 443void 444softdep_change_linkcnt(ip) 445 struct inode *ip; 446{ 447 448 panic("softdep_change_linkcnt called"); 449} 450 451void 452softdep_load_inodeblock(ip) 453 struct inode *ip; 454{ 455 456 panic("softdep_load_inodeblock called"); 457} 458 459void 460softdep_update_inodeblock(ip, bp, waitfor) 461 struct inode *ip; 462 struct buf *bp; 463 int waitfor; 464{ 465 466 panic("softdep_update_inodeblock called"); 467} 468 469int 470softdep_fsync(vp) 471 struct vnode *vp; /* the "in_core" copy of the inode */ 472{ 473 474 return (0); 475} 476 477void 478softdep_fsync_mountdev(vp) 479 struct vnode *vp; 480{ 481 482 return; 483} 484 485int 486softdep_flushworklist(oldmnt, countp, td) 487 struct mount *oldmnt; 488 int *countp; 489 struct thread *td; 490{ 491 492 *countp = 0; 493 return (0); 494} 495 496int 497softdep_sync_metadata(struct vnode *vp) 498{ 499 500 return (0); 501} 502 503int 504softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) 505{ 506 507 return (0); 508} 509 510int 511softdep_slowdown(vp) 512 struct vnode *vp; 513{ 514 515 panic("softdep_slowdown called"); 516} 517 518void 519softdep_releasefile(ip) 520 struct inode *ip; /* inode with the zero effective link count */ 521{ 522 523 panic("softdep_releasefile called"); 524} 525 526int 527softdep_request_cleanup(fs, vp, cred, resource) 528 struct fs *fs; 529 struct vnode *vp; 530 struct ucred *cred; 531 int resource; 532{ 533 534 return (0); 535} 536 537int 538softdep_check_suspend(struct mount *mp, 539 struct vnode *devvp, 540 int softdep_deps, 541 int softdep_accdeps, 542 int secondary_writes, 543 int secondary_accwrites) 544{ 545 struct bufobj *bo; 546 int error; 547 548 (void) softdep_deps, 549 (void) softdep_accdeps; 550 551 bo = &devvp->v_bufobj; 552 ASSERT_BO_LOCKED(bo); 553 554 MNT_ILOCK(mp); 555 while (mp->mnt_secondary_writes != 0) { 556 BO_UNLOCK(bo); 557 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), 558 (PUSER - 1) | PDROP, "secwr", 0); 559 BO_LOCK(bo); 560 MNT_ILOCK(mp); 561 } 562 563 /* 564 * Reasons for needing more work before suspend: 565 * - Dirty buffers on devvp. 566 * - Secondary writes occurred after start of vnode sync loop 567 */ 568 error = 0; 569 if (bo->bo_numoutput > 0 || 570 bo->bo_dirty.bv_cnt > 0 || 571 secondary_writes != 0 || 572 mp->mnt_secondary_writes != 0 || 573 secondary_accwrites != mp->mnt_secondary_accwrites) 574 error = EAGAIN; 575 BO_UNLOCK(bo); 576 return (error); 577} 578 579void 580softdep_get_depcounts(struct mount *mp, 581 int *softdepactivep, 582 int *softdepactiveaccp) 583{ 584 (void) mp; 585 *softdepactivep = 0; 586 *softdepactiveaccp = 0; 587} 588 589void 590softdep_buf_append(bp, wkhd) 591 struct buf *bp; 592 struct workhead *wkhd; 593{ 594 595 panic("softdep_buf_appendwork called"); 596} 597 598void 599softdep_inode_append(ip, cred, wkhd) 600 struct inode *ip; 601 struct ucred *cred; 602 struct workhead *wkhd; 603{ 604 605 panic("softdep_inode_appendwork called"); 606} 607 608void 609softdep_freework(wkhd) 610 struct workhead *wkhd; 611{ 612 613 panic("softdep_freework called"); 614} 615 616#else 617 618FEATURE(softupdates, "FFS soft-updates support"); 619 620/* 621 * These definitions need to be adapted to the system to which 622 * this file is being ported. 623 */ 624 625#define M_SOFTDEP_FLAGS (M_WAITOK) 626 627#define D_PAGEDEP 0 628#define D_INODEDEP 1 629#define D_BMSAFEMAP 2 630#define D_NEWBLK 3 631#define D_ALLOCDIRECT 4 632#define D_INDIRDEP 5 633#define D_ALLOCINDIR 6 634#define D_FREEFRAG 7 635#define D_FREEBLKS 8 636#define D_FREEFILE 9 637#define D_DIRADD 10 638#define D_MKDIR 11 639#define D_DIRREM 12 640#define D_NEWDIRBLK 13 641#define D_FREEWORK 14 642#define D_FREEDEP 15 643#define D_JADDREF 16 644#define D_JREMREF 17 645#define D_JMVREF 18 646#define D_JNEWBLK 19 647#define D_JFREEBLK 20 648#define D_JFREEFRAG 21 649#define D_JSEG 22 650#define D_JSEGDEP 23 651#define D_SBDEP 24 652#define D_JTRUNC 25 653#define D_JFSYNC 26 654#define D_SENTINAL 27 655#define D_LAST D_SENTINAL 656 657unsigned long dep_current[D_LAST + 1]; 658unsigned long dep_total[D_LAST + 1]; 659unsigned long dep_write[D_LAST + 1]; 660 661 662static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, 663 "soft updates stats"); 664static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, 665 "total dependencies allocated"); 666static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, 667 "current dependencies allocated"); 668static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0, 669 "current dependencies written"); 670 671#define SOFTDEP_TYPE(type, str, long) \ 672 static MALLOC_DEFINE(M_ ## type, #str, long); \ 673 SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ 674 &dep_total[D_ ## type], 0, ""); \ 675 SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ 676 &dep_current[D_ ## type], 0, ""); \ 677 SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, \ 678 &dep_write[D_ ## type], 0, ""); 679 680SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 681SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); 682SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, 683 "Block or frag allocated from cyl group map"); 684SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); 685SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); 686SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); 687SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); 688SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); 689SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); 690SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); 691SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); 692SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); 693SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); 694SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); 695SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); 696SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); 697SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); 698SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); 699SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); 700SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); 701SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); 702SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); 703SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); 704SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); 705SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); 706SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); 707SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete"); 708 709static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); 710static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); 711 712/* 713 * translate from workitem type to memory type 714 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 715 */ 716static struct malloc_type *memtype[] = { 717 M_PAGEDEP, 718 M_INODEDEP, 719 M_BMSAFEMAP, 720 M_NEWBLK, 721 M_ALLOCDIRECT, 722 M_INDIRDEP, 723 M_ALLOCINDIR, 724 M_FREEFRAG, 725 M_FREEBLKS, 726 M_FREEFILE, 727 M_DIRADD, 728 M_MKDIR, 729 M_DIRREM, 730 M_NEWDIRBLK, 731 M_FREEWORK, 732 M_FREEDEP, 733 M_JADDREF, 734 M_JREMREF, 735 M_JMVREF, 736 M_JNEWBLK, 737 M_JFREEBLK, 738 M_JFREEFRAG, 739 M_JSEG, 740 M_JSEGDEP, 741 M_SBDEP, 742 M_JTRUNC, 743 M_JFSYNC 744}; 745 746static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd; 747 748#define DtoM(type) (memtype[type]) 749 750/* 751 * Names of malloc types. 752 */ 753#define TYPENAME(type) \ 754 ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") 755/* 756 * End system adaptation definitions. 757 */ 758 759#define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) 760#define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) 761 762/* 763 * Forward declarations. 764 */ 765struct inodedep_hashhead; 766struct newblk_hashhead; 767struct pagedep_hashhead; 768struct bmsafemap_hashhead; 769 770/* 771 * Internal function prototypes. 772 */ 773static void softdep_error(char *, int); 774static void drain_output(struct vnode *); 775static struct buf *getdirtybuf(struct buf *, struct mtx *, int); 776static void clear_remove(struct thread *); 777static void clear_inodedeps(struct thread *); 778static void unlinked_inodedep(struct mount *, struct inodedep *); 779static void clear_unlinked_inodedep(struct inodedep *); 780static struct inodedep *first_unlinked_inodedep(struct ufsmount *); 781static int flush_pagedep_deps(struct vnode *, struct mount *, 782 struct diraddhd *); 783static int free_pagedep(struct pagedep *); 784static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); 785static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t); 786static int flush_deplist(struct allocdirectlst *, int, int *); 787static int sync_cgs(struct mount *, int); 788static int handle_written_filepage(struct pagedep *, struct buf *); 789static int handle_written_sbdep(struct sbdep *, struct buf *); 790static void initiate_write_sbdep(struct sbdep *); 791static void diradd_inode_written(struct diradd *, struct inodedep *); 792static int handle_written_indirdep(struct indirdep *, struct buf *, 793 struct buf**); 794static int handle_written_inodeblock(struct inodedep *, struct buf *); 795static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *, 796 uint8_t *); 797static int handle_written_bmsafemap(struct bmsafemap *, struct buf *); 798static void handle_written_jaddref(struct jaddref *); 799static void handle_written_jremref(struct jremref *); 800static void handle_written_jseg(struct jseg *, struct buf *); 801static void handle_written_jnewblk(struct jnewblk *); 802static void handle_written_jblkdep(struct jblkdep *); 803static void handle_written_jfreefrag(struct jfreefrag *); 804static void complete_jseg(struct jseg *); 805static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *); 806static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); 807static void jremref_write(struct jremref *, struct jseg *, uint8_t *); 808static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); 809static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); 810static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data); 811static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); 812static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); 813static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); 814static inline void inoref_write(struct inoref *, struct jseg *, 815 struct jrefrec *); 816static void handle_allocdirect_partdone(struct allocdirect *, 817 struct workhead *); 818static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *, 819 struct workhead *); 820static void indirdep_complete(struct indirdep *); 821static int indirblk_lookup(struct mount *, ufs2_daddr_t); 822static void indirblk_insert(struct freework *); 823static void indirblk_remove(struct freework *); 824static void handle_allocindir_partdone(struct allocindir *); 825static void initiate_write_filepage(struct pagedep *, struct buf *); 826static void initiate_write_indirdep(struct indirdep*, struct buf *); 827static void handle_written_mkdir(struct mkdir *, int); 828static int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *, 829 uint8_t *); 830static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); 831static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); 832static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); 833static void handle_workitem_freefile(struct freefile *); 834static int handle_workitem_remove(struct dirrem *, int); 835static struct dirrem *newdirrem(struct buf *, struct inode *, 836 struct inode *, int, struct dirrem **); 837static struct indirdep *indirdep_lookup(struct mount *, struct inode *, 838 struct buf *); 839static void cancel_indirdep(struct indirdep *, struct buf *, 840 struct freeblks *); 841static void free_indirdep(struct indirdep *); 842static void free_diradd(struct diradd *, struct workhead *); 843static void merge_diradd(struct inodedep *, struct diradd *); 844static void complete_diradd(struct diradd *); 845static struct diradd *diradd_lookup(struct pagedep *, int); 846static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, 847 struct jremref *); 848static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, 849 struct jremref *); 850static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, 851 struct jremref *, struct jremref *); 852static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, 853 struct jremref *); 854static void cancel_allocindir(struct allocindir *, struct buf *bp, 855 struct freeblks *, int); 856static int setup_trunc_indir(struct freeblks *, struct inode *, 857 ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t); 858static void complete_trunc_indir(struct freework *); 859static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *, 860 int); 861static void complete_mkdir(struct mkdir *); 862static void free_newdirblk(struct newdirblk *); 863static void free_jremref(struct jremref *); 864static void free_jaddref(struct jaddref *); 865static void free_jsegdep(struct jsegdep *); 866static void free_jsegs(struct jblocks *); 867static void rele_jseg(struct jseg *); 868static void free_jseg(struct jseg *, struct jblocks *); 869static void free_jnewblk(struct jnewblk *); 870static void free_jblkdep(struct jblkdep *); 871static void free_jfreefrag(struct jfreefrag *); 872static void free_freedep(struct freedep *); 873static void journal_jremref(struct dirrem *, struct jremref *, 874 struct inodedep *); 875static void cancel_jnewblk(struct jnewblk *, struct workhead *); 876static int cancel_jaddref(struct jaddref *, struct inodedep *, 877 struct workhead *); 878static void cancel_jfreefrag(struct jfreefrag *); 879static inline void setup_freedirect(struct freeblks *, struct inode *, 880 int, int); 881static inline void setup_freeext(struct freeblks *, struct inode *, int, int); 882static inline void setup_freeindir(struct freeblks *, struct inode *, int, 883 ufs_lbn_t, int); 884static inline struct freeblks *newfreeblks(struct mount *, struct inode *); 885static void freeblks_free(struct ufsmount *, struct freeblks *, int); 886static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); 887ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t); 888static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int); 889static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t, 890 int, int); 891static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int); 892static int cancel_pagedep(struct pagedep *, struct freeblks *, int); 893static int deallocate_dependencies(struct buf *, struct freeblks *, int); 894static void newblk_freefrag(struct newblk*); 895static void free_newblk(struct newblk *); 896static void cancel_allocdirect(struct allocdirectlst *, 897 struct allocdirect *, struct freeblks *); 898static int check_inode_unwritten(struct inodedep *); 899static int free_inodedep(struct inodedep *); 900static void freework_freeblock(struct freework *); 901static void freework_enqueue(struct freework *); 902static int handle_workitem_freeblocks(struct freeblks *, int); 903static int handle_complete_freeblocks(struct freeblks *, int); 904static void handle_workitem_indirblk(struct freework *); 905static void handle_written_freework(struct freework *); 906static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); 907static struct worklist *jnewblk_merge(struct worklist *, struct worklist *, 908 struct workhead *); 909static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *, 910 struct inodedep *, struct allocindir *, ufs_lbn_t); 911static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, 912 ufs2_daddr_t, ufs_lbn_t); 913static void handle_workitem_freefrag(struct freefrag *); 914static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, 915 ufs_lbn_t); 916static void allocdirect_merge(struct allocdirectlst *, 917 struct allocdirect *, struct allocdirect *); 918static struct freefrag *allocindir_merge(struct allocindir *, 919 struct allocindir *); 920static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int, 921 struct bmsafemap **); 922static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, 923 int cg); 924static int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t, 925 int, struct newblk **); 926static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); 927static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, 928 struct inodedep **); 929static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); 930static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t, 931 int, struct pagedep **); 932static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, 933 struct mount *mp, int, struct pagedep **); 934static void pause_timer(void *); 935static int request_cleanup(struct mount *, int); 936static int process_worklist_item(struct mount *, int, int); 937static void process_removes(struct vnode *); 938static void process_truncates(struct vnode *); 939static void jwork_move(struct workhead *, struct workhead *); 940static void jwork_insert(struct workhead *, struct jsegdep *); 941static void add_to_worklist(struct worklist *, int); 942static void wake_worklist(struct worklist *); 943static void wait_worklist(struct worklist *, char *); 944static void remove_from_worklist(struct worklist *); 945static void softdep_flush(void); 946static void softdep_flushjournal(struct mount *); 947static int softdep_speedup(void); 948static void worklist_speedup(void); 949static int journal_mount(struct mount *, struct fs *, struct ucred *); 950static void journal_unmount(struct mount *); 951static int journal_space(struct ufsmount *, int); 952static void journal_suspend(struct ufsmount *); 953static int journal_unsuspend(struct ufsmount *ump); 954static void softdep_prelink(struct vnode *, struct vnode *); 955static void add_to_journal(struct worklist *); 956static void remove_from_journal(struct worklist *); 957static void softdep_process_journal(struct mount *, struct worklist *, int); 958static struct jremref *newjremref(struct dirrem *, struct inode *, 959 struct inode *ip, off_t, nlink_t); 960static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, 961 uint16_t); 962static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, 963 uint16_t); 964static inline struct jsegdep *inoref_jseg(struct inoref *); 965static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); 966static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, 967 ufs2_daddr_t, int); 968static struct jtrunc *newjtrunc(struct freeblks *, off_t, int); 969static void move_newblock_dep(struct jaddref *, struct inodedep *); 970static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t); 971static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, 972 ufs2_daddr_t, long, ufs_lbn_t); 973static struct freework *newfreework(struct ufsmount *, struct freeblks *, 974 struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int); 975static int jwait(struct worklist *, int); 976static struct inodedep *inodedep_lookup_ip(struct inode *); 977static int bmsafemap_rollbacks(struct bmsafemap *); 978static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); 979static void handle_jwork(struct workhead *); 980static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, 981 struct mkdir **); 982static struct jblocks *jblocks_create(void); 983static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); 984static void jblocks_free(struct jblocks *, struct mount *, int); 985static void jblocks_destroy(struct jblocks *); 986static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); 987 988/* 989 * Exported softdep operations. 990 */ 991static void softdep_disk_io_initiation(struct buf *); 992static void softdep_disk_write_complete(struct buf *); 993static void softdep_deallocate_dependencies(struct buf *); 994static int softdep_count_dependencies(struct buf *bp, int); 995 996static struct mtx lk; 997MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF); 998 999#define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk) 1000#define ACQUIRE_LOCK(lk) mtx_lock(lk) 1001#define FREE_LOCK(lk) mtx_unlock(lk) 1002 1003#define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock) 1004#define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock) 1005 1006/* 1007 * Worklist queue management. 1008 * These routines require that the lock be held. 1009 */ 1010#ifndef /* NOT */ DEBUG 1011#define WORKLIST_INSERT(head, item) do { \ 1012 (item)->wk_state |= ONWORKLIST; \ 1013 LIST_INSERT_HEAD(head, item, wk_list); \ 1014} while (0) 1015#define WORKLIST_REMOVE(item) do { \ 1016 (item)->wk_state &= ~ONWORKLIST; \ 1017 LIST_REMOVE(item, wk_list); \ 1018} while (0) 1019#define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT 1020#define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE 1021 1022#else /* DEBUG */ 1023static void worklist_insert(struct workhead *, struct worklist *, int); 1024static void worklist_remove(struct worklist *, int); 1025 1026#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) 1027#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) 1028#define WORKLIST_REMOVE(item) worklist_remove(item, 1) 1029#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) 1030 1031static void 1032worklist_insert(head, item, locked) 1033 struct workhead *head; 1034 struct worklist *item; 1035 int locked; 1036{ 1037 1038 if (locked) 1039 mtx_assert(&lk, MA_OWNED); 1040 if (item->wk_state & ONWORKLIST) 1041 panic("worklist_insert: %p %s(0x%X) already on list", 1042 item, TYPENAME(item->wk_type), item->wk_state); 1043 item->wk_state |= ONWORKLIST; 1044 LIST_INSERT_HEAD(head, item, wk_list); 1045} 1046 1047static void 1048worklist_remove(item, locked) 1049 struct worklist *item; 1050 int locked; 1051{ 1052 1053 if (locked) 1054 mtx_assert(&lk, MA_OWNED); 1055 if ((item->wk_state & ONWORKLIST) == 0) 1056 panic("worklist_remove: %p %s(0x%X) not on list", 1057 item, TYPENAME(item->wk_type), item->wk_state); 1058 item->wk_state &= ~ONWORKLIST; 1059 LIST_REMOVE(item, wk_list); 1060} 1061#endif /* DEBUG */ 1062 1063/* 1064 * Merge two jsegdeps keeping only the oldest one as newer references 1065 * can't be discarded until after older references. 1066 */ 1067static inline struct jsegdep * 1068jsegdep_merge(struct jsegdep *one, struct jsegdep *two) 1069{ 1070 struct jsegdep *swp; 1071 1072 if (two == NULL) 1073 return (one); 1074 1075 if (one->jd_seg->js_seq > two->jd_seg->js_seq) { 1076 swp = one; 1077 one = two; 1078 two = swp; 1079 } 1080 WORKLIST_REMOVE(&two->jd_list); 1081 free_jsegdep(two); 1082 1083 return (one); 1084} 1085 1086/* 1087 * If two freedeps are compatible free one to reduce list size. 1088 */ 1089static inline struct freedep * 1090freedep_merge(struct freedep *one, struct freedep *two) 1091{ 1092 if (two == NULL) 1093 return (one); 1094 1095 if (one->fd_freework == two->fd_freework) { 1096 WORKLIST_REMOVE(&two->fd_list); 1097 free_freedep(two); 1098 } 1099 return (one); 1100} 1101 1102/* 1103 * Move journal work from one list to another. Duplicate freedeps and 1104 * jsegdeps are coalesced to keep the lists as small as possible. 1105 */ 1106static void 1107jwork_move(dst, src) 1108 struct workhead *dst; 1109 struct workhead *src; 1110{ 1111 struct freedep *freedep; 1112 struct jsegdep *jsegdep; 1113 struct worklist *wkn; 1114 struct worklist *wk; 1115 1116 KASSERT(dst != src, 1117 ("jwork_move: dst == src")); 1118 freedep = NULL; 1119 jsegdep = NULL; 1120 LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { 1121 if (wk->wk_type == D_JSEGDEP) 1122 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1123 if (wk->wk_type == D_FREEDEP) 1124 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1125 } 1126 1127 mtx_assert(&lk, MA_OWNED); 1128 while ((wk = LIST_FIRST(src)) != NULL) { 1129 WORKLIST_REMOVE(wk); 1130 WORKLIST_INSERT(dst, wk); 1131 if (wk->wk_type == D_JSEGDEP) { 1132 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1133 continue; 1134 } 1135 if (wk->wk_type == D_FREEDEP) 1136 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1137 } 1138} 1139 1140static void 1141jwork_insert(dst, jsegdep) 1142 struct workhead *dst; 1143 struct jsegdep *jsegdep; 1144{ 1145 struct jsegdep *jsegdepn; 1146 struct worklist *wk; 1147 1148 LIST_FOREACH(wk, dst, wk_list) 1149 if (wk->wk_type == D_JSEGDEP) 1150 break; 1151 if (wk == NULL) { 1152 WORKLIST_INSERT(dst, &jsegdep->jd_list); 1153 return; 1154 } 1155 jsegdepn = WK_JSEGDEP(wk); 1156 if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) { 1157 WORKLIST_REMOVE(wk); 1158 free_jsegdep(jsegdepn); 1159 WORKLIST_INSERT(dst, &jsegdep->jd_list); 1160 } else 1161 free_jsegdep(jsegdep); 1162} 1163 1164/* 1165 * Routines for tracking and managing workitems. 1166 */ 1167static void workitem_free(struct worklist *, int); 1168static void workitem_alloc(struct worklist *, int, struct mount *); 1169 1170#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type)) 1171 1172static void 1173workitem_free(item, type) 1174 struct worklist *item; 1175 int type; 1176{ 1177 struct ufsmount *ump; 1178 mtx_assert(&lk, MA_OWNED); 1179 1180#ifdef DEBUG 1181 if (item->wk_state & ONWORKLIST) 1182 panic("workitem_free: %s(0x%X) still on list", 1183 TYPENAME(item->wk_type), item->wk_state); 1184 if (item->wk_type != type) 1185 panic("workitem_free: type mismatch %s != %s", 1186 TYPENAME(item->wk_type), TYPENAME(type)); 1187#endif 1188 if (item->wk_state & IOWAITING) 1189 wakeup(item); 1190 ump = VFSTOUFS(item->wk_mp); 1191 if (--ump->softdep_deps == 0 && ump->softdep_req) 1192 wakeup(&ump->softdep_deps); 1193 dep_current[type]--; 1194 free(item, DtoM(type)); 1195} 1196 1197static void 1198workitem_alloc(item, type, mp) 1199 struct worklist *item; 1200 int type; 1201 struct mount *mp; 1202{ 1203 struct ufsmount *ump; 1204 1205 item->wk_type = type; 1206 item->wk_mp = mp; 1207 item->wk_state = 0; 1208 1209 ump = VFSTOUFS(mp); 1210 ACQUIRE_LOCK(&lk); 1211 dep_current[type]++; 1212 dep_total[type]++; 1213 ump->softdep_deps++; 1214 ump->softdep_accdeps++; 1215 FREE_LOCK(&lk); 1216} 1217 1218/* 1219 * Workitem queue management 1220 */ 1221static int max_softdeps; /* maximum number of structs before slowdown */ 1222static int maxindirdeps = 50; /* max number of indirdeps before slowdown */ 1223static int tickdelay = 2; /* number of ticks to pause during slowdown */ 1224static int proc_waiting; /* tracks whether we have a timeout posted */ 1225static int *stat_countp; /* statistic to count in proc_waiting timeout */ 1226static struct callout softdep_callout; 1227static int req_pending; 1228static int req_clear_inodedeps; /* syncer process flush some inodedeps */ 1229static int req_clear_remove; /* syncer process flush some freeblks */ 1230 1231/* 1232 * runtime statistics 1233 */ 1234static int stat_worklist_push; /* number of worklist cleanups */ 1235static int stat_blk_limit_push; /* number of times block limit neared */ 1236static int stat_ino_limit_push; /* number of times inode limit neared */ 1237static int stat_blk_limit_hit; /* number of times block slowdown imposed */ 1238static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ 1239static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ 1240static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ 1241static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ 1242static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ 1243static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ 1244static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ 1245static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ 1246static int stat_journal_min; /* Times hit journal min threshold */ 1247static int stat_journal_low; /* Times hit journal low threshold */ 1248static int stat_journal_wait; /* Times blocked in jwait(). */ 1249static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ 1250static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ 1251static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ 1252static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ 1253static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */ 1254static int stat_cleanup_blkrequests; /* Number of block cleanup requests */ 1255static int stat_cleanup_inorequests; /* Number of inode cleanup requests */ 1256static int stat_cleanup_retries; /* Number of cleanups that needed to flush */ 1257static int stat_cleanup_failures; /* Number of cleanup requests that failed */ 1258 1259SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, 1260 &max_softdeps, 0, ""); 1261SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, 1262 &tickdelay, 0, ""); 1263SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW, 1264 &maxindirdeps, 0, ""); 1265SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW, 1266 &stat_worklist_push, 0,""); 1267SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW, 1268 &stat_blk_limit_push, 0,""); 1269SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW, 1270 &stat_ino_limit_push, 0,""); 1271SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW, 1272 &stat_blk_limit_hit, 0, ""); 1273SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW, 1274 &stat_ino_limit_hit, 0, ""); 1275SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW, 1276 &stat_sync_limit_hit, 0, ""); 1277SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, 1278 &stat_indir_blk_ptrs, 0, ""); 1279SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW, 1280 &stat_inode_bitmap, 0, ""); 1281SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, 1282 &stat_direct_blk_ptrs, 0, ""); 1283SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW, 1284 &stat_dir_entry, 0, ""); 1285SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW, 1286 &stat_jaddref, 0, ""); 1287SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW, 1288 &stat_jnewblk, 0, ""); 1289SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW, 1290 &stat_journal_low, 0, ""); 1291SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW, 1292 &stat_journal_min, 0, ""); 1293SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW, 1294 &stat_journal_wait, 0, ""); 1295SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW, 1296 &stat_jwait_filepage, 0, ""); 1297SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW, 1298 &stat_jwait_freeblks, 0, ""); 1299SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW, 1300 &stat_jwait_inode, 0, ""); 1301SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW, 1302 &stat_jwait_newblk, 0, ""); 1303SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW, 1304 &stat_cleanup_blkrequests, 0, ""); 1305SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW, 1306 &stat_cleanup_inorequests, 0, ""); 1307SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW, 1308 &stat_cleanup_high_delay, 0, ""); 1309SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW, 1310 &stat_cleanup_retries, 0, ""); 1311SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW, 1312 &stat_cleanup_failures, 0, ""); 1313 1314SYSCTL_DECL(_vfs_ffs); 1315 1316LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl; 1317static u_long bmsafemap_hash; /* size of hash table - 1 */ 1318 1319static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ 1320SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, 1321 &compute_summary_at_mount, 0, "Recompute summary at mount"); 1322 1323static struct proc *softdepproc; 1324static struct kproc_desc softdep_kp = { 1325 "softdepflush", 1326 softdep_flush, 1327 &softdepproc 1328}; 1329SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, 1330 &softdep_kp); 1331 1332static void 1333softdep_flush(void) 1334{ 1335 struct mount *nmp; 1336 struct mount *mp; 1337 struct ufsmount *ump; 1338 struct thread *td; 1339 int remaining; 1340 int progress; 1341 int vfslocked; 1342 1343 td = curthread; 1344 td->td_pflags |= TDP_NORUNNINGBUF; 1345 1346 for (;;) { 1347 kproc_suspend_check(softdepproc); 1348 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL); 1349 ACQUIRE_LOCK(&lk); 1350 /* 1351 * If requested, try removing inode or removal dependencies. 1352 */ 1353 if (req_clear_inodedeps) { 1354 clear_inodedeps(td); 1355 req_clear_inodedeps -= 1; 1356 wakeup_one(&proc_waiting); 1357 } 1358 if (req_clear_remove) { 1359 clear_remove(td); 1360 req_clear_remove -= 1; 1361 wakeup_one(&proc_waiting); 1362 } 1363 FREE_LOCK(&lk); 1364 VFS_UNLOCK_GIANT(vfslocked); 1365 remaining = progress = 0; 1366 mtx_lock(&mountlist_mtx); 1367 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1368 nmp = TAILQ_NEXT(mp, mnt_list); 1369 if (MOUNTEDSOFTDEP(mp) == 0) 1370 continue; 1371 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 1372 continue; 1373 vfslocked = VFS_LOCK_GIANT(mp); 1374 progress += softdep_process_worklist(mp, 0); 1375 ump = VFSTOUFS(mp); 1376 remaining += ump->softdep_on_worklist; 1377 VFS_UNLOCK_GIANT(vfslocked); 1378 mtx_lock(&mountlist_mtx); 1379 nmp = TAILQ_NEXT(mp, mnt_list); 1380 vfs_unbusy(mp); 1381 } 1382 mtx_unlock(&mountlist_mtx); 1383 if (remaining && progress) 1384 continue; 1385 ACQUIRE_LOCK(&lk); 1386 if (!req_pending) 1387 msleep(&req_pending, &lk, PVM, "sdflush", hz); 1388 req_pending = 0; 1389 FREE_LOCK(&lk); 1390 } 1391} 1392 1393static void 1394worklist_speedup(void) 1395{ 1396 mtx_assert(&lk, MA_OWNED); 1397 if (req_pending == 0) { 1398 req_pending = 1; 1399 wakeup(&req_pending); 1400 } 1401} 1402 1403static int 1404softdep_speedup(void) 1405{ 1406 1407 worklist_speedup(); 1408 bd_speedup(); 1409 return speedup_syncer(); 1410} 1411 1412/* 1413 * Add an item to the end of the work queue. 1414 * This routine requires that the lock be held. 1415 * This is the only routine that adds items to the list. 1416 * The following routine is the only one that removes items 1417 * and does so in order from first to last. 1418 */ 1419 1420#define WK_HEAD 0x0001 /* Add to HEAD. */ 1421#define WK_NODELAY 0x0002 /* Process immediately. */ 1422 1423static void 1424add_to_worklist(wk, flags) 1425 struct worklist *wk; 1426 int flags; 1427{ 1428 struct ufsmount *ump; 1429 1430 mtx_assert(&lk, MA_OWNED); 1431 ump = VFSTOUFS(wk->wk_mp); 1432 if (wk->wk_state & ONWORKLIST) 1433 panic("add_to_worklist: %s(0x%X) already on list", 1434 TYPENAME(wk->wk_type), wk->wk_state); 1435 wk->wk_state |= ONWORKLIST; 1436 if (ump->softdep_on_worklist == 0) { 1437 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1438 ump->softdep_worklist_tail = wk; 1439 } else if (flags & WK_HEAD) { 1440 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1441 } else { 1442 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); 1443 ump->softdep_worklist_tail = wk; 1444 } 1445 ump->softdep_on_worklist += 1; 1446 if (flags & WK_NODELAY) 1447 worklist_speedup(); 1448} 1449 1450/* 1451 * Remove the item to be processed. If we are removing the last 1452 * item on the list, we need to recalculate the tail pointer. 1453 */ 1454static void 1455remove_from_worklist(wk) 1456 struct worklist *wk; 1457{ 1458 struct ufsmount *ump; 1459 1460 ump = VFSTOUFS(wk->wk_mp); 1461 WORKLIST_REMOVE(wk); 1462 if (ump->softdep_worklist_tail == wk) 1463 ump->softdep_worklist_tail = 1464 (struct worklist *)wk->wk_list.le_prev; 1465 ump->softdep_on_worklist -= 1; 1466} 1467 1468static void 1469wake_worklist(wk) 1470 struct worklist *wk; 1471{ 1472 if (wk->wk_state & IOWAITING) { 1473 wk->wk_state &= ~IOWAITING; 1474 wakeup(wk); 1475 } 1476} 1477 1478static void 1479wait_worklist(wk, wmesg) 1480 struct worklist *wk; 1481 char *wmesg; 1482{ 1483 1484 wk->wk_state |= IOWAITING; 1485 msleep(wk, &lk, PVM, wmesg, 0); 1486} 1487 1488/* 1489 * Process that runs once per second to handle items in the background queue. 1490 * 1491 * Note that we ensure that everything is done in the order in which they 1492 * appear in the queue. The code below depends on this property to ensure 1493 * that blocks of a file are freed before the inode itself is freed. This 1494 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 1495 * until all the old ones have been purged from the dependency lists. 1496 */ 1497int 1498softdep_process_worklist(mp, full) 1499 struct mount *mp; 1500 int full; 1501{ 1502 struct thread *td = curthread; 1503 int cnt, matchcnt; 1504 struct ufsmount *ump; 1505 long starttime; 1506 1507 KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); 1508 /* 1509 * Record the process identifier of our caller so that we can give 1510 * this process preferential treatment in request_cleanup below. 1511 */ 1512 matchcnt = 0; 1513 ump = VFSTOUFS(mp); 1514 ACQUIRE_LOCK(&lk); 1515 starttime = time_second; 1516 softdep_process_journal(mp, NULL, full?MNT_WAIT:0); 1517 while (ump->softdep_on_worklist > 0) { 1518 if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0) 1519 break; 1520 else 1521 matchcnt += cnt; 1522 /* 1523 * If requested, try removing inode or removal dependencies. 1524 */ 1525 if (req_clear_inodedeps) { 1526 clear_inodedeps(td); 1527 req_clear_inodedeps -= 1; 1528 wakeup_one(&proc_waiting); 1529 } 1530 if (req_clear_remove) { 1531 clear_remove(td); 1532 req_clear_remove -= 1; 1533 wakeup_one(&proc_waiting); 1534 } 1535 /* 1536 * We do not generally want to stop for buffer space, but if 1537 * we are really being a buffer hog, we will stop and wait. 1538 */ 1539 if (should_yield()) { 1540 FREE_LOCK(&lk); 1541 kern_yield(PRI_UNCHANGED); 1542 bwillwrite(); 1543 ACQUIRE_LOCK(&lk); 1544 } 1545 /* 1546 * Never allow processing to run for more than one 1547 * second. Otherwise the other mountpoints may get 1548 * excessively backlogged. 1549 */ 1550 if (!full && starttime != time_second) 1551 break; 1552 } 1553 if (full == 0) 1554 journal_unsuspend(ump); 1555 FREE_LOCK(&lk); 1556 return (matchcnt); 1557} 1558 1559/* 1560 * Process all removes associated with a vnode if we are running out of 1561 * journal space. Any other process which attempts to flush these will 1562 * be unable as we have the vnodes locked. 1563 */ 1564static void 1565process_removes(vp) 1566 struct vnode *vp; 1567{ 1568 struct inodedep *inodedep; 1569 struct dirrem *dirrem; 1570 struct mount *mp; 1571 ino_t inum; 1572 1573 mtx_assert(&lk, MA_OWNED); 1574 1575 mp = vp->v_mount; 1576 inum = VTOI(vp)->i_number; 1577 for (;;) { 1578top: 1579 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1580 return; 1581 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) { 1582 /* 1583 * If another thread is trying to lock this vnode 1584 * it will fail but we must wait for it to do so 1585 * before we can proceed. 1586 */ 1587 if (dirrem->dm_state & INPROGRESS) { 1588 wait_worklist(&dirrem->dm_list, "pwrwait"); 1589 goto top; 1590 } 1591 if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == 1592 (COMPLETE | ONWORKLIST)) 1593 break; 1594 } 1595 if (dirrem == NULL) 1596 return; 1597 remove_from_worklist(&dirrem->dm_list); 1598 FREE_LOCK(&lk); 1599 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1600 panic("process_removes: suspended filesystem"); 1601 handle_workitem_remove(dirrem, 0); 1602 vn_finished_secondary_write(mp); 1603 ACQUIRE_LOCK(&lk); 1604 } 1605} 1606 1607/* 1608 * Process all truncations associated with a vnode if we are running out 1609 * of journal space. This is called when the vnode lock is already held 1610 * and no other process can clear the truncation. This function returns 1611 * a value greater than zero if it did any work. 1612 */ 1613static void 1614process_truncates(vp) 1615 struct vnode *vp; 1616{ 1617 struct inodedep *inodedep; 1618 struct freeblks *freeblks; 1619 struct mount *mp; 1620 ino_t inum; 1621 int cgwait; 1622 1623 mtx_assert(&lk, MA_OWNED); 1624 1625 mp = vp->v_mount; 1626 inum = VTOI(vp)->i_number; 1627 for (;;) { 1628 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1629 return; 1630 cgwait = 0; 1631 TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) { 1632 /* Journal entries not yet written. */ 1633 if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) { 1634 jwait(&LIST_FIRST( 1635 &freeblks->fb_jblkdephd)->jb_list, 1636 MNT_WAIT); 1637 break; 1638 } 1639 /* Another thread is executing this item. */ 1640 if (freeblks->fb_state & INPROGRESS) { 1641 wait_worklist(&freeblks->fb_list, "ptrwait"); 1642 break; 1643 } 1644 /* Freeblks is waiting on a inode write. */ 1645 if ((freeblks->fb_state & COMPLETE) == 0) { 1646 FREE_LOCK(&lk); 1647 ffs_update(vp, 1); 1648 ACQUIRE_LOCK(&lk); 1649 break; 1650 } 1651 if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) == 1652 (ALLCOMPLETE | ONWORKLIST)) { 1653 remove_from_worklist(&freeblks->fb_list); 1654 freeblks->fb_state |= INPROGRESS; 1655 FREE_LOCK(&lk); 1656 if (vn_start_secondary_write(NULL, &mp, 1657 V_NOWAIT)) 1658 panic("process_truncates: " 1659 "suspended filesystem"); 1660 handle_workitem_freeblocks(freeblks, 0); 1661 vn_finished_secondary_write(mp); 1662 ACQUIRE_LOCK(&lk); 1663 break; 1664 } 1665 if (freeblks->fb_cgwait) 1666 cgwait++; 1667 } 1668 if (cgwait) { 1669 FREE_LOCK(&lk); 1670 sync_cgs(mp, MNT_WAIT); 1671 ffs_sync_snap(mp, MNT_WAIT); 1672 ACQUIRE_LOCK(&lk); 1673 continue; 1674 } 1675 if (freeblks == NULL) 1676 break; 1677 } 1678 return; 1679} 1680 1681/* 1682 * Process one item on the worklist. 1683 */ 1684static int 1685process_worklist_item(mp, target, flags) 1686 struct mount *mp; 1687 int target; 1688 int flags; 1689{ 1690 struct worklist sintenel; 1691 struct worklist *wk; 1692 struct ufsmount *ump; 1693 int matchcnt; 1694 int error; 1695 1696 mtx_assert(&lk, MA_OWNED); 1697 KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); 1698 /* 1699 * If we are being called because of a process doing a 1700 * copy-on-write, then it is not safe to write as we may 1701 * recurse into the copy-on-write routine. 1702 */ 1703 if (curthread->td_pflags & TDP_COWINPROGRESS) 1704 return (-1); 1705 PHOLD(curproc); /* Don't let the stack go away. */ 1706 ump = VFSTOUFS(mp); 1707 matchcnt = 0; 1708 sintenel.wk_mp = NULL; 1709 sintenel.wk_type = D_SENTINAL; 1710 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sintenel, wk_list); 1711 for (wk = LIST_NEXT(&sintenel, wk_list); wk != NULL; 1712 wk = LIST_NEXT(&sintenel, wk_list)) { 1713 if (wk->wk_type == D_SENTINAL) { 1714 LIST_REMOVE(&sintenel, wk_list); 1715 LIST_INSERT_AFTER(wk, &sintenel, wk_list); 1716 continue; 1717 } 1718 if (wk->wk_state & INPROGRESS) 1719 panic("process_worklist_item: %p already in progress.", 1720 wk); 1721 wk->wk_state |= INPROGRESS; 1722 remove_from_worklist(wk); 1723 FREE_LOCK(&lk); 1724 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1725 panic("process_worklist_item: suspended filesystem"); 1726 switch (wk->wk_type) { 1727 case D_DIRREM: 1728 /* removal of a directory entry */ 1729 error = handle_workitem_remove(WK_DIRREM(wk), flags); 1730 break; 1731 1732 case D_FREEBLKS: 1733 /* releasing blocks and/or fragments from a file */ 1734 error = handle_workitem_freeblocks(WK_FREEBLKS(wk), 1735 flags); 1736 break; 1737 1738 case D_FREEFRAG: 1739 /* releasing a fragment when replaced as a file grows */ 1740 handle_workitem_freefrag(WK_FREEFRAG(wk)); 1741 error = 0; 1742 break; 1743 1744 case D_FREEFILE: 1745 /* releasing an inode when its link count drops to 0 */ 1746 handle_workitem_freefile(WK_FREEFILE(wk)); 1747 error = 0; 1748 break; 1749 1750 default: 1751 panic("%s_process_worklist: Unknown type %s", 1752 "softdep", TYPENAME(wk->wk_type)); 1753 /* NOTREACHED */ 1754 } 1755 vn_finished_secondary_write(mp); 1756 ACQUIRE_LOCK(&lk); 1757 if (error == 0) { 1758 if (++matchcnt == target) 1759 break; 1760 continue; 1761 } 1762 /* 1763 * We have to retry the worklist item later. Wake up any 1764 * waiters who may be able to complete it immediately and 1765 * add the item back to the head so we don't try to execute 1766 * it again. 1767 */ 1768 wk->wk_state &= ~INPROGRESS; 1769 wake_worklist(wk); 1770 add_to_worklist(wk, WK_HEAD); 1771 } 1772 LIST_REMOVE(&sintenel, wk_list); 1773 /* Sentinal could've become the tail from remove_from_worklist. */ 1774 if (ump->softdep_worklist_tail == &sintenel) 1775 ump->softdep_worklist_tail = 1776 (struct worklist *)sintenel.wk_list.le_prev; 1777 PRELE(curproc); 1778 return (matchcnt); 1779} 1780 1781/* 1782 * Move dependencies from one buffer to another. 1783 */ 1784int 1785softdep_move_dependencies(oldbp, newbp) 1786 struct buf *oldbp; 1787 struct buf *newbp; 1788{ 1789 struct worklist *wk, *wktail; 1790 int dirty; 1791 1792 dirty = 0; 1793 wktail = NULL; 1794 ACQUIRE_LOCK(&lk); 1795 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 1796 LIST_REMOVE(wk, wk_list); 1797 if (wk->wk_type == D_BMSAFEMAP && 1798 bmsafemap_rollbacks(WK_BMSAFEMAP(wk))) 1799 dirty = 1; 1800 if (wktail == 0) 1801 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 1802 else 1803 LIST_INSERT_AFTER(wktail, wk, wk_list); 1804 wktail = wk; 1805 } 1806 FREE_LOCK(&lk); 1807 1808 return (dirty); 1809} 1810 1811/* 1812 * Purge the work list of all items associated with a particular mount point. 1813 */ 1814int 1815softdep_flushworklist(oldmnt, countp, td) 1816 struct mount *oldmnt; 1817 int *countp; 1818 struct thread *td; 1819{ 1820 struct vnode *devvp; 1821 int count, error = 0; 1822 struct ufsmount *ump; 1823 1824 /* 1825 * Alternately flush the block device associated with the mount 1826 * point and process any dependencies that the flushing 1827 * creates. We continue until no more worklist dependencies 1828 * are found. 1829 */ 1830 *countp = 0; 1831 ump = VFSTOUFS(oldmnt); 1832 devvp = ump->um_devvp; 1833 while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { 1834 *countp += count; 1835 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1836 error = VOP_FSYNC(devvp, MNT_WAIT, td); 1837 VOP_UNLOCK(devvp, 0); 1838 if (error) 1839 break; 1840 } 1841 return (error); 1842} 1843 1844int 1845softdep_waitidle(struct mount *mp) 1846{ 1847 struct ufsmount *ump; 1848 int error; 1849 int i; 1850 1851 ump = VFSTOUFS(mp); 1852 ACQUIRE_LOCK(&lk); 1853 for (i = 0; i < 10 && ump->softdep_deps; i++) { 1854 ump->softdep_req = 1; 1855 if (ump->softdep_on_worklist) 1856 panic("softdep_waitidle: work added after flush."); 1857 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1); 1858 } 1859 ump->softdep_req = 0; 1860 FREE_LOCK(&lk); 1861 error = 0; 1862 if (i == 10) { 1863 error = EBUSY; 1864 printf("softdep_waitidle: Failed to flush worklist for %p\n", 1865 mp); 1866 } 1867 1868 return (error); 1869} 1870 1871/* 1872 * Flush all vnodes and worklist items associated with a specified mount point. 1873 */ 1874int 1875softdep_flushfiles(oldmnt, flags, td) 1876 struct mount *oldmnt; 1877 int flags; 1878 struct thread *td; 1879{ 1880 int error, depcount, loopcnt, retry_flush_count, retry; 1881 1882 loopcnt = 10; 1883 retry_flush_count = 3; 1884retry_flush: 1885 error = 0; 1886 1887 /* 1888 * Alternately flush the vnodes associated with the mount 1889 * point and process any dependencies that the flushing 1890 * creates. In theory, this loop can happen at most twice, 1891 * but we give it a few extra just to be sure. 1892 */ 1893 for (; loopcnt > 0; loopcnt--) { 1894 /* 1895 * Do another flush in case any vnodes were brought in 1896 * as part of the cleanup operations. 1897 */ 1898 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) 1899 break; 1900 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || 1901 depcount == 0) 1902 break; 1903 } 1904 /* 1905 * If we are unmounting then it is an error to fail. If we 1906 * are simply trying to downgrade to read-only, then filesystem 1907 * activity can keep us busy forever, so we just fail with EBUSY. 1908 */ 1909 if (loopcnt == 0) { 1910 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 1911 panic("softdep_flushfiles: looping"); 1912 error = EBUSY; 1913 } 1914 if (!error) 1915 error = softdep_waitidle(oldmnt); 1916 if (!error) { 1917 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { 1918 retry = 0; 1919 MNT_ILOCK(oldmnt); 1920 KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0, 1921 ("softdep_flushfiles: !MNTK_NOINSMNTQ")); 1922 if (oldmnt->mnt_nvnodelistsize > 0) { 1923 if (--retry_flush_count > 0) { 1924 retry = 1; 1925 loopcnt = 3; 1926 } else 1927 error = EBUSY; 1928 } 1929 MNT_IUNLOCK(oldmnt); 1930 if (retry) 1931 goto retry_flush; 1932 } 1933 } 1934 return (error); 1935} 1936 1937/* 1938 * Structure hashing. 1939 * 1940 * There are three types of structures that can be looked up: 1941 * 1) pagedep structures identified by mount point, inode number, 1942 * and logical block. 1943 * 2) inodedep structures identified by mount point and inode number. 1944 * 3) newblk structures identified by mount point and 1945 * physical block number. 1946 * 1947 * The "pagedep" and "inodedep" dependency structures are hashed 1948 * separately from the file blocks and inodes to which they correspond. 1949 * This separation helps when the in-memory copy of an inode or 1950 * file block must be replaced. It also obviates the need to access 1951 * an inode or file page when simply updating (or de-allocating) 1952 * dependency structures. Lookup of newblk structures is needed to 1953 * find newly allocated blocks when trying to associate them with 1954 * their allocdirect or allocindir structure. 1955 * 1956 * The lookup routines optionally create and hash a new instance when 1957 * an existing entry is not found. 1958 */ 1959#define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 1960#define NODELAY 0x0002 /* cannot do background work */ 1961 1962/* 1963 * Structures and routines associated with pagedep caching. 1964 */ 1965LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 1966u_long pagedep_hash; /* size of hash table - 1 */ 1967#define PAGEDEP_HASH(mp, inum, lbn) \ 1968 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 1969 pagedep_hash]) 1970 1971static int 1972pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp) 1973 struct pagedep_hashhead *pagedephd; 1974 ino_t ino; 1975 ufs_lbn_t lbn; 1976 struct mount *mp; 1977 int flags; 1978 struct pagedep **pagedeppp; 1979{ 1980 struct pagedep *pagedep; 1981 1982 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 1983 if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn && 1984 mp == pagedep->pd_list.wk_mp) { 1985 *pagedeppp = pagedep; 1986 return (1); 1987 } 1988 } 1989 *pagedeppp = NULL; 1990 return (0); 1991} 1992/* 1993 * Look up a pagedep. Return 1 if found, 0 otherwise. 1994 * If not found, allocate if DEPALLOC flag is passed. 1995 * Found or allocated entry is returned in pagedeppp. 1996 * This routine must be called with splbio interrupts blocked. 1997 */ 1998static int 1999pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp) 2000 struct mount *mp; 2001 struct buf *bp; 2002 ino_t ino; 2003 ufs_lbn_t lbn; 2004 int flags; 2005 struct pagedep **pagedeppp; 2006{ 2007 struct pagedep *pagedep; 2008 struct pagedep_hashhead *pagedephd; 2009 struct worklist *wk; 2010 int ret; 2011 int i; 2012 2013 mtx_assert(&lk, MA_OWNED); 2014 if (bp) { 2015 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 2016 if (wk->wk_type == D_PAGEDEP) { 2017 *pagedeppp = WK_PAGEDEP(wk); 2018 return (1); 2019 } 2020 } 2021 } 2022 pagedephd = PAGEDEP_HASH(mp, ino, lbn); 2023 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 2024 if (ret) { 2025 if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp) 2026 WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list); 2027 return (1); 2028 } 2029 if ((flags & DEPALLOC) == 0) 2030 return (0); 2031 FREE_LOCK(&lk); 2032 pagedep = malloc(sizeof(struct pagedep), 2033 M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); 2034 workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); 2035 ACQUIRE_LOCK(&lk); 2036 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 2037 if (*pagedeppp) { 2038 /* 2039 * This should never happen since we only create pagedeps 2040 * with the vnode lock held. Could be an assert. 2041 */ 2042 WORKITEM_FREE(pagedep, D_PAGEDEP); 2043 return (ret); 2044 } 2045 pagedep->pd_ino = ino; 2046 pagedep->pd_lbn = lbn; 2047 LIST_INIT(&pagedep->pd_dirremhd); 2048 LIST_INIT(&pagedep->pd_pendinghd); 2049 for (i = 0; i < DAHASHSZ; i++) 2050 LIST_INIT(&pagedep->pd_diraddhd[i]); 2051 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 2052 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2053 *pagedeppp = pagedep; 2054 return (0); 2055} 2056 2057/* 2058 * Structures and routines associated with inodedep caching. 2059 */ 2060LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 2061static u_long inodedep_hash; /* size of hash table - 1 */ 2062#define INODEDEP_HASH(fs, inum) \ 2063 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 2064 2065static int 2066inodedep_find(inodedephd, fs, inum, inodedeppp) 2067 struct inodedep_hashhead *inodedephd; 2068 struct fs *fs; 2069 ino_t inum; 2070 struct inodedep **inodedeppp; 2071{ 2072 struct inodedep *inodedep; 2073 2074 LIST_FOREACH(inodedep, inodedephd, id_hash) 2075 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 2076 break; 2077 if (inodedep) { 2078 *inodedeppp = inodedep; 2079 return (1); 2080 } 2081 *inodedeppp = NULL; 2082 2083 return (0); 2084} 2085/* 2086 * Look up an inodedep. Return 1 if found, 0 if not found. 2087 * If not found, allocate if DEPALLOC flag is passed. 2088 * Found or allocated entry is returned in inodedeppp. 2089 * This routine must be called with splbio interrupts blocked. 2090 */ 2091static int 2092inodedep_lookup(mp, inum, flags, inodedeppp) 2093 struct mount *mp; 2094 ino_t inum; 2095 int flags; 2096 struct inodedep **inodedeppp; 2097{ 2098 struct inodedep *inodedep; 2099 struct inodedep_hashhead *inodedephd; 2100 struct fs *fs; 2101 2102 mtx_assert(&lk, MA_OWNED); 2103 fs = VFSTOUFS(mp)->um_fs; 2104 inodedephd = INODEDEP_HASH(fs, inum); 2105 2106 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) 2107 return (1); 2108 if ((flags & DEPALLOC) == 0) 2109 return (0); 2110 /* 2111 * If we are over our limit, try to improve the situation. 2112 */ 2113 if (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0) 2114 request_cleanup(mp, FLUSH_INODES); 2115 FREE_LOCK(&lk); 2116 inodedep = malloc(sizeof(struct inodedep), 2117 M_INODEDEP, M_SOFTDEP_FLAGS); 2118 workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); 2119 ACQUIRE_LOCK(&lk); 2120 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) { 2121 WORKITEM_FREE(inodedep, D_INODEDEP); 2122 return (1); 2123 } 2124 inodedep->id_fs = fs; 2125 inodedep->id_ino = inum; 2126 inodedep->id_state = ALLCOMPLETE; 2127 inodedep->id_nlinkdelta = 0; 2128 inodedep->id_savedino1 = NULL; 2129 inodedep->id_savedsize = -1; 2130 inodedep->id_savedextsize = -1; 2131 inodedep->id_savednlink = -1; 2132 inodedep->id_bmsafemap = NULL; 2133 inodedep->id_mkdiradd = NULL; 2134 LIST_INIT(&inodedep->id_dirremhd); 2135 LIST_INIT(&inodedep->id_pendinghd); 2136 LIST_INIT(&inodedep->id_inowait); 2137 LIST_INIT(&inodedep->id_bufwait); 2138 TAILQ_INIT(&inodedep->id_inoreflst); 2139 TAILQ_INIT(&inodedep->id_inoupdt); 2140 TAILQ_INIT(&inodedep->id_newinoupdt); 2141 TAILQ_INIT(&inodedep->id_extupdt); 2142 TAILQ_INIT(&inodedep->id_newextupdt); 2143 TAILQ_INIT(&inodedep->id_freeblklst); 2144 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 2145 *inodedeppp = inodedep; 2146 return (0); 2147} 2148 2149/* 2150 * Structures and routines associated with newblk caching. 2151 */ 2152LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 2153u_long newblk_hash; /* size of hash table - 1 */ 2154#define NEWBLK_HASH(fs, inum) \ 2155 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 2156 2157static int 2158newblk_find(newblkhd, mp, newblkno, flags, newblkpp) 2159 struct newblk_hashhead *newblkhd; 2160 struct mount *mp; 2161 ufs2_daddr_t newblkno; 2162 int flags; 2163 struct newblk **newblkpp; 2164{ 2165 struct newblk *newblk; 2166 2167 LIST_FOREACH(newblk, newblkhd, nb_hash) { 2168 if (newblkno != newblk->nb_newblkno) 2169 continue; 2170 if (mp != newblk->nb_list.wk_mp) 2171 continue; 2172 /* 2173 * If we're creating a new dependency don't match those that 2174 * have already been converted to allocdirects. This is for 2175 * a frag extend. 2176 */ 2177 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) 2178 continue; 2179 break; 2180 } 2181 if (newblk) { 2182 *newblkpp = newblk; 2183 return (1); 2184 } 2185 *newblkpp = NULL; 2186 return (0); 2187} 2188 2189/* 2190 * Look up a newblk. Return 1 if found, 0 if not found. 2191 * If not found, allocate if DEPALLOC flag is passed. 2192 * Found or allocated entry is returned in newblkpp. 2193 */ 2194static int 2195newblk_lookup(mp, newblkno, flags, newblkpp) 2196 struct mount *mp; 2197 ufs2_daddr_t newblkno; 2198 int flags; 2199 struct newblk **newblkpp; 2200{ 2201 struct newblk *newblk; 2202 struct newblk_hashhead *newblkhd; 2203 2204 newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno); 2205 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) 2206 return (1); 2207 if ((flags & DEPALLOC) == 0) 2208 return (0); 2209 FREE_LOCK(&lk); 2210 newblk = malloc(sizeof(union allblk), M_NEWBLK, 2211 M_SOFTDEP_FLAGS | M_ZERO); 2212 workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); 2213 ACQUIRE_LOCK(&lk); 2214 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) { 2215 WORKITEM_FREE(newblk, D_NEWBLK); 2216 return (1); 2217 } 2218 newblk->nb_freefrag = NULL; 2219 LIST_INIT(&newblk->nb_indirdeps); 2220 LIST_INIT(&newblk->nb_newdirblk); 2221 LIST_INIT(&newblk->nb_jwork); 2222 newblk->nb_state = ATTACHED; 2223 newblk->nb_newblkno = newblkno; 2224 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 2225 *newblkpp = newblk; 2226 return (0); 2227} 2228 2229/* 2230 * Structures and routines associated with freed indirect block caching. 2231 */ 2232struct freeworklst *indir_hashtbl; 2233u_long indir_hash; /* size of hash table - 1 */ 2234#define INDIR_HASH(mp, blkno) \ 2235 (&indir_hashtbl[((((register_t)(mp)) >> 13) + (blkno)) & indir_hash]) 2236 2237/* 2238 * Lookup an indirect block in the indir hash table. The freework is 2239 * removed and potentially freed. The caller must do a blocking journal 2240 * write before writing to the blkno. 2241 */ 2242static int 2243indirblk_lookup(mp, blkno) 2244 struct mount *mp; 2245 ufs2_daddr_t blkno; 2246{ 2247 struct freework *freework; 2248 struct freeworklst *wkhd; 2249 2250 wkhd = INDIR_HASH(mp, blkno); 2251 TAILQ_FOREACH(freework, wkhd, fw_next) { 2252 if (freework->fw_blkno != blkno) 2253 continue; 2254 if (freework->fw_list.wk_mp != mp) 2255 continue; 2256 indirblk_remove(freework); 2257 return (1); 2258 } 2259 return (0); 2260} 2261 2262/* 2263 * Insert an indirect block represented by freework into the indirblk 2264 * hash table so that it may prevent the block from being re-used prior 2265 * to the journal being written. 2266 */ 2267static void 2268indirblk_insert(freework) 2269 struct freework *freework; 2270{ 2271 struct freeblks *freeblks; 2272 struct jsegdep *jsegdep; 2273 struct worklist *wk; 2274 2275 freeblks = freework->fw_freeblks; 2276 LIST_FOREACH(wk, &freeblks->fb_jwork, wk_list) 2277 if (wk->wk_type == D_JSEGDEP) 2278 break; 2279 if (wk == NULL) 2280 return; 2281 2282 jsegdep = WK_JSEGDEP(wk); 2283 LIST_INSERT_HEAD(&jsegdep->jd_seg->js_indirs, freework, fw_segs); 2284 TAILQ_INSERT_HEAD(INDIR_HASH(freework->fw_list.wk_mp, 2285 freework->fw_blkno), freework, fw_next); 2286 freework->fw_state &= ~DEPCOMPLETE; 2287} 2288 2289static void 2290indirblk_remove(freework) 2291 struct freework *freework; 2292{ 2293 2294 LIST_REMOVE(freework, fw_segs); 2295 TAILQ_REMOVE(INDIR_HASH(freework->fw_list.wk_mp, 2296 freework->fw_blkno), freework, fw_next); 2297 freework->fw_state |= DEPCOMPLETE; 2298 if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) 2299 WORKITEM_FREE(freework, D_FREEWORK); 2300} 2301 2302/* 2303 * Executed during filesystem system initialization before 2304 * mounting any filesystems. 2305 */ 2306void 2307softdep_initialize() 2308{ 2309 int i; 2310 2311 LIST_INIT(&mkdirlisthd); 2312 max_softdeps = desiredvnodes * 4; 2313 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); 2314 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 2315 newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash); 2316 bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash); 2317 i = 1 << (ffs(desiredvnodes / 10) - 1); 2318 indir_hashtbl = malloc(i * sizeof(indir_hashtbl[0]), M_FREEWORK, 2319 M_WAITOK); 2320 indir_hash = i - 1; 2321 for (i = 0; i <= indir_hash; i++) 2322 TAILQ_INIT(&indir_hashtbl[i]); 2323 2324 /* initialise bioops hack */ 2325 bioops.io_start = softdep_disk_io_initiation; 2326 bioops.io_complete = softdep_disk_write_complete; 2327 bioops.io_deallocate = softdep_deallocate_dependencies; 2328 bioops.io_countdeps = softdep_count_dependencies; 2329 2330 /* Initialize the callout with an mtx. */ 2331 callout_init_mtx(&softdep_callout, &lk, 0); 2332} 2333 2334/* 2335 * Executed after all filesystems have been unmounted during 2336 * filesystem module unload. 2337 */ 2338void 2339softdep_uninitialize() 2340{ 2341 2342 callout_drain(&softdep_callout); 2343 hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); 2344 hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); 2345 hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); 2346 hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash); 2347 free(indir_hashtbl, M_FREEWORK); 2348} 2349 2350/* 2351 * Called at mount time to notify the dependency code that a 2352 * filesystem wishes to use it. 2353 */ 2354int 2355softdep_mount(devvp, mp, fs, cred) 2356 struct vnode *devvp; 2357 struct mount *mp; 2358 struct fs *fs; 2359 struct ucred *cred; 2360{ 2361 struct csum_total cstotal; 2362 struct ufsmount *ump; 2363 struct cg *cgp; 2364 struct buf *bp; 2365 int error, cyl; 2366 2367 MNT_ILOCK(mp); 2368 mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; 2369 if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { 2370 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 2371 MNTK_SOFTDEP; 2372 mp->mnt_noasync++; 2373 } 2374 MNT_IUNLOCK(mp); 2375 ump = VFSTOUFS(mp); 2376 LIST_INIT(&ump->softdep_workitem_pending); 2377 LIST_INIT(&ump->softdep_journal_pending); 2378 TAILQ_INIT(&ump->softdep_unlinked); 2379 LIST_INIT(&ump->softdep_dirtycg); 2380 ump->softdep_worklist_tail = NULL; 2381 ump->softdep_on_worklist = 0; 2382 ump->softdep_deps = 0; 2383 if ((fs->fs_flags & FS_SUJ) && 2384 (error = journal_mount(mp, fs, cred)) != 0) { 2385 printf("Failed to start journal: %d\n", error); 2386 return (error); 2387 } 2388 /* 2389 * When doing soft updates, the counters in the 2390 * superblock may have gotten out of sync. Recomputation 2391 * can take a long time and can be deferred for background 2392 * fsck. However, the old behavior of scanning the cylinder 2393 * groups and recalculating them at mount time is available 2394 * by setting vfs.ffs.compute_summary_at_mount to one. 2395 */ 2396 if (compute_summary_at_mount == 0 || fs->fs_clean != 0) 2397 return (0); 2398 bzero(&cstotal, sizeof cstotal); 2399 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 2400 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 2401 fs->fs_cgsize, cred, &bp)) != 0) { 2402 brelse(bp); 2403 return (error); 2404 } 2405 cgp = (struct cg *)bp->b_data; 2406 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 2407 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 2408 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 2409 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 2410 fs->fs_cs(fs, cyl) = cgp->cg_cs; 2411 brelse(bp); 2412 } 2413#ifdef DEBUG 2414 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 2415 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 2416#endif 2417 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 2418 return (0); 2419} 2420 2421void 2422softdep_unmount(mp) 2423 struct mount *mp; 2424{ 2425 2426 MNT_ILOCK(mp); 2427 mp->mnt_flag &= ~MNT_SOFTDEP; 2428 if (MOUNTEDSUJ(mp) == 0) { 2429 MNT_IUNLOCK(mp); 2430 return; 2431 } 2432 mp->mnt_flag &= ~MNT_SUJ; 2433 MNT_IUNLOCK(mp); 2434 journal_unmount(mp); 2435} 2436 2437struct jblocks { 2438 struct jseglst jb_segs; /* TAILQ of current segments. */ 2439 struct jseg *jb_writeseg; /* Next write to complete. */ 2440 struct jseg *jb_oldestseg; /* Oldest segment with valid entries. */ 2441 struct jextent *jb_extent; /* Extent array. */ 2442 uint64_t jb_nextseq; /* Next sequence number. */ 2443 uint64_t jb_oldestwrseq; /* Oldest written sequence number. */ 2444 uint8_t jb_needseg; /* Need a forced segment. */ 2445 uint8_t jb_suspended; /* Did journal suspend writes? */ 2446 int jb_avail; /* Available extents. */ 2447 int jb_used; /* Last used extent. */ 2448 int jb_head; /* Allocator head. */ 2449 int jb_off; /* Allocator extent offset. */ 2450 int jb_blocks; /* Total disk blocks covered. */ 2451 int jb_free; /* Total disk blocks free. */ 2452 int jb_min; /* Minimum free space. */ 2453 int jb_low; /* Low on space. */ 2454 int jb_age; /* Insertion time of oldest rec. */ 2455}; 2456 2457struct jextent { 2458 ufs2_daddr_t je_daddr; /* Disk block address. */ 2459 int je_blocks; /* Disk block count. */ 2460}; 2461 2462static struct jblocks * 2463jblocks_create(void) 2464{ 2465 struct jblocks *jblocks; 2466 2467 jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); 2468 TAILQ_INIT(&jblocks->jb_segs); 2469 jblocks->jb_avail = 10; 2470 jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2471 M_JBLOCKS, M_WAITOK | M_ZERO); 2472 2473 return (jblocks); 2474} 2475 2476static ufs2_daddr_t 2477jblocks_alloc(jblocks, bytes, actual) 2478 struct jblocks *jblocks; 2479 int bytes; 2480 int *actual; 2481{ 2482 ufs2_daddr_t daddr; 2483 struct jextent *jext; 2484 int freecnt; 2485 int blocks; 2486 2487 blocks = bytes / DEV_BSIZE; 2488 jext = &jblocks->jb_extent[jblocks->jb_head]; 2489 freecnt = jext->je_blocks - jblocks->jb_off; 2490 if (freecnt == 0) { 2491 jblocks->jb_off = 0; 2492 if (++jblocks->jb_head > jblocks->jb_used) 2493 jblocks->jb_head = 0; 2494 jext = &jblocks->jb_extent[jblocks->jb_head]; 2495 freecnt = jext->je_blocks; 2496 } 2497 if (freecnt > blocks) 2498 freecnt = blocks; 2499 *actual = freecnt * DEV_BSIZE; 2500 daddr = jext->je_daddr + jblocks->jb_off; 2501 jblocks->jb_off += freecnt; 2502 jblocks->jb_free -= freecnt; 2503 2504 return (daddr); 2505} 2506 2507static void 2508jblocks_free(jblocks, mp, bytes) 2509 struct jblocks *jblocks; 2510 struct mount *mp; 2511 int bytes; 2512{ 2513 2514 jblocks->jb_free += bytes / DEV_BSIZE; 2515 if (jblocks->jb_suspended) 2516 worklist_speedup(); 2517 wakeup(jblocks); 2518} 2519 2520static void 2521jblocks_destroy(jblocks) 2522 struct jblocks *jblocks; 2523{ 2524 2525 if (jblocks->jb_extent) 2526 free(jblocks->jb_extent, M_JBLOCKS); 2527 free(jblocks, M_JBLOCKS); 2528} 2529 2530static void 2531jblocks_add(jblocks, daddr, blocks) 2532 struct jblocks *jblocks; 2533 ufs2_daddr_t daddr; 2534 int blocks; 2535{ 2536 struct jextent *jext; 2537 2538 jblocks->jb_blocks += blocks; 2539 jblocks->jb_free += blocks; 2540 jext = &jblocks->jb_extent[jblocks->jb_used]; 2541 /* Adding the first block. */ 2542 if (jext->je_daddr == 0) { 2543 jext->je_daddr = daddr; 2544 jext->je_blocks = blocks; 2545 return; 2546 } 2547 /* Extending the last extent. */ 2548 if (jext->je_daddr + jext->je_blocks == daddr) { 2549 jext->je_blocks += blocks; 2550 return; 2551 } 2552 /* Adding a new extent. */ 2553 if (++jblocks->jb_used == jblocks->jb_avail) { 2554 jblocks->jb_avail *= 2; 2555 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2556 M_JBLOCKS, M_WAITOK | M_ZERO); 2557 memcpy(jext, jblocks->jb_extent, 2558 sizeof(struct jextent) * jblocks->jb_used); 2559 free(jblocks->jb_extent, M_JBLOCKS); 2560 jblocks->jb_extent = jext; 2561 } 2562 jext = &jblocks->jb_extent[jblocks->jb_used]; 2563 jext->je_daddr = daddr; 2564 jext->je_blocks = blocks; 2565 return; 2566} 2567 2568int 2569softdep_journal_lookup(mp, vpp) 2570 struct mount *mp; 2571 struct vnode **vpp; 2572{ 2573 struct componentname cnp; 2574 struct vnode *dvp; 2575 ino_t sujournal; 2576 int error; 2577 2578 error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp); 2579 if (error) 2580 return (error); 2581 bzero(&cnp, sizeof(cnp)); 2582 cnp.cn_nameiop = LOOKUP; 2583 cnp.cn_flags = ISLASTCN; 2584 cnp.cn_thread = curthread; 2585 cnp.cn_cred = curthread->td_ucred; 2586 cnp.cn_pnbuf = SUJ_FILE; 2587 cnp.cn_nameptr = SUJ_FILE; 2588 cnp.cn_namelen = strlen(SUJ_FILE); 2589 error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); 2590 vput(dvp); 2591 if (error != 0) 2592 return (error); 2593 error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); 2594 return (error); 2595} 2596 2597/* 2598 * Open and verify the journal file. 2599 */ 2600static int 2601journal_mount(mp, fs, cred) 2602 struct mount *mp; 2603 struct fs *fs; 2604 struct ucred *cred; 2605{ 2606 struct jblocks *jblocks; 2607 struct vnode *vp; 2608 struct inode *ip; 2609 ufs2_daddr_t blkno; 2610 int bcount; 2611 int error; 2612 int i; 2613 2614 error = softdep_journal_lookup(mp, &vp); 2615 if (error != 0) { 2616 printf("Failed to find journal. Use tunefs to create one\n"); 2617 return (error); 2618 } 2619 ip = VTOI(vp); 2620 if (ip->i_size < SUJ_MIN) { 2621 error = ENOSPC; 2622 goto out; 2623 } 2624 bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ 2625 jblocks = jblocks_create(); 2626 for (i = 0; i < bcount; i++) { 2627 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); 2628 if (error) 2629 break; 2630 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); 2631 } 2632 if (error) { 2633 jblocks_destroy(jblocks); 2634 goto out; 2635 } 2636 jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ 2637 jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ 2638 VFSTOUFS(mp)->softdep_jblocks = jblocks; 2639out: 2640 if (error == 0) { 2641 MNT_ILOCK(mp); 2642 mp->mnt_flag |= MNT_SUJ; 2643 mp->mnt_flag &= ~MNT_SOFTDEP; 2644 MNT_IUNLOCK(mp); 2645 /* 2646 * Only validate the journal contents if the 2647 * filesystem is clean, otherwise we write the logs 2648 * but they'll never be used. If the filesystem was 2649 * still dirty when we mounted it the journal is 2650 * invalid and a new journal can only be valid if it 2651 * starts from a clean mount. 2652 */ 2653 if (fs->fs_clean) { 2654 DIP_SET(ip, i_modrev, fs->fs_mtime); 2655 ip->i_flags |= IN_MODIFIED; 2656 ffs_update(vp, 1); 2657 } 2658 } 2659 vput(vp); 2660 return (error); 2661} 2662 2663static void 2664journal_unmount(mp) 2665 struct mount *mp; 2666{ 2667 struct ufsmount *ump; 2668 2669 ump = VFSTOUFS(mp); 2670 if (ump->softdep_jblocks) 2671 jblocks_destroy(ump->softdep_jblocks); 2672 ump->softdep_jblocks = NULL; 2673} 2674 2675/* 2676 * Called when a journal record is ready to be written. Space is allocated 2677 * and the journal entry is created when the journal is flushed to stable 2678 * store. 2679 */ 2680static void 2681add_to_journal(wk) 2682 struct worklist *wk; 2683{ 2684 struct ufsmount *ump; 2685 2686 mtx_assert(&lk, MA_OWNED); 2687 ump = VFSTOUFS(wk->wk_mp); 2688 if (wk->wk_state & ONWORKLIST) 2689 panic("add_to_journal: %s(0x%X) already on list", 2690 TYPENAME(wk->wk_type), wk->wk_state); 2691 wk->wk_state |= ONWORKLIST | DEPCOMPLETE; 2692 if (LIST_EMPTY(&ump->softdep_journal_pending)) { 2693 ump->softdep_jblocks->jb_age = ticks; 2694 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); 2695 } else 2696 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); 2697 ump->softdep_journal_tail = wk; 2698 ump->softdep_on_journal += 1; 2699} 2700 2701/* 2702 * Remove an arbitrary item for the journal worklist maintain the tail 2703 * pointer. This happens when a new operation obviates the need to 2704 * journal an old operation. 2705 */ 2706static void 2707remove_from_journal(wk) 2708 struct worklist *wk; 2709{ 2710 struct ufsmount *ump; 2711 2712 mtx_assert(&lk, MA_OWNED); 2713 ump = VFSTOUFS(wk->wk_mp); 2714#ifdef SUJ_DEBUG 2715 { 2716 struct worklist *wkn; 2717 2718 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) 2719 if (wkn == wk) 2720 break; 2721 if (wkn == NULL) 2722 panic("remove_from_journal: %p is not in journal", wk); 2723 } 2724#endif 2725 /* 2726 * We emulate a TAILQ to save space in most structures which do not 2727 * require TAILQ semantics. Here we must update the tail position 2728 * when removing the tail which is not the final entry. This works 2729 * only if the worklist linkage are at the beginning of the structure. 2730 */ 2731 if (ump->softdep_journal_tail == wk) 2732 ump->softdep_journal_tail = 2733 (struct worklist *)wk->wk_list.le_prev; 2734 2735 WORKLIST_REMOVE(wk); 2736 ump->softdep_on_journal -= 1; 2737} 2738 2739/* 2740 * Check for journal space as well as dependency limits so the prelink 2741 * code can throttle both journaled and non-journaled filesystems. 2742 * Threshold is 0 for low and 1 for min. 2743 */ 2744static int 2745journal_space(ump, thresh) 2746 struct ufsmount *ump; 2747 int thresh; 2748{ 2749 struct jblocks *jblocks; 2750 int avail; 2751 2752 jblocks = ump->softdep_jblocks; 2753 if (jblocks == NULL) 2754 return (1); 2755 /* 2756 * We use a tighter restriction here to prevent request_cleanup() 2757 * running in threads from running into locks we currently hold. 2758 */ 2759 if (dep_current[D_INODEDEP] > (max_softdeps / 10) * 9) 2760 return (0); 2761 if (thresh) 2762 thresh = jblocks->jb_min; 2763 else 2764 thresh = jblocks->jb_low; 2765 avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; 2766 avail = jblocks->jb_free - avail; 2767 2768 return (avail > thresh); 2769} 2770 2771static void 2772journal_suspend(ump) 2773 struct ufsmount *ump; 2774{ 2775 struct jblocks *jblocks; 2776 struct mount *mp; 2777 2778 mp = UFSTOVFS(ump); 2779 jblocks = ump->softdep_jblocks; 2780 MNT_ILOCK(mp); 2781 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 2782 stat_journal_min++; 2783 mp->mnt_kern_flag |= MNTK_SUSPEND; 2784 mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc); 2785 } 2786 jblocks->jb_suspended = 1; 2787 MNT_IUNLOCK(mp); 2788} 2789 2790static int 2791journal_unsuspend(struct ufsmount *ump) 2792{ 2793 struct jblocks *jblocks; 2794 struct mount *mp; 2795 2796 mp = UFSTOVFS(ump); 2797 jblocks = ump->softdep_jblocks; 2798 2799 if (jblocks != NULL && jblocks->jb_suspended && 2800 journal_space(ump, jblocks->jb_min)) { 2801 jblocks->jb_suspended = 0; 2802 FREE_LOCK(&lk); 2803 mp->mnt_susp_owner = curthread; 2804 vfs_write_resume(mp); 2805 ACQUIRE_LOCK(&lk); 2806 return (1); 2807 } 2808 return (0); 2809} 2810 2811/* 2812 * Called before any allocation function to be certain that there is 2813 * sufficient space in the journal prior to creating any new records. 2814 * Since in the case of block allocation we may have multiple locked 2815 * buffers at the time of the actual allocation we can not block 2816 * when the journal records are created. Doing so would create a deadlock 2817 * if any of these buffers needed to be flushed to reclaim space. Instead 2818 * we require a sufficiently large amount of available space such that 2819 * each thread in the system could have passed this allocation check and 2820 * still have sufficient free space. With 20% of a minimum journal size 2821 * of 1MB we have 6553 records available. 2822 */ 2823int 2824softdep_prealloc(vp, waitok) 2825 struct vnode *vp; 2826 int waitok; 2827{ 2828 struct ufsmount *ump; 2829 2830 if (DOINGSUJ(vp) == 0) 2831 return (0); 2832 ump = VFSTOUFS(vp->v_mount); 2833 ACQUIRE_LOCK(&lk); 2834 if (journal_space(ump, 0)) { 2835 FREE_LOCK(&lk); 2836 return (0); 2837 } 2838 stat_journal_low++; 2839 FREE_LOCK(&lk); 2840 if (waitok == MNT_NOWAIT) 2841 return (ENOSPC); 2842 /* 2843 * Attempt to sync this vnode once to flush any journal 2844 * work attached to it. 2845 */ 2846 if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0) 2847 ffs_syncvnode(vp, waitok); 2848 ACQUIRE_LOCK(&lk); 2849 process_removes(vp); 2850 process_truncates(vp); 2851 if (journal_space(ump, 0) == 0) { 2852 softdep_speedup(); 2853 if (journal_space(ump, 1) == 0) 2854 journal_suspend(ump); 2855 } 2856 FREE_LOCK(&lk); 2857 2858 return (0); 2859} 2860 2861/* 2862 * Before adjusting a link count on a vnode verify that we have sufficient 2863 * journal space. If not, process operations that depend on the currently 2864 * locked pair of vnodes to try to flush space as the syncer, buf daemon, 2865 * and softdep flush threads can not acquire these locks to reclaim space. 2866 */ 2867static void 2868softdep_prelink(dvp, vp) 2869 struct vnode *dvp; 2870 struct vnode *vp; 2871{ 2872 struct ufsmount *ump; 2873 2874 ump = VFSTOUFS(dvp->v_mount); 2875 mtx_assert(&lk, MA_OWNED); 2876 if (journal_space(ump, 0)) 2877 return; 2878 stat_journal_low++; 2879 FREE_LOCK(&lk); 2880 if (vp) 2881 ffs_syncvnode(vp, MNT_NOWAIT); 2882 ffs_syncvnode(dvp, MNT_WAIT); 2883 ACQUIRE_LOCK(&lk); 2884 /* Process vp before dvp as it may create .. removes. */ 2885 if (vp) { 2886 process_removes(vp); 2887 process_truncates(vp); 2888 } 2889 process_removes(dvp); 2890 process_truncates(dvp); 2891 softdep_speedup(); 2892 process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); 2893 if (journal_space(ump, 0) == 0) { 2894 softdep_speedup(); 2895 if (journal_space(ump, 1) == 0) 2896 journal_suspend(ump); 2897 } 2898} 2899 2900static void 2901jseg_write(ump, jseg, data) 2902 struct ufsmount *ump; 2903 struct jseg *jseg; 2904 uint8_t *data; 2905{ 2906 struct jsegrec *rec; 2907 2908 rec = (struct jsegrec *)data; 2909 rec->jsr_seq = jseg->js_seq; 2910 rec->jsr_oldest = jseg->js_oldseq; 2911 rec->jsr_cnt = jseg->js_cnt; 2912 rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize; 2913 rec->jsr_crc = 0; 2914 rec->jsr_time = ump->um_fs->fs_mtime; 2915} 2916 2917static inline void 2918inoref_write(inoref, jseg, rec) 2919 struct inoref *inoref; 2920 struct jseg *jseg; 2921 struct jrefrec *rec; 2922{ 2923 2924 inoref->if_jsegdep->jd_seg = jseg; 2925 rec->jr_ino = inoref->if_ino; 2926 rec->jr_parent = inoref->if_parent; 2927 rec->jr_nlink = inoref->if_nlink; 2928 rec->jr_mode = inoref->if_mode; 2929 rec->jr_diroff = inoref->if_diroff; 2930} 2931 2932static void 2933jaddref_write(jaddref, jseg, data) 2934 struct jaddref *jaddref; 2935 struct jseg *jseg; 2936 uint8_t *data; 2937{ 2938 struct jrefrec *rec; 2939 2940 rec = (struct jrefrec *)data; 2941 rec->jr_op = JOP_ADDREF; 2942 inoref_write(&jaddref->ja_ref, jseg, rec); 2943} 2944 2945static void 2946jremref_write(jremref, jseg, data) 2947 struct jremref *jremref; 2948 struct jseg *jseg; 2949 uint8_t *data; 2950{ 2951 struct jrefrec *rec; 2952 2953 rec = (struct jrefrec *)data; 2954 rec->jr_op = JOP_REMREF; 2955 inoref_write(&jremref->jr_ref, jseg, rec); 2956} 2957 2958static void 2959jmvref_write(jmvref, jseg, data) 2960 struct jmvref *jmvref; 2961 struct jseg *jseg; 2962 uint8_t *data; 2963{ 2964 struct jmvrec *rec; 2965 2966 rec = (struct jmvrec *)data; 2967 rec->jm_op = JOP_MVREF; 2968 rec->jm_ino = jmvref->jm_ino; 2969 rec->jm_parent = jmvref->jm_parent; 2970 rec->jm_oldoff = jmvref->jm_oldoff; 2971 rec->jm_newoff = jmvref->jm_newoff; 2972} 2973 2974static void 2975jnewblk_write(jnewblk, jseg, data) 2976 struct jnewblk *jnewblk; 2977 struct jseg *jseg; 2978 uint8_t *data; 2979{ 2980 struct jblkrec *rec; 2981 2982 jnewblk->jn_jsegdep->jd_seg = jseg; 2983 rec = (struct jblkrec *)data; 2984 rec->jb_op = JOP_NEWBLK; 2985 rec->jb_ino = jnewblk->jn_ino; 2986 rec->jb_blkno = jnewblk->jn_blkno; 2987 rec->jb_lbn = jnewblk->jn_lbn; 2988 rec->jb_frags = jnewblk->jn_frags; 2989 rec->jb_oldfrags = jnewblk->jn_oldfrags; 2990} 2991 2992static void 2993jfreeblk_write(jfreeblk, jseg, data) 2994 struct jfreeblk *jfreeblk; 2995 struct jseg *jseg; 2996 uint8_t *data; 2997{ 2998 struct jblkrec *rec; 2999 3000 jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg; 3001 rec = (struct jblkrec *)data; 3002 rec->jb_op = JOP_FREEBLK; 3003 rec->jb_ino = jfreeblk->jf_ino; 3004 rec->jb_blkno = jfreeblk->jf_blkno; 3005 rec->jb_lbn = jfreeblk->jf_lbn; 3006 rec->jb_frags = jfreeblk->jf_frags; 3007 rec->jb_oldfrags = 0; 3008} 3009 3010static void 3011jfreefrag_write(jfreefrag, jseg, data) 3012 struct jfreefrag *jfreefrag; 3013 struct jseg *jseg; 3014 uint8_t *data; 3015{ 3016 struct jblkrec *rec; 3017 3018 jfreefrag->fr_jsegdep->jd_seg = jseg; 3019 rec = (struct jblkrec *)data; 3020 rec->jb_op = JOP_FREEBLK; 3021 rec->jb_ino = jfreefrag->fr_ino; 3022 rec->jb_blkno = jfreefrag->fr_blkno; 3023 rec->jb_lbn = jfreefrag->fr_lbn; 3024 rec->jb_frags = jfreefrag->fr_frags; 3025 rec->jb_oldfrags = 0; 3026} 3027 3028static void 3029jtrunc_write(jtrunc, jseg, data) 3030 struct jtrunc *jtrunc; 3031 struct jseg *jseg; 3032 uint8_t *data; 3033{ 3034 struct jtrncrec *rec; 3035 3036 jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg; 3037 rec = (struct jtrncrec *)data; 3038 rec->jt_op = JOP_TRUNC; 3039 rec->jt_ino = jtrunc->jt_ino; 3040 rec->jt_size = jtrunc->jt_size; 3041 rec->jt_extsize = jtrunc->jt_extsize; 3042} 3043 3044static void 3045jfsync_write(jfsync, jseg, data) 3046 struct jfsync *jfsync; 3047 struct jseg *jseg; 3048 uint8_t *data; 3049{ 3050 struct jtrncrec *rec; 3051 3052 rec = (struct jtrncrec *)data; 3053 rec->jt_op = JOP_SYNC; 3054 rec->jt_ino = jfsync->jfs_ino; 3055 rec->jt_size = jfsync->jfs_size; 3056 rec->jt_extsize = jfsync->jfs_extsize; 3057} 3058 3059static void 3060softdep_flushjournal(mp) 3061 struct mount *mp; 3062{ 3063 struct jblocks *jblocks; 3064 struct ufsmount *ump; 3065 3066 if (MOUNTEDSUJ(mp) == 0) 3067 return; 3068 ump = VFSTOUFS(mp); 3069 jblocks = ump->softdep_jblocks; 3070 ACQUIRE_LOCK(&lk); 3071 while (ump->softdep_on_journal) { 3072 jblocks->jb_needseg = 1; 3073 softdep_process_journal(mp, NULL, MNT_WAIT); 3074 } 3075 FREE_LOCK(&lk); 3076} 3077 3078/* 3079 * Flush some journal records to disk. 3080 */ 3081static void 3082softdep_process_journal(mp, needwk, flags) 3083 struct mount *mp; 3084 struct worklist *needwk; 3085 int flags; 3086{ 3087 struct jblocks *jblocks; 3088 struct ufsmount *ump; 3089 struct worklist *wk; 3090 struct jseg *jseg; 3091 struct buf *bp; 3092 uint8_t *data; 3093 struct fs *fs; 3094 int segwritten; 3095 int jrecmin; /* Minimum records per block. */ 3096 int jrecmax; /* Maximum records per block. */ 3097 int size; 3098 int cnt; 3099 int off; 3100 int devbsize; 3101 3102 if (MOUNTEDSUJ(mp) == 0) 3103 return; 3104 ump = VFSTOUFS(mp); 3105 fs = ump->um_fs; 3106 jblocks = ump->softdep_jblocks; 3107 devbsize = ump->um_devvp->v_bufobj.bo_bsize; 3108 /* 3109 * We write anywhere between a disk block and fs block. The upper 3110 * bound is picked to prevent buffer cache fragmentation and limit 3111 * processing time per I/O. 3112 */ 3113 jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */ 3114 jrecmax = (fs->fs_bsize / devbsize) * jrecmin; 3115 segwritten = 0; 3116 for (;;) { 3117 cnt = ump->softdep_on_journal; 3118 /* 3119 * Criteria for writing a segment: 3120 * 1) We have a full block. 3121 * 2) We're called from jwait() and haven't found the 3122 * journal item yet. 3123 * 3) Always write if needseg is set. 3124 * 4) If we are called from process_worklist and have 3125 * not yet written anything we write a partial block 3126 * to enforce a 1 second maximum latency on journal 3127 * entries. 3128 */ 3129 if (cnt < (jrecmax - 1) && needwk == NULL && 3130 jblocks->jb_needseg == 0 && (segwritten || cnt == 0)) 3131 break; 3132 cnt++; 3133 /* 3134 * Verify some free journal space. softdep_prealloc() should 3135 * guarantee that we don't run out so this is indicative of 3136 * a problem with the flow control. Try to recover 3137 * gracefully in any event. 3138 */ 3139 while (jblocks->jb_free == 0) { 3140 if (flags != MNT_WAIT) 3141 break; 3142 printf("softdep: Out of journal space!\n"); 3143 softdep_speedup(); 3144 msleep(jblocks, &lk, PRIBIO, "jblocks", hz); 3145 } 3146 FREE_LOCK(&lk); 3147 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); 3148 workitem_alloc(&jseg->js_list, D_JSEG, mp); 3149 LIST_INIT(&jseg->js_entries); 3150 LIST_INIT(&jseg->js_indirs); 3151 jseg->js_state = ATTACHED; 3152 jseg->js_jblocks = jblocks; 3153 bp = geteblk(fs->fs_bsize, 0); 3154 ACQUIRE_LOCK(&lk); 3155 /* 3156 * If there was a race while we were allocating the block 3157 * and jseg the entry we care about was likely written. 3158 * We bail out in both the WAIT and NOWAIT case and assume 3159 * the caller will loop if the entry it cares about is 3160 * not written. 3161 */ 3162 cnt = ump->softdep_on_journal; 3163 if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) { 3164 bp->b_flags |= B_INVAL | B_NOCACHE; 3165 WORKITEM_FREE(jseg, D_JSEG); 3166 FREE_LOCK(&lk); 3167 brelse(bp); 3168 ACQUIRE_LOCK(&lk); 3169 break; 3170 } 3171 /* 3172 * Calculate the disk block size required for the available 3173 * records rounded to the min size. 3174 */ 3175 if (cnt == 0) 3176 size = devbsize; 3177 else if (cnt < jrecmax) 3178 size = howmany(cnt, jrecmin) * devbsize; 3179 else 3180 size = fs->fs_bsize; 3181 /* 3182 * Allocate a disk block for this journal data and account 3183 * for truncation of the requested size if enough contiguous 3184 * space was not available. 3185 */ 3186 bp->b_blkno = jblocks_alloc(jblocks, size, &size); 3187 bp->b_lblkno = bp->b_blkno; 3188 bp->b_offset = bp->b_blkno * DEV_BSIZE; 3189 bp->b_bcount = size; 3190 bp->b_bufobj = &ump->um_devvp->v_bufobj; 3191 bp->b_flags &= ~B_INVAL; 3192 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; 3193 /* 3194 * Initialize our jseg with cnt records. Assign the next 3195 * sequence number to it and link it in-order. 3196 */ 3197 cnt = MIN(cnt, (size / devbsize) * jrecmin); 3198 jseg->js_buf = bp; 3199 jseg->js_cnt = cnt; 3200 jseg->js_refs = cnt + 1; /* Self ref. */ 3201 jseg->js_size = size; 3202 jseg->js_seq = jblocks->jb_nextseq++; 3203 if (jblocks->jb_oldestseg == NULL) 3204 jblocks->jb_oldestseg = jseg; 3205 jseg->js_oldseq = jblocks->jb_oldestseg->js_seq; 3206 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); 3207 if (jblocks->jb_writeseg == NULL) 3208 jblocks->jb_writeseg = jseg; 3209 /* 3210 * Start filling in records from the pending list. 3211 */ 3212 data = bp->b_data; 3213 off = 0; 3214 while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) 3215 != NULL) { 3216 if (cnt == 0) 3217 break; 3218 /* Place a segment header on every device block. */ 3219 if ((off % devbsize) == 0) { 3220 jseg_write(ump, jseg, data); 3221 off += JREC_SIZE; 3222 data = bp->b_data + off; 3223 } 3224 if (wk == needwk) 3225 needwk = NULL; 3226 remove_from_journal(wk); 3227 wk->wk_state |= INPROGRESS; 3228 WORKLIST_INSERT(&jseg->js_entries, wk); 3229 switch (wk->wk_type) { 3230 case D_JADDREF: 3231 jaddref_write(WK_JADDREF(wk), jseg, data); 3232 break; 3233 case D_JREMREF: 3234 jremref_write(WK_JREMREF(wk), jseg, data); 3235 break; 3236 case D_JMVREF: 3237 jmvref_write(WK_JMVREF(wk), jseg, data); 3238 break; 3239 case D_JNEWBLK: 3240 jnewblk_write(WK_JNEWBLK(wk), jseg, data); 3241 break; 3242 case D_JFREEBLK: 3243 jfreeblk_write(WK_JFREEBLK(wk), jseg, data); 3244 break; 3245 case D_JFREEFRAG: 3246 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); 3247 break; 3248 case D_JTRUNC: 3249 jtrunc_write(WK_JTRUNC(wk), jseg, data); 3250 break; 3251 case D_JFSYNC: 3252 jfsync_write(WK_JFSYNC(wk), jseg, data); 3253 break; 3254 default: 3255 panic("process_journal: Unknown type %s", 3256 TYPENAME(wk->wk_type)); 3257 /* NOTREACHED */ 3258 } 3259 off += JREC_SIZE; 3260 data = bp->b_data + off; 3261 cnt--; 3262 } 3263 /* 3264 * Write this one buffer and continue. 3265 */ 3266 segwritten = 1; 3267 jblocks->jb_needseg = 0; 3268 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); 3269 FREE_LOCK(&lk); 3270 BO_LOCK(bp->b_bufobj); 3271 bgetvp(ump->um_devvp, bp); 3272 BO_UNLOCK(bp->b_bufobj); 3273 /* 3274 * We only do the blocking wait once we find the journal 3275 * entry we're looking for. 3276 */ 3277 if (needwk == NULL && flags == MNT_WAIT) 3278 bwrite(bp); 3279 else 3280 bawrite(bp); 3281 ACQUIRE_LOCK(&lk); 3282 } 3283 /* 3284 * If we've suspended the filesystem because we ran out of journal 3285 * space either try to sync it here to make some progress or 3286 * unsuspend it if we already have. 3287 */ 3288 if (flags == 0 && jblocks->jb_suspended) { 3289 if (journal_unsuspend(ump)) 3290 return; 3291 FREE_LOCK(&lk); 3292 VFS_SYNC(mp, MNT_NOWAIT); 3293 ffs_sbupdate(ump, MNT_WAIT, 0); 3294 ACQUIRE_LOCK(&lk); 3295 } 3296} 3297 3298/* 3299 * Complete a jseg, allowing all dependencies awaiting journal writes 3300 * to proceed. Each journal dependency also attaches a jsegdep to dependent 3301 * structures so that the journal segment can be freed to reclaim space. 3302 */ 3303static void 3304complete_jseg(jseg) 3305 struct jseg *jseg; 3306{ 3307 struct worklist *wk; 3308 struct jmvref *jmvref; 3309 int waiting; 3310#ifdef INVARIANTS 3311 int i = 0; 3312#endif 3313 3314 while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { 3315 WORKLIST_REMOVE(wk); 3316 waiting = wk->wk_state & IOWAITING; 3317 wk->wk_state &= ~(INPROGRESS | IOWAITING); 3318 wk->wk_state |= COMPLETE; 3319 KASSERT(i++ < jseg->js_cnt, 3320 ("handle_written_jseg: overflow %d >= %d", 3321 i - 1, jseg->js_cnt)); 3322 switch (wk->wk_type) { 3323 case D_JADDREF: 3324 handle_written_jaddref(WK_JADDREF(wk)); 3325 break; 3326 case D_JREMREF: 3327 handle_written_jremref(WK_JREMREF(wk)); 3328 break; 3329 case D_JMVREF: 3330 rele_jseg(jseg); /* No jsegdep. */ 3331 jmvref = WK_JMVREF(wk); 3332 LIST_REMOVE(jmvref, jm_deps); 3333 if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0) 3334 free_pagedep(jmvref->jm_pagedep); 3335 WORKITEM_FREE(jmvref, D_JMVREF); 3336 break; 3337 case D_JNEWBLK: 3338 handle_written_jnewblk(WK_JNEWBLK(wk)); 3339 break; 3340 case D_JFREEBLK: 3341 handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep); 3342 break; 3343 case D_JTRUNC: 3344 handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep); 3345 break; 3346 case D_JFSYNC: 3347 rele_jseg(jseg); /* No jsegdep. */ 3348 WORKITEM_FREE(wk, D_JFSYNC); 3349 break; 3350 case D_JFREEFRAG: 3351 handle_written_jfreefrag(WK_JFREEFRAG(wk)); 3352 break; 3353 default: 3354 panic("handle_written_jseg: Unknown type %s", 3355 TYPENAME(wk->wk_type)); 3356 /* NOTREACHED */ 3357 } 3358 if (waiting) 3359 wakeup(wk); 3360 } 3361 /* Release the self reference so the structure may be freed. */ 3362 rele_jseg(jseg); 3363} 3364 3365/* 3366 * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg 3367 * completions in order only. 3368 */ 3369static void 3370handle_written_jseg(jseg, bp) 3371 struct jseg *jseg; 3372 struct buf *bp; 3373{ 3374 struct jblocks *jblocks; 3375 struct jseg *jsegn; 3376 3377 if (jseg->js_refs == 0) 3378 panic("handle_written_jseg: No self-reference on %p", jseg); 3379 jseg->js_state |= DEPCOMPLETE; 3380 /* 3381 * We'll never need this buffer again, set flags so it will be 3382 * discarded. 3383 */ 3384 bp->b_flags |= B_INVAL | B_NOCACHE; 3385 jblocks = jseg->js_jblocks; 3386 /* 3387 * Don't allow out of order completions. If this isn't the first 3388 * block wait for it to write before we're done. 3389 */ 3390 if (jseg != jblocks->jb_writeseg) 3391 return; 3392 /* Iterate through available jsegs processing their entries. */ 3393 do { 3394 jblocks->jb_oldestwrseq = jseg->js_oldseq; 3395 jsegn = TAILQ_NEXT(jseg, js_next); 3396 complete_jseg(jseg); 3397 jseg = jsegn; 3398 } while (jseg && jseg->js_state & DEPCOMPLETE); 3399 jblocks->jb_writeseg = jseg; 3400 /* 3401 * Attempt to free jsegs now that oldestwrseq may have advanced. 3402 */ 3403 free_jsegs(jblocks); 3404} 3405 3406static inline struct jsegdep * 3407inoref_jseg(inoref) 3408 struct inoref *inoref; 3409{ 3410 struct jsegdep *jsegdep; 3411 3412 jsegdep = inoref->if_jsegdep; 3413 inoref->if_jsegdep = NULL; 3414 3415 return (jsegdep); 3416} 3417 3418/* 3419 * Called once a jremref has made it to stable store. The jremref is marked 3420 * complete and we attempt to free it. Any pagedeps writes sleeping waiting 3421 * for the jremref to complete will be awoken by free_jremref. 3422 */ 3423static void 3424handle_written_jremref(jremref) 3425 struct jremref *jremref; 3426{ 3427 struct inodedep *inodedep; 3428 struct jsegdep *jsegdep; 3429 struct dirrem *dirrem; 3430 3431 /* Grab the jsegdep. */ 3432 jsegdep = inoref_jseg(&jremref->jr_ref); 3433 /* 3434 * Remove us from the inoref list. 3435 */ 3436 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 3437 0, &inodedep) == 0) 3438 panic("handle_written_jremref: Lost inodedep"); 3439 TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 3440 /* 3441 * Complete the dirrem. 3442 */ 3443 dirrem = jremref->jr_dirrem; 3444 jremref->jr_dirrem = NULL; 3445 LIST_REMOVE(jremref, jr_deps); 3446 jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; 3447 jwork_insert(&dirrem->dm_jwork, jsegdep); 3448 if (LIST_EMPTY(&dirrem->dm_jremrefhd) && 3449 (dirrem->dm_state & COMPLETE) != 0) 3450 add_to_worklist(&dirrem->dm_list, 0); 3451 free_jremref(jremref); 3452} 3453 3454/* 3455 * Called once a jaddref has made it to stable store. The dependency is 3456 * marked complete and any dependent structures are added to the inode 3457 * bufwait list to be completed as soon as it is written. If a bitmap write 3458 * depends on this entry we move the inode into the inodedephd of the 3459 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. 3460 */ 3461static void 3462handle_written_jaddref(jaddref) 3463 struct jaddref *jaddref; 3464{ 3465 struct jsegdep *jsegdep; 3466 struct inodedep *inodedep; 3467 struct diradd *diradd; 3468 struct mkdir *mkdir; 3469 3470 /* Grab the jsegdep. */ 3471 jsegdep = inoref_jseg(&jaddref->ja_ref); 3472 mkdir = NULL; 3473 diradd = NULL; 3474 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3475 0, &inodedep) == 0) 3476 panic("handle_written_jaddref: Lost inodedep."); 3477 if (jaddref->ja_diradd == NULL) 3478 panic("handle_written_jaddref: No dependency"); 3479 if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { 3480 diradd = jaddref->ja_diradd; 3481 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); 3482 } else if (jaddref->ja_state & MKDIR_PARENT) { 3483 mkdir = jaddref->ja_mkdir; 3484 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); 3485 } else if (jaddref->ja_state & MKDIR_BODY) 3486 mkdir = jaddref->ja_mkdir; 3487 else 3488 panic("handle_written_jaddref: Unknown dependency %p", 3489 jaddref->ja_diradd); 3490 jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ 3491 /* 3492 * Remove us from the inode list. 3493 */ 3494 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); 3495 /* 3496 * The mkdir may be waiting on the jaddref to clear before freeing. 3497 */ 3498 if (mkdir) { 3499 KASSERT(mkdir->md_list.wk_type == D_MKDIR, 3500 ("handle_written_jaddref: Incorrect type for mkdir %s", 3501 TYPENAME(mkdir->md_list.wk_type))); 3502 mkdir->md_jaddref = NULL; 3503 diradd = mkdir->md_diradd; 3504 mkdir->md_state |= DEPCOMPLETE; 3505 complete_mkdir(mkdir); 3506 } 3507 jwork_insert(&diradd->da_jwork, jsegdep); 3508 if (jaddref->ja_state & NEWBLOCK) { 3509 inodedep->id_state |= ONDEPLIST; 3510 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, 3511 inodedep, id_deps); 3512 } 3513 free_jaddref(jaddref); 3514} 3515 3516/* 3517 * Called once a jnewblk journal is written. The allocdirect or allocindir 3518 * is placed in the bmsafemap to await notification of a written bitmap. If 3519 * the operation was canceled we add the segdep to the appropriate 3520 * dependency to free the journal space once the canceling operation 3521 * completes. 3522 */ 3523static void 3524handle_written_jnewblk(jnewblk) 3525 struct jnewblk *jnewblk; 3526{ 3527 struct bmsafemap *bmsafemap; 3528 struct freefrag *freefrag; 3529 struct freework *freework; 3530 struct jsegdep *jsegdep; 3531 struct newblk *newblk; 3532 3533 /* Grab the jsegdep. */ 3534 jsegdep = jnewblk->jn_jsegdep; 3535 jnewblk->jn_jsegdep = NULL; 3536 if (jnewblk->jn_dep == NULL) 3537 panic("handle_written_jnewblk: No dependency for the segdep."); 3538 switch (jnewblk->jn_dep->wk_type) { 3539 case D_NEWBLK: 3540 case D_ALLOCDIRECT: 3541 case D_ALLOCINDIR: 3542 /* 3543 * Add the written block to the bmsafemap so it can 3544 * be notified when the bitmap is on disk. 3545 */ 3546 newblk = WK_NEWBLK(jnewblk->jn_dep); 3547 newblk->nb_jnewblk = NULL; 3548 if ((newblk->nb_state & GOINGAWAY) == 0) { 3549 bmsafemap = newblk->nb_bmsafemap; 3550 newblk->nb_state |= ONDEPLIST; 3551 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, 3552 nb_deps); 3553 } 3554 jwork_insert(&newblk->nb_jwork, jsegdep); 3555 break; 3556 case D_FREEFRAG: 3557 /* 3558 * A newblock being removed by a freefrag when replaced by 3559 * frag extension. 3560 */ 3561 freefrag = WK_FREEFRAG(jnewblk->jn_dep); 3562 freefrag->ff_jdep = NULL; 3563 WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list); 3564 break; 3565 case D_FREEWORK: 3566 /* 3567 * A direct block was removed by truncate. 3568 */ 3569 freework = WK_FREEWORK(jnewblk->jn_dep); 3570 freework->fw_jnewblk = NULL; 3571 WORKLIST_INSERT(&freework->fw_freeblks->fb_jwork, 3572 &jsegdep->jd_list); 3573 break; 3574 default: 3575 panic("handle_written_jnewblk: Unknown type %d.", 3576 jnewblk->jn_dep->wk_type); 3577 } 3578 jnewblk->jn_dep = NULL; 3579 free_jnewblk(jnewblk); 3580} 3581 3582/* 3583 * Cancel a jfreefrag that won't be needed, probably due to colliding with 3584 * an in-flight allocation that has not yet been committed. Divorce us 3585 * from the freefrag and mark it DEPCOMPLETE so that it may be added 3586 * to the worklist. 3587 */ 3588static void 3589cancel_jfreefrag(jfreefrag) 3590 struct jfreefrag *jfreefrag; 3591{ 3592 struct freefrag *freefrag; 3593 3594 if (jfreefrag->fr_jsegdep) { 3595 free_jsegdep(jfreefrag->fr_jsegdep); 3596 jfreefrag->fr_jsegdep = NULL; 3597 } 3598 freefrag = jfreefrag->fr_freefrag; 3599 jfreefrag->fr_freefrag = NULL; 3600 free_jfreefrag(jfreefrag); 3601 freefrag->ff_state |= DEPCOMPLETE; 3602} 3603 3604/* 3605 * Free a jfreefrag when the parent freefrag is rendered obsolete. 3606 */ 3607static void 3608free_jfreefrag(jfreefrag) 3609 struct jfreefrag *jfreefrag; 3610{ 3611 3612 if (jfreefrag->fr_state & INPROGRESS) 3613 WORKLIST_REMOVE(&jfreefrag->fr_list); 3614 else if (jfreefrag->fr_state & ONWORKLIST) 3615 remove_from_journal(&jfreefrag->fr_list); 3616 if (jfreefrag->fr_freefrag != NULL) 3617 panic("free_jfreefrag: Still attached to a freefrag."); 3618 WORKITEM_FREE(jfreefrag, D_JFREEFRAG); 3619} 3620 3621/* 3622 * Called when the journal write for a jfreefrag completes. The parent 3623 * freefrag is added to the worklist if this completes its dependencies. 3624 */ 3625static void 3626handle_written_jfreefrag(jfreefrag) 3627 struct jfreefrag *jfreefrag; 3628{ 3629 struct jsegdep *jsegdep; 3630 struct freefrag *freefrag; 3631 3632 /* Grab the jsegdep. */ 3633 jsegdep = jfreefrag->fr_jsegdep; 3634 jfreefrag->fr_jsegdep = NULL; 3635 freefrag = jfreefrag->fr_freefrag; 3636 if (freefrag == NULL) 3637 panic("handle_written_jfreefrag: No freefrag."); 3638 freefrag->ff_state |= DEPCOMPLETE; 3639 freefrag->ff_jdep = NULL; 3640 jwork_insert(&freefrag->ff_jwork, jsegdep); 3641 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 3642 add_to_worklist(&freefrag->ff_list, 0); 3643 jfreefrag->fr_freefrag = NULL; 3644 free_jfreefrag(jfreefrag); 3645} 3646 3647/* 3648 * Called when the journal write for a jfreeblk completes. The jfreeblk 3649 * is removed from the freeblks list of pending journal writes and the 3650 * jsegdep is moved to the freeblks jwork to be completed when all blocks 3651 * have been reclaimed. 3652 */ 3653static void 3654handle_written_jblkdep(jblkdep) 3655 struct jblkdep *jblkdep; 3656{ 3657 struct freeblks *freeblks; 3658 struct jsegdep *jsegdep; 3659 3660 /* Grab the jsegdep. */ 3661 jsegdep = jblkdep->jb_jsegdep; 3662 jblkdep->jb_jsegdep = NULL; 3663 freeblks = jblkdep->jb_freeblks; 3664 LIST_REMOVE(jblkdep, jb_deps); 3665 WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list); 3666 /* 3667 * If the freeblks is all journaled, we can add it to the worklist. 3668 */ 3669 if (LIST_EMPTY(&freeblks->fb_jblkdephd) && 3670 (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 3671 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 3672 3673 free_jblkdep(jblkdep); 3674} 3675 3676static struct jsegdep * 3677newjsegdep(struct worklist *wk) 3678{ 3679 struct jsegdep *jsegdep; 3680 3681 jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); 3682 workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); 3683 jsegdep->jd_seg = NULL; 3684 3685 return (jsegdep); 3686} 3687 3688static struct jmvref * 3689newjmvref(dp, ino, oldoff, newoff) 3690 struct inode *dp; 3691 ino_t ino; 3692 off_t oldoff; 3693 off_t newoff; 3694{ 3695 struct jmvref *jmvref; 3696 3697 jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); 3698 workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump)); 3699 jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; 3700 jmvref->jm_parent = dp->i_number; 3701 jmvref->jm_ino = ino; 3702 jmvref->jm_oldoff = oldoff; 3703 jmvref->jm_newoff = newoff; 3704 3705 return (jmvref); 3706} 3707 3708/* 3709 * Allocate a new jremref that tracks the removal of ip from dp with the 3710 * directory entry offset of diroff. Mark the entry as ATTACHED and 3711 * DEPCOMPLETE as we have all the information required for the journal write 3712 * and the directory has already been removed from the buffer. The caller 3713 * is responsible for linking the jremref into the pagedep and adding it 3714 * to the journal to write. The MKDIR_PARENT flag is set if we're doing 3715 * a DOTDOT addition so handle_workitem_remove() can properly assign 3716 * the jsegdep when we're done. 3717 */ 3718static struct jremref * 3719newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip, 3720 off_t diroff, nlink_t nlink) 3721{ 3722 struct jremref *jremref; 3723 3724 jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); 3725 workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump)); 3726 jremref->jr_state = ATTACHED; 3727 newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, 3728 nlink, ip->i_mode); 3729 jremref->jr_dirrem = dirrem; 3730 3731 return (jremref); 3732} 3733 3734static inline void 3735newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff, 3736 nlink_t nlink, uint16_t mode) 3737{ 3738 3739 inoref->if_jsegdep = newjsegdep(&inoref->if_list); 3740 inoref->if_diroff = diroff; 3741 inoref->if_ino = ino; 3742 inoref->if_parent = parent; 3743 inoref->if_nlink = nlink; 3744 inoref->if_mode = mode; 3745} 3746 3747/* 3748 * Allocate a new jaddref to track the addition of ino to dp at diroff. The 3749 * directory offset may not be known until later. The caller is responsible 3750 * adding the entry to the journal when this information is available. nlink 3751 * should be the link count prior to the addition and mode is only required 3752 * to have the correct FMT. 3753 */ 3754static struct jaddref * 3755newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink, 3756 uint16_t mode) 3757{ 3758 struct jaddref *jaddref; 3759 3760 jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); 3761 workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump)); 3762 jaddref->ja_state = ATTACHED; 3763 jaddref->ja_mkdir = NULL; 3764 newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); 3765 3766 return (jaddref); 3767} 3768 3769/* 3770 * Create a new free dependency for a freework. The caller is responsible 3771 * for adjusting the reference count when it has the lock held. The freedep 3772 * will track an outstanding bitmap write that will ultimately clear the 3773 * freework to continue. 3774 */ 3775static struct freedep * 3776newfreedep(struct freework *freework) 3777{ 3778 struct freedep *freedep; 3779 3780 freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); 3781 workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); 3782 freedep->fd_freework = freework; 3783 3784 return (freedep); 3785} 3786 3787/* 3788 * Free a freedep structure once the buffer it is linked to is written. If 3789 * this is the last reference to the freework schedule it for completion. 3790 */ 3791static void 3792free_freedep(freedep) 3793 struct freedep *freedep; 3794{ 3795 struct freework *freework; 3796 3797 freework = freedep->fd_freework; 3798 freework->fw_freeblks->fb_cgwait--; 3799 if (--freework->fw_ref == 0) 3800 freework_enqueue(freework); 3801 WORKITEM_FREE(freedep, D_FREEDEP); 3802} 3803 3804/* 3805 * Allocate a new freework structure that may be a level in an indirect 3806 * when parent is not NULL or a top level block when it is. The top level 3807 * freework structures are allocated without lk held and before the freeblks 3808 * is visible outside of softdep_setup_freeblocks(). 3809 */ 3810static struct freework * 3811newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal) 3812 struct ufsmount *ump; 3813 struct freeblks *freeblks; 3814 struct freework *parent; 3815 ufs_lbn_t lbn; 3816 ufs2_daddr_t nb; 3817 int frags; 3818 int off; 3819 int journal; 3820{ 3821 struct freework *freework; 3822 3823 freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); 3824 workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); 3825 freework->fw_state = ATTACHED; 3826 freework->fw_jnewblk = NULL; 3827 freework->fw_freeblks = freeblks; 3828 freework->fw_parent = parent; 3829 freework->fw_lbn = lbn; 3830 freework->fw_blkno = nb; 3831 freework->fw_frags = frags; 3832 freework->fw_indir = NULL; 3833 freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR) 3834 ? 0 : NINDIR(ump->um_fs) + 1; 3835 freework->fw_start = freework->fw_off = off; 3836 if (journal) 3837 newjfreeblk(freeblks, lbn, nb, frags); 3838 if (parent == NULL) { 3839 ACQUIRE_LOCK(&lk); 3840 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); 3841 freeblks->fb_ref++; 3842 FREE_LOCK(&lk); 3843 } 3844 3845 return (freework); 3846} 3847 3848/* 3849 * Eliminate a jfreeblk for a block that does not need journaling. 3850 */ 3851static void 3852cancel_jfreeblk(freeblks, blkno) 3853 struct freeblks *freeblks; 3854 ufs2_daddr_t blkno; 3855{ 3856 struct jfreeblk *jfreeblk; 3857 struct jblkdep *jblkdep; 3858 3859 LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) { 3860 if (jblkdep->jb_list.wk_type != D_JFREEBLK) 3861 continue; 3862 jfreeblk = WK_JFREEBLK(&jblkdep->jb_list); 3863 if (jfreeblk->jf_blkno == blkno) 3864 break; 3865 } 3866 if (jblkdep == NULL) 3867 return; 3868 free_jsegdep(jblkdep->jb_jsegdep); 3869 LIST_REMOVE(jblkdep, jb_deps); 3870 WORKITEM_FREE(jfreeblk, D_JFREEBLK); 3871} 3872 3873/* 3874 * Allocate a new jfreeblk to journal top level block pointer when truncating 3875 * a file. The caller must add this to the worklist when lk is held. 3876 */ 3877static struct jfreeblk * 3878newjfreeblk(freeblks, lbn, blkno, frags) 3879 struct freeblks *freeblks; 3880 ufs_lbn_t lbn; 3881 ufs2_daddr_t blkno; 3882 int frags; 3883{ 3884 struct jfreeblk *jfreeblk; 3885 3886 jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); 3887 workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK, 3888 freeblks->fb_list.wk_mp); 3889 jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list); 3890 jfreeblk->jf_dep.jb_freeblks = freeblks; 3891 jfreeblk->jf_ino = freeblks->fb_inum; 3892 jfreeblk->jf_lbn = lbn; 3893 jfreeblk->jf_blkno = blkno; 3894 jfreeblk->jf_frags = frags; 3895 LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps); 3896 3897 return (jfreeblk); 3898} 3899 3900/* 3901 * Allocate a new jtrunc to track a partial truncation. 3902 */ 3903static struct jtrunc * 3904newjtrunc(freeblks, size, extsize) 3905 struct freeblks *freeblks; 3906 off_t size; 3907 int extsize; 3908{ 3909 struct jtrunc *jtrunc; 3910 3911 jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); 3912 workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC, 3913 freeblks->fb_list.wk_mp); 3914 jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list); 3915 jtrunc->jt_dep.jb_freeblks = freeblks; 3916 jtrunc->jt_ino = freeblks->fb_inum; 3917 jtrunc->jt_size = size; 3918 jtrunc->jt_extsize = extsize; 3919 LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps); 3920 3921 return (jtrunc); 3922} 3923 3924/* 3925 * If we're canceling a new bitmap we have to search for another ref 3926 * to move into the bmsafemap dep. This might be better expressed 3927 * with another structure. 3928 */ 3929static void 3930move_newblock_dep(jaddref, inodedep) 3931 struct jaddref *jaddref; 3932 struct inodedep *inodedep; 3933{ 3934 struct inoref *inoref; 3935 struct jaddref *jaddrefn; 3936 3937 jaddrefn = NULL; 3938 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3939 inoref = TAILQ_NEXT(inoref, if_deps)) { 3940 if ((jaddref->ja_state & NEWBLOCK) && 3941 inoref->if_list.wk_type == D_JADDREF) { 3942 jaddrefn = (struct jaddref *)inoref; 3943 break; 3944 } 3945 } 3946 if (jaddrefn == NULL) 3947 return; 3948 jaddrefn->ja_state &= ~(ATTACHED | UNDONE); 3949 jaddrefn->ja_state |= jaddref->ja_state & 3950 (ATTACHED | UNDONE | NEWBLOCK); 3951 jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); 3952 jaddref->ja_state |= ATTACHED; 3953 LIST_REMOVE(jaddref, ja_bmdeps); 3954 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, 3955 ja_bmdeps); 3956} 3957 3958/* 3959 * Cancel a jaddref either before it has been written or while it is being 3960 * written. This happens when a link is removed before the add reaches 3961 * the disk. The jaddref dependency is kept linked into the bmsafemap 3962 * and inode to prevent the link count or bitmap from reaching the disk 3963 * until handle_workitem_remove() re-adjusts the counts and bitmaps as 3964 * required. 3965 * 3966 * Returns 1 if the canceled addref requires journaling of the remove and 3967 * 0 otherwise. 3968 */ 3969static int 3970cancel_jaddref(jaddref, inodedep, wkhd) 3971 struct jaddref *jaddref; 3972 struct inodedep *inodedep; 3973 struct workhead *wkhd; 3974{ 3975 struct inoref *inoref; 3976 struct jsegdep *jsegdep; 3977 int needsj; 3978 3979 KASSERT((jaddref->ja_state & COMPLETE) == 0, 3980 ("cancel_jaddref: Canceling complete jaddref")); 3981 if (jaddref->ja_state & (INPROGRESS | COMPLETE)) 3982 needsj = 1; 3983 else 3984 needsj = 0; 3985 if (inodedep == NULL) 3986 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3987 0, &inodedep) == 0) 3988 panic("cancel_jaddref: Lost inodedep"); 3989 /* 3990 * We must adjust the nlink of any reference operation that follows 3991 * us so that it is consistent with the in-memory reference. This 3992 * ensures that inode nlink rollbacks always have the correct link. 3993 */ 3994 if (needsj == 0) { 3995 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3996 inoref = TAILQ_NEXT(inoref, if_deps)) { 3997 if (inoref->if_state & GOINGAWAY) 3998 break; 3999 inoref->if_nlink--; 4000 } 4001 } 4002 jsegdep = inoref_jseg(&jaddref->ja_ref); 4003 if (jaddref->ja_state & NEWBLOCK) 4004 move_newblock_dep(jaddref, inodedep); 4005 wake_worklist(&jaddref->ja_list); 4006 jaddref->ja_mkdir = NULL; 4007 if (jaddref->ja_state & INPROGRESS) { 4008 jaddref->ja_state &= ~INPROGRESS; 4009 WORKLIST_REMOVE(&jaddref->ja_list); 4010 jwork_insert(wkhd, jsegdep); 4011 } else { 4012 free_jsegdep(jsegdep); 4013 if (jaddref->ja_state & DEPCOMPLETE) 4014 remove_from_journal(&jaddref->ja_list); 4015 } 4016 jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE); 4017 /* 4018 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove 4019 * can arrange for them to be freed with the bitmap. Otherwise we 4020 * no longer need this addref attached to the inoreflst and it 4021 * will incorrectly adjust nlink if we leave it. 4022 */ 4023 if ((jaddref->ja_state & NEWBLOCK) == 0) { 4024 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 4025 if_deps); 4026 jaddref->ja_state |= COMPLETE; 4027 free_jaddref(jaddref); 4028 return (needsj); 4029 } 4030 /* 4031 * Leave the head of the list for jsegdeps for fast merging. 4032 */ 4033 if (LIST_FIRST(wkhd) != NULL) { 4034 jaddref->ja_state |= ONWORKLIST; 4035 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); 4036 } else 4037 WORKLIST_INSERT(wkhd, &jaddref->ja_list); 4038 4039 return (needsj); 4040} 4041 4042/* 4043 * Attempt to free a jaddref structure when some work completes. This 4044 * should only succeed once the entry is written and all dependencies have 4045 * been notified. 4046 */ 4047static void 4048free_jaddref(jaddref) 4049 struct jaddref *jaddref; 4050{ 4051 4052 if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) 4053 return; 4054 if (jaddref->ja_ref.if_jsegdep) 4055 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", 4056 jaddref, jaddref->ja_state); 4057 if (jaddref->ja_state & NEWBLOCK) 4058 LIST_REMOVE(jaddref, ja_bmdeps); 4059 if (jaddref->ja_state & (INPROGRESS | ONWORKLIST)) 4060 panic("free_jaddref: Bad state %p(0x%X)", 4061 jaddref, jaddref->ja_state); 4062 if (jaddref->ja_mkdir != NULL) 4063 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); 4064 WORKITEM_FREE(jaddref, D_JADDREF); 4065} 4066 4067/* 4068 * Free a jremref structure once it has been written or discarded. 4069 */ 4070static void 4071free_jremref(jremref) 4072 struct jremref *jremref; 4073{ 4074 4075 if (jremref->jr_ref.if_jsegdep) 4076 free_jsegdep(jremref->jr_ref.if_jsegdep); 4077 if (jremref->jr_state & INPROGRESS) 4078 panic("free_jremref: IO still pending"); 4079 WORKITEM_FREE(jremref, D_JREMREF); 4080} 4081 4082/* 4083 * Free a jnewblk structure. 4084 */ 4085static void 4086free_jnewblk(jnewblk) 4087 struct jnewblk *jnewblk; 4088{ 4089 4090 if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) 4091 return; 4092 LIST_REMOVE(jnewblk, jn_deps); 4093 if (jnewblk->jn_dep != NULL) 4094 panic("free_jnewblk: Dependency still attached."); 4095 WORKITEM_FREE(jnewblk, D_JNEWBLK); 4096} 4097 4098/* 4099 * Cancel a jnewblk which has been been made redundant by frag extension. 4100 */ 4101static void 4102cancel_jnewblk(jnewblk, wkhd) 4103 struct jnewblk *jnewblk; 4104 struct workhead *wkhd; 4105{ 4106 struct jsegdep *jsegdep; 4107 4108 jsegdep = jnewblk->jn_jsegdep; 4109 if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL) 4110 panic("cancel_jnewblk: Invalid state"); 4111 jnewblk->jn_jsegdep = NULL; 4112 jnewblk->jn_dep = NULL; 4113 jnewblk->jn_state |= GOINGAWAY; 4114 if (jnewblk->jn_state & INPROGRESS) { 4115 jnewblk->jn_state &= ~INPROGRESS; 4116 WORKLIST_REMOVE(&jnewblk->jn_list); 4117 jwork_insert(wkhd, jsegdep); 4118 } else { 4119 free_jsegdep(jsegdep); 4120 remove_from_journal(&jnewblk->jn_list); 4121 } 4122 wake_worklist(&jnewblk->jn_list); 4123 WORKLIST_INSERT(wkhd, &jnewblk->jn_list); 4124} 4125 4126static void 4127free_jblkdep(jblkdep) 4128 struct jblkdep *jblkdep; 4129{ 4130 4131 if (jblkdep->jb_list.wk_type == D_JFREEBLK) 4132 WORKITEM_FREE(jblkdep, D_JFREEBLK); 4133 else if (jblkdep->jb_list.wk_type == D_JTRUNC) 4134 WORKITEM_FREE(jblkdep, D_JTRUNC); 4135 else 4136 panic("free_jblkdep: Unexpected type %s", 4137 TYPENAME(jblkdep->jb_list.wk_type)); 4138} 4139 4140/* 4141 * Free a single jseg once it is no longer referenced in memory or on 4142 * disk. Reclaim journal blocks and dependencies waiting for the segment 4143 * to disappear. 4144 */ 4145static void 4146free_jseg(jseg, jblocks) 4147 struct jseg *jseg; 4148 struct jblocks *jblocks; 4149{ 4150 struct freework *freework; 4151 4152 /* 4153 * Free freework structures that were lingering to indicate freed 4154 * indirect blocks that forced journal write ordering on reallocate. 4155 */ 4156 while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) 4157 indirblk_remove(freework); 4158 if (jblocks->jb_oldestseg == jseg) 4159 jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next); 4160 TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); 4161 jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); 4162 KASSERT(LIST_EMPTY(&jseg->js_entries), 4163 ("free_jseg: Freed jseg has valid entries.")); 4164 WORKITEM_FREE(jseg, D_JSEG); 4165} 4166 4167/* 4168 * Free all jsegs that meet the criteria for being reclaimed and update 4169 * oldestseg. 4170 */ 4171static void 4172free_jsegs(jblocks) 4173 struct jblocks *jblocks; 4174{ 4175 struct jseg *jseg; 4176 4177 /* 4178 * Free only those jsegs which have none allocated before them to 4179 * preserve the journal space ordering. 4180 */ 4181 while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { 4182 /* 4183 * Only reclaim space when nothing depends on this journal 4184 * set and another set has written that it is no longer 4185 * valid. 4186 */ 4187 if (jseg->js_refs != 0) { 4188 jblocks->jb_oldestseg = jseg; 4189 return; 4190 } 4191 if (!LIST_EMPTY(&jseg->js_indirs) && 4192 jseg->js_seq >= jblocks->jb_oldestwrseq) 4193 break; 4194 free_jseg(jseg, jblocks); 4195 } 4196 /* 4197 * If we exited the loop above we still must discover the 4198 * oldest valid segment. 4199 */ 4200 if (jseg) 4201 for (jseg = jblocks->jb_oldestseg; jseg != NULL; 4202 jseg = TAILQ_NEXT(jseg, js_next)) 4203 if (jseg->js_refs != 0) 4204 break; 4205 jblocks->jb_oldestseg = jseg; 4206 /* 4207 * The journal has no valid records but some jsegs may still be 4208 * waiting on oldestwrseq to advance. We force a small record 4209 * out to permit these lingering records to be reclaimed. 4210 */ 4211 if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs)) 4212 jblocks->jb_needseg = 1; 4213} 4214 4215/* 4216 * Release one reference to a jseg and free it if the count reaches 0. This 4217 * should eventually reclaim journal space as well. 4218 */ 4219static void 4220rele_jseg(jseg) 4221 struct jseg *jseg; 4222{ 4223 4224 KASSERT(jseg->js_refs > 0, 4225 ("free_jseg: Invalid refcnt %d", jseg->js_refs)); 4226 if (--jseg->js_refs != 0) 4227 return; 4228 free_jsegs(jseg->js_jblocks); 4229} 4230 4231/* 4232 * Release a jsegdep and decrement the jseg count. 4233 */ 4234static void 4235free_jsegdep(jsegdep) 4236 struct jsegdep *jsegdep; 4237{ 4238 4239 if (jsegdep->jd_seg) 4240 rele_jseg(jsegdep->jd_seg); 4241 WORKITEM_FREE(jsegdep, D_JSEGDEP); 4242} 4243 4244/* 4245 * Wait for a journal item to make it to disk. Initiate journal processing 4246 * if required. 4247 */ 4248static int 4249jwait(wk, waitfor) 4250 struct worklist *wk; 4251 int waitfor; 4252{ 4253 4254 /* 4255 * Blocking journal waits cause slow synchronous behavior. Record 4256 * stats on the frequency of these blocking operations. 4257 */ 4258 if (waitfor == MNT_WAIT) { 4259 stat_journal_wait++; 4260 switch (wk->wk_type) { 4261 case D_JREMREF: 4262 case D_JMVREF: 4263 stat_jwait_filepage++; 4264 break; 4265 case D_JTRUNC: 4266 case D_JFREEBLK: 4267 stat_jwait_freeblks++; 4268 break; 4269 case D_JNEWBLK: 4270 stat_jwait_newblk++; 4271 break; 4272 case D_JADDREF: 4273 stat_jwait_inode++; 4274 break; 4275 default: 4276 break; 4277 } 4278 } 4279 /* 4280 * If IO has not started we process the journal. We can't mark the 4281 * worklist item as IOWAITING because we drop the lock while 4282 * processing the journal and the worklist entry may be freed after 4283 * this point. The caller may call back in and re-issue the request. 4284 */ 4285 if ((wk->wk_state & INPROGRESS) == 0) { 4286 softdep_process_journal(wk->wk_mp, wk, waitfor); 4287 if (waitfor != MNT_WAIT) 4288 return (EBUSY); 4289 return (0); 4290 } 4291 if (waitfor != MNT_WAIT) 4292 return (EBUSY); 4293 wait_worklist(wk, "jwait"); 4294 return (0); 4295} 4296 4297/* 4298 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as 4299 * appropriate. This is a convenience function to reduce duplicate code 4300 * for the setup and revert functions below. 4301 */ 4302static struct inodedep * 4303inodedep_lookup_ip(ip) 4304 struct inode *ip; 4305{ 4306 struct inodedep *inodedep; 4307 4308 KASSERT(ip->i_nlink >= ip->i_effnlink, 4309 ("inodedep_lookup_ip: bad delta")); 4310 (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 4311 DEPALLOC, &inodedep); 4312 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 4313 4314 return (inodedep); 4315} 4316 4317/* 4318 * Called prior to creating a new inode and linking it to a directory. The 4319 * jaddref structure must already be allocated by softdep_setup_inomapdep 4320 * and it is discovered here so we can initialize the mode and update 4321 * nlinkdelta. 4322 */ 4323void 4324softdep_setup_create(dp, ip) 4325 struct inode *dp; 4326 struct inode *ip; 4327{ 4328 struct inodedep *inodedep; 4329 struct jaddref *jaddref; 4330 struct vnode *dvp; 4331 4332 KASSERT(ip->i_nlink == 1, 4333 ("softdep_setup_create: Invalid link count.")); 4334 dvp = ITOV(dp); 4335 ACQUIRE_LOCK(&lk); 4336 inodedep = inodedep_lookup_ip(ip); 4337 if (DOINGSUJ(dvp)) { 4338 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4339 inoreflst); 4340 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 4341 ("softdep_setup_create: No addref structure present.")); 4342 } 4343 softdep_prelink(dvp, NULL); 4344 FREE_LOCK(&lk); 4345} 4346 4347/* 4348 * Create a jaddref structure to track the addition of a DOTDOT link when 4349 * we are reparenting an inode as part of a rename. This jaddref will be 4350 * found by softdep_setup_directory_change. Adjusts nlinkdelta for 4351 * non-journaling softdep. 4352 */ 4353void 4354softdep_setup_dotdot_link(dp, ip) 4355 struct inode *dp; 4356 struct inode *ip; 4357{ 4358 struct inodedep *inodedep; 4359 struct jaddref *jaddref; 4360 struct vnode *dvp; 4361 struct vnode *vp; 4362 4363 dvp = ITOV(dp); 4364 vp = ITOV(ip); 4365 jaddref = NULL; 4366 /* 4367 * We don't set MKDIR_PARENT as this is not tied to a mkdir and 4368 * is used as a normal link would be. 4369 */ 4370 if (DOINGSUJ(dvp)) 4371 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 4372 dp->i_effnlink - 1, dp->i_mode); 4373 ACQUIRE_LOCK(&lk); 4374 inodedep = inodedep_lookup_ip(dp); 4375 if (jaddref) 4376 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4377 if_deps); 4378 softdep_prelink(dvp, ITOV(ip)); 4379 FREE_LOCK(&lk); 4380} 4381 4382/* 4383 * Create a jaddref structure to track a new link to an inode. The directory 4384 * offset is not known until softdep_setup_directory_add or 4385 * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling 4386 * softdep. 4387 */ 4388void 4389softdep_setup_link(dp, ip) 4390 struct inode *dp; 4391 struct inode *ip; 4392{ 4393 struct inodedep *inodedep; 4394 struct jaddref *jaddref; 4395 struct vnode *dvp; 4396 4397 dvp = ITOV(dp); 4398 jaddref = NULL; 4399 if (DOINGSUJ(dvp)) 4400 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, 4401 ip->i_mode); 4402 ACQUIRE_LOCK(&lk); 4403 inodedep = inodedep_lookup_ip(ip); 4404 if (jaddref) 4405 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4406 if_deps); 4407 softdep_prelink(dvp, ITOV(ip)); 4408 FREE_LOCK(&lk); 4409} 4410 4411/* 4412 * Called to create the jaddref structures to track . and .. references as 4413 * well as lookup and further initialize the incomplete jaddref created 4414 * by softdep_setup_inomapdep when the inode was allocated. Adjusts 4415 * nlinkdelta for non-journaling softdep. 4416 */ 4417void 4418softdep_setup_mkdir(dp, ip) 4419 struct inode *dp; 4420 struct inode *ip; 4421{ 4422 struct inodedep *inodedep; 4423 struct jaddref *dotdotaddref; 4424 struct jaddref *dotaddref; 4425 struct jaddref *jaddref; 4426 struct vnode *dvp; 4427 4428 dvp = ITOV(dp); 4429 dotaddref = dotdotaddref = NULL; 4430 if (DOINGSUJ(dvp)) { 4431 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, 4432 ip->i_mode); 4433 dotaddref->ja_state |= MKDIR_BODY; 4434 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 4435 dp->i_effnlink - 1, dp->i_mode); 4436 dotdotaddref->ja_state |= MKDIR_PARENT; 4437 } 4438 ACQUIRE_LOCK(&lk); 4439 inodedep = inodedep_lookup_ip(ip); 4440 if (DOINGSUJ(dvp)) { 4441 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4442 inoreflst); 4443 KASSERT(jaddref != NULL, 4444 ("softdep_setup_mkdir: No addref structure present.")); 4445 KASSERT(jaddref->ja_parent == dp->i_number, 4446 ("softdep_setup_mkdir: bad parent %d", 4447 jaddref->ja_parent)); 4448 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, 4449 if_deps); 4450 } 4451 inodedep = inodedep_lookup_ip(dp); 4452 if (DOINGSUJ(dvp)) 4453 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, 4454 &dotdotaddref->ja_ref, if_deps); 4455 softdep_prelink(ITOV(dp), NULL); 4456 FREE_LOCK(&lk); 4457} 4458 4459/* 4460 * Called to track nlinkdelta of the inode and parent directories prior to 4461 * unlinking a directory. 4462 */ 4463void 4464softdep_setup_rmdir(dp, ip) 4465 struct inode *dp; 4466 struct inode *ip; 4467{ 4468 struct vnode *dvp; 4469 4470 dvp = ITOV(dp); 4471 ACQUIRE_LOCK(&lk); 4472 (void) inodedep_lookup_ip(ip); 4473 (void) inodedep_lookup_ip(dp); 4474 softdep_prelink(dvp, ITOV(ip)); 4475 FREE_LOCK(&lk); 4476} 4477 4478/* 4479 * Called to track nlinkdelta of the inode and parent directories prior to 4480 * unlink. 4481 */ 4482void 4483softdep_setup_unlink(dp, ip) 4484 struct inode *dp; 4485 struct inode *ip; 4486{ 4487 struct vnode *dvp; 4488 4489 dvp = ITOV(dp); 4490 ACQUIRE_LOCK(&lk); 4491 (void) inodedep_lookup_ip(ip); 4492 (void) inodedep_lookup_ip(dp); 4493 softdep_prelink(dvp, ITOV(ip)); 4494 FREE_LOCK(&lk); 4495} 4496 4497/* 4498 * Called to release the journal structures created by a failed non-directory 4499 * creation. Adjusts nlinkdelta for non-journaling softdep. 4500 */ 4501void 4502softdep_revert_create(dp, ip) 4503 struct inode *dp; 4504 struct inode *ip; 4505{ 4506 struct inodedep *inodedep; 4507 struct jaddref *jaddref; 4508 struct vnode *dvp; 4509 4510 dvp = ITOV(dp); 4511 ACQUIRE_LOCK(&lk); 4512 inodedep = inodedep_lookup_ip(ip); 4513 if (DOINGSUJ(dvp)) { 4514 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4515 inoreflst); 4516 KASSERT(jaddref->ja_parent == dp->i_number, 4517 ("softdep_revert_create: addref parent mismatch")); 4518 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4519 } 4520 FREE_LOCK(&lk); 4521} 4522 4523/* 4524 * Called to release the journal structures created by a failed dotdot link 4525 * creation. Adjusts nlinkdelta for non-journaling softdep. 4526 */ 4527void 4528softdep_revert_dotdot_link(dp, ip) 4529 struct inode *dp; 4530 struct inode *ip; 4531{ 4532 struct inodedep *inodedep; 4533 struct jaddref *jaddref; 4534 struct vnode *dvp; 4535 4536 dvp = ITOV(dp); 4537 ACQUIRE_LOCK(&lk); 4538 inodedep = inodedep_lookup_ip(dp); 4539 if (DOINGSUJ(dvp)) { 4540 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4541 inoreflst); 4542 KASSERT(jaddref->ja_parent == ip->i_number, 4543 ("softdep_revert_dotdot_link: addref parent mismatch")); 4544 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4545 } 4546 FREE_LOCK(&lk); 4547} 4548 4549/* 4550 * Called to release the journal structures created by a failed link 4551 * addition. Adjusts nlinkdelta for non-journaling softdep. 4552 */ 4553void 4554softdep_revert_link(dp, ip) 4555 struct inode *dp; 4556 struct inode *ip; 4557{ 4558 struct inodedep *inodedep; 4559 struct jaddref *jaddref; 4560 struct vnode *dvp; 4561 4562 dvp = ITOV(dp); 4563 ACQUIRE_LOCK(&lk); 4564 inodedep = inodedep_lookup_ip(ip); 4565 if (DOINGSUJ(dvp)) { 4566 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4567 inoreflst); 4568 KASSERT(jaddref->ja_parent == dp->i_number, 4569 ("softdep_revert_link: addref parent mismatch")); 4570 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4571 } 4572 FREE_LOCK(&lk); 4573} 4574 4575/* 4576 * Called to release the journal structures created by a failed mkdir 4577 * attempt. Adjusts nlinkdelta for non-journaling softdep. 4578 */ 4579void 4580softdep_revert_mkdir(dp, ip) 4581 struct inode *dp; 4582 struct inode *ip; 4583{ 4584 struct inodedep *inodedep; 4585 struct jaddref *jaddref; 4586 struct jaddref *dotaddref; 4587 struct vnode *dvp; 4588 4589 dvp = ITOV(dp); 4590 4591 ACQUIRE_LOCK(&lk); 4592 inodedep = inodedep_lookup_ip(dp); 4593 if (DOINGSUJ(dvp)) { 4594 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4595 inoreflst); 4596 KASSERT(jaddref->ja_parent == ip->i_number, 4597 ("softdep_revert_mkdir: dotdot addref parent mismatch")); 4598 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4599 } 4600 inodedep = inodedep_lookup_ip(ip); 4601 if (DOINGSUJ(dvp)) { 4602 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4603 inoreflst); 4604 KASSERT(jaddref->ja_parent == dp->i_number, 4605 ("softdep_revert_mkdir: addref parent mismatch")); 4606 dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 4607 inoreflst, if_deps); 4608 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4609 KASSERT(dotaddref->ja_parent == ip->i_number, 4610 ("softdep_revert_mkdir: dot addref parent mismatch")); 4611 cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait); 4612 } 4613 FREE_LOCK(&lk); 4614} 4615 4616/* 4617 * Called to correct nlinkdelta after a failed rmdir. 4618 */ 4619void 4620softdep_revert_rmdir(dp, ip) 4621 struct inode *dp; 4622 struct inode *ip; 4623{ 4624 4625 ACQUIRE_LOCK(&lk); 4626 (void) inodedep_lookup_ip(ip); 4627 (void) inodedep_lookup_ip(dp); 4628 FREE_LOCK(&lk); 4629} 4630 4631/* 4632 * Protecting the freemaps (or bitmaps). 4633 * 4634 * To eliminate the need to execute fsck before mounting a filesystem 4635 * after a power failure, one must (conservatively) guarantee that the 4636 * on-disk copy of the bitmaps never indicate that a live inode or block is 4637 * free. So, when a block or inode is allocated, the bitmap should be 4638 * updated (on disk) before any new pointers. When a block or inode is 4639 * freed, the bitmap should not be updated until all pointers have been 4640 * reset. The latter dependency is handled by the delayed de-allocation 4641 * approach described below for block and inode de-allocation. The former 4642 * dependency is handled by calling the following procedure when a block or 4643 * inode is allocated. When an inode is allocated an "inodedep" is created 4644 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 4645 * Each "inodedep" is also inserted into the hash indexing structure so 4646 * that any additional link additions can be made dependent on the inode 4647 * allocation. 4648 * 4649 * The ufs filesystem maintains a number of free block counts (e.g., per 4650 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 4651 * in addition to the bitmaps. These counts are used to improve efficiency 4652 * during allocation and therefore must be consistent with the bitmaps. 4653 * There is no convenient way to guarantee post-crash consistency of these 4654 * counts with simple update ordering, for two main reasons: (1) The counts 4655 * and bitmaps for a single cylinder group block are not in the same disk 4656 * sector. If a disk write is interrupted (e.g., by power failure), one may 4657 * be written and the other not. (2) Some of the counts are located in the 4658 * superblock rather than the cylinder group block. So, we focus our soft 4659 * updates implementation on protecting the bitmaps. When mounting a 4660 * filesystem, we recompute the auxiliary counts from the bitmaps. 4661 */ 4662 4663/* 4664 * Called just after updating the cylinder group block to allocate an inode. 4665 */ 4666void 4667softdep_setup_inomapdep(bp, ip, newinum, mode) 4668 struct buf *bp; /* buffer for cylgroup block with inode map */ 4669 struct inode *ip; /* inode related to allocation */ 4670 ino_t newinum; /* new inode number being allocated */ 4671 int mode; 4672{ 4673 struct inodedep *inodedep; 4674 struct bmsafemap *bmsafemap; 4675 struct jaddref *jaddref; 4676 struct mount *mp; 4677 struct fs *fs; 4678 4679 mp = UFSTOVFS(ip->i_ump); 4680 fs = ip->i_ump->um_fs; 4681 jaddref = NULL; 4682 4683 /* 4684 * Allocate the journal reference add structure so that the bitmap 4685 * can be dependent on it. 4686 */ 4687 if (MOUNTEDSUJ(mp)) { 4688 jaddref = newjaddref(ip, newinum, 0, 0, mode); 4689 jaddref->ja_state |= NEWBLOCK; 4690 } 4691 4692 /* 4693 * Create a dependency for the newly allocated inode. 4694 * Panic if it already exists as something is seriously wrong. 4695 * Otherwise add it to the dependency list for the buffer holding 4696 * the cylinder group map from which it was allocated. 4697 */ 4698 ACQUIRE_LOCK(&lk); 4699 if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep))) 4700 panic("softdep_setup_inomapdep: dependency %p for new" 4701 "inode already exists", inodedep); 4702 bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum)); 4703 if (jaddref) { 4704 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); 4705 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4706 if_deps); 4707 } else { 4708 inodedep->id_state |= ONDEPLIST; 4709 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 4710 } 4711 inodedep->id_bmsafemap = bmsafemap; 4712 inodedep->id_state &= ~DEPCOMPLETE; 4713 FREE_LOCK(&lk); 4714} 4715 4716/* 4717 * Called just after updating the cylinder group block to 4718 * allocate block or fragment. 4719 */ 4720void 4721softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 4722 struct buf *bp; /* buffer for cylgroup block with block map */ 4723 struct mount *mp; /* filesystem doing allocation */ 4724 ufs2_daddr_t newblkno; /* number of newly allocated block */ 4725 int frags; /* Number of fragments. */ 4726 int oldfrags; /* Previous number of fragments for extend. */ 4727{ 4728 struct newblk *newblk; 4729 struct bmsafemap *bmsafemap; 4730 struct jnewblk *jnewblk; 4731 struct fs *fs; 4732 4733 fs = VFSTOUFS(mp)->um_fs; 4734 jnewblk = NULL; 4735 /* 4736 * Create a dependency for the newly allocated block. 4737 * Add it to the dependency list for the buffer holding 4738 * the cylinder group map from which it was allocated. 4739 */ 4740 if (MOUNTEDSUJ(mp)) { 4741 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); 4742 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); 4743 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); 4744 jnewblk->jn_state = ATTACHED; 4745 jnewblk->jn_blkno = newblkno; 4746 jnewblk->jn_frags = frags; 4747 jnewblk->jn_oldfrags = oldfrags; 4748#ifdef SUJ_DEBUG 4749 { 4750 struct cg *cgp; 4751 uint8_t *blksfree; 4752 long bno; 4753 int i; 4754 4755 cgp = (struct cg *)bp->b_data; 4756 blksfree = cg_blksfree(cgp); 4757 bno = dtogd(fs, jnewblk->jn_blkno); 4758 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 4759 i++) { 4760 if (isset(blksfree, bno + i)) 4761 panic("softdep_setup_blkmapdep: " 4762 "free fragment %d from %d-%d " 4763 "state 0x%X dep %p", i, 4764 jnewblk->jn_oldfrags, 4765 jnewblk->jn_frags, 4766 jnewblk->jn_state, 4767 jnewblk->jn_dep); 4768 } 4769 } 4770#endif 4771 } 4772 ACQUIRE_LOCK(&lk); 4773 if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) 4774 panic("softdep_setup_blkmapdep: found block"); 4775 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, 4776 dtog(fs, newblkno)); 4777 if (jnewblk) { 4778 jnewblk->jn_dep = (struct worklist *)newblk; 4779 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); 4780 } else { 4781 newblk->nb_state |= ONDEPLIST; 4782 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 4783 } 4784 newblk->nb_bmsafemap = bmsafemap; 4785 newblk->nb_jnewblk = jnewblk; 4786 FREE_LOCK(&lk); 4787} 4788 4789#define BMSAFEMAP_HASH(fs, cg) \ 4790 (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash]) 4791 4792static int 4793bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp) 4794 struct bmsafemap_hashhead *bmsafemaphd; 4795 struct mount *mp; 4796 int cg; 4797 struct bmsafemap **bmsafemapp; 4798{ 4799 struct bmsafemap *bmsafemap; 4800 4801 LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) 4802 if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg) 4803 break; 4804 if (bmsafemap) { 4805 *bmsafemapp = bmsafemap; 4806 return (1); 4807 } 4808 *bmsafemapp = NULL; 4809 4810 return (0); 4811} 4812 4813/* 4814 * Find the bmsafemap associated with a cylinder group buffer. 4815 * If none exists, create one. The buffer must be locked when 4816 * this routine is called and this routine must be called with 4817 * splbio interrupts blocked. 4818 */ 4819static struct bmsafemap * 4820bmsafemap_lookup(mp, bp, cg) 4821 struct mount *mp; 4822 struct buf *bp; 4823 int cg; 4824{ 4825 struct bmsafemap_hashhead *bmsafemaphd; 4826 struct bmsafemap *bmsafemap, *collision; 4827 struct worklist *wk; 4828 struct fs *fs; 4829 4830 mtx_assert(&lk, MA_OWNED); 4831 if (bp) 4832 LIST_FOREACH(wk, &bp->b_dep, wk_list) 4833 if (wk->wk_type == D_BMSAFEMAP) 4834 return (WK_BMSAFEMAP(wk)); 4835 fs = VFSTOUFS(mp)->um_fs; 4836 bmsafemaphd = BMSAFEMAP_HASH(fs, cg); 4837 if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) 4838 return (bmsafemap); 4839 FREE_LOCK(&lk); 4840 bmsafemap = malloc(sizeof(struct bmsafemap), 4841 M_BMSAFEMAP, M_SOFTDEP_FLAGS); 4842 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); 4843 bmsafemap->sm_buf = bp; 4844 LIST_INIT(&bmsafemap->sm_inodedephd); 4845 LIST_INIT(&bmsafemap->sm_inodedepwr); 4846 LIST_INIT(&bmsafemap->sm_newblkhd); 4847 LIST_INIT(&bmsafemap->sm_newblkwr); 4848 LIST_INIT(&bmsafemap->sm_jaddrefhd); 4849 LIST_INIT(&bmsafemap->sm_jnewblkhd); 4850 LIST_INIT(&bmsafemap->sm_freehd); 4851 LIST_INIT(&bmsafemap->sm_freewr); 4852 ACQUIRE_LOCK(&lk); 4853 if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) { 4854 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 4855 return (collision); 4856 } 4857 bmsafemap->sm_cg = cg; 4858 LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); 4859 LIST_INSERT_HEAD(&VFSTOUFS(mp)->softdep_dirtycg, bmsafemap, sm_next); 4860 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 4861 return (bmsafemap); 4862} 4863 4864/* 4865 * Direct block allocation dependencies. 4866 * 4867 * When a new block is allocated, the corresponding disk locations must be 4868 * initialized (with zeros or new data) before the on-disk inode points to 4869 * them. Also, the freemap from which the block was allocated must be 4870 * updated (on disk) before the inode's pointer. These two dependencies are 4871 * independent of each other and are needed for all file blocks and indirect 4872 * blocks that are pointed to directly by the inode. Just before the 4873 * "in-core" version of the inode is updated with a newly allocated block 4874 * number, a procedure (below) is called to setup allocation dependency 4875 * structures. These structures are removed when the corresponding 4876 * dependencies are satisfied or when the block allocation becomes obsolete 4877 * (i.e., the file is deleted, the block is de-allocated, or the block is a 4878 * fragment that gets upgraded). All of these cases are handled in 4879 * procedures described later. 4880 * 4881 * When a file extension causes a fragment to be upgraded, either to a larger 4882 * fragment or to a full block, the on-disk location may change (if the 4883 * previous fragment could not simply be extended). In this case, the old 4884 * fragment must be de-allocated, but not until after the inode's pointer has 4885 * been updated. In most cases, this is handled by later procedures, which 4886 * will construct a "freefrag" structure to be added to the workitem queue 4887 * when the inode update is complete (or obsolete). The main exception to 4888 * this is when an allocation occurs while a pending allocation dependency 4889 * (for the same block pointer) remains. This case is handled in the main 4890 * allocation dependency setup procedure by immediately freeing the 4891 * unreferenced fragments. 4892 */ 4893void 4894softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 4895 struct inode *ip; /* inode to which block is being added */ 4896 ufs_lbn_t off; /* block pointer within inode */ 4897 ufs2_daddr_t newblkno; /* disk block number being added */ 4898 ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ 4899 long newsize; /* size of new block */ 4900 long oldsize; /* size of new block */ 4901 struct buf *bp; /* bp for allocated block */ 4902{ 4903 struct allocdirect *adp, *oldadp; 4904 struct allocdirectlst *adphead; 4905 struct freefrag *freefrag; 4906 struct inodedep *inodedep; 4907 struct pagedep *pagedep; 4908 struct jnewblk *jnewblk; 4909 struct newblk *newblk; 4910 struct mount *mp; 4911 ufs_lbn_t lbn; 4912 4913 lbn = bp->b_lblkno; 4914 mp = UFSTOVFS(ip->i_ump); 4915 if (oldblkno && oldblkno != newblkno) 4916 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 4917 else 4918 freefrag = NULL; 4919 4920 ACQUIRE_LOCK(&lk); 4921 if (off >= NDADDR) { 4922 if (lbn > 0) 4923 panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", 4924 lbn, off); 4925 /* allocating an indirect block */ 4926 if (oldblkno != 0) 4927 panic("softdep_setup_allocdirect: non-zero indir"); 4928 } else { 4929 if (off != lbn) 4930 panic("softdep_setup_allocdirect: lbn %jd != off %jd", 4931 lbn, off); 4932 /* 4933 * Allocating a direct block. 4934 * 4935 * If we are allocating a directory block, then we must 4936 * allocate an associated pagedep to track additions and 4937 * deletions. 4938 */ 4939 if ((ip->i_mode & IFMT) == IFDIR) 4940 pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC, 4941 &pagedep); 4942 } 4943 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 4944 panic("softdep_setup_allocdirect: lost block"); 4945 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4946 ("softdep_setup_allocdirect: newblk already initialized")); 4947 /* 4948 * Convert the newblk to an allocdirect. 4949 */ 4950 newblk->nb_list.wk_type = D_ALLOCDIRECT; 4951 adp = (struct allocdirect *)newblk; 4952 newblk->nb_freefrag = freefrag; 4953 adp->ad_offset = off; 4954 adp->ad_oldblkno = oldblkno; 4955 adp->ad_newsize = newsize; 4956 adp->ad_oldsize = oldsize; 4957 4958 /* 4959 * Finish initializing the journal. 4960 */ 4961 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4962 jnewblk->jn_ino = ip->i_number; 4963 jnewblk->jn_lbn = lbn; 4964 add_to_journal(&jnewblk->jn_list); 4965 } 4966 if (freefrag && freefrag->ff_jdep != NULL && 4967 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 4968 add_to_journal(freefrag->ff_jdep); 4969 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 4970 adp->ad_inodedep = inodedep; 4971 4972 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 4973 /* 4974 * The list of allocdirects must be kept in sorted and ascending 4975 * order so that the rollback routines can quickly determine the 4976 * first uncommitted block (the size of the file stored on disk 4977 * ends at the end of the lowest committed fragment, or if there 4978 * are no fragments, at the end of the highest committed block). 4979 * Since files generally grow, the typical case is that the new 4980 * block is to be added at the end of the list. We speed this 4981 * special case by checking against the last allocdirect in the 4982 * list before laboriously traversing the list looking for the 4983 * insertion point. 4984 */ 4985 adphead = &inodedep->id_newinoupdt; 4986 oldadp = TAILQ_LAST(adphead, allocdirectlst); 4987 if (oldadp == NULL || oldadp->ad_offset <= off) { 4988 /* insert at end of list */ 4989 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 4990 if (oldadp != NULL && oldadp->ad_offset == off) 4991 allocdirect_merge(adphead, adp, oldadp); 4992 FREE_LOCK(&lk); 4993 return; 4994 } 4995 TAILQ_FOREACH(oldadp, adphead, ad_next) { 4996 if (oldadp->ad_offset >= off) 4997 break; 4998 } 4999 if (oldadp == NULL) 5000 panic("softdep_setup_allocdirect: lost entry"); 5001 /* insert in middle of list */ 5002 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 5003 if (oldadp->ad_offset == off) 5004 allocdirect_merge(adphead, adp, oldadp); 5005 5006 FREE_LOCK(&lk); 5007} 5008 5009/* 5010 * Merge a newer and older journal record to be stored either in a 5011 * newblock or freefrag. This handles aggregating journal records for 5012 * fragment allocation into a second record as well as replacing a 5013 * journal free with an aborted journal allocation. A segment for the 5014 * oldest record will be placed on wkhd if it has been written. If not 5015 * the segment for the newer record will suffice. 5016 */ 5017static struct worklist * 5018jnewblk_merge(new, old, wkhd) 5019 struct worklist *new; 5020 struct worklist *old; 5021 struct workhead *wkhd; 5022{ 5023 struct jnewblk *njnewblk; 5024 struct jnewblk *jnewblk; 5025 5026 /* Handle NULLs to simplify callers. */ 5027 if (new == NULL) 5028 return (old); 5029 if (old == NULL) 5030 return (new); 5031 /* Replace a jfreefrag with a jnewblk. */ 5032 if (new->wk_type == D_JFREEFRAG) { 5033 cancel_jfreefrag(WK_JFREEFRAG(new)); 5034 return (old); 5035 } 5036 /* 5037 * Handle merging of two jnewblk records that describe 5038 * different sets of fragments in the same block. 5039 */ 5040 jnewblk = WK_JNEWBLK(old); 5041 njnewblk = WK_JNEWBLK(new); 5042 if (jnewblk->jn_blkno != njnewblk->jn_blkno) 5043 panic("jnewblk_merge: Merging disparate blocks."); 5044 /* 5045 * The record may be rolled back in the cg. 5046 */ 5047 if (jnewblk->jn_state & UNDONE) { 5048 jnewblk->jn_state &= ~UNDONE; 5049 njnewblk->jn_state |= UNDONE; 5050 njnewblk->jn_state &= ~ATTACHED; 5051 } 5052 /* 5053 * We modify the newer addref and free the older so that if neither 5054 * has been written the most up-to-date copy will be on disk. If 5055 * both have been written but rolled back we only temporarily need 5056 * one of them to fix the bits when the cg write completes. 5057 */ 5058 jnewblk->jn_state |= ATTACHED | COMPLETE; 5059 njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; 5060 cancel_jnewblk(jnewblk, wkhd); 5061 WORKLIST_REMOVE(&jnewblk->jn_list); 5062 free_jnewblk(jnewblk); 5063 return (new); 5064} 5065 5066/* 5067 * Replace an old allocdirect dependency with a newer one. 5068 * This routine must be called with splbio interrupts blocked. 5069 */ 5070static void 5071allocdirect_merge(adphead, newadp, oldadp) 5072 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 5073 struct allocdirect *newadp; /* allocdirect being added */ 5074 struct allocdirect *oldadp; /* existing allocdirect being checked */ 5075{ 5076 struct worklist *wk; 5077 struct freefrag *freefrag; 5078 5079 freefrag = NULL; 5080 mtx_assert(&lk, MA_OWNED); 5081 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 5082 newadp->ad_oldsize != oldadp->ad_newsize || 5083 newadp->ad_offset >= NDADDR) 5084 panic("%s %jd != new %jd || old size %ld != new %ld", 5085 "allocdirect_merge: old blkno", 5086 (intmax_t)newadp->ad_oldblkno, 5087 (intmax_t)oldadp->ad_newblkno, 5088 newadp->ad_oldsize, oldadp->ad_newsize); 5089 newadp->ad_oldblkno = oldadp->ad_oldblkno; 5090 newadp->ad_oldsize = oldadp->ad_oldsize; 5091 /* 5092 * If the old dependency had a fragment to free or had never 5093 * previously had a block allocated, then the new dependency 5094 * can immediately post its freefrag and adopt the old freefrag. 5095 * This action is done by swapping the freefrag dependencies. 5096 * The new dependency gains the old one's freefrag, and the 5097 * old one gets the new one and then immediately puts it on 5098 * the worklist when it is freed by free_newblk. It is 5099 * not possible to do this swap when the old dependency had a 5100 * non-zero size but no previous fragment to free. This condition 5101 * arises when the new block is an extension of the old block. 5102 * Here, the first part of the fragment allocated to the new 5103 * dependency is part of the block currently claimed on disk by 5104 * the old dependency, so cannot legitimately be freed until the 5105 * conditions for the new dependency are fulfilled. 5106 */ 5107 freefrag = newadp->ad_freefrag; 5108 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 5109 newadp->ad_freefrag = oldadp->ad_freefrag; 5110 oldadp->ad_freefrag = freefrag; 5111 } 5112 /* 5113 * If we are tracking a new directory-block allocation, 5114 * move it from the old allocdirect to the new allocdirect. 5115 */ 5116 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { 5117 WORKLIST_REMOVE(wk); 5118 if (!LIST_EMPTY(&oldadp->ad_newdirblk)) 5119 panic("allocdirect_merge: extra newdirblk"); 5120 WORKLIST_INSERT(&newadp->ad_newdirblk, wk); 5121 } 5122 TAILQ_REMOVE(adphead, oldadp, ad_next); 5123 /* 5124 * We need to move any journal dependencies over to the freefrag 5125 * that releases this block if it exists. Otherwise we are 5126 * extending an existing block and we'll wait until that is 5127 * complete to release the journal space and extend the 5128 * new journal to cover this old space as well. 5129 */ 5130 if (freefrag == NULL) { 5131 if (oldadp->ad_newblkno != newadp->ad_newblkno) 5132 panic("allocdirect_merge: %jd != %jd", 5133 oldadp->ad_newblkno, newadp->ad_newblkno); 5134 newadp->ad_block.nb_jnewblk = (struct jnewblk *) 5135 jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, 5136 &oldadp->ad_block.nb_jnewblk->jn_list, 5137 &newadp->ad_block.nb_jwork); 5138 oldadp->ad_block.nb_jnewblk = NULL; 5139 cancel_newblk(&oldadp->ad_block, NULL, 5140 &newadp->ad_block.nb_jwork); 5141 } else { 5142 wk = (struct worklist *) cancel_newblk(&oldadp->ad_block, 5143 &freefrag->ff_list, &freefrag->ff_jwork); 5144 freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk, 5145 &freefrag->ff_jwork); 5146 } 5147 free_newblk(&oldadp->ad_block); 5148} 5149 5150/* 5151 * Allocate a jfreefrag structure to journal a single block free. 5152 */ 5153static struct jfreefrag * 5154newjfreefrag(freefrag, ip, blkno, size, lbn) 5155 struct freefrag *freefrag; 5156 struct inode *ip; 5157 ufs2_daddr_t blkno; 5158 long size; 5159 ufs_lbn_t lbn; 5160{ 5161 struct jfreefrag *jfreefrag; 5162 struct fs *fs; 5163 5164 fs = ip->i_fs; 5165 jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, 5166 M_SOFTDEP_FLAGS); 5167 workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump)); 5168 jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); 5169 jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; 5170 jfreefrag->fr_ino = ip->i_number; 5171 jfreefrag->fr_lbn = lbn; 5172 jfreefrag->fr_blkno = blkno; 5173 jfreefrag->fr_frags = numfrags(fs, size); 5174 jfreefrag->fr_freefrag = freefrag; 5175 5176 return (jfreefrag); 5177} 5178 5179/* 5180 * Allocate a new freefrag structure. 5181 */ 5182static struct freefrag * 5183newfreefrag(ip, blkno, size, lbn) 5184 struct inode *ip; 5185 ufs2_daddr_t blkno; 5186 long size; 5187 ufs_lbn_t lbn; 5188{ 5189 struct freefrag *freefrag; 5190 struct fs *fs; 5191 5192 fs = ip->i_fs; 5193 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 5194 panic("newfreefrag: frag size"); 5195 freefrag = malloc(sizeof(struct freefrag), 5196 M_FREEFRAG, M_SOFTDEP_FLAGS); 5197 workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); 5198 freefrag->ff_state = ATTACHED; 5199 LIST_INIT(&freefrag->ff_jwork); 5200 freefrag->ff_inum = ip->i_number; 5201 freefrag->ff_vtype = ITOV(ip)->v_type; 5202 freefrag->ff_blkno = blkno; 5203 freefrag->ff_fragsize = size; 5204 5205 if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) { 5206 freefrag->ff_jdep = (struct worklist *) 5207 newjfreefrag(freefrag, ip, blkno, size, lbn); 5208 } else { 5209 freefrag->ff_state |= DEPCOMPLETE; 5210 freefrag->ff_jdep = NULL; 5211 } 5212 5213 return (freefrag); 5214} 5215 5216/* 5217 * This workitem de-allocates fragments that were replaced during 5218 * file block allocation. 5219 */ 5220static void 5221handle_workitem_freefrag(freefrag) 5222 struct freefrag *freefrag; 5223{ 5224 struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); 5225 struct workhead wkhd; 5226 5227 /* 5228 * It would be illegal to add new completion items to the 5229 * freefrag after it was schedule to be done so it must be 5230 * safe to modify the list head here. 5231 */ 5232 LIST_INIT(&wkhd); 5233 ACQUIRE_LOCK(&lk); 5234 LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); 5235 /* 5236 * If the journal has not been written we must cancel it here. 5237 */ 5238 if (freefrag->ff_jdep) { 5239 if (freefrag->ff_jdep->wk_type != D_JNEWBLK) 5240 panic("handle_workitem_freefrag: Unexpected type %d\n", 5241 freefrag->ff_jdep->wk_type); 5242 cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd); 5243 } 5244 FREE_LOCK(&lk); 5245 ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, 5246 freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd); 5247 ACQUIRE_LOCK(&lk); 5248 WORKITEM_FREE(freefrag, D_FREEFRAG); 5249 FREE_LOCK(&lk); 5250} 5251 5252/* 5253 * Set up a dependency structure for an external attributes data block. 5254 * This routine follows much of the structure of softdep_setup_allocdirect. 5255 * See the description of softdep_setup_allocdirect above for details. 5256 */ 5257void 5258softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 5259 struct inode *ip; 5260 ufs_lbn_t off; 5261 ufs2_daddr_t newblkno; 5262 ufs2_daddr_t oldblkno; 5263 long newsize; 5264 long oldsize; 5265 struct buf *bp; 5266{ 5267 struct allocdirect *adp, *oldadp; 5268 struct allocdirectlst *adphead; 5269 struct freefrag *freefrag; 5270 struct inodedep *inodedep; 5271 struct jnewblk *jnewblk; 5272 struct newblk *newblk; 5273 struct mount *mp; 5274 ufs_lbn_t lbn; 5275 5276 if (off >= NXADDR) 5277 panic("softdep_setup_allocext: lbn %lld > NXADDR", 5278 (long long)off); 5279 5280 lbn = bp->b_lblkno; 5281 mp = UFSTOVFS(ip->i_ump); 5282 if (oldblkno && oldblkno != newblkno) 5283 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 5284 else 5285 freefrag = NULL; 5286 5287 ACQUIRE_LOCK(&lk); 5288 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 5289 panic("softdep_setup_allocext: lost block"); 5290 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 5291 ("softdep_setup_allocext: newblk already initialized")); 5292 /* 5293 * Convert the newblk to an allocdirect. 5294 */ 5295 newblk->nb_list.wk_type = D_ALLOCDIRECT; 5296 adp = (struct allocdirect *)newblk; 5297 newblk->nb_freefrag = freefrag; 5298 adp->ad_offset = off; 5299 adp->ad_oldblkno = oldblkno; 5300 adp->ad_newsize = newsize; 5301 adp->ad_oldsize = oldsize; 5302 adp->ad_state |= EXTDATA; 5303 5304 /* 5305 * Finish initializing the journal. 5306 */ 5307 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 5308 jnewblk->jn_ino = ip->i_number; 5309 jnewblk->jn_lbn = lbn; 5310 add_to_journal(&jnewblk->jn_list); 5311 } 5312 if (freefrag && freefrag->ff_jdep != NULL && 5313 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 5314 add_to_journal(freefrag->ff_jdep); 5315 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 5316 adp->ad_inodedep = inodedep; 5317 5318 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 5319 /* 5320 * The list of allocdirects must be kept in sorted and ascending 5321 * order so that the rollback routines can quickly determine the 5322 * first uncommitted block (the size of the file stored on disk 5323 * ends at the end of the lowest committed fragment, or if there 5324 * are no fragments, at the end of the highest committed block). 5325 * Since files generally grow, the typical case is that the new 5326 * block is to be added at the end of the list. We speed this 5327 * special case by checking against the last allocdirect in the 5328 * list before laboriously traversing the list looking for the 5329 * insertion point. 5330 */ 5331 adphead = &inodedep->id_newextupdt; 5332 oldadp = TAILQ_LAST(adphead, allocdirectlst); 5333 if (oldadp == NULL || oldadp->ad_offset <= off) { 5334 /* insert at end of list */ 5335 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 5336 if (oldadp != NULL && oldadp->ad_offset == off) 5337 allocdirect_merge(adphead, adp, oldadp); 5338 FREE_LOCK(&lk); 5339 return; 5340 } 5341 TAILQ_FOREACH(oldadp, adphead, ad_next) { 5342 if (oldadp->ad_offset >= off) 5343 break; 5344 } 5345 if (oldadp == NULL) 5346 panic("softdep_setup_allocext: lost entry"); 5347 /* insert in middle of list */ 5348 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 5349 if (oldadp->ad_offset == off) 5350 allocdirect_merge(adphead, adp, oldadp); 5351 FREE_LOCK(&lk); 5352} 5353 5354/* 5355 * Indirect block allocation dependencies. 5356 * 5357 * The same dependencies that exist for a direct block also exist when 5358 * a new block is allocated and pointed to by an entry in a block of 5359 * indirect pointers. The undo/redo states described above are also 5360 * used here. Because an indirect block contains many pointers that 5361 * may have dependencies, a second copy of the entire in-memory indirect 5362 * block is kept. The buffer cache copy is always completely up-to-date. 5363 * The second copy, which is used only as a source for disk writes, 5364 * contains only the safe pointers (i.e., those that have no remaining 5365 * update dependencies). The second copy is freed when all pointers 5366 * are safe. The cache is not allowed to replace indirect blocks with 5367 * pending update dependencies. If a buffer containing an indirect 5368 * block with dependencies is written, these routines will mark it 5369 * dirty again. It can only be successfully written once all the 5370 * dependencies are removed. The ffs_fsync routine in conjunction with 5371 * softdep_sync_metadata work together to get all the dependencies 5372 * removed so that a file can be successfully written to disk. Three 5373 * procedures are used when setting up indirect block pointer 5374 * dependencies. The division is necessary because of the organization 5375 * of the "balloc" routine and because of the distinction between file 5376 * pages and file metadata blocks. 5377 */ 5378 5379/* 5380 * Allocate a new allocindir structure. 5381 */ 5382static struct allocindir * 5383newallocindir(ip, ptrno, newblkno, oldblkno, lbn) 5384 struct inode *ip; /* inode for file being extended */ 5385 int ptrno; /* offset of pointer in indirect block */ 5386 ufs2_daddr_t newblkno; /* disk block number being added */ 5387 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 5388 ufs_lbn_t lbn; 5389{ 5390 struct newblk *newblk; 5391 struct allocindir *aip; 5392 struct freefrag *freefrag; 5393 struct jnewblk *jnewblk; 5394 5395 if (oldblkno) 5396 freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn); 5397 else 5398 freefrag = NULL; 5399 ACQUIRE_LOCK(&lk); 5400 if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0) 5401 panic("new_allocindir: lost block"); 5402 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 5403 ("newallocindir: newblk already initialized")); 5404 newblk->nb_list.wk_type = D_ALLOCINDIR; 5405 newblk->nb_freefrag = freefrag; 5406 aip = (struct allocindir *)newblk; 5407 aip->ai_offset = ptrno; 5408 aip->ai_oldblkno = oldblkno; 5409 aip->ai_lbn = lbn; 5410 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 5411 jnewblk->jn_ino = ip->i_number; 5412 jnewblk->jn_lbn = lbn; 5413 add_to_journal(&jnewblk->jn_list); 5414 } 5415 if (freefrag && freefrag->ff_jdep != NULL && 5416 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 5417 add_to_journal(freefrag->ff_jdep); 5418 return (aip); 5419} 5420 5421/* 5422 * Called just before setting an indirect block pointer 5423 * to a newly allocated file page. 5424 */ 5425void 5426softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 5427 struct inode *ip; /* inode for file being extended */ 5428 ufs_lbn_t lbn; /* allocated block number within file */ 5429 struct buf *bp; /* buffer with indirect blk referencing page */ 5430 int ptrno; /* offset of pointer in indirect block */ 5431 ufs2_daddr_t newblkno; /* disk block number being added */ 5432 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 5433 struct buf *nbp; /* buffer holding allocated page */ 5434{ 5435 struct inodedep *inodedep; 5436 struct freefrag *freefrag; 5437 struct allocindir *aip; 5438 struct pagedep *pagedep; 5439 struct mount *mp; 5440 5441 if (lbn != nbp->b_lblkno) 5442 panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", 5443 lbn, bp->b_lblkno); 5444 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); 5445 mp = UFSTOVFS(ip->i_ump); 5446 aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); 5447 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 5448 /* 5449 * If we are allocating a directory page, then we must 5450 * allocate an associated pagedep to track additions and 5451 * deletions. 5452 */ 5453 if ((ip->i_mode & IFMT) == IFDIR) 5454 pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep); 5455 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 5456 freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); 5457 FREE_LOCK(&lk); 5458 if (freefrag) 5459 handle_workitem_freefrag(freefrag); 5460} 5461 5462/* 5463 * Called just before setting an indirect block pointer to a 5464 * newly allocated indirect block. 5465 */ 5466void 5467softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 5468 struct buf *nbp; /* newly allocated indirect block */ 5469 struct inode *ip; /* inode for file being extended */ 5470 struct buf *bp; /* indirect block referencing allocated block */ 5471 int ptrno; /* offset of pointer in indirect block */ 5472 ufs2_daddr_t newblkno; /* disk block number being added */ 5473{ 5474 struct inodedep *inodedep; 5475 struct allocindir *aip; 5476 ufs_lbn_t lbn; 5477 5478 lbn = nbp->b_lblkno; 5479 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); 5480 aip = newallocindir(ip, ptrno, newblkno, 0, lbn); 5481 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); 5482 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 5483 if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)) 5484 panic("softdep_setup_allocindir_meta: Block already existed"); 5485 FREE_LOCK(&lk); 5486} 5487 5488static void 5489indirdep_complete(indirdep) 5490 struct indirdep *indirdep; 5491{ 5492 struct allocindir *aip; 5493 5494 LIST_REMOVE(indirdep, ir_next); 5495 indirdep->ir_state |= DEPCOMPLETE; 5496 5497 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { 5498 LIST_REMOVE(aip, ai_next); 5499 free_newblk(&aip->ai_block); 5500 } 5501 /* 5502 * If this indirdep is not attached to a buf it was simply waiting 5503 * on completion to clear completehd. free_indirdep() asserts 5504 * that nothing is dangling. 5505 */ 5506 if ((indirdep->ir_state & ONWORKLIST) == 0) 5507 free_indirdep(indirdep); 5508} 5509 5510static struct indirdep * 5511indirdep_lookup(mp, ip, bp) 5512 struct mount *mp; 5513 struct inode *ip; 5514 struct buf *bp; 5515{ 5516 struct indirdep *indirdep, *newindirdep; 5517 struct newblk *newblk; 5518 struct worklist *wk; 5519 struct fs *fs; 5520 ufs2_daddr_t blkno; 5521 5522 mtx_assert(&lk, MA_OWNED); 5523 indirdep = NULL; 5524 newindirdep = NULL; 5525 fs = ip->i_fs; 5526 for (;;) { 5527 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 5528 if (wk->wk_type != D_INDIRDEP) 5529 continue; 5530 indirdep = WK_INDIRDEP(wk); 5531 break; 5532 } 5533 /* Found on the buffer worklist, no new structure to free. */ 5534 if (indirdep != NULL && newindirdep == NULL) 5535 return (indirdep); 5536 if (indirdep != NULL && newindirdep != NULL) 5537 panic("indirdep_lookup: simultaneous create"); 5538 /* None found on the buffer and a new structure is ready. */ 5539 if (indirdep == NULL && newindirdep != NULL) 5540 break; 5541 /* None found and no new structure available. */ 5542 FREE_LOCK(&lk); 5543 newindirdep = malloc(sizeof(struct indirdep), 5544 M_INDIRDEP, M_SOFTDEP_FLAGS); 5545 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); 5546 newindirdep->ir_state = ATTACHED; 5547 if (ip->i_ump->um_fstype == UFS1) 5548 newindirdep->ir_state |= UFS1FMT; 5549 TAILQ_INIT(&newindirdep->ir_trunc); 5550 newindirdep->ir_saveddata = NULL; 5551 LIST_INIT(&newindirdep->ir_deplisthd); 5552 LIST_INIT(&newindirdep->ir_donehd); 5553 LIST_INIT(&newindirdep->ir_writehd); 5554 LIST_INIT(&newindirdep->ir_completehd); 5555 if (bp->b_blkno == bp->b_lblkno) { 5556 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, 5557 NULL, NULL); 5558 bp->b_blkno = blkno; 5559 } 5560 newindirdep->ir_freeblks = NULL; 5561 newindirdep->ir_savebp = 5562 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); 5563 newindirdep->ir_bp = bp; 5564 BUF_KERNPROC(newindirdep->ir_savebp); 5565 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 5566 ACQUIRE_LOCK(&lk); 5567 } 5568 indirdep = newindirdep; 5569 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 5570 /* 5571 * If the block is not yet allocated we don't set DEPCOMPLETE so 5572 * that we don't free dependencies until the pointers are valid. 5573 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather 5574 * than using the hash. 5575 */ 5576 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)) 5577 LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next); 5578 else 5579 indirdep->ir_state |= DEPCOMPLETE; 5580 return (indirdep); 5581} 5582 5583/* 5584 * Called to finish the allocation of the "aip" allocated 5585 * by one of the two routines above. 5586 */ 5587static struct freefrag * 5588setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) 5589 struct buf *bp; /* in-memory copy of the indirect block */ 5590 struct inode *ip; /* inode for file being extended */ 5591 struct inodedep *inodedep; /* Inodedep for ip */ 5592 struct allocindir *aip; /* allocindir allocated by the above routines */ 5593 ufs_lbn_t lbn; /* Logical block number for this block. */ 5594{ 5595 struct fs *fs; 5596 struct indirdep *indirdep; 5597 struct allocindir *oldaip; 5598 struct freefrag *freefrag; 5599 struct mount *mp; 5600 5601 mtx_assert(&lk, MA_OWNED); 5602 mp = UFSTOVFS(ip->i_ump); 5603 fs = ip->i_fs; 5604 if (bp->b_lblkno >= 0) 5605 panic("setup_allocindir_phase2: not indir blk"); 5606 KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs), 5607 ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset)); 5608 indirdep = indirdep_lookup(mp, ip, bp); 5609 KASSERT(indirdep->ir_savebp != NULL, 5610 ("setup_allocindir_phase2 NULL ir_savebp")); 5611 aip->ai_indirdep = indirdep; 5612 /* 5613 * Check for an unwritten dependency for this indirect offset. If 5614 * there is, merge the old dependency into the new one. This happens 5615 * as a result of reallocblk only. 5616 */ 5617 freefrag = NULL; 5618 if (aip->ai_oldblkno != 0) { 5619 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) { 5620 if (oldaip->ai_offset == aip->ai_offset) { 5621 freefrag = allocindir_merge(aip, oldaip); 5622 goto done; 5623 } 5624 } 5625 LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) { 5626 if (oldaip->ai_offset == aip->ai_offset) { 5627 freefrag = allocindir_merge(aip, oldaip); 5628 goto done; 5629 } 5630 } 5631 } 5632done: 5633 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 5634 return (freefrag); 5635} 5636 5637/* 5638 * Merge two allocindirs which refer to the same block. Move newblock 5639 * dependencies and setup the freefrags appropriately. 5640 */ 5641static struct freefrag * 5642allocindir_merge(aip, oldaip) 5643 struct allocindir *aip; 5644 struct allocindir *oldaip; 5645{ 5646 struct freefrag *freefrag; 5647 struct worklist *wk; 5648 5649 if (oldaip->ai_newblkno != aip->ai_oldblkno) 5650 panic("allocindir_merge: blkno"); 5651 aip->ai_oldblkno = oldaip->ai_oldblkno; 5652 freefrag = aip->ai_freefrag; 5653 aip->ai_freefrag = oldaip->ai_freefrag; 5654 oldaip->ai_freefrag = NULL; 5655 KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); 5656 /* 5657 * If we are tracking a new directory-block allocation, 5658 * move it from the old allocindir to the new allocindir. 5659 */ 5660 if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { 5661 WORKLIST_REMOVE(wk); 5662 if (!LIST_EMPTY(&oldaip->ai_newdirblk)) 5663 panic("allocindir_merge: extra newdirblk"); 5664 WORKLIST_INSERT(&aip->ai_newdirblk, wk); 5665 } 5666 /* 5667 * We can skip journaling for this freefrag and just complete 5668 * any pending journal work for the allocindir that is being 5669 * removed after the freefrag completes. 5670 */ 5671 if (freefrag->ff_jdep) 5672 cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep)); 5673 LIST_REMOVE(oldaip, ai_next); 5674 freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block, 5675 &freefrag->ff_list, &freefrag->ff_jwork); 5676 free_newblk(&oldaip->ai_block); 5677 5678 return (freefrag); 5679} 5680 5681static inline void 5682setup_freedirect(freeblks, ip, i, needj) 5683 struct freeblks *freeblks; 5684 struct inode *ip; 5685 int i; 5686 int needj; 5687{ 5688 ufs2_daddr_t blkno; 5689 int frags; 5690 5691 blkno = DIP(ip, i_db[i]); 5692 if (blkno == 0) 5693 return; 5694 DIP_SET(ip, i_db[i], 0); 5695 frags = sblksize(ip->i_fs, ip->i_size, i); 5696 frags = numfrags(ip->i_fs, frags); 5697 newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj); 5698} 5699 5700static inline void 5701setup_freeext(freeblks, ip, i, needj) 5702 struct freeblks *freeblks; 5703 struct inode *ip; 5704 int i; 5705 int needj; 5706{ 5707 ufs2_daddr_t blkno; 5708 int frags; 5709 5710 blkno = ip->i_din2->di_extb[i]; 5711 if (blkno == 0) 5712 return; 5713 ip->i_din2->di_extb[i] = 0; 5714 frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i); 5715 frags = numfrags(ip->i_fs, frags); 5716 newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj); 5717} 5718 5719static inline void 5720setup_freeindir(freeblks, ip, i, lbn, needj) 5721 struct freeblks *freeblks; 5722 struct inode *ip; 5723 int i; 5724 ufs_lbn_t lbn; 5725 int needj; 5726{ 5727 ufs2_daddr_t blkno; 5728 5729 blkno = DIP(ip, i_ib[i]); 5730 if (blkno == 0) 5731 return; 5732 DIP_SET(ip, i_ib[i], 0); 5733 newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag, 5734 0, needj); 5735} 5736 5737static inline struct freeblks * 5738newfreeblks(mp, ip) 5739 struct mount *mp; 5740 struct inode *ip; 5741{ 5742 struct freeblks *freeblks; 5743 5744 freeblks = malloc(sizeof(struct freeblks), 5745 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); 5746 workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); 5747 LIST_INIT(&freeblks->fb_jblkdephd); 5748 LIST_INIT(&freeblks->fb_jwork); 5749 freeblks->fb_ref = 0; 5750 freeblks->fb_cgwait = 0; 5751 freeblks->fb_state = ATTACHED; 5752 freeblks->fb_uid = ip->i_uid; 5753 freeblks->fb_inum = ip->i_number; 5754 freeblks->fb_vtype = ITOV(ip)->v_type; 5755 freeblks->fb_modrev = DIP(ip, i_modrev); 5756 freeblks->fb_devvp = ip->i_devvp; 5757 freeblks->fb_chkcnt = 0; 5758 freeblks->fb_len = 0; 5759 5760 return (freeblks); 5761} 5762 5763static void 5764trunc_indirdep(indirdep, freeblks, bp, off) 5765 struct indirdep *indirdep; 5766 struct freeblks *freeblks; 5767 struct buf *bp; 5768 int off; 5769{ 5770 struct allocindir *aip, *aipn; 5771 5772 /* 5773 * The first set of allocindirs won't be in savedbp. 5774 */ 5775 LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn) 5776 if (aip->ai_offset > off) 5777 cancel_allocindir(aip, bp, freeblks, 1); 5778 LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn) 5779 if (aip->ai_offset > off) 5780 cancel_allocindir(aip, bp, freeblks, 1); 5781 /* 5782 * These will exist in savedbp. 5783 */ 5784 LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn) 5785 if (aip->ai_offset > off) 5786 cancel_allocindir(aip, NULL, freeblks, 0); 5787 LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn) 5788 if (aip->ai_offset > off) 5789 cancel_allocindir(aip, NULL, freeblks, 0); 5790} 5791 5792/* 5793 * Follow the chain of indirects down to lastlbn creating a freework 5794 * structure for each. This will be used to start indir_trunc() at 5795 * the right offset and create the journal records for the parrtial 5796 * truncation. A second step will handle the truncated dependencies. 5797 */ 5798static int 5799setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno) 5800 struct freeblks *freeblks; 5801 struct inode *ip; 5802 ufs_lbn_t lbn; 5803 ufs_lbn_t lastlbn; 5804 ufs2_daddr_t blkno; 5805{ 5806 struct indirdep *indirdep; 5807 struct indirdep *indirn; 5808 struct freework *freework; 5809 struct newblk *newblk; 5810 struct mount *mp; 5811 struct buf *bp; 5812 uint8_t *start; 5813 uint8_t *end; 5814 ufs_lbn_t lbnadd; 5815 int level; 5816 int error; 5817 int off; 5818 5819 5820 freework = NULL; 5821 if (blkno == 0) 5822 return (0); 5823 mp = freeblks->fb_list.wk_mp; 5824 bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0); 5825 if ((bp->b_flags & B_CACHE) == 0) { 5826 bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno); 5827 bp->b_iocmd = BIO_READ; 5828 bp->b_flags &= ~B_INVAL; 5829 bp->b_ioflags &= ~BIO_ERROR; 5830 vfs_busy_pages(bp, 0); 5831 bp->b_iooffset = dbtob(bp->b_blkno); 5832 bstrategy(bp); 5833 curthread->td_ru.ru_inblock++; 5834 error = bufwait(bp); 5835 if (error) { 5836 brelse(bp); 5837 return (error); 5838 } 5839 } 5840 level = lbn_level(lbn); 5841 lbnadd = lbn_offset(ip->i_fs, level); 5842 /* 5843 * Compute the offset of the last block we want to keep. Store 5844 * in the freework the first block we want to completely free. 5845 */ 5846 off = (lastlbn - -(lbn + level)) / lbnadd; 5847 if (off + 1 == NINDIR(ip->i_fs)) 5848 goto nowork; 5849 freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1, 5850 0); 5851 /* 5852 * Link the freework into the indirdep. This will prevent any new 5853 * allocations from proceeding until we are finished with the 5854 * truncate and the block is written. 5855 */ 5856 ACQUIRE_LOCK(&lk); 5857 indirdep = indirdep_lookup(mp, ip, bp); 5858 if (indirdep->ir_freeblks) 5859 panic("setup_trunc_indir: indirdep already truncated."); 5860 TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); 5861 freework->fw_indir = indirdep; 5862 /* 5863 * Cancel any allocindirs that will not make it to disk. 5864 * We have to do this for all copies of the indirdep that 5865 * live on this newblk. 5866 */ 5867 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 5868 newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk); 5869 LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next) 5870 trunc_indirdep(indirn, freeblks, bp, off); 5871 } else 5872 trunc_indirdep(indirdep, freeblks, bp, off); 5873 FREE_LOCK(&lk); 5874 /* 5875 * Creation is protected by the buf lock. The saveddata is only 5876 * needed if a full truncation follows a partial truncation but it 5877 * is difficult to allocate in that case so we fetch it anyway. 5878 */ 5879 if (indirdep->ir_saveddata == NULL) 5880 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 5881 M_SOFTDEP_FLAGS); 5882nowork: 5883 /* Fetch the blkno of the child and the zero start offset. */ 5884 if (ip->i_ump->um_fstype == UFS1) { 5885 blkno = ((ufs1_daddr_t *)bp->b_data)[off]; 5886 start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1]; 5887 } else { 5888 blkno = ((ufs2_daddr_t *)bp->b_data)[off]; 5889 start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1]; 5890 } 5891 if (freework) { 5892 /* Zero the truncated pointers. */ 5893 end = bp->b_data + bp->b_bcount; 5894 bzero(start, end - start); 5895 bdwrite(bp); 5896 } else 5897 bqrelse(bp); 5898 if (level == 0) 5899 return (0); 5900 lbn++; /* adjust level */ 5901 lbn -= (off * lbnadd); 5902 return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno); 5903} 5904 5905/* 5906 * Complete the partial truncation of an indirect block setup by 5907 * setup_trunc_indir(). This zeros the truncated pointers in the saved 5908 * copy and writes them to disk before the freeblks is allowed to complete. 5909 */ 5910static void 5911complete_trunc_indir(freework) 5912 struct freework *freework; 5913{ 5914 struct freework *fwn; 5915 struct indirdep *indirdep; 5916 struct buf *bp; 5917 uintptr_t start; 5918 int count; 5919 5920 indirdep = freework->fw_indir; 5921 for (;;) { 5922 bp = indirdep->ir_bp; 5923 /* See if the block was discarded. */ 5924 if (bp == NULL) 5925 break; 5926 /* Inline part of getdirtybuf(). We dont want bremfree. */ 5927 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) 5928 break; 5929 if (BUF_LOCK(bp, 5930 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, &lk) == 0) 5931 BUF_UNLOCK(bp); 5932 ACQUIRE_LOCK(&lk); 5933 } 5934 mtx_assert(&lk, MA_OWNED); 5935 freework->fw_state |= DEPCOMPLETE; 5936 TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next); 5937 /* 5938 * Zero the pointers in the saved copy. 5939 */ 5940 if (indirdep->ir_state & UFS1FMT) 5941 start = sizeof(ufs1_daddr_t); 5942 else 5943 start = sizeof(ufs2_daddr_t); 5944 start *= freework->fw_start; 5945 count = indirdep->ir_savebp->b_bcount - start; 5946 start += (uintptr_t)indirdep->ir_savebp->b_data; 5947 bzero((char *)start, count); 5948 /* 5949 * We need to start the next truncation in the list if it has not 5950 * been started yet. 5951 */ 5952 fwn = TAILQ_FIRST(&indirdep->ir_trunc); 5953 if (fwn != NULL) { 5954 if (fwn->fw_freeblks == indirdep->ir_freeblks) 5955 TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next); 5956 if ((fwn->fw_state & ONWORKLIST) == 0) 5957 freework_enqueue(fwn); 5958 } 5959 /* 5960 * If bp is NULL the block was fully truncated, restore 5961 * the saved block list otherwise free it if it is no 5962 * longer needed. 5963 */ 5964 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 5965 if (bp == NULL) 5966 bcopy(indirdep->ir_saveddata, 5967 indirdep->ir_savebp->b_data, 5968 indirdep->ir_savebp->b_bcount); 5969 free(indirdep->ir_saveddata, M_INDIRDEP); 5970 indirdep->ir_saveddata = NULL; 5971 } 5972 /* 5973 * When bp is NULL there is a full truncation pending. We 5974 * must wait for this full truncation to be journaled before 5975 * we can release this freework because the disk pointers will 5976 * never be written as zero. 5977 */ 5978 if (bp == NULL) { 5979 if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd)) 5980 handle_written_freework(freework); 5981 else 5982 WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd, 5983 &freework->fw_list); 5984 } else { 5985 /* Complete when the real copy is written. */ 5986 WORKLIST_INSERT(&bp->b_dep, &freework->fw_list); 5987 BUF_UNLOCK(bp); 5988 } 5989} 5990 5991/* 5992 * Calculate the number of blocks we are going to release where datablocks 5993 * is the current total and length is the new file size. 5994 */ 5995ufs2_daddr_t 5996blkcount(fs, datablocks, length) 5997 struct fs *fs; 5998 ufs2_daddr_t datablocks; 5999 off_t length; 6000{ 6001 off_t totblks, numblks; 6002 6003 totblks = 0; 6004 numblks = howmany(length, fs->fs_bsize); 6005 if (numblks <= NDADDR) { 6006 totblks = howmany(length, fs->fs_fsize); 6007 goto out; 6008 } 6009 totblks = blkstofrags(fs, numblks); 6010 numblks -= NDADDR; 6011 /* 6012 * Count all single, then double, then triple indirects required. 6013 * Subtracting one indirects worth of blocks for each pass 6014 * acknowledges one of each pointed to by the inode. 6015 */ 6016 for (;;) { 6017 totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs))); 6018 numblks -= NINDIR(fs); 6019 if (numblks <= 0) 6020 break; 6021 numblks = howmany(numblks, NINDIR(fs)); 6022 } 6023out: 6024 totblks = fsbtodb(fs, totblks); 6025 /* 6026 * Handle sparse files. We can't reclaim more blocks than the inode 6027 * references. We will correct it later in handle_complete_freeblks() 6028 * when we know the real count. 6029 */ 6030 if (totblks > datablocks) 6031 return (0); 6032 return (datablocks - totblks); 6033} 6034 6035/* 6036 * Handle freeblocks for journaled softupdate filesystems. 6037 * 6038 * Contrary to normal softupdates, we must preserve the block pointers in 6039 * indirects until their subordinates are free. This is to avoid journaling 6040 * every block that is freed which may consume more space than the journal 6041 * itself. The recovery program will see the free block journals at the 6042 * base of the truncated area and traverse them to reclaim space. The 6043 * pointers in the inode may be cleared immediately after the journal 6044 * records are written because each direct and indirect pointer in the 6045 * inode is recorded in a journal. This permits full truncation to proceed 6046 * asynchronously. The write order is journal -> inode -> cgs -> indirects. 6047 * 6048 * The algorithm is as follows: 6049 * 1) Traverse the in-memory state and create journal entries to release 6050 * the relevant blocks and full indirect trees. 6051 * 2) Traverse the indirect block chain adding partial truncation freework 6052 * records to indirects in the path to lastlbn. The freework will 6053 * prevent new allocation dependencies from being satisfied in this 6054 * indirect until the truncation completes. 6055 * 3) Read and lock the inode block, performing an update with the new size 6056 * and pointers. This prevents truncated data from becoming valid on 6057 * disk through step 4. 6058 * 4) Reap unsatisfied dependencies that are beyond the truncated area, 6059 * eliminate journal work for those records that do not require it. 6060 * 5) Schedule the journal records to be written followed by the inode block. 6061 * 6) Allocate any necessary frags for the end of file. 6062 * 7) Zero any partially truncated blocks. 6063 * 6064 * From this truncation proceeds asynchronously using the freework and 6065 * indir_trunc machinery. The file will not be extended again into a 6066 * partially truncated indirect block until all work is completed but 6067 * the normal dependency mechanism ensures that it is rolled back/forward 6068 * as appropriate. Further truncation may occur without delay and is 6069 * serialized in indir_trunc(). 6070 */ 6071void 6072softdep_journal_freeblocks(ip, cred, length, flags) 6073 struct inode *ip; /* The inode whose length is to be reduced */ 6074 struct ucred *cred; 6075 off_t length; /* The new length for the file */ 6076 int flags; /* IO_EXT and/or IO_NORMAL */ 6077{ 6078 struct freeblks *freeblks, *fbn; 6079 struct inodedep *inodedep; 6080 struct jblkdep *jblkdep; 6081 struct allocdirect *adp, *adpn; 6082 struct fs *fs; 6083 struct buf *bp; 6084 struct vnode *vp; 6085 struct mount *mp; 6086 ufs2_daddr_t extblocks, datablocks; 6087 ufs_lbn_t tmpval, lbn, lastlbn; 6088 int frags; 6089 int lastoff, iboff; 6090 int allocblock; 6091 int error, i; 6092 int needj; 6093 6094 fs = ip->i_fs; 6095 mp = UFSTOVFS(ip->i_ump); 6096 vp = ITOV(ip); 6097 needj = 1; 6098 iboff = -1; 6099 allocblock = 0; 6100 extblocks = 0; 6101 datablocks = 0; 6102 frags = 0; 6103 freeblks = newfreeblks(mp, ip); 6104 ACQUIRE_LOCK(&lk); 6105 /* 6106 * If we're truncating a removed file that will never be written 6107 * we don't need to journal the block frees. The canceled journals 6108 * for the allocations will suffice. 6109 */ 6110 inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6111 if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED && 6112 length == 0) 6113 needj = 0; 6114 FREE_LOCK(&lk); 6115 /* 6116 * Calculate the lbn that we are truncating to. This results in -1 6117 * if we're truncating the 0 bytes. So it is the last lbn we want 6118 * to keep, not the first lbn we want to truncate. 6119 */ 6120 lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1; 6121 lastoff = blkoff(fs, length); 6122 /* 6123 * Compute frags we are keeping in lastlbn. 0 means all. 6124 */ 6125 if (lastlbn >= 0 && lastlbn < NDADDR) { 6126 frags = fragroundup(fs, lastoff); 6127 /* adp offset of last valid allocdirect. */ 6128 iboff = lastlbn; 6129 } else if (lastlbn > 0) 6130 iboff = NDADDR; 6131 if (fs->fs_magic == FS_UFS2_MAGIC) 6132 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 6133 /* 6134 * Handle normal data blocks and indirects. This section saves 6135 * values used after the inode update to complete frag and indirect 6136 * truncation. 6137 */ 6138 if ((flags & IO_NORMAL) != 0) { 6139 /* 6140 * Handle truncation of whole direct and indirect blocks. 6141 */ 6142 for (i = iboff + 1; i < NDADDR; i++) 6143 setup_freedirect(freeblks, ip, i, needj); 6144 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; 6145 i++, lbn += tmpval, tmpval *= NINDIR(fs)) { 6146 /* Release a whole indirect tree. */ 6147 if (lbn > lastlbn) { 6148 setup_freeindir(freeblks, ip, i, -lbn -i, 6149 needj); 6150 continue; 6151 } 6152 iboff = i + NDADDR; 6153 /* 6154 * Traverse partially truncated indirect tree. 6155 */ 6156 if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn) 6157 setup_trunc_indir(freeblks, ip, -lbn - i, 6158 lastlbn, DIP(ip, i_ib[i])); 6159 } 6160 /* 6161 * Handle partial truncation to a frag boundary. 6162 */ 6163 if (frags) { 6164 ufs2_daddr_t blkno; 6165 long oldfrags; 6166 6167 oldfrags = blksize(fs, ip, lastlbn); 6168 blkno = DIP(ip, i_db[lastlbn]); 6169 if (blkno && oldfrags != frags) { 6170 oldfrags -= frags; 6171 oldfrags = numfrags(ip->i_fs, oldfrags); 6172 blkno += numfrags(ip->i_fs, frags); 6173 newfreework(ip->i_ump, freeblks, NULL, lastlbn, 6174 blkno, oldfrags, 0, needj); 6175 } else if (blkno == 0) 6176 allocblock = 1; 6177 } 6178 /* 6179 * Add a journal record for partial truncate if we are 6180 * handling indirect blocks. Non-indirects need no extra 6181 * journaling. 6182 */ 6183 if (length != 0 && lastlbn >= NDADDR) { 6184 ip->i_flag |= IN_TRUNCATED; 6185 newjtrunc(freeblks, length, 0); 6186 } 6187 ip->i_size = length; 6188 DIP_SET(ip, i_size, ip->i_size); 6189 datablocks = DIP(ip, i_blocks) - extblocks; 6190 if (length != 0) 6191 datablocks = blkcount(ip->i_fs, datablocks, length); 6192 freeblks->fb_len = length; 6193 } 6194 if ((flags & IO_EXT) != 0) { 6195 for (i = 0; i < NXADDR; i++) 6196 setup_freeext(freeblks, ip, i, needj); 6197 ip->i_din2->di_extsize = 0; 6198 datablocks += extblocks; 6199 } 6200#ifdef QUOTA 6201 /* Reference the quotas in case the block count is wrong in the end. */ 6202 quotaref(vp, freeblks->fb_quota); 6203 (void) chkdq(ip, -datablocks, NOCRED, 0); 6204#endif 6205 freeblks->fb_chkcnt = -datablocks; 6206 UFS_LOCK(ip->i_ump); 6207 fs->fs_pendingblocks += datablocks; 6208 UFS_UNLOCK(ip->i_ump); 6209 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); 6210 /* 6211 * Handle truncation of incomplete alloc direct dependencies. We 6212 * hold the inode block locked to prevent incomplete dependencies 6213 * from reaching the disk while we are eliminating those that 6214 * have been truncated. This is a partially inlined ffs_update(). 6215 */ 6216 ufs_itimes(vp); 6217 ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); 6218 error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 6219 (int)fs->fs_bsize, cred, &bp); 6220 if (error) { 6221 brelse(bp); 6222 softdep_error("softdep_journal_freeblocks", error); 6223 return; 6224 } 6225 if (bp->b_bufsize == fs->fs_bsize) 6226 bp->b_flags |= B_CLUSTEROK; 6227 softdep_update_inodeblock(ip, bp, 0); 6228 if (ip->i_ump->um_fstype == UFS1) 6229 *((struct ufs1_dinode *)bp->b_data + 6230 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; 6231 else 6232 *((struct ufs2_dinode *)bp->b_data + 6233 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; 6234 ACQUIRE_LOCK(&lk); 6235 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6236 if ((inodedep->id_state & IOSTARTED) != 0) 6237 panic("softdep_setup_freeblocks: inode busy"); 6238 /* 6239 * Add the freeblks structure to the list of operations that 6240 * must await the zero'ed inode being written to disk. If we 6241 * still have a bitmap dependency (needj), then the inode 6242 * has never been written to disk, so we can process the 6243 * freeblks below once we have deleted the dependencies. 6244 */ 6245 if (needj) 6246 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 6247 else 6248 freeblks->fb_state |= COMPLETE; 6249 if ((flags & IO_NORMAL) != 0) { 6250 TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) { 6251 if (adp->ad_offset > iboff) 6252 cancel_allocdirect(&inodedep->id_inoupdt, adp, 6253 freeblks); 6254 /* 6255 * Truncate the allocdirect. We could eliminate 6256 * or modify journal records as well. 6257 */ 6258 else if (adp->ad_offset == iboff && frags) 6259 adp->ad_newsize = frags; 6260 } 6261 } 6262 if ((flags & IO_EXT) != 0) 6263 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 6264 cancel_allocdirect(&inodedep->id_extupdt, adp, 6265 freeblks); 6266 /* 6267 * Add journal work. 6268 */ 6269 LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) 6270 add_to_journal(&jblkdep->jb_list); 6271 FREE_LOCK(&lk); 6272 bdwrite(bp); 6273 /* 6274 * Truncate dependency structures beyond length. 6275 */ 6276 trunc_dependencies(ip, freeblks, lastlbn, frags, flags); 6277 /* 6278 * This is only set when we need to allocate a fragment because 6279 * none existed at the end of a frag-sized file. It handles only 6280 * allocating a new, zero filled block. 6281 */ 6282 if (allocblock) { 6283 ip->i_size = length - lastoff; 6284 DIP_SET(ip, i_size, ip->i_size); 6285 error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp); 6286 if (error != 0) { 6287 softdep_error("softdep_journal_freeblks", error); 6288 return; 6289 } 6290 ip->i_size = length; 6291 DIP_SET(ip, i_size, length); 6292 ip->i_flag |= IN_CHANGE | IN_UPDATE; 6293 allocbuf(bp, frags); 6294 ffs_update(vp, MNT_NOWAIT); 6295 bawrite(bp); 6296 } else if (lastoff != 0 && vp->v_type != VDIR) { 6297 int size; 6298 6299 /* 6300 * Zero the end of a truncated frag or block. 6301 */ 6302 size = sblksize(fs, length, lastlbn); 6303 error = bread(vp, lastlbn, size, cred, &bp); 6304 if (error) { 6305 softdep_error("softdep_journal_freeblks", error); 6306 return; 6307 } 6308 bzero((char *)bp->b_data + lastoff, size - lastoff); 6309 bawrite(bp); 6310 6311 } 6312 ACQUIRE_LOCK(&lk); 6313 inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6314 TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next); 6315 freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST; 6316 /* 6317 * We zero earlier truncations so they don't erroneously 6318 * update i_blocks. 6319 */ 6320 if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0) 6321 TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next) 6322 fbn->fb_len = 0; 6323 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE && 6324 LIST_EMPTY(&freeblks->fb_jblkdephd)) 6325 freeblks->fb_state |= INPROGRESS; 6326 else 6327 freeblks = NULL; 6328 FREE_LOCK(&lk); 6329 if (freeblks) 6330 handle_workitem_freeblocks(freeblks, 0); 6331 trunc_pages(ip, length, extblocks, flags); 6332 6333} 6334 6335/* 6336 * Flush a JOP_SYNC to the journal. 6337 */ 6338void 6339softdep_journal_fsync(ip) 6340 struct inode *ip; 6341{ 6342 struct jfsync *jfsync; 6343 6344 if ((ip->i_flag & IN_TRUNCATED) == 0) 6345 return; 6346 ip->i_flag &= ~IN_TRUNCATED; 6347 jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO); 6348 workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump)); 6349 jfsync->jfs_size = ip->i_size; 6350 jfsync->jfs_ino = ip->i_number; 6351 ACQUIRE_LOCK(&lk); 6352 add_to_journal(&jfsync->jfs_list); 6353 jwait(&jfsync->jfs_list, MNT_WAIT); 6354 FREE_LOCK(&lk); 6355} 6356 6357/* 6358 * Block de-allocation dependencies. 6359 * 6360 * When blocks are de-allocated, the on-disk pointers must be nullified before 6361 * the blocks are made available for use by other files. (The true 6362 * requirement is that old pointers must be nullified before new on-disk 6363 * pointers are set. We chose this slightly more stringent requirement to 6364 * reduce complexity.) Our implementation handles this dependency by updating 6365 * the inode (or indirect block) appropriately but delaying the actual block 6366 * de-allocation (i.e., freemap and free space count manipulation) until 6367 * after the updated versions reach stable storage. After the disk is 6368 * updated, the blocks can be safely de-allocated whenever it is convenient. 6369 * This implementation handles only the common case of reducing a file's 6370 * length to zero. Other cases are handled by the conventional synchronous 6371 * write approach. 6372 * 6373 * The ffs implementation with which we worked double-checks 6374 * the state of the block pointers and file size as it reduces 6375 * a file's length. Some of this code is replicated here in our 6376 * soft updates implementation. The freeblks->fb_chkcnt field is 6377 * used to transfer a part of this information to the procedure 6378 * that eventually de-allocates the blocks. 6379 * 6380 * This routine should be called from the routine that shortens 6381 * a file's length, before the inode's size or block pointers 6382 * are modified. It will save the block pointer information for 6383 * later release and zero the inode so that the calling routine 6384 * can release it. 6385 */ 6386void 6387softdep_setup_freeblocks(ip, length, flags) 6388 struct inode *ip; /* The inode whose length is to be reduced */ 6389 off_t length; /* The new length for the file */ 6390 int flags; /* IO_EXT and/or IO_NORMAL */ 6391{ 6392 struct ufs1_dinode *dp1; 6393 struct ufs2_dinode *dp2; 6394 struct freeblks *freeblks; 6395 struct inodedep *inodedep; 6396 struct allocdirect *adp; 6397 struct buf *bp; 6398 struct fs *fs; 6399 ufs2_daddr_t extblocks, datablocks; 6400 struct mount *mp; 6401 int i, delay, error; 6402 ufs_lbn_t tmpval; 6403 ufs_lbn_t lbn; 6404 6405 fs = ip->i_fs; 6406 mp = UFSTOVFS(ip->i_ump); 6407 if (length != 0) 6408 panic("softdep_setup_freeblocks: non-zero length"); 6409 freeblks = newfreeblks(mp, ip); 6410 extblocks = 0; 6411 datablocks = 0; 6412 if (fs->fs_magic == FS_UFS2_MAGIC) 6413 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 6414 if ((flags & IO_NORMAL) != 0) { 6415 for (i = 0; i < NDADDR; i++) 6416 setup_freedirect(freeblks, ip, i, 0); 6417 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; 6418 i++, lbn += tmpval, tmpval *= NINDIR(fs)) 6419 setup_freeindir(freeblks, ip, i, -lbn -i, 0); 6420 ip->i_size = 0; 6421 DIP_SET(ip, i_size, 0); 6422 datablocks = DIP(ip, i_blocks) - extblocks; 6423 } 6424 if ((flags & IO_EXT) != 0) { 6425 for (i = 0; i < NXADDR; i++) 6426 setup_freeext(freeblks, ip, i, 0); 6427 ip->i_din2->di_extsize = 0; 6428 datablocks += extblocks; 6429 } 6430#ifdef QUOTA 6431 /* Reference the quotas in case the block count is wrong in the end. */ 6432 quotaref(ITOV(ip), freeblks->fb_quota); 6433 (void) chkdq(ip, -datablocks, NOCRED, 0); 6434#endif 6435 freeblks->fb_chkcnt = -datablocks; 6436 UFS_LOCK(ip->i_ump); 6437 fs->fs_pendingblocks += datablocks; 6438 UFS_UNLOCK(ip->i_ump); 6439 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); 6440 /* 6441 * Push the zero'ed inode to to its disk buffer so that we are free 6442 * to delete its dependencies below. Once the dependencies are gone 6443 * the buffer can be safely released. 6444 */ 6445 if ((error = bread(ip->i_devvp, 6446 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 6447 (int)fs->fs_bsize, NOCRED, &bp)) != 0) { 6448 brelse(bp); 6449 softdep_error("softdep_setup_freeblocks", error); 6450 } 6451 if (ip->i_ump->um_fstype == UFS1) { 6452 dp1 = ((struct ufs1_dinode *)bp->b_data + 6453 ino_to_fsbo(fs, ip->i_number)); 6454 ip->i_din1->di_freelink = dp1->di_freelink; 6455 *dp1 = *ip->i_din1; 6456 } else { 6457 dp2 = ((struct ufs2_dinode *)bp->b_data + 6458 ino_to_fsbo(fs, ip->i_number)); 6459 ip->i_din2->di_freelink = dp2->di_freelink; 6460 *dp2 = *ip->i_din2; 6461 } 6462 /* 6463 * Find and eliminate any inode dependencies. 6464 */ 6465 ACQUIRE_LOCK(&lk); 6466 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6467 if ((inodedep->id_state & IOSTARTED) != 0) 6468 panic("softdep_setup_freeblocks: inode busy"); 6469 /* 6470 * Add the freeblks structure to the list of operations that 6471 * must await the zero'ed inode being written to disk. If we 6472 * still have a bitmap dependency (delay == 0), then the inode 6473 * has never been written to disk, so we can process the 6474 * freeblks below once we have deleted the dependencies. 6475 */ 6476 delay = (inodedep->id_state & DEPCOMPLETE); 6477 if (delay) 6478 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 6479 else 6480 freeblks->fb_state |= COMPLETE; 6481 /* 6482 * Because the file length has been truncated to zero, any 6483 * pending block allocation dependency structures associated 6484 * with this inode are obsolete and can simply be de-allocated. 6485 * We must first merge the two dependency lists to get rid of 6486 * any duplicate freefrag structures, then purge the merged list. 6487 * If we still have a bitmap dependency, then the inode has never 6488 * been written to disk, so we can free any fragments without delay. 6489 */ 6490 if (flags & IO_NORMAL) { 6491 merge_inode_lists(&inodedep->id_newinoupdt, 6492 &inodedep->id_inoupdt); 6493 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 6494 cancel_allocdirect(&inodedep->id_inoupdt, adp, 6495 freeblks); 6496 } 6497 if (flags & IO_EXT) { 6498 merge_inode_lists(&inodedep->id_newextupdt, 6499 &inodedep->id_extupdt); 6500 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 6501 cancel_allocdirect(&inodedep->id_extupdt, adp, 6502 freeblks); 6503 } 6504 FREE_LOCK(&lk); 6505 bdwrite(bp); 6506 trunc_dependencies(ip, freeblks, -1, 0, flags); 6507 ACQUIRE_LOCK(&lk); 6508 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 6509 (void) free_inodedep(inodedep); 6510 freeblks->fb_state |= DEPCOMPLETE; 6511 /* 6512 * If the inode with zeroed block pointers is now on disk 6513 * we can start freeing blocks. 6514 */ 6515 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 6516 freeblks->fb_state |= INPROGRESS; 6517 else 6518 freeblks = NULL; 6519 FREE_LOCK(&lk); 6520 if (freeblks) 6521 handle_workitem_freeblocks(freeblks, 0); 6522 trunc_pages(ip, length, extblocks, flags); 6523} 6524 6525/* 6526 * Eliminate pages from the page cache that back parts of this inode and 6527 * adjust the vnode pager's idea of our size. This prevents stale data 6528 * from hanging around in the page cache. 6529 */ 6530static void 6531trunc_pages(ip, length, extblocks, flags) 6532 struct inode *ip; 6533 off_t length; 6534 ufs2_daddr_t extblocks; 6535 int flags; 6536{ 6537 struct vnode *vp; 6538 struct fs *fs; 6539 ufs_lbn_t lbn; 6540 off_t end, extend; 6541 6542 vp = ITOV(ip); 6543 fs = ip->i_fs; 6544 extend = OFF_TO_IDX(lblktosize(fs, -extblocks)); 6545 if ((flags & IO_EXT) != 0) 6546 vn_pages_remove(vp, extend, 0); 6547 if ((flags & IO_NORMAL) == 0) 6548 return; 6549 BO_LOCK(&vp->v_bufobj); 6550 drain_output(vp); 6551 BO_UNLOCK(&vp->v_bufobj); 6552 /* 6553 * The vnode pager eliminates file pages we eliminate indirects 6554 * below. 6555 */ 6556 vnode_pager_setsize(vp, length); 6557 /* 6558 * Calculate the end based on the last indirect we want to keep. If 6559 * the block extends into indirects we can just use the negative of 6560 * its lbn. Doubles and triples exist at lower numbers so we must 6561 * be careful not to remove those, if they exist. double and triple 6562 * indirect lbns do not overlap with others so it is not important 6563 * to verify how many levels are required. 6564 */ 6565 lbn = lblkno(fs, length); 6566 if (lbn >= NDADDR) { 6567 /* Calculate the virtual lbn of the triple indirect. */ 6568 lbn = -lbn - (NIADDR - 1); 6569 end = OFF_TO_IDX(lblktosize(fs, lbn)); 6570 } else 6571 end = extend; 6572 vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end); 6573} 6574 6575/* 6576 * See if the buf bp is in the range eliminated by truncation. 6577 */ 6578static int 6579trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags) 6580 struct buf *bp; 6581 int *blkoffp; 6582 ufs_lbn_t lastlbn; 6583 int lastoff; 6584 int flags; 6585{ 6586 ufs_lbn_t lbn; 6587 6588 *blkoffp = 0; 6589 /* Only match ext/normal blocks as appropriate. */ 6590 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || 6591 ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0)) 6592 return (0); 6593 /* ALTDATA is always a full truncation. */ 6594 if ((bp->b_xflags & BX_ALTDATA) != 0) 6595 return (1); 6596 /* -1 is full truncation. */ 6597 if (lastlbn == -1) 6598 return (1); 6599 /* 6600 * If this is a partial truncate we only want those 6601 * blocks and indirect blocks that cover the range 6602 * we're after. 6603 */ 6604 lbn = bp->b_lblkno; 6605 if (lbn < 0) 6606 lbn = -(lbn + lbn_level(lbn)); 6607 if (lbn < lastlbn) 6608 return (0); 6609 /* Here we only truncate lblkno if it's partial. */ 6610 if (lbn == lastlbn) { 6611 if (lastoff == 0) 6612 return (0); 6613 *blkoffp = lastoff; 6614 } 6615 return (1); 6616} 6617 6618/* 6619 * Eliminate any dependencies that exist in memory beyond lblkno:off 6620 */ 6621static void 6622trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags) 6623 struct inode *ip; 6624 struct freeblks *freeblks; 6625 ufs_lbn_t lastlbn; 6626 int lastoff; 6627 int flags; 6628{ 6629 struct bufobj *bo; 6630 struct vnode *vp; 6631 struct buf *bp; 6632 struct fs *fs; 6633 int blkoff; 6634 6635 /* 6636 * We must wait for any I/O in progress to finish so that 6637 * all potential buffers on the dirty list will be visible. 6638 * Once they are all there, walk the list and get rid of 6639 * any dependencies. 6640 */ 6641 fs = ip->i_fs; 6642 vp = ITOV(ip); 6643 bo = &vp->v_bufobj; 6644 BO_LOCK(bo); 6645 drain_output(vp); 6646 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 6647 bp->b_vflags &= ~BV_SCANNED; 6648restart: 6649 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 6650 if (bp->b_vflags & BV_SCANNED) 6651 continue; 6652 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { 6653 bp->b_vflags |= BV_SCANNED; 6654 continue; 6655 } 6656 if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL) 6657 goto restart; 6658 BO_UNLOCK(bo); 6659 if (deallocate_dependencies(bp, freeblks, blkoff)) 6660 bqrelse(bp); 6661 else 6662 brelse(bp); 6663 BO_LOCK(bo); 6664 goto restart; 6665 } 6666 /* 6667 * Now do the work of vtruncbuf while also matching indirect blocks. 6668 */ 6669 TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) 6670 bp->b_vflags &= ~BV_SCANNED; 6671cleanrestart: 6672 TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) { 6673 if (bp->b_vflags & BV_SCANNED) 6674 continue; 6675 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { 6676 bp->b_vflags |= BV_SCANNED; 6677 continue; 6678 } 6679 if (BUF_LOCK(bp, 6680 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 6681 BO_MTX(bo)) == ENOLCK) { 6682 BO_LOCK(bo); 6683 goto cleanrestart; 6684 } 6685 bp->b_vflags |= BV_SCANNED; 6686 BO_LOCK(bo); 6687 bremfree(bp); 6688 BO_UNLOCK(bo); 6689 if (blkoff != 0) { 6690 allocbuf(bp, blkoff); 6691 bqrelse(bp); 6692 } else { 6693 bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF; 6694 brelse(bp); 6695 } 6696 BO_LOCK(bo); 6697 goto cleanrestart; 6698 } 6699 drain_output(vp); 6700 BO_UNLOCK(bo); 6701} 6702 6703static int 6704cancel_pagedep(pagedep, freeblks, blkoff) 6705 struct pagedep *pagedep; 6706 struct freeblks *freeblks; 6707 int blkoff; 6708{ 6709 struct jremref *jremref; 6710 struct jmvref *jmvref; 6711 struct dirrem *dirrem, *tmp; 6712 int i; 6713 6714 /* 6715 * Copy any directory remove dependencies to the list 6716 * to be processed after the freeblks proceeds. If 6717 * directory entry never made it to disk they 6718 * can be dumped directly onto the work list. 6719 */ 6720 LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) { 6721 /* Skip this directory removal if it is intended to remain. */ 6722 if (dirrem->dm_offset < blkoff) 6723 continue; 6724 /* 6725 * If there are any dirrems we wait for the journal write 6726 * to complete and then restart the buf scan as the lock 6727 * has been dropped. 6728 */ 6729 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { 6730 jwait(&jremref->jr_list, MNT_WAIT); 6731 return (ERESTART); 6732 } 6733 LIST_REMOVE(dirrem, dm_next); 6734 dirrem->dm_dirinum = pagedep->pd_ino; 6735 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list); 6736 } 6737 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { 6738 jwait(&jmvref->jm_list, MNT_WAIT); 6739 return (ERESTART); 6740 } 6741 /* 6742 * When we're partially truncating a pagedep we just want to flush 6743 * journal entries and return. There can not be any adds in the 6744 * truncated portion of the directory and newblk must remain if 6745 * part of the block remains. 6746 */ 6747 if (blkoff != 0) { 6748 struct diradd *dap; 6749 6750 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 6751 if (dap->da_offset > blkoff) 6752 panic("cancel_pagedep: diradd %p off %d > %d", 6753 dap, dap->da_offset, blkoff); 6754 for (i = 0; i < DAHASHSZ; i++) 6755 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) 6756 if (dap->da_offset > blkoff) 6757 panic("cancel_pagedep: diradd %p off %d > %d", 6758 dap, dap->da_offset, blkoff); 6759 return (0); 6760 } 6761 /* 6762 * There should be no directory add dependencies present 6763 * as the directory could not be truncated until all 6764 * children were removed. 6765 */ 6766 KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, 6767 ("deallocate_dependencies: pendinghd != NULL")); 6768 for (i = 0; i < DAHASHSZ; i++) 6769 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, 6770 ("deallocate_dependencies: diraddhd != NULL")); 6771 if ((pagedep->pd_state & NEWBLOCK) != 0) 6772 free_newdirblk(pagedep->pd_newdirblk); 6773 if (free_pagedep(pagedep) == 0) 6774 panic("Failed to free pagedep %p", pagedep); 6775 return (0); 6776} 6777 6778/* 6779 * Reclaim any dependency structures from a buffer that is about to 6780 * be reallocated to a new vnode. The buffer must be locked, thus, 6781 * no I/O completion operations can occur while we are manipulating 6782 * its associated dependencies. The mutex is held so that other I/O's 6783 * associated with related dependencies do not occur. 6784 */ 6785static int 6786deallocate_dependencies(bp, freeblks, off) 6787 struct buf *bp; 6788 struct freeblks *freeblks; 6789 int off; 6790{ 6791 struct indirdep *indirdep; 6792 struct pagedep *pagedep; 6793 struct allocdirect *adp; 6794 struct worklist *wk, *wkn; 6795 6796 ACQUIRE_LOCK(&lk); 6797 LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) { 6798 switch (wk->wk_type) { 6799 case D_INDIRDEP: 6800 indirdep = WK_INDIRDEP(wk); 6801 if (bp->b_lblkno >= 0 || 6802 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 6803 panic("deallocate_dependencies: not indir"); 6804 cancel_indirdep(indirdep, bp, freeblks); 6805 continue; 6806 6807 case D_PAGEDEP: 6808 pagedep = WK_PAGEDEP(wk); 6809 if (cancel_pagedep(pagedep, freeblks, off)) { 6810 FREE_LOCK(&lk); 6811 return (ERESTART); 6812 } 6813 continue; 6814 6815 case D_ALLOCINDIR: 6816 /* 6817 * Simply remove the allocindir, we'll find it via 6818 * the indirdep where we can clear pointers if 6819 * needed. 6820 */ 6821 WORKLIST_REMOVE(wk); 6822 continue; 6823 6824 case D_FREEWORK: 6825 /* 6826 * A truncation is waiting for the zero'd pointers 6827 * to be written. It can be freed when the freeblks 6828 * is journaled. 6829 */ 6830 WORKLIST_REMOVE(wk); 6831 wk->wk_state |= ONDEPLIST; 6832 WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); 6833 break; 6834 6835 case D_ALLOCDIRECT: 6836 adp = WK_ALLOCDIRECT(wk); 6837 if (off != 0) 6838 continue; 6839 /* FALLTHROUGH */ 6840 default: 6841 panic("deallocate_dependencies: Unexpected type %s", 6842 TYPENAME(wk->wk_type)); 6843 /* NOTREACHED */ 6844 } 6845 } 6846 FREE_LOCK(&lk); 6847 /* 6848 * Don't throw away this buf, we were partially truncating and 6849 * some deps may always remain. 6850 */ 6851 if (off) { 6852 allocbuf(bp, off); 6853 bp->b_vflags |= BV_SCANNED; 6854 return (EBUSY); 6855 } 6856 bp->b_flags |= B_INVAL | B_NOCACHE; 6857 6858 return (0); 6859} 6860 6861/* 6862 * An allocdirect is being canceled due to a truncate. We must make sure 6863 * the journal entry is released in concert with the blkfree that releases 6864 * the storage. Completed journal entries must not be released until the 6865 * space is no longer pointed to by the inode or in the bitmap. 6866 */ 6867static void 6868cancel_allocdirect(adphead, adp, freeblks) 6869 struct allocdirectlst *adphead; 6870 struct allocdirect *adp; 6871 struct freeblks *freeblks; 6872{ 6873 struct freework *freework; 6874 struct newblk *newblk; 6875 struct worklist *wk; 6876 6877 TAILQ_REMOVE(adphead, adp, ad_next); 6878 newblk = (struct newblk *)adp; 6879 freework = NULL; 6880 /* 6881 * Find the correct freework structure. 6882 */ 6883 LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { 6884 if (wk->wk_type != D_FREEWORK) 6885 continue; 6886 freework = WK_FREEWORK(wk); 6887 if (freework->fw_blkno == newblk->nb_newblkno) 6888 break; 6889 } 6890 if (freework == NULL) 6891 panic("cancel_allocdirect: Freework not found"); 6892 /* 6893 * If a newblk exists at all we still have the journal entry that 6894 * initiated the allocation so we do not need to journal the free. 6895 */ 6896 cancel_jfreeblk(freeblks, freework->fw_blkno); 6897 /* 6898 * If the journal hasn't been written the jnewblk must be passed 6899 * to the call to ffs_blkfree that reclaims the space. We accomplish 6900 * this by linking the journal dependency into the freework to be 6901 * freed when freework_freeblock() is called. If the journal has 6902 * been written we can simply reclaim the journal space when the 6903 * freeblks work is complete. 6904 */ 6905 freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list, 6906 &freeblks->fb_jwork); 6907 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); 6908} 6909 6910 6911/* 6912 * Cancel a new block allocation. May be an indirect or direct block. We 6913 * remove it from various lists and return any journal record that needs to 6914 * be resolved by the caller. 6915 * 6916 * A special consideration is made for indirects which were never pointed 6917 * at on disk and will never be found once this block is released. 6918 */ 6919static struct jnewblk * 6920cancel_newblk(newblk, wk, wkhd) 6921 struct newblk *newblk; 6922 struct worklist *wk; 6923 struct workhead *wkhd; 6924{ 6925 struct jnewblk *jnewblk; 6926 6927 newblk->nb_state |= GOINGAWAY; 6928 /* 6929 * Previously we traversed the completedhd on each indirdep 6930 * attached to this newblk to cancel them and gather journal 6931 * work. Since we need only the oldest journal segment and 6932 * the lowest point on the tree will always have the oldest 6933 * journal segment we are free to release the segments 6934 * of any subordinates and may leave the indirdep list to 6935 * indirdep_complete() when this newblk is freed. 6936 */ 6937 if (newblk->nb_state & ONDEPLIST) { 6938 newblk->nb_state &= ~ONDEPLIST; 6939 LIST_REMOVE(newblk, nb_deps); 6940 } 6941 if (newblk->nb_state & ONWORKLIST) 6942 WORKLIST_REMOVE(&newblk->nb_list); 6943 /* 6944 * If the journal entry hasn't been written we save a pointer to 6945 * the dependency that frees it until it is written or the 6946 * superseding operation completes. 6947 */ 6948 jnewblk = newblk->nb_jnewblk; 6949 if (jnewblk != NULL && wk != NULL) { 6950 newblk->nb_jnewblk = NULL; 6951 jnewblk->jn_dep = wk; 6952 } 6953 if (!LIST_EMPTY(&newblk->nb_jwork)) 6954 jwork_move(wkhd, &newblk->nb_jwork); 6955 /* 6956 * When truncating we must free the newdirblk early to remove 6957 * the pagedep from the hash before returning. 6958 */ 6959 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) 6960 free_newdirblk(WK_NEWDIRBLK(wk)); 6961 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 6962 panic("cancel_newblk: extra newdirblk"); 6963 6964 return (jnewblk); 6965} 6966 6967/* 6968 * Schedule the freefrag associated with a newblk to be released once 6969 * the pointers are written and the previous block is no longer needed. 6970 */ 6971static void 6972newblk_freefrag(newblk) 6973 struct newblk *newblk; 6974{ 6975 struct freefrag *freefrag; 6976 6977 if (newblk->nb_freefrag == NULL) 6978 return; 6979 freefrag = newblk->nb_freefrag; 6980 newblk->nb_freefrag = NULL; 6981 freefrag->ff_state |= COMPLETE; 6982 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 6983 add_to_worklist(&freefrag->ff_list, 0); 6984} 6985 6986/* 6987 * Free a newblk. Generate a new freefrag work request if appropriate. 6988 * This must be called after the inode pointer and any direct block pointers 6989 * are valid or fully removed via truncate or frag extension. 6990 */ 6991static void 6992free_newblk(newblk) 6993 struct newblk *newblk; 6994{ 6995 struct indirdep *indirdep; 6996 struct worklist *wk; 6997 6998 KASSERT(newblk->nb_jnewblk == NULL, 6999 ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk)); 7000 mtx_assert(&lk, MA_OWNED); 7001 newblk_freefrag(newblk); 7002 if (newblk->nb_state & ONDEPLIST) 7003 LIST_REMOVE(newblk, nb_deps); 7004 if (newblk->nb_state & ONWORKLIST) 7005 WORKLIST_REMOVE(&newblk->nb_list); 7006 LIST_REMOVE(newblk, nb_hash); 7007 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) 7008 free_newdirblk(WK_NEWDIRBLK(wk)); 7009 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 7010 panic("free_newblk: extra newdirblk"); 7011 while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) 7012 indirdep_complete(indirdep); 7013 handle_jwork(&newblk->nb_jwork); 7014 newblk->nb_list.wk_type = D_NEWBLK; 7015 WORKITEM_FREE(newblk, D_NEWBLK); 7016} 7017 7018/* 7019 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. 7020 * This routine must be called with splbio interrupts blocked. 7021 */ 7022static void 7023free_newdirblk(newdirblk) 7024 struct newdirblk *newdirblk; 7025{ 7026 struct pagedep *pagedep; 7027 struct diradd *dap; 7028 struct worklist *wk; 7029 7030 mtx_assert(&lk, MA_OWNED); 7031 WORKLIST_REMOVE(&newdirblk->db_list); 7032 /* 7033 * If the pagedep is still linked onto the directory buffer 7034 * dependency chain, then some of the entries on the 7035 * pd_pendinghd list may not be committed to disk yet. In 7036 * this case, we will simply clear the NEWBLOCK flag and 7037 * let the pd_pendinghd list be processed when the pagedep 7038 * is next written. If the pagedep is no longer on the buffer 7039 * dependency chain, then all the entries on the pd_pending 7040 * list are committed to disk and we can free them here. 7041 */ 7042 pagedep = newdirblk->db_pagedep; 7043 pagedep->pd_state &= ~NEWBLOCK; 7044 if ((pagedep->pd_state & ONWORKLIST) == 0) { 7045 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 7046 free_diradd(dap, NULL); 7047 /* 7048 * If no dependencies remain, the pagedep will be freed. 7049 */ 7050 free_pagedep(pagedep); 7051 } 7052 /* Should only ever be one item in the list. */ 7053 while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { 7054 WORKLIST_REMOVE(wk); 7055 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 7056 } 7057 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 7058} 7059 7060/* 7061 * Prepare an inode to be freed. The actual free operation is not 7062 * done until the zero'ed inode has been written to disk. 7063 */ 7064void 7065softdep_freefile(pvp, ino, mode) 7066 struct vnode *pvp; 7067 ino_t ino; 7068 int mode; 7069{ 7070 struct inode *ip = VTOI(pvp); 7071 struct inodedep *inodedep; 7072 struct freefile *freefile; 7073 struct freeblks *freeblks; 7074 7075 /* 7076 * This sets up the inode de-allocation dependency. 7077 */ 7078 freefile = malloc(sizeof(struct freefile), 7079 M_FREEFILE, M_SOFTDEP_FLAGS); 7080 workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); 7081 freefile->fx_mode = mode; 7082 freefile->fx_oldinum = ino; 7083 freefile->fx_devvp = ip->i_devvp; 7084 LIST_INIT(&freefile->fx_jwork); 7085 UFS_LOCK(ip->i_ump); 7086 ip->i_fs->fs_pendinginodes += 1; 7087 UFS_UNLOCK(ip->i_ump); 7088 7089 /* 7090 * If the inodedep does not exist, then the zero'ed inode has 7091 * been written to disk. If the allocated inode has never been 7092 * written to disk, then the on-disk inode is zero'ed. In either 7093 * case we can free the file immediately. If the journal was 7094 * canceled before being written the inode will never make it to 7095 * disk and we must send the canceled journal entrys to 7096 * ffs_freefile() to be cleared in conjunction with the bitmap. 7097 * Any blocks waiting on the inode to write can be safely freed 7098 * here as it will never been written. 7099 */ 7100 ACQUIRE_LOCK(&lk); 7101 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 7102 if (inodedep) { 7103 /* 7104 * Clear out freeblks that no longer need to reference 7105 * this inode. 7106 */ 7107 while ((freeblks = 7108 TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) { 7109 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, 7110 fb_next); 7111 freeblks->fb_state &= ~ONDEPLIST; 7112 } 7113 /* 7114 * Remove this inode from the unlinked list. 7115 */ 7116 if (inodedep->id_state & UNLINKED) { 7117 /* 7118 * Save the journal work to be freed with the bitmap 7119 * before we clear UNLINKED. Otherwise it can be lost 7120 * if the inode block is written. 7121 */ 7122 handle_bufwait(inodedep, &freefile->fx_jwork); 7123 clear_unlinked_inodedep(inodedep); 7124 /* Re-acquire inodedep as we've dropped lk. */ 7125 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 7126 } 7127 } 7128 if (inodedep == NULL || check_inode_unwritten(inodedep)) { 7129 FREE_LOCK(&lk); 7130 handle_workitem_freefile(freefile); 7131 return; 7132 } 7133 if ((inodedep->id_state & DEPCOMPLETE) == 0) 7134 inodedep->id_state |= GOINGAWAY; 7135 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 7136 FREE_LOCK(&lk); 7137 if (ip->i_number == ino) 7138 ip->i_flag |= IN_MODIFIED; 7139} 7140 7141/* 7142 * Check to see if an inode has never been written to disk. If 7143 * so free the inodedep and return success, otherwise return failure. 7144 * This routine must be called with splbio interrupts blocked. 7145 * 7146 * If we still have a bitmap dependency, then the inode has never 7147 * been written to disk. Drop the dependency as it is no longer 7148 * necessary since the inode is being deallocated. We set the 7149 * ALLCOMPLETE flags since the bitmap now properly shows that the 7150 * inode is not allocated. Even if the inode is actively being 7151 * written, it has been rolled back to its zero'ed state, so we 7152 * are ensured that a zero inode is what is on the disk. For short 7153 * lived files, this change will usually result in removing all the 7154 * dependencies from the inode so that it can be freed immediately. 7155 */ 7156static int 7157check_inode_unwritten(inodedep) 7158 struct inodedep *inodedep; 7159{ 7160 7161 mtx_assert(&lk, MA_OWNED); 7162 7163 if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || 7164 !LIST_EMPTY(&inodedep->id_pendinghd) || 7165 !LIST_EMPTY(&inodedep->id_bufwait) || 7166 !LIST_EMPTY(&inodedep->id_inowait) || 7167 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 7168 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 7169 !TAILQ_EMPTY(&inodedep->id_extupdt) || 7170 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 7171 inodedep->id_mkdiradd != NULL || 7172 inodedep->id_nlinkdelta != 0) 7173 return (0); 7174 /* 7175 * Another process might be in initiate_write_inodeblock_ufs[12] 7176 * trying to allocate memory without holding "Softdep Lock". 7177 */ 7178 if ((inodedep->id_state & IOSTARTED) != 0 && 7179 inodedep->id_savedino1 == NULL) 7180 return (0); 7181 7182 if (inodedep->id_state & ONDEPLIST) 7183 LIST_REMOVE(inodedep, id_deps); 7184 inodedep->id_state &= ~ONDEPLIST; 7185 inodedep->id_state |= ALLCOMPLETE; 7186 inodedep->id_bmsafemap = NULL; 7187 if (inodedep->id_state & ONWORKLIST) 7188 WORKLIST_REMOVE(&inodedep->id_list); 7189 if (inodedep->id_savedino1 != NULL) { 7190 free(inodedep->id_savedino1, M_SAVEDINO); 7191 inodedep->id_savedino1 = NULL; 7192 } 7193 if (free_inodedep(inodedep) == 0) 7194 panic("check_inode_unwritten: busy inode"); 7195 return (1); 7196} 7197 7198/* 7199 * Try to free an inodedep structure. Return 1 if it could be freed. 7200 */ 7201static int 7202free_inodedep(inodedep) 7203 struct inodedep *inodedep; 7204{ 7205 7206 mtx_assert(&lk, MA_OWNED); 7207 if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || 7208 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 7209 !LIST_EMPTY(&inodedep->id_dirremhd) || 7210 !LIST_EMPTY(&inodedep->id_pendinghd) || 7211 !LIST_EMPTY(&inodedep->id_bufwait) || 7212 !LIST_EMPTY(&inodedep->id_inowait) || 7213 !TAILQ_EMPTY(&inodedep->id_inoreflst) || 7214 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 7215 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 7216 !TAILQ_EMPTY(&inodedep->id_extupdt) || 7217 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 7218 !TAILQ_EMPTY(&inodedep->id_freeblklst) || 7219 inodedep->id_mkdiradd != NULL || 7220 inodedep->id_nlinkdelta != 0 || 7221 inodedep->id_savedino1 != NULL) 7222 return (0); 7223 if (inodedep->id_state & ONDEPLIST) 7224 LIST_REMOVE(inodedep, id_deps); 7225 LIST_REMOVE(inodedep, id_hash); 7226 WORKITEM_FREE(inodedep, D_INODEDEP); 7227 return (1); 7228} 7229 7230/* 7231 * Free the block referenced by a freework structure. The parent freeblks 7232 * structure is released and completed when the final cg bitmap reaches 7233 * the disk. This routine may be freeing a jnewblk which never made it to 7234 * disk in which case we do not have to wait as the operation is undone 7235 * in memory immediately. 7236 */ 7237static void 7238freework_freeblock(freework) 7239 struct freework *freework; 7240{ 7241 struct freeblks *freeblks; 7242 struct jnewblk *jnewblk; 7243 struct ufsmount *ump; 7244 struct workhead wkhd; 7245 struct fs *fs; 7246 int bsize; 7247 int needj; 7248 7249 mtx_assert(&lk, MA_OWNED); 7250 /* 7251 * Handle partial truncate separately. 7252 */ 7253 if (freework->fw_indir) { 7254 complete_trunc_indir(freework); 7255 return; 7256 } 7257 freeblks = freework->fw_freeblks; 7258 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7259 fs = ump->um_fs; 7260 needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0; 7261 bsize = lfragtosize(fs, freework->fw_frags); 7262 LIST_INIT(&wkhd); 7263 /* 7264 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives 7265 * on the indirblk hashtable and prevents premature freeing. 7266 */ 7267 freework->fw_state |= DEPCOMPLETE; 7268 /* 7269 * SUJ needs to wait for the segment referencing freed indirect 7270 * blocks to expire so that we know the checker will not confuse 7271 * a re-allocated indirect block with its old contents. 7272 */ 7273 if (needj && freework->fw_lbn <= -NDADDR) 7274 indirblk_insert(freework); 7275 /* 7276 * If we are canceling an existing jnewblk pass it to the free 7277 * routine, otherwise pass the freeblk which will ultimately 7278 * release the freeblks. If we're not journaling, we can just 7279 * free the freeblks immediately. 7280 */ 7281 jnewblk = freework->fw_jnewblk; 7282 if (jnewblk != NULL) { 7283 cancel_jnewblk(jnewblk, &wkhd); 7284 needj = 0; 7285 } else if (needj) { 7286 freework->fw_state |= DELAYEDFREE; 7287 freeblks->fb_cgwait++; 7288 WORKLIST_INSERT(&wkhd, &freework->fw_list); 7289 } 7290 FREE_LOCK(&lk); 7291 freeblks_free(ump, freeblks, btodb(bsize)); 7292 ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize, 7293 freeblks->fb_inum, freeblks->fb_vtype, &wkhd); 7294 ACQUIRE_LOCK(&lk); 7295 /* 7296 * The jnewblk will be discarded and the bits in the map never 7297 * made it to disk. We can immediately free the freeblk. 7298 */ 7299 if (needj == 0) 7300 handle_written_freework(freework); 7301} 7302 7303/* 7304 * We enqueue freework items that need processing back on the freeblks and 7305 * add the freeblks to the worklist. This makes it easier to find all work 7306 * required to flush a truncation in process_truncates(). 7307 */ 7308static void 7309freework_enqueue(freework) 7310 struct freework *freework; 7311{ 7312 struct freeblks *freeblks; 7313 7314 freeblks = freework->fw_freeblks; 7315 if ((freework->fw_state & INPROGRESS) == 0) 7316 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); 7317 if ((freeblks->fb_state & 7318 (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE && 7319 LIST_EMPTY(&freeblks->fb_jblkdephd)) 7320 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 7321} 7322 7323/* 7324 * Start, continue, or finish the process of freeing an indirect block tree. 7325 * The free operation may be paused at any point with fw_off containing the 7326 * offset to restart from. This enables us to implement some flow control 7327 * for large truncates which may fan out and generate a huge number of 7328 * dependencies. 7329 */ 7330static void 7331handle_workitem_indirblk(freework) 7332 struct freework *freework; 7333{ 7334 struct freeblks *freeblks; 7335 struct ufsmount *ump; 7336 struct fs *fs; 7337 7338 freeblks = freework->fw_freeblks; 7339 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7340 fs = ump->um_fs; 7341 if (freework->fw_state & DEPCOMPLETE) { 7342 handle_written_freework(freework); 7343 return; 7344 } 7345 if (freework->fw_off == NINDIR(fs)) { 7346 freework_freeblock(freework); 7347 return; 7348 } 7349 freework->fw_state |= INPROGRESS; 7350 FREE_LOCK(&lk); 7351 indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), 7352 freework->fw_lbn); 7353 ACQUIRE_LOCK(&lk); 7354} 7355 7356/* 7357 * Called when a freework structure attached to a cg buf is written. The 7358 * ref on either the parent or the freeblks structure is released and 7359 * the freeblks is added back to the worklist if there is more work to do. 7360 */ 7361static void 7362handle_written_freework(freework) 7363 struct freework *freework; 7364{ 7365 struct freeblks *freeblks; 7366 struct freework *parent; 7367 7368 freeblks = freework->fw_freeblks; 7369 parent = freework->fw_parent; 7370 if (freework->fw_state & DELAYEDFREE) 7371 freeblks->fb_cgwait--; 7372 freework->fw_state |= COMPLETE; 7373 if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) 7374 WORKITEM_FREE(freework, D_FREEWORK); 7375 if (parent) { 7376 if (--parent->fw_ref == 0) 7377 freework_enqueue(parent); 7378 return; 7379 } 7380 if (--freeblks->fb_ref != 0) 7381 return; 7382 if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) == 7383 ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) 7384 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 7385} 7386 7387/* 7388 * This workitem routine performs the block de-allocation. 7389 * The workitem is added to the pending list after the updated 7390 * inode block has been written to disk. As mentioned above, 7391 * checks regarding the number of blocks de-allocated (compared 7392 * to the number of blocks allocated for the file) are also 7393 * performed in this function. 7394 */ 7395static int 7396handle_workitem_freeblocks(freeblks, flags) 7397 struct freeblks *freeblks; 7398 int flags; 7399{ 7400 struct freework *freework; 7401 struct newblk *newblk; 7402 struct allocindir *aip; 7403 struct ufsmount *ump; 7404 struct worklist *wk; 7405 7406 KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd), 7407 ("handle_workitem_freeblocks: Journal entries not written.")); 7408 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7409 ACQUIRE_LOCK(&lk); 7410 while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { 7411 WORKLIST_REMOVE(wk); 7412 switch (wk->wk_type) { 7413 case D_DIRREM: 7414 wk->wk_state |= COMPLETE; 7415 add_to_worklist(wk, 0); 7416 continue; 7417 7418 case D_ALLOCDIRECT: 7419 free_newblk(WK_NEWBLK(wk)); 7420 continue; 7421 7422 case D_ALLOCINDIR: 7423 aip = WK_ALLOCINDIR(wk); 7424 freework = NULL; 7425 if (aip->ai_state & DELAYEDFREE) { 7426 FREE_LOCK(&lk); 7427 freework = newfreework(ump, freeblks, NULL, 7428 aip->ai_lbn, aip->ai_newblkno, 7429 ump->um_fs->fs_frag, 0, 0); 7430 ACQUIRE_LOCK(&lk); 7431 } 7432 newblk = WK_NEWBLK(wk); 7433 if (newblk->nb_jnewblk) { 7434 freework->fw_jnewblk = newblk->nb_jnewblk; 7435 newblk->nb_jnewblk->jn_dep = &freework->fw_list; 7436 newblk->nb_jnewblk = NULL; 7437 } 7438 free_newblk(newblk); 7439 continue; 7440 7441 case D_FREEWORK: 7442 freework = WK_FREEWORK(wk); 7443 if (freework->fw_lbn <= -NDADDR) 7444 handle_workitem_indirblk(freework); 7445 else 7446 freework_freeblock(freework); 7447 continue; 7448 default: 7449 panic("handle_workitem_freeblocks: Unknown type %s", 7450 TYPENAME(wk->wk_type)); 7451 } 7452 } 7453 if (freeblks->fb_ref != 0) { 7454 freeblks->fb_state &= ~INPROGRESS; 7455 wake_worklist(&freeblks->fb_list); 7456 freeblks = NULL; 7457 } 7458 FREE_LOCK(&lk); 7459 if (freeblks) 7460 return handle_complete_freeblocks(freeblks, flags); 7461 return (0); 7462} 7463 7464/* 7465 * Handle completion of block free via truncate. This allows fs_pending 7466 * to track the actual free block count more closely than if we only updated 7467 * it at the end. We must be careful to handle cases where the block count 7468 * on free was incorrect. 7469 */ 7470static void 7471freeblks_free(ump, freeblks, blocks) 7472 struct ufsmount *ump; 7473 struct freeblks *freeblks; 7474 int blocks; 7475{ 7476 struct fs *fs; 7477 ufs2_daddr_t remain; 7478 7479 UFS_LOCK(ump); 7480 remain = -freeblks->fb_chkcnt; 7481 freeblks->fb_chkcnt += blocks; 7482 if (remain > 0) { 7483 if (remain < blocks) 7484 blocks = remain; 7485 fs = ump->um_fs; 7486 fs->fs_pendingblocks -= blocks; 7487 } 7488 UFS_UNLOCK(ump); 7489} 7490 7491/* 7492 * Once all of the freework workitems are complete we can retire the 7493 * freeblocks dependency and any journal work awaiting completion. This 7494 * can not be called until all other dependencies are stable on disk. 7495 */ 7496static int 7497handle_complete_freeblocks(freeblks, flags) 7498 struct freeblks *freeblks; 7499 int flags; 7500{ 7501 struct inodedep *inodedep; 7502 struct inode *ip; 7503 struct vnode *vp; 7504 struct fs *fs; 7505 struct ufsmount *ump; 7506 ufs2_daddr_t spare; 7507 7508 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7509 fs = ump->um_fs; 7510 flags = LK_EXCLUSIVE | flags; 7511 spare = freeblks->fb_chkcnt; 7512 7513 /* 7514 * If we did not release the expected number of blocks we may have 7515 * to adjust the inode block count here. Only do so if it wasn't 7516 * a truncation to zero and the modrev still matches. 7517 */ 7518 if (spare && freeblks->fb_len != 0) { 7519 if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum, 7520 flags, &vp, FFSV_FORCEINSMQ) != 0) 7521 return (EBUSY); 7522 ip = VTOI(vp); 7523 if (DIP(ip, i_modrev) == freeblks->fb_modrev) { 7524 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare); 7525 ip->i_flag |= IN_CHANGE; 7526 /* 7527 * We must wait so this happens before the 7528 * journal is reclaimed. 7529 */ 7530 ffs_update(vp, 1); 7531 } 7532 vput(vp); 7533 } 7534 if (spare < 0) { 7535 UFS_LOCK(ump); 7536 fs->fs_pendingblocks += spare; 7537 UFS_UNLOCK(ump); 7538 } 7539#ifdef QUOTA 7540 /* Handle spare. */ 7541 if (spare) 7542 quotaadj(freeblks->fb_quota, ump, -spare); 7543 quotarele(freeblks->fb_quota); 7544#endif 7545 ACQUIRE_LOCK(&lk); 7546 if (freeblks->fb_state & ONDEPLIST) { 7547 inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum, 7548 0, &inodedep); 7549 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); 7550 freeblks->fb_state &= ~ONDEPLIST; 7551 if (TAILQ_EMPTY(&inodedep->id_freeblklst)) 7552 free_inodedep(inodedep); 7553 } 7554 /* 7555 * All of the freeblock deps must be complete prior to this call 7556 * so it's now safe to complete earlier outstanding journal entries. 7557 */ 7558 handle_jwork(&freeblks->fb_jwork); 7559 WORKITEM_FREE(freeblks, D_FREEBLKS); 7560 FREE_LOCK(&lk); 7561 return (0); 7562} 7563 7564/* 7565 * Release blocks associated with the freeblks and stored in the indirect 7566 * block dbn. If level is greater than SINGLE, the block is an indirect block 7567 * and recursive calls to indirtrunc must be used to cleanse other indirect 7568 * blocks. 7569 * 7570 * This handles partial and complete truncation of blocks. Partial is noted 7571 * with goingaway == 0. In this case the freework is completed after the 7572 * zero'd indirects are written to disk. For full truncation the freework 7573 * is completed after the block is freed. 7574 */ 7575static void 7576indir_trunc(freework, dbn, lbn) 7577 struct freework *freework; 7578 ufs2_daddr_t dbn; 7579 ufs_lbn_t lbn; 7580{ 7581 struct freework *nfreework; 7582 struct workhead wkhd; 7583 struct freeblks *freeblks; 7584 struct buf *bp; 7585 struct fs *fs; 7586 struct indirdep *indirdep; 7587 struct ufsmount *ump; 7588 ufs1_daddr_t *bap1 = 0; 7589 ufs2_daddr_t nb, nnb, *bap2 = 0; 7590 ufs_lbn_t lbnadd, nlbn; 7591 int i, nblocks, ufs1fmt; 7592 int freedblocks; 7593 int goingaway; 7594 int freedeps; 7595 int needj; 7596 int level; 7597 int cnt; 7598 7599 freeblks = freework->fw_freeblks; 7600 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7601 fs = ump->um_fs; 7602 /* 7603 * Get buffer of block pointers to be freed. There are three cases: 7604 * 7605 * 1) Partial truncate caches the indirdep pointer in the freework 7606 * which provides us a back copy to the save bp which holds the 7607 * pointers we want to clear. When this completes the zero 7608 * pointers are written to the real copy. 7609 * 2) The indirect is being completely truncated, cancel_indirdep() 7610 * eliminated the real copy and placed the indirdep on the saved 7611 * copy. The indirdep and buf are discarded when this completes. 7612 * 3) The indirect was not in memory, we read a copy off of the disk 7613 * using the devvp and drop and invalidate the buffer when we're 7614 * done. 7615 */ 7616 goingaway = 1; 7617 indirdep = NULL; 7618 if (freework->fw_indir != NULL) { 7619 goingaway = 0; 7620 indirdep = freework->fw_indir; 7621 bp = indirdep->ir_savebp; 7622 if (bp == NULL || bp->b_blkno != dbn) 7623 panic("indir_trunc: Bad saved buf %p blkno %jd", 7624 bp, (intmax_t)dbn); 7625 } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) { 7626 /* 7627 * The lock prevents the buf dep list from changing and 7628 * indirects on devvp should only ever have one dependency. 7629 */ 7630 indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep)); 7631 if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0) 7632 panic("indir_trunc: Bad indirdep %p from buf %p", 7633 indirdep, bp); 7634 } else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 7635 NOCRED, &bp) != 0) { 7636 brelse(bp); 7637 return; 7638 } 7639 ACQUIRE_LOCK(&lk); 7640 /* Protects against a race with complete_trunc_indir(). */ 7641 freework->fw_state &= ~INPROGRESS; 7642 /* 7643 * If we have an indirdep we need to enforce the truncation order 7644 * and discard it when it is complete. 7645 */ 7646 if (indirdep) { 7647 if (freework != TAILQ_FIRST(&indirdep->ir_trunc) && 7648 !TAILQ_EMPTY(&indirdep->ir_trunc)) { 7649 /* 7650 * Add the complete truncate to the list on the 7651 * indirdep to enforce in-order processing. 7652 */ 7653 if (freework->fw_indir == NULL) 7654 TAILQ_INSERT_TAIL(&indirdep->ir_trunc, 7655 freework, fw_next); 7656 FREE_LOCK(&lk); 7657 return; 7658 } 7659 /* 7660 * If we're goingaway, free the indirdep. Otherwise it will 7661 * linger until the write completes. 7662 */ 7663 if (goingaway) { 7664 free_indirdep(indirdep); 7665 ump->um_numindirdeps -= 1; 7666 } 7667 } 7668 FREE_LOCK(&lk); 7669 /* Initialize pointers depending on block size. */ 7670 if (ump->um_fstype == UFS1) { 7671 bap1 = (ufs1_daddr_t *)bp->b_data; 7672 nb = bap1[freework->fw_off]; 7673 ufs1fmt = 1; 7674 } else { 7675 bap2 = (ufs2_daddr_t *)bp->b_data; 7676 nb = bap2[freework->fw_off]; 7677 ufs1fmt = 0; 7678 } 7679 level = lbn_level(lbn); 7680 needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0; 7681 lbnadd = lbn_offset(fs, level); 7682 nblocks = btodb(fs->fs_bsize); 7683 nfreework = freework; 7684 freedeps = 0; 7685 cnt = 0; 7686 /* 7687 * Reclaim blocks. Traverses into nested indirect levels and 7688 * arranges for the current level to be freed when subordinates 7689 * are free when journaling. 7690 */ 7691 for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { 7692 if (i != NINDIR(fs) - 1) { 7693 if (ufs1fmt) 7694 nnb = bap1[i+1]; 7695 else 7696 nnb = bap2[i+1]; 7697 } else 7698 nnb = 0; 7699 if (nb == 0) 7700 continue; 7701 cnt++; 7702 if (level != 0) { 7703 nlbn = (lbn + 1) - (i * lbnadd); 7704 if (needj != 0) { 7705 nfreework = newfreework(ump, freeblks, freework, 7706 nlbn, nb, fs->fs_frag, 0, 0); 7707 freedeps++; 7708 } 7709 indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); 7710 } else { 7711 struct freedep *freedep; 7712 7713 /* 7714 * Attempt to aggregate freedep dependencies for 7715 * all blocks being released to the same CG. 7716 */ 7717 LIST_INIT(&wkhd); 7718 if (needj != 0 && 7719 (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { 7720 freedep = newfreedep(freework); 7721 WORKLIST_INSERT_UNLOCKED(&wkhd, 7722 &freedep->fd_list); 7723 freedeps++; 7724 } 7725 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, 7726 fs->fs_bsize, freeblks->fb_inum, 7727 freeblks->fb_vtype, &wkhd); 7728 } 7729 } 7730 if (goingaway) { 7731 bp->b_flags |= B_INVAL | B_NOCACHE; 7732 brelse(bp); 7733 } 7734 freedblocks = 0; 7735 if (level == 0) 7736 freedblocks = (nblocks * cnt); 7737 if (needj == 0) 7738 freedblocks += nblocks; 7739 freeblks_free(ump, freeblks, freedblocks); 7740 /* 7741 * If we are journaling set up the ref counts and offset so this 7742 * indirect can be completed when its children are free. 7743 */ 7744 if (needj) { 7745 ACQUIRE_LOCK(&lk); 7746 freework->fw_off = i; 7747 freework->fw_ref += freedeps; 7748 freework->fw_ref -= NINDIR(fs) + 1; 7749 if (level == 0) 7750 freeblks->fb_cgwait += freedeps; 7751 if (freework->fw_ref == 0) 7752 freework_freeblock(freework); 7753 FREE_LOCK(&lk); 7754 return; 7755 } 7756 /* 7757 * If we're not journaling we can free the indirect now. 7758 */ 7759 dbn = dbtofsb(fs, dbn); 7760 ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, 7761 freeblks->fb_inum, freeblks->fb_vtype, NULL); 7762 /* Non SUJ softdep does single-threaded truncations. */ 7763 if (freework->fw_blkno == dbn) { 7764 freework->fw_state |= ALLCOMPLETE; 7765 ACQUIRE_LOCK(&lk); 7766 handle_written_freework(freework); 7767 FREE_LOCK(&lk); 7768 } 7769 return; 7770} 7771 7772/* 7773 * Cancel an allocindir when it is removed via truncation. When bp is not 7774 * NULL the indirect never appeared on disk and is scheduled to be freed 7775 * independently of the indir so we can more easily track journal work. 7776 */ 7777static void 7778cancel_allocindir(aip, bp, freeblks, trunc) 7779 struct allocindir *aip; 7780 struct buf *bp; 7781 struct freeblks *freeblks; 7782 int trunc; 7783{ 7784 struct indirdep *indirdep; 7785 struct freefrag *freefrag; 7786 struct newblk *newblk; 7787 7788 newblk = (struct newblk *)aip; 7789 LIST_REMOVE(aip, ai_next); 7790 /* 7791 * We must eliminate the pointer in bp if it must be freed on its 7792 * own due to partial truncate or pending journal work. 7793 */ 7794 if (bp && (trunc || newblk->nb_jnewblk)) { 7795 /* 7796 * Clear the pointer and mark the aip to be freed 7797 * directly if it never existed on disk. 7798 */ 7799 aip->ai_state |= DELAYEDFREE; 7800 indirdep = aip->ai_indirdep; 7801 if (indirdep->ir_state & UFS1FMT) 7802 ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0; 7803 else 7804 ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0; 7805 } 7806 /* 7807 * When truncating the previous pointer will be freed via 7808 * savedbp. Eliminate the freefrag which would dup free. 7809 */ 7810 if (trunc && (freefrag = newblk->nb_freefrag) != NULL) { 7811 newblk->nb_freefrag = NULL; 7812 if (freefrag->ff_jdep) 7813 cancel_jfreefrag( 7814 WK_JFREEFRAG(freefrag->ff_jdep)); 7815 jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork); 7816 WORKITEM_FREE(freefrag, D_FREEFRAG); 7817 } 7818 /* 7819 * If the journal hasn't been written the jnewblk must be passed 7820 * to the call to ffs_blkfree that reclaims the space. We accomplish 7821 * this by leaving the journal dependency on the newblk to be freed 7822 * when a freework is created in handle_workitem_freeblocks(). 7823 */ 7824 cancel_newblk(newblk, NULL, &freeblks->fb_jwork); 7825 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); 7826} 7827 7828/* 7829 * Create the mkdir dependencies for . and .. in a new directory. Link them 7830 * in to a newdirblk so any subsequent additions are tracked properly. The 7831 * caller is responsible for adding the mkdir1 dependency to the journal 7832 * and updating id_mkdiradd. This function returns with lk held. 7833 */ 7834static struct mkdir * 7835setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) 7836 struct diradd *dap; 7837 ino_t newinum; 7838 ino_t dinum; 7839 struct buf *newdirbp; 7840 struct mkdir **mkdirp; 7841{ 7842 struct newblk *newblk; 7843 struct pagedep *pagedep; 7844 struct inodedep *inodedep; 7845 struct newdirblk *newdirblk = 0; 7846 struct mkdir *mkdir1, *mkdir2; 7847 struct worklist *wk; 7848 struct jaddref *jaddref; 7849 struct mount *mp; 7850 7851 mp = dap->da_list.wk_mp; 7852 newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, 7853 M_SOFTDEP_FLAGS); 7854 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 7855 LIST_INIT(&newdirblk->db_mkdir); 7856 mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 7857 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); 7858 mkdir1->md_state = ATTACHED | MKDIR_BODY; 7859 mkdir1->md_diradd = dap; 7860 mkdir1->md_jaddref = NULL; 7861 mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 7862 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); 7863 mkdir2->md_state = ATTACHED | MKDIR_PARENT; 7864 mkdir2->md_diradd = dap; 7865 mkdir2->md_jaddref = NULL; 7866 if (MOUNTEDSUJ(mp) == 0) { 7867 mkdir1->md_state |= DEPCOMPLETE; 7868 mkdir2->md_state |= DEPCOMPLETE; 7869 } 7870 /* 7871 * Dependency on "." and ".." being written to disk. 7872 */ 7873 mkdir1->md_buf = newdirbp; 7874 ACQUIRE_LOCK(&lk); 7875 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 7876 /* 7877 * We must link the pagedep, allocdirect, and newdirblk for 7878 * the initial file page so the pointer to the new directory 7879 * is not written until the directory contents are live and 7880 * any subsequent additions are not marked live until the 7881 * block is reachable via the inode. 7882 */ 7883 if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0) 7884 panic("setup_newdir: lost pagedep"); 7885 LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) 7886 if (wk->wk_type == D_ALLOCDIRECT) 7887 break; 7888 if (wk == NULL) 7889 panic("setup_newdir: lost allocdirect"); 7890 if (pagedep->pd_state & NEWBLOCK) 7891 panic("setup_newdir: NEWBLOCK already set"); 7892 newblk = WK_NEWBLK(wk); 7893 pagedep->pd_state |= NEWBLOCK; 7894 pagedep->pd_newdirblk = newdirblk; 7895 newdirblk->db_pagedep = pagedep; 7896 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 7897 WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); 7898 /* 7899 * Look up the inodedep for the parent directory so that we 7900 * can link mkdir2 into the pending dotdot jaddref or 7901 * the inode write if there is none. If the inode is 7902 * ALLCOMPLETE and no jaddref is present all dependencies have 7903 * been satisfied and mkdir2 can be freed. 7904 */ 7905 inodedep_lookup(mp, dinum, 0, &inodedep); 7906 if (MOUNTEDSUJ(mp)) { 7907 if (inodedep == NULL) 7908 panic("setup_newdir: Lost parent."); 7909 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 7910 inoreflst); 7911 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && 7912 (jaddref->ja_state & MKDIR_PARENT), 7913 ("setup_newdir: bad dotdot jaddref %p", jaddref)); 7914 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 7915 mkdir2->md_jaddref = jaddref; 7916 jaddref->ja_mkdir = mkdir2; 7917 } else if (inodedep == NULL || 7918 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 7919 dap->da_state &= ~MKDIR_PARENT; 7920 WORKITEM_FREE(mkdir2, D_MKDIR); 7921 } else { 7922 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 7923 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list); 7924 } 7925 *mkdirp = mkdir2; 7926 7927 return (mkdir1); 7928} 7929 7930/* 7931 * Directory entry addition dependencies. 7932 * 7933 * When adding a new directory entry, the inode (with its incremented link 7934 * count) must be written to disk before the directory entry's pointer to it. 7935 * Also, if the inode is newly allocated, the corresponding freemap must be 7936 * updated (on disk) before the directory entry's pointer. These requirements 7937 * are met via undo/redo on the directory entry's pointer, which consists 7938 * simply of the inode number. 7939 * 7940 * As directory entries are added and deleted, the free space within a 7941 * directory block can become fragmented. The ufs filesystem will compact 7942 * a fragmented directory block to make space for a new entry. When this 7943 * occurs, the offsets of previously added entries change. Any "diradd" 7944 * dependency structures corresponding to these entries must be updated with 7945 * the new offsets. 7946 */ 7947 7948/* 7949 * This routine is called after the in-memory inode's link 7950 * count has been incremented, but before the directory entry's 7951 * pointer to the inode has been set. 7952 */ 7953int 7954softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 7955 struct buf *bp; /* buffer containing directory block */ 7956 struct inode *dp; /* inode for directory */ 7957 off_t diroffset; /* offset of new entry in directory */ 7958 ino_t newinum; /* inode referenced by new directory entry */ 7959 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 7960 int isnewblk; /* entry is in a newly allocated block */ 7961{ 7962 int offset; /* offset of new entry within directory block */ 7963 ufs_lbn_t lbn; /* block in directory containing new entry */ 7964 struct fs *fs; 7965 struct diradd *dap; 7966 struct newblk *newblk; 7967 struct pagedep *pagedep; 7968 struct inodedep *inodedep; 7969 struct newdirblk *newdirblk = 0; 7970 struct mkdir *mkdir1, *mkdir2; 7971 struct jaddref *jaddref; 7972 struct mount *mp; 7973 int isindir; 7974 7975 /* 7976 * Whiteouts have no dependencies. 7977 */ 7978 if (newinum == WINO) { 7979 if (newdirbp != NULL) 7980 bdwrite(newdirbp); 7981 return (0); 7982 } 7983 jaddref = NULL; 7984 mkdir1 = mkdir2 = NULL; 7985 mp = UFSTOVFS(dp->i_ump); 7986 fs = dp->i_fs; 7987 lbn = lblkno(fs, diroffset); 7988 offset = blkoff(fs, diroffset); 7989 dap = malloc(sizeof(struct diradd), M_DIRADD, 7990 M_SOFTDEP_FLAGS|M_ZERO); 7991 workitem_alloc(&dap->da_list, D_DIRADD, mp); 7992 dap->da_offset = offset; 7993 dap->da_newinum = newinum; 7994 dap->da_state = ATTACHED; 7995 LIST_INIT(&dap->da_jwork); 7996 isindir = bp->b_lblkno >= NDADDR; 7997 if (isnewblk && 7998 (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { 7999 newdirblk = malloc(sizeof(struct newdirblk), 8000 M_NEWDIRBLK, M_SOFTDEP_FLAGS); 8001 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 8002 LIST_INIT(&newdirblk->db_mkdir); 8003 } 8004 /* 8005 * If we're creating a new directory setup the dependencies and set 8006 * the dap state to wait for them. Otherwise it's COMPLETE and 8007 * we can move on. 8008 */ 8009 if (newdirbp == NULL) { 8010 dap->da_state |= DEPCOMPLETE; 8011 ACQUIRE_LOCK(&lk); 8012 } else { 8013 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 8014 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, 8015 &mkdir2); 8016 } 8017 /* 8018 * Link into parent directory pagedep to await its being written. 8019 */ 8020 pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep); 8021#ifdef DEBUG 8022 if (diradd_lookup(pagedep, offset) != NULL) 8023 panic("softdep_setup_directory_add: %p already at off %d\n", 8024 diradd_lookup(pagedep, offset), offset); 8025#endif 8026 dap->da_pagedep = pagedep; 8027 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 8028 da_pdlist); 8029 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 8030 /* 8031 * If we're journaling, link the diradd into the jaddref so it 8032 * may be completed after the journal entry is written. Otherwise, 8033 * link the diradd into its inodedep. If the inode is not yet 8034 * written place it on the bufwait list, otherwise do the post-inode 8035 * write processing to put it on the id_pendinghd list. 8036 */ 8037 if (MOUNTEDSUJ(mp)) { 8038 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 8039 inoreflst); 8040 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 8041 ("softdep_setup_directory_add: bad jaddref %p", jaddref)); 8042 jaddref->ja_diroff = diroffset; 8043 jaddref->ja_diradd = dap; 8044 add_to_journal(&jaddref->ja_list); 8045 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 8046 diradd_inode_written(dap, inodedep); 8047 else 8048 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 8049 /* 8050 * Add the journal entries for . and .. links now that the primary 8051 * link is written. 8052 */ 8053 if (mkdir1 != NULL && MOUNTEDSUJ(mp)) { 8054 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 8055 inoreflst, if_deps); 8056 KASSERT(jaddref != NULL && 8057 jaddref->ja_ino == jaddref->ja_parent && 8058 (jaddref->ja_state & MKDIR_BODY), 8059 ("softdep_setup_directory_add: bad dot jaddref %p", 8060 jaddref)); 8061 mkdir1->md_jaddref = jaddref; 8062 jaddref->ja_mkdir = mkdir1; 8063 /* 8064 * It is important that the dotdot journal entry 8065 * is added prior to the dot entry since dot writes 8066 * both the dot and dotdot links. These both must 8067 * be added after the primary link for the journal 8068 * to remain consistent. 8069 */ 8070 add_to_journal(&mkdir2->md_jaddref->ja_list); 8071 add_to_journal(&jaddref->ja_list); 8072 } 8073 /* 8074 * If we are adding a new directory remember this diradd so that if 8075 * we rename it we can keep the dot and dotdot dependencies. If 8076 * we are adding a new name for an inode that has a mkdiradd we 8077 * must be in rename and we have to move the dot and dotdot 8078 * dependencies to this new name. The old name is being orphaned 8079 * soon. 8080 */ 8081 if (mkdir1 != NULL) { 8082 if (inodedep->id_mkdiradd != NULL) 8083 panic("softdep_setup_directory_add: Existing mkdir"); 8084 inodedep->id_mkdiradd = dap; 8085 } else if (inodedep->id_mkdiradd) 8086 merge_diradd(inodedep, dap); 8087 if (newdirblk) { 8088 /* 8089 * There is nothing to do if we are already tracking 8090 * this block. 8091 */ 8092 if ((pagedep->pd_state & NEWBLOCK) != 0) { 8093 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 8094 FREE_LOCK(&lk); 8095 return (0); 8096 } 8097 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) 8098 == 0) 8099 panic("softdep_setup_directory_add: lost entry"); 8100 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 8101 pagedep->pd_state |= NEWBLOCK; 8102 pagedep->pd_newdirblk = newdirblk; 8103 newdirblk->db_pagedep = pagedep; 8104 FREE_LOCK(&lk); 8105 /* 8106 * If we extended into an indirect signal direnter to sync. 8107 */ 8108 if (isindir) 8109 return (1); 8110 return (0); 8111 } 8112 FREE_LOCK(&lk); 8113 return (0); 8114} 8115 8116/* 8117 * This procedure is called to change the offset of a directory 8118 * entry when compacting a directory block which must be owned 8119 * exclusively by the caller. Note that the actual entry movement 8120 * must be done in this procedure to ensure that no I/O completions 8121 * occur while the move is in progress. 8122 */ 8123void 8124softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 8125 struct buf *bp; /* Buffer holding directory block. */ 8126 struct inode *dp; /* inode for directory */ 8127 caddr_t base; /* address of dp->i_offset */ 8128 caddr_t oldloc; /* address of old directory location */ 8129 caddr_t newloc; /* address of new directory location */ 8130 int entrysize; /* size of directory entry */ 8131{ 8132 int offset, oldoffset, newoffset; 8133 struct pagedep *pagedep; 8134 struct jmvref *jmvref; 8135 struct diradd *dap; 8136 struct direct *de; 8137 struct mount *mp; 8138 ufs_lbn_t lbn; 8139 int flags; 8140 8141 mp = UFSTOVFS(dp->i_ump); 8142 de = (struct direct *)oldloc; 8143 jmvref = NULL; 8144 flags = 0; 8145 /* 8146 * Moves are always journaled as it would be too complex to 8147 * determine if any affected adds or removes are present in the 8148 * journal. 8149 */ 8150 if (MOUNTEDSUJ(mp)) { 8151 flags = DEPALLOC; 8152 jmvref = newjmvref(dp, de->d_ino, 8153 dp->i_offset + (oldloc - base), 8154 dp->i_offset + (newloc - base)); 8155 } 8156 lbn = lblkno(dp->i_fs, dp->i_offset); 8157 offset = blkoff(dp->i_fs, dp->i_offset); 8158 oldoffset = offset + (oldloc - base); 8159 newoffset = offset + (newloc - base); 8160 ACQUIRE_LOCK(&lk); 8161 if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0) 8162 goto done; 8163 dap = diradd_lookup(pagedep, oldoffset); 8164 if (dap) { 8165 dap->da_offset = newoffset; 8166 newoffset = DIRADDHASH(newoffset); 8167 oldoffset = DIRADDHASH(oldoffset); 8168 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && 8169 newoffset != oldoffset) { 8170 LIST_REMOVE(dap, da_pdlist); 8171 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], 8172 dap, da_pdlist); 8173 } 8174 } 8175done: 8176 if (jmvref) { 8177 jmvref->jm_pagedep = pagedep; 8178 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); 8179 add_to_journal(&jmvref->jm_list); 8180 } 8181 bcopy(oldloc, newloc, entrysize); 8182 FREE_LOCK(&lk); 8183} 8184 8185/* 8186 * Move the mkdir dependencies and journal work from one diradd to another 8187 * when renaming a directory. The new name must depend on the mkdir deps 8188 * completing as the old name did. Directories can only have one valid link 8189 * at a time so one must be canonical. 8190 */ 8191static void 8192merge_diradd(inodedep, newdap) 8193 struct inodedep *inodedep; 8194 struct diradd *newdap; 8195{ 8196 struct diradd *olddap; 8197 struct mkdir *mkdir, *nextmd; 8198 short state; 8199 8200 olddap = inodedep->id_mkdiradd; 8201 inodedep->id_mkdiradd = newdap; 8202 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8203 newdap->da_state &= ~DEPCOMPLETE; 8204 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 8205 nextmd = LIST_NEXT(mkdir, md_mkdirs); 8206 if (mkdir->md_diradd != olddap) 8207 continue; 8208 mkdir->md_diradd = newdap; 8209 state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); 8210 newdap->da_state |= state; 8211 olddap->da_state &= ~state; 8212 if ((olddap->da_state & 8213 (MKDIR_PARENT | MKDIR_BODY)) == 0) 8214 break; 8215 } 8216 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 8217 panic("merge_diradd: unfound ref"); 8218 } 8219 /* 8220 * Any mkdir related journal items are not safe to be freed until 8221 * the new name is stable. 8222 */ 8223 jwork_move(&newdap->da_jwork, &olddap->da_jwork); 8224 olddap->da_state |= DEPCOMPLETE; 8225 complete_diradd(olddap); 8226} 8227 8228/* 8229 * Move the diradd to the pending list when all diradd dependencies are 8230 * complete. 8231 */ 8232static void 8233complete_diradd(dap) 8234 struct diradd *dap; 8235{ 8236 struct pagedep *pagedep; 8237 8238 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 8239 if (dap->da_state & DIRCHG) 8240 pagedep = dap->da_previous->dm_pagedep; 8241 else 8242 pagedep = dap->da_pagedep; 8243 LIST_REMOVE(dap, da_pdlist); 8244 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 8245 } 8246} 8247 8248/* 8249 * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal 8250 * add entries and conditonally journal the remove. 8251 */ 8252static void 8253cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) 8254 struct diradd *dap; 8255 struct dirrem *dirrem; 8256 struct jremref *jremref; 8257 struct jremref *dotremref; 8258 struct jremref *dotdotremref; 8259{ 8260 struct inodedep *inodedep; 8261 struct jaddref *jaddref; 8262 struct inoref *inoref; 8263 struct mkdir *mkdir; 8264 8265 /* 8266 * If no remove references were allocated we're on a non-journaled 8267 * filesystem and can skip the cancel step. 8268 */ 8269 if (jremref == NULL) { 8270 free_diradd(dap, NULL); 8271 return; 8272 } 8273 /* 8274 * Cancel the primary name an free it if it does not require 8275 * journaling. 8276 */ 8277 if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, 8278 0, &inodedep) != 0) { 8279 /* Abort the addref that reference this diradd. */ 8280 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 8281 if (inoref->if_list.wk_type != D_JADDREF) 8282 continue; 8283 jaddref = (struct jaddref *)inoref; 8284 if (jaddref->ja_diradd != dap) 8285 continue; 8286 if (cancel_jaddref(jaddref, inodedep, 8287 &dirrem->dm_jwork) == 0) { 8288 free_jremref(jremref); 8289 jremref = NULL; 8290 } 8291 break; 8292 } 8293 } 8294 /* 8295 * Cancel subordinate names and free them if they do not require 8296 * journaling. 8297 */ 8298 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8299 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 8300 if (mkdir->md_diradd != dap) 8301 continue; 8302 if ((jaddref = mkdir->md_jaddref) == NULL) 8303 continue; 8304 mkdir->md_jaddref = NULL; 8305 if (mkdir->md_state & MKDIR_PARENT) { 8306 if (cancel_jaddref(jaddref, NULL, 8307 &dirrem->dm_jwork) == 0) { 8308 free_jremref(dotdotremref); 8309 dotdotremref = NULL; 8310 } 8311 } else { 8312 if (cancel_jaddref(jaddref, inodedep, 8313 &dirrem->dm_jwork) == 0) { 8314 free_jremref(dotremref); 8315 dotremref = NULL; 8316 } 8317 } 8318 } 8319 } 8320 8321 if (jremref) 8322 journal_jremref(dirrem, jremref, inodedep); 8323 if (dotremref) 8324 journal_jremref(dirrem, dotremref, inodedep); 8325 if (dotdotremref) 8326 journal_jremref(dirrem, dotdotremref, NULL); 8327 jwork_move(&dirrem->dm_jwork, &dap->da_jwork); 8328 free_diradd(dap, &dirrem->dm_jwork); 8329} 8330 8331/* 8332 * Free a diradd dependency structure. This routine must be called 8333 * with splbio interrupts blocked. 8334 */ 8335static void 8336free_diradd(dap, wkhd) 8337 struct diradd *dap; 8338 struct workhead *wkhd; 8339{ 8340 struct dirrem *dirrem; 8341 struct pagedep *pagedep; 8342 struct inodedep *inodedep; 8343 struct mkdir *mkdir, *nextmd; 8344 8345 mtx_assert(&lk, MA_OWNED); 8346 LIST_REMOVE(dap, da_pdlist); 8347 if (dap->da_state & ONWORKLIST) 8348 WORKLIST_REMOVE(&dap->da_list); 8349 if ((dap->da_state & DIRCHG) == 0) { 8350 pagedep = dap->da_pagedep; 8351 } else { 8352 dirrem = dap->da_previous; 8353 pagedep = dirrem->dm_pagedep; 8354 dirrem->dm_dirinum = pagedep->pd_ino; 8355 dirrem->dm_state |= COMPLETE; 8356 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8357 add_to_worklist(&dirrem->dm_list, 0); 8358 } 8359 if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 8360 0, &inodedep) != 0) 8361 if (inodedep->id_mkdiradd == dap) 8362 inodedep->id_mkdiradd = NULL; 8363 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8364 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 8365 nextmd = LIST_NEXT(mkdir, md_mkdirs); 8366 if (mkdir->md_diradd != dap) 8367 continue; 8368 dap->da_state &= 8369 ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 8370 LIST_REMOVE(mkdir, md_mkdirs); 8371 if (mkdir->md_state & ONWORKLIST) 8372 WORKLIST_REMOVE(&mkdir->md_list); 8373 if (mkdir->md_jaddref != NULL) 8374 panic("free_diradd: Unexpected jaddref"); 8375 WORKITEM_FREE(mkdir, D_MKDIR); 8376 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 8377 break; 8378 } 8379 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 8380 panic("free_diradd: unfound ref"); 8381 } 8382 if (inodedep) 8383 free_inodedep(inodedep); 8384 /* 8385 * Free any journal segments waiting for the directory write. 8386 */ 8387 handle_jwork(&dap->da_jwork); 8388 WORKITEM_FREE(dap, D_DIRADD); 8389} 8390 8391/* 8392 * Directory entry removal dependencies. 8393 * 8394 * When removing a directory entry, the entry's inode pointer must be 8395 * zero'ed on disk before the corresponding inode's link count is decremented 8396 * (possibly freeing the inode for re-use). This dependency is handled by 8397 * updating the directory entry but delaying the inode count reduction until 8398 * after the directory block has been written to disk. After this point, the 8399 * inode count can be decremented whenever it is convenient. 8400 */ 8401 8402/* 8403 * This routine should be called immediately after removing 8404 * a directory entry. The inode's link count should not be 8405 * decremented by the calling procedure -- the soft updates 8406 * code will do this task when it is safe. 8407 */ 8408void 8409softdep_setup_remove(bp, dp, ip, isrmdir) 8410 struct buf *bp; /* buffer containing directory block */ 8411 struct inode *dp; /* inode for the directory being modified */ 8412 struct inode *ip; /* inode for directory entry being removed */ 8413 int isrmdir; /* indicates if doing RMDIR */ 8414{ 8415 struct dirrem *dirrem, *prevdirrem; 8416 struct inodedep *inodedep; 8417 int direct; 8418 8419 /* 8420 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want 8421 * newdirrem() to setup the full directory remove which requires 8422 * isrmdir > 1. 8423 */ 8424 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 8425 /* 8426 * Add the dirrem to the inodedep's pending remove list for quick 8427 * discovery later. 8428 */ 8429 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 8430 &inodedep) == 0) 8431 panic("softdep_setup_remove: Lost inodedep."); 8432 dirrem->dm_state |= ONDEPLIST; 8433 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 8434 8435 /* 8436 * If the COMPLETE flag is clear, then there were no active 8437 * entries and we want to roll back to a zeroed entry until 8438 * the new inode is committed to disk. If the COMPLETE flag is 8439 * set then we have deleted an entry that never made it to 8440 * disk. If the entry we deleted resulted from a name change, 8441 * then the old name still resides on disk. We cannot delete 8442 * its inode (returned to us in prevdirrem) until the zeroed 8443 * directory entry gets to disk. The new inode has never been 8444 * referenced on the disk, so can be deleted immediately. 8445 */ 8446 if ((dirrem->dm_state & COMPLETE) == 0) { 8447 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 8448 dm_next); 8449 FREE_LOCK(&lk); 8450 } else { 8451 if (prevdirrem != NULL) 8452 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 8453 prevdirrem, dm_next); 8454 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 8455 direct = LIST_EMPTY(&dirrem->dm_jremrefhd); 8456 FREE_LOCK(&lk); 8457 if (direct) 8458 handle_workitem_remove(dirrem, 0); 8459 } 8460} 8461 8462/* 8463 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the 8464 * pd_pendinghd list of a pagedep. 8465 */ 8466static struct diradd * 8467diradd_lookup(pagedep, offset) 8468 struct pagedep *pagedep; 8469 int offset; 8470{ 8471 struct diradd *dap; 8472 8473 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 8474 if (dap->da_offset == offset) 8475 return (dap); 8476 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 8477 if (dap->da_offset == offset) 8478 return (dap); 8479 return (NULL); 8480} 8481 8482/* 8483 * Search for a .. diradd dependency in a directory that is being removed. 8484 * If the directory was renamed to a new parent we have a diradd rather 8485 * than a mkdir for the .. entry. We need to cancel it now before 8486 * it is found in truncate(). 8487 */ 8488static struct jremref * 8489cancel_diradd_dotdot(ip, dirrem, jremref) 8490 struct inode *ip; 8491 struct dirrem *dirrem; 8492 struct jremref *jremref; 8493{ 8494 struct pagedep *pagedep; 8495 struct diradd *dap; 8496 struct worklist *wk; 8497 8498 if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0, 8499 &pagedep) == 0) 8500 return (jremref); 8501 dap = diradd_lookup(pagedep, DOTDOT_OFFSET); 8502 if (dap == NULL) 8503 return (jremref); 8504 cancel_diradd(dap, dirrem, jremref, NULL, NULL); 8505 /* 8506 * Mark any journal work as belonging to the parent so it is freed 8507 * with the .. reference. 8508 */ 8509 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 8510 wk->wk_state |= MKDIR_PARENT; 8511 return (NULL); 8512} 8513 8514/* 8515 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to 8516 * replace it with a dirrem/diradd pair as a result of re-parenting a 8517 * directory. This ensures that we don't simultaneously have a mkdir and 8518 * a diradd for the same .. entry. 8519 */ 8520static struct jremref * 8521cancel_mkdir_dotdot(ip, dirrem, jremref) 8522 struct inode *ip; 8523 struct dirrem *dirrem; 8524 struct jremref *jremref; 8525{ 8526 struct inodedep *inodedep; 8527 struct jaddref *jaddref; 8528 struct mkdir *mkdir; 8529 struct diradd *dap; 8530 8531 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 8532 &inodedep) == 0) 8533 panic("cancel_mkdir_dotdot: Lost inodedep"); 8534 dap = inodedep->id_mkdiradd; 8535 if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) 8536 return (jremref); 8537 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; 8538 mkdir = LIST_NEXT(mkdir, md_mkdirs)) 8539 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) 8540 break; 8541 if (mkdir == NULL) 8542 panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); 8543 if ((jaddref = mkdir->md_jaddref) != NULL) { 8544 mkdir->md_jaddref = NULL; 8545 jaddref->ja_state &= ~MKDIR_PARENT; 8546 if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0, 8547 &inodedep) == 0) 8548 panic("cancel_mkdir_dotdot: Lost parent inodedep"); 8549 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { 8550 journal_jremref(dirrem, jremref, inodedep); 8551 jremref = NULL; 8552 } 8553 } 8554 if (mkdir->md_state & ONWORKLIST) 8555 WORKLIST_REMOVE(&mkdir->md_list); 8556 mkdir->md_state |= ALLCOMPLETE; 8557 complete_mkdir(mkdir); 8558 return (jremref); 8559} 8560 8561static void 8562journal_jremref(dirrem, jremref, inodedep) 8563 struct dirrem *dirrem; 8564 struct jremref *jremref; 8565 struct inodedep *inodedep; 8566{ 8567 8568 if (inodedep == NULL) 8569 if (inodedep_lookup(jremref->jr_list.wk_mp, 8570 jremref->jr_ref.if_ino, 0, &inodedep) == 0) 8571 panic("journal_jremref: Lost inodedep"); 8572 LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); 8573 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 8574 add_to_journal(&jremref->jr_list); 8575} 8576 8577static void 8578dirrem_journal(dirrem, jremref, dotremref, dotdotremref) 8579 struct dirrem *dirrem; 8580 struct jremref *jremref; 8581 struct jremref *dotremref; 8582 struct jremref *dotdotremref; 8583{ 8584 struct inodedep *inodedep; 8585 8586 8587 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, 8588 &inodedep) == 0) 8589 panic("dirrem_journal: Lost inodedep"); 8590 journal_jremref(dirrem, jremref, inodedep); 8591 if (dotremref) 8592 journal_jremref(dirrem, dotremref, inodedep); 8593 if (dotdotremref) 8594 journal_jremref(dirrem, dotdotremref, NULL); 8595} 8596 8597/* 8598 * Allocate a new dirrem if appropriate and return it along with 8599 * its associated pagedep. Called without a lock, returns with lock. 8600 */ 8601static struct dirrem * 8602newdirrem(bp, dp, ip, isrmdir, prevdirremp) 8603 struct buf *bp; /* buffer containing directory block */ 8604 struct inode *dp; /* inode for the directory being modified */ 8605 struct inode *ip; /* inode for directory entry being removed */ 8606 int isrmdir; /* indicates if doing RMDIR */ 8607 struct dirrem **prevdirremp; /* previously referenced inode, if any */ 8608{ 8609 int offset; 8610 ufs_lbn_t lbn; 8611 struct diradd *dap; 8612 struct dirrem *dirrem; 8613 struct pagedep *pagedep; 8614 struct jremref *jremref; 8615 struct jremref *dotremref; 8616 struct jremref *dotdotremref; 8617 struct vnode *dvp; 8618 8619 /* 8620 * Whiteouts have no deletion dependencies. 8621 */ 8622 if (ip == NULL) 8623 panic("newdirrem: whiteout"); 8624 dvp = ITOV(dp); 8625 /* 8626 * If we are over our limit, try to improve the situation. 8627 * Limiting the number of dirrem structures will also limit 8628 * the number of freefile and freeblks structures. 8629 */ 8630 ACQUIRE_LOCK(&lk); 8631 if (!(ip->i_flags & SF_SNAPSHOT) && 8632 dep_current[D_DIRREM] > max_softdeps / 2) 8633 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS); 8634 FREE_LOCK(&lk); 8635 dirrem = malloc(sizeof(struct dirrem), 8636 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); 8637 workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); 8638 LIST_INIT(&dirrem->dm_jremrefhd); 8639 LIST_INIT(&dirrem->dm_jwork); 8640 dirrem->dm_state = isrmdir ? RMDIR : 0; 8641 dirrem->dm_oldinum = ip->i_number; 8642 *prevdirremp = NULL; 8643 /* 8644 * Allocate remove reference structures to track journal write 8645 * dependencies. We will always have one for the link and 8646 * when doing directories we will always have one more for dot. 8647 * When renaming a directory we skip the dotdot link change so 8648 * this is not needed. 8649 */ 8650 jremref = dotremref = dotdotremref = NULL; 8651 if (DOINGSUJ(dvp)) { 8652 if (isrmdir) { 8653 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 8654 ip->i_effnlink + 2); 8655 dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, 8656 ip->i_effnlink + 1); 8657 dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, 8658 dp->i_effnlink + 1); 8659 dotdotremref->jr_state |= MKDIR_PARENT; 8660 } else 8661 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 8662 ip->i_effnlink + 1); 8663 } 8664 ACQUIRE_LOCK(&lk); 8665 lbn = lblkno(dp->i_fs, dp->i_offset); 8666 offset = blkoff(dp->i_fs, dp->i_offset); 8667 pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC, 8668 &pagedep); 8669 dirrem->dm_pagedep = pagedep; 8670 dirrem->dm_offset = offset; 8671 /* 8672 * If we're renaming a .. link to a new directory, cancel any 8673 * existing MKDIR_PARENT mkdir. If it has already been canceled 8674 * the jremref is preserved for any potential diradd in this 8675 * location. This can not coincide with a rmdir. 8676 */ 8677 if (dp->i_offset == DOTDOT_OFFSET) { 8678 if (isrmdir) 8679 panic("newdirrem: .. directory change during remove?"); 8680 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); 8681 } 8682 /* 8683 * If we're removing a directory search for the .. dependency now and 8684 * cancel it. Any pending journal work will be added to the dirrem 8685 * to be completed when the workitem remove completes. 8686 */ 8687 if (isrmdir) 8688 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); 8689 /* 8690 * Check for a diradd dependency for the same directory entry. 8691 * If present, then both dependencies become obsolete and can 8692 * be de-allocated. 8693 */ 8694 dap = diradd_lookup(pagedep, offset); 8695 if (dap == NULL) { 8696 /* 8697 * Link the jremref structures into the dirrem so they are 8698 * written prior to the pagedep. 8699 */ 8700 if (jremref) 8701 dirrem_journal(dirrem, jremref, dotremref, 8702 dotdotremref); 8703 return (dirrem); 8704 } 8705 /* 8706 * Must be ATTACHED at this point. 8707 */ 8708 if ((dap->da_state & ATTACHED) == 0) 8709 panic("newdirrem: not ATTACHED"); 8710 if (dap->da_newinum != ip->i_number) 8711 panic("newdirrem: inum %d should be %d", 8712 ip->i_number, dap->da_newinum); 8713 /* 8714 * If we are deleting a changed name that never made it to disk, 8715 * then return the dirrem describing the previous inode (which 8716 * represents the inode currently referenced from this entry on disk). 8717 */ 8718 if ((dap->da_state & DIRCHG) != 0) { 8719 *prevdirremp = dap->da_previous; 8720 dap->da_state &= ~DIRCHG; 8721 dap->da_pagedep = pagedep; 8722 } 8723 /* 8724 * We are deleting an entry that never made it to disk. 8725 * Mark it COMPLETE so we can delete its inode immediately. 8726 */ 8727 dirrem->dm_state |= COMPLETE; 8728 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); 8729#ifdef SUJ_DEBUG 8730 if (isrmdir == 0) { 8731 struct worklist *wk; 8732 8733 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 8734 if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) 8735 panic("bad wk %p (0x%X)\n", wk, wk->wk_state); 8736 } 8737#endif 8738 8739 return (dirrem); 8740} 8741 8742/* 8743 * Directory entry change dependencies. 8744 * 8745 * Changing an existing directory entry requires that an add operation 8746 * be completed first followed by a deletion. The semantics for the addition 8747 * are identical to the description of adding a new entry above except 8748 * that the rollback is to the old inode number rather than zero. Once 8749 * the addition dependency is completed, the removal is done as described 8750 * in the removal routine above. 8751 */ 8752 8753/* 8754 * This routine should be called immediately after changing 8755 * a directory entry. The inode's link count should not be 8756 * decremented by the calling procedure -- the soft updates 8757 * code will perform this task when it is safe. 8758 */ 8759void 8760softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 8761 struct buf *bp; /* buffer containing directory block */ 8762 struct inode *dp; /* inode for the directory being modified */ 8763 struct inode *ip; /* inode for directory entry being removed */ 8764 ino_t newinum; /* new inode number for changed entry */ 8765 int isrmdir; /* indicates if doing RMDIR */ 8766{ 8767 int offset; 8768 struct diradd *dap = NULL; 8769 struct dirrem *dirrem, *prevdirrem; 8770 struct pagedep *pagedep; 8771 struct inodedep *inodedep; 8772 struct jaddref *jaddref; 8773 struct mount *mp; 8774 8775 offset = blkoff(dp->i_fs, dp->i_offset); 8776 mp = UFSTOVFS(dp->i_ump); 8777 8778 /* 8779 * Whiteouts do not need diradd dependencies. 8780 */ 8781 if (newinum != WINO) { 8782 dap = malloc(sizeof(struct diradd), 8783 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); 8784 workitem_alloc(&dap->da_list, D_DIRADD, mp); 8785 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 8786 dap->da_offset = offset; 8787 dap->da_newinum = newinum; 8788 LIST_INIT(&dap->da_jwork); 8789 } 8790 8791 /* 8792 * Allocate a new dirrem and ACQUIRE_LOCK. 8793 */ 8794 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 8795 pagedep = dirrem->dm_pagedep; 8796 /* 8797 * The possible values for isrmdir: 8798 * 0 - non-directory file rename 8799 * 1 - directory rename within same directory 8800 * inum - directory rename to new directory of given inode number 8801 * When renaming to a new directory, we are both deleting and 8802 * creating a new directory entry, so the link count on the new 8803 * directory should not change. Thus we do not need the followup 8804 * dirrem which is usually done in handle_workitem_remove. We set 8805 * the DIRCHG flag to tell handle_workitem_remove to skip the 8806 * followup dirrem. 8807 */ 8808 if (isrmdir > 1) 8809 dirrem->dm_state |= DIRCHG; 8810 8811 /* 8812 * Whiteouts have no additional dependencies, 8813 * so just put the dirrem on the correct list. 8814 */ 8815 if (newinum == WINO) { 8816 if ((dirrem->dm_state & COMPLETE) == 0) { 8817 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 8818 dm_next); 8819 } else { 8820 dirrem->dm_dirinum = pagedep->pd_ino; 8821 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8822 add_to_worklist(&dirrem->dm_list, 0); 8823 } 8824 FREE_LOCK(&lk); 8825 return; 8826 } 8827 /* 8828 * Add the dirrem to the inodedep's pending remove list for quick 8829 * discovery later. A valid nlinkdelta ensures that this lookup 8830 * will not fail. 8831 */ 8832 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 8833 panic("softdep_setup_directory_change: Lost inodedep."); 8834 dirrem->dm_state |= ONDEPLIST; 8835 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 8836 8837 /* 8838 * If the COMPLETE flag is clear, then there were no active 8839 * entries and we want to roll back to the previous inode until 8840 * the new inode is committed to disk. If the COMPLETE flag is 8841 * set, then we have deleted an entry that never made it to disk. 8842 * If the entry we deleted resulted from a name change, then the old 8843 * inode reference still resides on disk. Any rollback that we do 8844 * needs to be to that old inode (returned to us in prevdirrem). If 8845 * the entry we deleted resulted from a create, then there is 8846 * no entry on the disk, so we want to roll back to zero rather 8847 * than the uncommitted inode. In either of the COMPLETE cases we 8848 * want to immediately free the unwritten and unreferenced inode. 8849 */ 8850 if ((dirrem->dm_state & COMPLETE) == 0) { 8851 dap->da_previous = dirrem; 8852 } else { 8853 if (prevdirrem != NULL) { 8854 dap->da_previous = prevdirrem; 8855 } else { 8856 dap->da_state &= ~DIRCHG; 8857 dap->da_pagedep = pagedep; 8858 } 8859 dirrem->dm_dirinum = pagedep->pd_ino; 8860 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8861 add_to_worklist(&dirrem->dm_list, 0); 8862 } 8863 /* 8864 * Lookup the jaddref for this journal entry. We must finish 8865 * initializing it and make the diradd write dependent on it. 8866 * If we're not journaling Put it on the id_bufwait list if the inode 8867 * is not yet written. If it is written, do the post-inode write 8868 * processing to put it on the id_pendinghd list. 8869 */ 8870 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 8871 if (MOUNTEDSUJ(mp)) { 8872 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 8873 inoreflst); 8874 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 8875 ("softdep_setup_directory_change: bad jaddref %p", 8876 jaddref)); 8877 jaddref->ja_diroff = dp->i_offset; 8878 jaddref->ja_diradd = dap; 8879 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 8880 dap, da_pdlist); 8881 add_to_journal(&jaddref->ja_list); 8882 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 8883 dap->da_state |= COMPLETE; 8884 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 8885 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 8886 } else { 8887 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 8888 dap, da_pdlist); 8889 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 8890 } 8891 /* 8892 * If we're making a new name for a directory that has not been 8893 * committed when need to move the dot and dotdot references to 8894 * this new name. 8895 */ 8896 if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) 8897 merge_diradd(inodedep, dap); 8898 FREE_LOCK(&lk); 8899} 8900 8901/* 8902 * Called whenever the link count on an inode is changed. 8903 * It creates an inode dependency so that the new reference(s) 8904 * to the inode cannot be committed to disk until the updated 8905 * inode has been written. 8906 */ 8907void 8908softdep_change_linkcnt(ip) 8909 struct inode *ip; /* the inode with the increased link count */ 8910{ 8911 struct inodedep *inodedep; 8912 8913 ACQUIRE_LOCK(&lk); 8914 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); 8915 if (ip->i_nlink < ip->i_effnlink) 8916 panic("softdep_change_linkcnt: bad delta"); 8917 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 8918 FREE_LOCK(&lk); 8919} 8920 8921/* 8922 * Attach a sbdep dependency to the superblock buf so that we can keep 8923 * track of the head of the linked list of referenced but unlinked inodes. 8924 */ 8925void 8926softdep_setup_sbupdate(ump, fs, bp) 8927 struct ufsmount *ump; 8928 struct fs *fs; 8929 struct buf *bp; 8930{ 8931 struct sbdep *sbdep; 8932 struct worklist *wk; 8933 8934 if (MOUNTEDSUJ(UFSTOVFS(ump)) == 0) 8935 return; 8936 LIST_FOREACH(wk, &bp->b_dep, wk_list) 8937 if (wk->wk_type == D_SBDEP) 8938 break; 8939 if (wk != NULL) 8940 return; 8941 sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); 8942 workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); 8943 sbdep->sb_fs = fs; 8944 sbdep->sb_ump = ump; 8945 ACQUIRE_LOCK(&lk); 8946 WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); 8947 FREE_LOCK(&lk); 8948} 8949 8950/* 8951 * Return the first unlinked inodedep which is ready to be the head of the 8952 * list. The inodedep and all those after it must have valid next pointers. 8953 */ 8954static struct inodedep * 8955first_unlinked_inodedep(ump) 8956 struct ufsmount *ump; 8957{ 8958 struct inodedep *inodedep; 8959 struct inodedep *idp; 8960 8961 for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); 8962 inodedep; inodedep = idp) { 8963 if ((inodedep->id_state & UNLINKNEXT) == 0) 8964 return (NULL); 8965 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 8966 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) 8967 break; 8968 if ((inodedep->id_state & UNLINKPREV) == 0) 8969 panic("first_unlinked_inodedep: prev != next"); 8970 } 8971 if (inodedep == NULL) 8972 return (NULL); 8973 8974 return (inodedep); 8975} 8976 8977/* 8978 * Set the sujfree unlinked head pointer prior to writing a superblock. 8979 */ 8980static void 8981initiate_write_sbdep(sbdep) 8982 struct sbdep *sbdep; 8983{ 8984 struct inodedep *inodedep; 8985 struct fs *bpfs; 8986 struct fs *fs; 8987 8988 bpfs = sbdep->sb_fs; 8989 fs = sbdep->sb_ump->um_fs; 8990 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 8991 if (inodedep) { 8992 fs->fs_sujfree = inodedep->id_ino; 8993 inodedep->id_state |= UNLINKPREV; 8994 } else 8995 fs->fs_sujfree = 0; 8996 bpfs->fs_sujfree = fs->fs_sujfree; 8997} 8998 8999/* 9000 * After a superblock is written determine whether it must be written again 9001 * due to a changing unlinked list head. 9002 */ 9003static int 9004handle_written_sbdep(sbdep, bp) 9005 struct sbdep *sbdep; 9006 struct buf *bp; 9007{ 9008 struct inodedep *inodedep; 9009 struct mount *mp; 9010 struct fs *fs; 9011 9012 fs = sbdep->sb_fs; 9013 mp = UFSTOVFS(sbdep->sb_ump); 9014 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 9015 if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || 9016 (inodedep == NULL && fs->fs_sujfree != 0)) { 9017 bdirty(bp); 9018 return (1); 9019 } 9020 WORKITEM_FREE(sbdep, D_SBDEP); 9021 if (fs->fs_sujfree == 0) 9022 return (0); 9023 if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0) 9024 panic("handle_written_sbdep: lost inodedep"); 9025 /* 9026 * Now that we have a record of this inode in stable store allow it 9027 * to be written to free up pending work. Inodes may see a lot of 9028 * write activity after they are unlinked which we must not hold up. 9029 */ 9030 for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { 9031 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) 9032 panic("handle_written_sbdep: Bad inodedep %p (0x%X)", 9033 inodedep, inodedep->id_state); 9034 if (inodedep->id_state & UNLINKONLIST) 9035 break; 9036 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; 9037 } 9038 9039 return (0); 9040} 9041 9042/* 9043 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list. 9044 */ 9045static void 9046unlinked_inodedep(mp, inodedep) 9047 struct mount *mp; 9048 struct inodedep *inodedep; 9049{ 9050 struct ufsmount *ump; 9051 9052 if (MOUNTEDSUJ(mp) == 0) 9053 return; 9054 ump = VFSTOUFS(mp); 9055 ump->um_fs->fs_fmod = 1; 9056 inodedep->id_state |= UNLINKED; 9057 TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); 9058} 9059 9060/* 9061 * Remove an inodedep from the unlinked inodedep list. This may require 9062 * disk writes if the inode has made it that far. 9063 */ 9064static void 9065clear_unlinked_inodedep(inodedep) 9066 struct inodedep *inodedep; 9067{ 9068 struct ufsmount *ump; 9069 struct inodedep *idp; 9070 struct inodedep *idn; 9071 struct fs *fs; 9072 struct buf *bp; 9073 ino_t ino; 9074 ino_t nino; 9075 ino_t pino; 9076 int error; 9077 9078 ump = VFSTOUFS(inodedep->id_list.wk_mp); 9079 fs = ump->um_fs; 9080 ino = inodedep->id_ino; 9081 error = 0; 9082 for (;;) { 9083 /* 9084 * If nothing has yet been written simply remove us from 9085 * the in memory list and return. This is the most common 9086 * case where handle_workitem_remove() loses the final 9087 * reference. 9088 */ 9089 if ((inodedep->id_state & UNLINKLINKS) == 0) 9090 break; 9091 /* 9092 * If we have a NEXT pointer and no PREV pointer we can simply 9093 * clear NEXT's PREV and remove ourselves from the list. Be 9094 * careful not to clear PREV if the superblock points at 9095 * next as well. 9096 */ 9097 idn = TAILQ_NEXT(inodedep, id_unlinked); 9098 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { 9099 if (idn && fs->fs_sujfree != idn->id_ino) 9100 idn->id_state &= ~UNLINKPREV; 9101 break; 9102 } 9103 /* 9104 * Here we have an inodedep which is actually linked into 9105 * the list. We must remove it by forcing a write to the 9106 * link before us, whether it be the superblock or an inode. 9107 * Unfortunately the list may change while we're waiting 9108 * on the buf lock for either resource so we must loop until 9109 * we lock the right one. If both the superblock and an 9110 * inode point to this inode we must clear the inode first 9111 * followed by the superblock. 9112 */ 9113 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 9114 pino = 0; 9115 if (idp && (idp->id_state & UNLINKNEXT)) 9116 pino = idp->id_ino; 9117 FREE_LOCK(&lk); 9118 if (pino == 0) 9119 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 9120 (int)fs->fs_sbsize, 0, 0, 0); 9121 else 9122 error = bread(ump->um_devvp, 9123 fsbtodb(fs, ino_to_fsba(fs, pino)), 9124 (int)fs->fs_bsize, NOCRED, &bp); 9125 ACQUIRE_LOCK(&lk); 9126 if (error) 9127 break; 9128 /* If the list has changed restart the loop. */ 9129 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 9130 nino = 0; 9131 if (idp && (idp->id_state & UNLINKNEXT)) 9132 nino = idp->id_ino; 9133 if (nino != pino || 9134 (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { 9135 FREE_LOCK(&lk); 9136 brelse(bp); 9137 ACQUIRE_LOCK(&lk); 9138 continue; 9139 } 9140 /* 9141 * Remove us from the in memory list. After this we cannot 9142 * access the inodedep. 9143 */ 9144 idn = TAILQ_NEXT(inodedep, id_unlinked); 9145 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 9146 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 9147 /* 9148 * Determine the next inode number. 9149 */ 9150 nino = 0; 9151 if (idn) { 9152 /* 9153 * If next isn't on the list we can just clear prev's 9154 * state and schedule it to be fixed later. No need 9155 * to synchronously write if we're not in the real 9156 * list. 9157 */ 9158 if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) { 9159 idp->id_state &= ~UNLINKNEXT; 9160 if ((idp->id_state & ONWORKLIST) == 0) 9161 WORKLIST_INSERT(&bp->b_dep, 9162 &idp->id_list); 9163 FREE_LOCK(&lk); 9164 bawrite(bp); 9165 ACQUIRE_LOCK(&lk); 9166 return; 9167 } 9168 nino = idn->id_ino; 9169 } 9170 FREE_LOCK(&lk); 9171 /* 9172 * The predecessor's next pointer is manually updated here 9173 * so that the NEXT flag is never cleared for an element 9174 * that is in the list. 9175 */ 9176 if (pino == 0) { 9177 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 9178 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 9179 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 9180 bp); 9181 } else if (fs->fs_magic == FS_UFS1_MAGIC) 9182 ((struct ufs1_dinode *)bp->b_data + 9183 ino_to_fsbo(fs, pino))->di_freelink = nino; 9184 else 9185 ((struct ufs2_dinode *)bp->b_data + 9186 ino_to_fsbo(fs, pino))->di_freelink = nino; 9187 /* 9188 * If the bwrite fails we have no recourse to recover. The 9189 * filesystem is corrupted already. 9190 */ 9191 bwrite(bp); 9192 ACQUIRE_LOCK(&lk); 9193 /* 9194 * If the superblock pointer still needs to be cleared force 9195 * a write here. 9196 */ 9197 if (fs->fs_sujfree == ino) { 9198 FREE_LOCK(&lk); 9199 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 9200 (int)fs->fs_sbsize, 0, 0, 0); 9201 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 9202 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 9203 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 9204 bp); 9205 bwrite(bp); 9206 ACQUIRE_LOCK(&lk); 9207 } 9208 if (fs->fs_sujfree != ino) 9209 return; 9210 panic("clear_unlinked_inodedep: Failed to clear free head"); 9211 } 9212 if (inodedep->id_ino == fs->fs_sujfree) 9213 panic("clear_unlinked_inodedep: Freeing head of free list"); 9214 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 9215 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 9216 return; 9217} 9218 9219/* 9220 * This workitem decrements the inode's link count. 9221 * If the link count reaches zero, the file is removed. 9222 */ 9223static int 9224handle_workitem_remove(dirrem, flags) 9225 struct dirrem *dirrem; 9226 int flags; 9227{ 9228 struct inodedep *inodedep; 9229 struct workhead dotdotwk; 9230 struct worklist *wk; 9231 struct ufsmount *ump; 9232 struct mount *mp; 9233 struct vnode *vp; 9234 struct inode *ip; 9235 ino_t oldinum; 9236 9237 if (dirrem->dm_state & ONWORKLIST) 9238 panic("handle_workitem_remove: dirrem %p still on worklist", 9239 dirrem); 9240 oldinum = dirrem->dm_oldinum; 9241 mp = dirrem->dm_list.wk_mp; 9242 ump = VFSTOUFS(mp); 9243 flags |= LK_EXCLUSIVE; 9244 if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0) 9245 return (EBUSY); 9246 ip = VTOI(vp); 9247 ACQUIRE_LOCK(&lk); 9248 if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) 9249 panic("handle_workitem_remove: lost inodedep"); 9250 if (dirrem->dm_state & ONDEPLIST) 9251 LIST_REMOVE(dirrem, dm_inonext); 9252 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 9253 ("handle_workitem_remove: Journal entries not written.")); 9254 9255 /* 9256 * Move all dependencies waiting on the remove to complete 9257 * from the dirrem to the inode inowait list to be completed 9258 * after the inode has been updated and written to disk. Any 9259 * marked MKDIR_PARENT are saved to be completed when the .. ref 9260 * is removed. 9261 */ 9262 LIST_INIT(&dotdotwk); 9263 while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { 9264 WORKLIST_REMOVE(wk); 9265 if (wk->wk_state & MKDIR_PARENT) { 9266 wk->wk_state &= ~MKDIR_PARENT; 9267 WORKLIST_INSERT(&dotdotwk, wk); 9268 continue; 9269 } 9270 WORKLIST_INSERT(&inodedep->id_inowait, wk); 9271 } 9272 LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); 9273 /* 9274 * Normal file deletion. 9275 */ 9276 if ((dirrem->dm_state & RMDIR) == 0) { 9277 ip->i_nlink--; 9278 DIP_SET(ip, i_nlink, ip->i_nlink); 9279 ip->i_flag |= IN_CHANGE; 9280 if (ip->i_nlink < ip->i_effnlink) 9281 panic("handle_workitem_remove: bad file delta"); 9282 if (ip->i_nlink == 0) 9283 unlinked_inodedep(mp, inodedep); 9284 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 9285 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 9286 ("handle_workitem_remove: worklist not empty. %s", 9287 TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); 9288 WORKITEM_FREE(dirrem, D_DIRREM); 9289 FREE_LOCK(&lk); 9290 goto out; 9291 } 9292 /* 9293 * Directory deletion. Decrement reference count for both the 9294 * just deleted parent directory entry and the reference for ".". 9295 * Arrange to have the reference count on the parent decremented 9296 * to account for the loss of "..". 9297 */ 9298 ip->i_nlink -= 2; 9299 DIP_SET(ip, i_nlink, ip->i_nlink); 9300 ip->i_flag |= IN_CHANGE; 9301 if (ip->i_nlink < ip->i_effnlink) 9302 panic("handle_workitem_remove: bad dir delta"); 9303 if (ip->i_nlink == 0) 9304 unlinked_inodedep(mp, inodedep); 9305 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 9306 /* 9307 * Rename a directory to a new parent. Since, we are both deleting 9308 * and creating a new directory entry, the link count on the new 9309 * directory should not change. Thus we skip the followup dirrem. 9310 */ 9311 if (dirrem->dm_state & DIRCHG) { 9312 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 9313 ("handle_workitem_remove: DIRCHG and worklist not empty.")); 9314 WORKITEM_FREE(dirrem, D_DIRREM); 9315 FREE_LOCK(&lk); 9316 goto out; 9317 } 9318 dirrem->dm_state = ONDEPLIST; 9319 dirrem->dm_oldinum = dirrem->dm_dirinum; 9320 /* 9321 * Place the dirrem on the parent's diremhd list. 9322 */ 9323 if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) 9324 panic("handle_workitem_remove: lost dir inodedep"); 9325 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 9326 /* 9327 * If the allocated inode has never been written to disk, then 9328 * the on-disk inode is zero'ed and we can remove the file 9329 * immediately. When journaling if the inode has been marked 9330 * unlinked and not DEPCOMPLETE we know it can never be written. 9331 */ 9332 inodedep_lookup(mp, oldinum, 0, &inodedep); 9333 if (inodedep == NULL || 9334 (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || 9335 check_inode_unwritten(inodedep)) { 9336 FREE_LOCK(&lk); 9337 vput(vp); 9338 return handle_workitem_remove(dirrem, flags); 9339 } 9340 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 9341 FREE_LOCK(&lk); 9342 ip->i_flag |= IN_CHANGE; 9343out: 9344 ffs_update(vp, 0); 9345 vput(vp); 9346 return (0); 9347} 9348 9349/* 9350 * Inode de-allocation dependencies. 9351 * 9352 * When an inode's link count is reduced to zero, it can be de-allocated. We 9353 * found it convenient to postpone de-allocation until after the inode is 9354 * written to disk with its new link count (zero). At this point, all of the 9355 * on-disk inode's block pointers are nullified and, with careful dependency 9356 * list ordering, all dependencies related to the inode will be satisfied and 9357 * the corresponding dependency structures de-allocated. So, if/when the 9358 * inode is reused, there will be no mixing of old dependencies with new 9359 * ones. This artificial dependency is set up by the block de-allocation 9360 * procedure above (softdep_setup_freeblocks) and completed by the 9361 * following procedure. 9362 */ 9363static void 9364handle_workitem_freefile(freefile) 9365 struct freefile *freefile; 9366{ 9367 struct workhead wkhd; 9368 struct fs *fs; 9369 struct inodedep *idp; 9370 struct ufsmount *ump; 9371 int error; 9372 9373 ump = VFSTOUFS(freefile->fx_list.wk_mp); 9374 fs = ump->um_fs; 9375#ifdef DEBUG 9376 ACQUIRE_LOCK(&lk); 9377 error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); 9378 FREE_LOCK(&lk); 9379 if (error) 9380 panic("handle_workitem_freefile: inodedep %p survived", idp); 9381#endif 9382 UFS_LOCK(ump); 9383 fs->fs_pendinginodes -= 1; 9384 UFS_UNLOCK(ump); 9385 LIST_INIT(&wkhd); 9386 LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); 9387 if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, 9388 freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) 9389 softdep_error("handle_workitem_freefile", error); 9390 ACQUIRE_LOCK(&lk); 9391 WORKITEM_FREE(freefile, D_FREEFILE); 9392 FREE_LOCK(&lk); 9393} 9394 9395 9396/* 9397 * Helper function which unlinks marker element from work list and returns 9398 * the next element on the list. 9399 */ 9400static __inline struct worklist * 9401markernext(struct worklist *marker) 9402{ 9403 struct worklist *next; 9404 9405 next = LIST_NEXT(marker, wk_list); 9406 LIST_REMOVE(marker, wk_list); 9407 return next; 9408} 9409 9410/* 9411 * Disk writes. 9412 * 9413 * The dependency structures constructed above are most actively used when file 9414 * system blocks are written to disk. No constraints are placed on when a 9415 * block can be written, but unsatisfied update dependencies are made safe by 9416 * modifying (or replacing) the source memory for the duration of the disk 9417 * write. When the disk write completes, the memory block is again brought 9418 * up-to-date. 9419 * 9420 * In-core inode structure reclamation. 9421 * 9422 * Because there are a finite number of "in-core" inode structures, they are 9423 * reused regularly. By transferring all inode-related dependencies to the 9424 * in-memory inode block and indexing them separately (via "inodedep"s), we 9425 * can allow "in-core" inode structures to be reused at any time and avoid 9426 * any increase in contention. 9427 * 9428 * Called just before entering the device driver to initiate a new disk I/O. 9429 * The buffer must be locked, thus, no I/O completion operations can occur 9430 * while we are manipulating its associated dependencies. 9431 */ 9432static void 9433softdep_disk_io_initiation(bp) 9434 struct buf *bp; /* structure describing disk write to occur */ 9435{ 9436 struct worklist *wk; 9437 struct worklist marker; 9438 struct inodedep *inodedep; 9439 struct freeblks *freeblks; 9440 struct jblkdep *jblkdep; 9441 struct newblk *newblk; 9442 9443 /* 9444 * We only care about write operations. There should never 9445 * be dependencies for reads. 9446 */ 9447 if (bp->b_iocmd != BIO_WRITE) 9448 panic("softdep_disk_io_initiation: not write"); 9449 9450 if (bp->b_vflags & BV_BKGRDINPROG) 9451 panic("softdep_disk_io_initiation: Writing buffer with " 9452 "background write in progress: %p", bp); 9453 9454 marker.wk_type = D_LAST + 1; /* Not a normal workitem */ 9455 PHOLD(curproc); /* Don't swap out kernel stack */ 9456 9457 ACQUIRE_LOCK(&lk); 9458 /* 9459 * Do any necessary pre-I/O processing. 9460 */ 9461 for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; 9462 wk = markernext(&marker)) { 9463 LIST_INSERT_AFTER(wk, &marker, wk_list); 9464 switch (wk->wk_type) { 9465 9466 case D_PAGEDEP: 9467 initiate_write_filepage(WK_PAGEDEP(wk), bp); 9468 continue; 9469 9470 case D_INODEDEP: 9471 inodedep = WK_INODEDEP(wk); 9472 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) 9473 initiate_write_inodeblock_ufs1(inodedep, bp); 9474 else 9475 initiate_write_inodeblock_ufs2(inodedep, bp); 9476 continue; 9477 9478 case D_INDIRDEP: 9479 initiate_write_indirdep(WK_INDIRDEP(wk), bp); 9480 continue; 9481 9482 case D_BMSAFEMAP: 9483 initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); 9484 continue; 9485 9486 case D_JSEG: 9487 WK_JSEG(wk)->js_buf = NULL; 9488 continue; 9489 9490 case D_FREEBLKS: 9491 freeblks = WK_FREEBLKS(wk); 9492 jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd); 9493 /* 9494 * We have to wait for the freeblks to be journaled 9495 * before we can write an inodeblock with updated 9496 * pointers. Be careful to arrange the marker so 9497 * we revisit the freeblks if it's not removed by 9498 * the first jwait(). 9499 */ 9500 if (jblkdep != NULL) { 9501 LIST_REMOVE(&marker, wk_list); 9502 LIST_INSERT_BEFORE(wk, &marker, wk_list); 9503 jwait(&jblkdep->jb_list, MNT_WAIT); 9504 } 9505 continue; 9506 case D_ALLOCDIRECT: 9507 case D_ALLOCINDIR: 9508 /* 9509 * We have to wait for the jnewblk to be journaled 9510 * before we can write to a block if the contents 9511 * may be confused with an earlier file's indirect 9512 * at recovery time. Handle the marker as described 9513 * above. 9514 */ 9515 newblk = WK_NEWBLK(wk); 9516 if (newblk->nb_jnewblk != NULL && 9517 indirblk_lookup(newblk->nb_list.wk_mp, 9518 newblk->nb_newblkno)) { 9519 LIST_REMOVE(&marker, wk_list); 9520 LIST_INSERT_BEFORE(wk, &marker, wk_list); 9521 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 9522 } 9523 continue; 9524 9525 case D_SBDEP: 9526 initiate_write_sbdep(WK_SBDEP(wk)); 9527 continue; 9528 9529 case D_MKDIR: 9530 case D_FREEWORK: 9531 case D_FREEDEP: 9532 case D_JSEGDEP: 9533 continue; 9534 9535 default: 9536 panic("handle_disk_io_initiation: Unexpected type %s", 9537 TYPENAME(wk->wk_type)); 9538 /* NOTREACHED */ 9539 } 9540 } 9541 FREE_LOCK(&lk); 9542 PRELE(curproc); /* Allow swapout of kernel stack */ 9543} 9544 9545/* 9546 * Called from within the procedure above to deal with unsatisfied 9547 * allocation dependencies in a directory. The buffer must be locked, 9548 * thus, no I/O completion operations can occur while we are 9549 * manipulating its associated dependencies. 9550 */ 9551static void 9552initiate_write_filepage(pagedep, bp) 9553 struct pagedep *pagedep; 9554 struct buf *bp; 9555{ 9556 struct jremref *jremref; 9557 struct jmvref *jmvref; 9558 struct dirrem *dirrem; 9559 struct diradd *dap; 9560 struct direct *ep; 9561 int i; 9562 9563 if (pagedep->pd_state & IOSTARTED) { 9564 /* 9565 * This can only happen if there is a driver that does not 9566 * understand chaining. Here biodone will reissue the call 9567 * to strategy for the incomplete buffers. 9568 */ 9569 printf("initiate_write_filepage: already started\n"); 9570 return; 9571 } 9572 pagedep->pd_state |= IOSTARTED; 9573 /* 9574 * Wait for all journal remove dependencies to hit the disk. 9575 * We can not allow any potentially conflicting directory adds 9576 * to be visible before removes and rollback is too difficult. 9577 * lk may be dropped and re-acquired, however we hold the buf 9578 * locked so the dependency can not go away. 9579 */ 9580 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) 9581 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) 9582 jwait(&jremref->jr_list, MNT_WAIT); 9583 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) 9584 jwait(&jmvref->jm_list, MNT_WAIT); 9585 for (i = 0; i < DAHASHSZ; i++) { 9586 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 9587 ep = (struct direct *) 9588 ((char *)bp->b_data + dap->da_offset); 9589 if (ep->d_ino != dap->da_newinum) 9590 panic("%s: dir inum %d != new %d", 9591 "initiate_write_filepage", 9592 ep->d_ino, dap->da_newinum); 9593 if (dap->da_state & DIRCHG) 9594 ep->d_ino = dap->da_previous->dm_oldinum; 9595 else 9596 ep->d_ino = 0; 9597 dap->da_state &= ~ATTACHED; 9598 dap->da_state |= UNDONE; 9599 } 9600 } 9601} 9602 9603/* 9604 * Version of initiate_write_inodeblock that handles UFS1 dinodes. 9605 * Note that any bug fixes made to this routine must be done in the 9606 * version found below. 9607 * 9608 * Called from within the procedure above to deal with unsatisfied 9609 * allocation dependencies in an inodeblock. The buffer must be 9610 * locked, thus, no I/O completion operations can occur while we 9611 * are manipulating its associated dependencies. 9612 */ 9613static void 9614initiate_write_inodeblock_ufs1(inodedep, bp) 9615 struct inodedep *inodedep; 9616 struct buf *bp; /* The inode block */ 9617{ 9618 struct allocdirect *adp, *lastadp; 9619 struct ufs1_dinode *dp; 9620 struct ufs1_dinode *sip; 9621 struct inoref *inoref; 9622 struct fs *fs; 9623 ufs_lbn_t i; 9624#ifdef INVARIANTS 9625 ufs_lbn_t prevlbn = 0; 9626#endif 9627 int deplist; 9628 9629 if (inodedep->id_state & IOSTARTED) 9630 panic("initiate_write_inodeblock_ufs1: already started"); 9631 inodedep->id_state |= IOSTARTED; 9632 fs = inodedep->id_fs; 9633 dp = (struct ufs1_dinode *)bp->b_data + 9634 ino_to_fsbo(fs, inodedep->id_ino); 9635 9636 /* 9637 * If we're on the unlinked list but have not yet written our 9638 * next pointer initialize it here. 9639 */ 9640 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 9641 struct inodedep *inon; 9642 9643 inon = TAILQ_NEXT(inodedep, id_unlinked); 9644 dp->di_freelink = inon ? inon->id_ino : 0; 9645 } 9646 /* 9647 * If the bitmap is not yet written, then the allocated 9648 * inode cannot be written to disk. 9649 */ 9650 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 9651 if (inodedep->id_savedino1 != NULL) 9652 panic("initiate_write_inodeblock_ufs1: I/O underway"); 9653 FREE_LOCK(&lk); 9654 sip = malloc(sizeof(struct ufs1_dinode), 9655 M_SAVEDINO, M_SOFTDEP_FLAGS); 9656 ACQUIRE_LOCK(&lk); 9657 inodedep->id_savedino1 = sip; 9658 *inodedep->id_savedino1 = *dp; 9659 bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); 9660 dp->di_gen = inodedep->id_savedino1->di_gen; 9661 dp->di_freelink = inodedep->id_savedino1->di_freelink; 9662 return; 9663 } 9664 /* 9665 * If no dependencies, then there is nothing to roll back. 9666 */ 9667 inodedep->id_savedsize = dp->di_size; 9668 inodedep->id_savedextsize = 0; 9669 inodedep->id_savednlink = dp->di_nlink; 9670 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 9671 TAILQ_EMPTY(&inodedep->id_inoreflst)) 9672 return; 9673 /* 9674 * Revert the link count to that of the first unwritten journal entry. 9675 */ 9676 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 9677 if (inoref) 9678 dp->di_nlink = inoref->if_nlink; 9679 /* 9680 * Set the dependencies to busy. 9681 */ 9682 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9683 adp = TAILQ_NEXT(adp, ad_next)) { 9684#ifdef INVARIANTS 9685 if (deplist != 0 && prevlbn >= adp->ad_offset) 9686 panic("softdep_write_inodeblock: lbn order"); 9687 prevlbn = adp->ad_offset; 9688 if (adp->ad_offset < NDADDR && 9689 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 9690 panic("%s: direct pointer #%jd mismatch %d != %jd", 9691 "softdep_write_inodeblock", 9692 (intmax_t)adp->ad_offset, 9693 dp->di_db[adp->ad_offset], 9694 (intmax_t)adp->ad_newblkno); 9695 if (adp->ad_offset >= NDADDR && 9696 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 9697 panic("%s: indirect pointer #%jd mismatch %d != %jd", 9698 "softdep_write_inodeblock", 9699 (intmax_t)adp->ad_offset - NDADDR, 9700 dp->di_ib[adp->ad_offset - NDADDR], 9701 (intmax_t)adp->ad_newblkno); 9702 deplist |= 1 << adp->ad_offset; 9703 if ((adp->ad_state & ATTACHED) == 0) 9704 panic("softdep_write_inodeblock: Unknown state 0x%x", 9705 adp->ad_state); 9706#endif /* INVARIANTS */ 9707 adp->ad_state &= ~ATTACHED; 9708 adp->ad_state |= UNDONE; 9709 } 9710 /* 9711 * The on-disk inode cannot claim to be any larger than the last 9712 * fragment that has been written. Otherwise, the on-disk inode 9713 * might have fragments that were not the last block in the file 9714 * which would corrupt the filesystem. 9715 */ 9716 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9717 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9718 if (adp->ad_offset >= NDADDR) 9719 break; 9720 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 9721 /* keep going until hitting a rollback to a frag */ 9722 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9723 continue; 9724 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9725 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 9726#ifdef INVARIANTS 9727 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 9728 panic("softdep_write_inodeblock: lost dep1"); 9729#endif /* INVARIANTS */ 9730 dp->di_db[i] = 0; 9731 } 9732 for (i = 0; i < NIADDR; i++) { 9733#ifdef INVARIANTS 9734 if (dp->di_ib[i] != 0 && 9735 (deplist & ((1 << NDADDR) << i)) == 0) 9736 panic("softdep_write_inodeblock: lost dep2"); 9737#endif /* INVARIANTS */ 9738 dp->di_ib[i] = 0; 9739 } 9740 return; 9741 } 9742 /* 9743 * If we have zero'ed out the last allocated block of the file, 9744 * roll back the size to the last currently allocated block. 9745 * We know that this last allocated block is a full-sized as 9746 * we already checked for fragments in the loop above. 9747 */ 9748 if (lastadp != NULL && 9749 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 9750 for (i = lastadp->ad_offset; i >= 0; i--) 9751 if (dp->di_db[i] != 0) 9752 break; 9753 dp->di_size = (i + 1) * fs->fs_bsize; 9754 } 9755 /* 9756 * The only dependencies are for indirect blocks. 9757 * 9758 * The file size for indirect block additions is not guaranteed. 9759 * Such a guarantee would be non-trivial to achieve. The conventional 9760 * synchronous write implementation also does not make this guarantee. 9761 * Fsck should catch and fix discrepancies. Arguably, the file size 9762 * can be over-estimated without destroying integrity when the file 9763 * moves into the indirect blocks (i.e., is large). If we want to 9764 * postpone fsck, we are stuck with this argument. 9765 */ 9766 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 9767 dp->di_ib[adp->ad_offset - NDADDR] = 0; 9768} 9769 9770/* 9771 * Version of initiate_write_inodeblock that handles UFS2 dinodes. 9772 * Note that any bug fixes made to this routine must be done in the 9773 * version found above. 9774 * 9775 * Called from within the procedure above to deal with unsatisfied 9776 * allocation dependencies in an inodeblock. The buffer must be 9777 * locked, thus, no I/O completion operations can occur while we 9778 * are manipulating its associated dependencies. 9779 */ 9780static void 9781initiate_write_inodeblock_ufs2(inodedep, bp) 9782 struct inodedep *inodedep; 9783 struct buf *bp; /* The inode block */ 9784{ 9785 struct allocdirect *adp, *lastadp; 9786 struct ufs2_dinode *dp; 9787 struct ufs2_dinode *sip; 9788 struct inoref *inoref; 9789 struct fs *fs; 9790 ufs_lbn_t i; 9791#ifdef INVARIANTS 9792 ufs_lbn_t prevlbn = 0; 9793#endif 9794 int deplist; 9795 9796 if (inodedep->id_state & IOSTARTED) 9797 panic("initiate_write_inodeblock_ufs2: already started"); 9798 inodedep->id_state |= IOSTARTED; 9799 fs = inodedep->id_fs; 9800 dp = (struct ufs2_dinode *)bp->b_data + 9801 ino_to_fsbo(fs, inodedep->id_ino); 9802 9803 /* 9804 * If we're on the unlinked list but have not yet written our 9805 * next pointer initialize it here. 9806 */ 9807 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 9808 struct inodedep *inon; 9809 9810 inon = TAILQ_NEXT(inodedep, id_unlinked); 9811 dp->di_freelink = inon ? inon->id_ino : 0; 9812 } 9813 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == 9814 (UNLINKED | UNLINKNEXT)) { 9815 struct inodedep *inon; 9816 ino_t freelink; 9817 9818 inon = TAILQ_NEXT(inodedep, id_unlinked); 9819 freelink = inon ? inon->id_ino : 0; 9820 if (freelink != dp->di_freelink) 9821 panic("ino %p(0x%X) %d, %d != %d", 9822 inodedep, inodedep->id_state, inodedep->id_ino, 9823 freelink, dp->di_freelink); 9824 } 9825 /* 9826 * If the bitmap is not yet written, then the allocated 9827 * inode cannot be written to disk. 9828 */ 9829 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 9830 if (inodedep->id_savedino2 != NULL) 9831 panic("initiate_write_inodeblock_ufs2: I/O underway"); 9832 FREE_LOCK(&lk); 9833 sip = malloc(sizeof(struct ufs2_dinode), 9834 M_SAVEDINO, M_SOFTDEP_FLAGS); 9835 ACQUIRE_LOCK(&lk); 9836 inodedep->id_savedino2 = sip; 9837 *inodedep->id_savedino2 = *dp; 9838 bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); 9839 dp->di_gen = inodedep->id_savedino2->di_gen; 9840 dp->di_freelink = inodedep->id_savedino2->di_freelink; 9841 return; 9842 } 9843 /* 9844 * If no dependencies, then there is nothing to roll back. 9845 */ 9846 inodedep->id_savedsize = dp->di_size; 9847 inodedep->id_savedextsize = dp->di_extsize; 9848 inodedep->id_savednlink = dp->di_nlink; 9849 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 9850 TAILQ_EMPTY(&inodedep->id_extupdt) && 9851 TAILQ_EMPTY(&inodedep->id_inoreflst)) 9852 return; 9853 /* 9854 * Revert the link count to that of the first unwritten journal entry. 9855 */ 9856 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 9857 if (inoref) 9858 dp->di_nlink = inoref->if_nlink; 9859 9860 /* 9861 * Set the ext data dependencies to busy. 9862 */ 9863 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 9864 adp = TAILQ_NEXT(adp, ad_next)) { 9865#ifdef INVARIANTS 9866 if (deplist != 0 && prevlbn >= adp->ad_offset) 9867 panic("softdep_write_inodeblock: lbn order"); 9868 prevlbn = adp->ad_offset; 9869 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) 9870 panic("%s: direct pointer #%jd mismatch %jd != %jd", 9871 "softdep_write_inodeblock", 9872 (intmax_t)adp->ad_offset, 9873 (intmax_t)dp->di_extb[adp->ad_offset], 9874 (intmax_t)adp->ad_newblkno); 9875 deplist |= 1 << adp->ad_offset; 9876 if ((adp->ad_state & ATTACHED) == 0) 9877 panic("softdep_write_inodeblock: Unknown state 0x%x", 9878 adp->ad_state); 9879#endif /* INVARIANTS */ 9880 adp->ad_state &= ~ATTACHED; 9881 adp->ad_state |= UNDONE; 9882 } 9883 /* 9884 * The on-disk inode cannot claim to be any larger than the last 9885 * fragment that has been written. Otherwise, the on-disk inode 9886 * might have fragments that were not the last block in the ext 9887 * data which would corrupt the filesystem. 9888 */ 9889 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 9890 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9891 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; 9892 /* keep going until hitting a rollback to a frag */ 9893 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9894 continue; 9895 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9896 for (i = adp->ad_offset + 1; i < NXADDR; i++) { 9897#ifdef INVARIANTS 9898 if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) 9899 panic("softdep_write_inodeblock: lost dep1"); 9900#endif /* INVARIANTS */ 9901 dp->di_extb[i] = 0; 9902 } 9903 lastadp = NULL; 9904 break; 9905 } 9906 /* 9907 * If we have zero'ed out the last allocated block of the ext 9908 * data, roll back the size to the last currently allocated block. 9909 * We know that this last allocated block is a full-sized as 9910 * we already checked for fragments in the loop above. 9911 */ 9912 if (lastadp != NULL && 9913 dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 9914 for (i = lastadp->ad_offset; i >= 0; i--) 9915 if (dp->di_extb[i] != 0) 9916 break; 9917 dp->di_extsize = (i + 1) * fs->fs_bsize; 9918 } 9919 /* 9920 * Set the file data dependencies to busy. 9921 */ 9922 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9923 adp = TAILQ_NEXT(adp, ad_next)) { 9924#ifdef INVARIANTS 9925 if (deplist != 0 && prevlbn >= adp->ad_offset) 9926 panic("softdep_write_inodeblock: lbn order"); 9927 if ((adp->ad_state & ATTACHED) == 0) 9928 panic("inodedep %p and adp %p not attached", inodedep, adp); 9929 prevlbn = adp->ad_offset; 9930 if (adp->ad_offset < NDADDR && 9931 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 9932 panic("%s: direct pointer #%jd mismatch %jd != %jd", 9933 "softdep_write_inodeblock", 9934 (intmax_t)adp->ad_offset, 9935 (intmax_t)dp->di_db[adp->ad_offset], 9936 (intmax_t)adp->ad_newblkno); 9937 if (adp->ad_offset >= NDADDR && 9938 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 9939 panic("%s indirect pointer #%jd mismatch %jd != %jd", 9940 "softdep_write_inodeblock:", 9941 (intmax_t)adp->ad_offset - NDADDR, 9942 (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], 9943 (intmax_t)adp->ad_newblkno); 9944 deplist |= 1 << adp->ad_offset; 9945 if ((adp->ad_state & ATTACHED) == 0) 9946 panic("softdep_write_inodeblock: Unknown state 0x%x", 9947 adp->ad_state); 9948#endif /* INVARIANTS */ 9949 adp->ad_state &= ~ATTACHED; 9950 adp->ad_state |= UNDONE; 9951 } 9952 /* 9953 * The on-disk inode cannot claim to be any larger than the last 9954 * fragment that has been written. Otherwise, the on-disk inode 9955 * might have fragments that were not the last block in the file 9956 * which would corrupt the filesystem. 9957 */ 9958 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9959 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9960 if (adp->ad_offset >= NDADDR) 9961 break; 9962 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 9963 /* keep going until hitting a rollback to a frag */ 9964 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9965 continue; 9966 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9967 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 9968#ifdef INVARIANTS 9969 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 9970 panic("softdep_write_inodeblock: lost dep2"); 9971#endif /* INVARIANTS */ 9972 dp->di_db[i] = 0; 9973 } 9974 for (i = 0; i < NIADDR; i++) { 9975#ifdef INVARIANTS 9976 if (dp->di_ib[i] != 0 && 9977 (deplist & ((1 << NDADDR) << i)) == 0) 9978 panic("softdep_write_inodeblock: lost dep3"); 9979#endif /* INVARIANTS */ 9980 dp->di_ib[i] = 0; 9981 } 9982 return; 9983 } 9984 /* 9985 * If we have zero'ed out the last allocated block of the file, 9986 * roll back the size to the last currently allocated block. 9987 * We know that this last allocated block is a full-sized as 9988 * we already checked for fragments in the loop above. 9989 */ 9990 if (lastadp != NULL && 9991 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 9992 for (i = lastadp->ad_offset; i >= 0; i--) 9993 if (dp->di_db[i] != 0) 9994 break; 9995 dp->di_size = (i + 1) * fs->fs_bsize; 9996 } 9997 /* 9998 * The only dependencies are for indirect blocks. 9999 * 10000 * The file size for indirect block additions is not guaranteed. 10001 * Such a guarantee would be non-trivial to achieve. The conventional 10002 * synchronous write implementation also does not make this guarantee. 10003 * Fsck should catch and fix discrepancies. Arguably, the file size 10004 * can be over-estimated without destroying integrity when the file 10005 * moves into the indirect blocks (i.e., is large). If we want to 10006 * postpone fsck, we are stuck with this argument. 10007 */ 10008 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 10009 dp->di_ib[adp->ad_offset - NDADDR] = 0; 10010} 10011 10012/* 10013 * Cancel an indirdep as a result of truncation. Release all of the 10014 * children allocindirs and place their journal work on the appropriate 10015 * list. 10016 */ 10017static void 10018cancel_indirdep(indirdep, bp, freeblks) 10019 struct indirdep *indirdep; 10020 struct buf *bp; 10021 struct freeblks *freeblks; 10022{ 10023 struct allocindir *aip; 10024 10025 /* 10026 * None of the indirect pointers will ever be visible, 10027 * so they can simply be tossed. GOINGAWAY ensures 10028 * that allocated pointers will be saved in the buffer 10029 * cache until they are freed. Note that they will 10030 * only be able to be found by their physical address 10031 * since the inode mapping the logical address will 10032 * be gone. The save buffer used for the safe copy 10033 * was allocated in setup_allocindir_phase2 using 10034 * the physical address so it could be used for this 10035 * purpose. Hence we swap the safe copy with the real 10036 * copy, allowing the safe copy to be freed and holding 10037 * on to the real copy for later use in indir_trunc. 10038 */ 10039 if (indirdep->ir_state & GOINGAWAY) 10040 panic("cancel_indirdep: already gone"); 10041 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 10042 indirdep->ir_state |= DEPCOMPLETE; 10043 LIST_REMOVE(indirdep, ir_next); 10044 } 10045 indirdep->ir_state |= GOINGAWAY; 10046 VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1; 10047 /* 10048 * Pass in bp for blocks still have journal writes 10049 * pending so we can cancel them on their own. 10050 */ 10051 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 10052 cancel_allocindir(aip, bp, freeblks, 0); 10053 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) 10054 cancel_allocindir(aip, NULL, freeblks, 0); 10055 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) 10056 cancel_allocindir(aip, NULL, freeblks, 0); 10057 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0) 10058 cancel_allocindir(aip, NULL, freeblks, 0); 10059 /* 10060 * If there are pending partial truncations we need to keep the 10061 * old block copy around until they complete. This is because 10062 * the current b_data is not a perfect superset of the available 10063 * blocks. 10064 */ 10065 if (TAILQ_EMPTY(&indirdep->ir_trunc)) 10066 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); 10067 else 10068 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 10069 WORKLIST_REMOVE(&indirdep->ir_list); 10070 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); 10071 indirdep->ir_bp = NULL; 10072 indirdep->ir_freeblks = freeblks; 10073} 10074 10075/* 10076 * Free an indirdep once it no longer has new pointers to track. 10077 */ 10078static void 10079free_indirdep(indirdep) 10080 struct indirdep *indirdep; 10081{ 10082 10083 KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc), 10084 ("free_indirdep: Indir trunc list not empty.")); 10085 KASSERT(LIST_EMPTY(&indirdep->ir_completehd), 10086 ("free_indirdep: Complete head not empty.")); 10087 KASSERT(LIST_EMPTY(&indirdep->ir_writehd), 10088 ("free_indirdep: write head not empty.")); 10089 KASSERT(LIST_EMPTY(&indirdep->ir_donehd), 10090 ("free_indirdep: done head not empty.")); 10091 KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), 10092 ("free_indirdep: deplist head not empty.")); 10093 KASSERT((indirdep->ir_state & DEPCOMPLETE), 10094 ("free_indirdep: %p still on newblk list.", indirdep)); 10095 KASSERT(indirdep->ir_saveddata == NULL, 10096 ("free_indirdep: %p still has saved data.", indirdep)); 10097 if (indirdep->ir_state & ONWORKLIST) 10098 WORKLIST_REMOVE(&indirdep->ir_list); 10099 WORKITEM_FREE(indirdep, D_INDIRDEP); 10100} 10101 10102/* 10103 * Called before a write to an indirdep. This routine is responsible for 10104 * rolling back pointers to a safe state which includes only those 10105 * allocindirs which have been completed. 10106 */ 10107static void 10108initiate_write_indirdep(indirdep, bp) 10109 struct indirdep *indirdep; 10110 struct buf *bp; 10111{ 10112 10113 indirdep->ir_state |= IOSTARTED; 10114 if (indirdep->ir_state & GOINGAWAY) 10115 panic("disk_io_initiation: indirdep gone"); 10116 /* 10117 * If there are no remaining dependencies, this will be writing 10118 * the real pointers. 10119 */ 10120 if (LIST_EMPTY(&indirdep->ir_deplisthd) && 10121 TAILQ_EMPTY(&indirdep->ir_trunc)) 10122 return; 10123 /* 10124 * Replace up-to-date version with safe version. 10125 */ 10126 if (indirdep->ir_saveddata == NULL) { 10127 FREE_LOCK(&lk); 10128 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 10129 M_SOFTDEP_FLAGS); 10130 ACQUIRE_LOCK(&lk); 10131 } 10132 indirdep->ir_state &= ~ATTACHED; 10133 indirdep->ir_state |= UNDONE; 10134 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 10135 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 10136 bp->b_bcount); 10137} 10138 10139/* 10140 * Called when an inode has been cleared in a cg bitmap. This finally 10141 * eliminates any canceled jaddrefs 10142 */ 10143void 10144softdep_setup_inofree(mp, bp, ino, wkhd) 10145 struct mount *mp; 10146 struct buf *bp; 10147 ino_t ino; 10148 struct workhead *wkhd; 10149{ 10150 struct worklist *wk, *wkn; 10151 struct inodedep *inodedep; 10152 uint8_t *inosused; 10153 struct cg *cgp; 10154 struct fs *fs; 10155 10156 ACQUIRE_LOCK(&lk); 10157 fs = VFSTOUFS(mp)->um_fs; 10158 cgp = (struct cg *)bp->b_data; 10159 inosused = cg_inosused(cgp); 10160 if (isset(inosused, ino % fs->fs_ipg)) 10161 panic("softdep_setup_inofree: inode %d not freed.", ino); 10162 if (inodedep_lookup(mp, ino, 0, &inodedep)) 10163 panic("softdep_setup_inofree: ino %d has existing inodedep %p", 10164 ino, inodedep); 10165 if (wkhd) { 10166 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { 10167 if (wk->wk_type != D_JADDREF) 10168 continue; 10169 WORKLIST_REMOVE(wk); 10170 /* 10171 * We can free immediately even if the jaddref 10172 * isn't attached in a background write as now 10173 * the bitmaps are reconciled. 10174 */ 10175 wk->wk_state |= COMPLETE | ATTACHED; 10176 free_jaddref(WK_JADDREF(wk)); 10177 } 10178 jwork_move(&bp->b_dep, wkhd); 10179 } 10180 FREE_LOCK(&lk); 10181} 10182 10183 10184/* 10185 * Called via ffs_blkfree() after a set of frags has been cleared from a cg 10186 * map. Any dependencies waiting for the write to clear are added to the 10187 * buf's list and any jnewblks that are being canceled are discarded 10188 * immediately. 10189 */ 10190void 10191softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 10192 struct mount *mp; 10193 struct buf *bp; 10194 ufs2_daddr_t blkno; 10195 int frags; 10196 struct workhead *wkhd; 10197{ 10198 struct bmsafemap *bmsafemap; 10199 struct jnewblk *jnewblk; 10200 struct worklist *wk; 10201 struct fs *fs; 10202#ifdef SUJ_DEBUG 10203 uint8_t *blksfree; 10204 struct cg *cgp; 10205 ufs2_daddr_t jstart; 10206 ufs2_daddr_t jend; 10207 ufs2_daddr_t end; 10208 long bno; 10209 int i; 10210#endif 10211 10212 ACQUIRE_LOCK(&lk); 10213 /* Lookup the bmsafemap so we track when it is dirty. */ 10214 fs = VFSTOUFS(mp)->um_fs; 10215 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); 10216 /* 10217 * Detach any jnewblks which have been canceled. They must linger 10218 * until the bitmap is cleared again by ffs_blkfree() to prevent 10219 * an unjournaled allocation from hitting the disk. 10220 */ 10221 if (wkhd) { 10222 while ((wk = LIST_FIRST(wkhd)) != NULL) { 10223 WORKLIST_REMOVE(wk); 10224 if (wk->wk_type != D_JNEWBLK) { 10225 WORKLIST_INSERT(&bmsafemap->sm_freehd, wk); 10226 continue; 10227 } 10228 jnewblk = WK_JNEWBLK(wk); 10229 KASSERT(jnewblk->jn_state & GOINGAWAY, 10230 ("softdep_setup_blkfree: jnewblk not canceled.")); 10231#ifdef SUJ_DEBUG 10232 /* 10233 * Assert that this block is free in the bitmap 10234 * before we discard the jnewblk. 10235 */ 10236 cgp = (struct cg *)bp->b_data; 10237 blksfree = cg_blksfree(cgp); 10238 bno = dtogd(fs, jnewblk->jn_blkno); 10239 for (i = jnewblk->jn_oldfrags; 10240 i < jnewblk->jn_frags; i++) { 10241 if (isset(blksfree, bno + i)) 10242 continue; 10243 panic("softdep_setup_blkfree: not free"); 10244 } 10245#endif 10246 /* 10247 * Even if it's not attached we can free immediately 10248 * as the new bitmap is correct. 10249 */ 10250 wk->wk_state |= COMPLETE | ATTACHED; 10251 free_jnewblk(jnewblk); 10252 } 10253 } 10254 10255#ifdef SUJ_DEBUG 10256 /* 10257 * Assert that we are not freeing a block which has an outstanding 10258 * allocation dependency. 10259 */ 10260 fs = VFSTOUFS(mp)->um_fs; 10261 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); 10262 end = blkno + frags; 10263 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 10264 /* 10265 * Don't match against blocks that will be freed when the 10266 * background write is done. 10267 */ 10268 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == 10269 (COMPLETE | DEPCOMPLETE)) 10270 continue; 10271 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; 10272 jend = jnewblk->jn_blkno + jnewblk->jn_frags; 10273 if ((blkno >= jstart && blkno < jend) || 10274 (end > jstart && end <= jend)) { 10275 printf("state 0x%X %jd - %d %d dep %p\n", 10276 jnewblk->jn_state, jnewblk->jn_blkno, 10277 jnewblk->jn_oldfrags, jnewblk->jn_frags, 10278 jnewblk->jn_dep); 10279 panic("softdep_setup_blkfree: " 10280 "%jd-%jd(%d) overlaps with %jd-%jd", 10281 blkno, end, frags, jstart, jend); 10282 } 10283 } 10284#endif 10285 FREE_LOCK(&lk); 10286} 10287 10288/* 10289 * Revert a block allocation when the journal record that describes it 10290 * is not yet written. 10291 */ 10292int 10293jnewblk_rollback(jnewblk, fs, cgp, blksfree) 10294 struct jnewblk *jnewblk; 10295 struct fs *fs; 10296 struct cg *cgp; 10297 uint8_t *blksfree; 10298{ 10299 ufs1_daddr_t fragno; 10300 long cgbno, bbase; 10301 int frags, blk; 10302 int i; 10303 10304 frags = 0; 10305 cgbno = dtogd(fs, jnewblk->jn_blkno); 10306 /* 10307 * We have to test which frags need to be rolled back. We may 10308 * be operating on a stale copy when doing background writes. 10309 */ 10310 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) 10311 if (isclr(blksfree, cgbno + i)) 10312 frags++; 10313 if (frags == 0) 10314 return (0); 10315 /* 10316 * This is mostly ffs_blkfree() sans some validation and 10317 * superblock updates. 10318 */ 10319 if (frags == fs->fs_frag) { 10320 fragno = fragstoblks(fs, cgbno); 10321 ffs_setblock(fs, blksfree, fragno); 10322 ffs_clusteracct(fs, cgp, fragno, 1); 10323 cgp->cg_cs.cs_nbfree++; 10324 } else { 10325 cgbno += jnewblk->jn_oldfrags; 10326 bbase = cgbno - fragnum(fs, cgbno); 10327 /* Decrement the old frags. */ 10328 blk = blkmap(fs, blksfree, bbase); 10329 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 10330 /* Deallocate the fragment */ 10331 for (i = 0; i < frags; i++) 10332 setbit(blksfree, cgbno + i); 10333 cgp->cg_cs.cs_nffree += frags; 10334 /* Add back in counts associated with the new frags */ 10335 blk = blkmap(fs, blksfree, bbase); 10336 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 10337 /* If a complete block has been reassembled, account for it. */ 10338 fragno = fragstoblks(fs, bbase); 10339 if (ffs_isblock(fs, blksfree, fragno)) { 10340 cgp->cg_cs.cs_nffree -= fs->fs_frag; 10341 ffs_clusteracct(fs, cgp, fragno, 1); 10342 cgp->cg_cs.cs_nbfree++; 10343 } 10344 } 10345 stat_jnewblk++; 10346 jnewblk->jn_state &= ~ATTACHED; 10347 jnewblk->jn_state |= UNDONE; 10348 10349 return (frags); 10350} 10351 10352static void 10353initiate_write_bmsafemap(bmsafemap, bp) 10354 struct bmsafemap *bmsafemap; 10355 struct buf *bp; /* The cg block. */ 10356{ 10357 struct jaddref *jaddref; 10358 struct jnewblk *jnewblk; 10359 uint8_t *inosused; 10360 uint8_t *blksfree; 10361 struct cg *cgp; 10362 struct fs *fs; 10363 ino_t ino; 10364 10365 if (bmsafemap->sm_state & IOSTARTED) 10366 panic("initiate_write_bmsafemap: Already started\n"); 10367 bmsafemap->sm_state |= IOSTARTED; 10368 /* 10369 * Clear any inode allocations which are pending journal writes. 10370 */ 10371 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { 10372 cgp = (struct cg *)bp->b_data; 10373 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 10374 inosused = cg_inosused(cgp); 10375 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { 10376 ino = jaddref->ja_ino % fs->fs_ipg; 10377 /* 10378 * If this is a background copy the inode may not 10379 * be marked used yet. 10380 */ 10381 if (isset(inosused, ino)) { 10382 if ((jaddref->ja_mode & IFMT) == IFDIR) 10383 cgp->cg_cs.cs_ndir--; 10384 cgp->cg_cs.cs_nifree++; 10385 clrbit(inosused, ino); 10386 jaddref->ja_state &= ~ATTACHED; 10387 jaddref->ja_state |= UNDONE; 10388 stat_jaddref++; 10389 } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 10390 panic("initiate_write_bmsafemap: inode %d " 10391 "marked free", jaddref->ja_ino); 10392 } 10393 } 10394 /* 10395 * Clear any block allocations which are pending journal writes. 10396 */ 10397 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 10398 cgp = (struct cg *)bp->b_data; 10399 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 10400 blksfree = cg_blksfree(cgp); 10401 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 10402 if (jnewblk_rollback(jnewblk, fs, cgp, blksfree)) 10403 continue; 10404 if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 10405 panic("initiate_write_bmsafemap: block %jd " 10406 "marked free", jnewblk->jn_blkno); 10407 } 10408 } 10409 /* 10410 * Move allocation lists to the written lists so they can be 10411 * cleared once the block write is complete. 10412 */ 10413 LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, 10414 inodedep, id_deps); 10415 LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, 10416 newblk, nb_deps); 10417 LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, 10418 wk_list); 10419} 10420 10421/* 10422 * This routine is called during the completion interrupt 10423 * service routine for a disk write (from the procedure called 10424 * by the device driver to inform the filesystem caches of 10425 * a request completion). It should be called early in this 10426 * procedure, before the block is made available to other 10427 * processes or other routines are called. 10428 * 10429 */ 10430static void 10431softdep_disk_write_complete(bp) 10432 struct buf *bp; /* describes the completed disk write */ 10433{ 10434 struct worklist *wk; 10435 struct worklist *owk; 10436 struct workhead reattach; 10437 struct freeblks *freeblks; 10438 struct buf *sbp; 10439 10440 /* 10441 * If an error occurred while doing the write, then the data 10442 * has not hit the disk and the dependencies cannot be unrolled. 10443 */ 10444 if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) 10445 return; 10446 LIST_INIT(&reattach); 10447 /* 10448 * This lock must not be released anywhere in this code segment. 10449 */ 10450 sbp = NULL; 10451 owk = NULL; 10452 ACQUIRE_LOCK(&lk); 10453 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 10454 WORKLIST_REMOVE(wk); 10455 dep_write[wk->wk_type]++; 10456 if (wk == owk) 10457 panic("duplicate worklist: %p\n", wk); 10458 owk = wk; 10459 switch (wk->wk_type) { 10460 10461 case D_PAGEDEP: 10462 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 10463 WORKLIST_INSERT(&reattach, wk); 10464 continue; 10465 10466 case D_INODEDEP: 10467 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 10468 WORKLIST_INSERT(&reattach, wk); 10469 continue; 10470 10471 case D_BMSAFEMAP: 10472 if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp)) 10473 WORKLIST_INSERT(&reattach, wk); 10474 continue; 10475 10476 case D_MKDIR: 10477 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 10478 continue; 10479 10480 case D_ALLOCDIRECT: 10481 wk->wk_state |= COMPLETE; 10482 handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); 10483 continue; 10484 10485 case D_ALLOCINDIR: 10486 wk->wk_state |= COMPLETE; 10487 handle_allocindir_partdone(WK_ALLOCINDIR(wk)); 10488 continue; 10489 10490 case D_INDIRDEP: 10491 if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp)) 10492 WORKLIST_INSERT(&reattach, wk); 10493 continue; 10494 10495 case D_FREEBLKS: 10496 wk->wk_state |= COMPLETE; 10497 freeblks = WK_FREEBLKS(wk); 10498 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE && 10499 LIST_EMPTY(&freeblks->fb_jblkdephd)) 10500 add_to_worklist(wk, WK_NODELAY); 10501 continue; 10502 10503 case D_FREEWORK: 10504 handle_written_freework(WK_FREEWORK(wk)); 10505 break; 10506 10507 case D_JSEGDEP: 10508 free_jsegdep(WK_JSEGDEP(wk)); 10509 continue; 10510 10511 case D_JSEG: 10512 handle_written_jseg(WK_JSEG(wk), bp); 10513 continue; 10514 10515 case D_SBDEP: 10516 if (handle_written_sbdep(WK_SBDEP(wk), bp)) 10517 WORKLIST_INSERT(&reattach, wk); 10518 continue; 10519 10520 case D_FREEDEP: 10521 free_freedep(WK_FREEDEP(wk)); 10522 continue; 10523 10524 default: 10525 panic("handle_disk_write_complete: Unknown type %s", 10526 TYPENAME(wk->wk_type)); 10527 /* NOTREACHED */ 10528 } 10529 } 10530 /* 10531 * Reattach any requests that must be redone. 10532 */ 10533 while ((wk = LIST_FIRST(&reattach)) != NULL) { 10534 WORKLIST_REMOVE(wk); 10535 WORKLIST_INSERT(&bp->b_dep, wk); 10536 } 10537 FREE_LOCK(&lk); 10538 if (sbp) 10539 brelse(sbp); 10540} 10541 10542/* 10543 * Called from within softdep_disk_write_complete above. Note that 10544 * this routine is always called from interrupt level with further 10545 * splbio interrupts blocked. 10546 */ 10547static void 10548handle_allocdirect_partdone(adp, wkhd) 10549 struct allocdirect *adp; /* the completed allocdirect */ 10550 struct workhead *wkhd; /* Work to do when inode is writtne. */ 10551{ 10552 struct allocdirectlst *listhead; 10553 struct allocdirect *listadp; 10554 struct inodedep *inodedep; 10555 long bsize; 10556 10557 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 10558 return; 10559 /* 10560 * The on-disk inode cannot claim to be any larger than the last 10561 * fragment that has been written. Otherwise, the on-disk inode 10562 * might have fragments that were not the last block in the file 10563 * which would corrupt the filesystem. Thus, we cannot free any 10564 * allocdirects after one whose ad_oldblkno claims a fragment as 10565 * these blocks must be rolled back to zero before writing the inode. 10566 * We check the currently active set of allocdirects in id_inoupdt 10567 * or id_extupdt as appropriate. 10568 */ 10569 inodedep = adp->ad_inodedep; 10570 bsize = inodedep->id_fs->fs_bsize; 10571 if (adp->ad_state & EXTDATA) 10572 listhead = &inodedep->id_extupdt; 10573 else 10574 listhead = &inodedep->id_inoupdt; 10575 TAILQ_FOREACH(listadp, listhead, ad_next) { 10576 /* found our block */ 10577 if (listadp == adp) 10578 break; 10579 /* continue if ad_oldlbn is not a fragment */ 10580 if (listadp->ad_oldsize == 0 || 10581 listadp->ad_oldsize == bsize) 10582 continue; 10583 /* hit a fragment */ 10584 return; 10585 } 10586 /* 10587 * If we have reached the end of the current list without 10588 * finding the just finished dependency, then it must be 10589 * on the future dependency list. Future dependencies cannot 10590 * be freed until they are moved to the current list. 10591 */ 10592 if (listadp == NULL) { 10593#ifdef DEBUG 10594 if (adp->ad_state & EXTDATA) 10595 listhead = &inodedep->id_newextupdt; 10596 else 10597 listhead = &inodedep->id_newinoupdt; 10598 TAILQ_FOREACH(listadp, listhead, ad_next) 10599 /* found our block */ 10600 if (listadp == adp) 10601 break; 10602 if (listadp == NULL) 10603 panic("handle_allocdirect_partdone: lost dep"); 10604#endif /* DEBUG */ 10605 return; 10606 } 10607 /* 10608 * If we have found the just finished dependency, then queue 10609 * it along with anything that follows it that is complete. 10610 * Since the pointer has not yet been written in the inode 10611 * as the dependency prevents it, place the allocdirect on the 10612 * bufwait list where it will be freed once the pointer is 10613 * valid. 10614 */ 10615 if (wkhd == NULL) 10616 wkhd = &inodedep->id_bufwait; 10617 for (; adp; adp = listadp) { 10618 listadp = TAILQ_NEXT(adp, ad_next); 10619 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 10620 return; 10621 TAILQ_REMOVE(listhead, adp, ad_next); 10622 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); 10623 } 10624} 10625 10626/* 10627 * Called from within softdep_disk_write_complete above. This routine 10628 * completes successfully written allocindirs. 10629 */ 10630static void 10631handle_allocindir_partdone(aip) 10632 struct allocindir *aip; /* the completed allocindir */ 10633{ 10634 struct indirdep *indirdep; 10635 10636 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 10637 return; 10638 indirdep = aip->ai_indirdep; 10639 LIST_REMOVE(aip, ai_next); 10640 /* 10641 * Don't set a pointer while the buffer is undergoing IO or while 10642 * we have active truncations. 10643 */ 10644 if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) { 10645 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 10646 return; 10647 } 10648 if (indirdep->ir_state & UFS1FMT) 10649 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 10650 aip->ai_newblkno; 10651 else 10652 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 10653 aip->ai_newblkno; 10654 /* 10655 * Await the pointer write before freeing the allocindir. 10656 */ 10657 LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); 10658} 10659 10660/* 10661 * Release segments held on a jwork list. 10662 */ 10663static void 10664handle_jwork(wkhd) 10665 struct workhead *wkhd; 10666{ 10667 struct worklist *wk; 10668 10669 while ((wk = LIST_FIRST(wkhd)) != NULL) { 10670 WORKLIST_REMOVE(wk); 10671 switch (wk->wk_type) { 10672 case D_JSEGDEP: 10673 free_jsegdep(WK_JSEGDEP(wk)); 10674 continue; 10675 case D_FREEDEP: 10676 free_freedep(WK_FREEDEP(wk)); 10677 continue; 10678 case D_FREEFRAG: 10679 rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep)); 10680 WORKITEM_FREE(wk, D_FREEFRAG); 10681 case D_FREEWORK: 10682 handle_written_freework(WK_FREEWORK(wk)); 10683 continue; 10684 default: 10685 panic("handle_jwork: Unknown type %s\n", 10686 TYPENAME(wk->wk_type)); 10687 } 10688 } 10689} 10690 10691/* 10692 * Handle the bufwait list on an inode when it is safe to release items 10693 * held there. This normally happens after an inode block is written but 10694 * may be delayed and handled later if there are pending journal items that 10695 * are not yet safe to be released. 10696 */ 10697static struct freefile * 10698handle_bufwait(inodedep, refhd) 10699 struct inodedep *inodedep; 10700 struct workhead *refhd; 10701{ 10702 struct jaddref *jaddref; 10703 struct freefile *freefile; 10704 struct worklist *wk; 10705 10706 freefile = NULL; 10707 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 10708 WORKLIST_REMOVE(wk); 10709 switch (wk->wk_type) { 10710 case D_FREEFILE: 10711 /* 10712 * We defer adding freefile to the worklist 10713 * until all other additions have been made to 10714 * ensure that it will be done after all the 10715 * old blocks have been freed. 10716 */ 10717 if (freefile != NULL) 10718 panic("handle_bufwait: freefile"); 10719 freefile = WK_FREEFILE(wk); 10720 continue; 10721 10722 case D_MKDIR: 10723 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 10724 continue; 10725 10726 case D_DIRADD: 10727 diradd_inode_written(WK_DIRADD(wk), inodedep); 10728 continue; 10729 10730 case D_FREEFRAG: 10731 wk->wk_state |= COMPLETE; 10732 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) 10733 add_to_worklist(wk, 0); 10734 continue; 10735 10736 case D_DIRREM: 10737 wk->wk_state |= COMPLETE; 10738 add_to_worklist(wk, 0); 10739 continue; 10740 10741 case D_ALLOCDIRECT: 10742 case D_ALLOCINDIR: 10743 free_newblk(WK_NEWBLK(wk)); 10744 continue; 10745 10746 case D_JNEWBLK: 10747 wk->wk_state |= COMPLETE; 10748 free_jnewblk(WK_JNEWBLK(wk)); 10749 continue; 10750 10751 /* 10752 * Save freed journal segments and add references on 10753 * the supplied list which will delay their release 10754 * until the cg bitmap is cleared on disk. 10755 */ 10756 case D_JSEGDEP: 10757 if (refhd == NULL) 10758 free_jsegdep(WK_JSEGDEP(wk)); 10759 else 10760 WORKLIST_INSERT(refhd, wk); 10761 continue; 10762 10763 case D_JADDREF: 10764 jaddref = WK_JADDREF(wk); 10765 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 10766 if_deps); 10767 /* 10768 * Transfer any jaddrefs to the list to be freed with 10769 * the bitmap if we're handling a removed file. 10770 */ 10771 if (refhd == NULL) { 10772 wk->wk_state |= COMPLETE; 10773 free_jaddref(jaddref); 10774 } else 10775 WORKLIST_INSERT(refhd, wk); 10776 continue; 10777 10778 default: 10779 panic("handle_bufwait: Unknown type %p(%s)", 10780 wk, TYPENAME(wk->wk_type)); 10781 /* NOTREACHED */ 10782 } 10783 } 10784 return (freefile); 10785} 10786/* 10787 * Called from within softdep_disk_write_complete above to restore 10788 * in-memory inode block contents to their most up-to-date state. Note 10789 * that this routine is always called from interrupt level with further 10790 * splbio interrupts blocked. 10791 */ 10792static int 10793handle_written_inodeblock(inodedep, bp) 10794 struct inodedep *inodedep; 10795 struct buf *bp; /* buffer containing the inode block */ 10796{ 10797 struct freefile *freefile; 10798 struct allocdirect *adp, *nextadp; 10799 struct ufs1_dinode *dp1 = NULL; 10800 struct ufs2_dinode *dp2 = NULL; 10801 struct workhead wkhd; 10802 int hadchanges, fstype; 10803 ino_t freelink; 10804 10805 LIST_INIT(&wkhd); 10806 hadchanges = 0; 10807 freefile = NULL; 10808 if ((inodedep->id_state & IOSTARTED) == 0) 10809 panic("handle_written_inodeblock: not started"); 10810 inodedep->id_state &= ~IOSTARTED; 10811 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { 10812 fstype = UFS1; 10813 dp1 = (struct ufs1_dinode *)bp->b_data + 10814 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 10815 freelink = dp1->di_freelink; 10816 } else { 10817 fstype = UFS2; 10818 dp2 = (struct ufs2_dinode *)bp->b_data + 10819 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 10820 freelink = dp2->di_freelink; 10821 } 10822 /* 10823 * If we wrote a valid freelink pointer during the last write 10824 * record it here. 10825 */ 10826 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 10827 struct inodedep *inon; 10828 10829 inon = TAILQ_NEXT(inodedep, id_unlinked); 10830 if ((inon == NULL && freelink == 0) || 10831 (inon && inon->id_ino == freelink)) { 10832 if (inon) 10833 inon->id_state |= UNLINKPREV; 10834 inodedep->id_state |= UNLINKNEXT; 10835 } else 10836 hadchanges = 1; 10837 } 10838 /* Leave this inodeblock dirty until it's in the list. */ 10839 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) 10840 hadchanges = 1; 10841 /* 10842 * If we had to rollback the inode allocation because of 10843 * bitmaps being incomplete, then simply restore it. 10844 * Keep the block dirty so that it will not be reclaimed until 10845 * all associated dependencies have been cleared and the 10846 * corresponding updates written to disk. 10847 */ 10848 if (inodedep->id_savedino1 != NULL) { 10849 hadchanges = 1; 10850 if (fstype == UFS1) 10851 *dp1 = *inodedep->id_savedino1; 10852 else 10853 *dp2 = *inodedep->id_savedino2; 10854 free(inodedep->id_savedino1, M_SAVEDINO); 10855 inodedep->id_savedino1 = NULL; 10856 if ((bp->b_flags & B_DELWRI) == 0) 10857 stat_inode_bitmap++; 10858 bdirty(bp); 10859 /* 10860 * If the inode is clear here and GOINGAWAY it will never 10861 * be written. Process the bufwait and clear any pending 10862 * work which may include the freefile. 10863 */ 10864 if (inodedep->id_state & GOINGAWAY) 10865 goto bufwait; 10866 return (1); 10867 } 10868 inodedep->id_state |= COMPLETE; 10869 /* 10870 * Roll forward anything that had to be rolled back before 10871 * the inode could be updated. 10872 */ 10873 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 10874 nextadp = TAILQ_NEXT(adp, ad_next); 10875 if (adp->ad_state & ATTACHED) 10876 panic("handle_written_inodeblock: new entry"); 10877 if (fstype == UFS1) { 10878 if (adp->ad_offset < NDADDR) { 10879 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) 10880 panic("%s %s #%jd mismatch %d != %jd", 10881 "handle_written_inodeblock:", 10882 "direct pointer", 10883 (intmax_t)adp->ad_offset, 10884 dp1->di_db[adp->ad_offset], 10885 (intmax_t)adp->ad_oldblkno); 10886 dp1->di_db[adp->ad_offset] = adp->ad_newblkno; 10887 } else { 10888 if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) 10889 panic("%s: %s #%jd allocated as %d", 10890 "handle_written_inodeblock", 10891 "indirect pointer", 10892 (intmax_t)adp->ad_offset - NDADDR, 10893 dp1->di_ib[adp->ad_offset - NDADDR]); 10894 dp1->di_ib[adp->ad_offset - NDADDR] = 10895 adp->ad_newblkno; 10896 } 10897 } else { 10898 if (adp->ad_offset < NDADDR) { 10899 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) 10900 panic("%s: %s #%jd %s %jd != %jd", 10901 "handle_written_inodeblock", 10902 "direct pointer", 10903 (intmax_t)adp->ad_offset, "mismatch", 10904 (intmax_t)dp2->di_db[adp->ad_offset], 10905 (intmax_t)adp->ad_oldblkno); 10906 dp2->di_db[adp->ad_offset] = adp->ad_newblkno; 10907 } else { 10908 if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) 10909 panic("%s: %s #%jd allocated as %jd", 10910 "handle_written_inodeblock", 10911 "indirect pointer", 10912 (intmax_t)adp->ad_offset - NDADDR, 10913 (intmax_t) 10914 dp2->di_ib[adp->ad_offset - NDADDR]); 10915 dp2->di_ib[adp->ad_offset - NDADDR] = 10916 adp->ad_newblkno; 10917 } 10918 } 10919 adp->ad_state &= ~UNDONE; 10920 adp->ad_state |= ATTACHED; 10921 hadchanges = 1; 10922 } 10923 for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { 10924 nextadp = TAILQ_NEXT(adp, ad_next); 10925 if (adp->ad_state & ATTACHED) 10926 panic("handle_written_inodeblock: new entry"); 10927 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) 10928 panic("%s: direct pointers #%jd %s %jd != %jd", 10929 "handle_written_inodeblock", 10930 (intmax_t)adp->ad_offset, "mismatch", 10931 (intmax_t)dp2->di_extb[adp->ad_offset], 10932 (intmax_t)adp->ad_oldblkno); 10933 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; 10934 adp->ad_state &= ~UNDONE; 10935 adp->ad_state |= ATTACHED; 10936 hadchanges = 1; 10937 } 10938 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 10939 stat_direct_blk_ptrs++; 10940 /* 10941 * Reset the file size to its most up-to-date value. 10942 */ 10943 if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) 10944 panic("handle_written_inodeblock: bad size"); 10945 if (inodedep->id_savednlink > LINK_MAX) 10946 panic("handle_written_inodeblock: Invalid link count " 10947 "%d for inodedep %p", inodedep->id_savednlink, inodedep); 10948 if (fstype == UFS1) { 10949 if (dp1->di_nlink != inodedep->id_savednlink) { 10950 dp1->di_nlink = inodedep->id_savednlink; 10951 hadchanges = 1; 10952 } 10953 if (dp1->di_size != inodedep->id_savedsize) { 10954 dp1->di_size = inodedep->id_savedsize; 10955 hadchanges = 1; 10956 } 10957 } else { 10958 if (dp2->di_nlink != inodedep->id_savednlink) { 10959 dp2->di_nlink = inodedep->id_savednlink; 10960 hadchanges = 1; 10961 } 10962 if (dp2->di_size != inodedep->id_savedsize) { 10963 dp2->di_size = inodedep->id_savedsize; 10964 hadchanges = 1; 10965 } 10966 if (dp2->di_extsize != inodedep->id_savedextsize) { 10967 dp2->di_extsize = inodedep->id_savedextsize; 10968 hadchanges = 1; 10969 } 10970 } 10971 inodedep->id_savedsize = -1; 10972 inodedep->id_savedextsize = -1; 10973 inodedep->id_savednlink = -1; 10974 /* 10975 * If there were any rollbacks in the inode block, then it must be 10976 * marked dirty so that its will eventually get written back in 10977 * its correct form. 10978 */ 10979 if (hadchanges) 10980 bdirty(bp); 10981bufwait: 10982 /* 10983 * Process any allocdirects that completed during the update. 10984 */ 10985 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 10986 handle_allocdirect_partdone(adp, &wkhd); 10987 if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) 10988 handle_allocdirect_partdone(adp, &wkhd); 10989 /* 10990 * Process deallocations that were held pending until the 10991 * inode had been written to disk. Freeing of the inode 10992 * is delayed until after all blocks have been freed to 10993 * avoid creation of new <vfsid, inum, lbn> triples 10994 * before the old ones have been deleted. Completely 10995 * unlinked inodes are not processed until the unlinked 10996 * inode list is written or the last reference is removed. 10997 */ 10998 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { 10999 freefile = handle_bufwait(inodedep, NULL); 11000 if (freefile && !LIST_EMPTY(&wkhd)) { 11001 WORKLIST_INSERT(&wkhd, &freefile->fx_list); 11002 freefile = NULL; 11003 } 11004 } 11005 /* 11006 * Move rolled forward dependency completions to the bufwait list 11007 * now that those that were already written have been processed. 11008 */ 11009 if (!LIST_EMPTY(&wkhd) && hadchanges == 0) 11010 panic("handle_written_inodeblock: bufwait but no changes"); 11011 jwork_move(&inodedep->id_bufwait, &wkhd); 11012 11013 if (freefile != NULL) { 11014 /* 11015 * If the inode is goingaway it was never written. Fake up 11016 * the state here so free_inodedep() can succeed. 11017 */ 11018 if (inodedep->id_state & GOINGAWAY) 11019 inodedep->id_state |= COMPLETE | DEPCOMPLETE; 11020 if (free_inodedep(inodedep) == 0) 11021 panic("handle_written_inodeblock: live inodedep %p", 11022 inodedep); 11023 add_to_worklist(&freefile->fx_list, 0); 11024 return (0); 11025 } 11026 11027 /* 11028 * If no outstanding dependencies, free it. 11029 */ 11030 if (free_inodedep(inodedep) || 11031 (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && 11032 TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && 11033 TAILQ_FIRST(&inodedep->id_extupdt) == 0 && 11034 LIST_FIRST(&inodedep->id_bufwait) == 0)) 11035 return (0); 11036 return (hadchanges); 11037} 11038 11039static int 11040handle_written_indirdep(indirdep, bp, bpp) 11041 struct indirdep *indirdep; 11042 struct buf *bp; 11043 struct buf **bpp; 11044{ 11045 struct allocindir *aip; 11046 struct buf *sbp; 11047 int chgs; 11048 11049 if (indirdep->ir_state & GOINGAWAY) 11050 panic("handle_written_indirdep: indirdep gone"); 11051 if ((indirdep->ir_state & IOSTARTED) == 0) 11052 panic("handle_written_indirdep: IO not started"); 11053 chgs = 0; 11054 /* 11055 * If there were rollbacks revert them here. 11056 */ 11057 if (indirdep->ir_saveddata) { 11058 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 11059 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 11060 free(indirdep->ir_saveddata, M_INDIRDEP); 11061 indirdep->ir_saveddata = NULL; 11062 } 11063 chgs = 1; 11064 } 11065 indirdep->ir_state &= ~(UNDONE | IOSTARTED); 11066 indirdep->ir_state |= ATTACHED; 11067 /* 11068 * Move allocindirs with written pointers to the completehd if 11069 * the indirdep's pointer is not yet written. Otherwise 11070 * free them here. 11071 */ 11072 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) { 11073 LIST_REMOVE(aip, ai_next); 11074 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 11075 LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, 11076 ai_next); 11077 newblk_freefrag(&aip->ai_block); 11078 continue; 11079 } 11080 free_newblk(&aip->ai_block); 11081 } 11082 /* 11083 * Move allocindirs that have finished dependency processing from 11084 * the done list to the write list after updating the pointers. 11085 */ 11086 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 11087 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 11088 handle_allocindir_partdone(aip); 11089 if (aip == LIST_FIRST(&indirdep->ir_donehd)) 11090 panic("disk_write_complete: not gone"); 11091 chgs = 1; 11092 } 11093 } 11094 /* 11095 * Preserve the indirdep if there were any changes or if it is not 11096 * yet valid on disk. 11097 */ 11098 if (chgs) { 11099 stat_indir_blk_ptrs++; 11100 bdirty(bp); 11101 return (1); 11102 } 11103 /* 11104 * If there were no changes we can discard the savedbp and detach 11105 * ourselves from the buf. We are only carrying completed pointers 11106 * in this case. 11107 */ 11108 sbp = indirdep->ir_savebp; 11109 sbp->b_flags |= B_INVAL | B_NOCACHE; 11110 indirdep->ir_savebp = NULL; 11111 indirdep->ir_bp = NULL; 11112 if (*bpp != NULL) 11113 panic("handle_written_indirdep: bp already exists."); 11114 *bpp = sbp; 11115 /* 11116 * The indirdep may not be freed until its parent points at it. 11117 */ 11118 if (indirdep->ir_state & DEPCOMPLETE) 11119 free_indirdep(indirdep); 11120 11121 return (0); 11122} 11123 11124/* 11125 * Process a diradd entry after its dependent inode has been written. 11126 * This routine must be called with splbio interrupts blocked. 11127 */ 11128static void 11129diradd_inode_written(dap, inodedep) 11130 struct diradd *dap; 11131 struct inodedep *inodedep; 11132{ 11133 11134 dap->da_state |= COMPLETE; 11135 complete_diradd(dap); 11136 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 11137} 11138 11139/* 11140 * Returns true if the bmsafemap will have rollbacks when written. Must 11141 * only be called with lk and the buf lock on the cg held. 11142 */ 11143static int 11144bmsafemap_rollbacks(bmsafemap) 11145 struct bmsafemap *bmsafemap; 11146{ 11147 11148 return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | 11149 !LIST_EMPTY(&bmsafemap->sm_jnewblkhd)); 11150} 11151 11152/* 11153 * Re-apply an allocation when a cg write is complete. 11154 */ 11155static int 11156jnewblk_rollforward(jnewblk, fs, cgp, blksfree) 11157 struct jnewblk *jnewblk; 11158 struct fs *fs; 11159 struct cg *cgp; 11160 uint8_t *blksfree; 11161{ 11162 ufs1_daddr_t fragno; 11163 ufs2_daddr_t blkno; 11164 long cgbno, bbase; 11165 int frags, blk; 11166 int i; 11167 11168 frags = 0; 11169 cgbno = dtogd(fs, jnewblk->jn_blkno); 11170 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { 11171 if (isclr(blksfree, cgbno + i)) 11172 panic("jnewblk_rollforward: re-allocated fragment"); 11173 frags++; 11174 } 11175 if (frags == fs->fs_frag) { 11176 blkno = fragstoblks(fs, cgbno); 11177 ffs_clrblock(fs, blksfree, (long)blkno); 11178 ffs_clusteracct(fs, cgp, blkno, -1); 11179 cgp->cg_cs.cs_nbfree--; 11180 } else { 11181 bbase = cgbno - fragnum(fs, cgbno); 11182 cgbno += jnewblk->jn_oldfrags; 11183 /* If a complete block had been reassembled, account for it. */ 11184 fragno = fragstoblks(fs, bbase); 11185 if (ffs_isblock(fs, blksfree, fragno)) { 11186 cgp->cg_cs.cs_nffree += fs->fs_frag; 11187 ffs_clusteracct(fs, cgp, fragno, -1); 11188 cgp->cg_cs.cs_nbfree--; 11189 } 11190 /* Decrement the old frags. */ 11191 blk = blkmap(fs, blksfree, bbase); 11192 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 11193 /* Allocate the fragment */ 11194 for (i = 0; i < frags; i++) 11195 clrbit(blksfree, cgbno + i); 11196 cgp->cg_cs.cs_nffree -= frags; 11197 /* Add back in counts associated with the new frags */ 11198 blk = blkmap(fs, blksfree, bbase); 11199 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 11200 } 11201 return (frags); 11202} 11203 11204/* 11205 * Complete a write to a bmsafemap structure. Roll forward any bitmap 11206 * changes if it's not a background write. Set all written dependencies 11207 * to DEPCOMPLETE and free the structure if possible. 11208 */ 11209static int 11210handle_written_bmsafemap(bmsafemap, bp) 11211 struct bmsafemap *bmsafemap; 11212 struct buf *bp; 11213{ 11214 struct newblk *newblk; 11215 struct inodedep *inodedep; 11216 struct jaddref *jaddref, *jatmp; 11217 struct jnewblk *jnewblk, *jntmp; 11218 struct ufsmount *ump; 11219 uint8_t *inosused; 11220 uint8_t *blksfree; 11221 struct cg *cgp; 11222 struct fs *fs; 11223 ino_t ino; 11224 int chgs; 11225 11226 if ((bmsafemap->sm_state & IOSTARTED) == 0) 11227 panic("initiate_write_bmsafemap: Not started\n"); 11228 ump = VFSTOUFS(bmsafemap->sm_list.wk_mp); 11229 chgs = 0; 11230 bmsafemap->sm_state &= ~IOSTARTED; 11231 /* 11232 * Release journal work that was waiting on the write. 11233 */ 11234 handle_jwork(&bmsafemap->sm_freewr); 11235 11236 /* 11237 * Restore unwritten inode allocation pending jaddref writes. 11238 */ 11239 if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { 11240 cgp = (struct cg *)bp->b_data; 11241 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 11242 inosused = cg_inosused(cgp); 11243 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, 11244 ja_bmdeps, jatmp) { 11245 if ((jaddref->ja_state & UNDONE) == 0) 11246 continue; 11247 ino = jaddref->ja_ino % fs->fs_ipg; 11248 if (isset(inosused, ino)) 11249 panic("handle_written_bmsafemap: " 11250 "re-allocated inode"); 11251 if ((bp->b_xflags & BX_BKGRDMARKER) == 0) { 11252 if ((jaddref->ja_mode & IFMT) == IFDIR) 11253 cgp->cg_cs.cs_ndir++; 11254 cgp->cg_cs.cs_nifree--; 11255 setbit(inosused, ino); 11256 chgs = 1; 11257 } 11258 jaddref->ja_state &= ~UNDONE; 11259 jaddref->ja_state |= ATTACHED; 11260 free_jaddref(jaddref); 11261 } 11262 } 11263 /* 11264 * Restore any block allocations which are pending journal writes. 11265 */ 11266 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 11267 cgp = (struct cg *)bp->b_data; 11268 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 11269 blksfree = cg_blksfree(cgp); 11270 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, 11271 jntmp) { 11272 if ((jnewblk->jn_state & UNDONE) == 0) 11273 continue; 11274 if ((bp->b_xflags & BX_BKGRDMARKER) == 0 && 11275 jnewblk_rollforward(jnewblk, fs, cgp, blksfree)) 11276 chgs = 1; 11277 jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); 11278 jnewblk->jn_state |= ATTACHED; 11279 free_jnewblk(jnewblk); 11280 } 11281 } 11282 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { 11283 newblk->nb_state |= DEPCOMPLETE; 11284 newblk->nb_state &= ~ONDEPLIST; 11285 newblk->nb_bmsafemap = NULL; 11286 LIST_REMOVE(newblk, nb_deps); 11287 if (newblk->nb_list.wk_type == D_ALLOCDIRECT) 11288 handle_allocdirect_partdone( 11289 WK_ALLOCDIRECT(&newblk->nb_list), NULL); 11290 else if (newblk->nb_list.wk_type == D_ALLOCINDIR) 11291 handle_allocindir_partdone( 11292 WK_ALLOCINDIR(&newblk->nb_list)); 11293 else if (newblk->nb_list.wk_type != D_NEWBLK) 11294 panic("handle_written_bmsafemap: Unexpected type: %s", 11295 TYPENAME(newblk->nb_list.wk_type)); 11296 } 11297 while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { 11298 inodedep->id_state |= DEPCOMPLETE; 11299 inodedep->id_state &= ~ONDEPLIST; 11300 LIST_REMOVE(inodedep, id_deps); 11301 inodedep->id_bmsafemap = NULL; 11302 } 11303 LIST_REMOVE(bmsafemap, sm_next); 11304 if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && 11305 LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && 11306 LIST_EMPTY(&bmsafemap->sm_newblkhd) && 11307 LIST_EMPTY(&bmsafemap->sm_inodedephd) && 11308 LIST_EMPTY(&bmsafemap->sm_freehd)) { 11309 LIST_REMOVE(bmsafemap, sm_hash); 11310 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 11311 return (0); 11312 } 11313 LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); 11314 bdirty(bp); 11315 return (1); 11316} 11317 11318/* 11319 * Try to free a mkdir dependency. 11320 */ 11321static void 11322complete_mkdir(mkdir) 11323 struct mkdir *mkdir; 11324{ 11325 struct diradd *dap; 11326 11327 if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) 11328 return; 11329 LIST_REMOVE(mkdir, md_mkdirs); 11330 dap = mkdir->md_diradd; 11331 dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 11332 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { 11333 dap->da_state |= DEPCOMPLETE; 11334 complete_diradd(dap); 11335 } 11336 WORKITEM_FREE(mkdir, D_MKDIR); 11337} 11338 11339/* 11340 * Handle the completion of a mkdir dependency. 11341 */ 11342static void 11343handle_written_mkdir(mkdir, type) 11344 struct mkdir *mkdir; 11345 int type; 11346{ 11347 11348 if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) 11349 panic("handle_written_mkdir: bad type"); 11350 mkdir->md_state |= COMPLETE; 11351 complete_mkdir(mkdir); 11352} 11353 11354static int 11355free_pagedep(pagedep) 11356 struct pagedep *pagedep; 11357{ 11358 int i; 11359 11360 if (pagedep->pd_state & NEWBLOCK) 11361 return (0); 11362 if (!LIST_EMPTY(&pagedep->pd_dirremhd)) 11363 return (0); 11364 for (i = 0; i < DAHASHSZ; i++) 11365 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 11366 return (0); 11367 if (!LIST_EMPTY(&pagedep->pd_pendinghd)) 11368 return (0); 11369 if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) 11370 return (0); 11371 if (pagedep->pd_state & ONWORKLIST) 11372 WORKLIST_REMOVE(&pagedep->pd_list); 11373 LIST_REMOVE(pagedep, pd_hash); 11374 WORKITEM_FREE(pagedep, D_PAGEDEP); 11375 11376 return (1); 11377} 11378 11379/* 11380 * Called from within softdep_disk_write_complete above. 11381 * A write operation was just completed. Removed inodes can 11382 * now be freed and associated block pointers may be committed. 11383 * Note that this routine is always called from interrupt level 11384 * with further splbio interrupts blocked. 11385 */ 11386static int 11387handle_written_filepage(pagedep, bp) 11388 struct pagedep *pagedep; 11389 struct buf *bp; /* buffer containing the written page */ 11390{ 11391 struct dirrem *dirrem; 11392 struct diradd *dap, *nextdap; 11393 struct direct *ep; 11394 int i, chgs; 11395 11396 if ((pagedep->pd_state & IOSTARTED) == 0) 11397 panic("handle_written_filepage: not started"); 11398 pagedep->pd_state &= ~IOSTARTED; 11399 /* 11400 * Process any directory removals that have been committed. 11401 */ 11402 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 11403 LIST_REMOVE(dirrem, dm_next); 11404 dirrem->dm_state |= COMPLETE; 11405 dirrem->dm_dirinum = pagedep->pd_ino; 11406 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 11407 ("handle_written_filepage: Journal entries not written.")); 11408 add_to_worklist(&dirrem->dm_list, 0); 11409 } 11410 /* 11411 * Free any directory additions that have been committed. 11412 * If it is a newly allocated block, we have to wait until 11413 * the on-disk directory inode claims the new block. 11414 */ 11415 if ((pagedep->pd_state & NEWBLOCK) == 0) 11416 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 11417 free_diradd(dap, NULL); 11418 /* 11419 * Uncommitted directory entries must be restored. 11420 */ 11421 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 11422 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 11423 dap = nextdap) { 11424 nextdap = LIST_NEXT(dap, da_pdlist); 11425 if (dap->da_state & ATTACHED) 11426 panic("handle_written_filepage: attached"); 11427 ep = (struct direct *) 11428 ((char *)bp->b_data + dap->da_offset); 11429 ep->d_ino = dap->da_newinum; 11430 dap->da_state &= ~UNDONE; 11431 dap->da_state |= ATTACHED; 11432 chgs = 1; 11433 /* 11434 * If the inode referenced by the directory has 11435 * been written out, then the dependency can be 11436 * moved to the pending list. 11437 */ 11438 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 11439 LIST_REMOVE(dap, da_pdlist); 11440 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 11441 da_pdlist); 11442 } 11443 } 11444 } 11445 /* 11446 * If there were any rollbacks in the directory, then it must be 11447 * marked dirty so that its will eventually get written back in 11448 * its correct form. 11449 */ 11450 if (chgs) { 11451 if ((bp->b_flags & B_DELWRI) == 0) 11452 stat_dir_entry++; 11453 bdirty(bp); 11454 return (1); 11455 } 11456 /* 11457 * If we are not waiting for a new directory block to be 11458 * claimed by its inode, then the pagedep will be freed. 11459 * Otherwise it will remain to track any new entries on 11460 * the page in case they are fsync'ed. 11461 */ 11462 free_pagedep(pagedep); 11463 return (0); 11464} 11465 11466/* 11467 * Writing back in-core inode structures. 11468 * 11469 * The filesystem only accesses an inode's contents when it occupies an 11470 * "in-core" inode structure. These "in-core" structures are separate from 11471 * the page frames used to cache inode blocks. Only the latter are 11472 * transferred to/from the disk. So, when the updated contents of the 11473 * "in-core" inode structure are copied to the corresponding in-memory inode 11474 * block, the dependencies are also transferred. The following procedure is 11475 * called when copying a dirty "in-core" inode to a cached inode block. 11476 */ 11477 11478/* 11479 * Called when an inode is loaded from disk. If the effective link count 11480 * differed from the actual link count when it was last flushed, then we 11481 * need to ensure that the correct effective link count is put back. 11482 */ 11483void 11484softdep_load_inodeblock(ip) 11485 struct inode *ip; /* the "in_core" copy of the inode */ 11486{ 11487 struct inodedep *inodedep; 11488 11489 /* 11490 * Check for alternate nlink count. 11491 */ 11492 ip->i_effnlink = ip->i_nlink; 11493 ACQUIRE_LOCK(&lk); 11494 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 11495 &inodedep) == 0) { 11496 FREE_LOCK(&lk); 11497 return; 11498 } 11499 ip->i_effnlink -= inodedep->id_nlinkdelta; 11500 FREE_LOCK(&lk); 11501} 11502 11503/* 11504 * This routine is called just before the "in-core" inode 11505 * information is to be copied to the in-memory inode block. 11506 * Recall that an inode block contains several inodes. If 11507 * the force flag is set, then the dependencies will be 11508 * cleared so that the update can always be made. Note that 11509 * the buffer is locked when this routine is called, so we 11510 * will never be in the middle of writing the inode block 11511 * to disk. 11512 */ 11513void 11514softdep_update_inodeblock(ip, bp, waitfor) 11515 struct inode *ip; /* the "in_core" copy of the inode */ 11516 struct buf *bp; /* the buffer containing the inode block */ 11517 int waitfor; /* nonzero => update must be allowed */ 11518{ 11519 struct inodedep *inodedep; 11520 struct inoref *inoref; 11521 struct worklist *wk; 11522 struct mount *mp; 11523 struct buf *ibp; 11524 struct fs *fs; 11525 int error; 11526 11527 mp = UFSTOVFS(ip->i_ump); 11528 fs = ip->i_fs; 11529 /* 11530 * Preserve the freelink that is on disk. clear_unlinked_inodedep() 11531 * does not have access to the in-core ip so must write directly into 11532 * the inode block buffer when setting freelink. 11533 */ 11534 if (fs->fs_magic == FS_UFS1_MAGIC) 11535 DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + 11536 ino_to_fsbo(fs, ip->i_number))->di_freelink); 11537 else 11538 DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + 11539 ino_to_fsbo(fs, ip->i_number))->di_freelink); 11540 /* 11541 * If the effective link count is not equal to the actual link 11542 * count, then we must track the difference in an inodedep while 11543 * the inode is (potentially) tossed out of the cache. Otherwise, 11544 * if there is no existing inodedep, then there are no dependencies 11545 * to track. 11546 */ 11547 ACQUIRE_LOCK(&lk); 11548again: 11549 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 11550 FREE_LOCK(&lk); 11551 if (ip->i_effnlink != ip->i_nlink) 11552 panic("softdep_update_inodeblock: bad link count"); 11553 return; 11554 } 11555 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) 11556 panic("softdep_update_inodeblock: bad delta"); 11557 /* 11558 * If we're flushing all dependencies we must also move any waiting 11559 * for journal writes onto the bufwait list prior to I/O. 11560 */ 11561 if (waitfor) { 11562 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 11563 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 11564 == DEPCOMPLETE) { 11565 jwait(&inoref->if_list, MNT_WAIT); 11566 goto again; 11567 } 11568 } 11569 } 11570 /* 11571 * Changes have been initiated. Anything depending on these 11572 * changes cannot occur until this inode has been written. 11573 */ 11574 inodedep->id_state &= ~COMPLETE; 11575 if ((inodedep->id_state & ONWORKLIST) == 0) 11576 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 11577 /* 11578 * Any new dependencies associated with the incore inode must 11579 * now be moved to the list associated with the buffer holding 11580 * the in-memory copy of the inode. Once merged process any 11581 * allocdirects that are completed by the merger. 11582 */ 11583 merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); 11584 if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) 11585 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), 11586 NULL); 11587 merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); 11588 if (!TAILQ_EMPTY(&inodedep->id_extupdt)) 11589 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), 11590 NULL); 11591 /* 11592 * Now that the inode has been pushed into the buffer, the 11593 * operations dependent on the inode being written to disk 11594 * can be moved to the id_bufwait so that they will be 11595 * processed when the buffer I/O completes. 11596 */ 11597 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 11598 WORKLIST_REMOVE(wk); 11599 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 11600 } 11601 /* 11602 * Newly allocated inodes cannot be written until the bitmap 11603 * that allocates them have been written (indicated by 11604 * DEPCOMPLETE being set in id_state). If we are doing a 11605 * forced sync (e.g., an fsync on a file), we force the bitmap 11606 * to be written so that the update can be done. 11607 */ 11608 if (waitfor == 0) { 11609 FREE_LOCK(&lk); 11610 return; 11611 } 11612retry: 11613 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { 11614 FREE_LOCK(&lk); 11615 return; 11616 } 11617 ibp = inodedep->id_bmsafemap->sm_buf; 11618 ibp = getdirtybuf(ibp, &lk, MNT_WAIT); 11619 if (ibp == NULL) { 11620 /* 11621 * If ibp came back as NULL, the dependency could have been 11622 * freed while we slept. Look it up again, and check to see 11623 * that it has completed. 11624 */ 11625 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 11626 goto retry; 11627 FREE_LOCK(&lk); 11628 return; 11629 } 11630 FREE_LOCK(&lk); 11631 if ((error = bwrite(ibp)) != 0) 11632 softdep_error("softdep_update_inodeblock: bwrite", error); 11633} 11634 11635/* 11636 * Merge the a new inode dependency list (such as id_newinoupdt) into an 11637 * old inode dependency list (such as id_inoupdt). This routine must be 11638 * called with splbio interrupts blocked. 11639 */ 11640static void 11641merge_inode_lists(newlisthead, oldlisthead) 11642 struct allocdirectlst *newlisthead; 11643 struct allocdirectlst *oldlisthead; 11644{ 11645 struct allocdirect *listadp, *newadp; 11646 11647 newadp = TAILQ_FIRST(newlisthead); 11648 for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { 11649 if (listadp->ad_offset < newadp->ad_offset) { 11650 listadp = TAILQ_NEXT(listadp, ad_next); 11651 continue; 11652 } 11653 TAILQ_REMOVE(newlisthead, newadp, ad_next); 11654 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 11655 if (listadp->ad_offset == newadp->ad_offset) { 11656 allocdirect_merge(oldlisthead, newadp, 11657 listadp); 11658 listadp = newadp; 11659 } 11660 newadp = TAILQ_FIRST(newlisthead); 11661 } 11662 while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { 11663 TAILQ_REMOVE(newlisthead, newadp, ad_next); 11664 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); 11665 } 11666} 11667 11668/* 11669 * If we are doing an fsync, then we must ensure that any directory 11670 * entries for the inode have been written after the inode gets to disk. 11671 */ 11672int 11673softdep_fsync(vp) 11674 struct vnode *vp; /* the "in_core" copy of the inode */ 11675{ 11676 struct inodedep *inodedep; 11677 struct pagedep *pagedep; 11678 struct inoref *inoref; 11679 struct worklist *wk; 11680 struct diradd *dap; 11681 struct mount *mp; 11682 struct vnode *pvp; 11683 struct inode *ip; 11684 struct buf *bp; 11685 struct fs *fs; 11686 struct thread *td = curthread; 11687 int error, flushparent, pagedep_new_block; 11688 ino_t parentino; 11689 ufs_lbn_t lbn; 11690 11691 ip = VTOI(vp); 11692 fs = ip->i_fs; 11693 mp = vp->v_mount; 11694 ACQUIRE_LOCK(&lk); 11695restart: 11696 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 11697 FREE_LOCK(&lk); 11698 return (0); 11699 } 11700 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 11701 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 11702 == DEPCOMPLETE) { 11703 jwait(&inoref->if_list, MNT_WAIT); 11704 goto restart; 11705 } 11706 } 11707 if (!LIST_EMPTY(&inodedep->id_inowait) || 11708 !TAILQ_EMPTY(&inodedep->id_extupdt) || 11709 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 11710 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 11711 !TAILQ_EMPTY(&inodedep->id_newinoupdt)) 11712 panic("softdep_fsync: pending ops %p", inodedep); 11713 for (error = 0, flushparent = 0; ; ) { 11714 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 11715 break; 11716 if (wk->wk_type != D_DIRADD) 11717 panic("softdep_fsync: Unexpected type %s", 11718 TYPENAME(wk->wk_type)); 11719 dap = WK_DIRADD(wk); 11720 /* 11721 * Flush our parent if this directory entry has a MKDIR_PARENT 11722 * dependency or is contained in a newly allocated block. 11723 */ 11724 if (dap->da_state & DIRCHG) 11725 pagedep = dap->da_previous->dm_pagedep; 11726 else 11727 pagedep = dap->da_pagedep; 11728 parentino = pagedep->pd_ino; 11729 lbn = pagedep->pd_lbn; 11730 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 11731 panic("softdep_fsync: dirty"); 11732 if ((dap->da_state & MKDIR_PARENT) || 11733 (pagedep->pd_state & NEWBLOCK)) 11734 flushparent = 1; 11735 else 11736 flushparent = 0; 11737 /* 11738 * If we are being fsync'ed as part of vgone'ing this vnode, 11739 * then we will not be able to release and recover the 11740 * vnode below, so we just have to give up on writing its 11741 * directory entry out. It will eventually be written, just 11742 * not now, but then the user was not asking to have it 11743 * written, so we are not breaking any promises. 11744 */ 11745 if (vp->v_iflag & VI_DOOMED) 11746 break; 11747 /* 11748 * We prevent deadlock by always fetching inodes from the 11749 * root, moving down the directory tree. Thus, when fetching 11750 * our parent directory, we first try to get the lock. If 11751 * that fails, we must unlock ourselves before requesting 11752 * the lock on our parent. See the comment in ufs_lookup 11753 * for details on possible races. 11754 */ 11755 FREE_LOCK(&lk); 11756 if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, 11757 FFSV_FORCEINSMQ)) { 11758 error = vfs_busy(mp, MBF_NOWAIT); 11759 if (error != 0) { 11760 vfs_ref(mp); 11761 VOP_UNLOCK(vp, 0); 11762 error = vfs_busy(mp, 0); 11763 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 11764 vfs_rel(mp); 11765 if (error != 0) 11766 return (ENOENT); 11767 if (vp->v_iflag & VI_DOOMED) { 11768 vfs_unbusy(mp); 11769 return (ENOENT); 11770 } 11771 } 11772 VOP_UNLOCK(vp, 0); 11773 error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, 11774 &pvp, FFSV_FORCEINSMQ); 11775 vfs_unbusy(mp); 11776 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 11777 if (vp->v_iflag & VI_DOOMED) { 11778 if (error == 0) 11779 vput(pvp); 11780 error = ENOENT; 11781 } 11782 if (error != 0) 11783 return (error); 11784 } 11785 /* 11786 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps 11787 * that are contained in direct blocks will be resolved by 11788 * doing a ffs_update. Pagedeps contained in indirect blocks 11789 * may require a complete sync'ing of the directory. So, we 11790 * try the cheap and fast ffs_update first, and if that fails, 11791 * then we do the slower ffs_syncvnode of the directory. 11792 */ 11793 if (flushparent) { 11794 int locked; 11795 11796 if ((error = ffs_update(pvp, 1)) != 0) { 11797 vput(pvp); 11798 return (error); 11799 } 11800 ACQUIRE_LOCK(&lk); 11801 locked = 1; 11802 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { 11803 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { 11804 if (wk->wk_type != D_DIRADD) 11805 panic("softdep_fsync: Unexpected type %s", 11806 TYPENAME(wk->wk_type)); 11807 dap = WK_DIRADD(wk); 11808 if (dap->da_state & DIRCHG) 11809 pagedep = dap->da_previous->dm_pagedep; 11810 else 11811 pagedep = dap->da_pagedep; 11812 pagedep_new_block = pagedep->pd_state & NEWBLOCK; 11813 FREE_LOCK(&lk); 11814 locked = 0; 11815 if (pagedep_new_block && 11816 (error = ffs_syncvnode(pvp, MNT_WAIT))) { 11817 vput(pvp); 11818 return (error); 11819 } 11820 } 11821 } 11822 if (locked) 11823 FREE_LOCK(&lk); 11824 } 11825 /* 11826 * Flush directory page containing the inode's name. 11827 */ 11828 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, 11829 &bp); 11830 if (error == 0) 11831 error = bwrite(bp); 11832 else 11833 brelse(bp); 11834 vput(pvp); 11835 if (error != 0) 11836 return (error); 11837 ACQUIRE_LOCK(&lk); 11838 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 11839 break; 11840 } 11841 FREE_LOCK(&lk); 11842 return (0); 11843} 11844 11845/* 11846 * Flush all the dirty bitmaps associated with the block device 11847 * before flushing the rest of the dirty blocks so as to reduce 11848 * the number of dependencies that will have to be rolled back. 11849 * 11850 * XXX Unused? 11851 */ 11852void 11853softdep_fsync_mountdev(vp) 11854 struct vnode *vp; 11855{ 11856 struct buf *bp, *nbp; 11857 struct worklist *wk; 11858 struct bufobj *bo; 11859 11860 if (!vn_isdisk(vp, NULL)) 11861 panic("softdep_fsync_mountdev: vnode not a disk"); 11862 bo = &vp->v_bufobj; 11863restart: 11864 BO_LOCK(bo); 11865 ACQUIRE_LOCK(&lk); 11866 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 11867 /* 11868 * If it is already scheduled, skip to the next buffer. 11869 */ 11870 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 11871 continue; 11872 11873 if ((bp->b_flags & B_DELWRI) == 0) 11874 panic("softdep_fsync_mountdev: not dirty"); 11875 /* 11876 * We are only interested in bitmaps with outstanding 11877 * dependencies. 11878 */ 11879 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || 11880 wk->wk_type != D_BMSAFEMAP || 11881 (bp->b_vflags & BV_BKGRDINPROG)) { 11882 BUF_UNLOCK(bp); 11883 continue; 11884 } 11885 FREE_LOCK(&lk); 11886 BO_UNLOCK(bo); 11887 bremfree(bp); 11888 (void) bawrite(bp); 11889 goto restart; 11890 } 11891 FREE_LOCK(&lk); 11892 drain_output(vp); 11893 BO_UNLOCK(bo); 11894} 11895 11896/* 11897 * Sync all cylinder groups that were dirty at the time this function is 11898 * called. Newly dirtied cgs will be inserted before the sintenel. This 11899 * is used to flush freedep activity that may be holding up writes to a 11900 * indirect block. 11901 */ 11902static int 11903sync_cgs(mp, waitfor) 11904 struct mount *mp; 11905 int waitfor; 11906{ 11907 struct bmsafemap *bmsafemap; 11908 struct bmsafemap *sintenel; 11909 struct ufsmount *ump; 11910 struct buf *bp; 11911 int error; 11912 11913 sintenel = malloc(sizeof(*sintenel), M_BMSAFEMAP, M_ZERO | M_WAITOK); 11914 sintenel->sm_cg = -1; 11915 ump = VFSTOUFS(mp); 11916 error = 0; 11917 ACQUIRE_LOCK(&lk); 11918 LIST_INSERT_HEAD(&ump->softdep_dirtycg, sintenel, sm_next); 11919 for (bmsafemap = LIST_NEXT(sintenel, sm_next); bmsafemap != NULL; 11920 bmsafemap = LIST_NEXT(sintenel, sm_next)) { 11921 /* Skip sintenels and cgs with no work to release. */ 11922 if (bmsafemap->sm_cg == -1 || 11923 (LIST_EMPTY(&bmsafemap->sm_freehd) && 11924 LIST_EMPTY(&bmsafemap->sm_freewr))) { 11925 LIST_REMOVE(sintenel, sm_next); 11926 LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next); 11927 continue; 11928 } 11929 /* 11930 * If we don't get the lock and we're waiting try again, if 11931 * not move on to the next buf and try to sync it. 11932 */ 11933 bp = getdirtybuf(bmsafemap->sm_buf, &lk, waitfor); 11934 if (bp == NULL && waitfor == MNT_WAIT) 11935 continue; 11936 LIST_REMOVE(sintenel, sm_next); 11937 LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next); 11938 if (bp == NULL) 11939 continue; 11940 FREE_LOCK(&lk); 11941 if (waitfor == MNT_NOWAIT) 11942 bawrite(bp); 11943 else 11944 error = bwrite(bp); 11945 ACQUIRE_LOCK(&lk); 11946 if (error) 11947 break; 11948 } 11949 LIST_REMOVE(sintenel, sm_next); 11950 FREE_LOCK(&lk); 11951 free(sintenel, M_BMSAFEMAP); 11952 return (error); 11953} 11954 11955/* 11956 * This routine is called when we are trying to synchronously flush a 11957 * file. This routine must eliminate any filesystem metadata dependencies 11958 * so that the syncing routine can succeed. 11959 */ 11960int 11961softdep_sync_metadata(struct vnode *vp) 11962{ 11963 int error; 11964 11965 /* 11966 * Ensure that any direct block dependencies have been cleared, 11967 * truncations are started, and inode references are journaled. 11968 */ 11969 ACQUIRE_LOCK(&lk); 11970 /* 11971 * Write all journal records to prevent rollbacks on devvp. 11972 */ 11973 if (vp->v_type == VCHR) 11974 softdep_flushjournal(vp->v_mount); 11975 error = flush_inodedep_deps(vp, vp->v_mount, VTOI(vp)->i_number); 11976 /* 11977 * Ensure that all truncates are written so we won't find deps on 11978 * indirect blocks. 11979 */ 11980 process_truncates(vp); 11981 FREE_LOCK(&lk); 11982 11983 return (error); 11984} 11985 11986/* 11987 * This routine is called when we are attempting to sync a buf with 11988 * dependencies. If waitfor is MNT_NOWAIT it attempts to schedule any 11989 * other IO it can but returns EBUSY if the buffer is not yet able to 11990 * be written. Dependencies which will not cause rollbacks will always 11991 * return 0. 11992 */ 11993int 11994softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) 11995{ 11996 struct indirdep *indirdep; 11997 struct pagedep *pagedep; 11998 struct allocindir *aip; 11999 struct newblk *newblk; 12000 struct buf *nbp; 12001 struct worklist *wk; 12002 int i, error; 12003 12004 /* 12005 * For VCHR we just don't want to force flush any dependencies that 12006 * will cause rollbacks. 12007 */ 12008 if (vp->v_type == VCHR) { 12009 if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0)) 12010 return (EBUSY); 12011 return (0); 12012 } 12013 ACQUIRE_LOCK(&lk); 12014 /* 12015 * As we hold the buffer locked, none of its dependencies 12016 * will disappear. 12017 */ 12018 error = 0; 12019top: 12020 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 12021 switch (wk->wk_type) { 12022 12023 case D_ALLOCDIRECT: 12024 case D_ALLOCINDIR: 12025 newblk = WK_NEWBLK(wk); 12026 if (newblk->nb_jnewblk != NULL) { 12027 if (waitfor == MNT_NOWAIT) { 12028 error = EBUSY; 12029 goto out_unlock; 12030 } 12031 jwait(&newblk->nb_jnewblk->jn_list, waitfor); 12032 goto top; 12033 } 12034 if (newblk->nb_state & DEPCOMPLETE || 12035 waitfor == MNT_NOWAIT) 12036 continue; 12037 nbp = newblk->nb_bmsafemap->sm_buf; 12038 nbp = getdirtybuf(nbp, &lk, waitfor); 12039 if (nbp == NULL) 12040 goto top; 12041 FREE_LOCK(&lk); 12042 if ((error = bwrite(nbp)) != 0) 12043 goto out; 12044 ACQUIRE_LOCK(&lk); 12045 continue; 12046 12047 case D_INDIRDEP: 12048 indirdep = WK_INDIRDEP(wk); 12049 if (waitfor == MNT_NOWAIT) { 12050 if (!TAILQ_EMPTY(&indirdep->ir_trunc) || 12051 !LIST_EMPTY(&indirdep->ir_deplisthd)) { 12052 error = EBUSY; 12053 goto out_unlock; 12054 } 12055 } 12056 if (!TAILQ_EMPTY(&indirdep->ir_trunc)) 12057 panic("softdep_sync_buf: truncation pending."); 12058 restart: 12059 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 12060 newblk = (struct newblk *)aip; 12061 if (newblk->nb_jnewblk != NULL) { 12062 jwait(&newblk->nb_jnewblk->jn_list, 12063 waitfor); 12064 goto restart; 12065 } 12066 if (newblk->nb_state & DEPCOMPLETE) 12067 continue; 12068 nbp = newblk->nb_bmsafemap->sm_buf; 12069 nbp = getdirtybuf(nbp, &lk, waitfor); 12070 if (nbp == NULL) 12071 goto restart; 12072 FREE_LOCK(&lk); 12073 if ((error = bwrite(nbp)) != 0) 12074 goto out; 12075 ACQUIRE_LOCK(&lk); 12076 goto restart; 12077 } 12078 continue; 12079 12080 case D_PAGEDEP: 12081 /* 12082 * Only flush directory entries in synchronous passes. 12083 */ 12084 if (waitfor != MNT_WAIT) { 12085 error = EBUSY; 12086 goto out_unlock; 12087 } 12088 /* 12089 * While syncing snapshots, we must allow recursive 12090 * lookups. 12091 */ 12092 BUF_AREC(bp); 12093 /* 12094 * We are trying to sync a directory that may 12095 * have dependencies on both its own metadata 12096 * and/or dependencies on the inodes of any 12097 * recently allocated files. We walk its diradd 12098 * lists pushing out the associated inode. 12099 */ 12100 pagedep = WK_PAGEDEP(wk); 12101 for (i = 0; i < DAHASHSZ; i++) { 12102 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 12103 continue; 12104 if ((error = flush_pagedep_deps(vp, wk->wk_mp, 12105 &pagedep->pd_diraddhd[i]))) { 12106 BUF_NOREC(bp); 12107 goto out_unlock; 12108 } 12109 } 12110 BUF_NOREC(bp); 12111 continue; 12112 12113 case D_FREEWORK: 12114 case D_FREEDEP: 12115 case D_JSEGDEP: 12116 case D_JNEWBLK: 12117 continue; 12118 12119 default: 12120 panic("softdep_sync_buf: Unknown type %s", 12121 TYPENAME(wk->wk_type)); 12122 /* NOTREACHED */ 12123 } 12124 } 12125out_unlock: 12126 FREE_LOCK(&lk); 12127out: 12128 return (error); 12129} 12130 12131/* 12132 * Flush the dependencies associated with an inodedep. 12133 * Called with splbio blocked. 12134 */ 12135static int 12136flush_inodedep_deps(vp, mp, ino) 12137 struct vnode *vp; 12138 struct mount *mp; 12139 ino_t ino; 12140{ 12141 struct inodedep *inodedep; 12142 struct inoref *inoref; 12143 int error, waitfor; 12144 12145 /* 12146 * This work is done in two passes. The first pass grabs most 12147 * of the buffers and begins asynchronously writing them. The 12148 * only way to wait for these asynchronous writes is to sleep 12149 * on the filesystem vnode which may stay busy for a long time 12150 * if the filesystem is active. So, instead, we make a second 12151 * pass over the dependencies blocking on each write. In the 12152 * usual case we will be blocking against a write that we 12153 * initiated, so when it is done the dependency will have been 12154 * resolved. Thus the second pass is expected to end quickly. 12155 * We give a brief window at the top of the loop to allow 12156 * any pending I/O to complete. 12157 */ 12158 for (error = 0, waitfor = MNT_NOWAIT; ; ) { 12159 if (error) 12160 return (error); 12161 FREE_LOCK(&lk); 12162 ACQUIRE_LOCK(&lk); 12163restart: 12164 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 12165 return (0); 12166 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 12167 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 12168 == DEPCOMPLETE) { 12169 jwait(&inoref->if_list, MNT_WAIT); 12170 goto restart; 12171 } 12172 } 12173 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || 12174 flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || 12175 flush_deplist(&inodedep->id_extupdt, waitfor, &error) || 12176 flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) 12177 continue; 12178 /* 12179 * If pass2, we are done, otherwise do pass 2. 12180 */ 12181 if (waitfor == MNT_WAIT) 12182 break; 12183 waitfor = MNT_WAIT; 12184 } 12185 /* 12186 * Try freeing inodedep in case all dependencies have been removed. 12187 */ 12188 if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) 12189 (void) free_inodedep(inodedep); 12190 return (0); 12191} 12192 12193/* 12194 * Flush an inode dependency list. 12195 * Called with splbio blocked. 12196 */ 12197static int 12198flush_deplist(listhead, waitfor, errorp) 12199 struct allocdirectlst *listhead; 12200 int waitfor; 12201 int *errorp; 12202{ 12203 struct allocdirect *adp; 12204 struct newblk *newblk; 12205 struct buf *bp; 12206 12207 mtx_assert(&lk, MA_OWNED); 12208 TAILQ_FOREACH(adp, listhead, ad_next) { 12209 newblk = (struct newblk *)adp; 12210 if (newblk->nb_jnewblk != NULL) { 12211 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 12212 return (1); 12213 } 12214 if (newblk->nb_state & DEPCOMPLETE) 12215 continue; 12216 bp = newblk->nb_bmsafemap->sm_buf; 12217 bp = getdirtybuf(bp, &lk, waitfor); 12218 if (bp == NULL) { 12219 if (waitfor == MNT_NOWAIT) 12220 continue; 12221 return (1); 12222 } 12223 FREE_LOCK(&lk); 12224 if (waitfor == MNT_NOWAIT) 12225 bawrite(bp); 12226 else 12227 *errorp = bwrite(bp); 12228 ACQUIRE_LOCK(&lk); 12229 return (1); 12230 } 12231 return (0); 12232} 12233 12234/* 12235 * Flush dependencies associated with an allocdirect block. 12236 */ 12237static int 12238flush_newblk_dep(vp, mp, lbn) 12239 struct vnode *vp; 12240 struct mount *mp; 12241 ufs_lbn_t lbn; 12242{ 12243 struct newblk *newblk; 12244 struct bufobj *bo; 12245 struct inode *ip; 12246 struct buf *bp; 12247 ufs2_daddr_t blkno; 12248 int error; 12249 12250 error = 0; 12251 bo = &vp->v_bufobj; 12252 ip = VTOI(vp); 12253 blkno = DIP(ip, i_db[lbn]); 12254 if (blkno == 0) 12255 panic("flush_newblk_dep: Missing block"); 12256 ACQUIRE_LOCK(&lk); 12257 /* 12258 * Loop until all dependencies related to this block are satisfied. 12259 * We must be careful to restart after each sleep in case a write 12260 * completes some part of this process for us. 12261 */ 12262 for (;;) { 12263 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { 12264 FREE_LOCK(&lk); 12265 break; 12266 } 12267 if (newblk->nb_list.wk_type != D_ALLOCDIRECT) 12268 panic("flush_newblk_deps: Bad newblk %p", newblk); 12269 /* 12270 * Flush the journal. 12271 */ 12272 if (newblk->nb_jnewblk != NULL) { 12273 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 12274 continue; 12275 } 12276 /* 12277 * Write the bitmap dependency. 12278 */ 12279 if ((newblk->nb_state & DEPCOMPLETE) == 0) { 12280 bp = newblk->nb_bmsafemap->sm_buf; 12281 bp = getdirtybuf(bp, &lk, MNT_WAIT); 12282 if (bp == NULL) 12283 continue; 12284 FREE_LOCK(&lk); 12285 error = bwrite(bp); 12286 if (error) 12287 break; 12288 ACQUIRE_LOCK(&lk); 12289 continue; 12290 } 12291 /* 12292 * Write the buffer. 12293 */ 12294 FREE_LOCK(&lk); 12295 BO_LOCK(bo); 12296 bp = gbincore(bo, lbn); 12297 if (bp != NULL) { 12298 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 12299 LK_INTERLOCK, BO_MTX(bo)); 12300 if (error == ENOLCK) { 12301 ACQUIRE_LOCK(&lk); 12302 continue; /* Slept, retry */ 12303 } 12304 if (error != 0) 12305 break; /* Failed */ 12306 if (bp->b_flags & B_DELWRI) { 12307 bremfree(bp); 12308 error = bwrite(bp); 12309 if (error) 12310 break; 12311 } else 12312 BUF_UNLOCK(bp); 12313 } else 12314 BO_UNLOCK(bo); 12315 /* 12316 * We have to wait for the direct pointers to 12317 * point at the newdirblk before the dependency 12318 * will go away. 12319 */ 12320 error = ffs_update(vp, MNT_WAIT); 12321 if (error) 12322 break; 12323 ACQUIRE_LOCK(&lk); 12324 } 12325 return (error); 12326} 12327 12328/* 12329 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 12330 * Called with splbio blocked. 12331 */ 12332static int 12333flush_pagedep_deps(pvp, mp, diraddhdp) 12334 struct vnode *pvp; 12335 struct mount *mp; 12336 struct diraddhd *diraddhdp; 12337{ 12338 struct inodedep *inodedep; 12339 struct inoref *inoref; 12340 struct ufsmount *ump; 12341 struct diradd *dap; 12342 struct vnode *vp; 12343 int error = 0; 12344 struct buf *bp; 12345 ino_t inum; 12346 12347 ump = VFSTOUFS(mp); 12348restart: 12349 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 12350 /* 12351 * Flush ourselves if this directory entry 12352 * has a MKDIR_PARENT dependency. 12353 */ 12354 if (dap->da_state & MKDIR_PARENT) { 12355 FREE_LOCK(&lk); 12356 if ((error = ffs_update(pvp, MNT_WAIT)) != 0) 12357 break; 12358 ACQUIRE_LOCK(&lk); 12359 /* 12360 * If that cleared dependencies, go on to next. 12361 */ 12362 if (dap != LIST_FIRST(diraddhdp)) 12363 continue; 12364 if (dap->da_state & MKDIR_PARENT) 12365 panic("flush_pagedep_deps: MKDIR_PARENT"); 12366 } 12367 /* 12368 * A newly allocated directory must have its "." and 12369 * ".." entries written out before its name can be 12370 * committed in its parent. 12371 */ 12372 inum = dap->da_newinum; 12373 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 12374 panic("flush_pagedep_deps: lost inode1"); 12375 /* 12376 * Wait for any pending journal adds to complete so we don't 12377 * cause rollbacks while syncing. 12378 */ 12379 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 12380 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 12381 == DEPCOMPLETE) { 12382 jwait(&inoref->if_list, MNT_WAIT); 12383 goto restart; 12384 } 12385 } 12386 if (dap->da_state & MKDIR_BODY) { 12387 FREE_LOCK(&lk); 12388 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 12389 FFSV_FORCEINSMQ))) 12390 break; 12391 error = flush_newblk_dep(vp, mp, 0); 12392 /* 12393 * If we still have the dependency we might need to 12394 * update the vnode to sync the new link count to 12395 * disk. 12396 */ 12397 if (error == 0 && dap == LIST_FIRST(diraddhdp)) 12398 error = ffs_update(vp, MNT_WAIT); 12399 vput(vp); 12400 if (error != 0) 12401 break; 12402 ACQUIRE_LOCK(&lk); 12403 /* 12404 * If that cleared dependencies, go on to next. 12405 */ 12406 if (dap != LIST_FIRST(diraddhdp)) 12407 continue; 12408 if (dap->da_state & MKDIR_BODY) { 12409 inodedep_lookup(UFSTOVFS(ump), inum, 0, 12410 &inodedep); 12411 panic("flush_pagedep_deps: MKDIR_BODY " 12412 "inodedep %p dap %p vp %p", 12413 inodedep, dap, vp); 12414 } 12415 } 12416 /* 12417 * Flush the inode on which the directory entry depends. 12418 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 12419 * the only remaining dependency is that the updated inode 12420 * count must get pushed to disk. The inode has already 12421 * been pushed into its inode buffer (via VOP_UPDATE) at 12422 * the time of the reference count change. So we need only 12423 * locate that buffer, ensure that there will be no rollback 12424 * caused by a bitmap dependency, then write the inode buffer. 12425 */ 12426retry: 12427 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 12428 panic("flush_pagedep_deps: lost inode"); 12429 /* 12430 * If the inode still has bitmap dependencies, 12431 * push them to disk. 12432 */ 12433 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { 12434 bp = inodedep->id_bmsafemap->sm_buf; 12435 bp = getdirtybuf(bp, &lk, MNT_WAIT); 12436 if (bp == NULL) 12437 goto retry; 12438 FREE_LOCK(&lk); 12439 if ((error = bwrite(bp)) != 0) 12440 break; 12441 ACQUIRE_LOCK(&lk); 12442 if (dap != LIST_FIRST(diraddhdp)) 12443 continue; 12444 } 12445 /* 12446 * If the inode is still sitting in a buffer waiting 12447 * to be written or waiting for the link count to be 12448 * adjusted update it here to flush it to disk. 12449 */ 12450 if (dap == LIST_FIRST(diraddhdp)) { 12451 FREE_LOCK(&lk); 12452 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 12453 FFSV_FORCEINSMQ))) 12454 break; 12455 error = ffs_update(vp, MNT_WAIT); 12456 vput(vp); 12457 if (error) 12458 break; 12459 ACQUIRE_LOCK(&lk); 12460 } 12461 /* 12462 * If we have failed to get rid of all the dependencies 12463 * then something is seriously wrong. 12464 */ 12465 if (dap == LIST_FIRST(diraddhdp)) { 12466 inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); 12467 panic("flush_pagedep_deps: failed to flush " 12468 "inodedep %p ino %d dap %p", inodedep, inum, dap); 12469 } 12470 } 12471 if (error) 12472 ACQUIRE_LOCK(&lk); 12473 return (error); 12474} 12475 12476/* 12477 * A large burst of file addition or deletion activity can drive the 12478 * memory load excessively high. First attempt to slow things down 12479 * using the techniques below. If that fails, this routine requests 12480 * the offending operations to fall back to running synchronously 12481 * until the memory load returns to a reasonable level. 12482 */ 12483int 12484softdep_slowdown(vp) 12485 struct vnode *vp; 12486{ 12487 struct ufsmount *ump; 12488 int jlow; 12489 int max_softdeps_hard; 12490 12491 ACQUIRE_LOCK(&lk); 12492 jlow = 0; 12493 /* 12494 * Check for journal space if needed. 12495 */ 12496 if (DOINGSUJ(vp)) { 12497 ump = VFSTOUFS(vp->v_mount); 12498 if (journal_space(ump, 0) == 0) 12499 jlow = 1; 12500 } 12501 max_softdeps_hard = max_softdeps * 11 / 10; 12502 if (dep_current[D_DIRREM] < max_softdeps_hard / 2 && 12503 dep_current[D_INODEDEP] < max_softdeps_hard && 12504 VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps && 12505 dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0) { 12506 FREE_LOCK(&lk); 12507 return (0); 12508 } 12509 if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow) 12510 softdep_speedup(); 12511 stat_sync_limit_hit += 1; 12512 FREE_LOCK(&lk); 12513 if (DOINGSUJ(vp)) 12514 return (0); 12515 return (1); 12516} 12517 12518/* 12519 * Called by the allocation routines when they are about to fail 12520 * in the hope that we can free up the requested resource (inodes 12521 * or disk space). 12522 * 12523 * First check to see if the work list has anything on it. If it has, 12524 * clean up entries until we successfully free the requested resource. 12525 * Because this process holds inodes locked, we cannot handle any remove 12526 * requests that might block on a locked inode as that could lead to 12527 * deadlock. If the worklist yields none of the requested resource, 12528 * start syncing out vnodes to free up the needed space. 12529 */ 12530int 12531softdep_request_cleanup(fs, vp, cred, resource) 12532 struct fs *fs; 12533 struct vnode *vp; 12534 struct ucred *cred; 12535 int resource; 12536{ 12537 struct ufsmount *ump; 12538 struct mount *mp; 12539 struct vnode *lvp, *mvp; 12540 long starttime; 12541 ufs2_daddr_t needed; 12542 int error; 12543 12544 mp = vp->v_mount; 12545 ump = VFSTOUFS(mp); 12546 mtx_assert(UFS_MTX(ump), MA_OWNED); 12547 if (resource == FLUSH_BLOCKS_WAIT) 12548 stat_cleanup_blkrequests += 1; 12549 else 12550 stat_cleanup_inorequests += 1; 12551 12552 /* 12553 * If we are being called because of a process doing a 12554 * copy-on-write, then it is not safe to process any 12555 * worklist items as we will recurse into the copyonwrite 12556 * routine. This will result in an incoherent snapshot. 12557 */ 12558 if (curthread->td_pflags & TDP_COWINPROGRESS) 12559 return (0); 12560 UFS_UNLOCK(ump); 12561 error = ffs_update(vp, 1); 12562 if (error != 0) { 12563 UFS_LOCK(ump); 12564 return (0); 12565 } 12566 /* 12567 * If we are in need of resources, consider pausing for 12568 * tickdelay to give ourselves some breathing room. 12569 */ 12570 ACQUIRE_LOCK(&lk); 12571 process_removes(vp); 12572 process_truncates(vp); 12573 request_cleanup(UFSTOVFS(ump), resource); 12574 FREE_LOCK(&lk); 12575 /* 12576 * Now clean up at least as many resources as we will need. 12577 * 12578 * When requested to clean up inodes, the number that are needed 12579 * is set by the number of simultaneous writers (mnt_writeopcount) 12580 * plus a bit of slop (2) in case some more writers show up while 12581 * we are cleaning. 12582 * 12583 * When requested to free up space, the amount of space that 12584 * we need is enough blocks to allocate a full-sized segment 12585 * (fs_contigsumsize). The number of such segments that will 12586 * be needed is set by the number of simultaneous writers 12587 * (mnt_writeopcount) plus a bit of slop (2) in case some more 12588 * writers show up while we are cleaning. 12589 * 12590 * Additionally, if we are unpriviledged and allocating space, 12591 * we need to ensure that we clean up enough blocks to get the 12592 * needed number of blocks over the threshhold of the minimum 12593 * number of blocks required to be kept free by the filesystem 12594 * (fs_minfree). 12595 */ 12596 if (resource == FLUSH_INODES_WAIT) { 12597 needed = vp->v_mount->mnt_writeopcount + 2; 12598 } else if (resource == FLUSH_BLOCKS_WAIT) { 12599 needed = (vp->v_mount->mnt_writeopcount + 2) * 12600 fs->fs_contigsumsize; 12601 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0)) 12602 needed += fragstoblks(fs, 12603 roundup((fs->fs_dsize * fs->fs_minfree / 100) - 12604 fs->fs_cstotal.cs_nffree, fs->fs_frag)); 12605 } else { 12606 UFS_LOCK(ump); 12607 printf("softdep_request_cleanup: Unknown resource type %d\n", 12608 resource); 12609 return (0); 12610 } 12611 starttime = time_second; 12612retry: 12613 if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && 12614 fs->fs_cstotal.cs_nbfree <= needed) || 12615 (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && 12616 fs->fs_cstotal.cs_nifree <= needed)) { 12617 ACQUIRE_LOCK(&lk); 12618 if (ump->softdep_on_worklist > 0 && 12619 process_worklist_item(UFSTOVFS(ump), 12620 ump->softdep_on_worklist, LK_NOWAIT) != 0) 12621 stat_worklist_push += 1; 12622 FREE_LOCK(&lk); 12623 } 12624 /* 12625 * If we still need resources and there are no more worklist 12626 * entries to process to obtain them, we have to start flushing 12627 * the dirty vnodes to force the release of additional requests 12628 * to the worklist that we can then process to reap addition 12629 * resources. We walk the vnodes associated with the mount point 12630 * until we get the needed worklist requests that we can reap. 12631 */ 12632 if ((resource == FLUSH_BLOCKS_WAIT && 12633 fs->fs_cstotal.cs_nbfree <= needed) || 12634 (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && 12635 fs->fs_cstotal.cs_nifree <= needed)) { 12636 MNT_ILOCK(mp); 12637 MNT_VNODE_FOREACH(lvp, mp, mvp) { 12638 VI_LOCK(lvp); 12639 if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) { 12640 VI_UNLOCK(lvp); 12641 continue; 12642 } 12643 MNT_IUNLOCK(mp); 12644 if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT, 12645 curthread)) { 12646 MNT_ILOCK(mp); 12647 continue; 12648 } 12649 if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */ 12650 vput(lvp); 12651 MNT_ILOCK(mp); 12652 continue; 12653 } 12654 (void) ffs_syncvnode(lvp, MNT_NOWAIT); 12655 vput(lvp); 12656 MNT_ILOCK(mp); 12657 } 12658 MNT_IUNLOCK(mp); 12659 lvp = ump->um_devvp; 12660 if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { 12661 VOP_FSYNC(lvp, MNT_NOWAIT, curthread); 12662 VOP_UNLOCK(lvp, 0); 12663 } 12664 if (ump->softdep_on_worklist > 0) { 12665 stat_cleanup_retries += 1; 12666 goto retry; 12667 } 12668 stat_cleanup_failures += 1; 12669 } 12670 if (time_second - starttime > stat_cleanup_high_delay) 12671 stat_cleanup_high_delay = time_second - starttime; 12672 UFS_LOCK(ump); 12673 return (1); 12674} 12675 12676/* 12677 * If memory utilization has gotten too high, deliberately slow things 12678 * down and speed up the I/O processing. 12679 */ 12680extern struct thread *syncertd; 12681static int 12682request_cleanup(mp, resource) 12683 struct mount *mp; 12684 int resource; 12685{ 12686 struct thread *td = curthread; 12687 struct ufsmount *ump; 12688 12689 mtx_assert(&lk, MA_OWNED); 12690 /* 12691 * We never hold up the filesystem syncer or buf daemon. 12692 */ 12693 if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) 12694 return (0); 12695 ump = VFSTOUFS(mp); 12696 /* 12697 * First check to see if the work list has gotten backlogged. 12698 * If it has, co-opt this process to help clean up two entries. 12699 * Because this process may hold inodes locked, we cannot 12700 * handle any remove requests that might block on a locked 12701 * inode as that could lead to deadlock. We set TDP_SOFTDEP 12702 * to avoid recursively processing the worklist. 12703 */ 12704 if (ump->softdep_on_worklist > max_softdeps / 10) { 12705 td->td_pflags |= TDP_SOFTDEP; 12706 process_worklist_item(mp, 2, LK_NOWAIT); 12707 td->td_pflags &= ~TDP_SOFTDEP; 12708 stat_worklist_push += 2; 12709 return(1); 12710 } 12711 /* 12712 * Next, we attempt to speed up the syncer process. If that 12713 * is successful, then we allow the process to continue. 12714 */ 12715 if (softdep_speedup() && 12716 resource != FLUSH_BLOCKS_WAIT && 12717 resource != FLUSH_INODES_WAIT) 12718 return(0); 12719 /* 12720 * If we are resource constrained on inode dependencies, try 12721 * flushing some dirty inodes. Otherwise, we are constrained 12722 * by file deletions, so try accelerating flushes of directories 12723 * with removal dependencies. We would like to do the cleanup 12724 * here, but we probably hold an inode locked at this point and 12725 * that might deadlock against one that we try to clean. So, 12726 * the best that we can do is request the syncer daemon to do 12727 * the cleanup for us. 12728 */ 12729 switch (resource) { 12730 12731 case FLUSH_INODES: 12732 case FLUSH_INODES_WAIT: 12733 stat_ino_limit_push += 1; 12734 req_clear_inodedeps += 1; 12735 stat_countp = &stat_ino_limit_hit; 12736 break; 12737 12738 case FLUSH_BLOCKS: 12739 case FLUSH_BLOCKS_WAIT: 12740 stat_blk_limit_push += 1; 12741 req_clear_remove += 1; 12742 stat_countp = &stat_blk_limit_hit; 12743 break; 12744 12745 default: 12746 panic("request_cleanup: unknown type"); 12747 } 12748 /* 12749 * Hopefully the syncer daemon will catch up and awaken us. 12750 * We wait at most tickdelay before proceeding in any case. 12751 */ 12752 proc_waiting += 1; 12753 if (callout_pending(&softdep_callout) == FALSE) 12754 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 12755 pause_timer, 0); 12756 12757 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); 12758 proc_waiting -= 1; 12759 return (1); 12760} 12761 12762/* 12763 * Awaken processes pausing in request_cleanup and clear proc_waiting 12764 * to indicate that there is no longer a timer running. 12765 */ 12766static void 12767pause_timer(arg) 12768 void *arg; 12769{ 12770 12771 /* 12772 * The callout_ API has acquired mtx and will hold it around this 12773 * function call. 12774 */ 12775 *stat_countp += 1; 12776 wakeup_one(&proc_waiting); 12777 if (proc_waiting > 0) 12778 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 12779 pause_timer, 0); 12780} 12781 12782/* 12783 * Flush out a directory with at least one removal dependency in an effort to 12784 * reduce the number of dirrem, freefile, and freeblks dependency structures. 12785 */ 12786static void 12787clear_remove(td) 12788 struct thread *td; 12789{ 12790 struct pagedep_hashhead *pagedephd; 12791 struct pagedep *pagedep; 12792 static int next = 0; 12793 struct mount *mp; 12794 struct vnode *vp; 12795 struct bufobj *bo; 12796 int error, cnt; 12797 ino_t ino; 12798 12799 mtx_assert(&lk, MA_OWNED); 12800 12801 for (cnt = 0; cnt < pagedep_hash; cnt++) { 12802 pagedephd = &pagedep_hashtbl[next++]; 12803 if (next >= pagedep_hash) 12804 next = 0; 12805 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 12806 if (LIST_EMPTY(&pagedep->pd_dirremhd)) 12807 continue; 12808 mp = pagedep->pd_list.wk_mp; 12809 ino = pagedep->pd_ino; 12810 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 12811 continue; 12812 FREE_LOCK(&lk); 12813 12814 /* 12815 * Let unmount clear deps 12816 */ 12817 error = vfs_busy(mp, MBF_NOWAIT); 12818 if (error != 0) 12819 goto finish_write; 12820 error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 12821 FFSV_FORCEINSMQ); 12822 vfs_unbusy(mp); 12823 if (error != 0) { 12824 softdep_error("clear_remove: vget", error); 12825 goto finish_write; 12826 } 12827 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 12828 softdep_error("clear_remove: fsync", error); 12829 bo = &vp->v_bufobj; 12830 BO_LOCK(bo); 12831 drain_output(vp); 12832 BO_UNLOCK(bo); 12833 vput(vp); 12834 finish_write: 12835 vn_finished_write(mp); 12836 ACQUIRE_LOCK(&lk); 12837 return; 12838 } 12839 } 12840} 12841 12842/* 12843 * Clear out a block of dirty inodes in an effort to reduce 12844 * the number of inodedep dependency structures. 12845 */ 12846static void 12847clear_inodedeps(td) 12848 struct thread *td; 12849{ 12850 struct inodedep_hashhead *inodedephd; 12851 struct inodedep *inodedep; 12852 static int next = 0; 12853 struct mount *mp; 12854 struct vnode *vp; 12855 struct fs *fs; 12856 int error, cnt; 12857 ino_t firstino, lastino, ino; 12858 12859 mtx_assert(&lk, MA_OWNED); 12860 /* 12861 * Pick a random inode dependency to be cleared. 12862 * We will then gather up all the inodes in its block 12863 * that have dependencies and flush them out. 12864 */ 12865 for (cnt = 0; cnt < inodedep_hash; cnt++) { 12866 inodedephd = &inodedep_hashtbl[next++]; 12867 if (next >= inodedep_hash) 12868 next = 0; 12869 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 12870 break; 12871 } 12872 if (inodedep == NULL) 12873 return; 12874 fs = inodedep->id_fs; 12875 mp = inodedep->id_list.wk_mp; 12876 /* 12877 * Find the last inode in the block with dependencies. 12878 */ 12879 firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 12880 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 12881 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) 12882 break; 12883 /* 12884 * Asynchronously push all but the last inode with dependencies. 12885 * Synchronously push the last inode with dependencies to ensure 12886 * that the inode block gets written to free up the inodedeps. 12887 */ 12888 for (ino = firstino; ino <= lastino; ino++) { 12889 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 12890 continue; 12891 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 12892 continue; 12893 FREE_LOCK(&lk); 12894 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ 12895 if (error != 0) { 12896 vn_finished_write(mp); 12897 ACQUIRE_LOCK(&lk); 12898 return; 12899 } 12900 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 12901 FFSV_FORCEINSMQ)) != 0) { 12902 softdep_error("clear_inodedeps: vget", error); 12903 vfs_unbusy(mp); 12904 vn_finished_write(mp); 12905 ACQUIRE_LOCK(&lk); 12906 return; 12907 } 12908 vfs_unbusy(mp); 12909 if (ino == lastino) { 12910 if ((error = ffs_syncvnode(vp, MNT_WAIT))) 12911 softdep_error("clear_inodedeps: fsync1", error); 12912 } else { 12913 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 12914 softdep_error("clear_inodedeps: fsync2", error); 12915 BO_LOCK(&vp->v_bufobj); 12916 drain_output(vp); 12917 BO_UNLOCK(&vp->v_bufobj); 12918 } 12919 vput(vp); 12920 vn_finished_write(mp); 12921 ACQUIRE_LOCK(&lk); 12922 } 12923} 12924 12925void 12926softdep_buf_append(bp, wkhd) 12927 struct buf *bp; 12928 struct workhead *wkhd; 12929{ 12930 struct worklist *wk; 12931 12932 ACQUIRE_LOCK(&lk); 12933 while ((wk = LIST_FIRST(wkhd)) != NULL) { 12934 WORKLIST_REMOVE(wk); 12935 WORKLIST_INSERT(&bp->b_dep, wk); 12936 } 12937 FREE_LOCK(&lk); 12938 12939} 12940 12941void 12942softdep_inode_append(ip, cred, wkhd) 12943 struct inode *ip; 12944 struct ucred *cred; 12945 struct workhead *wkhd; 12946{ 12947 struct buf *bp; 12948 struct fs *fs; 12949 int error; 12950 12951 fs = ip->i_fs; 12952 error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 12953 (int)fs->fs_bsize, cred, &bp); 12954 if (error) { 12955 softdep_freework(wkhd); 12956 return; 12957 } 12958 softdep_buf_append(bp, wkhd); 12959 bqrelse(bp); 12960} 12961 12962void 12963softdep_freework(wkhd) 12964 struct workhead *wkhd; 12965{ 12966 12967 ACQUIRE_LOCK(&lk); 12968 handle_jwork(wkhd); 12969 FREE_LOCK(&lk); 12970} 12971 12972/* 12973 * Function to determine if the buffer has outstanding dependencies 12974 * that will cause a roll-back if the buffer is written. If wantcount 12975 * is set, return number of dependencies, otherwise just yes or no. 12976 */ 12977static int 12978softdep_count_dependencies(bp, wantcount) 12979 struct buf *bp; 12980 int wantcount; 12981{ 12982 struct worklist *wk; 12983 struct bmsafemap *bmsafemap; 12984 struct freework *freework; 12985 struct inodedep *inodedep; 12986 struct indirdep *indirdep; 12987 struct freeblks *freeblks; 12988 struct allocindir *aip; 12989 struct pagedep *pagedep; 12990 struct dirrem *dirrem; 12991 struct newblk *newblk; 12992 struct mkdir *mkdir; 12993 struct diradd *dap; 12994 int i, retval; 12995 12996 retval = 0; 12997 ACQUIRE_LOCK(&lk); 12998 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 12999 switch (wk->wk_type) { 13000 13001 case D_INODEDEP: 13002 inodedep = WK_INODEDEP(wk); 13003 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 13004 /* bitmap allocation dependency */ 13005 retval += 1; 13006 if (!wantcount) 13007 goto out; 13008 } 13009 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 13010 /* direct block pointer dependency */ 13011 retval += 1; 13012 if (!wantcount) 13013 goto out; 13014 } 13015 if (TAILQ_FIRST(&inodedep->id_extupdt)) { 13016 /* direct block pointer dependency */ 13017 retval += 1; 13018 if (!wantcount) 13019 goto out; 13020 } 13021 if (TAILQ_FIRST(&inodedep->id_inoreflst)) { 13022 /* Add reference dependency. */ 13023 retval += 1; 13024 if (!wantcount) 13025 goto out; 13026 } 13027 continue; 13028 13029 case D_INDIRDEP: 13030 indirdep = WK_INDIRDEP(wk); 13031 13032 TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) { 13033 /* indirect truncation dependency */ 13034 retval += 1; 13035 if (!wantcount) 13036 goto out; 13037 } 13038 13039 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 13040 /* indirect block pointer dependency */ 13041 retval += 1; 13042 if (!wantcount) 13043 goto out; 13044 } 13045 continue; 13046 13047 case D_PAGEDEP: 13048 pagedep = WK_PAGEDEP(wk); 13049 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 13050 if (LIST_FIRST(&dirrem->dm_jremrefhd)) { 13051 /* Journal remove ref dependency. */ 13052 retval += 1; 13053 if (!wantcount) 13054 goto out; 13055 } 13056 } 13057 for (i = 0; i < DAHASHSZ; i++) { 13058 13059 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 13060 /* directory entry dependency */ 13061 retval += 1; 13062 if (!wantcount) 13063 goto out; 13064 } 13065 } 13066 continue; 13067 13068 case D_BMSAFEMAP: 13069 bmsafemap = WK_BMSAFEMAP(wk); 13070 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { 13071 /* Add reference dependency. */ 13072 retval += 1; 13073 if (!wantcount) 13074 goto out; 13075 } 13076 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { 13077 /* Allocate block dependency. */ 13078 retval += 1; 13079 if (!wantcount) 13080 goto out; 13081 } 13082 continue; 13083 13084 case D_FREEBLKS: 13085 freeblks = WK_FREEBLKS(wk); 13086 if (LIST_FIRST(&freeblks->fb_jblkdephd)) { 13087 /* Freeblk journal dependency. */ 13088 retval += 1; 13089 if (!wantcount) 13090 goto out; 13091 } 13092 continue; 13093 13094 case D_ALLOCDIRECT: 13095 case D_ALLOCINDIR: 13096 newblk = WK_NEWBLK(wk); 13097 if (newblk->nb_jnewblk) { 13098 /* Journal allocate dependency. */ 13099 retval += 1; 13100 if (!wantcount) 13101 goto out; 13102 } 13103 continue; 13104 13105 case D_MKDIR: 13106 mkdir = WK_MKDIR(wk); 13107 if (mkdir->md_jaddref) { 13108 /* Journal reference dependency. */ 13109 retval += 1; 13110 if (!wantcount) 13111 goto out; 13112 } 13113 continue; 13114 13115 case D_FREEWORK: 13116 case D_FREEDEP: 13117 case D_JSEGDEP: 13118 case D_JSEG: 13119 case D_SBDEP: 13120 /* never a dependency on these blocks */ 13121 continue; 13122 13123 default: 13124 panic("softdep_count_dependencies: Unexpected type %s", 13125 TYPENAME(wk->wk_type)); 13126 /* NOTREACHED */ 13127 } 13128 } 13129out: 13130 FREE_LOCK(&lk); 13131 return retval; 13132} 13133 13134/* 13135 * Acquire exclusive access to a buffer. 13136 * Must be called with a locked mtx parameter. 13137 * Return acquired buffer or NULL on failure. 13138 */ 13139static struct buf * 13140getdirtybuf(bp, mtx, waitfor) 13141 struct buf *bp; 13142 struct mtx *mtx; 13143 int waitfor; 13144{ 13145 int error; 13146 13147 mtx_assert(mtx, MA_OWNED); 13148 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { 13149 if (waitfor != MNT_WAIT) 13150 return (NULL); 13151 error = BUF_LOCK(bp, 13152 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx); 13153 /* 13154 * Even if we sucessfully acquire bp here, we have dropped 13155 * mtx, which may violates our guarantee. 13156 */ 13157 if (error == 0) 13158 BUF_UNLOCK(bp); 13159 else if (error != ENOLCK) 13160 panic("getdirtybuf: inconsistent lock: %d", error); 13161 mtx_lock(mtx); 13162 return (NULL); 13163 } 13164 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 13165 if (mtx == &lk && waitfor == MNT_WAIT) { 13166 mtx_unlock(mtx); 13167 BO_LOCK(bp->b_bufobj); 13168 BUF_UNLOCK(bp); 13169 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 13170 bp->b_vflags |= BV_BKGRDWAIT; 13171 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), 13172 PRIBIO | PDROP, "getbuf", 0); 13173 } else 13174 BO_UNLOCK(bp->b_bufobj); 13175 mtx_lock(mtx); 13176 return (NULL); 13177 } 13178 BUF_UNLOCK(bp); 13179 if (waitfor != MNT_WAIT) 13180 return (NULL); 13181 /* 13182 * The mtx argument must be bp->b_vp's mutex in 13183 * this case. 13184 */ 13185#ifdef DEBUG_VFS_LOCKS 13186 if (bp->b_vp->v_type != VCHR) 13187 ASSERT_BO_LOCKED(bp->b_bufobj); 13188#endif 13189 bp->b_vflags |= BV_BKGRDWAIT; 13190 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0); 13191 return (NULL); 13192 } 13193 if ((bp->b_flags & B_DELWRI) == 0) { 13194 BUF_UNLOCK(bp); 13195 return (NULL); 13196 } 13197 bremfree(bp); 13198 return (bp); 13199} 13200 13201 13202/* 13203 * Check if it is safe to suspend the file system now. On entry, 13204 * the vnode interlock for devvp should be held. Return 0 with 13205 * the mount interlock held if the file system can be suspended now, 13206 * otherwise return EAGAIN with the mount interlock held. 13207 */ 13208int 13209softdep_check_suspend(struct mount *mp, 13210 struct vnode *devvp, 13211 int softdep_deps, 13212 int softdep_accdeps, 13213 int secondary_writes, 13214 int secondary_accwrites) 13215{ 13216 struct bufobj *bo; 13217 struct ufsmount *ump; 13218 int error; 13219 13220 ump = VFSTOUFS(mp); 13221 bo = &devvp->v_bufobj; 13222 ASSERT_BO_LOCKED(bo); 13223 13224 for (;;) { 13225 if (!TRY_ACQUIRE_LOCK(&lk)) { 13226 BO_UNLOCK(bo); 13227 ACQUIRE_LOCK(&lk); 13228 FREE_LOCK(&lk); 13229 BO_LOCK(bo); 13230 continue; 13231 } 13232 MNT_ILOCK(mp); 13233 if (mp->mnt_secondary_writes != 0) { 13234 FREE_LOCK(&lk); 13235 BO_UNLOCK(bo); 13236 msleep(&mp->mnt_secondary_writes, 13237 MNT_MTX(mp), 13238 (PUSER - 1) | PDROP, "secwr", 0); 13239 BO_LOCK(bo); 13240 continue; 13241 } 13242 break; 13243 } 13244 13245 /* 13246 * Reasons for needing more work before suspend: 13247 * - Dirty buffers on devvp. 13248 * - Softdep activity occurred after start of vnode sync loop 13249 * - Secondary writes occurred after start of vnode sync loop 13250 */ 13251 error = 0; 13252 if (bo->bo_numoutput > 0 || 13253 bo->bo_dirty.bv_cnt > 0 || 13254 softdep_deps != 0 || 13255 ump->softdep_deps != 0 || 13256 softdep_accdeps != ump->softdep_accdeps || 13257 secondary_writes != 0 || 13258 mp->mnt_secondary_writes != 0 || 13259 secondary_accwrites != mp->mnt_secondary_accwrites) 13260 error = EAGAIN; 13261 FREE_LOCK(&lk); 13262 BO_UNLOCK(bo); 13263 return (error); 13264} 13265 13266 13267/* 13268 * Get the number of dependency structures for the file system, both 13269 * the current number and the total number allocated. These will 13270 * later be used to detect that softdep processing has occurred. 13271 */ 13272void 13273softdep_get_depcounts(struct mount *mp, 13274 int *softdep_depsp, 13275 int *softdep_accdepsp) 13276{ 13277 struct ufsmount *ump; 13278 13279 ump = VFSTOUFS(mp); 13280 ACQUIRE_LOCK(&lk); 13281 *softdep_depsp = ump->softdep_deps; 13282 *softdep_accdepsp = ump->softdep_accdeps; 13283 FREE_LOCK(&lk); 13284} 13285 13286/* 13287 * Wait for pending output on a vnode to complete. 13288 * Must be called with vnode lock and interlock locked. 13289 * 13290 * XXX: Should just be a call to bufobj_wwait(). 13291 */ 13292static void 13293drain_output(vp) 13294 struct vnode *vp; 13295{ 13296 struct bufobj *bo; 13297 13298 bo = &vp->v_bufobj; 13299 ASSERT_VOP_LOCKED(vp, "drain_output"); 13300 ASSERT_BO_LOCKED(bo); 13301 13302 while (bo->bo_numoutput) { 13303 bo->bo_flag |= BO_WWAIT; 13304 msleep((caddr_t)&bo->bo_numoutput, 13305 BO_MTX(bo), PRIBIO + 1, "drainvp", 0); 13306 } 13307} 13308 13309/* 13310 * Called whenever a buffer that is being invalidated or reallocated 13311 * contains dependencies. This should only happen if an I/O error has 13312 * occurred. The routine is called with the buffer locked. 13313 */ 13314static void 13315softdep_deallocate_dependencies(bp) 13316 struct buf *bp; 13317{ 13318 13319 if ((bp->b_ioflags & BIO_ERROR) == 0) 13320 panic("softdep_deallocate_dependencies: dangling deps"); 13321 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 13322 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 13323} 13324 13325/* 13326 * Function to handle asynchronous write errors in the filesystem. 13327 */ 13328static void 13329softdep_error(func, error) 13330 char *func; 13331 int error; 13332{ 13333 13334 /* XXX should do something better! */ 13335 printf("%s: got error %d while accessing filesystem\n", func, error); 13336} 13337 13338#ifdef DDB 13339 13340static void 13341inodedep_print(struct inodedep *inodedep, int verbose) 13342{ 13343 db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d" 13344 " saveino %p\n", 13345 inodedep, inodedep->id_fs, inodedep->id_state, 13346 (intmax_t)inodedep->id_ino, 13347 (intmax_t)fsbtodb(inodedep->id_fs, 13348 ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), 13349 inodedep->id_nlinkdelta, inodedep->id_savednlink, 13350 inodedep->id_savedino1); 13351 13352 if (verbose == 0) 13353 return; 13354 13355 db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, " 13356 "mkdiradd %p\n", 13357 LIST_FIRST(&inodedep->id_pendinghd), 13358 LIST_FIRST(&inodedep->id_bufwait), 13359 LIST_FIRST(&inodedep->id_inowait), 13360 TAILQ_FIRST(&inodedep->id_inoreflst), 13361 inodedep->id_mkdiradd); 13362 db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", 13363 TAILQ_FIRST(&inodedep->id_inoupdt), 13364 TAILQ_FIRST(&inodedep->id_newinoupdt), 13365 TAILQ_FIRST(&inodedep->id_extupdt), 13366 TAILQ_FIRST(&inodedep->id_newextupdt)); 13367} 13368 13369DB_SHOW_COMMAND(inodedep, db_show_inodedep) 13370{ 13371 13372 if (have_addr == 0) { 13373 db_printf("Address required\n"); 13374 return; 13375 } 13376 inodedep_print((struct inodedep*)addr, 1); 13377} 13378 13379DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) 13380{ 13381 struct inodedep_hashhead *inodedephd; 13382 struct inodedep *inodedep; 13383 struct fs *fs; 13384 int cnt; 13385 13386 fs = have_addr ? (struct fs *)addr : NULL; 13387 for (cnt = 0; cnt < inodedep_hash; cnt++) { 13388 inodedephd = &inodedep_hashtbl[cnt]; 13389 LIST_FOREACH(inodedep, inodedephd, id_hash) { 13390 if (fs != NULL && fs != inodedep->id_fs) 13391 continue; 13392 inodedep_print(inodedep, 0); 13393 } 13394 } 13395} 13396 13397DB_SHOW_COMMAND(worklist, db_show_worklist) 13398{ 13399 struct worklist *wk; 13400 13401 if (have_addr == 0) { 13402 db_printf("Address required\n"); 13403 return; 13404 } 13405 wk = (struct worklist *)addr; 13406 printf("worklist: %p type %s state 0x%X\n", 13407 wk, TYPENAME(wk->wk_type), wk->wk_state); 13408} 13409 13410DB_SHOW_COMMAND(workhead, db_show_workhead) 13411{ 13412 struct workhead *wkhd; 13413 struct worklist *wk; 13414 int i; 13415 13416 if (have_addr == 0) { 13417 db_printf("Address required\n"); 13418 return; 13419 } 13420 wkhd = (struct workhead *)addr; 13421 wk = LIST_FIRST(wkhd); 13422 for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) 13423 db_printf("worklist: %p type %s state 0x%X", 13424 wk, TYPENAME(wk->wk_type), wk->wk_state); 13425 if (i == 100) 13426 db_printf("workhead overflow"); 13427 printf("\n"); 13428} 13429 13430 13431DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) 13432{ 13433 struct jaddref *jaddref; 13434 struct diradd *diradd; 13435 struct mkdir *mkdir; 13436 13437 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 13438 diradd = mkdir->md_diradd; 13439 db_printf("mkdir: %p state 0x%X dap %p state 0x%X", 13440 mkdir, mkdir->md_state, diradd, diradd->da_state); 13441 if ((jaddref = mkdir->md_jaddref) != NULL) 13442 db_printf(" jaddref %p jaddref state 0x%X", 13443 jaddref, jaddref->ja_state); 13444 db_printf("\n"); 13445 } 13446} 13447 13448#endif /* DDB */ 13449 13450#endif /* SOFTUPDATES */ 13451