ffs_softdep.c revision 207142
1/*- 2 * Copyright 1998, 2000 Marshall Kirk McKusick. 3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org> 4 * All rights reserved. 5 * 6 * The soft updates code is derived from the appendix of a University 7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 8 * "Soft Updates: A Solution to the Metadata Update Problem in File 9 * Systems", CSE-TR-254-95, August 1995). 10 * 11 * Further information about soft updates can be obtained from: 12 * 13 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 14 * 1614 Oxford Street mckusick@mckusick.com 15 * Berkeley, CA 94709-1608 +1-510-843-9542 16 * USA 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 * 39 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 40 */ 41 42#include <sys/cdefs.h> 43__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 207142 2010-04-24 07:36:33Z pjd $"); 44 45#include "opt_ffs.h" 46#include "opt_ddb.h" 47 48/* 49 * For now we want the safety net that the DEBUG flag provides. 50 */ 51#ifndef DEBUG 52#define DEBUG 53#endif 54#define SUJ_DEBUG 55 56#include <sys/param.h> 57#include <sys/kernel.h> 58#include <sys/systm.h> 59#include <sys/bio.h> 60#include <sys/buf.h> 61#include <sys/kdb.h> 62#include <sys/kthread.h> 63#include <sys/lock.h> 64#include <sys/malloc.h> 65#include <sys/mount.h> 66#include <sys/mutex.h> 67#include <sys/namei.h> 68#include <sys/proc.h> 69#include <sys/stat.h> 70#include <sys/sysctl.h> 71#include <sys/syslog.h> 72#include <sys/vnode.h> 73#include <sys/conf.h> 74#include <ufs/ufs/dir.h> 75#include <ufs/ufs/extattr.h> 76#include <ufs/ufs/quota.h> 77#include <ufs/ufs/inode.h> 78#include <ufs/ufs/ufsmount.h> 79#include <ufs/ffs/fs.h> 80#include <ufs/ffs/softdep.h> 81#include <ufs/ffs/ffs_extern.h> 82#include <ufs/ufs/ufs_extern.h> 83 84#include <vm/vm.h> 85 86#include <ddb/ddb.h> 87 88#ifndef SOFTUPDATES 89 90int 91softdep_flushfiles(oldmnt, flags, td) 92 struct mount *oldmnt; 93 int flags; 94 struct thread *td; 95{ 96 97 panic("softdep_flushfiles called"); 98} 99 100int 101softdep_mount(devvp, mp, fs, cred) 102 struct vnode *devvp; 103 struct mount *mp; 104 struct fs *fs; 105 struct ucred *cred; 106{ 107 108 return (0); 109} 110 111void 112softdep_initialize() 113{ 114 115 return; 116} 117 118void 119softdep_uninitialize() 120{ 121 122 return; 123} 124 125void 126softdep_setup_inomapdep(bp, ip, newinum) 127 struct buf *bp; 128 struct inode *ip; 129 ino_t newinum; 130{ 131 132 panic("softdep_setup_inomapdep called"); 133} 134 135void 136softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 137 struct buf *bp; 138 struct mount *mp; 139 ufs2_daddr_t newblkno; 140 int frags; 141 int oldfrags; 142{ 143 144 panic("softdep_setup_blkmapdep called"); 145} 146 147void 148softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 149 struct inode *ip; 150 ufs_lbn_t lbn; 151 ufs2_daddr_t newblkno; 152 ufs2_daddr_t oldblkno; 153 long newsize; 154 long oldsize; 155 struct buf *bp; 156{ 157 158 panic("softdep_setup_allocdirect called"); 159} 160 161void 162softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 163 struct inode *ip; 164 ufs_lbn_t lbn; 165 ufs2_daddr_t newblkno; 166 ufs2_daddr_t oldblkno; 167 long newsize; 168 long oldsize; 169 struct buf *bp; 170{ 171 172 panic("softdep_setup_allocext called"); 173} 174 175void 176softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 177 struct inode *ip; 178 ufs_lbn_t lbn; 179 struct buf *bp; 180 int ptrno; 181 ufs2_daddr_t newblkno; 182 ufs2_daddr_t oldblkno; 183 struct buf *nbp; 184{ 185 186 panic("softdep_setup_allocindir_page called"); 187} 188 189void 190softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 191 struct buf *nbp; 192 struct inode *ip; 193 struct buf *bp; 194 int ptrno; 195 ufs2_daddr_t newblkno; 196{ 197 198 panic("softdep_setup_allocindir_meta called"); 199} 200 201void 202softdep_setup_freeblocks(ip, length, flags) 203 struct inode *ip; 204 off_t length; 205 int flags; 206{ 207 208 panic("softdep_setup_freeblocks called"); 209} 210 211void 212softdep_freefile(pvp, ino, mode) 213 struct vnode *pvp; 214 ino_t ino; 215 int mode; 216{ 217 218 panic("softdep_freefile called"); 219} 220 221int 222softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 223 struct buf *bp; 224 struct inode *dp; 225 off_t diroffset; 226 ino_t newinum; 227 struct buf *newdirbp; 228 int isnewblk; 229{ 230 231 panic("softdep_setup_directory_add called"); 232} 233 234void 235softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 236 struct buf *bp; 237 struct inode *dp; 238 caddr_t base; 239 caddr_t oldloc; 240 caddr_t newloc; 241 int entrysize; 242{ 243 244 panic("softdep_change_directoryentry_offset called"); 245} 246 247void 248softdep_setup_remove(bp, dp, ip, isrmdir) 249 struct buf *bp; 250 struct inode *dp; 251 struct inode *ip; 252 int isrmdir; 253{ 254 255 panic("softdep_setup_remove called"); 256} 257 258void 259softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 260 struct buf *bp; 261 struct inode *dp; 262 struct inode *ip; 263 ino_t newinum; 264 int isrmdir; 265{ 266 267 panic("softdep_setup_directory_change called"); 268} 269 270void 271softdep_change_linkcnt(ip) 272 struct inode *ip; 273{ 274 275 panic("softdep_change_linkcnt called"); 276} 277 278void 279softdep_load_inodeblock(ip) 280 struct inode *ip; 281{ 282 283 panic("softdep_load_inodeblock called"); 284} 285 286void 287softdep_update_inodeblock(ip, bp, waitfor) 288 struct inode *ip; 289 struct buf *bp; 290 int waitfor; 291{ 292 293 panic("softdep_update_inodeblock called"); 294} 295 296int 297softdep_fsync(vp) 298 struct vnode *vp; /* the "in_core" copy of the inode */ 299{ 300 301 return (0); 302} 303 304void 305softdep_fsync_mountdev(vp) 306 struct vnode *vp; 307{ 308 309 return; 310} 311 312int 313softdep_flushworklist(oldmnt, countp, td) 314 struct mount *oldmnt; 315 int *countp; 316 struct thread *td; 317{ 318 319 *countp = 0; 320 return (0); 321} 322 323int 324softdep_sync_metadata(struct vnode *vp) 325{ 326 327 return (0); 328} 329 330int 331softdep_slowdown(vp) 332 struct vnode *vp; 333{ 334 335 panic("softdep_slowdown called"); 336} 337 338void 339softdep_releasefile(ip) 340 struct inode *ip; /* inode with the zero effective link count */ 341{ 342 343 panic("softdep_releasefile called"); 344} 345 346int 347softdep_request_cleanup(fs, vp) 348 struct fs *fs; 349 struct vnode *vp; 350{ 351 352 return (0); 353} 354 355int 356softdep_check_suspend(struct mount *mp, 357 struct vnode *devvp, 358 int softdep_deps, 359 int softdep_accdeps, 360 int secondary_writes, 361 int secondary_accwrites) 362{ 363 struct bufobj *bo; 364 int error; 365 366 (void) softdep_deps, 367 (void) softdep_accdeps; 368 369 bo = &devvp->v_bufobj; 370 ASSERT_BO_LOCKED(bo); 371 372 MNT_ILOCK(mp); 373 while (mp->mnt_secondary_writes != 0) { 374 BO_UNLOCK(bo); 375 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), 376 (PUSER - 1) | PDROP, "secwr", 0); 377 BO_LOCK(bo); 378 MNT_ILOCK(mp); 379 } 380 381 /* 382 * Reasons for needing more work before suspend: 383 * - Dirty buffers on devvp. 384 * - Secondary writes occurred after start of vnode sync loop 385 */ 386 error = 0; 387 if (bo->bo_numoutput > 0 || 388 bo->bo_dirty.bv_cnt > 0 || 389 secondary_writes != 0 || 390 mp->mnt_secondary_writes != 0 || 391 secondary_accwrites != mp->mnt_secondary_accwrites) 392 error = EAGAIN; 393 BO_UNLOCK(bo); 394 return (error); 395} 396 397void 398softdep_get_depcounts(struct mount *mp, 399 int *softdepactivep, 400 int *softdepactiveaccp) 401{ 402 (void) mp; 403 *softdepactivep = 0; 404 *softdepactiveaccp = 0; 405} 406 407#else 408/* 409 * These definitions need to be adapted to the system to which 410 * this file is being ported. 411 */ 412 413#define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE) 414 415#define D_PAGEDEP 0 416#define D_INODEDEP 1 417#define D_BMSAFEMAP 2 418#define D_NEWBLK 3 419#define D_ALLOCDIRECT 4 420#define D_INDIRDEP 5 421#define D_ALLOCINDIR 6 422#define D_FREEFRAG 7 423#define D_FREEBLKS 8 424#define D_FREEFILE 9 425#define D_DIRADD 10 426#define D_MKDIR 11 427#define D_DIRREM 12 428#define D_NEWDIRBLK 13 429#define D_FREEWORK 14 430#define D_FREEDEP 15 431#define D_JADDREF 16 432#define D_JREMREF 17 433#define D_JMVREF 18 434#define D_JNEWBLK 19 435#define D_JFREEBLK 20 436#define D_JFREEFRAG 21 437#define D_JSEG 22 438#define D_JSEGDEP 23 439#define D_SBDEP 24 440#define D_JTRUNC 25 441#define D_LAST D_JTRUNC 442 443unsigned long dep_current[D_LAST + 1]; 444unsigned long dep_total[D_LAST + 1]; 445 446 447SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats"); 448SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, 449 "total dependencies allocated"); 450SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, 451 "current dependencies allocated"); 452 453#define SOFTDEP_TYPE(type, str, long) \ 454 static MALLOC_DEFINE(M_ ## type, #str, long); \ 455 SYSCTL_LONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ 456 &dep_total[D_ ## type], 0, ""); \ 457 SYSCTL_LONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ 458 &dep_current[D_ ## type], 0, ""); 459 460SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 461SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); 462SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, 463 "Block or frag allocated from cyl group map"); 464SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); 465SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); 466SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); 467SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); 468SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); 469SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); 470SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); 471SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); 472SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); 473SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); 474SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); 475SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); 476SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); 477SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); 478SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); 479SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); 480SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); 481SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); 482SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); 483SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); 484SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); 485SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); 486SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); 487 488static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); 489static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); 490 491/* 492 * translate from workitem type to memory type 493 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 494 */ 495static struct malloc_type *memtype[] = { 496 M_PAGEDEP, 497 M_INODEDEP, 498 M_BMSAFEMAP, 499 M_NEWBLK, 500 M_ALLOCDIRECT, 501 M_INDIRDEP, 502 M_ALLOCINDIR, 503 M_FREEFRAG, 504 M_FREEBLKS, 505 M_FREEFILE, 506 M_DIRADD, 507 M_MKDIR, 508 M_DIRREM, 509 M_NEWDIRBLK, 510 M_FREEWORK, 511 M_FREEDEP, 512 M_JADDREF, 513 M_JREMREF, 514 M_JMVREF, 515 M_JNEWBLK, 516 M_JFREEBLK, 517 M_JFREEFRAG, 518 M_JSEG, 519 M_JSEGDEP, 520 M_SBDEP, 521 M_JTRUNC 522}; 523 524#define DtoM(type) (memtype[type]) 525 526/* 527 * Names of malloc types. 528 */ 529#define TYPENAME(type) \ 530 ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") 531/* 532 * End system adaptation definitions. 533 */ 534 535#define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) 536#define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) 537 538/* 539 * Forward declarations. 540 */ 541struct inodedep_hashhead; 542struct newblk_hashhead; 543struct pagedep_hashhead; 544struct bmsafemap_hashhead; 545 546/* 547 * Internal function prototypes. 548 */ 549static void softdep_error(char *, int); 550static void drain_output(struct vnode *); 551static struct buf *getdirtybuf(struct buf *, struct mtx *, int); 552static void clear_remove(struct thread *); 553static void clear_inodedeps(struct thread *); 554static void unlinked_inodedep(struct mount *, struct inodedep *); 555static void clear_unlinked_inodedep(struct inodedep *); 556static struct inodedep *first_unlinked_inodedep(struct ufsmount *); 557static int flush_pagedep_deps(struct vnode *, struct mount *, 558 struct diraddhd *); 559static void free_pagedep(struct pagedep *); 560static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); 561static int flush_inodedep_deps(struct mount *, ino_t); 562static int flush_deplist(struct allocdirectlst *, int, int *); 563static int handle_written_filepage(struct pagedep *, struct buf *); 564static int handle_written_sbdep(struct sbdep *, struct buf *); 565static void initiate_write_sbdep(struct sbdep *); 566static void diradd_inode_written(struct diradd *, struct inodedep *); 567static int handle_written_indirdep(struct indirdep *, struct buf *, 568 struct buf**); 569static int handle_written_inodeblock(struct inodedep *, struct buf *); 570static int handle_written_bmsafemap(struct bmsafemap *, struct buf *); 571static void handle_written_jaddref(struct jaddref *); 572static void handle_written_jremref(struct jremref *); 573static void handle_written_jseg(struct jseg *, struct buf *); 574static void handle_written_jnewblk(struct jnewblk *); 575static void handle_written_jfreeblk(struct jfreeblk *); 576static void handle_written_jfreefrag(struct jfreefrag *); 577static void complete_jseg(struct jseg *); 578static void jseg_write(struct fs *, struct jblocks *, struct jseg *, 579 uint8_t *); 580static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); 581static void jremref_write(struct jremref *, struct jseg *, uint8_t *); 582static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); 583static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); 584static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); 585static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); 586static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); 587static inline void inoref_write(struct inoref *, struct jseg *, 588 struct jrefrec *); 589static void handle_allocdirect_partdone(struct allocdirect *, 590 struct workhead *); 591static void cancel_newblk(struct newblk *, struct workhead *); 592static void indirdep_complete(struct indirdep *); 593static void handle_allocindir_partdone(struct allocindir *); 594static void initiate_write_filepage(struct pagedep *, struct buf *); 595static void initiate_write_indirdep(struct indirdep*, struct buf *); 596static void handle_written_mkdir(struct mkdir *, int); 597static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); 598static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); 599static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); 600static void handle_workitem_freefile(struct freefile *); 601static void handle_workitem_remove(struct dirrem *, struct vnode *); 602static struct dirrem *newdirrem(struct buf *, struct inode *, 603 struct inode *, int, struct dirrem **); 604static void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *, 605 struct freeblks *); 606static void free_indirdep(struct indirdep *); 607static void free_diradd(struct diradd *, struct workhead *); 608static void merge_diradd(struct inodedep *, struct diradd *); 609static void complete_diradd(struct diradd *); 610static struct diradd *diradd_lookup(struct pagedep *, int); 611static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, 612 struct jremref *); 613static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, 614 struct jremref *); 615static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, 616 struct jremref *, struct jremref *); 617static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, 618 struct jremref *); 619static void cancel_allocindir(struct allocindir *, struct inodedep *, 620 struct freeblks *); 621static void complete_mkdir(struct mkdir *); 622static void free_newdirblk(struct newdirblk *); 623static void free_jremref(struct jremref *); 624static void free_jaddref(struct jaddref *); 625static void free_jsegdep(struct jsegdep *); 626static void free_jseg(struct jseg *); 627static void free_jnewblk(struct jnewblk *); 628static void free_jfreeblk(struct jfreeblk *); 629static void free_jfreefrag(struct jfreefrag *); 630static void free_freedep(struct freedep *); 631static void journal_jremref(struct dirrem *, struct jremref *, 632 struct inodedep *); 633static void cancel_jnewblk(struct jnewblk *, struct workhead *); 634static int cancel_jaddref(struct jaddref *, struct inodedep *, 635 struct workhead *); 636static void cancel_jfreefrag(struct jfreefrag *); 637static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); 638static int deallocate_dependencies(struct buf *, struct inodedep *, 639 struct freeblks *); 640static void free_newblk(struct newblk *); 641static void cancel_allocdirect(struct allocdirectlst *, 642 struct allocdirect *, struct freeblks *, int); 643static int check_inode_unwritten(struct inodedep *); 644static int free_inodedep(struct inodedep *); 645static void freework_freeblock(struct freework *); 646static void handle_workitem_freeblocks(struct freeblks *, int); 647static void handle_complete_freeblocks(struct freeblks *); 648static void handle_workitem_indirblk(struct freework *); 649static void handle_written_freework(struct freework *); 650static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); 651static void setup_allocindir_phase2(struct buf *, struct inode *, 652 struct inodedep *, struct allocindir *, ufs_lbn_t); 653static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, 654 ufs2_daddr_t, ufs_lbn_t); 655static void handle_workitem_freefrag(struct freefrag *); 656static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, 657 ufs_lbn_t); 658static void allocdirect_merge(struct allocdirectlst *, 659 struct allocdirect *, struct allocdirect *); 660static struct freefrag *allocindir_merge(struct allocindir *, 661 struct allocindir *); 662static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int, 663 struct bmsafemap **); 664static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, 665 int cg); 666static int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t, 667 int, struct newblk **); 668static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); 669static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, 670 struct inodedep **); 671static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); 672static int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int, 673 struct pagedep **); 674static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, 675 struct mount *mp, int, struct pagedep **); 676static void pause_timer(void *); 677static int request_cleanup(struct mount *, int); 678static int process_worklist_item(struct mount *, int); 679static void process_removes(struct vnode *); 680static void jwork_move(struct workhead *, struct workhead *); 681static void add_to_worklist(struct worklist *, int); 682static void remove_from_worklist(struct worklist *); 683static void softdep_flush(void); 684static int softdep_speedup(void); 685static void worklist_speedup(void); 686static int journal_mount(struct mount *, struct fs *, struct ucred *); 687static void journal_unmount(struct mount *); 688static int journal_space(struct ufsmount *, int); 689static void journal_suspend(struct ufsmount *); 690static void softdep_prelink(struct vnode *, struct vnode *); 691static void add_to_journal(struct worklist *); 692static void remove_from_journal(struct worklist *); 693static void softdep_process_journal(struct mount *, int); 694static struct jremref *newjremref(struct dirrem *, struct inode *, 695 struct inode *ip, off_t, nlink_t); 696static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, 697 uint16_t); 698static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, 699 uint16_t); 700static inline struct jsegdep *inoref_jseg(struct inoref *); 701static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); 702static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, 703 ufs2_daddr_t, int); 704static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, 705 ufs2_daddr_t, long, ufs_lbn_t); 706static struct freework *newfreework(struct freeblks *, struct freework *, 707 ufs_lbn_t, ufs2_daddr_t, int, int); 708static void jwait(struct worklist *wk); 709static struct inodedep *inodedep_lookup_ip(struct inode *); 710static int bmsafemap_rollbacks(struct bmsafemap *); 711static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); 712static void handle_jwork(struct workhead *); 713static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, 714 struct mkdir **); 715static struct jblocks *jblocks_create(void); 716static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); 717static void jblocks_free(struct jblocks *, struct mount *, int); 718static void jblocks_destroy(struct jblocks *); 719static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); 720 721/* 722 * Exported softdep operations. 723 */ 724static void softdep_disk_io_initiation(struct buf *); 725static void softdep_disk_write_complete(struct buf *); 726static void softdep_deallocate_dependencies(struct buf *); 727static int softdep_count_dependencies(struct buf *bp, int); 728 729static struct mtx lk; 730MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF); 731 732#define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk) 733#define ACQUIRE_LOCK(lk) mtx_lock(lk) 734#define FREE_LOCK(lk) mtx_unlock(lk) 735 736#define BUF_AREC(bp) ((bp)->b_lock.lock_object.lo_flags |= LO_RECURSABLE) 737#define BUF_NOREC(bp) ((bp)->b_lock.lock_object.lo_flags &= ~LO_RECURSABLE) 738 739/* 740 * Worklist queue management. 741 * These routines require that the lock be held. 742 */ 743#ifndef /* NOT */ DEBUG 744#define WORKLIST_INSERT(head, item) do { \ 745 (item)->wk_state |= ONWORKLIST; \ 746 LIST_INSERT_HEAD(head, item, wk_list); \ 747} while (0) 748#define WORKLIST_REMOVE(item) do { \ 749 (item)->wk_state &= ~ONWORKLIST; \ 750 LIST_REMOVE(item, wk_list); \ 751} while (0) 752#define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT 753#define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE 754 755#else /* DEBUG */ 756static void worklist_insert(struct workhead *, struct worklist *, int); 757static void worklist_remove(struct worklist *, int); 758 759#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) 760#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) 761#define WORKLIST_REMOVE(item) worklist_remove(item, 1) 762#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) 763 764static void 765worklist_insert(head, item, locked) 766 struct workhead *head; 767 struct worklist *item; 768 int locked; 769{ 770 771 if (locked) 772 mtx_assert(&lk, MA_OWNED); 773 if (item->wk_state & ONWORKLIST) 774 panic("worklist_insert: %p %s(0x%X) already on list", 775 item, TYPENAME(item->wk_type), item->wk_state); 776 item->wk_state |= ONWORKLIST; 777 LIST_INSERT_HEAD(head, item, wk_list); 778} 779 780static void 781worklist_remove(item, locked) 782 struct worklist *item; 783 int locked; 784{ 785 786 if (locked) 787 mtx_assert(&lk, MA_OWNED); 788 if ((item->wk_state & ONWORKLIST) == 0) 789 panic("worklist_remove: %p %s(0x%X) not on list", 790 item, TYPENAME(item->wk_type), item->wk_state); 791 item->wk_state &= ~ONWORKLIST; 792 LIST_REMOVE(item, wk_list); 793} 794#endif /* DEBUG */ 795 796/* 797 * Merge two jsegdeps keeping only the oldest one as newer references 798 * can't be discarded until after older references. 799 */ 800static inline struct jsegdep * 801jsegdep_merge(struct jsegdep *one, struct jsegdep *two) 802{ 803 struct jsegdep *swp; 804 805 if (two == NULL) 806 return (one); 807 808 if (one->jd_seg->js_seq > two->jd_seg->js_seq) { 809 swp = one; 810 one = two; 811 two = swp; 812 } 813 WORKLIST_REMOVE(&two->jd_list); 814 free_jsegdep(two); 815 816 return (one); 817} 818 819/* 820 * If two freedeps are compatible free one to reduce list size. 821 */ 822static inline struct freedep * 823freedep_merge(struct freedep *one, struct freedep *two) 824{ 825 if (two == NULL) 826 return (one); 827 828 if (one->fd_freework == two->fd_freework) { 829 WORKLIST_REMOVE(&two->fd_list); 830 free_freedep(two); 831 } 832 return (one); 833} 834 835/* 836 * Move journal work from one list to another. Duplicate freedeps and 837 * jsegdeps are coalesced to keep the lists as small as possible. 838 */ 839static void 840jwork_move(dst, src) 841 struct workhead *dst; 842 struct workhead *src; 843{ 844 struct freedep *freedep; 845 struct jsegdep *jsegdep; 846 struct worklist *wkn; 847 struct worklist *wk; 848 849 KASSERT(dst != src, 850 ("jwork_move: dst == src")); 851 freedep = NULL; 852 jsegdep = NULL; 853 LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { 854 if (wk->wk_type == D_JSEGDEP) 855 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 856 if (wk->wk_type == D_FREEDEP) 857 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 858 } 859 860 mtx_assert(&lk, MA_OWNED); 861 while ((wk = LIST_FIRST(src)) != NULL) { 862 WORKLIST_REMOVE(wk); 863 WORKLIST_INSERT(dst, wk); 864 if (wk->wk_type == D_JSEGDEP) { 865 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 866 continue; 867 } 868 if (wk->wk_type == D_FREEDEP) 869 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 870 } 871} 872 873/* 874 * Routines for tracking and managing workitems. 875 */ 876static void workitem_free(struct worklist *, int); 877static void workitem_alloc(struct worklist *, int, struct mount *); 878 879#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type)) 880 881static void 882workitem_free(item, type) 883 struct worklist *item; 884 int type; 885{ 886 struct ufsmount *ump; 887 mtx_assert(&lk, MA_OWNED); 888 889#ifdef DEBUG 890 if (item->wk_state & ONWORKLIST) 891 panic("workitem_free: %s(0x%X) still on list", 892 TYPENAME(item->wk_type), item->wk_state); 893 if (item->wk_type != type) 894 panic("workitem_free: type mismatch %s != %s", 895 TYPENAME(item->wk_type), TYPENAME(type)); 896#endif 897 ump = VFSTOUFS(item->wk_mp); 898 if (--ump->softdep_deps == 0 && ump->softdep_req) 899 wakeup(&ump->softdep_deps); 900 dep_current[type]--; 901 free(item, DtoM(type)); 902} 903 904static void 905workitem_alloc(item, type, mp) 906 struct worklist *item; 907 int type; 908 struct mount *mp; 909{ 910 item->wk_type = type; 911 item->wk_mp = mp; 912 item->wk_state = 0; 913 ACQUIRE_LOCK(&lk); 914 dep_current[type]++; 915 dep_total[type]++; 916 VFSTOUFS(mp)->softdep_deps++; 917 VFSTOUFS(mp)->softdep_accdeps++; 918 FREE_LOCK(&lk); 919} 920 921/* 922 * Workitem queue management 923 */ 924static int max_softdeps; /* maximum number of structs before slowdown */ 925static int maxindirdeps = 50; /* max number of indirdeps before slowdown */ 926static int tickdelay = 2; /* number of ticks to pause during slowdown */ 927static int proc_waiting; /* tracks whether we have a timeout posted */ 928static int *stat_countp; /* statistic to count in proc_waiting timeout */ 929static struct callout softdep_callout; 930static int req_pending; 931static int req_clear_inodedeps; /* syncer process flush some inodedeps */ 932#define FLUSH_INODES 1 933static int req_clear_remove; /* syncer process flush some freeblks */ 934#define FLUSH_REMOVE 2 935#define FLUSH_REMOVE_WAIT 3 936static long num_freeblkdep; /* number of freeblks workitems allocated */ 937 938/* 939 * runtime statistics 940 */ 941static int stat_worklist_push; /* number of worklist cleanups */ 942static int stat_blk_limit_push; /* number of times block limit neared */ 943static int stat_ino_limit_push; /* number of times inode limit neared */ 944static int stat_blk_limit_hit; /* number of times block slowdown imposed */ 945static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ 946static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ 947static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ 948static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ 949static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ 950static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ 951static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ 952static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ 953static int stat_journal_min; /* Times hit journal min threshold */ 954static int stat_journal_low; /* Times hit journal low threshold */ 955static int stat_journal_wait; /* Times blocked in jwait(). */ 956static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ 957static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ 958static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ 959static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ 960 961SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, 962 &max_softdeps, 0, ""); 963SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, 964 &tickdelay, 0, ""); 965SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW, 966 &maxindirdeps, 0, ""); 967SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW, 968 &stat_worklist_push, 0,""); 969SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW, 970 &stat_blk_limit_push, 0,""); 971SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW, 972 &stat_ino_limit_push, 0,""); 973SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW, 974 &stat_blk_limit_hit, 0, ""); 975SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW, 976 &stat_ino_limit_hit, 0, ""); 977SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW, 978 &stat_sync_limit_hit, 0, ""); 979SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, 980 &stat_indir_blk_ptrs, 0, ""); 981SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW, 982 &stat_inode_bitmap, 0, ""); 983SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, 984 &stat_direct_blk_ptrs, 0, ""); 985SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW, 986 &stat_dir_entry, 0, ""); 987SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW, 988 &stat_jaddref, 0, ""); 989SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW, 990 &stat_jnewblk, 0, ""); 991SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW, 992 &stat_journal_low, 0, ""); 993SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW, 994 &stat_journal_min, 0, ""); 995SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW, 996 &stat_journal_wait, 0, ""); 997SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW, 998 &stat_jwait_filepage, 0, ""); 999SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW, 1000 &stat_jwait_freeblks, 0, ""); 1001SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW, 1002 &stat_jwait_inode, 0, ""); 1003SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW, 1004 &stat_jwait_newblk, 0, ""); 1005 1006SYSCTL_DECL(_vfs_ffs); 1007 1008LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl; 1009static u_long bmsafemap_hash; /* size of hash table - 1 */ 1010 1011static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ 1012SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, 1013 &compute_summary_at_mount, 0, "Recompute summary at mount"); 1014 1015static struct proc *softdepproc; 1016static struct kproc_desc softdep_kp = { 1017 "softdepflush", 1018 softdep_flush, 1019 &softdepproc 1020}; 1021SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, 1022 &softdep_kp); 1023 1024static void 1025softdep_flush(void) 1026{ 1027 struct mount *nmp; 1028 struct mount *mp; 1029 struct ufsmount *ump; 1030 struct thread *td; 1031 int remaining; 1032 int vfslocked; 1033 1034 td = curthread; 1035 td->td_pflags |= TDP_NORUNNINGBUF; 1036 1037 for (;;) { 1038 kproc_suspend_check(softdepproc); 1039 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL); 1040 ACQUIRE_LOCK(&lk); 1041 /* 1042 * If requested, try removing inode or removal dependencies. 1043 */ 1044 if (req_clear_inodedeps) { 1045 clear_inodedeps(td); 1046 req_clear_inodedeps -= 1; 1047 wakeup_one(&proc_waiting); 1048 } 1049 if (req_clear_remove) { 1050 clear_remove(td); 1051 req_clear_remove -= 1; 1052 wakeup_one(&proc_waiting); 1053 } 1054 FREE_LOCK(&lk); 1055 VFS_UNLOCK_GIANT(vfslocked); 1056 remaining = 0; 1057 mtx_lock(&mountlist_mtx); 1058 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1059 nmp = TAILQ_NEXT(mp, mnt_list); 1060 if ((mp->mnt_flag & MNT_SOFTDEP) == 0) 1061 continue; 1062 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 1063 continue; 1064 vfslocked = VFS_LOCK_GIANT(mp); 1065 softdep_process_worklist(mp, 0); 1066 ump = VFSTOUFS(mp); 1067 remaining += ump->softdep_on_worklist - 1068 ump->softdep_on_worklist_inprogress; 1069 VFS_UNLOCK_GIANT(vfslocked); 1070 mtx_lock(&mountlist_mtx); 1071 nmp = TAILQ_NEXT(mp, mnt_list); 1072 vfs_unbusy(mp); 1073 } 1074 mtx_unlock(&mountlist_mtx); 1075 if (remaining) 1076 continue; 1077 ACQUIRE_LOCK(&lk); 1078 if (!req_pending) 1079 msleep(&req_pending, &lk, PVM, "sdflush", hz); 1080 req_pending = 0; 1081 FREE_LOCK(&lk); 1082 } 1083} 1084 1085static void 1086worklist_speedup(void) 1087{ 1088 mtx_assert(&lk, MA_OWNED); 1089 if (req_pending == 0) { 1090 req_pending = 1; 1091 wakeup(&req_pending); 1092 } 1093} 1094 1095static int 1096softdep_speedup(void) 1097{ 1098 1099 worklist_speedup(); 1100 bd_speedup(); 1101 return speedup_syncer(); 1102} 1103 1104/* 1105 * Add an item to the end of the work queue. 1106 * This routine requires that the lock be held. 1107 * This is the only routine that adds items to the list. 1108 * The following routine is the only one that removes items 1109 * and does so in order from first to last. 1110 */ 1111static void 1112add_to_worklist(wk, nodelay) 1113 struct worklist *wk; 1114 int nodelay; 1115{ 1116 struct ufsmount *ump; 1117 1118 mtx_assert(&lk, MA_OWNED); 1119 ump = VFSTOUFS(wk->wk_mp); 1120 if (wk->wk_state & ONWORKLIST) 1121 panic("add_to_worklist: %s(0x%X) already on list", 1122 TYPENAME(wk->wk_type), wk->wk_state); 1123 wk->wk_state |= ONWORKLIST; 1124 if (LIST_EMPTY(&ump->softdep_workitem_pending)) 1125 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1126 else 1127 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); 1128 ump->softdep_worklist_tail = wk; 1129 ump->softdep_on_worklist += 1; 1130 if (nodelay) 1131 worklist_speedup(); 1132} 1133 1134/* 1135 * Remove the item to be processed. If we are removing the last 1136 * item on the list, we need to recalculate the tail pointer. 1137 */ 1138static void 1139remove_from_worklist(wk) 1140 struct worklist *wk; 1141{ 1142 struct ufsmount *ump; 1143 struct worklist *wkend; 1144 1145 ump = VFSTOUFS(wk->wk_mp); 1146 WORKLIST_REMOVE(wk); 1147 if (wk == ump->softdep_worklist_tail) { 1148 LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list) 1149 if (LIST_NEXT(wkend, wk_list) == NULL) 1150 break; 1151 ump->softdep_worklist_tail = wkend; 1152 } 1153 ump->softdep_on_worklist -= 1; 1154} 1155 1156/* 1157 * Process that runs once per second to handle items in the background queue. 1158 * 1159 * Note that we ensure that everything is done in the order in which they 1160 * appear in the queue. The code below depends on this property to ensure 1161 * that blocks of a file are freed before the inode itself is freed. This 1162 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 1163 * until all the old ones have been purged from the dependency lists. 1164 */ 1165int 1166softdep_process_worklist(mp, full) 1167 struct mount *mp; 1168 int full; 1169{ 1170 struct thread *td = curthread; 1171 int cnt, matchcnt, loopcount; 1172 struct ufsmount *ump; 1173 long starttime; 1174 1175 KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); 1176 /* 1177 * Record the process identifier of our caller so that we can give 1178 * this process preferential treatment in request_cleanup below. 1179 */ 1180 matchcnt = 0; 1181 ump = VFSTOUFS(mp); 1182 ACQUIRE_LOCK(&lk); 1183 loopcount = 1; 1184 starttime = time_second; 1185 softdep_process_journal(mp, full?MNT_WAIT:0); 1186 while (ump->softdep_on_worklist > 0) { 1187 if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1) 1188 break; 1189 else 1190 matchcnt += cnt; 1191 /* 1192 * If requested, try removing inode or removal dependencies. 1193 */ 1194 if (req_clear_inodedeps) { 1195 clear_inodedeps(td); 1196 req_clear_inodedeps -= 1; 1197 wakeup_one(&proc_waiting); 1198 } 1199 if (req_clear_remove) { 1200 clear_remove(td); 1201 req_clear_remove -= 1; 1202 wakeup_one(&proc_waiting); 1203 } 1204 /* 1205 * We do not generally want to stop for buffer space, but if 1206 * we are really being a buffer hog, we will stop and wait. 1207 */ 1208 if (loopcount++ % 128 == 0) { 1209 FREE_LOCK(&lk); 1210 uio_yield(); 1211 bwillwrite(); 1212 ACQUIRE_LOCK(&lk); 1213 } 1214 /* 1215 * Never allow processing to run for more than one 1216 * second. Otherwise the other mountpoints may get 1217 * excessively backlogged. 1218 */ 1219 if (!full && starttime != time_second) 1220 break; 1221 } 1222 FREE_LOCK(&lk); 1223 return (matchcnt); 1224} 1225 1226/* 1227 * Process all removes associated with a vnode if we are running out of 1228 * journal space. Any other process which attempts to flush these will 1229 * be unable as we have the vnodes locked. 1230 */ 1231static void 1232process_removes(vp) 1233 struct vnode *vp; 1234{ 1235 struct inodedep *inodedep; 1236 struct dirrem *dirrem; 1237 struct mount *mp; 1238 ino_t inum; 1239 1240 mtx_assert(&lk, MA_OWNED); 1241 1242 mp = vp->v_mount; 1243 inum = VTOI(vp)->i_number; 1244 for (;;) { 1245 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1246 return; 1247 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) 1248 if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == 1249 (COMPLETE | ONWORKLIST)) 1250 break; 1251 if (dirrem == NULL) 1252 return; 1253 /* 1254 * If another thread is trying to lock this vnode it will 1255 * fail but we must wait for it to do so before we can 1256 * proceed. 1257 */ 1258 if (dirrem->dm_state & INPROGRESS) { 1259 dirrem->dm_state |= IOWAITING; 1260 msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0); 1261 continue; 1262 } 1263 remove_from_worklist(&dirrem->dm_list); 1264 FREE_LOCK(&lk); 1265 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1266 panic("process_removes: suspended filesystem"); 1267 handle_workitem_remove(dirrem, vp); 1268 vn_finished_secondary_write(mp); 1269 ACQUIRE_LOCK(&lk); 1270 } 1271} 1272 1273/* 1274 * Process one item on the worklist. 1275 */ 1276static int 1277process_worklist_item(mp, flags) 1278 struct mount *mp; 1279 int flags; 1280{ 1281 struct worklist *wk, *wkXXX; 1282 struct ufsmount *ump; 1283 struct vnode *vp; 1284 int matchcnt = 0; 1285 1286 mtx_assert(&lk, MA_OWNED); 1287 KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); 1288 /* 1289 * If we are being called because of a process doing a 1290 * copy-on-write, then it is not safe to write as we may 1291 * recurse into the copy-on-write routine. 1292 */ 1293 if (curthread->td_pflags & TDP_COWINPROGRESS) 1294 return (-1); 1295 /* 1296 * Normally we just process each item on the worklist in order. 1297 * However, if we are in a situation where we cannot lock any 1298 * inodes, we have to skip over any dirrem requests whose 1299 * vnodes are resident and locked. 1300 */ 1301 vp = NULL; 1302 ump = VFSTOUFS(mp); 1303 LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) { 1304 if (wk->wk_state & INPROGRESS) { 1305 wkXXX = wk; 1306 continue; 1307 } 1308 wkXXX = wk; /* Record the last valid wk pointer. */ 1309 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM) 1310 break; 1311 wk->wk_state |= INPROGRESS; 1312 ump->softdep_on_worklist_inprogress++; 1313 FREE_LOCK(&lk); 1314 ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum, 1315 LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ); 1316 ACQUIRE_LOCK(&lk); 1317 if (wk->wk_state & IOWAITING) { 1318 wk->wk_state &= ~IOWAITING; 1319 wakeup(wk); 1320 } 1321 wk->wk_state &= ~INPROGRESS; 1322 ump->softdep_on_worklist_inprogress--; 1323 if (vp != NULL) 1324 break; 1325 } 1326 if (wk == 0) 1327 return (-1); 1328 remove_from_worklist(wk); 1329 FREE_LOCK(&lk); 1330 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1331 panic("process_worklist_item: suspended filesystem"); 1332 matchcnt++; 1333 switch (wk->wk_type) { 1334 1335 case D_DIRREM: 1336 /* removal of a directory entry */ 1337 handle_workitem_remove(WK_DIRREM(wk), vp); 1338 if (vp) 1339 vput(vp); 1340 break; 1341 1342 case D_FREEBLKS: 1343 /* releasing blocks and/or fragments from a file */ 1344 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT); 1345 break; 1346 1347 case D_FREEFRAG: 1348 /* releasing a fragment when replaced as a file grows */ 1349 handle_workitem_freefrag(WK_FREEFRAG(wk)); 1350 break; 1351 1352 case D_FREEFILE: 1353 /* releasing an inode when its link count drops to 0 */ 1354 handle_workitem_freefile(WK_FREEFILE(wk)); 1355 break; 1356 1357 case D_FREEWORK: 1358 /* Final block in an indirect was freed. */ 1359 handle_workitem_indirblk(WK_FREEWORK(wk)); 1360 break; 1361 1362 default: 1363 panic("%s_process_worklist: Unknown type %s", 1364 "softdep", TYPENAME(wk->wk_type)); 1365 /* NOTREACHED */ 1366 } 1367 vn_finished_secondary_write(mp); 1368 ACQUIRE_LOCK(&lk); 1369 return (matchcnt); 1370} 1371 1372/* 1373 * Move dependencies from one buffer to another. 1374 */ 1375int 1376softdep_move_dependencies(oldbp, newbp) 1377 struct buf *oldbp; 1378 struct buf *newbp; 1379{ 1380 struct worklist *wk, *wktail; 1381 int dirty; 1382 1383 dirty = 0; 1384 wktail = NULL; 1385 ACQUIRE_LOCK(&lk); 1386 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 1387 LIST_REMOVE(wk, wk_list); 1388 if (wk->wk_type == D_BMSAFEMAP && 1389 bmsafemap_rollbacks(WK_BMSAFEMAP(wk))) 1390 dirty = 1; 1391 if (wktail == 0) 1392 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 1393 else 1394 LIST_INSERT_AFTER(wktail, wk, wk_list); 1395 wktail = wk; 1396 } 1397 FREE_LOCK(&lk); 1398 1399 return (dirty); 1400} 1401 1402/* 1403 * Purge the work list of all items associated with a particular mount point. 1404 */ 1405int 1406softdep_flushworklist(oldmnt, countp, td) 1407 struct mount *oldmnt; 1408 int *countp; 1409 struct thread *td; 1410{ 1411 struct vnode *devvp; 1412 int count, error = 0; 1413 struct ufsmount *ump; 1414 1415 /* 1416 * Alternately flush the block device associated with the mount 1417 * point and process any dependencies that the flushing 1418 * creates. We continue until no more worklist dependencies 1419 * are found. 1420 */ 1421 *countp = 0; 1422 ump = VFSTOUFS(oldmnt); 1423 devvp = ump->um_devvp; 1424 while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { 1425 *countp += count; 1426 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1427 error = VOP_FSYNC(devvp, MNT_WAIT, td); 1428 VOP_UNLOCK(devvp, 0); 1429 if (error) 1430 break; 1431 } 1432 return (error); 1433} 1434 1435int 1436softdep_waitidle(struct mount *mp) 1437{ 1438 struct ufsmount *ump; 1439 int error; 1440 int i; 1441 1442 ump = VFSTOUFS(mp); 1443 ACQUIRE_LOCK(&lk); 1444 for (i = 0; i < 10 && ump->softdep_deps; i++) { 1445 ump->softdep_req = 1; 1446 if (ump->softdep_on_worklist) 1447 panic("softdep_waitidle: work added after flush."); 1448 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1); 1449 } 1450 ump->softdep_req = 0; 1451 FREE_LOCK(&lk); 1452 error = 0; 1453 if (i == 10) { 1454 error = EBUSY; 1455 printf("softdep_waitidle: Failed to flush worklist for %p\n", 1456 mp); 1457 } 1458 1459 return (error); 1460} 1461 1462/* 1463 * Flush all vnodes and worklist items associated with a specified mount point. 1464 */ 1465int 1466softdep_flushfiles(oldmnt, flags, td) 1467 struct mount *oldmnt; 1468 int flags; 1469 struct thread *td; 1470{ 1471 int error, depcount, loopcnt, retry_flush_count, retry; 1472 1473 loopcnt = 10; 1474 retry_flush_count = 3; 1475retry_flush: 1476 error = 0; 1477 1478 /* 1479 * Alternately flush the vnodes associated with the mount 1480 * point and process any dependencies that the flushing 1481 * creates. In theory, this loop can happen at most twice, 1482 * but we give it a few extra just to be sure. 1483 */ 1484 for (; loopcnt > 0; loopcnt--) { 1485 /* 1486 * Do another flush in case any vnodes were brought in 1487 * as part of the cleanup operations. 1488 */ 1489 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) 1490 break; 1491 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || 1492 depcount == 0) 1493 break; 1494 } 1495 /* 1496 * If we are unmounting then it is an error to fail. If we 1497 * are simply trying to downgrade to read-only, then filesystem 1498 * activity can keep us busy forever, so we just fail with EBUSY. 1499 */ 1500 if (loopcnt == 0) { 1501 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 1502 panic("softdep_flushfiles: looping"); 1503 error = EBUSY; 1504 } 1505 if (!error) 1506 error = softdep_waitidle(oldmnt); 1507 if (!error) { 1508 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { 1509 retry = 0; 1510 MNT_ILOCK(oldmnt); 1511 KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0, 1512 ("softdep_flushfiles: !MNTK_NOINSMNTQ")); 1513 if (oldmnt->mnt_nvnodelistsize > 0) { 1514 if (--retry_flush_count > 0) { 1515 retry = 1; 1516 loopcnt = 3; 1517 } else 1518 error = EBUSY; 1519 } 1520 MNT_IUNLOCK(oldmnt); 1521 if (retry) 1522 goto retry_flush; 1523 } 1524 } 1525 return (error); 1526} 1527 1528/* 1529 * Structure hashing. 1530 * 1531 * There are three types of structures that can be looked up: 1532 * 1) pagedep structures identified by mount point, inode number, 1533 * and logical block. 1534 * 2) inodedep structures identified by mount point and inode number. 1535 * 3) newblk structures identified by mount point and 1536 * physical block number. 1537 * 1538 * The "pagedep" and "inodedep" dependency structures are hashed 1539 * separately from the file blocks and inodes to which they correspond. 1540 * This separation helps when the in-memory copy of an inode or 1541 * file block must be replaced. It also obviates the need to access 1542 * an inode or file page when simply updating (or de-allocating) 1543 * dependency structures. Lookup of newblk structures is needed to 1544 * find newly allocated blocks when trying to associate them with 1545 * their allocdirect or allocindir structure. 1546 * 1547 * The lookup routines optionally create and hash a new instance when 1548 * an existing entry is not found. 1549 */ 1550#define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 1551#define NODELAY 0x0002 /* cannot do background work */ 1552 1553/* 1554 * Structures and routines associated with pagedep caching. 1555 */ 1556LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 1557u_long pagedep_hash; /* size of hash table - 1 */ 1558#define PAGEDEP_HASH(mp, inum, lbn) \ 1559 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 1560 pagedep_hash]) 1561 1562static int 1563pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp) 1564 struct pagedep_hashhead *pagedephd; 1565 ino_t ino; 1566 ufs_lbn_t lbn; 1567 struct mount *mp; 1568 int flags; 1569 struct pagedep **pagedeppp; 1570{ 1571 struct pagedep *pagedep; 1572 1573 LIST_FOREACH(pagedep, pagedephd, pd_hash) 1574 if (ino == pagedep->pd_ino && 1575 lbn == pagedep->pd_lbn && 1576 mp == pagedep->pd_list.wk_mp) 1577 break; 1578 if (pagedep) { 1579 *pagedeppp = pagedep; 1580 if ((flags & DEPALLOC) != 0 && 1581 (pagedep->pd_state & ONWORKLIST) == 0) 1582 return (0); 1583 return (1); 1584 } 1585 *pagedeppp = NULL; 1586 return (0); 1587} 1588/* 1589 * Look up a pagedep. Return 1 if found, 0 if not found or found 1590 * when asked to allocate but not associated with any buffer. 1591 * If not found, allocate if DEPALLOC flag is passed. 1592 * Found or allocated entry is returned in pagedeppp. 1593 * This routine must be called with splbio interrupts blocked. 1594 */ 1595static int 1596pagedep_lookup(mp, ino, lbn, flags, pagedeppp) 1597 struct mount *mp; 1598 ino_t ino; 1599 ufs_lbn_t lbn; 1600 int flags; 1601 struct pagedep **pagedeppp; 1602{ 1603 struct pagedep *pagedep; 1604 struct pagedep_hashhead *pagedephd; 1605 int ret; 1606 int i; 1607 1608 mtx_assert(&lk, MA_OWNED); 1609 pagedephd = PAGEDEP_HASH(mp, ino, lbn); 1610 1611 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 1612 if (*pagedeppp || (flags & DEPALLOC) == 0) 1613 return (ret); 1614 FREE_LOCK(&lk); 1615 pagedep = malloc(sizeof(struct pagedep), 1616 M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); 1617 workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); 1618 ACQUIRE_LOCK(&lk); 1619 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 1620 if (*pagedeppp) { 1621 WORKITEM_FREE(pagedep, D_PAGEDEP); 1622 return (ret); 1623 } 1624 pagedep->pd_ino = ino; 1625 pagedep->pd_lbn = lbn; 1626 LIST_INIT(&pagedep->pd_dirremhd); 1627 LIST_INIT(&pagedep->pd_pendinghd); 1628 for (i = 0; i < DAHASHSZ; i++) 1629 LIST_INIT(&pagedep->pd_diraddhd[i]); 1630 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 1631 *pagedeppp = pagedep; 1632 return (0); 1633} 1634 1635/* 1636 * Structures and routines associated with inodedep caching. 1637 */ 1638LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 1639static u_long inodedep_hash; /* size of hash table - 1 */ 1640static long num_inodedep; /* number of inodedep allocated */ 1641#define INODEDEP_HASH(fs, inum) \ 1642 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 1643 1644static int 1645inodedep_find(inodedephd, fs, inum, inodedeppp) 1646 struct inodedep_hashhead *inodedephd; 1647 struct fs *fs; 1648 ino_t inum; 1649 struct inodedep **inodedeppp; 1650{ 1651 struct inodedep *inodedep; 1652 1653 LIST_FOREACH(inodedep, inodedephd, id_hash) 1654 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 1655 break; 1656 if (inodedep) { 1657 *inodedeppp = inodedep; 1658 return (1); 1659 } 1660 *inodedeppp = NULL; 1661 1662 return (0); 1663} 1664/* 1665 * Look up an inodedep. Return 1 if found, 0 if not found. 1666 * If not found, allocate if DEPALLOC flag is passed. 1667 * Found or allocated entry is returned in inodedeppp. 1668 * This routine must be called with splbio interrupts blocked. 1669 */ 1670static int 1671inodedep_lookup(mp, inum, flags, inodedeppp) 1672 struct mount *mp; 1673 ino_t inum; 1674 int flags; 1675 struct inodedep **inodedeppp; 1676{ 1677 struct inodedep *inodedep; 1678 struct inodedep_hashhead *inodedephd; 1679 struct fs *fs; 1680 1681 mtx_assert(&lk, MA_OWNED); 1682 fs = VFSTOUFS(mp)->um_fs; 1683 inodedephd = INODEDEP_HASH(fs, inum); 1684 1685 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) 1686 return (1); 1687 if ((flags & DEPALLOC) == 0) 1688 return (0); 1689 /* 1690 * If we are over our limit, try to improve the situation. 1691 */ 1692 if (num_inodedep > max_softdeps && (flags & NODELAY) == 0) 1693 request_cleanup(mp, FLUSH_INODES); 1694 FREE_LOCK(&lk); 1695 inodedep = malloc(sizeof(struct inodedep), 1696 M_INODEDEP, M_SOFTDEP_FLAGS); 1697 workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); 1698 ACQUIRE_LOCK(&lk); 1699 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) { 1700 WORKITEM_FREE(inodedep, D_INODEDEP); 1701 return (1); 1702 } 1703 num_inodedep += 1; 1704 inodedep->id_fs = fs; 1705 inodedep->id_ino = inum; 1706 inodedep->id_state = ALLCOMPLETE; 1707 inodedep->id_nlinkdelta = 0; 1708 inodedep->id_savedino1 = NULL; 1709 inodedep->id_savedsize = -1; 1710 inodedep->id_savedextsize = -1; 1711 inodedep->id_savednlink = -1; 1712 inodedep->id_bmsafemap = NULL; 1713 inodedep->id_mkdiradd = NULL; 1714 LIST_INIT(&inodedep->id_dirremhd); 1715 LIST_INIT(&inodedep->id_pendinghd); 1716 LIST_INIT(&inodedep->id_inowait); 1717 LIST_INIT(&inodedep->id_bufwait); 1718 TAILQ_INIT(&inodedep->id_inoreflst); 1719 TAILQ_INIT(&inodedep->id_inoupdt); 1720 TAILQ_INIT(&inodedep->id_newinoupdt); 1721 TAILQ_INIT(&inodedep->id_extupdt); 1722 TAILQ_INIT(&inodedep->id_newextupdt); 1723 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 1724 *inodedeppp = inodedep; 1725 return (0); 1726} 1727 1728/* 1729 * Structures and routines associated with newblk caching. 1730 */ 1731LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 1732u_long newblk_hash; /* size of hash table - 1 */ 1733#define NEWBLK_HASH(fs, inum) \ 1734 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 1735 1736static int 1737newblk_find(newblkhd, mp, newblkno, flags, newblkpp) 1738 struct newblk_hashhead *newblkhd; 1739 struct mount *mp; 1740 ufs2_daddr_t newblkno; 1741 int flags; 1742 struct newblk **newblkpp; 1743{ 1744 struct newblk *newblk; 1745 1746 LIST_FOREACH(newblk, newblkhd, nb_hash) { 1747 if (newblkno != newblk->nb_newblkno) 1748 continue; 1749 if (mp != newblk->nb_list.wk_mp) 1750 continue; 1751 /* 1752 * If we're creating a new dependency don't match those that 1753 * have already been converted to allocdirects. This is for 1754 * a frag extend. 1755 */ 1756 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) 1757 continue; 1758 break; 1759 } 1760 if (newblk) { 1761 *newblkpp = newblk; 1762 return (1); 1763 } 1764 *newblkpp = NULL; 1765 return (0); 1766} 1767 1768/* 1769 * Look up a newblk. Return 1 if found, 0 if not found. 1770 * If not found, allocate if DEPALLOC flag is passed. 1771 * Found or allocated entry is returned in newblkpp. 1772 */ 1773static int 1774newblk_lookup(mp, newblkno, flags, newblkpp) 1775 struct mount *mp; 1776 ufs2_daddr_t newblkno; 1777 int flags; 1778 struct newblk **newblkpp; 1779{ 1780 struct newblk *newblk; 1781 struct newblk_hashhead *newblkhd; 1782 1783 newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno); 1784 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) 1785 return (1); 1786 if ((flags & DEPALLOC) == 0) 1787 return (0); 1788 FREE_LOCK(&lk); 1789 newblk = malloc(sizeof(union allblk), M_NEWBLK, 1790 M_SOFTDEP_FLAGS | M_ZERO); 1791 workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); 1792 ACQUIRE_LOCK(&lk); 1793 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) { 1794 WORKITEM_FREE(newblk, D_NEWBLK); 1795 return (1); 1796 } 1797 newblk->nb_freefrag = NULL; 1798 LIST_INIT(&newblk->nb_indirdeps); 1799 LIST_INIT(&newblk->nb_newdirblk); 1800 LIST_INIT(&newblk->nb_jwork); 1801 newblk->nb_state = ATTACHED; 1802 newblk->nb_newblkno = newblkno; 1803 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 1804 *newblkpp = newblk; 1805 return (0); 1806} 1807 1808/* 1809 * Executed during filesystem system initialization before 1810 * mounting any filesystems. 1811 */ 1812void 1813softdep_initialize() 1814{ 1815 1816 LIST_INIT(&mkdirlisthd); 1817 max_softdeps = desiredvnodes * 4; 1818 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); 1819 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 1820 newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash); 1821 bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash); 1822 1823 /* initialise bioops hack */ 1824 bioops.io_start = softdep_disk_io_initiation; 1825 bioops.io_complete = softdep_disk_write_complete; 1826 bioops.io_deallocate = softdep_deallocate_dependencies; 1827 bioops.io_countdeps = softdep_count_dependencies; 1828 1829 /* Initialize the callout with an mtx. */ 1830 callout_init_mtx(&softdep_callout, &lk, 0); 1831} 1832 1833/* 1834 * Executed after all filesystems have been unmounted during 1835 * filesystem module unload. 1836 */ 1837void 1838softdep_uninitialize() 1839{ 1840 1841 callout_drain(&softdep_callout); 1842 hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); 1843 hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); 1844 hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); 1845 hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash); 1846} 1847 1848/* 1849 * Called at mount time to notify the dependency code that a 1850 * filesystem wishes to use it. 1851 */ 1852int 1853softdep_mount(devvp, mp, fs, cred) 1854 struct vnode *devvp; 1855 struct mount *mp; 1856 struct fs *fs; 1857 struct ucred *cred; 1858{ 1859 struct csum_total cstotal; 1860 struct ufsmount *ump; 1861 struct cg *cgp; 1862 struct buf *bp; 1863 int error, cyl; 1864 1865 MNT_ILOCK(mp); 1866 mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; 1867 if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { 1868 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 1869 MNTK_SOFTDEP; 1870 mp->mnt_noasync++; 1871 } 1872 MNT_IUNLOCK(mp); 1873 ump = VFSTOUFS(mp); 1874 LIST_INIT(&ump->softdep_workitem_pending); 1875 LIST_INIT(&ump->softdep_journal_pending); 1876 TAILQ_INIT(&ump->softdep_unlinked); 1877 ump->softdep_worklist_tail = NULL; 1878 ump->softdep_on_worklist = 0; 1879 ump->softdep_deps = 0; 1880 if ((fs->fs_flags & FS_SUJ) && 1881 (error = journal_mount(mp, fs, cred)) != 0) { 1882 printf("Failed to start journal: %d\n", error); 1883 return (error); 1884 } 1885 /* 1886 * When doing soft updates, the counters in the 1887 * superblock may have gotten out of sync. Recomputation 1888 * can take a long time and can be deferred for background 1889 * fsck. However, the old behavior of scanning the cylinder 1890 * groups and recalculating them at mount time is available 1891 * by setting vfs.ffs.compute_summary_at_mount to one. 1892 */ 1893 if (compute_summary_at_mount == 0 || fs->fs_clean != 0) 1894 return (0); 1895 bzero(&cstotal, sizeof cstotal); 1896 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 1897 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 1898 fs->fs_cgsize, cred, &bp)) != 0) { 1899 brelse(bp); 1900 return (error); 1901 } 1902 cgp = (struct cg *)bp->b_data; 1903 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 1904 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 1905 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 1906 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 1907 fs->fs_cs(fs, cyl) = cgp->cg_cs; 1908 brelse(bp); 1909 } 1910#ifdef DEBUG 1911 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 1912 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 1913#endif 1914 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 1915 return (0); 1916} 1917 1918void 1919softdep_unmount(mp) 1920 struct mount *mp; 1921{ 1922 1923 if (mp->mnt_kern_flag & MNTK_SUJ) 1924 journal_unmount(mp); 1925} 1926 1927struct jblocks { 1928 struct jseglst jb_segs; /* TAILQ of current segments. */ 1929 struct jseg *jb_writeseg; /* Next write to complete. */ 1930 struct jextent *jb_extent; /* Extent array. */ 1931 uint64_t jb_nextseq; /* Next sequence number. */ 1932 uint64_t jb_oldestseq; /* Oldest active sequence number. */ 1933 int jb_avail; /* Available extents. */ 1934 int jb_used; /* Last used extent. */ 1935 int jb_head; /* Allocator head. */ 1936 int jb_off; /* Allocator extent offset. */ 1937 int jb_blocks; /* Total disk blocks covered. */ 1938 int jb_free; /* Total disk blocks free. */ 1939 int jb_min; /* Minimum free space. */ 1940 int jb_low; /* Low on space. */ 1941 int jb_age; /* Insertion time of oldest rec. */ 1942 int jb_suspended; /* Did journal suspend writes? */ 1943}; 1944 1945struct jextent { 1946 ufs2_daddr_t je_daddr; /* Disk block address. */ 1947 int je_blocks; /* Disk block count. */ 1948}; 1949 1950static struct jblocks * 1951jblocks_create(void) 1952{ 1953 struct jblocks *jblocks; 1954 1955 jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); 1956 TAILQ_INIT(&jblocks->jb_segs); 1957 jblocks->jb_avail = 10; 1958 jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, 1959 M_JBLOCKS, M_WAITOK | M_ZERO); 1960 1961 return (jblocks); 1962} 1963 1964static ufs2_daddr_t 1965jblocks_alloc(jblocks, bytes, actual) 1966 struct jblocks *jblocks; 1967 int bytes; 1968 int *actual; 1969{ 1970 ufs2_daddr_t daddr; 1971 struct jextent *jext; 1972 int freecnt; 1973 int blocks; 1974 1975 blocks = bytes / DEV_BSIZE; 1976 jext = &jblocks->jb_extent[jblocks->jb_head]; 1977 freecnt = jext->je_blocks - jblocks->jb_off; 1978 if (freecnt == 0) { 1979 jblocks->jb_off = 0; 1980 if (++jblocks->jb_head > jblocks->jb_used) 1981 jblocks->jb_head = 0; 1982 jext = &jblocks->jb_extent[jblocks->jb_head]; 1983 freecnt = jext->je_blocks; 1984 } 1985 if (freecnt > blocks) 1986 freecnt = blocks; 1987 *actual = freecnt * DEV_BSIZE; 1988 daddr = jext->je_daddr + jblocks->jb_off; 1989 jblocks->jb_off += freecnt; 1990 jblocks->jb_free -= freecnt; 1991 1992 return (daddr); 1993} 1994 1995static void 1996jblocks_free(jblocks, mp, bytes) 1997 struct jblocks *jblocks; 1998 struct mount *mp; 1999 int bytes; 2000{ 2001 2002 jblocks->jb_free += bytes / DEV_BSIZE; 2003 if (jblocks->jb_suspended) 2004 worklist_speedup(); 2005 wakeup(jblocks); 2006} 2007 2008static void 2009jblocks_destroy(jblocks) 2010 struct jblocks *jblocks; 2011{ 2012 2013 if (jblocks->jb_extent) 2014 free(jblocks->jb_extent, M_JBLOCKS); 2015 free(jblocks, M_JBLOCKS); 2016} 2017 2018static void 2019jblocks_add(jblocks, daddr, blocks) 2020 struct jblocks *jblocks; 2021 ufs2_daddr_t daddr; 2022 int blocks; 2023{ 2024 struct jextent *jext; 2025 2026 jblocks->jb_blocks += blocks; 2027 jblocks->jb_free += blocks; 2028 jext = &jblocks->jb_extent[jblocks->jb_used]; 2029 /* Adding the first block. */ 2030 if (jext->je_daddr == 0) { 2031 jext->je_daddr = daddr; 2032 jext->je_blocks = blocks; 2033 return; 2034 } 2035 /* Extending the last extent. */ 2036 if (jext->je_daddr + jext->je_blocks == daddr) { 2037 jext->je_blocks += blocks; 2038 return; 2039 } 2040 /* Adding a new extent. */ 2041 if (++jblocks->jb_used == jblocks->jb_avail) { 2042 jblocks->jb_avail *= 2; 2043 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2044 M_JBLOCKS, M_WAITOK | M_ZERO); 2045 memcpy(jext, jblocks->jb_extent, 2046 sizeof(struct jextent) * jblocks->jb_used); 2047 free(jblocks->jb_extent, M_JBLOCKS); 2048 jblocks->jb_extent = jext; 2049 } 2050 jext = &jblocks->jb_extent[jblocks->jb_used]; 2051 jext->je_daddr = daddr; 2052 jext->je_blocks = blocks; 2053 return; 2054} 2055 2056int 2057softdep_journal_lookup(mp, vpp) 2058 struct mount *mp; 2059 struct vnode **vpp; 2060{ 2061 struct componentname cnp; 2062 struct vnode *dvp; 2063 ino_t sujournal; 2064 int error; 2065 2066 error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp); 2067 if (error) 2068 return (error); 2069 bzero(&cnp, sizeof(cnp)); 2070 cnp.cn_nameiop = LOOKUP; 2071 cnp.cn_flags = ISLASTCN; 2072 cnp.cn_thread = curthread; 2073 cnp.cn_cred = curthread->td_ucred; 2074 cnp.cn_pnbuf = SUJ_FILE; 2075 cnp.cn_nameptr = SUJ_FILE; 2076 cnp.cn_namelen = strlen(SUJ_FILE); 2077 error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); 2078 vput(dvp); 2079 if (error != 0) 2080 return (error); 2081 error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); 2082 return (error); 2083} 2084 2085/* 2086 * Open and verify the journal file. 2087 */ 2088static int 2089journal_mount(mp, fs, cred) 2090 struct mount *mp; 2091 struct fs *fs; 2092 struct ucred *cred; 2093{ 2094 struct jblocks *jblocks; 2095 struct vnode *vp; 2096 struct inode *ip; 2097 ufs2_daddr_t blkno; 2098 int bcount; 2099 int error; 2100 int i; 2101 2102 mp->mnt_kern_flag |= MNTK_SUJ; 2103 error = softdep_journal_lookup(mp, &vp); 2104 if (error != 0) { 2105 printf("Failed to find journal. Use tunefs to create one\n"); 2106 return (error); 2107 } 2108 ip = VTOI(vp); 2109 if (ip->i_size < SUJ_MIN) { 2110 error = ENOSPC; 2111 goto out; 2112 } 2113 bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ 2114 jblocks = jblocks_create(); 2115 for (i = 0; i < bcount; i++) { 2116 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); 2117 if (error) 2118 break; 2119 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); 2120 } 2121 if (error) { 2122 jblocks_destroy(jblocks); 2123 goto out; 2124 } 2125 jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ 2126 jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ 2127 /* 2128 * Only validate the journal contents if the filesystem is clean, 2129 * otherwise we write the logs but they'll never be used. If the 2130 * filesystem was still dirty when we mounted it the journal is 2131 * invalid and a new journal can only be valid if it starts from a 2132 * clean mount. 2133 */ 2134 if (fs->fs_clean) { 2135 DIP_SET(ip, i_modrev, fs->fs_mtime); 2136 ip->i_flags |= IN_MODIFIED; 2137 ffs_update(vp, 1); 2138 } 2139 VFSTOUFS(mp)->softdep_jblocks = jblocks; 2140out: 2141 vput(vp); 2142 return (error); 2143} 2144 2145static void 2146journal_unmount(mp) 2147 struct mount *mp; 2148{ 2149 struct ufsmount *ump; 2150 2151 ump = VFSTOUFS(mp); 2152 if (ump->softdep_jblocks) 2153 jblocks_destroy(ump->softdep_jblocks); 2154 ump->softdep_jblocks = NULL; 2155} 2156 2157/* 2158 * Called when a journal record is ready to be written. Space is allocated 2159 * and the journal entry is created when the journal is flushed to stable 2160 * store. 2161 */ 2162static void 2163add_to_journal(wk) 2164 struct worklist *wk; 2165{ 2166 struct ufsmount *ump; 2167 2168 mtx_assert(&lk, MA_OWNED); 2169 ump = VFSTOUFS(wk->wk_mp); 2170 if (wk->wk_state & ONWORKLIST) 2171 panic("add_to_journal: %s(0x%X) already on list", 2172 TYPENAME(wk->wk_type), wk->wk_state); 2173 wk->wk_state |= ONWORKLIST | DEPCOMPLETE; 2174 if (LIST_EMPTY(&ump->softdep_journal_pending)) { 2175 ump->softdep_jblocks->jb_age = ticks; 2176 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); 2177 } else 2178 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); 2179 ump->softdep_journal_tail = wk; 2180 ump->softdep_on_journal += 1; 2181} 2182 2183/* 2184 * Remove an arbitrary item for the journal worklist maintain the tail 2185 * pointer. This happens when a new operation obviates the need to 2186 * journal an old operation. 2187 */ 2188static void 2189remove_from_journal(wk) 2190 struct worklist *wk; 2191{ 2192 struct ufsmount *ump; 2193 2194 mtx_assert(&lk, MA_OWNED); 2195 ump = VFSTOUFS(wk->wk_mp); 2196#ifdef DEBUG /* XXX Expensive, temporary. */ 2197 { 2198 struct worklist *wkn; 2199 2200 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) 2201 if (wkn == wk) 2202 break; 2203 if (wkn == NULL) 2204 panic("remove_from_journal: %p is not in journal", wk); 2205 } 2206#endif 2207 /* 2208 * We emulate a TAILQ to save space in most structures which do not 2209 * require TAILQ semantics. Here we must update the tail position 2210 * when removing the tail which is not the final entry. 2211 */ 2212 if (ump->softdep_journal_tail == wk) 2213 ump->softdep_journal_tail = 2214 (struct worklist *)wk->wk_list.le_prev; 2215 2216 WORKLIST_REMOVE(wk); 2217 ump->softdep_on_journal -= 1; 2218} 2219 2220/* 2221 * Check for journal space as well as dependency limits so the prelink 2222 * code can throttle both journaled and non-journaled filesystems. 2223 * Threshold is 0 for low and 1 for min. 2224 */ 2225static int 2226journal_space(ump, thresh) 2227 struct ufsmount *ump; 2228 int thresh; 2229{ 2230 struct jblocks *jblocks; 2231 int avail; 2232 2233 /* 2234 * We use a tighter restriction here to prevent request_cleanup() 2235 * running in threads from running into locks we currently hold. 2236 */ 2237 if (num_inodedep > (max_softdeps / 10) * 9) 2238 return (0); 2239 2240 jblocks = ump->softdep_jblocks; 2241 if (jblocks == NULL) 2242 return (1); 2243 if (thresh) 2244 thresh = jblocks->jb_min; 2245 else 2246 thresh = jblocks->jb_low; 2247 avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; 2248 avail = jblocks->jb_free - avail; 2249 2250 return (avail > thresh); 2251} 2252 2253static void 2254journal_suspend(ump) 2255 struct ufsmount *ump; 2256{ 2257 struct jblocks *jblocks; 2258 struct mount *mp; 2259 2260 mp = UFSTOVFS(ump); 2261 jblocks = ump->softdep_jblocks; 2262 MNT_ILOCK(mp); 2263 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 2264 stat_journal_min++; 2265 mp->mnt_kern_flag |= MNTK_SUSPEND; 2266 mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc); 2267 } 2268 jblocks->jb_suspended = 1; 2269 MNT_IUNLOCK(mp); 2270} 2271 2272/* 2273 * Called before any allocation function to be certain that there is 2274 * sufficient space in the journal prior to creating any new records. 2275 * Since in the case of block allocation we may have multiple locked 2276 * buffers at the time of the actual allocation we can not block 2277 * when the journal records are created. Doing so would create a deadlock 2278 * if any of these buffers needed to be flushed to reclaim space. Instead 2279 * we require a sufficiently large amount of available space such that 2280 * each thread in the system could have passed this allocation check and 2281 * still have sufficient free space. With 20% of a minimum journal size 2282 * of 1MB we have 6553 records available. 2283 */ 2284int 2285softdep_prealloc(vp, waitok) 2286 struct vnode *vp; 2287 int waitok; 2288{ 2289 struct ufsmount *ump; 2290 2291 if (DOINGSUJ(vp) == 0) 2292 return (0); 2293 ump = VFSTOUFS(vp->v_mount); 2294 ACQUIRE_LOCK(&lk); 2295 if (journal_space(ump, 0)) { 2296 FREE_LOCK(&lk); 2297 return (0); 2298 } 2299 stat_journal_low++; 2300 FREE_LOCK(&lk); 2301 if (waitok == MNT_NOWAIT) 2302 return (ENOSPC); 2303 /* 2304 * Attempt to sync this vnode once to flush any journal 2305 * work attached to it. 2306 */ 2307 ffs_syncvnode(vp, waitok); 2308 ACQUIRE_LOCK(&lk); 2309 process_removes(vp); 2310 if (journal_space(ump, 0) == 0) { 2311 softdep_speedup(); 2312 if (journal_space(ump, 1) == 0) 2313 journal_suspend(ump); 2314 } 2315 FREE_LOCK(&lk); 2316 2317 return (0); 2318} 2319 2320/* 2321 * Before adjusting a link count on a vnode verify that we have sufficient 2322 * journal space. If not, process operations that depend on the currently 2323 * locked pair of vnodes to try to flush space as the syncer, buf daemon, 2324 * and softdep flush threads can not acquire these locks to reclaim space. 2325 */ 2326static void 2327softdep_prelink(dvp, vp) 2328 struct vnode *dvp; 2329 struct vnode *vp; 2330{ 2331 struct ufsmount *ump; 2332 2333 ump = VFSTOUFS(dvp->v_mount); 2334 mtx_assert(&lk, MA_OWNED); 2335 if (journal_space(ump, 0)) 2336 return; 2337 stat_journal_low++; 2338 FREE_LOCK(&lk); 2339 if (vp) 2340 ffs_syncvnode(vp, MNT_NOWAIT); 2341 ffs_syncvnode(dvp, MNT_WAIT); 2342 ACQUIRE_LOCK(&lk); 2343 /* Process vp before dvp as it may create .. removes. */ 2344 if (vp) 2345 process_removes(vp); 2346 process_removes(dvp); 2347 softdep_speedup(); 2348 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT); 2349 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT); 2350 if (journal_space(ump, 0) == 0) { 2351 softdep_speedup(); 2352 if (journal_space(ump, 1) == 0) 2353 journal_suspend(ump); 2354 } 2355} 2356 2357static void 2358jseg_write(fs, jblocks, jseg, data) 2359 struct fs *fs; 2360 struct jblocks *jblocks; 2361 struct jseg *jseg; 2362 uint8_t *data; 2363{ 2364 struct jsegrec *rec; 2365 2366 rec = (struct jsegrec *)data; 2367 rec->jsr_seq = jseg->js_seq; 2368 rec->jsr_oldest = jblocks->jb_oldestseq; 2369 rec->jsr_cnt = jseg->js_cnt; 2370 rec->jsr_blocks = jseg->js_size / DEV_BSIZE; 2371 rec->jsr_crc = 0; 2372 rec->jsr_time = fs->fs_mtime; 2373} 2374 2375static inline void 2376inoref_write(inoref, jseg, rec) 2377 struct inoref *inoref; 2378 struct jseg *jseg; 2379 struct jrefrec *rec; 2380{ 2381 2382 inoref->if_jsegdep->jd_seg = jseg; 2383 rec->jr_ino = inoref->if_ino; 2384 rec->jr_parent = inoref->if_parent; 2385 rec->jr_nlink = inoref->if_nlink; 2386 rec->jr_mode = inoref->if_mode; 2387 rec->jr_diroff = inoref->if_diroff; 2388} 2389 2390static void 2391jaddref_write(jaddref, jseg, data) 2392 struct jaddref *jaddref; 2393 struct jseg *jseg; 2394 uint8_t *data; 2395{ 2396 struct jrefrec *rec; 2397 2398 rec = (struct jrefrec *)data; 2399 rec->jr_op = JOP_ADDREF; 2400 inoref_write(&jaddref->ja_ref, jseg, rec); 2401} 2402 2403static void 2404jremref_write(jremref, jseg, data) 2405 struct jremref *jremref; 2406 struct jseg *jseg; 2407 uint8_t *data; 2408{ 2409 struct jrefrec *rec; 2410 2411 rec = (struct jrefrec *)data; 2412 rec->jr_op = JOP_REMREF; 2413 inoref_write(&jremref->jr_ref, jseg, rec); 2414} 2415 2416static void 2417jmvref_write(jmvref, jseg, data) 2418 struct jmvref *jmvref; 2419 struct jseg *jseg; 2420 uint8_t *data; 2421{ 2422 struct jmvrec *rec; 2423 2424 rec = (struct jmvrec *)data; 2425 rec->jm_op = JOP_MVREF; 2426 rec->jm_ino = jmvref->jm_ino; 2427 rec->jm_parent = jmvref->jm_parent; 2428 rec->jm_oldoff = jmvref->jm_oldoff; 2429 rec->jm_newoff = jmvref->jm_newoff; 2430} 2431 2432static void 2433jnewblk_write(jnewblk, jseg, data) 2434 struct jnewblk *jnewblk; 2435 struct jseg *jseg; 2436 uint8_t *data; 2437{ 2438 struct jblkrec *rec; 2439 2440 jnewblk->jn_jsegdep->jd_seg = jseg; 2441 rec = (struct jblkrec *)data; 2442 rec->jb_op = JOP_NEWBLK; 2443 rec->jb_ino = jnewblk->jn_ino; 2444 rec->jb_blkno = jnewblk->jn_blkno; 2445 rec->jb_lbn = jnewblk->jn_lbn; 2446 rec->jb_frags = jnewblk->jn_frags; 2447 rec->jb_oldfrags = jnewblk->jn_oldfrags; 2448} 2449 2450static void 2451jfreeblk_write(jfreeblk, jseg, data) 2452 struct jfreeblk *jfreeblk; 2453 struct jseg *jseg; 2454 uint8_t *data; 2455{ 2456 struct jblkrec *rec; 2457 2458 jfreeblk->jf_jsegdep->jd_seg = jseg; 2459 rec = (struct jblkrec *)data; 2460 rec->jb_op = JOP_FREEBLK; 2461 rec->jb_ino = jfreeblk->jf_ino; 2462 rec->jb_blkno = jfreeblk->jf_blkno; 2463 rec->jb_lbn = jfreeblk->jf_lbn; 2464 rec->jb_frags = jfreeblk->jf_frags; 2465 rec->jb_oldfrags = 0; 2466} 2467 2468static void 2469jfreefrag_write(jfreefrag, jseg, data) 2470 struct jfreefrag *jfreefrag; 2471 struct jseg *jseg; 2472 uint8_t *data; 2473{ 2474 struct jblkrec *rec; 2475 2476 jfreefrag->fr_jsegdep->jd_seg = jseg; 2477 rec = (struct jblkrec *)data; 2478 rec->jb_op = JOP_FREEBLK; 2479 rec->jb_ino = jfreefrag->fr_ino; 2480 rec->jb_blkno = jfreefrag->fr_blkno; 2481 rec->jb_lbn = jfreefrag->fr_lbn; 2482 rec->jb_frags = jfreefrag->fr_frags; 2483 rec->jb_oldfrags = 0; 2484} 2485 2486static void 2487jtrunc_write(jtrunc, jseg, data) 2488 struct jtrunc *jtrunc; 2489 struct jseg *jseg; 2490 uint8_t *data; 2491{ 2492 struct jtrncrec *rec; 2493 2494 rec = (struct jtrncrec *)data; 2495 rec->jt_op = JOP_TRUNC; 2496 rec->jt_ino = jtrunc->jt_ino; 2497 rec->jt_size = jtrunc->jt_size; 2498 rec->jt_extsize = jtrunc->jt_extsize; 2499} 2500 2501/* 2502 * Flush some journal records to disk. 2503 */ 2504static void 2505softdep_process_journal(mp, flags) 2506 struct mount *mp; 2507 int flags; 2508{ 2509 struct jblocks *jblocks; 2510 struct ufsmount *ump; 2511 struct worklist *wk; 2512 struct jseg *jseg; 2513 struct buf *bp; 2514 uint8_t *data; 2515 struct fs *fs; 2516 int segwritten; 2517 int jrecmin; /* Minimum records per block. */ 2518 int jrecmax; /* Maximum records per block. */ 2519 int size; 2520 int cnt; 2521 int off; 2522 2523 if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) 2524 return; 2525 ump = VFSTOUFS(mp); 2526 fs = ump->um_fs; 2527 jblocks = ump->softdep_jblocks; 2528 /* 2529 * We write anywhere between a disk block and fs block. The upper 2530 * bound is picked to prevent buffer cache fragmentation and limit 2531 * processing time per I/O. 2532 */ 2533 jrecmin = (DEV_BSIZE / JREC_SIZE) - 1; /* -1 for seg header */ 2534 jrecmax = (fs->fs_bsize / DEV_BSIZE) * jrecmin; 2535 segwritten = 0; 2536 while ((cnt = ump->softdep_on_journal) != 0) { 2537 /* 2538 * Create a new segment to hold as many as 'cnt' journal 2539 * entries and add them to the segment. Notice cnt is 2540 * off by one to account for the space required by the 2541 * jsegrec. If we don't have a full block to log skip it 2542 * unless we haven't written anything. 2543 */ 2544 cnt++; 2545 if (cnt < jrecmax && segwritten) 2546 break; 2547 /* 2548 * Verify some free journal space. softdep_prealloc() should 2549 * guarantee that we don't run out so this is indicative of 2550 * a problem with the flow control. Try to recover 2551 * gracefully in any event. 2552 */ 2553 while (jblocks->jb_free == 0) { 2554 if (flags != MNT_WAIT) 2555 break; 2556 printf("softdep: Out of journal space!\n"); 2557 softdep_speedup(); 2558 msleep(jblocks, &lk, PRIBIO, "jblocks", 1); 2559 } 2560 FREE_LOCK(&lk); 2561 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); 2562 workitem_alloc(&jseg->js_list, D_JSEG, mp); 2563 LIST_INIT(&jseg->js_entries); 2564 jseg->js_state = ATTACHED; 2565 jseg->js_jblocks = jblocks; 2566 bp = geteblk(fs->fs_bsize, 0); 2567 ACQUIRE_LOCK(&lk); 2568 /* 2569 * If there was a race while we were allocating the block 2570 * and jseg the entry we care about was likely written. 2571 * We bail out in both the WAIT and NOWAIT case and assume 2572 * the caller will loop if the entry it cares about is 2573 * not written. 2574 */ 2575 if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) { 2576 bp->b_flags |= B_INVAL | B_NOCACHE; 2577 WORKITEM_FREE(jseg, D_JSEG); 2578 FREE_LOCK(&lk); 2579 brelse(bp); 2580 ACQUIRE_LOCK(&lk); 2581 break; 2582 } 2583 /* 2584 * Calculate the disk block size required for the available 2585 * records rounded to the min size. 2586 */ 2587 cnt = ump->softdep_on_journal; 2588 if (cnt < jrecmax) 2589 size = howmany(cnt, jrecmin) * DEV_BSIZE; 2590 else 2591 size = fs->fs_bsize; 2592 /* 2593 * Allocate a disk block for this journal data and account 2594 * for truncation of the requested size if enough contiguous 2595 * space was not available. 2596 */ 2597 bp->b_blkno = jblocks_alloc(jblocks, size, &size); 2598 bp->b_lblkno = bp->b_blkno; 2599 bp->b_offset = bp->b_blkno * DEV_BSIZE; 2600 bp->b_bcount = size; 2601 bp->b_bufobj = &ump->um_devvp->v_bufobj; 2602 bp->b_flags &= ~B_INVAL; 2603 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; 2604 /* 2605 * Initialize our jseg with cnt records. Assign the next 2606 * sequence number to it and link it in-order. 2607 */ 2608 cnt = MIN(ump->softdep_on_journal, 2609 (size / DEV_BSIZE) * jrecmin); 2610 jseg->js_buf = bp; 2611 jseg->js_cnt = cnt; 2612 jseg->js_refs = cnt + 1; /* Self ref. */ 2613 jseg->js_size = size; 2614 jseg->js_seq = jblocks->jb_nextseq++; 2615 if (TAILQ_EMPTY(&jblocks->jb_segs)) 2616 jblocks->jb_oldestseq = jseg->js_seq; 2617 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); 2618 if (jblocks->jb_writeseg == NULL) 2619 jblocks->jb_writeseg = jseg; 2620 /* 2621 * Start filling in records from the pending list. 2622 */ 2623 data = bp->b_data; 2624 off = 0; 2625 while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) 2626 != NULL) { 2627 /* Place a segment header on every device block. */ 2628 if ((off % DEV_BSIZE) == 0) { 2629 jseg_write(fs, jblocks, jseg, data); 2630 off += JREC_SIZE; 2631 data = bp->b_data + off; 2632 } 2633 remove_from_journal(wk); 2634 wk->wk_state |= IOSTARTED; 2635 WORKLIST_INSERT(&jseg->js_entries, wk); 2636 switch (wk->wk_type) { 2637 case D_JADDREF: 2638 jaddref_write(WK_JADDREF(wk), jseg, data); 2639 break; 2640 case D_JREMREF: 2641 jremref_write(WK_JREMREF(wk), jseg, data); 2642 break; 2643 case D_JMVREF: 2644 jmvref_write(WK_JMVREF(wk), jseg, data); 2645 break; 2646 case D_JNEWBLK: 2647 jnewblk_write(WK_JNEWBLK(wk), jseg, data); 2648 break; 2649 case D_JFREEBLK: 2650 jfreeblk_write(WK_JFREEBLK(wk), jseg, data); 2651 break; 2652 case D_JFREEFRAG: 2653 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); 2654 break; 2655 case D_JTRUNC: 2656 jtrunc_write(WK_JTRUNC(wk), jseg, data); 2657 break; 2658 default: 2659 panic("process_journal: Unknown type %s", 2660 TYPENAME(wk->wk_type)); 2661 /* NOTREACHED */ 2662 } 2663 if (--cnt == 0) 2664 break; 2665 off += JREC_SIZE; 2666 data = bp->b_data + off; 2667 } 2668 /* 2669 * Write this one buffer and continue. 2670 */ 2671 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); 2672 FREE_LOCK(&lk); 2673 BO_LOCK(bp->b_bufobj); 2674 bgetvp(ump->um_devvp, bp); 2675 BO_UNLOCK(bp->b_bufobj); 2676 if (flags == MNT_NOWAIT) 2677 bawrite(bp); 2678 else 2679 bwrite(bp); 2680 ACQUIRE_LOCK(&lk); 2681 } 2682 /* 2683 * If we've suspended the filesystem because we ran out of journal 2684 * space either try to sync it here to make some progress or 2685 * unsuspend it if we already have. 2686 */ 2687 if (flags == 0 && jblocks && jblocks->jb_suspended) { 2688 if (journal_space(ump, jblocks->jb_min)) { 2689 FREE_LOCK(&lk); 2690 jblocks->jb_suspended = 0; 2691 mp->mnt_susp_owner = curthread; 2692 vfs_write_resume(mp); 2693 ACQUIRE_LOCK(&lk); 2694 return; 2695 } 2696 FREE_LOCK(&lk); 2697 VFS_SYNC(mp, MNT_NOWAIT); 2698 ffs_sbupdate(ump, MNT_WAIT, 0); 2699 ACQUIRE_LOCK(&lk); 2700 } 2701} 2702 2703/* 2704 * Complete a jseg, allowing all dependencies awaiting journal writes 2705 * to proceed. Each journal dependency also attaches a jsegdep to dependent 2706 * structures so that the journal segment can be freed to reclaim space. 2707 */ 2708static void 2709complete_jseg(jseg) 2710 struct jseg *jseg; 2711{ 2712 struct worklist *wk; 2713 struct jmvref *jmvref; 2714 int waiting; 2715 int i; 2716 2717 i = 0; 2718 while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { 2719 WORKLIST_REMOVE(wk); 2720 waiting = wk->wk_state & IOWAITING; 2721 wk->wk_state &= ~(IOSTARTED | IOWAITING); 2722 wk->wk_state |= COMPLETE; 2723 KASSERT(i < jseg->js_cnt, 2724 ("handle_written_jseg: overflow %d >= %d", 2725 i, jseg->js_cnt)); 2726 switch (wk->wk_type) { 2727 case D_JADDREF: 2728 handle_written_jaddref(WK_JADDREF(wk)); 2729 break; 2730 case D_JREMREF: 2731 handle_written_jremref(WK_JREMREF(wk)); 2732 break; 2733 case D_JMVREF: 2734 /* No jsegdep here. */ 2735 free_jseg(jseg); 2736 jmvref = WK_JMVREF(wk); 2737 LIST_REMOVE(jmvref, jm_deps); 2738 free_pagedep(jmvref->jm_pagedep); 2739 WORKITEM_FREE(jmvref, D_JMVREF); 2740 break; 2741 case D_JNEWBLK: 2742 handle_written_jnewblk(WK_JNEWBLK(wk)); 2743 break; 2744 case D_JFREEBLK: 2745 handle_written_jfreeblk(WK_JFREEBLK(wk)); 2746 break; 2747 case D_JFREEFRAG: 2748 handle_written_jfreefrag(WK_JFREEFRAG(wk)); 2749 break; 2750 case D_JTRUNC: 2751 WK_JTRUNC(wk)->jt_jsegdep->jd_seg = jseg; 2752 WORKITEM_FREE(wk, D_JTRUNC); 2753 break; 2754 default: 2755 panic("handle_written_jseg: Unknown type %s", 2756 TYPENAME(wk->wk_type)); 2757 /* NOTREACHED */ 2758 } 2759 if (waiting) 2760 wakeup(wk); 2761 } 2762 /* Release the self reference so the structure may be freed. */ 2763 free_jseg(jseg); 2764} 2765 2766/* 2767 * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg 2768 * completions in order only. 2769 */ 2770static void 2771handle_written_jseg(jseg, bp) 2772 struct jseg *jseg; 2773 struct buf *bp; 2774{ 2775 struct jblocks *jblocks; 2776 struct jseg *jsegn; 2777 2778 if (jseg->js_refs == 0) 2779 panic("handle_written_jseg: No self-reference on %p", jseg); 2780 jseg->js_state |= DEPCOMPLETE; 2781 /* 2782 * We'll never need this buffer again, set flags so it will be 2783 * discarded. 2784 */ 2785 bp->b_flags |= B_INVAL | B_NOCACHE; 2786 jblocks = jseg->js_jblocks; 2787 /* 2788 * Don't allow out of order completions. If this isn't the first 2789 * block wait for it to write before we're done. 2790 */ 2791 if (jseg != jblocks->jb_writeseg) 2792 return; 2793 /* Iterate through available jsegs processing their entries. */ 2794 do { 2795 jsegn = TAILQ_NEXT(jseg, js_next); 2796 complete_jseg(jseg); 2797 jseg = jsegn; 2798 } while (jseg && jseg->js_state & DEPCOMPLETE); 2799 jblocks->jb_writeseg = jseg; 2800} 2801 2802static inline struct jsegdep * 2803inoref_jseg(inoref) 2804 struct inoref *inoref; 2805{ 2806 struct jsegdep *jsegdep; 2807 2808 jsegdep = inoref->if_jsegdep; 2809 inoref->if_jsegdep = NULL; 2810 2811 return (jsegdep); 2812} 2813 2814/* 2815 * Called once a jremref has made it to stable store. The jremref is marked 2816 * complete and we attempt to free it. Any pagedeps writes sleeping waiting 2817 * for the jremref to complete will be awoken by free_jremref. 2818 */ 2819static void 2820handle_written_jremref(jremref) 2821 struct jremref *jremref; 2822{ 2823 struct inodedep *inodedep; 2824 struct jsegdep *jsegdep; 2825 struct dirrem *dirrem; 2826 2827 /* Grab the jsegdep. */ 2828 jsegdep = inoref_jseg(&jremref->jr_ref); 2829 /* 2830 * Remove us from the inoref list. 2831 */ 2832 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 2833 0, &inodedep) == 0) 2834 panic("handle_written_jremref: Lost inodedep"); 2835 TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 2836 /* 2837 * Complete the dirrem. 2838 */ 2839 dirrem = jremref->jr_dirrem; 2840 jremref->jr_dirrem = NULL; 2841 LIST_REMOVE(jremref, jr_deps); 2842 jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; 2843 WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list); 2844 if (LIST_EMPTY(&dirrem->dm_jremrefhd) && 2845 (dirrem->dm_state & COMPLETE) != 0) 2846 add_to_worklist(&dirrem->dm_list, 0); 2847 free_jremref(jremref); 2848} 2849 2850/* 2851 * Called once a jaddref has made it to stable store. The dependency is 2852 * marked complete and any dependent structures are added to the inode 2853 * bufwait list to be completed as soon as it is written. If a bitmap write 2854 * depends on this entry we move the inode into the inodedephd of the 2855 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. 2856 */ 2857static void 2858handle_written_jaddref(jaddref) 2859 struct jaddref *jaddref; 2860{ 2861 struct jsegdep *jsegdep; 2862 struct inodedep *inodedep; 2863 struct diradd *diradd; 2864 struct mkdir *mkdir; 2865 2866 /* Grab the jsegdep. */ 2867 jsegdep = inoref_jseg(&jaddref->ja_ref); 2868 mkdir = NULL; 2869 diradd = NULL; 2870 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 2871 0, &inodedep) == 0) 2872 panic("handle_written_jaddref: Lost inodedep."); 2873 if (jaddref->ja_diradd == NULL) 2874 panic("handle_written_jaddref: No dependency"); 2875 if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { 2876 diradd = jaddref->ja_diradd; 2877 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); 2878 } else if (jaddref->ja_state & MKDIR_PARENT) { 2879 mkdir = jaddref->ja_mkdir; 2880 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); 2881 } else if (jaddref->ja_state & MKDIR_BODY) 2882 mkdir = jaddref->ja_mkdir; 2883 else 2884 panic("handle_written_jaddref: Unknown dependency %p", 2885 jaddref->ja_diradd); 2886 jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ 2887 /* 2888 * Remove us from the inode list. 2889 */ 2890 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); 2891 /* 2892 * The mkdir may be waiting on the jaddref to clear before freeing. 2893 */ 2894 if (mkdir) { 2895 KASSERT(mkdir->md_list.wk_type == D_MKDIR, 2896 ("handle_written_jaddref: Incorrect type for mkdir %s", 2897 TYPENAME(mkdir->md_list.wk_type))); 2898 mkdir->md_jaddref = NULL; 2899 diradd = mkdir->md_diradd; 2900 mkdir->md_state |= DEPCOMPLETE; 2901 complete_mkdir(mkdir); 2902 } 2903 WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list); 2904 if (jaddref->ja_state & NEWBLOCK) { 2905 inodedep->id_state |= ONDEPLIST; 2906 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, 2907 inodedep, id_deps); 2908 } 2909 free_jaddref(jaddref); 2910} 2911 2912/* 2913 * Called once a jnewblk journal is written. The allocdirect or allocindir 2914 * is placed in the bmsafemap to await notification of a written bitmap. 2915 */ 2916static void 2917handle_written_jnewblk(jnewblk) 2918 struct jnewblk *jnewblk; 2919{ 2920 struct bmsafemap *bmsafemap; 2921 struct jsegdep *jsegdep; 2922 struct newblk *newblk; 2923 2924 /* Grab the jsegdep. */ 2925 jsegdep = jnewblk->jn_jsegdep; 2926 jnewblk->jn_jsegdep = NULL; 2927 /* 2928 * Add the written block to the bmsafemap so it can be notified when 2929 * the bitmap is on disk. 2930 */ 2931 newblk = jnewblk->jn_newblk; 2932 jnewblk->jn_newblk = NULL; 2933 if (newblk == NULL) 2934 panic("handle_written_jnewblk: No dependency for the segdep."); 2935 2936 newblk->nb_jnewblk = NULL; 2937 bmsafemap = newblk->nb_bmsafemap; 2938 WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list); 2939 newblk->nb_state |= ONDEPLIST; 2940 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 2941 free_jnewblk(jnewblk); 2942} 2943 2944/* 2945 * Cancel a jfreefrag that won't be needed, probably due to colliding with 2946 * an in-flight allocation that has not yet been committed. Divorce us 2947 * from the freefrag and mark it DEPCOMPLETE so that it may be added 2948 * to the worklist. 2949 */ 2950static void 2951cancel_jfreefrag(jfreefrag) 2952 struct jfreefrag *jfreefrag; 2953{ 2954 struct freefrag *freefrag; 2955 2956 if (jfreefrag->fr_jsegdep) { 2957 free_jsegdep(jfreefrag->fr_jsegdep); 2958 jfreefrag->fr_jsegdep = NULL; 2959 } 2960 freefrag = jfreefrag->fr_freefrag; 2961 jfreefrag->fr_freefrag = NULL; 2962 freefrag->ff_jfreefrag = NULL; 2963 free_jfreefrag(jfreefrag); 2964 freefrag->ff_state |= DEPCOMPLETE; 2965} 2966 2967/* 2968 * Free a jfreefrag when the parent freefrag is rendered obsolete. 2969 */ 2970static void 2971free_jfreefrag(jfreefrag) 2972 struct jfreefrag *jfreefrag; 2973{ 2974 2975 if (jfreefrag->fr_state & IOSTARTED) 2976 WORKLIST_REMOVE(&jfreefrag->fr_list); 2977 else if (jfreefrag->fr_state & ONWORKLIST) 2978 remove_from_journal(&jfreefrag->fr_list); 2979 if (jfreefrag->fr_freefrag != NULL) 2980 panic("free_jfreefrag: Still attached to a freefrag."); 2981 WORKITEM_FREE(jfreefrag, D_JFREEFRAG); 2982} 2983 2984/* 2985 * Called when the journal write for a jfreefrag completes. The parent 2986 * freefrag is added to the worklist if this completes its dependencies. 2987 */ 2988static void 2989handle_written_jfreefrag(jfreefrag) 2990 struct jfreefrag *jfreefrag; 2991{ 2992 struct jsegdep *jsegdep; 2993 struct freefrag *freefrag; 2994 2995 /* Grab the jsegdep. */ 2996 jsegdep = jfreefrag->fr_jsegdep; 2997 jfreefrag->fr_jsegdep = NULL; 2998 freefrag = jfreefrag->fr_freefrag; 2999 if (freefrag == NULL) 3000 panic("handle_written_jfreefrag: No freefrag."); 3001 freefrag->ff_state |= DEPCOMPLETE; 3002 freefrag->ff_jfreefrag = NULL; 3003 WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list); 3004 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 3005 add_to_worklist(&freefrag->ff_list, 0); 3006 jfreefrag->fr_freefrag = NULL; 3007 free_jfreefrag(jfreefrag); 3008} 3009 3010/* 3011 * Called when the journal write for a jfreeblk completes. The jfreeblk 3012 * is removed from the freeblks list of pending journal writes and the 3013 * jsegdep is moved to the freeblks jwork to be completed when all blocks 3014 * have been reclaimed. 3015 */ 3016static void 3017handle_written_jfreeblk(jfreeblk) 3018 struct jfreeblk *jfreeblk; 3019{ 3020 struct freeblks *freeblks; 3021 struct jsegdep *jsegdep; 3022 3023 /* Grab the jsegdep. */ 3024 jsegdep = jfreeblk->jf_jsegdep; 3025 jfreeblk->jf_jsegdep = NULL; 3026 freeblks = jfreeblk->jf_freeblks; 3027 LIST_REMOVE(jfreeblk, jf_deps); 3028 WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list); 3029 /* 3030 * If the freeblks is all journaled, we can add it to the worklist. 3031 */ 3032 if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) && 3033 (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) { 3034 /* Remove from the b_dep that is waiting on this write. */ 3035 if (freeblks->fb_state & ONWORKLIST) 3036 WORKLIST_REMOVE(&freeblks->fb_list); 3037 add_to_worklist(&freeblks->fb_list, 1); 3038 } 3039 3040 free_jfreeblk(jfreeblk); 3041} 3042 3043static struct jsegdep * 3044newjsegdep(struct worklist *wk) 3045{ 3046 struct jsegdep *jsegdep; 3047 3048 jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); 3049 workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); 3050 jsegdep->jd_seg = NULL; 3051 3052 return (jsegdep); 3053} 3054 3055static struct jmvref * 3056newjmvref(dp, ino, oldoff, newoff) 3057 struct inode *dp; 3058 ino_t ino; 3059 off_t oldoff; 3060 off_t newoff; 3061{ 3062 struct jmvref *jmvref; 3063 3064 jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); 3065 workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump)); 3066 jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; 3067 jmvref->jm_parent = dp->i_number; 3068 jmvref->jm_ino = ino; 3069 jmvref->jm_oldoff = oldoff; 3070 jmvref->jm_newoff = newoff; 3071 3072 return (jmvref); 3073} 3074 3075/* 3076 * Allocate a new jremref that tracks the removal of ip from dp with the 3077 * directory entry offset of diroff. Mark the entry as ATTACHED and 3078 * DEPCOMPLETE as we have all the information required for the journal write 3079 * and the directory has already been removed from the buffer. The caller 3080 * is responsible for linking the jremref into the pagedep and adding it 3081 * to the journal to write. The MKDIR_PARENT flag is set if we're doing 3082 * a DOTDOT addition so handle_workitem_remove() can properly assign 3083 * the jsegdep when we're done. 3084 */ 3085static struct jremref * 3086newjremref(dirrem, dp, ip, diroff, nlink) 3087 struct dirrem *dirrem; 3088 struct inode *dp; 3089 struct inode *ip; 3090 off_t diroff; 3091 nlink_t nlink; 3092{ 3093 struct jremref *jremref; 3094 3095 jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); 3096 workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump)); 3097 jremref->jr_state = ATTACHED; 3098 newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, 3099 nlink, ip->i_mode); 3100 jremref->jr_dirrem = dirrem; 3101 3102 return (jremref); 3103} 3104 3105static inline void 3106newinoref(inoref, ino, parent, diroff, nlink, mode) 3107 struct inoref *inoref; 3108 ino_t ino; 3109 ino_t parent; 3110 off_t diroff; 3111 nlink_t nlink; 3112 uint16_t mode; 3113{ 3114 3115 inoref->if_jsegdep = newjsegdep(&inoref->if_list); 3116 inoref->if_diroff = diroff; 3117 inoref->if_ino = ino; 3118 inoref->if_parent = parent; 3119 inoref->if_nlink = nlink; 3120 inoref->if_mode = mode; 3121} 3122 3123/* 3124 * Allocate a new jaddref to track the addition of ino to dp at diroff. The 3125 * directory offset may not be known until later. The caller is responsible 3126 * adding the entry to the journal when this information is available. nlink 3127 * should be the link count prior to the addition and mode is only required 3128 * to have the correct FMT. 3129 */ 3130static struct jaddref * 3131newjaddref(dp, ino, diroff, nlink, mode) 3132 struct inode *dp; 3133 ino_t ino; 3134 off_t diroff; 3135 int16_t nlink; 3136 uint16_t mode; 3137{ 3138 struct jaddref *jaddref; 3139 3140 jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); 3141 workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump)); 3142 jaddref->ja_state = ATTACHED; 3143 jaddref->ja_mkdir = NULL; 3144 newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); 3145 3146 return (jaddref); 3147} 3148 3149/* 3150 * Create a new free dependency for a freework. The caller is responsible 3151 * for adjusting the reference count when it has the lock held. The freedep 3152 * will track an outstanding bitmap write that will ultimately clear the 3153 * freework to continue. 3154 */ 3155static struct freedep * 3156newfreedep(struct freework *freework) 3157{ 3158 struct freedep *freedep; 3159 3160 freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); 3161 workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); 3162 freedep->fd_freework = freework; 3163 3164 return (freedep); 3165} 3166 3167/* 3168 * Free a freedep structure once the buffer it is linked to is written. If 3169 * this is the last reference to the freework schedule it for completion. 3170 */ 3171static void 3172free_freedep(freedep) 3173 struct freedep *freedep; 3174{ 3175 3176 if (--freedep->fd_freework->fw_ref == 0) 3177 add_to_worklist(&freedep->fd_freework->fw_list, 1); 3178 WORKITEM_FREE(freedep, D_FREEDEP); 3179} 3180 3181/* 3182 * Allocate a new freework structure that may be a level in an indirect 3183 * when parent is not NULL or a top level block when it is. The top level 3184 * freework structures are allocated without lk held and before the freeblks 3185 * is visible outside of softdep_setup_freeblocks(). 3186 */ 3187static struct freework * 3188newfreework(freeblks, parent, lbn, nb, frags, journal) 3189 struct freeblks *freeblks; 3190 struct freework *parent; 3191 ufs_lbn_t lbn; 3192 ufs2_daddr_t nb; 3193 int frags; 3194 int journal; 3195{ 3196 struct freework *freework; 3197 3198 freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); 3199 workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); 3200 freework->fw_freeblks = freeblks; 3201 freework->fw_parent = parent; 3202 freework->fw_lbn = lbn; 3203 freework->fw_blkno = nb; 3204 freework->fw_frags = frags; 3205 freework->fw_ref = 0; 3206 freework->fw_off = 0; 3207 LIST_INIT(&freework->fw_jwork); 3208 3209 if (parent == NULL) { 3210 WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd, 3211 &freework->fw_list); 3212 freeblks->fb_ref++; 3213 } 3214 if (journal) 3215 newjfreeblk(freeblks, lbn, nb, frags); 3216 3217 return (freework); 3218} 3219 3220/* 3221 * Allocate a new jfreeblk to journal top level block pointer when truncating 3222 * a file. The caller must add this to the worklist when lk is held. 3223 */ 3224static struct jfreeblk * 3225newjfreeblk(freeblks, lbn, blkno, frags) 3226 struct freeblks *freeblks; 3227 ufs_lbn_t lbn; 3228 ufs2_daddr_t blkno; 3229 int frags; 3230{ 3231 struct jfreeblk *jfreeblk; 3232 3233 jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); 3234 workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp); 3235 jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list); 3236 jfreeblk->jf_state = ATTACHED | DEPCOMPLETE; 3237 jfreeblk->jf_ino = freeblks->fb_previousinum; 3238 jfreeblk->jf_lbn = lbn; 3239 jfreeblk->jf_blkno = blkno; 3240 jfreeblk->jf_frags = frags; 3241 jfreeblk->jf_freeblks = freeblks; 3242 LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps); 3243 3244 return (jfreeblk); 3245} 3246 3247static void move_newblock_dep(struct jaddref *, struct inodedep *); 3248/* 3249 * If we're canceling a new bitmap we have to search for another ref 3250 * to move into the bmsafemap dep. This might be better expressed 3251 * with another structure. 3252 */ 3253static void 3254move_newblock_dep(jaddref, inodedep) 3255 struct jaddref *jaddref; 3256 struct inodedep *inodedep; 3257{ 3258 struct inoref *inoref; 3259 struct jaddref *jaddrefn; 3260 3261 jaddrefn = NULL; 3262 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3263 inoref = TAILQ_NEXT(inoref, if_deps)) { 3264 if ((jaddref->ja_state & NEWBLOCK) && 3265 inoref->if_list.wk_type == D_JADDREF) { 3266 jaddrefn = (struct jaddref *)inoref; 3267 break; 3268 } 3269 } 3270 if (jaddrefn == NULL) 3271 return; 3272 jaddrefn->ja_state &= ~(ATTACHED | UNDONE); 3273 jaddrefn->ja_state |= jaddref->ja_state & 3274 (ATTACHED | UNDONE | NEWBLOCK); 3275 jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); 3276 jaddref->ja_state |= ATTACHED; 3277 LIST_REMOVE(jaddref, ja_bmdeps); 3278 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, 3279 ja_bmdeps); 3280} 3281 3282/* 3283 * Cancel a jaddref either before it has been written or while it is being 3284 * written. This happens when a link is removed before the add reaches 3285 * the disk. The jaddref dependency is kept linked into the bmsafemap 3286 * and inode to prevent the link count or bitmap from reaching the disk 3287 * until handle_workitem_remove() re-adjusts the counts and bitmaps as 3288 * required. 3289 * 3290 * Returns 1 if the canceled addref requires journaling of the remove and 3291 * 0 otherwise. 3292 */ 3293static int 3294cancel_jaddref(jaddref, inodedep, wkhd) 3295 struct jaddref *jaddref; 3296 struct inodedep *inodedep; 3297 struct workhead *wkhd; 3298{ 3299 struct inoref *inoref; 3300 struct jsegdep *jsegdep; 3301 int needsj; 3302 3303 KASSERT((jaddref->ja_state & COMPLETE) == 0, 3304 ("cancel_jaddref: Canceling complete jaddref")); 3305 if (jaddref->ja_state & (IOSTARTED | COMPLETE)) 3306 needsj = 1; 3307 else 3308 needsj = 0; 3309 if (inodedep == NULL) 3310 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3311 0, &inodedep) == 0) 3312 panic("cancel_jaddref: Lost inodedep"); 3313 /* 3314 * We must adjust the nlink of any reference operation that follows 3315 * us so that it is consistent with the in-memory reference. This 3316 * ensures that inode nlink rollbacks always have the correct link. 3317 */ 3318 if (needsj == 0) 3319 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3320 inoref = TAILQ_NEXT(inoref, if_deps)) 3321 inoref->if_nlink--; 3322 jsegdep = inoref_jseg(&jaddref->ja_ref); 3323 if (jaddref->ja_state & NEWBLOCK) 3324 move_newblock_dep(jaddref, inodedep); 3325 if (jaddref->ja_state & IOWAITING) { 3326 jaddref->ja_state &= ~IOWAITING; 3327 wakeup(&jaddref->ja_list); 3328 } 3329 jaddref->ja_mkdir = NULL; 3330 if (jaddref->ja_state & IOSTARTED) { 3331 jaddref->ja_state &= ~IOSTARTED; 3332 WORKLIST_REMOVE(&jaddref->ja_list); 3333 WORKLIST_INSERT(wkhd, &jsegdep->jd_list); 3334 } else { 3335 free_jsegdep(jsegdep); 3336 remove_from_journal(&jaddref->ja_list); 3337 } 3338 /* 3339 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove 3340 * can arrange for them to be freed with the bitmap. Otherwise we 3341 * no longer need this addref attached to the inoreflst and it 3342 * will incorrectly adjust nlink if we leave it. 3343 */ 3344 if ((jaddref->ja_state & NEWBLOCK) == 0) { 3345 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 3346 if_deps); 3347 jaddref->ja_state |= COMPLETE; 3348 free_jaddref(jaddref); 3349 return (needsj); 3350 } 3351 jaddref->ja_state |= GOINGAWAY; 3352 /* 3353 * Leave the head of the list for jsegdeps for fast merging. 3354 */ 3355 if (LIST_FIRST(wkhd) != NULL) { 3356 jaddref->ja_state |= ONWORKLIST; 3357 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); 3358 } else 3359 WORKLIST_INSERT(wkhd, &jaddref->ja_list); 3360 3361 return (needsj); 3362} 3363 3364/* 3365 * Attempt to free a jaddref structure when some work completes. This 3366 * should only succeed once the entry is written and all dependencies have 3367 * been notified. 3368 */ 3369static void 3370free_jaddref(jaddref) 3371 struct jaddref *jaddref; 3372{ 3373 3374 if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) 3375 return; 3376 if (jaddref->ja_ref.if_jsegdep) 3377 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", 3378 jaddref, jaddref->ja_state); 3379 if (jaddref->ja_state & NEWBLOCK) 3380 LIST_REMOVE(jaddref, ja_bmdeps); 3381 if (jaddref->ja_state & (IOSTARTED | ONWORKLIST)) 3382 panic("free_jaddref: Bad state %p(0x%X)", 3383 jaddref, jaddref->ja_state); 3384 if (jaddref->ja_mkdir != NULL) 3385 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); 3386 WORKITEM_FREE(jaddref, D_JADDREF); 3387} 3388 3389/* 3390 * Free a jremref structure once it has been written or discarded. 3391 */ 3392static void 3393free_jremref(jremref) 3394 struct jremref *jremref; 3395{ 3396 3397 if (jremref->jr_ref.if_jsegdep) 3398 free_jsegdep(jremref->jr_ref.if_jsegdep); 3399 if (jremref->jr_state & IOSTARTED) 3400 panic("free_jremref: IO still pending"); 3401 WORKITEM_FREE(jremref, D_JREMREF); 3402} 3403 3404/* 3405 * Free a jnewblk structure. 3406 */ 3407static void 3408free_jnewblk(jnewblk) 3409 struct jnewblk *jnewblk; 3410{ 3411 3412 if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) 3413 return; 3414 LIST_REMOVE(jnewblk, jn_deps); 3415 if (jnewblk->jn_newblk != NULL) 3416 panic("free_jnewblk: Dependency still attached."); 3417 WORKITEM_FREE(jnewblk, D_JNEWBLK); 3418} 3419 3420/* 3421 * Cancel a jnewblk which has been superseded by a freeblk. The jnewblk 3422 * is kept linked into the bmsafemap until the free completes, thus 3423 * preventing the modified state from ever reaching disk. The free 3424 * routine must pass this structure via ffs_blkfree() to 3425 * softdep_setup_freeblks() so there is no race in releasing the space. 3426 */ 3427static void 3428cancel_jnewblk(jnewblk, wkhd) 3429 struct jnewblk *jnewblk; 3430 struct workhead *wkhd; 3431{ 3432 struct jsegdep *jsegdep; 3433 3434 jsegdep = jnewblk->jn_jsegdep; 3435 jnewblk->jn_jsegdep = NULL; 3436 free_jsegdep(jsegdep); 3437 jnewblk->jn_newblk = NULL; 3438 jnewblk->jn_state |= GOINGAWAY; 3439 if (jnewblk->jn_state & IOSTARTED) { 3440 jnewblk->jn_state &= ~IOSTARTED; 3441 WORKLIST_REMOVE(&jnewblk->jn_list); 3442 } else 3443 remove_from_journal(&jnewblk->jn_list); 3444 /* 3445 * Leave the head of the list for jsegdeps for fast merging. 3446 */ 3447 if (LIST_FIRST(wkhd) != NULL) { 3448 jnewblk->jn_state |= ONWORKLIST; 3449 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jnewblk->jn_list, wk_list); 3450 } else 3451 WORKLIST_INSERT(wkhd, &jnewblk->jn_list); 3452 if (jnewblk->jn_state & IOWAITING) { 3453 jnewblk->jn_state &= ~IOWAITING; 3454 wakeup(&jnewblk->jn_list); 3455 } 3456} 3457 3458static void 3459free_jfreeblk(jfreeblk) 3460 struct jfreeblk *jfreeblk; 3461{ 3462 3463 WORKITEM_FREE(jfreeblk, D_JFREEBLK); 3464} 3465 3466/* 3467 * Release one reference to a jseg and free it if the count reaches 0. This 3468 * should eventually reclaim journal space as well. 3469 */ 3470static void 3471free_jseg(jseg) 3472 struct jseg *jseg; 3473{ 3474 struct jblocks *jblocks; 3475 3476 KASSERT(jseg->js_refs > 0, 3477 ("free_jseg: Invalid refcnt %d", jseg->js_refs)); 3478 if (--jseg->js_refs != 0) 3479 return; 3480 /* 3481 * Free only those jsegs which have none allocated before them to 3482 * preserve the journal space ordering. 3483 */ 3484 jblocks = jseg->js_jblocks; 3485 while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { 3486 jblocks->jb_oldestseq = jseg->js_seq; 3487 if (jseg->js_refs != 0) 3488 break; 3489 TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); 3490 jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); 3491 KASSERT(LIST_EMPTY(&jseg->js_entries), 3492 ("free_jseg: Freed jseg has valid entries.")); 3493 WORKITEM_FREE(jseg, D_JSEG); 3494 } 3495} 3496 3497/* 3498 * Release a jsegdep and decrement the jseg count. 3499 */ 3500static void 3501free_jsegdep(jsegdep) 3502 struct jsegdep *jsegdep; 3503{ 3504 3505 if (jsegdep->jd_seg) 3506 free_jseg(jsegdep->jd_seg); 3507 WORKITEM_FREE(jsegdep, D_JSEGDEP); 3508} 3509 3510/* 3511 * Wait for a journal item to make it to disk. Initiate journal processing 3512 * if required. 3513 */ 3514static void 3515jwait(wk) 3516 struct worklist *wk; 3517{ 3518 3519 stat_journal_wait++; 3520 /* 3521 * If IO has not started we process the journal. We can't mark the 3522 * worklist item as IOWAITING because we drop the lock while 3523 * processing the journal and the worklist entry may be freed after 3524 * this point. The caller may call back in and re-issue the request. 3525 */ 3526 if ((wk->wk_state & IOSTARTED) == 0) { 3527 softdep_process_journal(wk->wk_mp, MNT_WAIT); 3528 return; 3529 } 3530 wk->wk_state |= IOWAITING; 3531 msleep(wk, &lk, PRIBIO, "jwait", 0); 3532} 3533 3534/* 3535 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as 3536 * appropriate. This is a convenience function to reduce duplicate code 3537 * for the setup and revert functions below. 3538 */ 3539static struct inodedep * 3540inodedep_lookup_ip(ip) 3541 struct inode *ip; 3542{ 3543 struct inodedep *inodedep; 3544 3545 KASSERT(ip->i_nlink >= ip->i_effnlink, 3546 ("inodedep_lookup_ip: bad delta")); 3547 (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 3548 DEPALLOC, &inodedep); 3549 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 3550 3551 return (inodedep); 3552} 3553 3554/* 3555 * Create a journal entry that describes a truncate that we're about to 3556 * perform. The inode allocations and frees between here and the completion 3557 * of the operation are done asynchronously and without journaling. At 3558 * the end of the operation the vnode is sync'd and the journal space 3559 * is released. Recovery will discover the partially completed truncate 3560 * and complete it. 3561 */ 3562void * 3563softdep_setup_trunc(vp, length, flags) 3564 struct vnode *vp; 3565 off_t length; 3566 int flags; 3567{ 3568 struct jsegdep *jsegdep; 3569 struct jtrunc *jtrunc; 3570 struct ufsmount *ump; 3571 struct inode *ip; 3572 3573 softdep_prealloc(vp, MNT_WAIT); 3574 ip = VTOI(vp); 3575 ump = VFSTOUFS(vp->v_mount); 3576 jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); 3577 workitem_alloc(&jtrunc->jt_list, D_JTRUNC, vp->v_mount); 3578 jsegdep = jtrunc->jt_jsegdep = newjsegdep(&jtrunc->jt_list); 3579 jtrunc->jt_ino = ip->i_number; 3580 jtrunc->jt_extsize = 0; 3581 jtrunc->jt_size = length; 3582 if ((flags & IO_EXT) == 0 && ump->um_fstype == UFS2) 3583 jtrunc->jt_extsize = ip->i_din2->di_extsize; 3584 if ((flags & IO_NORMAL) == 0) 3585 jtrunc->jt_size = DIP(ip, i_size); 3586 ACQUIRE_LOCK(&lk); 3587 add_to_journal(&jtrunc->jt_list); 3588 while (jsegdep->jd_seg == NULL) { 3589 stat_jwait_freeblks++; 3590 jwait(&jtrunc->jt_list); 3591 } 3592 FREE_LOCK(&lk); 3593 3594 return (jsegdep); 3595} 3596 3597/* 3598 * After synchronous truncation is complete we free sync the vnode and 3599 * release the jsegdep so the journal space can be freed. 3600 */ 3601int 3602softdep_complete_trunc(vp, cookie) 3603 struct vnode *vp; 3604 void *cookie; 3605{ 3606 int error; 3607 3608 error = ffs_syncvnode(vp, MNT_WAIT); 3609 ACQUIRE_LOCK(&lk); 3610 free_jsegdep((struct jsegdep *)cookie); 3611 FREE_LOCK(&lk); 3612 3613 return (error); 3614} 3615 3616/* 3617 * Called prior to creating a new inode and linking it to a directory. The 3618 * jaddref structure must already be allocated by softdep_setup_inomapdep 3619 * and it is discovered here so we can initialize the mode and update 3620 * nlinkdelta. 3621 */ 3622void 3623softdep_setup_create(dp, ip) 3624 struct inode *dp; 3625 struct inode *ip; 3626{ 3627 struct inodedep *inodedep; 3628 struct jaddref *jaddref; 3629 struct vnode *dvp; 3630 3631 KASSERT(ip->i_nlink == 1, 3632 ("softdep_setup_create: Invalid link count.")); 3633 dvp = ITOV(dp); 3634 ACQUIRE_LOCK(&lk); 3635 inodedep = inodedep_lookup_ip(ip); 3636 if (DOINGSUJ(dvp)) { 3637 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 3638 inoreflst); 3639 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 3640 ("softdep_setup_create: No addref structure present.")); 3641 jaddref->ja_mode = ip->i_mode; 3642 } 3643 softdep_prelink(dvp, NULL); 3644 FREE_LOCK(&lk); 3645} 3646 3647/* 3648 * Create a jaddref structure to track the addition of a DOTDOT link when 3649 * we are reparenting an inode as part of a rename. This jaddref will be 3650 * found by softdep_setup_directory_change. Adjusts nlinkdelta for 3651 * non-journaling softdep. 3652 */ 3653void 3654softdep_setup_dotdot_link(dp, ip) 3655 struct inode *dp; 3656 struct inode *ip; 3657{ 3658 struct inodedep *inodedep; 3659 struct jaddref *jaddref; 3660 struct vnode *dvp; 3661 struct vnode *vp; 3662 3663 dvp = ITOV(dp); 3664 vp = ITOV(ip); 3665 jaddref = NULL; 3666 /* 3667 * We don't set MKDIR_PARENT as this is not tied to a mkdir and 3668 * is used as a normal link would be. 3669 */ 3670 if (DOINGSUJ(dvp)) 3671 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 3672 dp->i_effnlink - 1, dp->i_mode); 3673 ACQUIRE_LOCK(&lk); 3674 inodedep = inodedep_lookup_ip(dp); 3675 if (jaddref) 3676 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 3677 if_deps); 3678 softdep_prelink(dvp, ITOV(ip)); 3679 FREE_LOCK(&lk); 3680} 3681 3682/* 3683 * Create a jaddref structure to track a new link to an inode. The directory 3684 * offset is not known until softdep_setup_directory_add or 3685 * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling 3686 * softdep. 3687 */ 3688void 3689softdep_setup_link(dp, ip) 3690 struct inode *dp; 3691 struct inode *ip; 3692{ 3693 struct inodedep *inodedep; 3694 struct jaddref *jaddref; 3695 struct vnode *dvp; 3696 3697 dvp = ITOV(dp); 3698 jaddref = NULL; 3699 if (DOINGSUJ(dvp)) 3700 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, 3701 ip->i_mode); 3702 ACQUIRE_LOCK(&lk); 3703 inodedep = inodedep_lookup_ip(ip); 3704 if (jaddref) 3705 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 3706 if_deps); 3707 softdep_prelink(dvp, ITOV(ip)); 3708 FREE_LOCK(&lk); 3709} 3710 3711/* 3712 * Called to create the jaddref structures to track . and .. references as 3713 * well as lookup and further initialize the incomplete jaddref created 3714 * by softdep_setup_inomapdep when the inode was allocated. Adjusts 3715 * nlinkdelta for non-journaling softdep. 3716 */ 3717void 3718softdep_setup_mkdir(dp, ip) 3719 struct inode *dp; 3720 struct inode *ip; 3721{ 3722 struct inodedep *inodedep; 3723 struct jaddref *dotdotaddref; 3724 struct jaddref *dotaddref; 3725 struct jaddref *jaddref; 3726 struct vnode *dvp; 3727 3728 dvp = ITOV(dp); 3729 dotaddref = dotdotaddref = NULL; 3730 if (DOINGSUJ(dvp)) { 3731 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, 3732 ip->i_mode); 3733 dotaddref->ja_state |= MKDIR_BODY; 3734 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 3735 dp->i_effnlink - 1, dp->i_mode); 3736 dotdotaddref->ja_state |= MKDIR_PARENT; 3737 } 3738 ACQUIRE_LOCK(&lk); 3739 inodedep = inodedep_lookup_ip(ip); 3740 if (DOINGSUJ(dvp)) { 3741 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 3742 inoreflst); 3743 KASSERT(jaddref != NULL, 3744 ("softdep_setup_mkdir: No addref structure present.")); 3745 KASSERT(jaddref->ja_parent == dp->i_number, 3746 ("softdep_setup_mkdir: bad parent %d", 3747 jaddref->ja_parent)); 3748 jaddref->ja_mode = ip->i_mode; 3749 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, 3750 if_deps); 3751 } 3752 inodedep = inodedep_lookup_ip(dp); 3753 if (DOINGSUJ(dvp)) 3754 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, 3755 &dotdotaddref->ja_ref, if_deps); 3756 softdep_prelink(ITOV(dp), NULL); 3757 FREE_LOCK(&lk); 3758} 3759 3760/* 3761 * Called to track nlinkdelta of the inode and parent directories prior to 3762 * unlinking a directory. 3763 */ 3764void 3765softdep_setup_rmdir(dp, ip) 3766 struct inode *dp; 3767 struct inode *ip; 3768{ 3769 struct vnode *dvp; 3770 3771 dvp = ITOV(dp); 3772 ACQUIRE_LOCK(&lk); 3773 (void) inodedep_lookup_ip(ip); 3774 (void) inodedep_lookup_ip(dp); 3775 softdep_prelink(dvp, ITOV(ip)); 3776 FREE_LOCK(&lk); 3777} 3778 3779/* 3780 * Called to track nlinkdelta of the inode and parent directories prior to 3781 * unlink. 3782 */ 3783void 3784softdep_setup_unlink(dp, ip) 3785 struct inode *dp; 3786 struct inode *ip; 3787{ 3788 struct vnode *dvp; 3789 3790 dvp = ITOV(dp); 3791 ACQUIRE_LOCK(&lk); 3792 (void) inodedep_lookup_ip(ip); 3793 (void) inodedep_lookup_ip(dp); 3794 softdep_prelink(dvp, ITOV(ip)); 3795 FREE_LOCK(&lk); 3796} 3797 3798/* 3799 * Called to release the journal structures created by a failed non-directory 3800 * creation. Adjusts nlinkdelta for non-journaling softdep. 3801 */ 3802void 3803softdep_revert_create(dp, ip) 3804 struct inode *dp; 3805 struct inode *ip; 3806{ 3807 struct inodedep *inodedep; 3808 struct jaddref *jaddref; 3809 struct vnode *dvp; 3810 3811 dvp = ITOV(dp); 3812 ACQUIRE_LOCK(&lk); 3813 inodedep = inodedep_lookup_ip(ip); 3814 if (DOINGSUJ(dvp)) { 3815 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 3816 inoreflst); 3817 KASSERT(jaddref->ja_parent == dp->i_number, 3818 ("softdep_revert_create: addref parent mismatch")); 3819 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 3820 } 3821 FREE_LOCK(&lk); 3822} 3823 3824/* 3825 * Called to release the journal structures created by a failed dotdot link 3826 * creation. Adjusts nlinkdelta for non-journaling softdep. 3827 */ 3828void 3829softdep_revert_dotdot_link(dp, ip) 3830 struct inode *dp; 3831 struct inode *ip; 3832{ 3833 struct inodedep *inodedep; 3834 struct jaddref *jaddref; 3835 struct vnode *dvp; 3836 3837 dvp = ITOV(dp); 3838 ACQUIRE_LOCK(&lk); 3839 inodedep = inodedep_lookup_ip(dp); 3840 if (DOINGSUJ(dvp)) { 3841 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 3842 inoreflst); 3843 KASSERT(jaddref->ja_parent == ip->i_number, 3844 ("softdep_revert_dotdot_link: addref parent mismatch")); 3845 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 3846 } 3847 FREE_LOCK(&lk); 3848} 3849 3850/* 3851 * Called to release the journal structures created by a failed link 3852 * addition. Adjusts nlinkdelta for non-journaling softdep. 3853 */ 3854void 3855softdep_revert_link(dp, ip) 3856 struct inode *dp; 3857 struct inode *ip; 3858{ 3859 struct inodedep *inodedep; 3860 struct jaddref *jaddref; 3861 struct vnode *dvp; 3862 3863 dvp = ITOV(dp); 3864 ACQUIRE_LOCK(&lk); 3865 inodedep = inodedep_lookup_ip(ip); 3866 if (DOINGSUJ(dvp)) { 3867 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 3868 inoreflst); 3869 KASSERT(jaddref->ja_parent == dp->i_number, 3870 ("softdep_revert_link: addref parent mismatch")); 3871 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 3872 } 3873 FREE_LOCK(&lk); 3874} 3875 3876/* 3877 * Called to release the journal structures created by a failed mkdir 3878 * attempt. Adjusts nlinkdelta for non-journaling softdep. 3879 */ 3880void 3881softdep_revert_mkdir(dp, ip) 3882 struct inode *dp; 3883 struct inode *ip; 3884{ 3885 struct inodedep *inodedep; 3886 struct jaddref *jaddref; 3887 struct vnode *dvp; 3888 3889 dvp = ITOV(dp); 3890 3891 ACQUIRE_LOCK(&lk); 3892 inodedep = inodedep_lookup_ip(dp); 3893 if (DOINGSUJ(dvp)) { 3894 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 3895 inoreflst); 3896 KASSERT(jaddref->ja_parent == ip->i_number, 3897 ("softdep_revert_mkdir: dotdot addref parent mismatch")); 3898 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 3899 } 3900 inodedep = inodedep_lookup_ip(ip); 3901 if (DOINGSUJ(dvp)) { 3902 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 3903 inoreflst); 3904 KASSERT(jaddref->ja_parent == dp->i_number, 3905 ("softdep_revert_mkdir: addref parent mismatch")); 3906 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 3907 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 3908 inoreflst); 3909 KASSERT(jaddref->ja_parent == ip->i_number, 3910 ("softdep_revert_mkdir: dot addref parent mismatch")); 3911 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 3912 } 3913 FREE_LOCK(&lk); 3914} 3915 3916/* 3917 * Called to correct nlinkdelta after a failed rmdir. 3918 */ 3919void 3920softdep_revert_rmdir(dp, ip) 3921 struct inode *dp; 3922 struct inode *ip; 3923{ 3924 3925 ACQUIRE_LOCK(&lk); 3926 (void) inodedep_lookup_ip(ip); 3927 (void) inodedep_lookup_ip(dp); 3928 FREE_LOCK(&lk); 3929} 3930 3931/* 3932 * Protecting the freemaps (or bitmaps). 3933 * 3934 * To eliminate the need to execute fsck before mounting a filesystem 3935 * after a power failure, one must (conservatively) guarantee that the 3936 * on-disk copy of the bitmaps never indicate that a live inode or block is 3937 * free. So, when a block or inode is allocated, the bitmap should be 3938 * updated (on disk) before any new pointers. When a block or inode is 3939 * freed, the bitmap should not be updated until all pointers have been 3940 * reset. The latter dependency is handled by the delayed de-allocation 3941 * approach described below for block and inode de-allocation. The former 3942 * dependency is handled by calling the following procedure when a block or 3943 * inode is allocated. When an inode is allocated an "inodedep" is created 3944 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 3945 * Each "inodedep" is also inserted into the hash indexing structure so 3946 * that any additional link additions can be made dependent on the inode 3947 * allocation. 3948 * 3949 * The ufs filesystem maintains a number of free block counts (e.g., per 3950 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 3951 * in addition to the bitmaps. These counts are used to improve efficiency 3952 * during allocation and therefore must be consistent with the bitmaps. 3953 * There is no convenient way to guarantee post-crash consistency of these 3954 * counts with simple update ordering, for two main reasons: (1) The counts 3955 * and bitmaps for a single cylinder group block are not in the same disk 3956 * sector. If a disk write is interrupted (e.g., by power failure), one may 3957 * be written and the other not. (2) Some of the counts are located in the 3958 * superblock rather than the cylinder group block. So, we focus our soft 3959 * updates implementation on protecting the bitmaps. When mounting a 3960 * filesystem, we recompute the auxiliary counts from the bitmaps. 3961 */ 3962 3963/* 3964 * Called just after updating the cylinder group block to allocate an inode. 3965 */ 3966void 3967softdep_setup_inomapdep(bp, ip, newinum) 3968 struct buf *bp; /* buffer for cylgroup block with inode map */ 3969 struct inode *ip; /* inode related to allocation */ 3970 ino_t newinum; /* new inode number being allocated */ 3971{ 3972 struct inodedep *inodedep; 3973 struct bmsafemap *bmsafemap; 3974 struct jaddref *jaddref; 3975 struct mount *mp; 3976 struct fs *fs; 3977 3978 mp = UFSTOVFS(ip->i_ump); 3979 fs = ip->i_ump->um_fs; 3980 jaddref = NULL; 3981 3982 /* 3983 * Allocate the journal reference add structure so that the bitmap 3984 * can be dependent on it. 3985 */ 3986 if (mp->mnt_kern_flag & MNTK_SUJ) { 3987 jaddref = newjaddref(ip, newinum, 0, 0, 0); 3988 jaddref->ja_state |= NEWBLOCK; 3989 } 3990 3991 /* 3992 * Create a dependency for the newly allocated inode. 3993 * Panic if it already exists as something is seriously wrong. 3994 * Otherwise add it to the dependency list for the buffer holding 3995 * the cylinder group map from which it was allocated. 3996 */ 3997 ACQUIRE_LOCK(&lk); 3998 if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep))) 3999 panic("softdep_setup_inomapdep: dependency %p for new" 4000 "inode already exists", inodedep); 4001 bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum)); 4002 if (jaddref) { 4003 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); 4004 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4005 if_deps); 4006 } else { 4007 inodedep->id_state |= ONDEPLIST; 4008 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 4009 } 4010 inodedep->id_bmsafemap = bmsafemap; 4011 inodedep->id_state &= ~DEPCOMPLETE; 4012 FREE_LOCK(&lk); 4013} 4014 4015/* 4016 * Called just after updating the cylinder group block to 4017 * allocate block or fragment. 4018 */ 4019void 4020softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 4021 struct buf *bp; /* buffer for cylgroup block with block map */ 4022 struct mount *mp; /* filesystem doing allocation */ 4023 ufs2_daddr_t newblkno; /* number of newly allocated block */ 4024 int frags; /* Number of fragments. */ 4025 int oldfrags; /* Previous number of fragments for extend. */ 4026{ 4027 struct newblk *newblk; 4028 struct bmsafemap *bmsafemap; 4029 struct jnewblk *jnewblk; 4030 struct fs *fs; 4031 4032 fs = VFSTOUFS(mp)->um_fs; 4033 jnewblk = NULL; 4034 /* 4035 * Create a dependency for the newly allocated block. 4036 * Add it to the dependency list for the buffer holding 4037 * the cylinder group map from which it was allocated. 4038 */ 4039 if (mp->mnt_kern_flag & MNTK_SUJ) { 4040 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); 4041 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); 4042 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); 4043 jnewblk->jn_state = ATTACHED; 4044 jnewblk->jn_blkno = newblkno; 4045 jnewblk->jn_frags = frags; 4046 jnewblk->jn_oldfrags = oldfrags; 4047#ifdef SUJ_DEBUG 4048 { 4049 struct cg *cgp; 4050 uint8_t *blksfree; 4051 long bno; 4052 int i; 4053 4054 cgp = (struct cg *)bp->b_data; 4055 blksfree = cg_blksfree(cgp); 4056 bno = dtogd(fs, jnewblk->jn_blkno); 4057 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 4058 i++) { 4059 if (isset(blksfree, bno + i)) 4060 panic("softdep_setup_blkmapdep: " 4061 "free fragment %d from %d-%d " 4062 "state 0x%X dep %p", i, 4063 jnewblk->jn_oldfrags, 4064 jnewblk->jn_frags, 4065 jnewblk->jn_state, 4066 jnewblk->jn_newblk); 4067 } 4068 } 4069#endif 4070 } 4071 ACQUIRE_LOCK(&lk); 4072 if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) 4073 panic("softdep_setup_blkmapdep: found block"); 4074 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, 4075 dtog(fs, newblkno)); 4076 if (jnewblk) { 4077 jnewblk->jn_newblk = newblk; 4078 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); 4079 } else { 4080 newblk->nb_state |= ONDEPLIST; 4081 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 4082 } 4083 newblk->nb_bmsafemap = bmsafemap; 4084 newblk->nb_jnewblk = jnewblk; 4085 FREE_LOCK(&lk); 4086} 4087 4088#define BMSAFEMAP_HASH(fs, cg) \ 4089 (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash]) 4090 4091static int 4092bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp) 4093 struct bmsafemap_hashhead *bmsafemaphd; 4094 struct mount *mp; 4095 int cg; 4096 struct bmsafemap **bmsafemapp; 4097{ 4098 struct bmsafemap *bmsafemap; 4099 4100 LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) 4101 if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg) 4102 break; 4103 if (bmsafemap) { 4104 *bmsafemapp = bmsafemap; 4105 return (1); 4106 } 4107 *bmsafemapp = NULL; 4108 4109 return (0); 4110} 4111 4112/* 4113 * Find the bmsafemap associated with a cylinder group buffer. 4114 * If none exists, create one. The buffer must be locked when 4115 * this routine is called and this routine must be called with 4116 * splbio interrupts blocked. 4117 */ 4118static struct bmsafemap * 4119bmsafemap_lookup(mp, bp, cg) 4120 struct mount *mp; 4121 struct buf *bp; 4122 int cg; 4123{ 4124 struct bmsafemap_hashhead *bmsafemaphd; 4125 struct bmsafemap *bmsafemap, *collision; 4126 struct worklist *wk; 4127 struct fs *fs; 4128 4129 mtx_assert(&lk, MA_OWNED); 4130 if (bp) 4131 LIST_FOREACH(wk, &bp->b_dep, wk_list) 4132 if (wk->wk_type == D_BMSAFEMAP) 4133 return (WK_BMSAFEMAP(wk)); 4134 fs = VFSTOUFS(mp)->um_fs; 4135 bmsafemaphd = BMSAFEMAP_HASH(fs, cg); 4136 if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) 4137 return (bmsafemap); 4138 FREE_LOCK(&lk); 4139 bmsafemap = malloc(sizeof(struct bmsafemap), 4140 M_BMSAFEMAP, M_SOFTDEP_FLAGS); 4141 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); 4142 bmsafemap->sm_buf = bp; 4143 LIST_INIT(&bmsafemap->sm_inodedephd); 4144 LIST_INIT(&bmsafemap->sm_inodedepwr); 4145 LIST_INIT(&bmsafemap->sm_newblkhd); 4146 LIST_INIT(&bmsafemap->sm_newblkwr); 4147 LIST_INIT(&bmsafemap->sm_jaddrefhd); 4148 LIST_INIT(&bmsafemap->sm_jnewblkhd); 4149 ACQUIRE_LOCK(&lk); 4150 if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) { 4151 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 4152 return (collision); 4153 } 4154 bmsafemap->sm_cg = cg; 4155 LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); 4156 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 4157 return (bmsafemap); 4158} 4159 4160/* 4161 * Direct block allocation dependencies. 4162 * 4163 * When a new block is allocated, the corresponding disk locations must be 4164 * initialized (with zeros or new data) before the on-disk inode points to 4165 * them. Also, the freemap from which the block was allocated must be 4166 * updated (on disk) before the inode's pointer. These two dependencies are 4167 * independent of each other and are needed for all file blocks and indirect 4168 * blocks that are pointed to directly by the inode. Just before the 4169 * "in-core" version of the inode is updated with a newly allocated block 4170 * number, a procedure (below) is called to setup allocation dependency 4171 * structures. These structures are removed when the corresponding 4172 * dependencies are satisfied or when the block allocation becomes obsolete 4173 * (i.e., the file is deleted, the block is de-allocated, or the block is a 4174 * fragment that gets upgraded). All of these cases are handled in 4175 * procedures described later. 4176 * 4177 * When a file extension causes a fragment to be upgraded, either to a larger 4178 * fragment or to a full block, the on-disk location may change (if the 4179 * previous fragment could not simply be extended). In this case, the old 4180 * fragment must be de-allocated, but not until after the inode's pointer has 4181 * been updated. In most cases, this is handled by later procedures, which 4182 * will construct a "freefrag" structure to be added to the workitem queue 4183 * when the inode update is complete (or obsolete). The main exception to 4184 * this is when an allocation occurs while a pending allocation dependency 4185 * (for the same block pointer) remains. This case is handled in the main 4186 * allocation dependency setup procedure by immediately freeing the 4187 * unreferenced fragments. 4188 */ 4189void 4190softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 4191 struct inode *ip; /* inode to which block is being added */ 4192 ufs_lbn_t off; /* block pointer within inode */ 4193 ufs2_daddr_t newblkno; /* disk block number being added */ 4194 ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ 4195 long newsize; /* size of new block */ 4196 long oldsize; /* size of new block */ 4197 struct buf *bp; /* bp for allocated block */ 4198{ 4199 struct allocdirect *adp, *oldadp; 4200 struct allocdirectlst *adphead; 4201 struct freefrag *freefrag; 4202 struct inodedep *inodedep; 4203 struct pagedep *pagedep; 4204 struct jnewblk *jnewblk; 4205 struct newblk *newblk; 4206 struct mount *mp; 4207 ufs_lbn_t lbn; 4208 4209 lbn = bp->b_lblkno; 4210 mp = UFSTOVFS(ip->i_ump); 4211 if (oldblkno && oldblkno != newblkno) 4212 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 4213 else 4214 freefrag = NULL; 4215 4216 ACQUIRE_LOCK(&lk); 4217 if (off >= NDADDR) { 4218 if (lbn > 0) 4219 panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", 4220 lbn, off); 4221 /* allocating an indirect block */ 4222 if (oldblkno != 0) 4223 panic("softdep_setup_allocdirect: non-zero indir"); 4224 } else { 4225 if (off != lbn) 4226 panic("softdep_setup_allocdirect: lbn %jd != off %jd", 4227 lbn, off); 4228 /* 4229 * Allocating a direct block. 4230 * 4231 * If we are allocating a directory block, then we must 4232 * allocate an associated pagedep to track additions and 4233 * deletions. 4234 */ 4235 if ((ip->i_mode & IFMT) == IFDIR && 4236 pagedep_lookup(mp, ip->i_number, off, DEPALLOC, 4237 &pagedep) == 0) 4238 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 4239 } 4240 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 4241 panic("softdep_setup_allocdirect: lost block"); 4242 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4243 ("softdep_setup_allocdirect: newblk already initialized")); 4244 /* 4245 * Convert the newblk to an allocdirect. 4246 */ 4247 newblk->nb_list.wk_type = D_ALLOCDIRECT; 4248 adp = (struct allocdirect *)newblk; 4249 newblk->nb_freefrag = freefrag; 4250 adp->ad_offset = off; 4251 adp->ad_oldblkno = oldblkno; 4252 adp->ad_newsize = newsize; 4253 adp->ad_oldsize = oldsize; 4254 4255 /* 4256 * Finish initializing the journal. 4257 */ 4258 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4259 jnewblk->jn_ino = ip->i_number; 4260 jnewblk->jn_lbn = lbn; 4261 add_to_journal(&jnewblk->jn_list); 4262 } 4263 if (freefrag && freefrag->ff_jfreefrag != NULL) 4264 add_to_journal(&freefrag->ff_jfreefrag->fr_list); 4265 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 4266 adp->ad_inodedep = inodedep; 4267 4268 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 4269 /* 4270 * The list of allocdirects must be kept in sorted and ascending 4271 * order so that the rollback routines can quickly determine the 4272 * first uncommitted block (the size of the file stored on disk 4273 * ends at the end of the lowest committed fragment, or if there 4274 * are no fragments, at the end of the highest committed block). 4275 * Since files generally grow, the typical case is that the new 4276 * block is to be added at the end of the list. We speed this 4277 * special case by checking against the last allocdirect in the 4278 * list before laboriously traversing the list looking for the 4279 * insertion point. 4280 */ 4281 adphead = &inodedep->id_newinoupdt; 4282 oldadp = TAILQ_LAST(adphead, allocdirectlst); 4283 if (oldadp == NULL || oldadp->ad_offset <= off) { 4284 /* insert at end of list */ 4285 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 4286 if (oldadp != NULL && oldadp->ad_offset == off) 4287 allocdirect_merge(adphead, adp, oldadp); 4288 FREE_LOCK(&lk); 4289 return; 4290 } 4291 TAILQ_FOREACH(oldadp, adphead, ad_next) { 4292 if (oldadp->ad_offset >= off) 4293 break; 4294 } 4295 if (oldadp == NULL) 4296 panic("softdep_setup_allocdirect: lost entry"); 4297 /* insert in middle of list */ 4298 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 4299 if (oldadp->ad_offset == off) 4300 allocdirect_merge(adphead, adp, oldadp); 4301 4302 FREE_LOCK(&lk); 4303} 4304 4305/* 4306 * Replace an old allocdirect dependency with a newer one. 4307 * This routine must be called with splbio interrupts blocked. 4308 */ 4309static void 4310allocdirect_merge(adphead, newadp, oldadp) 4311 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 4312 struct allocdirect *newadp; /* allocdirect being added */ 4313 struct allocdirect *oldadp; /* existing allocdirect being checked */ 4314{ 4315 struct worklist *wk; 4316 struct freefrag *freefrag; 4317 struct newdirblk *newdirblk; 4318 4319 freefrag = NULL; 4320 mtx_assert(&lk, MA_OWNED); 4321 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 4322 newadp->ad_oldsize != oldadp->ad_newsize || 4323 newadp->ad_offset >= NDADDR) 4324 panic("%s %jd != new %jd || old size %ld != new %ld", 4325 "allocdirect_merge: old blkno", 4326 (intmax_t)newadp->ad_oldblkno, 4327 (intmax_t)oldadp->ad_newblkno, 4328 newadp->ad_oldsize, oldadp->ad_newsize); 4329 newadp->ad_oldblkno = oldadp->ad_oldblkno; 4330 newadp->ad_oldsize = oldadp->ad_oldsize; 4331 /* 4332 * If the old dependency had a fragment to free or had never 4333 * previously had a block allocated, then the new dependency 4334 * can immediately post its freefrag and adopt the old freefrag. 4335 * This action is done by swapping the freefrag dependencies. 4336 * The new dependency gains the old one's freefrag, and the 4337 * old one gets the new one and then immediately puts it on 4338 * the worklist when it is freed by free_newblk. It is 4339 * not possible to do this swap when the old dependency had a 4340 * non-zero size but no previous fragment to free. This condition 4341 * arises when the new block is an extension of the old block. 4342 * Here, the first part of the fragment allocated to the new 4343 * dependency is part of the block currently claimed on disk by 4344 * the old dependency, so cannot legitimately be freed until the 4345 * conditions for the new dependency are fulfilled. 4346 */ 4347 freefrag = newadp->ad_freefrag; 4348 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 4349 newadp->ad_freefrag = oldadp->ad_freefrag; 4350 oldadp->ad_freefrag = freefrag; 4351 } 4352 /* 4353 * If we are tracking a new directory-block allocation, 4354 * move it from the old allocdirect to the new allocdirect. 4355 */ 4356 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { 4357 newdirblk = WK_NEWDIRBLK(wk); 4358 WORKLIST_REMOVE(&newdirblk->db_list); 4359 if (!LIST_EMPTY(&oldadp->ad_newdirblk)) 4360 panic("allocdirect_merge: extra newdirblk"); 4361 WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list); 4362 } 4363 TAILQ_REMOVE(adphead, oldadp, ad_next); 4364 /* 4365 * We need to move any journal dependencies over to the freefrag 4366 * that releases this block if it exists. Otherwise we are 4367 * extending an existing block and we'll wait until that is 4368 * complete to release the journal space and extend the 4369 * new journal to cover this old space as well. 4370 */ 4371 if (freefrag == NULL) { 4372 struct jnewblk *jnewblk; 4373 struct jnewblk *njnewblk; 4374 4375 if (oldadp->ad_newblkno != newadp->ad_newblkno) 4376 panic("allocdirect_merge: %jd != %jd", 4377 oldadp->ad_newblkno, newadp->ad_newblkno); 4378 jnewblk = oldadp->ad_block.nb_jnewblk; 4379 cancel_newblk(&oldadp->ad_block, &newadp->ad_block.nb_jwork); 4380 /* 4381 * We have an unwritten jnewblk, we need to merge the 4382 * frag bits with our own. The newer adp's journal can not 4383 * be written prior to the old one so no need to check for 4384 * it here. 4385 */ 4386 if (jnewblk) { 4387 njnewblk = newadp->ad_block.nb_jnewblk; 4388 if (njnewblk == NULL) 4389 panic("allocdirect_merge: No jnewblk"); 4390 if (jnewblk->jn_state & UNDONE) { 4391 njnewblk->jn_state |= UNDONE | NEWBLOCK; 4392 njnewblk->jn_state &= ~ATTACHED; 4393 jnewblk->jn_state &= ~UNDONE; 4394 } 4395 njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; 4396 WORKLIST_REMOVE(&jnewblk->jn_list); 4397 jnewblk->jn_state |= ATTACHED | COMPLETE; 4398 free_jnewblk(jnewblk); 4399 } 4400 } else { 4401 /* 4402 * We can skip journaling for this freefrag and just complete 4403 * any pending journal work for the allocdirect that is being 4404 * removed after the freefrag completes. 4405 */ 4406 if (freefrag->ff_jfreefrag) 4407 cancel_jfreefrag(freefrag->ff_jfreefrag); 4408 cancel_newblk(&oldadp->ad_block, &freefrag->ff_jwork); 4409 } 4410 free_newblk(&oldadp->ad_block); 4411} 4412 4413/* 4414 * Allocate a jfreefrag structure to journal a single block free. 4415 */ 4416static struct jfreefrag * 4417newjfreefrag(freefrag, ip, blkno, size, lbn) 4418 struct freefrag *freefrag; 4419 struct inode *ip; 4420 ufs2_daddr_t blkno; 4421 long size; 4422 ufs_lbn_t lbn; 4423{ 4424 struct jfreefrag *jfreefrag; 4425 struct fs *fs; 4426 4427 fs = ip->i_fs; 4428 jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, 4429 M_SOFTDEP_FLAGS); 4430 workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump)); 4431 jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); 4432 jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; 4433 jfreefrag->fr_ino = ip->i_number; 4434 jfreefrag->fr_lbn = lbn; 4435 jfreefrag->fr_blkno = blkno; 4436 jfreefrag->fr_frags = numfrags(fs, size); 4437 jfreefrag->fr_freefrag = freefrag; 4438 4439 return (jfreefrag); 4440} 4441 4442/* 4443 * Allocate a new freefrag structure. 4444 */ 4445static struct freefrag * 4446newfreefrag(ip, blkno, size, lbn) 4447 struct inode *ip; 4448 ufs2_daddr_t blkno; 4449 long size; 4450 ufs_lbn_t lbn; 4451{ 4452 struct freefrag *freefrag; 4453 struct fs *fs; 4454 4455 fs = ip->i_fs; 4456 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 4457 panic("newfreefrag: frag size"); 4458 freefrag = malloc(sizeof(struct freefrag), 4459 M_FREEFRAG, M_SOFTDEP_FLAGS); 4460 workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); 4461 freefrag->ff_state = ATTACHED; 4462 LIST_INIT(&freefrag->ff_jwork); 4463 freefrag->ff_inum = ip->i_number; 4464 freefrag->ff_blkno = blkno; 4465 freefrag->ff_fragsize = size; 4466 4467 if (fs->fs_flags & FS_SUJ) { 4468 freefrag->ff_jfreefrag = 4469 newjfreefrag(freefrag, ip, blkno, size, lbn); 4470 } else { 4471 freefrag->ff_state |= DEPCOMPLETE; 4472 freefrag->ff_jfreefrag = NULL; 4473 } 4474 4475 return (freefrag); 4476} 4477 4478/* 4479 * This workitem de-allocates fragments that were replaced during 4480 * file block allocation. 4481 */ 4482static void 4483handle_workitem_freefrag(freefrag) 4484 struct freefrag *freefrag; 4485{ 4486 struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); 4487 struct workhead wkhd; 4488 4489 /* 4490 * It would be illegal to add new completion items to the 4491 * freefrag after it was schedule to be done so it must be 4492 * safe to modify the list head here. 4493 */ 4494 LIST_INIT(&wkhd); 4495 LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); 4496 ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, 4497 freefrag->ff_fragsize, freefrag->ff_inum, &wkhd); 4498 ACQUIRE_LOCK(&lk); 4499 WORKITEM_FREE(freefrag, D_FREEFRAG); 4500 FREE_LOCK(&lk); 4501} 4502 4503/* 4504 * Set up a dependency structure for an external attributes data block. 4505 * This routine follows much of the structure of softdep_setup_allocdirect. 4506 * See the description of softdep_setup_allocdirect above for details. 4507 */ 4508void 4509softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 4510 struct inode *ip; 4511 ufs_lbn_t off; 4512 ufs2_daddr_t newblkno; 4513 ufs2_daddr_t oldblkno; 4514 long newsize; 4515 long oldsize; 4516 struct buf *bp; 4517{ 4518 struct allocdirect *adp, *oldadp; 4519 struct allocdirectlst *adphead; 4520 struct freefrag *freefrag; 4521 struct inodedep *inodedep; 4522 struct jnewblk *jnewblk; 4523 struct newblk *newblk; 4524 struct mount *mp; 4525 ufs_lbn_t lbn; 4526 4527 if (off >= NXADDR) 4528 panic("softdep_setup_allocext: lbn %lld > NXADDR", 4529 (long long)off); 4530 4531 lbn = bp->b_lblkno; 4532 mp = UFSTOVFS(ip->i_ump); 4533 if (oldblkno && oldblkno != newblkno) 4534 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 4535 else 4536 freefrag = NULL; 4537 4538 ACQUIRE_LOCK(&lk); 4539 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 4540 panic("softdep_setup_allocext: lost block"); 4541 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4542 ("softdep_setup_allocext: newblk already initialized")); 4543 /* 4544 * Convert the newblk to an allocdirect. 4545 */ 4546 newblk->nb_list.wk_type = D_ALLOCDIRECT; 4547 adp = (struct allocdirect *)newblk; 4548 newblk->nb_freefrag = freefrag; 4549 adp->ad_offset = off; 4550 adp->ad_oldblkno = oldblkno; 4551 adp->ad_newsize = newsize; 4552 adp->ad_oldsize = oldsize; 4553 adp->ad_state |= EXTDATA; 4554 4555 /* 4556 * Finish initializing the journal. 4557 */ 4558 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4559 jnewblk->jn_ino = ip->i_number; 4560 jnewblk->jn_lbn = lbn; 4561 add_to_journal(&jnewblk->jn_list); 4562 } 4563 if (freefrag && freefrag->ff_jfreefrag != NULL) 4564 add_to_journal(&freefrag->ff_jfreefrag->fr_list); 4565 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 4566 adp->ad_inodedep = inodedep; 4567 4568 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 4569 /* 4570 * The list of allocdirects must be kept in sorted and ascending 4571 * order so that the rollback routines can quickly determine the 4572 * first uncommitted block (the size of the file stored on disk 4573 * ends at the end of the lowest committed fragment, or if there 4574 * are no fragments, at the end of the highest committed block). 4575 * Since files generally grow, the typical case is that the new 4576 * block is to be added at the end of the list. We speed this 4577 * special case by checking against the last allocdirect in the 4578 * list before laboriously traversing the list looking for the 4579 * insertion point. 4580 */ 4581 adphead = &inodedep->id_newextupdt; 4582 oldadp = TAILQ_LAST(adphead, allocdirectlst); 4583 if (oldadp == NULL || oldadp->ad_offset <= off) { 4584 /* insert at end of list */ 4585 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 4586 if (oldadp != NULL && oldadp->ad_offset == off) 4587 allocdirect_merge(adphead, adp, oldadp); 4588 FREE_LOCK(&lk); 4589 return; 4590 } 4591 TAILQ_FOREACH(oldadp, adphead, ad_next) { 4592 if (oldadp->ad_offset >= off) 4593 break; 4594 } 4595 if (oldadp == NULL) 4596 panic("softdep_setup_allocext: lost entry"); 4597 /* insert in middle of list */ 4598 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 4599 if (oldadp->ad_offset == off) 4600 allocdirect_merge(adphead, adp, oldadp); 4601 FREE_LOCK(&lk); 4602} 4603 4604/* 4605 * Indirect block allocation dependencies. 4606 * 4607 * The same dependencies that exist for a direct block also exist when 4608 * a new block is allocated and pointed to by an entry in a block of 4609 * indirect pointers. The undo/redo states described above are also 4610 * used here. Because an indirect block contains many pointers that 4611 * may have dependencies, a second copy of the entire in-memory indirect 4612 * block is kept. The buffer cache copy is always completely up-to-date. 4613 * The second copy, which is used only as a source for disk writes, 4614 * contains only the safe pointers (i.e., those that have no remaining 4615 * update dependencies). The second copy is freed when all pointers 4616 * are safe. The cache is not allowed to replace indirect blocks with 4617 * pending update dependencies. If a buffer containing an indirect 4618 * block with dependencies is written, these routines will mark it 4619 * dirty again. It can only be successfully written once all the 4620 * dependencies are removed. The ffs_fsync routine in conjunction with 4621 * softdep_sync_metadata work together to get all the dependencies 4622 * removed so that a file can be successfully written to disk. Three 4623 * procedures are used when setting up indirect block pointer 4624 * dependencies. The division is necessary because of the organization 4625 * of the "balloc" routine and because of the distinction between file 4626 * pages and file metadata blocks. 4627 */ 4628 4629/* 4630 * Allocate a new allocindir structure. 4631 */ 4632static struct allocindir * 4633newallocindir(ip, ptrno, newblkno, oldblkno, lbn) 4634 struct inode *ip; /* inode for file being extended */ 4635 int ptrno; /* offset of pointer in indirect block */ 4636 ufs2_daddr_t newblkno; /* disk block number being added */ 4637 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 4638 ufs_lbn_t lbn; 4639{ 4640 struct newblk *newblk; 4641 struct allocindir *aip; 4642 struct freefrag *freefrag; 4643 struct jnewblk *jnewblk; 4644 4645 if (oldblkno) 4646 freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn); 4647 else 4648 freefrag = NULL; 4649 ACQUIRE_LOCK(&lk); 4650 if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0) 4651 panic("new_allocindir: lost block"); 4652 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4653 ("newallocindir: newblk already initialized")); 4654 newblk->nb_list.wk_type = D_ALLOCINDIR; 4655 newblk->nb_freefrag = freefrag; 4656 aip = (struct allocindir *)newblk; 4657 aip->ai_offset = ptrno; 4658 aip->ai_oldblkno = oldblkno; 4659 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4660 jnewblk->jn_ino = ip->i_number; 4661 jnewblk->jn_lbn = lbn; 4662 add_to_journal(&jnewblk->jn_list); 4663 } 4664 if (freefrag && freefrag->ff_jfreefrag != NULL) 4665 add_to_journal(&freefrag->ff_jfreefrag->fr_list); 4666 return (aip); 4667} 4668 4669/* 4670 * Called just before setting an indirect block pointer 4671 * to a newly allocated file page. 4672 */ 4673void 4674softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 4675 struct inode *ip; /* inode for file being extended */ 4676 ufs_lbn_t lbn; /* allocated block number within file */ 4677 struct buf *bp; /* buffer with indirect blk referencing page */ 4678 int ptrno; /* offset of pointer in indirect block */ 4679 ufs2_daddr_t newblkno; /* disk block number being added */ 4680 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 4681 struct buf *nbp; /* buffer holding allocated page */ 4682{ 4683 struct inodedep *inodedep; 4684 struct allocindir *aip; 4685 struct pagedep *pagedep; 4686 struct mount *mp; 4687 4688 if (lbn != nbp->b_lblkno) 4689 panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", 4690 lbn, bp->b_lblkno); 4691 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); 4692 mp = UFSTOVFS(ip->i_ump); 4693 aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); 4694 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 4695 /* 4696 * If we are allocating a directory page, then we must 4697 * allocate an associated pagedep to track additions and 4698 * deletions. 4699 */ 4700 if ((ip->i_mode & IFMT) == IFDIR && 4701 pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0) 4702 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); 4703 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 4704 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); 4705 FREE_LOCK(&lk); 4706} 4707 4708/* 4709 * Called just before setting an indirect block pointer to a 4710 * newly allocated indirect block. 4711 */ 4712void 4713softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 4714 struct buf *nbp; /* newly allocated indirect block */ 4715 struct inode *ip; /* inode for file being extended */ 4716 struct buf *bp; /* indirect block referencing allocated block */ 4717 int ptrno; /* offset of pointer in indirect block */ 4718 ufs2_daddr_t newblkno; /* disk block number being added */ 4719{ 4720 struct inodedep *inodedep; 4721 struct allocindir *aip; 4722 ufs_lbn_t lbn; 4723 4724 lbn = nbp->b_lblkno; 4725 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); 4726 aip = newallocindir(ip, ptrno, newblkno, 0, lbn); 4727 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); 4728 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 4729 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); 4730 FREE_LOCK(&lk); 4731} 4732 4733static void 4734indirdep_complete(indirdep) 4735 struct indirdep *indirdep; 4736{ 4737 struct allocindir *aip; 4738 4739 LIST_REMOVE(indirdep, ir_next); 4740 indirdep->ir_state &= ~ONDEPLIST; 4741 4742 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { 4743 LIST_REMOVE(aip, ai_next); 4744 free_newblk(&aip->ai_block); 4745 } 4746 /* 4747 * If this indirdep is not attached to a buf it was simply waiting 4748 * on completion to clear completehd. free_indirdep() asserts 4749 * that nothing is dangling. 4750 */ 4751 if ((indirdep->ir_state & ONWORKLIST) == 0) 4752 free_indirdep(indirdep); 4753} 4754 4755/* 4756 * Called to finish the allocation of the "aip" allocated 4757 * by one of the two routines above. 4758 */ 4759static void 4760setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) 4761 struct buf *bp; /* in-memory copy of the indirect block */ 4762 struct inode *ip; /* inode for file being extended */ 4763 struct inodedep *inodedep; /* Inodedep for ip */ 4764 struct allocindir *aip; /* allocindir allocated by the above routines */ 4765 ufs_lbn_t lbn; /* Logical block number for this block. */ 4766{ 4767 struct worklist *wk; 4768 struct fs *fs; 4769 struct newblk *newblk; 4770 struct indirdep *indirdep, *newindirdep; 4771 struct allocindir *oldaip; 4772 struct freefrag *freefrag; 4773 struct mount *mp; 4774 ufs2_daddr_t blkno; 4775 4776 mp = UFSTOVFS(ip->i_ump); 4777 fs = ip->i_fs; 4778 mtx_assert(&lk, MA_OWNED); 4779 if (bp->b_lblkno >= 0) 4780 panic("setup_allocindir_phase2: not indir blk"); 4781 for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) { 4782 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 4783 if (wk->wk_type != D_INDIRDEP) 4784 continue; 4785 indirdep = WK_INDIRDEP(wk); 4786 break; 4787 } 4788 if (indirdep == NULL && newindirdep) { 4789 indirdep = newindirdep; 4790 newindirdep = NULL; 4791 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 4792 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, 4793 &newblk)) { 4794 indirdep->ir_state |= ONDEPLIST; 4795 LIST_INSERT_HEAD(&newblk->nb_indirdeps, 4796 indirdep, ir_next); 4797 } else 4798 indirdep->ir_state |= DEPCOMPLETE; 4799 } 4800 if (indirdep) { 4801 aip->ai_indirdep = indirdep; 4802 /* 4803 * Check to see if there is an existing dependency 4804 * for this block. If there is, merge the old 4805 * dependency into the new one. This happens 4806 * as a result of reallocblk only. 4807 */ 4808 if (aip->ai_oldblkno == 0) 4809 oldaip = NULL; 4810 else 4811 4812 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, 4813 ai_next) 4814 if (oldaip->ai_offset == aip->ai_offset) 4815 break; 4816 if (oldaip != NULL) 4817 freefrag = allocindir_merge(aip, oldaip); 4818 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 4819 KASSERT(aip->ai_offset >= 0 && 4820 aip->ai_offset < NINDIR(ip->i_ump->um_fs), 4821 ("setup_allocindir_phase2: Bad offset %d", 4822 aip->ai_offset)); 4823 KASSERT(indirdep->ir_savebp != NULL, 4824 ("setup_allocindir_phase2 NULL ir_savebp")); 4825 if (ip->i_ump->um_fstype == UFS1) 4826 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data) 4827 [aip->ai_offset] = aip->ai_oldblkno; 4828 else 4829 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data) 4830 [aip->ai_offset] = aip->ai_oldblkno; 4831 FREE_LOCK(&lk); 4832 if (freefrag != NULL) 4833 handle_workitem_freefrag(freefrag); 4834 } else 4835 FREE_LOCK(&lk); 4836 if (newindirdep) { 4837 newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; 4838 brelse(newindirdep->ir_savebp); 4839 ACQUIRE_LOCK(&lk); 4840 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); 4841 if (indirdep) 4842 break; 4843 FREE_LOCK(&lk); 4844 } 4845 if (indirdep) { 4846 ACQUIRE_LOCK(&lk); 4847 break; 4848 } 4849 newindirdep = malloc(sizeof(struct indirdep), 4850 M_INDIRDEP, M_SOFTDEP_FLAGS); 4851 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); 4852 newindirdep->ir_state = ATTACHED; 4853 if (ip->i_ump->um_fstype == UFS1) 4854 newindirdep->ir_state |= UFS1FMT; 4855 newindirdep->ir_saveddata = NULL; 4856 LIST_INIT(&newindirdep->ir_deplisthd); 4857 LIST_INIT(&newindirdep->ir_donehd); 4858 LIST_INIT(&newindirdep->ir_writehd); 4859 LIST_INIT(&newindirdep->ir_completehd); 4860 LIST_INIT(&newindirdep->ir_jwork); 4861 if (bp->b_blkno == bp->b_lblkno) { 4862 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, 4863 NULL, NULL); 4864 bp->b_blkno = blkno; 4865 } 4866 newindirdep->ir_savebp = 4867 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); 4868 BUF_KERNPROC(newindirdep->ir_savebp); 4869 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 4870 ACQUIRE_LOCK(&lk); 4871 } 4872} 4873 4874/* 4875 * Merge two allocindirs which refer to the same block. Move newblock 4876 * dependencies and setup the freefrags appropriately. 4877 */ 4878static struct freefrag * 4879allocindir_merge(aip, oldaip) 4880 struct allocindir *aip; 4881 struct allocindir *oldaip; 4882{ 4883 struct newdirblk *newdirblk; 4884 struct freefrag *freefrag; 4885 struct worklist *wk; 4886 4887 if (oldaip->ai_newblkno != aip->ai_oldblkno) 4888 panic("allocindir_merge: blkno"); 4889 aip->ai_oldblkno = oldaip->ai_oldblkno; 4890 freefrag = aip->ai_freefrag; 4891 aip->ai_freefrag = oldaip->ai_freefrag; 4892 oldaip->ai_freefrag = NULL; 4893 KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); 4894 /* 4895 * If we are tracking a new directory-block allocation, 4896 * move it from the old allocindir to the new allocindir. 4897 */ 4898 if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { 4899 newdirblk = WK_NEWDIRBLK(wk); 4900 WORKLIST_REMOVE(&newdirblk->db_list); 4901 if (!LIST_EMPTY(&oldaip->ai_newdirblk)) 4902 panic("allocindir_merge: extra newdirblk"); 4903 WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list); 4904 } 4905 /* 4906 * We can skip journaling for this freefrag and just complete 4907 * any pending journal work for the allocindir that is being 4908 * removed after the freefrag completes. 4909 */ 4910 if (freefrag->ff_jfreefrag) 4911 cancel_jfreefrag(freefrag->ff_jfreefrag); 4912 LIST_REMOVE(oldaip, ai_next); 4913 cancel_newblk(&oldaip->ai_block, &freefrag->ff_jwork); 4914 free_newblk(&oldaip->ai_block); 4915 4916 return (freefrag); 4917} 4918 4919/* 4920 * Block de-allocation dependencies. 4921 * 4922 * When blocks are de-allocated, the on-disk pointers must be nullified before 4923 * the blocks are made available for use by other files. (The true 4924 * requirement is that old pointers must be nullified before new on-disk 4925 * pointers are set. We chose this slightly more stringent requirement to 4926 * reduce complexity.) Our implementation handles this dependency by updating 4927 * the inode (or indirect block) appropriately but delaying the actual block 4928 * de-allocation (i.e., freemap and free space count manipulation) until 4929 * after the updated versions reach stable storage. After the disk is 4930 * updated, the blocks can be safely de-allocated whenever it is convenient. 4931 * This implementation handles only the common case of reducing a file's 4932 * length to zero. Other cases are handled by the conventional synchronous 4933 * write approach. 4934 * 4935 * The ffs implementation with which we worked double-checks 4936 * the state of the block pointers and file size as it reduces 4937 * a file's length. Some of this code is replicated here in our 4938 * soft updates implementation. The freeblks->fb_chkcnt field is 4939 * used to transfer a part of this information to the procedure 4940 * that eventually de-allocates the blocks. 4941 * 4942 * This routine should be called from the routine that shortens 4943 * a file's length, before the inode's size or block pointers 4944 * are modified. It will save the block pointer information for 4945 * later release and zero the inode so that the calling routine 4946 * can release it. 4947 */ 4948void 4949softdep_setup_freeblocks(ip, length, flags) 4950 struct inode *ip; /* The inode whose length is to be reduced */ 4951 off_t length; /* The new length for the file */ 4952 int flags; /* IO_EXT and/or IO_NORMAL */ 4953{ 4954 struct ufs1_dinode *dp1; 4955 struct ufs2_dinode *dp2; 4956 struct freeblks *freeblks; 4957 struct inodedep *inodedep; 4958 struct allocdirect *adp; 4959 struct jfreeblk *jfreeblk; 4960 struct bufobj *bo; 4961 struct vnode *vp; 4962 struct buf *bp; 4963 struct fs *fs; 4964 ufs2_daddr_t extblocks, datablocks; 4965 struct mount *mp; 4966 int i, delay, error; 4967 ufs2_daddr_t blkno; 4968 ufs_lbn_t tmpval; 4969 ufs_lbn_t lbn; 4970 long oldextsize; 4971 long oldsize; 4972 int frags; 4973 int needj; 4974 4975 fs = ip->i_fs; 4976 mp = UFSTOVFS(ip->i_ump); 4977 if (length != 0) 4978 panic("softdep_setup_freeblocks: non-zero length"); 4979 freeblks = malloc(sizeof(struct freeblks), 4980 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); 4981 workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); 4982 LIST_INIT(&freeblks->fb_jfreeblkhd); 4983 LIST_INIT(&freeblks->fb_jwork); 4984 freeblks->fb_state = ATTACHED; 4985 freeblks->fb_uid = ip->i_uid; 4986 freeblks->fb_previousinum = ip->i_number; 4987 freeblks->fb_devvp = ip->i_devvp; 4988 freeblks->fb_chkcnt = 0; 4989 ACQUIRE_LOCK(&lk); 4990 /* 4991 * If we're truncating a removed file that will never be written 4992 * we don't need to journal the block frees. The canceled journals 4993 * for the allocations will suffice. 4994 */ 4995 inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 4996 if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED || 4997 (fs->fs_flags & FS_SUJ) == 0) 4998 needj = 0; 4999 else 5000 needj = 1; 5001 num_freeblkdep++; 5002 FREE_LOCK(&lk); 5003 extblocks = 0; 5004 if (fs->fs_magic == FS_UFS2_MAGIC) 5005 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 5006 datablocks = DIP(ip, i_blocks) - extblocks; 5007 if ((flags & IO_NORMAL) != 0) { 5008 oldsize = ip->i_size; 5009 ip->i_size = 0; 5010 DIP_SET(ip, i_size, 0); 5011 freeblks->fb_chkcnt = datablocks; 5012 for (i = 0; i < NDADDR; i++) { 5013 blkno = DIP(ip, i_db[i]); 5014 DIP_SET(ip, i_db[i], 0); 5015 if (blkno == 0) 5016 continue; 5017 frags = sblksize(fs, oldsize, i); 5018 frags = numfrags(fs, frags); 5019 newfreework(freeblks, NULL, i, blkno, frags, needj); 5020 } 5021 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; 5022 i++, tmpval *= NINDIR(fs)) { 5023 blkno = DIP(ip, i_ib[i]); 5024 DIP_SET(ip, i_ib[i], 0); 5025 if (blkno) 5026 newfreework(freeblks, NULL, -lbn - i, blkno, 5027 fs->fs_frag, needj); 5028 lbn += tmpval; 5029 } 5030 /* 5031 * If the file was removed, then the space being freed was 5032 * accounted for then (see softdep_releasefile()). If the 5033 * file is merely being truncated, then we account for it now. 5034 */ 5035 if ((ip->i_flag & IN_SPACECOUNTED) == 0) { 5036 UFS_LOCK(ip->i_ump); 5037 fs->fs_pendingblocks += datablocks; 5038 UFS_UNLOCK(ip->i_ump); 5039 } 5040 } 5041 if ((flags & IO_EXT) != 0) { 5042 oldextsize = ip->i_din2->di_extsize; 5043 ip->i_din2->di_extsize = 0; 5044 freeblks->fb_chkcnt += extblocks; 5045 for (i = 0; i < NXADDR; i++) { 5046 blkno = ip->i_din2->di_extb[i]; 5047 ip->i_din2->di_extb[i] = 0; 5048 if (blkno == 0) 5049 continue; 5050 frags = sblksize(fs, oldextsize, i); 5051 frags = numfrags(fs, frags); 5052 newfreework(freeblks, NULL, -1 - i, blkno, frags, 5053 needj); 5054 } 5055 } 5056 if (LIST_EMPTY(&freeblks->fb_jfreeblkhd)) 5057 needj = 0; 5058 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt); 5059 /* 5060 * Push the zero'ed inode to to its disk buffer so that we are free 5061 * to delete its dependencies below. Once the dependencies are gone 5062 * the buffer can be safely released. 5063 */ 5064 if ((error = bread(ip->i_devvp, 5065 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 5066 (int)fs->fs_bsize, NOCRED, &bp)) != 0) { 5067 brelse(bp); 5068 softdep_error("softdep_setup_freeblocks", error); 5069 } 5070 if (ip->i_ump->um_fstype == UFS1) { 5071 dp1 = ((struct ufs1_dinode *)bp->b_data + 5072 ino_to_fsbo(fs, ip->i_number)); 5073 ip->i_din1->di_freelink = dp1->di_freelink; 5074 *dp1 = *ip->i_din1; 5075 } else { 5076 dp2 = ((struct ufs2_dinode *)bp->b_data + 5077 ino_to_fsbo(fs, ip->i_number)); 5078 ip->i_din2->di_freelink = dp2->di_freelink; 5079 *dp2 = *ip->i_din2; 5080 } 5081 /* 5082 * Find and eliminate any inode dependencies. 5083 */ 5084 ACQUIRE_LOCK(&lk); 5085 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 5086 if ((inodedep->id_state & IOSTARTED) != 0) 5087 panic("softdep_setup_freeblocks: inode busy"); 5088 /* 5089 * Add the freeblks structure to the list of operations that 5090 * must await the zero'ed inode being written to disk. If we 5091 * still have a bitmap dependency (delay == 0), then the inode 5092 * has never been written to disk, so we can process the 5093 * freeblks below once we have deleted the dependencies. 5094 */ 5095 delay = (inodedep->id_state & DEPCOMPLETE); 5096 if (delay) 5097 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 5098 else if (needj) 5099 freeblks->fb_state |= DEPCOMPLETE | COMPLETE; 5100 /* 5101 * Because the file length has been truncated to zero, any 5102 * pending block allocation dependency structures associated 5103 * with this inode are obsolete and can simply be de-allocated. 5104 * We must first merge the two dependency lists to get rid of 5105 * any duplicate freefrag structures, then purge the merged list. 5106 * If we still have a bitmap dependency, then the inode has never 5107 * been written to disk, so we can free any fragments without delay. 5108 */ 5109 if (flags & IO_NORMAL) { 5110 merge_inode_lists(&inodedep->id_newinoupdt, 5111 &inodedep->id_inoupdt); 5112 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 5113 cancel_allocdirect(&inodedep->id_inoupdt, adp, 5114 freeblks, delay); 5115 } 5116 if (flags & IO_EXT) { 5117 merge_inode_lists(&inodedep->id_newextupdt, 5118 &inodedep->id_extupdt); 5119 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 5120 cancel_allocdirect(&inodedep->id_extupdt, adp, 5121 freeblks, delay); 5122 } 5123 LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps) 5124 add_to_journal(&jfreeblk->jf_list); 5125 5126 FREE_LOCK(&lk); 5127 bdwrite(bp); 5128 /* 5129 * We must wait for any I/O in progress to finish so that 5130 * all potential buffers on the dirty list will be visible. 5131 * Once they are all there, walk the list and get rid of 5132 * any dependencies. 5133 */ 5134 vp = ITOV(ip); 5135 bo = &vp->v_bufobj; 5136 BO_LOCK(bo); 5137 drain_output(vp); 5138restart: 5139 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 5140 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || 5141 ((flags & IO_NORMAL) == 0 && 5142 (bp->b_xflags & BX_ALTDATA) == 0)) 5143 continue; 5144 if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL) 5145 goto restart; 5146 BO_UNLOCK(bo); 5147 ACQUIRE_LOCK(&lk); 5148 (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep); 5149 if (deallocate_dependencies(bp, inodedep, freeblks)) 5150 bp->b_flags |= B_INVAL | B_NOCACHE; 5151 FREE_LOCK(&lk); 5152 brelse(bp); 5153 BO_LOCK(bo); 5154 goto restart; 5155 } 5156 BO_UNLOCK(bo); 5157 ACQUIRE_LOCK(&lk); 5158 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 5159 (void) free_inodedep(inodedep); 5160 5161 if (delay) { 5162 freeblks->fb_state |= DEPCOMPLETE; 5163 /* 5164 * If the inode with zeroed block pointers is now on disk 5165 * we can start freeing blocks. Add freeblks to the worklist 5166 * instead of calling handle_workitem_freeblocks directly as 5167 * it is more likely that additional IO is needed to complete 5168 * the request here than in the !delay case. 5169 */ 5170 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 5171 add_to_worklist(&freeblks->fb_list, 1); 5172 } 5173 5174 FREE_LOCK(&lk); 5175 /* 5176 * If the inode has never been written to disk (delay == 0) and 5177 * we're not waiting on any journal writes, then we can process the 5178 * freeblks now that we have deleted the dependencies. 5179 */ 5180 if (!delay && !needj) 5181 handle_workitem_freeblocks(freeblks, 0); 5182} 5183 5184/* 5185 * Reclaim any dependency structures from a buffer that is about to 5186 * be reallocated to a new vnode. The buffer must be locked, thus, 5187 * no I/O completion operations can occur while we are manipulating 5188 * its associated dependencies. The mutex is held so that other I/O's 5189 * associated with related dependencies do not occur. Returns 1 if 5190 * all dependencies were cleared, 0 otherwise. 5191 */ 5192static int 5193deallocate_dependencies(bp, inodedep, freeblks) 5194 struct buf *bp; 5195 struct inodedep *inodedep; 5196 struct freeblks *freeblks; 5197{ 5198 struct worklist *wk; 5199 struct indirdep *indirdep; 5200 struct newdirblk *newdirblk; 5201 struct allocindir *aip; 5202 struct pagedep *pagedep; 5203 struct jremref *jremref; 5204 struct jmvref *jmvref; 5205 struct dirrem *dirrem; 5206 int i; 5207 5208 mtx_assert(&lk, MA_OWNED); 5209 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 5210 switch (wk->wk_type) { 5211 5212 case D_INDIRDEP: 5213 indirdep = WK_INDIRDEP(wk); 5214 if (bp->b_lblkno >= 0 || 5215 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 5216 panic("deallocate_dependencies: not indir"); 5217 cancel_indirdep(indirdep, bp, inodedep, freeblks); 5218 continue; 5219 5220 case D_PAGEDEP: 5221 pagedep = WK_PAGEDEP(wk); 5222 /* 5223 * There should be no directory add dependencies present 5224 * as the directory could not be truncated until all 5225 * children were removed. 5226 */ 5227 KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, 5228 ("deallocate_dependencies: pendinghd != NULL")); 5229 for (i = 0; i < DAHASHSZ; i++) 5230 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, 5231 ("deallocate_dependencies: diraddhd != NULL")); 5232 /* 5233 * Copy any directory remove dependencies to the list 5234 * to be processed after the zero'ed inode is written. 5235 * If the inode has already been written, then they 5236 * can be dumped directly onto the work list. 5237 */ 5238 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 5239 /* 5240 * If there are any dirrems we wait for 5241 * the journal write to complete and 5242 * then restart the buf scan as the lock 5243 * has been dropped. 5244 */ 5245 while ((jremref = 5246 LIST_FIRST(&dirrem->dm_jremrefhd)) 5247 != NULL) { 5248 stat_jwait_filepage++; 5249 jwait(&jremref->jr_list); 5250 return (0); 5251 } 5252 LIST_REMOVE(dirrem, dm_next); 5253 dirrem->dm_dirinum = pagedep->pd_ino; 5254 if (inodedep == NULL || 5255 (inodedep->id_state & ALLCOMPLETE) == 5256 ALLCOMPLETE) { 5257 dirrem->dm_state |= COMPLETE; 5258 add_to_worklist(&dirrem->dm_list, 0); 5259 } else 5260 WORKLIST_INSERT(&inodedep->id_bufwait, 5261 &dirrem->dm_list); 5262 } 5263 if ((pagedep->pd_state & NEWBLOCK) != 0) { 5264 newdirblk = pagedep->pd_newdirblk; 5265 WORKLIST_REMOVE(&newdirblk->db_list); 5266 free_newdirblk(newdirblk); 5267 } 5268 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) 5269 != NULL) { 5270 stat_jwait_filepage++; 5271 jwait(&jmvref->jm_list); 5272 return (0); 5273 } 5274 WORKLIST_REMOVE(&pagedep->pd_list); 5275 LIST_REMOVE(pagedep, pd_hash); 5276 WORKITEM_FREE(pagedep, D_PAGEDEP); 5277 continue; 5278 5279 case D_ALLOCINDIR: 5280 aip = WK_ALLOCINDIR(wk); 5281 cancel_allocindir(aip, inodedep, freeblks); 5282 continue; 5283 5284 case D_ALLOCDIRECT: 5285 case D_INODEDEP: 5286 panic("deallocate_dependencies: Unexpected type %s", 5287 TYPENAME(wk->wk_type)); 5288 /* NOTREACHED */ 5289 5290 default: 5291 panic("deallocate_dependencies: Unknown type %s", 5292 TYPENAME(wk->wk_type)); 5293 /* NOTREACHED */ 5294 } 5295 } 5296 5297 return (1); 5298} 5299 5300/* 5301 * An allocdirect is being canceled due to a truncate. We must make sure 5302 * the journal entry is released in concert with the blkfree that releases 5303 * the storage. Completed journal entries must not be released until the 5304 * space is no longer pointed to by the inode or in the bitmap. 5305 */ 5306static void 5307cancel_allocdirect(adphead, adp, freeblks, delay) 5308 struct allocdirectlst *adphead; 5309 struct allocdirect *adp; 5310 struct freeblks *freeblks; 5311 int delay; 5312{ 5313 struct freework *freework; 5314 struct newblk *newblk; 5315 struct worklist *wk; 5316 ufs_lbn_t lbn; 5317 5318 TAILQ_REMOVE(adphead, adp, ad_next); 5319 newblk = (struct newblk *)adp; 5320 /* 5321 * If the journal hasn't been written the jnewblk must be passed 5322 * to the call to ffs_freeblk that reclaims the space. We accomplish 5323 * this by linking the journal dependency into the freework to be 5324 * freed when freework_freeblock() is called. If the journal has 5325 * been written we can simply reclaim the journal space when the 5326 * freeblks work is complete. 5327 */ 5328 if (newblk->nb_jnewblk == NULL) { 5329 cancel_newblk(newblk, &freeblks->fb_jwork); 5330 goto found; 5331 } 5332 lbn = newblk->nb_jnewblk->jn_lbn; 5333 /* 5334 * Find the correct freework structure so it releases the canceled 5335 * journal when the bitmap is cleared. This preserves rollback 5336 * until the allocation is reverted. 5337 */ 5338 LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { 5339 freework = WK_FREEWORK(wk); 5340 if (freework->fw_lbn != lbn) 5341 continue; 5342 cancel_newblk(newblk, &freework->fw_jwork); 5343 goto found; 5344 } 5345 panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn); 5346found: 5347 if (delay) 5348 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, 5349 &newblk->nb_list); 5350 else 5351 free_newblk(newblk); 5352 return; 5353} 5354 5355 5356static void 5357cancel_newblk(newblk, wkhd) 5358 struct newblk *newblk; 5359 struct workhead *wkhd; 5360{ 5361 struct indirdep *indirdep; 5362 struct allocindir *aip; 5363 5364 while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) { 5365 indirdep->ir_state &= ~ONDEPLIST; 5366 LIST_REMOVE(indirdep, ir_next); 5367 /* 5368 * If an indirdep is not on the buf worklist we need to 5369 * free it here as deallocate_dependencies() will never 5370 * find it. These pointers were never visible on disk and 5371 * can be discarded immediately. 5372 */ 5373 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { 5374 LIST_REMOVE(aip, ai_next); 5375 cancel_newblk(&aip->ai_block, wkhd); 5376 free_newblk(&aip->ai_block); 5377 } 5378 /* 5379 * If this indirdep is not attached to a buf it was simply 5380 * waiting on completion to clear completehd. free_indirdep() 5381 * asserts that nothing is dangling. 5382 */ 5383 if ((indirdep->ir_state & ONWORKLIST) == 0) 5384 free_indirdep(indirdep); 5385 } 5386 if (newblk->nb_state & ONDEPLIST) { 5387 newblk->nb_state &= ~ONDEPLIST; 5388 LIST_REMOVE(newblk, nb_deps); 5389 } 5390 if (newblk->nb_state & ONWORKLIST) 5391 WORKLIST_REMOVE(&newblk->nb_list); 5392 /* 5393 * If the journal entry hasn't been written we hold onto the dep 5394 * until it is safe to free along with the other journal work. 5395 */ 5396 if (newblk->nb_jnewblk != NULL) { 5397 cancel_jnewblk(newblk->nb_jnewblk, wkhd); 5398 newblk->nb_jnewblk = NULL; 5399 } 5400 if (!LIST_EMPTY(&newblk->nb_jwork)) 5401 jwork_move(wkhd, &newblk->nb_jwork); 5402} 5403 5404/* 5405 * Free a newblk. Generate a new freefrag work request if appropriate. 5406 * This must be called after the inode pointer and any direct block pointers 5407 * are valid or fully removed via truncate or frag extension. 5408 */ 5409static void 5410free_newblk(newblk) 5411 struct newblk *newblk; 5412{ 5413 struct indirdep *indirdep; 5414 struct newdirblk *newdirblk; 5415 struct freefrag *freefrag; 5416 struct worklist *wk; 5417 5418 mtx_assert(&lk, MA_OWNED); 5419 if (newblk->nb_state & ONDEPLIST) 5420 LIST_REMOVE(newblk, nb_deps); 5421 if (newblk->nb_state & ONWORKLIST) 5422 WORKLIST_REMOVE(&newblk->nb_list); 5423 LIST_REMOVE(newblk, nb_hash); 5424 if ((freefrag = newblk->nb_freefrag) != NULL) { 5425 freefrag->ff_state |= COMPLETE; 5426 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 5427 add_to_worklist(&freefrag->ff_list, 0); 5428 } 5429 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) { 5430 newdirblk = WK_NEWDIRBLK(wk); 5431 WORKLIST_REMOVE(&newdirblk->db_list); 5432 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 5433 panic("free_newblk: extra newdirblk"); 5434 free_newdirblk(newdirblk); 5435 } 5436 while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) { 5437 indirdep->ir_state |= DEPCOMPLETE; 5438 indirdep_complete(indirdep); 5439 } 5440 KASSERT(newblk->nb_jnewblk == NULL, 5441 ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk)); 5442 handle_jwork(&newblk->nb_jwork); 5443 newblk->nb_list.wk_type = D_NEWBLK; 5444 WORKITEM_FREE(newblk, D_NEWBLK); 5445} 5446 5447/* 5448 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. 5449 * This routine must be called with splbio interrupts blocked. 5450 */ 5451static void 5452free_newdirblk(newdirblk) 5453 struct newdirblk *newdirblk; 5454{ 5455 struct pagedep *pagedep; 5456 struct diradd *dap; 5457 struct worklist *wk; 5458 int i; 5459 5460 mtx_assert(&lk, MA_OWNED); 5461 /* 5462 * If the pagedep is still linked onto the directory buffer 5463 * dependency chain, then some of the entries on the 5464 * pd_pendinghd list may not be committed to disk yet. In 5465 * this case, we will simply clear the NEWBLOCK flag and 5466 * let the pd_pendinghd list be processed when the pagedep 5467 * is next written. If the pagedep is no longer on the buffer 5468 * dependency chain, then all the entries on the pd_pending 5469 * list are committed to disk and we can free them here. 5470 */ 5471 pagedep = newdirblk->db_pagedep; 5472 pagedep->pd_state &= ~NEWBLOCK; 5473 if ((pagedep->pd_state & ONWORKLIST) == 0) 5474 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 5475 free_diradd(dap, NULL); 5476 /* 5477 * If no dependencies remain, the pagedep will be freed. 5478 */ 5479 for (i = 0; i < DAHASHSZ; i++) 5480 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 5481 break; 5482 if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 && 5483 LIST_EMPTY(&pagedep->pd_jmvrefhd)) { 5484 KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL, 5485 ("free_newdirblk: Freeing non-free pagedep %p", pagedep)); 5486 LIST_REMOVE(pagedep, pd_hash); 5487 WORKITEM_FREE(pagedep, D_PAGEDEP); 5488 } 5489 /* Should only ever be one item in the list. */ 5490 while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { 5491 WORKLIST_REMOVE(wk); 5492 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 5493 } 5494 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 5495} 5496 5497/* 5498 * Prepare an inode to be freed. The actual free operation is not 5499 * done until the zero'ed inode has been written to disk. 5500 */ 5501void 5502softdep_freefile(pvp, ino, mode) 5503 struct vnode *pvp; 5504 ino_t ino; 5505 int mode; 5506{ 5507 struct inode *ip = VTOI(pvp); 5508 struct inodedep *inodedep; 5509 struct freefile *freefile; 5510 5511 /* 5512 * This sets up the inode de-allocation dependency. 5513 */ 5514 freefile = malloc(sizeof(struct freefile), 5515 M_FREEFILE, M_SOFTDEP_FLAGS); 5516 workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); 5517 freefile->fx_mode = mode; 5518 freefile->fx_oldinum = ino; 5519 freefile->fx_devvp = ip->i_devvp; 5520 LIST_INIT(&freefile->fx_jwork); 5521 if ((ip->i_flag & IN_SPACECOUNTED) == 0) { 5522 UFS_LOCK(ip->i_ump); 5523 ip->i_fs->fs_pendinginodes += 1; 5524 UFS_UNLOCK(ip->i_ump); 5525 } 5526 5527 /* 5528 * If the inodedep does not exist, then the zero'ed inode has 5529 * been written to disk. If the allocated inode has never been 5530 * written to disk, then the on-disk inode is zero'ed. In either 5531 * case we can free the file immediately. If the journal was 5532 * canceled before being written the inode will never make it to 5533 * disk and we must send the canceled journal entrys to 5534 * ffs_freefile() to be cleared in conjunction with the bitmap. 5535 * Any blocks waiting on the inode to write can be safely freed 5536 * here as it will never been written. 5537 */ 5538 ACQUIRE_LOCK(&lk); 5539 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 5540 /* 5541 * Remove this inode from the unlinked list and set 5542 * GOINGAWAY as appropriate to indicate that this inode 5543 * will never be written. 5544 */ 5545 if (inodedep && inodedep->id_state & UNLINKED) { 5546 /* 5547 * Save the journal work to be freed with the bitmap 5548 * before we clear UNLINKED. Otherwise it can be lost 5549 * if the inode block is written. 5550 */ 5551 handle_bufwait(inodedep, &freefile->fx_jwork); 5552 clear_unlinked_inodedep(inodedep); 5553 /* Re-acquire inodedep as we've dropped lk. */ 5554 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 5555 if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0) 5556 inodedep->id_state |= GOINGAWAY; 5557 } 5558 if (inodedep == NULL || check_inode_unwritten(inodedep)) { 5559 FREE_LOCK(&lk); 5560 handle_workitem_freefile(freefile); 5561 return; 5562 } 5563 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 5564 FREE_LOCK(&lk); 5565 if (ip->i_number == ino) 5566 ip->i_flag |= IN_MODIFIED; 5567} 5568 5569/* 5570 * Check to see if an inode has never been written to disk. If 5571 * so free the inodedep and return success, otherwise return failure. 5572 * This routine must be called with splbio interrupts blocked. 5573 * 5574 * If we still have a bitmap dependency, then the inode has never 5575 * been written to disk. Drop the dependency as it is no longer 5576 * necessary since the inode is being deallocated. We set the 5577 * ALLCOMPLETE flags since the bitmap now properly shows that the 5578 * inode is not allocated. Even if the inode is actively being 5579 * written, it has been rolled back to its zero'ed state, so we 5580 * are ensured that a zero inode is what is on the disk. For short 5581 * lived files, this change will usually result in removing all the 5582 * dependencies from the inode so that it can be freed immediately. 5583 */ 5584static int 5585check_inode_unwritten(inodedep) 5586 struct inodedep *inodedep; 5587{ 5588 5589 mtx_assert(&lk, MA_OWNED); 5590 5591 if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || 5592 !LIST_EMPTY(&inodedep->id_pendinghd) || 5593 !LIST_EMPTY(&inodedep->id_bufwait) || 5594 !LIST_EMPTY(&inodedep->id_inowait) || 5595 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 5596 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 5597 !TAILQ_EMPTY(&inodedep->id_extupdt) || 5598 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 5599 inodedep->id_mkdiradd != NULL || 5600 inodedep->id_nlinkdelta != 0) 5601 return (0); 5602 /* 5603 * Another process might be in initiate_write_inodeblock_ufs[12] 5604 * trying to allocate memory without holding "Softdep Lock". 5605 */ 5606 if ((inodedep->id_state & IOSTARTED) != 0 && 5607 inodedep->id_savedino1 == NULL) 5608 return (0); 5609 5610 if (inodedep->id_state & ONDEPLIST) 5611 LIST_REMOVE(inodedep, id_deps); 5612 inodedep->id_state &= ~ONDEPLIST; 5613 inodedep->id_state |= ALLCOMPLETE; 5614 inodedep->id_bmsafemap = NULL; 5615 if (inodedep->id_state & ONWORKLIST) 5616 WORKLIST_REMOVE(&inodedep->id_list); 5617 if (inodedep->id_savedino1 != NULL) { 5618 free(inodedep->id_savedino1, M_SAVEDINO); 5619 inodedep->id_savedino1 = NULL; 5620 } 5621 if (free_inodedep(inodedep) == 0) 5622 panic("check_inode_unwritten: busy inode"); 5623 return (1); 5624} 5625 5626/* 5627 * Try to free an inodedep structure. Return 1 if it could be freed. 5628 */ 5629static int 5630free_inodedep(inodedep) 5631 struct inodedep *inodedep; 5632{ 5633 5634 mtx_assert(&lk, MA_OWNED); 5635 if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || 5636 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 5637 !LIST_EMPTY(&inodedep->id_dirremhd) || 5638 !LIST_EMPTY(&inodedep->id_pendinghd) || 5639 !LIST_EMPTY(&inodedep->id_bufwait) || 5640 !LIST_EMPTY(&inodedep->id_inowait) || 5641 !TAILQ_EMPTY(&inodedep->id_inoreflst) || 5642 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 5643 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 5644 !TAILQ_EMPTY(&inodedep->id_extupdt) || 5645 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 5646 inodedep->id_mkdiradd != NULL || 5647 inodedep->id_nlinkdelta != 0 || 5648 inodedep->id_savedino1 != NULL) 5649 return (0); 5650 if (inodedep->id_state & ONDEPLIST) 5651 LIST_REMOVE(inodedep, id_deps); 5652 LIST_REMOVE(inodedep, id_hash); 5653 WORKITEM_FREE(inodedep, D_INODEDEP); 5654 num_inodedep -= 1; 5655 return (1); 5656} 5657 5658/* 5659 * Free the block referenced by a freework structure. The parent freeblks 5660 * structure is released and completed when the final cg bitmap reaches 5661 * the disk. This routine may be freeing a jnewblk which never made it to 5662 * disk in which case we do not have to wait as the operation is undone 5663 * in memory immediately. 5664 */ 5665static void 5666freework_freeblock(freework) 5667 struct freework *freework; 5668{ 5669 struct freeblks *freeblks; 5670 struct ufsmount *ump; 5671 struct workhead wkhd; 5672 struct fs *fs; 5673 int complete; 5674 int pending; 5675 int bsize; 5676 int needj; 5677 5678 freeblks = freework->fw_freeblks; 5679 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 5680 fs = ump->um_fs; 5681 needj = freeblks->fb_list.wk_mp->mnt_kern_flag & MNTK_SUJ; 5682 complete = 0; 5683 LIST_INIT(&wkhd); 5684 /* 5685 * If we are canceling an existing jnewblk pass it to the free 5686 * routine, otherwise pass the freeblk which will ultimately 5687 * release the freeblks. If we're not journaling, we can just 5688 * free the freeblks immediately. 5689 */ 5690 if (!LIST_EMPTY(&freework->fw_jwork)) { 5691 LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list); 5692 complete = 1; 5693 } else if (needj) 5694 WORKLIST_INSERT_UNLOCKED(&wkhd, &freework->fw_list); 5695 bsize = lfragtosize(fs, freework->fw_frags); 5696 pending = btodb(bsize); 5697 ACQUIRE_LOCK(&lk); 5698 freeblks->fb_chkcnt -= pending; 5699 FREE_LOCK(&lk); 5700 /* 5701 * extattr blocks don't show up in pending blocks. XXX why? 5702 */ 5703 if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) { 5704 UFS_LOCK(ump); 5705 fs->fs_pendingblocks -= pending; 5706 UFS_UNLOCK(ump); 5707 } 5708 ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, 5709 bsize, freeblks->fb_previousinum, &wkhd); 5710 if (complete == 0 && needj) 5711 return; 5712 /* 5713 * The jnewblk will be discarded and the bits in the map never 5714 * made it to disk. We can immediately free the freeblk. 5715 */ 5716 ACQUIRE_LOCK(&lk); 5717 handle_written_freework(freework); 5718 FREE_LOCK(&lk); 5719} 5720 5721/* 5722 * Start, continue, or finish the process of freeing an indirect block tree. 5723 * The free operation may be paused at any point with fw_off containing the 5724 * offset to restart from. This enables us to implement some flow control 5725 * for large truncates which may fan out and generate a huge number of 5726 * dependencies. 5727 */ 5728static void 5729handle_workitem_indirblk(freework) 5730 struct freework *freework; 5731{ 5732 struct freeblks *freeblks; 5733 struct ufsmount *ump; 5734 struct fs *fs; 5735 5736 5737 freeblks = freework->fw_freeblks; 5738 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 5739 fs = ump->um_fs; 5740 if (freework->fw_off == NINDIR(fs)) 5741 freework_freeblock(freework); 5742 else 5743 indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), 5744 freework->fw_lbn); 5745} 5746 5747/* 5748 * Called when a freework structure attached to a cg buf is written. The 5749 * ref on either the parent or the freeblks structure is released and 5750 * either may be added to the worklist if it is the final ref. 5751 */ 5752static void 5753handle_written_freework(freework) 5754 struct freework *freework; 5755{ 5756 struct freeblks *freeblks; 5757 struct freework *parent; 5758 5759 freeblks = freework->fw_freeblks; 5760 parent = freework->fw_parent; 5761 if (parent) { 5762 if (--parent->fw_ref != 0) 5763 parent = NULL; 5764 freeblks = NULL; 5765 } else if (--freeblks->fb_ref != 0) 5766 freeblks = NULL; 5767 WORKITEM_FREE(freework, D_FREEWORK); 5768 /* 5769 * Don't delay these block frees or it takes an intolerable amount 5770 * of time to process truncates and free their journal entries. 5771 */ 5772 if (freeblks) 5773 add_to_worklist(&freeblks->fb_list, 1); 5774 if (parent) 5775 add_to_worklist(&parent->fw_list, 1); 5776} 5777 5778/* 5779 * This workitem routine performs the block de-allocation. 5780 * The workitem is added to the pending list after the updated 5781 * inode block has been written to disk. As mentioned above, 5782 * checks regarding the number of blocks de-allocated (compared 5783 * to the number of blocks allocated for the file) are also 5784 * performed in this function. 5785 */ 5786static void 5787handle_workitem_freeblocks(freeblks, flags) 5788 struct freeblks *freeblks; 5789 int flags; 5790{ 5791 struct freework *freework; 5792 struct worklist *wk; 5793 5794 KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd), 5795 ("handle_workitem_freeblocks: Journal entries not written.")); 5796 if (LIST_EMPTY(&freeblks->fb_freeworkhd)) { 5797 handle_complete_freeblocks(freeblks); 5798 return; 5799 } 5800 freeblks->fb_ref++; 5801 while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { 5802 KASSERT(wk->wk_type == D_FREEWORK, 5803 ("handle_workitem_freeblocks: Unknown type %s", 5804 TYPENAME(wk->wk_type))); 5805 WORKLIST_REMOVE_UNLOCKED(wk); 5806 freework = WK_FREEWORK(wk); 5807 if (freework->fw_lbn <= -NDADDR) 5808 handle_workitem_indirblk(freework); 5809 else 5810 freework_freeblock(freework); 5811 } 5812 ACQUIRE_LOCK(&lk); 5813 if (--freeblks->fb_ref != 0) 5814 freeblks = NULL; 5815 FREE_LOCK(&lk); 5816 if (freeblks) 5817 handle_complete_freeblocks(freeblks); 5818} 5819 5820/* 5821 * Once all of the freework workitems are complete we can retire the 5822 * freeblocks dependency and any journal work awaiting completion. This 5823 * can not be called until all other dependencies are stable on disk. 5824 */ 5825static void 5826handle_complete_freeblocks(freeblks) 5827 struct freeblks *freeblks; 5828{ 5829 struct inode *ip; 5830 struct vnode *vp; 5831 struct fs *fs; 5832 struct ufsmount *ump; 5833 int flags; 5834 5835 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 5836 fs = ump->um_fs; 5837 flags = LK_NOWAIT; 5838 5839 /* 5840 * If we still have not finished background cleanup, then check 5841 * to see if the block count needs to be adjusted. 5842 */ 5843 if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 && 5844 ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum, 5845 (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) { 5846 ip = VTOI(vp); 5847 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt); 5848 ip->i_flag |= IN_CHANGE; 5849 vput(vp); 5850 } 5851 5852#ifdef INVARIANTS 5853 if (freeblks->fb_chkcnt != 0 && 5854 ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0)) 5855 printf("handle_workitem_freeblocks: block count\n"); 5856#endif /* INVARIANTS */ 5857 5858 ACQUIRE_LOCK(&lk); 5859 /* 5860 * All of the freeblock deps must be complete prior to this call 5861 * so it's now safe to complete earlier outstanding journal entries. 5862 */ 5863 handle_jwork(&freeblks->fb_jwork); 5864 WORKITEM_FREE(freeblks, D_FREEBLKS); 5865 num_freeblkdep--; 5866 FREE_LOCK(&lk); 5867} 5868 5869/* 5870 * Release blocks associated with the inode ip and stored in the indirect 5871 * block dbn. If level is greater than SINGLE, the block is an indirect block 5872 * and recursive calls to indirtrunc must be used to cleanse other indirect 5873 * blocks. 5874 */ 5875static void 5876indir_trunc(freework, dbn, lbn) 5877 struct freework *freework; 5878 ufs2_daddr_t dbn; 5879 ufs_lbn_t lbn; 5880{ 5881 struct freework *nfreework; 5882 struct workhead wkhd; 5883 struct jnewblk *jnewblk; 5884 struct freeblks *freeblks; 5885 struct buf *bp; 5886 struct fs *fs; 5887 struct worklist *wkn; 5888 struct worklist *wk; 5889 struct indirdep *indirdep; 5890 struct ufsmount *ump; 5891 ufs1_daddr_t *bap1 = 0; 5892 ufs2_daddr_t nb, nnb, *bap2 = 0; 5893 ufs_lbn_t lbnadd; 5894 int i, nblocks, ufs1fmt; 5895 int fs_pendingblocks; 5896 int freedeps; 5897 int needj; 5898 int level; 5899 int cnt; 5900 5901 LIST_INIT(&wkhd); 5902 level = lbn_level(lbn); 5903 if (level == -1) 5904 panic("indir_trunc: Invalid lbn %jd\n", lbn); 5905 freeblks = freework->fw_freeblks; 5906 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 5907 fs = ump->um_fs; 5908 fs_pendingblocks = 0; 5909 freedeps = 0; 5910 needj = UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ; 5911 lbnadd = 1; 5912 for (i = level; i > 0; i--) 5913 lbnadd *= NINDIR(fs); 5914 /* 5915 * Get buffer of block pointers to be freed. This routine is not 5916 * called until the zero'ed inode has been written, so it is safe 5917 * to free blocks as they are encountered. Because the inode has 5918 * been zero'ed, calls to bmap on these blocks will fail. So, we 5919 * have to use the on-disk address and the block device for the 5920 * filesystem to look them up. If the file was deleted before its 5921 * indirect blocks were all written to disk, the routine that set 5922 * us up (deallocate_dependencies) will have arranged to leave 5923 * a complete copy of the indirect block in memory for our use. 5924 * Otherwise we have to read the blocks in from the disk. 5925 */ 5926#ifdef notyet 5927 bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0, 5928 GB_NOCREAT); 5929#else 5930 bp = incore(&freeblks->fb_devvp->v_bufobj, dbn); 5931#endif 5932 ACQUIRE_LOCK(&lk); 5933 if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) { 5934 if (wk->wk_type != D_INDIRDEP || 5935 (wk->wk_state & GOINGAWAY) == 0) 5936 panic("indir_trunc: lost indirdep %p", wk); 5937 indirdep = WK_INDIRDEP(wk); 5938 LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list); 5939 free_indirdep(indirdep); 5940 if (!LIST_EMPTY(&bp->b_dep)) 5941 panic("indir_trunc: dangling dep %p", 5942 LIST_FIRST(&bp->b_dep)); 5943 ump->um_numindirdeps -= 1; 5944 FREE_LOCK(&lk); 5945 } else { 5946#ifdef notyet 5947 if (bp) 5948 brelse(bp); 5949#endif 5950 FREE_LOCK(&lk); 5951 if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 5952 NOCRED, &bp) != 0) { 5953 brelse(bp); 5954 return; 5955 } 5956 } 5957 /* 5958 * Recursively free indirect blocks. 5959 */ 5960 if (ump->um_fstype == UFS1) { 5961 ufs1fmt = 1; 5962 bap1 = (ufs1_daddr_t *)bp->b_data; 5963 } else { 5964 ufs1fmt = 0; 5965 bap2 = (ufs2_daddr_t *)bp->b_data; 5966 } 5967 /* 5968 * Reclaim indirect blocks which never made it to disk. 5969 */ 5970 cnt = 0; 5971 LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) { 5972 struct workhead freewk; 5973 if (wk->wk_type != D_JNEWBLK) 5974 continue; 5975 WORKLIST_REMOVE_UNLOCKED(wk); 5976 LIST_INIT(&freewk); 5977 WORKLIST_INSERT_UNLOCKED(&freewk, wk); 5978 jnewblk = WK_JNEWBLK(wk); 5979 if (jnewblk->jn_lbn > 0) 5980 i = (jnewblk->jn_lbn - -lbn) / lbnadd; 5981 else 5982 i = (jnewblk->jn_lbn - (lbn + 1)) / lbnadd; 5983 KASSERT(i >= 0 && i < NINDIR(fs), 5984 ("indir_trunc: Index out of range %d parent %jd lbn %jd", 5985 i, lbn, jnewblk->jn_lbn)); 5986 /* Clear the pointer so it isn't found below. */ 5987 if (ufs1fmt) { 5988 nb = bap1[i]; 5989 bap1[i] = 0; 5990 } else { 5991 nb = bap2[i]; 5992 bap2[i] = 0; 5993 } 5994 KASSERT(nb == jnewblk->jn_blkno, 5995 ("indir_trunc: Block mismatch %jd != %jd", 5996 nb, jnewblk->jn_blkno)); 5997 ffs_blkfree(ump, fs, freeblks->fb_devvp, jnewblk->jn_blkno, 5998 fs->fs_bsize, freeblks->fb_previousinum, &freewk); 5999 cnt++; 6000 } 6001 ACQUIRE_LOCK(&lk); 6002 if (needj) 6003 freework->fw_ref += NINDIR(fs) + 1; 6004 /* Any remaining journal work can be completed with freeblks. */ 6005 jwork_move(&freeblks->fb_jwork, &wkhd); 6006 FREE_LOCK(&lk); 6007 nblocks = btodb(fs->fs_bsize); 6008 if (ufs1fmt) 6009 nb = bap1[0]; 6010 else 6011 nb = bap2[0]; 6012 nfreework = freework; 6013 /* 6014 * Reclaim on disk blocks. 6015 */ 6016 for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { 6017 if (i != NINDIR(fs) - 1) { 6018 if (ufs1fmt) 6019 nnb = bap1[i+1]; 6020 else 6021 nnb = bap2[i+1]; 6022 } else 6023 nnb = 0; 6024 if (nb == 0) 6025 continue; 6026 cnt++; 6027 if (level != 0) { 6028 ufs_lbn_t nlbn; 6029 6030 nlbn = (lbn + 1) - (i * lbnadd); 6031 if (needj != 0) { 6032 nfreework = newfreework(freeblks, freework, 6033 nlbn, nb, fs->fs_frag, 0); 6034 freedeps++; 6035 } 6036 indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); 6037 } else { 6038 struct freedep *freedep; 6039 6040 /* 6041 * Attempt to aggregate freedep dependencies for 6042 * all blocks being released to the same CG. 6043 */ 6044 LIST_INIT(&wkhd); 6045 if (needj != 0 && 6046 (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { 6047 freedep = newfreedep(freework); 6048 WORKLIST_INSERT_UNLOCKED(&wkhd, 6049 &freedep->fd_list); 6050 freedeps++; 6051 } 6052 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, 6053 fs->fs_bsize, freeblks->fb_previousinum, &wkhd); 6054 } 6055 } 6056 if (level == 0) 6057 fs_pendingblocks = (nblocks * cnt); 6058 /* 6059 * If we're not journaling we can free the indirect now. Otherwise 6060 * setup the ref counts and offset so this indirect can be completed 6061 * when its children are free. 6062 */ 6063 if (needj == 0) { 6064 fs_pendingblocks += nblocks; 6065 dbn = dbtofsb(fs, dbn); 6066 ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, 6067 freeblks->fb_previousinum, NULL); 6068 ACQUIRE_LOCK(&lk); 6069 freeblks->fb_chkcnt -= fs_pendingblocks; 6070 if (freework->fw_blkno == dbn) 6071 handle_written_freework(freework); 6072 FREE_LOCK(&lk); 6073 freework = NULL; 6074 } else { 6075 ACQUIRE_LOCK(&lk); 6076 freework->fw_off = i; 6077 freework->fw_ref += freedeps; 6078 freework->fw_ref -= NINDIR(fs) + 1; 6079 if (freework->fw_ref != 0) 6080 freework = NULL; 6081 freeblks->fb_chkcnt -= fs_pendingblocks; 6082 FREE_LOCK(&lk); 6083 } 6084 if (fs_pendingblocks) { 6085 UFS_LOCK(ump); 6086 fs->fs_pendingblocks -= fs_pendingblocks; 6087 UFS_UNLOCK(ump); 6088 } 6089 bp->b_flags |= B_INVAL | B_NOCACHE; 6090 brelse(bp); 6091 if (freework) 6092 handle_workitem_indirblk(freework); 6093 return; 6094} 6095 6096/* 6097 * Cancel an allocindir when it is removed via truncation. 6098 */ 6099static void 6100cancel_allocindir(aip, inodedep, freeblks) 6101 struct allocindir *aip; 6102 struct inodedep *inodedep; 6103 struct freeblks *freeblks; 6104{ 6105 struct newblk *newblk; 6106 6107 /* 6108 * If the journal hasn't been written the jnewblk must be passed 6109 * to the call to ffs_freeblk that reclaims the space. We accomplish 6110 * this by linking the journal dependency into the indirdep to be 6111 * freed when indir_trunc() is called. If the journal has already 6112 * been written we can simply reclaim the journal space when the 6113 * freeblks work is complete. 6114 */ 6115 LIST_REMOVE(aip, ai_next); 6116 newblk = (struct newblk *)aip; 6117 if (newblk->nb_jnewblk == NULL) 6118 cancel_newblk(newblk, &freeblks->fb_jwork); 6119 else 6120 cancel_newblk(newblk, &aip->ai_indirdep->ir_jwork); 6121 if (inodedep && inodedep->id_state & DEPCOMPLETE) 6122 WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list); 6123 else 6124 free_newblk(newblk); 6125} 6126 6127/* 6128 * Create the mkdir dependencies for . and .. in a new directory. Link them 6129 * in to a newdirblk so any subsequent additions are tracked properly. The 6130 * caller is responsible for adding the mkdir1 dependency to the journal 6131 * and updating id_mkdiradd. This function returns with lk held. 6132 */ 6133static struct mkdir * 6134setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) 6135 struct diradd *dap; 6136 ino_t newinum; 6137 ino_t dinum; 6138 struct buf *newdirbp; 6139 struct mkdir **mkdirp; 6140{ 6141 struct newblk *newblk; 6142 struct pagedep *pagedep; 6143 struct inodedep *inodedep; 6144 struct newdirblk *newdirblk = 0; 6145 struct mkdir *mkdir1, *mkdir2; 6146 struct worklist *wk; 6147 struct jaddref *jaddref; 6148 struct mount *mp; 6149 6150 mp = dap->da_list.wk_mp; 6151 newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, 6152 M_SOFTDEP_FLAGS); 6153 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 6154 LIST_INIT(&newdirblk->db_mkdir); 6155 mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 6156 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); 6157 mkdir1->md_state = ATTACHED | MKDIR_BODY; 6158 mkdir1->md_diradd = dap; 6159 mkdir1->md_jaddref = NULL; 6160 mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 6161 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); 6162 mkdir2->md_state = ATTACHED | MKDIR_PARENT; 6163 mkdir2->md_diradd = dap; 6164 mkdir2->md_jaddref = NULL; 6165 if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) { 6166 mkdir1->md_state |= DEPCOMPLETE; 6167 mkdir2->md_state |= DEPCOMPLETE; 6168 } 6169 /* 6170 * Dependency on "." and ".." being written to disk. 6171 */ 6172 mkdir1->md_buf = newdirbp; 6173 ACQUIRE_LOCK(&lk); 6174 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 6175 /* 6176 * We must link the pagedep, allocdirect, and newdirblk for 6177 * the initial file page so the pointer to the new directory 6178 * is not written until the directory contents are live and 6179 * any subsequent additions are not marked live until the 6180 * block is reachable via the inode. 6181 */ 6182 if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0) 6183 panic("setup_newdir: lost pagedep"); 6184 LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) 6185 if (wk->wk_type == D_ALLOCDIRECT) 6186 break; 6187 if (wk == NULL) 6188 panic("setup_newdir: lost allocdirect"); 6189 newblk = WK_NEWBLK(wk); 6190 pagedep->pd_state |= NEWBLOCK; 6191 pagedep->pd_newdirblk = newdirblk; 6192 newdirblk->db_pagedep = pagedep; 6193 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 6194 WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); 6195 /* 6196 * Look up the inodedep for the parent directory so that we 6197 * can link mkdir2 into the pending dotdot jaddref or 6198 * the inode write if there is none. If the inode is 6199 * ALLCOMPLETE and no jaddref is present all dependencies have 6200 * been satisfied and mkdir2 can be freed. 6201 */ 6202 inodedep_lookup(mp, dinum, 0, &inodedep); 6203 if (mp->mnt_kern_flag & MNTK_SUJ) { 6204 if (inodedep == NULL) 6205 panic("setup_newdir: Lost parent."); 6206 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 6207 inoreflst); 6208 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && 6209 (jaddref->ja_state & MKDIR_PARENT), 6210 ("setup_newdir: bad dotdot jaddref %p", jaddref)); 6211 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 6212 mkdir2->md_jaddref = jaddref; 6213 jaddref->ja_mkdir = mkdir2; 6214 } else if (inodedep == NULL || 6215 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 6216 dap->da_state &= ~MKDIR_PARENT; 6217 WORKITEM_FREE(mkdir2, D_MKDIR); 6218 } else { 6219 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 6220 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); 6221 } 6222 *mkdirp = mkdir2; 6223 6224 return (mkdir1); 6225} 6226 6227/* 6228 * Directory entry addition dependencies. 6229 * 6230 * When adding a new directory entry, the inode (with its incremented link 6231 * count) must be written to disk before the directory entry's pointer to it. 6232 * Also, if the inode is newly allocated, the corresponding freemap must be 6233 * updated (on disk) before the directory entry's pointer. These requirements 6234 * are met via undo/redo on the directory entry's pointer, which consists 6235 * simply of the inode number. 6236 * 6237 * As directory entries are added and deleted, the free space within a 6238 * directory block can become fragmented. The ufs filesystem will compact 6239 * a fragmented directory block to make space for a new entry. When this 6240 * occurs, the offsets of previously added entries change. Any "diradd" 6241 * dependency structures corresponding to these entries must be updated with 6242 * the new offsets. 6243 */ 6244 6245/* 6246 * This routine is called after the in-memory inode's link 6247 * count has been incremented, but before the directory entry's 6248 * pointer to the inode has been set. 6249 */ 6250int 6251softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 6252 struct buf *bp; /* buffer containing directory block */ 6253 struct inode *dp; /* inode for directory */ 6254 off_t diroffset; /* offset of new entry in directory */ 6255 ino_t newinum; /* inode referenced by new directory entry */ 6256 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 6257 int isnewblk; /* entry is in a newly allocated block */ 6258{ 6259 int offset; /* offset of new entry within directory block */ 6260 ufs_lbn_t lbn; /* block in directory containing new entry */ 6261 struct fs *fs; 6262 struct diradd *dap; 6263 struct newblk *newblk; 6264 struct pagedep *pagedep; 6265 struct inodedep *inodedep; 6266 struct newdirblk *newdirblk = 0; 6267 struct mkdir *mkdir1, *mkdir2; 6268 struct jaddref *jaddref; 6269 struct mount *mp; 6270 int isindir; 6271 6272 /* 6273 * Whiteouts have no dependencies. 6274 */ 6275 if (newinum == WINO) { 6276 if (newdirbp != NULL) 6277 bdwrite(newdirbp); 6278 return (0); 6279 } 6280 jaddref = NULL; 6281 mkdir1 = mkdir2 = NULL; 6282 mp = UFSTOVFS(dp->i_ump); 6283 fs = dp->i_fs; 6284 lbn = lblkno(fs, diroffset); 6285 offset = blkoff(fs, diroffset); 6286 dap = malloc(sizeof(struct diradd), M_DIRADD, 6287 M_SOFTDEP_FLAGS|M_ZERO); 6288 workitem_alloc(&dap->da_list, D_DIRADD, mp); 6289 dap->da_offset = offset; 6290 dap->da_newinum = newinum; 6291 dap->da_state = ATTACHED; 6292 LIST_INIT(&dap->da_jwork); 6293 isindir = bp->b_lblkno >= NDADDR; 6294 if (isnewblk && 6295 (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { 6296 newdirblk = malloc(sizeof(struct newdirblk), 6297 M_NEWDIRBLK, M_SOFTDEP_FLAGS); 6298 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 6299 LIST_INIT(&newdirblk->db_mkdir); 6300 } 6301 /* 6302 * If we're creating a new directory setup the dependencies and set 6303 * the dap state to wait for them. Otherwise it's COMPLETE and 6304 * we can move on. 6305 */ 6306 if (newdirbp == NULL) { 6307 dap->da_state |= DEPCOMPLETE; 6308 ACQUIRE_LOCK(&lk); 6309 } else { 6310 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 6311 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, 6312 &mkdir2); 6313 } 6314 /* 6315 * Link into parent directory pagedep to await its being written. 6316 */ 6317 if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0) 6318 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 6319#ifdef DEBUG 6320 if (diradd_lookup(pagedep, offset) != NULL) 6321 panic("softdep_setup_directory_add: %p already at off %d\n", 6322 diradd_lookup(pagedep, offset), offset); 6323#endif 6324 dap->da_pagedep = pagedep; 6325 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 6326 da_pdlist); 6327 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 6328 /* 6329 * If we're journaling, link the diradd into the jaddref so it 6330 * may be completed after the journal entry is written. Otherwise, 6331 * link the diradd into its inodedep. If the inode is not yet 6332 * written place it on the bufwait list, otherwise do the post-inode 6333 * write processing to put it on the id_pendinghd list. 6334 */ 6335 if (mp->mnt_kern_flag & MNTK_SUJ) { 6336 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 6337 inoreflst); 6338 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 6339 ("softdep_setup_directory_add: bad jaddref %p", jaddref)); 6340 jaddref->ja_diroff = diroffset; 6341 jaddref->ja_diradd = dap; 6342 add_to_journal(&jaddref->ja_list); 6343 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 6344 diradd_inode_written(dap, inodedep); 6345 else 6346 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 6347 /* 6348 * Add the journal entries for . and .. links now that the primary 6349 * link is written. 6350 */ 6351 if (mkdir1 != NULL && mp->mnt_kern_flag & MNTK_SUJ) { 6352 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 6353 inoreflst, if_deps); 6354 KASSERT(jaddref != NULL && 6355 jaddref->ja_ino == jaddref->ja_parent && 6356 (jaddref->ja_state & MKDIR_BODY), 6357 ("softdep_setup_directory_add: bad dot jaddref %p", 6358 jaddref)); 6359 mkdir1->md_jaddref = jaddref; 6360 jaddref->ja_mkdir = mkdir1; 6361 /* 6362 * It is important that the dotdot journal entry 6363 * is added prior to the dot entry since dot writes 6364 * both the dot and dotdot links. These both must 6365 * be added after the primary link for the journal 6366 * to remain consistent. 6367 */ 6368 add_to_journal(&mkdir2->md_jaddref->ja_list); 6369 add_to_journal(&jaddref->ja_list); 6370 } 6371 /* 6372 * If we are adding a new directory remember this diradd so that if 6373 * we rename it we can keep the dot and dotdot dependencies. If 6374 * we are adding a new name for an inode that has a mkdiradd we 6375 * must be in rename and we have to move the dot and dotdot 6376 * dependencies to this new name. The old name is being orphaned 6377 * soon. 6378 */ 6379 if (mkdir1 != NULL) { 6380 if (inodedep->id_mkdiradd != NULL) 6381 panic("softdep_setup_directory_add: Existing mkdir"); 6382 inodedep->id_mkdiradd = dap; 6383 } else if (inodedep->id_mkdiradd) 6384 merge_diradd(inodedep, dap); 6385 if (newdirblk) { 6386 /* 6387 * There is nothing to do if we are already tracking 6388 * this block. 6389 */ 6390 if ((pagedep->pd_state & NEWBLOCK) != 0) { 6391 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 6392 FREE_LOCK(&lk); 6393 return (0); 6394 } 6395 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) 6396 == 0) 6397 panic("softdep_setup_directory_add: lost entry"); 6398 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 6399 pagedep->pd_state |= NEWBLOCK; 6400 pagedep->pd_newdirblk = newdirblk; 6401 newdirblk->db_pagedep = pagedep; 6402 FREE_LOCK(&lk); 6403 /* 6404 * If we extended into an indirect signal direnter to sync. 6405 */ 6406 if (isindir) 6407 return (1); 6408 return (0); 6409 } 6410 FREE_LOCK(&lk); 6411 return (0); 6412} 6413 6414/* 6415 * This procedure is called to change the offset of a directory 6416 * entry when compacting a directory block which must be owned 6417 * exclusively by the caller. Note that the actual entry movement 6418 * must be done in this procedure to ensure that no I/O completions 6419 * occur while the move is in progress. 6420 */ 6421void 6422softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 6423 struct buf *bp; /* Buffer holding directory block. */ 6424 struct inode *dp; /* inode for directory */ 6425 caddr_t base; /* address of dp->i_offset */ 6426 caddr_t oldloc; /* address of old directory location */ 6427 caddr_t newloc; /* address of new directory location */ 6428 int entrysize; /* size of directory entry */ 6429{ 6430 int offset, oldoffset, newoffset; 6431 struct pagedep *pagedep; 6432 struct jmvref *jmvref; 6433 struct diradd *dap; 6434 struct direct *de; 6435 struct mount *mp; 6436 ufs_lbn_t lbn; 6437 int flags; 6438 6439 mp = UFSTOVFS(dp->i_ump); 6440 de = (struct direct *)oldloc; 6441 jmvref = NULL; 6442 flags = 0; 6443 /* 6444 * Moves are always journaled as it would be too complex to 6445 * determine if any affected adds or removes are present in the 6446 * journal. 6447 */ 6448 if (mp->mnt_kern_flag & MNTK_SUJ) { 6449 flags = DEPALLOC; 6450 jmvref = newjmvref(dp, de->d_ino, 6451 dp->i_offset + (oldloc - base), 6452 dp->i_offset + (newloc - base)); 6453 } 6454 lbn = lblkno(dp->i_fs, dp->i_offset); 6455 offset = blkoff(dp->i_fs, dp->i_offset); 6456 oldoffset = offset + (oldloc - base); 6457 newoffset = offset + (newloc - base); 6458 ACQUIRE_LOCK(&lk); 6459 if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) { 6460 if (pagedep) 6461 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 6462 goto done; 6463 } 6464 dap = diradd_lookup(pagedep, oldoffset); 6465 if (dap) { 6466 dap->da_offset = newoffset; 6467 newoffset = DIRADDHASH(newoffset); 6468 oldoffset = DIRADDHASH(oldoffset); 6469 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && 6470 newoffset != oldoffset) { 6471 LIST_REMOVE(dap, da_pdlist); 6472 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], 6473 dap, da_pdlist); 6474 } 6475 } 6476done: 6477 if (jmvref) { 6478 jmvref->jm_pagedep = pagedep; 6479 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); 6480 add_to_journal(&jmvref->jm_list); 6481 } 6482 bcopy(oldloc, newloc, entrysize); 6483 FREE_LOCK(&lk); 6484} 6485 6486/* 6487 * Move the mkdir dependencies and journal work from one diradd to another 6488 * when renaming a directory. The new name must depend on the mkdir deps 6489 * completing as the old name did. Directories can only have one valid link 6490 * at a time so one must be canonical. 6491 */ 6492static void 6493merge_diradd(inodedep, newdap) 6494 struct inodedep *inodedep; 6495 struct diradd *newdap; 6496{ 6497 struct diradd *olddap; 6498 struct mkdir *mkdir, *nextmd; 6499 short state; 6500 6501 olddap = inodedep->id_mkdiradd; 6502 inodedep->id_mkdiradd = newdap; 6503 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 6504 newdap->da_state &= ~DEPCOMPLETE; 6505 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 6506 nextmd = LIST_NEXT(mkdir, md_mkdirs); 6507 if (mkdir->md_diradd != olddap) 6508 continue; 6509 mkdir->md_diradd = newdap; 6510 state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); 6511 newdap->da_state |= state; 6512 olddap->da_state &= ~state; 6513 if ((olddap->da_state & 6514 (MKDIR_PARENT | MKDIR_BODY)) == 0) 6515 break; 6516 } 6517 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 6518 panic("merge_diradd: unfound ref"); 6519 } 6520 /* 6521 * Any mkdir related journal items are not safe to be freed until 6522 * the new name is stable. 6523 */ 6524 jwork_move(&newdap->da_jwork, &olddap->da_jwork); 6525 olddap->da_state |= DEPCOMPLETE; 6526 complete_diradd(olddap); 6527} 6528 6529/* 6530 * Move the diradd to the pending list when all diradd dependencies are 6531 * complete. 6532 */ 6533static void 6534complete_diradd(dap) 6535 struct diradd *dap; 6536{ 6537 struct pagedep *pagedep; 6538 6539 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 6540 if (dap->da_state & DIRCHG) 6541 pagedep = dap->da_previous->dm_pagedep; 6542 else 6543 pagedep = dap->da_pagedep; 6544 LIST_REMOVE(dap, da_pdlist); 6545 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 6546 } 6547} 6548 6549/* 6550 * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal 6551 * add entries and conditonally journal the remove. 6552 */ 6553static void 6554cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) 6555 struct diradd *dap; 6556 struct dirrem *dirrem; 6557 struct jremref *jremref; 6558 struct jremref *dotremref; 6559 struct jremref *dotdotremref; 6560{ 6561 struct inodedep *inodedep; 6562 struct jaddref *jaddref; 6563 struct inoref *inoref; 6564 struct mkdir *mkdir; 6565 6566 /* 6567 * If no remove references were allocated we're on a non-journaled 6568 * filesystem and can skip the cancel step. 6569 */ 6570 if (jremref == NULL) { 6571 free_diradd(dap, NULL); 6572 return; 6573 } 6574 /* 6575 * Cancel the primary name an free it if it does not require 6576 * journaling. 6577 */ 6578 if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, 6579 0, &inodedep) != 0) { 6580 /* Abort the addref that reference this diradd. */ 6581 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 6582 if (inoref->if_list.wk_type != D_JADDREF) 6583 continue; 6584 jaddref = (struct jaddref *)inoref; 6585 if (jaddref->ja_diradd != dap) 6586 continue; 6587 if (cancel_jaddref(jaddref, inodedep, 6588 &dirrem->dm_jwork) == 0) { 6589 free_jremref(jremref); 6590 jremref = NULL; 6591 } 6592 break; 6593 } 6594 } 6595 /* 6596 * Cancel subordinate names and free them if they do not require 6597 * journaling. 6598 */ 6599 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 6600 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 6601 if (mkdir->md_diradd != dap) 6602 continue; 6603 if ((jaddref = mkdir->md_jaddref) == NULL) 6604 continue; 6605 mkdir->md_jaddref = NULL; 6606 if (mkdir->md_state & MKDIR_PARENT) { 6607 if (cancel_jaddref(jaddref, NULL, 6608 &dirrem->dm_jwork) == 0) { 6609 free_jremref(dotdotremref); 6610 dotdotremref = NULL; 6611 } 6612 } else { 6613 if (cancel_jaddref(jaddref, inodedep, 6614 &dirrem->dm_jwork) == 0) { 6615 free_jremref(dotremref); 6616 dotremref = NULL; 6617 } 6618 } 6619 } 6620 } 6621 6622 if (jremref) 6623 journal_jremref(dirrem, jremref, inodedep); 6624 if (dotremref) 6625 journal_jremref(dirrem, dotremref, inodedep); 6626 if (dotdotremref) 6627 journal_jremref(dirrem, dotdotremref, NULL); 6628 jwork_move(&dirrem->dm_jwork, &dap->da_jwork); 6629 free_diradd(dap, &dirrem->dm_jwork); 6630} 6631 6632/* 6633 * Free a diradd dependency structure. This routine must be called 6634 * with splbio interrupts blocked. 6635 */ 6636static void 6637free_diradd(dap, wkhd) 6638 struct diradd *dap; 6639 struct workhead *wkhd; 6640{ 6641 struct dirrem *dirrem; 6642 struct pagedep *pagedep; 6643 struct inodedep *inodedep; 6644 struct mkdir *mkdir, *nextmd; 6645 6646 mtx_assert(&lk, MA_OWNED); 6647 LIST_REMOVE(dap, da_pdlist); 6648 if (dap->da_state & ONWORKLIST) 6649 WORKLIST_REMOVE(&dap->da_list); 6650 if ((dap->da_state & DIRCHG) == 0) { 6651 pagedep = dap->da_pagedep; 6652 } else { 6653 dirrem = dap->da_previous; 6654 pagedep = dirrem->dm_pagedep; 6655 dirrem->dm_dirinum = pagedep->pd_ino; 6656 dirrem->dm_state |= COMPLETE; 6657 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 6658 add_to_worklist(&dirrem->dm_list, 0); 6659 } 6660 if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 6661 0, &inodedep) != 0) 6662 if (inodedep->id_mkdiradd == dap) 6663 inodedep->id_mkdiradd = NULL; 6664 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 6665 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 6666 nextmd = LIST_NEXT(mkdir, md_mkdirs); 6667 if (mkdir->md_diradd != dap) 6668 continue; 6669 dap->da_state &= 6670 ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 6671 LIST_REMOVE(mkdir, md_mkdirs); 6672 if (mkdir->md_state & ONWORKLIST) 6673 WORKLIST_REMOVE(&mkdir->md_list); 6674 if (mkdir->md_jaddref != NULL) 6675 panic("free_diradd: Unexpected jaddref"); 6676 WORKITEM_FREE(mkdir, D_MKDIR); 6677 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 6678 break; 6679 } 6680 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 6681 panic("free_diradd: unfound ref"); 6682 } 6683 if (inodedep) 6684 free_inodedep(inodedep); 6685 /* 6686 * Free any journal segments waiting for the directory write. 6687 */ 6688 handle_jwork(&dap->da_jwork); 6689 WORKITEM_FREE(dap, D_DIRADD); 6690} 6691 6692/* 6693 * Directory entry removal dependencies. 6694 * 6695 * When removing a directory entry, the entry's inode pointer must be 6696 * zero'ed on disk before the corresponding inode's link count is decremented 6697 * (possibly freeing the inode for re-use). This dependency is handled by 6698 * updating the directory entry but delaying the inode count reduction until 6699 * after the directory block has been written to disk. After this point, the 6700 * inode count can be decremented whenever it is convenient. 6701 */ 6702 6703/* 6704 * This routine should be called immediately after removing 6705 * a directory entry. The inode's link count should not be 6706 * decremented by the calling procedure -- the soft updates 6707 * code will do this task when it is safe. 6708 */ 6709void 6710softdep_setup_remove(bp, dp, ip, isrmdir) 6711 struct buf *bp; /* buffer containing directory block */ 6712 struct inode *dp; /* inode for the directory being modified */ 6713 struct inode *ip; /* inode for directory entry being removed */ 6714 int isrmdir; /* indicates if doing RMDIR */ 6715{ 6716 struct dirrem *dirrem, *prevdirrem; 6717 struct inodedep *inodedep; 6718 int direct; 6719 6720 /* 6721 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want 6722 * newdirrem() to setup the full directory remove which requires 6723 * isrmdir > 1. 6724 */ 6725 dirrem = newdirrem(bp, dp, ip, isrmdir?2:0, &prevdirrem); 6726 /* 6727 * Add the dirrem to the inodedep's pending remove list for quick 6728 * discovery later. 6729 */ 6730 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 6731 &inodedep) == 0) 6732 panic("softdep_setup_remove: Lost inodedep."); 6733 dirrem->dm_state |= ONDEPLIST; 6734 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 6735 6736 /* 6737 * If the COMPLETE flag is clear, then there were no active 6738 * entries and we want to roll back to a zeroed entry until 6739 * the new inode is committed to disk. If the COMPLETE flag is 6740 * set then we have deleted an entry that never made it to 6741 * disk. If the entry we deleted resulted from a name change, 6742 * then the old name still resides on disk. We cannot delete 6743 * its inode (returned to us in prevdirrem) until the zeroed 6744 * directory entry gets to disk. The new inode has never been 6745 * referenced on the disk, so can be deleted immediately. 6746 */ 6747 if ((dirrem->dm_state & COMPLETE) == 0) { 6748 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 6749 dm_next); 6750 FREE_LOCK(&lk); 6751 } else { 6752 if (prevdirrem != NULL) 6753 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 6754 prevdirrem, dm_next); 6755 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 6756 direct = LIST_EMPTY(&dirrem->dm_jremrefhd); 6757 FREE_LOCK(&lk); 6758 if (direct) 6759 handle_workitem_remove(dirrem, NULL); 6760 } 6761} 6762 6763/* 6764 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the 6765 * pd_pendinghd list of a pagedep. 6766 */ 6767static struct diradd * 6768diradd_lookup(pagedep, offset) 6769 struct pagedep *pagedep; 6770 int offset; 6771{ 6772 struct diradd *dap; 6773 6774 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 6775 if (dap->da_offset == offset) 6776 return (dap); 6777 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 6778 if (dap->da_offset == offset) 6779 return (dap); 6780 return (NULL); 6781} 6782 6783/* 6784 * Search for a .. diradd dependency in a directory that is being removed. 6785 * If the directory was renamed to a new parent we have a diradd rather 6786 * than a mkdir for the .. entry. We need to cancel it now before 6787 * it is found in truncate(). 6788 */ 6789static struct jremref * 6790cancel_diradd_dotdot(ip, dirrem, jremref) 6791 struct inode *ip; 6792 struct dirrem *dirrem; 6793 struct jremref *jremref; 6794{ 6795 struct pagedep *pagedep; 6796 struct diradd *dap; 6797 struct worklist *wk; 6798 6799 if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0, 6800 &pagedep) == 0) 6801 return (jremref); 6802 dap = diradd_lookup(pagedep, DOTDOT_OFFSET); 6803 if (dap == NULL) 6804 return (jremref); 6805 cancel_diradd(dap, dirrem, jremref, NULL, NULL); 6806 /* 6807 * Mark any journal work as belonging to the parent so it is freed 6808 * with the .. reference. 6809 */ 6810 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 6811 wk->wk_state |= MKDIR_PARENT; 6812 return (NULL); 6813} 6814 6815/* 6816 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to 6817 * replace it with a dirrem/diradd pair as a result of re-parenting a 6818 * directory. This ensures that we don't simultaneously have a mkdir and 6819 * a diradd for the same .. entry. 6820 */ 6821static struct jremref * 6822cancel_mkdir_dotdot(ip, dirrem, jremref) 6823 struct inode *ip; 6824 struct dirrem *dirrem; 6825 struct jremref *jremref; 6826{ 6827 struct inodedep *inodedep; 6828 struct jaddref *jaddref; 6829 struct mkdir *mkdir; 6830 struct diradd *dap; 6831 6832 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 6833 &inodedep) == 0) 6834 panic("cancel_mkdir_dotdot: Lost inodedep"); 6835 dap = inodedep->id_mkdiradd; 6836 if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) 6837 return (jremref); 6838 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; 6839 mkdir = LIST_NEXT(mkdir, md_mkdirs)) 6840 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) 6841 break; 6842 if (mkdir == NULL) 6843 panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); 6844 if ((jaddref = mkdir->md_jaddref) != NULL) { 6845 mkdir->md_jaddref = NULL; 6846 jaddref->ja_state &= ~MKDIR_PARENT; 6847 if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0, 6848 &inodedep) == 0) 6849 panic("cancel_mkdir_dotdot: Lost parent inodedep"); 6850 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { 6851 journal_jremref(dirrem, jremref, inodedep); 6852 jremref = NULL; 6853 } 6854 } 6855 if (mkdir->md_state & ONWORKLIST) 6856 WORKLIST_REMOVE(&mkdir->md_list); 6857 mkdir->md_state |= ALLCOMPLETE; 6858 complete_mkdir(mkdir); 6859 return (jremref); 6860} 6861 6862static void 6863journal_jremref(dirrem, jremref, inodedep) 6864 struct dirrem *dirrem; 6865 struct jremref *jremref; 6866 struct inodedep *inodedep; 6867{ 6868 6869 if (inodedep == NULL) 6870 if (inodedep_lookup(jremref->jr_list.wk_mp, 6871 jremref->jr_ref.if_ino, 0, &inodedep) == 0) 6872 panic("journal_jremref: Lost inodedep"); 6873 LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); 6874 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 6875 add_to_journal(&jremref->jr_list); 6876} 6877 6878static void 6879dirrem_journal(dirrem, jremref, dotremref, dotdotremref) 6880 struct dirrem *dirrem; 6881 struct jremref *jremref; 6882 struct jremref *dotremref; 6883 struct jremref *dotdotremref; 6884{ 6885 struct inodedep *inodedep; 6886 6887 6888 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, 6889 &inodedep) == 0) 6890 panic("dirrem_journal: Lost inodedep"); 6891 journal_jremref(dirrem, jremref, inodedep); 6892 if (dotremref) 6893 journal_jremref(dirrem, dotremref, inodedep); 6894 if (dotdotremref) 6895 journal_jremref(dirrem, dotdotremref, NULL); 6896} 6897 6898/* 6899 * Allocate a new dirrem if appropriate and return it along with 6900 * its associated pagedep. Called without a lock, returns with lock. 6901 */ 6902static long num_dirrem; /* number of dirrem allocated */ 6903static struct dirrem * 6904newdirrem(bp, dp, ip, isrmdir, prevdirremp) 6905 struct buf *bp; /* buffer containing directory block */ 6906 struct inode *dp; /* inode for the directory being modified */ 6907 struct inode *ip; /* inode for directory entry being removed */ 6908 int isrmdir; /* indicates if doing RMDIR */ 6909 struct dirrem **prevdirremp; /* previously referenced inode, if any */ 6910{ 6911 int offset; 6912 ufs_lbn_t lbn; 6913 struct diradd *dap; 6914 struct dirrem *dirrem; 6915 struct pagedep *pagedep; 6916 struct jremref *jremref; 6917 struct jremref *dotremref; 6918 struct jremref *dotdotremref; 6919 struct vnode *dvp; 6920 6921 /* 6922 * Whiteouts have no deletion dependencies. 6923 */ 6924 if (ip == NULL) 6925 panic("newdirrem: whiteout"); 6926 dvp = ITOV(dp); 6927 /* 6928 * If we are over our limit, try to improve the situation. 6929 * Limiting the number of dirrem structures will also limit 6930 * the number of freefile and freeblks structures. 6931 */ 6932 ACQUIRE_LOCK(&lk); 6933 if (!(ip->i_flags & SF_SNAPSHOT) && num_dirrem > max_softdeps / 2) 6934 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE); 6935 num_dirrem += 1; 6936 FREE_LOCK(&lk); 6937 dirrem = malloc(sizeof(struct dirrem), 6938 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); 6939 workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); 6940 LIST_INIT(&dirrem->dm_jremrefhd); 6941 LIST_INIT(&dirrem->dm_jwork); 6942 dirrem->dm_state = isrmdir ? RMDIR : 0; 6943 dirrem->dm_oldinum = ip->i_number; 6944 *prevdirremp = NULL; 6945 /* 6946 * Allocate remove reference structures to track journal write 6947 * dependencies. We will always have one for the link and 6948 * when doing directories we will always have one more for dot. 6949 * When renaming a directory we skip the dotdot link change so 6950 * this is not needed. 6951 */ 6952 jremref = dotremref = dotdotremref = NULL; 6953 if (DOINGSUJ(dvp)) { 6954 if (isrmdir) { 6955 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 6956 ip->i_effnlink + 2); 6957 dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, 6958 ip->i_effnlink + 1); 6959 } else 6960 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 6961 ip->i_effnlink + 1); 6962 if (isrmdir > 1) { 6963 dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, 6964 dp->i_effnlink + 1); 6965 dotdotremref->jr_state |= MKDIR_PARENT; 6966 } 6967 } 6968 ACQUIRE_LOCK(&lk); 6969 lbn = lblkno(dp->i_fs, dp->i_offset); 6970 offset = blkoff(dp->i_fs, dp->i_offset); 6971 if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC, 6972 &pagedep) == 0) 6973 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 6974 dirrem->dm_pagedep = pagedep; 6975 /* 6976 * If we're renaming a .. link to a new directory, cancel any 6977 * existing MKDIR_PARENT mkdir. If it has already been canceled 6978 * the jremref is preserved for any potential diradd in this 6979 * location. This can not coincide with a rmdir. 6980 */ 6981 if (dp->i_offset == DOTDOT_OFFSET) { 6982 if (isrmdir) 6983 panic("newdirrem: .. directory change during remove?"); 6984 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); 6985 } 6986 /* 6987 * If we're removing a directory search for the .. dependency now and 6988 * cancel it. Any pending journal work will be added to the dirrem 6989 * to be completed when the workitem remove completes. 6990 */ 6991 if (isrmdir > 1) 6992 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); 6993 /* 6994 * Check for a diradd dependency for the same directory entry. 6995 * If present, then both dependencies become obsolete and can 6996 * be de-allocated. 6997 */ 6998 dap = diradd_lookup(pagedep, offset); 6999 if (dap == NULL) { 7000 /* 7001 * Link the jremref structures into the dirrem so they are 7002 * written prior to the pagedep. 7003 */ 7004 if (jremref) 7005 dirrem_journal(dirrem, jremref, dotremref, 7006 dotdotremref); 7007 return (dirrem); 7008 } 7009 /* 7010 * Must be ATTACHED at this point. 7011 */ 7012 if ((dap->da_state & ATTACHED) == 0) 7013 panic("newdirrem: not ATTACHED"); 7014 if (dap->da_newinum != ip->i_number) 7015 panic("newdirrem: inum %d should be %d", 7016 ip->i_number, dap->da_newinum); 7017 /* 7018 * If we are deleting a changed name that never made it to disk, 7019 * then return the dirrem describing the previous inode (which 7020 * represents the inode currently referenced from this entry on disk). 7021 */ 7022 if ((dap->da_state & DIRCHG) != 0) { 7023 *prevdirremp = dap->da_previous; 7024 dap->da_state &= ~DIRCHG; 7025 dap->da_pagedep = pagedep; 7026 } 7027 /* 7028 * We are deleting an entry that never made it to disk. 7029 * Mark it COMPLETE so we can delete its inode immediately. 7030 */ 7031 dirrem->dm_state |= COMPLETE; 7032 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); 7033#ifdef SUJ_DEBUG 7034 if (isrmdir == 0) { 7035 struct worklist *wk; 7036 7037 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 7038 if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) 7039 panic("bad wk %p (0x%X)\n", wk, wk->wk_state); 7040 } 7041#endif 7042 7043 return (dirrem); 7044} 7045 7046/* 7047 * Directory entry change dependencies. 7048 * 7049 * Changing an existing directory entry requires that an add operation 7050 * be completed first followed by a deletion. The semantics for the addition 7051 * are identical to the description of adding a new entry above except 7052 * that the rollback is to the old inode number rather than zero. Once 7053 * the addition dependency is completed, the removal is done as described 7054 * in the removal routine above. 7055 */ 7056 7057/* 7058 * This routine should be called immediately after changing 7059 * a directory entry. The inode's link count should not be 7060 * decremented by the calling procedure -- the soft updates 7061 * code will perform this task when it is safe. 7062 */ 7063void 7064softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 7065 struct buf *bp; /* buffer containing directory block */ 7066 struct inode *dp; /* inode for the directory being modified */ 7067 struct inode *ip; /* inode for directory entry being removed */ 7068 ino_t newinum; /* new inode number for changed entry */ 7069 int isrmdir; /* indicates if doing RMDIR */ 7070{ 7071 int offset; 7072 struct diradd *dap = NULL; 7073 struct dirrem *dirrem, *prevdirrem; 7074 struct pagedep *pagedep; 7075 struct inodedep *inodedep; 7076 struct jaddref *jaddref; 7077 struct mount *mp; 7078 7079 offset = blkoff(dp->i_fs, dp->i_offset); 7080 mp = UFSTOVFS(dp->i_ump); 7081 7082 /* 7083 * Whiteouts do not need diradd dependencies. 7084 */ 7085 if (newinum != WINO) { 7086 dap = malloc(sizeof(struct diradd), 7087 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); 7088 workitem_alloc(&dap->da_list, D_DIRADD, mp); 7089 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 7090 dap->da_offset = offset; 7091 dap->da_newinum = newinum; 7092 LIST_INIT(&dap->da_jwork); 7093 } 7094 7095 /* 7096 * Allocate a new dirrem and ACQUIRE_LOCK. 7097 */ 7098 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 7099 pagedep = dirrem->dm_pagedep; 7100 /* 7101 * The possible values for isrmdir: 7102 * 0 - non-directory file rename 7103 * 1 - directory rename within same directory 7104 * inum - directory rename to new directory of given inode number 7105 * When renaming to a new directory, we are both deleting and 7106 * creating a new directory entry, so the link count on the new 7107 * directory should not change. Thus we do not need the followup 7108 * dirrem which is usually done in handle_workitem_remove. We set 7109 * the DIRCHG flag to tell handle_workitem_remove to skip the 7110 * followup dirrem. 7111 */ 7112 if (isrmdir > 1) 7113 dirrem->dm_state |= DIRCHG; 7114 7115 /* 7116 * Whiteouts have no additional dependencies, 7117 * so just put the dirrem on the correct list. 7118 */ 7119 if (newinum == WINO) { 7120 if ((dirrem->dm_state & COMPLETE) == 0) { 7121 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 7122 dm_next); 7123 } else { 7124 dirrem->dm_dirinum = pagedep->pd_ino; 7125 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 7126 add_to_worklist(&dirrem->dm_list, 0); 7127 } 7128 FREE_LOCK(&lk); 7129 return; 7130 } 7131 /* 7132 * Add the dirrem to the inodedep's pending remove list for quick 7133 * discovery later. A valid nlinkdelta ensures that this lookup 7134 * will not fail. 7135 */ 7136 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 7137 panic("softdep_setup_directory_change: Lost inodedep."); 7138 dirrem->dm_state |= ONDEPLIST; 7139 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 7140 7141 /* 7142 * If the COMPLETE flag is clear, then there were no active 7143 * entries and we want to roll back to the previous inode until 7144 * the new inode is committed to disk. If the COMPLETE flag is 7145 * set, then we have deleted an entry that never made it to disk. 7146 * If the entry we deleted resulted from a name change, then the old 7147 * inode reference still resides on disk. Any rollback that we do 7148 * needs to be to that old inode (returned to us in prevdirrem). If 7149 * the entry we deleted resulted from a create, then there is 7150 * no entry on the disk, so we want to roll back to zero rather 7151 * than the uncommitted inode. In either of the COMPLETE cases we 7152 * want to immediately free the unwritten and unreferenced inode. 7153 */ 7154 if ((dirrem->dm_state & COMPLETE) == 0) { 7155 dap->da_previous = dirrem; 7156 } else { 7157 if (prevdirrem != NULL) { 7158 dap->da_previous = prevdirrem; 7159 } else { 7160 dap->da_state &= ~DIRCHG; 7161 dap->da_pagedep = pagedep; 7162 } 7163 dirrem->dm_dirinum = pagedep->pd_ino; 7164 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 7165 add_to_worklist(&dirrem->dm_list, 0); 7166 } 7167 /* 7168 * Lookup the jaddref for this journal entry. We must finish 7169 * initializing it and make the diradd write dependent on it. 7170 * If we're not journaling Put it on the id_bufwait list if the inode 7171 * is not yet written. If it is written, do the post-inode write 7172 * processing to put it on the id_pendinghd list. 7173 */ 7174 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 7175 if (mp->mnt_kern_flag & MNTK_SUJ) { 7176 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 7177 inoreflst); 7178 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 7179 ("softdep_setup_directory_change: bad jaddref %p", 7180 jaddref)); 7181 jaddref->ja_diroff = dp->i_offset; 7182 jaddref->ja_diradd = dap; 7183 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 7184 dap, da_pdlist); 7185 add_to_journal(&jaddref->ja_list); 7186 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 7187 dap->da_state |= COMPLETE; 7188 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 7189 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 7190 } else { 7191 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 7192 dap, da_pdlist); 7193 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 7194 } 7195 /* 7196 * If we're making a new name for a directory that has not been 7197 * committed when need to move the dot and dotdot references to 7198 * this new name. 7199 */ 7200 if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) 7201 merge_diradd(inodedep, dap); 7202 FREE_LOCK(&lk); 7203} 7204 7205/* 7206 * Called whenever the link count on an inode is changed. 7207 * It creates an inode dependency so that the new reference(s) 7208 * to the inode cannot be committed to disk until the updated 7209 * inode has been written. 7210 */ 7211void 7212softdep_change_linkcnt(ip) 7213 struct inode *ip; /* the inode with the increased link count */ 7214{ 7215 struct inodedep *inodedep; 7216 7217 ACQUIRE_LOCK(&lk); 7218 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); 7219 if (ip->i_nlink < ip->i_effnlink) 7220 panic("softdep_change_linkcnt: bad delta"); 7221 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 7222 FREE_LOCK(&lk); 7223} 7224 7225/* 7226 * Called when the effective link count and the reference count 7227 * on an inode drops to zero. At this point there are no names 7228 * referencing the file in the filesystem and no active file 7229 * references. The space associated with the file will be freed 7230 * as soon as the necessary soft dependencies are cleared. 7231 */ 7232void 7233softdep_releasefile(ip) 7234 struct inode *ip; /* inode with the zero effective link count */ 7235{ 7236 struct inodedep *inodedep; 7237 struct fs *fs; 7238 int extblocks; 7239 7240 if (ip->i_effnlink > 0) 7241 panic("softdep_releasefile: file still referenced"); 7242 /* 7243 * We may be called several times as the on-disk link count 7244 * drops to zero. We only want to account for the space once. 7245 */ 7246 if (ip->i_flag & IN_SPACECOUNTED) 7247 return; 7248 /* 7249 * We have to deactivate a snapshot otherwise copyonwrites may 7250 * add blocks and the cleanup may remove blocks after we have 7251 * tried to account for them. 7252 */ 7253 if ((ip->i_flags & SF_SNAPSHOT) != 0) 7254 ffs_snapremove(ITOV(ip)); 7255 /* 7256 * If we are tracking an nlinkdelta, we have to also remember 7257 * whether we accounted for the freed space yet. 7258 */ 7259 ACQUIRE_LOCK(&lk); 7260 if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep))) 7261 inodedep->id_state |= SPACECOUNTED; 7262 FREE_LOCK(&lk); 7263 fs = ip->i_fs; 7264 extblocks = 0; 7265 if (fs->fs_magic == FS_UFS2_MAGIC) 7266 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 7267 UFS_LOCK(ip->i_ump); 7268 ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks; 7269 ip->i_fs->fs_pendinginodes += 1; 7270 UFS_UNLOCK(ip->i_ump); 7271 ip->i_flag |= IN_SPACECOUNTED; 7272} 7273 7274/* 7275 * Attach a sbdep dependency to the superblock buf so that we can keep 7276 * track of the head of the linked list of referenced but unlinked inodes. 7277 */ 7278void 7279softdep_setup_sbupdate(ump, fs, bp) 7280 struct ufsmount *ump; 7281 struct fs *fs; 7282 struct buf *bp; 7283{ 7284 struct sbdep *sbdep; 7285 struct worklist *wk; 7286 7287 if ((fs->fs_flags & FS_SUJ) == 0) 7288 return; 7289 LIST_FOREACH(wk, &bp->b_dep, wk_list) 7290 if (wk->wk_type == D_SBDEP) 7291 break; 7292 if (wk != NULL) 7293 return; 7294 sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); 7295 workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); 7296 sbdep->sb_fs = fs; 7297 sbdep->sb_ump = ump; 7298 ACQUIRE_LOCK(&lk); 7299 WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); 7300 FREE_LOCK(&lk); 7301} 7302 7303/* 7304 * Return the first unlinked inodedep which is ready to be the head of the 7305 * list. The inodedep and all those after it must have valid next pointers. 7306 */ 7307static struct inodedep * 7308first_unlinked_inodedep(ump) 7309 struct ufsmount *ump; 7310{ 7311 struct inodedep *inodedep; 7312 struct inodedep *idp; 7313 7314 for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); 7315 inodedep; inodedep = idp) { 7316 if ((inodedep->id_state & UNLINKNEXT) == 0) 7317 return (NULL); 7318 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 7319 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) 7320 break; 7321 if ((inodedep->id_state & UNLINKPREV) == 0) 7322 panic("first_unlinked_inodedep: prev != next"); 7323 } 7324 if (inodedep == NULL) 7325 return (NULL); 7326 7327 return (inodedep); 7328} 7329 7330/* 7331 * Set the sujfree unlinked head pointer prior to writing a superblock. 7332 */ 7333static void 7334initiate_write_sbdep(sbdep) 7335 struct sbdep *sbdep; 7336{ 7337 struct inodedep *inodedep; 7338 struct fs *bpfs; 7339 struct fs *fs; 7340 7341 bpfs = sbdep->sb_fs; 7342 fs = sbdep->sb_ump->um_fs; 7343 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 7344 if (inodedep) { 7345 fs->fs_sujfree = inodedep->id_ino; 7346 inodedep->id_state |= UNLINKPREV; 7347 } else 7348 fs->fs_sujfree = 0; 7349 bpfs->fs_sujfree = fs->fs_sujfree; 7350} 7351 7352/* 7353 * After a superblock is written determine whether it must be written again 7354 * due to a changing unlinked list head. 7355 */ 7356static int 7357handle_written_sbdep(sbdep, bp) 7358 struct sbdep *sbdep; 7359 struct buf *bp; 7360{ 7361 struct inodedep *inodedep; 7362 struct mount *mp; 7363 struct fs *fs; 7364 7365 fs = sbdep->sb_fs; 7366 mp = UFSTOVFS(sbdep->sb_ump); 7367 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 7368 if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || 7369 (inodedep == NULL && fs->fs_sujfree != 0)) { 7370 bdirty(bp); 7371 return (1); 7372 } 7373 WORKITEM_FREE(sbdep, D_SBDEP); 7374 if (fs->fs_sujfree == 0) 7375 return (0); 7376 if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0) 7377 panic("handle_written_sbdep: lost inodedep"); 7378 /* 7379 * Now that we have a record of this indode in stable store allow it 7380 * to be written to free up pending work. Inodes may see a lot of 7381 * write activity after they are unlinked which we must not hold up. 7382 */ 7383 for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { 7384 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) 7385 panic("handle_written_sbdep: Bad inodedep %p (0x%X)", 7386 inodedep, inodedep->id_state); 7387 if (inodedep->id_state & UNLINKONLIST) 7388 break; 7389 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; 7390 } 7391 7392 return (0); 7393} 7394 7395/* 7396 * Mark an inodedep has unlinked and insert it into the in-memory unlinked 7397 * list. 7398 */ 7399static void 7400unlinked_inodedep(mp, inodedep) 7401 struct mount *mp; 7402 struct inodedep *inodedep; 7403{ 7404 struct ufsmount *ump; 7405 7406 if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) 7407 return; 7408 ump = VFSTOUFS(mp); 7409 ump->um_fs->fs_fmod = 1; 7410 inodedep->id_state |= UNLINKED; 7411 TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); 7412} 7413 7414/* 7415 * Remove an inodedep from the unlinked inodedep list. This may require 7416 * disk writes if the inode has made it that far. 7417 */ 7418static void 7419clear_unlinked_inodedep(inodedep) 7420 struct inodedep *inodedep; 7421{ 7422 struct ufsmount *ump; 7423 struct inodedep *idp; 7424 struct inodedep *idn; 7425 struct fs *fs; 7426 struct buf *bp; 7427 ino_t ino; 7428 ino_t nino; 7429 ino_t pino; 7430 int error; 7431 7432 ump = VFSTOUFS(inodedep->id_list.wk_mp); 7433 fs = ump->um_fs; 7434 ino = inodedep->id_ino; 7435 error = 0; 7436 for (;;) { 7437 /* 7438 * If nothing has yet been written simply remove us from 7439 * the in memory list and return. This is the most common 7440 * case where handle_workitem_remove() loses the final 7441 * reference. 7442 */ 7443 if ((inodedep->id_state & UNLINKLINKS) == 0) 7444 break; 7445 /* 7446 * If we have a NEXT pointer and no PREV pointer we can simply 7447 * clear NEXT's PREV and remove ourselves from the list. Be 7448 * careful not to clear PREV if the superblock points at 7449 * next as well. 7450 */ 7451 idn = TAILQ_NEXT(inodedep, id_unlinked); 7452 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { 7453 if (idn && fs->fs_sujfree != idn->id_ino) 7454 idn->id_state &= ~UNLINKPREV; 7455 break; 7456 } 7457 /* 7458 * Here we have an inodedep which is actually linked into 7459 * the list. We must remove it by forcing a write to the 7460 * link before us, whether it be the superblock or an inode. 7461 * Unfortunately the list may change while we're waiting 7462 * on the buf lock for either resource so we must loop until 7463 * we lock. the right one. If both the superblock and an 7464 * inode point to this inode we must clear the inode first 7465 * followed by the superblock. 7466 */ 7467 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 7468 pino = 0; 7469 if (idp && (idp->id_state & UNLINKNEXT)) 7470 pino = idp->id_ino; 7471 FREE_LOCK(&lk); 7472 if (pino == 0) 7473 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 7474 (int)fs->fs_sbsize, 0, 0, 0); 7475 else 7476 error = bread(ump->um_devvp, 7477 fsbtodb(fs, ino_to_fsba(fs, pino)), 7478 (int)fs->fs_bsize, NOCRED, &bp); 7479 ACQUIRE_LOCK(&lk); 7480 if (error) 7481 break; 7482 /* If the list has changed restart the loop. */ 7483 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 7484 nino = 0; 7485 if (idp && (idp->id_state & UNLINKNEXT)) 7486 nino = idp->id_ino; 7487 if (nino != pino || 7488 (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { 7489 FREE_LOCK(&lk); 7490 brelse(bp); 7491 ACQUIRE_LOCK(&lk); 7492 continue; 7493 } 7494 /* 7495 * Remove us from the in memory list. After this we cannot 7496 * access the inodedep. 7497 */ 7498 idn = TAILQ_NEXT(inodedep, id_unlinked); 7499 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 7500 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 7501 /* 7502 * Determine the next inode number. 7503 */ 7504 nino = 0; 7505 if (idn) { 7506 /* 7507 * If next isn't on the list we can just clear prev's 7508 * state and schedule it to be fixed later. No need 7509 * to synchronously write if we're not in the real 7510 * list. 7511 */ 7512 if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) { 7513 idp->id_state &= ~UNLINKNEXT; 7514 if ((idp->id_state & ONWORKLIST) == 0) 7515 WORKLIST_INSERT(&bp->b_dep, 7516 &idp->id_list); 7517 FREE_LOCK(&lk); 7518 bawrite(bp); 7519 ACQUIRE_LOCK(&lk); 7520 return; 7521 } 7522 nino = idn->id_ino; 7523 } 7524 FREE_LOCK(&lk); 7525 /* 7526 * The predecessor's next pointer is manually updated here 7527 * so that the NEXT flag is never cleared for an element 7528 * that is in the list. 7529 */ 7530 if (pino == 0) { 7531 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 7532 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 7533 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 7534 bp); 7535 } else if (fs->fs_magic == FS_UFS1_MAGIC) 7536 ((struct ufs1_dinode *)bp->b_data + 7537 ino_to_fsbo(fs, pino))->di_freelink = nino; 7538 else 7539 ((struct ufs2_dinode *)bp->b_data + 7540 ino_to_fsbo(fs, pino))->di_freelink = nino; 7541 /* 7542 * If the bwrite fails we have no recourse to recover. The 7543 * filesystem is corrupted already. 7544 */ 7545 bwrite(bp); 7546 ACQUIRE_LOCK(&lk); 7547 /* 7548 * If the superblock pointer still needs to be cleared force 7549 * a write here. 7550 */ 7551 if (fs->fs_sujfree == ino) { 7552 FREE_LOCK(&lk); 7553 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 7554 (int)fs->fs_sbsize, 0, 0, 0); 7555 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 7556 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 7557 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 7558 bp); 7559 bwrite(bp); 7560 ACQUIRE_LOCK(&lk); 7561 } 7562 if (fs->fs_sujfree != ino) 7563 return; 7564 panic("clear_unlinked_inodedep: Failed to clear free head"); 7565 } 7566 if (inodedep->id_ino == fs->fs_sujfree) 7567 panic("clear_unlinked_inodedep: Freeing head of free list"); 7568 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 7569 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 7570 return; 7571} 7572 7573/* 7574 * This workitem decrements the inode's link count. 7575 * If the link count reaches zero, the file is removed. 7576 */ 7577static void 7578handle_workitem_remove(dirrem, xp) 7579 struct dirrem *dirrem; 7580 struct vnode *xp; 7581{ 7582 struct thread *td = curthread; 7583 struct inodedep *inodedep; 7584 struct workhead dotdotwk; 7585 struct worklist *wk; 7586 struct ufsmount *ump; 7587 struct mount *mp; 7588 struct vnode *vp; 7589 struct inode *ip; 7590 ino_t oldinum; 7591 int error; 7592 7593 if (dirrem->dm_state & ONWORKLIST) 7594 panic("handle_workitem_remove: dirrem %p still on worklist", 7595 dirrem); 7596 oldinum = dirrem->dm_oldinum; 7597 mp = dirrem->dm_list.wk_mp; 7598 ump = VFSTOUFS(mp); 7599 if ((vp = xp) == NULL && 7600 (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp, 7601 FFSV_FORCEINSMQ)) != 0) { 7602 softdep_error("handle_workitem_remove: vget", error); 7603 return; 7604 } 7605 ip = VTOI(vp); 7606 ACQUIRE_LOCK(&lk); 7607 if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) 7608 panic("handle_workitem_remove: lost inodedep"); 7609 if (dirrem->dm_state & ONDEPLIST) 7610 LIST_REMOVE(dirrem, dm_inonext); 7611 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 7612 ("handle_workitem_remove: Journal entries not written.")); 7613 7614 /* 7615 * Move all dependencies waiting on the remove to complete 7616 * from the dirrem to the inode inowait list to be completed 7617 * after the inode has been updated and written to disk. Any 7618 * marked MKDIR_PARENT are saved to be completed when the .. ref 7619 * is removed. 7620 */ 7621 LIST_INIT(&dotdotwk); 7622 while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { 7623 WORKLIST_REMOVE(wk); 7624 if (wk->wk_state & MKDIR_PARENT) { 7625 wk->wk_state &= ~MKDIR_PARENT; 7626 WORKLIST_INSERT(&dotdotwk, wk); 7627 continue; 7628 } 7629 WORKLIST_INSERT(&inodedep->id_inowait, wk); 7630 } 7631 LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); 7632 /* 7633 * Normal file deletion. 7634 */ 7635 if ((dirrem->dm_state & RMDIR) == 0) { 7636 ip->i_nlink--; 7637 DIP_SET(ip, i_nlink, ip->i_nlink); 7638 ip->i_flag |= IN_CHANGE; 7639 if (ip->i_nlink < ip->i_effnlink) 7640 panic("handle_workitem_remove: bad file delta"); 7641 if (ip->i_nlink == 0) 7642 unlinked_inodedep(mp, inodedep); 7643 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 7644 num_dirrem -= 1; 7645 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 7646 ("handle_workitem_remove: worklist not empty. %s", 7647 TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); 7648 WORKITEM_FREE(dirrem, D_DIRREM); 7649 FREE_LOCK(&lk); 7650 goto out; 7651 } 7652 /* 7653 * Directory deletion. Decrement reference count for both the 7654 * just deleted parent directory entry and the reference for ".". 7655 * Next truncate the directory to length zero. When the 7656 * truncation completes, arrange to have the reference count on 7657 * the parent decremented to account for the loss of "..". 7658 */ 7659 ip->i_nlink -= 2; 7660 DIP_SET(ip, i_nlink, ip->i_nlink); 7661 ip->i_flag |= IN_CHANGE; 7662 if (ip->i_nlink < ip->i_effnlink) 7663 panic("handle_workitem_remove: bad dir delta"); 7664 if (ip->i_nlink == 0) 7665 unlinked_inodedep(mp, inodedep); 7666 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 7667 FREE_LOCK(&lk); 7668 if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0) 7669 softdep_error("handle_workitem_remove: truncate", error); 7670 ACQUIRE_LOCK(&lk); 7671 /* 7672 * Rename a directory to a new parent. Since, we are both deleting 7673 * and creating a new directory entry, the link count on the new 7674 * directory should not change. Thus we skip the followup dirrem. 7675 */ 7676 if (dirrem->dm_state & DIRCHG) { 7677 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 7678 ("handle_workitem_remove: DIRCHG and worklist not empty.")); 7679 num_dirrem -= 1; 7680 WORKITEM_FREE(dirrem, D_DIRREM); 7681 FREE_LOCK(&lk); 7682 goto out; 7683 } 7684 dirrem->dm_state = ONDEPLIST; 7685 dirrem->dm_oldinum = dirrem->dm_dirinum; 7686 /* 7687 * Place the dirrem on the parent's diremhd list. 7688 */ 7689 if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) 7690 panic("handle_workitem_remove: lost dir inodedep"); 7691 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 7692 /* 7693 * If the allocated inode has never been written to disk, then 7694 * the on-disk inode is zero'ed and we can remove the file 7695 * immediately. When journaling if the inode has been marked 7696 * unlinked and not DEPCOMPLETE we know it can never be written. 7697 */ 7698 inodedep_lookup(mp, oldinum, 0, &inodedep); 7699 if (inodedep == NULL || 7700 (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || 7701 check_inode_unwritten(inodedep)) { 7702 if (xp != NULL) 7703 add_to_worklist(&dirrem->dm_list, 0); 7704 FREE_LOCK(&lk); 7705 if (xp == NULL) { 7706 vput(vp); 7707 handle_workitem_remove(dirrem, NULL); 7708 } 7709 return; 7710 } 7711 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 7712 FREE_LOCK(&lk); 7713 ip->i_flag |= IN_CHANGE; 7714out: 7715 ffs_update(vp, 0); 7716 if (xp == NULL) 7717 vput(vp); 7718} 7719 7720/* 7721 * Inode de-allocation dependencies. 7722 * 7723 * When an inode's link count is reduced to zero, it can be de-allocated. We 7724 * found it convenient to postpone de-allocation until after the inode is 7725 * written to disk with its new link count (zero). At this point, all of the 7726 * on-disk inode's block pointers are nullified and, with careful dependency 7727 * list ordering, all dependencies related to the inode will be satisfied and 7728 * the corresponding dependency structures de-allocated. So, if/when the 7729 * inode is reused, there will be no mixing of old dependencies with new 7730 * ones. This artificial dependency is set up by the block de-allocation 7731 * procedure above (softdep_setup_freeblocks) and completed by the 7732 * following procedure. 7733 */ 7734static void 7735handle_workitem_freefile(freefile) 7736 struct freefile *freefile; 7737{ 7738 struct workhead wkhd; 7739 struct fs *fs; 7740 struct inodedep *idp; 7741 struct ufsmount *ump; 7742 int error; 7743 7744 ump = VFSTOUFS(freefile->fx_list.wk_mp); 7745 fs = ump->um_fs; 7746#ifdef DEBUG 7747 ACQUIRE_LOCK(&lk); 7748 error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); 7749 FREE_LOCK(&lk); 7750 if (error) 7751 panic("handle_workitem_freefile: inodedep %p survived", idp); 7752#endif 7753 UFS_LOCK(ump); 7754 fs->fs_pendinginodes -= 1; 7755 UFS_UNLOCK(ump); 7756 LIST_INIT(&wkhd); 7757 LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); 7758 if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, 7759 freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) 7760 softdep_error("handle_workitem_freefile", error); 7761 ACQUIRE_LOCK(&lk); 7762 WORKITEM_FREE(freefile, D_FREEFILE); 7763 FREE_LOCK(&lk); 7764} 7765 7766 7767/* 7768 * Helper function which unlinks marker element from work list and returns 7769 * the next element on the list. 7770 */ 7771static __inline struct worklist * 7772markernext(struct worklist *marker) 7773{ 7774 struct worklist *next; 7775 7776 next = LIST_NEXT(marker, wk_list); 7777 LIST_REMOVE(marker, wk_list); 7778 return next; 7779} 7780 7781/* 7782 * Disk writes. 7783 * 7784 * The dependency structures constructed above are most actively used when file 7785 * system blocks are written to disk. No constraints are placed on when a 7786 * block can be written, but unsatisfied update dependencies are made safe by 7787 * modifying (or replacing) the source memory for the duration of the disk 7788 * write. When the disk write completes, the memory block is again brought 7789 * up-to-date. 7790 * 7791 * In-core inode structure reclamation. 7792 * 7793 * Because there are a finite number of "in-core" inode structures, they are 7794 * reused regularly. By transferring all inode-related dependencies to the 7795 * in-memory inode block and indexing them separately (via "inodedep"s), we 7796 * can allow "in-core" inode structures to be reused at any time and avoid 7797 * any increase in contention. 7798 * 7799 * Called just before entering the device driver to initiate a new disk I/O. 7800 * The buffer must be locked, thus, no I/O completion operations can occur 7801 * while we are manipulating its associated dependencies. 7802 */ 7803static void 7804softdep_disk_io_initiation(bp) 7805 struct buf *bp; /* structure describing disk write to occur */ 7806{ 7807 struct worklist *wk; 7808 struct worklist marker; 7809 struct inodedep *inodedep; 7810 struct freeblks *freeblks; 7811 struct jfreeblk *jfreeblk; 7812 struct newblk *newblk; 7813 7814 /* 7815 * We only care about write operations. There should never 7816 * be dependencies for reads. 7817 */ 7818 if (bp->b_iocmd != BIO_WRITE) 7819 panic("softdep_disk_io_initiation: not write"); 7820 7821 if (bp->b_vflags & BV_BKGRDINPROG) 7822 panic("softdep_disk_io_initiation: Writing buffer with " 7823 "background write in progress: %p", bp); 7824 7825 marker.wk_type = D_LAST + 1; /* Not a normal workitem */ 7826 PHOLD(curproc); /* Don't swap out kernel stack */ 7827 7828 ACQUIRE_LOCK(&lk); 7829 /* 7830 * Do any necessary pre-I/O processing. 7831 */ 7832 for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; 7833 wk = markernext(&marker)) { 7834 LIST_INSERT_AFTER(wk, &marker, wk_list); 7835 switch (wk->wk_type) { 7836 7837 case D_PAGEDEP: 7838 initiate_write_filepage(WK_PAGEDEP(wk), bp); 7839 continue; 7840 7841 case D_INODEDEP: 7842 inodedep = WK_INODEDEP(wk); 7843 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) 7844 initiate_write_inodeblock_ufs1(inodedep, bp); 7845 else 7846 initiate_write_inodeblock_ufs2(inodedep, bp); 7847 continue; 7848 7849 case D_INDIRDEP: 7850 initiate_write_indirdep(WK_INDIRDEP(wk), bp); 7851 continue; 7852 7853 case D_BMSAFEMAP: 7854 initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); 7855 continue; 7856 7857 case D_JSEG: 7858 WK_JSEG(wk)->js_buf = NULL; 7859 continue; 7860 7861 case D_FREEBLKS: 7862 freeblks = WK_FREEBLKS(wk); 7863 jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd); 7864 /* 7865 * We have to wait for the jfreeblks to be journaled 7866 * before we can write an inodeblock with updated 7867 * pointers. Be careful to arrange the marker so 7868 * we revisit the jfreeblk if it's not removed by 7869 * the first jwait(). 7870 */ 7871 if (jfreeblk != NULL) { 7872 LIST_REMOVE(&marker, wk_list); 7873 LIST_INSERT_BEFORE(wk, &marker, wk_list); 7874 jwait(&jfreeblk->jf_list); 7875 } 7876 continue; 7877 case D_ALLOCDIRECT: 7878 case D_ALLOCINDIR: 7879 /* 7880 * We have to wait for the jnewblk to be journaled 7881 * before we can write to a block otherwise the 7882 * contents may be confused with an earlier file 7883 * at recovery time. Handle the marker as described 7884 * above. 7885 */ 7886 newblk = WK_NEWBLK(wk); 7887 if (newblk->nb_jnewblk != NULL) { 7888 LIST_REMOVE(&marker, wk_list); 7889 LIST_INSERT_BEFORE(wk, &marker, wk_list); 7890 jwait(&newblk->nb_jnewblk->jn_list); 7891 } 7892 continue; 7893 7894 case D_SBDEP: 7895 initiate_write_sbdep(WK_SBDEP(wk)); 7896 continue; 7897 7898 case D_MKDIR: 7899 case D_FREEWORK: 7900 case D_FREEDEP: 7901 case D_JSEGDEP: 7902 continue; 7903 7904 default: 7905 panic("handle_disk_io_initiation: Unexpected type %s", 7906 TYPENAME(wk->wk_type)); 7907 /* NOTREACHED */ 7908 } 7909 } 7910 FREE_LOCK(&lk); 7911 PRELE(curproc); /* Allow swapout of kernel stack */ 7912} 7913 7914/* 7915 * Called from within the procedure above to deal with unsatisfied 7916 * allocation dependencies in a directory. The buffer must be locked, 7917 * thus, no I/O completion operations can occur while we are 7918 * manipulating its associated dependencies. 7919 */ 7920static void 7921initiate_write_filepage(pagedep, bp) 7922 struct pagedep *pagedep; 7923 struct buf *bp; 7924{ 7925 struct jremref *jremref; 7926 struct jmvref *jmvref; 7927 struct dirrem *dirrem; 7928 struct diradd *dap; 7929 struct direct *ep; 7930 int i; 7931 7932 if (pagedep->pd_state & IOSTARTED) { 7933 /* 7934 * This can only happen if there is a driver that does not 7935 * understand chaining. Here biodone will reissue the call 7936 * to strategy for the incomplete buffers. 7937 */ 7938 printf("initiate_write_filepage: already started\n"); 7939 return; 7940 } 7941 pagedep->pd_state |= IOSTARTED; 7942 /* 7943 * Wait for all journal remove dependencies to hit the disk. 7944 * We can not allow any potentially conflicting directory adds 7945 * to be visible before removes and rollback is too difficult. 7946 * lk may be dropped and re-acquired, however we hold the buf 7947 * locked so the dependency can not go away. 7948 */ 7949 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) 7950 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { 7951 stat_jwait_filepage++; 7952 jwait(&jremref->jr_list); 7953 } 7954 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { 7955 stat_jwait_filepage++; 7956 jwait(&jmvref->jm_list); 7957 } 7958 for (i = 0; i < DAHASHSZ; i++) { 7959 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 7960 ep = (struct direct *) 7961 ((char *)bp->b_data + dap->da_offset); 7962 if (ep->d_ino != dap->da_newinum) 7963 panic("%s: dir inum %d != new %d", 7964 "initiate_write_filepage", 7965 ep->d_ino, dap->da_newinum); 7966 if (dap->da_state & DIRCHG) 7967 ep->d_ino = dap->da_previous->dm_oldinum; 7968 else 7969 ep->d_ino = 0; 7970 dap->da_state &= ~ATTACHED; 7971 dap->da_state |= UNDONE; 7972 } 7973 } 7974} 7975 7976/* 7977 * Version of initiate_write_inodeblock that handles UFS1 dinodes. 7978 * Note that any bug fixes made to this routine must be done in the 7979 * version found below. 7980 * 7981 * Called from within the procedure above to deal with unsatisfied 7982 * allocation dependencies in an inodeblock. The buffer must be 7983 * locked, thus, no I/O completion operations can occur while we 7984 * are manipulating its associated dependencies. 7985 */ 7986static void 7987initiate_write_inodeblock_ufs1(inodedep, bp) 7988 struct inodedep *inodedep; 7989 struct buf *bp; /* The inode block */ 7990{ 7991 struct allocdirect *adp, *lastadp; 7992 struct ufs1_dinode *dp; 7993 struct ufs1_dinode *sip; 7994 struct inoref *inoref; 7995 struct fs *fs; 7996 ufs_lbn_t i; 7997#ifdef INVARIANTS 7998 ufs_lbn_t prevlbn = 0; 7999#endif 8000 int deplist; 8001 8002 if (inodedep->id_state & IOSTARTED) 8003 panic("initiate_write_inodeblock_ufs1: already started"); 8004 inodedep->id_state |= IOSTARTED; 8005 fs = inodedep->id_fs; 8006 dp = (struct ufs1_dinode *)bp->b_data + 8007 ino_to_fsbo(fs, inodedep->id_ino); 8008 8009 /* 8010 * If we're on the unlinked list but have not yet written our 8011 * next pointer initialize it here. 8012 */ 8013 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 8014 struct inodedep *inon; 8015 8016 inon = TAILQ_NEXT(inodedep, id_unlinked); 8017 dp->di_freelink = inon ? inon->id_ino : 0; 8018 } 8019 /* 8020 * If the bitmap is not yet written, then the allocated 8021 * inode cannot be written to disk. 8022 */ 8023 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 8024 if (inodedep->id_savedino1 != NULL) 8025 panic("initiate_write_inodeblock_ufs1: I/O underway"); 8026 FREE_LOCK(&lk); 8027 sip = malloc(sizeof(struct ufs1_dinode), 8028 M_SAVEDINO, M_SOFTDEP_FLAGS); 8029 ACQUIRE_LOCK(&lk); 8030 inodedep->id_savedino1 = sip; 8031 *inodedep->id_savedino1 = *dp; 8032 bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); 8033 dp->di_gen = inodedep->id_savedino1->di_gen; 8034 dp->di_freelink = inodedep->id_savedino1->di_freelink; 8035 return; 8036 } 8037 /* 8038 * If no dependencies, then there is nothing to roll back. 8039 */ 8040 inodedep->id_savedsize = dp->di_size; 8041 inodedep->id_savedextsize = 0; 8042 inodedep->id_savednlink = dp->di_nlink; 8043 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 8044 TAILQ_EMPTY(&inodedep->id_inoreflst)) 8045 return; 8046 /* 8047 * Revert the link count to that of the first unwritten journal entry. 8048 */ 8049 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 8050 if (inoref) 8051 dp->di_nlink = inoref->if_nlink; 8052 /* 8053 * Set the dependencies to busy. 8054 */ 8055 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8056 adp = TAILQ_NEXT(adp, ad_next)) { 8057#ifdef INVARIANTS 8058 if (deplist != 0 && prevlbn >= adp->ad_offset) 8059 panic("softdep_write_inodeblock: lbn order"); 8060 prevlbn = adp->ad_offset; 8061 if (adp->ad_offset < NDADDR && 8062 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 8063 panic("%s: direct pointer #%jd mismatch %d != %jd", 8064 "softdep_write_inodeblock", 8065 (intmax_t)adp->ad_offset, 8066 dp->di_db[adp->ad_offset], 8067 (intmax_t)adp->ad_newblkno); 8068 if (adp->ad_offset >= NDADDR && 8069 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 8070 panic("%s: indirect pointer #%jd mismatch %d != %jd", 8071 "softdep_write_inodeblock", 8072 (intmax_t)adp->ad_offset - NDADDR, 8073 dp->di_ib[adp->ad_offset - NDADDR], 8074 (intmax_t)adp->ad_newblkno); 8075 deplist |= 1 << adp->ad_offset; 8076 if ((adp->ad_state & ATTACHED) == 0) 8077 panic("softdep_write_inodeblock: Unknown state 0x%x", 8078 adp->ad_state); 8079#endif /* INVARIANTS */ 8080 adp->ad_state &= ~ATTACHED; 8081 adp->ad_state |= UNDONE; 8082 } 8083 /* 8084 * The on-disk inode cannot claim to be any larger than the last 8085 * fragment that has been written. Otherwise, the on-disk inode 8086 * might have fragments that were not the last block in the file 8087 * which would corrupt the filesystem. 8088 */ 8089 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8090 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 8091 if (adp->ad_offset >= NDADDR) 8092 break; 8093 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 8094 /* keep going until hitting a rollback to a frag */ 8095 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 8096 continue; 8097 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 8098 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 8099#ifdef INVARIANTS 8100 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 8101 panic("softdep_write_inodeblock: lost dep1"); 8102#endif /* INVARIANTS */ 8103 dp->di_db[i] = 0; 8104 } 8105 for (i = 0; i < NIADDR; i++) { 8106#ifdef INVARIANTS 8107 if (dp->di_ib[i] != 0 && 8108 (deplist & ((1 << NDADDR) << i)) == 0) 8109 panic("softdep_write_inodeblock: lost dep2"); 8110#endif /* INVARIANTS */ 8111 dp->di_ib[i] = 0; 8112 } 8113 return; 8114 } 8115 /* 8116 * If we have zero'ed out the last allocated block of the file, 8117 * roll back the size to the last currently allocated block. 8118 * We know that this last allocated block is a full-sized as 8119 * we already checked for fragments in the loop above. 8120 */ 8121 if (lastadp != NULL && 8122 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 8123 for (i = lastadp->ad_offset; i >= 0; i--) 8124 if (dp->di_db[i] != 0) 8125 break; 8126 dp->di_size = (i + 1) * fs->fs_bsize; 8127 } 8128 /* 8129 * The only dependencies are for indirect blocks. 8130 * 8131 * The file size for indirect block additions is not guaranteed. 8132 * Such a guarantee would be non-trivial to achieve. The conventional 8133 * synchronous write implementation also does not make this guarantee. 8134 * Fsck should catch and fix discrepancies. Arguably, the file size 8135 * can be over-estimated without destroying integrity when the file 8136 * moves into the indirect blocks (i.e., is large). If we want to 8137 * postpone fsck, we are stuck with this argument. 8138 */ 8139 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 8140 dp->di_ib[adp->ad_offset - NDADDR] = 0; 8141} 8142 8143/* 8144 * Version of initiate_write_inodeblock that handles UFS2 dinodes. 8145 * Note that any bug fixes made to this routine must be done in the 8146 * version found above. 8147 * 8148 * Called from within the procedure above to deal with unsatisfied 8149 * allocation dependencies in an inodeblock. The buffer must be 8150 * locked, thus, no I/O completion operations can occur while we 8151 * are manipulating its associated dependencies. 8152 */ 8153static void 8154initiate_write_inodeblock_ufs2(inodedep, bp) 8155 struct inodedep *inodedep; 8156 struct buf *bp; /* The inode block */ 8157{ 8158 struct allocdirect *adp, *lastadp; 8159 struct ufs2_dinode *dp; 8160 struct ufs2_dinode *sip; 8161 struct inoref *inoref; 8162 struct fs *fs; 8163 ufs_lbn_t i; 8164#ifdef INVARIANTS 8165 ufs_lbn_t prevlbn = 0; 8166#endif 8167 int deplist; 8168 8169 if (inodedep->id_state & IOSTARTED) 8170 panic("initiate_write_inodeblock_ufs2: already started"); 8171 inodedep->id_state |= IOSTARTED; 8172 fs = inodedep->id_fs; 8173 dp = (struct ufs2_dinode *)bp->b_data + 8174 ino_to_fsbo(fs, inodedep->id_ino); 8175 8176 /* 8177 * If we're on the unlinked list but have not yet written our 8178 * next pointer initialize it here. 8179 */ 8180 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 8181 struct inodedep *inon; 8182 8183 inon = TAILQ_NEXT(inodedep, id_unlinked); 8184 dp->di_freelink = inon ? inon->id_ino : 0; 8185 } 8186 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == 8187 (UNLINKED | UNLINKNEXT)) { 8188 struct inodedep *inon; 8189 ino_t freelink; 8190 8191 inon = TAILQ_NEXT(inodedep, id_unlinked); 8192 freelink = inon ? inon->id_ino : 0; 8193 if (freelink != dp->di_freelink) 8194 panic("ino %p(0x%X) %d, %d != %d", 8195 inodedep, inodedep->id_state, inodedep->id_ino, 8196 freelink, dp->di_freelink); 8197 } 8198 /* 8199 * If the bitmap is not yet written, then the allocated 8200 * inode cannot be written to disk. 8201 */ 8202 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 8203 if (inodedep->id_savedino2 != NULL) 8204 panic("initiate_write_inodeblock_ufs2: I/O underway"); 8205 FREE_LOCK(&lk); 8206 sip = malloc(sizeof(struct ufs2_dinode), 8207 M_SAVEDINO, M_SOFTDEP_FLAGS); 8208 ACQUIRE_LOCK(&lk); 8209 inodedep->id_savedino2 = sip; 8210 *inodedep->id_savedino2 = *dp; 8211 bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); 8212 dp->di_gen = inodedep->id_savedino2->di_gen; 8213 dp->di_freelink = inodedep->id_savedino2->di_freelink; 8214 return; 8215 } 8216 /* 8217 * If no dependencies, then there is nothing to roll back. 8218 */ 8219 inodedep->id_savedsize = dp->di_size; 8220 inodedep->id_savedextsize = dp->di_extsize; 8221 inodedep->id_savednlink = dp->di_nlink; 8222 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 8223 TAILQ_EMPTY(&inodedep->id_extupdt) && 8224 TAILQ_EMPTY(&inodedep->id_inoreflst)) 8225 return; 8226 /* 8227 * Revert the link count to that of the first unwritten journal entry. 8228 */ 8229 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 8230 if (inoref) 8231 dp->di_nlink = inoref->if_nlink; 8232 8233 /* 8234 * Set the ext data dependencies to busy. 8235 */ 8236 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 8237 adp = TAILQ_NEXT(adp, ad_next)) { 8238#ifdef INVARIANTS 8239 if (deplist != 0 && prevlbn >= adp->ad_offset) 8240 panic("softdep_write_inodeblock: lbn order"); 8241 prevlbn = adp->ad_offset; 8242 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) 8243 panic("%s: direct pointer #%jd mismatch %jd != %jd", 8244 "softdep_write_inodeblock", 8245 (intmax_t)adp->ad_offset, 8246 (intmax_t)dp->di_extb[adp->ad_offset], 8247 (intmax_t)adp->ad_newblkno); 8248 deplist |= 1 << adp->ad_offset; 8249 if ((adp->ad_state & ATTACHED) == 0) 8250 panic("softdep_write_inodeblock: Unknown state 0x%x", 8251 adp->ad_state); 8252#endif /* INVARIANTS */ 8253 adp->ad_state &= ~ATTACHED; 8254 adp->ad_state |= UNDONE; 8255 } 8256 /* 8257 * The on-disk inode cannot claim to be any larger than the last 8258 * fragment that has been written. Otherwise, the on-disk inode 8259 * might have fragments that were not the last block in the ext 8260 * data which would corrupt the filesystem. 8261 */ 8262 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 8263 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 8264 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; 8265 /* keep going until hitting a rollback to a frag */ 8266 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 8267 continue; 8268 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 8269 for (i = adp->ad_offset + 1; i < NXADDR; i++) { 8270#ifdef INVARIANTS 8271 if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) 8272 panic("softdep_write_inodeblock: lost dep1"); 8273#endif /* INVARIANTS */ 8274 dp->di_extb[i] = 0; 8275 } 8276 lastadp = NULL; 8277 break; 8278 } 8279 /* 8280 * If we have zero'ed out the last allocated block of the ext 8281 * data, roll back the size to the last currently allocated block. 8282 * We know that this last allocated block is a full-sized as 8283 * we already checked for fragments in the loop above. 8284 */ 8285 if (lastadp != NULL && 8286 dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 8287 for (i = lastadp->ad_offset; i >= 0; i--) 8288 if (dp->di_extb[i] != 0) 8289 break; 8290 dp->di_extsize = (i + 1) * fs->fs_bsize; 8291 } 8292 /* 8293 * Set the file data dependencies to busy. 8294 */ 8295 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8296 adp = TAILQ_NEXT(adp, ad_next)) { 8297#ifdef INVARIANTS 8298 if (deplist != 0 && prevlbn >= adp->ad_offset) 8299 panic("softdep_write_inodeblock: lbn order"); 8300 prevlbn = adp->ad_offset; 8301 if (adp->ad_offset < NDADDR && 8302 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 8303 panic("%s: direct pointer #%jd mismatch %jd != %jd", 8304 "softdep_write_inodeblock", 8305 (intmax_t)adp->ad_offset, 8306 (intmax_t)dp->di_db[adp->ad_offset], 8307 (intmax_t)adp->ad_newblkno); 8308 if (adp->ad_offset >= NDADDR && 8309 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 8310 panic("%s indirect pointer #%jd mismatch %jd != %jd", 8311 "softdep_write_inodeblock:", 8312 (intmax_t)adp->ad_offset - NDADDR, 8313 (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], 8314 (intmax_t)adp->ad_newblkno); 8315 deplist |= 1 << adp->ad_offset; 8316 if ((adp->ad_state & ATTACHED) == 0) 8317 panic("softdep_write_inodeblock: Unknown state 0x%x", 8318 adp->ad_state); 8319#endif /* INVARIANTS */ 8320 adp->ad_state &= ~ATTACHED; 8321 adp->ad_state |= UNDONE; 8322 } 8323 /* 8324 * The on-disk inode cannot claim to be any larger than the last 8325 * fragment that has been written. Otherwise, the on-disk inode 8326 * might have fragments that were not the last block in the file 8327 * which would corrupt the filesystem. 8328 */ 8329 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 8330 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 8331 if (adp->ad_offset >= NDADDR) 8332 break; 8333 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 8334 /* keep going until hitting a rollback to a frag */ 8335 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 8336 continue; 8337 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 8338 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 8339#ifdef INVARIANTS 8340 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 8341 panic("softdep_write_inodeblock: lost dep2"); 8342#endif /* INVARIANTS */ 8343 dp->di_db[i] = 0; 8344 } 8345 for (i = 0; i < NIADDR; i++) { 8346#ifdef INVARIANTS 8347 if (dp->di_ib[i] != 0 && 8348 (deplist & ((1 << NDADDR) << i)) == 0) 8349 panic("softdep_write_inodeblock: lost dep3"); 8350#endif /* INVARIANTS */ 8351 dp->di_ib[i] = 0; 8352 } 8353 return; 8354 } 8355 /* 8356 * If we have zero'ed out the last allocated block of the file, 8357 * roll back the size to the last currently allocated block. 8358 * We know that this last allocated block is a full-sized as 8359 * we already checked for fragments in the loop above. 8360 */ 8361 if (lastadp != NULL && 8362 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 8363 for (i = lastadp->ad_offset; i >= 0; i--) 8364 if (dp->di_db[i] != 0) 8365 break; 8366 dp->di_size = (i + 1) * fs->fs_bsize; 8367 } 8368 /* 8369 * The only dependencies are for indirect blocks. 8370 * 8371 * The file size for indirect block additions is not guaranteed. 8372 * Such a guarantee would be non-trivial to achieve. The conventional 8373 * synchronous write implementation also does not make this guarantee. 8374 * Fsck should catch and fix discrepancies. Arguably, the file size 8375 * can be over-estimated without destroying integrity when the file 8376 * moves into the indirect blocks (i.e., is large). If we want to 8377 * postpone fsck, we are stuck with this argument. 8378 */ 8379 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 8380 dp->di_ib[adp->ad_offset - NDADDR] = 0; 8381} 8382 8383/* 8384 * Cancel an indirdep as a result of truncation. Release all of the 8385 * children allocindirs and place their journal work on the appropriate 8386 * list. 8387 */ 8388static void 8389cancel_indirdep(indirdep, bp, inodedep, freeblks) 8390 struct indirdep *indirdep; 8391 struct buf *bp; 8392 struct inodedep *inodedep; 8393 struct freeblks *freeblks; 8394{ 8395 struct allocindir *aip; 8396 8397 /* 8398 * None of the indirect pointers will ever be visible, 8399 * so they can simply be tossed. GOINGAWAY ensures 8400 * that allocated pointers will be saved in the buffer 8401 * cache until they are freed. Note that they will 8402 * only be able to be found by their physical address 8403 * since the inode mapping the logical address will 8404 * be gone. The save buffer used for the safe copy 8405 * was allocated in setup_allocindir_phase2 using 8406 * the physical address so it could be used for this 8407 * purpose. Hence we swap the safe copy with the real 8408 * copy, allowing the safe copy to be freed and holding 8409 * on to the real copy for later use in indir_trunc. 8410 */ 8411 if (indirdep->ir_state & GOINGAWAY) 8412 panic("cancel_indirdep: already gone"); 8413 if (indirdep->ir_state & ONDEPLIST) { 8414 indirdep->ir_state &= ~ONDEPLIST; 8415 LIST_REMOVE(indirdep, ir_next); 8416 } 8417 indirdep->ir_state |= GOINGAWAY; 8418 VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1; 8419 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 8420 cancel_allocindir(aip, inodedep, freeblks); 8421 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) 8422 cancel_allocindir(aip, inodedep, freeblks); 8423 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) 8424 cancel_allocindir(aip, inodedep, freeblks); 8425 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0) 8426 cancel_allocindir(aip, inodedep, freeblks); 8427 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); 8428 WORKLIST_REMOVE(&indirdep->ir_list); 8429 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); 8430 indirdep->ir_savebp = NULL; 8431} 8432 8433/* 8434 * Free an indirdep once it no longer has new pointers to track. 8435 */ 8436static void 8437free_indirdep(indirdep) 8438 struct indirdep *indirdep; 8439{ 8440 8441 KASSERT(LIST_EMPTY(&indirdep->ir_jwork), 8442 ("free_indirdep: Journal work not empty.")); 8443 KASSERT(LIST_EMPTY(&indirdep->ir_completehd), 8444 ("free_indirdep: Complete head not empty.")); 8445 KASSERT(LIST_EMPTY(&indirdep->ir_writehd), 8446 ("free_indirdep: write head not empty.")); 8447 KASSERT(LIST_EMPTY(&indirdep->ir_donehd), 8448 ("free_indirdep: done head not empty.")); 8449 KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), 8450 ("free_indirdep: deplist head not empty.")); 8451 KASSERT(indirdep->ir_savebp == NULL, 8452 ("free_indirdep: %p ir_savebp != NULL", indirdep)); 8453 KASSERT((indirdep->ir_state & ONDEPLIST) == 0, 8454 ("free_indirdep: %p still on deplist.", indirdep)); 8455 if (indirdep->ir_state & ONWORKLIST) 8456 WORKLIST_REMOVE(&indirdep->ir_list); 8457 WORKITEM_FREE(indirdep, D_INDIRDEP); 8458} 8459 8460/* 8461 * Called before a write to an indirdep. This routine is responsible for 8462 * rolling back pointers to a safe state which includes only those 8463 * allocindirs which have been completed. 8464 */ 8465static void 8466initiate_write_indirdep(indirdep, bp) 8467 struct indirdep *indirdep; 8468 struct buf *bp; 8469{ 8470 8471 if (indirdep->ir_state & GOINGAWAY) 8472 panic("disk_io_initiation: indirdep gone"); 8473 8474 /* 8475 * If there are no remaining dependencies, this will be writing 8476 * the real pointers. 8477 */ 8478 if (LIST_EMPTY(&indirdep->ir_deplisthd)) 8479 return; 8480 /* 8481 * Replace up-to-date version with safe version. 8482 */ 8483 FREE_LOCK(&lk); 8484 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 8485 M_SOFTDEP_FLAGS); 8486 ACQUIRE_LOCK(&lk); 8487 indirdep->ir_state &= ~ATTACHED; 8488 indirdep->ir_state |= UNDONE; 8489 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 8490 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 8491 bp->b_bcount); 8492} 8493 8494/* 8495 * Called when an inode has been cleared in a cg bitmap. This finally 8496 * eliminates any canceled jaddrefs 8497 */ 8498void 8499softdep_setup_inofree(mp, bp, ino, wkhd) 8500 struct mount *mp; 8501 struct buf *bp; 8502 ino_t ino; 8503 struct workhead *wkhd; 8504{ 8505 struct worklist *wk, *wkn; 8506 struct inodedep *inodedep; 8507 uint8_t *inosused; 8508 struct cg *cgp; 8509 struct fs *fs; 8510 8511 ACQUIRE_LOCK(&lk); 8512 fs = VFSTOUFS(mp)->um_fs; 8513 cgp = (struct cg *)bp->b_data; 8514 inosused = cg_inosused(cgp); 8515 if (isset(inosused, ino % fs->fs_ipg)) 8516 panic("softdep_setup_inofree: inode %d not freed.", ino); 8517 if (inodedep_lookup(mp, ino, 0, &inodedep)) 8518 panic("softdep_setup_inofree: ino %d has existing inodedep %p", 8519 ino, inodedep); 8520 if (wkhd) { 8521 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { 8522 if (wk->wk_type != D_JADDREF) 8523 continue; 8524 WORKLIST_REMOVE(wk); 8525 /* 8526 * We can free immediately even if the jaddref 8527 * isn't attached in a background write as now 8528 * the bitmaps are reconciled. 8529 */ 8530 wk->wk_state |= COMPLETE | ATTACHED; 8531 free_jaddref(WK_JADDREF(wk)); 8532 } 8533 jwork_move(&bp->b_dep, wkhd); 8534 } 8535 FREE_LOCK(&lk); 8536} 8537 8538 8539/* 8540 * Called via ffs_blkfree() after a set of frags has been cleared from a cg 8541 * map. Any dependencies waiting for the write to clear are added to the 8542 * buf's list and any jnewblks that are being canceled are discarded 8543 * immediately. 8544 */ 8545void 8546softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 8547 struct mount *mp; 8548 struct buf *bp; 8549 ufs2_daddr_t blkno; 8550 int frags; 8551 struct workhead *wkhd; 8552{ 8553 struct jnewblk *jnewblk; 8554 struct worklist *wk, *wkn; 8555#ifdef SUJ_DEBUG 8556 struct bmsafemap *bmsafemap; 8557 struct fs *fs; 8558 uint8_t *blksfree; 8559 struct cg *cgp; 8560 ufs2_daddr_t jstart; 8561 ufs2_daddr_t jend; 8562 ufs2_daddr_t end; 8563 long bno; 8564 int i; 8565#endif 8566 8567 ACQUIRE_LOCK(&lk); 8568 /* 8569 * Detach any jnewblks which have been canceled. They must linger 8570 * until the bitmap is cleared again by ffs_blkfree() to prevent 8571 * an unjournaled allocation from hitting the disk. 8572 */ 8573 if (wkhd) { 8574 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { 8575 if (wk->wk_type != D_JNEWBLK) 8576 continue; 8577 jnewblk = WK_JNEWBLK(wk); 8578 KASSERT(jnewblk->jn_state & GOINGAWAY, 8579 ("softdep_setup_blkfree: jnewblk not canceled.")); 8580 WORKLIST_REMOVE(wk); 8581#ifdef SUJ_DEBUG 8582 /* 8583 * Assert that this block is free in the bitmap 8584 * before we discard the jnewblk. 8585 */ 8586 fs = VFSTOUFS(mp)->um_fs; 8587 cgp = (struct cg *)bp->b_data; 8588 blksfree = cg_blksfree(cgp); 8589 bno = dtogd(fs, jnewblk->jn_blkno); 8590 for (i = jnewblk->jn_oldfrags; 8591 i < jnewblk->jn_frags; i++) { 8592 if (isset(blksfree, bno + i)) 8593 continue; 8594 panic("softdep_setup_blkfree: not free"); 8595 } 8596#endif 8597 /* 8598 * Even if it's not attached we can free immediately 8599 * as the new bitmap is correct. 8600 */ 8601 wk->wk_state |= COMPLETE | ATTACHED; 8602 free_jnewblk(jnewblk); 8603 } 8604 /* 8605 * The buf must be locked by the caller otherwise these could 8606 * be added while it's being written and the write would 8607 * complete them before they made it to disk. 8608 */ 8609 jwork_move(&bp->b_dep, wkhd); 8610 } 8611 8612#ifdef SUJ_DEBUG 8613 /* 8614 * Assert that we are not freeing a block which has an outstanding 8615 * allocation dependency. 8616 */ 8617 fs = VFSTOUFS(mp)->um_fs; 8618 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); 8619 end = blkno + frags; 8620 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 8621 /* 8622 * Don't match against blocks that will be freed when the 8623 * background write is done. 8624 */ 8625 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == 8626 (COMPLETE | DEPCOMPLETE)) 8627 continue; 8628 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; 8629 jend = jnewblk->jn_blkno + jnewblk->jn_frags; 8630 if ((blkno >= jstart && blkno < jend) || 8631 (end > jstart && end <= jend)) { 8632 printf("state 0x%X %jd - %d %d dep %p\n", 8633 jnewblk->jn_state, jnewblk->jn_blkno, 8634 jnewblk->jn_oldfrags, jnewblk->jn_frags, 8635 jnewblk->jn_newblk); 8636 panic("softdep_setup_blkfree: " 8637 "%jd-%jd(%d) overlaps with %jd-%jd", 8638 blkno, end, frags, jstart, jend); 8639 } 8640 } 8641#endif 8642 FREE_LOCK(&lk); 8643} 8644 8645static void 8646initiate_write_bmsafemap(bmsafemap, bp) 8647 struct bmsafemap *bmsafemap; 8648 struct buf *bp; /* The cg block. */ 8649{ 8650 struct jaddref *jaddref; 8651 struct jnewblk *jnewblk; 8652 uint8_t *inosused; 8653 uint8_t *blksfree; 8654 struct cg *cgp; 8655 struct fs *fs; 8656 int cleared; 8657 ino_t ino; 8658 long bno; 8659 int i; 8660 8661 if (bmsafemap->sm_state & IOSTARTED) 8662 panic("initiate_write_bmsafemap: Already started\n"); 8663 bmsafemap->sm_state |= IOSTARTED; 8664 /* 8665 * Clear any inode allocations which are pending journal writes. 8666 */ 8667 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { 8668 cgp = (struct cg *)bp->b_data; 8669 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 8670 inosused = cg_inosused(cgp); 8671 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { 8672 ino = jaddref->ja_ino % fs->fs_ipg; 8673 /* 8674 * If this is a background copy the inode may not 8675 * be marked used yet. 8676 */ 8677 if (isset(inosused, ino)) { 8678 if ((jaddref->ja_mode & IFMT) == IFDIR) 8679 cgp->cg_cs.cs_ndir--; 8680 cgp->cg_cs.cs_nifree++; 8681 clrbit(inosused, ino); 8682 jaddref->ja_state &= ~ATTACHED; 8683 jaddref->ja_state |= UNDONE; 8684 stat_jaddref++; 8685 } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 8686 panic("initiate_write_bmsafemap: inode %d " 8687 "marked free", jaddref->ja_ino); 8688 } 8689 } 8690 /* 8691 * Clear any block allocations which are pending journal writes. 8692 */ 8693 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 8694 cgp = (struct cg *)bp->b_data; 8695 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 8696 blksfree = cg_blksfree(cgp); 8697 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 8698 bno = dtogd(fs, jnewblk->jn_blkno); 8699 cleared = 0; 8700 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 8701 i++) { 8702 if (isclr(blksfree, bno + i)) { 8703 cleared = 1; 8704 setbit(blksfree, bno + i); 8705 } 8706 } 8707 /* 8708 * We may not clear the block if it's a background 8709 * copy. In that case there is no reason to detach 8710 * it. 8711 */ 8712 if (cleared) { 8713 stat_jnewblk++; 8714 jnewblk->jn_state &= ~ATTACHED; 8715 jnewblk->jn_state |= UNDONE; 8716 } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 8717 panic("initiate_write_bmsafemap: block %jd " 8718 "marked free", jnewblk->jn_blkno); 8719 } 8720 } 8721 /* 8722 * Move allocation lists to the written lists so they can be 8723 * cleared once the block write is complete. 8724 */ 8725 LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, 8726 inodedep, id_deps); 8727 LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, 8728 newblk, nb_deps); 8729} 8730 8731/* 8732 * This routine is called during the completion interrupt 8733 * service routine for a disk write (from the procedure called 8734 * by the device driver to inform the filesystem caches of 8735 * a request completion). It should be called early in this 8736 * procedure, before the block is made available to other 8737 * processes or other routines are called. 8738 * 8739 */ 8740static void 8741softdep_disk_write_complete(bp) 8742 struct buf *bp; /* describes the completed disk write */ 8743{ 8744 struct worklist *wk; 8745 struct worklist *owk; 8746 struct workhead reattach; 8747 struct buf *sbp; 8748 8749 /* 8750 * If an error occurred while doing the write, then the data 8751 * has not hit the disk and the dependencies cannot be unrolled. 8752 */ 8753 if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) 8754 return; 8755 LIST_INIT(&reattach); 8756 /* 8757 * This lock must not be released anywhere in this code segment. 8758 */ 8759 sbp = NULL; 8760 owk = NULL; 8761 ACQUIRE_LOCK(&lk); 8762 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 8763 WORKLIST_REMOVE(wk); 8764 if (wk == owk) 8765 panic("duplicate worklist: %p\n", wk); 8766 owk = wk; 8767 switch (wk->wk_type) { 8768 8769 case D_PAGEDEP: 8770 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 8771 WORKLIST_INSERT(&reattach, wk); 8772 continue; 8773 8774 case D_INODEDEP: 8775 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 8776 WORKLIST_INSERT(&reattach, wk); 8777 continue; 8778 8779 case D_BMSAFEMAP: 8780 if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp)) 8781 WORKLIST_INSERT(&reattach, wk); 8782 continue; 8783 8784 case D_MKDIR: 8785 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 8786 continue; 8787 8788 case D_ALLOCDIRECT: 8789 wk->wk_state |= COMPLETE; 8790 handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); 8791 continue; 8792 8793 case D_ALLOCINDIR: 8794 wk->wk_state |= COMPLETE; 8795 handle_allocindir_partdone(WK_ALLOCINDIR(wk)); 8796 continue; 8797 8798 case D_INDIRDEP: 8799 if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp)) 8800 WORKLIST_INSERT(&reattach, wk); 8801 continue; 8802 8803 case D_FREEBLKS: 8804 wk->wk_state |= COMPLETE; 8805 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) 8806 add_to_worklist(wk, 1); 8807 continue; 8808 8809 case D_FREEWORK: 8810 handle_written_freework(WK_FREEWORK(wk)); 8811 break; 8812 8813 case D_FREEDEP: 8814 free_freedep(WK_FREEDEP(wk)); 8815 continue; 8816 8817 case D_JSEGDEP: 8818 free_jsegdep(WK_JSEGDEP(wk)); 8819 continue; 8820 8821 case D_JSEG: 8822 handle_written_jseg(WK_JSEG(wk), bp); 8823 continue; 8824 8825 case D_SBDEP: 8826 if (handle_written_sbdep(WK_SBDEP(wk), bp)) 8827 WORKLIST_INSERT(&reattach, wk); 8828 continue; 8829 8830 default: 8831 panic("handle_disk_write_complete: Unknown type %s", 8832 TYPENAME(wk->wk_type)); 8833 /* NOTREACHED */ 8834 } 8835 } 8836 /* 8837 * Reattach any requests that must be redone. 8838 */ 8839 while ((wk = LIST_FIRST(&reattach)) != NULL) { 8840 WORKLIST_REMOVE(wk); 8841 WORKLIST_INSERT(&bp->b_dep, wk); 8842 } 8843 FREE_LOCK(&lk); 8844 if (sbp) 8845 brelse(sbp); 8846} 8847 8848/* 8849 * Called from within softdep_disk_write_complete above. Note that 8850 * this routine is always called from interrupt level with further 8851 * splbio interrupts blocked. 8852 */ 8853static void 8854handle_allocdirect_partdone(adp, wkhd) 8855 struct allocdirect *adp; /* the completed allocdirect */ 8856 struct workhead *wkhd; /* Work to do when inode is writtne. */ 8857{ 8858 struct allocdirectlst *listhead; 8859 struct allocdirect *listadp; 8860 struct inodedep *inodedep; 8861 long bsize; 8862 8863 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 8864 return; 8865 /* 8866 * The on-disk inode cannot claim to be any larger than the last 8867 * fragment that has been written. Otherwise, the on-disk inode 8868 * might have fragments that were not the last block in the file 8869 * which would corrupt the filesystem. Thus, we cannot free any 8870 * allocdirects after one whose ad_oldblkno claims a fragment as 8871 * these blocks must be rolled back to zero before writing the inode. 8872 * We check the currently active set of allocdirects in id_inoupdt 8873 * or id_extupdt as appropriate. 8874 */ 8875 inodedep = adp->ad_inodedep; 8876 bsize = inodedep->id_fs->fs_bsize; 8877 if (adp->ad_state & EXTDATA) 8878 listhead = &inodedep->id_extupdt; 8879 else 8880 listhead = &inodedep->id_inoupdt; 8881 TAILQ_FOREACH(listadp, listhead, ad_next) { 8882 /* found our block */ 8883 if (listadp == adp) 8884 break; 8885 /* continue if ad_oldlbn is not a fragment */ 8886 if (listadp->ad_oldsize == 0 || 8887 listadp->ad_oldsize == bsize) 8888 continue; 8889 /* hit a fragment */ 8890 return; 8891 } 8892 /* 8893 * If we have reached the end of the current list without 8894 * finding the just finished dependency, then it must be 8895 * on the future dependency list. Future dependencies cannot 8896 * be freed until they are moved to the current list. 8897 */ 8898 if (listadp == NULL) { 8899#ifdef DEBUG 8900 if (adp->ad_state & EXTDATA) 8901 listhead = &inodedep->id_newextupdt; 8902 else 8903 listhead = &inodedep->id_newinoupdt; 8904 TAILQ_FOREACH(listadp, listhead, ad_next) 8905 /* found our block */ 8906 if (listadp == adp) 8907 break; 8908 if (listadp == NULL) 8909 panic("handle_allocdirect_partdone: lost dep"); 8910#endif /* DEBUG */ 8911 return; 8912 } 8913 /* 8914 * If we have found the just finished dependency, then queue 8915 * it along with anything that follows it that is complete. 8916 * Since the pointer has not yet been written in the inode 8917 * as the dependency prevents it, place the allocdirect on the 8918 * bufwait list where it will be freed once the pointer is 8919 * valid. 8920 */ 8921 if (wkhd == NULL) 8922 wkhd = &inodedep->id_bufwait; 8923 for (; adp; adp = listadp) { 8924 listadp = TAILQ_NEXT(adp, ad_next); 8925 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 8926 return; 8927 TAILQ_REMOVE(listhead, adp, ad_next); 8928 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); 8929 } 8930} 8931 8932/* 8933 * Called from within softdep_disk_write_complete above. This routine 8934 * completes successfully written allocindirs. 8935 */ 8936static void 8937handle_allocindir_partdone(aip) 8938 struct allocindir *aip; /* the completed allocindir */ 8939{ 8940 struct indirdep *indirdep; 8941 8942 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 8943 return; 8944 indirdep = aip->ai_indirdep; 8945 LIST_REMOVE(aip, ai_next); 8946 if (indirdep->ir_state & UNDONE) { 8947 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 8948 return; 8949 } 8950 if (indirdep->ir_state & UFS1FMT) 8951 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 8952 aip->ai_newblkno; 8953 else 8954 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 8955 aip->ai_newblkno; 8956 /* 8957 * Await the pointer write before freeing the allocindir. 8958 */ 8959 LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); 8960} 8961 8962/* 8963 * Release segments held on a jwork list. 8964 */ 8965static void 8966handle_jwork(wkhd) 8967 struct workhead *wkhd; 8968{ 8969 struct worklist *wk; 8970 8971 while ((wk = LIST_FIRST(wkhd)) != NULL) { 8972 WORKLIST_REMOVE(wk); 8973 switch (wk->wk_type) { 8974 case D_JSEGDEP: 8975 free_jsegdep(WK_JSEGDEP(wk)); 8976 continue; 8977 default: 8978 panic("handle_jwork: Unknown type %s\n", 8979 TYPENAME(wk->wk_type)); 8980 } 8981 } 8982} 8983 8984/* 8985 * Handle the bufwait list on an inode when it is safe to release items 8986 * held there. This normally happens after an inode block is written but 8987 * may be delayed and handle later if there are pending journal items that 8988 * are not yet safe to be released. 8989 */ 8990static struct freefile * 8991handle_bufwait(inodedep, refhd) 8992 struct inodedep *inodedep; 8993 struct workhead *refhd; 8994{ 8995 struct jaddref *jaddref; 8996 struct freefile *freefile; 8997 struct worklist *wk; 8998 8999 freefile = NULL; 9000 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 9001 WORKLIST_REMOVE(wk); 9002 switch (wk->wk_type) { 9003 case D_FREEFILE: 9004 /* 9005 * We defer adding freefile to the worklist 9006 * until all other additions have been made to 9007 * ensure that it will be done after all the 9008 * old blocks have been freed. 9009 */ 9010 if (freefile != NULL) 9011 panic("handle_bufwait: freefile"); 9012 freefile = WK_FREEFILE(wk); 9013 continue; 9014 9015 case D_MKDIR: 9016 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 9017 continue; 9018 9019 case D_DIRADD: 9020 diradd_inode_written(WK_DIRADD(wk), inodedep); 9021 continue; 9022 9023 case D_FREEFRAG: 9024 wk->wk_state |= COMPLETE; 9025 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) 9026 add_to_worklist(wk, 0); 9027 continue; 9028 9029 case D_DIRREM: 9030 wk->wk_state |= COMPLETE; 9031 add_to_worklist(wk, 0); 9032 continue; 9033 9034 case D_ALLOCDIRECT: 9035 case D_ALLOCINDIR: 9036 free_newblk(WK_NEWBLK(wk)); 9037 continue; 9038 9039 case D_JNEWBLK: 9040 wk->wk_state |= COMPLETE; 9041 free_jnewblk(WK_JNEWBLK(wk)); 9042 continue; 9043 9044 /* 9045 * Save freed journal segments and add references on 9046 * the supplied list which will delay their release 9047 * until the cg bitmap is cleared on disk. 9048 */ 9049 case D_JSEGDEP: 9050 if (refhd == NULL) 9051 free_jsegdep(WK_JSEGDEP(wk)); 9052 else 9053 WORKLIST_INSERT(refhd, wk); 9054 continue; 9055 9056 case D_JADDREF: 9057 jaddref = WK_JADDREF(wk); 9058 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 9059 if_deps); 9060 /* 9061 * Transfer any jaddrefs to the list to be freed with 9062 * the bitmap if we're handling a removed file. 9063 */ 9064 if (refhd == NULL) { 9065 wk->wk_state |= COMPLETE; 9066 free_jaddref(jaddref); 9067 } else 9068 WORKLIST_INSERT(refhd, wk); 9069 continue; 9070 9071 default: 9072 panic("handle_bufwait: Unknown type %p(%s)", 9073 wk, TYPENAME(wk->wk_type)); 9074 /* NOTREACHED */ 9075 } 9076 } 9077 return (freefile); 9078} 9079/* 9080 * Called from within softdep_disk_write_complete above to restore 9081 * in-memory inode block contents to their most up-to-date state. Note 9082 * that this routine is always called from interrupt level with further 9083 * splbio interrupts blocked. 9084 */ 9085static int 9086handle_written_inodeblock(inodedep, bp) 9087 struct inodedep *inodedep; 9088 struct buf *bp; /* buffer containing the inode block */ 9089{ 9090 struct freefile *freefile; 9091 struct allocdirect *adp, *nextadp; 9092 struct ufs1_dinode *dp1 = NULL; 9093 struct ufs2_dinode *dp2 = NULL; 9094 struct workhead wkhd; 9095 int hadchanges, fstype; 9096 ino_t freelink; 9097 9098 LIST_INIT(&wkhd); 9099 hadchanges = 0; 9100 freefile = NULL; 9101 if ((inodedep->id_state & IOSTARTED) == 0) 9102 panic("handle_written_inodeblock: not started"); 9103 inodedep->id_state &= ~IOSTARTED; 9104 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { 9105 fstype = UFS1; 9106 dp1 = (struct ufs1_dinode *)bp->b_data + 9107 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 9108 freelink = dp1->di_freelink; 9109 } else { 9110 fstype = UFS2; 9111 dp2 = (struct ufs2_dinode *)bp->b_data + 9112 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 9113 freelink = dp2->di_freelink; 9114 } 9115 /* 9116 * If we wrote a valid freelink pointer during the last write 9117 * record it here. 9118 */ 9119 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 9120 struct inodedep *inon; 9121 9122 inon = TAILQ_NEXT(inodedep, id_unlinked); 9123 if ((inon == NULL && freelink == 0) || 9124 (inon && inon->id_ino == freelink)) { 9125 if (inon) 9126 inon->id_state |= UNLINKPREV; 9127 inodedep->id_state |= UNLINKNEXT; 9128 } else 9129 hadchanges = 1; 9130 } 9131 /* Leave this inodeblock dirty until it's in the list. */ 9132 if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED) 9133 hadchanges = 1; 9134 /* 9135 * If we had to rollback the inode allocation because of 9136 * bitmaps being incomplete, then simply restore it. 9137 * Keep the block dirty so that it will not be reclaimed until 9138 * all associated dependencies have been cleared and the 9139 * corresponding updates written to disk. 9140 */ 9141 if (inodedep->id_savedino1 != NULL) { 9142 hadchanges = 1; 9143 if (fstype == UFS1) 9144 *dp1 = *inodedep->id_savedino1; 9145 else 9146 *dp2 = *inodedep->id_savedino2; 9147 free(inodedep->id_savedino1, M_SAVEDINO); 9148 inodedep->id_savedino1 = NULL; 9149 if ((bp->b_flags & B_DELWRI) == 0) 9150 stat_inode_bitmap++; 9151 bdirty(bp); 9152 /* 9153 * If the inode is clear here and GOINGAWAY it will never 9154 * be written. Process the bufwait and clear any pending 9155 * work which may include the freefile. 9156 */ 9157 if (inodedep->id_state & GOINGAWAY) 9158 goto bufwait; 9159 return (1); 9160 } 9161 inodedep->id_state |= COMPLETE; 9162 /* 9163 * Roll forward anything that had to be rolled back before 9164 * the inode could be updated. 9165 */ 9166 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 9167 nextadp = TAILQ_NEXT(adp, ad_next); 9168 if (adp->ad_state & ATTACHED) 9169 panic("handle_written_inodeblock: new entry"); 9170 if (fstype == UFS1) { 9171 if (adp->ad_offset < NDADDR) { 9172 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) 9173 panic("%s %s #%jd mismatch %d != %jd", 9174 "handle_written_inodeblock:", 9175 "direct pointer", 9176 (intmax_t)adp->ad_offset, 9177 dp1->di_db[adp->ad_offset], 9178 (intmax_t)adp->ad_oldblkno); 9179 dp1->di_db[adp->ad_offset] = adp->ad_newblkno; 9180 } else { 9181 if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) 9182 panic("%s: %s #%jd allocated as %d", 9183 "handle_written_inodeblock", 9184 "indirect pointer", 9185 (intmax_t)adp->ad_offset - NDADDR, 9186 dp1->di_ib[adp->ad_offset - NDADDR]); 9187 dp1->di_ib[adp->ad_offset - NDADDR] = 9188 adp->ad_newblkno; 9189 } 9190 } else { 9191 if (adp->ad_offset < NDADDR) { 9192 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) 9193 panic("%s: %s #%jd %s %jd != %jd", 9194 "handle_written_inodeblock", 9195 "direct pointer", 9196 (intmax_t)adp->ad_offset, "mismatch", 9197 (intmax_t)dp2->di_db[adp->ad_offset], 9198 (intmax_t)adp->ad_oldblkno); 9199 dp2->di_db[adp->ad_offset] = adp->ad_newblkno; 9200 } else { 9201 if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) 9202 panic("%s: %s #%jd allocated as %jd", 9203 "handle_written_inodeblock", 9204 "indirect pointer", 9205 (intmax_t)adp->ad_offset - NDADDR, 9206 (intmax_t) 9207 dp2->di_ib[adp->ad_offset - NDADDR]); 9208 dp2->di_ib[adp->ad_offset - NDADDR] = 9209 adp->ad_newblkno; 9210 } 9211 } 9212 adp->ad_state &= ~UNDONE; 9213 adp->ad_state |= ATTACHED; 9214 hadchanges = 1; 9215 } 9216 for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { 9217 nextadp = TAILQ_NEXT(adp, ad_next); 9218 if (adp->ad_state & ATTACHED) 9219 panic("handle_written_inodeblock: new entry"); 9220 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) 9221 panic("%s: direct pointers #%jd %s %jd != %jd", 9222 "handle_written_inodeblock", 9223 (intmax_t)adp->ad_offset, "mismatch", 9224 (intmax_t)dp2->di_extb[adp->ad_offset], 9225 (intmax_t)adp->ad_oldblkno); 9226 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; 9227 adp->ad_state &= ~UNDONE; 9228 adp->ad_state |= ATTACHED; 9229 hadchanges = 1; 9230 } 9231 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 9232 stat_direct_blk_ptrs++; 9233 /* 9234 * Reset the file size to its most up-to-date value. 9235 */ 9236 if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) 9237 panic("handle_written_inodeblock: bad size"); 9238 if (inodedep->id_savednlink > LINK_MAX) 9239 panic("handle_written_inodeblock: Invalid link count " 9240 "%d for inodedep %p", inodedep->id_savednlink, inodedep); 9241 if (fstype == UFS1) { 9242 if (dp1->di_nlink != inodedep->id_savednlink) { 9243 dp1->di_nlink = inodedep->id_savednlink; 9244 hadchanges = 1; 9245 } 9246 if (dp1->di_size != inodedep->id_savedsize) { 9247 dp1->di_size = inodedep->id_savedsize; 9248 hadchanges = 1; 9249 } 9250 } else { 9251 if (dp2->di_nlink != inodedep->id_savednlink) { 9252 dp2->di_nlink = inodedep->id_savednlink; 9253 hadchanges = 1; 9254 } 9255 if (dp2->di_size != inodedep->id_savedsize) { 9256 dp2->di_size = inodedep->id_savedsize; 9257 hadchanges = 1; 9258 } 9259 if (dp2->di_extsize != inodedep->id_savedextsize) { 9260 dp2->di_extsize = inodedep->id_savedextsize; 9261 hadchanges = 1; 9262 } 9263 } 9264 inodedep->id_savedsize = -1; 9265 inodedep->id_savedextsize = -1; 9266 inodedep->id_savednlink = -1; 9267 /* 9268 * If there were any rollbacks in the inode block, then it must be 9269 * marked dirty so that its will eventually get written back in 9270 * its correct form. 9271 */ 9272 if (hadchanges) 9273 bdirty(bp); 9274bufwait: 9275 /* 9276 * Process any allocdirects that completed during the update. 9277 */ 9278 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 9279 handle_allocdirect_partdone(adp, &wkhd); 9280 if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) 9281 handle_allocdirect_partdone(adp, &wkhd); 9282 /* 9283 * Process deallocations that were held pending until the 9284 * inode had been written to disk. Freeing of the inode 9285 * is delayed until after all blocks have been freed to 9286 * avoid creation of new <vfsid, inum, lbn> triples 9287 * before the old ones have been deleted. Completely 9288 * unlinked inodes are not processed until the unlinked 9289 * inode list is written or the last reference is removed. 9290 */ 9291 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { 9292 freefile = handle_bufwait(inodedep, NULL); 9293 if (freefile && !LIST_EMPTY(&wkhd)) { 9294 WORKLIST_INSERT(&wkhd, &freefile->fx_list); 9295 freefile = NULL; 9296 } 9297 } 9298 /* 9299 * Move rolled forward dependency completions to the bufwait list 9300 * now that those that were already written have been processed. 9301 */ 9302 if (!LIST_EMPTY(&wkhd) && hadchanges == 0) 9303 panic("handle_written_inodeblock: bufwait but no changes"); 9304 jwork_move(&inodedep->id_bufwait, &wkhd); 9305 9306 if (freefile != NULL) { 9307 /* 9308 * If the inode is goingaway it was never written. Fake up 9309 * the state here so free_inodedep() can succeed. 9310 */ 9311 if (inodedep->id_state & GOINGAWAY) 9312 inodedep->id_state |= COMPLETE | DEPCOMPLETE; 9313 if (free_inodedep(inodedep) == 0) 9314 panic("handle_written_inodeblock: live inodedep %p", 9315 inodedep); 9316 add_to_worklist(&freefile->fx_list, 0); 9317 return (0); 9318 } 9319 9320 /* 9321 * If no outstanding dependencies, free it. 9322 */ 9323 if (free_inodedep(inodedep) || 9324 (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && 9325 TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && 9326 TAILQ_FIRST(&inodedep->id_extupdt) == 0 && 9327 LIST_FIRST(&inodedep->id_bufwait) == 0)) 9328 return (0); 9329 return (hadchanges); 9330} 9331 9332static int 9333handle_written_indirdep(indirdep, bp, bpp) 9334 struct indirdep *indirdep; 9335 struct buf *bp; 9336 struct buf **bpp; 9337{ 9338 struct allocindir *aip; 9339 int chgs; 9340 9341 if (indirdep->ir_state & GOINGAWAY) 9342 panic("disk_write_complete: indirdep gone"); 9343 chgs = 0; 9344 /* 9345 * If there were rollbacks revert them here. 9346 */ 9347 if (indirdep->ir_saveddata) { 9348 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 9349 free(indirdep->ir_saveddata, M_INDIRDEP); 9350 indirdep->ir_saveddata = 0; 9351 chgs = 1; 9352 } 9353 indirdep->ir_state &= ~UNDONE; 9354 indirdep->ir_state |= ATTACHED; 9355 /* 9356 * Move allocindirs with written pointers to the completehd if 9357 * the the indirdep's pointer is not yet written. Otherwise 9358 * free them here. 9359 */ 9360 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) { 9361 LIST_REMOVE(aip, ai_next); 9362 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 9363 LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, 9364 ai_next); 9365 continue; 9366 } 9367 free_newblk(&aip->ai_block); 9368 } 9369 /* 9370 * Move allocindirs that have finished dependency processing from 9371 * the done list to the write list after updating the pointers. 9372 */ 9373 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 9374 handle_allocindir_partdone(aip); 9375 if (aip == LIST_FIRST(&indirdep->ir_donehd)) 9376 panic("disk_write_complete: not gone"); 9377 chgs = 1; 9378 } 9379 /* 9380 * If this indirdep has been detached from its newblk during 9381 * I/O we need to keep this dep attached to the buffer so 9382 * deallocate_dependencies can find it and properly resolve 9383 * any outstanding dependencies. 9384 */ 9385 if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0) 9386 chgs = 1; 9387 if ((bp->b_flags & B_DELWRI) == 0) 9388 stat_indir_blk_ptrs++; 9389 /* 9390 * If there were no changes we can discard the savedbp and detach 9391 * ourselves from the buf. We are only carrying completed pointers 9392 * in this case. 9393 */ 9394 if (chgs == 0) { 9395 struct buf *sbp; 9396 9397 sbp = indirdep->ir_savebp; 9398 sbp->b_flags |= B_INVAL | B_NOCACHE; 9399 indirdep->ir_savebp = NULL; 9400 if (*bpp != NULL) 9401 panic("handle_written_indirdep: bp already exists."); 9402 *bpp = sbp; 9403 } else 9404 bdirty(bp); 9405 /* 9406 * If there are no fresh dependencies and none waiting on writes 9407 * we can free the indirdep. 9408 */ 9409 if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) { 9410 if (indirdep->ir_state & ONDEPLIST) 9411 LIST_REMOVE(indirdep, ir_next); 9412 free_indirdep(indirdep); 9413 return (0); 9414 } 9415 9416 return (chgs); 9417} 9418 9419/* 9420 * Process a diradd entry after its dependent inode has been written. 9421 * This routine must be called with splbio interrupts blocked. 9422 */ 9423static void 9424diradd_inode_written(dap, inodedep) 9425 struct diradd *dap; 9426 struct inodedep *inodedep; 9427{ 9428 9429 dap->da_state |= COMPLETE; 9430 complete_diradd(dap); 9431 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 9432} 9433 9434/* 9435 * Returns true if the bmsafemap will have rollbacks when written. Must 9436 * only be called with lk and the buf lock on the cg held. 9437 */ 9438static int 9439bmsafemap_rollbacks(bmsafemap) 9440 struct bmsafemap *bmsafemap; 9441{ 9442 9443 return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | 9444 !LIST_EMPTY(&bmsafemap->sm_jnewblkhd)); 9445} 9446 9447/* 9448 * Complete a write to a bmsafemap structure. Roll forward any bitmap 9449 * changes if it's not a background write. Set all written dependencies 9450 * to DEPCOMPLETE and free the structure if possible. 9451 */ 9452static int 9453handle_written_bmsafemap(bmsafemap, bp) 9454 struct bmsafemap *bmsafemap; 9455 struct buf *bp; 9456{ 9457 struct newblk *newblk; 9458 struct inodedep *inodedep; 9459 struct jaddref *jaddref, *jatmp; 9460 struct jnewblk *jnewblk, *jntmp; 9461 uint8_t *inosused; 9462 uint8_t *blksfree; 9463 struct cg *cgp; 9464 struct fs *fs; 9465 ino_t ino; 9466 long bno; 9467 int chgs; 9468 int i; 9469 9470 if ((bmsafemap->sm_state & IOSTARTED) == 0) 9471 panic("initiate_write_bmsafemap: Not started\n"); 9472 chgs = 0; 9473 bmsafemap->sm_state &= ~IOSTARTED; 9474 /* 9475 * Restore unwritten inode allocation pending jaddref writes. 9476 */ 9477 if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { 9478 cgp = (struct cg *)bp->b_data; 9479 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 9480 inosused = cg_inosused(cgp); 9481 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, 9482 ja_bmdeps, jatmp) { 9483 if ((jaddref->ja_state & UNDONE) == 0) 9484 continue; 9485 ino = jaddref->ja_ino % fs->fs_ipg; 9486 if (isset(inosused, ino)) 9487 panic("handle_written_bmsafemap: " 9488 "re-allocated inode"); 9489 if ((bp->b_xflags & BX_BKGRDMARKER) == 0) { 9490 if ((jaddref->ja_mode & IFMT) == IFDIR) 9491 cgp->cg_cs.cs_ndir++; 9492 cgp->cg_cs.cs_nifree--; 9493 setbit(inosused, ino); 9494 chgs = 1; 9495 } 9496 jaddref->ja_state &= ~UNDONE; 9497 jaddref->ja_state |= ATTACHED; 9498 free_jaddref(jaddref); 9499 } 9500 } 9501 /* 9502 * Restore any block allocations which are pending journal writes. 9503 */ 9504 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 9505 cgp = (struct cg *)bp->b_data; 9506 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 9507 blksfree = cg_blksfree(cgp); 9508 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, 9509 jntmp) { 9510 if ((jnewblk->jn_state & UNDONE) == 0) 9511 continue; 9512 bno = dtogd(fs, jnewblk->jn_blkno); 9513 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 9514 i++) { 9515 if (bp->b_xflags & BX_BKGRDMARKER) 9516 break; 9517 if ((jnewblk->jn_state & NEWBLOCK) == 0 && 9518 isclr(blksfree, bno + i)) 9519 panic("handle_written_bmsafemap: " 9520 "re-allocated fragment"); 9521 clrbit(blksfree, bno + i); 9522 chgs = 1; 9523 } 9524 jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); 9525 jnewblk->jn_state |= ATTACHED; 9526 free_jnewblk(jnewblk); 9527 } 9528 } 9529 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { 9530 newblk->nb_state |= DEPCOMPLETE; 9531 newblk->nb_state &= ~ONDEPLIST; 9532 newblk->nb_bmsafemap = NULL; 9533 LIST_REMOVE(newblk, nb_deps); 9534 if (newblk->nb_list.wk_type == D_ALLOCDIRECT) 9535 handle_allocdirect_partdone( 9536 WK_ALLOCDIRECT(&newblk->nb_list), NULL); 9537 else if (newblk->nb_list.wk_type == D_ALLOCINDIR) 9538 handle_allocindir_partdone( 9539 WK_ALLOCINDIR(&newblk->nb_list)); 9540 else if (newblk->nb_list.wk_type != D_NEWBLK) 9541 panic("handle_written_bmsafemap: Unexpected type: %s", 9542 TYPENAME(newblk->nb_list.wk_type)); 9543 } 9544 while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { 9545 inodedep->id_state |= DEPCOMPLETE; 9546 inodedep->id_state &= ~ONDEPLIST; 9547 LIST_REMOVE(inodedep, id_deps); 9548 inodedep->id_bmsafemap = NULL; 9549 } 9550 if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && 9551 LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && 9552 LIST_EMPTY(&bmsafemap->sm_newblkhd) && 9553 LIST_EMPTY(&bmsafemap->sm_inodedephd)) { 9554 if (chgs) 9555 bdirty(bp); 9556 LIST_REMOVE(bmsafemap, sm_hash); 9557 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 9558 return (0); 9559 } 9560 bdirty(bp); 9561 return (1); 9562} 9563 9564/* 9565 * Try to free a mkdir dependency. 9566 */ 9567static void 9568complete_mkdir(mkdir) 9569 struct mkdir *mkdir; 9570{ 9571 struct diradd *dap; 9572 9573 if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) 9574 return; 9575 LIST_REMOVE(mkdir, md_mkdirs); 9576 dap = mkdir->md_diradd; 9577 dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 9578 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { 9579 dap->da_state |= DEPCOMPLETE; 9580 complete_diradd(dap); 9581 } 9582 WORKITEM_FREE(mkdir, D_MKDIR); 9583} 9584 9585/* 9586 * Handle the completion of a mkdir dependency. 9587 */ 9588static void 9589handle_written_mkdir(mkdir, type) 9590 struct mkdir *mkdir; 9591 int type; 9592{ 9593 9594 if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) 9595 panic("handle_written_mkdir: bad type"); 9596 mkdir->md_state |= COMPLETE; 9597 complete_mkdir(mkdir); 9598} 9599 9600static void 9601free_pagedep(pagedep) 9602 struct pagedep *pagedep; 9603{ 9604 int i; 9605 9606 if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST)) 9607 return; 9608 for (i = 0; i < DAHASHSZ; i++) 9609 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 9610 return; 9611 if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) 9612 return; 9613 if (!LIST_EMPTY(&pagedep->pd_dirremhd)) 9614 return; 9615 if (!LIST_EMPTY(&pagedep->pd_pendinghd)) 9616 return; 9617 LIST_REMOVE(pagedep, pd_hash); 9618 WORKITEM_FREE(pagedep, D_PAGEDEP); 9619} 9620 9621/* 9622 * Called from within softdep_disk_write_complete above. 9623 * A write operation was just completed. Removed inodes can 9624 * now be freed and associated block pointers may be committed. 9625 * Note that this routine is always called from interrupt level 9626 * with further splbio interrupts blocked. 9627 */ 9628static int 9629handle_written_filepage(pagedep, bp) 9630 struct pagedep *pagedep; 9631 struct buf *bp; /* buffer containing the written page */ 9632{ 9633 struct dirrem *dirrem; 9634 struct diradd *dap, *nextdap; 9635 struct direct *ep; 9636 int i, chgs; 9637 9638 if ((pagedep->pd_state & IOSTARTED) == 0) 9639 panic("handle_written_filepage: not started"); 9640 pagedep->pd_state &= ~IOSTARTED; 9641 /* 9642 * Process any directory removals that have been committed. 9643 */ 9644 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 9645 LIST_REMOVE(dirrem, dm_next); 9646 dirrem->dm_state |= COMPLETE; 9647 dirrem->dm_dirinum = pagedep->pd_ino; 9648 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 9649 ("handle_written_filepage: Journal entries not written.")); 9650 add_to_worklist(&dirrem->dm_list, 0); 9651 } 9652 /* 9653 * Free any directory additions that have been committed. 9654 * If it is a newly allocated block, we have to wait until 9655 * the on-disk directory inode claims the new block. 9656 */ 9657 if ((pagedep->pd_state & NEWBLOCK) == 0) 9658 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 9659 free_diradd(dap, NULL); 9660 /* 9661 * Uncommitted directory entries must be restored. 9662 */ 9663 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 9664 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 9665 dap = nextdap) { 9666 nextdap = LIST_NEXT(dap, da_pdlist); 9667 if (dap->da_state & ATTACHED) 9668 panic("handle_written_filepage: attached"); 9669 ep = (struct direct *) 9670 ((char *)bp->b_data + dap->da_offset); 9671 ep->d_ino = dap->da_newinum; 9672 dap->da_state &= ~UNDONE; 9673 dap->da_state |= ATTACHED; 9674 chgs = 1; 9675 /* 9676 * If the inode referenced by the directory has 9677 * been written out, then the dependency can be 9678 * moved to the pending list. 9679 */ 9680 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 9681 LIST_REMOVE(dap, da_pdlist); 9682 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 9683 da_pdlist); 9684 } 9685 } 9686 } 9687 /* 9688 * If there were any rollbacks in the directory, then it must be 9689 * marked dirty so that its will eventually get written back in 9690 * its correct form. 9691 */ 9692 if (chgs) { 9693 if ((bp->b_flags & B_DELWRI) == 0) 9694 stat_dir_entry++; 9695 bdirty(bp); 9696 return (1); 9697 } 9698 /* 9699 * If we are not waiting for a new directory block to be 9700 * claimed by its inode, then the pagedep will be freed. 9701 * Otherwise it will remain to track any new entries on 9702 * the page in case they are fsync'ed. 9703 */ 9704 if ((pagedep->pd_state & NEWBLOCK) == 0 && 9705 LIST_EMPTY(&pagedep->pd_jmvrefhd)) { 9706 LIST_REMOVE(pagedep, pd_hash); 9707 WORKITEM_FREE(pagedep, D_PAGEDEP); 9708 } 9709 return (0); 9710} 9711 9712/* 9713 * Writing back in-core inode structures. 9714 * 9715 * The filesystem only accesses an inode's contents when it occupies an 9716 * "in-core" inode structure. These "in-core" structures are separate from 9717 * the page frames used to cache inode blocks. Only the latter are 9718 * transferred to/from the disk. So, when the updated contents of the 9719 * "in-core" inode structure are copied to the corresponding in-memory inode 9720 * block, the dependencies are also transferred. The following procedure is 9721 * called when copying a dirty "in-core" inode to a cached inode block. 9722 */ 9723 9724/* 9725 * Called when an inode is loaded from disk. If the effective link count 9726 * differed from the actual link count when it was last flushed, then we 9727 * need to ensure that the correct effective link count is put back. 9728 */ 9729void 9730softdep_load_inodeblock(ip) 9731 struct inode *ip; /* the "in_core" copy of the inode */ 9732{ 9733 struct inodedep *inodedep; 9734 9735 /* 9736 * Check for alternate nlink count. 9737 */ 9738 ip->i_effnlink = ip->i_nlink; 9739 ACQUIRE_LOCK(&lk); 9740 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 9741 &inodedep) == 0) { 9742 FREE_LOCK(&lk); 9743 return; 9744 } 9745 ip->i_effnlink -= inodedep->id_nlinkdelta; 9746 if (inodedep->id_state & SPACECOUNTED) 9747 ip->i_flag |= IN_SPACECOUNTED; 9748 FREE_LOCK(&lk); 9749} 9750 9751/* 9752 * This routine is called just before the "in-core" inode 9753 * information is to be copied to the in-memory inode block. 9754 * Recall that an inode block contains several inodes. If 9755 * the force flag is set, then the dependencies will be 9756 * cleared so that the update can always be made. Note that 9757 * the buffer is locked when this routine is called, so we 9758 * will never be in the middle of writing the inode block 9759 * to disk. 9760 */ 9761void 9762softdep_update_inodeblock(ip, bp, waitfor) 9763 struct inode *ip; /* the "in_core" copy of the inode */ 9764 struct buf *bp; /* the buffer containing the inode block */ 9765 int waitfor; /* nonzero => update must be allowed */ 9766{ 9767 struct inodedep *inodedep; 9768 struct inoref *inoref; 9769 struct worklist *wk; 9770 struct mount *mp; 9771 struct buf *ibp; 9772 struct fs *fs; 9773 int error; 9774 9775 mp = UFSTOVFS(ip->i_ump); 9776 fs = ip->i_fs; 9777 /* 9778 * Preserve the freelink that is on disk. clear_unlinked_inodedep() 9779 * does not have access to the in-core ip so must write directly into 9780 * the inode block buffer when setting freelink. 9781 */ 9782 if (fs->fs_magic == FS_UFS1_MAGIC) 9783 DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + 9784 ino_to_fsbo(fs, ip->i_number))->di_freelink); 9785 else 9786 DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + 9787 ino_to_fsbo(fs, ip->i_number))->di_freelink); 9788 /* 9789 * If the effective link count is not equal to the actual link 9790 * count, then we must track the difference in an inodedep while 9791 * the inode is (potentially) tossed out of the cache. Otherwise, 9792 * if there is no existing inodedep, then there are no dependencies 9793 * to track. 9794 */ 9795 ACQUIRE_LOCK(&lk); 9796again: 9797 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 9798 FREE_LOCK(&lk); 9799 if (ip->i_effnlink != ip->i_nlink) 9800 panic("softdep_update_inodeblock: bad link count"); 9801 return; 9802 } 9803 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) 9804 panic("softdep_update_inodeblock: bad delta"); 9805 /* 9806 * If we're flushing all dependencies we must also move any waiting 9807 * for journal writes onto the bufwait list prior to I/O. 9808 */ 9809 if (waitfor) { 9810 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 9811 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 9812 == DEPCOMPLETE) { 9813 stat_jwait_inode++; 9814 jwait(&inoref->if_list); 9815 goto again; 9816 } 9817 } 9818 } 9819 /* 9820 * Changes have been initiated. Anything depending on these 9821 * changes cannot occur until this inode has been written. 9822 */ 9823 inodedep->id_state &= ~COMPLETE; 9824 if ((inodedep->id_state & ONWORKLIST) == 0) 9825 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 9826 /* 9827 * Any new dependencies associated with the incore inode must 9828 * now be moved to the list associated with the buffer holding 9829 * the in-memory copy of the inode. Once merged process any 9830 * allocdirects that are completed by the merger. 9831 */ 9832 merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); 9833 if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) 9834 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), 9835 NULL); 9836 merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); 9837 if (!TAILQ_EMPTY(&inodedep->id_extupdt)) 9838 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), 9839 NULL); 9840 /* 9841 * Now that the inode has been pushed into the buffer, the 9842 * operations dependent on the inode being written to disk 9843 * can be moved to the id_bufwait so that they will be 9844 * processed when the buffer I/O completes. 9845 */ 9846 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 9847 WORKLIST_REMOVE(wk); 9848 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 9849 } 9850 /* 9851 * Newly allocated inodes cannot be written until the bitmap 9852 * that allocates them have been written (indicated by 9853 * DEPCOMPLETE being set in id_state). If we are doing a 9854 * forced sync (e.g., an fsync on a file), we force the bitmap 9855 * to be written so that the update can be done. 9856 */ 9857 if (waitfor == 0) { 9858 FREE_LOCK(&lk); 9859 return; 9860 } 9861retry: 9862 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { 9863 FREE_LOCK(&lk); 9864 return; 9865 } 9866 ibp = inodedep->id_bmsafemap->sm_buf; 9867 ibp = getdirtybuf(ibp, &lk, MNT_WAIT); 9868 if (ibp == NULL) { 9869 /* 9870 * If ibp came back as NULL, the dependency could have been 9871 * freed while we slept. Look it up again, and check to see 9872 * that it has completed. 9873 */ 9874 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 9875 goto retry; 9876 FREE_LOCK(&lk); 9877 return; 9878 } 9879 FREE_LOCK(&lk); 9880 if ((error = bwrite(ibp)) != 0) 9881 softdep_error("softdep_update_inodeblock: bwrite", error); 9882} 9883 9884/* 9885 * Merge the a new inode dependency list (such as id_newinoupdt) into an 9886 * old inode dependency list (such as id_inoupdt). This routine must be 9887 * called with splbio interrupts blocked. 9888 */ 9889static void 9890merge_inode_lists(newlisthead, oldlisthead) 9891 struct allocdirectlst *newlisthead; 9892 struct allocdirectlst *oldlisthead; 9893{ 9894 struct allocdirect *listadp, *newadp; 9895 9896 newadp = TAILQ_FIRST(newlisthead); 9897 for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { 9898 if (listadp->ad_offset < newadp->ad_offset) { 9899 listadp = TAILQ_NEXT(listadp, ad_next); 9900 continue; 9901 } 9902 TAILQ_REMOVE(newlisthead, newadp, ad_next); 9903 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 9904 if (listadp->ad_offset == newadp->ad_offset) { 9905 allocdirect_merge(oldlisthead, newadp, 9906 listadp); 9907 listadp = newadp; 9908 } 9909 newadp = TAILQ_FIRST(newlisthead); 9910 } 9911 while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { 9912 TAILQ_REMOVE(newlisthead, newadp, ad_next); 9913 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); 9914 } 9915} 9916 9917/* 9918 * If we are doing an fsync, then we must ensure that any directory 9919 * entries for the inode have been written after the inode gets to disk. 9920 */ 9921int 9922softdep_fsync(vp) 9923 struct vnode *vp; /* the "in_core" copy of the inode */ 9924{ 9925 struct inodedep *inodedep; 9926 struct pagedep *pagedep; 9927 struct inoref *inoref; 9928 struct worklist *wk; 9929 struct diradd *dap; 9930 struct mount *mp; 9931 struct vnode *pvp; 9932 struct inode *ip; 9933 struct buf *bp; 9934 struct fs *fs; 9935 struct thread *td = curthread; 9936 int error, flushparent, pagedep_new_block; 9937 ino_t parentino; 9938 ufs_lbn_t lbn; 9939 9940 ip = VTOI(vp); 9941 fs = ip->i_fs; 9942 mp = vp->v_mount; 9943 ACQUIRE_LOCK(&lk); 9944restart: 9945 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 9946 FREE_LOCK(&lk); 9947 return (0); 9948 } 9949 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 9950 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 9951 == DEPCOMPLETE) { 9952 stat_jwait_inode++; 9953 jwait(&inoref->if_list); 9954 goto restart; 9955 } 9956 } 9957 if (!LIST_EMPTY(&inodedep->id_inowait) || 9958 !TAILQ_EMPTY(&inodedep->id_extupdt) || 9959 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 9960 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 9961 !TAILQ_EMPTY(&inodedep->id_newinoupdt)) 9962 panic("softdep_fsync: pending ops %p", inodedep); 9963 for (error = 0, flushparent = 0; ; ) { 9964 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 9965 break; 9966 if (wk->wk_type != D_DIRADD) 9967 panic("softdep_fsync: Unexpected type %s", 9968 TYPENAME(wk->wk_type)); 9969 dap = WK_DIRADD(wk); 9970 /* 9971 * Flush our parent if this directory entry has a MKDIR_PARENT 9972 * dependency or is contained in a newly allocated block. 9973 */ 9974 if (dap->da_state & DIRCHG) 9975 pagedep = dap->da_previous->dm_pagedep; 9976 else 9977 pagedep = dap->da_pagedep; 9978 parentino = pagedep->pd_ino; 9979 lbn = pagedep->pd_lbn; 9980 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 9981 panic("softdep_fsync: dirty"); 9982 if ((dap->da_state & MKDIR_PARENT) || 9983 (pagedep->pd_state & NEWBLOCK)) 9984 flushparent = 1; 9985 else 9986 flushparent = 0; 9987 /* 9988 * If we are being fsync'ed as part of vgone'ing this vnode, 9989 * then we will not be able to release and recover the 9990 * vnode below, so we just have to give up on writing its 9991 * directory entry out. It will eventually be written, just 9992 * not now, but then the user was not asking to have it 9993 * written, so we are not breaking any promises. 9994 */ 9995 if (vp->v_iflag & VI_DOOMED) 9996 break; 9997 /* 9998 * We prevent deadlock by always fetching inodes from the 9999 * root, moving down the directory tree. Thus, when fetching 10000 * our parent directory, we first try to get the lock. If 10001 * that fails, we must unlock ourselves before requesting 10002 * the lock on our parent. See the comment in ufs_lookup 10003 * for details on possible races. 10004 */ 10005 FREE_LOCK(&lk); 10006 if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, 10007 FFSV_FORCEINSMQ)) { 10008 error = vfs_busy(mp, MBF_NOWAIT); 10009 if (error != 0) { 10010 vfs_ref(mp); 10011 VOP_UNLOCK(vp, 0); 10012 error = vfs_busy(mp, 0); 10013 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 10014 vfs_rel(mp); 10015 if (error != 0) 10016 return (ENOENT); 10017 if (vp->v_iflag & VI_DOOMED) { 10018 vfs_unbusy(mp); 10019 return (ENOENT); 10020 } 10021 } 10022 VOP_UNLOCK(vp, 0); 10023 error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, 10024 &pvp, FFSV_FORCEINSMQ); 10025 vfs_unbusy(mp); 10026 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 10027 if (vp->v_iflag & VI_DOOMED) { 10028 if (error == 0) 10029 vput(pvp); 10030 error = ENOENT; 10031 } 10032 if (error != 0) 10033 return (error); 10034 } 10035 /* 10036 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps 10037 * that are contained in direct blocks will be resolved by 10038 * doing a ffs_update. Pagedeps contained in indirect blocks 10039 * may require a complete sync'ing of the directory. So, we 10040 * try the cheap and fast ffs_update first, and if that fails, 10041 * then we do the slower ffs_syncvnode of the directory. 10042 */ 10043 if (flushparent) { 10044 int locked; 10045 10046 if ((error = ffs_update(pvp, 1)) != 0) { 10047 vput(pvp); 10048 return (error); 10049 } 10050 ACQUIRE_LOCK(&lk); 10051 locked = 1; 10052 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { 10053 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { 10054 if (wk->wk_type != D_DIRADD) 10055 panic("softdep_fsync: Unexpected type %s", 10056 TYPENAME(wk->wk_type)); 10057 dap = WK_DIRADD(wk); 10058 if (dap->da_state & DIRCHG) 10059 pagedep = dap->da_previous->dm_pagedep; 10060 else 10061 pagedep = dap->da_pagedep; 10062 pagedep_new_block = pagedep->pd_state & NEWBLOCK; 10063 FREE_LOCK(&lk); 10064 locked = 0; 10065 if (pagedep_new_block && 10066 (error = ffs_syncvnode(pvp, MNT_WAIT))) { 10067 vput(pvp); 10068 return (error); 10069 } 10070 } 10071 } 10072 if (locked) 10073 FREE_LOCK(&lk); 10074 } 10075 /* 10076 * Flush directory page containing the inode's name. 10077 */ 10078 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, 10079 &bp); 10080 if (error == 0) 10081 error = bwrite(bp); 10082 else 10083 brelse(bp); 10084 vput(pvp); 10085 if (error != 0) 10086 return (error); 10087 ACQUIRE_LOCK(&lk); 10088 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 10089 break; 10090 } 10091 FREE_LOCK(&lk); 10092 return (0); 10093} 10094 10095/* 10096 * Flush all the dirty bitmaps associated with the block device 10097 * before flushing the rest of the dirty blocks so as to reduce 10098 * the number of dependencies that will have to be rolled back. 10099 */ 10100void 10101softdep_fsync_mountdev(vp) 10102 struct vnode *vp; 10103{ 10104 struct buf *bp, *nbp; 10105 struct worklist *wk; 10106 struct bufobj *bo; 10107 10108 if (!vn_isdisk(vp, NULL)) 10109 panic("softdep_fsync_mountdev: vnode not a disk"); 10110 bo = &vp->v_bufobj; 10111restart: 10112 BO_LOCK(bo); 10113 ACQUIRE_LOCK(&lk); 10114 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 10115 /* 10116 * If it is already scheduled, skip to the next buffer. 10117 */ 10118 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 10119 continue; 10120 10121 if ((bp->b_flags & B_DELWRI) == 0) 10122 panic("softdep_fsync_mountdev: not dirty"); 10123 /* 10124 * We are only interested in bitmaps with outstanding 10125 * dependencies. 10126 */ 10127 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || 10128 wk->wk_type != D_BMSAFEMAP || 10129 (bp->b_vflags & BV_BKGRDINPROG)) { 10130 BUF_UNLOCK(bp); 10131 continue; 10132 } 10133 FREE_LOCK(&lk); 10134 BO_UNLOCK(bo); 10135 bremfree(bp); 10136 (void) bawrite(bp); 10137 goto restart; 10138 } 10139 FREE_LOCK(&lk); 10140 drain_output(vp); 10141 BO_UNLOCK(bo); 10142} 10143 10144/* 10145 * This routine is called when we are trying to synchronously flush a 10146 * file. This routine must eliminate any filesystem metadata dependencies 10147 * so that the syncing routine can succeed by pushing the dirty blocks 10148 * associated with the file. If any I/O errors occur, they are returned. 10149 */ 10150int 10151softdep_sync_metadata(struct vnode *vp) 10152{ 10153 struct pagedep *pagedep; 10154 struct allocindir *aip; 10155 struct newblk *newblk; 10156 struct buf *bp, *nbp; 10157 struct worklist *wk; 10158 struct bufobj *bo; 10159 int i, error, waitfor; 10160 10161 if (!DOINGSOFTDEP(vp)) 10162 return (0); 10163 /* 10164 * Ensure that any direct block dependencies have been cleared. 10165 */ 10166 ACQUIRE_LOCK(&lk); 10167 if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) { 10168 FREE_LOCK(&lk); 10169 return (error); 10170 } 10171 FREE_LOCK(&lk); 10172 /* 10173 * For most files, the only metadata dependencies are the 10174 * cylinder group maps that allocate their inode or blocks. 10175 * The block allocation dependencies can be found by traversing 10176 * the dependency lists for any buffers that remain on their 10177 * dirty buffer list. The inode allocation dependency will 10178 * be resolved when the inode is updated with MNT_WAIT. 10179 * This work is done in two passes. The first pass grabs most 10180 * of the buffers and begins asynchronously writing them. The 10181 * only way to wait for these asynchronous writes is to sleep 10182 * on the filesystem vnode which may stay busy for a long time 10183 * if the filesystem is active. So, instead, we make a second 10184 * pass over the dependencies blocking on each write. In the 10185 * usual case we will be blocking against a write that we 10186 * initiated, so when it is done the dependency will have been 10187 * resolved. Thus the second pass is expected to end quickly. 10188 */ 10189 waitfor = MNT_NOWAIT; 10190 bo = &vp->v_bufobj; 10191 10192top: 10193 /* 10194 * We must wait for any I/O in progress to finish so that 10195 * all potential buffers on the dirty list will be visible. 10196 */ 10197 BO_LOCK(bo); 10198 drain_output(vp); 10199 while ((bp = TAILQ_FIRST(&bo->bo_dirty.bv_hd)) != NULL) { 10200 bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT); 10201 if (bp) 10202 break; 10203 } 10204 BO_UNLOCK(bo); 10205 if (bp == NULL) 10206 return (0); 10207loop: 10208 /* While syncing snapshots, we must allow recursive lookups */ 10209 BUF_AREC(bp); 10210 ACQUIRE_LOCK(&lk); 10211 /* 10212 * As we hold the buffer locked, none of its dependencies 10213 * will disappear. 10214 */ 10215 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 10216 switch (wk->wk_type) { 10217 10218 case D_ALLOCDIRECT: 10219 case D_ALLOCINDIR: 10220 newblk = WK_NEWBLK(wk); 10221 if (newblk->nb_jnewblk != NULL) { 10222 stat_jwait_newblk++; 10223 jwait(&newblk->nb_jnewblk->jn_list); 10224 goto restart; 10225 } 10226 if (newblk->nb_state & DEPCOMPLETE) 10227 continue; 10228 nbp = newblk->nb_bmsafemap->sm_buf; 10229 nbp = getdirtybuf(nbp, &lk, waitfor); 10230 if (nbp == NULL) 10231 continue; 10232 FREE_LOCK(&lk); 10233 if (waitfor == MNT_NOWAIT) { 10234 bawrite(nbp); 10235 } else if ((error = bwrite(nbp)) != 0) { 10236 break; 10237 } 10238 ACQUIRE_LOCK(&lk); 10239 continue; 10240 10241 case D_INDIRDEP: 10242 restart: 10243 10244 LIST_FOREACH(aip, 10245 &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { 10246 newblk = (struct newblk *)aip; 10247 if (newblk->nb_jnewblk != NULL) { 10248 stat_jwait_newblk++; 10249 jwait(&newblk->nb_jnewblk->jn_list); 10250 goto restart; 10251 } 10252 if (newblk->nb_state & DEPCOMPLETE) 10253 continue; 10254 nbp = newblk->nb_bmsafemap->sm_buf; 10255 nbp = getdirtybuf(nbp, &lk, MNT_WAIT); 10256 if (nbp == NULL) 10257 goto restart; 10258 FREE_LOCK(&lk); 10259 if ((error = bwrite(nbp)) != 0) { 10260 goto loop_end; 10261 } 10262 ACQUIRE_LOCK(&lk); 10263 goto restart; 10264 } 10265 continue; 10266 10267 case D_PAGEDEP: 10268 /* 10269 * We are trying to sync a directory that may 10270 * have dependencies on both its own metadata 10271 * and/or dependencies on the inodes of any 10272 * recently allocated files. We walk its diradd 10273 * lists pushing out the associated inode. 10274 */ 10275 pagedep = WK_PAGEDEP(wk); 10276 for (i = 0; i < DAHASHSZ; i++) { 10277 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 10278 continue; 10279 if ((error = 10280 flush_pagedep_deps(vp, wk->wk_mp, 10281 &pagedep->pd_diraddhd[i]))) { 10282 FREE_LOCK(&lk); 10283 goto loop_end; 10284 } 10285 } 10286 continue; 10287 10288 default: 10289 panic("softdep_sync_metadata: Unknown type %s", 10290 TYPENAME(wk->wk_type)); 10291 /* NOTREACHED */ 10292 } 10293 loop_end: 10294 /* We reach here only in error and unlocked */ 10295 if (error == 0) 10296 panic("softdep_sync_metadata: zero error"); 10297 BUF_NOREC(bp); 10298 bawrite(bp); 10299 return (error); 10300 } 10301 FREE_LOCK(&lk); 10302 BO_LOCK(bo); 10303 while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) { 10304 nbp = getdirtybuf(nbp, BO_MTX(bo), MNT_WAIT); 10305 if (nbp) 10306 break; 10307 } 10308 BO_UNLOCK(bo); 10309 BUF_NOREC(bp); 10310 bawrite(bp); 10311 if (nbp != NULL) { 10312 bp = nbp; 10313 goto loop; 10314 } 10315 /* 10316 * The brief unlock is to allow any pent up dependency 10317 * processing to be done. Then proceed with the second pass. 10318 */ 10319 if (waitfor == MNT_NOWAIT) { 10320 waitfor = MNT_WAIT; 10321 goto top; 10322 } 10323 10324 /* 10325 * If we have managed to get rid of all the dirty buffers, 10326 * then we are done. For certain directories and block 10327 * devices, we may need to do further work. 10328 * 10329 * We must wait for any I/O in progress to finish so that 10330 * all potential buffers on the dirty list will be visible. 10331 */ 10332 BO_LOCK(bo); 10333 drain_output(vp); 10334 BO_UNLOCK(bo); 10335 return ffs_update(vp, 1); 10336 /* return (0); */ 10337} 10338 10339/* 10340 * Flush the dependencies associated with an inodedep. 10341 * Called with splbio blocked. 10342 */ 10343static int 10344flush_inodedep_deps(mp, ino) 10345 struct mount *mp; 10346 ino_t ino; 10347{ 10348 struct inodedep *inodedep; 10349 struct inoref *inoref; 10350 int error, waitfor; 10351 10352 /* 10353 * This work is done in two passes. The first pass grabs most 10354 * of the buffers and begins asynchronously writing them. The 10355 * only way to wait for these asynchronous writes is to sleep 10356 * on the filesystem vnode which may stay busy for a long time 10357 * if the filesystem is active. So, instead, we make a second 10358 * pass over the dependencies blocking on each write. In the 10359 * usual case we will be blocking against a write that we 10360 * initiated, so when it is done the dependency will have been 10361 * resolved. Thus the second pass is expected to end quickly. 10362 * We give a brief window at the top of the loop to allow 10363 * any pending I/O to complete. 10364 */ 10365 for (error = 0, waitfor = MNT_NOWAIT; ; ) { 10366 if (error) 10367 return (error); 10368 FREE_LOCK(&lk); 10369 ACQUIRE_LOCK(&lk); 10370restart: 10371 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 10372 return (0); 10373 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 10374 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 10375 == DEPCOMPLETE) { 10376 stat_jwait_inode++; 10377 jwait(&inoref->if_list); 10378 goto restart; 10379 } 10380 } 10381 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || 10382 flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || 10383 flush_deplist(&inodedep->id_extupdt, waitfor, &error) || 10384 flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) 10385 continue; 10386 /* 10387 * If pass2, we are done, otherwise do pass 2. 10388 */ 10389 if (waitfor == MNT_WAIT) 10390 break; 10391 waitfor = MNT_WAIT; 10392 } 10393 /* 10394 * Try freeing inodedep in case all dependencies have been removed. 10395 */ 10396 if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) 10397 (void) free_inodedep(inodedep); 10398 return (0); 10399} 10400 10401/* 10402 * Flush an inode dependency list. 10403 * Called with splbio blocked. 10404 */ 10405static int 10406flush_deplist(listhead, waitfor, errorp) 10407 struct allocdirectlst *listhead; 10408 int waitfor; 10409 int *errorp; 10410{ 10411 struct allocdirect *adp; 10412 struct newblk *newblk; 10413 struct buf *bp; 10414 10415 mtx_assert(&lk, MA_OWNED); 10416 TAILQ_FOREACH(adp, listhead, ad_next) { 10417 newblk = (struct newblk *)adp; 10418 if (newblk->nb_jnewblk != NULL) { 10419 stat_jwait_newblk++; 10420 jwait(&newblk->nb_jnewblk->jn_list); 10421 return (1); 10422 } 10423 if (newblk->nb_state & DEPCOMPLETE) 10424 continue; 10425 bp = newblk->nb_bmsafemap->sm_buf; 10426 bp = getdirtybuf(bp, &lk, waitfor); 10427 if (bp == NULL) { 10428 if (waitfor == MNT_NOWAIT) 10429 continue; 10430 return (1); 10431 } 10432 FREE_LOCK(&lk); 10433 if (waitfor == MNT_NOWAIT) { 10434 bawrite(bp); 10435 } else if ((*errorp = bwrite(bp)) != 0) { 10436 ACQUIRE_LOCK(&lk); 10437 return (1); 10438 } 10439 ACQUIRE_LOCK(&lk); 10440 return (1); 10441 } 10442 return (0); 10443} 10444 10445/* 10446 * Flush dependencies associated with an allocdirect block. 10447 */ 10448static int 10449flush_newblk_dep(vp, mp, lbn) 10450 struct vnode *vp; 10451 struct mount *mp; 10452 ufs_lbn_t lbn; 10453{ 10454 struct newblk *newblk; 10455 struct bufobj *bo; 10456 struct inode *ip; 10457 struct buf *bp; 10458 ufs2_daddr_t blkno; 10459 int error; 10460 10461 error = 0; 10462 bo = &vp->v_bufobj; 10463 ip = VTOI(vp); 10464 blkno = DIP(ip, i_db[lbn]); 10465 if (blkno == 0) 10466 panic("flush_newblk_dep: Missing block"); 10467 ACQUIRE_LOCK(&lk); 10468 /* 10469 * Loop until all dependencies related to this block are satisfied. 10470 * We must be careful to restart after each sleep in case a write 10471 * completes some part of this process for us. 10472 */ 10473 for (;;) { 10474 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { 10475 FREE_LOCK(&lk); 10476 break; 10477 } 10478 if (newblk->nb_list.wk_type != D_ALLOCDIRECT) 10479 panic("flush_newblk_deps: Bad newblk %p", newblk); 10480 /* 10481 * Flush the journal. 10482 */ 10483 if (newblk->nb_jnewblk != NULL) { 10484 stat_jwait_newblk++; 10485 jwait(&newblk->nb_jnewblk->jn_list); 10486 continue; 10487 } 10488 /* 10489 * Write the bitmap dependency. 10490 */ 10491 if ((newblk->nb_state & DEPCOMPLETE) == 0) { 10492 bp = newblk->nb_bmsafemap->sm_buf; 10493 bp = getdirtybuf(bp, &lk, MNT_WAIT); 10494 if (bp == NULL) 10495 continue; 10496 FREE_LOCK(&lk); 10497 error = bwrite(bp); 10498 if (error) 10499 break; 10500 ACQUIRE_LOCK(&lk); 10501 continue; 10502 } 10503 /* 10504 * Write the buffer. 10505 */ 10506 FREE_LOCK(&lk); 10507 BO_LOCK(bo); 10508 bp = gbincore(bo, lbn); 10509 if (bp != NULL) { 10510 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 10511 LK_INTERLOCK, BO_MTX(bo)); 10512 if (error == ENOLCK) { 10513 ACQUIRE_LOCK(&lk); 10514 continue; /* Slept, retry */ 10515 } 10516 if (error != 0) 10517 break; /* Failed */ 10518 if (bp->b_flags & B_DELWRI) { 10519 bremfree(bp); 10520 error = bwrite(bp); 10521 if (error) 10522 break; 10523 } else 10524 BUF_UNLOCK(bp); 10525 } else 10526 BO_UNLOCK(bo); 10527 /* 10528 * We have to wait for the direct pointers to 10529 * point at the newdirblk before the dependency 10530 * will go away. 10531 */ 10532 error = ffs_update(vp, MNT_WAIT); 10533 if (error) 10534 break; 10535 ACQUIRE_LOCK(&lk); 10536 } 10537 return (error); 10538} 10539 10540/* 10541 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 10542 * Called with splbio blocked. 10543 */ 10544static int 10545flush_pagedep_deps(pvp, mp, diraddhdp) 10546 struct vnode *pvp; 10547 struct mount *mp; 10548 struct diraddhd *diraddhdp; 10549{ 10550 struct inodedep *inodedep; 10551 struct inoref *inoref; 10552 struct ufsmount *ump; 10553 struct diradd *dap; 10554 struct vnode *vp; 10555 int error = 0; 10556 struct buf *bp; 10557 ino_t inum; 10558 10559 ump = VFSTOUFS(mp); 10560restart: 10561 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 10562 /* 10563 * Flush ourselves if this directory entry 10564 * has a MKDIR_PARENT dependency. 10565 */ 10566 if (dap->da_state & MKDIR_PARENT) { 10567 FREE_LOCK(&lk); 10568 if ((error = ffs_update(pvp, MNT_WAIT)) != 0) 10569 break; 10570 ACQUIRE_LOCK(&lk); 10571 /* 10572 * If that cleared dependencies, go on to next. 10573 */ 10574 if (dap != LIST_FIRST(diraddhdp)) 10575 continue; 10576 if (dap->da_state & MKDIR_PARENT) 10577 panic("flush_pagedep_deps: MKDIR_PARENT"); 10578 } 10579 /* 10580 * A newly allocated directory must have its "." and 10581 * ".." entries written out before its name can be 10582 * committed in its parent. 10583 */ 10584 inum = dap->da_newinum; 10585 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 10586 panic("flush_pagedep_deps: lost inode1"); 10587 /* 10588 * Wait for any pending journal adds to complete so we don't 10589 * cause rollbacks while syncing. 10590 */ 10591 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 10592 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 10593 == DEPCOMPLETE) { 10594 stat_jwait_inode++; 10595 jwait(&inoref->if_list); 10596 goto restart; 10597 } 10598 } 10599 if (dap->da_state & MKDIR_BODY) { 10600 FREE_LOCK(&lk); 10601 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 10602 FFSV_FORCEINSMQ))) 10603 break; 10604 error = flush_newblk_dep(vp, mp, 0); 10605 /* 10606 * If we still have the dependency we might need to 10607 * update the vnode to sync the new link count to 10608 * disk. 10609 */ 10610 if (error == 0 && dap == LIST_FIRST(diraddhdp)) 10611 error = ffs_update(vp, MNT_WAIT); 10612 vput(vp); 10613 if (error != 0) 10614 break; 10615 ACQUIRE_LOCK(&lk); 10616 /* 10617 * If that cleared dependencies, go on to next. 10618 */ 10619 if (dap != LIST_FIRST(diraddhdp)) 10620 continue; 10621 if (dap->da_state & MKDIR_BODY) { 10622 inodedep_lookup(UFSTOVFS(ump), inum, 0, 10623 &inodedep); 10624 panic("flush_pagedep_deps: MKDIR_BODY " 10625 "inodedep %p dap %p vp %p", 10626 inodedep, dap, vp); 10627 } 10628 } 10629 /* 10630 * Flush the inode on which the directory entry depends. 10631 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 10632 * the only remaining dependency is that the updated inode 10633 * count must get pushed to disk. The inode has already 10634 * been pushed into its inode buffer (via VOP_UPDATE) at 10635 * the time of the reference count change. So we need only 10636 * locate that buffer, ensure that there will be no rollback 10637 * caused by a bitmap dependency, then write the inode buffer. 10638 */ 10639retry: 10640 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 10641 panic("flush_pagedep_deps: lost inode"); 10642 /* 10643 * If the inode still has bitmap dependencies, 10644 * push them to disk. 10645 */ 10646 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { 10647 bp = inodedep->id_bmsafemap->sm_buf; 10648 bp = getdirtybuf(bp, &lk, MNT_WAIT); 10649 if (bp == NULL) 10650 goto retry; 10651 FREE_LOCK(&lk); 10652 if ((error = bwrite(bp)) != 0) 10653 break; 10654 ACQUIRE_LOCK(&lk); 10655 if (dap != LIST_FIRST(diraddhdp)) 10656 continue; 10657 } 10658 /* 10659 * If the inode is still sitting in a buffer waiting 10660 * to be written or waiting for the link count to be 10661 * adjusted update it here to flush it to disk. 10662 */ 10663 if (dap == LIST_FIRST(diraddhdp)) { 10664 FREE_LOCK(&lk); 10665 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 10666 FFSV_FORCEINSMQ))) 10667 break; 10668 error = ffs_update(vp, MNT_WAIT); 10669 vput(vp); 10670 if (error) 10671 break; 10672 ACQUIRE_LOCK(&lk); 10673 } 10674 /* 10675 * If we have failed to get rid of all the dependencies 10676 * then something is seriously wrong. 10677 */ 10678 if (dap == LIST_FIRST(diraddhdp)) { 10679 inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); 10680 panic("flush_pagedep_deps: failed to flush " 10681 "inodedep %p ino %d dap %p", inodedep, inum, dap); 10682 } 10683 } 10684 if (error) 10685 ACQUIRE_LOCK(&lk); 10686 return (error); 10687} 10688 10689/* 10690 * A large burst of file addition or deletion activity can drive the 10691 * memory load excessively high. First attempt to slow things down 10692 * using the techniques below. If that fails, this routine requests 10693 * the offending operations to fall back to running synchronously 10694 * until the memory load returns to a reasonable level. 10695 */ 10696int 10697softdep_slowdown(vp) 10698 struct vnode *vp; 10699{ 10700 int max_softdeps_hard; 10701 10702 ACQUIRE_LOCK(&lk); 10703 max_softdeps_hard = max_softdeps * 11 / 10; 10704 if (num_dirrem < max_softdeps_hard / 2 && 10705 num_inodedep < max_softdeps_hard && 10706 VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps && 10707 num_freeblkdep < max_softdeps_hard) { 10708 FREE_LOCK(&lk); 10709 return (0); 10710 } 10711 if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps) 10712 softdep_speedup(); 10713 stat_sync_limit_hit += 1; 10714 FREE_LOCK(&lk); 10715 return (1); 10716} 10717 10718/* 10719 * Called by the allocation routines when they are about to fail 10720 * in the hope that we can free up some disk space. 10721 * 10722 * First check to see if the work list has anything on it. If it has, 10723 * clean up entries until we successfully free some space. Because this 10724 * process holds inodes locked, we cannot handle any remove requests 10725 * that might block on a locked inode as that could lead to deadlock. 10726 * If the worklist yields no free space, encourage the syncer daemon 10727 * to help us. In no event will we try for longer than tickdelay seconds. 10728 */ 10729int 10730softdep_request_cleanup(fs, vp) 10731 struct fs *fs; 10732 struct vnode *vp; 10733{ 10734 struct ufsmount *ump; 10735 long starttime; 10736 ufs2_daddr_t needed; 10737 int error; 10738 10739 ump = VTOI(vp)->i_ump; 10740 mtx_assert(UFS_MTX(ump), MA_OWNED); 10741 needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize; 10742 starttime = time_second + tickdelay; 10743 /* 10744 * If we are being called because of a process doing a 10745 * copy-on-write, then it is not safe to update the vnode 10746 * as we may recurse into the copy-on-write routine. 10747 */ 10748 if (!(curthread->td_pflags & TDP_COWINPROGRESS)) { 10749 UFS_UNLOCK(ump); 10750 error = ffs_update(vp, 1); 10751 UFS_LOCK(ump); 10752 if (error != 0) 10753 return (0); 10754 } 10755 while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) { 10756 if (time_second > starttime) 10757 return (0); 10758 UFS_UNLOCK(ump); 10759 ACQUIRE_LOCK(&lk); 10760 process_removes(vp); 10761 if (ump->softdep_on_worklist > 0 && 10762 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) { 10763 stat_worklist_push += 1; 10764 FREE_LOCK(&lk); 10765 UFS_LOCK(ump); 10766 continue; 10767 } 10768 request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT); 10769 FREE_LOCK(&lk); 10770 UFS_LOCK(ump); 10771 } 10772 return (1); 10773} 10774 10775/* 10776 * If memory utilization has gotten too high, deliberately slow things 10777 * down and speed up the I/O processing. 10778 */ 10779extern struct thread *syncertd; 10780static int 10781request_cleanup(mp, resource) 10782 struct mount *mp; 10783 int resource; 10784{ 10785 struct thread *td = curthread; 10786 struct ufsmount *ump; 10787 10788 mtx_assert(&lk, MA_OWNED); 10789 /* 10790 * We never hold up the filesystem syncer or buf daemon. 10791 */ 10792 if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) 10793 return (0); 10794 ump = VFSTOUFS(mp); 10795 /* 10796 * First check to see if the work list has gotten backlogged. 10797 * If it has, co-opt this process to help clean up two entries. 10798 * Because this process may hold inodes locked, we cannot 10799 * handle any remove requests that might block on a locked 10800 * inode as that could lead to deadlock. We set TDP_SOFTDEP 10801 * to avoid recursively processing the worklist. 10802 */ 10803 if (ump->softdep_on_worklist > max_softdeps / 10) { 10804 td->td_pflags |= TDP_SOFTDEP; 10805 process_worklist_item(mp, LK_NOWAIT); 10806 process_worklist_item(mp, LK_NOWAIT); 10807 td->td_pflags &= ~TDP_SOFTDEP; 10808 stat_worklist_push += 2; 10809 return(1); 10810 } 10811 /* 10812 * Next, we attempt to speed up the syncer process. If that 10813 * is successful, then we allow the process to continue. 10814 */ 10815 if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT) 10816 return(0); 10817 /* 10818 * If we are resource constrained on inode dependencies, try 10819 * flushing some dirty inodes. Otherwise, we are constrained 10820 * by file deletions, so try accelerating flushes of directories 10821 * with removal dependencies. We would like to do the cleanup 10822 * here, but we probably hold an inode locked at this point and 10823 * that might deadlock against one that we try to clean. So, 10824 * the best that we can do is request the syncer daemon to do 10825 * the cleanup for us. 10826 */ 10827 switch (resource) { 10828 10829 case FLUSH_INODES: 10830 stat_ino_limit_push += 1; 10831 req_clear_inodedeps += 1; 10832 stat_countp = &stat_ino_limit_hit; 10833 break; 10834 10835 case FLUSH_REMOVE: 10836 case FLUSH_REMOVE_WAIT: 10837 stat_blk_limit_push += 1; 10838 req_clear_remove += 1; 10839 stat_countp = &stat_blk_limit_hit; 10840 break; 10841 10842 default: 10843 panic("request_cleanup: unknown type"); 10844 } 10845 /* 10846 * Hopefully the syncer daemon will catch up and awaken us. 10847 * We wait at most tickdelay before proceeding in any case. 10848 */ 10849 proc_waiting += 1; 10850 if (callout_pending(&softdep_callout) == FALSE) 10851 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 10852 pause_timer, 0); 10853 10854 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); 10855 proc_waiting -= 1; 10856 return (1); 10857} 10858 10859/* 10860 * Awaken processes pausing in request_cleanup and clear proc_waiting 10861 * to indicate that there is no longer a timer running. 10862 */ 10863static void 10864pause_timer(arg) 10865 void *arg; 10866{ 10867 10868 /* 10869 * The callout_ API has acquired mtx and will hold it around this 10870 * function call. 10871 */ 10872 *stat_countp += 1; 10873 wakeup_one(&proc_waiting); 10874 if (proc_waiting > 0) 10875 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 10876 pause_timer, 0); 10877} 10878 10879/* 10880 * Flush out a directory with at least one removal dependency in an effort to 10881 * reduce the number of dirrem, freefile, and freeblks dependency structures. 10882 */ 10883static void 10884clear_remove(td) 10885 struct thread *td; 10886{ 10887 struct pagedep_hashhead *pagedephd; 10888 struct pagedep *pagedep; 10889 static int next = 0; 10890 struct mount *mp; 10891 struct vnode *vp; 10892 struct bufobj *bo; 10893 int error, cnt; 10894 ino_t ino; 10895 10896 mtx_assert(&lk, MA_OWNED); 10897 10898 for (cnt = 0; cnt < pagedep_hash; cnt++) { 10899 pagedephd = &pagedep_hashtbl[next++]; 10900 if (next >= pagedep_hash) 10901 next = 0; 10902 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 10903 if (LIST_EMPTY(&pagedep->pd_dirremhd)) 10904 continue; 10905 mp = pagedep->pd_list.wk_mp; 10906 ino = pagedep->pd_ino; 10907 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 10908 continue; 10909 FREE_LOCK(&lk); 10910 10911 /* 10912 * Let unmount clear deps 10913 */ 10914 error = vfs_busy(mp, MBF_NOWAIT); 10915 if (error != 0) 10916 goto finish_write; 10917 error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 10918 FFSV_FORCEINSMQ); 10919 vfs_unbusy(mp); 10920 if (error != 0) { 10921 softdep_error("clear_remove: vget", error); 10922 goto finish_write; 10923 } 10924 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 10925 softdep_error("clear_remove: fsync", error); 10926 bo = &vp->v_bufobj; 10927 BO_LOCK(bo); 10928 drain_output(vp); 10929 BO_UNLOCK(bo); 10930 vput(vp); 10931 finish_write: 10932 vn_finished_write(mp); 10933 ACQUIRE_LOCK(&lk); 10934 return; 10935 } 10936 } 10937} 10938 10939/* 10940 * Clear out a block of dirty inodes in an effort to reduce 10941 * the number of inodedep dependency structures. 10942 */ 10943static void 10944clear_inodedeps(td) 10945 struct thread *td; 10946{ 10947 struct inodedep_hashhead *inodedephd; 10948 struct inodedep *inodedep; 10949 static int next = 0; 10950 struct mount *mp; 10951 struct vnode *vp; 10952 struct fs *fs; 10953 int error, cnt; 10954 ino_t firstino, lastino, ino; 10955 10956 mtx_assert(&lk, MA_OWNED); 10957 /* 10958 * Pick a random inode dependency to be cleared. 10959 * We will then gather up all the inodes in its block 10960 * that have dependencies and flush them out. 10961 */ 10962 for (cnt = 0; cnt < inodedep_hash; cnt++) { 10963 inodedephd = &inodedep_hashtbl[next++]; 10964 if (next >= inodedep_hash) 10965 next = 0; 10966 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 10967 break; 10968 } 10969 if (inodedep == NULL) 10970 return; 10971 fs = inodedep->id_fs; 10972 mp = inodedep->id_list.wk_mp; 10973 /* 10974 * Find the last inode in the block with dependencies. 10975 */ 10976 firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 10977 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 10978 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) 10979 break; 10980 /* 10981 * Asynchronously push all but the last inode with dependencies. 10982 * Synchronously push the last inode with dependencies to ensure 10983 * that the inode block gets written to free up the inodedeps. 10984 */ 10985 for (ino = firstino; ino <= lastino; ino++) { 10986 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 10987 continue; 10988 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 10989 continue; 10990 FREE_LOCK(&lk); 10991 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ 10992 if (error != 0) { 10993 vn_finished_write(mp); 10994 ACQUIRE_LOCK(&lk); 10995 return; 10996 } 10997 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 10998 FFSV_FORCEINSMQ)) != 0) { 10999 softdep_error("clear_inodedeps: vget", error); 11000 vfs_unbusy(mp); 11001 vn_finished_write(mp); 11002 ACQUIRE_LOCK(&lk); 11003 return; 11004 } 11005 vfs_unbusy(mp); 11006 if (ino == lastino) { 11007 if ((error = ffs_syncvnode(vp, MNT_WAIT))) 11008 softdep_error("clear_inodedeps: fsync1", error); 11009 } else { 11010 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 11011 softdep_error("clear_inodedeps: fsync2", error); 11012 BO_LOCK(&vp->v_bufobj); 11013 drain_output(vp); 11014 BO_UNLOCK(&vp->v_bufobj); 11015 } 11016 vput(vp); 11017 vn_finished_write(mp); 11018 ACQUIRE_LOCK(&lk); 11019 } 11020} 11021 11022/* 11023 * Function to determine if the buffer has outstanding dependencies 11024 * that will cause a roll-back if the buffer is written. If wantcount 11025 * is set, return number of dependencies, otherwise just yes or no. 11026 */ 11027static int 11028softdep_count_dependencies(bp, wantcount) 11029 struct buf *bp; 11030 int wantcount; 11031{ 11032 struct worklist *wk; 11033 struct bmsafemap *bmsafemap; 11034 struct inodedep *inodedep; 11035 struct indirdep *indirdep; 11036 struct freeblks *freeblks; 11037 struct allocindir *aip; 11038 struct pagedep *pagedep; 11039 struct dirrem *dirrem; 11040 struct newblk *newblk; 11041 struct mkdir *mkdir; 11042 struct diradd *dap; 11043 int i, retval; 11044 11045 retval = 0; 11046 ACQUIRE_LOCK(&lk); 11047 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 11048 switch (wk->wk_type) { 11049 11050 case D_INODEDEP: 11051 inodedep = WK_INODEDEP(wk); 11052 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 11053 /* bitmap allocation dependency */ 11054 retval += 1; 11055 if (!wantcount) 11056 goto out; 11057 } 11058 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 11059 /* direct block pointer dependency */ 11060 retval += 1; 11061 if (!wantcount) 11062 goto out; 11063 } 11064 if (TAILQ_FIRST(&inodedep->id_extupdt)) { 11065 /* direct block pointer dependency */ 11066 retval += 1; 11067 if (!wantcount) 11068 goto out; 11069 } 11070 if (TAILQ_FIRST(&inodedep->id_inoreflst)) { 11071 /* Add reference dependency. */ 11072 retval += 1; 11073 if (!wantcount) 11074 goto out; 11075 } 11076 continue; 11077 11078 case D_INDIRDEP: 11079 indirdep = WK_INDIRDEP(wk); 11080 11081 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 11082 /* indirect block pointer dependency */ 11083 retval += 1; 11084 if (!wantcount) 11085 goto out; 11086 } 11087 continue; 11088 11089 case D_PAGEDEP: 11090 pagedep = WK_PAGEDEP(wk); 11091 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 11092 if (LIST_FIRST(&dirrem->dm_jremrefhd)) { 11093 /* Journal remove ref dependency. */ 11094 retval += 1; 11095 if (!wantcount) 11096 goto out; 11097 } 11098 } 11099 for (i = 0; i < DAHASHSZ; i++) { 11100 11101 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 11102 /* directory entry dependency */ 11103 retval += 1; 11104 if (!wantcount) 11105 goto out; 11106 } 11107 } 11108 continue; 11109 11110 case D_BMSAFEMAP: 11111 bmsafemap = WK_BMSAFEMAP(wk); 11112 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { 11113 /* Add reference dependency. */ 11114 retval += 1; 11115 if (!wantcount) 11116 goto out; 11117 } 11118 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { 11119 /* Allocate block dependency. */ 11120 retval += 1; 11121 if (!wantcount) 11122 goto out; 11123 } 11124 continue; 11125 11126 case D_FREEBLKS: 11127 freeblks = WK_FREEBLKS(wk); 11128 if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) { 11129 /* Freeblk journal dependency. */ 11130 retval += 1; 11131 if (!wantcount) 11132 goto out; 11133 } 11134 continue; 11135 11136 case D_ALLOCDIRECT: 11137 case D_ALLOCINDIR: 11138 newblk = WK_NEWBLK(wk); 11139 if (newblk->nb_jnewblk) { 11140 /* Journal allocate dependency. */ 11141 retval += 1; 11142 if (!wantcount) 11143 goto out; 11144 } 11145 continue; 11146 11147 case D_MKDIR: 11148 mkdir = WK_MKDIR(wk); 11149 if (mkdir->md_jaddref) { 11150 /* Journal reference dependency. */ 11151 retval += 1; 11152 if (!wantcount) 11153 goto out; 11154 } 11155 continue; 11156 11157 case D_FREEWORK: 11158 case D_FREEDEP: 11159 case D_JSEGDEP: 11160 case D_JSEG: 11161 case D_SBDEP: 11162 /* never a dependency on these blocks */ 11163 continue; 11164 11165 default: 11166 panic("softdep_count_dependencies: Unexpected type %s", 11167 TYPENAME(wk->wk_type)); 11168 /* NOTREACHED */ 11169 } 11170 } 11171out: 11172 FREE_LOCK(&lk); 11173 return retval; 11174} 11175 11176/* 11177 * Acquire exclusive access to a buffer. 11178 * Must be called with a locked mtx parameter. 11179 * Return acquired buffer or NULL on failure. 11180 */ 11181static struct buf * 11182getdirtybuf(bp, mtx, waitfor) 11183 struct buf *bp; 11184 struct mtx *mtx; 11185 int waitfor; 11186{ 11187 int error; 11188 11189 mtx_assert(mtx, MA_OWNED); 11190 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { 11191 if (waitfor != MNT_WAIT) 11192 return (NULL); 11193 error = BUF_LOCK(bp, 11194 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx); 11195 /* 11196 * Even if we sucessfully acquire bp here, we have dropped 11197 * mtx, which may violates our guarantee. 11198 */ 11199 if (error == 0) 11200 BUF_UNLOCK(bp); 11201 else if (error != ENOLCK) 11202 panic("getdirtybuf: inconsistent lock: %d", error); 11203 mtx_lock(mtx); 11204 return (NULL); 11205 } 11206 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 11207 if (mtx == &lk && waitfor == MNT_WAIT) { 11208 mtx_unlock(mtx); 11209 BO_LOCK(bp->b_bufobj); 11210 BUF_UNLOCK(bp); 11211 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 11212 bp->b_vflags |= BV_BKGRDWAIT; 11213 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), 11214 PRIBIO | PDROP, "getbuf", 0); 11215 } else 11216 BO_UNLOCK(bp->b_bufobj); 11217 mtx_lock(mtx); 11218 return (NULL); 11219 } 11220 BUF_UNLOCK(bp); 11221 if (waitfor != MNT_WAIT) 11222 return (NULL); 11223 /* 11224 * The mtx argument must be bp->b_vp's mutex in 11225 * this case. 11226 */ 11227#ifdef DEBUG_VFS_LOCKS 11228 if (bp->b_vp->v_type != VCHR) 11229 ASSERT_BO_LOCKED(bp->b_bufobj); 11230#endif 11231 bp->b_vflags |= BV_BKGRDWAIT; 11232 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0); 11233 return (NULL); 11234 } 11235 if ((bp->b_flags & B_DELWRI) == 0) { 11236 BUF_UNLOCK(bp); 11237 return (NULL); 11238 } 11239 bremfree(bp); 11240 return (bp); 11241} 11242 11243 11244/* 11245 * Check if it is safe to suspend the file system now. On entry, 11246 * the vnode interlock for devvp should be held. Return 0 with 11247 * the mount interlock held if the file system can be suspended now, 11248 * otherwise return EAGAIN with the mount interlock held. 11249 */ 11250int 11251softdep_check_suspend(struct mount *mp, 11252 struct vnode *devvp, 11253 int softdep_deps, 11254 int softdep_accdeps, 11255 int secondary_writes, 11256 int secondary_accwrites) 11257{ 11258 struct bufobj *bo; 11259 struct ufsmount *ump; 11260 int error; 11261 11262 ump = VFSTOUFS(mp); 11263 bo = &devvp->v_bufobj; 11264 ASSERT_BO_LOCKED(bo); 11265 11266 for (;;) { 11267 if (!TRY_ACQUIRE_LOCK(&lk)) { 11268 BO_UNLOCK(bo); 11269 ACQUIRE_LOCK(&lk); 11270 FREE_LOCK(&lk); 11271 BO_LOCK(bo); 11272 continue; 11273 } 11274 MNT_ILOCK(mp); 11275 if (mp->mnt_secondary_writes != 0) { 11276 FREE_LOCK(&lk); 11277 BO_UNLOCK(bo); 11278 msleep(&mp->mnt_secondary_writes, 11279 MNT_MTX(mp), 11280 (PUSER - 1) | PDROP, "secwr", 0); 11281 BO_LOCK(bo); 11282 continue; 11283 } 11284 break; 11285 } 11286 11287 /* 11288 * Reasons for needing more work before suspend: 11289 * - Dirty buffers on devvp. 11290 * - Softdep activity occurred after start of vnode sync loop 11291 * - Secondary writes occurred after start of vnode sync loop 11292 */ 11293 error = 0; 11294 if (bo->bo_numoutput > 0 || 11295 bo->bo_dirty.bv_cnt > 0 || 11296 softdep_deps != 0 || 11297 ump->softdep_deps != 0 || 11298 softdep_accdeps != ump->softdep_accdeps || 11299 secondary_writes != 0 || 11300 mp->mnt_secondary_writes != 0 || 11301 secondary_accwrites != mp->mnt_secondary_accwrites) 11302 error = EAGAIN; 11303 FREE_LOCK(&lk); 11304 BO_UNLOCK(bo); 11305 return (error); 11306} 11307 11308 11309/* 11310 * Get the number of dependency structures for the file system, both 11311 * the current number and the total number allocated. These will 11312 * later be used to detect that softdep processing has occurred. 11313 */ 11314void 11315softdep_get_depcounts(struct mount *mp, 11316 int *softdep_depsp, 11317 int *softdep_accdepsp) 11318{ 11319 struct ufsmount *ump; 11320 11321 ump = VFSTOUFS(mp); 11322 ACQUIRE_LOCK(&lk); 11323 *softdep_depsp = ump->softdep_deps; 11324 *softdep_accdepsp = ump->softdep_accdeps; 11325 FREE_LOCK(&lk); 11326} 11327 11328/* 11329 * Wait for pending output on a vnode to complete. 11330 * Must be called with vnode lock and interlock locked. 11331 * 11332 * XXX: Should just be a call to bufobj_wwait(). 11333 */ 11334static void 11335drain_output(vp) 11336 struct vnode *vp; 11337{ 11338 struct bufobj *bo; 11339 11340 bo = &vp->v_bufobj; 11341 ASSERT_VOP_LOCKED(vp, "drain_output"); 11342 ASSERT_BO_LOCKED(bo); 11343 11344 while (bo->bo_numoutput) { 11345 bo->bo_flag |= BO_WWAIT; 11346 msleep((caddr_t)&bo->bo_numoutput, 11347 BO_MTX(bo), PRIBIO + 1, "drainvp", 0); 11348 } 11349} 11350 11351/* 11352 * Called whenever a buffer that is being invalidated or reallocated 11353 * contains dependencies. This should only happen if an I/O error has 11354 * occurred. The routine is called with the buffer locked. 11355 */ 11356static void 11357softdep_deallocate_dependencies(bp) 11358 struct buf *bp; 11359{ 11360 11361 if ((bp->b_ioflags & BIO_ERROR) == 0) 11362 panic("softdep_deallocate_dependencies: dangling deps"); 11363 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 11364 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 11365} 11366 11367/* 11368 * Function to handle asynchronous write errors in the filesystem. 11369 */ 11370static void 11371softdep_error(func, error) 11372 char *func; 11373 int error; 11374{ 11375 11376 /* XXX should do something better! */ 11377 printf("%s: got error %d while accessing filesystem\n", func, error); 11378} 11379 11380#ifdef DDB 11381 11382static void 11383inodedep_print(struct inodedep *inodedep, int verbose) 11384{ 11385 db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d" 11386 " saveino %p\n", 11387 inodedep, inodedep->id_fs, inodedep->id_state, 11388 (intmax_t)inodedep->id_ino, 11389 (intmax_t)fsbtodb(inodedep->id_fs, 11390 ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), 11391 inodedep->id_nlinkdelta, inodedep->id_savednlink, 11392 inodedep->id_savedino1); 11393 11394 if (verbose == 0) 11395 return; 11396 11397 db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, " 11398 "mkdiradd %p\n", 11399 LIST_FIRST(&inodedep->id_pendinghd), 11400 LIST_FIRST(&inodedep->id_bufwait), 11401 LIST_FIRST(&inodedep->id_inowait), 11402 TAILQ_FIRST(&inodedep->id_inoreflst), 11403 inodedep->id_mkdiradd); 11404 db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", 11405 TAILQ_FIRST(&inodedep->id_inoupdt), 11406 TAILQ_FIRST(&inodedep->id_newinoupdt), 11407 TAILQ_FIRST(&inodedep->id_extupdt), 11408 TAILQ_FIRST(&inodedep->id_newextupdt)); 11409} 11410 11411DB_SHOW_COMMAND(inodedep, db_show_inodedep) 11412{ 11413 11414 if (have_addr == 0) { 11415 db_printf("Address required\n"); 11416 return; 11417 } 11418 inodedep_print((struct inodedep*)addr, 1); 11419} 11420 11421DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) 11422{ 11423 struct inodedep_hashhead *inodedephd; 11424 struct inodedep *inodedep; 11425 struct fs *fs; 11426 int cnt; 11427 11428 fs = have_addr ? (struct fs *)addr : NULL; 11429 for (cnt = 0; cnt < inodedep_hash; cnt++) { 11430 inodedephd = &inodedep_hashtbl[cnt]; 11431 LIST_FOREACH(inodedep, inodedephd, id_hash) { 11432 if (fs != NULL && fs != inodedep->id_fs) 11433 continue; 11434 inodedep_print(inodedep, 0); 11435 } 11436 } 11437} 11438 11439DB_SHOW_COMMAND(worklist, db_show_worklist) 11440{ 11441 struct worklist *wk; 11442 11443 if (have_addr == 0) { 11444 db_printf("Address required\n"); 11445 return; 11446 } 11447 wk = (struct worklist *)addr; 11448 printf("worklist: %p type %s state 0x%X\n", 11449 wk, TYPENAME(wk->wk_type), wk->wk_state); 11450} 11451 11452DB_SHOW_COMMAND(workhead, db_show_workhead) 11453{ 11454 struct workhead *wkhd; 11455 struct worklist *wk; 11456 int i; 11457 11458 if (have_addr == 0) { 11459 db_printf("Address required\n"); 11460 return; 11461 } 11462 wkhd = (struct workhead *)addr; 11463 wk = LIST_FIRST(wkhd); 11464 for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) 11465 db_printf("worklist: %p type %s state 0x%X", 11466 wk, TYPENAME(wk->wk_type), wk->wk_state); 11467 if (i == 100) 11468 db_printf("workhead overflow"); 11469 printf("\n"); 11470} 11471 11472 11473DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) 11474{ 11475 struct jaddref *jaddref; 11476 struct diradd *diradd; 11477 struct mkdir *mkdir; 11478 11479 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 11480 diradd = mkdir->md_diradd; 11481 db_printf("mkdir: %p state 0x%X dap %p state 0x%X", 11482 mkdir, mkdir->md_state, diradd, diradd->da_state); 11483 if ((jaddref = mkdir->md_jaddref) != NULL) 11484 db_printf(" jaddref %p jaddref state 0x%X", 11485 jaddref, jaddref->ja_state); 11486 db_printf("\n"); 11487 } 11488} 11489 11490#endif /* DDB */ 11491 11492#endif /* SOFTUPDATES */ 11493