ffs_softdep.c revision 219895
1/*-
2 * Copyright 1998, 2000 Marshall Kirk McKusick.
3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4 * All rights reserved.
5 *
6 * The soft updates code is derived from the appendix of a University
7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8 * "Soft Updates: A Solution to the Metadata Update Problem in File
9 * Systems", CSE-TR-254-95, August 1995).
10 *
11 * Further information about soft updates can be obtained from:
12 *
13 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
14 *	1614 Oxford Street		mckusick@mckusick.com
15 *	Berkeley, CA 94709-1608		+1-510-843-9542
16 *	USA
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 *
22 * 1. Redistributions of source code must retain the above copyright
23 *    notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 *    notice, this list of conditions and the following disclaimer in the
26 *    documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
40 */
41
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 219895 2011-03-23 05:13:54Z mckusick $");
44
45#include "opt_ffs.h"
46#include "opt_ddb.h"
47
48/*
49 * For now we want the safety net that the DEBUG flag provides.
50 */
51#ifndef DEBUG
52#define DEBUG
53#endif
54
55#include <sys/param.h>
56#include <sys/kernel.h>
57#include <sys/systm.h>
58#include <sys/bio.h>
59#include <sys/buf.h>
60#include <sys/kdb.h>
61#include <sys/kthread.h>
62#include <sys/lock.h>
63#include <sys/malloc.h>
64#include <sys/mount.h>
65#include <sys/mutex.h>
66#include <sys/namei.h>
67#include <sys/proc.h>
68#include <sys/stat.h>
69#include <sys/sysctl.h>
70#include <sys/syslog.h>
71#include <sys/vnode.h>
72#include <sys/conf.h>
73#include <ufs/ufs/dir.h>
74#include <ufs/ufs/extattr.h>
75#include <ufs/ufs/quota.h>
76#include <ufs/ufs/inode.h>
77#include <ufs/ufs/ufsmount.h>
78#include <ufs/ffs/fs.h>
79#include <ufs/ffs/softdep.h>
80#include <ufs/ffs/ffs_extern.h>
81#include <ufs/ufs/ufs_extern.h>
82
83#include <vm/vm.h>
84
85#include <ddb/ddb.h>
86
87#ifndef SOFTUPDATES
88
89int
90softdep_flushfiles(oldmnt, flags, td)
91	struct mount *oldmnt;
92	int flags;
93	struct thread *td;
94{
95
96	panic("softdep_flushfiles called");
97}
98
99int
100softdep_mount(devvp, mp, fs, cred)
101	struct vnode *devvp;
102	struct mount *mp;
103	struct fs *fs;
104	struct ucred *cred;
105{
106
107	return (0);
108}
109
110void
111softdep_initialize()
112{
113
114	return;
115}
116
117void
118softdep_uninitialize()
119{
120
121	return;
122}
123
124void
125softdep_unmount(mp)
126	struct mount *mp;
127{
128
129}
130
131void
132softdep_setup_sbupdate(ump, fs, bp)
133	struct ufsmount *ump;
134	struct fs *fs;
135	struct buf *bp;
136{
137}
138
139void
140softdep_setup_inomapdep(bp, ip, newinum)
141	struct buf *bp;
142	struct inode *ip;
143	ino_t newinum;
144{
145
146	panic("softdep_setup_inomapdep called");
147}
148
149void
150softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
151	struct buf *bp;
152	struct mount *mp;
153	ufs2_daddr_t newblkno;
154	int frags;
155	int oldfrags;
156{
157
158	panic("softdep_setup_blkmapdep called");
159}
160
161void
162softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
163	struct inode *ip;
164	ufs_lbn_t lbn;
165	ufs2_daddr_t newblkno;
166	ufs2_daddr_t oldblkno;
167	long newsize;
168	long oldsize;
169	struct buf *bp;
170{
171
172	panic("softdep_setup_allocdirect called");
173}
174
175void
176softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
177	struct inode *ip;
178	ufs_lbn_t lbn;
179	ufs2_daddr_t newblkno;
180	ufs2_daddr_t oldblkno;
181	long newsize;
182	long oldsize;
183	struct buf *bp;
184{
185
186	panic("softdep_setup_allocext called");
187}
188
189void
190softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
191	struct inode *ip;
192	ufs_lbn_t lbn;
193	struct buf *bp;
194	int ptrno;
195	ufs2_daddr_t newblkno;
196	ufs2_daddr_t oldblkno;
197	struct buf *nbp;
198{
199
200	panic("softdep_setup_allocindir_page called");
201}
202
203void
204softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
205	struct buf *nbp;
206	struct inode *ip;
207	struct buf *bp;
208	int ptrno;
209	ufs2_daddr_t newblkno;
210{
211
212	panic("softdep_setup_allocindir_meta called");
213}
214
215void
216softdep_setup_freeblocks(ip, length, flags)
217	struct inode *ip;
218	off_t length;
219	int flags;
220{
221
222	panic("softdep_setup_freeblocks called");
223}
224
225void
226softdep_freefile(pvp, ino, mode)
227		struct vnode *pvp;
228		ino_t ino;
229		int mode;
230{
231
232	panic("softdep_freefile called");
233}
234
235int
236softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
237	struct buf *bp;
238	struct inode *dp;
239	off_t diroffset;
240	ino_t newinum;
241	struct buf *newdirbp;
242	int isnewblk;
243{
244
245	panic("softdep_setup_directory_add called");
246}
247
248void
249softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
250	struct buf *bp;
251	struct inode *dp;
252	caddr_t base;
253	caddr_t oldloc;
254	caddr_t newloc;
255	int entrysize;
256{
257
258	panic("softdep_change_directoryentry_offset called");
259}
260
261void
262softdep_setup_remove(bp, dp, ip, isrmdir)
263	struct buf *bp;
264	struct inode *dp;
265	struct inode *ip;
266	int isrmdir;
267{
268
269	panic("softdep_setup_remove called");
270}
271
272void
273softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
274	struct buf *bp;
275	struct inode *dp;
276	struct inode *ip;
277	ino_t newinum;
278	int isrmdir;
279{
280
281	panic("softdep_setup_directory_change called");
282}
283
284void *
285softdep_setup_trunc(vp, length, flags)
286	struct vnode *vp;
287	off_t length;
288	int flags;
289{
290
291	panic("%s called", __FUNCTION__);
292
293	return (NULL);
294}
295
296int
297softdep_complete_trunc(vp, cookie)
298	struct vnode *vp;
299	void *cookie;
300{
301
302	panic("%s called", __FUNCTION__);
303
304	return (0);
305}
306
307void
308softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
309	struct mount *mp;
310	struct buf *bp;
311	ufs2_daddr_t blkno;
312	int frags;
313	struct workhead *wkhd;
314{
315
316	panic("%s called", __FUNCTION__);
317}
318
319void
320softdep_setup_inofree(mp, bp, ino, wkhd)
321	struct mount *mp;
322	struct buf *bp;
323	ino_t ino;
324	struct workhead *wkhd;
325{
326
327	panic("%s called", __FUNCTION__);
328}
329
330void
331softdep_setup_unlink(dp, ip)
332	struct inode *dp;
333	struct inode *ip;
334{
335
336	panic("%s called", __FUNCTION__);
337}
338
339void
340softdep_setup_link(dp, ip)
341	struct inode *dp;
342	struct inode *ip;
343{
344
345	panic("%s called", __FUNCTION__);
346}
347
348void
349softdep_revert_link(dp, ip)
350	struct inode *dp;
351	struct inode *ip;
352{
353
354	panic("%s called", __FUNCTION__);
355}
356
357void
358softdep_setup_rmdir(dp, ip)
359	struct inode *dp;
360	struct inode *ip;
361{
362
363	panic("%s called", __FUNCTION__);
364}
365
366void
367softdep_revert_rmdir(dp, ip)
368	struct inode *dp;
369	struct inode *ip;
370{
371
372	panic("%s called", __FUNCTION__);
373}
374
375void
376softdep_setup_create(dp, ip)
377	struct inode *dp;
378	struct inode *ip;
379{
380
381	panic("%s called", __FUNCTION__);
382}
383
384void
385softdep_revert_create(dp, ip)
386	struct inode *dp;
387	struct inode *ip;
388{
389
390	panic("%s called", __FUNCTION__);
391}
392
393void
394softdep_setup_mkdir(dp, ip)
395	struct inode *dp;
396	struct inode *ip;
397{
398
399	panic("%s called", __FUNCTION__);
400}
401
402void
403softdep_revert_mkdir(dp, ip)
404	struct inode *dp;
405	struct inode *ip;
406{
407
408	panic("%s called", __FUNCTION__);
409}
410
411void
412softdep_setup_dotdot_link(dp, ip)
413	struct inode *dp;
414	struct inode *ip;
415{
416
417	panic("%s called", __FUNCTION__);
418}
419
420int
421softdep_prealloc(vp, waitok)
422	struct vnode *vp;
423	int waitok;
424{
425
426	panic("%s called", __FUNCTION__);
427
428	return (0);
429}
430
431int
432softdep_journal_lookup(mp, vpp)
433	struct mount *mp;
434	struct vnode **vpp;
435{
436
437	return (ENOENT);
438}
439
440void
441softdep_change_linkcnt(ip)
442	struct inode *ip;
443{
444
445	panic("softdep_change_linkcnt called");
446}
447
448void
449softdep_load_inodeblock(ip)
450	struct inode *ip;
451{
452
453	panic("softdep_load_inodeblock called");
454}
455
456void
457softdep_update_inodeblock(ip, bp, waitfor)
458	struct inode *ip;
459	struct buf *bp;
460	int waitfor;
461{
462
463	panic("softdep_update_inodeblock called");
464}
465
466int
467softdep_fsync(vp)
468	struct vnode *vp;	/* the "in_core" copy of the inode */
469{
470
471	return (0);
472}
473
474void
475softdep_fsync_mountdev(vp)
476	struct vnode *vp;
477{
478
479	return;
480}
481
482int
483softdep_flushworklist(oldmnt, countp, td)
484	struct mount *oldmnt;
485	int *countp;
486	struct thread *td;
487{
488
489	*countp = 0;
490	return (0);
491}
492
493int
494softdep_sync_metadata(struct vnode *vp)
495{
496
497	return (0);
498}
499
500int
501softdep_slowdown(vp)
502	struct vnode *vp;
503{
504
505	panic("softdep_slowdown called");
506}
507
508void
509softdep_releasefile(ip)
510	struct inode *ip;	/* inode with the zero effective link count */
511{
512
513	panic("softdep_releasefile called");
514}
515
516int
517softdep_request_cleanup(fs, vp)
518	struct fs *fs;
519	struct vnode *vp;
520{
521
522	return (0);
523}
524
525int
526softdep_check_suspend(struct mount *mp,
527		      struct vnode *devvp,
528		      int softdep_deps,
529		      int softdep_accdeps,
530		      int secondary_writes,
531		      int secondary_accwrites)
532{
533	struct bufobj *bo;
534	int error;
535
536	(void) softdep_deps,
537	(void) softdep_accdeps;
538
539	bo = &devvp->v_bufobj;
540	ASSERT_BO_LOCKED(bo);
541
542	MNT_ILOCK(mp);
543	while (mp->mnt_secondary_writes != 0) {
544		BO_UNLOCK(bo);
545		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
546		    (PUSER - 1) | PDROP, "secwr", 0);
547		BO_LOCK(bo);
548		MNT_ILOCK(mp);
549	}
550
551	/*
552	 * Reasons for needing more work before suspend:
553	 * - Dirty buffers on devvp.
554	 * - Secondary writes occurred after start of vnode sync loop
555	 */
556	error = 0;
557	if (bo->bo_numoutput > 0 ||
558	    bo->bo_dirty.bv_cnt > 0 ||
559	    secondary_writes != 0 ||
560	    mp->mnt_secondary_writes != 0 ||
561	    secondary_accwrites != mp->mnt_secondary_accwrites)
562		error = EAGAIN;
563	BO_UNLOCK(bo);
564	return (error);
565}
566
567void
568softdep_get_depcounts(struct mount *mp,
569		      int *softdepactivep,
570		      int *softdepactiveaccp)
571{
572	(void) mp;
573	*softdepactivep = 0;
574	*softdepactiveaccp = 0;
575}
576
577#else
578
579FEATURE(softupdates, "FFS soft-updates support");
580
581/*
582 * These definitions need to be adapted to the system to which
583 * this file is being ported.
584 */
585
586#define M_SOFTDEP_FLAGS	(M_WAITOK)
587
588#define	D_PAGEDEP	0
589#define	D_INODEDEP	1
590#define	D_BMSAFEMAP	2
591#define	D_NEWBLK	3
592#define	D_ALLOCDIRECT	4
593#define	D_INDIRDEP	5
594#define	D_ALLOCINDIR	6
595#define	D_FREEFRAG	7
596#define	D_FREEBLKS	8
597#define	D_FREEFILE	9
598#define	D_DIRADD	10
599#define	D_MKDIR		11
600#define	D_DIRREM	12
601#define	D_NEWDIRBLK	13
602#define	D_FREEWORK	14
603#define	D_FREEDEP	15
604#define	D_JADDREF	16
605#define	D_JREMREF	17
606#define	D_JMVREF	18
607#define	D_JNEWBLK	19
608#define	D_JFREEBLK	20
609#define	D_JFREEFRAG	21
610#define	D_JSEG		22
611#define	D_JSEGDEP	23
612#define	D_SBDEP		24
613#define	D_JTRUNC	25
614#define	D_LAST		D_JTRUNC
615
616unsigned long dep_current[D_LAST + 1];
617unsigned long dep_total[D_LAST + 1];
618
619
620SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats");
621SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
622    "total dependencies allocated");
623SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
624    "current dependencies allocated");
625
626#define	SOFTDEP_TYPE(type, str, long)					\
627    static MALLOC_DEFINE(M_ ## type, #str, long);			\
628    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
629	&dep_total[D_ ## type], 0, "");					\
630    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
631	&dep_current[D_ ## type], 0, "");
632
633SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
634SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
635SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
636    "Block or frag allocated from cyl group map");
637SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
638SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
639SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
640SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
641SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
642SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
643SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
644SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
645SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
646SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
647SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
648SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
649SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
650SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
651SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
652SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
653SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
654SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
655SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
656SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
657SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
658SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
659SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
660
661static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
662static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
663
664/*
665 * translate from workitem type to memory type
666 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
667 */
668static struct malloc_type *memtype[] = {
669	M_PAGEDEP,
670	M_INODEDEP,
671	M_BMSAFEMAP,
672	M_NEWBLK,
673	M_ALLOCDIRECT,
674	M_INDIRDEP,
675	M_ALLOCINDIR,
676	M_FREEFRAG,
677	M_FREEBLKS,
678	M_FREEFILE,
679	M_DIRADD,
680	M_MKDIR,
681	M_DIRREM,
682	M_NEWDIRBLK,
683	M_FREEWORK,
684	M_FREEDEP,
685	M_JADDREF,
686	M_JREMREF,
687	M_JMVREF,
688	M_JNEWBLK,
689	M_JFREEBLK,
690	M_JFREEFRAG,
691	M_JSEG,
692	M_JSEGDEP,
693	M_SBDEP,
694	M_JTRUNC
695};
696
697static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;
698
699#define DtoM(type) (memtype[type])
700
701/*
702 * Names of malloc types.
703 */
704#define TYPENAME(type)  \
705	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
706/*
707 * End system adaptation definitions.
708 */
709
710#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
711#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
712
713/*
714 * Forward declarations.
715 */
716struct inodedep_hashhead;
717struct newblk_hashhead;
718struct pagedep_hashhead;
719struct bmsafemap_hashhead;
720
721/*
722 * Internal function prototypes.
723 */
724static	void softdep_error(char *, int);
725static	void drain_output(struct vnode *);
726static	struct buf *getdirtybuf(struct buf *, struct mtx *, int);
727static	void clear_remove(struct thread *);
728static	void clear_inodedeps(struct thread *);
729static	void unlinked_inodedep(struct mount *, struct inodedep *);
730static	void clear_unlinked_inodedep(struct inodedep *);
731static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
732static	int flush_pagedep_deps(struct vnode *, struct mount *,
733	    struct diraddhd *);
734static	void free_pagedep(struct pagedep *);
735static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
736static	int flush_inodedep_deps(struct mount *, ino_t);
737static	int flush_deplist(struct allocdirectlst *, int, int *);
738static	int handle_written_filepage(struct pagedep *, struct buf *);
739static	int handle_written_sbdep(struct sbdep *, struct buf *);
740static	void initiate_write_sbdep(struct sbdep *);
741static  void diradd_inode_written(struct diradd *, struct inodedep *);
742static	int handle_written_indirdep(struct indirdep *, struct buf *,
743	    struct buf**);
744static	int handle_written_inodeblock(struct inodedep *, struct buf *);
745static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
746static	void handle_written_jaddref(struct jaddref *);
747static	void handle_written_jremref(struct jremref *);
748static	void handle_written_jseg(struct jseg *, struct buf *);
749static	void handle_written_jnewblk(struct jnewblk *);
750static	void handle_written_jfreeblk(struct jfreeblk *);
751static	void handle_written_jfreefrag(struct jfreefrag *);
752static	void complete_jseg(struct jseg *);
753static	void jseg_write(struct ufsmount *ump, struct jblocks *, struct jseg *,
754	    uint8_t *);
755static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
756static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
757static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
758static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
759static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
760static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
761static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
762static	inline void inoref_write(struct inoref *, struct jseg *,
763	    struct jrefrec *);
764static	void handle_allocdirect_partdone(struct allocdirect *,
765	    struct workhead *);
766static	void cancel_newblk(struct newblk *, struct workhead *);
767static	void indirdep_complete(struct indirdep *);
768static	void handle_allocindir_partdone(struct allocindir *);
769static	void initiate_write_filepage(struct pagedep *, struct buf *);
770static	void initiate_write_indirdep(struct indirdep*, struct buf *);
771static	void handle_written_mkdir(struct mkdir *, int);
772static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
773static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
774static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
775static	void handle_workitem_freefile(struct freefile *);
776static	void handle_workitem_remove(struct dirrem *, struct vnode *);
777static	struct dirrem *newdirrem(struct buf *, struct inode *,
778	    struct inode *, int, struct dirrem **);
779static	void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *,
780	    struct freeblks *);
781static	void free_indirdep(struct indirdep *);
782static	void free_diradd(struct diradd *, struct workhead *);
783static	void merge_diradd(struct inodedep *, struct diradd *);
784static	void complete_diradd(struct diradd *);
785static	struct diradd *diradd_lookup(struct pagedep *, int);
786static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
787	    struct jremref *);
788static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
789	    struct jremref *);
790static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
791	    struct jremref *, struct jremref *);
792static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
793	    struct jremref *);
794static	void cancel_allocindir(struct allocindir *, struct inodedep *,
795	    struct freeblks *);
796static	void complete_mkdir(struct mkdir *);
797static	void free_newdirblk(struct newdirblk *);
798static	void free_jremref(struct jremref *);
799static	void free_jaddref(struct jaddref *);
800static	void free_jsegdep(struct jsegdep *);
801static	void free_jseg(struct jseg *);
802static	void free_jnewblk(struct jnewblk *);
803static	void free_jfreeblk(struct jfreeblk *);
804static	void free_jfreefrag(struct jfreefrag *);
805static	void free_freedep(struct freedep *);
806static	void journal_jremref(struct dirrem *, struct jremref *,
807	    struct inodedep *);
808static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
809static	int cancel_jaddref(struct jaddref *, struct inodedep *,
810	    struct workhead *);
811static	void cancel_jfreefrag(struct jfreefrag *);
812static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
813static	int deallocate_dependencies(struct buf *, struct inodedep *,
814	    struct freeblks *);
815static	void free_newblk(struct newblk *);
816static	void cancel_allocdirect(struct allocdirectlst *,
817	    struct allocdirect *, struct freeblks *, int);
818static	int check_inode_unwritten(struct inodedep *);
819static	int free_inodedep(struct inodedep *);
820static	void freework_freeblock(struct freework *);
821static	void handle_workitem_freeblocks(struct freeblks *, int);
822static	void handle_complete_freeblocks(struct freeblks *);
823static	void handle_workitem_indirblk(struct freework *);
824static	void handle_written_freework(struct freework *);
825static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
826static	void setup_allocindir_phase2(struct buf *, struct inode *,
827	    struct inodedep *, struct allocindir *, ufs_lbn_t);
828static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
829	    ufs2_daddr_t, ufs_lbn_t);
830static	void handle_workitem_freefrag(struct freefrag *);
831static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
832	    ufs_lbn_t);
833static	void allocdirect_merge(struct allocdirectlst *,
834	    struct allocdirect *, struct allocdirect *);
835static	struct freefrag *allocindir_merge(struct allocindir *,
836	    struct allocindir *);
837static	int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int,
838	    struct bmsafemap **);
839static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
840	    int cg);
841static	int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t,
842	    int, struct newblk **);
843static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
844static	int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
845	    struct inodedep **);
846static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
847static	int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int,
848	    struct pagedep **);
849static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
850	    struct mount *mp, int, struct pagedep **);
851static	void pause_timer(void *);
852static	int request_cleanup(struct mount *, int);
853static	int process_worklist_item(struct mount *, int);
854static	void process_removes(struct vnode *);
855static	void jwork_move(struct workhead *, struct workhead *);
856static	void add_to_worklist(struct worklist *, int);
857static	void remove_from_worklist(struct worklist *);
858static	void softdep_flush(void);
859static	int softdep_speedup(void);
860static	void worklist_speedup(void);
861static	int journal_mount(struct mount *, struct fs *, struct ucred *);
862static	void journal_unmount(struct mount *);
863static	int journal_space(struct ufsmount *, int);
864static	void journal_suspend(struct ufsmount *);
865static	int journal_unsuspend(struct ufsmount *ump);
866static	void softdep_prelink(struct vnode *, struct vnode *);
867static	void add_to_journal(struct worklist *);
868static	void remove_from_journal(struct worklist *);
869static	void softdep_process_journal(struct mount *, int);
870static	struct jremref *newjremref(struct dirrem *, struct inode *,
871	    struct inode *ip, off_t, nlink_t);
872static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
873	    uint16_t);
874static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
875	    uint16_t);
876static inline struct jsegdep *inoref_jseg(struct inoref *);
877static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
878static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
879	    ufs2_daddr_t, int);
880static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
881	    ufs2_daddr_t, long, ufs_lbn_t);
882static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
883	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int);
884static	void jwait(struct worklist *wk);
885static	struct inodedep *inodedep_lookup_ip(struct inode *);
886static	int bmsafemap_rollbacks(struct bmsafemap *);
887static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
888static	void handle_jwork(struct workhead *);
889static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
890	    struct mkdir **);
891static	struct jblocks *jblocks_create(void);
892static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
893static	void jblocks_free(struct jblocks *, struct mount *, int);
894static	void jblocks_destroy(struct jblocks *);
895static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
896
897/*
898 * Exported softdep operations.
899 */
900static	void softdep_disk_io_initiation(struct buf *);
901static	void softdep_disk_write_complete(struct buf *);
902static	void softdep_deallocate_dependencies(struct buf *);
903static	int softdep_count_dependencies(struct buf *bp, int);
904
905static struct mtx lk;
906MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
907
908#define TRY_ACQUIRE_LOCK(lk)		mtx_trylock(lk)
909#define ACQUIRE_LOCK(lk)		mtx_lock(lk)
910#define FREE_LOCK(lk)			mtx_unlock(lk)
911
912#define	BUF_AREC(bp)			lockallowrecurse(&(bp)->b_lock)
913#define	BUF_NOREC(bp)			lockdisablerecurse(&(bp)->b_lock)
914
915/*
916 * Worklist queue management.
917 * These routines require that the lock be held.
918 */
919#ifndef /* NOT */ DEBUG
920#define WORKLIST_INSERT(head, item) do {	\
921	(item)->wk_state |= ONWORKLIST;		\
922	LIST_INSERT_HEAD(head, item, wk_list);	\
923} while (0)
924#define WORKLIST_REMOVE(item) do {		\
925	(item)->wk_state &= ~ONWORKLIST;	\
926	LIST_REMOVE(item, wk_list);		\
927} while (0)
928#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
929#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
930
931#else /* DEBUG */
932static	void worklist_insert(struct workhead *, struct worklist *, int);
933static	void worklist_remove(struct worklist *, int);
934
935#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
936#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
937#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
938#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
939
940static void
941worklist_insert(head, item, locked)
942	struct workhead *head;
943	struct worklist *item;
944	int locked;
945{
946
947	if (locked)
948		mtx_assert(&lk, MA_OWNED);
949	if (item->wk_state & ONWORKLIST)
950		panic("worklist_insert: %p %s(0x%X) already on list",
951		    item, TYPENAME(item->wk_type), item->wk_state);
952	item->wk_state |= ONWORKLIST;
953	LIST_INSERT_HEAD(head, item, wk_list);
954}
955
956static void
957worklist_remove(item, locked)
958	struct worklist *item;
959	int locked;
960{
961
962	if (locked)
963		mtx_assert(&lk, MA_OWNED);
964	if ((item->wk_state & ONWORKLIST) == 0)
965		panic("worklist_remove: %p %s(0x%X) not on list",
966		    item, TYPENAME(item->wk_type), item->wk_state);
967	item->wk_state &= ~ONWORKLIST;
968	LIST_REMOVE(item, wk_list);
969}
970#endif /* DEBUG */
971
972/*
973 * Merge two jsegdeps keeping only the oldest one as newer references
974 * can't be discarded until after older references.
975 */
976static inline struct jsegdep *
977jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
978{
979	struct jsegdep *swp;
980
981	if (two == NULL)
982		return (one);
983
984	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
985		swp = one;
986		one = two;
987		two = swp;
988	}
989	WORKLIST_REMOVE(&two->jd_list);
990	free_jsegdep(two);
991
992	return (one);
993}
994
995/*
996 * If two freedeps are compatible free one to reduce list size.
997 */
998static inline struct freedep *
999freedep_merge(struct freedep *one, struct freedep *two)
1000{
1001	if (two == NULL)
1002		return (one);
1003
1004	if (one->fd_freework == two->fd_freework) {
1005		WORKLIST_REMOVE(&two->fd_list);
1006		free_freedep(two);
1007	}
1008	return (one);
1009}
1010
1011/*
1012 * Move journal work from one list to another.  Duplicate freedeps and
1013 * jsegdeps are coalesced to keep the lists as small as possible.
1014 */
1015static void
1016jwork_move(dst, src)
1017	struct workhead *dst;
1018	struct workhead *src;
1019{
1020	struct freedep *freedep;
1021	struct jsegdep *jsegdep;
1022	struct worklist *wkn;
1023	struct worklist *wk;
1024
1025	KASSERT(dst != src,
1026	    ("jwork_move: dst == src"));
1027	freedep = NULL;
1028	jsegdep = NULL;
1029	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1030		if (wk->wk_type == D_JSEGDEP)
1031			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1032		if (wk->wk_type == D_FREEDEP)
1033			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1034	}
1035
1036	mtx_assert(&lk, MA_OWNED);
1037	while ((wk = LIST_FIRST(src)) != NULL) {
1038		WORKLIST_REMOVE(wk);
1039		WORKLIST_INSERT(dst, wk);
1040		if (wk->wk_type == D_JSEGDEP) {
1041			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1042			continue;
1043		}
1044		if (wk->wk_type == D_FREEDEP)
1045			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1046	}
1047}
1048
1049/*
1050 * Routines for tracking and managing workitems.
1051 */
1052static	void workitem_free(struct worklist *, int);
1053static	void workitem_alloc(struct worklist *, int, struct mount *);
1054
1055#define	WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
1056
1057static void
1058workitem_free(item, type)
1059	struct worklist *item;
1060	int type;
1061{
1062	struct ufsmount *ump;
1063	mtx_assert(&lk, MA_OWNED);
1064
1065#ifdef DEBUG
1066	if (item->wk_state & ONWORKLIST)
1067		panic("workitem_free: %s(0x%X) still on list",
1068		    TYPENAME(item->wk_type), item->wk_state);
1069	if (item->wk_type != type)
1070		panic("workitem_free: type mismatch %s != %s",
1071		    TYPENAME(item->wk_type), TYPENAME(type));
1072#endif
1073	ump = VFSTOUFS(item->wk_mp);
1074	if (--ump->softdep_deps == 0 && ump->softdep_req)
1075		wakeup(&ump->softdep_deps);
1076	dep_current[type]--;
1077	free(item, DtoM(type));
1078}
1079
1080static void
1081workitem_alloc(item, type, mp)
1082	struct worklist *item;
1083	int type;
1084	struct mount *mp;
1085{
1086	item->wk_type = type;
1087	item->wk_mp = mp;
1088	item->wk_state = 0;
1089	ACQUIRE_LOCK(&lk);
1090	dep_current[type]++;
1091	dep_total[type]++;
1092	VFSTOUFS(mp)->softdep_deps++;
1093	VFSTOUFS(mp)->softdep_accdeps++;
1094	FREE_LOCK(&lk);
1095}
1096
1097/*
1098 * Workitem queue management
1099 */
1100static int max_softdeps;	/* maximum number of structs before slowdown */
1101static int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
1102static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1103static int proc_waiting;	/* tracks whether we have a timeout posted */
1104static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1105static struct callout softdep_callout;
1106static int req_pending;
1107static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1108static int req_clear_remove;	/* syncer process flush some freeblks */
1109static long num_freeblkdep;	/* number of freeblks workitems allocated */
1110
1111/*
1112 * runtime statistics
1113 */
1114static int stat_worklist_push;	/* number of worklist cleanups */
1115static int stat_blk_limit_push;	/* number of times block limit neared */
1116static int stat_ino_limit_push;	/* number of times inode limit neared */
1117static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1118static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1119static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1120static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1121static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1122static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1123static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1124static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1125static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1126static int stat_journal_min;	/* Times hit journal min threshold */
1127static int stat_journal_low;	/* Times hit journal low threshold */
1128static int stat_journal_wait;	/* Times blocked in jwait(). */
1129static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1130static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1131static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1132static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1133
1134SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1135    &max_softdeps, 0, "");
1136SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1137    &tickdelay, 0, "");
1138SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
1139    &maxindirdeps, 0, "");
1140SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1141    &stat_worklist_push, 0,"");
1142SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1143    &stat_blk_limit_push, 0,"");
1144SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1145    &stat_ino_limit_push, 0,"");
1146SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1147    &stat_blk_limit_hit, 0, "");
1148SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1149    &stat_ino_limit_hit, 0, "");
1150SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1151    &stat_sync_limit_hit, 0, "");
1152SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1153    &stat_indir_blk_ptrs, 0, "");
1154SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1155    &stat_inode_bitmap, 0, "");
1156SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1157    &stat_direct_blk_ptrs, 0, "");
1158SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1159    &stat_dir_entry, 0, "");
1160SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1161    &stat_jaddref, 0, "");
1162SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1163    &stat_jnewblk, 0, "");
1164SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1165    &stat_journal_low, 0, "");
1166SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1167    &stat_journal_min, 0, "");
1168SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1169    &stat_journal_wait, 0, "");
1170SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1171    &stat_jwait_filepage, 0, "");
1172SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1173    &stat_jwait_freeblks, 0, "");
1174SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1175    &stat_jwait_inode, 0, "");
1176SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1177    &stat_jwait_newblk, 0, "");
1178
1179SYSCTL_DECL(_vfs_ffs);
1180
1181LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl;
1182static u_long	bmsafemap_hash;	/* size of hash table - 1 */
1183
1184static int compute_summary_at_mount = 0;	/* Whether to recompute the summary at mount time */
1185SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1186	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1187
1188static struct proc *softdepproc;
1189static struct kproc_desc softdep_kp = {
1190	"softdepflush",
1191	softdep_flush,
1192	&softdepproc
1193};
1194SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
1195    &softdep_kp);
1196
1197static void
1198softdep_flush(void)
1199{
1200	struct mount *nmp;
1201	struct mount *mp;
1202	struct ufsmount *ump;
1203	struct thread *td;
1204	int remaining;
1205	int progress;
1206	int vfslocked;
1207
1208	td = curthread;
1209	td->td_pflags |= TDP_NORUNNINGBUF;
1210
1211	for (;;) {
1212		kproc_suspend_check(softdepproc);
1213		vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
1214		ACQUIRE_LOCK(&lk);
1215		/*
1216		 * If requested, try removing inode or removal dependencies.
1217		 */
1218		if (req_clear_inodedeps) {
1219			clear_inodedeps(td);
1220			req_clear_inodedeps -= 1;
1221			wakeup_one(&proc_waiting);
1222		}
1223		if (req_clear_remove) {
1224			clear_remove(td);
1225			req_clear_remove -= 1;
1226			wakeup_one(&proc_waiting);
1227		}
1228		FREE_LOCK(&lk);
1229		VFS_UNLOCK_GIANT(vfslocked);
1230		remaining = progress = 0;
1231		mtx_lock(&mountlist_mtx);
1232		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
1233			nmp = TAILQ_NEXT(mp, mnt_list);
1234			if ((mp->mnt_flag & MNT_SOFTDEP) == 0)
1235				continue;
1236			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
1237				continue;
1238			vfslocked = VFS_LOCK_GIANT(mp);
1239			progress += softdep_process_worklist(mp, 0);
1240			ump = VFSTOUFS(mp);
1241			remaining += ump->softdep_on_worklist -
1242				ump->softdep_on_worklist_inprogress;
1243			VFS_UNLOCK_GIANT(vfslocked);
1244			mtx_lock(&mountlist_mtx);
1245			nmp = TAILQ_NEXT(mp, mnt_list);
1246			vfs_unbusy(mp);
1247		}
1248		mtx_unlock(&mountlist_mtx);
1249		if (remaining && progress)
1250			continue;
1251		ACQUIRE_LOCK(&lk);
1252		if (!req_pending)
1253			msleep(&req_pending, &lk, PVM, "sdflush", hz);
1254		req_pending = 0;
1255		FREE_LOCK(&lk);
1256	}
1257}
1258
1259static void
1260worklist_speedup(void)
1261{
1262	mtx_assert(&lk, MA_OWNED);
1263	if (req_pending == 0) {
1264		req_pending = 1;
1265		wakeup(&req_pending);
1266	}
1267}
1268
1269static int
1270softdep_speedup(void)
1271{
1272
1273	worklist_speedup();
1274	bd_speedup();
1275	return speedup_syncer();
1276}
1277
1278/*
1279 * Add an item to the end of the work queue.
1280 * This routine requires that the lock be held.
1281 * This is the only routine that adds items to the list.
1282 * The following routine is the only one that removes items
1283 * and does so in order from first to last.
1284 */
1285static void
1286add_to_worklist(wk, nodelay)
1287	struct worklist *wk;
1288	int nodelay;
1289{
1290	struct ufsmount *ump;
1291
1292	mtx_assert(&lk, MA_OWNED);
1293	ump = VFSTOUFS(wk->wk_mp);
1294	if (wk->wk_state & ONWORKLIST)
1295		panic("add_to_worklist: %s(0x%X) already on list",
1296		    TYPENAME(wk->wk_type), wk->wk_state);
1297	wk->wk_state |= ONWORKLIST;
1298	if (LIST_EMPTY(&ump->softdep_workitem_pending))
1299		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1300	else
1301		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1302	ump->softdep_worklist_tail = wk;
1303	ump->softdep_on_worklist += 1;
1304	if (nodelay)
1305		worklist_speedup();
1306}
1307
1308/*
1309 * Remove the item to be processed. If we are removing the last
1310 * item on the list, we need to recalculate the tail pointer.
1311 */
1312static void
1313remove_from_worklist(wk)
1314	struct worklist *wk;
1315{
1316	struct ufsmount *ump;
1317	struct worklist *wkend;
1318
1319	ump = VFSTOUFS(wk->wk_mp);
1320	WORKLIST_REMOVE(wk);
1321	if (wk == ump->softdep_worklist_tail) {
1322		LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
1323			if (LIST_NEXT(wkend, wk_list) == NULL)
1324				break;
1325		ump->softdep_worklist_tail = wkend;
1326	}
1327	ump->softdep_on_worklist -= 1;
1328}
1329
1330/*
1331 * Process that runs once per second to handle items in the background queue.
1332 *
1333 * Note that we ensure that everything is done in the order in which they
1334 * appear in the queue. The code below depends on this property to ensure
1335 * that blocks of a file are freed before the inode itself is freed. This
1336 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1337 * until all the old ones have been purged from the dependency lists.
1338 */
1339int
1340softdep_process_worklist(mp, full)
1341	struct mount *mp;
1342	int full;
1343{
1344	struct thread *td = curthread;
1345	int cnt, matchcnt;
1346	struct ufsmount *ump;
1347	long starttime;
1348
1349	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1350	/*
1351	 * Record the process identifier of our caller so that we can give
1352	 * this process preferential treatment in request_cleanup below.
1353	 */
1354	matchcnt = 0;
1355	ump = VFSTOUFS(mp);
1356	ACQUIRE_LOCK(&lk);
1357	starttime = time_second;
1358	softdep_process_journal(mp, full?MNT_WAIT:0);
1359	while (ump->softdep_on_worklist > 0) {
1360		if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1)
1361			break;
1362		else
1363			matchcnt += cnt;
1364		/*
1365		 * If requested, try removing inode or removal dependencies.
1366		 */
1367		if (req_clear_inodedeps) {
1368			clear_inodedeps(td);
1369			req_clear_inodedeps -= 1;
1370			wakeup_one(&proc_waiting);
1371		}
1372		if (req_clear_remove) {
1373			clear_remove(td);
1374			req_clear_remove -= 1;
1375			wakeup_one(&proc_waiting);
1376		}
1377		/*
1378		 * We do not generally want to stop for buffer space, but if
1379		 * we are really being a buffer hog, we will stop and wait.
1380		 */
1381		if (should_yield()) {
1382			FREE_LOCK(&lk);
1383			kern_yield(-1);
1384			bwillwrite();
1385			ACQUIRE_LOCK(&lk);
1386		}
1387		/*
1388		 * Never allow processing to run for more than one
1389		 * second. Otherwise the other mountpoints may get
1390		 * excessively backlogged.
1391		 */
1392		if (!full && starttime != time_second)
1393			break;
1394	}
1395	if (full == 0)
1396		journal_unsuspend(ump);
1397	FREE_LOCK(&lk);
1398	return (matchcnt);
1399}
1400
1401/*
1402 * Process all removes associated with a vnode if we are running out of
1403 * journal space.  Any other process which attempts to flush these will
1404 * be unable as we have the vnodes locked.
1405 */
1406static void
1407process_removes(vp)
1408	struct vnode *vp;
1409{
1410	struct inodedep *inodedep;
1411	struct dirrem *dirrem;
1412	struct mount *mp;
1413	ino_t inum;
1414
1415	mtx_assert(&lk, MA_OWNED);
1416
1417	mp = vp->v_mount;
1418	inum = VTOI(vp)->i_number;
1419	for (;;) {
1420		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1421			return;
1422		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext)
1423			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1424			    (COMPLETE | ONWORKLIST))
1425				break;
1426		if (dirrem == NULL)
1427			return;
1428		/*
1429		 * If another thread is trying to lock this vnode it will
1430		 * fail but we must wait for it to do so before we can
1431		 * proceed.
1432		 */
1433		if (dirrem->dm_state & INPROGRESS) {
1434			dirrem->dm_state |= IOWAITING;
1435			msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0);
1436			continue;
1437		}
1438		remove_from_worklist(&dirrem->dm_list);
1439		FREE_LOCK(&lk);
1440		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1441			panic("process_removes: suspended filesystem");
1442		handle_workitem_remove(dirrem, vp);
1443		vn_finished_secondary_write(mp);
1444		ACQUIRE_LOCK(&lk);
1445	}
1446}
1447
1448/*
1449 * Process one item on the worklist.
1450 */
1451static int
1452process_worklist_item(mp, flags)
1453	struct mount *mp;
1454	int flags;
1455{
1456	struct worklist *wk;
1457	struct ufsmount *ump;
1458	struct vnode *vp;
1459	int matchcnt = 0;
1460
1461	mtx_assert(&lk, MA_OWNED);
1462	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1463	/*
1464	 * If we are being called because of a process doing a
1465	 * copy-on-write, then it is not safe to write as we may
1466	 * recurse into the copy-on-write routine.
1467	 */
1468	if (curthread->td_pflags & TDP_COWINPROGRESS)
1469		return (-1);
1470	/*
1471	 * Normally we just process each item on the worklist in order.
1472	 * However, if we are in a situation where we cannot lock any
1473	 * inodes, we have to skip over any dirrem requests whose
1474	 * vnodes are resident and locked.
1475	 */
1476	vp = NULL;
1477	ump = VFSTOUFS(mp);
1478	LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
1479		if (wk->wk_state & INPROGRESS)
1480			continue;
1481		if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
1482			break;
1483		wk->wk_state |= INPROGRESS;
1484		ump->softdep_on_worklist_inprogress++;
1485		FREE_LOCK(&lk);
1486		ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum,
1487		    LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ);
1488		ACQUIRE_LOCK(&lk);
1489		if (wk->wk_state & IOWAITING) {
1490			wk->wk_state &= ~IOWAITING;
1491			wakeup(wk);
1492		}
1493		wk->wk_state &= ~INPROGRESS;
1494		ump->softdep_on_worklist_inprogress--;
1495		if (vp != NULL)
1496			break;
1497	}
1498	if (wk == 0)
1499		return (-1);
1500	remove_from_worklist(wk);
1501	FREE_LOCK(&lk);
1502	if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1503		panic("process_worklist_item: suspended filesystem");
1504	matchcnt++;
1505	switch (wk->wk_type) {
1506
1507	case D_DIRREM:
1508		/* removal of a directory entry */
1509		handle_workitem_remove(WK_DIRREM(wk), vp);
1510		if (vp)
1511			vput(vp);
1512		break;
1513
1514	case D_FREEBLKS:
1515		/* releasing blocks and/or fragments from a file */
1516		handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
1517		break;
1518
1519	case D_FREEFRAG:
1520		/* releasing a fragment when replaced as a file grows */
1521		handle_workitem_freefrag(WK_FREEFRAG(wk));
1522		break;
1523
1524	case D_FREEFILE:
1525		/* releasing an inode when its link count drops to 0 */
1526		handle_workitem_freefile(WK_FREEFILE(wk));
1527		break;
1528
1529	case D_FREEWORK:
1530		/* Final block in an indirect was freed. */
1531		handle_workitem_indirblk(WK_FREEWORK(wk));
1532		break;
1533
1534	default:
1535		panic("%s_process_worklist: Unknown type %s",
1536		    "softdep", TYPENAME(wk->wk_type));
1537		/* NOTREACHED */
1538	}
1539	vn_finished_secondary_write(mp);
1540	ACQUIRE_LOCK(&lk);
1541	return (matchcnt);
1542}
1543
1544/*
1545 * Move dependencies from one buffer to another.
1546 */
1547int
1548softdep_move_dependencies(oldbp, newbp)
1549	struct buf *oldbp;
1550	struct buf *newbp;
1551{
1552	struct worklist *wk, *wktail;
1553	int dirty;
1554
1555	dirty = 0;
1556	wktail = NULL;
1557	ACQUIRE_LOCK(&lk);
1558	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1559		LIST_REMOVE(wk, wk_list);
1560		if (wk->wk_type == D_BMSAFEMAP &&
1561		    bmsafemap_rollbacks(WK_BMSAFEMAP(wk)))
1562			dirty = 1;
1563		if (wktail == 0)
1564			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1565		else
1566			LIST_INSERT_AFTER(wktail, wk, wk_list);
1567		wktail = wk;
1568	}
1569	FREE_LOCK(&lk);
1570
1571	return (dirty);
1572}
1573
1574/*
1575 * Purge the work list of all items associated with a particular mount point.
1576 */
1577int
1578softdep_flushworklist(oldmnt, countp, td)
1579	struct mount *oldmnt;
1580	int *countp;
1581	struct thread *td;
1582{
1583	struct vnode *devvp;
1584	int count, error = 0;
1585	struct ufsmount *ump;
1586
1587	/*
1588	 * Alternately flush the block device associated with the mount
1589	 * point and process any dependencies that the flushing
1590	 * creates. We continue until no more worklist dependencies
1591	 * are found.
1592	 */
1593	*countp = 0;
1594	ump = VFSTOUFS(oldmnt);
1595	devvp = ump->um_devvp;
1596	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1597		*countp += count;
1598		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1599		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1600		VOP_UNLOCK(devvp, 0);
1601		if (error)
1602			break;
1603	}
1604	return (error);
1605}
1606
1607int
1608softdep_waitidle(struct mount *mp)
1609{
1610	struct ufsmount *ump;
1611	int error;
1612	int i;
1613
1614	ump = VFSTOUFS(mp);
1615	ACQUIRE_LOCK(&lk);
1616	for (i = 0; i < 10 && ump->softdep_deps; i++) {
1617		ump->softdep_req = 1;
1618		if (ump->softdep_on_worklist)
1619			panic("softdep_waitidle: work added after flush.");
1620		msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
1621	}
1622	ump->softdep_req = 0;
1623	FREE_LOCK(&lk);
1624	error = 0;
1625	if (i == 10) {
1626		error = EBUSY;
1627		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1628		    mp);
1629	}
1630
1631	return (error);
1632}
1633
1634/*
1635 * Flush all vnodes and worklist items associated with a specified mount point.
1636 */
1637int
1638softdep_flushfiles(oldmnt, flags, td)
1639	struct mount *oldmnt;
1640	int flags;
1641	struct thread *td;
1642{
1643	int error, depcount, loopcnt, retry_flush_count, retry;
1644
1645	loopcnt = 10;
1646	retry_flush_count = 3;
1647retry_flush:
1648	error = 0;
1649
1650	/*
1651	 * Alternately flush the vnodes associated with the mount
1652	 * point and process any dependencies that the flushing
1653	 * creates. In theory, this loop can happen at most twice,
1654	 * but we give it a few extra just to be sure.
1655	 */
1656	for (; loopcnt > 0; loopcnt--) {
1657		/*
1658		 * Do another flush in case any vnodes were brought in
1659		 * as part of the cleanup operations.
1660		 */
1661		if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
1662			break;
1663		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1664		    depcount == 0)
1665			break;
1666	}
1667	/*
1668	 * If we are unmounting then it is an error to fail. If we
1669	 * are simply trying to downgrade to read-only, then filesystem
1670	 * activity can keep us busy forever, so we just fail with EBUSY.
1671	 */
1672	if (loopcnt == 0) {
1673		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1674			panic("softdep_flushfiles: looping");
1675		error = EBUSY;
1676	}
1677	if (!error)
1678		error = softdep_waitidle(oldmnt);
1679	if (!error) {
1680		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
1681			retry = 0;
1682			MNT_ILOCK(oldmnt);
1683			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
1684			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
1685			if (oldmnt->mnt_nvnodelistsize > 0) {
1686				if (--retry_flush_count > 0) {
1687					retry = 1;
1688					loopcnt = 3;
1689				} else
1690					error = EBUSY;
1691			}
1692			MNT_IUNLOCK(oldmnt);
1693			if (retry)
1694				goto retry_flush;
1695		}
1696	}
1697	return (error);
1698}
1699
1700/*
1701 * Structure hashing.
1702 *
1703 * There are three types of structures that can be looked up:
1704 *	1) pagedep structures identified by mount point, inode number,
1705 *	   and logical block.
1706 *	2) inodedep structures identified by mount point and inode number.
1707 *	3) newblk structures identified by mount point and
1708 *	   physical block number.
1709 *
1710 * The "pagedep" and "inodedep" dependency structures are hashed
1711 * separately from the file blocks and inodes to which they correspond.
1712 * This separation helps when the in-memory copy of an inode or
1713 * file block must be replaced. It also obviates the need to access
1714 * an inode or file page when simply updating (or de-allocating)
1715 * dependency structures. Lookup of newblk structures is needed to
1716 * find newly allocated blocks when trying to associate them with
1717 * their allocdirect or allocindir structure.
1718 *
1719 * The lookup routines optionally create and hash a new instance when
1720 * an existing entry is not found.
1721 */
1722#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
1723#define NODELAY		0x0002	/* cannot do background work */
1724
1725/*
1726 * Structures and routines associated with pagedep caching.
1727 */
1728LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
1729u_long	pagedep_hash;		/* size of hash table - 1 */
1730#define	PAGEDEP_HASH(mp, inum, lbn) \
1731	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
1732	    pagedep_hash])
1733
1734static int
1735pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
1736	struct pagedep_hashhead *pagedephd;
1737	ino_t ino;
1738	ufs_lbn_t lbn;
1739	struct mount *mp;
1740	int flags;
1741	struct pagedep **pagedeppp;
1742{
1743	struct pagedep *pagedep;
1744
1745	LIST_FOREACH(pagedep, pagedephd, pd_hash)
1746		if (ino == pagedep->pd_ino &&
1747		    lbn == pagedep->pd_lbn &&
1748		    mp == pagedep->pd_list.wk_mp)
1749			break;
1750	if (pagedep) {
1751		*pagedeppp = pagedep;
1752		if ((flags & DEPALLOC) != 0 &&
1753		    (pagedep->pd_state & ONWORKLIST) == 0)
1754			return (0);
1755		return (1);
1756	}
1757	*pagedeppp = NULL;
1758	return (0);
1759}
1760/*
1761 * Look up a pagedep. Return 1 if found, 0 if not found or found
1762 * when asked to allocate but not associated with any buffer.
1763 * If not found, allocate if DEPALLOC flag is passed.
1764 * Found or allocated entry is returned in pagedeppp.
1765 * This routine must be called with splbio interrupts blocked.
1766 */
1767static int
1768pagedep_lookup(mp, ino, lbn, flags, pagedeppp)
1769	struct mount *mp;
1770	ino_t ino;
1771	ufs_lbn_t lbn;
1772	int flags;
1773	struct pagedep **pagedeppp;
1774{
1775	struct pagedep *pagedep;
1776	struct pagedep_hashhead *pagedephd;
1777	int ret;
1778	int i;
1779
1780	mtx_assert(&lk, MA_OWNED);
1781	pagedephd = PAGEDEP_HASH(mp, ino, lbn);
1782
1783	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
1784	if (*pagedeppp || (flags & DEPALLOC) == 0)
1785		return (ret);
1786	FREE_LOCK(&lk);
1787	pagedep = malloc(sizeof(struct pagedep),
1788	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
1789	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
1790	ACQUIRE_LOCK(&lk);
1791	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
1792	if (*pagedeppp) {
1793		WORKITEM_FREE(pagedep, D_PAGEDEP);
1794		return (ret);
1795	}
1796	pagedep->pd_ino = ino;
1797	pagedep->pd_lbn = lbn;
1798	LIST_INIT(&pagedep->pd_dirremhd);
1799	LIST_INIT(&pagedep->pd_pendinghd);
1800	for (i = 0; i < DAHASHSZ; i++)
1801		LIST_INIT(&pagedep->pd_diraddhd[i]);
1802	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
1803	*pagedeppp = pagedep;
1804	return (0);
1805}
1806
1807/*
1808 * Structures and routines associated with inodedep caching.
1809 */
1810LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
1811static u_long	inodedep_hash;	/* size of hash table - 1 */
1812static long	num_inodedep;	/* number of inodedep allocated */
1813#define	INODEDEP_HASH(fs, inum) \
1814      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
1815
1816static int
1817inodedep_find(inodedephd, fs, inum, inodedeppp)
1818	struct inodedep_hashhead *inodedephd;
1819	struct fs *fs;
1820	ino_t inum;
1821	struct inodedep **inodedeppp;
1822{
1823	struct inodedep *inodedep;
1824
1825	LIST_FOREACH(inodedep, inodedephd, id_hash)
1826		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1827			break;
1828	if (inodedep) {
1829		*inodedeppp = inodedep;
1830		return (1);
1831	}
1832	*inodedeppp = NULL;
1833
1834	return (0);
1835}
1836/*
1837 * Look up an inodedep. Return 1 if found, 0 if not found.
1838 * If not found, allocate if DEPALLOC flag is passed.
1839 * Found or allocated entry is returned in inodedeppp.
1840 * This routine must be called with splbio interrupts blocked.
1841 */
1842static int
1843inodedep_lookup(mp, inum, flags, inodedeppp)
1844	struct mount *mp;
1845	ino_t inum;
1846	int flags;
1847	struct inodedep **inodedeppp;
1848{
1849	struct inodedep *inodedep;
1850	struct inodedep_hashhead *inodedephd;
1851	struct fs *fs;
1852
1853	mtx_assert(&lk, MA_OWNED);
1854	fs = VFSTOUFS(mp)->um_fs;
1855	inodedephd = INODEDEP_HASH(fs, inum);
1856
1857	if (inodedep_find(inodedephd, fs, inum, inodedeppp))
1858		return (1);
1859	if ((flags & DEPALLOC) == 0)
1860		return (0);
1861	/*
1862	 * If we are over our limit, try to improve the situation.
1863	 */
1864	if (num_inodedep > max_softdeps && (flags & NODELAY) == 0)
1865		request_cleanup(mp, FLUSH_INODES);
1866	FREE_LOCK(&lk);
1867	inodedep = malloc(sizeof(struct inodedep),
1868		M_INODEDEP, M_SOFTDEP_FLAGS);
1869	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
1870	ACQUIRE_LOCK(&lk);
1871	if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
1872		WORKITEM_FREE(inodedep, D_INODEDEP);
1873		return (1);
1874	}
1875	num_inodedep += 1;
1876	inodedep->id_fs = fs;
1877	inodedep->id_ino = inum;
1878	inodedep->id_state = ALLCOMPLETE;
1879	inodedep->id_nlinkdelta = 0;
1880	inodedep->id_savedino1 = NULL;
1881	inodedep->id_savedsize = -1;
1882	inodedep->id_savedextsize = -1;
1883	inodedep->id_savednlink = -1;
1884	inodedep->id_bmsafemap = NULL;
1885	inodedep->id_mkdiradd = NULL;
1886	LIST_INIT(&inodedep->id_dirremhd);
1887	LIST_INIT(&inodedep->id_pendinghd);
1888	LIST_INIT(&inodedep->id_inowait);
1889	LIST_INIT(&inodedep->id_bufwait);
1890	TAILQ_INIT(&inodedep->id_inoreflst);
1891	TAILQ_INIT(&inodedep->id_inoupdt);
1892	TAILQ_INIT(&inodedep->id_newinoupdt);
1893	TAILQ_INIT(&inodedep->id_extupdt);
1894	TAILQ_INIT(&inodedep->id_newextupdt);
1895	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1896	*inodedeppp = inodedep;
1897	return (0);
1898}
1899
1900/*
1901 * Structures and routines associated with newblk caching.
1902 */
1903LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1904u_long	newblk_hash;		/* size of hash table - 1 */
1905#define	NEWBLK_HASH(fs, inum) \
1906	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1907
1908static int
1909newblk_find(newblkhd, mp, newblkno, flags, newblkpp)
1910	struct newblk_hashhead *newblkhd;
1911	struct mount *mp;
1912	ufs2_daddr_t newblkno;
1913	int flags;
1914	struct newblk **newblkpp;
1915{
1916	struct newblk *newblk;
1917
1918	LIST_FOREACH(newblk, newblkhd, nb_hash) {
1919		if (newblkno != newblk->nb_newblkno)
1920			continue;
1921		if (mp != newblk->nb_list.wk_mp)
1922			continue;
1923		/*
1924		 * If we're creating a new dependency don't match those that
1925		 * have already been converted to allocdirects.  This is for
1926		 * a frag extend.
1927		 */
1928		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
1929			continue;
1930		break;
1931	}
1932	if (newblk) {
1933		*newblkpp = newblk;
1934		return (1);
1935	}
1936	*newblkpp = NULL;
1937	return (0);
1938}
1939
1940/*
1941 * Look up a newblk. Return 1 if found, 0 if not found.
1942 * If not found, allocate if DEPALLOC flag is passed.
1943 * Found or allocated entry is returned in newblkpp.
1944 */
1945static int
1946newblk_lookup(mp, newblkno, flags, newblkpp)
1947	struct mount *mp;
1948	ufs2_daddr_t newblkno;
1949	int flags;
1950	struct newblk **newblkpp;
1951{
1952	struct newblk *newblk;
1953	struct newblk_hashhead *newblkhd;
1954
1955	newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno);
1956	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp))
1957		return (1);
1958	if ((flags & DEPALLOC) == 0)
1959		return (0);
1960	FREE_LOCK(&lk);
1961	newblk = malloc(sizeof(union allblk), M_NEWBLK,
1962	    M_SOFTDEP_FLAGS | M_ZERO);
1963	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
1964	ACQUIRE_LOCK(&lk);
1965	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) {
1966		WORKITEM_FREE(newblk, D_NEWBLK);
1967		return (1);
1968	}
1969	newblk->nb_freefrag = NULL;
1970	LIST_INIT(&newblk->nb_indirdeps);
1971	LIST_INIT(&newblk->nb_newdirblk);
1972	LIST_INIT(&newblk->nb_jwork);
1973	newblk->nb_state = ATTACHED;
1974	newblk->nb_newblkno = newblkno;
1975	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1976	*newblkpp = newblk;
1977	return (0);
1978}
1979
1980/*
1981 * Executed during filesystem system initialization before
1982 * mounting any filesystems.
1983 */
1984void
1985softdep_initialize()
1986{
1987
1988	LIST_INIT(&mkdirlisthd);
1989	max_softdeps = desiredvnodes * 4;
1990	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash);
1991	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
1992	newblk_hashtbl = hashinit(desiredvnodes / 5,  M_NEWBLK, &newblk_hash);
1993	bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
1994
1995	/* initialise bioops hack */
1996	bioops.io_start = softdep_disk_io_initiation;
1997	bioops.io_complete = softdep_disk_write_complete;
1998	bioops.io_deallocate = softdep_deallocate_dependencies;
1999	bioops.io_countdeps = softdep_count_dependencies;
2000
2001	/* Initialize the callout with an mtx. */
2002	callout_init_mtx(&softdep_callout, &lk, 0);
2003}
2004
2005/*
2006 * Executed after all filesystems have been unmounted during
2007 * filesystem module unload.
2008 */
2009void
2010softdep_uninitialize()
2011{
2012
2013	callout_drain(&softdep_callout);
2014	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
2015	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
2016	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
2017	hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash);
2018}
2019
2020/*
2021 * Called at mount time to notify the dependency code that a
2022 * filesystem wishes to use it.
2023 */
2024int
2025softdep_mount(devvp, mp, fs, cred)
2026	struct vnode *devvp;
2027	struct mount *mp;
2028	struct fs *fs;
2029	struct ucred *cred;
2030{
2031	struct csum_total cstotal;
2032	struct ufsmount *ump;
2033	struct cg *cgp;
2034	struct buf *bp;
2035	int error, cyl;
2036
2037	MNT_ILOCK(mp);
2038	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2039	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2040		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2041			MNTK_SOFTDEP;
2042		mp->mnt_noasync++;
2043	}
2044	MNT_IUNLOCK(mp);
2045	ump = VFSTOUFS(mp);
2046	LIST_INIT(&ump->softdep_workitem_pending);
2047	LIST_INIT(&ump->softdep_journal_pending);
2048	TAILQ_INIT(&ump->softdep_unlinked);
2049	ump->softdep_worklist_tail = NULL;
2050	ump->softdep_on_worklist = 0;
2051	ump->softdep_deps = 0;
2052	if ((fs->fs_flags & FS_SUJ) &&
2053	    (error = journal_mount(mp, fs, cred)) != 0) {
2054		printf("Failed to start journal: %d\n", error);
2055		return (error);
2056	}
2057	/*
2058	 * When doing soft updates, the counters in the
2059	 * superblock may have gotten out of sync. Recomputation
2060	 * can take a long time and can be deferred for background
2061	 * fsck.  However, the old behavior of scanning the cylinder
2062	 * groups and recalculating them at mount time is available
2063	 * by setting vfs.ffs.compute_summary_at_mount to one.
2064	 */
2065	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2066		return (0);
2067	bzero(&cstotal, sizeof cstotal);
2068	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2069		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2070		    fs->fs_cgsize, cred, &bp)) != 0) {
2071			brelse(bp);
2072			return (error);
2073		}
2074		cgp = (struct cg *)bp->b_data;
2075		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2076		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2077		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2078		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2079		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2080		brelse(bp);
2081	}
2082#ifdef DEBUG
2083	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2084		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2085#endif
2086	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2087	return (0);
2088}
2089
2090void
2091softdep_unmount(mp)
2092	struct mount *mp;
2093{
2094
2095	if (mp->mnt_kern_flag & MNTK_SUJ)
2096		journal_unmount(mp);
2097}
2098
2099struct jblocks {
2100	struct jseglst	jb_segs;	/* TAILQ of current segments. */
2101	struct jseg	*jb_writeseg;	/* Next write to complete. */
2102	struct jextent	*jb_extent;	/* Extent array. */
2103	uint64_t	jb_nextseq;	/* Next sequence number. */
2104	uint64_t	jb_oldestseq;	/* Oldest active sequence number. */
2105	int		jb_avail;	/* Available extents. */
2106	int		jb_used;	/* Last used extent. */
2107	int		jb_head;	/* Allocator head. */
2108	int		jb_off;		/* Allocator extent offset. */
2109	int		jb_blocks;	/* Total disk blocks covered. */
2110	int		jb_free;	/* Total disk blocks free. */
2111	int		jb_min;		/* Minimum free space. */
2112	int		jb_low;		/* Low on space. */
2113	int		jb_age;		/* Insertion time of oldest rec. */
2114	int		jb_suspended;	/* Did journal suspend writes? */
2115};
2116
2117struct jextent {
2118	ufs2_daddr_t	je_daddr;	/* Disk block address. */
2119	int		je_blocks;	/* Disk block count. */
2120};
2121
2122static struct jblocks *
2123jblocks_create(void)
2124{
2125	struct jblocks *jblocks;
2126
2127	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2128	TAILQ_INIT(&jblocks->jb_segs);
2129	jblocks->jb_avail = 10;
2130	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2131	    M_JBLOCKS, M_WAITOK | M_ZERO);
2132
2133	return (jblocks);
2134}
2135
2136static ufs2_daddr_t
2137jblocks_alloc(jblocks, bytes, actual)
2138	struct jblocks *jblocks;
2139	int bytes;
2140	int *actual;
2141{
2142	ufs2_daddr_t daddr;
2143	struct jextent *jext;
2144	int freecnt;
2145	int blocks;
2146
2147	blocks = bytes / DEV_BSIZE;
2148	jext = &jblocks->jb_extent[jblocks->jb_head];
2149	freecnt = jext->je_blocks - jblocks->jb_off;
2150	if (freecnt == 0) {
2151		jblocks->jb_off = 0;
2152		if (++jblocks->jb_head > jblocks->jb_used)
2153			jblocks->jb_head = 0;
2154		jext = &jblocks->jb_extent[jblocks->jb_head];
2155		freecnt = jext->je_blocks;
2156	}
2157	if (freecnt > blocks)
2158		freecnt = blocks;
2159	*actual = freecnt * DEV_BSIZE;
2160	daddr = jext->je_daddr + jblocks->jb_off;
2161	jblocks->jb_off += freecnt;
2162	jblocks->jb_free -= freecnt;
2163
2164	return (daddr);
2165}
2166
2167static void
2168jblocks_free(jblocks, mp, bytes)
2169	struct jblocks *jblocks;
2170	struct mount *mp;
2171	int bytes;
2172{
2173
2174	jblocks->jb_free += bytes / DEV_BSIZE;
2175	if (jblocks->jb_suspended)
2176		worklist_speedup();
2177	wakeup(jblocks);
2178}
2179
2180static void
2181jblocks_destroy(jblocks)
2182	struct jblocks *jblocks;
2183{
2184
2185	if (jblocks->jb_extent)
2186		free(jblocks->jb_extent, M_JBLOCKS);
2187	free(jblocks, M_JBLOCKS);
2188}
2189
2190static void
2191jblocks_add(jblocks, daddr, blocks)
2192	struct jblocks *jblocks;
2193	ufs2_daddr_t daddr;
2194	int blocks;
2195{
2196	struct jextent *jext;
2197
2198	jblocks->jb_blocks += blocks;
2199	jblocks->jb_free += blocks;
2200	jext = &jblocks->jb_extent[jblocks->jb_used];
2201	/* Adding the first block. */
2202	if (jext->je_daddr == 0) {
2203		jext->je_daddr = daddr;
2204		jext->je_blocks = blocks;
2205		return;
2206	}
2207	/* Extending the last extent. */
2208	if (jext->je_daddr + jext->je_blocks == daddr) {
2209		jext->je_blocks += blocks;
2210		return;
2211	}
2212	/* Adding a new extent. */
2213	if (++jblocks->jb_used == jblocks->jb_avail) {
2214		jblocks->jb_avail *= 2;
2215		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2216		    M_JBLOCKS, M_WAITOK | M_ZERO);
2217		memcpy(jext, jblocks->jb_extent,
2218		    sizeof(struct jextent) * jblocks->jb_used);
2219		free(jblocks->jb_extent, M_JBLOCKS);
2220		jblocks->jb_extent = jext;
2221	}
2222	jext = &jblocks->jb_extent[jblocks->jb_used];
2223	jext->je_daddr = daddr;
2224	jext->je_blocks = blocks;
2225	return;
2226}
2227
2228int
2229softdep_journal_lookup(mp, vpp)
2230	struct mount *mp;
2231	struct vnode **vpp;
2232{
2233	struct componentname cnp;
2234	struct vnode *dvp;
2235	ino_t sujournal;
2236	int error;
2237
2238	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2239	if (error)
2240		return (error);
2241	bzero(&cnp, sizeof(cnp));
2242	cnp.cn_nameiop = LOOKUP;
2243	cnp.cn_flags = ISLASTCN;
2244	cnp.cn_thread = curthread;
2245	cnp.cn_cred = curthread->td_ucred;
2246	cnp.cn_pnbuf = SUJ_FILE;
2247	cnp.cn_nameptr = SUJ_FILE;
2248	cnp.cn_namelen = strlen(SUJ_FILE);
2249	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2250	vput(dvp);
2251	if (error != 0)
2252		return (error);
2253	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2254	return (error);
2255}
2256
2257/*
2258 * Open and verify the journal file.
2259 */
2260static int
2261journal_mount(mp, fs, cred)
2262	struct mount *mp;
2263	struct fs *fs;
2264	struct ucred *cred;
2265{
2266	struct jblocks *jblocks;
2267	struct vnode *vp;
2268	struct inode *ip;
2269	ufs2_daddr_t blkno;
2270	int bcount;
2271	int error;
2272	int i;
2273
2274	error = softdep_journal_lookup(mp, &vp);
2275	if (error != 0) {
2276		printf("Failed to find journal.  Use tunefs to create one\n");
2277		return (error);
2278	}
2279	ip = VTOI(vp);
2280	if (ip->i_size < SUJ_MIN) {
2281		error = ENOSPC;
2282		goto out;
2283	}
2284	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2285	jblocks = jblocks_create();
2286	for (i = 0; i < bcount; i++) {
2287		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2288		if (error)
2289			break;
2290		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2291	}
2292	if (error) {
2293		jblocks_destroy(jblocks);
2294		goto out;
2295	}
2296	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2297	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2298	VFSTOUFS(mp)->softdep_jblocks = jblocks;
2299out:
2300	if (error == 0) {
2301		MNT_ILOCK(mp);
2302		mp->mnt_kern_flag |= MNTK_SUJ;
2303		MNT_IUNLOCK(mp);
2304		/*
2305		 * Only validate the journal contents if the
2306		 * filesystem is clean, otherwise we write the logs
2307		 * but they'll never be used.  If the filesystem was
2308		 * still dirty when we mounted it the journal is
2309		 * invalid and a new journal can only be valid if it
2310		 * starts from a clean mount.
2311		 */
2312		if (fs->fs_clean) {
2313			DIP_SET(ip, i_modrev, fs->fs_mtime);
2314			ip->i_flags |= IN_MODIFIED;
2315			ffs_update(vp, 1);
2316		}
2317	}
2318	vput(vp);
2319	return (error);
2320}
2321
2322static void
2323journal_unmount(mp)
2324	struct mount *mp;
2325{
2326	struct ufsmount *ump;
2327
2328	ump = VFSTOUFS(mp);
2329	if (ump->softdep_jblocks)
2330		jblocks_destroy(ump->softdep_jblocks);
2331	ump->softdep_jblocks = NULL;
2332}
2333
2334/*
2335 * Called when a journal record is ready to be written.  Space is allocated
2336 * and the journal entry is created when the journal is flushed to stable
2337 * store.
2338 */
2339static void
2340add_to_journal(wk)
2341	struct worklist *wk;
2342{
2343	struct ufsmount *ump;
2344
2345	mtx_assert(&lk, MA_OWNED);
2346	ump = VFSTOUFS(wk->wk_mp);
2347	if (wk->wk_state & ONWORKLIST)
2348		panic("add_to_journal: %s(0x%X) already on list",
2349		    TYPENAME(wk->wk_type), wk->wk_state);
2350	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2351	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2352		ump->softdep_jblocks->jb_age = ticks;
2353		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2354	} else
2355		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2356	ump->softdep_journal_tail = wk;
2357	ump->softdep_on_journal += 1;
2358}
2359
2360/*
2361 * Remove an arbitrary item for the journal worklist maintain the tail
2362 * pointer.  This happens when a new operation obviates the need to
2363 * journal an old operation.
2364 */
2365static void
2366remove_from_journal(wk)
2367	struct worklist *wk;
2368{
2369	struct ufsmount *ump;
2370
2371	mtx_assert(&lk, MA_OWNED);
2372	ump = VFSTOUFS(wk->wk_mp);
2373#ifdef SUJ_DEBUG
2374	{
2375		struct worklist *wkn;
2376
2377		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2378			if (wkn == wk)
2379				break;
2380		if (wkn == NULL)
2381			panic("remove_from_journal: %p is not in journal", wk);
2382	}
2383#endif
2384	/*
2385	 * We emulate a TAILQ to save space in most structures which do not
2386	 * require TAILQ semantics.  Here we must update the tail position
2387	 * when removing the tail which is not the final entry. This works
2388	 * only if the worklist linkage are at the beginning of the structure.
2389	 */
2390	if (ump->softdep_journal_tail == wk)
2391		ump->softdep_journal_tail =
2392		    (struct worklist *)wk->wk_list.le_prev;
2393
2394	WORKLIST_REMOVE(wk);
2395	ump->softdep_on_journal -= 1;
2396}
2397
2398/*
2399 * Check for journal space as well as dependency limits so the prelink
2400 * code can throttle both journaled and non-journaled filesystems.
2401 * Threshold is 0 for low and 1 for min.
2402 */
2403static int
2404journal_space(ump, thresh)
2405	struct ufsmount *ump;
2406	int thresh;
2407{
2408	struct jblocks *jblocks;
2409	int avail;
2410
2411	jblocks = ump->softdep_jblocks;
2412	if (jblocks == NULL)
2413		return (1);
2414	/*
2415	 * We use a tighter restriction here to prevent request_cleanup()
2416	 * running in threads from running into locks we currently hold.
2417	 */
2418	if (num_inodedep > (max_softdeps / 10) * 9)
2419		return (0);
2420	if (thresh)
2421		thresh = jblocks->jb_min;
2422	else
2423		thresh = jblocks->jb_low;
2424	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2425	avail = jblocks->jb_free - avail;
2426
2427	return (avail > thresh);
2428}
2429
2430static void
2431journal_suspend(ump)
2432	struct ufsmount *ump;
2433{
2434	struct jblocks *jblocks;
2435	struct mount *mp;
2436
2437	mp = UFSTOVFS(ump);
2438	jblocks = ump->softdep_jblocks;
2439	MNT_ILOCK(mp);
2440	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2441		stat_journal_min++;
2442		mp->mnt_kern_flag |= MNTK_SUSPEND;
2443		mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
2444	}
2445	jblocks->jb_suspended = 1;
2446	MNT_IUNLOCK(mp);
2447}
2448
2449static int
2450journal_unsuspend(struct ufsmount *ump)
2451{
2452	struct jblocks *jblocks;
2453	struct mount *mp;
2454
2455	mp = UFSTOVFS(ump);
2456	jblocks = ump->softdep_jblocks;
2457
2458	if (jblocks != NULL && jblocks->jb_suspended &&
2459	    journal_space(ump, jblocks->jb_min)) {
2460		jblocks->jb_suspended = 0;
2461		FREE_LOCK(&lk);
2462		mp->mnt_susp_owner = curthread;
2463		vfs_write_resume(mp);
2464		ACQUIRE_LOCK(&lk);
2465		return (1);
2466	}
2467	return (0);
2468}
2469
2470/*
2471 * Called before any allocation function to be certain that there is
2472 * sufficient space in the journal prior to creating any new records.
2473 * Since in the case of block allocation we may have multiple locked
2474 * buffers at the time of the actual allocation we can not block
2475 * when the journal records are created.  Doing so would create a deadlock
2476 * if any of these buffers needed to be flushed to reclaim space.  Instead
2477 * we require a sufficiently large amount of available space such that
2478 * each thread in the system could have passed this allocation check and
2479 * still have sufficient free space.  With 20% of a minimum journal size
2480 * of 1MB we have 6553 records available.
2481 */
2482int
2483softdep_prealloc(vp, waitok)
2484	struct vnode *vp;
2485	int waitok;
2486{
2487	struct ufsmount *ump;
2488
2489	if (DOINGSUJ(vp) == 0)
2490		return (0);
2491	ump = VFSTOUFS(vp->v_mount);
2492	ACQUIRE_LOCK(&lk);
2493	if (journal_space(ump, 0)) {
2494		FREE_LOCK(&lk);
2495		return (0);
2496	}
2497	stat_journal_low++;
2498	FREE_LOCK(&lk);
2499	if (waitok == MNT_NOWAIT)
2500		return (ENOSPC);
2501	/*
2502	 * Attempt to sync this vnode once to flush any journal
2503	 * work attached to it.
2504	 */
2505	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
2506		ffs_syncvnode(vp, waitok);
2507	ACQUIRE_LOCK(&lk);
2508	process_removes(vp);
2509	if (journal_space(ump, 0) == 0) {
2510		softdep_speedup();
2511		if (journal_space(ump, 1) == 0)
2512			journal_suspend(ump);
2513	}
2514	FREE_LOCK(&lk);
2515
2516	return (0);
2517}
2518
2519/*
2520 * Before adjusting a link count on a vnode verify that we have sufficient
2521 * journal space.  If not, process operations that depend on the currently
2522 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
2523 * and softdep flush threads can not acquire these locks to reclaim space.
2524 */
2525static void
2526softdep_prelink(dvp, vp)
2527	struct vnode *dvp;
2528	struct vnode *vp;
2529{
2530	struct ufsmount *ump;
2531
2532	ump = VFSTOUFS(dvp->v_mount);
2533	mtx_assert(&lk, MA_OWNED);
2534	if (journal_space(ump, 0))
2535		return;
2536	stat_journal_low++;
2537	FREE_LOCK(&lk);
2538	if (vp)
2539		ffs_syncvnode(vp, MNT_NOWAIT);
2540	ffs_syncvnode(dvp, MNT_WAIT);
2541	ACQUIRE_LOCK(&lk);
2542	/* Process vp before dvp as it may create .. removes. */
2543	if (vp)
2544		process_removes(vp);
2545	process_removes(dvp);
2546	softdep_speedup();
2547	process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
2548	process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
2549	if (journal_space(ump, 0) == 0) {
2550		softdep_speedup();
2551		if (journal_space(ump, 1) == 0)
2552			journal_suspend(ump);
2553	}
2554}
2555
2556static void
2557jseg_write(ump, jblocks, jseg, data)
2558	struct ufsmount *ump;
2559	struct jblocks *jblocks;
2560	struct jseg *jseg;
2561	uint8_t *data;
2562{
2563	struct jsegrec *rec;
2564
2565	rec = (struct jsegrec *)data;
2566	rec->jsr_seq = jseg->js_seq;
2567	rec->jsr_oldest = jblocks->jb_oldestseq;
2568	rec->jsr_cnt = jseg->js_cnt;
2569	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
2570	rec->jsr_crc = 0;
2571	rec->jsr_time = ump->um_fs->fs_mtime;
2572}
2573
2574static inline void
2575inoref_write(inoref, jseg, rec)
2576	struct inoref *inoref;
2577	struct jseg *jseg;
2578	struct jrefrec *rec;
2579{
2580
2581	inoref->if_jsegdep->jd_seg = jseg;
2582	rec->jr_ino = inoref->if_ino;
2583	rec->jr_parent = inoref->if_parent;
2584	rec->jr_nlink = inoref->if_nlink;
2585	rec->jr_mode = inoref->if_mode;
2586	rec->jr_diroff = inoref->if_diroff;
2587}
2588
2589static void
2590jaddref_write(jaddref, jseg, data)
2591	struct jaddref *jaddref;
2592	struct jseg *jseg;
2593	uint8_t *data;
2594{
2595	struct jrefrec *rec;
2596
2597	rec = (struct jrefrec *)data;
2598	rec->jr_op = JOP_ADDREF;
2599	inoref_write(&jaddref->ja_ref, jseg, rec);
2600}
2601
2602static void
2603jremref_write(jremref, jseg, data)
2604	struct jremref *jremref;
2605	struct jseg *jseg;
2606	uint8_t *data;
2607{
2608	struct jrefrec *rec;
2609
2610	rec = (struct jrefrec *)data;
2611	rec->jr_op = JOP_REMREF;
2612	inoref_write(&jremref->jr_ref, jseg, rec);
2613}
2614
2615static void
2616jmvref_write(jmvref, jseg, data)
2617	struct jmvref *jmvref;
2618	struct jseg *jseg;
2619	uint8_t *data;
2620{
2621	struct jmvrec *rec;
2622
2623	rec = (struct jmvrec *)data;
2624	rec->jm_op = JOP_MVREF;
2625	rec->jm_ino = jmvref->jm_ino;
2626	rec->jm_parent = jmvref->jm_parent;
2627	rec->jm_oldoff = jmvref->jm_oldoff;
2628	rec->jm_newoff = jmvref->jm_newoff;
2629}
2630
2631static void
2632jnewblk_write(jnewblk, jseg, data)
2633	struct jnewblk *jnewblk;
2634	struct jseg *jseg;
2635	uint8_t *data;
2636{
2637	struct jblkrec *rec;
2638
2639	jnewblk->jn_jsegdep->jd_seg = jseg;
2640	rec = (struct jblkrec *)data;
2641	rec->jb_op = JOP_NEWBLK;
2642	rec->jb_ino = jnewblk->jn_ino;
2643	rec->jb_blkno = jnewblk->jn_blkno;
2644	rec->jb_lbn = jnewblk->jn_lbn;
2645	rec->jb_frags = jnewblk->jn_frags;
2646	rec->jb_oldfrags = jnewblk->jn_oldfrags;
2647}
2648
2649static void
2650jfreeblk_write(jfreeblk, jseg, data)
2651	struct jfreeblk *jfreeblk;
2652	struct jseg *jseg;
2653	uint8_t *data;
2654{
2655	struct jblkrec *rec;
2656
2657	jfreeblk->jf_jsegdep->jd_seg = jseg;
2658	rec = (struct jblkrec *)data;
2659	rec->jb_op = JOP_FREEBLK;
2660	rec->jb_ino = jfreeblk->jf_ino;
2661	rec->jb_blkno = jfreeblk->jf_blkno;
2662	rec->jb_lbn = jfreeblk->jf_lbn;
2663	rec->jb_frags = jfreeblk->jf_frags;
2664	rec->jb_oldfrags = 0;
2665}
2666
2667static void
2668jfreefrag_write(jfreefrag, jseg, data)
2669	struct jfreefrag *jfreefrag;
2670	struct jseg *jseg;
2671	uint8_t *data;
2672{
2673	struct jblkrec *rec;
2674
2675	jfreefrag->fr_jsegdep->jd_seg = jseg;
2676	rec = (struct jblkrec *)data;
2677	rec->jb_op = JOP_FREEBLK;
2678	rec->jb_ino = jfreefrag->fr_ino;
2679	rec->jb_blkno = jfreefrag->fr_blkno;
2680	rec->jb_lbn = jfreefrag->fr_lbn;
2681	rec->jb_frags = jfreefrag->fr_frags;
2682	rec->jb_oldfrags = 0;
2683}
2684
2685static void
2686jtrunc_write(jtrunc, jseg, data)
2687	struct jtrunc *jtrunc;
2688	struct jseg *jseg;
2689	uint8_t *data;
2690{
2691	struct jtrncrec *rec;
2692
2693	rec = (struct jtrncrec *)data;
2694	rec->jt_op = JOP_TRUNC;
2695	rec->jt_ino = jtrunc->jt_ino;
2696	rec->jt_size = jtrunc->jt_size;
2697	rec->jt_extsize = jtrunc->jt_extsize;
2698}
2699
2700/*
2701 * Flush some journal records to disk.
2702 */
2703static void
2704softdep_process_journal(mp, flags)
2705	struct mount *mp;
2706	int flags;
2707{
2708	struct jblocks *jblocks;
2709	struct ufsmount *ump;
2710	struct worklist *wk;
2711	struct jseg *jseg;
2712	struct buf *bp;
2713	uint8_t *data;
2714	struct fs *fs;
2715	int segwritten;
2716	int jrecmin;	/* Minimum records per block. */
2717	int jrecmax;	/* Maximum records per block. */
2718	int size;
2719	int cnt;
2720	int off;
2721	int devbsize;
2722
2723	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
2724		return;
2725	ump = VFSTOUFS(mp);
2726	fs = ump->um_fs;
2727	jblocks = ump->softdep_jblocks;
2728	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
2729	/*
2730	 * We write anywhere between a disk block and fs block.  The upper
2731	 * bound is picked to prevent buffer cache fragmentation and limit
2732	 * processing time per I/O.
2733	 */
2734	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
2735	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
2736	segwritten = 0;
2737	while ((cnt = ump->softdep_on_journal) != 0) {
2738		/*
2739		 * Create a new segment to hold as many as 'cnt' journal
2740		 * entries and add them to the segment.  Notice cnt is
2741		 * off by one to account for the space required by the
2742		 * jsegrec.  If we don't have a full block to log skip it
2743		 * unless we haven't written anything.
2744		 */
2745		cnt++;
2746		if (cnt < jrecmax && segwritten)
2747			break;
2748		/*
2749		 * Verify some free journal space.  softdep_prealloc() should
2750	 	 * guarantee that we don't run out so this is indicative of
2751		 * a problem with the flow control.  Try to recover
2752		 * gracefully in any event.
2753		 */
2754		while (jblocks->jb_free == 0) {
2755			if (flags != MNT_WAIT)
2756				break;
2757			printf("softdep: Out of journal space!\n");
2758			softdep_speedup();
2759			msleep(jblocks, &lk, PRIBIO, "jblocks", hz);
2760		}
2761		FREE_LOCK(&lk);
2762		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
2763		workitem_alloc(&jseg->js_list, D_JSEG, mp);
2764		LIST_INIT(&jseg->js_entries);
2765		jseg->js_state = ATTACHED;
2766		jseg->js_jblocks = jblocks;
2767		bp = geteblk(fs->fs_bsize, 0);
2768		ACQUIRE_LOCK(&lk);
2769		/*
2770		 * If there was a race while we were allocating the block
2771		 * and jseg the entry we care about was likely written.
2772		 * We bail out in both the WAIT and NOWAIT case and assume
2773		 * the caller will loop if the entry it cares about is
2774		 * not written.
2775		 */
2776		if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) {
2777			bp->b_flags |= B_INVAL | B_NOCACHE;
2778			WORKITEM_FREE(jseg, D_JSEG);
2779			FREE_LOCK(&lk);
2780			brelse(bp);
2781			ACQUIRE_LOCK(&lk);
2782			break;
2783		}
2784		/*
2785		 * Calculate the disk block size required for the available
2786		 * records rounded to the min size.
2787		 */
2788		cnt = ump->softdep_on_journal;
2789		if (cnt < jrecmax)
2790			size = howmany(cnt, jrecmin) * devbsize;
2791		else
2792			size = fs->fs_bsize;
2793		/*
2794		 * Allocate a disk block for this journal data and account
2795		 * for truncation of the requested size if enough contiguous
2796		 * space was not available.
2797		 */
2798		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
2799		bp->b_lblkno = bp->b_blkno;
2800		bp->b_offset = bp->b_blkno * DEV_BSIZE;
2801		bp->b_bcount = size;
2802		bp->b_bufobj = &ump->um_devvp->v_bufobj;
2803		bp->b_flags &= ~B_INVAL;
2804		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
2805		/*
2806		 * Initialize our jseg with cnt records.  Assign the next
2807		 * sequence number to it and link it in-order.
2808		 */
2809		cnt = MIN(ump->softdep_on_journal,
2810		    (size / devbsize) * jrecmin);
2811		jseg->js_buf = bp;
2812		jseg->js_cnt = cnt;
2813		jseg->js_refs = cnt + 1;	/* Self ref. */
2814		jseg->js_size = size;
2815		jseg->js_seq = jblocks->jb_nextseq++;
2816		if (TAILQ_EMPTY(&jblocks->jb_segs))
2817			jblocks->jb_oldestseq = jseg->js_seq;
2818		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
2819		if (jblocks->jb_writeseg == NULL)
2820			jblocks->jb_writeseg = jseg;
2821		/*
2822		 * Start filling in records from the pending list.
2823		 */
2824		data = bp->b_data;
2825		off = 0;
2826		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
2827		    != NULL) {
2828			/* Place a segment header on every device block. */
2829			if ((off % devbsize) == 0) {
2830				jseg_write(ump, jblocks, jseg, data);
2831				off += JREC_SIZE;
2832				data = bp->b_data + off;
2833			}
2834			remove_from_journal(wk);
2835			wk->wk_state |= IOSTARTED;
2836			WORKLIST_INSERT(&jseg->js_entries, wk);
2837			switch (wk->wk_type) {
2838			case D_JADDREF:
2839				jaddref_write(WK_JADDREF(wk), jseg, data);
2840				break;
2841			case D_JREMREF:
2842				jremref_write(WK_JREMREF(wk), jseg, data);
2843				break;
2844			case D_JMVREF:
2845				jmvref_write(WK_JMVREF(wk), jseg, data);
2846				break;
2847			case D_JNEWBLK:
2848				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
2849				break;
2850			case D_JFREEBLK:
2851				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
2852				break;
2853			case D_JFREEFRAG:
2854				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
2855				break;
2856			case D_JTRUNC:
2857				jtrunc_write(WK_JTRUNC(wk), jseg, data);
2858				break;
2859			default:
2860				panic("process_journal: Unknown type %s",
2861				    TYPENAME(wk->wk_type));
2862				/* NOTREACHED */
2863			}
2864			if (--cnt == 0)
2865				break;
2866			off += JREC_SIZE;
2867			data = bp->b_data + off;
2868		}
2869		/*
2870		 * Write this one buffer and continue.
2871		 */
2872		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
2873		FREE_LOCK(&lk);
2874		BO_LOCK(bp->b_bufobj);
2875		bgetvp(ump->um_devvp, bp);
2876		BO_UNLOCK(bp->b_bufobj);
2877		if (flags == MNT_NOWAIT)
2878			bawrite(bp);
2879		else
2880			bwrite(bp);
2881		ACQUIRE_LOCK(&lk);
2882	}
2883	/*
2884	 * If we've suspended the filesystem because we ran out of journal
2885	 * space either try to sync it here to make some progress or
2886	 * unsuspend it if we already have.
2887	 */
2888	if (flags == 0 && jblocks->jb_suspended) {
2889		if (journal_unsuspend(ump))
2890			return;
2891		FREE_LOCK(&lk);
2892		VFS_SYNC(mp, MNT_NOWAIT);
2893		ffs_sbupdate(ump, MNT_WAIT, 0);
2894		ACQUIRE_LOCK(&lk);
2895	}
2896}
2897
2898/*
2899 * Complete a jseg, allowing all dependencies awaiting journal writes
2900 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
2901 * structures so that the journal segment can be freed to reclaim space.
2902 */
2903static void
2904complete_jseg(jseg)
2905	struct jseg *jseg;
2906{
2907	struct worklist *wk;
2908	struct jmvref *jmvref;
2909	int waiting;
2910#ifdef INVARIANTS
2911	int i = 0;
2912#endif
2913
2914	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
2915		WORKLIST_REMOVE(wk);
2916		waiting = wk->wk_state & IOWAITING;
2917		wk->wk_state &= ~(IOSTARTED | IOWAITING);
2918		wk->wk_state |= COMPLETE;
2919		KASSERT(i++ < jseg->js_cnt,
2920		    ("handle_written_jseg: overflow %d >= %d",
2921		    i - 1, jseg->js_cnt));
2922		switch (wk->wk_type) {
2923		case D_JADDREF:
2924			handle_written_jaddref(WK_JADDREF(wk));
2925			break;
2926		case D_JREMREF:
2927			handle_written_jremref(WK_JREMREF(wk));
2928			break;
2929		case D_JMVREF:
2930			/* No jsegdep here. */
2931			free_jseg(jseg);
2932			jmvref = WK_JMVREF(wk);
2933			LIST_REMOVE(jmvref, jm_deps);
2934			free_pagedep(jmvref->jm_pagedep);
2935			WORKITEM_FREE(jmvref, D_JMVREF);
2936			break;
2937		case D_JNEWBLK:
2938			handle_written_jnewblk(WK_JNEWBLK(wk));
2939			break;
2940		case D_JFREEBLK:
2941			handle_written_jfreeblk(WK_JFREEBLK(wk));
2942			break;
2943		case D_JFREEFRAG:
2944			handle_written_jfreefrag(WK_JFREEFRAG(wk));
2945			break;
2946		case D_JTRUNC:
2947			WK_JTRUNC(wk)->jt_jsegdep->jd_seg = jseg;
2948			WORKITEM_FREE(wk, D_JTRUNC);
2949			break;
2950		default:
2951			panic("handle_written_jseg: Unknown type %s",
2952			    TYPENAME(wk->wk_type));
2953			/* NOTREACHED */
2954		}
2955		if (waiting)
2956			wakeup(wk);
2957	}
2958	/* Release the self reference so the structure may be freed. */
2959	free_jseg(jseg);
2960}
2961
2962/*
2963 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Handle jseg
2964 * completions in order only.
2965 */
2966static void
2967handle_written_jseg(jseg, bp)
2968	struct jseg *jseg;
2969	struct buf *bp;
2970{
2971	struct jblocks *jblocks;
2972	struct jseg *jsegn;
2973
2974	if (jseg->js_refs == 0)
2975		panic("handle_written_jseg: No self-reference on %p", jseg);
2976	jseg->js_state |= DEPCOMPLETE;
2977	/*
2978	 * We'll never need this buffer again, set flags so it will be
2979	 * discarded.
2980	 */
2981	bp->b_flags |= B_INVAL | B_NOCACHE;
2982	jblocks = jseg->js_jblocks;
2983	/*
2984	 * Don't allow out of order completions.  If this isn't the first
2985	 * block wait for it to write before we're done.
2986	 */
2987	if (jseg != jblocks->jb_writeseg)
2988		return;
2989	/* Iterate through available jsegs processing their entries. */
2990	do {
2991		jsegn = TAILQ_NEXT(jseg, js_next);
2992		complete_jseg(jseg);
2993		jseg = jsegn;
2994	} while (jseg && jseg->js_state & DEPCOMPLETE);
2995	jblocks->jb_writeseg = jseg;
2996}
2997
2998static inline struct jsegdep *
2999inoref_jseg(inoref)
3000	struct inoref *inoref;
3001{
3002	struct jsegdep *jsegdep;
3003
3004	jsegdep = inoref->if_jsegdep;
3005	inoref->if_jsegdep = NULL;
3006
3007	return (jsegdep);
3008}
3009
3010/*
3011 * Called once a jremref has made it to stable store.  The jremref is marked
3012 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3013 * for the jremref to complete will be awoken by free_jremref.
3014 */
3015static void
3016handle_written_jremref(jremref)
3017	struct jremref *jremref;
3018{
3019	struct inodedep *inodedep;
3020	struct jsegdep *jsegdep;
3021	struct dirrem *dirrem;
3022
3023	/* Grab the jsegdep. */
3024	jsegdep = inoref_jseg(&jremref->jr_ref);
3025	/*
3026	 * Remove us from the inoref list.
3027	 */
3028	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3029	    0, &inodedep) == 0)
3030		panic("handle_written_jremref: Lost inodedep");
3031	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3032	/*
3033	 * Complete the dirrem.
3034	 */
3035	dirrem = jremref->jr_dirrem;
3036	jremref->jr_dirrem = NULL;
3037	LIST_REMOVE(jremref, jr_deps);
3038	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3039	WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list);
3040	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3041	    (dirrem->dm_state & COMPLETE) != 0)
3042		add_to_worklist(&dirrem->dm_list, 0);
3043	free_jremref(jremref);
3044}
3045
3046/*
3047 * Called once a jaddref has made it to stable store.  The dependency is
3048 * marked complete and any dependent structures are added to the inode
3049 * bufwait list to be completed as soon as it is written.  If a bitmap write
3050 * depends on this entry we move the inode into the inodedephd of the
3051 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3052 */
3053static void
3054handle_written_jaddref(jaddref)
3055	struct jaddref *jaddref;
3056{
3057	struct jsegdep *jsegdep;
3058	struct inodedep *inodedep;
3059	struct diradd *diradd;
3060	struct mkdir *mkdir;
3061
3062	/* Grab the jsegdep. */
3063	jsegdep = inoref_jseg(&jaddref->ja_ref);
3064	mkdir = NULL;
3065	diradd = NULL;
3066	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3067	    0, &inodedep) == 0)
3068		panic("handle_written_jaddref: Lost inodedep.");
3069	if (jaddref->ja_diradd == NULL)
3070		panic("handle_written_jaddref: No dependency");
3071	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3072		diradd = jaddref->ja_diradd;
3073		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3074	} else if (jaddref->ja_state & MKDIR_PARENT) {
3075		mkdir = jaddref->ja_mkdir;
3076		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3077	} else if (jaddref->ja_state & MKDIR_BODY)
3078		mkdir = jaddref->ja_mkdir;
3079	else
3080		panic("handle_written_jaddref: Unknown dependency %p",
3081		    jaddref->ja_diradd);
3082	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3083	/*
3084	 * Remove us from the inode list.
3085	 */
3086	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3087	/*
3088	 * The mkdir may be waiting on the jaddref to clear before freeing.
3089	 */
3090	if (mkdir) {
3091		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3092		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3093		    TYPENAME(mkdir->md_list.wk_type)));
3094		mkdir->md_jaddref = NULL;
3095		diradd = mkdir->md_diradd;
3096		mkdir->md_state |= DEPCOMPLETE;
3097		complete_mkdir(mkdir);
3098	}
3099	WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list);
3100	if (jaddref->ja_state & NEWBLOCK) {
3101		inodedep->id_state |= ONDEPLIST;
3102		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3103		    inodedep, id_deps);
3104	}
3105	free_jaddref(jaddref);
3106}
3107
3108/*
3109 * Called once a jnewblk journal is written.  The allocdirect or allocindir
3110 * is placed in the bmsafemap to await notification of a written bitmap.
3111 */
3112static void
3113handle_written_jnewblk(jnewblk)
3114	struct jnewblk *jnewblk;
3115{
3116	struct bmsafemap *bmsafemap;
3117	struct jsegdep *jsegdep;
3118	struct newblk *newblk;
3119
3120	/* Grab the jsegdep. */
3121	jsegdep = jnewblk->jn_jsegdep;
3122	jnewblk->jn_jsegdep = NULL;
3123	/*
3124	 * Add the written block to the bmsafemap so it can be notified when
3125	 * the bitmap is on disk.
3126	 */
3127	newblk = jnewblk->jn_newblk;
3128	jnewblk->jn_newblk = NULL;
3129	if (newblk == NULL)
3130		panic("handle_written_jnewblk: No dependency for the segdep.");
3131
3132	newblk->nb_jnewblk = NULL;
3133	bmsafemap = newblk->nb_bmsafemap;
3134	WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list);
3135	newblk->nb_state |= ONDEPLIST;
3136	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
3137	free_jnewblk(jnewblk);
3138}
3139
3140/*
3141 * Cancel a jfreefrag that won't be needed, probably due to colliding with
3142 * an in-flight allocation that has not yet been committed.  Divorce us
3143 * from the freefrag and mark it DEPCOMPLETE so that it may be added
3144 * to the worklist.
3145 */
3146static void
3147cancel_jfreefrag(jfreefrag)
3148	struct jfreefrag *jfreefrag;
3149{
3150	struct freefrag *freefrag;
3151
3152	if (jfreefrag->fr_jsegdep) {
3153		free_jsegdep(jfreefrag->fr_jsegdep);
3154		jfreefrag->fr_jsegdep = NULL;
3155	}
3156	freefrag = jfreefrag->fr_freefrag;
3157	jfreefrag->fr_freefrag = NULL;
3158	freefrag->ff_jfreefrag = NULL;
3159	free_jfreefrag(jfreefrag);
3160	freefrag->ff_state |= DEPCOMPLETE;
3161}
3162
3163/*
3164 * Free a jfreefrag when the parent freefrag is rendered obsolete.
3165 */
3166static void
3167free_jfreefrag(jfreefrag)
3168	struct jfreefrag *jfreefrag;
3169{
3170
3171	if (jfreefrag->fr_state & IOSTARTED)
3172		WORKLIST_REMOVE(&jfreefrag->fr_list);
3173	else if (jfreefrag->fr_state & ONWORKLIST)
3174		remove_from_journal(&jfreefrag->fr_list);
3175	if (jfreefrag->fr_freefrag != NULL)
3176		panic("free_jfreefrag:  Still attached to a freefrag.");
3177	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3178}
3179
3180/*
3181 * Called when the journal write for a jfreefrag completes.  The parent
3182 * freefrag is added to the worklist if this completes its dependencies.
3183 */
3184static void
3185handle_written_jfreefrag(jfreefrag)
3186	struct jfreefrag *jfreefrag;
3187{
3188	struct jsegdep *jsegdep;
3189	struct freefrag *freefrag;
3190
3191	/* Grab the jsegdep. */
3192	jsegdep = jfreefrag->fr_jsegdep;
3193	jfreefrag->fr_jsegdep = NULL;
3194	freefrag = jfreefrag->fr_freefrag;
3195	if (freefrag == NULL)
3196		panic("handle_written_jfreefrag: No freefrag.");
3197	freefrag->ff_state |= DEPCOMPLETE;
3198	freefrag->ff_jfreefrag = NULL;
3199	WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list);
3200	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3201		add_to_worklist(&freefrag->ff_list, 0);
3202	jfreefrag->fr_freefrag = NULL;
3203	free_jfreefrag(jfreefrag);
3204}
3205
3206/*
3207 * Called when the journal write for a jfreeblk completes.  The jfreeblk
3208 * is removed from the freeblks list of pending journal writes and the
3209 * jsegdep is moved to the freeblks jwork to be completed when all blocks
3210 * have been reclaimed.
3211 */
3212static void
3213handle_written_jfreeblk(jfreeblk)
3214	struct jfreeblk *jfreeblk;
3215{
3216	struct freeblks *freeblks;
3217	struct jsegdep *jsegdep;
3218
3219	/* Grab the jsegdep. */
3220	jsegdep = jfreeblk->jf_jsegdep;
3221	jfreeblk->jf_jsegdep = NULL;
3222	freeblks = jfreeblk->jf_freeblks;
3223	LIST_REMOVE(jfreeblk, jf_deps);
3224	WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list);
3225	/*
3226	 * If the freeblks is all journaled, we can add it to the worklist.
3227	 */
3228	if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) &&
3229	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) {
3230		/* Remove from the b_dep that is waiting on this write. */
3231		if (freeblks->fb_state & ONWORKLIST)
3232			WORKLIST_REMOVE(&freeblks->fb_list);
3233		add_to_worklist(&freeblks->fb_list, 1);
3234	}
3235
3236	free_jfreeblk(jfreeblk);
3237}
3238
3239static struct jsegdep *
3240newjsegdep(struct worklist *wk)
3241{
3242	struct jsegdep *jsegdep;
3243
3244	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3245	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3246	jsegdep->jd_seg = NULL;
3247
3248	return (jsegdep);
3249}
3250
3251static struct jmvref *
3252newjmvref(dp, ino, oldoff, newoff)
3253	struct inode *dp;
3254	ino_t ino;
3255	off_t oldoff;
3256	off_t newoff;
3257{
3258	struct jmvref *jmvref;
3259
3260	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3261	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3262	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3263	jmvref->jm_parent = dp->i_number;
3264	jmvref->jm_ino = ino;
3265	jmvref->jm_oldoff = oldoff;
3266	jmvref->jm_newoff = newoff;
3267
3268	return (jmvref);
3269}
3270
3271/*
3272 * Allocate a new jremref that tracks the removal of ip from dp with the
3273 * directory entry offset of diroff.  Mark the entry as ATTACHED and
3274 * DEPCOMPLETE as we have all the information required for the journal write
3275 * and the directory has already been removed from the buffer.  The caller
3276 * is responsible for linking the jremref into the pagedep and adding it
3277 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
3278 * a DOTDOT addition so handle_workitem_remove() can properly assign
3279 * the jsegdep when we're done.
3280 */
3281static struct jremref *
3282newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
3283    off_t diroff, nlink_t nlink)
3284{
3285	struct jremref *jremref;
3286
3287	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
3288	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
3289	jremref->jr_state = ATTACHED;
3290	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
3291	   nlink, ip->i_mode);
3292	jremref->jr_dirrem = dirrem;
3293
3294	return (jremref);
3295}
3296
3297static inline void
3298newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
3299    nlink_t nlink, uint16_t mode)
3300{
3301
3302	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
3303	inoref->if_diroff = diroff;
3304	inoref->if_ino = ino;
3305	inoref->if_parent = parent;
3306	inoref->if_nlink = nlink;
3307	inoref->if_mode = mode;
3308}
3309
3310/*
3311 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
3312 * directory offset may not be known until later.  The caller is responsible
3313 * adding the entry to the journal when this information is available.  nlink
3314 * should be the link count prior to the addition and mode is only required
3315 * to have the correct FMT.
3316 */
3317static struct jaddref *
3318newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
3319    uint16_t mode)
3320{
3321	struct jaddref *jaddref;
3322
3323	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
3324	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
3325	jaddref->ja_state = ATTACHED;
3326	jaddref->ja_mkdir = NULL;
3327	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
3328
3329	return (jaddref);
3330}
3331
3332/*
3333 * Create a new free dependency for a freework.  The caller is responsible
3334 * for adjusting the reference count when it has the lock held.  The freedep
3335 * will track an outstanding bitmap write that will ultimately clear the
3336 * freework to continue.
3337 */
3338static struct freedep *
3339newfreedep(struct freework *freework)
3340{
3341	struct freedep *freedep;
3342
3343	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
3344	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
3345	freedep->fd_freework = freework;
3346
3347	return (freedep);
3348}
3349
3350/*
3351 * Free a freedep structure once the buffer it is linked to is written.  If
3352 * this is the last reference to the freework schedule it for completion.
3353 */
3354static void
3355free_freedep(freedep)
3356	struct freedep *freedep;
3357{
3358
3359	if (--freedep->fd_freework->fw_ref == 0)
3360		add_to_worklist(&freedep->fd_freework->fw_list, 1);
3361	WORKITEM_FREE(freedep, D_FREEDEP);
3362}
3363
3364/*
3365 * Allocate a new freework structure that may be a level in an indirect
3366 * when parent is not NULL or a top level block when it is.  The top level
3367 * freework structures are allocated without lk held and before the freeblks
3368 * is visible outside of softdep_setup_freeblocks().
3369 */
3370static struct freework *
3371newfreework(ump, freeblks, parent, lbn, nb, frags, journal)
3372	struct ufsmount *ump;
3373	struct freeblks *freeblks;
3374	struct freework *parent;
3375	ufs_lbn_t lbn;
3376	ufs2_daddr_t nb;
3377	int frags;
3378	int journal;
3379{
3380	struct freework *freework;
3381
3382	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
3383	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
3384	freework->fw_freeblks = freeblks;
3385	freework->fw_parent = parent;
3386	freework->fw_lbn = lbn;
3387	freework->fw_blkno = nb;
3388	freework->fw_frags = frags;
3389	freework->fw_ref = ((UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ) == 0 ||
3390	    lbn >= -NXADDR) ? 0 : NINDIR(ump->um_fs) + 1;
3391	freework->fw_off = 0;
3392	LIST_INIT(&freework->fw_jwork);
3393
3394	if (parent == NULL) {
3395		WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd,
3396		    &freework->fw_list);
3397		freeblks->fb_ref++;
3398	}
3399	if (journal)
3400		newjfreeblk(freeblks, lbn, nb, frags);
3401
3402	return (freework);
3403}
3404
3405/*
3406 * Allocate a new jfreeblk to journal top level block pointer when truncating
3407 * a file.  The caller must add this to the worklist when lk is held.
3408 */
3409static struct jfreeblk *
3410newjfreeblk(freeblks, lbn, blkno, frags)
3411	struct freeblks *freeblks;
3412	ufs_lbn_t lbn;
3413	ufs2_daddr_t blkno;
3414	int frags;
3415{
3416	struct jfreeblk *jfreeblk;
3417
3418	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
3419	workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp);
3420	jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list);
3421	jfreeblk->jf_state = ATTACHED | DEPCOMPLETE;
3422	jfreeblk->jf_ino = freeblks->fb_previousinum;
3423	jfreeblk->jf_lbn = lbn;
3424	jfreeblk->jf_blkno = blkno;
3425	jfreeblk->jf_frags = frags;
3426	jfreeblk->jf_freeblks = freeblks;
3427	LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps);
3428
3429	return (jfreeblk);
3430}
3431
3432static void move_newblock_dep(struct jaddref *, struct inodedep *);
3433/*
3434 * If we're canceling a new bitmap we have to search for another ref
3435 * to move into the bmsafemap dep.  This might be better expressed
3436 * with another structure.
3437 */
3438static void
3439move_newblock_dep(jaddref, inodedep)
3440	struct jaddref *jaddref;
3441	struct inodedep *inodedep;
3442{
3443	struct inoref *inoref;
3444	struct jaddref *jaddrefn;
3445
3446	jaddrefn = NULL;
3447	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
3448	    inoref = TAILQ_NEXT(inoref, if_deps)) {
3449		if ((jaddref->ja_state & NEWBLOCK) &&
3450		    inoref->if_list.wk_type == D_JADDREF) {
3451			jaddrefn = (struct jaddref *)inoref;
3452			break;
3453		}
3454	}
3455	if (jaddrefn == NULL)
3456		return;
3457	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
3458	jaddrefn->ja_state |= jaddref->ja_state &
3459	    (ATTACHED | UNDONE | NEWBLOCK);
3460	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
3461	jaddref->ja_state |= ATTACHED;
3462	LIST_REMOVE(jaddref, ja_bmdeps);
3463	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
3464	    ja_bmdeps);
3465}
3466
3467/*
3468 * Cancel a jaddref either before it has been written or while it is being
3469 * written.  This happens when a link is removed before the add reaches
3470 * the disk.  The jaddref dependency is kept linked into the bmsafemap
3471 * and inode to prevent the link count or bitmap from reaching the disk
3472 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
3473 * required.
3474 *
3475 * Returns 1 if the canceled addref requires journaling of the remove and
3476 * 0 otherwise.
3477 */
3478static int
3479cancel_jaddref(jaddref, inodedep, wkhd)
3480	struct jaddref *jaddref;
3481	struct inodedep *inodedep;
3482	struct workhead *wkhd;
3483{
3484	struct inoref *inoref;
3485	struct jsegdep *jsegdep;
3486	int needsj;
3487
3488	KASSERT((jaddref->ja_state & COMPLETE) == 0,
3489	    ("cancel_jaddref: Canceling complete jaddref"));
3490	if (jaddref->ja_state & (IOSTARTED | COMPLETE))
3491		needsj = 1;
3492	else
3493		needsj = 0;
3494	if (inodedep == NULL)
3495		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3496		    0, &inodedep) == 0)
3497			panic("cancel_jaddref: Lost inodedep");
3498	/*
3499	 * We must adjust the nlink of any reference operation that follows
3500	 * us so that it is consistent with the in-memory reference.  This
3501	 * ensures that inode nlink rollbacks always have the correct link.
3502	 */
3503	if (needsj == 0)
3504		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
3505		    inoref = TAILQ_NEXT(inoref, if_deps))
3506			inoref->if_nlink--;
3507	jsegdep = inoref_jseg(&jaddref->ja_ref);
3508	if (jaddref->ja_state & NEWBLOCK)
3509		move_newblock_dep(jaddref, inodedep);
3510	if (jaddref->ja_state & IOWAITING) {
3511		jaddref->ja_state &= ~IOWAITING;
3512		wakeup(&jaddref->ja_list);
3513	}
3514	jaddref->ja_mkdir = NULL;
3515	if (jaddref->ja_state & IOSTARTED) {
3516		jaddref->ja_state &= ~IOSTARTED;
3517		WORKLIST_REMOVE(&jaddref->ja_list);
3518		WORKLIST_INSERT(wkhd, &jsegdep->jd_list);
3519	} else {
3520		free_jsegdep(jsegdep);
3521		if (jaddref->ja_state & DEPCOMPLETE)
3522			remove_from_journal(&jaddref->ja_list);
3523	}
3524	/*
3525	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
3526	 * can arrange for them to be freed with the bitmap.  Otherwise we
3527	 * no longer need this addref attached to the inoreflst and it
3528	 * will incorrectly adjust nlink if we leave it.
3529	 */
3530	if ((jaddref->ja_state & NEWBLOCK) == 0) {
3531		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
3532		    if_deps);
3533		jaddref->ja_state |= COMPLETE;
3534		free_jaddref(jaddref);
3535		return (needsj);
3536	}
3537	jaddref->ja_state |= GOINGAWAY;
3538	/*
3539	 * Leave the head of the list for jsegdeps for fast merging.
3540	 */
3541	if (LIST_FIRST(wkhd) != NULL) {
3542		jaddref->ja_state |= ONWORKLIST;
3543		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
3544	} else
3545		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
3546
3547	return (needsj);
3548}
3549
3550/*
3551 * Attempt to free a jaddref structure when some work completes.  This
3552 * should only succeed once the entry is written and all dependencies have
3553 * been notified.
3554 */
3555static void
3556free_jaddref(jaddref)
3557	struct jaddref *jaddref;
3558{
3559
3560	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
3561		return;
3562	if (jaddref->ja_ref.if_jsegdep)
3563		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
3564		    jaddref, jaddref->ja_state);
3565	if (jaddref->ja_state & NEWBLOCK)
3566		LIST_REMOVE(jaddref, ja_bmdeps);
3567	if (jaddref->ja_state & (IOSTARTED | ONWORKLIST))
3568		panic("free_jaddref: Bad state %p(0x%X)",
3569		    jaddref, jaddref->ja_state);
3570	if (jaddref->ja_mkdir != NULL)
3571		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
3572	WORKITEM_FREE(jaddref, D_JADDREF);
3573}
3574
3575/*
3576 * Free a jremref structure once it has been written or discarded.
3577 */
3578static void
3579free_jremref(jremref)
3580	struct jremref *jremref;
3581{
3582
3583	if (jremref->jr_ref.if_jsegdep)
3584		free_jsegdep(jremref->jr_ref.if_jsegdep);
3585	if (jremref->jr_state & IOSTARTED)
3586		panic("free_jremref: IO still pending");
3587	WORKITEM_FREE(jremref, D_JREMREF);
3588}
3589
3590/*
3591 * Free a jnewblk structure.
3592 */
3593static void
3594free_jnewblk(jnewblk)
3595	struct jnewblk *jnewblk;
3596{
3597
3598	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
3599		return;
3600	LIST_REMOVE(jnewblk, jn_deps);
3601	if (jnewblk->jn_newblk != NULL)
3602		panic("free_jnewblk: Dependency still attached.");
3603	WORKITEM_FREE(jnewblk, D_JNEWBLK);
3604}
3605
3606/*
3607 * Cancel a jnewblk which has been superseded by a freeblk.  The jnewblk
3608 * is kept linked into the bmsafemap until the free completes, thus
3609 * preventing the modified state from ever reaching disk.  The free
3610 * routine must pass this structure via ffs_blkfree() to
3611 * softdep_setup_freeblks() so there is no race in releasing the space.
3612 */
3613static void
3614cancel_jnewblk(jnewblk, wkhd)
3615	struct jnewblk *jnewblk;
3616	struct workhead *wkhd;
3617{
3618	struct jsegdep *jsegdep;
3619
3620	jsegdep = jnewblk->jn_jsegdep;
3621	jnewblk->jn_jsegdep  = NULL;
3622	free_jsegdep(jsegdep);
3623	jnewblk->jn_newblk = NULL;
3624	jnewblk->jn_state |= GOINGAWAY;
3625	if (jnewblk->jn_state & IOSTARTED) {
3626		jnewblk->jn_state &= ~IOSTARTED;
3627		WORKLIST_REMOVE(&jnewblk->jn_list);
3628	} else
3629		remove_from_journal(&jnewblk->jn_list);
3630	/*
3631	 * Leave the head of the list for jsegdeps for fast merging.
3632	 */
3633	if (LIST_FIRST(wkhd) != NULL) {
3634		jnewblk->jn_state |= ONWORKLIST;
3635		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jnewblk->jn_list, wk_list);
3636	} else
3637		WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
3638	if (jnewblk->jn_state & IOWAITING) {
3639		jnewblk->jn_state &= ~IOWAITING;
3640		wakeup(&jnewblk->jn_list);
3641	}
3642}
3643
3644static void
3645free_jfreeblk(jfreeblk)
3646	struct jfreeblk *jfreeblk;
3647{
3648
3649	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
3650}
3651
3652/*
3653 * Release one reference to a jseg and free it if the count reaches 0.  This
3654 * should eventually reclaim journal space as well.
3655 */
3656static void
3657free_jseg(jseg)
3658	struct jseg *jseg;
3659{
3660	struct jblocks *jblocks;
3661
3662	KASSERT(jseg->js_refs > 0,
3663	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
3664	if (--jseg->js_refs != 0)
3665		return;
3666	/*
3667	 * Free only those jsegs which have none allocated before them to
3668	 * preserve the journal space ordering.
3669	 */
3670	jblocks = jseg->js_jblocks;
3671	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
3672		jblocks->jb_oldestseq = jseg->js_seq;
3673		if (jseg->js_refs != 0)
3674			break;
3675		TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
3676		jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
3677		KASSERT(LIST_EMPTY(&jseg->js_entries),
3678		    ("free_jseg: Freed jseg has valid entries."));
3679		WORKITEM_FREE(jseg, D_JSEG);
3680	}
3681}
3682
3683/*
3684 * Release a jsegdep and decrement the jseg count.
3685 */
3686static void
3687free_jsegdep(jsegdep)
3688	struct jsegdep *jsegdep;
3689{
3690
3691	if (jsegdep->jd_seg)
3692		free_jseg(jsegdep->jd_seg);
3693	WORKITEM_FREE(jsegdep, D_JSEGDEP);
3694}
3695
3696/*
3697 * Wait for a journal item to make it to disk.  Initiate journal processing
3698 * if required.
3699 */
3700static void
3701jwait(wk)
3702	struct worklist *wk;
3703{
3704
3705	stat_journal_wait++;
3706	/*
3707	 * If IO has not started we process the journal.  We can't mark the
3708	 * worklist item as IOWAITING because we drop the lock while
3709	 * processing the journal and the worklist entry may be freed after
3710	 * this point.  The caller may call back in and re-issue the request.
3711	 */
3712	if ((wk->wk_state & IOSTARTED) == 0) {
3713		softdep_process_journal(wk->wk_mp, MNT_WAIT);
3714		return;
3715	}
3716	wk->wk_state |= IOWAITING;
3717	msleep(wk, &lk, PRIBIO, "jwait", 0);
3718}
3719
3720/*
3721 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
3722 * appropriate.  This is a convenience function to reduce duplicate code
3723 * for the setup and revert functions below.
3724 */
3725static struct inodedep *
3726inodedep_lookup_ip(ip)
3727	struct inode *ip;
3728{
3729	struct inodedep *inodedep;
3730
3731	KASSERT(ip->i_nlink >= ip->i_effnlink,
3732	    ("inodedep_lookup_ip: bad delta"));
3733	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
3734	    DEPALLOC, &inodedep);
3735	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3736
3737	return (inodedep);
3738}
3739
3740/*
3741 * Create a journal entry that describes a truncate that we're about to
3742 * perform.  The inode allocations and frees between here and the completion
3743 * of the operation are done asynchronously and without journaling.  At
3744 * the end of the operation the vnode is sync'd and the journal space
3745 * is released.  Recovery will discover the partially completed truncate
3746 * and complete it.
3747 */
3748void *
3749softdep_setup_trunc(vp, length, flags)
3750	struct vnode *vp;
3751	off_t length;
3752	int flags;
3753{
3754	struct jsegdep *jsegdep;
3755	struct jtrunc *jtrunc;
3756	struct ufsmount *ump;
3757	struct inode *ip;
3758
3759	softdep_prealloc(vp, MNT_WAIT);
3760	ip = VTOI(vp);
3761	ump = VFSTOUFS(vp->v_mount);
3762	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
3763	workitem_alloc(&jtrunc->jt_list, D_JTRUNC, vp->v_mount);
3764	jsegdep = jtrunc->jt_jsegdep = newjsegdep(&jtrunc->jt_list);
3765	jtrunc->jt_ino = ip->i_number;
3766	jtrunc->jt_extsize = 0;
3767	jtrunc->jt_size = length;
3768	if ((flags & IO_EXT) == 0 && ump->um_fstype == UFS2)
3769		jtrunc->jt_extsize = ip->i_din2->di_extsize;
3770	if ((flags & IO_NORMAL) == 0)
3771		jtrunc->jt_size = DIP(ip, i_size);
3772	ACQUIRE_LOCK(&lk);
3773	add_to_journal(&jtrunc->jt_list);
3774	while (jsegdep->jd_seg == NULL) {
3775		stat_jwait_freeblks++;
3776		jwait(&jtrunc->jt_list);
3777	}
3778	FREE_LOCK(&lk);
3779
3780	return (jsegdep);
3781}
3782
3783/*
3784 * After synchronous truncation is complete we free sync the vnode and
3785 * release the jsegdep so the journal space can be freed.
3786 */
3787int
3788softdep_complete_trunc(vp, cookie)
3789	struct vnode *vp;
3790	void *cookie;
3791{
3792	int error;
3793
3794	error = ffs_syncvnode(vp, MNT_WAIT);
3795	ACQUIRE_LOCK(&lk);
3796	free_jsegdep((struct jsegdep *)cookie);
3797	FREE_LOCK(&lk);
3798
3799	return (error);
3800}
3801
3802/*
3803 * Called prior to creating a new inode and linking it to a directory.  The
3804 * jaddref structure must already be allocated by softdep_setup_inomapdep
3805 * and it is discovered here so we can initialize the mode and update
3806 * nlinkdelta.
3807 */
3808void
3809softdep_setup_create(dp, ip)
3810	struct inode *dp;
3811	struct inode *ip;
3812{
3813	struct inodedep *inodedep;
3814	struct jaddref *jaddref;
3815	struct vnode *dvp;
3816
3817	KASSERT(ip->i_nlink == 1,
3818	    ("softdep_setup_create: Invalid link count."));
3819	dvp = ITOV(dp);
3820	ACQUIRE_LOCK(&lk);
3821	inodedep = inodedep_lookup_ip(ip);
3822	if (DOINGSUJ(dvp)) {
3823		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3824		    inoreflst);
3825		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
3826		    ("softdep_setup_create: No addref structure present."));
3827		jaddref->ja_mode = ip->i_mode;
3828	}
3829	softdep_prelink(dvp, NULL);
3830	FREE_LOCK(&lk);
3831}
3832
3833/*
3834 * Create a jaddref structure to track the addition of a DOTDOT link when
3835 * we are reparenting an inode as part of a rename.  This jaddref will be
3836 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
3837 * non-journaling softdep.
3838 */
3839void
3840softdep_setup_dotdot_link(dp, ip)
3841	struct inode *dp;
3842	struct inode *ip;
3843{
3844	struct inodedep *inodedep;
3845	struct jaddref *jaddref;
3846	struct vnode *dvp;
3847	struct vnode *vp;
3848
3849	dvp = ITOV(dp);
3850	vp = ITOV(ip);
3851	jaddref = NULL;
3852	/*
3853	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
3854	 * is used as a normal link would be.
3855	 */
3856	if (DOINGSUJ(dvp))
3857		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
3858		    dp->i_effnlink - 1, dp->i_mode);
3859	ACQUIRE_LOCK(&lk);
3860	inodedep = inodedep_lookup_ip(dp);
3861	if (jaddref)
3862		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
3863		    if_deps);
3864	softdep_prelink(dvp, ITOV(ip));
3865	FREE_LOCK(&lk);
3866}
3867
3868/*
3869 * Create a jaddref structure to track a new link to an inode.  The directory
3870 * offset is not known until softdep_setup_directory_add or
3871 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
3872 * softdep.
3873 */
3874void
3875softdep_setup_link(dp, ip)
3876	struct inode *dp;
3877	struct inode *ip;
3878{
3879	struct inodedep *inodedep;
3880	struct jaddref *jaddref;
3881	struct vnode *dvp;
3882
3883	dvp = ITOV(dp);
3884	jaddref = NULL;
3885	if (DOINGSUJ(dvp))
3886		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
3887		    ip->i_mode);
3888	ACQUIRE_LOCK(&lk);
3889	inodedep = inodedep_lookup_ip(ip);
3890	if (jaddref)
3891		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
3892		    if_deps);
3893	softdep_prelink(dvp, ITOV(ip));
3894	FREE_LOCK(&lk);
3895}
3896
3897/*
3898 * Called to create the jaddref structures to track . and .. references as
3899 * well as lookup and further initialize the incomplete jaddref created
3900 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
3901 * nlinkdelta for non-journaling softdep.
3902 */
3903void
3904softdep_setup_mkdir(dp, ip)
3905	struct inode *dp;
3906	struct inode *ip;
3907{
3908	struct inodedep *inodedep;
3909	struct jaddref *dotdotaddref;
3910	struct jaddref *dotaddref;
3911	struct jaddref *jaddref;
3912	struct vnode *dvp;
3913
3914	dvp = ITOV(dp);
3915	dotaddref = dotdotaddref = NULL;
3916	if (DOINGSUJ(dvp)) {
3917		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
3918		    ip->i_mode);
3919		dotaddref->ja_state |= MKDIR_BODY;
3920		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
3921		    dp->i_effnlink - 1, dp->i_mode);
3922		dotdotaddref->ja_state |= MKDIR_PARENT;
3923	}
3924	ACQUIRE_LOCK(&lk);
3925	inodedep = inodedep_lookup_ip(ip);
3926	if (DOINGSUJ(dvp)) {
3927		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3928		    inoreflst);
3929		KASSERT(jaddref != NULL,
3930		    ("softdep_setup_mkdir: No addref structure present."));
3931		KASSERT(jaddref->ja_parent == dp->i_number,
3932		    ("softdep_setup_mkdir: bad parent %d",
3933		    jaddref->ja_parent));
3934		jaddref->ja_mode = ip->i_mode;
3935		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
3936		    if_deps);
3937	}
3938	inodedep = inodedep_lookup_ip(dp);
3939	if (DOINGSUJ(dvp))
3940		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
3941		    &dotdotaddref->ja_ref, if_deps);
3942	softdep_prelink(ITOV(dp), NULL);
3943	FREE_LOCK(&lk);
3944}
3945
3946/*
3947 * Called to track nlinkdelta of the inode and parent directories prior to
3948 * unlinking a directory.
3949 */
3950void
3951softdep_setup_rmdir(dp, ip)
3952	struct inode *dp;
3953	struct inode *ip;
3954{
3955	struct vnode *dvp;
3956
3957	dvp = ITOV(dp);
3958	ACQUIRE_LOCK(&lk);
3959	(void) inodedep_lookup_ip(ip);
3960	(void) inodedep_lookup_ip(dp);
3961	softdep_prelink(dvp, ITOV(ip));
3962	FREE_LOCK(&lk);
3963}
3964
3965/*
3966 * Called to track nlinkdelta of the inode and parent directories prior to
3967 * unlink.
3968 */
3969void
3970softdep_setup_unlink(dp, ip)
3971	struct inode *dp;
3972	struct inode *ip;
3973{
3974	struct vnode *dvp;
3975
3976	dvp = ITOV(dp);
3977	ACQUIRE_LOCK(&lk);
3978	(void) inodedep_lookup_ip(ip);
3979	(void) inodedep_lookup_ip(dp);
3980	softdep_prelink(dvp, ITOV(ip));
3981	FREE_LOCK(&lk);
3982}
3983
3984/*
3985 * Called to release the journal structures created by a failed non-directory
3986 * creation.  Adjusts nlinkdelta for non-journaling softdep.
3987 */
3988void
3989softdep_revert_create(dp, ip)
3990	struct inode *dp;
3991	struct inode *ip;
3992{
3993	struct inodedep *inodedep;
3994	struct jaddref *jaddref;
3995	struct vnode *dvp;
3996
3997	dvp = ITOV(dp);
3998	ACQUIRE_LOCK(&lk);
3999	inodedep = inodedep_lookup_ip(ip);
4000	if (DOINGSUJ(dvp)) {
4001		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4002		    inoreflst);
4003		KASSERT(jaddref->ja_parent == dp->i_number,
4004		    ("softdep_revert_create: addref parent mismatch"));
4005		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4006	}
4007	FREE_LOCK(&lk);
4008}
4009
4010/*
4011 * Called to release the journal structures created by a failed dotdot link
4012 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4013 */
4014void
4015softdep_revert_dotdot_link(dp, ip)
4016	struct inode *dp;
4017	struct inode *ip;
4018{
4019	struct inodedep *inodedep;
4020	struct jaddref *jaddref;
4021	struct vnode *dvp;
4022
4023	dvp = ITOV(dp);
4024	ACQUIRE_LOCK(&lk);
4025	inodedep = inodedep_lookup_ip(dp);
4026	if (DOINGSUJ(dvp)) {
4027		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4028		    inoreflst);
4029		KASSERT(jaddref->ja_parent == ip->i_number,
4030		    ("softdep_revert_dotdot_link: addref parent mismatch"));
4031		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4032	}
4033	FREE_LOCK(&lk);
4034}
4035
4036/*
4037 * Called to release the journal structures created by a failed link
4038 * addition.  Adjusts nlinkdelta for non-journaling softdep.
4039 */
4040void
4041softdep_revert_link(dp, ip)
4042	struct inode *dp;
4043	struct inode *ip;
4044{
4045	struct inodedep *inodedep;
4046	struct jaddref *jaddref;
4047	struct vnode *dvp;
4048
4049	dvp = ITOV(dp);
4050	ACQUIRE_LOCK(&lk);
4051	inodedep = inodedep_lookup_ip(ip);
4052	if (DOINGSUJ(dvp)) {
4053		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4054		    inoreflst);
4055		KASSERT(jaddref->ja_parent == dp->i_number,
4056		    ("softdep_revert_link: addref parent mismatch"));
4057		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4058	}
4059	FREE_LOCK(&lk);
4060}
4061
4062/*
4063 * Called to release the journal structures created by a failed mkdir
4064 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4065 */
4066void
4067softdep_revert_mkdir(dp, ip)
4068	struct inode *dp;
4069	struct inode *ip;
4070{
4071	struct inodedep *inodedep;
4072	struct jaddref *jaddref;
4073	struct vnode *dvp;
4074
4075	dvp = ITOV(dp);
4076
4077	ACQUIRE_LOCK(&lk);
4078	inodedep = inodedep_lookup_ip(dp);
4079	if (DOINGSUJ(dvp)) {
4080		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4081		    inoreflst);
4082		KASSERT(jaddref->ja_parent == ip->i_number,
4083		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4084		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4085	}
4086	inodedep = inodedep_lookup_ip(ip);
4087	if (DOINGSUJ(dvp)) {
4088		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4089		    inoreflst);
4090		KASSERT(jaddref->ja_parent == dp->i_number,
4091		    ("softdep_revert_mkdir: addref parent mismatch"));
4092		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4093		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4094		    inoreflst);
4095		KASSERT(jaddref->ja_parent == ip->i_number,
4096		    ("softdep_revert_mkdir: dot addref parent mismatch"));
4097		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4098	}
4099	FREE_LOCK(&lk);
4100}
4101
4102/*
4103 * Called to correct nlinkdelta after a failed rmdir.
4104 */
4105void
4106softdep_revert_rmdir(dp, ip)
4107	struct inode *dp;
4108	struct inode *ip;
4109{
4110
4111	ACQUIRE_LOCK(&lk);
4112	(void) inodedep_lookup_ip(ip);
4113	(void) inodedep_lookup_ip(dp);
4114	FREE_LOCK(&lk);
4115}
4116
4117/*
4118 * Protecting the freemaps (or bitmaps).
4119 *
4120 * To eliminate the need to execute fsck before mounting a filesystem
4121 * after a power failure, one must (conservatively) guarantee that the
4122 * on-disk copy of the bitmaps never indicate that a live inode or block is
4123 * free.  So, when a block or inode is allocated, the bitmap should be
4124 * updated (on disk) before any new pointers.  When a block or inode is
4125 * freed, the bitmap should not be updated until all pointers have been
4126 * reset.  The latter dependency is handled by the delayed de-allocation
4127 * approach described below for block and inode de-allocation.  The former
4128 * dependency is handled by calling the following procedure when a block or
4129 * inode is allocated. When an inode is allocated an "inodedep" is created
4130 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4131 * Each "inodedep" is also inserted into the hash indexing structure so
4132 * that any additional link additions can be made dependent on the inode
4133 * allocation.
4134 *
4135 * The ufs filesystem maintains a number of free block counts (e.g., per
4136 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4137 * in addition to the bitmaps.  These counts are used to improve efficiency
4138 * during allocation and therefore must be consistent with the bitmaps.
4139 * There is no convenient way to guarantee post-crash consistency of these
4140 * counts with simple update ordering, for two main reasons: (1) The counts
4141 * and bitmaps for a single cylinder group block are not in the same disk
4142 * sector.  If a disk write is interrupted (e.g., by power failure), one may
4143 * be written and the other not.  (2) Some of the counts are located in the
4144 * superblock rather than the cylinder group block. So, we focus our soft
4145 * updates implementation on protecting the bitmaps. When mounting a
4146 * filesystem, we recompute the auxiliary counts from the bitmaps.
4147 */
4148
4149/*
4150 * Called just after updating the cylinder group block to allocate an inode.
4151 */
4152void
4153softdep_setup_inomapdep(bp, ip, newinum)
4154	struct buf *bp;		/* buffer for cylgroup block with inode map */
4155	struct inode *ip;	/* inode related to allocation */
4156	ino_t newinum;		/* new inode number being allocated */
4157{
4158	struct inodedep *inodedep;
4159	struct bmsafemap *bmsafemap;
4160	struct jaddref *jaddref;
4161	struct mount *mp;
4162	struct fs *fs;
4163
4164	mp = UFSTOVFS(ip->i_ump);
4165	fs = ip->i_ump->um_fs;
4166	jaddref = NULL;
4167
4168	/*
4169	 * Allocate the journal reference add structure so that the bitmap
4170	 * can be dependent on it.
4171	 */
4172	if (mp->mnt_kern_flag & MNTK_SUJ) {
4173		jaddref = newjaddref(ip, newinum, 0, 0, 0);
4174		jaddref->ja_state |= NEWBLOCK;
4175	}
4176
4177	/*
4178	 * Create a dependency for the newly allocated inode.
4179	 * Panic if it already exists as something is seriously wrong.
4180	 * Otherwise add it to the dependency list for the buffer holding
4181	 * the cylinder group map from which it was allocated.
4182	 */
4183	ACQUIRE_LOCK(&lk);
4184	if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep)))
4185		panic("softdep_setup_inomapdep: dependency %p for new"
4186		    "inode already exists", inodedep);
4187	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum));
4188	if (jaddref) {
4189		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
4190		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4191		    if_deps);
4192	} else {
4193		inodedep->id_state |= ONDEPLIST;
4194		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
4195	}
4196	inodedep->id_bmsafemap = bmsafemap;
4197	inodedep->id_state &= ~DEPCOMPLETE;
4198	FREE_LOCK(&lk);
4199}
4200
4201/*
4202 * Called just after updating the cylinder group block to
4203 * allocate block or fragment.
4204 */
4205void
4206softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
4207	struct buf *bp;		/* buffer for cylgroup block with block map */
4208	struct mount *mp;	/* filesystem doing allocation */
4209	ufs2_daddr_t newblkno;	/* number of newly allocated block */
4210	int frags;		/* Number of fragments. */
4211	int oldfrags;		/* Previous number of fragments for extend. */
4212{
4213	struct newblk *newblk;
4214	struct bmsafemap *bmsafemap;
4215	struct jnewblk *jnewblk;
4216	struct fs *fs;
4217
4218	fs = VFSTOUFS(mp)->um_fs;
4219	jnewblk = NULL;
4220	/*
4221	 * Create a dependency for the newly allocated block.
4222	 * Add it to the dependency list for the buffer holding
4223	 * the cylinder group map from which it was allocated.
4224	 */
4225	if (mp->mnt_kern_flag & MNTK_SUJ) {
4226		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
4227		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
4228		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
4229		jnewblk->jn_state = ATTACHED;
4230		jnewblk->jn_blkno = newblkno;
4231		jnewblk->jn_frags = frags;
4232		jnewblk->jn_oldfrags = oldfrags;
4233#ifdef SUJ_DEBUG
4234		{
4235			struct cg *cgp;
4236			uint8_t *blksfree;
4237			long bno;
4238			int i;
4239
4240			cgp = (struct cg *)bp->b_data;
4241			blksfree = cg_blksfree(cgp);
4242			bno = dtogd(fs, jnewblk->jn_blkno);
4243			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
4244			    i++) {
4245				if (isset(blksfree, bno + i))
4246					panic("softdep_setup_blkmapdep: "
4247					    "free fragment %d from %d-%d "
4248					    "state 0x%X dep %p", i,
4249					    jnewblk->jn_oldfrags,
4250					    jnewblk->jn_frags,
4251					    jnewblk->jn_state,
4252					    jnewblk->jn_newblk);
4253			}
4254		}
4255#endif
4256	}
4257	ACQUIRE_LOCK(&lk);
4258	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
4259		panic("softdep_setup_blkmapdep: found block");
4260	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
4261	    dtog(fs, newblkno));
4262	if (jnewblk) {
4263		jnewblk->jn_newblk = newblk;
4264		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
4265	} else {
4266		newblk->nb_state |= ONDEPLIST;
4267		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
4268	}
4269	newblk->nb_bmsafemap = bmsafemap;
4270	newblk->nb_jnewblk = jnewblk;
4271	FREE_LOCK(&lk);
4272}
4273
4274#define	BMSAFEMAP_HASH(fs, cg) \
4275      (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash])
4276
4277static int
4278bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp)
4279	struct bmsafemap_hashhead *bmsafemaphd;
4280	struct mount *mp;
4281	int cg;
4282	struct bmsafemap **bmsafemapp;
4283{
4284	struct bmsafemap *bmsafemap;
4285
4286	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
4287		if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg)
4288			break;
4289	if (bmsafemap) {
4290		*bmsafemapp = bmsafemap;
4291		return (1);
4292	}
4293	*bmsafemapp = NULL;
4294
4295	return (0);
4296}
4297
4298/*
4299 * Find the bmsafemap associated with a cylinder group buffer.
4300 * If none exists, create one. The buffer must be locked when
4301 * this routine is called and this routine must be called with
4302 * splbio interrupts blocked.
4303 */
4304static struct bmsafemap *
4305bmsafemap_lookup(mp, bp, cg)
4306	struct mount *mp;
4307	struct buf *bp;
4308	int cg;
4309{
4310	struct bmsafemap_hashhead *bmsafemaphd;
4311	struct bmsafemap *bmsafemap, *collision;
4312	struct worklist *wk;
4313	struct fs *fs;
4314
4315	mtx_assert(&lk, MA_OWNED);
4316	if (bp)
4317		LIST_FOREACH(wk, &bp->b_dep, wk_list)
4318			if (wk->wk_type == D_BMSAFEMAP)
4319				return (WK_BMSAFEMAP(wk));
4320	fs = VFSTOUFS(mp)->um_fs;
4321	bmsafemaphd = BMSAFEMAP_HASH(fs, cg);
4322	if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1)
4323		return (bmsafemap);
4324	FREE_LOCK(&lk);
4325	bmsafemap = malloc(sizeof(struct bmsafemap),
4326		M_BMSAFEMAP, M_SOFTDEP_FLAGS);
4327	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
4328	bmsafemap->sm_buf = bp;
4329	LIST_INIT(&bmsafemap->sm_inodedephd);
4330	LIST_INIT(&bmsafemap->sm_inodedepwr);
4331	LIST_INIT(&bmsafemap->sm_newblkhd);
4332	LIST_INIT(&bmsafemap->sm_newblkwr);
4333	LIST_INIT(&bmsafemap->sm_jaddrefhd);
4334	LIST_INIT(&bmsafemap->sm_jnewblkhd);
4335	ACQUIRE_LOCK(&lk);
4336	if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) {
4337		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
4338		return (collision);
4339	}
4340	bmsafemap->sm_cg = cg;
4341	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
4342	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
4343	return (bmsafemap);
4344}
4345
4346/*
4347 * Direct block allocation dependencies.
4348 *
4349 * When a new block is allocated, the corresponding disk locations must be
4350 * initialized (with zeros or new data) before the on-disk inode points to
4351 * them.  Also, the freemap from which the block was allocated must be
4352 * updated (on disk) before the inode's pointer. These two dependencies are
4353 * independent of each other and are needed for all file blocks and indirect
4354 * blocks that are pointed to directly by the inode.  Just before the
4355 * "in-core" version of the inode is updated with a newly allocated block
4356 * number, a procedure (below) is called to setup allocation dependency
4357 * structures.  These structures are removed when the corresponding
4358 * dependencies are satisfied or when the block allocation becomes obsolete
4359 * (i.e., the file is deleted, the block is de-allocated, or the block is a
4360 * fragment that gets upgraded).  All of these cases are handled in
4361 * procedures described later.
4362 *
4363 * When a file extension causes a fragment to be upgraded, either to a larger
4364 * fragment or to a full block, the on-disk location may change (if the
4365 * previous fragment could not simply be extended). In this case, the old
4366 * fragment must be de-allocated, but not until after the inode's pointer has
4367 * been updated. In most cases, this is handled by later procedures, which
4368 * will construct a "freefrag" structure to be added to the workitem queue
4369 * when the inode update is complete (or obsolete).  The main exception to
4370 * this is when an allocation occurs while a pending allocation dependency
4371 * (for the same block pointer) remains.  This case is handled in the main
4372 * allocation dependency setup procedure by immediately freeing the
4373 * unreferenced fragments.
4374 */
4375void
4376softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
4377	struct inode *ip;	/* inode to which block is being added */
4378	ufs_lbn_t off;		/* block pointer within inode */
4379	ufs2_daddr_t newblkno;	/* disk block number being added */
4380	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
4381	long newsize;		/* size of new block */
4382	long oldsize;		/* size of new block */
4383	struct buf *bp;		/* bp for allocated block */
4384{
4385	struct allocdirect *adp, *oldadp;
4386	struct allocdirectlst *adphead;
4387	struct freefrag *freefrag;
4388	struct inodedep *inodedep;
4389	struct pagedep *pagedep;
4390	struct jnewblk *jnewblk;
4391	struct newblk *newblk;
4392	struct mount *mp;
4393	ufs_lbn_t lbn;
4394
4395	lbn = bp->b_lblkno;
4396	mp = UFSTOVFS(ip->i_ump);
4397	if (oldblkno && oldblkno != newblkno)
4398		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
4399	else
4400		freefrag = NULL;
4401
4402	ACQUIRE_LOCK(&lk);
4403	if (off >= NDADDR) {
4404		if (lbn > 0)
4405			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
4406			    lbn, off);
4407		/* allocating an indirect block */
4408		if (oldblkno != 0)
4409			panic("softdep_setup_allocdirect: non-zero indir");
4410	} else {
4411		if (off != lbn)
4412			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
4413			    lbn, off);
4414		/*
4415		 * Allocating a direct block.
4416		 *
4417		 * If we are allocating a directory block, then we must
4418		 * allocate an associated pagedep to track additions and
4419		 * deletions.
4420		 */
4421		if ((ip->i_mode & IFMT) == IFDIR &&
4422		    pagedep_lookup(mp, ip->i_number, off, DEPALLOC,
4423		    &pagedep) == 0)
4424			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
4425	}
4426	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
4427		panic("softdep_setup_allocdirect: lost block");
4428	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4429	    ("softdep_setup_allocdirect: newblk already initialized"));
4430	/*
4431	 * Convert the newblk to an allocdirect.
4432	 */
4433	newblk->nb_list.wk_type = D_ALLOCDIRECT;
4434	adp = (struct allocdirect *)newblk;
4435	newblk->nb_freefrag = freefrag;
4436	adp->ad_offset = off;
4437	adp->ad_oldblkno = oldblkno;
4438	adp->ad_newsize = newsize;
4439	adp->ad_oldsize = oldsize;
4440
4441	/*
4442	 * Finish initializing the journal.
4443	 */
4444	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4445		jnewblk->jn_ino = ip->i_number;
4446		jnewblk->jn_lbn = lbn;
4447		add_to_journal(&jnewblk->jn_list);
4448	}
4449	if (freefrag && freefrag->ff_jfreefrag != NULL)
4450		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
4451	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
4452	adp->ad_inodedep = inodedep;
4453
4454	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
4455	/*
4456	 * The list of allocdirects must be kept in sorted and ascending
4457	 * order so that the rollback routines can quickly determine the
4458	 * first uncommitted block (the size of the file stored on disk
4459	 * ends at the end of the lowest committed fragment, or if there
4460	 * are no fragments, at the end of the highest committed block).
4461	 * Since files generally grow, the typical case is that the new
4462	 * block is to be added at the end of the list. We speed this
4463	 * special case by checking against the last allocdirect in the
4464	 * list before laboriously traversing the list looking for the
4465	 * insertion point.
4466	 */
4467	adphead = &inodedep->id_newinoupdt;
4468	oldadp = TAILQ_LAST(adphead, allocdirectlst);
4469	if (oldadp == NULL || oldadp->ad_offset <= off) {
4470		/* insert at end of list */
4471		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
4472		if (oldadp != NULL && oldadp->ad_offset == off)
4473			allocdirect_merge(adphead, adp, oldadp);
4474		FREE_LOCK(&lk);
4475		return;
4476	}
4477	TAILQ_FOREACH(oldadp, adphead, ad_next) {
4478		if (oldadp->ad_offset >= off)
4479			break;
4480	}
4481	if (oldadp == NULL)
4482		panic("softdep_setup_allocdirect: lost entry");
4483	/* insert in middle of list */
4484	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
4485	if (oldadp->ad_offset == off)
4486		allocdirect_merge(adphead, adp, oldadp);
4487
4488	FREE_LOCK(&lk);
4489}
4490
4491/*
4492 * Replace an old allocdirect dependency with a newer one.
4493 * This routine must be called with splbio interrupts blocked.
4494 */
4495static void
4496allocdirect_merge(adphead, newadp, oldadp)
4497	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
4498	struct allocdirect *newadp;	/* allocdirect being added */
4499	struct allocdirect *oldadp;	/* existing allocdirect being checked */
4500{
4501	struct worklist *wk;
4502	struct freefrag *freefrag;
4503	struct newdirblk *newdirblk;
4504
4505	freefrag = NULL;
4506	mtx_assert(&lk, MA_OWNED);
4507	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
4508	    newadp->ad_oldsize != oldadp->ad_newsize ||
4509	    newadp->ad_offset >= NDADDR)
4510		panic("%s %jd != new %jd || old size %ld != new %ld",
4511		    "allocdirect_merge: old blkno",
4512		    (intmax_t)newadp->ad_oldblkno,
4513		    (intmax_t)oldadp->ad_newblkno,
4514		    newadp->ad_oldsize, oldadp->ad_newsize);
4515	newadp->ad_oldblkno = oldadp->ad_oldblkno;
4516	newadp->ad_oldsize = oldadp->ad_oldsize;
4517	/*
4518	 * If the old dependency had a fragment to free or had never
4519	 * previously had a block allocated, then the new dependency
4520	 * can immediately post its freefrag and adopt the old freefrag.
4521	 * This action is done by swapping the freefrag dependencies.
4522	 * The new dependency gains the old one's freefrag, and the
4523	 * old one gets the new one and then immediately puts it on
4524	 * the worklist when it is freed by free_newblk. It is
4525	 * not possible to do this swap when the old dependency had a
4526	 * non-zero size but no previous fragment to free. This condition
4527	 * arises when the new block is an extension of the old block.
4528	 * Here, the first part of the fragment allocated to the new
4529	 * dependency is part of the block currently claimed on disk by
4530	 * the old dependency, so cannot legitimately be freed until the
4531	 * conditions for the new dependency are fulfilled.
4532	 */
4533	freefrag = newadp->ad_freefrag;
4534	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
4535		newadp->ad_freefrag = oldadp->ad_freefrag;
4536		oldadp->ad_freefrag = freefrag;
4537	}
4538	/*
4539	 * If we are tracking a new directory-block allocation,
4540	 * move it from the old allocdirect to the new allocdirect.
4541	 */
4542	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
4543		newdirblk = WK_NEWDIRBLK(wk);
4544		WORKLIST_REMOVE(&newdirblk->db_list);
4545		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
4546			panic("allocdirect_merge: extra newdirblk");
4547		WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
4548	}
4549	TAILQ_REMOVE(adphead, oldadp, ad_next);
4550	/*
4551	 * We need to move any journal dependencies over to the freefrag
4552	 * that releases this block if it exists.  Otherwise we are
4553	 * extending an existing block and we'll wait until that is
4554	 * complete to release the journal space and extend the
4555	 * new journal to cover this old space as well.
4556	 */
4557	if (freefrag == NULL) {
4558		struct jnewblk *jnewblk;
4559		struct jnewblk *njnewblk;
4560
4561		if (oldadp->ad_newblkno != newadp->ad_newblkno)
4562			panic("allocdirect_merge: %jd != %jd",
4563			    oldadp->ad_newblkno, newadp->ad_newblkno);
4564		jnewblk = oldadp->ad_block.nb_jnewblk;
4565		cancel_newblk(&oldadp->ad_block, &newadp->ad_block.nb_jwork);
4566		/*
4567		 * We have an unwritten jnewblk, we need to merge the
4568		 * frag bits with our own.  The newer adp's journal can not
4569		 * be written prior to the old one so no need to check for
4570		 * it here.
4571		 */
4572		if (jnewblk) {
4573			njnewblk = newadp->ad_block.nb_jnewblk;
4574			if (njnewblk == NULL)
4575				panic("allocdirect_merge: No jnewblk");
4576			if (jnewblk->jn_state & UNDONE) {
4577				njnewblk->jn_state |= UNDONE | NEWBLOCK;
4578				njnewblk->jn_state &= ~ATTACHED;
4579				jnewblk->jn_state &= ~UNDONE;
4580			}
4581			njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
4582			WORKLIST_REMOVE(&jnewblk->jn_list);
4583			jnewblk->jn_state |= ATTACHED | COMPLETE;
4584			free_jnewblk(jnewblk);
4585		}
4586	} else {
4587		/*
4588		 * We can skip journaling for this freefrag and just complete
4589		 * any pending journal work for the allocdirect that is being
4590		 * removed after the freefrag completes.
4591		 */
4592		if (freefrag->ff_jfreefrag)
4593			cancel_jfreefrag(freefrag->ff_jfreefrag);
4594		cancel_newblk(&oldadp->ad_block, &freefrag->ff_jwork);
4595	}
4596	free_newblk(&oldadp->ad_block);
4597}
4598
4599/*
4600 * Allocate a jfreefrag structure to journal a single block free.
4601 */
4602static struct jfreefrag *
4603newjfreefrag(freefrag, ip, blkno, size, lbn)
4604	struct freefrag *freefrag;
4605	struct inode *ip;
4606	ufs2_daddr_t blkno;
4607	long size;
4608	ufs_lbn_t lbn;
4609{
4610	struct jfreefrag *jfreefrag;
4611	struct fs *fs;
4612
4613	fs = ip->i_fs;
4614	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
4615	    M_SOFTDEP_FLAGS);
4616	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
4617	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
4618	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
4619	jfreefrag->fr_ino = ip->i_number;
4620	jfreefrag->fr_lbn = lbn;
4621	jfreefrag->fr_blkno = blkno;
4622	jfreefrag->fr_frags = numfrags(fs, size);
4623	jfreefrag->fr_freefrag = freefrag;
4624
4625	return (jfreefrag);
4626}
4627
4628/*
4629 * Allocate a new freefrag structure.
4630 */
4631static struct freefrag *
4632newfreefrag(ip, blkno, size, lbn)
4633	struct inode *ip;
4634	ufs2_daddr_t blkno;
4635	long size;
4636	ufs_lbn_t lbn;
4637{
4638	struct freefrag *freefrag;
4639	struct fs *fs;
4640
4641	fs = ip->i_fs;
4642	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
4643		panic("newfreefrag: frag size");
4644	freefrag = malloc(sizeof(struct freefrag),
4645	    M_FREEFRAG, M_SOFTDEP_FLAGS);
4646	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
4647	freefrag->ff_state = ATTACHED;
4648	LIST_INIT(&freefrag->ff_jwork);
4649	freefrag->ff_inum = ip->i_number;
4650	freefrag->ff_blkno = blkno;
4651	freefrag->ff_fragsize = size;
4652
4653	if (fs->fs_flags & FS_SUJ) {
4654		freefrag->ff_jfreefrag =
4655		    newjfreefrag(freefrag, ip, blkno, size, lbn);
4656	} else {
4657		freefrag->ff_state |= DEPCOMPLETE;
4658		freefrag->ff_jfreefrag = NULL;
4659	}
4660
4661	return (freefrag);
4662}
4663
4664/*
4665 * This workitem de-allocates fragments that were replaced during
4666 * file block allocation.
4667 */
4668static void
4669handle_workitem_freefrag(freefrag)
4670	struct freefrag *freefrag;
4671{
4672	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
4673	struct workhead wkhd;
4674
4675	/*
4676	 * It would be illegal to add new completion items to the
4677	 * freefrag after it was schedule to be done so it must be
4678	 * safe to modify the list head here.
4679	 */
4680	LIST_INIT(&wkhd);
4681	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
4682	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
4683	    freefrag->ff_fragsize, freefrag->ff_inum, &wkhd);
4684	ACQUIRE_LOCK(&lk);
4685	WORKITEM_FREE(freefrag, D_FREEFRAG);
4686	FREE_LOCK(&lk);
4687}
4688
4689/*
4690 * Set up a dependency structure for an external attributes data block.
4691 * This routine follows much of the structure of softdep_setup_allocdirect.
4692 * See the description of softdep_setup_allocdirect above for details.
4693 */
4694void
4695softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
4696	struct inode *ip;
4697	ufs_lbn_t off;
4698	ufs2_daddr_t newblkno;
4699	ufs2_daddr_t oldblkno;
4700	long newsize;
4701	long oldsize;
4702	struct buf *bp;
4703{
4704	struct allocdirect *adp, *oldadp;
4705	struct allocdirectlst *adphead;
4706	struct freefrag *freefrag;
4707	struct inodedep *inodedep;
4708	struct jnewblk *jnewblk;
4709	struct newblk *newblk;
4710	struct mount *mp;
4711	ufs_lbn_t lbn;
4712
4713	if (off >= NXADDR)
4714		panic("softdep_setup_allocext: lbn %lld > NXADDR",
4715		    (long long)off);
4716
4717	lbn = bp->b_lblkno;
4718	mp = UFSTOVFS(ip->i_ump);
4719	if (oldblkno && oldblkno != newblkno)
4720		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
4721	else
4722		freefrag = NULL;
4723
4724	ACQUIRE_LOCK(&lk);
4725	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
4726		panic("softdep_setup_allocext: lost block");
4727	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4728	    ("softdep_setup_allocext: newblk already initialized"));
4729	/*
4730	 * Convert the newblk to an allocdirect.
4731	 */
4732	newblk->nb_list.wk_type = D_ALLOCDIRECT;
4733	adp = (struct allocdirect *)newblk;
4734	newblk->nb_freefrag = freefrag;
4735	adp->ad_offset = off;
4736	adp->ad_oldblkno = oldblkno;
4737	adp->ad_newsize = newsize;
4738	adp->ad_oldsize = oldsize;
4739	adp->ad_state |=  EXTDATA;
4740
4741	/*
4742	 * Finish initializing the journal.
4743	 */
4744	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4745		jnewblk->jn_ino = ip->i_number;
4746		jnewblk->jn_lbn = lbn;
4747		add_to_journal(&jnewblk->jn_list);
4748	}
4749	if (freefrag && freefrag->ff_jfreefrag != NULL)
4750		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
4751	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
4752	adp->ad_inodedep = inodedep;
4753
4754	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
4755	/*
4756	 * The list of allocdirects must be kept in sorted and ascending
4757	 * order so that the rollback routines can quickly determine the
4758	 * first uncommitted block (the size of the file stored on disk
4759	 * ends at the end of the lowest committed fragment, or if there
4760	 * are no fragments, at the end of the highest committed block).
4761	 * Since files generally grow, the typical case is that the new
4762	 * block is to be added at the end of the list. We speed this
4763	 * special case by checking against the last allocdirect in the
4764	 * list before laboriously traversing the list looking for the
4765	 * insertion point.
4766	 */
4767	adphead = &inodedep->id_newextupdt;
4768	oldadp = TAILQ_LAST(adphead, allocdirectlst);
4769	if (oldadp == NULL || oldadp->ad_offset <= off) {
4770		/* insert at end of list */
4771		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
4772		if (oldadp != NULL && oldadp->ad_offset == off)
4773			allocdirect_merge(adphead, adp, oldadp);
4774		FREE_LOCK(&lk);
4775		return;
4776	}
4777	TAILQ_FOREACH(oldadp, adphead, ad_next) {
4778		if (oldadp->ad_offset >= off)
4779			break;
4780	}
4781	if (oldadp == NULL)
4782		panic("softdep_setup_allocext: lost entry");
4783	/* insert in middle of list */
4784	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
4785	if (oldadp->ad_offset == off)
4786		allocdirect_merge(adphead, adp, oldadp);
4787	FREE_LOCK(&lk);
4788}
4789
4790/*
4791 * Indirect block allocation dependencies.
4792 *
4793 * The same dependencies that exist for a direct block also exist when
4794 * a new block is allocated and pointed to by an entry in a block of
4795 * indirect pointers. The undo/redo states described above are also
4796 * used here. Because an indirect block contains many pointers that
4797 * may have dependencies, a second copy of the entire in-memory indirect
4798 * block is kept. The buffer cache copy is always completely up-to-date.
4799 * The second copy, which is used only as a source for disk writes,
4800 * contains only the safe pointers (i.e., those that have no remaining
4801 * update dependencies). The second copy is freed when all pointers
4802 * are safe. The cache is not allowed to replace indirect blocks with
4803 * pending update dependencies. If a buffer containing an indirect
4804 * block with dependencies is written, these routines will mark it
4805 * dirty again. It can only be successfully written once all the
4806 * dependencies are removed. The ffs_fsync routine in conjunction with
4807 * softdep_sync_metadata work together to get all the dependencies
4808 * removed so that a file can be successfully written to disk. Three
4809 * procedures are used when setting up indirect block pointer
4810 * dependencies. The division is necessary because of the organization
4811 * of the "balloc" routine and because of the distinction between file
4812 * pages and file metadata blocks.
4813 */
4814
4815/*
4816 * Allocate a new allocindir structure.
4817 */
4818static struct allocindir *
4819newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
4820	struct inode *ip;	/* inode for file being extended */
4821	int ptrno;		/* offset of pointer in indirect block */
4822	ufs2_daddr_t newblkno;	/* disk block number being added */
4823	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
4824	ufs_lbn_t lbn;
4825{
4826	struct newblk *newblk;
4827	struct allocindir *aip;
4828	struct freefrag *freefrag;
4829	struct jnewblk *jnewblk;
4830
4831	if (oldblkno)
4832		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
4833	else
4834		freefrag = NULL;
4835	ACQUIRE_LOCK(&lk);
4836	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
4837		panic("new_allocindir: lost block");
4838	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4839	    ("newallocindir: newblk already initialized"));
4840	newblk->nb_list.wk_type = D_ALLOCINDIR;
4841	newblk->nb_freefrag = freefrag;
4842	aip = (struct allocindir *)newblk;
4843	aip->ai_offset = ptrno;
4844	aip->ai_oldblkno = oldblkno;
4845	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4846		jnewblk->jn_ino = ip->i_number;
4847		jnewblk->jn_lbn = lbn;
4848		add_to_journal(&jnewblk->jn_list);
4849	}
4850	if (freefrag && freefrag->ff_jfreefrag != NULL)
4851		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
4852	return (aip);
4853}
4854
4855/*
4856 * Called just before setting an indirect block pointer
4857 * to a newly allocated file page.
4858 */
4859void
4860softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
4861	struct inode *ip;	/* inode for file being extended */
4862	ufs_lbn_t lbn;		/* allocated block number within file */
4863	struct buf *bp;		/* buffer with indirect blk referencing page */
4864	int ptrno;		/* offset of pointer in indirect block */
4865	ufs2_daddr_t newblkno;	/* disk block number being added */
4866	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
4867	struct buf *nbp;	/* buffer holding allocated page */
4868{
4869	struct inodedep *inodedep;
4870	struct allocindir *aip;
4871	struct pagedep *pagedep;
4872	struct mount *mp;
4873
4874	if (lbn != nbp->b_lblkno)
4875		panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
4876		    lbn, bp->b_lblkno);
4877	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
4878	mp = UFSTOVFS(ip->i_ump);
4879	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
4880	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
4881	/*
4882	 * If we are allocating a directory page, then we must
4883	 * allocate an associated pagedep to track additions and
4884	 * deletions.
4885	 */
4886	if ((ip->i_mode & IFMT) == IFDIR &&
4887	    pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0)
4888		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
4889	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
4890	setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
4891	FREE_LOCK(&lk);
4892}
4893
4894/*
4895 * Called just before setting an indirect block pointer to a
4896 * newly allocated indirect block.
4897 */
4898void
4899softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
4900	struct buf *nbp;	/* newly allocated indirect block */
4901	struct inode *ip;	/* inode for file being extended */
4902	struct buf *bp;		/* indirect block referencing allocated block */
4903	int ptrno;		/* offset of pointer in indirect block */
4904	ufs2_daddr_t newblkno;	/* disk block number being added */
4905{
4906	struct inodedep *inodedep;
4907	struct allocindir *aip;
4908	ufs_lbn_t lbn;
4909
4910	lbn = nbp->b_lblkno;
4911	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
4912	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
4913	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
4914	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
4915	setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
4916	FREE_LOCK(&lk);
4917}
4918
4919static void
4920indirdep_complete(indirdep)
4921	struct indirdep *indirdep;
4922{
4923	struct allocindir *aip;
4924
4925	LIST_REMOVE(indirdep, ir_next);
4926	indirdep->ir_state &= ~ONDEPLIST;
4927
4928	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
4929		LIST_REMOVE(aip, ai_next);
4930		free_newblk(&aip->ai_block);
4931	}
4932	/*
4933	 * If this indirdep is not attached to a buf it was simply waiting
4934	 * on completion to clear completehd.  free_indirdep() asserts
4935	 * that nothing is dangling.
4936	 */
4937	if ((indirdep->ir_state & ONWORKLIST) == 0)
4938		free_indirdep(indirdep);
4939}
4940
4941/*
4942 * Called to finish the allocation of the "aip" allocated
4943 * by one of the two routines above.
4944 */
4945static void
4946setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
4947	struct buf *bp;		/* in-memory copy of the indirect block */
4948	struct inode *ip;	/* inode for file being extended */
4949	struct inodedep *inodedep; /* Inodedep for ip */
4950	struct allocindir *aip;	/* allocindir allocated by the above routines */
4951	ufs_lbn_t lbn;		/* Logical block number for this block. */
4952{
4953	struct worklist *wk;
4954	struct fs *fs;
4955	struct newblk *newblk;
4956	struct indirdep *indirdep, *newindirdep;
4957	struct allocindir *oldaip;
4958	struct freefrag *freefrag;
4959	struct mount *mp;
4960	ufs2_daddr_t blkno;
4961
4962	mp = UFSTOVFS(ip->i_ump);
4963	fs = ip->i_fs;
4964	mtx_assert(&lk, MA_OWNED);
4965	if (bp->b_lblkno >= 0)
4966		panic("setup_allocindir_phase2: not indir blk");
4967	for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) {
4968		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4969			if (wk->wk_type != D_INDIRDEP)
4970				continue;
4971			indirdep = WK_INDIRDEP(wk);
4972			break;
4973		}
4974		if (indirdep == NULL && newindirdep) {
4975			indirdep = newindirdep;
4976			newindirdep = NULL;
4977			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
4978			if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0,
4979			    &newblk)) {
4980				indirdep->ir_state |= ONDEPLIST;
4981				LIST_INSERT_HEAD(&newblk->nb_indirdeps,
4982				    indirdep, ir_next);
4983			} else
4984				indirdep->ir_state |= DEPCOMPLETE;
4985		}
4986		if (indirdep) {
4987			aip->ai_indirdep = indirdep;
4988			/*
4989			 * Check to see if there is an existing dependency
4990			 * for this block. If there is, merge the old
4991			 * dependency into the new one.  This happens
4992			 * as a result of reallocblk only.
4993			 */
4994			if (aip->ai_oldblkno == 0)
4995				oldaip = NULL;
4996			else
4997
4998				LIST_FOREACH(oldaip, &indirdep->ir_deplisthd,
4999				    ai_next)
5000					if (oldaip->ai_offset == aip->ai_offset)
5001						break;
5002			if (oldaip != NULL)
5003				freefrag = allocindir_merge(aip, oldaip);
5004			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
5005			KASSERT(aip->ai_offset >= 0 &&
5006			    aip->ai_offset < NINDIR(ip->i_ump->um_fs),
5007			    ("setup_allocindir_phase2: Bad offset %d",
5008			    aip->ai_offset));
5009			KASSERT(indirdep->ir_savebp != NULL,
5010			    ("setup_allocindir_phase2 NULL ir_savebp"));
5011			if (ip->i_ump->um_fstype == UFS1)
5012				((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
5013				    [aip->ai_offset] = aip->ai_oldblkno;
5014			else
5015				((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
5016				    [aip->ai_offset] = aip->ai_oldblkno;
5017			FREE_LOCK(&lk);
5018			if (freefrag != NULL)
5019				handle_workitem_freefrag(freefrag);
5020		} else
5021			FREE_LOCK(&lk);
5022		if (newindirdep) {
5023			newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
5024			brelse(newindirdep->ir_savebp);
5025			ACQUIRE_LOCK(&lk);
5026			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
5027			if (indirdep)
5028				break;
5029			FREE_LOCK(&lk);
5030		}
5031		if (indirdep) {
5032			ACQUIRE_LOCK(&lk);
5033			break;
5034		}
5035		newindirdep = malloc(sizeof(struct indirdep),
5036			M_INDIRDEP, M_SOFTDEP_FLAGS);
5037		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5038		newindirdep->ir_state = ATTACHED;
5039		if (ip->i_ump->um_fstype == UFS1)
5040			newindirdep->ir_state |= UFS1FMT;
5041		newindirdep->ir_saveddata = NULL;
5042		LIST_INIT(&newindirdep->ir_deplisthd);
5043		LIST_INIT(&newindirdep->ir_donehd);
5044		LIST_INIT(&newindirdep->ir_writehd);
5045		LIST_INIT(&newindirdep->ir_completehd);
5046		LIST_INIT(&newindirdep->ir_jwork);
5047		if (bp->b_blkno == bp->b_lblkno) {
5048			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5049			    NULL, NULL);
5050			bp->b_blkno = blkno;
5051		}
5052		newindirdep->ir_savebp =
5053		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5054		BUF_KERNPROC(newindirdep->ir_savebp);
5055		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5056		ACQUIRE_LOCK(&lk);
5057	}
5058}
5059
5060/*
5061 * Merge two allocindirs which refer to the same block.  Move newblock
5062 * dependencies and setup the freefrags appropriately.
5063 */
5064static struct freefrag *
5065allocindir_merge(aip, oldaip)
5066	struct allocindir *aip;
5067	struct allocindir *oldaip;
5068{
5069	struct newdirblk *newdirblk;
5070	struct freefrag *freefrag;
5071	struct worklist *wk;
5072
5073	if (oldaip->ai_newblkno != aip->ai_oldblkno)
5074		panic("allocindir_merge: blkno");
5075	aip->ai_oldblkno = oldaip->ai_oldblkno;
5076	freefrag = aip->ai_freefrag;
5077	aip->ai_freefrag = oldaip->ai_freefrag;
5078	oldaip->ai_freefrag = NULL;
5079	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
5080	/*
5081	 * If we are tracking a new directory-block allocation,
5082	 * move it from the old allocindir to the new allocindir.
5083	 */
5084	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
5085		newdirblk = WK_NEWDIRBLK(wk);
5086		WORKLIST_REMOVE(&newdirblk->db_list);
5087		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
5088			panic("allocindir_merge: extra newdirblk");
5089		WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list);
5090	}
5091	/*
5092	 * We can skip journaling for this freefrag and just complete
5093	 * any pending journal work for the allocindir that is being
5094	 * removed after the freefrag completes.
5095	 */
5096	if (freefrag->ff_jfreefrag)
5097		cancel_jfreefrag(freefrag->ff_jfreefrag);
5098	LIST_REMOVE(oldaip, ai_next);
5099	cancel_newblk(&oldaip->ai_block, &freefrag->ff_jwork);
5100	free_newblk(&oldaip->ai_block);
5101
5102	return (freefrag);
5103}
5104
5105/*
5106 * Block de-allocation dependencies.
5107 *
5108 * When blocks are de-allocated, the on-disk pointers must be nullified before
5109 * the blocks are made available for use by other files.  (The true
5110 * requirement is that old pointers must be nullified before new on-disk
5111 * pointers are set.  We chose this slightly more stringent requirement to
5112 * reduce complexity.) Our implementation handles this dependency by updating
5113 * the inode (or indirect block) appropriately but delaying the actual block
5114 * de-allocation (i.e., freemap and free space count manipulation) until
5115 * after the updated versions reach stable storage.  After the disk is
5116 * updated, the blocks can be safely de-allocated whenever it is convenient.
5117 * This implementation handles only the common case of reducing a file's
5118 * length to zero. Other cases are handled by the conventional synchronous
5119 * write approach.
5120 *
5121 * The ffs implementation with which we worked double-checks
5122 * the state of the block pointers and file size as it reduces
5123 * a file's length.  Some of this code is replicated here in our
5124 * soft updates implementation.  The freeblks->fb_chkcnt field is
5125 * used to transfer a part of this information to the procedure
5126 * that eventually de-allocates the blocks.
5127 *
5128 * This routine should be called from the routine that shortens
5129 * a file's length, before the inode's size or block pointers
5130 * are modified. It will save the block pointer information for
5131 * later release and zero the inode so that the calling routine
5132 * can release it.
5133 */
5134void
5135softdep_setup_freeblocks(ip, length, flags)
5136	struct inode *ip;	/* The inode whose length is to be reduced */
5137	off_t length;		/* The new length for the file */
5138	int flags;		/* IO_EXT and/or IO_NORMAL */
5139{
5140	struct ufs1_dinode *dp1;
5141	struct ufs2_dinode *dp2;
5142	struct freeblks *freeblks;
5143	struct inodedep *inodedep;
5144	struct allocdirect *adp;
5145	struct jfreeblk *jfreeblk;
5146	struct bufobj *bo;
5147	struct vnode *vp;
5148	struct buf *bp;
5149	struct fs *fs;
5150	ufs2_daddr_t extblocks, datablocks;
5151	struct mount *mp;
5152	int i, delay, error;
5153	ufs2_daddr_t blkno;
5154	ufs_lbn_t tmpval;
5155	ufs_lbn_t lbn;
5156	long oldextsize;
5157	long oldsize;
5158	int frags;
5159	int needj;
5160
5161	fs = ip->i_fs;
5162	mp = UFSTOVFS(ip->i_ump);
5163	if (length != 0)
5164		panic("softdep_setup_freeblocks: non-zero length");
5165	freeblks = malloc(sizeof(struct freeblks),
5166		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
5167	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
5168	LIST_INIT(&freeblks->fb_jfreeblkhd);
5169	LIST_INIT(&freeblks->fb_jwork);
5170	freeblks->fb_state = ATTACHED;
5171	freeblks->fb_uid = ip->i_uid;
5172	freeblks->fb_previousinum = ip->i_number;
5173	freeblks->fb_devvp = ip->i_devvp;
5174	freeblks->fb_chkcnt = 0;
5175	ACQUIRE_LOCK(&lk);
5176	/*
5177	 * If we're truncating a removed file that will never be written
5178	 * we don't need to journal the block frees.  The canceled journals
5179	 * for the allocations will suffice.
5180	 */
5181	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5182	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED ||
5183	    (fs->fs_flags & FS_SUJ) == 0)
5184		needj = 0;
5185	else
5186		needj = 1;
5187	num_freeblkdep++;
5188	FREE_LOCK(&lk);
5189	extblocks = 0;
5190	if (fs->fs_magic == FS_UFS2_MAGIC)
5191		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
5192	datablocks = DIP(ip, i_blocks) - extblocks;
5193	if ((flags & IO_NORMAL) != 0) {
5194		oldsize = ip->i_size;
5195		ip->i_size = 0;
5196		DIP_SET(ip, i_size, 0);
5197		freeblks->fb_chkcnt = datablocks;
5198		for (i = 0; i < NDADDR; i++) {
5199			blkno = DIP(ip, i_db[i]);
5200			DIP_SET(ip, i_db[i], 0);
5201			if (blkno == 0)
5202				continue;
5203			frags = sblksize(fs, oldsize, i);
5204			frags = numfrags(fs, frags);
5205			newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags,
5206			    needj);
5207		}
5208		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
5209		    i++, tmpval *= NINDIR(fs)) {
5210			blkno = DIP(ip, i_ib[i]);
5211			DIP_SET(ip, i_ib[i], 0);
5212			if (blkno)
5213				newfreework(ip->i_ump, freeblks, NULL, -lbn - i,
5214				    blkno, fs->fs_frag, needj);
5215			lbn += tmpval;
5216		}
5217		UFS_LOCK(ip->i_ump);
5218		fs->fs_pendingblocks += datablocks;
5219		UFS_UNLOCK(ip->i_ump);
5220	}
5221	if ((flags & IO_EXT) != 0) {
5222		oldextsize = ip->i_din2->di_extsize;
5223		ip->i_din2->di_extsize = 0;
5224		freeblks->fb_chkcnt += extblocks;
5225		for (i = 0; i < NXADDR; i++) {
5226			blkno = ip->i_din2->di_extb[i];
5227			ip->i_din2->di_extb[i] = 0;
5228			if (blkno == 0)
5229				continue;
5230			frags = sblksize(fs, oldextsize, i);
5231			frags = numfrags(fs, frags);
5232			newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno,
5233			    frags, needj);
5234		}
5235	}
5236	if (LIST_EMPTY(&freeblks->fb_jfreeblkhd))
5237		needj = 0;
5238	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
5239	/*
5240	 * Push the zero'ed inode to to its disk buffer so that we are free
5241	 * to delete its dependencies below. Once the dependencies are gone
5242	 * the buffer can be safely released.
5243	 */
5244	if ((error = bread(ip->i_devvp,
5245	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
5246	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
5247		brelse(bp);
5248		softdep_error("softdep_setup_freeblocks", error);
5249	}
5250	if (ip->i_ump->um_fstype == UFS1) {
5251		dp1 = ((struct ufs1_dinode *)bp->b_data +
5252		    ino_to_fsbo(fs, ip->i_number));
5253		ip->i_din1->di_freelink = dp1->di_freelink;
5254		*dp1 = *ip->i_din1;
5255	} else {
5256		dp2 = ((struct ufs2_dinode *)bp->b_data +
5257		    ino_to_fsbo(fs, ip->i_number));
5258		ip->i_din2->di_freelink = dp2->di_freelink;
5259		*dp2 = *ip->i_din2;
5260	}
5261	/*
5262	 * Find and eliminate any inode dependencies.
5263	 */
5264	ACQUIRE_LOCK(&lk);
5265	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5266	if ((inodedep->id_state & IOSTARTED) != 0)
5267		panic("softdep_setup_freeblocks: inode busy");
5268	/*
5269	 * Add the freeblks structure to the list of operations that
5270	 * must await the zero'ed inode being written to disk. If we
5271	 * still have a bitmap dependency (delay == 0), then the inode
5272	 * has never been written to disk, so we can process the
5273	 * freeblks below once we have deleted the dependencies.
5274	 */
5275	delay = (inodedep->id_state & DEPCOMPLETE);
5276	if (delay)
5277		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
5278	else if (needj)
5279		freeblks->fb_state |= COMPLETE;
5280	/*
5281	 * Because the file length has been truncated to zero, any
5282	 * pending block allocation dependency structures associated
5283	 * with this inode are obsolete and can simply be de-allocated.
5284	 * We must first merge the two dependency lists to get rid of
5285	 * any duplicate freefrag structures, then purge the merged list.
5286	 * If we still have a bitmap dependency, then the inode has never
5287	 * been written to disk, so we can free any fragments without delay.
5288	 */
5289	if (flags & IO_NORMAL) {
5290		merge_inode_lists(&inodedep->id_newinoupdt,
5291		    &inodedep->id_inoupdt);
5292		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
5293			cancel_allocdirect(&inodedep->id_inoupdt, adp,
5294			    freeblks, delay);
5295	}
5296	if (flags & IO_EXT) {
5297		merge_inode_lists(&inodedep->id_newextupdt,
5298		    &inodedep->id_extupdt);
5299		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
5300			cancel_allocdirect(&inodedep->id_extupdt, adp,
5301			    freeblks, delay);
5302	}
5303	LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps)
5304		add_to_journal(&jfreeblk->jf_list);
5305
5306	FREE_LOCK(&lk);
5307	bdwrite(bp);
5308	/*
5309	 * We must wait for any I/O in progress to finish so that
5310	 * all potential buffers on the dirty list will be visible.
5311	 * Once they are all there, walk the list and get rid of
5312	 * any dependencies.
5313	 */
5314	vp = ITOV(ip);
5315	bo = &vp->v_bufobj;
5316	BO_LOCK(bo);
5317	drain_output(vp);
5318restart:
5319	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
5320		if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
5321		    ((flags & IO_NORMAL) == 0 &&
5322		      (bp->b_xflags & BX_ALTDATA) == 0))
5323			continue;
5324		if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL)
5325			goto restart;
5326		BO_UNLOCK(bo);
5327		ACQUIRE_LOCK(&lk);
5328		(void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
5329		if (deallocate_dependencies(bp, inodedep, freeblks))
5330			bp->b_flags |= B_INVAL | B_NOCACHE;
5331		FREE_LOCK(&lk);
5332		brelse(bp);
5333		BO_LOCK(bo);
5334		goto restart;
5335	}
5336	BO_UNLOCK(bo);
5337	ACQUIRE_LOCK(&lk);
5338	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
5339		(void) free_inodedep(inodedep);
5340
5341	if (delay || needj)
5342		freeblks->fb_state |= DEPCOMPLETE;
5343	if (delay) {
5344		/*
5345		 * If the inode with zeroed block pointers is now on disk
5346		 * we can start freeing blocks. Add freeblks to the worklist
5347		 * instead of calling  handle_workitem_freeblocks directly as
5348		 * it is more likely that additional IO is needed to complete
5349		 * the request here than in the !delay case.
5350		 */
5351		if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
5352			add_to_worklist(&freeblks->fb_list, 1);
5353	}
5354	if (needj && LIST_EMPTY(&freeblks->fb_jfreeblkhd))
5355		needj = 0;
5356
5357	FREE_LOCK(&lk);
5358	/*
5359	 * If the inode has never been written to disk (delay == 0) and
5360	 * we're not waiting on any journal writes, then we can process the
5361	 * freeblks now that we have deleted the dependencies.
5362	 */
5363	if (!delay && !needj)
5364		handle_workitem_freeblocks(freeblks, 0);
5365}
5366
5367/*
5368 * Reclaim any dependency structures from a buffer that is about to
5369 * be reallocated to a new vnode. The buffer must be locked, thus,
5370 * no I/O completion operations can occur while we are manipulating
5371 * its associated dependencies. The mutex is held so that other I/O's
5372 * associated with related dependencies do not occur.  Returns 1 if
5373 * all dependencies were cleared, 0 otherwise.
5374 */
5375static int
5376deallocate_dependencies(bp, inodedep, freeblks)
5377	struct buf *bp;
5378	struct inodedep *inodedep;
5379	struct freeblks *freeblks;
5380{
5381	struct worklist *wk;
5382	struct indirdep *indirdep;
5383	struct newdirblk *newdirblk;
5384	struct allocindir *aip;
5385	struct pagedep *pagedep;
5386	struct jremref *jremref;
5387	struct jmvref *jmvref;
5388	struct dirrem *dirrem;
5389	int i;
5390
5391	mtx_assert(&lk, MA_OWNED);
5392	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
5393		switch (wk->wk_type) {
5394
5395		case D_INDIRDEP:
5396			indirdep = WK_INDIRDEP(wk);
5397			if (bp->b_lblkno >= 0 ||
5398			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
5399				panic("deallocate_dependencies: not indir");
5400			cancel_indirdep(indirdep, bp, inodedep, freeblks);
5401			continue;
5402
5403		case D_PAGEDEP:
5404			pagedep = WK_PAGEDEP(wk);
5405			/*
5406			 * There should be no directory add dependencies present
5407			 * as the directory could not be truncated until all
5408			 * children were removed.
5409			 */
5410			KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
5411			    ("deallocate_dependencies: pendinghd != NULL"));
5412			for (i = 0; i < DAHASHSZ; i++)
5413				KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
5414				    ("deallocate_dependencies: diraddhd != NULL"));
5415			/*
5416			 * Copy any directory remove dependencies to the list
5417			 * to be processed after the zero'ed inode is written.
5418			 * If the inode has already been written, then they
5419			 * can be dumped directly onto the work list.
5420			 */
5421			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
5422				/*
5423				 * If there are any dirrems we wait for
5424				 * the journal write to complete and
5425				 * then restart the buf scan as the lock
5426				 * has been dropped.
5427				 */
5428				while ((jremref =
5429				    LIST_FIRST(&dirrem->dm_jremrefhd))
5430				    != NULL) {
5431					stat_jwait_filepage++;
5432					jwait(&jremref->jr_list);
5433					return (0);
5434				}
5435				LIST_REMOVE(dirrem, dm_next);
5436				dirrem->dm_dirinum = pagedep->pd_ino;
5437				if (inodedep == NULL ||
5438				    (inodedep->id_state & ALLCOMPLETE) ==
5439				     ALLCOMPLETE) {
5440					dirrem->dm_state |= COMPLETE;
5441					add_to_worklist(&dirrem->dm_list, 0);
5442				} else
5443					WORKLIST_INSERT(&inodedep->id_bufwait,
5444					    &dirrem->dm_list);
5445			}
5446			if ((pagedep->pd_state & NEWBLOCK) != 0) {
5447				newdirblk = pagedep->pd_newdirblk;
5448				WORKLIST_REMOVE(&newdirblk->db_list);
5449				free_newdirblk(newdirblk);
5450			}
5451			while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd))
5452			    != NULL) {
5453				stat_jwait_filepage++;
5454				jwait(&jmvref->jm_list);
5455				return (0);
5456			}
5457			WORKLIST_REMOVE(&pagedep->pd_list);
5458			LIST_REMOVE(pagedep, pd_hash);
5459			WORKITEM_FREE(pagedep, D_PAGEDEP);
5460			continue;
5461
5462		case D_ALLOCINDIR:
5463			aip = WK_ALLOCINDIR(wk);
5464			cancel_allocindir(aip, inodedep, freeblks);
5465			continue;
5466
5467		case D_ALLOCDIRECT:
5468		case D_INODEDEP:
5469			panic("deallocate_dependencies: Unexpected type %s",
5470			    TYPENAME(wk->wk_type));
5471			/* NOTREACHED */
5472
5473		default:
5474			panic("deallocate_dependencies: Unknown type %s",
5475			    TYPENAME(wk->wk_type));
5476			/* NOTREACHED */
5477		}
5478	}
5479
5480	return (1);
5481}
5482
5483/*
5484 * An allocdirect is being canceled due to a truncate.  We must make sure
5485 * the journal entry is released in concert with the blkfree that releases
5486 * the storage.  Completed journal entries must not be released until the
5487 * space is no longer pointed to by the inode or in the bitmap.
5488 */
5489static void
5490cancel_allocdirect(adphead, adp, freeblks, delay)
5491	struct allocdirectlst *adphead;
5492	struct allocdirect *adp;
5493	struct freeblks *freeblks;
5494	int delay;
5495{
5496	struct freework *freework;
5497	struct newblk *newblk;
5498	struct worklist *wk;
5499	ufs_lbn_t lbn;
5500
5501	TAILQ_REMOVE(adphead, adp, ad_next);
5502	newblk = (struct newblk *)adp;
5503	/*
5504	 * If the journal hasn't been written the jnewblk must be passed
5505	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
5506	 * this by linking the journal dependency into the freework to be
5507	 * freed when freework_freeblock() is called.  If the journal has
5508	 * been written we can simply reclaim the journal space when the
5509	 * freeblks work is complete.
5510	 */
5511	if (newblk->nb_jnewblk == NULL) {
5512		cancel_newblk(newblk, &freeblks->fb_jwork);
5513		goto found;
5514	}
5515	lbn = newblk->nb_jnewblk->jn_lbn;
5516	/*
5517	 * Find the correct freework structure so it releases the canceled
5518	 * journal when the bitmap is cleared.  This preserves rollback
5519	 * until the allocation is reverted.
5520	 */
5521	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
5522		freework = WK_FREEWORK(wk);
5523		if (freework->fw_lbn != lbn)
5524			continue;
5525		cancel_newblk(newblk, &freework->fw_jwork);
5526		goto found;
5527	}
5528	panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn);
5529found:
5530	if (delay)
5531		WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
5532		    &newblk->nb_list);
5533	else
5534		free_newblk(newblk);
5535	return;
5536}
5537
5538
5539static void
5540cancel_newblk(newblk, wkhd)
5541	struct newblk *newblk;
5542	struct workhead *wkhd;
5543{
5544	struct indirdep *indirdep;
5545	struct allocindir *aip;
5546
5547	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
5548		indirdep->ir_state &= ~ONDEPLIST;
5549		LIST_REMOVE(indirdep, ir_next);
5550		/*
5551		 * If an indirdep is not on the buf worklist we need to
5552		 * free it here as deallocate_dependencies() will never
5553		 * find it.  These pointers were never visible on disk and
5554		 * can be discarded immediately.
5555		 */
5556		while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5557			LIST_REMOVE(aip, ai_next);
5558			cancel_newblk(&aip->ai_block, wkhd);
5559			free_newblk(&aip->ai_block);
5560		}
5561		/*
5562		 * If this indirdep is not attached to a buf it was simply
5563		 * waiting on completion to clear completehd.  free_indirdep()
5564		 * asserts that nothing is dangling.
5565		 */
5566		if ((indirdep->ir_state & ONWORKLIST) == 0)
5567			free_indirdep(indirdep);
5568	}
5569	if (newblk->nb_state & ONDEPLIST) {
5570		newblk->nb_state &= ~ONDEPLIST;
5571		LIST_REMOVE(newblk, nb_deps);
5572	}
5573	if (newblk->nb_state & ONWORKLIST)
5574		WORKLIST_REMOVE(&newblk->nb_list);
5575	/*
5576	 * If the journal entry hasn't been written we hold onto the dep
5577	 * until it is safe to free along with the other journal work.
5578	 */
5579	if (newblk->nb_jnewblk != NULL) {
5580		cancel_jnewblk(newblk->nb_jnewblk, wkhd);
5581		newblk->nb_jnewblk = NULL;
5582	}
5583	if (!LIST_EMPTY(&newblk->nb_jwork))
5584		jwork_move(wkhd, &newblk->nb_jwork);
5585}
5586
5587/*
5588 * Free a newblk. Generate a new freefrag work request if appropriate.
5589 * This must be called after the inode pointer and any direct block pointers
5590 * are valid or fully removed via truncate or frag extension.
5591 */
5592static void
5593free_newblk(newblk)
5594	struct newblk *newblk;
5595{
5596	struct indirdep *indirdep;
5597	struct newdirblk *newdirblk;
5598	struct freefrag *freefrag;
5599	struct worklist *wk;
5600
5601	mtx_assert(&lk, MA_OWNED);
5602	if (newblk->nb_state & ONDEPLIST)
5603		LIST_REMOVE(newblk, nb_deps);
5604	if (newblk->nb_state & ONWORKLIST)
5605		WORKLIST_REMOVE(&newblk->nb_list);
5606	LIST_REMOVE(newblk, nb_hash);
5607	if ((freefrag = newblk->nb_freefrag) != NULL) {
5608		freefrag->ff_state |= COMPLETE;
5609		if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
5610			add_to_worklist(&freefrag->ff_list, 0);
5611	}
5612	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) {
5613		newdirblk = WK_NEWDIRBLK(wk);
5614		WORKLIST_REMOVE(&newdirblk->db_list);
5615		if (!LIST_EMPTY(&newblk->nb_newdirblk))
5616			panic("free_newblk: extra newdirblk");
5617		free_newdirblk(newdirblk);
5618	}
5619	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
5620		indirdep->ir_state |= DEPCOMPLETE;
5621		indirdep_complete(indirdep);
5622	}
5623	KASSERT(newblk->nb_jnewblk == NULL,
5624	    ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk));
5625	handle_jwork(&newblk->nb_jwork);
5626	newblk->nb_list.wk_type = D_NEWBLK;
5627	WORKITEM_FREE(newblk, D_NEWBLK);
5628}
5629
5630/*
5631 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
5632 * This routine must be called with splbio interrupts blocked.
5633 */
5634static void
5635free_newdirblk(newdirblk)
5636	struct newdirblk *newdirblk;
5637{
5638	struct pagedep *pagedep;
5639	struct diradd *dap;
5640	struct worklist *wk;
5641	int i;
5642
5643	mtx_assert(&lk, MA_OWNED);
5644	/*
5645	 * If the pagedep is still linked onto the directory buffer
5646	 * dependency chain, then some of the entries on the
5647	 * pd_pendinghd list may not be committed to disk yet. In
5648	 * this case, we will simply clear the NEWBLOCK flag and
5649	 * let the pd_pendinghd list be processed when the pagedep
5650	 * is next written. If the pagedep is no longer on the buffer
5651	 * dependency chain, then all the entries on the pd_pending
5652	 * list are committed to disk and we can free them here.
5653	 */
5654	pagedep = newdirblk->db_pagedep;
5655	pagedep->pd_state &= ~NEWBLOCK;
5656	if ((pagedep->pd_state & ONWORKLIST) == 0)
5657		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
5658			free_diradd(dap, NULL);
5659	/*
5660	 * If no dependencies remain, the pagedep will be freed.
5661	 */
5662	for (i = 0; i < DAHASHSZ; i++)
5663		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
5664			break;
5665	if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 &&
5666	    LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
5667		KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL,
5668		    ("free_newdirblk: Freeing non-free pagedep %p", pagedep));
5669		LIST_REMOVE(pagedep, pd_hash);
5670		WORKITEM_FREE(pagedep, D_PAGEDEP);
5671	}
5672	/* Should only ever be one item in the list. */
5673	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
5674		WORKLIST_REMOVE(wk);
5675		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
5676	}
5677	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
5678}
5679
5680/*
5681 * Prepare an inode to be freed. The actual free operation is not
5682 * done until the zero'ed inode has been written to disk.
5683 */
5684void
5685softdep_freefile(pvp, ino, mode)
5686	struct vnode *pvp;
5687	ino_t ino;
5688	int mode;
5689{
5690	struct inode *ip = VTOI(pvp);
5691	struct inodedep *inodedep;
5692	struct freefile *freefile;
5693
5694	/*
5695	 * This sets up the inode de-allocation dependency.
5696	 */
5697	freefile = malloc(sizeof(struct freefile),
5698		M_FREEFILE, M_SOFTDEP_FLAGS);
5699	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
5700	freefile->fx_mode = mode;
5701	freefile->fx_oldinum = ino;
5702	freefile->fx_devvp = ip->i_devvp;
5703	LIST_INIT(&freefile->fx_jwork);
5704	UFS_LOCK(ip->i_ump);
5705	ip->i_fs->fs_pendinginodes += 1;
5706	UFS_UNLOCK(ip->i_ump);
5707
5708	/*
5709	 * If the inodedep does not exist, then the zero'ed inode has
5710	 * been written to disk. If the allocated inode has never been
5711	 * written to disk, then the on-disk inode is zero'ed. In either
5712	 * case we can free the file immediately.  If the journal was
5713	 * canceled before being written the inode will never make it to
5714	 * disk and we must send the canceled journal entrys to
5715	 * ffs_freefile() to be cleared in conjunction with the bitmap.
5716	 * Any blocks waiting on the inode to write can be safely freed
5717	 * here as it will never been written.
5718	 */
5719	ACQUIRE_LOCK(&lk);
5720	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
5721	/*
5722	 * Remove this inode from the unlinked list and set
5723	 * GOINGAWAY as appropriate to indicate that this inode
5724	 * will never be written.
5725	 */
5726	if (inodedep && inodedep->id_state & UNLINKED) {
5727		/*
5728		 * Save the journal work to be freed with the bitmap
5729		 * before we clear UNLINKED.  Otherwise it can be lost
5730		 * if the inode block is written.
5731		 */
5732		handle_bufwait(inodedep, &freefile->fx_jwork);
5733		clear_unlinked_inodedep(inodedep);
5734		/* Re-acquire inodedep as we've dropped lk. */
5735		inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
5736		if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0)
5737			inodedep->id_state |= GOINGAWAY;
5738	}
5739	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
5740		FREE_LOCK(&lk);
5741		handle_workitem_freefile(freefile);
5742		return;
5743	}
5744	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
5745	FREE_LOCK(&lk);
5746	if (ip->i_number == ino)
5747		ip->i_flag |= IN_MODIFIED;
5748}
5749
5750/*
5751 * Check to see if an inode has never been written to disk. If
5752 * so free the inodedep and return success, otherwise return failure.
5753 * This routine must be called with splbio interrupts blocked.
5754 *
5755 * If we still have a bitmap dependency, then the inode has never
5756 * been written to disk. Drop the dependency as it is no longer
5757 * necessary since the inode is being deallocated. We set the
5758 * ALLCOMPLETE flags since the bitmap now properly shows that the
5759 * inode is not allocated. Even if the inode is actively being
5760 * written, it has been rolled back to its zero'ed state, so we
5761 * are ensured that a zero inode is what is on the disk. For short
5762 * lived files, this change will usually result in removing all the
5763 * dependencies from the inode so that it can be freed immediately.
5764 */
5765static int
5766check_inode_unwritten(inodedep)
5767	struct inodedep *inodedep;
5768{
5769
5770	mtx_assert(&lk, MA_OWNED);
5771
5772	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
5773	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
5774	    !LIST_EMPTY(&inodedep->id_bufwait) ||
5775	    !LIST_EMPTY(&inodedep->id_inowait) ||
5776	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
5777	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
5778	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
5779	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
5780	    inodedep->id_mkdiradd != NULL ||
5781	    inodedep->id_nlinkdelta != 0)
5782		return (0);
5783	/*
5784	 * Another process might be in initiate_write_inodeblock_ufs[12]
5785	 * trying to allocate memory without holding "Softdep Lock".
5786	 */
5787	if ((inodedep->id_state & IOSTARTED) != 0 &&
5788	    inodedep->id_savedino1 == NULL)
5789		return (0);
5790
5791	if (inodedep->id_state & ONDEPLIST)
5792		LIST_REMOVE(inodedep, id_deps);
5793	inodedep->id_state &= ~ONDEPLIST;
5794	inodedep->id_state |= ALLCOMPLETE;
5795	inodedep->id_bmsafemap = NULL;
5796	if (inodedep->id_state & ONWORKLIST)
5797		WORKLIST_REMOVE(&inodedep->id_list);
5798	if (inodedep->id_savedino1 != NULL) {
5799		free(inodedep->id_savedino1, M_SAVEDINO);
5800		inodedep->id_savedino1 = NULL;
5801	}
5802	if (free_inodedep(inodedep) == 0)
5803		panic("check_inode_unwritten: busy inode");
5804	return (1);
5805}
5806
5807/*
5808 * Try to free an inodedep structure. Return 1 if it could be freed.
5809 */
5810static int
5811free_inodedep(inodedep)
5812	struct inodedep *inodedep;
5813{
5814
5815	mtx_assert(&lk, MA_OWNED);
5816	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
5817	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
5818	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
5819	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
5820	    !LIST_EMPTY(&inodedep->id_bufwait) ||
5821	    !LIST_EMPTY(&inodedep->id_inowait) ||
5822	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
5823	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
5824	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
5825	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
5826	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
5827	    inodedep->id_mkdiradd != NULL ||
5828	    inodedep->id_nlinkdelta != 0 ||
5829	    inodedep->id_savedino1 != NULL)
5830		return (0);
5831	if (inodedep->id_state & ONDEPLIST)
5832		LIST_REMOVE(inodedep, id_deps);
5833	LIST_REMOVE(inodedep, id_hash);
5834	WORKITEM_FREE(inodedep, D_INODEDEP);
5835	num_inodedep -= 1;
5836	return (1);
5837}
5838
5839/*
5840 * Free the block referenced by a freework structure.  The parent freeblks
5841 * structure is released and completed when the final cg bitmap reaches
5842 * the disk.  This routine may be freeing a jnewblk which never made it to
5843 * disk in which case we do not have to wait as the operation is undone
5844 * in memory immediately.
5845 */
5846static void
5847freework_freeblock(freework)
5848	struct freework *freework;
5849{
5850	struct freeblks *freeblks;
5851	struct ufsmount *ump;
5852	struct workhead wkhd;
5853	struct fs *fs;
5854	int complete;
5855	int pending;
5856	int bsize;
5857	int needj;
5858
5859	freeblks = freework->fw_freeblks;
5860	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
5861	fs = ump->um_fs;
5862	needj = freeblks->fb_list.wk_mp->mnt_kern_flag & MNTK_SUJ;
5863	complete = 0;
5864	LIST_INIT(&wkhd);
5865	/*
5866	 * If we are canceling an existing jnewblk pass it to the free
5867	 * routine, otherwise pass the freeblk which will ultimately
5868	 * release the freeblks.  If we're not journaling, we can just
5869	 * free the freeblks immediately.
5870	 */
5871	if (!LIST_EMPTY(&freework->fw_jwork)) {
5872		LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list);
5873		complete = 1;
5874	} else if (needj)
5875		WORKLIST_INSERT_UNLOCKED(&wkhd, &freework->fw_list);
5876	bsize = lfragtosize(fs, freework->fw_frags);
5877	pending = btodb(bsize);
5878	ACQUIRE_LOCK(&lk);
5879	freeblks->fb_chkcnt -= pending;
5880	FREE_LOCK(&lk);
5881	/*
5882	 * extattr blocks don't show up in pending blocks.  XXX why?
5883	 */
5884	if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) {
5885		UFS_LOCK(ump);
5886		fs->fs_pendingblocks -= pending;
5887		UFS_UNLOCK(ump);
5888	}
5889	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno,
5890	    bsize, freeblks->fb_previousinum, &wkhd);
5891	if (complete == 0 && needj)
5892		return;
5893	/*
5894	 * The jnewblk will be discarded and the bits in the map never
5895	 * made it to disk.  We can immediately free the freeblk.
5896	 */
5897	ACQUIRE_LOCK(&lk);
5898	handle_written_freework(freework);
5899	FREE_LOCK(&lk);
5900}
5901
5902/*
5903 * Start, continue, or finish the process of freeing an indirect block tree.
5904 * The free operation may be paused at any point with fw_off containing the
5905 * offset to restart from.  This enables us to implement some flow control
5906 * for large truncates which may fan out and generate a huge number of
5907 * dependencies.
5908 */
5909static void
5910handle_workitem_indirblk(freework)
5911	struct freework *freework;
5912{
5913	struct freeblks *freeblks;
5914	struct ufsmount *ump;
5915	struct fs *fs;
5916
5917
5918	freeblks = freework->fw_freeblks;
5919	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
5920	fs = ump->um_fs;
5921	if (freework->fw_off == NINDIR(fs))
5922		freework_freeblock(freework);
5923	else
5924		indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
5925		    freework->fw_lbn);
5926}
5927
5928/*
5929 * Called when a freework structure attached to a cg buf is written.  The
5930 * ref on either the parent or the freeblks structure is released and
5931 * either may be added to the worklist if it is the final ref.
5932 */
5933static void
5934handle_written_freework(freework)
5935	struct freework *freework;
5936{
5937	struct freeblks *freeblks;
5938	struct freework *parent;
5939
5940	freeblks = freework->fw_freeblks;
5941	parent = freework->fw_parent;
5942	if (parent) {
5943		if (--parent->fw_ref != 0)
5944			parent = NULL;
5945		freeblks = NULL;
5946	} else if (--freeblks->fb_ref != 0)
5947		freeblks = NULL;
5948	WORKITEM_FREE(freework, D_FREEWORK);
5949	/*
5950	 * Don't delay these block frees or it takes an intolerable amount
5951	 * of time to process truncates and free their journal entries.
5952	 */
5953	if (freeblks)
5954		add_to_worklist(&freeblks->fb_list, 1);
5955	if (parent)
5956		add_to_worklist(&parent->fw_list, 1);
5957}
5958
5959/*
5960 * This workitem routine performs the block de-allocation.
5961 * The workitem is added to the pending list after the updated
5962 * inode block has been written to disk.  As mentioned above,
5963 * checks regarding the number of blocks de-allocated (compared
5964 * to the number of blocks allocated for the file) are also
5965 * performed in this function.
5966 */
5967static void
5968handle_workitem_freeblocks(freeblks, flags)
5969	struct freeblks *freeblks;
5970	int flags;
5971{
5972	struct freework *freework;
5973	struct worklist *wk;
5974
5975	KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd),
5976	    ("handle_workitem_freeblocks: Journal entries not written."));
5977	if (LIST_EMPTY(&freeblks->fb_freeworkhd)) {
5978		handle_complete_freeblocks(freeblks);
5979		return;
5980	}
5981	freeblks->fb_ref++;
5982	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
5983		KASSERT(wk->wk_type == D_FREEWORK,
5984		    ("handle_workitem_freeblocks: Unknown type %s",
5985		    TYPENAME(wk->wk_type)));
5986		WORKLIST_REMOVE_UNLOCKED(wk);
5987		freework = WK_FREEWORK(wk);
5988		if (freework->fw_lbn <= -NDADDR)
5989			handle_workitem_indirblk(freework);
5990		else
5991			freework_freeblock(freework);
5992	}
5993	ACQUIRE_LOCK(&lk);
5994	if (--freeblks->fb_ref != 0)
5995		freeblks = NULL;
5996	FREE_LOCK(&lk);
5997	if (freeblks)
5998		handle_complete_freeblocks(freeblks);
5999}
6000
6001/*
6002 * Once all of the freework workitems are complete we can retire the
6003 * freeblocks dependency and any journal work awaiting completion.  This
6004 * can not be called until all other dependencies are stable on disk.
6005 */
6006static void
6007handle_complete_freeblocks(freeblks)
6008	struct freeblks *freeblks;
6009{
6010	struct inode *ip;
6011	struct vnode *vp;
6012	struct fs *fs;
6013	struct ufsmount *ump;
6014	int flags;
6015
6016	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
6017	fs = ump->um_fs;
6018	flags = LK_NOWAIT;
6019
6020	/*
6021	 * If we still have not finished background cleanup, then check
6022	 * to see if the block count needs to be adjusted.
6023	 */
6024	if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 &&
6025	    ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
6026	    (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) {
6027		ip = VTOI(vp);
6028		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt);
6029		ip->i_flag |= IN_CHANGE;
6030		vput(vp);
6031	}
6032
6033	if (!(freeblks->fb_chkcnt == 0 ||
6034	    ((fs->fs_flags & FS_UNCLEAN) != 0 && (flags & LK_NOWAIT) == 0)))
6035	        printf(
6036	"handle_workitem_freeblocks: inode %ju block count %jd\n",
6037		   (uintmax_t)freeblks->fb_previousinum,
6038		   (intmax_t)freeblks->fb_chkcnt);
6039
6040	ACQUIRE_LOCK(&lk);
6041	/*
6042	 * All of the freeblock deps must be complete prior to this call
6043	 * so it's now safe to complete earlier outstanding journal entries.
6044	 */
6045	handle_jwork(&freeblks->fb_jwork);
6046	WORKITEM_FREE(freeblks, D_FREEBLKS);
6047	num_freeblkdep--;
6048	FREE_LOCK(&lk);
6049}
6050
6051/*
6052 * Release blocks associated with the inode ip and stored in the indirect
6053 * block dbn. If level is greater than SINGLE, the block is an indirect block
6054 * and recursive calls to indirtrunc must be used to cleanse other indirect
6055 * blocks.
6056 */
6057static void
6058indir_trunc(freework, dbn, lbn)
6059	struct freework *freework;
6060	ufs2_daddr_t dbn;
6061	ufs_lbn_t lbn;
6062{
6063	struct freework *nfreework;
6064	struct workhead wkhd;
6065	struct jnewblk *jnewblk;
6066	struct freeblks *freeblks;
6067	struct buf *bp;
6068	struct fs *fs;
6069	struct worklist *wkn;
6070	struct worklist *wk;
6071	struct indirdep *indirdep;
6072	struct ufsmount *ump;
6073	ufs1_daddr_t *bap1 = 0;
6074	ufs2_daddr_t nb, nnb, *bap2 = 0;
6075	ufs_lbn_t lbnadd;
6076	int i, nblocks, ufs1fmt;
6077	int fs_pendingblocks;
6078	int freedeps;
6079	int needj;
6080	int level;
6081	int cnt;
6082
6083	LIST_INIT(&wkhd);
6084	level = lbn_level(lbn);
6085	if (level == -1)
6086		panic("indir_trunc: Invalid lbn %jd\n", lbn);
6087	freeblks = freework->fw_freeblks;
6088	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
6089	fs = ump->um_fs;
6090	fs_pendingblocks = 0;
6091	freedeps = 0;
6092	needj = UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ;
6093	lbnadd = lbn_offset(fs, level);
6094	/*
6095	 * Get buffer of block pointers to be freed. This routine is not
6096	 * called until the zero'ed inode has been written, so it is safe
6097	 * to free blocks as they are encountered. Because the inode has
6098	 * been zero'ed, calls to bmap on these blocks will fail. So, we
6099	 * have to use the on-disk address and the block device for the
6100	 * filesystem to look them up. If the file was deleted before its
6101	 * indirect blocks were all written to disk, the routine that set
6102	 * us up (deallocate_dependencies) will have arranged to leave
6103	 * a complete copy of the indirect block in memory for our use.
6104	 * Otherwise we have to read the blocks in from the disk.
6105	 */
6106#ifdef notyet
6107	bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
6108	    GB_NOCREAT);
6109#else
6110	bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
6111#endif
6112	ACQUIRE_LOCK(&lk);
6113	if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
6114		if (wk->wk_type != D_INDIRDEP ||
6115		    (wk->wk_state & GOINGAWAY) == 0)
6116			panic("indir_trunc: lost indirdep %p", wk);
6117		indirdep = WK_INDIRDEP(wk);
6118		LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list);
6119		free_indirdep(indirdep);
6120		if (!LIST_EMPTY(&bp->b_dep))
6121			panic("indir_trunc: dangling dep %p",
6122			    LIST_FIRST(&bp->b_dep));
6123		ump->um_numindirdeps -= 1;
6124		FREE_LOCK(&lk);
6125	} else {
6126#ifdef notyet
6127		if (bp)
6128			brelse(bp);
6129#endif
6130		FREE_LOCK(&lk);
6131		if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
6132		    NOCRED, &bp) != 0) {
6133			brelse(bp);
6134			return;
6135		}
6136	}
6137	/*
6138	 * Recursively free indirect blocks.
6139	 */
6140	if (ump->um_fstype == UFS1) {
6141		ufs1fmt = 1;
6142		bap1 = (ufs1_daddr_t *)bp->b_data;
6143	} else {
6144		ufs1fmt = 0;
6145		bap2 = (ufs2_daddr_t *)bp->b_data;
6146	}
6147
6148	/*
6149	 * Reclaim indirect blocks which never made it to disk.
6150	 */
6151	cnt = 0;
6152	LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) {
6153		if (wk->wk_type != D_JNEWBLK)
6154			continue;
6155		ACQUIRE_LOCK(&lk);
6156		WORKLIST_REMOVE(wk);
6157		FREE_LOCK(&lk);
6158		jnewblk = WK_JNEWBLK(wk);
6159		if (jnewblk->jn_lbn > 0)
6160			i = (jnewblk->jn_lbn - -lbn) / lbnadd;
6161		else
6162			i = (-(jnewblk->jn_lbn + level - 1) - -(lbn + level)) /
6163			    lbnadd;
6164		KASSERT(i >= 0 && i < NINDIR(fs),
6165		    ("indir_trunc: Index out of range %d parent %jd lbn %jd level %d",
6166		    i, lbn, jnewblk->jn_lbn, level));
6167		/* Clear the pointer so it isn't found below. */
6168		if (ufs1fmt) {
6169			nb = bap1[i];
6170			bap1[i] = 0;
6171		} else {
6172			nb = bap2[i];
6173			bap2[i] = 0;
6174		}
6175		KASSERT(nb == jnewblk->jn_blkno,
6176		    ("indir_trunc: Block mismatch %jd != %jd",
6177		    nb, jnewblk->jn_blkno));
6178		if (level != 0) {
6179			ufs_lbn_t nlbn;
6180
6181			nlbn = (lbn + 1) - (i * lbnadd);
6182			nfreework = newfreework(ump, freeblks, freework,
6183			    nlbn, nb, fs->fs_frag, 0);
6184			WORKLIST_INSERT_UNLOCKED(&nfreework->fw_jwork, wk);
6185			freedeps++;
6186			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
6187		} else {
6188			struct workhead freewk;
6189
6190			LIST_INIT(&freewk);
6191			ACQUIRE_LOCK(&lk);
6192			WORKLIST_INSERT(&freewk, wk);
6193			FREE_LOCK(&lk);
6194			ffs_blkfree(ump, fs, freeblks->fb_devvp,
6195			    jnewblk->jn_blkno, fs->fs_bsize,
6196			    freeblks->fb_previousinum, &freewk);
6197		}
6198		cnt++;
6199	}
6200	ACQUIRE_LOCK(&lk);
6201	/* Any remaining journal work can be completed with freeblks. */
6202	jwork_move(&freeblks->fb_jwork, &wkhd);
6203	FREE_LOCK(&lk);
6204	nblocks = btodb(fs->fs_bsize);
6205	if (ufs1fmt)
6206		nb = bap1[0];
6207	else
6208		nb = bap2[0];
6209	nfreework = freework;
6210	/*
6211	 * Reclaim on disk blocks.
6212	 */
6213	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
6214		if (i != NINDIR(fs) - 1) {
6215			if (ufs1fmt)
6216				nnb = bap1[i+1];
6217			else
6218				nnb = bap2[i+1];
6219		} else
6220			nnb = 0;
6221		if (nb == 0)
6222			continue;
6223		cnt++;
6224		if (level != 0) {
6225			ufs_lbn_t nlbn;
6226
6227			nlbn = (lbn + 1) - (i * lbnadd);
6228			if (needj != 0) {
6229				nfreework = newfreework(ump, freeblks, freework,
6230				    nlbn, nb, fs->fs_frag, 0);
6231				freedeps++;
6232			}
6233			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
6234		} else {
6235			struct freedep *freedep;
6236
6237			/*
6238			 * Attempt to aggregate freedep dependencies for
6239			 * all blocks being released to the same CG.
6240			 */
6241			LIST_INIT(&wkhd);
6242			if (needj != 0 &&
6243			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
6244				freedep = newfreedep(freework);
6245				WORKLIST_INSERT_UNLOCKED(&wkhd,
6246				    &freedep->fd_list);
6247				freedeps++;
6248			}
6249			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
6250			    fs->fs_bsize, freeblks->fb_previousinum, &wkhd);
6251		}
6252	}
6253	if (level == 0)
6254		fs_pendingblocks = (nblocks * cnt);
6255	/*
6256	 * If we're not journaling we can free the indirect now.  Otherwise
6257	 * setup the ref counts and offset so this indirect can be completed
6258	 * when its children are free.
6259	 */
6260	if (needj == 0) {
6261		fs_pendingblocks += nblocks;
6262		dbn = dbtofsb(fs, dbn);
6263		ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
6264		    freeblks->fb_previousinum, NULL);
6265		ACQUIRE_LOCK(&lk);
6266		freeblks->fb_chkcnt -= fs_pendingblocks;
6267		if (freework->fw_blkno == dbn)
6268			handle_written_freework(freework);
6269		FREE_LOCK(&lk);
6270		freework = NULL;
6271	} else {
6272		ACQUIRE_LOCK(&lk);
6273		freework->fw_off = i;
6274		freework->fw_ref += freedeps;
6275		freework->fw_ref -= NINDIR(fs) + 1;
6276		if (freework->fw_ref != 0)
6277			freework = NULL;
6278		freeblks->fb_chkcnt -= fs_pendingblocks;
6279		FREE_LOCK(&lk);
6280	}
6281	if (fs_pendingblocks) {
6282		UFS_LOCK(ump);
6283		fs->fs_pendingblocks -= fs_pendingblocks;
6284		UFS_UNLOCK(ump);
6285	}
6286	bp->b_flags |= B_INVAL | B_NOCACHE;
6287	brelse(bp);
6288	if (freework)
6289		handle_workitem_indirblk(freework);
6290	return;
6291}
6292
6293/*
6294 * Cancel an allocindir when it is removed via truncation.
6295 */
6296static void
6297cancel_allocindir(aip, inodedep, freeblks)
6298	struct allocindir *aip;
6299	struct inodedep *inodedep;
6300	struct freeblks *freeblks;
6301{
6302	struct newblk *newblk;
6303
6304	/*
6305	 * If the journal hasn't been written the jnewblk must be passed
6306	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
6307	 * this by linking the journal dependency into the indirdep to be
6308	 * freed when indir_trunc() is called.  If the journal has already
6309	 * been written we can simply reclaim the journal space when the
6310	 * freeblks work is complete.
6311	 */
6312	LIST_REMOVE(aip, ai_next);
6313	newblk = (struct newblk *)aip;
6314	if (newblk->nb_jnewblk == NULL)
6315		cancel_newblk(newblk, &freeblks->fb_jwork);
6316	else
6317		cancel_newblk(newblk, &aip->ai_indirdep->ir_jwork);
6318	if (inodedep && inodedep->id_state & DEPCOMPLETE)
6319		WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list);
6320	else
6321		free_newblk(newblk);
6322}
6323
6324/*
6325 * Create the mkdir dependencies for . and .. in a new directory.  Link them
6326 * in to a newdirblk so any subsequent additions are tracked properly.  The
6327 * caller is responsible for adding the mkdir1 dependency to the journal
6328 * and updating id_mkdiradd.  This function returns with lk held.
6329 */
6330static struct mkdir *
6331setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
6332	struct diradd *dap;
6333	ino_t newinum;
6334	ino_t dinum;
6335	struct buf *newdirbp;
6336	struct mkdir **mkdirp;
6337{
6338	struct newblk *newblk;
6339	struct pagedep *pagedep;
6340	struct inodedep *inodedep;
6341	struct newdirblk *newdirblk = 0;
6342	struct mkdir *mkdir1, *mkdir2;
6343	struct worklist *wk;
6344	struct jaddref *jaddref;
6345	struct mount *mp;
6346
6347	mp = dap->da_list.wk_mp;
6348	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
6349	    M_SOFTDEP_FLAGS);
6350	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
6351	LIST_INIT(&newdirblk->db_mkdir);
6352	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
6353	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
6354	mkdir1->md_state = ATTACHED | MKDIR_BODY;
6355	mkdir1->md_diradd = dap;
6356	mkdir1->md_jaddref = NULL;
6357	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
6358	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
6359	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
6360	mkdir2->md_diradd = dap;
6361	mkdir2->md_jaddref = NULL;
6362	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) {
6363		mkdir1->md_state |= DEPCOMPLETE;
6364		mkdir2->md_state |= DEPCOMPLETE;
6365	}
6366	/*
6367	 * Dependency on "." and ".." being written to disk.
6368	 */
6369	mkdir1->md_buf = newdirbp;
6370	ACQUIRE_LOCK(&lk);
6371	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
6372	/*
6373	 * We must link the pagedep, allocdirect, and newdirblk for
6374	 * the initial file page so the pointer to the new directory
6375	 * is not written until the directory contents are live and
6376	 * any subsequent additions are not marked live until the
6377	 * block is reachable via the inode.
6378	 */
6379	if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0)
6380		panic("setup_newdir: lost pagedep");
6381	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
6382		if (wk->wk_type == D_ALLOCDIRECT)
6383			break;
6384	if (wk == NULL)
6385		panic("setup_newdir: lost allocdirect");
6386	newblk = WK_NEWBLK(wk);
6387	pagedep->pd_state |= NEWBLOCK;
6388	pagedep->pd_newdirblk = newdirblk;
6389	newdirblk->db_pagedep = pagedep;
6390	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
6391	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
6392	/*
6393	 * Look up the inodedep for the parent directory so that we
6394	 * can link mkdir2 into the pending dotdot jaddref or
6395	 * the inode write if there is none.  If the inode is
6396	 * ALLCOMPLETE and no jaddref is present all dependencies have
6397	 * been satisfied and mkdir2 can be freed.
6398	 */
6399	inodedep_lookup(mp, dinum, 0, &inodedep);
6400	if (mp->mnt_kern_flag & MNTK_SUJ) {
6401		if (inodedep == NULL)
6402			panic("setup_newdir: Lost parent.");
6403		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
6404		    inoreflst);
6405		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
6406		    (jaddref->ja_state & MKDIR_PARENT),
6407		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
6408		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
6409		mkdir2->md_jaddref = jaddref;
6410		jaddref->ja_mkdir = mkdir2;
6411	} else if (inodedep == NULL ||
6412	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
6413		dap->da_state &= ~MKDIR_PARENT;
6414		WORKITEM_FREE(mkdir2, D_MKDIR);
6415	} else {
6416		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
6417		WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
6418	}
6419	*mkdirp = mkdir2;
6420
6421	return (mkdir1);
6422}
6423
6424/*
6425 * Directory entry addition dependencies.
6426 *
6427 * When adding a new directory entry, the inode (with its incremented link
6428 * count) must be written to disk before the directory entry's pointer to it.
6429 * Also, if the inode is newly allocated, the corresponding freemap must be
6430 * updated (on disk) before the directory entry's pointer. These requirements
6431 * are met via undo/redo on the directory entry's pointer, which consists
6432 * simply of the inode number.
6433 *
6434 * As directory entries are added and deleted, the free space within a
6435 * directory block can become fragmented.  The ufs filesystem will compact
6436 * a fragmented directory block to make space for a new entry. When this
6437 * occurs, the offsets of previously added entries change. Any "diradd"
6438 * dependency structures corresponding to these entries must be updated with
6439 * the new offsets.
6440 */
6441
6442/*
6443 * This routine is called after the in-memory inode's link
6444 * count has been incremented, but before the directory entry's
6445 * pointer to the inode has been set.
6446 */
6447int
6448softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
6449	struct buf *bp;		/* buffer containing directory block */
6450	struct inode *dp;	/* inode for directory */
6451	off_t diroffset;	/* offset of new entry in directory */
6452	ino_t newinum;		/* inode referenced by new directory entry */
6453	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
6454	int isnewblk;		/* entry is in a newly allocated block */
6455{
6456	int offset;		/* offset of new entry within directory block */
6457	ufs_lbn_t lbn;		/* block in directory containing new entry */
6458	struct fs *fs;
6459	struct diradd *dap;
6460	struct newblk *newblk;
6461	struct pagedep *pagedep;
6462	struct inodedep *inodedep;
6463	struct newdirblk *newdirblk = 0;
6464	struct mkdir *mkdir1, *mkdir2;
6465	struct jaddref *jaddref;
6466	struct mount *mp;
6467	int isindir;
6468
6469	/*
6470	 * Whiteouts have no dependencies.
6471	 */
6472	if (newinum == WINO) {
6473		if (newdirbp != NULL)
6474			bdwrite(newdirbp);
6475		return (0);
6476	}
6477	jaddref = NULL;
6478	mkdir1 = mkdir2 = NULL;
6479	mp = UFSTOVFS(dp->i_ump);
6480	fs = dp->i_fs;
6481	lbn = lblkno(fs, diroffset);
6482	offset = blkoff(fs, diroffset);
6483	dap = malloc(sizeof(struct diradd), M_DIRADD,
6484		M_SOFTDEP_FLAGS|M_ZERO);
6485	workitem_alloc(&dap->da_list, D_DIRADD, mp);
6486	dap->da_offset = offset;
6487	dap->da_newinum = newinum;
6488	dap->da_state = ATTACHED;
6489	LIST_INIT(&dap->da_jwork);
6490	isindir = bp->b_lblkno >= NDADDR;
6491	if (isnewblk &&
6492	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
6493		newdirblk = malloc(sizeof(struct newdirblk),
6494		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
6495		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
6496		LIST_INIT(&newdirblk->db_mkdir);
6497	}
6498	/*
6499	 * If we're creating a new directory setup the dependencies and set
6500	 * the dap state to wait for them.  Otherwise it's COMPLETE and
6501	 * we can move on.
6502	 */
6503	if (newdirbp == NULL) {
6504		dap->da_state |= DEPCOMPLETE;
6505		ACQUIRE_LOCK(&lk);
6506	} else {
6507		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
6508		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
6509		    &mkdir2);
6510	}
6511	/*
6512	 * Link into parent directory pagedep to await its being written.
6513	 */
6514	if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0)
6515		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
6516#ifdef DEBUG
6517	if (diradd_lookup(pagedep, offset) != NULL)
6518		panic("softdep_setup_directory_add: %p already at off %d\n",
6519		    diradd_lookup(pagedep, offset), offset);
6520#endif
6521	dap->da_pagedep = pagedep;
6522	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
6523	    da_pdlist);
6524	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
6525	/*
6526	 * If we're journaling, link the diradd into the jaddref so it
6527	 * may be completed after the journal entry is written.  Otherwise,
6528	 * link the diradd into its inodedep.  If the inode is not yet
6529	 * written place it on the bufwait list, otherwise do the post-inode
6530	 * write processing to put it on the id_pendinghd list.
6531	 */
6532	if (mp->mnt_kern_flag & MNTK_SUJ) {
6533		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
6534		    inoreflst);
6535		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
6536		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
6537		jaddref->ja_diroff = diroffset;
6538		jaddref->ja_diradd = dap;
6539		add_to_journal(&jaddref->ja_list);
6540	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
6541		diradd_inode_written(dap, inodedep);
6542	else
6543		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
6544	/*
6545	 * Add the journal entries for . and .. links now that the primary
6546	 * link is written.
6547	 */
6548	if (mkdir1 != NULL && mp->mnt_kern_flag & MNTK_SUJ) {
6549		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
6550		    inoreflst, if_deps);
6551		KASSERT(jaddref != NULL &&
6552		    jaddref->ja_ino == jaddref->ja_parent &&
6553		    (jaddref->ja_state & MKDIR_BODY),
6554		    ("softdep_setup_directory_add: bad dot jaddref %p",
6555		    jaddref));
6556		mkdir1->md_jaddref = jaddref;
6557		jaddref->ja_mkdir = mkdir1;
6558		/*
6559		 * It is important that the dotdot journal entry
6560		 * is added prior to the dot entry since dot writes
6561		 * both the dot and dotdot links.  These both must
6562		 * be added after the primary link for the journal
6563		 * to remain consistent.
6564		 */
6565		add_to_journal(&mkdir2->md_jaddref->ja_list);
6566		add_to_journal(&jaddref->ja_list);
6567	}
6568	/*
6569	 * If we are adding a new directory remember this diradd so that if
6570	 * we rename it we can keep the dot and dotdot dependencies.  If
6571	 * we are adding a new name for an inode that has a mkdiradd we
6572	 * must be in rename and we have to move the dot and dotdot
6573	 * dependencies to this new name.  The old name is being orphaned
6574	 * soon.
6575	 */
6576	if (mkdir1 != NULL) {
6577		if (inodedep->id_mkdiradd != NULL)
6578			panic("softdep_setup_directory_add: Existing mkdir");
6579		inodedep->id_mkdiradd = dap;
6580	} else if (inodedep->id_mkdiradd)
6581		merge_diradd(inodedep, dap);
6582	if (newdirblk) {
6583		/*
6584		 * There is nothing to do if we are already tracking
6585		 * this block.
6586		 */
6587		if ((pagedep->pd_state & NEWBLOCK) != 0) {
6588			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
6589			FREE_LOCK(&lk);
6590			return (0);
6591		}
6592		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
6593		    == 0)
6594			panic("softdep_setup_directory_add: lost entry");
6595		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
6596		pagedep->pd_state |= NEWBLOCK;
6597		pagedep->pd_newdirblk = newdirblk;
6598		newdirblk->db_pagedep = pagedep;
6599		FREE_LOCK(&lk);
6600		/*
6601		 * If we extended into an indirect signal direnter to sync.
6602		 */
6603		if (isindir)
6604			return (1);
6605		return (0);
6606	}
6607	FREE_LOCK(&lk);
6608	return (0);
6609}
6610
6611/*
6612 * This procedure is called to change the offset of a directory
6613 * entry when compacting a directory block which must be owned
6614 * exclusively by the caller. Note that the actual entry movement
6615 * must be done in this procedure to ensure that no I/O completions
6616 * occur while the move is in progress.
6617 */
6618void
6619softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
6620	struct buf *bp;		/* Buffer holding directory block. */
6621	struct inode *dp;	/* inode for directory */
6622	caddr_t base;		/* address of dp->i_offset */
6623	caddr_t oldloc;		/* address of old directory location */
6624	caddr_t newloc;		/* address of new directory location */
6625	int entrysize;		/* size of directory entry */
6626{
6627	int offset, oldoffset, newoffset;
6628	struct pagedep *pagedep;
6629	struct jmvref *jmvref;
6630	struct diradd *dap;
6631	struct direct *de;
6632	struct mount *mp;
6633	ufs_lbn_t lbn;
6634	int flags;
6635
6636	mp = UFSTOVFS(dp->i_ump);
6637	de = (struct direct *)oldloc;
6638	jmvref = NULL;
6639	flags = 0;
6640	/*
6641	 * Moves are always journaled as it would be too complex to
6642	 * determine if any affected adds or removes are present in the
6643	 * journal.
6644	 */
6645	if (mp->mnt_kern_flag & MNTK_SUJ)  {
6646		flags = DEPALLOC;
6647		jmvref = newjmvref(dp, de->d_ino,
6648		    dp->i_offset + (oldloc - base),
6649		    dp->i_offset + (newloc - base));
6650	}
6651	lbn = lblkno(dp->i_fs, dp->i_offset);
6652	offset = blkoff(dp->i_fs, dp->i_offset);
6653	oldoffset = offset + (oldloc - base);
6654	newoffset = offset + (newloc - base);
6655	ACQUIRE_LOCK(&lk);
6656	if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) {
6657		if (pagedep)
6658			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
6659		goto done;
6660	}
6661	dap = diradd_lookup(pagedep, oldoffset);
6662	if (dap) {
6663		dap->da_offset = newoffset;
6664		newoffset = DIRADDHASH(newoffset);
6665		oldoffset = DIRADDHASH(oldoffset);
6666		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
6667		    newoffset != oldoffset) {
6668			LIST_REMOVE(dap, da_pdlist);
6669			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
6670			    dap, da_pdlist);
6671		}
6672	}
6673done:
6674	if (jmvref) {
6675		jmvref->jm_pagedep = pagedep;
6676		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
6677		add_to_journal(&jmvref->jm_list);
6678	}
6679	bcopy(oldloc, newloc, entrysize);
6680	FREE_LOCK(&lk);
6681}
6682
6683/*
6684 * Move the mkdir dependencies and journal work from one diradd to another
6685 * when renaming a directory.  The new name must depend on the mkdir deps
6686 * completing as the old name did.  Directories can only have one valid link
6687 * at a time so one must be canonical.
6688 */
6689static void
6690merge_diradd(inodedep, newdap)
6691	struct inodedep *inodedep;
6692	struct diradd *newdap;
6693{
6694	struct diradd *olddap;
6695	struct mkdir *mkdir, *nextmd;
6696	short state;
6697
6698	olddap = inodedep->id_mkdiradd;
6699	inodedep->id_mkdiradd = newdap;
6700	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
6701		newdap->da_state &= ~DEPCOMPLETE;
6702		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
6703			nextmd = LIST_NEXT(mkdir, md_mkdirs);
6704			if (mkdir->md_diradd != olddap)
6705				continue;
6706			mkdir->md_diradd = newdap;
6707			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
6708			newdap->da_state |= state;
6709			olddap->da_state &= ~state;
6710			if ((olddap->da_state &
6711			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
6712				break;
6713		}
6714		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
6715			panic("merge_diradd: unfound ref");
6716	}
6717	/*
6718	 * Any mkdir related journal items are not safe to be freed until
6719	 * the new name is stable.
6720	 */
6721	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
6722	olddap->da_state |= DEPCOMPLETE;
6723	complete_diradd(olddap);
6724}
6725
6726/*
6727 * Move the diradd to the pending list when all diradd dependencies are
6728 * complete.
6729 */
6730static void
6731complete_diradd(dap)
6732	struct diradd *dap;
6733{
6734	struct pagedep *pagedep;
6735
6736	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
6737		if (dap->da_state & DIRCHG)
6738			pagedep = dap->da_previous->dm_pagedep;
6739		else
6740			pagedep = dap->da_pagedep;
6741		LIST_REMOVE(dap, da_pdlist);
6742		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
6743	}
6744}
6745
6746/*
6747 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
6748 * add entries and conditonally journal the remove.
6749 */
6750static void
6751cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
6752	struct diradd *dap;
6753	struct dirrem *dirrem;
6754	struct jremref *jremref;
6755	struct jremref *dotremref;
6756	struct jremref *dotdotremref;
6757{
6758	struct inodedep *inodedep;
6759	struct jaddref *jaddref;
6760	struct inoref *inoref;
6761	struct mkdir *mkdir;
6762
6763	/*
6764	 * If no remove references were allocated we're on a non-journaled
6765	 * filesystem and can skip the cancel step.
6766	 */
6767	if (jremref == NULL) {
6768		free_diradd(dap, NULL);
6769		return;
6770	}
6771	/*
6772	 * Cancel the primary name an free it if it does not require
6773	 * journaling.
6774	 */
6775	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
6776	    0, &inodedep) != 0) {
6777		/* Abort the addref that reference this diradd.  */
6778		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
6779			if (inoref->if_list.wk_type != D_JADDREF)
6780				continue;
6781			jaddref = (struct jaddref *)inoref;
6782			if (jaddref->ja_diradd != dap)
6783				continue;
6784			if (cancel_jaddref(jaddref, inodedep,
6785			    &dirrem->dm_jwork) == 0) {
6786				free_jremref(jremref);
6787				jremref = NULL;
6788			}
6789			break;
6790		}
6791	}
6792	/*
6793	 * Cancel subordinate names and free them if they do not require
6794	 * journaling.
6795	 */
6796	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
6797		LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
6798			if (mkdir->md_diradd != dap)
6799				continue;
6800			if ((jaddref = mkdir->md_jaddref) == NULL)
6801				continue;
6802			mkdir->md_jaddref = NULL;
6803			if (mkdir->md_state & MKDIR_PARENT) {
6804				if (cancel_jaddref(jaddref, NULL,
6805				    &dirrem->dm_jwork) == 0) {
6806					free_jremref(dotdotremref);
6807					dotdotremref = NULL;
6808				}
6809			} else {
6810				if (cancel_jaddref(jaddref, inodedep,
6811				    &dirrem->dm_jwork) == 0) {
6812					free_jremref(dotremref);
6813					dotremref = NULL;
6814				}
6815			}
6816		}
6817	}
6818
6819	if (jremref)
6820		journal_jremref(dirrem, jremref, inodedep);
6821	if (dotremref)
6822		journal_jremref(dirrem, dotremref, inodedep);
6823	if (dotdotremref)
6824		journal_jremref(dirrem, dotdotremref, NULL);
6825	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
6826	free_diradd(dap, &dirrem->dm_jwork);
6827}
6828
6829/*
6830 * Free a diradd dependency structure. This routine must be called
6831 * with splbio interrupts blocked.
6832 */
6833static void
6834free_diradd(dap, wkhd)
6835	struct diradd *dap;
6836	struct workhead *wkhd;
6837{
6838	struct dirrem *dirrem;
6839	struct pagedep *pagedep;
6840	struct inodedep *inodedep;
6841	struct mkdir *mkdir, *nextmd;
6842
6843	mtx_assert(&lk, MA_OWNED);
6844	LIST_REMOVE(dap, da_pdlist);
6845	if (dap->da_state & ONWORKLIST)
6846		WORKLIST_REMOVE(&dap->da_list);
6847	if ((dap->da_state & DIRCHG) == 0) {
6848		pagedep = dap->da_pagedep;
6849	} else {
6850		dirrem = dap->da_previous;
6851		pagedep = dirrem->dm_pagedep;
6852		dirrem->dm_dirinum = pagedep->pd_ino;
6853		dirrem->dm_state |= COMPLETE;
6854		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
6855			add_to_worklist(&dirrem->dm_list, 0);
6856	}
6857	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
6858	    0, &inodedep) != 0)
6859		if (inodedep->id_mkdiradd == dap)
6860			inodedep->id_mkdiradd = NULL;
6861	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
6862		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
6863			nextmd = LIST_NEXT(mkdir, md_mkdirs);
6864			if (mkdir->md_diradd != dap)
6865				continue;
6866			dap->da_state &=
6867			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
6868			LIST_REMOVE(mkdir, md_mkdirs);
6869			if (mkdir->md_state & ONWORKLIST)
6870				WORKLIST_REMOVE(&mkdir->md_list);
6871			if (mkdir->md_jaddref != NULL)
6872				panic("free_diradd: Unexpected jaddref");
6873			WORKITEM_FREE(mkdir, D_MKDIR);
6874			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
6875				break;
6876		}
6877		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
6878			panic("free_diradd: unfound ref");
6879	}
6880	if (inodedep)
6881		free_inodedep(inodedep);
6882	/*
6883	 * Free any journal segments waiting for the directory write.
6884	 */
6885	handle_jwork(&dap->da_jwork);
6886	WORKITEM_FREE(dap, D_DIRADD);
6887}
6888
6889/*
6890 * Directory entry removal dependencies.
6891 *
6892 * When removing a directory entry, the entry's inode pointer must be
6893 * zero'ed on disk before the corresponding inode's link count is decremented
6894 * (possibly freeing the inode for re-use). This dependency is handled by
6895 * updating the directory entry but delaying the inode count reduction until
6896 * after the directory block has been written to disk. After this point, the
6897 * inode count can be decremented whenever it is convenient.
6898 */
6899
6900/*
6901 * This routine should be called immediately after removing
6902 * a directory entry.  The inode's link count should not be
6903 * decremented by the calling procedure -- the soft updates
6904 * code will do this task when it is safe.
6905 */
6906void
6907softdep_setup_remove(bp, dp, ip, isrmdir)
6908	struct buf *bp;		/* buffer containing directory block */
6909	struct inode *dp;	/* inode for the directory being modified */
6910	struct inode *ip;	/* inode for directory entry being removed */
6911	int isrmdir;		/* indicates if doing RMDIR */
6912{
6913	struct dirrem *dirrem, *prevdirrem;
6914	struct inodedep *inodedep;
6915	int direct;
6916
6917	/*
6918	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
6919	 * newdirrem() to setup the full directory remove which requires
6920	 * isrmdir > 1.
6921	 */
6922	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
6923	/*
6924	 * Add the dirrem to the inodedep's pending remove list for quick
6925	 * discovery later.
6926	 */
6927	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
6928	    &inodedep) == 0)
6929		panic("softdep_setup_remove: Lost inodedep.");
6930	dirrem->dm_state |= ONDEPLIST;
6931	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
6932
6933	/*
6934	 * If the COMPLETE flag is clear, then there were no active
6935	 * entries and we want to roll back to a zeroed entry until
6936	 * the new inode is committed to disk. If the COMPLETE flag is
6937	 * set then we have deleted an entry that never made it to
6938	 * disk. If the entry we deleted resulted from a name change,
6939	 * then the old name still resides on disk. We cannot delete
6940	 * its inode (returned to us in prevdirrem) until the zeroed
6941	 * directory entry gets to disk. The new inode has never been
6942	 * referenced on the disk, so can be deleted immediately.
6943	 */
6944	if ((dirrem->dm_state & COMPLETE) == 0) {
6945		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
6946		    dm_next);
6947		FREE_LOCK(&lk);
6948	} else {
6949		if (prevdirrem != NULL)
6950			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
6951			    prevdirrem, dm_next);
6952		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
6953		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
6954		FREE_LOCK(&lk);
6955		if (direct)
6956			handle_workitem_remove(dirrem, NULL);
6957	}
6958}
6959
6960/*
6961 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
6962 * pd_pendinghd list of a pagedep.
6963 */
6964static struct diradd *
6965diradd_lookup(pagedep, offset)
6966	struct pagedep *pagedep;
6967	int offset;
6968{
6969	struct diradd *dap;
6970
6971	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
6972		if (dap->da_offset == offset)
6973			return (dap);
6974	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
6975		if (dap->da_offset == offset)
6976			return (dap);
6977	return (NULL);
6978}
6979
6980/*
6981 * Search for a .. diradd dependency in a directory that is being removed.
6982 * If the directory was renamed to a new parent we have a diradd rather
6983 * than a mkdir for the .. entry.  We need to cancel it now before
6984 * it is found in truncate().
6985 */
6986static struct jremref *
6987cancel_diradd_dotdot(ip, dirrem, jremref)
6988	struct inode *ip;
6989	struct dirrem *dirrem;
6990	struct jremref *jremref;
6991{
6992	struct pagedep *pagedep;
6993	struct diradd *dap;
6994	struct worklist *wk;
6995
6996	if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0,
6997	    &pagedep) == 0)
6998		return (jremref);
6999	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
7000	if (dap == NULL)
7001		return (jremref);
7002	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
7003	/*
7004	 * Mark any journal work as belonging to the parent so it is freed
7005	 * with the .. reference.
7006	 */
7007	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
7008		wk->wk_state |= MKDIR_PARENT;
7009	return (NULL);
7010}
7011
7012/*
7013 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
7014 * replace it with a dirrem/diradd pair as a result of re-parenting a
7015 * directory.  This ensures that we don't simultaneously have a mkdir and
7016 * a diradd for the same .. entry.
7017 */
7018static struct jremref *
7019cancel_mkdir_dotdot(ip, dirrem, jremref)
7020	struct inode *ip;
7021	struct dirrem *dirrem;
7022	struct jremref *jremref;
7023{
7024	struct inodedep *inodedep;
7025	struct jaddref *jaddref;
7026	struct mkdir *mkdir;
7027	struct diradd *dap;
7028
7029	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
7030	    &inodedep) == 0)
7031		panic("cancel_mkdir_dotdot: Lost inodedep");
7032	dap = inodedep->id_mkdiradd;
7033	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
7034		return (jremref);
7035	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir;
7036	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
7037		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
7038			break;
7039	if (mkdir == NULL)
7040		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
7041	if ((jaddref = mkdir->md_jaddref) != NULL) {
7042		mkdir->md_jaddref = NULL;
7043		jaddref->ja_state &= ~MKDIR_PARENT;
7044		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
7045		    &inodedep) == 0)
7046			panic("cancel_mkdir_dotdot: Lost parent inodedep");
7047		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
7048			journal_jremref(dirrem, jremref, inodedep);
7049			jremref = NULL;
7050		}
7051	}
7052	if (mkdir->md_state & ONWORKLIST)
7053		WORKLIST_REMOVE(&mkdir->md_list);
7054	mkdir->md_state |= ALLCOMPLETE;
7055	complete_mkdir(mkdir);
7056	return (jremref);
7057}
7058
7059static void
7060journal_jremref(dirrem, jremref, inodedep)
7061	struct dirrem *dirrem;
7062	struct jremref *jremref;
7063	struct inodedep *inodedep;
7064{
7065
7066	if (inodedep == NULL)
7067		if (inodedep_lookup(jremref->jr_list.wk_mp,
7068		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
7069			panic("journal_jremref: Lost inodedep");
7070	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
7071	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
7072	add_to_journal(&jremref->jr_list);
7073}
7074
7075static void
7076dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
7077	struct dirrem *dirrem;
7078	struct jremref *jremref;
7079	struct jremref *dotremref;
7080	struct jremref *dotdotremref;
7081{
7082	struct inodedep *inodedep;
7083
7084
7085	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
7086	    &inodedep) == 0)
7087		panic("dirrem_journal: Lost inodedep");
7088	journal_jremref(dirrem, jremref, inodedep);
7089	if (dotremref)
7090		journal_jremref(dirrem, dotremref, inodedep);
7091	if (dotdotremref)
7092		journal_jremref(dirrem, dotdotremref, NULL);
7093}
7094
7095/*
7096 * Allocate a new dirrem if appropriate and return it along with
7097 * its associated pagedep. Called without a lock, returns with lock.
7098 */
7099static long num_dirrem;		/* number of dirrem allocated */
7100static struct dirrem *
7101newdirrem(bp, dp, ip, isrmdir, prevdirremp)
7102	struct buf *bp;		/* buffer containing directory block */
7103	struct inode *dp;	/* inode for the directory being modified */
7104	struct inode *ip;	/* inode for directory entry being removed */
7105	int isrmdir;		/* indicates if doing RMDIR */
7106	struct dirrem **prevdirremp; /* previously referenced inode, if any */
7107{
7108	int offset;
7109	ufs_lbn_t lbn;
7110	struct diradd *dap;
7111	struct dirrem *dirrem;
7112	struct pagedep *pagedep;
7113	struct jremref *jremref;
7114	struct jremref *dotremref;
7115	struct jremref *dotdotremref;
7116	struct vnode *dvp;
7117
7118	/*
7119	 * Whiteouts have no deletion dependencies.
7120	 */
7121	if (ip == NULL)
7122		panic("newdirrem: whiteout");
7123	dvp = ITOV(dp);
7124	/*
7125	 * If we are over our limit, try to improve the situation.
7126	 * Limiting the number of dirrem structures will also limit
7127	 * the number of freefile and freeblks structures.
7128	 */
7129	ACQUIRE_LOCK(&lk);
7130	if (!(ip->i_flags & SF_SNAPSHOT) && num_dirrem > max_softdeps / 2)
7131		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS);
7132	num_dirrem += 1;
7133	FREE_LOCK(&lk);
7134	dirrem = malloc(sizeof(struct dirrem),
7135		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
7136	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
7137	LIST_INIT(&dirrem->dm_jremrefhd);
7138	LIST_INIT(&dirrem->dm_jwork);
7139	dirrem->dm_state = isrmdir ? RMDIR : 0;
7140	dirrem->dm_oldinum = ip->i_number;
7141	*prevdirremp = NULL;
7142	/*
7143	 * Allocate remove reference structures to track journal write
7144	 * dependencies.  We will always have one for the link and
7145	 * when doing directories we will always have one more for dot.
7146	 * When renaming a directory we skip the dotdot link change so
7147	 * this is not needed.
7148	 */
7149	jremref = dotremref = dotdotremref = NULL;
7150	if (DOINGSUJ(dvp)) {
7151		if (isrmdir) {
7152			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
7153			    ip->i_effnlink + 2);
7154			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
7155			    ip->i_effnlink + 1);
7156			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
7157			    dp->i_effnlink + 1);
7158			dotdotremref->jr_state |= MKDIR_PARENT;
7159		} else
7160			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
7161			    ip->i_effnlink + 1);
7162	}
7163	ACQUIRE_LOCK(&lk);
7164	lbn = lblkno(dp->i_fs, dp->i_offset);
7165	offset = blkoff(dp->i_fs, dp->i_offset);
7166	if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC,
7167	    &pagedep) == 0)
7168		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
7169	dirrem->dm_pagedep = pagedep;
7170	/*
7171	 * If we're renaming a .. link to a new directory, cancel any
7172	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
7173	 * the jremref is preserved for any potential diradd in this
7174	 * location.  This can not coincide with a rmdir.
7175	 */
7176	if (dp->i_offset == DOTDOT_OFFSET) {
7177		if (isrmdir)
7178			panic("newdirrem: .. directory change during remove?");
7179		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
7180	}
7181	/*
7182	 * If we're removing a directory search for the .. dependency now and
7183	 * cancel it.  Any pending journal work will be added to the dirrem
7184	 * to be completed when the workitem remove completes.
7185	 */
7186	if (isrmdir)
7187		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
7188	/*
7189	 * Check for a diradd dependency for the same directory entry.
7190	 * If present, then both dependencies become obsolete and can
7191	 * be de-allocated.
7192	 */
7193	dap = diradd_lookup(pagedep, offset);
7194	if (dap == NULL) {
7195		/*
7196		 * Link the jremref structures into the dirrem so they are
7197		 * written prior to the pagedep.
7198		 */
7199		if (jremref)
7200			dirrem_journal(dirrem, jremref, dotremref,
7201			    dotdotremref);
7202		return (dirrem);
7203	}
7204	/*
7205	 * Must be ATTACHED at this point.
7206	 */
7207	if ((dap->da_state & ATTACHED) == 0)
7208		panic("newdirrem: not ATTACHED");
7209	if (dap->da_newinum != ip->i_number)
7210		panic("newdirrem: inum %d should be %d",
7211		    ip->i_number, dap->da_newinum);
7212	/*
7213	 * If we are deleting a changed name that never made it to disk,
7214	 * then return the dirrem describing the previous inode (which
7215	 * represents the inode currently referenced from this entry on disk).
7216	 */
7217	if ((dap->da_state & DIRCHG) != 0) {
7218		*prevdirremp = dap->da_previous;
7219		dap->da_state &= ~DIRCHG;
7220		dap->da_pagedep = pagedep;
7221	}
7222	/*
7223	 * We are deleting an entry that never made it to disk.
7224	 * Mark it COMPLETE so we can delete its inode immediately.
7225	 */
7226	dirrem->dm_state |= COMPLETE;
7227	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
7228#ifdef SUJ_DEBUG
7229	if (isrmdir == 0) {
7230		struct worklist *wk;
7231
7232		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
7233			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
7234				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
7235	}
7236#endif
7237
7238	return (dirrem);
7239}
7240
7241/*
7242 * Directory entry change dependencies.
7243 *
7244 * Changing an existing directory entry requires that an add operation
7245 * be completed first followed by a deletion. The semantics for the addition
7246 * are identical to the description of adding a new entry above except
7247 * that the rollback is to the old inode number rather than zero. Once
7248 * the addition dependency is completed, the removal is done as described
7249 * in the removal routine above.
7250 */
7251
7252/*
7253 * This routine should be called immediately after changing
7254 * a directory entry.  The inode's link count should not be
7255 * decremented by the calling procedure -- the soft updates
7256 * code will perform this task when it is safe.
7257 */
7258void
7259softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
7260	struct buf *bp;		/* buffer containing directory block */
7261	struct inode *dp;	/* inode for the directory being modified */
7262	struct inode *ip;	/* inode for directory entry being removed */
7263	ino_t newinum;		/* new inode number for changed entry */
7264	int isrmdir;		/* indicates if doing RMDIR */
7265{
7266	int offset;
7267	struct diradd *dap = NULL;
7268	struct dirrem *dirrem, *prevdirrem;
7269	struct pagedep *pagedep;
7270	struct inodedep *inodedep;
7271	struct jaddref *jaddref;
7272	struct mount *mp;
7273
7274	offset = blkoff(dp->i_fs, dp->i_offset);
7275	mp = UFSTOVFS(dp->i_ump);
7276
7277	/*
7278	 * Whiteouts do not need diradd dependencies.
7279	 */
7280	if (newinum != WINO) {
7281		dap = malloc(sizeof(struct diradd),
7282		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
7283		workitem_alloc(&dap->da_list, D_DIRADD, mp);
7284		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
7285		dap->da_offset = offset;
7286		dap->da_newinum = newinum;
7287		LIST_INIT(&dap->da_jwork);
7288	}
7289
7290	/*
7291	 * Allocate a new dirrem and ACQUIRE_LOCK.
7292	 */
7293	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
7294	pagedep = dirrem->dm_pagedep;
7295	/*
7296	 * The possible values for isrmdir:
7297	 *	0 - non-directory file rename
7298	 *	1 - directory rename within same directory
7299	 *   inum - directory rename to new directory of given inode number
7300	 * When renaming to a new directory, we are both deleting and
7301	 * creating a new directory entry, so the link count on the new
7302	 * directory should not change. Thus we do not need the followup
7303	 * dirrem which is usually done in handle_workitem_remove. We set
7304	 * the DIRCHG flag to tell handle_workitem_remove to skip the
7305	 * followup dirrem.
7306	 */
7307	if (isrmdir > 1)
7308		dirrem->dm_state |= DIRCHG;
7309
7310	/*
7311	 * Whiteouts have no additional dependencies,
7312	 * so just put the dirrem on the correct list.
7313	 */
7314	if (newinum == WINO) {
7315		if ((dirrem->dm_state & COMPLETE) == 0) {
7316			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
7317			    dm_next);
7318		} else {
7319			dirrem->dm_dirinum = pagedep->pd_ino;
7320			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
7321				add_to_worklist(&dirrem->dm_list, 0);
7322		}
7323		FREE_LOCK(&lk);
7324		return;
7325	}
7326	/*
7327	 * Add the dirrem to the inodedep's pending remove list for quick
7328	 * discovery later.  A valid nlinkdelta ensures that this lookup
7329	 * will not fail.
7330	 */
7331	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
7332		panic("softdep_setup_directory_change: Lost inodedep.");
7333	dirrem->dm_state |= ONDEPLIST;
7334	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
7335
7336	/*
7337	 * If the COMPLETE flag is clear, then there were no active
7338	 * entries and we want to roll back to the previous inode until
7339	 * the new inode is committed to disk. If the COMPLETE flag is
7340	 * set, then we have deleted an entry that never made it to disk.
7341	 * If the entry we deleted resulted from a name change, then the old
7342	 * inode reference still resides on disk. Any rollback that we do
7343	 * needs to be to that old inode (returned to us in prevdirrem). If
7344	 * the entry we deleted resulted from a create, then there is
7345	 * no entry on the disk, so we want to roll back to zero rather
7346	 * than the uncommitted inode. In either of the COMPLETE cases we
7347	 * want to immediately free the unwritten and unreferenced inode.
7348	 */
7349	if ((dirrem->dm_state & COMPLETE) == 0) {
7350		dap->da_previous = dirrem;
7351	} else {
7352		if (prevdirrem != NULL) {
7353			dap->da_previous = prevdirrem;
7354		} else {
7355			dap->da_state &= ~DIRCHG;
7356			dap->da_pagedep = pagedep;
7357		}
7358		dirrem->dm_dirinum = pagedep->pd_ino;
7359		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
7360			add_to_worklist(&dirrem->dm_list, 0);
7361	}
7362	/*
7363	 * Lookup the jaddref for this journal entry.  We must finish
7364	 * initializing it and make the diradd write dependent on it.
7365	 * If we're not journaling Put it on the id_bufwait list if the inode
7366	 * is not yet written. If it is written, do the post-inode write
7367	 * processing to put it on the id_pendinghd list.
7368	 */
7369	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
7370	if (mp->mnt_kern_flag & MNTK_SUJ) {
7371		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
7372		    inoreflst);
7373		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
7374		    ("softdep_setup_directory_change: bad jaddref %p",
7375		    jaddref));
7376		jaddref->ja_diroff = dp->i_offset;
7377		jaddref->ja_diradd = dap;
7378		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
7379		    dap, da_pdlist);
7380		add_to_journal(&jaddref->ja_list);
7381	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
7382		dap->da_state |= COMPLETE;
7383		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
7384		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
7385	} else {
7386		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
7387		    dap, da_pdlist);
7388		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
7389	}
7390	/*
7391	 * If we're making a new name for a directory that has not been
7392	 * committed when need to move the dot and dotdot references to
7393	 * this new name.
7394	 */
7395	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
7396		merge_diradd(inodedep, dap);
7397	FREE_LOCK(&lk);
7398}
7399
7400/*
7401 * Called whenever the link count on an inode is changed.
7402 * It creates an inode dependency so that the new reference(s)
7403 * to the inode cannot be committed to disk until the updated
7404 * inode has been written.
7405 */
7406void
7407softdep_change_linkcnt(ip)
7408	struct inode *ip;	/* the inode with the increased link count */
7409{
7410	struct inodedep *inodedep;
7411
7412	ACQUIRE_LOCK(&lk);
7413	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
7414	if (ip->i_nlink < ip->i_effnlink)
7415		panic("softdep_change_linkcnt: bad delta");
7416	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
7417	FREE_LOCK(&lk);
7418}
7419
7420/*
7421 * Attach a sbdep dependency to the superblock buf so that we can keep
7422 * track of the head of the linked list of referenced but unlinked inodes.
7423 */
7424void
7425softdep_setup_sbupdate(ump, fs, bp)
7426	struct ufsmount *ump;
7427	struct fs *fs;
7428	struct buf *bp;
7429{
7430	struct sbdep *sbdep;
7431	struct worklist *wk;
7432
7433	if ((fs->fs_flags & FS_SUJ) == 0)
7434		return;
7435	LIST_FOREACH(wk, &bp->b_dep, wk_list)
7436		if (wk->wk_type == D_SBDEP)
7437			break;
7438	if (wk != NULL)
7439		return;
7440	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
7441	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
7442	sbdep->sb_fs = fs;
7443	sbdep->sb_ump = ump;
7444	ACQUIRE_LOCK(&lk);
7445	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
7446	FREE_LOCK(&lk);
7447}
7448
7449/*
7450 * Return the first unlinked inodedep which is ready to be the head of the
7451 * list.  The inodedep and all those after it must have valid next pointers.
7452 */
7453static struct inodedep *
7454first_unlinked_inodedep(ump)
7455	struct ufsmount *ump;
7456{
7457	struct inodedep *inodedep;
7458	struct inodedep *idp;
7459
7460	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
7461	    inodedep; inodedep = idp) {
7462		if ((inodedep->id_state & UNLINKNEXT) == 0)
7463			return (NULL);
7464		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
7465		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
7466			break;
7467		if ((inodedep->id_state & UNLINKPREV) == 0)
7468			panic("first_unlinked_inodedep: prev != next");
7469	}
7470	if (inodedep == NULL)
7471		return (NULL);
7472
7473	return (inodedep);
7474}
7475
7476/*
7477 * Set the sujfree unlinked head pointer prior to writing a superblock.
7478 */
7479static void
7480initiate_write_sbdep(sbdep)
7481	struct sbdep *sbdep;
7482{
7483	struct inodedep *inodedep;
7484	struct fs *bpfs;
7485	struct fs *fs;
7486
7487	bpfs = sbdep->sb_fs;
7488	fs = sbdep->sb_ump->um_fs;
7489	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
7490	if (inodedep) {
7491		fs->fs_sujfree = inodedep->id_ino;
7492		inodedep->id_state |= UNLINKPREV;
7493	} else
7494		fs->fs_sujfree = 0;
7495	bpfs->fs_sujfree = fs->fs_sujfree;
7496}
7497
7498/*
7499 * After a superblock is written determine whether it must be written again
7500 * due to a changing unlinked list head.
7501 */
7502static int
7503handle_written_sbdep(sbdep, bp)
7504	struct sbdep *sbdep;
7505	struct buf *bp;
7506{
7507	struct inodedep *inodedep;
7508	struct mount *mp;
7509	struct fs *fs;
7510
7511	fs = sbdep->sb_fs;
7512	mp = UFSTOVFS(sbdep->sb_ump);
7513	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
7514	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
7515	    (inodedep == NULL && fs->fs_sujfree != 0)) {
7516		bdirty(bp);
7517		return (1);
7518	}
7519	WORKITEM_FREE(sbdep, D_SBDEP);
7520	if (fs->fs_sujfree == 0)
7521		return (0);
7522	if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0)
7523		panic("handle_written_sbdep: lost inodedep");
7524	/*
7525	 * Now that we have a record of this inode in stable store allow it
7526	 * to be written to free up pending work.  Inodes may see a lot of
7527	 * write activity after they are unlinked which we must not hold up.
7528	 */
7529	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
7530		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
7531			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
7532			    inodedep, inodedep->id_state);
7533		if (inodedep->id_state & UNLINKONLIST)
7534			break;
7535		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
7536	}
7537
7538	return (0);
7539}
7540
7541/*
7542 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
7543 */
7544static void
7545unlinked_inodedep(mp, inodedep)
7546	struct mount *mp;
7547	struct inodedep *inodedep;
7548{
7549	struct ufsmount *ump;
7550
7551	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
7552		return;
7553	ump = VFSTOUFS(mp);
7554	ump->um_fs->fs_fmod = 1;
7555	inodedep->id_state |= UNLINKED;
7556	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
7557}
7558
7559/*
7560 * Remove an inodedep from the unlinked inodedep list.  This may require
7561 * disk writes if the inode has made it that far.
7562 */
7563static void
7564clear_unlinked_inodedep(inodedep)
7565	struct inodedep *inodedep;
7566{
7567	struct ufsmount *ump;
7568	struct inodedep *idp;
7569	struct inodedep *idn;
7570	struct fs *fs;
7571	struct buf *bp;
7572	ino_t ino;
7573	ino_t nino;
7574	ino_t pino;
7575	int error;
7576
7577	ump = VFSTOUFS(inodedep->id_list.wk_mp);
7578	fs = ump->um_fs;
7579	ino = inodedep->id_ino;
7580	error = 0;
7581	for (;;) {
7582		/*
7583		 * If nothing has yet been written simply remove us from
7584		 * the in memory list and return.  This is the most common
7585		 * case where handle_workitem_remove() loses the final
7586		 * reference.
7587		 */
7588		if ((inodedep->id_state & UNLINKLINKS) == 0)
7589			break;
7590		/*
7591		 * If we have a NEXT pointer and no PREV pointer we can simply
7592		 * clear NEXT's PREV and remove ourselves from the list.  Be
7593		 * careful not to clear PREV if the superblock points at
7594		 * next as well.
7595		 */
7596		idn = TAILQ_NEXT(inodedep, id_unlinked);
7597		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
7598			if (idn && fs->fs_sujfree != idn->id_ino)
7599				idn->id_state &= ~UNLINKPREV;
7600			break;
7601		}
7602		/*
7603		 * Here we have an inodedep which is actually linked into
7604		 * the list.  We must remove it by forcing a write to the
7605		 * link before us, whether it be the superblock or an inode.
7606		 * Unfortunately the list may change while we're waiting
7607		 * on the buf lock for either resource so we must loop until
7608		 * we lock the right one.  If both the superblock and an
7609		 * inode point to this inode we must clear the inode first
7610		 * followed by the superblock.
7611		 */
7612		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
7613		pino = 0;
7614		if (idp && (idp->id_state & UNLINKNEXT))
7615			pino = idp->id_ino;
7616		FREE_LOCK(&lk);
7617		if (pino == 0)
7618			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
7619			    (int)fs->fs_sbsize, 0, 0, 0);
7620		else
7621			error = bread(ump->um_devvp,
7622			    fsbtodb(fs, ino_to_fsba(fs, pino)),
7623			    (int)fs->fs_bsize, NOCRED, &bp);
7624		ACQUIRE_LOCK(&lk);
7625		if (error)
7626			break;
7627		/* If the list has changed restart the loop. */
7628		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
7629		nino = 0;
7630		if (idp && (idp->id_state & UNLINKNEXT))
7631			nino = idp->id_ino;
7632		if (nino != pino ||
7633		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
7634			FREE_LOCK(&lk);
7635			brelse(bp);
7636			ACQUIRE_LOCK(&lk);
7637			continue;
7638		}
7639		/*
7640		 * Remove us from the in memory list.  After this we cannot
7641		 * access the inodedep.
7642		 */
7643		idn = TAILQ_NEXT(inodedep, id_unlinked);
7644		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
7645		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
7646		/*
7647		 * Determine the next inode number.
7648		 */
7649		nino = 0;
7650		if (idn) {
7651			/*
7652			 * If next isn't on the list we can just clear prev's
7653			 * state and schedule it to be fixed later.  No need
7654			 * to synchronously write if we're not in the real
7655			 * list.
7656			 */
7657			if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) {
7658				idp->id_state &= ~UNLINKNEXT;
7659				if ((idp->id_state & ONWORKLIST) == 0)
7660					WORKLIST_INSERT(&bp->b_dep,
7661					    &idp->id_list);
7662				FREE_LOCK(&lk);
7663				bawrite(bp);
7664				ACQUIRE_LOCK(&lk);
7665				return;
7666			}
7667			nino = idn->id_ino;
7668		}
7669		FREE_LOCK(&lk);
7670		/*
7671		 * The predecessor's next pointer is manually updated here
7672		 * so that the NEXT flag is never cleared for an element
7673		 * that is in the list.
7674		 */
7675		if (pino == 0) {
7676			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
7677			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
7678			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
7679			    bp);
7680		} else if (fs->fs_magic == FS_UFS1_MAGIC)
7681			((struct ufs1_dinode *)bp->b_data +
7682			    ino_to_fsbo(fs, pino))->di_freelink = nino;
7683		else
7684			((struct ufs2_dinode *)bp->b_data +
7685			    ino_to_fsbo(fs, pino))->di_freelink = nino;
7686		/*
7687		 * If the bwrite fails we have no recourse to recover.  The
7688		 * filesystem is corrupted already.
7689		 */
7690		bwrite(bp);
7691		ACQUIRE_LOCK(&lk);
7692		/*
7693		 * If the superblock pointer still needs to be cleared force
7694		 * a write here.
7695		 */
7696		if (fs->fs_sujfree == ino) {
7697			FREE_LOCK(&lk);
7698			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
7699			    (int)fs->fs_sbsize, 0, 0, 0);
7700			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
7701			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
7702			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
7703			    bp);
7704			bwrite(bp);
7705			ACQUIRE_LOCK(&lk);
7706		}
7707		if (fs->fs_sujfree != ino)
7708			return;
7709		panic("clear_unlinked_inodedep: Failed to clear free head");
7710	}
7711	if (inodedep->id_ino == fs->fs_sujfree)
7712		panic("clear_unlinked_inodedep: Freeing head of free list");
7713	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
7714	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
7715	return;
7716}
7717
7718/*
7719 * This workitem decrements the inode's link count.
7720 * If the link count reaches zero, the file is removed.
7721 */
7722static void
7723handle_workitem_remove(dirrem, xp)
7724	struct dirrem *dirrem;
7725	struct vnode *xp;
7726{
7727	struct inodedep *inodedep;
7728	struct workhead dotdotwk;
7729	struct worklist *wk;
7730	struct ufsmount *ump;
7731	struct mount *mp;
7732	struct vnode *vp;
7733	struct inode *ip;
7734	ino_t oldinum;
7735	int error;
7736
7737	if (dirrem->dm_state & ONWORKLIST)
7738		panic("handle_workitem_remove: dirrem %p still on worklist",
7739		    dirrem);
7740	oldinum = dirrem->dm_oldinum;
7741	mp = dirrem->dm_list.wk_mp;
7742	ump = VFSTOUFS(mp);
7743	if ((vp = xp) == NULL &&
7744	    (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp,
7745	    FFSV_FORCEINSMQ)) != 0) {
7746		softdep_error("handle_workitem_remove: vget", error);
7747		return;
7748	}
7749	ip = VTOI(vp);
7750	ACQUIRE_LOCK(&lk);
7751	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
7752		panic("handle_workitem_remove: lost inodedep");
7753	if (dirrem->dm_state & ONDEPLIST)
7754		LIST_REMOVE(dirrem, dm_inonext);
7755	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
7756	    ("handle_workitem_remove:  Journal entries not written."));
7757
7758	/*
7759	 * Move all dependencies waiting on the remove to complete
7760	 * from the dirrem to the inode inowait list to be completed
7761	 * after the inode has been updated and written to disk.  Any
7762	 * marked MKDIR_PARENT are saved to be completed when the .. ref
7763	 * is removed.
7764	 */
7765	LIST_INIT(&dotdotwk);
7766	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
7767		WORKLIST_REMOVE(wk);
7768		if (wk->wk_state & MKDIR_PARENT) {
7769			wk->wk_state &= ~MKDIR_PARENT;
7770			WORKLIST_INSERT(&dotdotwk, wk);
7771			continue;
7772		}
7773		WORKLIST_INSERT(&inodedep->id_inowait, wk);
7774	}
7775	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
7776	/*
7777	 * Normal file deletion.
7778	 */
7779	if ((dirrem->dm_state & RMDIR) == 0) {
7780		ip->i_nlink--;
7781		DIP_SET(ip, i_nlink, ip->i_nlink);
7782		ip->i_flag |= IN_CHANGE;
7783		if (ip->i_nlink < ip->i_effnlink)
7784			panic("handle_workitem_remove: bad file delta");
7785		if (ip->i_nlink == 0)
7786			unlinked_inodedep(mp, inodedep);
7787		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
7788		num_dirrem -= 1;
7789		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
7790		    ("handle_workitem_remove: worklist not empty. %s",
7791		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
7792		WORKITEM_FREE(dirrem, D_DIRREM);
7793		FREE_LOCK(&lk);
7794		goto out;
7795	}
7796	/*
7797	 * Directory deletion. Decrement reference count for both the
7798	 * just deleted parent directory entry and the reference for ".".
7799	 * Arrange to have the reference count on the parent decremented
7800	 * to account for the loss of "..".
7801	 */
7802	ip->i_nlink -= 2;
7803	DIP_SET(ip, i_nlink, ip->i_nlink);
7804	ip->i_flag |= IN_CHANGE;
7805	if (ip->i_nlink < ip->i_effnlink)
7806		panic("handle_workitem_remove: bad dir delta");
7807	if (ip->i_nlink == 0)
7808		unlinked_inodedep(mp, inodedep);
7809	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
7810	/*
7811	 * Rename a directory to a new parent. Since, we are both deleting
7812	 * and creating a new directory entry, the link count on the new
7813	 * directory should not change. Thus we skip the followup dirrem.
7814	 */
7815	if (dirrem->dm_state & DIRCHG) {
7816		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
7817		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
7818		num_dirrem -= 1;
7819		WORKITEM_FREE(dirrem, D_DIRREM);
7820		FREE_LOCK(&lk);
7821		goto out;
7822	}
7823	dirrem->dm_state = ONDEPLIST;
7824	dirrem->dm_oldinum = dirrem->dm_dirinum;
7825	/*
7826	 * Place the dirrem on the parent's diremhd list.
7827	 */
7828	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
7829		panic("handle_workitem_remove: lost dir inodedep");
7830	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
7831	/*
7832	 * If the allocated inode has never been written to disk, then
7833	 * the on-disk inode is zero'ed and we can remove the file
7834	 * immediately.  When journaling if the inode has been marked
7835	 * unlinked and not DEPCOMPLETE we know it can never be written.
7836	 */
7837	inodedep_lookup(mp, oldinum, 0, &inodedep);
7838	if (inodedep == NULL ||
7839	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
7840	    check_inode_unwritten(inodedep)) {
7841		if (xp != NULL)
7842			add_to_worklist(&dirrem->dm_list, 0);
7843		FREE_LOCK(&lk);
7844		if (xp == NULL) {
7845			vput(vp);
7846			handle_workitem_remove(dirrem, NULL);
7847		}
7848		return;
7849	}
7850	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
7851	FREE_LOCK(&lk);
7852	ip->i_flag |= IN_CHANGE;
7853out:
7854	ffs_update(vp, 0);
7855	if (xp == NULL)
7856		vput(vp);
7857}
7858
7859/*
7860 * Inode de-allocation dependencies.
7861 *
7862 * When an inode's link count is reduced to zero, it can be de-allocated. We
7863 * found it convenient to postpone de-allocation until after the inode is
7864 * written to disk with its new link count (zero).  At this point, all of the
7865 * on-disk inode's block pointers are nullified and, with careful dependency
7866 * list ordering, all dependencies related to the inode will be satisfied and
7867 * the corresponding dependency structures de-allocated.  So, if/when the
7868 * inode is reused, there will be no mixing of old dependencies with new
7869 * ones.  This artificial dependency is set up by the block de-allocation
7870 * procedure above (softdep_setup_freeblocks) and completed by the
7871 * following procedure.
7872 */
7873static void
7874handle_workitem_freefile(freefile)
7875	struct freefile *freefile;
7876{
7877	struct workhead wkhd;
7878	struct fs *fs;
7879	struct inodedep *idp;
7880	struct ufsmount *ump;
7881	int error;
7882
7883	ump = VFSTOUFS(freefile->fx_list.wk_mp);
7884	fs = ump->um_fs;
7885#ifdef DEBUG
7886	ACQUIRE_LOCK(&lk);
7887	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
7888	FREE_LOCK(&lk);
7889	if (error)
7890		panic("handle_workitem_freefile: inodedep %p survived", idp);
7891#endif
7892	UFS_LOCK(ump);
7893	fs->fs_pendinginodes -= 1;
7894	UFS_UNLOCK(ump);
7895	LIST_INIT(&wkhd);
7896	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
7897	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
7898	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
7899		softdep_error("handle_workitem_freefile", error);
7900	ACQUIRE_LOCK(&lk);
7901	WORKITEM_FREE(freefile, D_FREEFILE);
7902	FREE_LOCK(&lk);
7903}
7904
7905
7906/*
7907 * Helper function which unlinks marker element from work list and returns
7908 * the next element on the list.
7909 */
7910static __inline struct worklist *
7911markernext(struct worklist *marker)
7912{
7913	struct worklist *next;
7914
7915	next = LIST_NEXT(marker, wk_list);
7916	LIST_REMOVE(marker, wk_list);
7917	return next;
7918}
7919
7920/*
7921 * Disk writes.
7922 *
7923 * The dependency structures constructed above are most actively used when file
7924 * system blocks are written to disk.  No constraints are placed on when a
7925 * block can be written, but unsatisfied update dependencies are made safe by
7926 * modifying (or replacing) the source memory for the duration of the disk
7927 * write.  When the disk write completes, the memory block is again brought
7928 * up-to-date.
7929 *
7930 * In-core inode structure reclamation.
7931 *
7932 * Because there are a finite number of "in-core" inode structures, they are
7933 * reused regularly.  By transferring all inode-related dependencies to the
7934 * in-memory inode block and indexing them separately (via "inodedep"s), we
7935 * can allow "in-core" inode structures to be reused at any time and avoid
7936 * any increase in contention.
7937 *
7938 * Called just before entering the device driver to initiate a new disk I/O.
7939 * The buffer must be locked, thus, no I/O completion operations can occur
7940 * while we are manipulating its associated dependencies.
7941 */
7942static void
7943softdep_disk_io_initiation(bp)
7944	struct buf *bp;		/* structure describing disk write to occur */
7945{
7946	struct worklist *wk;
7947	struct worklist marker;
7948	struct inodedep *inodedep;
7949	struct freeblks *freeblks;
7950	struct jfreeblk *jfreeblk;
7951	struct newblk *newblk;
7952
7953	/*
7954	 * We only care about write operations. There should never
7955	 * be dependencies for reads.
7956	 */
7957	if (bp->b_iocmd != BIO_WRITE)
7958		panic("softdep_disk_io_initiation: not write");
7959
7960	if (bp->b_vflags & BV_BKGRDINPROG)
7961		panic("softdep_disk_io_initiation: Writing buffer with "
7962		    "background write in progress: %p", bp);
7963
7964	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
7965	PHOLD(curproc);			/* Don't swap out kernel stack */
7966
7967	ACQUIRE_LOCK(&lk);
7968	/*
7969	 * Do any necessary pre-I/O processing.
7970	 */
7971	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
7972	     wk = markernext(&marker)) {
7973		LIST_INSERT_AFTER(wk, &marker, wk_list);
7974		switch (wk->wk_type) {
7975
7976		case D_PAGEDEP:
7977			initiate_write_filepage(WK_PAGEDEP(wk), bp);
7978			continue;
7979
7980		case D_INODEDEP:
7981			inodedep = WK_INODEDEP(wk);
7982			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
7983				initiate_write_inodeblock_ufs1(inodedep, bp);
7984			else
7985				initiate_write_inodeblock_ufs2(inodedep, bp);
7986			continue;
7987
7988		case D_INDIRDEP:
7989			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
7990			continue;
7991
7992		case D_BMSAFEMAP:
7993			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
7994			continue;
7995
7996		case D_JSEG:
7997			WK_JSEG(wk)->js_buf = NULL;
7998			continue;
7999
8000		case D_FREEBLKS:
8001			freeblks = WK_FREEBLKS(wk);
8002			jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd);
8003			/*
8004			 * We have to wait for the jfreeblks to be journaled
8005			 * before we can write an inodeblock with updated
8006			 * pointers.  Be careful to arrange the marker so
8007			 * we revisit the jfreeblk if it's not removed by
8008			 * the first jwait().
8009			 */
8010			if (jfreeblk != NULL) {
8011				LIST_REMOVE(&marker, wk_list);
8012				LIST_INSERT_BEFORE(wk, &marker, wk_list);
8013				jwait(&jfreeblk->jf_list);
8014			}
8015			continue;
8016		case D_ALLOCDIRECT:
8017		case D_ALLOCINDIR:
8018			/*
8019			 * We have to wait for the jnewblk to be journaled
8020			 * before we can write to a block otherwise the
8021			 * contents may be confused with an earlier file
8022			 * at recovery time.  Handle the marker as described
8023			 * above.
8024			 */
8025			newblk = WK_NEWBLK(wk);
8026			if (newblk->nb_jnewblk != NULL) {
8027				LIST_REMOVE(&marker, wk_list);
8028				LIST_INSERT_BEFORE(wk, &marker, wk_list);
8029				jwait(&newblk->nb_jnewblk->jn_list);
8030			}
8031			continue;
8032
8033		case D_SBDEP:
8034			initiate_write_sbdep(WK_SBDEP(wk));
8035			continue;
8036
8037		case D_MKDIR:
8038		case D_FREEWORK:
8039		case D_FREEDEP:
8040		case D_JSEGDEP:
8041			continue;
8042
8043		default:
8044			panic("handle_disk_io_initiation: Unexpected type %s",
8045			    TYPENAME(wk->wk_type));
8046			/* NOTREACHED */
8047		}
8048	}
8049	FREE_LOCK(&lk);
8050	PRELE(curproc);			/* Allow swapout of kernel stack */
8051}
8052
8053/*
8054 * Called from within the procedure above to deal with unsatisfied
8055 * allocation dependencies in a directory. The buffer must be locked,
8056 * thus, no I/O completion operations can occur while we are
8057 * manipulating its associated dependencies.
8058 */
8059static void
8060initiate_write_filepage(pagedep, bp)
8061	struct pagedep *pagedep;
8062	struct buf *bp;
8063{
8064	struct jremref *jremref;
8065	struct jmvref *jmvref;
8066	struct dirrem *dirrem;
8067	struct diradd *dap;
8068	struct direct *ep;
8069	int i;
8070
8071	if (pagedep->pd_state & IOSTARTED) {
8072		/*
8073		 * This can only happen if there is a driver that does not
8074		 * understand chaining. Here biodone will reissue the call
8075		 * to strategy for the incomplete buffers.
8076		 */
8077		printf("initiate_write_filepage: already started\n");
8078		return;
8079	}
8080	pagedep->pd_state |= IOSTARTED;
8081	/*
8082	 * Wait for all journal remove dependencies to hit the disk.
8083	 * We can not allow any potentially conflicting directory adds
8084	 * to be visible before removes and rollback is too difficult.
8085	 * lk may be dropped and re-acquired, however we hold the buf
8086	 * locked so the dependency can not go away.
8087	 */
8088	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
8089		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
8090			stat_jwait_filepage++;
8091			jwait(&jremref->jr_list);
8092		}
8093	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
8094		stat_jwait_filepage++;
8095		jwait(&jmvref->jm_list);
8096	}
8097	for (i = 0; i < DAHASHSZ; i++) {
8098		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
8099			ep = (struct direct *)
8100			    ((char *)bp->b_data + dap->da_offset);
8101			if (ep->d_ino != dap->da_newinum)
8102				panic("%s: dir inum %d != new %d",
8103				    "initiate_write_filepage",
8104				    ep->d_ino, dap->da_newinum);
8105			if (dap->da_state & DIRCHG)
8106				ep->d_ino = dap->da_previous->dm_oldinum;
8107			else
8108				ep->d_ino = 0;
8109			dap->da_state &= ~ATTACHED;
8110			dap->da_state |= UNDONE;
8111		}
8112	}
8113}
8114
8115/*
8116 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
8117 * Note that any bug fixes made to this routine must be done in the
8118 * version found below.
8119 *
8120 * Called from within the procedure above to deal with unsatisfied
8121 * allocation dependencies in an inodeblock. The buffer must be
8122 * locked, thus, no I/O completion operations can occur while we
8123 * are manipulating its associated dependencies.
8124 */
8125static void
8126initiate_write_inodeblock_ufs1(inodedep, bp)
8127	struct inodedep *inodedep;
8128	struct buf *bp;			/* The inode block */
8129{
8130	struct allocdirect *adp, *lastadp;
8131	struct ufs1_dinode *dp;
8132	struct ufs1_dinode *sip;
8133	struct inoref *inoref;
8134	struct fs *fs;
8135	ufs_lbn_t i;
8136#ifdef INVARIANTS
8137	ufs_lbn_t prevlbn = 0;
8138#endif
8139	int deplist;
8140
8141	if (inodedep->id_state & IOSTARTED)
8142		panic("initiate_write_inodeblock_ufs1: already started");
8143	inodedep->id_state |= IOSTARTED;
8144	fs = inodedep->id_fs;
8145	dp = (struct ufs1_dinode *)bp->b_data +
8146	    ino_to_fsbo(fs, inodedep->id_ino);
8147
8148	/*
8149	 * If we're on the unlinked list but have not yet written our
8150	 * next pointer initialize it here.
8151	 */
8152	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
8153		struct inodedep *inon;
8154
8155		inon = TAILQ_NEXT(inodedep, id_unlinked);
8156		dp->di_freelink = inon ? inon->id_ino : 0;
8157	}
8158	/*
8159	 * If the bitmap is not yet written, then the allocated
8160	 * inode cannot be written to disk.
8161	 */
8162	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
8163		if (inodedep->id_savedino1 != NULL)
8164			panic("initiate_write_inodeblock_ufs1: I/O underway");
8165		FREE_LOCK(&lk);
8166		sip = malloc(sizeof(struct ufs1_dinode),
8167		    M_SAVEDINO, M_SOFTDEP_FLAGS);
8168		ACQUIRE_LOCK(&lk);
8169		inodedep->id_savedino1 = sip;
8170		*inodedep->id_savedino1 = *dp;
8171		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
8172		dp->di_gen = inodedep->id_savedino1->di_gen;
8173		dp->di_freelink = inodedep->id_savedino1->di_freelink;
8174		return;
8175	}
8176	/*
8177	 * If no dependencies, then there is nothing to roll back.
8178	 */
8179	inodedep->id_savedsize = dp->di_size;
8180	inodedep->id_savedextsize = 0;
8181	inodedep->id_savednlink = dp->di_nlink;
8182	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
8183	    TAILQ_EMPTY(&inodedep->id_inoreflst))
8184		return;
8185	/*
8186	 * Revert the link count to that of the first unwritten journal entry.
8187	 */
8188	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
8189	if (inoref)
8190		dp->di_nlink = inoref->if_nlink;
8191	/*
8192	 * Set the dependencies to busy.
8193	 */
8194	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8195	     adp = TAILQ_NEXT(adp, ad_next)) {
8196#ifdef INVARIANTS
8197		if (deplist != 0 && prevlbn >= adp->ad_offset)
8198			panic("softdep_write_inodeblock: lbn order");
8199		prevlbn = adp->ad_offset;
8200		if (adp->ad_offset < NDADDR &&
8201		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
8202			panic("%s: direct pointer #%jd mismatch %d != %jd",
8203			    "softdep_write_inodeblock",
8204			    (intmax_t)adp->ad_offset,
8205			    dp->di_db[adp->ad_offset],
8206			    (intmax_t)adp->ad_newblkno);
8207		if (adp->ad_offset >= NDADDR &&
8208		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
8209			panic("%s: indirect pointer #%jd mismatch %d != %jd",
8210			    "softdep_write_inodeblock",
8211			    (intmax_t)adp->ad_offset - NDADDR,
8212			    dp->di_ib[adp->ad_offset - NDADDR],
8213			    (intmax_t)adp->ad_newblkno);
8214		deplist |= 1 << adp->ad_offset;
8215		if ((adp->ad_state & ATTACHED) == 0)
8216			panic("softdep_write_inodeblock: Unknown state 0x%x",
8217			    adp->ad_state);
8218#endif /* INVARIANTS */
8219		adp->ad_state &= ~ATTACHED;
8220		adp->ad_state |= UNDONE;
8221	}
8222	/*
8223	 * The on-disk inode cannot claim to be any larger than the last
8224	 * fragment that has been written. Otherwise, the on-disk inode
8225	 * might have fragments that were not the last block in the file
8226	 * which would corrupt the filesystem.
8227	 */
8228	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8229	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8230		if (adp->ad_offset >= NDADDR)
8231			break;
8232		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
8233		/* keep going until hitting a rollback to a frag */
8234		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8235			continue;
8236		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8237		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
8238#ifdef INVARIANTS
8239			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
8240				panic("softdep_write_inodeblock: lost dep1");
8241#endif /* INVARIANTS */
8242			dp->di_db[i] = 0;
8243		}
8244		for (i = 0; i < NIADDR; i++) {
8245#ifdef INVARIANTS
8246			if (dp->di_ib[i] != 0 &&
8247			    (deplist & ((1 << NDADDR) << i)) == 0)
8248				panic("softdep_write_inodeblock: lost dep2");
8249#endif /* INVARIANTS */
8250			dp->di_ib[i] = 0;
8251		}
8252		return;
8253	}
8254	/*
8255	 * If we have zero'ed out the last allocated block of the file,
8256	 * roll back the size to the last currently allocated block.
8257	 * We know that this last allocated block is a full-sized as
8258	 * we already checked for fragments in the loop above.
8259	 */
8260	if (lastadp != NULL &&
8261	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8262		for (i = lastadp->ad_offset; i >= 0; i--)
8263			if (dp->di_db[i] != 0)
8264				break;
8265		dp->di_size = (i + 1) * fs->fs_bsize;
8266	}
8267	/*
8268	 * The only dependencies are for indirect blocks.
8269	 *
8270	 * The file size for indirect block additions is not guaranteed.
8271	 * Such a guarantee would be non-trivial to achieve. The conventional
8272	 * synchronous write implementation also does not make this guarantee.
8273	 * Fsck should catch and fix discrepancies. Arguably, the file size
8274	 * can be over-estimated without destroying integrity when the file
8275	 * moves into the indirect blocks (i.e., is large). If we want to
8276	 * postpone fsck, we are stuck with this argument.
8277	 */
8278	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
8279		dp->di_ib[adp->ad_offset - NDADDR] = 0;
8280}
8281
8282/*
8283 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
8284 * Note that any bug fixes made to this routine must be done in the
8285 * version found above.
8286 *
8287 * Called from within the procedure above to deal with unsatisfied
8288 * allocation dependencies in an inodeblock. The buffer must be
8289 * locked, thus, no I/O completion operations can occur while we
8290 * are manipulating its associated dependencies.
8291 */
8292static void
8293initiate_write_inodeblock_ufs2(inodedep, bp)
8294	struct inodedep *inodedep;
8295	struct buf *bp;			/* The inode block */
8296{
8297	struct allocdirect *adp, *lastadp;
8298	struct ufs2_dinode *dp;
8299	struct ufs2_dinode *sip;
8300	struct inoref *inoref;
8301	struct fs *fs;
8302	ufs_lbn_t i;
8303#ifdef INVARIANTS
8304	ufs_lbn_t prevlbn = 0;
8305#endif
8306	int deplist;
8307
8308	if (inodedep->id_state & IOSTARTED)
8309		panic("initiate_write_inodeblock_ufs2: already started");
8310	inodedep->id_state |= IOSTARTED;
8311	fs = inodedep->id_fs;
8312	dp = (struct ufs2_dinode *)bp->b_data +
8313	    ino_to_fsbo(fs, inodedep->id_ino);
8314
8315	/*
8316	 * If we're on the unlinked list but have not yet written our
8317	 * next pointer initialize it here.
8318	 */
8319	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
8320		struct inodedep *inon;
8321
8322		inon = TAILQ_NEXT(inodedep, id_unlinked);
8323		dp->di_freelink = inon ? inon->id_ino : 0;
8324	}
8325	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) ==
8326	    (UNLINKED | UNLINKNEXT)) {
8327		struct inodedep *inon;
8328		ino_t freelink;
8329
8330		inon = TAILQ_NEXT(inodedep, id_unlinked);
8331		freelink = inon ? inon->id_ino : 0;
8332		if (freelink != dp->di_freelink)
8333			panic("ino %p(0x%X) %d, %d != %d",
8334			    inodedep, inodedep->id_state, inodedep->id_ino,
8335			    freelink, dp->di_freelink);
8336	}
8337	/*
8338	 * If the bitmap is not yet written, then the allocated
8339	 * inode cannot be written to disk.
8340	 */
8341	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
8342		if (inodedep->id_savedino2 != NULL)
8343			panic("initiate_write_inodeblock_ufs2: I/O underway");
8344		FREE_LOCK(&lk);
8345		sip = malloc(sizeof(struct ufs2_dinode),
8346		    M_SAVEDINO, M_SOFTDEP_FLAGS);
8347		ACQUIRE_LOCK(&lk);
8348		inodedep->id_savedino2 = sip;
8349		*inodedep->id_savedino2 = *dp;
8350		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
8351		dp->di_gen = inodedep->id_savedino2->di_gen;
8352		dp->di_freelink = inodedep->id_savedino2->di_freelink;
8353		return;
8354	}
8355	/*
8356	 * If no dependencies, then there is nothing to roll back.
8357	 */
8358	inodedep->id_savedsize = dp->di_size;
8359	inodedep->id_savedextsize = dp->di_extsize;
8360	inodedep->id_savednlink = dp->di_nlink;
8361	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
8362	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
8363	    TAILQ_EMPTY(&inodedep->id_inoreflst))
8364		return;
8365	/*
8366	 * Revert the link count to that of the first unwritten journal entry.
8367	 */
8368	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
8369	if (inoref)
8370		dp->di_nlink = inoref->if_nlink;
8371
8372	/*
8373	 * Set the ext data dependencies to busy.
8374	 */
8375	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
8376	     adp = TAILQ_NEXT(adp, ad_next)) {
8377#ifdef INVARIANTS
8378		if (deplist != 0 && prevlbn >= adp->ad_offset)
8379			panic("softdep_write_inodeblock: lbn order");
8380		prevlbn = adp->ad_offset;
8381		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
8382			panic("%s: direct pointer #%jd mismatch %jd != %jd",
8383			    "softdep_write_inodeblock",
8384			    (intmax_t)adp->ad_offset,
8385			    (intmax_t)dp->di_extb[adp->ad_offset],
8386			    (intmax_t)adp->ad_newblkno);
8387		deplist |= 1 << adp->ad_offset;
8388		if ((adp->ad_state & ATTACHED) == 0)
8389			panic("softdep_write_inodeblock: Unknown state 0x%x",
8390			    adp->ad_state);
8391#endif /* INVARIANTS */
8392		adp->ad_state &= ~ATTACHED;
8393		adp->ad_state |= UNDONE;
8394	}
8395	/*
8396	 * The on-disk inode cannot claim to be any larger than the last
8397	 * fragment that has been written. Otherwise, the on-disk inode
8398	 * might have fragments that were not the last block in the ext
8399	 * data which would corrupt the filesystem.
8400	 */
8401	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
8402	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8403		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
8404		/* keep going until hitting a rollback to a frag */
8405		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8406			continue;
8407		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8408		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
8409#ifdef INVARIANTS
8410			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
8411				panic("softdep_write_inodeblock: lost dep1");
8412#endif /* INVARIANTS */
8413			dp->di_extb[i] = 0;
8414		}
8415		lastadp = NULL;
8416		break;
8417	}
8418	/*
8419	 * If we have zero'ed out the last allocated block of the ext
8420	 * data, roll back the size to the last currently allocated block.
8421	 * We know that this last allocated block is a full-sized as
8422	 * we already checked for fragments in the loop above.
8423	 */
8424	if (lastadp != NULL &&
8425	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8426		for (i = lastadp->ad_offset; i >= 0; i--)
8427			if (dp->di_extb[i] != 0)
8428				break;
8429		dp->di_extsize = (i + 1) * fs->fs_bsize;
8430	}
8431	/*
8432	 * Set the file data dependencies to busy.
8433	 */
8434	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8435	     adp = TAILQ_NEXT(adp, ad_next)) {
8436#ifdef INVARIANTS
8437		if (deplist != 0 && prevlbn >= adp->ad_offset)
8438			panic("softdep_write_inodeblock: lbn order");
8439		prevlbn = adp->ad_offset;
8440		if (adp->ad_offset < NDADDR &&
8441		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
8442			panic("%s: direct pointer #%jd mismatch %jd != %jd",
8443			    "softdep_write_inodeblock",
8444			    (intmax_t)adp->ad_offset,
8445			    (intmax_t)dp->di_db[adp->ad_offset],
8446			    (intmax_t)adp->ad_newblkno);
8447		if (adp->ad_offset >= NDADDR &&
8448		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
8449			panic("%s indirect pointer #%jd mismatch %jd != %jd",
8450			    "softdep_write_inodeblock:",
8451			    (intmax_t)adp->ad_offset - NDADDR,
8452			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
8453			    (intmax_t)adp->ad_newblkno);
8454		deplist |= 1 << adp->ad_offset;
8455		if ((adp->ad_state & ATTACHED) == 0)
8456			panic("softdep_write_inodeblock: Unknown state 0x%x",
8457			    adp->ad_state);
8458#endif /* INVARIANTS */
8459		adp->ad_state &= ~ATTACHED;
8460		adp->ad_state |= UNDONE;
8461	}
8462	/*
8463	 * The on-disk inode cannot claim to be any larger than the last
8464	 * fragment that has been written. Otherwise, the on-disk inode
8465	 * might have fragments that were not the last block in the file
8466	 * which would corrupt the filesystem.
8467	 */
8468	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8469	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8470		if (adp->ad_offset >= NDADDR)
8471			break;
8472		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
8473		/* keep going until hitting a rollback to a frag */
8474		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8475			continue;
8476		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8477		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
8478#ifdef INVARIANTS
8479			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
8480				panic("softdep_write_inodeblock: lost dep2");
8481#endif /* INVARIANTS */
8482			dp->di_db[i] = 0;
8483		}
8484		for (i = 0; i < NIADDR; i++) {
8485#ifdef INVARIANTS
8486			if (dp->di_ib[i] != 0 &&
8487			    (deplist & ((1 << NDADDR) << i)) == 0)
8488				panic("softdep_write_inodeblock: lost dep3");
8489#endif /* INVARIANTS */
8490			dp->di_ib[i] = 0;
8491		}
8492		return;
8493	}
8494	/*
8495	 * If we have zero'ed out the last allocated block of the file,
8496	 * roll back the size to the last currently allocated block.
8497	 * We know that this last allocated block is a full-sized as
8498	 * we already checked for fragments in the loop above.
8499	 */
8500	if (lastadp != NULL &&
8501	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8502		for (i = lastadp->ad_offset; i >= 0; i--)
8503			if (dp->di_db[i] != 0)
8504				break;
8505		dp->di_size = (i + 1) * fs->fs_bsize;
8506	}
8507	/*
8508	 * The only dependencies are for indirect blocks.
8509	 *
8510	 * The file size for indirect block additions is not guaranteed.
8511	 * Such a guarantee would be non-trivial to achieve. The conventional
8512	 * synchronous write implementation also does not make this guarantee.
8513	 * Fsck should catch and fix discrepancies. Arguably, the file size
8514	 * can be over-estimated without destroying integrity when the file
8515	 * moves into the indirect blocks (i.e., is large). If we want to
8516	 * postpone fsck, we are stuck with this argument.
8517	 */
8518	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
8519		dp->di_ib[adp->ad_offset - NDADDR] = 0;
8520}
8521
8522/*
8523 * Cancel an indirdep as a result of truncation.  Release all of the
8524 * children allocindirs and place their journal work on the appropriate
8525 * list.
8526 */
8527static void
8528cancel_indirdep(indirdep, bp, inodedep, freeblks)
8529	struct indirdep *indirdep;
8530	struct buf *bp;
8531	struct inodedep *inodedep;
8532	struct freeblks *freeblks;
8533{
8534	struct allocindir *aip;
8535
8536	/*
8537	 * None of the indirect pointers will ever be visible,
8538	 * so they can simply be tossed. GOINGAWAY ensures
8539	 * that allocated pointers will be saved in the buffer
8540	 * cache until they are freed. Note that they will
8541	 * only be able to be found by their physical address
8542	 * since the inode mapping the logical address will
8543	 * be gone. The save buffer used for the safe copy
8544	 * was allocated in setup_allocindir_phase2 using
8545	 * the physical address so it could be used for this
8546	 * purpose. Hence we swap the safe copy with the real
8547	 * copy, allowing the safe copy to be freed and holding
8548	 * on to the real copy for later use in indir_trunc.
8549	 */
8550	if (indirdep->ir_state & GOINGAWAY)
8551		panic("cancel_indirdep: already gone");
8552	if (indirdep->ir_state & ONDEPLIST) {
8553		indirdep->ir_state &= ~ONDEPLIST;
8554		LIST_REMOVE(indirdep, ir_next);
8555	}
8556	indirdep->ir_state |= GOINGAWAY;
8557	VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1;
8558	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
8559		cancel_allocindir(aip, inodedep, freeblks);
8560	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
8561		cancel_allocindir(aip, inodedep, freeblks);
8562	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
8563		cancel_allocindir(aip, inodedep, freeblks);
8564	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
8565		cancel_allocindir(aip, inodedep, freeblks);
8566	bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
8567	WORKLIST_REMOVE(&indirdep->ir_list);
8568	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
8569	indirdep->ir_savebp = NULL;
8570}
8571
8572/*
8573 * Free an indirdep once it no longer has new pointers to track.
8574 */
8575static void
8576free_indirdep(indirdep)
8577	struct indirdep *indirdep;
8578{
8579
8580	KASSERT(LIST_EMPTY(&indirdep->ir_jwork),
8581	    ("free_indirdep: Journal work not empty."));
8582	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
8583	    ("free_indirdep: Complete head not empty."));
8584	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
8585	    ("free_indirdep: write head not empty."));
8586	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
8587	    ("free_indirdep: done head not empty."));
8588	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
8589	    ("free_indirdep: deplist head not empty."));
8590	KASSERT(indirdep->ir_savebp == NULL,
8591	    ("free_indirdep: %p ir_savebp != NULL", indirdep));
8592	KASSERT((indirdep->ir_state & ONDEPLIST) == 0,
8593	    ("free_indirdep: %p still on deplist.", indirdep));
8594	if (indirdep->ir_state & ONWORKLIST)
8595		WORKLIST_REMOVE(&indirdep->ir_list);
8596	WORKITEM_FREE(indirdep, D_INDIRDEP);
8597}
8598
8599/*
8600 * Called before a write to an indirdep.  This routine is responsible for
8601 * rolling back pointers to a safe state which includes only those
8602 * allocindirs which have been completed.
8603 */
8604static void
8605initiate_write_indirdep(indirdep, bp)
8606	struct indirdep *indirdep;
8607	struct buf *bp;
8608{
8609
8610	if (indirdep->ir_state & GOINGAWAY)
8611		panic("disk_io_initiation: indirdep gone");
8612
8613	/*
8614	 * If there are no remaining dependencies, this will be writing
8615	 * the real pointers.
8616	 */
8617	if (LIST_EMPTY(&indirdep->ir_deplisthd))
8618		return;
8619	/*
8620	 * Replace up-to-date version with safe version.
8621	 */
8622	FREE_LOCK(&lk);
8623	indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
8624	    M_SOFTDEP_FLAGS);
8625	ACQUIRE_LOCK(&lk);
8626	indirdep->ir_state &= ~ATTACHED;
8627	indirdep->ir_state |= UNDONE;
8628	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
8629	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
8630	    bp->b_bcount);
8631}
8632
8633/*
8634 * Called when an inode has been cleared in a cg bitmap.  This finally
8635 * eliminates any canceled jaddrefs
8636 */
8637void
8638softdep_setup_inofree(mp, bp, ino, wkhd)
8639	struct mount *mp;
8640	struct buf *bp;
8641	ino_t ino;
8642	struct workhead *wkhd;
8643{
8644	struct worklist *wk, *wkn;
8645	struct inodedep *inodedep;
8646	uint8_t *inosused;
8647	struct cg *cgp;
8648	struct fs *fs;
8649
8650	ACQUIRE_LOCK(&lk);
8651	fs = VFSTOUFS(mp)->um_fs;
8652	cgp = (struct cg *)bp->b_data;
8653	inosused = cg_inosused(cgp);
8654	if (isset(inosused, ino % fs->fs_ipg))
8655		panic("softdep_setup_inofree: inode %d not freed.", ino);
8656	if (inodedep_lookup(mp, ino, 0, &inodedep))
8657		panic("softdep_setup_inofree: ino %d has existing inodedep %p",
8658		    ino, inodedep);
8659	if (wkhd) {
8660		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
8661			if (wk->wk_type != D_JADDREF)
8662				continue;
8663			WORKLIST_REMOVE(wk);
8664			/*
8665			 * We can free immediately even if the jaddref
8666			 * isn't attached in a background write as now
8667			 * the bitmaps are reconciled.
8668		 	 */
8669			wk->wk_state |= COMPLETE | ATTACHED;
8670			free_jaddref(WK_JADDREF(wk));
8671		}
8672		jwork_move(&bp->b_dep, wkhd);
8673	}
8674	FREE_LOCK(&lk);
8675}
8676
8677
8678/*
8679 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
8680 * map.  Any dependencies waiting for the write to clear are added to the
8681 * buf's list and any jnewblks that are being canceled are discarded
8682 * immediately.
8683 */
8684void
8685softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
8686	struct mount *mp;
8687	struct buf *bp;
8688	ufs2_daddr_t blkno;
8689	int frags;
8690	struct workhead *wkhd;
8691{
8692	struct jnewblk *jnewblk;
8693	struct worklist *wk, *wkn;
8694#ifdef SUJ_DEBUG
8695	struct bmsafemap *bmsafemap;
8696	struct fs *fs;
8697	uint8_t *blksfree;
8698	struct cg *cgp;
8699	ufs2_daddr_t jstart;
8700	ufs2_daddr_t jend;
8701	ufs2_daddr_t end;
8702	long bno;
8703	int i;
8704#endif
8705
8706	ACQUIRE_LOCK(&lk);
8707	/*
8708	 * Detach any jnewblks which have been canceled.  They must linger
8709	 * until the bitmap is cleared again by ffs_blkfree() to prevent
8710	 * an unjournaled allocation from hitting the disk.
8711	 */
8712	if (wkhd) {
8713		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
8714			if (wk->wk_type != D_JNEWBLK)
8715				continue;
8716			jnewblk = WK_JNEWBLK(wk);
8717			KASSERT(jnewblk->jn_state & GOINGAWAY,
8718			    ("softdep_setup_blkfree: jnewblk not canceled."));
8719			WORKLIST_REMOVE(wk);
8720#ifdef SUJ_DEBUG
8721			/*
8722			 * Assert that this block is free in the bitmap
8723			 * before we discard the jnewblk.
8724			 */
8725			fs = VFSTOUFS(mp)->um_fs;
8726			cgp = (struct cg *)bp->b_data;
8727			blksfree = cg_blksfree(cgp);
8728			bno = dtogd(fs, jnewblk->jn_blkno);
8729			for (i = jnewblk->jn_oldfrags;
8730			    i < jnewblk->jn_frags; i++) {
8731				if (isset(blksfree, bno + i))
8732					continue;
8733				panic("softdep_setup_blkfree: not free");
8734			}
8735#endif
8736			/*
8737			 * Even if it's not attached we can free immediately
8738			 * as the new bitmap is correct.
8739			 */
8740			wk->wk_state |= COMPLETE | ATTACHED;
8741			free_jnewblk(jnewblk);
8742		}
8743		/*
8744		 * The buf must be locked by the caller otherwise these could
8745		 * be added while it's being written and the write would
8746		 * complete them before they made it to disk.
8747		 */
8748		jwork_move(&bp->b_dep, wkhd);
8749	}
8750
8751#ifdef SUJ_DEBUG
8752	/*
8753	 * Assert that we are not freeing a block which has an outstanding
8754	 * allocation dependency.
8755	 */
8756	fs = VFSTOUFS(mp)->um_fs;
8757	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno));
8758	end = blkno + frags;
8759	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
8760		/*
8761		 * Don't match against blocks that will be freed when the
8762		 * background write is done.
8763		 */
8764		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
8765		    (COMPLETE | DEPCOMPLETE))
8766			continue;
8767		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
8768		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
8769		if ((blkno >= jstart && blkno < jend) ||
8770		    (end > jstart && end <= jend)) {
8771			printf("state 0x%X %jd - %d %d dep %p\n",
8772			    jnewblk->jn_state, jnewblk->jn_blkno,
8773			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
8774			    jnewblk->jn_newblk);
8775			panic("softdep_setup_blkfree: "
8776			    "%jd-%jd(%d) overlaps with %jd-%jd",
8777			    blkno, end, frags, jstart, jend);
8778		}
8779	}
8780#endif
8781	FREE_LOCK(&lk);
8782}
8783
8784static void
8785initiate_write_bmsafemap(bmsafemap, bp)
8786	struct bmsafemap *bmsafemap;
8787	struct buf *bp;			/* The cg block. */
8788{
8789	struct jaddref *jaddref;
8790	struct jnewblk *jnewblk;
8791	uint8_t *inosused;
8792	uint8_t *blksfree;
8793	struct cg *cgp;
8794	struct fs *fs;
8795	int cleared;
8796	ino_t ino;
8797	long bno;
8798	int i;
8799
8800	if (bmsafemap->sm_state & IOSTARTED)
8801		panic("initiate_write_bmsafemap: Already started\n");
8802	bmsafemap->sm_state |= IOSTARTED;
8803	/*
8804	 * Clear any inode allocations which are pending journal writes.
8805	 */
8806	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
8807		cgp = (struct cg *)bp->b_data;
8808		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
8809		inosused = cg_inosused(cgp);
8810		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
8811			ino = jaddref->ja_ino % fs->fs_ipg;
8812			/*
8813			 * If this is a background copy the inode may not
8814			 * be marked used yet.
8815			 */
8816			if (isset(inosused, ino)) {
8817				if ((jaddref->ja_mode & IFMT) == IFDIR)
8818					cgp->cg_cs.cs_ndir--;
8819				cgp->cg_cs.cs_nifree++;
8820				clrbit(inosused, ino);
8821				jaddref->ja_state &= ~ATTACHED;
8822				jaddref->ja_state |= UNDONE;
8823				stat_jaddref++;
8824			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
8825				panic("initiate_write_bmsafemap: inode %d "
8826				    "marked free", jaddref->ja_ino);
8827		}
8828	}
8829	/*
8830	 * Clear any block allocations which are pending journal writes.
8831	 */
8832	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
8833		cgp = (struct cg *)bp->b_data;
8834		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
8835		blksfree = cg_blksfree(cgp);
8836		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
8837			bno = dtogd(fs, jnewblk->jn_blkno);
8838			cleared = 0;
8839			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
8840			    i++) {
8841				if (isclr(blksfree, bno + i)) {
8842					cleared = 1;
8843					setbit(blksfree, bno + i);
8844				}
8845			}
8846			/*
8847			 * We may not clear the block if it's a background
8848			 * copy.  In that case there is no reason to detach
8849			 * it.
8850			 */
8851			if (cleared) {
8852				stat_jnewblk++;
8853				jnewblk->jn_state &= ~ATTACHED;
8854				jnewblk->jn_state |= UNDONE;
8855			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
8856				panic("initiate_write_bmsafemap: block %jd "
8857				    "marked free", jnewblk->jn_blkno);
8858		}
8859	}
8860	/*
8861	 * Move allocation lists to the written lists so they can be
8862	 * cleared once the block write is complete.
8863	 */
8864	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
8865	    inodedep, id_deps);
8866	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
8867	    newblk, nb_deps);
8868}
8869
8870/*
8871 * This routine is called during the completion interrupt
8872 * service routine for a disk write (from the procedure called
8873 * by the device driver to inform the filesystem caches of
8874 * a request completion).  It should be called early in this
8875 * procedure, before the block is made available to other
8876 * processes or other routines are called.
8877 *
8878 */
8879static void
8880softdep_disk_write_complete(bp)
8881	struct buf *bp;		/* describes the completed disk write */
8882{
8883	struct worklist *wk;
8884	struct worklist *owk;
8885	struct workhead reattach;
8886	struct buf *sbp;
8887
8888	/*
8889	 * If an error occurred while doing the write, then the data
8890	 * has not hit the disk and the dependencies cannot be unrolled.
8891	 */
8892	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
8893		return;
8894	LIST_INIT(&reattach);
8895	/*
8896	 * This lock must not be released anywhere in this code segment.
8897	 */
8898	sbp = NULL;
8899	owk = NULL;
8900	ACQUIRE_LOCK(&lk);
8901	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
8902		WORKLIST_REMOVE(wk);
8903		if (wk == owk)
8904			panic("duplicate worklist: %p\n", wk);
8905		owk = wk;
8906		switch (wk->wk_type) {
8907
8908		case D_PAGEDEP:
8909			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
8910				WORKLIST_INSERT(&reattach, wk);
8911			continue;
8912
8913		case D_INODEDEP:
8914			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
8915				WORKLIST_INSERT(&reattach, wk);
8916			continue;
8917
8918		case D_BMSAFEMAP:
8919			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
8920				WORKLIST_INSERT(&reattach, wk);
8921			continue;
8922
8923		case D_MKDIR:
8924			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
8925			continue;
8926
8927		case D_ALLOCDIRECT:
8928			wk->wk_state |= COMPLETE;
8929			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
8930			continue;
8931
8932		case D_ALLOCINDIR:
8933			wk->wk_state |= COMPLETE;
8934			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
8935			continue;
8936
8937		case D_INDIRDEP:
8938			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
8939				WORKLIST_INSERT(&reattach, wk);
8940			continue;
8941
8942		case D_FREEBLKS:
8943			wk->wk_state |= COMPLETE;
8944			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
8945				add_to_worklist(wk, 1);
8946			continue;
8947
8948		case D_FREEWORK:
8949			handle_written_freework(WK_FREEWORK(wk));
8950			break;
8951
8952		case D_FREEDEP:
8953			free_freedep(WK_FREEDEP(wk));
8954			continue;
8955
8956		case D_JSEGDEP:
8957			free_jsegdep(WK_JSEGDEP(wk));
8958			continue;
8959
8960		case D_JSEG:
8961			handle_written_jseg(WK_JSEG(wk), bp);
8962			continue;
8963
8964		case D_SBDEP:
8965			if (handle_written_sbdep(WK_SBDEP(wk), bp))
8966				WORKLIST_INSERT(&reattach, wk);
8967			continue;
8968
8969		default:
8970			panic("handle_disk_write_complete: Unknown type %s",
8971			    TYPENAME(wk->wk_type));
8972			/* NOTREACHED */
8973		}
8974	}
8975	/*
8976	 * Reattach any requests that must be redone.
8977	 */
8978	while ((wk = LIST_FIRST(&reattach)) != NULL) {
8979		WORKLIST_REMOVE(wk);
8980		WORKLIST_INSERT(&bp->b_dep, wk);
8981	}
8982	FREE_LOCK(&lk);
8983	if (sbp)
8984		brelse(sbp);
8985}
8986
8987/*
8988 * Called from within softdep_disk_write_complete above. Note that
8989 * this routine is always called from interrupt level with further
8990 * splbio interrupts blocked.
8991 */
8992static void
8993handle_allocdirect_partdone(adp, wkhd)
8994	struct allocdirect *adp;	/* the completed allocdirect */
8995	struct workhead *wkhd;		/* Work to do when inode is writtne. */
8996{
8997	struct allocdirectlst *listhead;
8998	struct allocdirect *listadp;
8999	struct inodedep *inodedep;
9000	long bsize;
9001
9002	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
9003		return;
9004	/*
9005	 * The on-disk inode cannot claim to be any larger than the last
9006	 * fragment that has been written. Otherwise, the on-disk inode
9007	 * might have fragments that were not the last block in the file
9008	 * which would corrupt the filesystem. Thus, we cannot free any
9009	 * allocdirects after one whose ad_oldblkno claims a fragment as
9010	 * these blocks must be rolled back to zero before writing the inode.
9011	 * We check the currently active set of allocdirects in id_inoupdt
9012	 * or id_extupdt as appropriate.
9013	 */
9014	inodedep = adp->ad_inodedep;
9015	bsize = inodedep->id_fs->fs_bsize;
9016	if (adp->ad_state & EXTDATA)
9017		listhead = &inodedep->id_extupdt;
9018	else
9019		listhead = &inodedep->id_inoupdt;
9020	TAILQ_FOREACH(listadp, listhead, ad_next) {
9021		/* found our block */
9022		if (listadp == adp)
9023			break;
9024		/* continue if ad_oldlbn is not a fragment */
9025		if (listadp->ad_oldsize == 0 ||
9026		    listadp->ad_oldsize == bsize)
9027			continue;
9028		/* hit a fragment */
9029		return;
9030	}
9031	/*
9032	 * If we have reached the end of the current list without
9033	 * finding the just finished dependency, then it must be
9034	 * on the future dependency list. Future dependencies cannot
9035	 * be freed until they are moved to the current list.
9036	 */
9037	if (listadp == NULL) {
9038#ifdef DEBUG
9039		if (adp->ad_state & EXTDATA)
9040			listhead = &inodedep->id_newextupdt;
9041		else
9042			listhead = &inodedep->id_newinoupdt;
9043		TAILQ_FOREACH(listadp, listhead, ad_next)
9044			/* found our block */
9045			if (listadp == adp)
9046				break;
9047		if (listadp == NULL)
9048			panic("handle_allocdirect_partdone: lost dep");
9049#endif /* DEBUG */
9050		return;
9051	}
9052	/*
9053	 * If we have found the just finished dependency, then queue
9054	 * it along with anything that follows it that is complete.
9055	 * Since the pointer has not yet been written in the inode
9056	 * as the dependency prevents it, place the allocdirect on the
9057	 * bufwait list where it will be freed once the pointer is
9058	 * valid.
9059	 */
9060	if (wkhd == NULL)
9061		wkhd = &inodedep->id_bufwait;
9062	for (; adp; adp = listadp) {
9063		listadp = TAILQ_NEXT(adp, ad_next);
9064		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
9065			return;
9066		TAILQ_REMOVE(listhead, adp, ad_next);
9067		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
9068	}
9069}
9070
9071/*
9072 * Called from within softdep_disk_write_complete above.  This routine
9073 * completes successfully written allocindirs.
9074 */
9075static void
9076handle_allocindir_partdone(aip)
9077	struct allocindir *aip;		/* the completed allocindir */
9078{
9079	struct indirdep *indirdep;
9080
9081	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
9082		return;
9083	indirdep = aip->ai_indirdep;
9084	LIST_REMOVE(aip, ai_next);
9085	if (indirdep->ir_state & UNDONE) {
9086		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
9087		return;
9088	}
9089	if (indirdep->ir_state & UFS1FMT)
9090		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
9091		    aip->ai_newblkno;
9092	else
9093		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
9094		    aip->ai_newblkno;
9095	/*
9096	 * Await the pointer write before freeing the allocindir.
9097	 */
9098	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
9099}
9100
9101/*
9102 * Release segments held on a jwork list.
9103 */
9104static void
9105handle_jwork(wkhd)
9106	struct workhead *wkhd;
9107{
9108	struct worklist *wk;
9109
9110	while ((wk = LIST_FIRST(wkhd)) != NULL) {
9111		WORKLIST_REMOVE(wk);
9112		switch (wk->wk_type) {
9113		case D_JSEGDEP:
9114			free_jsegdep(WK_JSEGDEP(wk));
9115			continue;
9116		default:
9117			panic("handle_jwork: Unknown type %s\n",
9118			    TYPENAME(wk->wk_type));
9119		}
9120	}
9121}
9122
9123/*
9124 * Handle the bufwait list on an inode when it is safe to release items
9125 * held there.  This normally happens after an inode block is written but
9126 * may be delayed and handled later if there are pending journal items that
9127 * are not yet safe to be released.
9128 */
9129static struct freefile *
9130handle_bufwait(inodedep, refhd)
9131	struct inodedep *inodedep;
9132	struct workhead *refhd;
9133{
9134	struct jaddref *jaddref;
9135	struct freefile *freefile;
9136	struct worklist *wk;
9137
9138	freefile = NULL;
9139	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
9140		WORKLIST_REMOVE(wk);
9141		switch (wk->wk_type) {
9142		case D_FREEFILE:
9143			/*
9144			 * We defer adding freefile to the worklist
9145			 * until all other additions have been made to
9146			 * ensure that it will be done after all the
9147			 * old blocks have been freed.
9148			 */
9149			if (freefile != NULL)
9150				panic("handle_bufwait: freefile");
9151			freefile = WK_FREEFILE(wk);
9152			continue;
9153
9154		case D_MKDIR:
9155			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
9156			continue;
9157
9158		case D_DIRADD:
9159			diradd_inode_written(WK_DIRADD(wk), inodedep);
9160			continue;
9161
9162		case D_FREEFRAG:
9163			wk->wk_state |= COMPLETE;
9164			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
9165				add_to_worklist(wk, 0);
9166			continue;
9167
9168		case D_DIRREM:
9169			wk->wk_state |= COMPLETE;
9170			add_to_worklist(wk, 0);
9171			continue;
9172
9173		case D_ALLOCDIRECT:
9174		case D_ALLOCINDIR:
9175			free_newblk(WK_NEWBLK(wk));
9176			continue;
9177
9178		case D_JNEWBLK:
9179			wk->wk_state |= COMPLETE;
9180			free_jnewblk(WK_JNEWBLK(wk));
9181			continue;
9182
9183		/*
9184		 * Save freed journal segments and add references on
9185		 * the supplied list which will delay their release
9186		 * until the cg bitmap is cleared on disk.
9187		 */
9188		case D_JSEGDEP:
9189			if (refhd == NULL)
9190				free_jsegdep(WK_JSEGDEP(wk));
9191			else
9192				WORKLIST_INSERT(refhd, wk);
9193			continue;
9194
9195		case D_JADDREF:
9196			jaddref = WK_JADDREF(wk);
9197			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
9198			    if_deps);
9199			/*
9200			 * Transfer any jaddrefs to the list to be freed with
9201			 * the bitmap if we're handling a removed file.
9202			 */
9203			if (refhd == NULL) {
9204				wk->wk_state |= COMPLETE;
9205				free_jaddref(jaddref);
9206			} else
9207				WORKLIST_INSERT(refhd, wk);
9208			continue;
9209
9210		default:
9211			panic("handle_bufwait: Unknown type %p(%s)",
9212			    wk, TYPENAME(wk->wk_type));
9213			/* NOTREACHED */
9214		}
9215	}
9216	return (freefile);
9217}
9218/*
9219 * Called from within softdep_disk_write_complete above to restore
9220 * in-memory inode block contents to their most up-to-date state. Note
9221 * that this routine is always called from interrupt level with further
9222 * splbio interrupts blocked.
9223 */
9224static int
9225handle_written_inodeblock(inodedep, bp)
9226	struct inodedep *inodedep;
9227	struct buf *bp;		/* buffer containing the inode block */
9228{
9229	struct freefile *freefile;
9230	struct allocdirect *adp, *nextadp;
9231	struct ufs1_dinode *dp1 = NULL;
9232	struct ufs2_dinode *dp2 = NULL;
9233	struct workhead wkhd;
9234	int hadchanges, fstype;
9235	ino_t freelink;
9236
9237	LIST_INIT(&wkhd);
9238	hadchanges = 0;
9239	freefile = NULL;
9240	if ((inodedep->id_state & IOSTARTED) == 0)
9241		panic("handle_written_inodeblock: not started");
9242	inodedep->id_state &= ~IOSTARTED;
9243	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
9244		fstype = UFS1;
9245		dp1 = (struct ufs1_dinode *)bp->b_data +
9246		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
9247		freelink = dp1->di_freelink;
9248	} else {
9249		fstype = UFS2;
9250		dp2 = (struct ufs2_dinode *)bp->b_data +
9251		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
9252		freelink = dp2->di_freelink;
9253	}
9254	/*
9255	 * If we wrote a valid freelink pointer during the last write
9256	 * record it here.
9257	 */
9258	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
9259		struct inodedep *inon;
9260
9261		inon = TAILQ_NEXT(inodedep, id_unlinked);
9262		if ((inon == NULL && freelink == 0) ||
9263		    (inon && inon->id_ino == freelink)) {
9264			if (inon)
9265				inon->id_state |= UNLINKPREV;
9266			inodedep->id_state |= UNLINKNEXT;
9267		} else
9268			hadchanges = 1;
9269	}
9270	/* Leave this inodeblock dirty until it's in the list. */
9271	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED)
9272		hadchanges = 1;
9273	/*
9274	 * If we had to rollback the inode allocation because of
9275	 * bitmaps being incomplete, then simply restore it.
9276	 * Keep the block dirty so that it will not be reclaimed until
9277	 * all associated dependencies have been cleared and the
9278	 * corresponding updates written to disk.
9279	 */
9280	if (inodedep->id_savedino1 != NULL) {
9281		hadchanges = 1;
9282		if (fstype == UFS1)
9283			*dp1 = *inodedep->id_savedino1;
9284		else
9285			*dp2 = *inodedep->id_savedino2;
9286		free(inodedep->id_savedino1, M_SAVEDINO);
9287		inodedep->id_savedino1 = NULL;
9288		if ((bp->b_flags & B_DELWRI) == 0)
9289			stat_inode_bitmap++;
9290		bdirty(bp);
9291		/*
9292		 * If the inode is clear here and GOINGAWAY it will never
9293		 * be written.  Process the bufwait and clear any pending
9294		 * work which may include the freefile.
9295		 */
9296		if (inodedep->id_state & GOINGAWAY)
9297			goto bufwait;
9298		return (1);
9299	}
9300	inodedep->id_state |= COMPLETE;
9301	/*
9302	 * Roll forward anything that had to be rolled back before
9303	 * the inode could be updated.
9304	 */
9305	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
9306		nextadp = TAILQ_NEXT(adp, ad_next);
9307		if (adp->ad_state & ATTACHED)
9308			panic("handle_written_inodeblock: new entry");
9309		if (fstype == UFS1) {
9310			if (adp->ad_offset < NDADDR) {
9311				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
9312					panic("%s %s #%jd mismatch %d != %jd",
9313					    "handle_written_inodeblock:",
9314					    "direct pointer",
9315					    (intmax_t)adp->ad_offset,
9316					    dp1->di_db[adp->ad_offset],
9317					    (intmax_t)adp->ad_oldblkno);
9318				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
9319			} else {
9320				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
9321					panic("%s: %s #%jd allocated as %d",
9322					    "handle_written_inodeblock",
9323					    "indirect pointer",
9324					    (intmax_t)adp->ad_offset - NDADDR,
9325					    dp1->di_ib[adp->ad_offset - NDADDR]);
9326				dp1->di_ib[adp->ad_offset - NDADDR] =
9327				    adp->ad_newblkno;
9328			}
9329		} else {
9330			if (adp->ad_offset < NDADDR) {
9331				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
9332					panic("%s: %s #%jd %s %jd != %jd",
9333					    "handle_written_inodeblock",
9334					    "direct pointer",
9335					    (intmax_t)adp->ad_offset, "mismatch",
9336					    (intmax_t)dp2->di_db[adp->ad_offset],
9337					    (intmax_t)adp->ad_oldblkno);
9338				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
9339			} else {
9340				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
9341					panic("%s: %s #%jd allocated as %jd",
9342					    "handle_written_inodeblock",
9343					    "indirect pointer",
9344					    (intmax_t)adp->ad_offset - NDADDR,
9345					    (intmax_t)
9346					    dp2->di_ib[adp->ad_offset - NDADDR]);
9347				dp2->di_ib[adp->ad_offset - NDADDR] =
9348				    adp->ad_newblkno;
9349			}
9350		}
9351		adp->ad_state &= ~UNDONE;
9352		adp->ad_state |= ATTACHED;
9353		hadchanges = 1;
9354	}
9355	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
9356		nextadp = TAILQ_NEXT(adp, ad_next);
9357		if (adp->ad_state & ATTACHED)
9358			panic("handle_written_inodeblock: new entry");
9359		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
9360			panic("%s: direct pointers #%jd %s %jd != %jd",
9361			    "handle_written_inodeblock",
9362			    (intmax_t)adp->ad_offset, "mismatch",
9363			    (intmax_t)dp2->di_extb[adp->ad_offset],
9364			    (intmax_t)adp->ad_oldblkno);
9365		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
9366		adp->ad_state &= ~UNDONE;
9367		adp->ad_state |= ATTACHED;
9368		hadchanges = 1;
9369	}
9370	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
9371		stat_direct_blk_ptrs++;
9372	/*
9373	 * Reset the file size to its most up-to-date value.
9374	 */
9375	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
9376		panic("handle_written_inodeblock: bad size");
9377	if (inodedep->id_savednlink > LINK_MAX)
9378		panic("handle_written_inodeblock: Invalid link count "
9379		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
9380	if (fstype == UFS1) {
9381		if (dp1->di_nlink != inodedep->id_savednlink) {
9382			dp1->di_nlink = inodedep->id_savednlink;
9383			hadchanges = 1;
9384		}
9385		if (dp1->di_size != inodedep->id_savedsize) {
9386			dp1->di_size = inodedep->id_savedsize;
9387			hadchanges = 1;
9388		}
9389	} else {
9390		if (dp2->di_nlink != inodedep->id_savednlink) {
9391			dp2->di_nlink = inodedep->id_savednlink;
9392			hadchanges = 1;
9393		}
9394		if (dp2->di_size != inodedep->id_savedsize) {
9395			dp2->di_size = inodedep->id_savedsize;
9396			hadchanges = 1;
9397		}
9398		if (dp2->di_extsize != inodedep->id_savedextsize) {
9399			dp2->di_extsize = inodedep->id_savedextsize;
9400			hadchanges = 1;
9401		}
9402	}
9403	inodedep->id_savedsize = -1;
9404	inodedep->id_savedextsize = -1;
9405	inodedep->id_savednlink = -1;
9406	/*
9407	 * If there were any rollbacks in the inode block, then it must be
9408	 * marked dirty so that its will eventually get written back in
9409	 * its correct form.
9410	 */
9411	if (hadchanges)
9412		bdirty(bp);
9413bufwait:
9414	/*
9415	 * Process any allocdirects that completed during the update.
9416	 */
9417	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
9418		handle_allocdirect_partdone(adp, &wkhd);
9419	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
9420		handle_allocdirect_partdone(adp, &wkhd);
9421	/*
9422	 * Process deallocations that were held pending until the
9423	 * inode had been written to disk. Freeing of the inode
9424	 * is delayed until after all blocks have been freed to
9425	 * avoid creation of new <vfsid, inum, lbn> triples
9426	 * before the old ones have been deleted.  Completely
9427	 * unlinked inodes are not processed until the unlinked
9428	 * inode list is written or the last reference is removed.
9429	 */
9430	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
9431		freefile = handle_bufwait(inodedep, NULL);
9432		if (freefile && !LIST_EMPTY(&wkhd)) {
9433			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
9434			freefile = NULL;
9435		}
9436	}
9437	/*
9438	 * Move rolled forward dependency completions to the bufwait list
9439	 * now that those that were already written have been processed.
9440	 */
9441	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
9442		panic("handle_written_inodeblock: bufwait but no changes");
9443	jwork_move(&inodedep->id_bufwait, &wkhd);
9444
9445	if (freefile != NULL) {
9446		/*
9447		 * If the inode is goingaway it was never written.  Fake up
9448		 * the state here so free_inodedep() can succeed.
9449		 */
9450		if (inodedep->id_state & GOINGAWAY)
9451			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
9452		if (free_inodedep(inodedep) == 0)
9453			panic("handle_written_inodeblock: live inodedep %p",
9454			    inodedep);
9455		add_to_worklist(&freefile->fx_list, 0);
9456		return (0);
9457	}
9458
9459	/*
9460	 * If no outstanding dependencies, free it.
9461	 */
9462	if (free_inodedep(inodedep) ||
9463	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
9464	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
9465	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
9466	     LIST_FIRST(&inodedep->id_bufwait) == 0))
9467		return (0);
9468	return (hadchanges);
9469}
9470
9471static int
9472handle_written_indirdep(indirdep, bp, bpp)
9473	struct indirdep *indirdep;
9474	struct buf *bp;
9475	struct buf **bpp;
9476{
9477	struct allocindir *aip;
9478	int chgs;
9479
9480	if (indirdep->ir_state & GOINGAWAY)
9481		panic("disk_write_complete: indirdep gone");
9482	chgs = 0;
9483	/*
9484	 * If there were rollbacks revert them here.
9485	 */
9486	if (indirdep->ir_saveddata) {
9487		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
9488		free(indirdep->ir_saveddata, M_INDIRDEP);
9489		indirdep->ir_saveddata = 0;
9490		chgs = 1;
9491	}
9492	indirdep->ir_state &= ~UNDONE;
9493	indirdep->ir_state |= ATTACHED;
9494	/*
9495	 * Move allocindirs with written pointers to the completehd if
9496	 * the indirdep's pointer is not yet written.  Otherwise
9497	 * free them here.
9498	 */
9499	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
9500		LIST_REMOVE(aip, ai_next);
9501		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
9502			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
9503			    ai_next);
9504			continue;
9505		}
9506		free_newblk(&aip->ai_block);
9507	}
9508	/*
9509	 * Move allocindirs that have finished dependency processing from
9510	 * the done list to the write list after updating the pointers.
9511	 */
9512	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
9513		handle_allocindir_partdone(aip);
9514		if (aip == LIST_FIRST(&indirdep->ir_donehd))
9515			panic("disk_write_complete: not gone");
9516		chgs = 1;
9517	}
9518	/*
9519	 * If this indirdep has been detached from its newblk during
9520	 * I/O we need to keep this dep attached to the buffer so
9521	 * deallocate_dependencies can find it and properly resolve
9522	 * any outstanding dependencies.
9523	 */
9524	if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0)
9525		chgs = 1;
9526	if ((bp->b_flags & B_DELWRI) == 0)
9527		stat_indir_blk_ptrs++;
9528	/*
9529	 * If there were no changes we can discard the savedbp and detach
9530	 * ourselves from the buf.  We are only carrying completed pointers
9531	 * in this case.
9532	 */
9533	if (chgs == 0) {
9534		struct buf *sbp;
9535
9536		sbp = indirdep->ir_savebp;
9537		sbp->b_flags |= B_INVAL | B_NOCACHE;
9538		indirdep->ir_savebp = NULL;
9539		if (*bpp != NULL)
9540			panic("handle_written_indirdep: bp already exists.");
9541		*bpp = sbp;
9542	} else
9543		bdirty(bp);
9544	/*
9545	 * If there are no fresh dependencies and none waiting on writes
9546	 * we can free the indirdep.
9547	 */
9548	if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) {
9549		if (indirdep->ir_state & ONDEPLIST)
9550			LIST_REMOVE(indirdep, ir_next);
9551		free_indirdep(indirdep);
9552		return (0);
9553	}
9554
9555	return (chgs);
9556}
9557
9558/*
9559 * Process a diradd entry after its dependent inode has been written.
9560 * This routine must be called with splbio interrupts blocked.
9561 */
9562static void
9563diradd_inode_written(dap, inodedep)
9564	struct diradd *dap;
9565	struct inodedep *inodedep;
9566{
9567
9568	dap->da_state |= COMPLETE;
9569	complete_diradd(dap);
9570	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9571}
9572
9573/*
9574 * Returns true if the bmsafemap will have rollbacks when written.  Must
9575 * only be called with lk and the buf lock on the cg held.
9576 */
9577static int
9578bmsafemap_rollbacks(bmsafemap)
9579	struct bmsafemap *bmsafemap;
9580{
9581
9582	return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
9583	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd));
9584}
9585
9586/*
9587 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
9588 * changes if it's not a background write.  Set all written dependencies
9589 * to DEPCOMPLETE and free the structure if possible.
9590 */
9591static int
9592handle_written_bmsafemap(bmsafemap, bp)
9593	struct bmsafemap *bmsafemap;
9594	struct buf *bp;
9595{
9596	struct newblk *newblk;
9597	struct inodedep *inodedep;
9598	struct jaddref *jaddref, *jatmp;
9599	struct jnewblk *jnewblk, *jntmp;
9600	uint8_t *inosused;
9601	uint8_t *blksfree;
9602	struct cg *cgp;
9603	struct fs *fs;
9604	ino_t ino;
9605	long bno;
9606	int chgs;
9607	int i;
9608
9609	if ((bmsafemap->sm_state & IOSTARTED) == 0)
9610		panic("initiate_write_bmsafemap: Not started\n");
9611	chgs = 0;
9612	bmsafemap->sm_state &= ~IOSTARTED;
9613	/*
9614	 * Restore unwritten inode allocation pending jaddref writes.
9615	 */
9616	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
9617		cgp = (struct cg *)bp->b_data;
9618		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
9619		inosused = cg_inosused(cgp);
9620		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
9621		    ja_bmdeps, jatmp) {
9622			if ((jaddref->ja_state & UNDONE) == 0)
9623				continue;
9624			ino = jaddref->ja_ino % fs->fs_ipg;
9625			if (isset(inosused, ino))
9626				panic("handle_written_bmsafemap: "
9627				    "re-allocated inode");
9628			if ((bp->b_xflags & BX_BKGRDMARKER) == 0) {
9629				if ((jaddref->ja_mode & IFMT) == IFDIR)
9630					cgp->cg_cs.cs_ndir++;
9631				cgp->cg_cs.cs_nifree--;
9632				setbit(inosused, ino);
9633				chgs = 1;
9634			}
9635			jaddref->ja_state &= ~UNDONE;
9636			jaddref->ja_state |= ATTACHED;
9637			free_jaddref(jaddref);
9638		}
9639	}
9640	/*
9641	 * Restore any block allocations which are pending journal writes.
9642	 */
9643	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
9644		cgp = (struct cg *)bp->b_data;
9645		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
9646		blksfree = cg_blksfree(cgp);
9647		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
9648		    jntmp) {
9649			if ((jnewblk->jn_state & UNDONE) == 0)
9650				continue;
9651			bno = dtogd(fs, jnewblk->jn_blkno);
9652			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
9653			    i++) {
9654				if (bp->b_xflags & BX_BKGRDMARKER)
9655					break;
9656				if ((jnewblk->jn_state & NEWBLOCK) == 0 &&
9657				    isclr(blksfree, bno + i))
9658					panic("handle_written_bmsafemap: "
9659					    "re-allocated fragment");
9660				clrbit(blksfree, bno + i);
9661				chgs = 1;
9662			}
9663			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
9664			jnewblk->jn_state |= ATTACHED;
9665			free_jnewblk(jnewblk);
9666		}
9667	}
9668	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
9669		newblk->nb_state |= DEPCOMPLETE;
9670		newblk->nb_state &= ~ONDEPLIST;
9671		newblk->nb_bmsafemap = NULL;
9672		LIST_REMOVE(newblk, nb_deps);
9673		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
9674			handle_allocdirect_partdone(
9675			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
9676		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
9677			handle_allocindir_partdone(
9678			    WK_ALLOCINDIR(&newblk->nb_list));
9679		else if (newblk->nb_list.wk_type != D_NEWBLK)
9680			panic("handle_written_bmsafemap: Unexpected type: %s",
9681			    TYPENAME(newblk->nb_list.wk_type));
9682	}
9683	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
9684		inodedep->id_state |= DEPCOMPLETE;
9685		inodedep->id_state &= ~ONDEPLIST;
9686		LIST_REMOVE(inodedep, id_deps);
9687		inodedep->id_bmsafemap = NULL;
9688	}
9689	if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
9690	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
9691	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
9692	    LIST_EMPTY(&bmsafemap->sm_inodedephd)) {
9693		if (chgs)
9694			bdirty(bp);
9695		LIST_REMOVE(bmsafemap, sm_hash);
9696		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
9697		return (0);
9698	}
9699	bdirty(bp);
9700	return (1);
9701}
9702
9703/*
9704 * Try to free a mkdir dependency.
9705 */
9706static void
9707complete_mkdir(mkdir)
9708	struct mkdir *mkdir;
9709{
9710	struct diradd *dap;
9711
9712	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
9713		return;
9714	LIST_REMOVE(mkdir, md_mkdirs);
9715	dap = mkdir->md_diradd;
9716	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
9717	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
9718		dap->da_state |= DEPCOMPLETE;
9719		complete_diradd(dap);
9720	}
9721	WORKITEM_FREE(mkdir, D_MKDIR);
9722}
9723
9724/*
9725 * Handle the completion of a mkdir dependency.
9726 */
9727static void
9728handle_written_mkdir(mkdir, type)
9729	struct mkdir *mkdir;
9730	int type;
9731{
9732
9733	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
9734		panic("handle_written_mkdir: bad type");
9735	mkdir->md_state |= COMPLETE;
9736	complete_mkdir(mkdir);
9737}
9738
9739static void
9740free_pagedep(pagedep)
9741	struct pagedep *pagedep;
9742{
9743	int i;
9744
9745	if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST))
9746		return;
9747	for (i = 0; i < DAHASHSZ; i++)
9748		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
9749			return;
9750	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
9751		return;
9752	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
9753		return;
9754	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
9755		return;
9756	LIST_REMOVE(pagedep, pd_hash);
9757	WORKITEM_FREE(pagedep, D_PAGEDEP);
9758}
9759
9760/*
9761 * Called from within softdep_disk_write_complete above.
9762 * A write operation was just completed. Removed inodes can
9763 * now be freed and associated block pointers may be committed.
9764 * Note that this routine is always called from interrupt level
9765 * with further splbio interrupts blocked.
9766 */
9767static int
9768handle_written_filepage(pagedep, bp)
9769	struct pagedep *pagedep;
9770	struct buf *bp;		/* buffer containing the written page */
9771{
9772	struct dirrem *dirrem;
9773	struct diradd *dap, *nextdap;
9774	struct direct *ep;
9775	int i, chgs;
9776
9777	if ((pagedep->pd_state & IOSTARTED) == 0)
9778		panic("handle_written_filepage: not started");
9779	pagedep->pd_state &= ~IOSTARTED;
9780	/*
9781	 * Process any directory removals that have been committed.
9782	 */
9783	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
9784		LIST_REMOVE(dirrem, dm_next);
9785		dirrem->dm_state |= COMPLETE;
9786		dirrem->dm_dirinum = pagedep->pd_ino;
9787		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9788		    ("handle_written_filepage: Journal entries not written."));
9789		add_to_worklist(&dirrem->dm_list, 0);
9790	}
9791	/*
9792	 * Free any directory additions that have been committed.
9793	 * If it is a newly allocated block, we have to wait until
9794	 * the on-disk directory inode claims the new block.
9795	 */
9796	if ((pagedep->pd_state & NEWBLOCK) == 0)
9797		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
9798			free_diradd(dap, NULL);
9799	/*
9800	 * Uncommitted directory entries must be restored.
9801	 */
9802	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
9803		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
9804		     dap = nextdap) {
9805			nextdap = LIST_NEXT(dap, da_pdlist);
9806			if (dap->da_state & ATTACHED)
9807				panic("handle_written_filepage: attached");
9808			ep = (struct direct *)
9809			    ((char *)bp->b_data + dap->da_offset);
9810			ep->d_ino = dap->da_newinum;
9811			dap->da_state &= ~UNDONE;
9812			dap->da_state |= ATTACHED;
9813			chgs = 1;
9814			/*
9815			 * If the inode referenced by the directory has
9816			 * been written out, then the dependency can be
9817			 * moved to the pending list.
9818			 */
9819			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
9820				LIST_REMOVE(dap, da_pdlist);
9821				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
9822				    da_pdlist);
9823			}
9824		}
9825	}
9826	/*
9827	 * If there were any rollbacks in the directory, then it must be
9828	 * marked dirty so that its will eventually get written back in
9829	 * its correct form.
9830	 */
9831	if (chgs) {
9832		if ((bp->b_flags & B_DELWRI) == 0)
9833			stat_dir_entry++;
9834		bdirty(bp);
9835		return (1);
9836	}
9837	/*
9838	 * If we are not waiting for a new directory block to be
9839	 * claimed by its inode, then the pagedep will be freed.
9840	 * Otherwise it will remain to track any new entries on
9841	 * the page in case they are fsync'ed.
9842	 */
9843	if ((pagedep->pd_state & NEWBLOCK) == 0 &&
9844	    LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
9845		LIST_REMOVE(pagedep, pd_hash);
9846		WORKITEM_FREE(pagedep, D_PAGEDEP);
9847	}
9848	return (0);
9849}
9850
9851/*
9852 * Writing back in-core inode structures.
9853 *
9854 * The filesystem only accesses an inode's contents when it occupies an
9855 * "in-core" inode structure.  These "in-core" structures are separate from
9856 * the page frames used to cache inode blocks.  Only the latter are
9857 * transferred to/from the disk.  So, when the updated contents of the
9858 * "in-core" inode structure are copied to the corresponding in-memory inode
9859 * block, the dependencies are also transferred.  The following procedure is
9860 * called when copying a dirty "in-core" inode to a cached inode block.
9861 */
9862
9863/*
9864 * Called when an inode is loaded from disk. If the effective link count
9865 * differed from the actual link count when it was last flushed, then we
9866 * need to ensure that the correct effective link count is put back.
9867 */
9868void
9869softdep_load_inodeblock(ip)
9870	struct inode *ip;	/* the "in_core" copy of the inode */
9871{
9872	struct inodedep *inodedep;
9873
9874	/*
9875	 * Check for alternate nlink count.
9876	 */
9877	ip->i_effnlink = ip->i_nlink;
9878	ACQUIRE_LOCK(&lk);
9879	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
9880	    &inodedep) == 0) {
9881		FREE_LOCK(&lk);
9882		return;
9883	}
9884	ip->i_effnlink -= inodedep->id_nlinkdelta;
9885	FREE_LOCK(&lk);
9886}
9887
9888/*
9889 * This routine is called just before the "in-core" inode
9890 * information is to be copied to the in-memory inode block.
9891 * Recall that an inode block contains several inodes. If
9892 * the force flag is set, then the dependencies will be
9893 * cleared so that the update can always be made. Note that
9894 * the buffer is locked when this routine is called, so we
9895 * will never be in the middle of writing the inode block
9896 * to disk.
9897 */
9898void
9899softdep_update_inodeblock(ip, bp, waitfor)
9900	struct inode *ip;	/* the "in_core" copy of the inode */
9901	struct buf *bp;		/* the buffer containing the inode block */
9902	int waitfor;		/* nonzero => update must be allowed */
9903{
9904	struct inodedep *inodedep;
9905	struct inoref *inoref;
9906	struct worklist *wk;
9907	struct mount *mp;
9908	struct buf *ibp;
9909	struct fs *fs;
9910	int error;
9911
9912	mp = UFSTOVFS(ip->i_ump);
9913	fs = ip->i_fs;
9914	/*
9915	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
9916	 * does not have access to the in-core ip so must write directly into
9917	 * the inode block buffer when setting freelink.
9918	 */
9919	if (fs->fs_magic == FS_UFS1_MAGIC)
9920		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
9921		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
9922	else
9923		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
9924		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
9925	/*
9926	 * If the effective link count is not equal to the actual link
9927	 * count, then we must track the difference in an inodedep while
9928	 * the inode is (potentially) tossed out of the cache. Otherwise,
9929	 * if there is no existing inodedep, then there are no dependencies
9930	 * to track.
9931	 */
9932	ACQUIRE_LOCK(&lk);
9933again:
9934	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
9935		FREE_LOCK(&lk);
9936		if (ip->i_effnlink != ip->i_nlink)
9937			panic("softdep_update_inodeblock: bad link count");
9938		return;
9939	}
9940	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
9941		panic("softdep_update_inodeblock: bad delta");
9942	/*
9943	 * If we're flushing all dependencies we must also move any waiting
9944	 * for journal writes onto the bufwait list prior to I/O.
9945	 */
9946	if (waitfor) {
9947		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
9948			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
9949			    == DEPCOMPLETE) {
9950				stat_jwait_inode++;
9951				jwait(&inoref->if_list);
9952				goto again;
9953			}
9954		}
9955	}
9956	/*
9957	 * Changes have been initiated. Anything depending on these
9958	 * changes cannot occur until this inode has been written.
9959	 */
9960	inodedep->id_state &= ~COMPLETE;
9961	if ((inodedep->id_state & ONWORKLIST) == 0)
9962		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
9963	/*
9964	 * Any new dependencies associated with the incore inode must
9965	 * now be moved to the list associated with the buffer holding
9966	 * the in-memory copy of the inode. Once merged process any
9967	 * allocdirects that are completed by the merger.
9968	 */
9969	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
9970	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
9971		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
9972		    NULL);
9973	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
9974	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
9975		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
9976		    NULL);
9977	/*
9978	 * Now that the inode has been pushed into the buffer, the
9979	 * operations dependent on the inode being written to disk
9980	 * can be moved to the id_bufwait so that they will be
9981	 * processed when the buffer I/O completes.
9982	 */
9983	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
9984		WORKLIST_REMOVE(wk);
9985		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
9986	}
9987	/*
9988	 * Newly allocated inodes cannot be written until the bitmap
9989	 * that allocates them have been written (indicated by
9990	 * DEPCOMPLETE being set in id_state). If we are doing a
9991	 * forced sync (e.g., an fsync on a file), we force the bitmap
9992	 * to be written so that the update can be done.
9993	 */
9994	if (waitfor == 0) {
9995		FREE_LOCK(&lk);
9996		return;
9997	}
9998retry:
9999	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
10000		FREE_LOCK(&lk);
10001		return;
10002	}
10003	ibp = inodedep->id_bmsafemap->sm_buf;
10004	ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
10005	if (ibp == NULL) {
10006		/*
10007		 * If ibp came back as NULL, the dependency could have been
10008		 * freed while we slept.  Look it up again, and check to see
10009		 * that it has completed.
10010		 */
10011		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
10012			goto retry;
10013		FREE_LOCK(&lk);
10014		return;
10015	}
10016	FREE_LOCK(&lk);
10017	if ((error = bwrite(ibp)) != 0)
10018		softdep_error("softdep_update_inodeblock: bwrite", error);
10019}
10020
10021/*
10022 * Merge the a new inode dependency list (such as id_newinoupdt) into an
10023 * old inode dependency list (such as id_inoupdt). This routine must be
10024 * called with splbio interrupts blocked.
10025 */
10026static void
10027merge_inode_lists(newlisthead, oldlisthead)
10028	struct allocdirectlst *newlisthead;
10029	struct allocdirectlst *oldlisthead;
10030{
10031	struct allocdirect *listadp, *newadp;
10032
10033	newadp = TAILQ_FIRST(newlisthead);
10034	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
10035		if (listadp->ad_offset < newadp->ad_offset) {
10036			listadp = TAILQ_NEXT(listadp, ad_next);
10037			continue;
10038		}
10039		TAILQ_REMOVE(newlisthead, newadp, ad_next);
10040		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
10041		if (listadp->ad_offset == newadp->ad_offset) {
10042			allocdirect_merge(oldlisthead, newadp,
10043			    listadp);
10044			listadp = newadp;
10045		}
10046		newadp = TAILQ_FIRST(newlisthead);
10047	}
10048	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
10049		TAILQ_REMOVE(newlisthead, newadp, ad_next);
10050		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
10051	}
10052}
10053
10054/*
10055 * If we are doing an fsync, then we must ensure that any directory
10056 * entries for the inode have been written after the inode gets to disk.
10057 */
10058int
10059softdep_fsync(vp)
10060	struct vnode *vp;	/* the "in_core" copy of the inode */
10061{
10062	struct inodedep *inodedep;
10063	struct pagedep *pagedep;
10064	struct inoref *inoref;
10065	struct worklist *wk;
10066	struct diradd *dap;
10067	struct mount *mp;
10068	struct vnode *pvp;
10069	struct inode *ip;
10070	struct buf *bp;
10071	struct fs *fs;
10072	struct thread *td = curthread;
10073	int error, flushparent, pagedep_new_block;
10074	ino_t parentino;
10075	ufs_lbn_t lbn;
10076
10077	ip = VTOI(vp);
10078	fs = ip->i_fs;
10079	mp = vp->v_mount;
10080	ACQUIRE_LOCK(&lk);
10081restart:
10082	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
10083		FREE_LOCK(&lk);
10084		return (0);
10085	}
10086	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
10087		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
10088		    == DEPCOMPLETE) {
10089			stat_jwait_inode++;
10090			jwait(&inoref->if_list);
10091			goto restart;
10092		}
10093	}
10094	if (!LIST_EMPTY(&inodedep->id_inowait) ||
10095	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
10096	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
10097	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
10098	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
10099		panic("softdep_fsync: pending ops %p", inodedep);
10100	for (error = 0, flushparent = 0; ; ) {
10101		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
10102			break;
10103		if (wk->wk_type != D_DIRADD)
10104			panic("softdep_fsync: Unexpected type %s",
10105			    TYPENAME(wk->wk_type));
10106		dap = WK_DIRADD(wk);
10107		/*
10108		 * Flush our parent if this directory entry has a MKDIR_PARENT
10109		 * dependency or is contained in a newly allocated block.
10110		 */
10111		if (dap->da_state & DIRCHG)
10112			pagedep = dap->da_previous->dm_pagedep;
10113		else
10114			pagedep = dap->da_pagedep;
10115		parentino = pagedep->pd_ino;
10116		lbn = pagedep->pd_lbn;
10117		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
10118			panic("softdep_fsync: dirty");
10119		if ((dap->da_state & MKDIR_PARENT) ||
10120		    (pagedep->pd_state & NEWBLOCK))
10121			flushparent = 1;
10122		else
10123			flushparent = 0;
10124		/*
10125		 * If we are being fsync'ed as part of vgone'ing this vnode,
10126		 * then we will not be able to release and recover the
10127		 * vnode below, so we just have to give up on writing its
10128		 * directory entry out. It will eventually be written, just
10129		 * not now, but then the user was not asking to have it
10130		 * written, so we are not breaking any promises.
10131		 */
10132		if (vp->v_iflag & VI_DOOMED)
10133			break;
10134		/*
10135		 * We prevent deadlock by always fetching inodes from the
10136		 * root, moving down the directory tree. Thus, when fetching
10137		 * our parent directory, we first try to get the lock. If
10138		 * that fails, we must unlock ourselves before requesting
10139		 * the lock on our parent. See the comment in ufs_lookup
10140		 * for details on possible races.
10141		 */
10142		FREE_LOCK(&lk);
10143		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
10144		    FFSV_FORCEINSMQ)) {
10145			error = vfs_busy(mp, MBF_NOWAIT);
10146			if (error != 0) {
10147				vfs_ref(mp);
10148				VOP_UNLOCK(vp, 0);
10149				error = vfs_busy(mp, 0);
10150				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
10151				vfs_rel(mp);
10152				if (error != 0)
10153					return (ENOENT);
10154				if (vp->v_iflag & VI_DOOMED) {
10155					vfs_unbusy(mp);
10156					return (ENOENT);
10157				}
10158			}
10159			VOP_UNLOCK(vp, 0);
10160			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
10161			    &pvp, FFSV_FORCEINSMQ);
10162			vfs_unbusy(mp);
10163			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
10164			if (vp->v_iflag & VI_DOOMED) {
10165				if (error == 0)
10166					vput(pvp);
10167				error = ENOENT;
10168			}
10169			if (error != 0)
10170				return (error);
10171		}
10172		/*
10173		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
10174		 * that are contained in direct blocks will be resolved by
10175		 * doing a ffs_update. Pagedeps contained in indirect blocks
10176		 * may require a complete sync'ing of the directory. So, we
10177		 * try the cheap and fast ffs_update first, and if that fails,
10178		 * then we do the slower ffs_syncvnode of the directory.
10179		 */
10180		if (flushparent) {
10181			int locked;
10182
10183			if ((error = ffs_update(pvp, 1)) != 0) {
10184				vput(pvp);
10185				return (error);
10186			}
10187			ACQUIRE_LOCK(&lk);
10188			locked = 1;
10189			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
10190				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
10191					if (wk->wk_type != D_DIRADD)
10192						panic("softdep_fsync: Unexpected type %s",
10193						      TYPENAME(wk->wk_type));
10194					dap = WK_DIRADD(wk);
10195					if (dap->da_state & DIRCHG)
10196						pagedep = dap->da_previous->dm_pagedep;
10197					else
10198						pagedep = dap->da_pagedep;
10199					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
10200					FREE_LOCK(&lk);
10201					locked = 0;
10202					if (pagedep_new_block &&
10203					    (error = ffs_syncvnode(pvp, MNT_WAIT))) {
10204						vput(pvp);
10205						return (error);
10206					}
10207				}
10208			}
10209			if (locked)
10210				FREE_LOCK(&lk);
10211		}
10212		/*
10213		 * Flush directory page containing the inode's name.
10214		 */
10215		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
10216		    &bp);
10217		if (error == 0)
10218			error = bwrite(bp);
10219		else
10220			brelse(bp);
10221		vput(pvp);
10222		if (error != 0)
10223			return (error);
10224		ACQUIRE_LOCK(&lk);
10225		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
10226			break;
10227	}
10228	FREE_LOCK(&lk);
10229	return (0);
10230}
10231
10232/*
10233 * Flush all the dirty bitmaps associated with the block device
10234 * before flushing the rest of the dirty blocks so as to reduce
10235 * the number of dependencies that will have to be rolled back.
10236 */
10237void
10238softdep_fsync_mountdev(vp)
10239	struct vnode *vp;
10240{
10241	struct buf *bp, *nbp;
10242	struct worklist *wk;
10243	struct bufobj *bo;
10244
10245	if (!vn_isdisk(vp, NULL))
10246		panic("softdep_fsync_mountdev: vnode not a disk");
10247	bo = &vp->v_bufobj;
10248restart:
10249	BO_LOCK(bo);
10250	ACQUIRE_LOCK(&lk);
10251	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
10252		/*
10253		 * If it is already scheduled, skip to the next buffer.
10254		 */
10255		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
10256			continue;
10257
10258		if ((bp->b_flags & B_DELWRI) == 0)
10259			panic("softdep_fsync_mountdev: not dirty");
10260		/*
10261		 * We are only interested in bitmaps with outstanding
10262		 * dependencies.
10263		 */
10264		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
10265		    wk->wk_type != D_BMSAFEMAP ||
10266		    (bp->b_vflags & BV_BKGRDINPROG)) {
10267			BUF_UNLOCK(bp);
10268			continue;
10269		}
10270		FREE_LOCK(&lk);
10271		BO_UNLOCK(bo);
10272		bremfree(bp);
10273		(void) bawrite(bp);
10274		goto restart;
10275	}
10276	FREE_LOCK(&lk);
10277	drain_output(vp);
10278	BO_UNLOCK(bo);
10279}
10280
10281/*
10282 * This routine is called when we are trying to synchronously flush a
10283 * file. This routine must eliminate any filesystem metadata dependencies
10284 * so that the syncing routine can succeed by pushing the dirty blocks
10285 * associated with the file. If any I/O errors occur, they are returned.
10286 */
10287int
10288softdep_sync_metadata(struct vnode *vp)
10289{
10290	struct pagedep *pagedep;
10291	struct allocindir *aip;
10292	struct newblk *newblk;
10293	struct buf *bp, *nbp;
10294	struct worklist *wk;
10295	struct bufobj *bo;
10296	int i, error, waitfor;
10297
10298	if (!DOINGSOFTDEP(vp))
10299		return (0);
10300	/*
10301	 * Ensure that any direct block dependencies have been cleared.
10302	 */
10303	ACQUIRE_LOCK(&lk);
10304	if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) {
10305		FREE_LOCK(&lk);
10306		return (error);
10307	}
10308	FREE_LOCK(&lk);
10309	/*
10310	 * For most files, the only metadata dependencies are the
10311	 * cylinder group maps that allocate their inode or blocks.
10312	 * The block allocation dependencies can be found by traversing
10313	 * the dependency lists for any buffers that remain on their
10314	 * dirty buffer list. The inode allocation dependency will
10315	 * be resolved when the inode is updated with MNT_WAIT.
10316	 * This work is done in two passes. The first pass grabs most
10317	 * of the buffers and begins asynchronously writing them. The
10318	 * only way to wait for these asynchronous writes is to sleep
10319	 * on the filesystem vnode which may stay busy for a long time
10320	 * if the filesystem is active. So, instead, we make a second
10321	 * pass over the dependencies blocking on each write. In the
10322	 * usual case we will be blocking against a write that we
10323	 * initiated, so when it is done the dependency will have been
10324	 * resolved. Thus the second pass is expected to end quickly.
10325	 */
10326	waitfor = MNT_NOWAIT;
10327	bo = &vp->v_bufobj;
10328
10329top:
10330	/*
10331	 * We must wait for any I/O in progress to finish so that
10332	 * all potential buffers on the dirty list will be visible.
10333	 */
10334	BO_LOCK(bo);
10335	drain_output(vp);
10336	while ((bp = TAILQ_FIRST(&bo->bo_dirty.bv_hd)) != NULL) {
10337		bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT);
10338		if (bp)
10339			break;
10340	}
10341	BO_UNLOCK(bo);
10342	if (bp == NULL)
10343		return (0);
10344loop:
10345	/* While syncing snapshots, we must allow recursive lookups */
10346	BUF_AREC(bp);
10347	ACQUIRE_LOCK(&lk);
10348	/*
10349	 * As we hold the buffer locked, none of its dependencies
10350	 * will disappear.
10351	 */
10352	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
10353		switch (wk->wk_type) {
10354
10355		case D_ALLOCDIRECT:
10356		case D_ALLOCINDIR:
10357			newblk = WK_NEWBLK(wk);
10358			if (newblk->nb_jnewblk != NULL) {
10359				stat_jwait_newblk++;
10360				jwait(&newblk->nb_jnewblk->jn_list);
10361				goto restart;
10362			}
10363			if (newblk->nb_state & DEPCOMPLETE)
10364				continue;
10365			nbp = newblk->nb_bmsafemap->sm_buf;
10366			nbp = getdirtybuf(nbp, &lk, waitfor);
10367			if (nbp == NULL)
10368				continue;
10369			FREE_LOCK(&lk);
10370			if (waitfor == MNT_NOWAIT) {
10371				bawrite(nbp);
10372			} else if ((error = bwrite(nbp)) != 0) {
10373				break;
10374			}
10375			ACQUIRE_LOCK(&lk);
10376			continue;
10377
10378		case D_INDIRDEP:
10379		restart:
10380
10381			LIST_FOREACH(aip,
10382			    &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
10383				newblk = (struct newblk *)aip;
10384				if (newblk->nb_jnewblk != NULL) {
10385					stat_jwait_newblk++;
10386					jwait(&newblk->nb_jnewblk->jn_list);
10387					goto restart;
10388				}
10389				if (newblk->nb_state & DEPCOMPLETE)
10390					continue;
10391				nbp = newblk->nb_bmsafemap->sm_buf;
10392				nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
10393				if (nbp == NULL)
10394					goto restart;
10395				FREE_LOCK(&lk);
10396				if ((error = bwrite(nbp)) != 0) {
10397					goto loop_end;
10398				}
10399				ACQUIRE_LOCK(&lk);
10400				goto restart;
10401			}
10402			continue;
10403
10404		case D_PAGEDEP:
10405			/*
10406			 * We are trying to sync a directory that may
10407			 * have dependencies on both its own metadata
10408			 * and/or dependencies on the inodes of any
10409			 * recently allocated files. We walk its diradd
10410			 * lists pushing out the associated inode.
10411			 */
10412			pagedep = WK_PAGEDEP(wk);
10413			for (i = 0; i < DAHASHSZ; i++) {
10414				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
10415					continue;
10416				if ((error =
10417				    flush_pagedep_deps(vp, wk->wk_mp,
10418						&pagedep->pd_diraddhd[i]))) {
10419					FREE_LOCK(&lk);
10420					goto loop_end;
10421				}
10422			}
10423			continue;
10424
10425		default:
10426			panic("softdep_sync_metadata: Unknown type %s",
10427			    TYPENAME(wk->wk_type));
10428			/* NOTREACHED */
10429		}
10430	loop_end:
10431		/* We reach here only in error and unlocked */
10432		if (error == 0)
10433			panic("softdep_sync_metadata: zero error");
10434		BUF_NOREC(bp);
10435		bawrite(bp);
10436		return (error);
10437	}
10438	FREE_LOCK(&lk);
10439	BO_LOCK(bo);
10440	while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) {
10441		nbp = getdirtybuf(nbp, BO_MTX(bo), MNT_WAIT);
10442		if (nbp)
10443			break;
10444	}
10445	BO_UNLOCK(bo);
10446	BUF_NOREC(bp);
10447	bawrite(bp);
10448	if (nbp != NULL) {
10449		bp = nbp;
10450		goto loop;
10451	}
10452	/*
10453	 * The brief unlock is to allow any pent up dependency
10454	 * processing to be done. Then proceed with the second pass.
10455	 */
10456	if (waitfor == MNT_NOWAIT) {
10457		waitfor = MNT_WAIT;
10458		goto top;
10459	}
10460
10461	/*
10462	 * If we have managed to get rid of all the dirty buffers,
10463	 * then we are done. For certain directories and block
10464	 * devices, we may need to do further work.
10465	 *
10466	 * We must wait for any I/O in progress to finish so that
10467	 * all potential buffers on the dirty list will be visible.
10468	 */
10469	BO_LOCK(bo);
10470	drain_output(vp);
10471	BO_UNLOCK(bo);
10472	return ffs_update(vp, 1);
10473	/* return (0); */
10474}
10475
10476/*
10477 * Flush the dependencies associated with an inodedep.
10478 * Called with splbio blocked.
10479 */
10480static int
10481flush_inodedep_deps(mp, ino)
10482	struct mount *mp;
10483	ino_t ino;
10484{
10485	struct inodedep *inodedep;
10486	struct inoref *inoref;
10487	int error, waitfor;
10488
10489	/*
10490	 * This work is done in two passes. The first pass grabs most
10491	 * of the buffers and begins asynchronously writing them. The
10492	 * only way to wait for these asynchronous writes is to sleep
10493	 * on the filesystem vnode which may stay busy for a long time
10494	 * if the filesystem is active. So, instead, we make a second
10495	 * pass over the dependencies blocking on each write. In the
10496	 * usual case we will be blocking against a write that we
10497	 * initiated, so when it is done the dependency will have been
10498	 * resolved. Thus the second pass is expected to end quickly.
10499	 * We give a brief window at the top of the loop to allow
10500	 * any pending I/O to complete.
10501	 */
10502	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
10503		if (error)
10504			return (error);
10505		FREE_LOCK(&lk);
10506		ACQUIRE_LOCK(&lk);
10507restart:
10508		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
10509			return (0);
10510		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
10511			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
10512			    == DEPCOMPLETE) {
10513				stat_jwait_inode++;
10514				jwait(&inoref->if_list);
10515				goto restart;
10516			}
10517		}
10518		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
10519		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
10520		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
10521		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
10522			continue;
10523		/*
10524		 * If pass2, we are done, otherwise do pass 2.
10525		 */
10526		if (waitfor == MNT_WAIT)
10527			break;
10528		waitfor = MNT_WAIT;
10529	}
10530	/*
10531	 * Try freeing inodedep in case all dependencies have been removed.
10532	 */
10533	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
10534		(void) free_inodedep(inodedep);
10535	return (0);
10536}
10537
10538/*
10539 * Flush an inode dependency list.
10540 * Called with splbio blocked.
10541 */
10542static int
10543flush_deplist(listhead, waitfor, errorp)
10544	struct allocdirectlst *listhead;
10545	int waitfor;
10546	int *errorp;
10547{
10548	struct allocdirect *adp;
10549	struct newblk *newblk;
10550	struct buf *bp;
10551
10552	mtx_assert(&lk, MA_OWNED);
10553	TAILQ_FOREACH(adp, listhead, ad_next) {
10554		newblk = (struct newblk *)adp;
10555		if (newblk->nb_jnewblk != NULL) {
10556			stat_jwait_newblk++;
10557			jwait(&newblk->nb_jnewblk->jn_list);
10558			return (1);
10559		}
10560		if (newblk->nb_state & DEPCOMPLETE)
10561			continue;
10562		bp = newblk->nb_bmsafemap->sm_buf;
10563		bp = getdirtybuf(bp, &lk, waitfor);
10564		if (bp == NULL) {
10565			if (waitfor == MNT_NOWAIT)
10566				continue;
10567			return (1);
10568		}
10569		FREE_LOCK(&lk);
10570		if (waitfor == MNT_NOWAIT) {
10571			bawrite(bp);
10572		} else if ((*errorp = bwrite(bp)) != 0) {
10573			ACQUIRE_LOCK(&lk);
10574			return (1);
10575		}
10576		ACQUIRE_LOCK(&lk);
10577		return (1);
10578	}
10579	return (0);
10580}
10581
10582/*
10583 * Flush dependencies associated with an allocdirect block.
10584 */
10585static int
10586flush_newblk_dep(vp, mp, lbn)
10587	struct vnode *vp;
10588	struct mount *mp;
10589	ufs_lbn_t lbn;
10590{
10591	struct newblk *newblk;
10592	struct bufobj *bo;
10593	struct inode *ip;
10594	struct buf *bp;
10595	ufs2_daddr_t blkno;
10596	int error;
10597
10598	error = 0;
10599	bo = &vp->v_bufobj;
10600	ip = VTOI(vp);
10601	blkno = DIP(ip, i_db[lbn]);
10602	if (blkno == 0)
10603		panic("flush_newblk_dep: Missing block");
10604	ACQUIRE_LOCK(&lk);
10605	/*
10606	 * Loop until all dependencies related to this block are satisfied.
10607	 * We must be careful to restart after each sleep in case a write
10608	 * completes some part of this process for us.
10609	 */
10610	for (;;) {
10611		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
10612			FREE_LOCK(&lk);
10613			break;
10614		}
10615		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
10616			panic("flush_newblk_deps: Bad newblk %p", newblk);
10617		/*
10618		 * Flush the journal.
10619		 */
10620		if (newblk->nb_jnewblk != NULL) {
10621			stat_jwait_newblk++;
10622			jwait(&newblk->nb_jnewblk->jn_list);
10623			continue;
10624		}
10625		/*
10626		 * Write the bitmap dependency.
10627		 */
10628		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
10629			bp = newblk->nb_bmsafemap->sm_buf;
10630			bp = getdirtybuf(bp, &lk, MNT_WAIT);
10631			if (bp == NULL)
10632				continue;
10633			FREE_LOCK(&lk);
10634			error = bwrite(bp);
10635			if (error)
10636				break;
10637			ACQUIRE_LOCK(&lk);
10638			continue;
10639		}
10640		/*
10641		 * Write the buffer.
10642		 */
10643		FREE_LOCK(&lk);
10644		BO_LOCK(bo);
10645		bp = gbincore(bo, lbn);
10646		if (bp != NULL) {
10647			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
10648			    LK_INTERLOCK, BO_MTX(bo));
10649			if (error == ENOLCK) {
10650				ACQUIRE_LOCK(&lk);
10651				continue; /* Slept, retry */
10652			}
10653			if (error != 0)
10654				break;	/* Failed */
10655			if (bp->b_flags & B_DELWRI) {
10656				bremfree(bp);
10657				error = bwrite(bp);
10658				if (error)
10659					break;
10660			} else
10661				BUF_UNLOCK(bp);
10662		} else
10663			BO_UNLOCK(bo);
10664		/*
10665		 * We have to wait for the direct pointers to
10666		 * point at the newdirblk before the dependency
10667		 * will go away.
10668		 */
10669		error = ffs_update(vp, MNT_WAIT);
10670		if (error)
10671			break;
10672		ACQUIRE_LOCK(&lk);
10673	}
10674	return (error);
10675}
10676
10677/*
10678 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
10679 * Called with splbio blocked.
10680 */
10681static int
10682flush_pagedep_deps(pvp, mp, diraddhdp)
10683	struct vnode *pvp;
10684	struct mount *mp;
10685	struct diraddhd *diraddhdp;
10686{
10687	struct inodedep *inodedep;
10688	struct inoref *inoref;
10689	struct ufsmount *ump;
10690	struct diradd *dap;
10691	struct vnode *vp;
10692	int error = 0;
10693	struct buf *bp;
10694	ino_t inum;
10695
10696	ump = VFSTOUFS(mp);
10697restart:
10698	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
10699		/*
10700		 * Flush ourselves if this directory entry
10701		 * has a MKDIR_PARENT dependency.
10702		 */
10703		if (dap->da_state & MKDIR_PARENT) {
10704			FREE_LOCK(&lk);
10705			if ((error = ffs_update(pvp, MNT_WAIT)) != 0)
10706				break;
10707			ACQUIRE_LOCK(&lk);
10708			/*
10709			 * If that cleared dependencies, go on to next.
10710			 */
10711			if (dap != LIST_FIRST(diraddhdp))
10712				continue;
10713			if (dap->da_state & MKDIR_PARENT)
10714				panic("flush_pagedep_deps: MKDIR_PARENT");
10715		}
10716		/*
10717		 * A newly allocated directory must have its "." and
10718		 * ".." entries written out before its name can be
10719		 * committed in its parent.
10720		 */
10721		inum = dap->da_newinum;
10722		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
10723			panic("flush_pagedep_deps: lost inode1");
10724		/*
10725		 * Wait for any pending journal adds to complete so we don't
10726		 * cause rollbacks while syncing.
10727		 */
10728		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
10729			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
10730			    == DEPCOMPLETE) {
10731				stat_jwait_inode++;
10732				jwait(&inoref->if_list);
10733				goto restart;
10734			}
10735		}
10736		if (dap->da_state & MKDIR_BODY) {
10737			FREE_LOCK(&lk);
10738			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
10739			    FFSV_FORCEINSMQ)))
10740				break;
10741			error = flush_newblk_dep(vp, mp, 0);
10742			/*
10743			 * If we still have the dependency we might need to
10744			 * update the vnode to sync the new link count to
10745			 * disk.
10746			 */
10747			if (error == 0 && dap == LIST_FIRST(diraddhdp))
10748				error = ffs_update(vp, MNT_WAIT);
10749			vput(vp);
10750			if (error != 0)
10751				break;
10752			ACQUIRE_LOCK(&lk);
10753			/*
10754			 * If that cleared dependencies, go on to next.
10755			 */
10756			if (dap != LIST_FIRST(diraddhdp))
10757				continue;
10758			if (dap->da_state & MKDIR_BODY) {
10759				inodedep_lookup(UFSTOVFS(ump), inum, 0,
10760				    &inodedep);
10761				panic("flush_pagedep_deps: MKDIR_BODY "
10762				    "inodedep %p dap %p vp %p",
10763				    inodedep, dap, vp);
10764			}
10765		}
10766		/*
10767		 * Flush the inode on which the directory entry depends.
10768		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
10769		 * the only remaining dependency is that the updated inode
10770		 * count must get pushed to disk. The inode has already
10771		 * been pushed into its inode buffer (via VOP_UPDATE) at
10772		 * the time of the reference count change. So we need only
10773		 * locate that buffer, ensure that there will be no rollback
10774		 * caused by a bitmap dependency, then write the inode buffer.
10775		 */
10776retry:
10777		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
10778			panic("flush_pagedep_deps: lost inode");
10779		/*
10780		 * If the inode still has bitmap dependencies,
10781		 * push them to disk.
10782		 */
10783		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
10784			bp = inodedep->id_bmsafemap->sm_buf;
10785			bp = getdirtybuf(bp, &lk, MNT_WAIT);
10786			if (bp == NULL)
10787				goto retry;
10788			FREE_LOCK(&lk);
10789			if ((error = bwrite(bp)) != 0)
10790				break;
10791			ACQUIRE_LOCK(&lk);
10792			if (dap != LIST_FIRST(diraddhdp))
10793				continue;
10794		}
10795		/*
10796		 * If the inode is still sitting in a buffer waiting
10797		 * to be written or waiting for the link count to be
10798		 * adjusted update it here to flush it to disk.
10799		 */
10800		if (dap == LIST_FIRST(diraddhdp)) {
10801			FREE_LOCK(&lk);
10802			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
10803			    FFSV_FORCEINSMQ)))
10804				break;
10805			error = ffs_update(vp, MNT_WAIT);
10806			vput(vp);
10807			if (error)
10808				break;
10809			ACQUIRE_LOCK(&lk);
10810		}
10811		/*
10812		 * If we have failed to get rid of all the dependencies
10813		 * then something is seriously wrong.
10814		 */
10815		if (dap == LIST_FIRST(diraddhdp)) {
10816			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
10817			panic("flush_pagedep_deps: failed to flush "
10818			    "inodedep %p ino %d dap %p", inodedep, inum, dap);
10819		}
10820	}
10821	if (error)
10822		ACQUIRE_LOCK(&lk);
10823	return (error);
10824}
10825
10826/*
10827 * A large burst of file addition or deletion activity can drive the
10828 * memory load excessively high. First attempt to slow things down
10829 * using the techniques below. If that fails, this routine requests
10830 * the offending operations to fall back to running synchronously
10831 * until the memory load returns to a reasonable level.
10832 */
10833int
10834softdep_slowdown(vp)
10835	struct vnode *vp;
10836{
10837	struct ufsmount *ump;
10838	int jlow;
10839	int max_softdeps_hard;
10840
10841	ACQUIRE_LOCK(&lk);
10842	jlow = 0;
10843	/*
10844	 * Check for journal space if needed.
10845	 */
10846	if (DOINGSUJ(vp)) {
10847		ump = VFSTOUFS(vp->v_mount);
10848		if (journal_space(ump, 0) == 0)
10849			jlow = 1;
10850	}
10851	max_softdeps_hard = max_softdeps * 11 / 10;
10852	if (num_dirrem < max_softdeps_hard / 2 &&
10853	    num_inodedep < max_softdeps_hard &&
10854	    VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps &&
10855	    num_freeblkdep < max_softdeps_hard && jlow == 0) {
10856		FREE_LOCK(&lk);
10857  		return (0);
10858	}
10859	if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow)
10860		softdep_speedup();
10861	stat_sync_limit_hit += 1;
10862	FREE_LOCK(&lk);
10863	return (1);
10864}
10865
10866/*
10867 * Called by the allocation routines when they are about to fail
10868 * in the hope that we can free up the requested resource (inodes
10869 * or disk space).
10870 *
10871 * First check to see if the work list has anything on it. If it has,
10872 * clean up entries until we successfully free the requested resource.
10873 * Because this process holds inodes locked, we cannot handle any remove
10874 * requests that might block on a locked inode as that could lead to
10875 * deadlock. If the worklist yields none of the requested resource,
10876 * encourage the syncer daemon to help us. In no event will we try for
10877 * longer than tickdelay seconds.
10878 */
10879int
10880softdep_request_cleanup(fs, vp, resource)
10881	struct fs *fs;
10882	struct vnode *vp;
10883	int resource;
10884{
10885	struct ufsmount *ump;
10886	long starttime;
10887	ufs2_daddr_t needed;
10888	int error;
10889
10890	ump = VTOI(vp)->i_ump;
10891	mtx_assert(UFS_MTX(ump), MA_OWNED);
10892	if (resource == FLUSH_BLOCKS_WAIT)
10893		needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
10894	else if (resource == FLUSH_INODES_WAIT)
10895		needed = fs->fs_cstotal.cs_nifree + 2;
10896	else
10897		return (0);
10898	starttime = time_second + tickdelay;
10899	/*
10900	 * If we are being called because of a process doing a
10901	 * copy-on-write, then it is not safe to update the vnode
10902	 * as we may recurse into the copy-on-write routine.
10903	 */
10904	if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
10905		UFS_UNLOCK(ump);
10906		error = ffs_update(vp, 1);
10907		UFS_LOCK(ump);
10908		if (error != 0)
10909			return (0);
10910	}
10911	while ((resource == FLUSH_BLOCKS_WAIT && fs->fs_pendingblocks > 0 &&
10912		fs->fs_cstotal.cs_nbfree <= needed) ||
10913	       (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
10914		fs->fs_cstotal.cs_nifree <= needed)) {
10915		if (time_second > starttime)
10916			return (0);
10917		UFS_UNLOCK(ump);
10918		ACQUIRE_LOCK(&lk);
10919		process_removes(vp);
10920		if (ump->softdep_on_worklist > 0 &&
10921		    process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
10922			stat_worklist_push += 1;
10923			FREE_LOCK(&lk);
10924			UFS_LOCK(ump);
10925			continue;
10926		}
10927		request_cleanup(UFSTOVFS(ump), resource);
10928		FREE_LOCK(&lk);
10929		UFS_LOCK(ump);
10930	}
10931	return (1);
10932}
10933
10934/*
10935 * If memory utilization has gotten too high, deliberately slow things
10936 * down and speed up the I/O processing.
10937 */
10938extern struct thread *syncertd;
10939static int
10940request_cleanup(mp, resource)
10941	struct mount *mp;
10942	int resource;
10943{
10944	struct thread *td = curthread;
10945	struct ufsmount *ump;
10946
10947	mtx_assert(&lk, MA_OWNED);
10948	/*
10949	 * We never hold up the filesystem syncer or buf daemon.
10950	 */
10951	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
10952		return (0);
10953	ump = VFSTOUFS(mp);
10954	/*
10955	 * First check to see if the work list has gotten backlogged.
10956	 * If it has, co-opt this process to help clean up two entries.
10957	 * Because this process may hold inodes locked, we cannot
10958	 * handle any remove requests that might block on a locked
10959	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
10960	 * to avoid recursively processing the worklist.
10961	 */
10962	if (ump->softdep_on_worklist > max_softdeps / 10) {
10963		td->td_pflags |= TDP_SOFTDEP;
10964		process_worklist_item(mp, LK_NOWAIT);
10965		process_worklist_item(mp, LK_NOWAIT);
10966		td->td_pflags &= ~TDP_SOFTDEP;
10967		stat_worklist_push += 2;
10968		return(1);
10969	}
10970	/*
10971	 * Next, we attempt to speed up the syncer process. If that
10972	 * is successful, then we allow the process to continue.
10973	 */
10974	if (softdep_speedup() &&
10975	    resource != FLUSH_BLOCKS_WAIT &&
10976	    resource != FLUSH_INODES_WAIT)
10977		return(0);
10978	/*
10979	 * If we are resource constrained on inode dependencies, try
10980	 * flushing some dirty inodes. Otherwise, we are constrained
10981	 * by file deletions, so try accelerating flushes of directories
10982	 * with removal dependencies. We would like to do the cleanup
10983	 * here, but we probably hold an inode locked at this point and
10984	 * that might deadlock against one that we try to clean. So,
10985	 * the best that we can do is request the syncer daemon to do
10986	 * the cleanup for us.
10987	 */
10988	switch (resource) {
10989
10990	case FLUSH_INODES:
10991	case FLUSH_INODES_WAIT:
10992		stat_ino_limit_push += 1;
10993		req_clear_inodedeps += 1;
10994		stat_countp = &stat_ino_limit_hit;
10995		break;
10996
10997	case FLUSH_BLOCKS:
10998	case FLUSH_BLOCKS_WAIT:
10999		stat_blk_limit_push += 1;
11000		req_clear_remove += 1;
11001		stat_countp = &stat_blk_limit_hit;
11002		break;
11003
11004	default:
11005		panic("request_cleanup: unknown type");
11006	}
11007	/*
11008	 * Hopefully the syncer daemon will catch up and awaken us.
11009	 * We wait at most tickdelay before proceeding in any case.
11010	 */
11011	proc_waiting += 1;
11012	if (callout_pending(&softdep_callout) == FALSE)
11013		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
11014		    pause_timer, 0);
11015
11016	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
11017	proc_waiting -= 1;
11018	return (1);
11019}
11020
11021/*
11022 * Awaken processes pausing in request_cleanup and clear proc_waiting
11023 * to indicate that there is no longer a timer running.
11024 */
11025static void
11026pause_timer(arg)
11027	void *arg;
11028{
11029
11030	/*
11031	 * The callout_ API has acquired mtx and will hold it around this
11032	 * function call.
11033	 */
11034	*stat_countp += 1;
11035	wakeup_one(&proc_waiting);
11036	if (proc_waiting > 0)
11037		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
11038		    pause_timer, 0);
11039}
11040
11041/*
11042 * Flush out a directory with at least one removal dependency in an effort to
11043 * reduce the number of dirrem, freefile, and freeblks dependency structures.
11044 */
11045static void
11046clear_remove(td)
11047	struct thread *td;
11048{
11049	struct pagedep_hashhead *pagedephd;
11050	struct pagedep *pagedep;
11051	static int next = 0;
11052	struct mount *mp;
11053	struct vnode *vp;
11054	struct bufobj *bo;
11055	int error, cnt;
11056	ino_t ino;
11057
11058	mtx_assert(&lk, MA_OWNED);
11059
11060	for (cnt = 0; cnt < pagedep_hash; cnt++) {
11061		pagedephd = &pagedep_hashtbl[next++];
11062		if (next >= pagedep_hash)
11063			next = 0;
11064		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
11065			if (LIST_EMPTY(&pagedep->pd_dirremhd))
11066				continue;
11067			mp = pagedep->pd_list.wk_mp;
11068			ino = pagedep->pd_ino;
11069			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
11070				continue;
11071			FREE_LOCK(&lk);
11072
11073			/*
11074			 * Let unmount clear deps
11075			 */
11076			error = vfs_busy(mp, MBF_NOWAIT);
11077			if (error != 0)
11078				goto finish_write;
11079			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
11080			     FFSV_FORCEINSMQ);
11081			vfs_unbusy(mp);
11082			if (error != 0) {
11083				softdep_error("clear_remove: vget", error);
11084				goto finish_write;
11085			}
11086			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
11087				softdep_error("clear_remove: fsync", error);
11088			bo = &vp->v_bufobj;
11089			BO_LOCK(bo);
11090			drain_output(vp);
11091			BO_UNLOCK(bo);
11092			vput(vp);
11093		finish_write:
11094			vn_finished_write(mp);
11095			ACQUIRE_LOCK(&lk);
11096			return;
11097		}
11098	}
11099}
11100
11101/*
11102 * Clear out a block of dirty inodes in an effort to reduce
11103 * the number of inodedep dependency structures.
11104 */
11105static void
11106clear_inodedeps(td)
11107	struct thread *td;
11108{
11109	struct inodedep_hashhead *inodedephd;
11110	struct inodedep *inodedep;
11111	static int next = 0;
11112	struct mount *mp;
11113	struct vnode *vp;
11114	struct fs *fs;
11115	int error, cnt;
11116	ino_t firstino, lastino, ino;
11117
11118	mtx_assert(&lk, MA_OWNED);
11119	/*
11120	 * Pick a random inode dependency to be cleared.
11121	 * We will then gather up all the inodes in its block
11122	 * that have dependencies and flush them out.
11123	 */
11124	for (cnt = 0; cnt < inodedep_hash; cnt++) {
11125		inodedephd = &inodedep_hashtbl[next++];
11126		if (next >= inodedep_hash)
11127			next = 0;
11128		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
11129			break;
11130	}
11131	if (inodedep == NULL)
11132		return;
11133	fs = inodedep->id_fs;
11134	mp = inodedep->id_list.wk_mp;
11135	/*
11136	 * Find the last inode in the block with dependencies.
11137	 */
11138	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
11139	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
11140		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
11141			break;
11142	/*
11143	 * Asynchronously push all but the last inode with dependencies.
11144	 * Synchronously push the last inode with dependencies to ensure
11145	 * that the inode block gets written to free up the inodedeps.
11146	 */
11147	for (ino = firstino; ino <= lastino; ino++) {
11148		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
11149			continue;
11150		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
11151			continue;
11152		FREE_LOCK(&lk);
11153		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
11154		if (error != 0) {
11155			vn_finished_write(mp);
11156			ACQUIRE_LOCK(&lk);
11157			return;
11158		}
11159		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
11160		    FFSV_FORCEINSMQ)) != 0) {
11161			softdep_error("clear_inodedeps: vget", error);
11162			vfs_unbusy(mp);
11163			vn_finished_write(mp);
11164			ACQUIRE_LOCK(&lk);
11165			return;
11166		}
11167		vfs_unbusy(mp);
11168		if (ino == lastino) {
11169			if ((error = ffs_syncvnode(vp, MNT_WAIT)))
11170				softdep_error("clear_inodedeps: fsync1", error);
11171		} else {
11172			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
11173				softdep_error("clear_inodedeps: fsync2", error);
11174			BO_LOCK(&vp->v_bufobj);
11175			drain_output(vp);
11176			BO_UNLOCK(&vp->v_bufobj);
11177		}
11178		vput(vp);
11179		vn_finished_write(mp);
11180		ACQUIRE_LOCK(&lk);
11181	}
11182}
11183
11184/*
11185 * Function to determine if the buffer has outstanding dependencies
11186 * that will cause a roll-back if the buffer is written. If wantcount
11187 * is set, return number of dependencies, otherwise just yes or no.
11188 */
11189static int
11190softdep_count_dependencies(bp, wantcount)
11191	struct buf *bp;
11192	int wantcount;
11193{
11194	struct worklist *wk;
11195	struct bmsafemap *bmsafemap;
11196	struct inodedep *inodedep;
11197	struct indirdep *indirdep;
11198	struct freeblks *freeblks;
11199	struct allocindir *aip;
11200	struct pagedep *pagedep;
11201	struct dirrem *dirrem;
11202	struct newblk *newblk;
11203	struct mkdir *mkdir;
11204	struct diradd *dap;
11205	int i, retval;
11206
11207	retval = 0;
11208	ACQUIRE_LOCK(&lk);
11209	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
11210		switch (wk->wk_type) {
11211
11212		case D_INODEDEP:
11213			inodedep = WK_INODEDEP(wk);
11214			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
11215				/* bitmap allocation dependency */
11216				retval += 1;
11217				if (!wantcount)
11218					goto out;
11219			}
11220			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
11221				/* direct block pointer dependency */
11222				retval += 1;
11223				if (!wantcount)
11224					goto out;
11225			}
11226			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
11227				/* direct block pointer dependency */
11228				retval += 1;
11229				if (!wantcount)
11230					goto out;
11231			}
11232			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
11233				/* Add reference dependency. */
11234				retval += 1;
11235				if (!wantcount)
11236					goto out;
11237			}
11238			continue;
11239
11240		case D_INDIRDEP:
11241			indirdep = WK_INDIRDEP(wk);
11242
11243			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
11244				/* indirect block pointer dependency */
11245				retval += 1;
11246				if (!wantcount)
11247					goto out;
11248			}
11249			continue;
11250
11251		case D_PAGEDEP:
11252			pagedep = WK_PAGEDEP(wk);
11253			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
11254				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
11255					/* Journal remove ref dependency. */
11256					retval += 1;
11257					if (!wantcount)
11258						goto out;
11259				}
11260			}
11261			for (i = 0; i < DAHASHSZ; i++) {
11262
11263				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
11264					/* directory entry dependency */
11265					retval += 1;
11266					if (!wantcount)
11267						goto out;
11268				}
11269			}
11270			continue;
11271
11272		case D_BMSAFEMAP:
11273			bmsafemap = WK_BMSAFEMAP(wk);
11274			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
11275				/* Add reference dependency. */
11276				retval += 1;
11277				if (!wantcount)
11278					goto out;
11279			}
11280			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
11281				/* Allocate block dependency. */
11282				retval += 1;
11283				if (!wantcount)
11284					goto out;
11285			}
11286			continue;
11287
11288		case D_FREEBLKS:
11289			freeblks = WK_FREEBLKS(wk);
11290			if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) {
11291				/* Freeblk journal dependency. */
11292				retval += 1;
11293				if (!wantcount)
11294					goto out;
11295			}
11296			continue;
11297
11298		case D_ALLOCDIRECT:
11299		case D_ALLOCINDIR:
11300			newblk = WK_NEWBLK(wk);
11301			if (newblk->nb_jnewblk) {
11302				/* Journal allocate dependency. */
11303				retval += 1;
11304				if (!wantcount)
11305					goto out;
11306			}
11307			continue;
11308
11309		case D_MKDIR:
11310			mkdir = WK_MKDIR(wk);
11311			if (mkdir->md_jaddref) {
11312				/* Journal reference dependency. */
11313				retval += 1;
11314				if (!wantcount)
11315					goto out;
11316			}
11317			continue;
11318
11319		case D_FREEWORK:
11320		case D_FREEDEP:
11321		case D_JSEGDEP:
11322		case D_JSEG:
11323		case D_SBDEP:
11324			/* never a dependency on these blocks */
11325			continue;
11326
11327		default:
11328			panic("softdep_count_dependencies: Unexpected type %s",
11329			    TYPENAME(wk->wk_type));
11330			/* NOTREACHED */
11331		}
11332	}
11333out:
11334	FREE_LOCK(&lk);
11335	return retval;
11336}
11337
11338/*
11339 * Acquire exclusive access to a buffer.
11340 * Must be called with a locked mtx parameter.
11341 * Return acquired buffer or NULL on failure.
11342 */
11343static struct buf *
11344getdirtybuf(bp, mtx, waitfor)
11345	struct buf *bp;
11346	struct mtx *mtx;
11347	int waitfor;
11348{
11349	int error;
11350
11351	mtx_assert(mtx, MA_OWNED);
11352	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
11353		if (waitfor != MNT_WAIT)
11354			return (NULL);
11355		error = BUF_LOCK(bp,
11356		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
11357		/*
11358		 * Even if we sucessfully acquire bp here, we have dropped
11359		 * mtx, which may violates our guarantee.
11360		 */
11361		if (error == 0)
11362			BUF_UNLOCK(bp);
11363		else if (error != ENOLCK)
11364			panic("getdirtybuf: inconsistent lock: %d", error);
11365		mtx_lock(mtx);
11366		return (NULL);
11367	}
11368	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
11369		if (mtx == &lk && waitfor == MNT_WAIT) {
11370			mtx_unlock(mtx);
11371			BO_LOCK(bp->b_bufobj);
11372			BUF_UNLOCK(bp);
11373			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
11374				bp->b_vflags |= BV_BKGRDWAIT;
11375				msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
11376				       PRIBIO | PDROP, "getbuf", 0);
11377			} else
11378				BO_UNLOCK(bp->b_bufobj);
11379			mtx_lock(mtx);
11380			return (NULL);
11381		}
11382		BUF_UNLOCK(bp);
11383		if (waitfor != MNT_WAIT)
11384			return (NULL);
11385		/*
11386		 * The mtx argument must be bp->b_vp's mutex in
11387		 * this case.
11388		 */
11389#ifdef	DEBUG_VFS_LOCKS
11390		if (bp->b_vp->v_type != VCHR)
11391			ASSERT_BO_LOCKED(bp->b_bufobj);
11392#endif
11393		bp->b_vflags |= BV_BKGRDWAIT;
11394		msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
11395		return (NULL);
11396	}
11397	if ((bp->b_flags & B_DELWRI) == 0) {
11398		BUF_UNLOCK(bp);
11399		return (NULL);
11400	}
11401	bremfree(bp);
11402	return (bp);
11403}
11404
11405
11406/*
11407 * Check if it is safe to suspend the file system now.  On entry,
11408 * the vnode interlock for devvp should be held.  Return 0 with
11409 * the mount interlock held if the file system can be suspended now,
11410 * otherwise return EAGAIN with the mount interlock held.
11411 */
11412int
11413softdep_check_suspend(struct mount *mp,
11414		      struct vnode *devvp,
11415		      int softdep_deps,
11416		      int softdep_accdeps,
11417		      int secondary_writes,
11418		      int secondary_accwrites)
11419{
11420	struct bufobj *bo;
11421	struct ufsmount *ump;
11422	int error;
11423
11424	ump = VFSTOUFS(mp);
11425	bo = &devvp->v_bufobj;
11426	ASSERT_BO_LOCKED(bo);
11427
11428	for (;;) {
11429		if (!TRY_ACQUIRE_LOCK(&lk)) {
11430			BO_UNLOCK(bo);
11431			ACQUIRE_LOCK(&lk);
11432			FREE_LOCK(&lk);
11433			BO_LOCK(bo);
11434			continue;
11435		}
11436		MNT_ILOCK(mp);
11437		if (mp->mnt_secondary_writes != 0) {
11438			FREE_LOCK(&lk);
11439			BO_UNLOCK(bo);
11440			msleep(&mp->mnt_secondary_writes,
11441			       MNT_MTX(mp),
11442			       (PUSER - 1) | PDROP, "secwr", 0);
11443			BO_LOCK(bo);
11444			continue;
11445		}
11446		break;
11447	}
11448
11449	/*
11450	 * Reasons for needing more work before suspend:
11451	 * - Dirty buffers on devvp.
11452	 * - Softdep activity occurred after start of vnode sync loop
11453	 * - Secondary writes occurred after start of vnode sync loop
11454	 */
11455	error = 0;
11456	if (bo->bo_numoutput > 0 ||
11457	    bo->bo_dirty.bv_cnt > 0 ||
11458	    softdep_deps != 0 ||
11459	    ump->softdep_deps != 0 ||
11460	    softdep_accdeps != ump->softdep_accdeps ||
11461	    secondary_writes != 0 ||
11462	    mp->mnt_secondary_writes != 0 ||
11463	    secondary_accwrites != mp->mnt_secondary_accwrites)
11464		error = EAGAIN;
11465	FREE_LOCK(&lk);
11466	BO_UNLOCK(bo);
11467	return (error);
11468}
11469
11470
11471/*
11472 * Get the number of dependency structures for the file system, both
11473 * the current number and the total number allocated.  These will
11474 * later be used to detect that softdep processing has occurred.
11475 */
11476void
11477softdep_get_depcounts(struct mount *mp,
11478		      int *softdep_depsp,
11479		      int *softdep_accdepsp)
11480{
11481	struct ufsmount *ump;
11482
11483	ump = VFSTOUFS(mp);
11484	ACQUIRE_LOCK(&lk);
11485	*softdep_depsp = ump->softdep_deps;
11486	*softdep_accdepsp = ump->softdep_accdeps;
11487	FREE_LOCK(&lk);
11488}
11489
11490/*
11491 * Wait for pending output on a vnode to complete.
11492 * Must be called with vnode lock and interlock locked.
11493 *
11494 * XXX: Should just be a call to bufobj_wwait().
11495 */
11496static void
11497drain_output(vp)
11498	struct vnode *vp;
11499{
11500	struct bufobj *bo;
11501
11502	bo = &vp->v_bufobj;
11503	ASSERT_VOP_LOCKED(vp, "drain_output");
11504	ASSERT_BO_LOCKED(bo);
11505
11506	while (bo->bo_numoutput) {
11507		bo->bo_flag |= BO_WWAIT;
11508		msleep((caddr_t)&bo->bo_numoutput,
11509		    BO_MTX(bo), PRIBIO + 1, "drainvp", 0);
11510	}
11511}
11512
11513/*
11514 * Called whenever a buffer that is being invalidated or reallocated
11515 * contains dependencies. This should only happen if an I/O error has
11516 * occurred. The routine is called with the buffer locked.
11517 */
11518static void
11519softdep_deallocate_dependencies(bp)
11520	struct buf *bp;
11521{
11522
11523	if ((bp->b_ioflags & BIO_ERROR) == 0)
11524		panic("softdep_deallocate_dependencies: dangling deps");
11525	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
11526	panic("softdep_deallocate_dependencies: unrecovered I/O error");
11527}
11528
11529/*
11530 * Function to handle asynchronous write errors in the filesystem.
11531 */
11532static void
11533softdep_error(func, error)
11534	char *func;
11535	int error;
11536{
11537
11538	/* XXX should do something better! */
11539	printf("%s: got error %d while accessing filesystem\n", func, error);
11540}
11541
11542#ifdef DDB
11543
11544static void
11545inodedep_print(struct inodedep *inodedep, int verbose)
11546{
11547	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
11548	    " saveino %p\n",
11549	    inodedep, inodedep->id_fs, inodedep->id_state,
11550	    (intmax_t)inodedep->id_ino,
11551	    (intmax_t)fsbtodb(inodedep->id_fs,
11552	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
11553	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
11554	    inodedep->id_savedino1);
11555
11556	if (verbose == 0)
11557		return;
11558
11559	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
11560	    "mkdiradd %p\n",
11561	    LIST_FIRST(&inodedep->id_pendinghd),
11562	    LIST_FIRST(&inodedep->id_bufwait),
11563	    LIST_FIRST(&inodedep->id_inowait),
11564	    TAILQ_FIRST(&inodedep->id_inoreflst),
11565	    inodedep->id_mkdiradd);
11566	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
11567	    TAILQ_FIRST(&inodedep->id_inoupdt),
11568	    TAILQ_FIRST(&inodedep->id_newinoupdt),
11569	    TAILQ_FIRST(&inodedep->id_extupdt),
11570	    TAILQ_FIRST(&inodedep->id_newextupdt));
11571}
11572
11573DB_SHOW_COMMAND(inodedep, db_show_inodedep)
11574{
11575
11576	if (have_addr == 0) {
11577		db_printf("Address required\n");
11578		return;
11579	}
11580	inodedep_print((struct inodedep*)addr, 1);
11581}
11582
11583DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
11584{
11585	struct inodedep_hashhead *inodedephd;
11586	struct inodedep *inodedep;
11587	struct fs *fs;
11588	int cnt;
11589
11590	fs = have_addr ? (struct fs *)addr : NULL;
11591	for (cnt = 0; cnt < inodedep_hash; cnt++) {
11592		inodedephd = &inodedep_hashtbl[cnt];
11593		LIST_FOREACH(inodedep, inodedephd, id_hash) {
11594			if (fs != NULL && fs != inodedep->id_fs)
11595				continue;
11596			inodedep_print(inodedep, 0);
11597		}
11598	}
11599}
11600
11601DB_SHOW_COMMAND(worklist, db_show_worklist)
11602{
11603	struct worklist *wk;
11604
11605	if (have_addr == 0) {
11606		db_printf("Address required\n");
11607		return;
11608	}
11609	wk = (struct worklist *)addr;
11610	printf("worklist: %p type %s state 0x%X\n",
11611	    wk, TYPENAME(wk->wk_type), wk->wk_state);
11612}
11613
11614DB_SHOW_COMMAND(workhead, db_show_workhead)
11615{
11616	struct workhead *wkhd;
11617	struct worklist *wk;
11618	int i;
11619
11620	if (have_addr == 0) {
11621		db_printf("Address required\n");
11622		return;
11623	}
11624	wkhd = (struct workhead *)addr;
11625	wk = LIST_FIRST(wkhd);
11626	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
11627		db_printf("worklist: %p type %s state 0x%X",
11628		    wk, TYPENAME(wk->wk_type), wk->wk_state);
11629	if (i == 100)
11630		db_printf("workhead overflow");
11631	printf("\n");
11632}
11633
11634
11635DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
11636{
11637	struct jaddref *jaddref;
11638	struct diradd *diradd;
11639	struct mkdir *mkdir;
11640
11641	LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
11642		diradd = mkdir->md_diradd;
11643		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
11644		    mkdir, mkdir->md_state, diradd, diradd->da_state);
11645		if ((jaddref = mkdir->md_jaddref) != NULL)
11646			db_printf(" jaddref %p jaddref state 0x%X",
11647			    jaddref, jaddref->ja_state);
11648		db_printf("\n");
11649	}
11650}
11651
11652#endif /* DDB */
11653
11654#endif /* SOFTUPDATES */
11655