ffs_softdep.c revision 242259
1/*-
2 * Copyright 1998, 2000 Marshall Kirk McKusick.
3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4 * All rights reserved.
5 *
6 * The soft updates code is derived from the appendix of a University
7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8 * "Soft Updates: A Solution to the Metadata Update Problem in File
9 * Systems", CSE-TR-254-95, August 1995).
10 *
11 * Further information about soft updates can be obtained from:
12 *
13 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
14 *	1614 Oxford Street		mckusick@mckusick.com
15 *	Berkeley, CA 94709-1608		+1-510-843-9542
16 *	USA
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 *
22 * 1. Redistributions of source code must retain the above copyright
23 *    notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 *    notice, this list of conditions and the following disclaimer in the
26 *    documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
40 */
41
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 242259 2012-10-28 18:53:28Z trasz $");
44
45#include "opt_ffs.h"
46#include "opt_quota.h"
47#include "opt_ddb.h"
48
49/*
50 * For now we want the safety net that the DEBUG flag provides.
51 */
52#ifndef DEBUG
53#define DEBUG
54#endif
55
56#include <sys/param.h>
57#include <sys/kernel.h>
58#include <sys/systm.h>
59#include <sys/bio.h>
60#include <sys/buf.h>
61#include <sys/kdb.h>
62#include <sys/kthread.h>
63#include <sys/limits.h>
64#include <sys/lock.h>
65#include <sys/malloc.h>
66#include <sys/mount.h>
67#include <sys/mutex.h>
68#include <sys/namei.h>
69#include <sys/priv.h>
70#include <sys/proc.h>
71#include <sys/stat.h>
72#include <sys/sysctl.h>
73#include <sys/syslog.h>
74#include <sys/vnode.h>
75#include <sys/conf.h>
76
77#include <ufs/ufs/dir.h>
78#include <ufs/ufs/extattr.h>
79#include <ufs/ufs/quota.h>
80#include <ufs/ufs/inode.h>
81#include <ufs/ufs/ufsmount.h>
82#include <ufs/ffs/fs.h>
83#include <ufs/ffs/softdep.h>
84#include <ufs/ffs/ffs_extern.h>
85#include <ufs/ufs/ufs_extern.h>
86
87#include <vm/vm.h>
88#include <vm/vm_extern.h>
89#include <vm/vm_object.h>
90
91#include <ddb/ddb.h>
92
93#ifndef SOFTUPDATES
94
95int
96softdep_flushfiles(oldmnt, flags, td)
97	struct mount *oldmnt;
98	int flags;
99	struct thread *td;
100{
101
102	panic("softdep_flushfiles called");
103}
104
105int
106softdep_mount(devvp, mp, fs, cred)
107	struct vnode *devvp;
108	struct mount *mp;
109	struct fs *fs;
110	struct ucred *cred;
111{
112
113	return (0);
114}
115
116void
117softdep_initialize()
118{
119
120	return;
121}
122
123void
124softdep_uninitialize()
125{
126
127	return;
128}
129
130void
131softdep_unmount(mp)
132	struct mount *mp;
133{
134
135}
136
137void
138softdep_setup_sbupdate(ump, fs, bp)
139	struct ufsmount *ump;
140	struct fs *fs;
141	struct buf *bp;
142{
143}
144
145void
146softdep_setup_inomapdep(bp, ip, newinum, mode)
147	struct buf *bp;
148	struct inode *ip;
149	ino_t newinum;
150	int mode;
151{
152
153	panic("softdep_setup_inomapdep called");
154}
155
156void
157softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
158	struct buf *bp;
159	struct mount *mp;
160	ufs2_daddr_t newblkno;
161	int frags;
162	int oldfrags;
163{
164
165	panic("softdep_setup_blkmapdep called");
166}
167
168void
169softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
170	struct inode *ip;
171	ufs_lbn_t lbn;
172	ufs2_daddr_t newblkno;
173	ufs2_daddr_t oldblkno;
174	long newsize;
175	long oldsize;
176	struct buf *bp;
177{
178
179	panic("softdep_setup_allocdirect called");
180}
181
182void
183softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
184	struct inode *ip;
185	ufs_lbn_t lbn;
186	ufs2_daddr_t newblkno;
187	ufs2_daddr_t oldblkno;
188	long newsize;
189	long oldsize;
190	struct buf *bp;
191{
192
193	panic("softdep_setup_allocext called");
194}
195
196void
197softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
198	struct inode *ip;
199	ufs_lbn_t lbn;
200	struct buf *bp;
201	int ptrno;
202	ufs2_daddr_t newblkno;
203	ufs2_daddr_t oldblkno;
204	struct buf *nbp;
205{
206
207	panic("softdep_setup_allocindir_page called");
208}
209
210void
211softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
212	struct buf *nbp;
213	struct inode *ip;
214	struct buf *bp;
215	int ptrno;
216	ufs2_daddr_t newblkno;
217{
218
219	panic("softdep_setup_allocindir_meta called");
220}
221
222void
223softdep_journal_freeblocks(ip, cred, length, flags)
224	struct inode *ip;
225	struct ucred *cred;
226	off_t length;
227	int flags;
228{
229
230	panic("softdep_journal_freeblocks called");
231}
232
233void
234softdep_journal_fsync(ip)
235	struct inode *ip;
236{
237
238	panic("softdep_journal_fsync called");
239}
240
241void
242softdep_setup_freeblocks(ip, length, flags)
243	struct inode *ip;
244	off_t length;
245	int flags;
246{
247
248	panic("softdep_setup_freeblocks called");
249}
250
251void
252softdep_freefile(pvp, ino, mode)
253		struct vnode *pvp;
254		ino_t ino;
255		int mode;
256{
257
258	panic("softdep_freefile called");
259}
260
261int
262softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
263	struct buf *bp;
264	struct inode *dp;
265	off_t diroffset;
266	ino_t newinum;
267	struct buf *newdirbp;
268	int isnewblk;
269{
270
271	panic("softdep_setup_directory_add called");
272}
273
274void
275softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
276	struct buf *bp;
277	struct inode *dp;
278	caddr_t base;
279	caddr_t oldloc;
280	caddr_t newloc;
281	int entrysize;
282{
283
284	panic("softdep_change_directoryentry_offset called");
285}
286
287void
288softdep_setup_remove(bp, dp, ip, isrmdir)
289	struct buf *bp;
290	struct inode *dp;
291	struct inode *ip;
292	int isrmdir;
293{
294
295	panic("softdep_setup_remove called");
296}
297
298void
299softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
300	struct buf *bp;
301	struct inode *dp;
302	struct inode *ip;
303	ino_t newinum;
304	int isrmdir;
305{
306
307	panic("softdep_setup_directory_change called");
308}
309
310void
311softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
312	struct mount *mp;
313	struct buf *bp;
314	ufs2_daddr_t blkno;
315	int frags;
316	struct workhead *wkhd;
317{
318
319	panic("%s called", __FUNCTION__);
320}
321
322void
323softdep_setup_inofree(mp, bp, ino, wkhd)
324	struct mount *mp;
325	struct buf *bp;
326	ino_t ino;
327	struct workhead *wkhd;
328{
329
330	panic("%s called", __FUNCTION__);
331}
332
333void
334softdep_setup_unlink(dp, ip)
335	struct inode *dp;
336	struct inode *ip;
337{
338
339	panic("%s called", __FUNCTION__);
340}
341
342void
343softdep_setup_link(dp, ip)
344	struct inode *dp;
345	struct inode *ip;
346{
347
348	panic("%s called", __FUNCTION__);
349}
350
351void
352softdep_revert_link(dp, ip)
353	struct inode *dp;
354	struct inode *ip;
355{
356
357	panic("%s called", __FUNCTION__);
358}
359
360void
361softdep_setup_rmdir(dp, ip)
362	struct inode *dp;
363	struct inode *ip;
364{
365
366	panic("%s called", __FUNCTION__);
367}
368
369void
370softdep_revert_rmdir(dp, ip)
371	struct inode *dp;
372	struct inode *ip;
373{
374
375	panic("%s called", __FUNCTION__);
376}
377
378void
379softdep_setup_create(dp, ip)
380	struct inode *dp;
381	struct inode *ip;
382{
383
384	panic("%s called", __FUNCTION__);
385}
386
387void
388softdep_revert_create(dp, ip)
389	struct inode *dp;
390	struct inode *ip;
391{
392
393	panic("%s called", __FUNCTION__);
394}
395
396void
397softdep_setup_mkdir(dp, ip)
398	struct inode *dp;
399	struct inode *ip;
400{
401
402	panic("%s called", __FUNCTION__);
403}
404
405void
406softdep_revert_mkdir(dp, ip)
407	struct inode *dp;
408	struct inode *ip;
409{
410
411	panic("%s called", __FUNCTION__);
412}
413
414void
415softdep_setup_dotdot_link(dp, ip)
416	struct inode *dp;
417	struct inode *ip;
418{
419
420	panic("%s called", __FUNCTION__);
421}
422
423int
424softdep_prealloc(vp, waitok)
425	struct vnode *vp;
426	int waitok;
427{
428
429	panic("%s called", __FUNCTION__);
430
431	return (0);
432}
433
434int
435softdep_journal_lookup(mp, vpp)
436	struct mount *mp;
437	struct vnode **vpp;
438{
439
440	return (ENOENT);
441}
442
443void
444softdep_change_linkcnt(ip)
445	struct inode *ip;
446{
447
448	panic("softdep_change_linkcnt called");
449}
450
451void
452softdep_load_inodeblock(ip)
453	struct inode *ip;
454{
455
456	panic("softdep_load_inodeblock called");
457}
458
459void
460softdep_update_inodeblock(ip, bp, waitfor)
461	struct inode *ip;
462	struct buf *bp;
463	int waitfor;
464{
465
466	panic("softdep_update_inodeblock called");
467}
468
469int
470softdep_fsync(vp)
471	struct vnode *vp;	/* the "in_core" copy of the inode */
472{
473
474	return (0);
475}
476
477void
478softdep_fsync_mountdev(vp)
479	struct vnode *vp;
480{
481
482	return;
483}
484
485int
486softdep_flushworklist(oldmnt, countp, td)
487	struct mount *oldmnt;
488	int *countp;
489	struct thread *td;
490{
491
492	*countp = 0;
493	return (0);
494}
495
496int
497softdep_sync_metadata(struct vnode *vp)
498{
499
500	return (0);
501}
502
503int
504softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
505{
506
507	return (0);
508}
509
510int
511softdep_slowdown(vp)
512	struct vnode *vp;
513{
514
515	panic("softdep_slowdown called");
516}
517
518void
519softdep_releasefile(ip)
520	struct inode *ip;	/* inode with the zero effective link count */
521{
522
523	panic("softdep_releasefile called");
524}
525
526int
527softdep_request_cleanup(fs, vp, cred, resource)
528	struct fs *fs;
529	struct vnode *vp;
530	struct ucred *cred;
531	int resource;
532{
533
534	return (0);
535}
536
537int
538softdep_check_suspend(struct mount *mp,
539		      struct vnode *devvp,
540		      int softdep_deps,
541		      int softdep_accdeps,
542		      int secondary_writes,
543		      int secondary_accwrites)
544{
545	struct bufobj *bo;
546	int error;
547
548	(void) softdep_deps,
549	(void) softdep_accdeps;
550
551	bo = &devvp->v_bufobj;
552	ASSERT_BO_LOCKED(bo);
553
554	MNT_ILOCK(mp);
555	while (mp->mnt_secondary_writes != 0) {
556		BO_UNLOCK(bo);
557		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
558		    (PUSER - 1) | PDROP, "secwr", 0);
559		BO_LOCK(bo);
560		MNT_ILOCK(mp);
561	}
562
563	/*
564	 * Reasons for needing more work before suspend:
565	 * - Dirty buffers on devvp.
566	 * - Secondary writes occurred after start of vnode sync loop
567	 */
568	error = 0;
569	if (bo->bo_numoutput > 0 ||
570	    bo->bo_dirty.bv_cnt > 0 ||
571	    secondary_writes != 0 ||
572	    mp->mnt_secondary_writes != 0 ||
573	    secondary_accwrites != mp->mnt_secondary_accwrites)
574		error = EAGAIN;
575	BO_UNLOCK(bo);
576	return (error);
577}
578
579void
580softdep_get_depcounts(struct mount *mp,
581		      int *softdepactivep,
582		      int *softdepactiveaccp)
583{
584	(void) mp;
585	*softdepactivep = 0;
586	*softdepactiveaccp = 0;
587}
588
589void
590softdep_buf_append(bp, wkhd)
591	struct buf *bp;
592	struct workhead *wkhd;
593{
594
595	panic("softdep_buf_appendwork called");
596}
597
598void
599softdep_inode_append(ip, cred, wkhd)
600	struct inode *ip;
601	struct ucred *cred;
602	struct workhead *wkhd;
603{
604
605	panic("softdep_inode_appendwork called");
606}
607
608void
609softdep_freework(wkhd)
610	struct workhead *wkhd;
611{
612
613	panic("softdep_freework called");
614}
615
616#else
617
618FEATURE(softupdates, "FFS soft-updates support");
619
620/*
621 * These definitions need to be adapted to the system to which
622 * this file is being ported.
623 */
624
625#define M_SOFTDEP_FLAGS	(M_WAITOK)
626
627#define	D_PAGEDEP	0
628#define	D_INODEDEP	1
629#define	D_BMSAFEMAP	2
630#define	D_NEWBLK	3
631#define	D_ALLOCDIRECT	4
632#define	D_INDIRDEP	5
633#define	D_ALLOCINDIR	6
634#define	D_FREEFRAG	7
635#define	D_FREEBLKS	8
636#define	D_FREEFILE	9
637#define	D_DIRADD	10
638#define	D_MKDIR		11
639#define	D_DIRREM	12
640#define	D_NEWDIRBLK	13
641#define	D_FREEWORK	14
642#define	D_FREEDEP	15
643#define	D_JADDREF	16
644#define	D_JREMREF	17
645#define	D_JMVREF	18
646#define	D_JNEWBLK	19
647#define	D_JFREEBLK	20
648#define	D_JFREEFRAG	21
649#define	D_JSEG		22
650#define	D_JSEGDEP	23
651#define	D_SBDEP		24
652#define	D_JTRUNC	25
653#define	D_JFSYNC	26
654#define	D_SENTINAL	27
655#define	D_LAST		D_SENTINAL
656
657unsigned long dep_current[D_LAST + 1];
658unsigned long dep_total[D_LAST + 1];
659unsigned long dep_write[D_LAST + 1];
660
661
662static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
663    "soft updates stats");
664static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
665    "total dependencies allocated");
666static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
667    "current dependencies allocated");
668static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
669    "current dependencies written");
670
671#define	SOFTDEP_TYPE(type, str, long)					\
672    static MALLOC_DEFINE(M_ ## type, #str, long);			\
673    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
674	&dep_total[D_ ## type], 0, "");					\
675    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
676	&dep_current[D_ ## type], 0, "");				\
677    SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, 	\
678	&dep_write[D_ ## type], 0, "");
679
680SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
681SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
682SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
683    "Block or frag allocated from cyl group map");
684SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
685SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
686SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
687SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
688SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
689SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
690SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
691SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
692SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
693SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
694SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
695SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
696SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
697SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
698SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
699SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
700SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
701SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
702SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
703SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
704SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
705SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
706SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
707SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
708
709static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
710static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
711
712/*
713 * translate from workitem type to memory type
714 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
715 */
716static struct malloc_type *memtype[] = {
717	M_PAGEDEP,
718	M_INODEDEP,
719	M_BMSAFEMAP,
720	M_NEWBLK,
721	M_ALLOCDIRECT,
722	M_INDIRDEP,
723	M_ALLOCINDIR,
724	M_FREEFRAG,
725	M_FREEBLKS,
726	M_FREEFILE,
727	M_DIRADD,
728	M_MKDIR,
729	M_DIRREM,
730	M_NEWDIRBLK,
731	M_FREEWORK,
732	M_FREEDEP,
733	M_JADDREF,
734	M_JREMREF,
735	M_JMVREF,
736	M_JNEWBLK,
737	M_JFREEBLK,
738	M_JFREEFRAG,
739	M_JSEG,
740	M_JSEGDEP,
741	M_SBDEP,
742	M_JTRUNC,
743	M_JFSYNC
744};
745
746static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;
747
748#define DtoM(type) (memtype[type])
749
750/*
751 * Names of malloc types.
752 */
753#define TYPENAME(type)  \
754	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
755/*
756 * End system adaptation definitions.
757 */
758
759#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
760#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
761
762/*
763 * Forward declarations.
764 */
765struct inodedep_hashhead;
766struct newblk_hashhead;
767struct pagedep_hashhead;
768struct bmsafemap_hashhead;
769
770/*
771 * Internal function prototypes.
772 */
773static	void softdep_error(char *, int);
774static	void drain_output(struct vnode *);
775static	struct buf *getdirtybuf(struct buf *, struct mtx *, int);
776static	void clear_remove(void);
777static	void clear_inodedeps(void);
778static	void unlinked_inodedep(struct mount *, struct inodedep *);
779static	void clear_unlinked_inodedep(struct inodedep *);
780static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
781static	int flush_pagedep_deps(struct vnode *, struct mount *,
782	    struct diraddhd *);
783static	int free_pagedep(struct pagedep *);
784static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
785static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
786static	int flush_deplist(struct allocdirectlst *, int, int *);
787static	int sync_cgs(struct mount *, int);
788static	int handle_written_filepage(struct pagedep *, struct buf *);
789static	int handle_written_sbdep(struct sbdep *, struct buf *);
790static	void initiate_write_sbdep(struct sbdep *);
791static  void diradd_inode_written(struct diradd *, struct inodedep *);
792static	int handle_written_indirdep(struct indirdep *, struct buf *,
793	    struct buf**);
794static	int handle_written_inodeblock(struct inodedep *, struct buf *);
795static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
796	    uint8_t *);
797static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
798static	void handle_written_jaddref(struct jaddref *);
799static	void handle_written_jremref(struct jremref *);
800static	void handle_written_jseg(struct jseg *, struct buf *);
801static	void handle_written_jnewblk(struct jnewblk *);
802static	void handle_written_jblkdep(struct jblkdep *);
803static	void handle_written_jfreefrag(struct jfreefrag *);
804static	void complete_jseg(struct jseg *);
805static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
806static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
807static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
808static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
809static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
810static	void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
811static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
812static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
813static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
814static	inline void inoref_write(struct inoref *, struct jseg *,
815	    struct jrefrec *);
816static	void handle_allocdirect_partdone(struct allocdirect *,
817	    struct workhead *);
818static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
819	    struct workhead *);
820static	void indirdep_complete(struct indirdep *);
821static	int indirblk_lookup(struct mount *, ufs2_daddr_t);
822static	void indirblk_insert(struct freework *);
823static	void indirblk_remove(struct freework *);
824static	void handle_allocindir_partdone(struct allocindir *);
825static	void initiate_write_filepage(struct pagedep *, struct buf *);
826static	void initiate_write_indirdep(struct indirdep*, struct buf *);
827static	void handle_written_mkdir(struct mkdir *, int);
828static	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
829	    uint8_t *);
830static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
831static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
832static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
833static	void handle_workitem_freefile(struct freefile *);
834static	int handle_workitem_remove(struct dirrem *, int);
835static	struct dirrem *newdirrem(struct buf *, struct inode *,
836	    struct inode *, int, struct dirrem **);
837static	struct indirdep *indirdep_lookup(struct mount *, struct inode *,
838	    struct buf *);
839static	void cancel_indirdep(struct indirdep *, struct buf *,
840	    struct freeblks *);
841static	void free_indirdep(struct indirdep *);
842static	void free_diradd(struct diradd *, struct workhead *);
843static	void merge_diradd(struct inodedep *, struct diradd *);
844static	void complete_diradd(struct diradd *);
845static	struct diradd *diradd_lookup(struct pagedep *, int);
846static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
847	    struct jremref *);
848static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
849	    struct jremref *);
850static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
851	    struct jremref *, struct jremref *);
852static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
853	    struct jremref *);
854static	void cancel_allocindir(struct allocindir *, struct buf *bp,
855	    struct freeblks *, int);
856static	int setup_trunc_indir(struct freeblks *, struct inode *,
857	    ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
858static	void complete_trunc_indir(struct freework *);
859static	void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
860	    int);
861static	void complete_mkdir(struct mkdir *);
862static	void free_newdirblk(struct newdirblk *);
863static	void free_jremref(struct jremref *);
864static	void free_jaddref(struct jaddref *);
865static	void free_jsegdep(struct jsegdep *);
866static	void free_jsegs(struct jblocks *);
867static	void rele_jseg(struct jseg *);
868static	void free_jseg(struct jseg *, struct jblocks *);
869static	void free_jnewblk(struct jnewblk *);
870static	void free_jblkdep(struct jblkdep *);
871static	void free_jfreefrag(struct jfreefrag *);
872static	void free_freedep(struct freedep *);
873static	void journal_jremref(struct dirrem *, struct jremref *,
874	    struct inodedep *);
875static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
876static	int cancel_jaddref(struct jaddref *, struct inodedep *,
877	    struct workhead *);
878static	void cancel_jfreefrag(struct jfreefrag *);
879static	inline void setup_freedirect(struct freeblks *, struct inode *,
880	    int, int);
881static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
882static	inline void setup_freeindir(struct freeblks *, struct inode *, int,
883	    ufs_lbn_t, int);
884static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
885static	void freeblks_free(struct ufsmount *, struct freeblks *, int);
886static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
887ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
888static	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
889static	void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
890	    int, int);
891static	void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
892static 	int cancel_pagedep(struct pagedep *, struct freeblks *, int);
893static	int deallocate_dependencies(struct buf *, struct freeblks *, int);
894static	void newblk_freefrag(struct newblk*);
895static	void free_newblk(struct newblk *);
896static	void cancel_allocdirect(struct allocdirectlst *,
897	    struct allocdirect *, struct freeblks *);
898static	int check_inode_unwritten(struct inodedep *);
899static	int free_inodedep(struct inodedep *);
900static	void freework_freeblock(struct freework *);
901static	void freework_enqueue(struct freework *);
902static	int handle_workitem_freeblocks(struct freeblks *, int);
903static	int handle_complete_freeblocks(struct freeblks *, int);
904static	void handle_workitem_indirblk(struct freework *);
905static	void handle_written_freework(struct freework *);
906static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
907static	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
908	    struct workhead *);
909static	struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
910	    struct inodedep *, struct allocindir *, ufs_lbn_t);
911static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
912	    ufs2_daddr_t, ufs_lbn_t);
913static	void handle_workitem_freefrag(struct freefrag *);
914static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
915	    ufs_lbn_t);
916static	void allocdirect_merge(struct allocdirectlst *,
917	    struct allocdirect *, struct allocdirect *);
918static	struct freefrag *allocindir_merge(struct allocindir *,
919	    struct allocindir *);
920static	int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int,
921	    struct bmsafemap **);
922static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
923	    int cg, struct bmsafemap *);
924static	int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t,
925	    int, struct newblk **);
926static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
927static	int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
928	    struct inodedep **);
929static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
930static	int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
931	    int, struct pagedep **);
932static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
933	    struct mount *mp, int, struct pagedep **);
934static	void pause_timer(void *);
935static	int request_cleanup(struct mount *, int);
936static	int process_worklist_item(struct mount *, int, int);
937static	void process_removes(struct vnode *);
938static	void process_truncates(struct vnode *);
939static	void jwork_move(struct workhead *, struct workhead *);
940static	void jwork_insert(struct workhead *, struct jsegdep *);
941static	void add_to_worklist(struct worklist *, int);
942static	void wake_worklist(struct worklist *);
943static	void wait_worklist(struct worklist *, char *);
944static	void remove_from_worklist(struct worklist *);
945static	void softdep_flush(void);
946static	void softdep_flushjournal(struct mount *);
947static	int softdep_speedup(void);
948static	void worklist_speedup(void);
949static	int journal_mount(struct mount *, struct fs *, struct ucred *);
950static	void journal_unmount(struct mount *);
951static	int journal_space(struct ufsmount *, int);
952static	void journal_suspend(struct ufsmount *);
953static	int journal_unsuspend(struct ufsmount *ump);
954static	void softdep_prelink(struct vnode *, struct vnode *);
955static	void add_to_journal(struct worklist *);
956static	void remove_from_journal(struct worklist *);
957static	void softdep_process_journal(struct mount *, struct worklist *, int);
958static	struct jremref *newjremref(struct dirrem *, struct inode *,
959	    struct inode *ip, off_t, nlink_t);
960static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
961	    uint16_t);
962static	inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
963	    uint16_t);
964static	inline struct jsegdep *inoref_jseg(struct inoref *);
965static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
966static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
967	    ufs2_daddr_t, int);
968static	struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
969static	void move_newblock_dep(struct jaddref *, struct inodedep *);
970static	void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
971static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
972	    ufs2_daddr_t, long, ufs_lbn_t);
973static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
974	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
975static	int jwait(struct worklist *, int);
976static	struct inodedep *inodedep_lookup_ip(struct inode *);
977static	int bmsafemap_rollbacks(struct bmsafemap *);
978static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
979static	void handle_jwork(struct workhead *);
980static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
981	    struct mkdir **);
982static	struct jblocks *jblocks_create(void);
983static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
984static	void jblocks_free(struct jblocks *, struct mount *, int);
985static	void jblocks_destroy(struct jblocks *);
986static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
987
988/*
989 * Exported softdep operations.
990 */
991static	void softdep_disk_io_initiation(struct buf *);
992static	void softdep_disk_write_complete(struct buf *);
993static	void softdep_deallocate_dependencies(struct buf *);
994static	int softdep_count_dependencies(struct buf *bp, int);
995
996static struct mtx lk;
997MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
998
999#define TRY_ACQUIRE_LOCK(lk)		mtx_trylock(lk)
1000#define ACQUIRE_LOCK(lk)		mtx_lock(lk)
1001#define FREE_LOCK(lk)			mtx_unlock(lk)
1002
1003#define	BUF_AREC(bp)			lockallowrecurse(&(bp)->b_lock)
1004#define	BUF_NOREC(bp)			lockdisablerecurse(&(bp)->b_lock)
1005
1006/*
1007 * Worklist queue management.
1008 * These routines require that the lock be held.
1009 */
1010#ifndef /* NOT */ DEBUG
1011#define WORKLIST_INSERT(head, item) do {	\
1012	(item)->wk_state |= ONWORKLIST;		\
1013	LIST_INSERT_HEAD(head, item, wk_list);	\
1014} while (0)
1015#define WORKLIST_REMOVE(item) do {		\
1016	(item)->wk_state &= ~ONWORKLIST;	\
1017	LIST_REMOVE(item, wk_list);		\
1018} while (0)
1019#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
1020#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
1021
1022#else /* DEBUG */
1023static	void worklist_insert(struct workhead *, struct worklist *, int);
1024static	void worklist_remove(struct worklist *, int);
1025
1026#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
1027#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
1028#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
1029#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
1030
1031static void
1032worklist_insert(head, item, locked)
1033	struct workhead *head;
1034	struct worklist *item;
1035	int locked;
1036{
1037
1038	if (locked)
1039		mtx_assert(&lk, MA_OWNED);
1040	if (item->wk_state & ONWORKLIST)
1041		panic("worklist_insert: %p %s(0x%X) already on list",
1042		    item, TYPENAME(item->wk_type), item->wk_state);
1043	item->wk_state |= ONWORKLIST;
1044	LIST_INSERT_HEAD(head, item, wk_list);
1045}
1046
1047static void
1048worklist_remove(item, locked)
1049	struct worklist *item;
1050	int locked;
1051{
1052
1053	if (locked)
1054		mtx_assert(&lk, MA_OWNED);
1055	if ((item->wk_state & ONWORKLIST) == 0)
1056		panic("worklist_remove: %p %s(0x%X) not on list",
1057		    item, TYPENAME(item->wk_type), item->wk_state);
1058	item->wk_state &= ~ONWORKLIST;
1059	LIST_REMOVE(item, wk_list);
1060}
1061#endif /* DEBUG */
1062
1063/*
1064 * Merge two jsegdeps keeping only the oldest one as newer references
1065 * can't be discarded until after older references.
1066 */
1067static inline struct jsegdep *
1068jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1069{
1070	struct jsegdep *swp;
1071
1072	if (two == NULL)
1073		return (one);
1074
1075	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1076		swp = one;
1077		one = two;
1078		two = swp;
1079	}
1080	WORKLIST_REMOVE(&two->jd_list);
1081	free_jsegdep(two);
1082
1083	return (one);
1084}
1085
1086/*
1087 * If two freedeps are compatible free one to reduce list size.
1088 */
1089static inline struct freedep *
1090freedep_merge(struct freedep *one, struct freedep *two)
1091{
1092	if (two == NULL)
1093		return (one);
1094
1095	if (one->fd_freework == two->fd_freework) {
1096		WORKLIST_REMOVE(&two->fd_list);
1097		free_freedep(two);
1098	}
1099	return (one);
1100}
1101
1102/*
1103 * Move journal work from one list to another.  Duplicate freedeps and
1104 * jsegdeps are coalesced to keep the lists as small as possible.
1105 */
1106static void
1107jwork_move(dst, src)
1108	struct workhead *dst;
1109	struct workhead *src;
1110{
1111	struct freedep *freedep;
1112	struct jsegdep *jsegdep;
1113	struct worklist *wkn;
1114	struct worklist *wk;
1115
1116	KASSERT(dst != src,
1117	    ("jwork_move: dst == src"));
1118	freedep = NULL;
1119	jsegdep = NULL;
1120	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1121		if (wk->wk_type == D_JSEGDEP)
1122			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1123		if (wk->wk_type == D_FREEDEP)
1124			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1125	}
1126
1127	mtx_assert(&lk, MA_OWNED);
1128	while ((wk = LIST_FIRST(src)) != NULL) {
1129		WORKLIST_REMOVE(wk);
1130		WORKLIST_INSERT(dst, wk);
1131		if (wk->wk_type == D_JSEGDEP) {
1132			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1133			continue;
1134		}
1135		if (wk->wk_type == D_FREEDEP)
1136			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1137	}
1138}
1139
1140static void
1141jwork_insert(dst, jsegdep)
1142	struct workhead *dst;
1143	struct jsegdep *jsegdep;
1144{
1145	struct jsegdep *jsegdepn;
1146	struct worklist *wk;
1147
1148	LIST_FOREACH(wk, dst, wk_list)
1149		if (wk->wk_type == D_JSEGDEP)
1150			break;
1151	if (wk == NULL) {
1152		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1153		return;
1154	}
1155	jsegdepn = WK_JSEGDEP(wk);
1156	if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1157		WORKLIST_REMOVE(wk);
1158		free_jsegdep(jsegdepn);
1159		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1160	} else
1161		free_jsegdep(jsegdep);
1162}
1163
1164/*
1165 * Routines for tracking and managing workitems.
1166 */
1167static	void workitem_free(struct worklist *, int);
1168static	void workitem_alloc(struct worklist *, int, struct mount *);
1169
1170#define	WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
1171
1172static void
1173workitem_free(item, type)
1174	struct worklist *item;
1175	int type;
1176{
1177	struct ufsmount *ump;
1178	mtx_assert(&lk, MA_OWNED);
1179
1180#ifdef DEBUG
1181	if (item->wk_state & ONWORKLIST)
1182		panic("workitem_free: %s(0x%X) still on list",
1183		    TYPENAME(item->wk_type), item->wk_state);
1184	if (item->wk_type != type)
1185		panic("workitem_free: type mismatch %s != %s",
1186		    TYPENAME(item->wk_type), TYPENAME(type));
1187#endif
1188	if (item->wk_state & IOWAITING)
1189		wakeup(item);
1190	ump = VFSTOUFS(item->wk_mp);
1191	if (--ump->softdep_deps == 0 && ump->softdep_req)
1192		wakeup(&ump->softdep_deps);
1193	dep_current[type]--;
1194	free(item, DtoM(type));
1195}
1196
1197static void
1198workitem_alloc(item, type, mp)
1199	struct worklist *item;
1200	int type;
1201	struct mount *mp;
1202{
1203	struct ufsmount *ump;
1204
1205	item->wk_type = type;
1206	item->wk_mp = mp;
1207	item->wk_state = 0;
1208
1209	ump = VFSTOUFS(mp);
1210	ACQUIRE_LOCK(&lk);
1211	dep_current[type]++;
1212	dep_total[type]++;
1213	ump->softdep_deps++;
1214	ump->softdep_accdeps++;
1215	FREE_LOCK(&lk);
1216}
1217
1218/*
1219 * Workitem queue management
1220 */
1221static int max_softdeps;	/* maximum number of structs before slowdown */
1222static int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
1223static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1224static int proc_waiting;	/* tracks whether we have a timeout posted */
1225static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1226static struct callout softdep_callout;
1227static int req_pending;
1228static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1229static int req_clear_remove;	/* syncer process flush some freeblks */
1230
1231/*
1232 * runtime statistics
1233 */
1234static int stat_worklist_push;	/* number of worklist cleanups */
1235static int stat_blk_limit_push;	/* number of times block limit neared */
1236static int stat_ino_limit_push;	/* number of times inode limit neared */
1237static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1238static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1239static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1240static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1241static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1242static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1243static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1244static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1245static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1246static int stat_journal_min;	/* Times hit journal min threshold */
1247static int stat_journal_low;	/* Times hit journal low threshold */
1248static int stat_journal_wait;	/* Times blocked in jwait(). */
1249static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1250static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1251static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1252static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1253static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1254static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1255static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1256static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1257static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1258
1259SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1260    &max_softdeps, 0, "");
1261SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1262    &tickdelay, 0, "");
1263SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
1264    &maxindirdeps, 0, "");
1265SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1266    &stat_worklist_push, 0,"");
1267SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1268    &stat_blk_limit_push, 0,"");
1269SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1270    &stat_ino_limit_push, 0,"");
1271SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1272    &stat_blk_limit_hit, 0, "");
1273SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1274    &stat_ino_limit_hit, 0, "");
1275SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1276    &stat_sync_limit_hit, 0, "");
1277SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1278    &stat_indir_blk_ptrs, 0, "");
1279SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1280    &stat_inode_bitmap, 0, "");
1281SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1282    &stat_direct_blk_ptrs, 0, "");
1283SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1284    &stat_dir_entry, 0, "");
1285SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1286    &stat_jaddref, 0, "");
1287SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1288    &stat_jnewblk, 0, "");
1289SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1290    &stat_journal_low, 0, "");
1291SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1292    &stat_journal_min, 0, "");
1293SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1294    &stat_journal_wait, 0, "");
1295SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1296    &stat_jwait_filepage, 0, "");
1297SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1298    &stat_jwait_freeblks, 0, "");
1299SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1300    &stat_jwait_inode, 0, "");
1301SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1302    &stat_jwait_newblk, 0, "");
1303SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1304    &stat_cleanup_blkrequests, 0, "");
1305SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1306    &stat_cleanup_inorequests, 0, "");
1307SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1308    &stat_cleanup_high_delay, 0, "");
1309SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1310    &stat_cleanup_retries, 0, "");
1311SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1312    &stat_cleanup_failures, 0, "");
1313
1314SYSCTL_DECL(_vfs_ffs);
1315
1316LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl;
1317static u_long	bmsafemap_hash;	/* size of hash table - 1 */
1318
1319static int compute_summary_at_mount = 0;	/* Whether to recompute the summary at mount time */
1320SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1321	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1322
1323static struct proc *softdepproc;
1324static struct kproc_desc softdep_kp = {
1325	"softdepflush",
1326	softdep_flush,
1327	&softdepproc
1328};
1329SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
1330    &softdep_kp);
1331
1332static void
1333softdep_flush(void)
1334{
1335	struct mount *nmp;
1336	struct mount *mp;
1337	struct ufsmount *ump;
1338	struct thread *td;
1339	int remaining;
1340	int progress;
1341
1342	td = curthread;
1343	td->td_pflags |= TDP_NORUNNINGBUF;
1344
1345	for (;;) {
1346		kproc_suspend_check(softdepproc);
1347		ACQUIRE_LOCK(&lk);
1348		/*
1349		 * If requested, try removing inode or removal dependencies.
1350		 */
1351		if (req_clear_inodedeps) {
1352			clear_inodedeps();
1353			req_clear_inodedeps -= 1;
1354			wakeup_one(&proc_waiting);
1355		}
1356		if (req_clear_remove) {
1357			clear_remove();
1358			req_clear_remove -= 1;
1359			wakeup_one(&proc_waiting);
1360		}
1361		FREE_LOCK(&lk);
1362		remaining = progress = 0;
1363		mtx_lock(&mountlist_mtx);
1364		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
1365			nmp = TAILQ_NEXT(mp, mnt_list);
1366			if (MOUNTEDSOFTDEP(mp) == 0)
1367				continue;
1368			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
1369				continue;
1370			progress += softdep_process_worklist(mp, 0);
1371			ump = VFSTOUFS(mp);
1372			remaining += ump->softdep_on_worklist;
1373			mtx_lock(&mountlist_mtx);
1374			nmp = TAILQ_NEXT(mp, mnt_list);
1375			vfs_unbusy(mp);
1376		}
1377		mtx_unlock(&mountlist_mtx);
1378		if (remaining && progress)
1379			continue;
1380		ACQUIRE_LOCK(&lk);
1381		if (!req_pending)
1382			msleep(&req_pending, &lk, PVM, "sdflush", hz);
1383		req_pending = 0;
1384		FREE_LOCK(&lk);
1385	}
1386}
1387
1388static void
1389worklist_speedup(void)
1390{
1391	mtx_assert(&lk, MA_OWNED);
1392	if (req_pending == 0) {
1393		req_pending = 1;
1394		wakeup(&req_pending);
1395	}
1396}
1397
1398static int
1399softdep_speedup(void)
1400{
1401
1402	worklist_speedup();
1403	bd_speedup();
1404	return speedup_syncer();
1405}
1406
1407/*
1408 * Add an item to the end of the work queue.
1409 * This routine requires that the lock be held.
1410 * This is the only routine that adds items to the list.
1411 * The following routine is the only one that removes items
1412 * and does so in order from first to last.
1413 */
1414
1415#define	WK_HEAD		0x0001	/* Add to HEAD. */
1416#define	WK_NODELAY	0x0002	/* Process immediately. */
1417
1418static void
1419add_to_worklist(wk, flags)
1420	struct worklist *wk;
1421	int flags;
1422{
1423	struct ufsmount *ump;
1424
1425	mtx_assert(&lk, MA_OWNED);
1426	ump = VFSTOUFS(wk->wk_mp);
1427	if (wk->wk_state & ONWORKLIST)
1428		panic("add_to_worklist: %s(0x%X) already on list",
1429		    TYPENAME(wk->wk_type), wk->wk_state);
1430	wk->wk_state |= ONWORKLIST;
1431	if (ump->softdep_on_worklist == 0) {
1432		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1433		ump->softdep_worklist_tail = wk;
1434	} else if (flags & WK_HEAD) {
1435		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1436	} else {
1437		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1438		ump->softdep_worklist_tail = wk;
1439	}
1440	ump->softdep_on_worklist += 1;
1441	if (flags & WK_NODELAY)
1442		worklist_speedup();
1443}
1444
1445/*
1446 * Remove the item to be processed. If we are removing the last
1447 * item on the list, we need to recalculate the tail pointer.
1448 */
1449static void
1450remove_from_worklist(wk)
1451	struct worklist *wk;
1452{
1453	struct ufsmount *ump;
1454
1455	ump = VFSTOUFS(wk->wk_mp);
1456	WORKLIST_REMOVE(wk);
1457	if (ump->softdep_worklist_tail == wk)
1458		ump->softdep_worklist_tail =
1459		    (struct worklist *)wk->wk_list.le_prev;
1460	ump->softdep_on_worklist -= 1;
1461}
1462
1463static void
1464wake_worklist(wk)
1465	struct worklist *wk;
1466{
1467	if (wk->wk_state & IOWAITING) {
1468		wk->wk_state &= ~IOWAITING;
1469		wakeup(wk);
1470	}
1471}
1472
1473static void
1474wait_worklist(wk, wmesg)
1475	struct worklist *wk;
1476	char *wmesg;
1477{
1478
1479	wk->wk_state |= IOWAITING;
1480	msleep(wk, &lk, PVM, wmesg, 0);
1481}
1482
1483/*
1484 * Process that runs once per second to handle items in the background queue.
1485 *
1486 * Note that we ensure that everything is done in the order in which they
1487 * appear in the queue. The code below depends on this property to ensure
1488 * that blocks of a file are freed before the inode itself is freed. This
1489 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1490 * until all the old ones have been purged from the dependency lists.
1491 */
1492int
1493softdep_process_worklist(mp, full)
1494	struct mount *mp;
1495	int full;
1496{
1497	int cnt, matchcnt;
1498	struct ufsmount *ump;
1499	long starttime;
1500
1501	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1502	/*
1503	 * Record the process identifier of our caller so that we can give
1504	 * this process preferential treatment in request_cleanup below.
1505	 */
1506	matchcnt = 0;
1507	ump = VFSTOUFS(mp);
1508	ACQUIRE_LOCK(&lk);
1509	starttime = time_second;
1510	softdep_process_journal(mp, NULL, full?MNT_WAIT:0);
1511	while (ump->softdep_on_worklist > 0) {
1512		if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1513			break;
1514		else
1515			matchcnt += cnt;
1516		/*
1517		 * If requested, try removing inode or removal dependencies.
1518		 */
1519		if (req_clear_inodedeps) {
1520			clear_inodedeps();
1521			req_clear_inodedeps -= 1;
1522			wakeup_one(&proc_waiting);
1523		}
1524		if (req_clear_remove) {
1525			clear_remove();
1526			req_clear_remove -= 1;
1527			wakeup_one(&proc_waiting);
1528		}
1529		/*
1530		 * We do not generally want to stop for buffer space, but if
1531		 * we are really being a buffer hog, we will stop and wait.
1532		 */
1533		if (should_yield()) {
1534			FREE_LOCK(&lk);
1535			kern_yield(PRI_UNCHANGED);
1536			bwillwrite();
1537			ACQUIRE_LOCK(&lk);
1538		}
1539		/*
1540		 * Never allow processing to run for more than one
1541		 * second. Otherwise the other mountpoints may get
1542		 * excessively backlogged.
1543		 */
1544		if (!full && starttime != time_second)
1545			break;
1546	}
1547	if (full == 0)
1548		journal_unsuspend(ump);
1549	FREE_LOCK(&lk);
1550	return (matchcnt);
1551}
1552
1553/*
1554 * Process all removes associated with a vnode if we are running out of
1555 * journal space.  Any other process which attempts to flush these will
1556 * be unable as we have the vnodes locked.
1557 */
1558static void
1559process_removes(vp)
1560	struct vnode *vp;
1561{
1562	struct inodedep *inodedep;
1563	struct dirrem *dirrem;
1564	struct mount *mp;
1565	ino_t inum;
1566
1567	mtx_assert(&lk, MA_OWNED);
1568
1569	mp = vp->v_mount;
1570	inum = VTOI(vp)->i_number;
1571	for (;;) {
1572top:
1573		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1574			return;
1575		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1576			/*
1577			 * If another thread is trying to lock this vnode
1578			 * it will fail but we must wait for it to do so
1579			 * before we can proceed.
1580			 */
1581			if (dirrem->dm_state & INPROGRESS) {
1582				wait_worklist(&dirrem->dm_list, "pwrwait");
1583				goto top;
1584			}
1585			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1586			    (COMPLETE | ONWORKLIST))
1587				break;
1588		}
1589		if (dirrem == NULL)
1590			return;
1591		remove_from_worklist(&dirrem->dm_list);
1592		FREE_LOCK(&lk);
1593		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1594			panic("process_removes: suspended filesystem");
1595		handle_workitem_remove(dirrem, 0);
1596		vn_finished_secondary_write(mp);
1597		ACQUIRE_LOCK(&lk);
1598	}
1599}
1600
1601/*
1602 * Process all truncations associated with a vnode if we are running out
1603 * of journal space.  This is called when the vnode lock is already held
1604 * and no other process can clear the truncation.  This function returns
1605 * a value greater than zero if it did any work.
1606 */
1607static void
1608process_truncates(vp)
1609	struct vnode *vp;
1610{
1611	struct inodedep *inodedep;
1612	struct freeblks *freeblks;
1613	struct mount *mp;
1614	ino_t inum;
1615	int cgwait;
1616
1617	mtx_assert(&lk, MA_OWNED);
1618
1619	mp = vp->v_mount;
1620	inum = VTOI(vp)->i_number;
1621	for (;;) {
1622		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1623			return;
1624		cgwait = 0;
1625		TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1626			/* Journal entries not yet written.  */
1627			if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1628				jwait(&LIST_FIRST(
1629				    &freeblks->fb_jblkdephd)->jb_list,
1630				    MNT_WAIT);
1631				break;
1632			}
1633			/* Another thread is executing this item. */
1634			if (freeblks->fb_state & INPROGRESS) {
1635				wait_worklist(&freeblks->fb_list, "ptrwait");
1636				break;
1637			}
1638			/* Freeblks is waiting on a inode write. */
1639			if ((freeblks->fb_state & COMPLETE) == 0) {
1640				FREE_LOCK(&lk);
1641				ffs_update(vp, 1);
1642				ACQUIRE_LOCK(&lk);
1643				break;
1644			}
1645			if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1646			    (ALLCOMPLETE | ONWORKLIST)) {
1647				remove_from_worklist(&freeblks->fb_list);
1648				freeblks->fb_state |= INPROGRESS;
1649				FREE_LOCK(&lk);
1650				if (vn_start_secondary_write(NULL, &mp,
1651				    V_NOWAIT))
1652					panic("process_truncates: "
1653					    "suspended filesystem");
1654				handle_workitem_freeblocks(freeblks, 0);
1655				vn_finished_secondary_write(mp);
1656				ACQUIRE_LOCK(&lk);
1657				break;
1658			}
1659			if (freeblks->fb_cgwait)
1660				cgwait++;
1661		}
1662		if (cgwait) {
1663			FREE_LOCK(&lk);
1664			sync_cgs(mp, MNT_WAIT);
1665			ffs_sync_snap(mp, MNT_WAIT);
1666			ACQUIRE_LOCK(&lk);
1667			continue;
1668		}
1669		if (freeblks == NULL)
1670			break;
1671	}
1672	return;
1673}
1674
1675/*
1676 * Process one item on the worklist.
1677 */
1678static int
1679process_worklist_item(mp, target, flags)
1680	struct mount *mp;
1681	int target;
1682	int flags;
1683{
1684	struct worklist sintenel;
1685	struct worklist *wk;
1686	struct ufsmount *ump;
1687	int matchcnt;
1688	int error;
1689
1690	mtx_assert(&lk, MA_OWNED);
1691	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1692	/*
1693	 * If we are being called because of a process doing a
1694	 * copy-on-write, then it is not safe to write as we may
1695	 * recurse into the copy-on-write routine.
1696	 */
1697	if (curthread->td_pflags & TDP_COWINPROGRESS)
1698		return (-1);
1699	PHOLD(curproc);	/* Don't let the stack go away. */
1700	ump = VFSTOUFS(mp);
1701	matchcnt = 0;
1702	sintenel.wk_mp = NULL;
1703	sintenel.wk_type = D_SENTINAL;
1704	LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sintenel, wk_list);
1705	for (wk = LIST_NEXT(&sintenel, wk_list); wk != NULL;
1706	    wk = LIST_NEXT(&sintenel, wk_list)) {
1707		if (wk->wk_type == D_SENTINAL) {
1708			LIST_REMOVE(&sintenel, wk_list);
1709			LIST_INSERT_AFTER(wk, &sintenel, wk_list);
1710			continue;
1711		}
1712		if (wk->wk_state & INPROGRESS)
1713			panic("process_worklist_item: %p already in progress.",
1714			    wk);
1715		wk->wk_state |= INPROGRESS;
1716		remove_from_worklist(wk);
1717		FREE_LOCK(&lk);
1718		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1719			panic("process_worklist_item: suspended filesystem");
1720		switch (wk->wk_type) {
1721		case D_DIRREM:
1722			/* removal of a directory entry */
1723			error = handle_workitem_remove(WK_DIRREM(wk), flags);
1724			break;
1725
1726		case D_FREEBLKS:
1727			/* releasing blocks and/or fragments from a file */
1728			error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1729			    flags);
1730			break;
1731
1732		case D_FREEFRAG:
1733			/* releasing a fragment when replaced as a file grows */
1734			handle_workitem_freefrag(WK_FREEFRAG(wk));
1735			error = 0;
1736			break;
1737
1738		case D_FREEFILE:
1739			/* releasing an inode when its link count drops to 0 */
1740			handle_workitem_freefile(WK_FREEFILE(wk));
1741			error = 0;
1742			break;
1743
1744		default:
1745			panic("%s_process_worklist: Unknown type %s",
1746			    "softdep", TYPENAME(wk->wk_type));
1747			/* NOTREACHED */
1748		}
1749		vn_finished_secondary_write(mp);
1750		ACQUIRE_LOCK(&lk);
1751		if (error == 0) {
1752			if (++matchcnt == target)
1753				break;
1754			continue;
1755		}
1756		/*
1757		 * We have to retry the worklist item later.  Wake up any
1758		 * waiters who may be able to complete it immediately and
1759		 * add the item back to the head so we don't try to execute
1760		 * it again.
1761		 */
1762		wk->wk_state &= ~INPROGRESS;
1763		wake_worklist(wk);
1764		add_to_worklist(wk, WK_HEAD);
1765	}
1766	LIST_REMOVE(&sintenel, wk_list);
1767	/* Sentinal could've become the tail from remove_from_worklist. */
1768	if (ump->softdep_worklist_tail == &sintenel)
1769		ump->softdep_worklist_tail =
1770		    (struct worklist *)sintenel.wk_list.le_prev;
1771	PRELE(curproc);
1772	return (matchcnt);
1773}
1774
1775/*
1776 * Move dependencies from one buffer to another.
1777 */
1778int
1779softdep_move_dependencies(oldbp, newbp)
1780	struct buf *oldbp;
1781	struct buf *newbp;
1782{
1783	struct worklist *wk, *wktail;
1784	int dirty;
1785
1786	dirty = 0;
1787	wktail = NULL;
1788	ACQUIRE_LOCK(&lk);
1789	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1790		LIST_REMOVE(wk, wk_list);
1791		if (wk->wk_type == D_BMSAFEMAP &&
1792		    bmsafemap_rollbacks(WK_BMSAFEMAP(wk)))
1793			dirty = 1;
1794		if (wktail == 0)
1795			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1796		else
1797			LIST_INSERT_AFTER(wktail, wk, wk_list);
1798		wktail = wk;
1799	}
1800	FREE_LOCK(&lk);
1801
1802	return (dirty);
1803}
1804
1805/*
1806 * Purge the work list of all items associated with a particular mount point.
1807 */
1808int
1809softdep_flushworklist(oldmnt, countp, td)
1810	struct mount *oldmnt;
1811	int *countp;
1812	struct thread *td;
1813{
1814	struct vnode *devvp;
1815	int count, error = 0;
1816	struct ufsmount *ump;
1817
1818	/*
1819	 * Alternately flush the block device associated with the mount
1820	 * point and process any dependencies that the flushing
1821	 * creates. We continue until no more worklist dependencies
1822	 * are found.
1823	 */
1824	*countp = 0;
1825	ump = VFSTOUFS(oldmnt);
1826	devvp = ump->um_devvp;
1827	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1828		*countp += count;
1829		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1830		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1831		VOP_UNLOCK(devvp, 0);
1832		if (error)
1833			break;
1834	}
1835	return (error);
1836}
1837
1838int
1839softdep_waitidle(struct mount *mp)
1840{
1841	struct ufsmount *ump;
1842	int error;
1843	int i;
1844
1845	ump = VFSTOUFS(mp);
1846	ACQUIRE_LOCK(&lk);
1847	for (i = 0; i < 10 && ump->softdep_deps; i++) {
1848		ump->softdep_req = 1;
1849		if (ump->softdep_on_worklist)
1850			panic("softdep_waitidle: work added after flush.");
1851		msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
1852	}
1853	ump->softdep_req = 0;
1854	FREE_LOCK(&lk);
1855	error = 0;
1856	if (i == 10) {
1857		error = EBUSY;
1858		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1859		    mp);
1860	}
1861
1862	return (error);
1863}
1864
1865/*
1866 * Flush all vnodes and worklist items associated with a specified mount point.
1867 */
1868int
1869softdep_flushfiles(oldmnt, flags, td)
1870	struct mount *oldmnt;
1871	int flags;
1872	struct thread *td;
1873{
1874	int error, depcount, loopcnt, retry_flush_count, retry;
1875
1876	loopcnt = 10;
1877	retry_flush_count = 3;
1878retry_flush:
1879	error = 0;
1880
1881	/*
1882	 * Alternately flush the vnodes associated with the mount
1883	 * point and process any dependencies that the flushing
1884	 * creates. In theory, this loop can happen at most twice,
1885	 * but we give it a few extra just to be sure.
1886	 */
1887	for (; loopcnt > 0; loopcnt--) {
1888		/*
1889		 * Do another flush in case any vnodes were brought in
1890		 * as part of the cleanup operations.
1891		 */
1892		if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
1893			break;
1894		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1895		    depcount == 0)
1896			break;
1897	}
1898	/*
1899	 * If we are unmounting then it is an error to fail. If we
1900	 * are simply trying to downgrade to read-only, then filesystem
1901	 * activity can keep us busy forever, so we just fail with EBUSY.
1902	 */
1903	if (loopcnt == 0) {
1904		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1905			panic("softdep_flushfiles: looping");
1906		error = EBUSY;
1907	}
1908	if (!error)
1909		error = softdep_waitidle(oldmnt);
1910	if (!error) {
1911		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
1912			retry = 0;
1913			MNT_ILOCK(oldmnt);
1914			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
1915			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
1916			if (oldmnt->mnt_nvnodelistsize > 0) {
1917				if (--retry_flush_count > 0) {
1918					retry = 1;
1919					loopcnt = 3;
1920				} else
1921					error = EBUSY;
1922			}
1923			MNT_IUNLOCK(oldmnt);
1924			if (retry)
1925				goto retry_flush;
1926		}
1927	}
1928	return (error);
1929}
1930
1931/*
1932 * Structure hashing.
1933 *
1934 * There are three types of structures that can be looked up:
1935 *	1) pagedep structures identified by mount point, inode number,
1936 *	   and logical block.
1937 *	2) inodedep structures identified by mount point and inode number.
1938 *	3) newblk structures identified by mount point and
1939 *	   physical block number.
1940 *
1941 * The "pagedep" and "inodedep" dependency structures are hashed
1942 * separately from the file blocks and inodes to which they correspond.
1943 * This separation helps when the in-memory copy of an inode or
1944 * file block must be replaced. It also obviates the need to access
1945 * an inode or file page when simply updating (or de-allocating)
1946 * dependency structures. Lookup of newblk structures is needed to
1947 * find newly allocated blocks when trying to associate them with
1948 * their allocdirect or allocindir structure.
1949 *
1950 * The lookup routines optionally create and hash a new instance when
1951 * an existing entry is not found.
1952 */
1953#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
1954#define NODELAY		0x0002	/* cannot do background work */
1955
1956/*
1957 * Structures and routines associated with pagedep caching.
1958 */
1959LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
1960u_long	pagedep_hash;		/* size of hash table - 1 */
1961#define	PAGEDEP_HASH(mp, inum, lbn) \
1962	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
1963	    pagedep_hash])
1964
1965static int
1966pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
1967	struct pagedep_hashhead *pagedephd;
1968	ino_t ino;
1969	ufs_lbn_t lbn;
1970	struct mount *mp;
1971	int flags;
1972	struct pagedep **pagedeppp;
1973{
1974	struct pagedep *pagedep;
1975
1976	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
1977		if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn &&
1978		    mp == pagedep->pd_list.wk_mp) {
1979			*pagedeppp = pagedep;
1980			return (1);
1981		}
1982	}
1983	*pagedeppp = NULL;
1984	return (0);
1985}
1986/*
1987 * Look up a pagedep. Return 1 if found, 0 otherwise.
1988 * If not found, allocate if DEPALLOC flag is passed.
1989 * Found or allocated entry is returned in pagedeppp.
1990 * This routine must be called with splbio interrupts blocked.
1991 */
1992static int
1993pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
1994	struct mount *mp;
1995	struct buf *bp;
1996	ino_t ino;
1997	ufs_lbn_t lbn;
1998	int flags;
1999	struct pagedep **pagedeppp;
2000{
2001	struct pagedep *pagedep;
2002	struct pagedep_hashhead *pagedephd;
2003	struct worklist *wk;
2004	int ret;
2005	int i;
2006
2007	mtx_assert(&lk, MA_OWNED);
2008	if (bp) {
2009		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2010			if (wk->wk_type == D_PAGEDEP) {
2011				*pagedeppp = WK_PAGEDEP(wk);
2012				return (1);
2013			}
2014		}
2015	}
2016	pagedephd = PAGEDEP_HASH(mp, ino, lbn);
2017	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
2018	if (ret) {
2019		if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2020			WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2021		return (1);
2022	}
2023	if ((flags & DEPALLOC) == 0)
2024		return (0);
2025	FREE_LOCK(&lk);
2026	pagedep = malloc(sizeof(struct pagedep),
2027	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2028	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2029	ACQUIRE_LOCK(&lk);
2030	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
2031	if (*pagedeppp) {
2032		/*
2033		 * This should never happen since we only create pagedeps
2034		 * with the vnode lock held.  Could be an assert.
2035		 */
2036		WORKITEM_FREE(pagedep, D_PAGEDEP);
2037		return (ret);
2038	}
2039	pagedep->pd_ino = ino;
2040	pagedep->pd_lbn = lbn;
2041	LIST_INIT(&pagedep->pd_dirremhd);
2042	LIST_INIT(&pagedep->pd_pendinghd);
2043	for (i = 0; i < DAHASHSZ; i++)
2044		LIST_INIT(&pagedep->pd_diraddhd[i]);
2045	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2046	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2047	*pagedeppp = pagedep;
2048	return (0);
2049}
2050
2051/*
2052 * Structures and routines associated with inodedep caching.
2053 */
2054LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
2055static u_long	inodedep_hash;	/* size of hash table - 1 */
2056#define	INODEDEP_HASH(fs, inum) \
2057      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
2058
2059static int
2060inodedep_find(inodedephd, fs, inum, inodedeppp)
2061	struct inodedep_hashhead *inodedephd;
2062	struct fs *fs;
2063	ino_t inum;
2064	struct inodedep **inodedeppp;
2065{
2066	struct inodedep *inodedep;
2067
2068	LIST_FOREACH(inodedep, inodedephd, id_hash)
2069		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
2070			break;
2071	if (inodedep) {
2072		*inodedeppp = inodedep;
2073		return (1);
2074	}
2075	*inodedeppp = NULL;
2076
2077	return (0);
2078}
2079/*
2080 * Look up an inodedep. Return 1 if found, 0 if not found.
2081 * If not found, allocate if DEPALLOC flag is passed.
2082 * Found or allocated entry is returned in inodedeppp.
2083 * This routine must be called with splbio interrupts blocked.
2084 */
2085static int
2086inodedep_lookup(mp, inum, flags, inodedeppp)
2087	struct mount *mp;
2088	ino_t inum;
2089	int flags;
2090	struct inodedep **inodedeppp;
2091{
2092	struct inodedep *inodedep;
2093	struct inodedep_hashhead *inodedephd;
2094	struct fs *fs;
2095
2096	mtx_assert(&lk, MA_OWNED);
2097	fs = VFSTOUFS(mp)->um_fs;
2098	inodedephd = INODEDEP_HASH(fs, inum);
2099
2100	if (inodedep_find(inodedephd, fs, inum, inodedeppp))
2101		return (1);
2102	if ((flags & DEPALLOC) == 0)
2103		return (0);
2104	/*
2105	 * If we are over our limit, try to improve the situation.
2106	 */
2107	if (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0)
2108		request_cleanup(mp, FLUSH_INODES);
2109	FREE_LOCK(&lk);
2110	inodedep = malloc(sizeof(struct inodedep),
2111		M_INODEDEP, M_SOFTDEP_FLAGS);
2112	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2113	ACQUIRE_LOCK(&lk);
2114	if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
2115		WORKITEM_FREE(inodedep, D_INODEDEP);
2116		return (1);
2117	}
2118	inodedep->id_fs = fs;
2119	inodedep->id_ino = inum;
2120	inodedep->id_state = ALLCOMPLETE;
2121	inodedep->id_nlinkdelta = 0;
2122	inodedep->id_savedino1 = NULL;
2123	inodedep->id_savedsize = -1;
2124	inodedep->id_savedextsize = -1;
2125	inodedep->id_savednlink = -1;
2126	inodedep->id_bmsafemap = NULL;
2127	inodedep->id_mkdiradd = NULL;
2128	LIST_INIT(&inodedep->id_dirremhd);
2129	LIST_INIT(&inodedep->id_pendinghd);
2130	LIST_INIT(&inodedep->id_inowait);
2131	LIST_INIT(&inodedep->id_bufwait);
2132	TAILQ_INIT(&inodedep->id_inoreflst);
2133	TAILQ_INIT(&inodedep->id_inoupdt);
2134	TAILQ_INIT(&inodedep->id_newinoupdt);
2135	TAILQ_INIT(&inodedep->id_extupdt);
2136	TAILQ_INIT(&inodedep->id_newextupdt);
2137	TAILQ_INIT(&inodedep->id_freeblklst);
2138	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2139	*inodedeppp = inodedep;
2140	return (0);
2141}
2142
2143/*
2144 * Structures and routines associated with newblk caching.
2145 */
2146LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
2147u_long	newblk_hash;		/* size of hash table - 1 */
2148#define	NEWBLK_HASH(fs, inum) \
2149	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
2150
2151static int
2152newblk_find(newblkhd, mp, newblkno, flags, newblkpp)
2153	struct newblk_hashhead *newblkhd;
2154	struct mount *mp;
2155	ufs2_daddr_t newblkno;
2156	int flags;
2157	struct newblk **newblkpp;
2158{
2159	struct newblk *newblk;
2160
2161	LIST_FOREACH(newblk, newblkhd, nb_hash) {
2162		if (newblkno != newblk->nb_newblkno)
2163			continue;
2164		if (mp != newblk->nb_list.wk_mp)
2165			continue;
2166		/*
2167		 * If we're creating a new dependency don't match those that
2168		 * have already been converted to allocdirects.  This is for
2169		 * a frag extend.
2170		 */
2171		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2172			continue;
2173		break;
2174	}
2175	if (newblk) {
2176		*newblkpp = newblk;
2177		return (1);
2178	}
2179	*newblkpp = NULL;
2180	return (0);
2181}
2182
2183/*
2184 * Look up a newblk. Return 1 if found, 0 if not found.
2185 * If not found, allocate if DEPALLOC flag is passed.
2186 * Found or allocated entry is returned in newblkpp.
2187 */
2188static int
2189newblk_lookup(mp, newblkno, flags, newblkpp)
2190	struct mount *mp;
2191	ufs2_daddr_t newblkno;
2192	int flags;
2193	struct newblk **newblkpp;
2194{
2195	struct newblk *newblk;
2196	struct newblk_hashhead *newblkhd;
2197
2198	newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno);
2199	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp))
2200		return (1);
2201	if ((flags & DEPALLOC) == 0)
2202		return (0);
2203	FREE_LOCK(&lk);
2204	newblk = malloc(sizeof(union allblk), M_NEWBLK,
2205	    M_SOFTDEP_FLAGS | M_ZERO);
2206	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2207	ACQUIRE_LOCK(&lk);
2208	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) {
2209		WORKITEM_FREE(newblk, D_NEWBLK);
2210		return (1);
2211	}
2212	newblk->nb_freefrag = NULL;
2213	LIST_INIT(&newblk->nb_indirdeps);
2214	LIST_INIT(&newblk->nb_newdirblk);
2215	LIST_INIT(&newblk->nb_jwork);
2216	newblk->nb_state = ATTACHED;
2217	newblk->nb_newblkno = newblkno;
2218	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2219	*newblkpp = newblk;
2220	return (0);
2221}
2222
2223/*
2224 * Structures and routines associated with freed indirect block caching.
2225 */
2226struct freeworklst *indir_hashtbl;
2227u_long	indir_hash;		/* size of hash table - 1 */
2228#define	INDIR_HASH(mp, blkno) \
2229	(&indir_hashtbl[((((register_t)(mp)) >> 13) + (blkno)) & indir_hash])
2230
2231/*
2232 * Lookup an indirect block in the indir hash table.  The freework is
2233 * removed and potentially freed.  The caller must do a blocking journal
2234 * write before writing to the blkno.
2235 */
2236static int
2237indirblk_lookup(mp, blkno)
2238	struct mount *mp;
2239	ufs2_daddr_t blkno;
2240{
2241	struct freework *freework;
2242	struct freeworklst *wkhd;
2243
2244	wkhd = INDIR_HASH(mp, blkno);
2245	TAILQ_FOREACH(freework, wkhd, fw_next) {
2246		if (freework->fw_blkno != blkno)
2247			continue;
2248		if (freework->fw_list.wk_mp != mp)
2249			continue;
2250		indirblk_remove(freework);
2251		return (1);
2252	}
2253	return (0);
2254}
2255
2256/*
2257 * Insert an indirect block represented by freework into the indirblk
2258 * hash table so that it may prevent the block from being re-used prior
2259 * to the journal being written.
2260 */
2261static void
2262indirblk_insert(freework)
2263	struct freework *freework;
2264{
2265	struct freeblks *freeblks;
2266	struct jsegdep *jsegdep;
2267	struct worklist *wk;
2268
2269	freeblks = freework->fw_freeblks;
2270	LIST_FOREACH(wk, &freeblks->fb_jwork, wk_list)
2271		if (wk->wk_type == D_JSEGDEP)
2272			break;
2273	if (wk == NULL)
2274		return;
2275
2276	jsegdep = WK_JSEGDEP(wk);
2277	LIST_INSERT_HEAD(&jsegdep->jd_seg->js_indirs, freework, fw_segs);
2278	TAILQ_INSERT_HEAD(INDIR_HASH(freework->fw_list.wk_mp,
2279	    freework->fw_blkno), freework, fw_next);
2280	freework->fw_state &= ~DEPCOMPLETE;
2281}
2282
2283static void
2284indirblk_remove(freework)
2285	struct freework *freework;
2286{
2287
2288	LIST_REMOVE(freework, fw_segs);
2289	TAILQ_REMOVE(INDIR_HASH(freework->fw_list.wk_mp,
2290	    freework->fw_blkno), freework, fw_next);
2291	freework->fw_state |= DEPCOMPLETE;
2292	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2293		WORKITEM_FREE(freework, D_FREEWORK);
2294}
2295
2296/*
2297 * Executed during filesystem system initialization before
2298 * mounting any filesystems.
2299 */
2300void
2301softdep_initialize()
2302{
2303	int i;
2304
2305	LIST_INIT(&mkdirlisthd);
2306	max_softdeps = desiredvnodes * 4;
2307	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash);
2308	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
2309	newblk_hashtbl = hashinit(desiredvnodes / 5,  M_NEWBLK, &newblk_hash);
2310	bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
2311	i = 1 << (ffs(desiredvnodes / 10) - 1);
2312	indir_hashtbl = malloc(i * sizeof(indir_hashtbl[0]), M_FREEWORK,
2313	    M_WAITOK);
2314	indir_hash = i - 1;
2315	for (i = 0; i <= indir_hash; i++)
2316		TAILQ_INIT(&indir_hashtbl[i]);
2317
2318	/* initialise bioops hack */
2319	bioops.io_start = softdep_disk_io_initiation;
2320	bioops.io_complete = softdep_disk_write_complete;
2321	bioops.io_deallocate = softdep_deallocate_dependencies;
2322	bioops.io_countdeps = softdep_count_dependencies;
2323
2324	/* Initialize the callout with an mtx. */
2325	callout_init_mtx(&softdep_callout, &lk, 0);
2326}
2327
2328/*
2329 * Executed after all filesystems have been unmounted during
2330 * filesystem module unload.
2331 */
2332void
2333softdep_uninitialize()
2334{
2335
2336	callout_drain(&softdep_callout);
2337	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
2338	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
2339	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
2340	hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash);
2341	free(indir_hashtbl, M_FREEWORK);
2342}
2343
2344/*
2345 * Called at mount time to notify the dependency code that a
2346 * filesystem wishes to use it.
2347 */
2348int
2349softdep_mount(devvp, mp, fs, cred)
2350	struct vnode *devvp;
2351	struct mount *mp;
2352	struct fs *fs;
2353	struct ucred *cred;
2354{
2355	struct csum_total cstotal;
2356	struct ufsmount *ump;
2357	struct cg *cgp;
2358	struct buf *bp;
2359	int error, cyl;
2360
2361	MNT_ILOCK(mp);
2362	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2363	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2364		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2365			MNTK_SOFTDEP | MNTK_NOASYNC;
2366	}
2367	MNT_IUNLOCK(mp);
2368	ump = VFSTOUFS(mp);
2369	LIST_INIT(&ump->softdep_workitem_pending);
2370	LIST_INIT(&ump->softdep_journal_pending);
2371	TAILQ_INIT(&ump->softdep_unlinked);
2372	LIST_INIT(&ump->softdep_dirtycg);
2373	ump->softdep_worklist_tail = NULL;
2374	ump->softdep_on_worklist = 0;
2375	ump->softdep_deps = 0;
2376	if ((fs->fs_flags & FS_SUJ) &&
2377	    (error = journal_mount(mp, fs, cred)) != 0) {
2378		printf("Failed to start journal: %d\n", error);
2379		return (error);
2380	}
2381	/*
2382	 * When doing soft updates, the counters in the
2383	 * superblock may have gotten out of sync. Recomputation
2384	 * can take a long time and can be deferred for background
2385	 * fsck.  However, the old behavior of scanning the cylinder
2386	 * groups and recalculating them at mount time is available
2387	 * by setting vfs.ffs.compute_summary_at_mount to one.
2388	 */
2389	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2390		return (0);
2391	bzero(&cstotal, sizeof cstotal);
2392	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2393		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2394		    fs->fs_cgsize, cred, &bp)) != 0) {
2395			brelse(bp);
2396			return (error);
2397		}
2398		cgp = (struct cg *)bp->b_data;
2399		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2400		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2401		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2402		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2403		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2404		brelse(bp);
2405	}
2406#ifdef DEBUG
2407	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2408		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2409#endif
2410	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2411	return (0);
2412}
2413
2414void
2415softdep_unmount(mp)
2416	struct mount *mp;
2417{
2418
2419	MNT_ILOCK(mp);
2420	mp->mnt_flag &= ~MNT_SOFTDEP;
2421	if (MOUNTEDSUJ(mp) == 0) {
2422		MNT_IUNLOCK(mp);
2423		return;
2424	}
2425	mp->mnt_flag &= ~MNT_SUJ;
2426	MNT_IUNLOCK(mp);
2427	journal_unmount(mp);
2428}
2429
2430struct jblocks {
2431	struct jseglst	jb_segs;	/* TAILQ of current segments. */
2432	struct jseg	*jb_writeseg;	/* Next write to complete. */
2433	struct jseg	*jb_oldestseg;	/* Oldest segment with valid entries. */
2434	struct jextent	*jb_extent;	/* Extent array. */
2435	uint64_t	jb_nextseq;	/* Next sequence number. */
2436	uint64_t	jb_oldestwrseq;	/* Oldest written sequence number. */
2437	uint8_t		jb_needseg;	/* Need a forced segment. */
2438	uint8_t		jb_suspended;	/* Did journal suspend writes? */
2439	int		jb_avail;	/* Available extents. */
2440	int		jb_used;	/* Last used extent. */
2441	int		jb_head;	/* Allocator head. */
2442	int		jb_off;		/* Allocator extent offset. */
2443	int		jb_blocks;	/* Total disk blocks covered. */
2444	int		jb_free;	/* Total disk blocks free. */
2445	int		jb_min;		/* Minimum free space. */
2446	int		jb_low;		/* Low on space. */
2447	int		jb_age;		/* Insertion time of oldest rec. */
2448};
2449
2450struct jextent {
2451	ufs2_daddr_t	je_daddr;	/* Disk block address. */
2452	int		je_blocks;	/* Disk block count. */
2453};
2454
2455static struct jblocks *
2456jblocks_create(void)
2457{
2458	struct jblocks *jblocks;
2459
2460	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2461	TAILQ_INIT(&jblocks->jb_segs);
2462	jblocks->jb_avail = 10;
2463	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2464	    M_JBLOCKS, M_WAITOK | M_ZERO);
2465
2466	return (jblocks);
2467}
2468
2469static ufs2_daddr_t
2470jblocks_alloc(jblocks, bytes, actual)
2471	struct jblocks *jblocks;
2472	int bytes;
2473	int *actual;
2474{
2475	ufs2_daddr_t daddr;
2476	struct jextent *jext;
2477	int freecnt;
2478	int blocks;
2479
2480	blocks = bytes / DEV_BSIZE;
2481	jext = &jblocks->jb_extent[jblocks->jb_head];
2482	freecnt = jext->je_blocks - jblocks->jb_off;
2483	if (freecnt == 0) {
2484		jblocks->jb_off = 0;
2485		if (++jblocks->jb_head > jblocks->jb_used)
2486			jblocks->jb_head = 0;
2487		jext = &jblocks->jb_extent[jblocks->jb_head];
2488		freecnt = jext->je_blocks;
2489	}
2490	if (freecnt > blocks)
2491		freecnt = blocks;
2492	*actual = freecnt * DEV_BSIZE;
2493	daddr = jext->je_daddr + jblocks->jb_off;
2494	jblocks->jb_off += freecnt;
2495	jblocks->jb_free -= freecnt;
2496
2497	return (daddr);
2498}
2499
2500static void
2501jblocks_free(jblocks, mp, bytes)
2502	struct jblocks *jblocks;
2503	struct mount *mp;
2504	int bytes;
2505{
2506
2507	jblocks->jb_free += bytes / DEV_BSIZE;
2508	if (jblocks->jb_suspended)
2509		worklist_speedup();
2510	wakeup(jblocks);
2511}
2512
2513static void
2514jblocks_destroy(jblocks)
2515	struct jblocks *jblocks;
2516{
2517
2518	if (jblocks->jb_extent)
2519		free(jblocks->jb_extent, M_JBLOCKS);
2520	free(jblocks, M_JBLOCKS);
2521}
2522
2523static void
2524jblocks_add(jblocks, daddr, blocks)
2525	struct jblocks *jblocks;
2526	ufs2_daddr_t daddr;
2527	int blocks;
2528{
2529	struct jextent *jext;
2530
2531	jblocks->jb_blocks += blocks;
2532	jblocks->jb_free += blocks;
2533	jext = &jblocks->jb_extent[jblocks->jb_used];
2534	/* Adding the first block. */
2535	if (jext->je_daddr == 0) {
2536		jext->je_daddr = daddr;
2537		jext->je_blocks = blocks;
2538		return;
2539	}
2540	/* Extending the last extent. */
2541	if (jext->je_daddr + jext->je_blocks == daddr) {
2542		jext->je_blocks += blocks;
2543		return;
2544	}
2545	/* Adding a new extent. */
2546	if (++jblocks->jb_used == jblocks->jb_avail) {
2547		jblocks->jb_avail *= 2;
2548		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2549		    M_JBLOCKS, M_WAITOK | M_ZERO);
2550		memcpy(jext, jblocks->jb_extent,
2551		    sizeof(struct jextent) * jblocks->jb_used);
2552		free(jblocks->jb_extent, M_JBLOCKS);
2553		jblocks->jb_extent = jext;
2554	}
2555	jext = &jblocks->jb_extent[jblocks->jb_used];
2556	jext->je_daddr = daddr;
2557	jext->je_blocks = blocks;
2558	return;
2559}
2560
2561int
2562softdep_journal_lookup(mp, vpp)
2563	struct mount *mp;
2564	struct vnode **vpp;
2565{
2566	struct componentname cnp;
2567	struct vnode *dvp;
2568	ino_t sujournal;
2569	int error;
2570
2571	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2572	if (error)
2573		return (error);
2574	bzero(&cnp, sizeof(cnp));
2575	cnp.cn_nameiop = LOOKUP;
2576	cnp.cn_flags = ISLASTCN;
2577	cnp.cn_thread = curthread;
2578	cnp.cn_cred = curthread->td_ucred;
2579	cnp.cn_pnbuf = SUJ_FILE;
2580	cnp.cn_nameptr = SUJ_FILE;
2581	cnp.cn_namelen = strlen(SUJ_FILE);
2582	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2583	vput(dvp);
2584	if (error != 0)
2585		return (error);
2586	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2587	return (error);
2588}
2589
2590/*
2591 * Open and verify the journal file.
2592 */
2593static int
2594journal_mount(mp, fs, cred)
2595	struct mount *mp;
2596	struct fs *fs;
2597	struct ucred *cred;
2598{
2599	struct jblocks *jblocks;
2600	struct vnode *vp;
2601	struct inode *ip;
2602	ufs2_daddr_t blkno;
2603	int bcount;
2604	int error;
2605	int i;
2606
2607	error = softdep_journal_lookup(mp, &vp);
2608	if (error != 0) {
2609		printf("Failed to find journal.  Use tunefs to create one\n");
2610		return (error);
2611	}
2612	ip = VTOI(vp);
2613	if (ip->i_size < SUJ_MIN) {
2614		error = ENOSPC;
2615		goto out;
2616	}
2617	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2618	jblocks = jblocks_create();
2619	for (i = 0; i < bcount; i++) {
2620		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2621		if (error)
2622			break;
2623		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2624	}
2625	if (error) {
2626		jblocks_destroy(jblocks);
2627		goto out;
2628	}
2629	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2630	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2631	VFSTOUFS(mp)->softdep_jblocks = jblocks;
2632out:
2633	if (error == 0) {
2634		MNT_ILOCK(mp);
2635		mp->mnt_flag |= MNT_SUJ;
2636		mp->mnt_flag &= ~MNT_SOFTDEP;
2637		MNT_IUNLOCK(mp);
2638		/*
2639		 * Only validate the journal contents if the
2640		 * filesystem is clean, otherwise we write the logs
2641		 * but they'll never be used.  If the filesystem was
2642		 * still dirty when we mounted it the journal is
2643		 * invalid and a new journal can only be valid if it
2644		 * starts from a clean mount.
2645		 */
2646		if (fs->fs_clean) {
2647			DIP_SET(ip, i_modrev, fs->fs_mtime);
2648			ip->i_flags |= IN_MODIFIED;
2649			ffs_update(vp, 1);
2650		}
2651	}
2652	vput(vp);
2653	return (error);
2654}
2655
2656static void
2657journal_unmount(mp)
2658	struct mount *mp;
2659{
2660	struct ufsmount *ump;
2661
2662	ump = VFSTOUFS(mp);
2663	if (ump->softdep_jblocks)
2664		jblocks_destroy(ump->softdep_jblocks);
2665	ump->softdep_jblocks = NULL;
2666}
2667
2668/*
2669 * Called when a journal record is ready to be written.  Space is allocated
2670 * and the journal entry is created when the journal is flushed to stable
2671 * store.
2672 */
2673static void
2674add_to_journal(wk)
2675	struct worklist *wk;
2676{
2677	struct ufsmount *ump;
2678
2679	mtx_assert(&lk, MA_OWNED);
2680	ump = VFSTOUFS(wk->wk_mp);
2681	if (wk->wk_state & ONWORKLIST)
2682		panic("add_to_journal: %s(0x%X) already on list",
2683		    TYPENAME(wk->wk_type), wk->wk_state);
2684	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2685	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2686		ump->softdep_jblocks->jb_age = ticks;
2687		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2688	} else
2689		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2690	ump->softdep_journal_tail = wk;
2691	ump->softdep_on_journal += 1;
2692}
2693
2694/*
2695 * Remove an arbitrary item for the journal worklist maintain the tail
2696 * pointer.  This happens when a new operation obviates the need to
2697 * journal an old operation.
2698 */
2699static void
2700remove_from_journal(wk)
2701	struct worklist *wk;
2702{
2703	struct ufsmount *ump;
2704
2705	mtx_assert(&lk, MA_OWNED);
2706	ump = VFSTOUFS(wk->wk_mp);
2707#ifdef SUJ_DEBUG
2708	{
2709		struct worklist *wkn;
2710
2711		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2712			if (wkn == wk)
2713				break;
2714		if (wkn == NULL)
2715			panic("remove_from_journal: %p is not in journal", wk);
2716	}
2717#endif
2718	/*
2719	 * We emulate a TAILQ to save space in most structures which do not
2720	 * require TAILQ semantics.  Here we must update the tail position
2721	 * when removing the tail which is not the final entry. This works
2722	 * only if the worklist linkage are at the beginning of the structure.
2723	 */
2724	if (ump->softdep_journal_tail == wk)
2725		ump->softdep_journal_tail =
2726		    (struct worklist *)wk->wk_list.le_prev;
2727
2728	WORKLIST_REMOVE(wk);
2729	ump->softdep_on_journal -= 1;
2730}
2731
2732/*
2733 * Check for journal space as well as dependency limits so the prelink
2734 * code can throttle both journaled and non-journaled filesystems.
2735 * Threshold is 0 for low and 1 for min.
2736 */
2737static int
2738journal_space(ump, thresh)
2739	struct ufsmount *ump;
2740	int thresh;
2741{
2742	struct jblocks *jblocks;
2743	int avail;
2744
2745	jblocks = ump->softdep_jblocks;
2746	if (jblocks == NULL)
2747		return (1);
2748	/*
2749	 * We use a tighter restriction here to prevent request_cleanup()
2750	 * running in threads from running into locks we currently hold.
2751	 */
2752	if (dep_current[D_INODEDEP] > (max_softdeps / 10) * 9)
2753		return (0);
2754	if (thresh)
2755		thresh = jblocks->jb_min;
2756	else
2757		thresh = jblocks->jb_low;
2758	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2759	avail = jblocks->jb_free - avail;
2760
2761	return (avail > thresh);
2762}
2763
2764static void
2765journal_suspend(ump)
2766	struct ufsmount *ump;
2767{
2768	struct jblocks *jblocks;
2769	struct mount *mp;
2770
2771	mp = UFSTOVFS(ump);
2772	jblocks = ump->softdep_jblocks;
2773	MNT_ILOCK(mp);
2774	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2775		stat_journal_min++;
2776		mp->mnt_kern_flag |= MNTK_SUSPEND;
2777		mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
2778	}
2779	jblocks->jb_suspended = 1;
2780	MNT_IUNLOCK(mp);
2781}
2782
2783static int
2784journal_unsuspend(struct ufsmount *ump)
2785{
2786	struct jblocks *jblocks;
2787	struct mount *mp;
2788
2789	mp = UFSTOVFS(ump);
2790	jblocks = ump->softdep_jblocks;
2791
2792	if (jblocks != NULL && jblocks->jb_suspended &&
2793	    journal_space(ump, jblocks->jb_min)) {
2794		jblocks->jb_suspended = 0;
2795		FREE_LOCK(&lk);
2796		mp->mnt_susp_owner = curthread;
2797		vfs_write_resume(mp);
2798		ACQUIRE_LOCK(&lk);
2799		return (1);
2800	}
2801	return (0);
2802}
2803
2804/*
2805 * Called before any allocation function to be certain that there is
2806 * sufficient space in the journal prior to creating any new records.
2807 * Since in the case of block allocation we may have multiple locked
2808 * buffers at the time of the actual allocation we can not block
2809 * when the journal records are created.  Doing so would create a deadlock
2810 * if any of these buffers needed to be flushed to reclaim space.  Instead
2811 * we require a sufficiently large amount of available space such that
2812 * each thread in the system could have passed this allocation check and
2813 * still have sufficient free space.  With 20% of a minimum journal size
2814 * of 1MB we have 6553 records available.
2815 */
2816int
2817softdep_prealloc(vp, waitok)
2818	struct vnode *vp;
2819	int waitok;
2820{
2821	struct ufsmount *ump;
2822
2823	/*
2824	 * Nothing to do if we are not running journaled soft updates.
2825	 * If we currently hold the snapshot lock, we must avoid handling
2826	 * other resources that could cause deadlock.
2827	 */
2828	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)))
2829		return (0);
2830	ump = VFSTOUFS(vp->v_mount);
2831	ACQUIRE_LOCK(&lk);
2832	if (journal_space(ump, 0)) {
2833		FREE_LOCK(&lk);
2834		return (0);
2835	}
2836	stat_journal_low++;
2837	FREE_LOCK(&lk);
2838	if (waitok == MNT_NOWAIT)
2839		return (ENOSPC);
2840	/*
2841	 * Attempt to sync this vnode once to flush any journal
2842	 * work attached to it.
2843	 */
2844	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
2845		ffs_syncvnode(vp, waitok, 0);
2846	ACQUIRE_LOCK(&lk);
2847	process_removes(vp);
2848	process_truncates(vp);
2849	if (journal_space(ump, 0) == 0) {
2850		softdep_speedup();
2851		if (journal_space(ump, 1) == 0)
2852			journal_suspend(ump);
2853	}
2854	FREE_LOCK(&lk);
2855
2856	return (0);
2857}
2858
2859/*
2860 * Before adjusting a link count on a vnode verify that we have sufficient
2861 * journal space.  If not, process operations that depend on the currently
2862 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
2863 * and softdep flush threads can not acquire these locks to reclaim space.
2864 */
2865static void
2866softdep_prelink(dvp, vp)
2867	struct vnode *dvp;
2868	struct vnode *vp;
2869{
2870	struct ufsmount *ump;
2871
2872	ump = VFSTOUFS(dvp->v_mount);
2873	mtx_assert(&lk, MA_OWNED);
2874	/*
2875	 * Nothing to do if we have sufficient journal space.
2876	 * If we currently hold the snapshot lock, we must avoid
2877	 * handling other resources that could cause deadlock.
2878	 */
2879	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
2880		return;
2881	stat_journal_low++;
2882	FREE_LOCK(&lk);
2883	if (vp)
2884		ffs_syncvnode(vp, MNT_NOWAIT, 0);
2885	ffs_syncvnode(dvp, MNT_WAIT, 0);
2886	ACQUIRE_LOCK(&lk);
2887	/* Process vp before dvp as it may create .. removes. */
2888	if (vp) {
2889		process_removes(vp);
2890		process_truncates(vp);
2891	}
2892	process_removes(dvp);
2893	process_truncates(dvp);
2894	softdep_speedup();
2895	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
2896	if (journal_space(ump, 0) == 0) {
2897		softdep_speedup();
2898		if (journal_space(ump, 1) == 0)
2899			journal_suspend(ump);
2900	}
2901}
2902
2903static void
2904jseg_write(ump, jseg, data)
2905	struct ufsmount *ump;
2906	struct jseg *jseg;
2907	uint8_t *data;
2908{
2909	struct jsegrec *rec;
2910
2911	rec = (struct jsegrec *)data;
2912	rec->jsr_seq = jseg->js_seq;
2913	rec->jsr_oldest = jseg->js_oldseq;
2914	rec->jsr_cnt = jseg->js_cnt;
2915	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
2916	rec->jsr_crc = 0;
2917	rec->jsr_time = ump->um_fs->fs_mtime;
2918}
2919
2920static inline void
2921inoref_write(inoref, jseg, rec)
2922	struct inoref *inoref;
2923	struct jseg *jseg;
2924	struct jrefrec *rec;
2925{
2926
2927	inoref->if_jsegdep->jd_seg = jseg;
2928	rec->jr_ino = inoref->if_ino;
2929	rec->jr_parent = inoref->if_parent;
2930	rec->jr_nlink = inoref->if_nlink;
2931	rec->jr_mode = inoref->if_mode;
2932	rec->jr_diroff = inoref->if_diroff;
2933}
2934
2935static void
2936jaddref_write(jaddref, jseg, data)
2937	struct jaddref *jaddref;
2938	struct jseg *jseg;
2939	uint8_t *data;
2940{
2941	struct jrefrec *rec;
2942
2943	rec = (struct jrefrec *)data;
2944	rec->jr_op = JOP_ADDREF;
2945	inoref_write(&jaddref->ja_ref, jseg, rec);
2946}
2947
2948static void
2949jremref_write(jremref, jseg, data)
2950	struct jremref *jremref;
2951	struct jseg *jseg;
2952	uint8_t *data;
2953{
2954	struct jrefrec *rec;
2955
2956	rec = (struct jrefrec *)data;
2957	rec->jr_op = JOP_REMREF;
2958	inoref_write(&jremref->jr_ref, jseg, rec);
2959}
2960
2961static void
2962jmvref_write(jmvref, jseg, data)
2963	struct jmvref *jmvref;
2964	struct jseg *jseg;
2965	uint8_t *data;
2966{
2967	struct jmvrec *rec;
2968
2969	rec = (struct jmvrec *)data;
2970	rec->jm_op = JOP_MVREF;
2971	rec->jm_ino = jmvref->jm_ino;
2972	rec->jm_parent = jmvref->jm_parent;
2973	rec->jm_oldoff = jmvref->jm_oldoff;
2974	rec->jm_newoff = jmvref->jm_newoff;
2975}
2976
2977static void
2978jnewblk_write(jnewblk, jseg, data)
2979	struct jnewblk *jnewblk;
2980	struct jseg *jseg;
2981	uint8_t *data;
2982{
2983	struct jblkrec *rec;
2984
2985	jnewblk->jn_jsegdep->jd_seg = jseg;
2986	rec = (struct jblkrec *)data;
2987	rec->jb_op = JOP_NEWBLK;
2988	rec->jb_ino = jnewblk->jn_ino;
2989	rec->jb_blkno = jnewblk->jn_blkno;
2990	rec->jb_lbn = jnewblk->jn_lbn;
2991	rec->jb_frags = jnewblk->jn_frags;
2992	rec->jb_oldfrags = jnewblk->jn_oldfrags;
2993}
2994
2995static void
2996jfreeblk_write(jfreeblk, jseg, data)
2997	struct jfreeblk *jfreeblk;
2998	struct jseg *jseg;
2999	uint8_t *data;
3000{
3001	struct jblkrec *rec;
3002
3003	jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3004	rec = (struct jblkrec *)data;
3005	rec->jb_op = JOP_FREEBLK;
3006	rec->jb_ino = jfreeblk->jf_ino;
3007	rec->jb_blkno = jfreeblk->jf_blkno;
3008	rec->jb_lbn = jfreeblk->jf_lbn;
3009	rec->jb_frags = jfreeblk->jf_frags;
3010	rec->jb_oldfrags = 0;
3011}
3012
3013static void
3014jfreefrag_write(jfreefrag, jseg, data)
3015	struct jfreefrag *jfreefrag;
3016	struct jseg *jseg;
3017	uint8_t *data;
3018{
3019	struct jblkrec *rec;
3020
3021	jfreefrag->fr_jsegdep->jd_seg = jseg;
3022	rec = (struct jblkrec *)data;
3023	rec->jb_op = JOP_FREEBLK;
3024	rec->jb_ino = jfreefrag->fr_ino;
3025	rec->jb_blkno = jfreefrag->fr_blkno;
3026	rec->jb_lbn = jfreefrag->fr_lbn;
3027	rec->jb_frags = jfreefrag->fr_frags;
3028	rec->jb_oldfrags = 0;
3029}
3030
3031static void
3032jtrunc_write(jtrunc, jseg, data)
3033	struct jtrunc *jtrunc;
3034	struct jseg *jseg;
3035	uint8_t *data;
3036{
3037	struct jtrncrec *rec;
3038
3039	jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3040	rec = (struct jtrncrec *)data;
3041	rec->jt_op = JOP_TRUNC;
3042	rec->jt_ino = jtrunc->jt_ino;
3043	rec->jt_size = jtrunc->jt_size;
3044	rec->jt_extsize = jtrunc->jt_extsize;
3045}
3046
3047static void
3048jfsync_write(jfsync, jseg, data)
3049	struct jfsync *jfsync;
3050	struct jseg *jseg;
3051	uint8_t *data;
3052{
3053	struct jtrncrec *rec;
3054
3055	rec = (struct jtrncrec *)data;
3056	rec->jt_op = JOP_SYNC;
3057	rec->jt_ino = jfsync->jfs_ino;
3058	rec->jt_size = jfsync->jfs_size;
3059	rec->jt_extsize = jfsync->jfs_extsize;
3060}
3061
3062static void
3063softdep_flushjournal(mp)
3064	struct mount *mp;
3065{
3066	struct jblocks *jblocks;
3067	struct ufsmount *ump;
3068
3069	if (MOUNTEDSUJ(mp) == 0)
3070		return;
3071	ump = VFSTOUFS(mp);
3072	jblocks = ump->softdep_jblocks;
3073	ACQUIRE_LOCK(&lk);
3074	while (ump->softdep_on_journal) {
3075		jblocks->jb_needseg = 1;
3076		softdep_process_journal(mp, NULL, MNT_WAIT);
3077	}
3078	FREE_LOCK(&lk);
3079}
3080
3081/*
3082 * Flush some journal records to disk.
3083 */
3084static void
3085softdep_process_journal(mp, needwk, flags)
3086	struct mount *mp;
3087	struct worklist *needwk;
3088	int flags;
3089{
3090	struct jblocks *jblocks;
3091	struct ufsmount *ump;
3092	struct worklist *wk;
3093	struct jseg *jseg;
3094	struct buf *bp;
3095	uint8_t *data;
3096	struct fs *fs;
3097	int segwritten;
3098	int jrecmin;	/* Minimum records per block. */
3099	int jrecmax;	/* Maximum records per block. */
3100	int size;
3101	int cnt;
3102	int off;
3103	int devbsize;
3104
3105	if (MOUNTEDSUJ(mp) == 0)
3106		return;
3107	ump = VFSTOUFS(mp);
3108	fs = ump->um_fs;
3109	jblocks = ump->softdep_jblocks;
3110	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3111	/*
3112	 * We write anywhere between a disk block and fs block.  The upper
3113	 * bound is picked to prevent buffer cache fragmentation and limit
3114	 * processing time per I/O.
3115	 */
3116	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3117	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3118	segwritten = 0;
3119	for (;;) {
3120		cnt = ump->softdep_on_journal;
3121		/*
3122		 * Criteria for writing a segment:
3123		 * 1) We have a full block.
3124		 * 2) We're called from jwait() and haven't found the
3125		 *    journal item yet.
3126		 * 3) Always write if needseg is set.
3127		 * 4) If we are called from process_worklist and have
3128		 *    not yet written anything we write a partial block
3129		 *    to enforce a 1 second maximum latency on journal
3130		 *    entries.
3131		 */
3132		if (cnt < (jrecmax - 1) && needwk == NULL &&
3133		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3134			break;
3135		cnt++;
3136		/*
3137		 * Verify some free journal space.  softdep_prealloc() should
3138	 	 * guarantee that we don't run out so this is indicative of
3139		 * a problem with the flow control.  Try to recover
3140		 * gracefully in any event.
3141		 */
3142		while (jblocks->jb_free == 0) {
3143			if (flags != MNT_WAIT)
3144				break;
3145			printf("softdep: Out of journal space!\n");
3146			softdep_speedup();
3147			msleep(jblocks, &lk, PRIBIO, "jblocks", hz);
3148		}
3149		FREE_LOCK(&lk);
3150		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3151		workitem_alloc(&jseg->js_list, D_JSEG, mp);
3152		LIST_INIT(&jseg->js_entries);
3153		LIST_INIT(&jseg->js_indirs);
3154		jseg->js_state = ATTACHED;
3155		jseg->js_jblocks = jblocks;
3156		bp = geteblk(fs->fs_bsize, 0);
3157		ACQUIRE_LOCK(&lk);
3158		/*
3159		 * If there was a race while we were allocating the block
3160		 * and jseg the entry we care about was likely written.
3161		 * We bail out in both the WAIT and NOWAIT case and assume
3162		 * the caller will loop if the entry it cares about is
3163		 * not written.
3164		 */
3165		cnt = ump->softdep_on_journal;
3166		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3167			bp->b_flags |= B_INVAL | B_NOCACHE;
3168			WORKITEM_FREE(jseg, D_JSEG);
3169			FREE_LOCK(&lk);
3170			brelse(bp);
3171			ACQUIRE_LOCK(&lk);
3172			break;
3173		}
3174		/*
3175		 * Calculate the disk block size required for the available
3176		 * records rounded to the min size.
3177		 */
3178		if (cnt == 0)
3179			size = devbsize;
3180		else if (cnt < jrecmax)
3181			size = howmany(cnt, jrecmin) * devbsize;
3182		else
3183			size = fs->fs_bsize;
3184		/*
3185		 * Allocate a disk block for this journal data and account
3186		 * for truncation of the requested size if enough contiguous
3187		 * space was not available.
3188		 */
3189		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3190		bp->b_lblkno = bp->b_blkno;
3191		bp->b_offset = bp->b_blkno * DEV_BSIZE;
3192		bp->b_bcount = size;
3193		bp->b_bufobj = &ump->um_devvp->v_bufobj;
3194		bp->b_flags &= ~B_INVAL;
3195		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3196		/*
3197		 * Initialize our jseg with cnt records.  Assign the next
3198		 * sequence number to it and link it in-order.
3199		 */
3200		cnt = MIN(cnt, (size / devbsize) * jrecmin);
3201		jseg->js_buf = bp;
3202		jseg->js_cnt = cnt;
3203		jseg->js_refs = cnt + 1;	/* Self ref. */
3204		jseg->js_size = size;
3205		jseg->js_seq = jblocks->jb_nextseq++;
3206		if (jblocks->jb_oldestseg == NULL)
3207			jblocks->jb_oldestseg = jseg;
3208		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3209		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3210		if (jblocks->jb_writeseg == NULL)
3211			jblocks->jb_writeseg = jseg;
3212		/*
3213		 * Start filling in records from the pending list.
3214		 */
3215		data = bp->b_data;
3216		off = 0;
3217		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3218		    != NULL) {
3219			if (cnt == 0)
3220				break;
3221			/* Place a segment header on every device block. */
3222			if ((off % devbsize) == 0) {
3223				jseg_write(ump, jseg, data);
3224				off += JREC_SIZE;
3225				data = bp->b_data + off;
3226			}
3227			if (wk == needwk)
3228				needwk = NULL;
3229			remove_from_journal(wk);
3230			wk->wk_state |= INPROGRESS;
3231			WORKLIST_INSERT(&jseg->js_entries, wk);
3232			switch (wk->wk_type) {
3233			case D_JADDREF:
3234				jaddref_write(WK_JADDREF(wk), jseg, data);
3235				break;
3236			case D_JREMREF:
3237				jremref_write(WK_JREMREF(wk), jseg, data);
3238				break;
3239			case D_JMVREF:
3240				jmvref_write(WK_JMVREF(wk), jseg, data);
3241				break;
3242			case D_JNEWBLK:
3243				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3244				break;
3245			case D_JFREEBLK:
3246				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3247				break;
3248			case D_JFREEFRAG:
3249				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3250				break;
3251			case D_JTRUNC:
3252				jtrunc_write(WK_JTRUNC(wk), jseg, data);
3253				break;
3254			case D_JFSYNC:
3255				jfsync_write(WK_JFSYNC(wk), jseg, data);
3256				break;
3257			default:
3258				panic("process_journal: Unknown type %s",
3259				    TYPENAME(wk->wk_type));
3260				/* NOTREACHED */
3261			}
3262			off += JREC_SIZE;
3263			data = bp->b_data + off;
3264			cnt--;
3265		}
3266		/*
3267		 * Write this one buffer and continue.
3268		 */
3269		segwritten = 1;
3270		jblocks->jb_needseg = 0;
3271		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3272		FREE_LOCK(&lk);
3273		BO_LOCK(bp->b_bufobj);
3274		bgetvp(ump->um_devvp, bp);
3275		BO_UNLOCK(bp->b_bufobj);
3276		/*
3277		 * We only do the blocking wait once we find the journal
3278		 * entry we're looking for.
3279		 */
3280		if (needwk == NULL && flags == MNT_WAIT)
3281			bwrite(bp);
3282		else
3283			bawrite(bp);
3284		ACQUIRE_LOCK(&lk);
3285	}
3286	/*
3287	 * If we've suspended the filesystem because we ran out of journal
3288	 * space either try to sync it here to make some progress or
3289	 * unsuspend it if we already have.
3290	 */
3291	if (flags == 0 && jblocks->jb_suspended) {
3292		if (journal_unsuspend(ump))
3293			return;
3294		FREE_LOCK(&lk);
3295		VFS_SYNC(mp, MNT_NOWAIT);
3296		ffs_sbupdate(ump, MNT_WAIT, 0);
3297		ACQUIRE_LOCK(&lk);
3298	}
3299}
3300
3301/*
3302 * Complete a jseg, allowing all dependencies awaiting journal writes
3303 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3304 * structures so that the journal segment can be freed to reclaim space.
3305 */
3306static void
3307complete_jseg(jseg)
3308	struct jseg *jseg;
3309{
3310	struct worklist *wk;
3311	struct jmvref *jmvref;
3312	int waiting;
3313#ifdef INVARIANTS
3314	int i = 0;
3315#endif
3316
3317	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3318		WORKLIST_REMOVE(wk);
3319		waiting = wk->wk_state & IOWAITING;
3320		wk->wk_state &= ~(INPROGRESS | IOWAITING);
3321		wk->wk_state |= COMPLETE;
3322		KASSERT(i++ < jseg->js_cnt,
3323		    ("handle_written_jseg: overflow %d >= %d",
3324		    i - 1, jseg->js_cnt));
3325		switch (wk->wk_type) {
3326		case D_JADDREF:
3327			handle_written_jaddref(WK_JADDREF(wk));
3328			break;
3329		case D_JREMREF:
3330			handle_written_jremref(WK_JREMREF(wk));
3331			break;
3332		case D_JMVREF:
3333			rele_jseg(jseg);	/* No jsegdep. */
3334			jmvref = WK_JMVREF(wk);
3335			LIST_REMOVE(jmvref, jm_deps);
3336			if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3337				free_pagedep(jmvref->jm_pagedep);
3338			WORKITEM_FREE(jmvref, D_JMVREF);
3339			break;
3340		case D_JNEWBLK:
3341			handle_written_jnewblk(WK_JNEWBLK(wk));
3342			break;
3343		case D_JFREEBLK:
3344			handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3345			break;
3346		case D_JTRUNC:
3347			handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3348			break;
3349		case D_JFSYNC:
3350			rele_jseg(jseg);	/* No jsegdep. */
3351			WORKITEM_FREE(wk, D_JFSYNC);
3352			break;
3353		case D_JFREEFRAG:
3354			handle_written_jfreefrag(WK_JFREEFRAG(wk));
3355			break;
3356		default:
3357			panic("handle_written_jseg: Unknown type %s",
3358			    TYPENAME(wk->wk_type));
3359			/* NOTREACHED */
3360		}
3361		if (waiting)
3362			wakeup(wk);
3363	}
3364	/* Release the self reference so the structure may be freed. */
3365	rele_jseg(jseg);
3366}
3367
3368/*
3369 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Handle jseg
3370 * completions in order only.
3371 */
3372static void
3373handle_written_jseg(jseg, bp)
3374	struct jseg *jseg;
3375	struct buf *bp;
3376{
3377	struct jblocks *jblocks;
3378	struct jseg *jsegn;
3379
3380	if (jseg->js_refs == 0)
3381		panic("handle_written_jseg: No self-reference on %p", jseg);
3382	jseg->js_state |= DEPCOMPLETE;
3383	/*
3384	 * We'll never need this buffer again, set flags so it will be
3385	 * discarded.
3386	 */
3387	bp->b_flags |= B_INVAL | B_NOCACHE;
3388	jblocks = jseg->js_jblocks;
3389	/*
3390	 * Don't allow out of order completions.  If this isn't the first
3391	 * block wait for it to write before we're done.
3392	 */
3393	if (jseg != jblocks->jb_writeseg)
3394		return;
3395	/* Iterate through available jsegs processing their entries. */
3396	do {
3397		jblocks->jb_oldestwrseq = jseg->js_oldseq;
3398		jsegn = TAILQ_NEXT(jseg, js_next);
3399		complete_jseg(jseg);
3400		jseg = jsegn;
3401	} while (jseg && jseg->js_state & DEPCOMPLETE);
3402	jblocks->jb_writeseg = jseg;
3403	/*
3404	 * Attempt to free jsegs now that oldestwrseq may have advanced.
3405	 */
3406	free_jsegs(jblocks);
3407}
3408
3409static inline struct jsegdep *
3410inoref_jseg(inoref)
3411	struct inoref *inoref;
3412{
3413	struct jsegdep *jsegdep;
3414
3415	jsegdep = inoref->if_jsegdep;
3416	inoref->if_jsegdep = NULL;
3417
3418	return (jsegdep);
3419}
3420
3421/*
3422 * Called once a jremref has made it to stable store.  The jremref is marked
3423 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3424 * for the jremref to complete will be awoken by free_jremref.
3425 */
3426static void
3427handle_written_jremref(jremref)
3428	struct jremref *jremref;
3429{
3430	struct inodedep *inodedep;
3431	struct jsegdep *jsegdep;
3432	struct dirrem *dirrem;
3433
3434	/* Grab the jsegdep. */
3435	jsegdep = inoref_jseg(&jremref->jr_ref);
3436	/*
3437	 * Remove us from the inoref list.
3438	 */
3439	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3440	    0, &inodedep) == 0)
3441		panic("handle_written_jremref: Lost inodedep");
3442	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3443	/*
3444	 * Complete the dirrem.
3445	 */
3446	dirrem = jremref->jr_dirrem;
3447	jremref->jr_dirrem = NULL;
3448	LIST_REMOVE(jremref, jr_deps);
3449	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3450	jwork_insert(&dirrem->dm_jwork, jsegdep);
3451	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3452	    (dirrem->dm_state & COMPLETE) != 0)
3453		add_to_worklist(&dirrem->dm_list, 0);
3454	free_jremref(jremref);
3455}
3456
3457/*
3458 * Called once a jaddref has made it to stable store.  The dependency is
3459 * marked complete and any dependent structures are added to the inode
3460 * bufwait list to be completed as soon as it is written.  If a bitmap write
3461 * depends on this entry we move the inode into the inodedephd of the
3462 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3463 */
3464static void
3465handle_written_jaddref(jaddref)
3466	struct jaddref *jaddref;
3467{
3468	struct jsegdep *jsegdep;
3469	struct inodedep *inodedep;
3470	struct diradd *diradd;
3471	struct mkdir *mkdir;
3472
3473	/* Grab the jsegdep. */
3474	jsegdep = inoref_jseg(&jaddref->ja_ref);
3475	mkdir = NULL;
3476	diradd = NULL;
3477	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3478	    0, &inodedep) == 0)
3479		panic("handle_written_jaddref: Lost inodedep.");
3480	if (jaddref->ja_diradd == NULL)
3481		panic("handle_written_jaddref: No dependency");
3482	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3483		diradd = jaddref->ja_diradd;
3484		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3485	} else if (jaddref->ja_state & MKDIR_PARENT) {
3486		mkdir = jaddref->ja_mkdir;
3487		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3488	} else if (jaddref->ja_state & MKDIR_BODY)
3489		mkdir = jaddref->ja_mkdir;
3490	else
3491		panic("handle_written_jaddref: Unknown dependency %p",
3492		    jaddref->ja_diradd);
3493	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3494	/*
3495	 * Remove us from the inode list.
3496	 */
3497	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3498	/*
3499	 * The mkdir may be waiting on the jaddref to clear before freeing.
3500	 */
3501	if (mkdir) {
3502		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3503		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3504		    TYPENAME(mkdir->md_list.wk_type)));
3505		mkdir->md_jaddref = NULL;
3506		diradd = mkdir->md_diradd;
3507		mkdir->md_state |= DEPCOMPLETE;
3508		complete_mkdir(mkdir);
3509	}
3510	jwork_insert(&diradd->da_jwork, jsegdep);
3511	if (jaddref->ja_state & NEWBLOCK) {
3512		inodedep->id_state |= ONDEPLIST;
3513		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3514		    inodedep, id_deps);
3515	}
3516	free_jaddref(jaddref);
3517}
3518
3519/*
3520 * Called once a jnewblk journal is written.  The allocdirect or allocindir
3521 * is placed in the bmsafemap to await notification of a written bitmap.  If
3522 * the operation was canceled we add the segdep to the appropriate
3523 * dependency to free the journal space once the canceling operation
3524 * completes.
3525 */
3526static void
3527handle_written_jnewblk(jnewblk)
3528	struct jnewblk *jnewblk;
3529{
3530	struct bmsafemap *bmsafemap;
3531	struct freefrag *freefrag;
3532	struct freework *freework;
3533	struct jsegdep *jsegdep;
3534	struct newblk *newblk;
3535
3536	/* Grab the jsegdep. */
3537	jsegdep = jnewblk->jn_jsegdep;
3538	jnewblk->jn_jsegdep = NULL;
3539	if (jnewblk->jn_dep == NULL)
3540		panic("handle_written_jnewblk: No dependency for the segdep.");
3541	switch (jnewblk->jn_dep->wk_type) {
3542	case D_NEWBLK:
3543	case D_ALLOCDIRECT:
3544	case D_ALLOCINDIR:
3545		/*
3546		 * Add the written block to the bmsafemap so it can
3547		 * be notified when the bitmap is on disk.
3548		 */
3549		newblk = WK_NEWBLK(jnewblk->jn_dep);
3550		newblk->nb_jnewblk = NULL;
3551		if ((newblk->nb_state & GOINGAWAY) == 0) {
3552			bmsafemap = newblk->nb_bmsafemap;
3553			newblk->nb_state |= ONDEPLIST;
3554			LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3555			    nb_deps);
3556		}
3557		jwork_insert(&newblk->nb_jwork, jsegdep);
3558		break;
3559	case D_FREEFRAG:
3560		/*
3561		 * A newblock being removed by a freefrag when replaced by
3562		 * frag extension.
3563		 */
3564		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3565		freefrag->ff_jdep = NULL;
3566		WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list);
3567		break;
3568	case D_FREEWORK:
3569		/*
3570		 * A direct block was removed by truncate.
3571		 */
3572		freework = WK_FREEWORK(jnewblk->jn_dep);
3573		freework->fw_jnewblk = NULL;
3574		WORKLIST_INSERT(&freework->fw_freeblks->fb_jwork,
3575		    &jsegdep->jd_list);
3576		break;
3577	default:
3578		panic("handle_written_jnewblk: Unknown type %d.",
3579		    jnewblk->jn_dep->wk_type);
3580	}
3581	jnewblk->jn_dep = NULL;
3582	free_jnewblk(jnewblk);
3583}
3584
3585/*
3586 * Cancel a jfreefrag that won't be needed, probably due to colliding with
3587 * an in-flight allocation that has not yet been committed.  Divorce us
3588 * from the freefrag and mark it DEPCOMPLETE so that it may be added
3589 * to the worklist.
3590 */
3591static void
3592cancel_jfreefrag(jfreefrag)
3593	struct jfreefrag *jfreefrag;
3594{
3595	struct freefrag *freefrag;
3596
3597	if (jfreefrag->fr_jsegdep) {
3598		free_jsegdep(jfreefrag->fr_jsegdep);
3599		jfreefrag->fr_jsegdep = NULL;
3600	}
3601	freefrag = jfreefrag->fr_freefrag;
3602	jfreefrag->fr_freefrag = NULL;
3603	free_jfreefrag(jfreefrag);
3604	freefrag->ff_state |= DEPCOMPLETE;
3605}
3606
3607/*
3608 * Free a jfreefrag when the parent freefrag is rendered obsolete.
3609 */
3610static void
3611free_jfreefrag(jfreefrag)
3612	struct jfreefrag *jfreefrag;
3613{
3614
3615	if (jfreefrag->fr_state & INPROGRESS)
3616		WORKLIST_REMOVE(&jfreefrag->fr_list);
3617	else if (jfreefrag->fr_state & ONWORKLIST)
3618		remove_from_journal(&jfreefrag->fr_list);
3619	if (jfreefrag->fr_freefrag != NULL)
3620		panic("free_jfreefrag:  Still attached to a freefrag.");
3621	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3622}
3623
3624/*
3625 * Called when the journal write for a jfreefrag completes.  The parent
3626 * freefrag is added to the worklist if this completes its dependencies.
3627 */
3628static void
3629handle_written_jfreefrag(jfreefrag)
3630	struct jfreefrag *jfreefrag;
3631{
3632	struct jsegdep *jsegdep;
3633	struct freefrag *freefrag;
3634
3635	/* Grab the jsegdep. */
3636	jsegdep = jfreefrag->fr_jsegdep;
3637	jfreefrag->fr_jsegdep = NULL;
3638	freefrag = jfreefrag->fr_freefrag;
3639	if (freefrag == NULL)
3640		panic("handle_written_jfreefrag: No freefrag.");
3641	freefrag->ff_state |= DEPCOMPLETE;
3642	freefrag->ff_jdep = NULL;
3643	jwork_insert(&freefrag->ff_jwork, jsegdep);
3644	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3645		add_to_worklist(&freefrag->ff_list, 0);
3646	jfreefrag->fr_freefrag = NULL;
3647	free_jfreefrag(jfreefrag);
3648}
3649
3650/*
3651 * Called when the journal write for a jfreeblk completes.  The jfreeblk
3652 * is removed from the freeblks list of pending journal writes and the
3653 * jsegdep is moved to the freeblks jwork to be completed when all blocks
3654 * have been reclaimed.
3655 */
3656static void
3657handle_written_jblkdep(jblkdep)
3658	struct jblkdep *jblkdep;
3659{
3660	struct freeblks *freeblks;
3661	struct jsegdep *jsegdep;
3662
3663	/* Grab the jsegdep. */
3664	jsegdep = jblkdep->jb_jsegdep;
3665	jblkdep->jb_jsegdep = NULL;
3666	freeblks = jblkdep->jb_freeblks;
3667	LIST_REMOVE(jblkdep, jb_deps);
3668	WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list);
3669	/*
3670	 * If the freeblks is all journaled, we can add it to the worklist.
3671	 */
3672	if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
3673	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
3674		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
3675
3676	free_jblkdep(jblkdep);
3677}
3678
3679static struct jsegdep *
3680newjsegdep(struct worklist *wk)
3681{
3682	struct jsegdep *jsegdep;
3683
3684	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3685	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3686	jsegdep->jd_seg = NULL;
3687
3688	return (jsegdep);
3689}
3690
3691static struct jmvref *
3692newjmvref(dp, ino, oldoff, newoff)
3693	struct inode *dp;
3694	ino_t ino;
3695	off_t oldoff;
3696	off_t newoff;
3697{
3698	struct jmvref *jmvref;
3699
3700	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3701	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3702	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3703	jmvref->jm_parent = dp->i_number;
3704	jmvref->jm_ino = ino;
3705	jmvref->jm_oldoff = oldoff;
3706	jmvref->jm_newoff = newoff;
3707
3708	return (jmvref);
3709}
3710
3711/*
3712 * Allocate a new jremref that tracks the removal of ip from dp with the
3713 * directory entry offset of diroff.  Mark the entry as ATTACHED and
3714 * DEPCOMPLETE as we have all the information required for the journal write
3715 * and the directory has already been removed from the buffer.  The caller
3716 * is responsible for linking the jremref into the pagedep and adding it
3717 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
3718 * a DOTDOT addition so handle_workitem_remove() can properly assign
3719 * the jsegdep when we're done.
3720 */
3721static struct jremref *
3722newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
3723    off_t diroff, nlink_t nlink)
3724{
3725	struct jremref *jremref;
3726
3727	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
3728	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
3729	jremref->jr_state = ATTACHED;
3730	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
3731	   nlink, ip->i_mode);
3732	jremref->jr_dirrem = dirrem;
3733
3734	return (jremref);
3735}
3736
3737static inline void
3738newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
3739    nlink_t nlink, uint16_t mode)
3740{
3741
3742	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
3743	inoref->if_diroff = diroff;
3744	inoref->if_ino = ino;
3745	inoref->if_parent = parent;
3746	inoref->if_nlink = nlink;
3747	inoref->if_mode = mode;
3748}
3749
3750/*
3751 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
3752 * directory offset may not be known until later.  The caller is responsible
3753 * adding the entry to the journal when this information is available.  nlink
3754 * should be the link count prior to the addition and mode is only required
3755 * to have the correct FMT.
3756 */
3757static struct jaddref *
3758newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
3759    uint16_t mode)
3760{
3761	struct jaddref *jaddref;
3762
3763	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
3764	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
3765	jaddref->ja_state = ATTACHED;
3766	jaddref->ja_mkdir = NULL;
3767	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
3768
3769	return (jaddref);
3770}
3771
3772/*
3773 * Create a new free dependency for a freework.  The caller is responsible
3774 * for adjusting the reference count when it has the lock held.  The freedep
3775 * will track an outstanding bitmap write that will ultimately clear the
3776 * freework to continue.
3777 */
3778static struct freedep *
3779newfreedep(struct freework *freework)
3780{
3781	struct freedep *freedep;
3782
3783	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
3784	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
3785	freedep->fd_freework = freework;
3786
3787	return (freedep);
3788}
3789
3790/*
3791 * Free a freedep structure once the buffer it is linked to is written.  If
3792 * this is the last reference to the freework schedule it for completion.
3793 */
3794static void
3795free_freedep(freedep)
3796	struct freedep *freedep;
3797{
3798	struct freework *freework;
3799
3800	freework = freedep->fd_freework;
3801	freework->fw_freeblks->fb_cgwait--;
3802	if (--freework->fw_ref == 0)
3803		freework_enqueue(freework);
3804	WORKITEM_FREE(freedep, D_FREEDEP);
3805}
3806
3807/*
3808 * Allocate a new freework structure that may be a level in an indirect
3809 * when parent is not NULL or a top level block when it is.  The top level
3810 * freework structures are allocated without lk held and before the freeblks
3811 * is visible outside of softdep_setup_freeblocks().
3812 */
3813static struct freework *
3814newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
3815	struct ufsmount *ump;
3816	struct freeblks *freeblks;
3817	struct freework *parent;
3818	ufs_lbn_t lbn;
3819	ufs2_daddr_t nb;
3820	int frags;
3821	int off;
3822	int journal;
3823{
3824	struct freework *freework;
3825
3826	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
3827	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
3828	freework->fw_state = ATTACHED;
3829	freework->fw_jnewblk = NULL;
3830	freework->fw_freeblks = freeblks;
3831	freework->fw_parent = parent;
3832	freework->fw_lbn = lbn;
3833	freework->fw_blkno = nb;
3834	freework->fw_frags = frags;
3835	freework->fw_indir = NULL;
3836	freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR)
3837		? 0 : NINDIR(ump->um_fs) + 1;
3838	freework->fw_start = freework->fw_off = off;
3839	if (journal)
3840		newjfreeblk(freeblks, lbn, nb, frags);
3841	if (parent == NULL) {
3842		ACQUIRE_LOCK(&lk);
3843		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
3844		freeblks->fb_ref++;
3845		FREE_LOCK(&lk);
3846	}
3847
3848	return (freework);
3849}
3850
3851/*
3852 * Eliminate a jfreeblk for a block that does not need journaling.
3853 */
3854static void
3855cancel_jfreeblk(freeblks, blkno)
3856	struct freeblks *freeblks;
3857	ufs2_daddr_t blkno;
3858{
3859	struct jfreeblk *jfreeblk;
3860	struct jblkdep *jblkdep;
3861
3862	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
3863		if (jblkdep->jb_list.wk_type != D_JFREEBLK)
3864			continue;
3865		jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
3866		if (jfreeblk->jf_blkno == blkno)
3867			break;
3868	}
3869	if (jblkdep == NULL)
3870		return;
3871	free_jsegdep(jblkdep->jb_jsegdep);
3872	LIST_REMOVE(jblkdep, jb_deps);
3873	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
3874}
3875
3876/*
3877 * Allocate a new jfreeblk to journal top level block pointer when truncating
3878 * a file.  The caller must add this to the worklist when lk is held.
3879 */
3880static struct jfreeblk *
3881newjfreeblk(freeblks, lbn, blkno, frags)
3882	struct freeblks *freeblks;
3883	ufs_lbn_t lbn;
3884	ufs2_daddr_t blkno;
3885	int frags;
3886{
3887	struct jfreeblk *jfreeblk;
3888
3889	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
3890	workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
3891	    freeblks->fb_list.wk_mp);
3892	jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
3893	jfreeblk->jf_dep.jb_freeblks = freeblks;
3894	jfreeblk->jf_ino = freeblks->fb_inum;
3895	jfreeblk->jf_lbn = lbn;
3896	jfreeblk->jf_blkno = blkno;
3897	jfreeblk->jf_frags = frags;
3898	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
3899
3900	return (jfreeblk);
3901}
3902
3903/*
3904 * Allocate a new jtrunc to track a partial truncation.
3905 */
3906static struct jtrunc *
3907newjtrunc(freeblks, size, extsize)
3908	struct freeblks *freeblks;
3909	off_t size;
3910	int extsize;
3911{
3912	struct jtrunc *jtrunc;
3913
3914	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
3915	workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
3916	    freeblks->fb_list.wk_mp);
3917	jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
3918	jtrunc->jt_dep.jb_freeblks = freeblks;
3919	jtrunc->jt_ino = freeblks->fb_inum;
3920	jtrunc->jt_size = size;
3921	jtrunc->jt_extsize = extsize;
3922	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
3923
3924	return (jtrunc);
3925}
3926
3927/*
3928 * If we're canceling a new bitmap we have to search for another ref
3929 * to move into the bmsafemap dep.  This might be better expressed
3930 * with another structure.
3931 */
3932static void
3933move_newblock_dep(jaddref, inodedep)
3934	struct jaddref *jaddref;
3935	struct inodedep *inodedep;
3936{
3937	struct inoref *inoref;
3938	struct jaddref *jaddrefn;
3939
3940	jaddrefn = NULL;
3941	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
3942	    inoref = TAILQ_NEXT(inoref, if_deps)) {
3943		if ((jaddref->ja_state & NEWBLOCK) &&
3944		    inoref->if_list.wk_type == D_JADDREF) {
3945			jaddrefn = (struct jaddref *)inoref;
3946			break;
3947		}
3948	}
3949	if (jaddrefn == NULL)
3950		return;
3951	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
3952	jaddrefn->ja_state |= jaddref->ja_state &
3953	    (ATTACHED | UNDONE | NEWBLOCK);
3954	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
3955	jaddref->ja_state |= ATTACHED;
3956	LIST_REMOVE(jaddref, ja_bmdeps);
3957	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
3958	    ja_bmdeps);
3959}
3960
3961/*
3962 * Cancel a jaddref either before it has been written or while it is being
3963 * written.  This happens when a link is removed before the add reaches
3964 * the disk.  The jaddref dependency is kept linked into the bmsafemap
3965 * and inode to prevent the link count or bitmap from reaching the disk
3966 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
3967 * required.
3968 *
3969 * Returns 1 if the canceled addref requires journaling of the remove and
3970 * 0 otherwise.
3971 */
3972static int
3973cancel_jaddref(jaddref, inodedep, wkhd)
3974	struct jaddref *jaddref;
3975	struct inodedep *inodedep;
3976	struct workhead *wkhd;
3977{
3978	struct inoref *inoref;
3979	struct jsegdep *jsegdep;
3980	int needsj;
3981
3982	KASSERT((jaddref->ja_state & COMPLETE) == 0,
3983	    ("cancel_jaddref: Canceling complete jaddref"));
3984	if (jaddref->ja_state & (INPROGRESS | COMPLETE))
3985		needsj = 1;
3986	else
3987		needsj = 0;
3988	if (inodedep == NULL)
3989		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3990		    0, &inodedep) == 0)
3991			panic("cancel_jaddref: Lost inodedep");
3992	/*
3993	 * We must adjust the nlink of any reference operation that follows
3994	 * us so that it is consistent with the in-memory reference.  This
3995	 * ensures that inode nlink rollbacks always have the correct link.
3996	 */
3997	if (needsj == 0) {
3998		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
3999		    inoref = TAILQ_NEXT(inoref, if_deps)) {
4000			if (inoref->if_state & GOINGAWAY)
4001				break;
4002			inoref->if_nlink--;
4003		}
4004	}
4005	jsegdep = inoref_jseg(&jaddref->ja_ref);
4006	if (jaddref->ja_state & NEWBLOCK)
4007		move_newblock_dep(jaddref, inodedep);
4008	wake_worklist(&jaddref->ja_list);
4009	jaddref->ja_mkdir = NULL;
4010	if (jaddref->ja_state & INPROGRESS) {
4011		jaddref->ja_state &= ~INPROGRESS;
4012		WORKLIST_REMOVE(&jaddref->ja_list);
4013		jwork_insert(wkhd, jsegdep);
4014	} else {
4015		free_jsegdep(jsegdep);
4016		if (jaddref->ja_state & DEPCOMPLETE)
4017			remove_from_journal(&jaddref->ja_list);
4018	}
4019	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4020	/*
4021	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4022	 * can arrange for them to be freed with the bitmap.  Otherwise we
4023	 * no longer need this addref attached to the inoreflst and it
4024	 * will incorrectly adjust nlink if we leave it.
4025	 */
4026	if ((jaddref->ja_state & NEWBLOCK) == 0) {
4027		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4028		    if_deps);
4029		jaddref->ja_state |= COMPLETE;
4030		free_jaddref(jaddref);
4031		return (needsj);
4032	}
4033	/*
4034	 * Leave the head of the list for jsegdeps for fast merging.
4035	 */
4036	if (LIST_FIRST(wkhd) != NULL) {
4037		jaddref->ja_state |= ONWORKLIST;
4038		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4039	} else
4040		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4041
4042	return (needsj);
4043}
4044
4045/*
4046 * Attempt to free a jaddref structure when some work completes.  This
4047 * should only succeed once the entry is written and all dependencies have
4048 * been notified.
4049 */
4050static void
4051free_jaddref(jaddref)
4052	struct jaddref *jaddref;
4053{
4054
4055	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4056		return;
4057	if (jaddref->ja_ref.if_jsegdep)
4058		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4059		    jaddref, jaddref->ja_state);
4060	if (jaddref->ja_state & NEWBLOCK)
4061		LIST_REMOVE(jaddref, ja_bmdeps);
4062	if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4063		panic("free_jaddref: Bad state %p(0x%X)",
4064		    jaddref, jaddref->ja_state);
4065	if (jaddref->ja_mkdir != NULL)
4066		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4067	WORKITEM_FREE(jaddref, D_JADDREF);
4068}
4069
4070/*
4071 * Free a jremref structure once it has been written or discarded.
4072 */
4073static void
4074free_jremref(jremref)
4075	struct jremref *jremref;
4076{
4077
4078	if (jremref->jr_ref.if_jsegdep)
4079		free_jsegdep(jremref->jr_ref.if_jsegdep);
4080	if (jremref->jr_state & INPROGRESS)
4081		panic("free_jremref: IO still pending");
4082	WORKITEM_FREE(jremref, D_JREMREF);
4083}
4084
4085/*
4086 * Free a jnewblk structure.
4087 */
4088static void
4089free_jnewblk(jnewblk)
4090	struct jnewblk *jnewblk;
4091{
4092
4093	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4094		return;
4095	LIST_REMOVE(jnewblk, jn_deps);
4096	if (jnewblk->jn_dep != NULL)
4097		panic("free_jnewblk: Dependency still attached.");
4098	WORKITEM_FREE(jnewblk, D_JNEWBLK);
4099}
4100
4101/*
4102 * Cancel a jnewblk which has been been made redundant by frag extension.
4103 */
4104static void
4105cancel_jnewblk(jnewblk, wkhd)
4106	struct jnewblk *jnewblk;
4107	struct workhead *wkhd;
4108{
4109	struct jsegdep *jsegdep;
4110
4111	jsegdep = jnewblk->jn_jsegdep;
4112	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4113		panic("cancel_jnewblk: Invalid state");
4114	jnewblk->jn_jsegdep  = NULL;
4115	jnewblk->jn_dep = NULL;
4116	jnewblk->jn_state |= GOINGAWAY;
4117	if (jnewblk->jn_state & INPROGRESS) {
4118		jnewblk->jn_state &= ~INPROGRESS;
4119		WORKLIST_REMOVE(&jnewblk->jn_list);
4120		jwork_insert(wkhd, jsegdep);
4121	} else {
4122		free_jsegdep(jsegdep);
4123		remove_from_journal(&jnewblk->jn_list);
4124	}
4125	wake_worklist(&jnewblk->jn_list);
4126	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4127}
4128
4129static void
4130free_jblkdep(jblkdep)
4131	struct jblkdep *jblkdep;
4132{
4133
4134	if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4135		WORKITEM_FREE(jblkdep, D_JFREEBLK);
4136	else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4137		WORKITEM_FREE(jblkdep, D_JTRUNC);
4138	else
4139		panic("free_jblkdep: Unexpected type %s",
4140		    TYPENAME(jblkdep->jb_list.wk_type));
4141}
4142
4143/*
4144 * Free a single jseg once it is no longer referenced in memory or on
4145 * disk.  Reclaim journal blocks and dependencies waiting for the segment
4146 * to disappear.
4147 */
4148static void
4149free_jseg(jseg, jblocks)
4150	struct jseg *jseg;
4151	struct jblocks *jblocks;
4152{
4153	struct freework *freework;
4154
4155	/*
4156	 * Free freework structures that were lingering to indicate freed
4157	 * indirect blocks that forced journal write ordering on reallocate.
4158	 */
4159	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4160		indirblk_remove(freework);
4161	if (jblocks->jb_oldestseg == jseg)
4162		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4163	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4164	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4165	KASSERT(LIST_EMPTY(&jseg->js_entries),
4166	    ("free_jseg: Freed jseg has valid entries."));
4167	WORKITEM_FREE(jseg, D_JSEG);
4168}
4169
4170/*
4171 * Free all jsegs that meet the criteria for being reclaimed and update
4172 * oldestseg.
4173 */
4174static void
4175free_jsegs(jblocks)
4176	struct jblocks *jblocks;
4177{
4178	struct jseg *jseg;
4179
4180	/*
4181	 * Free only those jsegs which have none allocated before them to
4182	 * preserve the journal space ordering.
4183	 */
4184	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4185		/*
4186		 * Only reclaim space when nothing depends on this journal
4187		 * set and another set has written that it is no longer
4188		 * valid.
4189		 */
4190		if (jseg->js_refs != 0) {
4191			jblocks->jb_oldestseg = jseg;
4192			return;
4193		}
4194		if (!LIST_EMPTY(&jseg->js_indirs) &&
4195		    jseg->js_seq >= jblocks->jb_oldestwrseq)
4196			break;
4197		free_jseg(jseg, jblocks);
4198	}
4199	/*
4200	 * If we exited the loop above we still must discover the
4201	 * oldest valid segment.
4202	 */
4203	if (jseg)
4204		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4205		     jseg = TAILQ_NEXT(jseg, js_next))
4206			if (jseg->js_refs != 0)
4207				break;
4208	jblocks->jb_oldestseg = jseg;
4209	/*
4210	 * The journal has no valid records but some jsegs may still be
4211	 * waiting on oldestwrseq to advance.  We force a small record
4212	 * out to permit these lingering records to be reclaimed.
4213	 */
4214	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4215		jblocks->jb_needseg = 1;
4216}
4217
4218/*
4219 * Release one reference to a jseg and free it if the count reaches 0.  This
4220 * should eventually reclaim journal space as well.
4221 */
4222static void
4223rele_jseg(jseg)
4224	struct jseg *jseg;
4225{
4226
4227	KASSERT(jseg->js_refs > 0,
4228	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4229	if (--jseg->js_refs != 0)
4230		return;
4231	free_jsegs(jseg->js_jblocks);
4232}
4233
4234/*
4235 * Release a jsegdep and decrement the jseg count.
4236 */
4237static void
4238free_jsegdep(jsegdep)
4239	struct jsegdep *jsegdep;
4240{
4241
4242	if (jsegdep->jd_seg)
4243		rele_jseg(jsegdep->jd_seg);
4244	WORKITEM_FREE(jsegdep, D_JSEGDEP);
4245}
4246
4247/*
4248 * Wait for a journal item to make it to disk.  Initiate journal processing
4249 * if required.
4250 */
4251static int
4252jwait(wk, waitfor)
4253	struct worklist *wk;
4254	int waitfor;
4255{
4256
4257	/*
4258	 * Blocking journal waits cause slow synchronous behavior.  Record
4259	 * stats on the frequency of these blocking operations.
4260	 */
4261	if (waitfor == MNT_WAIT) {
4262		stat_journal_wait++;
4263		switch (wk->wk_type) {
4264		case D_JREMREF:
4265		case D_JMVREF:
4266			stat_jwait_filepage++;
4267			break;
4268		case D_JTRUNC:
4269		case D_JFREEBLK:
4270			stat_jwait_freeblks++;
4271			break;
4272		case D_JNEWBLK:
4273			stat_jwait_newblk++;
4274			break;
4275		case D_JADDREF:
4276			stat_jwait_inode++;
4277			break;
4278		default:
4279			break;
4280		}
4281	}
4282	/*
4283	 * If IO has not started we process the journal.  We can't mark the
4284	 * worklist item as IOWAITING because we drop the lock while
4285	 * processing the journal and the worklist entry may be freed after
4286	 * this point.  The caller may call back in and re-issue the request.
4287	 */
4288	if ((wk->wk_state & INPROGRESS) == 0) {
4289		softdep_process_journal(wk->wk_mp, wk, waitfor);
4290		if (waitfor != MNT_WAIT)
4291			return (EBUSY);
4292		return (0);
4293	}
4294	if (waitfor != MNT_WAIT)
4295		return (EBUSY);
4296	wait_worklist(wk, "jwait");
4297	return (0);
4298}
4299
4300/*
4301 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4302 * appropriate.  This is a convenience function to reduce duplicate code
4303 * for the setup and revert functions below.
4304 */
4305static struct inodedep *
4306inodedep_lookup_ip(ip)
4307	struct inode *ip;
4308{
4309	struct inodedep *inodedep;
4310	int dflags;
4311
4312	KASSERT(ip->i_nlink >= ip->i_effnlink,
4313	    ("inodedep_lookup_ip: bad delta"));
4314	dflags = DEPALLOC;
4315	if (IS_SNAPSHOT(ip))
4316		dflags |= NODELAY;
4317	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags,
4318	    &inodedep);
4319	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4320	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4321
4322	return (inodedep);
4323}
4324
4325/*
4326 * Called prior to creating a new inode and linking it to a directory.  The
4327 * jaddref structure must already be allocated by softdep_setup_inomapdep
4328 * and it is discovered here so we can initialize the mode and update
4329 * nlinkdelta.
4330 */
4331void
4332softdep_setup_create(dp, ip)
4333	struct inode *dp;
4334	struct inode *ip;
4335{
4336	struct inodedep *inodedep;
4337	struct jaddref *jaddref;
4338	struct vnode *dvp;
4339
4340	KASSERT(ip->i_nlink == 1,
4341	    ("softdep_setup_create: Invalid link count."));
4342	dvp = ITOV(dp);
4343	ACQUIRE_LOCK(&lk);
4344	inodedep = inodedep_lookup_ip(ip);
4345	if (DOINGSUJ(dvp)) {
4346		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4347		    inoreflst);
4348		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4349		    ("softdep_setup_create: No addref structure present."));
4350	}
4351	softdep_prelink(dvp, NULL);
4352	FREE_LOCK(&lk);
4353}
4354
4355/*
4356 * Create a jaddref structure to track the addition of a DOTDOT link when
4357 * we are reparenting an inode as part of a rename.  This jaddref will be
4358 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4359 * non-journaling softdep.
4360 */
4361void
4362softdep_setup_dotdot_link(dp, ip)
4363	struct inode *dp;
4364	struct inode *ip;
4365{
4366	struct inodedep *inodedep;
4367	struct jaddref *jaddref;
4368	struct vnode *dvp;
4369	struct vnode *vp;
4370
4371	dvp = ITOV(dp);
4372	vp = ITOV(ip);
4373	jaddref = NULL;
4374	/*
4375	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4376	 * is used as a normal link would be.
4377	 */
4378	if (DOINGSUJ(dvp))
4379		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4380		    dp->i_effnlink - 1, dp->i_mode);
4381	ACQUIRE_LOCK(&lk);
4382	inodedep = inodedep_lookup_ip(dp);
4383	if (jaddref)
4384		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4385		    if_deps);
4386	softdep_prelink(dvp, ITOV(ip));
4387	FREE_LOCK(&lk);
4388}
4389
4390/*
4391 * Create a jaddref structure to track a new link to an inode.  The directory
4392 * offset is not known until softdep_setup_directory_add or
4393 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4394 * softdep.
4395 */
4396void
4397softdep_setup_link(dp, ip)
4398	struct inode *dp;
4399	struct inode *ip;
4400{
4401	struct inodedep *inodedep;
4402	struct jaddref *jaddref;
4403	struct vnode *dvp;
4404
4405	dvp = ITOV(dp);
4406	jaddref = NULL;
4407	if (DOINGSUJ(dvp))
4408		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4409		    ip->i_mode);
4410	ACQUIRE_LOCK(&lk);
4411	inodedep = inodedep_lookup_ip(ip);
4412	if (jaddref)
4413		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4414		    if_deps);
4415	softdep_prelink(dvp, ITOV(ip));
4416	FREE_LOCK(&lk);
4417}
4418
4419/*
4420 * Called to create the jaddref structures to track . and .. references as
4421 * well as lookup and further initialize the incomplete jaddref created
4422 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4423 * nlinkdelta for non-journaling softdep.
4424 */
4425void
4426softdep_setup_mkdir(dp, ip)
4427	struct inode *dp;
4428	struct inode *ip;
4429{
4430	struct inodedep *inodedep;
4431	struct jaddref *dotdotaddref;
4432	struct jaddref *dotaddref;
4433	struct jaddref *jaddref;
4434	struct vnode *dvp;
4435
4436	dvp = ITOV(dp);
4437	dotaddref = dotdotaddref = NULL;
4438	if (DOINGSUJ(dvp)) {
4439		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4440		    ip->i_mode);
4441		dotaddref->ja_state |= MKDIR_BODY;
4442		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4443		    dp->i_effnlink - 1, dp->i_mode);
4444		dotdotaddref->ja_state |= MKDIR_PARENT;
4445	}
4446	ACQUIRE_LOCK(&lk);
4447	inodedep = inodedep_lookup_ip(ip);
4448	if (DOINGSUJ(dvp)) {
4449		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4450		    inoreflst);
4451		KASSERT(jaddref != NULL,
4452		    ("softdep_setup_mkdir: No addref structure present."));
4453		KASSERT(jaddref->ja_parent == dp->i_number,
4454		    ("softdep_setup_mkdir: bad parent %ju",
4455		    (uintmax_t)jaddref->ja_parent));
4456		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4457		    if_deps);
4458	}
4459	inodedep = inodedep_lookup_ip(dp);
4460	if (DOINGSUJ(dvp))
4461		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4462		    &dotdotaddref->ja_ref, if_deps);
4463	softdep_prelink(ITOV(dp), NULL);
4464	FREE_LOCK(&lk);
4465}
4466
4467/*
4468 * Called to track nlinkdelta of the inode and parent directories prior to
4469 * unlinking a directory.
4470 */
4471void
4472softdep_setup_rmdir(dp, ip)
4473	struct inode *dp;
4474	struct inode *ip;
4475{
4476	struct vnode *dvp;
4477
4478	dvp = ITOV(dp);
4479	ACQUIRE_LOCK(&lk);
4480	(void) inodedep_lookup_ip(ip);
4481	(void) inodedep_lookup_ip(dp);
4482	softdep_prelink(dvp, ITOV(ip));
4483	FREE_LOCK(&lk);
4484}
4485
4486/*
4487 * Called to track nlinkdelta of the inode and parent directories prior to
4488 * unlink.
4489 */
4490void
4491softdep_setup_unlink(dp, ip)
4492	struct inode *dp;
4493	struct inode *ip;
4494{
4495	struct vnode *dvp;
4496
4497	dvp = ITOV(dp);
4498	ACQUIRE_LOCK(&lk);
4499	(void) inodedep_lookup_ip(ip);
4500	(void) inodedep_lookup_ip(dp);
4501	softdep_prelink(dvp, ITOV(ip));
4502	FREE_LOCK(&lk);
4503}
4504
4505/*
4506 * Called to release the journal structures created by a failed non-directory
4507 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4508 */
4509void
4510softdep_revert_create(dp, ip)
4511	struct inode *dp;
4512	struct inode *ip;
4513{
4514	struct inodedep *inodedep;
4515	struct jaddref *jaddref;
4516	struct vnode *dvp;
4517
4518	dvp = ITOV(dp);
4519	ACQUIRE_LOCK(&lk);
4520	inodedep = inodedep_lookup_ip(ip);
4521	if (DOINGSUJ(dvp)) {
4522		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4523		    inoreflst);
4524		KASSERT(jaddref->ja_parent == dp->i_number,
4525		    ("softdep_revert_create: addref parent mismatch"));
4526		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4527	}
4528	FREE_LOCK(&lk);
4529}
4530
4531/*
4532 * Called to release the journal structures created by a failed dotdot link
4533 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4534 */
4535void
4536softdep_revert_dotdot_link(dp, ip)
4537	struct inode *dp;
4538	struct inode *ip;
4539{
4540	struct inodedep *inodedep;
4541	struct jaddref *jaddref;
4542	struct vnode *dvp;
4543
4544	dvp = ITOV(dp);
4545	ACQUIRE_LOCK(&lk);
4546	inodedep = inodedep_lookup_ip(dp);
4547	if (DOINGSUJ(dvp)) {
4548		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4549		    inoreflst);
4550		KASSERT(jaddref->ja_parent == ip->i_number,
4551		    ("softdep_revert_dotdot_link: addref parent mismatch"));
4552		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4553	}
4554	FREE_LOCK(&lk);
4555}
4556
4557/*
4558 * Called to release the journal structures created by a failed link
4559 * addition.  Adjusts nlinkdelta for non-journaling softdep.
4560 */
4561void
4562softdep_revert_link(dp, ip)
4563	struct inode *dp;
4564	struct inode *ip;
4565{
4566	struct inodedep *inodedep;
4567	struct jaddref *jaddref;
4568	struct vnode *dvp;
4569
4570	dvp = ITOV(dp);
4571	ACQUIRE_LOCK(&lk);
4572	inodedep = inodedep_lookup_ip(ip);
4573	if (DOINGSUJ(dvp)) {
4574		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4575		    inoreflst);
4576		KASSERT(jaddref->ja_parent == dp->i_number,
4577		    ("softdep_revert_link: addref parent mismatch"));
4578		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4579	}
4580	FREE_LOCK(&lk);
4581}
4582
4583/*
4584 * Called to release the journal structures created by a failed mkdir
4585 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4586 */
4587void
4588softdep_revert_mkdir(dp, ip)
4589	struct inode *dp;
4590	struct inode *ip;
4591{
4592	struct inodedep *inodedep;
4593	struct jaddref *jaddref;
4594	struct jaddref *dotaddref;
4595	struct vnode *dvp;
4596
4597	dvp = ITOV(dp);
4598
4599	ACQUIRE_LOCK(&lk);
4600	inodedep = inodedep_lookup_ip(dp);
4601	if (DOINGSUJ(dvp)) {
4602		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4603		    inoreflst);
4604		KASSERT(jaddref->ja_parent == ip->i_number,
4605		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4606		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4607	}
4608	inodedep = inodedep_lookup_ip(ip);
4609	if (DOINGSUJ(dvp)) {
4610		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4611		    inoreflst);
4612		KASSERT(jaddref->ja_parent == dp->i_number,
4613		    ("softdep_revert_mkdir: addref parent mismatch"));
4614		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4615		    inoreflst, if_deps);
4616		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4617		KASSERT(dotaddref->ja_parent == ip->i_number,
4618		    ("softdep_revert_mkdir: dot addref parent mismatch"));
4619		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4620	}
4621	FREE_LOCK(&lk);
4622}
4623
4624/*
4625 * Called to correct nlinkdelta after a failed rmdir.
4626 */
4627void
4628softdep_revert_rmdir(dp, ip)
4629	struct inode *dp;
4630	struct inode *ip;
4631{
4632
4633	ACQUIRE_LOCK(&lk);
4634	(void) inodedep_lookup_ip(ip);
4635	(void) inodedep_lookup_ip(dp);
4636	FREE_LOCK(&lk);
4637}
4638
4639/*
4640 * Protecting the freemaps (or bitmaps).
4641 *
4642 * To eliminate the need to execute fsck before mounting a filesystem
4643 * after a power failure, one must (conservatively) guarantee that the
4644 * on-disk copy of the bitmaps never indicate that a live inode or block is
4645 * free.  So, when a block or inode is allocated, the bitmap should be
4646 * updated (on disk) before any new pointers.  When a block or inode is
4647 * freed, the bitmap should not be updated until all pointers have been
4648 * reset.  The latter dependency is handled by the delayed de-allocation
4649 * approach described below for block and inode de-allocation.  The former
4650 * dependency is handled by calling the following procedure when a block or
4651 * inode is allocated. When an inode is allocated an "inodedep" is created
4652 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4653 * Each "inodedep" is also inserted into the hash indexing structure so
4654 * that any additional link additions can be made dependent on the inode
4655 * allocation.
4656 *
4657 * The ufs filesystem maintains a number of free block counts (e.g., per
4658 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4659 * in addition to the bitmaps.  These counts are used to improve efficiency
4660 * during allocation and therefore must be consistent with the bitmaps.
4661 * There is no convenient way to guarantee post-crash consistency of these
4662 * counts with simple update ordering, for two main reasons: (1) The counts
4663 * and bitmaps for a single cylinder group block are not in the same disk
4664 * sector.  If a disk write is interrupted (e.g., by power failure), one may
4665 * be written and the other not.  (2) Some of the counts are located in the
4666 * superblock rather than the cylinder group block. So, we focus our soft
4667 * updates implementation on protecting the bitmaps. When mounting a
4668 * filesystem, we recompute the auxiliary counts from the bitmaps.
4669 */
4670
4671/*
4672 * Called just after updating the cylinder group block to allocate an inode.
4673 */
4674void
4675softdep_setup_inomapdep(bp, ip, newinum, mode)
4676	struct buf *bp;		/* buffer for cylgroup block with inode map */
4677	struct inode *ip;	/* inode related to allocation */
4678	ino_t newinum;		/* new inode number being allocated */
4679	int mode;
4680{
4681	struct inodedep *inodedep;
4682	struct bmsafemap *bmsafemap;
4683	struct jaddref *jaddref;
4684	struct mount *mp;
4685	struct fs *fs;
4686
4687	mp = UFSTOVFS(ip->i_ump);
4688	fs = ip->i_ump->um_fs;
4689	jaddref = NULL;
4690
4691	/*
4692	 * Allocate the journal reference add structure so that the bitmap
4693	 * can be dependent on it.
4694	 */
4695	if (MOUNTEDSUJ(mp)) {
4696		jaddref = newjaddref(ip, newinum, 0, 0, mode);
4697		jaddref->ja_state |= NEWBLOCK;
4698	}
4699
4700	/*
4701	 * Create a dependency for the newly allocated inode.
4702	 * Panic if it already exists as something is seriously wrong.
4703	 * Otherwise add it to the dependency list for the buffer holding
4704	 * the cylinder group map from which it was allocated.
4705	 *
4706	 * We have to preallocate a bmsafemap entry in case it is needed
4707	 * in bmsafemap_lookup since once we allocate the inodedep, we
4708	 * have to finish initializing it before we can FREE_LOCK().
4709	 * By preallocating, we avoid FREE_LOCK() while doing a malloc
4710	 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
4711	 * creating the inodedep as it can be freed during the time
4712	 * that we FREE_LOCK() while allocating the inodedep. We must
4713	 * call workitem_alloc() before entering the locked section as
4714	 * it also acquires the lock and we must avoid trying doing so
4715	 * recursively.
4716	 */
4717	bmsafemap = malloc(sizeof(struct bmsafemap),
4718	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
4719	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
4720	ACQUIRE_LOCK(&lk);
4721	if ((inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep)))
4722		panic("softdep_setup_inomapdep: dependency %p for new"
4723		    "inode already exists", inodedep);
4724	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
4725	if (jaddref) {
4726		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
4727		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4728		    if_deps);
4729	} else {
4730		inodedep->id_state |= ONDEPLIST;
4731		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
4732	}
4733	inodedep->id_bmsafemap = bmsafemap;
4734	inodedep->id_state &= ~DEPCOMPLETE;
4735	FREE_LOCK(&lk);
4736}
4737
4738/*
4739 * Called just after updating the cylinder group block to
4740 * allocate block or fragment.
4741 */
4742void
4743softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
4744	struct buf *bp;		/* buffer for cylgroup block with block map */
4745	struct mount *mp;	/* filesystem doing allocation */
4746	ufs2_daddr_t newblkno;	/* number of newly allocated block */
4747	int frags;		/* Number of fragments. */
4748	int oldfrags;		/* Previous number of fragments for extend. */
4749{
4750	struct newblk *newblk;
4751	struct bmsafemap *bmsafemap;
4752	struct jnewblk *jnewblk;
4753	struct fs *fs;
4754
4755	fs = VFSTOUFS(mp)->um_fs;
4756	jnewblk = NULL;
4757	/*
4758	 * Create a dependency for the newly allocated block.
4759	 * Add it to the dependency list for the buffer holding
4760	 * the cylinder group map from which it was allocated.
4761	 */
4762	if (MOUNTEDSUJ(mp)) {
4763		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
4764		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
4765		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
4766		jnewblk->jn_state = ATTACHED;
4767		jnewblk->jn_blkno = newblkno;
4768		jnewblk->jn_frags = frags;
4769		jnewblk->jn_oldfrags = oldfrags;
4770#ifdef SUJ_DEBUG
4771		{
4772			struct cg *cgp;
4773			uint8_t *blksfree;
4774			long bno;
4775			int i;
4776
4777			cgp = (struct cg *)bp->b_data;
4778			blksfree = cg_blksfree(cgp);
4779			bno = dtogd(fs, jnewblk->jn_blkno);
4780			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
4781			    i++) {
4782				if (isset(blksfree, bno + i))
4783					panic("softdep_setup_blkmapdep: "
4784					    "free fragment %d from %d-%d "
4785					    "state 0x%X dep %p", i,
4786					    jnewblk->jn_oldfrags,
4787					    jnewblk->jn_frags,
4788					    jnewblk->jn_state,
4789					    jnewblk->jn_dep);
4790			}
4791		}
4792#endif
4793	}
4794	ACQUIRE_LOCK(&lk);
4795	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
4796		panic("softdep_setup_blkmapdep: found block");
4797	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
4798	    dtog(fs, newblkno), NULL);
4799	if (jnewblk) {
4800		jnewblk->jn_dep = (struct worklist *)newblk;
4801		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
4802	} else {
4803		newblk->nb_state |= ONDEPLIST;
4804		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
4805	}
4806	newblk->nb_bmsafemap = bmsafemap;
4807	newblk->nb_jnewblk = jnewblk;
4808	FREE_LOCK(&lk);
4809}
4810
4811#define	BMSAFEMAP_HASH(fs, cg) \
4812      (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash])
4813
4814static int
4815bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp)
4816	struct bmsafemap_hashhead *bmsafemaphd;
4817	struct mount *mp;
4818	int cg;
4819	struct bmsafemap **bmsafemapp;
4820{
4821	struct bmsafemap *bmsafemap;
4822
4823	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
4824		if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg)
4825			break;
4826	if (bmsafemap) {
4827		*bmsafemapp = bmsafemap;
4828		return (1);
4829	}
4830	*bmsafemapp = NULL;
4831
4832	return (0);
4833}
4834
4835/*
4836 * Find the bmsafemap associated with a cylinder group buffer.
4837 * If none exists, create one. The buffer must be locked when
4838 * this routine is called and this routine must be called with
4839 * the softdep lock held. To avoid giving up the lock while
4840 * allocating a new bmsafemap, a preallocated bmsafemap may be
4841 * provided. If it is provided but not needed, it is freed.
4842 */
4843static struct bmsafemap *
4844bmsafemap_lookup(mp, bp, cg, newbmsafemap)
4845	struct mount *mp;
4846	struct buf *bp;
4847	int cg;
4848	struct bmsafemap *newbmsafemap;
4849{
4850	struct bmsafemap_hashhead *bmsafemaphd;
4851	struct bmsafemap *bmsafemap, *collision;
4852	struct worklist *wk;
4853	struct fs *fs;
4854
4855	mtx_assert(&lk, MA_OWNED);
4856	if (bp)
4857		LIST_FOREACH(wk, &bp->b_dep, wk_list)
4858			if (wk->wk_type == D_BMSAFEMAP) {
4859				if (newbmsafemap)
4860					WORKITEM_FREE(newbmsafemap,D_BMSAFEMAP);
4861				return (WK_BMSAFEMAP(wk));
4862			}
4863	fs = VFSTOUFS(mp)->um_fs;
4864	bmsafemaphd = BMSAFEMAP_HASH(fs, cg);
4865	if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) {
4866		if (newbmsafemap)
4867			WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
4868		return (bmsafemap);
4869	}
4870	if (newbmsafemap) {
4871		bmsafemap = newbmsafemap;
4872	} else {
4873		FREE_LOCK(&lk);
4874		bmsafemap = malloc(sizeof(struct bmsafemap),
4875			M_BMSAFEMAP, M_SOFTDEP_FLAGS);
4876		workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
4877		ACQUIRE_LOCK(&lk);
4878	}
4879	bmsafemap->sm_buf = bp;
4880	LIST_INIT(&bmsafemap->sm_inodedephd);
4881	LIST_INIT(&bmsafemap->sm_inodedepwr);
4882	LIST_INIT(&bmsafemap->sm_newblkhd);
4883	LIST_INIT(&bmsafemap->sm_newblkwr);
4884	LIST_INIT(&bmsafemap->sm_jaddrefhd);
4885	LIST_INIT(&bmsafemap->sm_jnewblkhd);
4886	LIST_INIT(&bmsafemap->sm_freehd);
4887	LIST_INIT(&bmsafemap->sm_freewr);
4888	if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) {
4889		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
4890		return (collision);
4891	}
4892	bmsafemap->sm_cg = cg;
4893	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
4894	LIST_INSERT_HEAD(&VFSTOUFS(mp)->softdep_dirtycg, bmsafemap, sm_next);
4895	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
4896	return (bmsafemap);
4897}
4898
4899/*
4900 * Direct block allocation dependencies.
4901 *
4902 * When a new block is allocated, the corresponding disk locations must be
4903 * initialized (with zeros or new data) before the on-disk inode points to
4904 * them.  Also, the freemap from which the block was allocated must be
4905 * updated (on disk) before the inode's pointer. These two dependencies are
4906 * independent of each other and are needed for all file blocks and indirect
4907 * blocks that are pointed to directly by the inode.  Just before the
4908 * "in-core" version of the inode is updated with a newly allocated block
4909 * number, a procedure (below) is called to setup allocation dependency
4910 * structures.  These structures are removed when the corresponding
4911 * dependencies are satisfied or when the block allocation becomes obsolete
4912 * (i.e., the file is deleted, the block is de-allocated, or the block is a
4913 * fragment that gets upgraded).  All of these cases are handled in
4914 * procedures described later.
4915 *
4916 * When a file extension causes a fragment to be upgraded, either to a larger
4917 * fragment or to a full block, the on-disk location may change (if the
4918 * previous fragment could not simply be extended). In this case, the old
4919 * fragment must be de-allocated, but not until after the inode's pointer has
4920 * been updated. In most cases, this is handled by later procedures, which
4921 * will construct a "freefrag" structure to be added to the workitem queue
4922 * when the inode update is complete (or obsolete).  The main exception to
4923 * this is when an allocation occurs while a pending allocation dependency
4924 * (for the same block pointer) remains.  This case is handled in the main
4925 * allocation dependency setup procedure by immediately freeing the
4926 * unreferenced fragments.
4927 */
4928void
4929softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
4930	struct inode *ip;	/* inode to which block is being added */
4931	ufs_lbn_t off;		/* block pointer within inode */
4932	ufs2_daddr_t newblkno;	/* disk block number being added */
4933	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
4934	long newsize;		/* size of new block */
4935	long oldsize;		/* size of new block */
4936	struct buf *bp;		/* bp for allocated block */
4937{
4938	struct allocdirect *adp, *oldadp;
4939	struct allocdirectlst *adphead;
4940	struct freefrag *freefrag;
4941	struct inodedep *inodedep;
4942	struct pagedep *pagedep;
4943	struct jnewblk *jnewblk;
4944	struct newblk *newblk;
4945	struct mount *mp;
4946	ufs_lbn_t lbn;
4947
4948	lbn = bp->b_lblkno;
4949	mp = UFSTOVFS(ip->i_ump);
4950	if (oldblkno && oldblkno != newblkno)
4951		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
4952	else
4953		freefrag = NULL;
4954
4955	ACQUIRE_LOCK(&lk);
4956	if (off >= NDADDR) {
4957		if (lbn > 0)
4958			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
4959			    lbn, off);
4960		/* allocating an indirect block */
4961		if (oldblkno != 0)
4962			panic("softdep_setup_allocdirect: non-zero indir");
4963	} else {
4964		if (off != lbn)
4965			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
4966			    lbn, off);
4967		/*
4968		 * Allocating a direct block.
4969		 *
4970		 * If we are allocating a directory block, then we must
4971		 * allocate an associated pagedep to track additions and
4972		 * deletions.
4973		 */
4974		if ((ip->i_mode & IFMT) == IFDIR)
4975			pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
4976			    &pagedep);
4977	}
4978	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
4979		panic("softdep_setup_allocdirect: lost block");
4980	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4981	    ("softdep_setup_allocdirect: newblk already initialized"));
4982	/*
4983	 * Convert the newblk to an allocdirect.
4984	 */
4985	newblk->nb_list.wk_type = D_ALLOCDIRECT;
4986	adp = (struct allocdirect *)newblk;
4987	newblk->nb_freefrag = freefrag;
4988	adp->ad_offset = off;
4989	adp->ad_oldblkno = oldblkno;
4990	adp->ad_newsize = newsize;
4991	adp->ad_oldsize = oldsize;
4992
4993	/*
4994	 * Finish initializing the journal.
4995	 */
4996	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4997		jnewblk->jn_ino = ip->i_number;
4998		jnewblk->jn_lbn = lbn;
4999		add_to_journal(&jnewblk->jn_list);
5000	}
5001	if (freefrag && freefrag->ff_jdep != NULL &&
5002	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5003		add_to_journal(freefrag->ff_jdep);
5004	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5005	adp->ad_inodedep = inodedep;
5006
5007	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5008	/*
5009	 * The list of allocdirects must be kept in sorted and ascending
5010	 * order so that the rollback routines can quickly determine the
5011	 * first uncommitted block (the size of the file stored on disk
5012	 * ends at the end of the lowest committed fragment, or if there
5013	 * are no fragments, at the end of the highest committed block).
5014	 * Since files generally grow, the typical case is that the new
5015	 * block is to be added at the end of the list. We speed this
5016	 * special case by checking against the last allocdirect in the
5017	 * list before laboriously traversing the list looking for the
5018	 * insertion point.
5019	 */
5020	adphead = &inodedep->id_newinoupdt;
5021	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5022	if (oldadp == NULL || oldadp->ad_offset <= off) {
5023		/* insert at end of list */
5024		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5025		if (oldadp != NULL && oldadp->ad_offset == off)
5026			allocdirect_merge(adphead, adp, oldadp);
5027		FREE_LOCK(&lk);
5028		return;
5029	}
5030	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5031		if (oldadp->ad_offset >= off)
5032			break;
5033	}
5034	if (oldadp == NULL)
5035		panic("softdep_setup_allocdirect: lost entry");
5036	/* insert in middle of list */
5037	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5038	if (oldadp->ad_offset == off)
5039		allocdirect_merge(adphead, adp, oldadp);
5040
5041	FREE_LOCK(&lk);
5042}
5043
5044/*
5045 * Merge a newer and older journal record to be stored either in a
5046 * newblock or freefrag.  This handles aggregating journal records for
5047 * fragment allocation into a second record as well as replacing a
5048 * journal free with an aborted journal allocation.  A segment for the
5049 * oldest record will be placed on wkhd if it has been written.  If not
5050 * the segment for the newer record will suffice.
5051 */
5052static struct worklist *
5053jnewblk_merge(new, old, wkhd)
5054	struct worklist *new;
5055	struct worklist *old;
5056	struct workhead *wkhd;
5057{
5058	struct jnewblk *njnewblk;
5059	struct jnewblk *jnewblk;
5060
5061	/* Handle NULLs to simplify callers. */
5062	if (new == NULL)
5063		return (old);
5064	if (old == NULL)
5065		return (new);
5066	/* Replace a jfreefrag with a jnewblk. */
5067	if (new->wk_type == D_JFREEFRAG) {
5068		cancel_jfreefrag(WK_JFREEFRAG(new));
5069		return (old);
5070	}
5071	/*
5072	 * Handle merging of two jnewblk records that describe
5073	 * different sets of fragments in the same block.
5074	 */
5075	jnewblk = WK_JNEWBLK(old);
5076	njnewblk = WK_JNEWBLK(new);
5077	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5078		panic("jnewblk_merge: Merging disparate blocks.");
5079	/*
5080	 * The record may be rolled back in the cg.
5081	 */
5082	if (jnewblk->jn_state & UNDONE) {
5083		jnewblk->jn_state &= ~UNDONE;
5084		njnewblk->jn_state |= UNDONE;
5085		njnewblk->jn_state &= ~ATTACHED;
5086	}
5087	/*
5088	 * We modify the newer addref and free the older so that if neither
5089	 * has been written the most up-to-date copy will be on disk.  If
5090	 * both have been written but rolled back we only temporarily need
5091	 * one of them to fix the bits when the cg write completes.
5092	 */
5093	jnewblk->jn_state |= ATTACHED | COMPLETE;
5094	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5095	cancel_jnewblk(jnewblk, wkhd);
5096	WORKLIST_REMOVE(&jnewblk->jn_list);
5097	free_jnewblk(jnewblk);
5098	return (new);
5099}
5100
5101/*
5102 * Replace an old allocdirect dependency with a newer one.
5103 * This routine must be called with splbio interrupts blocked.
5104 */
5105static void
5106allocdirect_merge(adphead, newadp, oldadp)
5107	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
5108	struct allocdirect *newadp;	/* allocdirect being added */
5109	struct allocdirect *oldadp;	/* existing allocdirect being checked */
5110{
5111	struct worklist *wk;
5112	struct freefrag *freefrag;
5113
5114	freefrag = NULL;
5115	mtx_assert(&lk, MA_OWNED);
5116	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5117	    newadp->ad_oldsize != oldadp->ad_newsize ||
5118	    newadp->ad_offset >= NDADDR)
5119		panic("%s %jd != new %jd || old size %ld != new %ld",
5120		    "allocdirect_merge: old blkno",
5121		    (intmax_t)newadp->ad_oldblkno,
5122		    (intmax_t)oldadp->ad_newblkno,
5123		    newadp->ad_oldsize, oldadp->ad_newsize);
5124	newadp->ad_oldblkno = oldadp->ad_oldblkno;
5125	newadp->ad_oldsize = oldadp->ad_oldsize;
5126	/*
5127	 * If the old dependency had a fragment to free or had never
5128	 * previously had a block allocated, then the new dependency
5129	 * can immediately post its freefrag and adopt the old freefrag.
5130	 * This action is done by swapping the freefrag dependencies.
5131	 * The new dependency gains the old one's freefrag, and the
5132	 * old one gets the new one and then immediately puts it on
5133	 * the worklist when it is freed by free_newblk. It is
5134	 * not possible to do this swap when the old dependency had a
5135	 * non-zero size but no previous fragment to free. This condition
5136	 * arises when the new block is an extension of the old block.
5137	 * Here, the first part of the fragment allocated to the new
5138	 * dependency is part of the block currently claimed on disk by
5139	 * the old dependency, so cannot legitimately be freed until the
5140	 * conditions for the new dependency are fulfilled.
5141	 */
5142	freefrag = newadp->ad_freefrag;
5143	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5144		newadp->ad_freefrag = oldadp->ad_freefrag;
5145		oldadp->ad_freefrag = freefrag;
5146	}
5147	/*
5148	 * If we are tracking a new directory-block allocation,
5149	 * move it from the old allocdirect to the new allocdirect.
5150	 */
5151	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5152		WORKLIST_REMOVE(wk);
5153		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5154			panic("allocdirect_merge: extra newdirblk");
5155		WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5156	}
5157	TAILQ_REMOVE(adphead, oldadp, ad_next);
5158	/*
5159	 * We need to move any journal dependencies over to the freefrag
5160	 * that releases this block if it exists.  Otherwise we are
5161	 * extending an existing block and we'll wait until that is
5162	 * complete to release the journal space and extend the
5163	 * new journal to cover this old space as well.
5164	 */
5165	if (freefrag == NULL) {
5166		if (oldadp->ad_newblkno != newadp->ad_newblkno)
5167			panic("allocdirect_merge: %jd != %jd",
5168			    oldadp->ad_newblkno, newadp->ad_newblkno);
5169		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5170		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5171		    &oldadp->ad_block.nb_jnewblk->jn_list,
5172		    &newadp->ad_block.nb_jwork);
5173		oldadp->ad_block.nb_jnewblk = NULL;
5174		cancel_newblk(&oldadp->ad_block, NULL,
5175		    &newadp->ad_block.nb_jwork);
5176	} else {
5177		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5178		    &freefrag->ff_list, &freefrag->ff_jwork);
5179		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5180		    &freefrag->ff_jwork);
5181	}
5182	free_newblk(&oldadp->ad_block);
5183}
5184
5185/*
5186 * Allocate a jfreefrag structure to journal a single block free.
5187 */
5188static struct jfreefrag *
5189newjfreefrag(freefrag, ip, blkno, size, lbn)
5190	struct freefrag *freefrag;
5191	struct inode *ip;
5192	ufs2_daddr_t blkno;
5193	long size;
5194	ufs_lbn_t lbn;
5195{
5196	struct jfreefrag *jfreefrag;
5197	struct fs *fs;
5198
5199	fs = ip->i_fs;
5200	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5201	    M_SOFTDEP_FLAGS);
5202	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
5203	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5204	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5205	jfreefrag->fr_ino = ip->i_number;
5206	jfreefrag->fr_lbn = lbn;
5207	jfreefrag->fr_blkno = blkno;
5208	jfreefrag->fr_frags = numfrags(fs, size);
5209	jfreefrag->fr_freefrag = freefrag;
5210
5211	return (jfreefrag);
5212}
5213
5214/*
5215 * Allocate a new freefrag structure.
5216 */
5217static struct freefrag *
5218newfreefrag(ip, blkno, size, lbn)
5219	struct inode *ip;
5220	ufs2_daddr_t blkno;
5221	long size;
5222	ufs_lbn_t lbn;
5223{
5224	struct freefrag *freefrag;
5225	struct fs *fs;
5226
5227	fs = ip->i_fs;
5228	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5229		panic("newfreefrag: frag size");
5230	freefrag = malloc(sizeof(struct freefrag),
5231	    M_FREEFRAG, M_SOFTDEP_FLAGS);
5232	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
5233	freefrag->ff_state = ATTACHED;
5234	LIST_INIT(&freefrag->ff_jwork);
5235	freefrag->ff_inum = ip->i_number;
5236	freefrag->ff_vtype = ITOV(ip)->v_type;
5237	freefrag->ff_blkno = blkno;
5238	freefrag->ff_fragsize = size;
5239
5240	if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) {
5241		freefrag->ff_jdep = (struct worklist *)
5242		    newjfreefrag(freefrag, ip, blkno, size, lbn);
5243	} else {
5244		freefrag->ff_state |= DEPCOMPLETE;
5245		freefrag->ff_jdep = NULL;
5246	}
5247
5248	return (freefrag);
5249}
5250
5251/*
5252 * This workitem de-allocates fragments that were replaced during
5253 * file block allocation.
5254 */
5255static void
5256handle_workitem_freefrag(freefrag)
5257	struct freefrag *freefrag;
5258{
5259	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5260	struct workhead wkhd;
5261
5262	/*
5263	 * It would be illegal to add new completion items to the
5264	 * freefrag after it was schedule to be done so it must be
5265	 * safe to modify the list head here.
5266	 */
5267	LIST_INIT(&wkhd);
5268	ACQUIRE_LOCK(&lk);
5269	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5270	/*
5271	 * If the journal has not been written we must cancel it here.
5272	 */
5273	if (freefrag->ff_jdep) {
5274		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5275			panic("handle_workitem_freefrag: Unexpected type %d\n",
5276			    freefrag->ff_jdep->wk_type);
5277		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5278	}
5279	FREE_LOCK(&lk);
5280	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5281	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
5282	ACQUIRE_LOCK(&lk);
5283	WORKITEM_FREE(freefrag, D_FREEFRAG);
5284	FREE_LOCK(&lk);
5285}
5286
5287/*
5288 * Set up a dependency structure for an external attributes data block.
5289 * This routine follows much of the structure of softdep_setup_allocdirect.
5290 * See the description of softdep_setup_allocdirect above for details.
5291 */
5292void
5293softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5294	struct inode *ip;
5295	ufs_lbn_t off;
5296	ufs2_daddr_t newblkno;
5297	ufs2_daddr_t oldblkno;
5298	long newsize;
5299	long oldsize;
5300	struct buf *bp;
5301{
5302	struct allocdirect *adp, *oldadp;
5303	struct allocdirectlst *adphead;
5304	struct freefrag *freefrag;
5305	struct inodedep *inodedep;
5306	struct jnewblk *jnewblk;
5307	struct newblk *newblk;
5308	struct mount *mp;
5309	ufs_lbn_t lbn;
5310
5311	if (off >= NXADDR)
5312		panic("softdep_setup_allocext: lbn %lld > NXADDR",
5313		    (long long)off);
5314
5315	lbn = bp->b_lblkno;
5316	mp = UFSTOVFS(ip->i_ump);
5317	if (oldblkno && oldblkno != newblkno)
5318		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5319	else
5320		freefrag = NULL;
5321
5322	ACQUIRE_LOCK(&lk);
5323	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5324		panic("softdep_setup_allocext: lost block");
5325	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5326	    ("softdep_setup_allocext: newblk already initialized"));
5327	/*
5328	 * Convert the newblk to an allocdirect.
5329	 */
5330	newblk->nb_list.wk_type = D_ALLOCDIRECT;
5331	adp = (struct allocdirect *)newblk;
5332	newblk->nb_freefrag = freefrag;
5333	adp->ad_offset = off;
5334	adp->ad_oldblkno = oldblkno;
5335	adp->ad_newsize = newsize;
5336	adp->ad_oldsize = oldsize;
5337	adp->ad_state |=  EXTDATA;
5338
5339	/*
5340	 * Finish initializing the journal.
5341	 */
5342	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5343		jnewblk->jn_ino = ip->i_number;
5344		jnewblk->jn_lbn = lbn;
5345		add_to_journal(&jnewblk->jn_list);
5346	}
5347	if (freefrag && freefrag->ff_jdep != NULL &&
5348	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5349		add_to_journal(freefrag->ff_jdep);
5350	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5351	adp->ad_inodedep = inodedep;
5352
5353	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5354	/*
5355	 * The list of allocdirects must be kept in sorted and ascending
5356	 * order so that the rollback routines can quickly determine the
5357	 * first uncommitted block (the size of the file stored on disk
5358	 * ends at the end of the lowest committed fragment, or if there
5359	 * are no fragments, at the end of the highest committed block).
5360	 * Since files generally grow, the typical case is that the new
5361	 * block is to be added at the end of the list. We speed this
5362	 * special case by checking against the last allocdirect in the
5363	 * list before laboriously traversing the list looking for the
5364	 * insertion point.
5365	 */
5366	adphead = &inodedep->id_newextupdt;
5367	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5368	if (oldadp == NULL || oldadp->ad_offset <= off) {
5369		/* insert at end of list */
5370		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5371		if (oldadp != NULL && oldadp->ad_offset == off)
5372			allocdirect_merge(adphead, adp, oldadp);
5373		FREE_LOCK(&lk);
5374		return;
5375	}
5376	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5377		if (oldadp->ad_offset >= off)
5378			break;
5379	}
5380	if (oldadp == NULL)
5381		panic("softdep_setup_allocext: lost entry");
5382	/* insert in middle of list */
5383	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5384	if (oldadp->ad_offset == off)
5385		allocdirect_merge(adphead, adp, oldadp);
5386	FREE_LOCK(&lk);
5387}
5388
5389/*
5390 * Indirect block allocation dependencies.
5391 *
5392 * The same dependencies that exist for a direct block also exist when
5393 * a new block is allocated and pointed to by an entry in a block of
5394 * indirect pointers. The undo/redo states described above are also
5395 * used here. Because an indirect block contains many pointers that
5396 * may have dependencies, a second copy of the entire in-memory indirect
5397 * block is kept. The buffer cache copy is always completely up-to-date.
5398 * The second copy, which is used only as a source for disk writes,
5399 * contains only the safe pointers (i.e., those that have no remaining
5400 * update dependencies). The second copy is freed when all pointers
5401 * are safe. The cache is not allowed to replace indirect blocks with
5402 * pending update dependencies. If a buffer containing an indirect
5403 * block with dependencies is written, these routines will mark it
5404 * dirty again. It can only be successfully written once all the
5405 * dependencies are removed. The ffs_fsync routine in conjunction with
5406 * softdep_sync_metadata work together to get all the dependencies
5407 * removed so that a file can be successfully written to disk. Three
5408 * procedures are used when setting up indirect block pointer
5409 * dependencies. The division is necessary because of the organization
5410 * of the "balloc" routine and because of the distinction between file
5411 * pages and file metadata blocks.
5412 */
5413
5414/*
5415 * Allocate a new allocindir structure.
5416 */
5417static struct allocindir *
5418newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
5419	struct inode *ip;	/* inode for file being extended */
5420	int ptrno;		/* offset of pointer in indirect block */
5421	ufs2_daddr_t newblkno;	/* disk block number being added */
5422	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5423	ufs_lbn_t lbn;
5424{
5425	struct newblk *newblk;
5426	struct allocindir *aip;
5427	struct freefrag *freefrag;
5428	struct jnewblk *jnewblk;
5429
5430	if (oldblkno)
5431		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
5432	else
5433		freefrag = NULL;
5434	ACQUIRE_LOCK(&lk);
5435	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
5436		panic("new_allocindir: lost block");
5437	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5438	    ("newallocindir: newblk already initialized"));
5439	newblk->nb_list.wk_type = D_ALLOCINDIR;
5440	newblk->nb_freefrag = freefrag;
5441	aip = (struct allocindir *)newblk;
5442	aip->ai_offset = ptrno;
5443	aip->ai_oldblkno = oldblkno;
5444	aip->ai_lbn = lbn;
5445	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5446		jnewblk->jn_ino = ip->i_number;
5447		jnewblk->jn_lbn = lbn;
5448		add_to_journal(&jnewblk->jn_list);
5449	}
5450	if (freefrag && freefrag->ff_jdep != NULL &&
5451	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5452		add_to_journal(freefrag->ff_jdep);
5453	return (aip);
5454}
5455
5456/*
5457 * Called just before setting an indirect block pointer
5458 * to a newly allocated file page.
5459 */
5460void
5461softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
5462	struct inode *ip;	/* inode for file being extended */
5463	ufs_lbn_t lbn;		/* allocated block number within file */
5464	struct buf *bp;		/* buffer with indirect blk referencing page */
5465	int ptrno;		/* offset of pointer in indirect block */
5466	ufs2_daddr_t newblkno;	/* disk block number being added */
5467	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5468	struct buf *nbp;	/* buffer holding allocated page */
5469{
5470	struct inodedep *inodedep;
5471	struct freefrag *freefrag;
5472	struct allocindir *aip;
5473	struct pagedep *pagedep;
5474	struct mount *mp;
5475	int dflags;
5476
5477	if (lbn != nbp->b_lblkno)
5478		panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
5479		    lbn, bp->b_lblkno);
5480	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
5481	mp = UFSTOVFS(ip->i_ump);
5482	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
5483	dflags = DEPALLOC;
5484	if (IS_SNAPSHOT(ip))
5485		dflags |= NODELAY;
5486	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
5487	/*
5488	 * If we are allocating a directory page, then we must
5489	 * allocate an associated pagedep to track additions and
5490	 * deletions.
5491	 */
5492	if ((ip->i_mode & IFMT) == IFDIR)
5493		pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
5494	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5495	freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5496	FREE_LOCK(&lk);
5497	if (freefrag)
5498		handle_workitem_freefrag(freefrag);
5499}
5500
5501/*
5502 * Called just before setting an indirect block pointer to a
5503 * newly allocated indirect block.
5504 */
5505void
5506softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
5507	struct buf *nbp;	/* newly allocated indirect block */
5508	struct inode *ip;	/* inode for file being extended */
5509	struct buf *bp;		/* indirect block referencing allocated block */
5510	int ptrno;		/* offset of pointer in indirect block */
5511	ufs2_daddr_t newblkno;	/* disk block number being added */
5512{
5513	struct inodedep *inodedep;
5514	struct allocindir *aip;
5515	ufs_lbn_t lbn;
5516	int dflags;
5517
5518	lbn = nbp->b_lblkno;
5519	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
5520	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
5521	dflags = DEPALLOC;
5522	if (IS_SNAPSHOT(ip))
5523		dflags |= NODELAY;
5524	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
5525	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5526	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
5527		panic("softdep_setup_allocindir_meta: Block already existed");
5528	FREE_LOCK(&lk);
5529}
5530
5531static void
5532indirdep_complete(indirdep)
5533	struct indirdep *indirdep;
5534{
5535	struct allocindir *aip;
5536
5537	LIST_REMOVE(indirdep, ir_next);
5538	indirdep->ir_state |= DEPCOMPLETE;
5539
5540	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5541		LIST_REMOVE(aip, ai_next);
5542		free_newblk(&aip->ai_block);
5543	}
5544	/*
5545	 * If this indirdep is not attached to a buf it was simply waiting
5546	 * on completion to clear completehd.  free_indirdep() asserts
5547	 * that nothing is dangling.
5548	 */
5549	if ((indirdep->ir_state & ONWORKLIST) == 0)
5550		free_indirdep(indirdep);
5551}
5552
5553static struct indirdep *
5554indirdep_lookup(mp, ip, bp)
5555	struct mount *mp;
5556	struct inode *ip;
5557	struct buf *bp;
5558{
5559	struct indirdep *indirdep, *newindirdep;
5560	struct newblk *newblk;
5561	struct worklist *wk;
5562	struct fs *fs;
5563	ufs2_daddr_t blkno;
5564
5565	mtx_assert(&lk, MA_OWNED);
5566	indirdep = NULL;
5567	newindirdep = NULL;
5568	fs = ip->i_fs;
5569	for (;;) {
5570		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5571			if (wk->wk_type != D_INDIRDEP)
5572				continue;
5573			indirdep = WK_INDIRDEP(wk);
5574			break;
5575		}
5576		/* Found on the buffer worklist, no new structure to free. */
5577		if (indirdep != NULL && newindirdep == NULL)
5578			return (indirdep);
5579		if (indirdep != NULL && newindirdep != NULL)
5580			panic("indirdep_lookup: simultaneous create");
5581		/* None found on the buffer and a new structure is ready. */
5582		if (indirdep == NULL && newindirdep != NULL)
5583			break;
5584		/* None found and no new structure available. */
5585		FREE_LOCK(&lk);
5586		newindirdep = malloc(sizeof(struct indirdep),
5587		    M_INDIRDEP, M_SOFTDEP_FLAGS);
5588		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5589		newindirdep->ir_state = ATTACHED;
5590		if (ip->i_ump->um_fstype == UFS1)
5591			newindirdep->ir_state |= UFS1FMT;
5592		TAILQ_INIT(&newindirdep->ir_trunc);
5593		newindirdep->ir_saveddata = NULL;
5594		LIST_INIT(&newindirdep->ir_deplisthd);
5595		LIST_INIT(&newindirdep->ir_donehd);
5596		LIST_INIT(&newindirdep->ir_writehd);
5597		LIST_INIT(&newindirdep->ir_completehd);
5598		if (bp->b_blkno == bp->b_lblkno) {
5599			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5600			    NULL, NULL);
5601			bp->b_blkno = blkno;
5602		}
5603		newindirdep->ir_freeblks = NULL;
5604		newindirdep->ir_savebp =
5605		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5606		newindirdep->ir_bp = bp;
5607		BUF_KERNPROC(newindirdep->ir_savebp);
5608		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5609		ACQUIRE_LOCK(&lk);
5610	}
5611	indirdep = newindirdep;
5612	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
5613	/*
5614	 * If the block is not yet allocated we don't set DEPCOMPLETE so
5615	 * that we don't free dependencies until the pointers are valid.
5616	 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
5617	 * than using the hash.
5618	 */
5619	if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
5620		LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
5621	else
5622		indirdep->ir_state |= DEPCOMPLETE;
5623	return (indirdep);
5624}
5625
5626/*
5627 * Called to finish the allocation of the "aip" allocated
5628 * by one of the two routines above.
5629 */
5630static struct freefrag *
5631setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
5632	struct buf *bp;		/* in-memory copy of the indirect block */
5633	struct inode *ip;	/* inode for file being extended */
5634	struct inodedep *inodedep; /* Inodedep for ip */
5635	struct allocindir *aip;	/* allocindir allocated by the above routines */
5636	ufs_lbn_t lbn;		/* Logical block number for this block. */
5637{
5638	struct fs *fs;
5639	struct indirdep *indirdep;
5640	struct allocindir *oldaip;
5641	struct freefrag *freefrag;
5642	struct mount *mp;
5643
5644	mtx_assert(&lk, MA_OWNED);
5645	mp = UFSTOVFS(ip->i_ump);
5646	fs = ip->i_fs;
5647	if (bp->b_lblkno >= 0)
5648		panic("setup_allocindir_phase2: not indir blk");
5649	KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
5650	    ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
5651	indirdep = indirdep_lookup(mp, ip, bp);
5652	KASSERT(indirdep->ir_savebp != NULL,
5653	    ("setup_allocindir_phase2 NULL ir_savebp"));
5654	aip->ai_indirdep = indirdep;
5655	/*
5656	 * Check for an unwritten dependency for this indirect offset.  If
5657	 * there is, merge the old dependency into the new one.  This happens
5658	 * as a result of reallocblk only.
5659	 */
5660	freefrag = NULL;
5661	if (aip->ai_oldblkno != 0) {
5662		LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
5663			if (oldaip->ai_offset == aip->ai_offset) {
5664				freefrag = allocindir_merge(aip, oldaip);
5665				goto done;
5666			}
5667		}
5668		LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
5669			if (oldaip->ai_offset == aip->ai_offset) {
5670				freefrag = allocindir_merge(aip, oldaip);
5671				goto done;
5672			}
5673		}
5674	}
5675done:
5676	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
5677	return (freefrag);
5678}
5679
5680/*
5681 * Merge two allocindirs which refer to the same block.  Move newblock
5682 * dependencies and setup the freefrags appropriately.
5683 */
5684static struct freefrag *
5685allocindir_merge(aip, oldaip)
5686	struct allocindir *aip;
5687	struct allocindir *oldaip;
5688{
5689	struct freefrag *freefrag;
5690	struct worklist *wk;
5691
5692	if (oldaip->ai_newblkno != aip->ai_oldblkno)
5693		panic("allocindir_merge: blkno");
5694	aip->ai_oldblkno = oldaip->ai_oldblkno;
5695	freefrag = aip->ai_freefrag;
5696	aip->ai_freefrag = oldaip->ai_freefrag;
5697	oldaip->ai_freefrag = NULL;
5698	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
5699	/*
5700	 * If we are tracking a new directory-block allocation,
5701	 * move it from the old allocindir to the new allocindir.
5702	 */
5703	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
5704		WORKLIST_REMOVE(wk);
5705		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
5706			panic("allocindir_merge: extra newdirblk");
5707		WORKLIST_INSERT(&aip->ai_newdirblk, wk);
5708	}
5709	/*
5710	 * We can skip journaling for this freefrag and just complete
5711	 * any pending journal work for the allocindir that is being
5712	 * removed after the freefrag completes.
5713	 */
5714	if (freefrag->ff_jdep)
5715		cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
5716	LIST_REMOVE(oldaip, ai_next);
5717	freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
5718	    &freefrag->ff_list, &freefrag->ff_jwork);
5719	free_newblk(&oldaip->ai_block);
5720
5721	return (freefrag);
5722}
5723
5724static inline void
5725setup_freedirect(freeblks, ip, i, needj)
5726	struct freeblks *freeblks;
5727	struct inode *ip;
5728	int i;
5729	int needj;
5730{
5731	ufs2_daddr_t blkno;
5732	int frags;
5733
5734	blkno = DIP(ip, i_db[i]);
5735	if (blkno == 0)
5736		return;
5737	DIP_SET(ip, i_db[i], 0);
5738	frags = sblksize(ip->i_fs, ip->i_size, i);
5739	frags = numfrags(ip->i_fs, frags);
5740	newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj);
5741}
5742
5743static inline void
5744setup_freeext(freeblks, ip, i, needj)
5745	struct freeblks *freeblks;
5746	struct inode *ip;
5747	int i;
5748	int needj;
5749{
5750	ufs2_daddr_t blkno;
5751	int frags;
5752
5753	blkno = ip->i_din2->di_extb[i];
5754	if (blkno == 0)
5755		return;
5756	ip->i_din2->di_extb[i] = 0;
5757	frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
5758	frags = numfrags(ip->i_fs, frags);
5759	newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
5760}
5761
5762static inline void
5763setup_freeindir(freeblks, ip, i, lbn, needj)
5764	struct freeblks *freeblks;
5765	struct inode *ip;
5766	int i;
5767	ufs_lbn_t lbn;
5768	int needj;
5769{
5770	ufs2_daddr_t blkno;
5771
5772	blkno = DIP(ip, i_ib[i]);
5773	if (blkno == 0)
5774		return;
5775	DIP_SET(ip, i_ib[i], 0);
5776	newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
5777	    0, needj);
5778}
5779
5780static inline struct freeblks *
5781newfreeblks(mp, ip)
5782	struct mount *mp;
5783	struct inode *ip;
5784{
5785	struct freeblks *freeblks;
5786
5787	freeblks = malloc(sizeof(struct freeblks),
5788		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
5789	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
5790	LIST_INIT(&freeblks->fb_jblkdephd);
5791	LIST_INIT(&freeblks->fb_jwork);
5792	freeblks->fb_ref = 0;
5793	freeblks->fb_cgwait = 0;
5794	freeblks->fb_state = ATTACHED;
5795	freeblks->fb_uid = ip->i_uid;
5796	freeblks->fb_inum = ip->i_number;
5797	freeblks->fb_vtype = ITOV(ip)->v_type;
5798	freeblks->fb_modrev = DIP(ip, i_modrev);
5799	freeblks->fb_devvp = ip->i_devvp;
5800	freeblks->fb_chkcnt = 0;
5801	freeblks->fb_len = 0;
5802
5803	return (freeblks);
5804}
5805
5806static void
5807trunc_indirdep(indirdep, freeblks, bp, off)
5808	struct indirdep *indirdep;
5809	struct freeblks *freeblks;
5810	struct buf *bp;
5811	int off;
5812{
5813	struct allocindir *aip, *aipn;
5814
5815	/*
5816	 * The first set of allocindirs won't be in savedbp.
5817	 */
5818	LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
5819		if (aip->ai_offset > off)
5820			cancel_allocindir(aip, bp, freeblks, 1);
5821	LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
5822		if (aip->ai_offset > off)
5823			cancel_allocindir(aip, bp, freeblks, 1);
5824	/*
5825	 * These will exist in savedbp.
5826	 */
5827	LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
5828		if (aip->ai_offset > off)
5829			cancel_allocindir(aip, NULL, freeblks, 0);
5830	LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
5831		if (aip->ai_offset > off)
5832			cancel_allocindir(aip, NULL, freeblks, 0);
5833}
5834
5835/*
5836 * Follow the chain of indirects down to lastlbn creating a freework
5837 * structure for each.  This will be used to start indir_trunc() at
5838 * the right offset and create the journal records for the parrtial
5839 * truncation.  A second step will handle the truncated dependencies.
5840 */
5841static int
5842setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
5843	struct freeblks *freeblks;
5844	struct inode *ip;
5845	ufs_lbn_t lbn;
5846	ufs_lbn_t lastlbn;
5847	ufs2_daddr_t blkno;
5848{
5849	struct indirdep *indirdep;
5850	struct indirdep *indirn;
5851	struct freework *freework;
5852	struct newblk *newblk;
5853	struct mount *mp;
5854	struct buf *bp;
5855	uint8_t *start;
5856	uint8_t *end;
5857	ufs_lbn_t lbnadd;
5858	int level;
5859	int error;
5860	int off;
5861
5862
5863	freework = NULL;
5864	if (blkno == 0)
5865		return (0);
5866	mp = freeblks->fb_list.wk_mp;
5867	bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
5868	if ((bp->b_flags & B_CACHE) == 0) {
5869		bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
5870		bp->b_iocmd = BIO_READ;
5871		bp->b_flags &= ~B_INVAL;
5872		bp->b_ioflags &= ~BIO_ERROR;
5873		vfs_busy_pages(bp, 0);
5874		bp->b_iooffset = dbtob(bp->b_blkno);
5875		bstrategy(bp);
5876		curthread->td_ru.ru_inblock++;
5877		error = bufwait(bp);
5878		if (error) {
5879			brelse(bp);
5880			return (error);
5881		}
5882	}
5883	level = lbn_level(lbn);
5884	lbnadd = lbn_offset(ip->i_fs, level);
5885	/*
5886	 * Compute the offset of the last block we want to keep.  Store
5887	 * in the freework the first block we want to completely free.
5888	 */
5889	off = (lastlbn - -(lbn + level)) / lbnadd;
5890	if (off + 1 == NINDIR(ip->i_fs))
5891		goto nowork;
5892	freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1,
5893	    0);
5894	/*
5895	 * Link the freework into the indirdep.  This will prevent any new
5896	 * allocations from proceeding until we are finished with the
5897	 * truncate and the block is written.
5898	 */
5899	ACQUIRE_LOCK(&lk);
5900	indirdep = indirdep_lookup(mp, ip, bp);
5901	if (indirdep->ir_freeblks)
5902		panic("setup_trunc_indir: indirdep already truncated.");
5903	TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
5904	freework->fw_indir = indirdep;
5905	/*
5906	 * Cancel any allocindirs that will not make it to disk.
5907	 * We have to do this for all copies of the indirdep that
5908	 * live on this newblk.
5909	 */
5910	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
5911		newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk);
5912		LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
5913			trunc_indirdep(indirn, freeblks, bp, off);
5914	} else
5915		trunc_indirdep(indirdep, freeblks, bp, off);
5916	FREE_LOCK(&lk);
5917	/*
5918	 * Creation is protected by the buf lock. The saveddata is only
5919	 * needed if a full truncation follows a partial truncation but it
5920	 * is difficult to allocate in that case so we fetch it anyway.
5921	 */
5922	if (indirdep->ir_saveddata == NULL)
5923		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
5924		    M_SOFTDEP_FLAGS);
5925nowork:
5926	/* Fetch the blkno of the child and the zero start offset. */
5927	if (ip->i_ump->um_fstype == UFS1) {
5928		blkno = ((ufs1_daddr_t *)bp->b_data)[off];
5929		start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
5930	} else {
5931		blkno = ((ufs2_daddr_t *)bp->b_data)[off];
5932		start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
5933	}
5934	if (freework) {
5935		/* Zero the truncated pointers. */
5936		end = bp->b_data + bp->b_bcount;
5937		bzero(start, end - start);
5938		bdwrite(bp);
5939	} else
5940		bqrelse(bp);
5941	if (level == 0)
5942		return (0);
5943	lbn++; /* adjust level */
5944	lbn -= (off * lbnadd);
5945	return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
5946}
5947
5948/*
5949 * Complete the partial truncation of an indirect block setup by
5950 * setup_trunc_indir().  This zeros the truncated pointers in the saved
5951 * copy and writes them to disk before the freeblks is allowed to complete.
5952 */
5953static void
5954complete_trunc_indir(freework)
5955	struct freework *freework;
5956{
5957	struct freework *fwn;
5958	struct indirdep *indirdep;
5959	struct buf *bp;
5960	uintptr_t start;
5961	int count;
5962
5963	indirdep = freework->fw_indir;
5964	for (;;) {
5965		bp = indirdep->ir_bp;
5966		/* See if the block was discarded. */
5967		if (bp == NULL)
5968			break;
5969		/* Inline part of getdirtybuf().  We dont want bremfree. */
5970		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
5971			break;
5972		if (BUF_LOCK(bp,
5973		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, &lk) == 0)
5974			BUF_UNLOCK(bp);
5975		ACQUIRE_LOCK(&lk);
5976	}
5977	mtx_assert(&lk, MA_OWNED);
5978	freework->fw_state |= DEPCOMPLETE;
5979	TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
5980	/*
5981	 * Zero the pointers in the saved copy.
5982	 */
5983	if (indirdep->ir_state & UFS1FMT)
5984		start = sizeof(ufs1_daddr_t);
5985	else
5986		start = sizeof(ufs2_daddr_t);
5987	start *= freework->fw_start;
5988	count = indirdep->ir_savebp->b_bcount - start;
5989	start += (uintptr_t)indirdep->ir_savebp->b_data;
5990	bzero((char *)start, count);
5991	/*
5992	 * We need to start the next truncation in the list if it has not
5993	 * been started yet.
5994	 */
5995	fwn = TAILQ_FIRST(&indirdep->ir_trunc);
5996	if (fwn != NULL) {
5997		if (fwn->fw_freeblks == indirdep->ir_freeblks)
5998			TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
5999		if ((fwn->fw_state & ONWORKLIST) == 0)
6000			freework_enqueue(fwn);
6001	}
6002	/*
6003	 * If bp is NULL the block was fully truncated, restore
6004	 * the saved block list otherwise free it if it is no
6005	 * longer needed.
6006	 */
6007	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6008		if (bp == NULL)
6009			bcopy(indirdep->ir_saveddata,
6010			    indirdep->ir_savebp->b_data,
6011			    indirdep->ir_savebp->b_bcount);
6012		free(indirdep->ir_saveddata, M_INDIRDEP);
6013		indirdep->ir_saveddata = NULL;
6014	}
6015	/*
6016	 * When bp is NULL there is a full truncation pending.  We
6017	 * must wait for this full truncation to be journaled before
6018	 * we can release this freework because the disk pointers will
6019	 * never be written as zero.
6020	 */
6021	if (bp == NULL)  {
6022		if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6023			handle_written_freework(freework);
6024		else
6025			WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6026			   &freework->fw_list);
6027	} else {
6028		/* Complete when the real copy is written. */
6029		WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6030		BUF_UNLOCK(bp);
6031	}
6032}
6033
6034/*
6035 * Calculate the number of blocks we are going to release where datablocks
6036 * is the current total and length is the new file size.
6037 */
6038ufs2_daddr_t
6039blkcount(fs, datablocks, length)
6040	struct fs *fs;
6041	ufs2_daddr_t datablocks;
6042	off_t length;
6043{
6044	off_t totblks, numblks;
6045
6046	totblks = 0;
6047	numblks = howmany(length, fs->fs_bsize);
6048	if (numblks <= NDADDR) {
6049		totblks = howmany(length, fs->fs_fsize);
6050		goto out;
6051	}
6052        totblks = blkstofrags(fs, numblks);
6053	numblks -= NDADDR;
6054	/*
6055	 * Count all single, then double, then triple indirects required.
6056	 * Subtracting one indirects worth of blocks for each pass
6057	 * acknowledges one of each pointed to by the inode.
6058	 */
6059	for (;;) {
6060		totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6061		numblks -= NINDIR(fs);
6062		if (numblks <= 0)
6063			break;
6064		numblks = howmany(numblks, NINDIR(fs));
6065	}
6066out:
6067	totblks = fsbtodb(fs, totblks);
6068	/*
6069	 * Handle sparse files.  We can't reclaim more blocks than the inode
6070	 * references.  We will correct it later in handle_complete_freeblks()
6071	 * when we know the real count.
6072	 */
6073	if (totblks > datablocks)
6074		return (0);
6075	return (datablocks - totblks);
6076}
6077
6078/*
6079 * Handle freeblocks for journaled softupdate filesystems.
6080 *
6081 * Contrary to normal softupdates, we must preserve the block pointers in
6082 * indirects until their subordinates are free.  This is to avoid journaling
6083 * every block that is freed which may consume more space than the journal
6084 * itself.  The recovery program will see the free block journals at the
6085 * base of the truncated area and traverse them to reclaim space.  The
6086 * pointers in the inode may be cleared immediately after the journal
6087 * records are written because each direct and indirect pointer in the
6088 * inode is recorded in a journal.  This permits full truncation to proceed
6089 * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6090 *
6091 * The algorithm is as follows:
6092 * 1) Traverse the in-memory state and create journal entries to release
6093 *    the relevant blocks and full indirect trees.
6094 * 2) Traverse the indirect block chain adding partial truncation freework
6095 *    records to indirects in the path to lastlbn.  The freework will
6096 *    prevent new allocation dependencies from being satisfied in this
6097 *    indirect until the truncation completes.
6098 * 3) Read and lock the inode block, performing an update with the new size
6099 *    and pointers.  This prevents truncated data from becoming valid on
6100 *    disk through step 4.
6101 * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6102 *    eliminate journal work for those records that do not require it.
6103 * 5) Schedule the journal records to be written followed by the inode block.
6104 * 6) Allocate any necessary frags for the end of file.
6105 * 7) Zero any partially truncated blocks.
6106 *
6107 * From this truncation proceeds asynchronously using the freework and
6108 * indir_trunc machinery.  The file will not be extended again into a
6109 * partially truncated indirect block until all work is completed but
6110 * the normal dependency mechanism ensures that it is rolled back/forward
6111 * as appropriate.  Further truncation may occur without delay and is
6112 * serialized in indir_trunc().
6113 */
6114void
6115softdep_journal_freeblocks(ip, cred, length, flags)
6116	struct inode *ip;	/* The inode whose length is to be reduced */
6117	struct ucred *cred;
6118	off_t length;		/* The new length for the file */
6119	int flags;		/* IO_EXT and/or IO_NORMAL */
6120{
6121	struct freeblks *freeblks, *fbn;
6122	struct inodedep *inodedep;
6123	struct jblkdep *jblkdep;
6124	struct allocdirect *adp, *adpn;
6125	struct fs *fs;
6126	struct buf *bp;
6127	struct vnode *vp;
6128	struct mount *mp;
6129	ufs2_daddr_t extblocks, datablocks;
6130	ufs_lbn_t tmpval, lbn, lastlbn;
6131	int frags, lastoff, iboff, allocblock, needj, dflags, error, i;
6132
6133	fs = ip->i_fs;
6134	mp = UFSTOVFS(ip->i_ump);
6135	vp = ITOV(ip);
6136	needj = 1;
6137	iboff = -1;
6138	allocblock = 0;
6139	extblocks = 0;
6140	datablocks = 0;
6141	frags = 0;
6142	freeblks = newfreeblks(mp, ip);
6143	ACQUIRE_LOCK(&lk);
6144	/*
6145	 * If we're truncating a removed file that will never be written
6146	 * we don't need to journal the block frees.  The canceled journals
6147	 * for the allocations will suffice.
6148	 */
6149	dflags = DEPALLOC;
6150	if (IS_SNAPSHOT(ip))
6151		dflags |= NODELAY;
6152	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6153	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6154	    length == 0)
6155		needj = 0;
6156	FREE_LOCK(&lk);
6157	/*
6158	 * Calculate the lbn that we are truncating to.  This results in -1
6159	 * if we're truncating the 0 bytes.  So it is the last lbn we want
6160	 * to keep, not the first lbn we want to truncate.
6161	 */
6162	lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6163	lastoff = blkoff(fs, length);
6164	/*
6165	 * Compute frags we are keeping in lastlbn.  0 means all.
6166	 */
6167	if (lastlbn >= 0 && lastlbn < NDADDR) {
6168		frags = fragroundup(fs, lastoff);
6169		/* adp offset of last valid allocdirect. */
6170		iboff = lastlbn;
6171	} else if (lastlbn > 0)
6172		iboff = NDADDR;
6173	if (fs->fs_magic == FS_UFS2_MAGIC)
6174		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6175	/*
6176	 * Handle normal data blocks and indirects.  This section saves
6177	 * values used after the inode update to complete frag and indirect
6178	 * truncation.
6179	 */
6180	if ((flags & IO_NORMAL) != 0) {
6181		/*
6182		 * Handle truncation of whole direct and indirect blocks.
6183		 */
6184		for (i = iboff + 1; i < NDADDR; i++)
6185			setup_freedirect(freeblks, ip, i, needj);
6186		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6187		    i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6188			/* Release a whole indirect tree. */
6189			if (lbn > lastlbn) {
6190				setup_freeindir(freeblks, ip, i, -lbn -i,
6191				    needj);
6192				continue;
6193			}
6194			iboff = i + NDADDR;
6195			/*
6196			 * Traverse partially truncated indirect tree.
6197			 */
6198			if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6199				setup_trunc_indir(freeblks, ip, -lbn - i,
6200				    lastlbn, DIP(ip, i_ib[i]));
6201		}
6202		/*
6203		 * Handle partial truncation to a frag boundary.
6204		 */
6205		if (frags) {
6206			ufs2_daddr_t blkno;
6207			long oldfrags;
6208
6209			oldfrags = blksize(fs, ip, lastlbn);
6210			blkno = DIP(ip, i_db[lastlbn]);
6211			if (blkno && oldfrags != frags) {
6212				oldfrags -= frags;
6213				oldfrags = numfrags(ip->i_fs, oldfrags);
6214				blkno += numfrags(ip->i_fs, frags);
6215				newfreework(ip->i_ump, freeblks, NULL, lastlbn,
6216				    blkno, oldfrags, 0, needj);
6217			} else if (blkno == 0)
6218				allocblock = 1;
6219		}
6220		/*
6221		 * Add a journal record for partial truncate if we are
6222		 * handling indirect blocks.  Non-indirects need no extra
6223		 * journaling.
6224		 */
6225		if (length != 0 && lastlbn >= NDADDR) {
6226			ip->i_flag |= IN_TRUNCATED;
6227			newjtrunc(freeblks, length, 0);
6228		}
6229		ip->i_size = length;
6230		DIP_SET(ip, i_size, ip->i_size);
6231		datablocks = DIP(ip, i_blocks) - extblocks;
6232		if (length != 0)
6233			datablocks = blkcount(ip->i_fs, datablocks, length);
6234		freeblks->fb_len = length;
6235	}
6236	if ((flags & IO_EXT) != 0) {
6237		for (i = 0; i < NXADDR; i++)
6238			setup_freeext(freeblks, ip, i, needj);
6239		ip->i_din2->di_extsize = 0;
6240		datablocks += extblocks;
6241	}
6242#ifdef QUOTA
6243	/* Reference the quotas in case the block count is wrong in the end. */
6244	quotaref(vp, freeblks->fb_quota);
6245	(void) chkdq(ip, -datablocks, NOCRED, 0);
6246#endif
6247	freeblks->fb_chkcnt = -datablocks;
6248	UFS_LOCK(ip->i_ump);
6249	fs->fs_pendingblocks += datablocks;
6250	UFS_UNLOCK(ip->i_ump);
6251	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6252	/*
6253	 * Handle truncation of incomplete alloc direct dependencies.  We
6254	 * hold the inode block locked to prevent incomplete dependencies
6255	 * from reaching the disk while we are eliminating those that
6256	 * have been truncated.  This is a partially inlined ffs_update().
6257	 */
6258	ufs_itimes(vp);
6259	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6260	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6261	    (int)fs->fs_bsize, cred, &bp);
6262	if (error) {
6263		brelse(bp);
6264		softdep_error("softdep_journal_freeblocks", error);
6265		return;
6266	}
6267	if (bp->b_bufsize == fs->fs_bsize)
6268		bp->b_flags |= B_CLUSTEROK;
6269	softdep_update_inodeblock(ip, bp, 0);
6270	if (ip->i_ump->um_fstype == UFS1)
6271		*((struct ufs1_dinode *)bp->b_data +
6272		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6273	else
6274		*((struct ufs2_dinode *)bp->b_data +
6275		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6276	ACQUIRE_LOCK(&lk);
6277	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6278	if ((inodedep->id_state & IOSTARTED) != 0)
6279		panic("softdep_setup_freeblocks: inode busy");
6280	/*
6281	 * Add the freeblks structure to the list of operations that
6282	 * must await the zero'ed inode being written to disk. If we
6283	 * still have a bitmap dependency (needj), then the inode
6284	 * has never been written to disk, so we can process the
6285	 * freeblks below once we have deleted the dependencies.
6286	 */
6287	if (needj)
6288		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6289	else
6290		freeblks->fb_state |= COMPLETE;
6291	if ((flags & IO_NORMAL) != 0) {
6292		TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6293			if (adp->ad_offset > iboff)
6294				cancel_allocdirect(&inodedep->id_inoupdt, adp,
6295				    freeblks);
6296			/*
6297			 * Truncate the allocdirect.  We could eliminate
6298			 * or modify journal records as well.
6299			 */
6300			else if (adp->ad_offset == iboff && frags)
6301				adp->ad_newsize = frags;
6302		}
6303	}
6304	if ((flags & IO_EXT) != 0)
6305		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6306			cancel_allocdirect(&inodedep->id_extupdt, adp,
6307			    freeblks);
6308	/*
6309	 * Add journal work.
6310	 */
6311	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6312		add_to_journal(&jblkdep->jb_list);
6313	FREE_LOCK(&lk);
6314	bdwrite(bp);
6315	/*
6316	 * Truncate dependency structures beyond length.
6317	 */
6318	trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6319	/*
6320	 * This is only set when we need to allocate a fragment because
6321	 * none existed at the end of a frag-sized file.  It handles only
6322	 * allocating a new, zero filled block.
6323	 */
6324	if (allocblock) {
6325		ip->i_size = length - lastoff;
6326		DIP_SET(ip, i_size, ip->i_size);
6327		error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6328		if (error != 0) {
6329			softdep_error("softdep_journal_freeblks", error);
6330			return;
6331		}
6332		ip->i_size = length;
6333		DIP_SET(ip, i_size, length);
6334		ip->i_flag |= IN_CHANGE | IN_UPDATE;
6335		allocbuf(bp, frags);
6336		ffs_update(vp, 0);
6337		bawrite(bp);
6338	} else if (lastoff != 0 && vp->v_type != VDIR) {
6339		int size;
6340
6341		/*
6342		 * Zero the end of a truncated frag or block.
6343		 */
6344		size = sblksize(fs, length, lastlbn);
6345		error = bread(vp, lastlbn, size, cred, &bp);
6346		if (error) {
6347			softdep_error("softdep_journal_freeblks", error);
6348			return;
6349		}
6350		bzero((char *)bp->b_data + lastoff, size - lastoff);
6351		bawrite(bp);
6352
6353	}
6354	ACQUIRE_LOCK(&lk);
6355	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6356	TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
6357	freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
6358	/*
6359	 * We zero earlier truncations so they don't erroneously
6360	 * update i_blocks.
6361	 */
6362	if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
6363		TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
6364			fbn->fb_len = 0;
6365	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
6366	    LIST_EMPTY(&freeblks->fb_jblkdephd))
6367		freeblks->fb_state |= INPROGRESS;
6368	else
6369		freeblks = NULL;
6370	FREE_LOCK(&lk);
6371	if (freeblks)
6372		handle_workitem_freeblocks(freeblks, 0);
6373	trunc_pages(ip, length, extblocks, flags);
6374
6375}
6376
6377/*
6378 * Flush a JOP_SYNC to the journal.
6379 */
6380void
6381softdep_journal_fsync(ip)
6382	struct inode *ip;
6383{
6384	struct jfsync *jfsync;
6385
6386	if ((ip->i_flag & IN_TRUNCATED) == 0)
6387		return;
6388	ip->i_flag &= ~IN_TRUNCATED;
6389	jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
6390	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump));
6391	jfsync->jfs_size = ip->i_size;
6392	jfsync->jfs_ino = ip->i_number;
6393	ACQUIRE_LOCK(&lk);
6394	add_to_journal(&jfsync->jfs_list);
6395	jwait(&jfsync->jfs_list, MNT_WAIT);
6396	FREE_LOCK(&lk);
6397}
6398
6399/*
6400 * Block de-allocation dependencies.
6401 *
6402 * When blocks are de-allocated, the on-disk pointers must be nullified before
6403 * the blocks are made available for use by other files.  (The true
6404 * requirement is that old pointers must be nullified before new on-disk
6405 * pointers are set.  We chose this slightly more stringent requirement to
6406 * reduce complexity.) Our implementation handles this dependency by updating
6407 * the inode (or indirect block) appropriately but delaying the actual block
6408 * de-allocation (i.e., freemap and free space count manipulation) until
6409 * after the updated versions reach stable storage.  After the disk is
6410 * updated, the blocks can be safely de-allocated whenever it is convenient.
6411 * This implementation handles only the common case of reducing a file's
6412 * length to zero. Other cases are handled by the conventional synchronous
6413 * write approach.
6414 *
6415 * The ffs implementation with which we worked double-checks
6416 * the state of the block pointers and file size as it reduces
6417 * a file's length.  Some of this code is replicated here in our
6418 * soft updates implementation.  The freeblks->fb_chkcnt field is
6419 * used to transfer a part of this information to the procedure
6420 * that eventually de-allocates the blocks.
6421 *
6422 * This routine should be called from the routine that shortens
6423 * a file's length, before the inode's size or block pointers
6424 * are modified. It will save the block pointer information for
6425 * later release and zero the inode so that the calling routine
6426 * can release it.
6427 */
6428void
6429softdep_setup_freeblocks(ip, length, flags)
6430	struct inode *ip;	/* The inode whose length is to be reduced */
6431	off_t length;		/* The new length for the file */
6432	int flags;		/* IO_EXT and/or IO_NORMAL */
6433{
6434	struct ufs1_dinode *dp1;
6435	struct ufs2_dinode *dp2;
6436	struct freeblks *freeblks;
6437	struct inodedep *inodedep;
6438	struct allocdirect *adp;
6439	struct buf *bp;
6440	struct fs *fs;
6441	ufs2_daddr_t extblocks, datablocks;
6442	struct mount *mp;
6443	int i, delay, error, dflags;
6444	ufs_lbn_t tmpval;
6445	ufs_lbn_t lbn;
6446
6447	fs = ip->i_fs;
6448	mp = UFSTOVFS(ip->i_ump);
6449	if (length != 0)
6450		panic("softdep_setup_freeblocks: non-zero length");
6451	freeblks = newfreeblks(mp, ip);
6452	extblocks = 0;
6453	datablocks = 0;
6454	if (fs->fs_magic == FS_UFS2_MAGIC)
6455		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6456	if ((flags & IO_NORMAL) != 0) {
6457		for (i = 0; i < NDADDR; i++)
6458			setup_freedirect(freeblks, ip, i, 0);
6459		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6460		    i++, lbn += tmpval, tmpval *= NINDIR(fs))
6461			setup_freeindir(freeblks, ip, i, -lbn -i, 0);
6462		ip->i_size = 0;
6463		DIP_SET(ip, i_size, 0);
6464		datablocks = DIP(ip, i_blocks) - extblocks;
6465	}
6466	if ((flags & IO_EXT) != 0) {
6467		for (i = 0; i < NXADDR; i++)
6468			setup_freeext(freeblks, ip, i, 0);
6469		ip->i_din2->di_extsize = 0;
6470		datablocks += extblocks;
6471	}
6472#ifdef QUOTA
6473	/* Reference the quotas in case the block count is wrong in the end. */
6474	quotaref(ITOV(ip), freeblks->fb_quota);
6475	(void) chkdq(ip, -datablocks, NOCRED, 0);
6476#endif
6477	freeblks->fb_chkcnt = -datablocks;
6478	UFS_LOCK(ip->i_ump);
6479	fs->fs_pendingblocks += datablocks;
6480	UFS_UNLOCK(ip->i_ump);
6481	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6482	/*
6483	 * Push the zero'ed inode to to its disk buffer so that we are free
6484	 * to delete its dependencies below. Once the dependencies are gone
6485	 * the buffer can be safely released.
6486	 */
6487	if ((error = bread(ip->i_devvp,
6488	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6489	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
6490		brelse(bp);
6491		softdep_error("softdep_setup_freeblocks", error);
6492	}
6493	if (ip->i_ump->um_fstype == UFS1) {
6494		dp1 = ((struct ufs1_dinode *)bp->b_data +
6495		    ino_to_fsbo(fs, ip->i_number));
6496		ip->i_din1->di_freelink = dp1->di_freelink;
6497		*dp1 = *ip->i_din1;
6498	} else {
6499		dp2 = ((struct ufs2_dinode *)bp->b_data +
6500		    ino_to_fsbo(fs, ip->i_number));
6501		ip->i_din2->di_freelink = dp2->di_freelink;
6502		*dp2 = *ip->i_din2;
6503	}
6504	/*
6505	 * Find and eliminate any inode dependencies.
6506	 */
6507	ACQUIRE_LOCK(&lk);
6508	dflags = DEPALLOC;
6509	if (IS_SNAPSHOT(ip))
6510		dflags |= NODELAY;
6511	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6512	if ((inodedep->id_state & IOSTARTED) != 0)
6513		panic("softdep_setup_freeblocks: inode busy");
6514	/*
6515	 * Add the freeblks structure to the list of operations that
6516	 * must await the zero'ed inode being written to disk. If we
6517	 * still have a bitmap dependency (delay == 0), then the inode
6518	 * has never been written to disk, so we can process the
6519	 * freeblks below once we have deleted the dependencies.
6520	 */
6521	delay = (inodedep->id_state & DEPCOMPLETE);
6522	if (delay)
6523		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6524	else
6525		freeblks->fb_state |= COMPLETE;
6526	/*
6527	 * Because the file length has been truncated to zero, any
6528	 * pending block allocation dependency structures associated
6529	 * with this inode are obsolete and can simply be de-allocated.
6530	 * We must first merge the two dependency lists to get rid of
6531	 * any duplicate freefrag structures, then purge the merged list.
6532	 * If we still have a bitmap dependency, then the inode has never
6533	 * been written to disk, so we can free any fragments without delay.
6534	 */
6535	if (flags & IO_NORMAL) {
6536		merge_inode_lists(&inodedep->id_newinoupdt,
6537		    &inodedep->id_inoupdt);
6538		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
6539			cancel_allocdirect(&inodedep->id_inoupdt, adp,
6540			    freeblks);
6541	}
6542	if (flags & IO_EXT) {
6543		merge_inode_lists(&inodedep->id_newextupdt,
6544		    &inodedep->id_extupdt);
6545		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6546			cancel_allocdirect(&inodedep->id_extupdt, adp,
6547			    freeblks);
6548	}
6549	FREE_LOCK(&lk);
6550	bdwrite(bp);
6551	trunc_dependencies(ip, freeblks, -1, 0, flags);
6552	ACQUIRE_LOCK(&lk);
6553	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
6554		(void) free_inodedep(inodedep);
6555	freeblks->fb_state |= DEPCOMPLETE;
6556	/*
6557	 * If the inode with zeroed block pointers is now on disk
6558	 * we can start freeing blocks.
6559	 */
6560	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
6561		freeblks->fb_state |= INPROGRESS;
6562	else
6563		freeblks = NULL;
6564	FREE_LOCK(&lk);
6565	if (freeblks)
6566		handle_workitem_freeblocks(freeblks, 0);
6567	trunc_pages(ip, length, extblocks, flags);
6568}
6569
6570/*
6571 * Eliminate pages from the page cache that back parts of this inode and
6572 * adjust the vnode pager's idea of our size.  This prevents stale data
6573 * from hanging around in the page cache.
6574 */
6575static void
6576trunc_pages(ip, length, extblocks, flags)
6577	struct inode *ip;
6578	off_t length;
6579	ufs2_daddr_t extblocks;
6580	int flags;
6581{
6582	struct vnode *vp;
6583	struct fs *fs;
6584	ufs_lbn_t lbn;
6585	off_t end, extend;
6586
6587	vp = ITOV(ip);
6588	fs = ip->i_fs;
6589	extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
6590	if ((flags & IO_EXT) != 0)
6591		vn_pages_remove(vp, extend, 0);
6592	if ((flags & IO_NORMAL) == 0)
6593		return;
6594	BO_LOCK(&vp->v_bufobj);
6595	drain_output(vp);
6596	BO_UNLOCK(&vp->v_bufobj);
6597	/*
6598	 * The vnode pager eliminates file pages we eliminate indirects
6599	 * below.
6600	 */
6601	vnode_pager_setsize(vp, length);
6602	/*
6603	 * Calculate the end based on the last indirect we want to keep.  If
6604	 * the block extends into indirects we can just use the negative of
6605	 * its lbn.  Doubles and triples exist at lower numbers so we must
6606	 * be careful not to remove those, if they exist.  double and triple
6607	 * indirect lbns do not overlap with others so it is not important
6608	 * to verify how many levels are required.
6609	 */
6610	lbn = lblkno(fs, length);
6611	if (lbn >= NDADDR) {
6612		/* Calculate the virtual lbn of the triple indirect. */
6613		lbn = -lbn - (NIADDR - 1);
6614		end = OFF_TO_IDX(lblktosize(fs, lbn));
6615	} else
6616		end = extend;
6617	vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
6618}
6619
6620/*
6621 * See if the buf bp is in the range eliminated by truncation.
6622 */
6623static int
6624trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
6625	struct buf *bp;
6626	int *blkoffp;
6627	ufs_lbn_t lastlbn;
6628	int lastoff;
6629	int flags;
6630{
6631	ufs_lbn_t lbn;
6632
6633	*blkoffp = 0;
6634	/* Only match ext/normal blocks as appropriate. */
6635	if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
6636	    ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
6637		return (0);
6638	/* ALTDATA is always a full truncation. */
6639	if ((bp->b_xflags & BX_ALTDATA) != 0)
6640		return (1);
6641	/* -1 is full truncation. */
6642	if (lastlbn == -1)
6643		return (1);
6644	/*
6645	 * If this is a partial truncate we only want those
6646	 * blocks and indirect blocks that cover the range
6647	 * we're after.
6648	 */
6649	lbn = bp->b_lblkno;
6650	if (lbn < 0)
6651		lbn = -(lbn + lbn_level(lbn));
6652	if (lbn < lastlbn)
6653		return (0);
6654	/* Here we only truncate lblkno if it's partial. */
6655	if (lbn == lastlbn) {
6656		if (lastoff == 0)
6657			return (0);
6658		*blkoffp = lastoff;
6659	}
6660	return (1);
6661}
6662
6663/*
6664 * Eliminate any dependencies that exist in memory beyond lblkno:off
6665 */
6666static void
6667trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
6668	struct inode *ip;
6669	struct freeblks *freeblks;
6670	ufs_lbn_t lastlbn;
6671	int lastoff;
6672	int flags;
6673{
6674	struct bufobj *bo;
6675	struct vnode *vp;
6676	struct buf *bp;
6677	struct fs *fs;
6678	int blkoff;
6679
6680	/*
6681	 * We must wait for any I/O in progress to finish so that
6682	 * all potential buffers on the dirty list will be visible.
6683	 * Once they are all there, walk the list and get rid of
6684	 * any dependencies.
6685	 */
6686	fs = ip->i_fs;
6687	vp = ITOV(ip);
6688	bo = &vp->v_bufobj;
6689	BO_LOCK(bo);
6690	drain_output(vp);
6691	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
6692		bp->b_vflags &= ~BV_SCANNED;
6693restart:
6694	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
6695		if (bp->b_vflags & BV_SCANNED)
6696			continue;
6697		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
6698			bp->b_vflags |= BV_SCANNED;
6699			continue;
6700		}
6701		if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL)
6702			goto restart;
6703		BO_UNLOCK(bo);
6704		if (deallocate_dependencies(bp, freeblks, blkoff))
6705			bqrelse(bp);
6706		else
6707			brelse(bp);
6708		BO_LOCK(bo);
6709		goto restart;
6710	}
6711	/*
6712	 * Now do the work of vtruncbuf while also matching indirect blocks.
6713	 */
6714	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
6715		bp->b_vflags &= ~BV_SCANNED;
6716cleanrestart:
6717	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
6718		if (bp->b_vflags & BV_SCANNED)
6719			continue;
6720		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
6721			bp->b_vflags |= BV_SCANNED;
6722			continue;
6723		}
6724		if (BUF_LOCK(bp,
6725		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6726		    BO_MTX(bo)) == ENOLCK) {
6727			BO_LOCK(bo);
6728			goto cleanrestart;
6729		}
6730		bp->b_vflags |= BV_SCANNED;
6731		BO_LOCK(bo);
6732		bremfree(bp);
6733		BO_UNLOCK(bo);
6734		if (blkoff != 0) {
6735			allocbuf(bp, blkoff);
6736			bqrelse(bp);
6737		} else {
6738			bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
6739			brelse(bp);
6740		}
6741		BO_LOCK(bo);
6742		goto cleanrestart;
6743	}
6744	drain_output(vp);
6745	BO_UNLOCK(bo);
6746}
6747
6748static int
6749cancel_pagedep(pagedep, freeblks, blkoff)
6750	struct pagedep *pagedep;
6751	struct freeblks *freeblks;
6752	int blkoff;
6753{
6754	struct jremref *jremref;
6755	struct jmvref *jmvref;
6756	struct dirrem *dirrem, *tmp;
6757	int i;
6758
6759	/*
6760	 * Copy any directory remove dependencies to the list
6761	 * to be processed after the freeblks proceeds.  If
6762	 * directory entry never made it to disk they
6763	 * can be dumped directly onto the work list.
6764	 */
6765	LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
6766		/* Skip this directory removal if it is intended to remain. */
6767		if (dirrem->dm_offset < blkoff)
6768			continue;
6769		/*
6770		 * If there are any dirrems we wait for the journal write
6771		 * to complete and then restart the buf scan as the lock
6772		 * has been dropped.
6773		 */
6774		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
6775			jwait(&jremref->jr_list, MNT_WAIT);
6776			return (ERESTART);
6777		}
6778		LIST_REMOVE(dirrem, dm_next);
6779		dirrem->dm_dirinum = pagedep->pd_ino;
6780		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
6781	}
6782	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
6783		jwait(&jmvref->jm_list, MNT_WAIT);
6784		return (ERESTART);
6785	}
6786	/*
6787	 * When we're partially truncating a pagedep we just want to flush
6788	 * journal entries and return.  There can not be any adds in the
6789	 * truncated portion of the directory and newblk must remain if
6790	 * part of the block remains.
6791	 */
6792	if (blkoff != 0) {
6793		struct diradd *dap;
6794
6795		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
6796			if (dap->da_offset > blkoff)
6797				panic("cancel_pagedep: diradd %p off %d > %d",
6798				    dap, dap->da_offset, blkoff);
6799		for (i = 0; i < DAHASHSZ; i++)
6800			LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
6801				if (dap->da_offset > blkoff)
6802					panic("cancel_pagedep: diradd %p off %d > %d",
6803					    dap, dap->da_offset, blkoff);
6804		return (0);
6805	}
6806	/*
6807	 * There should be no directory add dependencies present
6808	 * as the directory could not be truncated until all
6809	 * children were removed.
6810	 */
6811	KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
6812	    ("deallocate_dependencies: pendinghd != NULL"));
6813	for (i = 0; i < DAHASHSZ; i++)
6814		KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
6815		    ("deallocate_dependencies: diraddhd != NULL"));
6816	if ((pagedep->pd_state & NEWBLOCK) != 0)
6817		free_newdirblk(pagedep->pd_newdirblk);
6818	if (free_pagedep(pagedep) == 0)
6819		panic("Failed to free pagedep %p", pagedep);
6820	return (0);
6821}
6822
6823/*
6824 * Reclaim any dependency structures from a buffer that is about to
6825 * be reallocated to a new vnode. The buffer must be locked, thus,
6826 * no I/O completion operations can occur while we are manipulating
6827 * its associated dependencies. The mutex is held so that other I/O's
6828 * associated with related dependencies do not occur.
6829 */
6830static int
6831deallocate_dependencies(bp, freeblks, off)
6832	struct buf *bp;
6833	struct freeblks *freeblks;
6834	int off;
6835{
6836	struct indirdep *indirdep;
6837	struct pagedep *pagedep;
6838	struct allocdirect *adp;
6839	struct worklist *wk, *wkn;
6840
6841	ACQUIRE_LOCK(&lk);
6842	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
6843		switch (wk->wk_type) {
6844		case D_INDIRDEP:
6845			indirdep = WK_INDIRDEP(wk);
6846			if (bp->b_lblkno >= 0 ||
6847			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
6848				panic("deallocate_dependencies: not indir");
6849			cancel_indirdep(indirdep, bp, freeblks);
6850			continue;
6851
6852		case D_PAGEDEP:
6853			pagedep = WK_PAGEDEP(wk);
6854			if (cancel_pagedep(pagedep, freeblks, off)) {
6855				FREE_LOCK(&lk);
6856				return (ERESTART);
6857			}
6858			continue;
6859
6860		case D_ALLOCINDIR:
6861			/*
6862			 * Simply remove the allocindir, we'll find it via
6863			 * the indirdep where we can clear pointers if
6864			 * needed.
6865			 */
6866			WORKLIST_REMOVE(wk);
6867			continue;
6868
6869		case D_FREEWORK:
6870			/*
6871			 * A truncation is waiting for the zero'd pointers
6872			 * to be written.  It can be freed when the freeblks
6873			 * is journaled.
6874			 */
6875			WORKLIST_REMOVE(wk);
6876			wk->wk_state |= ONDEPLIST;
6877			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6878			break;
6879
6880		case D_ALLOCDIRECT:
6881			adp = WK_ALLOCDIRECT(wk);
6882			if (off != 0)
6883				continue;
6884			/* FALLTHROUGH */
6885		default:
6886			panic("deallocate_dependencies: Unexpected type %s",
6887			    TYPENAME(wk->wk_type));
6888			/* NOTREACHED */
6889		}
6890	}
6891	FREE_LOCK(&lk);
6892	/*
6893	 * Don't throw away this buf, we were partially truncating and
6894	 * some deps may always remain.
6895	 */
6896	if (off) {
6897		allocbuf(bp, off);
6898		bp->b_vflags |= BV_SCANNED;
6899		return (EBUSY);
6900	}
6901	bp->b_flags |= B_INVAL | B_NOCACHE;
6902
6903	return (0);
6904}
6905
6906/*
6907 * An allocdirect is being canceled due to a truncate.  We must make sure
6908 * the journal entry is released in concert with the blkfree that releases
6909 * the storage.  Completed journal entries must not be released until the
6910 * space is no longer pointed to by the inode or in the bitmap.
6911 */
6912static void
6913cancel_allocdirect(adphead, adp, freeblks)
6914	struct allocdirectlst *adphead;
6915	struct allocdirect *adp;
6916	struct freeblks *freeblks;
6917{
6918	struct freework *freework;
6919	struct newblk *newblk;
6920	struct worklist *wk;
6921
6922	TAILQ_REMOVE(adphead, adp, ad_next);
6923	newblk = (struct newblk *)adp;
6924	freework = NULL;
6925	/*
6926	 * Find the correct freework structure.
6927	 */
6928	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
6929		if (wk->wk_type != D_FREEWORK)
6930			continue;
6931		freework = WK_FREEWORK(wk);
6932		if (freework->fw_blkno == newblk->nb_newblkno)
6933			break;
6934	}
6935	if (freework == NULL)
6936		panic("cancel_allocdirect: Freework not found");
6937	/*
6938	 * If a newblk exists at all we still have the journal entry that
6939	 * initiated the allocation so we do not need to journal the free.
6940	 */
6941	cancel_jfreeblk(freeblks, freework->fw_blkno);
6942	/*
6943	 * If the journal hasn't been written the jnewblk must be passed
6944	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
6945	 * this by linking the journal dependency into the freework to be
6946	 * freed when freework_freeblock() is called.  If the journal has
6947	 * been written we can simply reclaim the journal space when the
6948	 * freeblks work is complete.
6949	 */
6950	freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
6951	    &freeblks->fb_jwork);
6952	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
6953}
6954
6955
6956/*
6957 * Cancel a new block allocation.  May be an indirect or direct block.  We
6958 * remove it from various lists and return any journal record that needs to
6959 * be resolved by the caller.
6960 *
6961 * A special consideration is made for indirects which were never pointed
6962 * at on disk and will never be found once this block is released.
6963 */
6964static struct jnewblk *
6965cancel_newblk(newblk, wk, wkhd)
6966	struct newblk *newblk;
6967	struct worklist *wk;
6968	struct workhead *wkhd;
6969{
6970	struct jnewblk *jnewblk;
6971
6972	newblk->nb_state |= GOINGAWAY;
6973	/*
6974	 * Previously we traversed the completedhd on each indirdep
6975	 * attached to this newblk to cancel them and gather journal
6976	 * work.  Since we need only the oldest journal segment and
6977	 * the lowest point on the tree will always have the oldest
6978	 * journal segment we are free to release the segments
6979	 * of any subordinates and may leave the indirdep list to
6980	 * indirdep_complete() when this newblk is freed.
6981	 */
6982	if (newblk->nb_state & ONDEPLIST) {
6983		newblk->nb_state &= ~ONDEPLIST;
6984		LIST_REMOVE(newblk, nb_deps);
6985	}
6986	if (newblk->nb_state & ONWORKLIST)
6987		WORKLIST_REMOVE(&newblk->nb_list);
6988	/*
6989	 * If the journal entry hasn't been written we save a pointer to
6990	 * the dependency that frees it until it is written or the
6991	 * superseding operation completes.
6992	 */
6993	jnewblk = newblk->nb_jnewblk;
6994	if (jnewblk != NULL && wk != NULL) {
6995		newblk->nb_jnewblk = NULL;
6996		jnewblk->jn_dep = wk;
6997	}
6998	if (!LIST_EMPTY(&newblk->nb_jwork))
6999		jwork_move(wkhd, &newblk->nb_jwork);
7000	/*
7001	 * When truncating we must free the newdirblk early to remove
7002	 * the pagedep from the hash before returning.
7003	 */
7004	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7005		free_newdirblk(WK_NEWDIRBLK(wk));
7006	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7007		panic("cancel_newblk: extra newdirblk");
7008
7009	return (jnewblk);
7010}
7011
7012/*
7013 * Schedule the freefrag associated with a newblk to be released once
7014 * the pointers are written and the previous block is no longer needed.
7015 */
7016static void
7017newblk_freefrag(newblk)
7018	struct newblk *newblk;
7019{
7020	struct freefrag *freefrag;
7021
7022	if (newblk->nb_freefrag == NULL)
7023		return;
7024	freefrag = newblk->nb_freefrag;
7025	newblk->nb_freefrag = NULL;
7026	freefrag->ff_state |= COMPLETE;
7027	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7028		add_to_worklist(&freefrag->ff_list, 0);
7029}
7030
7031/*
7032 * Free a newblk. Generate a new freefrag work request if appropriate.
7033 * This must be called after the inode pointer and any direct block pointers
7034 * are valid or fully removed via truncate or frag extension.
7035 */
7036static void
7037free_newblk(newblk)
7038	struct newblk *newblk;
7039{
7040	struct indirdep *indirdep;
7041	struct worklist *wk;
7042
7043	KASSERT(newblk->nb_jnewblk == NULL,
7044	    ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk));
7045	mtx_assert(&lk, MA_OWNED);
7046	newblk_freefrag(newblk);
7047	if (newblk->nb_state & ONDEPLIST)
7048		LIST_REMOVE(newblk, nb_deps);
7049	if (newblk->nb_state & ONWORKLIST)
7050		WORKLIST_REMOVE(&newblk->nb_list);
7051	LIST_REMOVE(newblk, nb_hash);
7052	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7053		free_newdirblk(WK_NEWDIRBLK(wk));
7054	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7055		panic("free_newblk: extra newdirblk");
7056	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7057		indirdep_complete(indirdep);
7058	handle_jwork(&newblk->nb_jwork);
7059	newblk->nb_list.wk_type = D_NEWBLK;
7060	WORKITEM_FREE(newblk, D_NEWBLK);
7061}
7062
7063/*
7064 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7065 * This routine must be called with splbio interrupts blocked.
7066 */
7067static void
7068free_newdirblk(newdirblk)
7069	struct newdirblk *newdirblk;
7070{
7071	struct pagedep *pagedep;
7072	struct diradd *dap;
7073	struct worklist *wk;
7074
7075	mtx_assert(&lk, MA_OWNED);
7076	WORKLIST_REMOVE(&newdirblk->db_list);
7077	/*
7078	 * If the pagedep is still linked onto the directory buffer
7079	 * dependency chain, then some of the entries on the
7080	 * pd_pendinghd list may not be committed to disk yet. In
7081	 * this case, we will simply clear the NEWBLOCK flag and
7082	 * let the pd_pendinghd list be processed when the pagedep
7083	 * is next written. If the pagedep is no longer on the buffer
7084	 * dependency chain, then all the entries on the pd_pending
7085	 * list are committed to disk and we can free them here.
7086	 */
7087	pagedep = newdirblk->db_pagedep;
7088	pagedep->pd_state &= ~NEWBLOCK;
7089	if ((pagedep->pd_state & ONWORKLIST) == 0) {
7090		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7091			free_diradd(dap, NULL);
7092		/*
7093		 * If no dependencies remain, the pagedep will be freed.
7094		 */
7095		free_pagedep(pagedep);
7096	}
7097	/* Should only ever be one item in the list. */
7098	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7099		WORKLIST_REMOVE(wk);
7100		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7101	}
7102	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7103}
7104
7105/*
7106 * Prepare an inode to be freed. The actual free operation is not
7107 * done until the zero'ed inode has been written to disk.
7108 */
7109void
7110softdep_freefile(pvp, ino, mode)
7111	struct vnode *pvp;
7112	ino_t ino;
7113	int mode;
7114{
7115	struct inode *ip = VTOI(pvp);
7116	struct inodedep *inodedep;
7117	struct freefile *freefile;
7118	struct freeblks *freeblks;
7119
7120	/*
7121	 * This sets up the inode de-allocation dependency.
7122	 */
7123	freefile = malloc(sizeof(struct freefile),
7124		M_FREEFILE, M_SOFTDEP_FLAGS);
7125	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7126	freefile->fx_mode = mode;
7127	freefile->fx_oldinum = ino;
7128	freefile->fx_devvp = ip->i_devvp;
7129	LIST_INIT(&freefile->fx_jwork);
7130	UFS_LOCK(ip->i_ump);
7131	ip->i_fs->fs_pendinginodes += 1;
7132	UFS_UNLOCK(ip->i_ump);
7133
7134	/*
7135	 * If the inodedep does not exist, then the zero'ed inode has
7136	 * been written to disk. If the allocated inode has never been
7137	 * written to disk, then the on-disk inode is zero'ed. In either
7138	 * case we can free the file immediately.  If the journal was
7139	 * canceled before being written the inode will never make it to
7140	 * disk and we must send the canceled journal entrys to
7141	 * ffs_freefile() to be cleared in conjunction with the bitmap.
7142	 * Any blocks waiting on the inode to write can be safely freed
7143	 * here as it will never been written.
7144	 */
7145	ACQUIRE_LOCK(&lk);
7146	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7147	if (inodedep) {
7148		/*
7149		 * Clear out freeblks that no longer need to reference
7150		 * this inode.
7151		 */
7152		while ((freeblks =
7153		    TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7154			TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7155			    fb_next);
7156			freeblks->fb_state &= ~ONDEPLIST;
7157		}
7158		/*
7159		 * Remove this inode from the unlinked list.
7160		 */
7161		if (inodedep->id_state & UNLINKED) {
7162			/*
7163			 * Save the journal work to be freed with the bitmap
7164			 * before we clear UNLINKED.  Otherwise it can be lost
7165			 * if the inode block is written.
7166			 */
7167			handle_bufwait(inodedep, &freefile->fx_jwork);
7168			clear_unlinked_inodedep(inodedep);
7169			/* Re-acquire inodedep as we've dropped lk. */
7170			inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7171		}
7172	}
7173	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7174		FREE_LOCK(&lk);
7175		handle_workitem_freefile(freefile);
7176		return;
7177	}
7178	if ((inodedep->id_state & DEPCOMPLETE) == 0)
7179		inodedep->id_state |= GOINGAWAY;
7180	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7181	FREE_LOCK(&lk);
7182	if (ip->i_number == ino)
7183		ip->i_flag |= IN_MODIFIED;
7184}
7185
7186/*
7187 * Check to see if an inode has never been written to disk. If
7188 * so free the inodedep and return success, otherwise return failure.
7189 * This routine must be called with splbio interrupts blocked.
7190 *
7191 * If we still have a bitmap dependency, then the inode has never
7192 * been written to disk. Drop the dependency as it is no longer
7193 * necessary since the inode is being deallocated. We set the
7194 * ALLCOMPLETE flags since the bitmap now properly shows that the
7195 * inode is not allocated. Even if the inode is actively being
7196 * written, it has been rolled back to its zero'ed state, so we
7197 * are ensured that a zero inode is what is on the disk. For short
7198 * lived files, this change will usually result in removing all the
7199 * dependencies from the inode so that it can be freed immediately.
7200 */
7201static int
7202check_inode_unwritten(inodedep)
7203	struct inodedep *inodedep;
7204{
7205
7206	mtx_assert(&lk, MA_OWNED);
7207
7208	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7209	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7210	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7211	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7212	    !LIST_EMPTY(&inodedep->id_inowait) ||
7213	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7214	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7215	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7216	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7217	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7218	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7219	    inodedep->id_mkdiradd != NULL ||
7220	    inodedep->id_nlinkdelta != 0)
7221		return (0);
7222	/*
7223	 * Another process might be in initiate_write_inodeblock_ufs[12]
7224	 * trying to allocate memory without holding "Softdep Lock".
7225	 */
7226	if ((inodedep->id_state & IOSTARTED) != 0 &&
7227	    inodedep->id_savedino1 == NULL)
7228		return (0);
7229
7230	if (inodedep->id_state & ONDEPLIST)
7231		LIST_REMOVE(inodedep, id_deps);
7232	inodedep->id_state &= ~ONDEPLIST;
7233	inodedep->id_state |= ALLCOMPLETE;
7234	inodedep->id_bmsafemap = NULL;
7235	if (inodedep->id_state & ONWORKLIST)
7236		WORKLIST_REMOVE(&inodedep->id_list);
7237	if (inodedep->id_savedino1 != NULL) {
7238		free(inodedep->id_savedino1, M_SAVEDINO);
7239		inodedep->id_savedino1 = NULL;
7240	}
7241	if (free_inodedep(inodedep) == 0)
7242		panic("check_inode_unwritten: busy inode");
7243	return (1);
7244}
7245
7246/*
7247 * Try to free an inodedep structure. Return 1 if it could be freed.
7248 */
7249static int
7250free_inodedep(inodedep)
7251	struct inodedep *inodedep;
7252{
7253
7254	mtx_assert(&lk, MA_OWNED);
7255	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7256	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7257	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7258	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7259	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7260	    !LIST_EMPTY(&inodedep->id_inowait) ||
7261	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7262	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7263	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7264	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7265	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7266	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7267	    inodedep->id_mkdiradd != NULL ||
7268	    inodedep->id_nlinkdelta != 0 ||
7269	    inodedep->id_savedino1 != NULL)
7270		return (0);
7271	if (inodedep->id_state & ONDEPLIST)
7272		LIST_REMOVE(inodedep, id_deps);
7273	LIST_REMOVE(inodedep, id_hash);
7274	WORKITEM_FREE(inodedep, D_INODEDEP);
7275	return (1);
7276}
7277
7278/*
7279 * Free the block referenced by a freework structure.  The parent freeblks
7280 * structure is released and completed when the final cg bitmap reaches
7281 * the disk.  This routine may be freeing a jnewblk which never made it to
7282 * disk in which case we do not have to wait as the operation is undone
7283 * in memory immediately.
7284 */
7285static void
7286freework_freeblock(freework)
7287	struct freework *freework;
7288{
7289	struct freeblks *freeblks;
7290	struct jnewblk *jnewblk;
7291	struct ufsmount *ump;
7292	struct workhead wkhd;
7293	struct fs *fs;
7294	int bsize;
7295	int needj;
7296
7297	mtx_assert(&lk, MA_OWNED);
7298	/*
7299	 * Handle partial truncate separately.
7300	 */
7301	if (freework->fw_indir) {
7302		complete_trunc_indir(freework);
7303		return;
7304	}
7305	freeblks = freework->fw_freeblks;
7306	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7307	fs = ump->um_fs;
7308	needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7309	bsize = lfragtosize(fs, freework->fw_frags);
7310	LIST_INIT(&wkhd);
7311	/*
7312	 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7313	 * on the indirblk hashtable and prevents premature freeing.
7314	 */
7315	freework->fw_state |= DEPCOMPLETE;
7316	/*
7317	 * SUJ needs to wait for the segment referencing freed indirect
7318	 * blocks to expire so that we know the checker will not confuse
7319	 * a re-allocated indirect block with its old contents.
7320	 */
7321	if (needj && freework->fw_lbn <= -NDADDR)
7322		indirblk_insert(freework);
7323	/*
7324	 * If we are canceling an existing jnewblk pass it to the free
7325	 * routine, otherwise pass the freeblk which will ultimately
7326	 * release the freeblks.  If we're not journaling, we can just
7327	 * free the freeblks immediately.
7328	 */
7329	jnewblk = freework->fw_jnewblk;
7330	if (jnewblk != NULL) {
7331		cancel_jnewblk(jnewblk, &wkhd);
7332		needj = 0;
7333	} else if (needj) {
7334		freework->fw_state |= DELAYEDFREE;
7335		freeblks->fb_cgwait++;
7336		WORKLIST_INSERT(&wkhd, &freework->fw_list);
7337	}
7338	FREE_LOCK(&lk);
7339	freeblks_free(ump, freeblks, btodb(bsize));
7340	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
7341	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
7342	ACQUIRE_LOCK(&lk);
7343	/*
7344	 * The jnewblk will be discarded and the bits in the map never
7345	 * made it to disk.  We can immediately free the freeblk.
7346	 */
7347	if (needj == 0)
7348		handle_written_freework(freework);
7349}
7350
7351/*
7352 * We enqueue freework items that need processing back on the freeblks and
7353 * add the freeblks to the worklist.  This makes it easier to find all work
7354 * required to flush a truncation in process_truncates().
7355 */
7356static void
7357freework_enqueue(freework)
7358	struct freework *freework;
7359{
7360	struct freeblks *freeblks;
7361
7362	freeblks = freework->fw_freeblks;
7363	if ((freework->fw_state & INPROGRESS) == 0)
7364		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
7365	if ((freeblks->fb_state &
7366	    (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
7367	    LIST_EMPTY(&freeblks->fb_jblkdephd))
7368		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7369}
7370
7371/*
7372 * Start, continue, or finish the process of freeing an indirect block tree.
7373 * The free operation may be paused at any point with fw_off containing the
7374 * offset to restart from.  This enables us to implement some flow control
7375 * for large truncates which may fan out and generate a huge number of
7376 * dependencies.
7377 */
7378static void
7379handle_workitem_indirblk(freework)
7380	struct freework *freework;
7381{
7382	struct freeblks *freeblks;
7383	struct ufsmount *ump;
7384	struct fs *fs;
7385
7386	freeblks = freework->fw_freeblks;
7387	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7388	fs = ump->um_fs;
7389	if (freework->fw_state & DEPCOMPLETE) {
7390		handle_written_freework(freework);
7391		return;
7392	}
7393	if (freework->fw_off == NINDIR(fs)) {
7394		freework_freeblock(freework);
7395		return;
7396	}
7397	freework->fw_state |= INPROGRESS;
7398	FREE_LOCK(&lk);
7399	indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
7400	    freework->fw_lbn);
7401	ACQUIRE_LOCK(&lk);
7402}
7403
7404/*
7405 * Called when a freework structure attached to a cg buf is written.  The
7406 * ref on either the parent or the freeblks structure is released and
7407 * the freeblks is added back to the worklist if there is more work to do.
7408 */
7409static void
7410handle_written_freework(freework)
7411	struct freework *freework;
7412{
7413	struct freeblks *freeblks;
7414	struct freework *parent;
7415
7416	freeblks = freework->fw_freeblks;
7417	parent = freework->fw_parent;
7418	if (freework->fw_state & DELAYEDFREE)
7419		freeblks->fb_cgwait--;
7420	freework->fw_state |= COMPLETE;
7421	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
7422		WORKITEM_FREE(freework, D_FREEWORK);
7423	if (parent) {
7424		if (--parent->fw_ref == 0)
7425			freework_enqueue(parent);
7426		return;
7427	}
7428	if (--freeblks->fb_ref != 0)
7429		return;
7430	if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
7431	    ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
7432		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7433}
7434
7435/*
7436 * This workitem routine performs the block de-allocation.
7437 * The workitem is added to the pending list after the updated
7438 * inode block has been written to disk.  As mentioned above,
7439 * checks regarding the number of blocks de-allocated (compared
7440 * to the number of blocks allocated for the file) are also
7441 * performed in this function.
7442 */
7443static int
7444handle_workitem_freeblocks(freeblks, flags)
7445	struct freeblks *freeblks;
7446	int flags;
7447{
7448	struct freework *freework;
7449	struct newblk *newblk;
7450	struct allocindir *aip;
7451	struct ufsmount *ump;
7452	struct worklist *wk;
7453
7454	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
7455	    ("handle_workitem_freeblocks: Journal entries not written."));
7456	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7457	ACQUIRE_LOCK(&lk);
7458	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
7459		WORKLIST_REMOVE(wk);
7460		switch (wk->wk_type) {
7461		case D_DIRREM:
7462			wk->wk_state |= COMPLETE;
7463			add_to_worklist(wk, 0);
7464			continue;
7465
7466		case D_ALLOCDIRECT:
7467			free_newblk(WK_NEWBLK(wk));
7468			continue;
7469
7470		case D_ALLOCINDIR:
7471			aip = WK_ALLOCINDIR(wk);
7472			freework = NULL;
7473			if (aip->ai_state & DELAYEDFREE) {
7474				FREE_LOCK(&lk);
7475				freework = newfreework(ump, freeblks, NULL,
7476				    aip->ai_lbn, aip->ai_newblkno,
7477				    ump->um_fs->fs_frag, 0, 0);
7478				ACQUIRE_LOCK(&lk);
7479			}
7480			newblk = WK_NEWBLK(wk);
7481			if (newblk->nb_jnewblk) {
7482				freework->fw_jnewblk = newblk->nb_jnewblk;
7483				newblk->nb_jnewblk->jn_dep = &freework->fw_list;
7484				newblk->nb_jnewblk = NULL;
7485			}
7486			free_newblk(newblk);
7487			continue;
7488
7489		case D_FREEWORK:
7490			freework = WK_FREEWORK(wk);
7491			if (freework->fw_lbn <= -NDADDR)
7492				handle_workitem_indirblk(freework);
7493			else
7494				freework_freeblock(freework);
7495			continue;
7496		default:
7497			panic("handle_workitem_freeblocks: Unknown type %s",
7498			    TYPENAME(wk->wk_type));
7499		}
7500	}
7501	if (freeblks->fb_ref != 0) {
7502		freeblks->fb_state &= ~INPROGRESS;
7503		wake_worklist(&freeblks->fb_list);
7504		freeblks = NULL;
7505	}
7506	FREE_LOCK(&lk);
7507	if (freeblks)
7508		return handle_complete_freeblocks(freeblks, flags);
7509	return (0);
7510}
7511
7512/*
7513 * Handle completion of block free via truncate.  This allows fs_pending
7514 * to track the actual free block count more closely than if we only updated
7515 * it at the end.  We must be careful to handle cases where the block count
7516 * on free was incorrect.
7517 */
7518static void
7519freeblks_free(ump, freeblks, blocks)
7520	struct ufsmount *ump;
7521	struct freeblks *freeblks;
7522	int blocks;
7523{
7524	struct fs *fs;
7525	ufs2_daddr_t remain;
7526
7527	UFS_LOCK(ump);
7528	remain = -freeblks->fb_chkcnt;
7529	freeblks->fb_chkcnt += blocks;
7530	if (remain > 0) {
7531		if (remain < blocks)
7532			blocks = remain;
7533		fs = ump->um_fs;
7534		fs->fs_pendingblocks -= blocks;
7535	}
7536	UFS_UNLOCK(ump);
7537}
7538
7539/*
7540 * Once all of the freework workitems are complete we can retire the
7541 * freeblocks dependency and any journal work awaiting completion.  This
7542 * can not be called until all other dependencies are stable on disk.
7543 */
7544static int
7545handle_complete_freeblocks(freeblks, flags)
7546	struct freeblks *freeblks;
7547	int flags;
7548{
7549	struct inodedep *inodedep;
7550	struct inode *ip;
7551	struct vnode *vp;
7552	struct fs *fs;
7553	struct ufsmount *ump;
7554	ufs2_daddr_t spare;
7555
7556	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7557	fs = ump->um_fs;
7558	flags = LK_EXCLUSIVE | flags;
7559	spare = freeblks->fb_chkcnt;
7560
7561	/*
7562	 * If we did not release the expected number of blocks we may have
7563	 * to adjust the inode block count here.  Only do so if it wasn't
7564	 * a truncation to zero and the modrev still matches.
7565	 */
7566	if (spare && freeblks->fb_len != 0) {
7567		if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7568		    flags, &vp, FFSV_FORCEINSMQ) != 0)
7569			return (EBUSY);
7570		ip = VTOI(vp);
7571		if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
7572			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
7573			ip->i_flag |= IN_CHANGE;
7574			/*
7575			 * We must wait so this happens before the
7576			 * journal is reclaimed.
7577			 */
7578			ffs_update(vp, 1);
7579		}
7580		vput(vp);
7581	}
7582	if (spare < 0) {
7583		UFS_LOCK(ump);
7584		fs->fs_pendingblocks += spare;
7585		UFS_UNLOCK(ump);
7586	}
7587#ifdef QUOTA
7588	/* Handle spare. */
7589	if (spare)
7590		quotaadj(freeblks->fb_quota, ump, -spare);
7591	quotarele(freeblks->fb_quota);
7592#endif
7593	ACQUIRE_LOCK(&lk);
7594	if (freeblks->fb_state & ONDEPLIST) {
7595		inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7596		    0, &inodedep);
7597		TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
7598		freeblks->fb_state &= ~ONDEPLIST;
7599		if (TAILQ_EMPTY(&inodedep->id_freeblklst))
7600			free_inodedep(inodedep);
7601	}
7602	/*
7603	 * All of the freeblock deps must be complete prior to this call
7604	 * so it's now safe to complete earlier outstanding journal entries.
7605	 */
7606	handle_jwork(&freeblks->fb_jwork);
7607	WORKITEM_FREE(freeblks, D_FREEBLKS);
7608	FREE_LOCK(&lk);
7609	return (0);
7610}
7611
7612/*
7613 * Release blocks associated with the freeblks and stored in the indirect
7614 * block dbn. If level is greater than SINGLE, the block is an indirect block
7615 * and recursive calls to indirtrunc must be used to cleanse other indirect
7616 * blocks.
7617 *
7618 * This handles partial and complete truncation of blocks.  Partial is noted
7619 * with goingaway == 0.  In this case the freework is completed after the
7620 * zero'd indirects are written to disk.  For full truncation the freework
7621 * is completed after the block is freed.
7622 */
7623static void
7624indir_trunc(freework, dbn, lbn)
7625	struct freework *freework;
7626	ufs2_daddr_t dbn;
7627	ufs_lbn_t lbn;
7628{
7629	struct freework *nfreework;
7630	struct workhead wkhd;
7631	struct freeblks *freeblks;
7632	struct buf *bp;
7633	struct fs *fs;
7634	struct indirdep *indirdep;
7635	struct ufsmount *ump;
7636	ufs1_daddr_t *bap1 = 0;
7637	ufs2_daddr_t nb, nnb, *bap2 = 0;
7638	ufs_lbn_t lbnadd, nlbn;
7639	int i, nblocks, ufs1fmt;
7640	int freedblocks;
7641	int goingaway;
7642	int freedeps;
7643	int needj;
7644	int level;
7645	int cnt;
7646
7647	freeblks = freework->fw_freeblks;
7648	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7649	fs = ump->um_fs;
7650	/*
7651	 * Get buffer of block pointers to be freed.  There are three cases:
7652	 *
7653	 * 1) Partial truncate caches the indirdep pointer in the freework
7654	 *    which provides us a back copy to the save bp which holds the
7655	 *    pointers we want to clear.  When this completes the zero
7656	 *    pointers are written to the real copy.
7657	 * 2) The indirect is being completely truncated, cancel_indirdep()
7658	 *    eliminated the real copy and placed the indirdep on the saved
7659	 *    copy.  The indirdep and buf are discarded when this completes.
7660	 * 3) The indirect was not in memory, we read a copy off of the disk
7661	 *    using the devvp and drop and invalidate the buffer when we're
7662	 *    done.
7663	 */
7664	goingaway = 1;
7665	indirdep = NULL;
7666	if (freework->fw_indir != NULL) {
7667		goingaway = 0;
7668		indirdep = freework->fw_indir;
7669		bp = indirdep->ir_savebp;
7670		if (bp == NULL || bp->b_blkno != dbn)
7671			panic("indir_trunc: Bad saved buf %p blkno %jd",
7672			    bp, (intmax_t)dbn);
7673	} else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
7674		/*
7675		 * The lock prevents the buf dep list from changing and
7676	 	 * indirects on devvp should only ever have one dependency.
7677		 */
7678		indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
7679		if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
7680			panic("indir_trunc: Bad indirdep %p from buf %p",
7681			    indirdep, bp);
7682	} else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
7683	    NOCRED, &bp) != 0) {
7684		brelse(bp);
7685		return;
7686	}
7687	ACQUIRE_LOCK(&lk);
7688	/* Protects against a race with complete_trunc_indir(). */
7689	freework->fw_state &= ~INPROGRESS;
7690	/*
7691	 * If we have an indirdep we need to enforce the truncation order
7692	 * and discard it when it is complete.
7693	 */
7694	if (indirdep) {
7695		if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
7696		    !TAILQ_EMPTY(&indirdep->ir_trunc)) {
7697			/*
7698			 * Add the complete truncate to the list on the
7699			 * indirdep to enforce in-order processing.
7700			 */
7701			if (freework->fw_indir == NULL)
7702				TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
7703				    freework, fw_next);
7704			FREE_LOCK(&lk);
7705			return;
7706		}
7707		/*
7708		 * If we're goingaway, free the indirdep.  Otherwise it will
7709		 * linger until the write completes.
7710		 */
7711		if (goingaway) {
7712			free_indirdep(indirdep);
7713			ump->um_numindirdeps -= 1;
7714		}
7715	}
7716	FREE_LOCK(&lk);
7717	/* Initialize pointers depending on block size. */
7718	if (ump->um_fstype == UFS1) {
7719		bap1 = (ufs1_daddr_t *)bp->b_data;
7720		nb = bap1[freework->fw_off];
7721		ufs1fmt = 1;
7722	} else {
7723		bap2 = (ufs2_daddr_t *)bp->b_data;
7724		nb = bap2[freework->fw_off];
7725		ufs1fmt = 0;
7726	}
7727	level = lbn_level(lbn);
7728	needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
7729	lbnadd = lbn_offset(fs, level);
7730	nblocks = btodb(fs->fs_bsize);
7731	nfreework = freework;
7732	freedeps = 0;
7733	cnt = 0;
7734	/*
7735	 * Reclaim blocks.  Traverses into nested indirect levels and
7736	 * arranges for the current level to be freed when subordinates
7737	 * are free when journaling.
7738	 */
7739	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
7740		if (i != NINDIR(fs) - 1) {
7741			if (ufs1fmt)
7742				nnb = bap1[i+1];
7743			else
7744				nnb = bap2[i+1];
7745		} else
7746			nnb = 0;
7747		if (nb == 0)
7748			continue;
7749		cnt++;
7750		if (level != 0) {
7751			nlbn = (lbn + 1) - (i * lbnadd);
7752			if (needj != 0) {
7753				nfreework = newfreework(ump, freeblks, freework,
7754				    nlbn, nb, fs->fs_frag, 0, 0);
7755				freedeps++;
7756			}
7757			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
7758		} else {
7759			struct freedep *freedep;
7760
7761			/*
7762			 * Attempt to aggregate freedep dependencies for
7763			 * all blocks being released to the same CG.
7764			 */
7765			LIST_INIT(&wkhd);
7766			if (needj != 0 &&
7767			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
7768				freedep = newfreedep(freework);
7769				WORKLIST_INSERT_UNLOCKED(&wkhd,
7770				    &freedep->fd_list);
7771				freedeps++;
7772			}
7773			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
7774			    fs->fs_bsize, freeblks->fb_inum,
7775			    freeblks->fb_vtype, &wkhd);
7776		}
7777	}
7778	if (goingaway) {
7779		bp->b_flags |= B_INVAL | B_NOCACHE;
7780		brelse(bp);
7781	}
7782	freedblocks = 0;
7783	if (level == 0)
7784		freedblocks = (nblocks * cnt);
7785	if (needj == 0)
7786		freedblocks += nblocks;
7787	freeblks_free(ump, freeblks, freedblocks);
7788	/*
7789	 * If we are journaling set up the ref counts and offset so this
7790	 * indirect can be completed when its children are free.
7791	 */
7792	if (needj) {
7793		ACQUIRE_LOCK(&lk);
7794		freework->fw_off = i;
7795		freework->fw_ref += freedeps;
7796		freework->fw_ref -= NINDIR(fs) + 1;
7797		if (level == 0)
7798			freeblks->fb_cgwait += freedeps;
7799		if (freework->fw_ref == 0)
7800			freework_freeblock(freework);
7801		FREE_LOCK(&lk);
7802		return;
7803	}
7804	/*
7805	 * If we're not journaling we can free the indirect now.
7806	 */
7807	dbn = dbtofsb(fs, dbn);
7808	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
7809	    freeblks->fb_inum, freeblks->fb_vtype, NULL);
7810	/* Non SUJ softdep does single-threaded truncations. */
7811	if (freework->fw_blkno == dbn) {
7812		freework->fw_state |= ALLCOMPLETE;
7813		ACQUIRE_LOCK(&lk);
7814		handle_written_freework(freework);
7815		FREE_LOCK(&lk);
7816	}
7817	return;
7818}
7819
7820/*
7821 * Cancel an allocindir when it is removed via truncation.  When bp is not
7822 * NULL the indirect never appeared on disk and is scheduled to be freed
7823 * independently of the indir so we can more easily track journal work.
7824 */
7825static void
7826cancel_allocindir(aip, bp, freeblks, trunc)
7827	struct allocindir *aip;
7828	struct buf *bp;
7829	struct freeblks *freeblks;
7830	int trunc;
7831{
7832	struct indirdep *indirdep;
7833	struct freefrag *freefrag;
7834	struct newblk *newblk;
7835
7836	newblk = (struct newblk *)aip;
7837	LIST_REMOVE(aip, ai_next);
7838	/*
7839	 * We must eliminate the pointer in bp if it must be freed on its
7840	 * own due to partial truncate or pending journal work.
7841	 */
7842	if (bp && (trunc || newblk->nb_jnewblk)) {
7843		/*
7844		 * Clear the pointer and mark the aip to be freed
7845		 * directly if it never existed on disk.
7846		 */
7847		aip->ai_state |= DELAYEDFREE;
7848		indirdep = aip->ai_indirdep;
7849		if (indirdep->ir_state & UFS1FMT)
7850			((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
7851		else
7852			((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
7853	}
7854	/*
7855	 * When truncating the previous pointer will be freed via
7856	 * savedbp.  Eliminate the freefrag which would dup free.
7857	 */
7858	if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
7859		newblk->nb_freefrag = NULL;
7860		if (freefrag->ff_jdep)
7861			cancel_jfreefrag(
7862			    WK_JFREEFRAG(freefrag->ff_jdep));
7863		jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
7864		WORKITEM_FREE(freefrag, D_FREEFRAG);
7865	}
7866	/*
7867	 * If the journal hasn't been written the jnewblk must be passed
7868	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
7869	 * this by leaving the journal dependency on the newblk to be freed
7870	 * when a freework is created in handle_workitem_freeblocks().
7871	 */
7872	cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
7873	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7874}
7875
7876/*
7877 * Create the mkdir dependencies for . and .. in a new directory.  Link them
7878 * in to a newdirblk so any subsequent additions are tracked properly.  The
7879 * caller is responsible for adding the mkdir1 dependency to the journal
7880 * and updating id_mkdiradd.  This function returns with lk held.
7881 */
7882static struct mkdir *
7883setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
7884	struct diradd *dap;
7885	ino_t newinum;
7886	ino_t dinum;
7887	struct buf *newdirbp;
7888	struct mkdir **mkdirp;
7889{
7890	struct newblk *newblk;
7891	struct pagedep *pagedep;
7892	struct inodedep *inodedep;
7893	struct newdirblk *newdirblk = 0;
7894	struct mkdir *mkdir1, *mkdir2;
7895	struct worklist *wk;
7896	struct jaddref *jaddref;
7897	struct mount *mp;
7898
7899	mp = dap->da_list.wk_mp;
7900	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
7901	    M_SOFTDEP_FLAGS);
7902	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
7903	LIST_INIT(&newdirblk->db_mkdir);
7904	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
7905	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
7906	mkdir1->md_state = ATTACHED | MKDIR_BODY;
7907	mkdir1->md_diradd = dap;
7908	mkdir1->md_jaddref = NULL;
7909	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
7910	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
7911	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
7912	mkdir2->md_diradd = dap;
7913	mkdir2->md_jaddref = NULL;
7914	if (MOUNTEDSUJ(mp) == 0) {
7915		mkdir1->md_state |= DEPCOMPLETE;
7916		mkdir2->md_state |= DEPCOMPLETE;
7917	}
7918	/*
7919	 * Dependency on "." and ".." being written to disk.
7920	 */
7921	mkdir1->md_buf = newdirbp;
7922	ACQUIRE_LOCK(&lk);
7923	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
7924	/*
7925	 * We must link the pagedep, allocdirect, and newdirblk for
7926	 * the initial file page so the pointer to the new directory
7927	 * is not written until the directory contents are live and
7928	 * any subsequent additions are not marked live until the
7929	 * block is reachable via the inode.
7930	 */
7931	if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
7932		panic("setup_newdir: lost pagedep");
7933	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
7934		if (wk->wk_type == D_ALLOCDIRECT)
7935			break;
7936	if (wk == NULL)
7937		panic("setup_newdir: lost allocdirect");
7938	if (pagedep->pd_state & NEWBLOCK)
7939		panic("setup_newdir: NEWBLOCK already set");
7940	newblk = WK_NEWBLK(wk);
7941	pagedep->pd_state |= NEWBLOCK;
7942	pagedep->pd_newdirblk = newdirblk;
7943	newdirblk->db_pagedep = pagedep;
7944	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
7945	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
7946	/*
7947	 * Look up the inodedep for the parent directory so that we
7948	 * can link mkdir2 into the pending dotdot jaddref or
7949	 * the inode write if there is none.  If the inode is
7950	 * ALLCOMPLETE and no jaddref is present all dependencies have
7951	 * been satisfied and mkdir2 can be freed.
7952	 */
7953	inodedep_lookup(mp, dinum, 0, &inodedep);
7954	if (MOUNTEDSUJ(mp)) {
7955		if (inodedep == NULL)
7956			panic("setup_newdir: Lost parent.");
7957		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
7958		    inoreflst);
7959		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
7960		    (jaddref->ja_state & MKDIR_PARENT),
7961		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
7962		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
7963		mkdir2->md_jaddref = jaddref;
7964		jaddref->ja_mkdir = mkdir2;
7965	} else if (inodedep == NULL ||
7966	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
7967		dap->da_state &= ~MKDIR_PARENT;
7968		WORKITEM_FREE(mkdir2, D_MKDIR);
7969	} else {
7970		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
7971		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
7972	}
7973	*mkdirp = mkdir2;
7974
7975	return (mkdir1);
7976}
7977
7978/*
7979 * Directory entry addition dependencies.
7980 *
7981 * When adding a new directory entry, the inode (with its incremented link
7982 * count) must be written to disk before the directory entry's pointer to it.
7983 * Also, if the inode is newly allocated, the corresponding freemap must be
7984 * updated (on disk) before the directory entry's pointer. These requirements
7985 * are met via undo/redo on the directory entry's pointer, which consists
7986 * simply of the inode number.
7987 *
7988 * As directory entries are added and deleted, the free space within a
7989 * directory block can become fragmented.  The ufs filesystem will compact
7990 * a fragmented directory block to make space for a new entry. When this
7991 * occurs, the offsets of previously added entries change. Any "diradd"
7992 * dependency structures corresponding to these entries must be updated with
7993 * the new offsets.
7994 */
7995
7996/*
7997 * This routine is called after the in-memory inode's link
7998 * count has been incremented, but before the directory entry's
7999 * pointer to the inode has been set.
8000 */
8001int
8002softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
8003	struct buf *bp;		/* buffer containing directory block */
8004	struct inode *dp;	/* inode for directory */
8005	off_t diroffset;	/* offset of new entry in directory */
8006	ino_t newinum;		/* inode referenced by new directory entry */
8007	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
8008	int isnewblk;		/* entry is in a newly allocated block */
8009{
8010	int offset;		/* offset of new entry within directory block */
8011	ufs_lbn_t lbn;		/* block in directory containing new entry */
8012	struct fs *fs;
8013	struct diradd *dap;
8014	struct newblk *newblk;
8015	struct pagedep *pagedep;
8016	struct inodedep *inodedep;
8017	struct newdirblk *newdirblk = 0;
8018	struct mkdir *mkdir1, *mkdir2;
8019	struct jaddref *jaddref;
8020	struct mount *mp;
8021	int isindir;
8022
8023	/*
8024	 * Whiteouts have no dependencies.
8025	 */
8026	if (newinum == WINO) {
8027		if (newdirbp != NULL)
8028			bdwrite(newdirbp);
8029		return (0);
8030	}
8031	jaddref = NULL;
8032	mkdir1 = mkdir2 = NULL;
8033	mp = UFSTOVFS(dp->i_ump);
8034	fs = dp->i_fs;
8035	lbn = lblkno(fs, diroffset);
8036	offset = blkoff(fs, diroffset);
8037	dap = malloc(sizeof(struct diradd), M_DIRADD,
8038		M_SOFTDEP_FLAGS|M_ZERO);
8039	workitem_alloc(&dap->da_list, D_DIRADD, mp);
8040	dap->da_offset = offset;
8041	dap->da_newinum = newinum;
8042	dap->da_state = ATTACHED;
8043	LIST_INIT(&dap->da_jwork);
8044	isindir = bp->b_lblkno >= NDADDR;
8045	if (isnewblk &&
8046	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8047		newdirblk = malloc(sizeof(struct newdirblk),
8048		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8049		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8050		LIST_INIT(&newdirblk->db_mkdir);
8051	}
8052	/*
8053	 * If we're creating a new directory setup the dependencies and set
8054	 * the dap state to wait for them.  Otherwise it's COMPLETE and
8055	 * we can move on.
8056	 */
8057	if (newdirbp == NULL) {
8058		dap->da_state |= DEPCOMPLETE;
8059		ACQUIRE_LOCK(&lk);
8060	} else {
8061		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8062		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8063		    &mkdir2);
8064	}
8065	/*
8066	 * Link into parent directory pagedep to await its being written.
8067	 */
8068	pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8069#ifdef DEBUG
8070	if (diradd_lookup(pagedep, offset) != NULL)
8071		panic("softdep_setup_directory_add: %p already at off %d\n",
8072		    diradd_lookup(pagedep, offset), offset);
8073#endif
8074	dap->da_pagedep = pagedep;
8075	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8076	    da_pdlist);
8077	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
8078	/*
8079	 * If we're journaling, link the diradd into the jaddref so it
8080	 * may be completed after the journal entry is written.  Otherwise,
8081	 * link the diradd into its inodedep.  If the inode is not yet
8082	 * written place it on the bufwait list, otherwise do the post-inode
8083	 * write processing to put it on the id_pendinghd list.
8084	 */
8085	if (MOUNTEDSUJ(mp)) {
8086		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8087		    inoreflst);
8088		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8089		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8090		jaddref->ja_diroff = diroffset;
8091		jaddref->ja_diradd = dap;
8092		add_to_journal(&jaddref->ja_list);
8093	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8094		diradd_inode_written(dap, inodedep);
8095	else
8096		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8097	/*
8098	 * Add the journal entries for . and .. links now that the primary
8099	 * link is written.
8100	 */
8101	if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8102		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8103		    inoreflst, if_deps);
8104		KASSERT(jaddref != NULL &&
8105		    jaddref->ja_ino == jaddref->ja_parent &&
8106		    (jaddref->ja_state & MKDIR_BODY),
8107		    ("softdep_setup_directory_add: bad dot jaddref %p",
8108		    jaddref));
8109		mkdir1->md_jaddref = jaddref;
8110		jaddref->ja_mkdir = mkdir1;
8111		/*
8112		 * It is important that the dotdot journal entry
8113		 * is added prior to the dot entry since dot writes
8114		 * both the dot and dotdot links.  These both must
8115		 * be added after the primary link for the journal
8116		 * to remain consistent.
8117		 */
8118		add_to_journal(&mkdir2->md_jaddref->ja_list);
8119		add_to_journal(&jaddref->ja_list);
8120	}
8121	/*
8122	 * If we are adding a new directory remember this diradd so that if
8123	 * we rename it we can keep the dot and dotdot dependencies.  If
8124	 * we are adding a new name for an inode that has a mkdiradd we
8125	 * must be in rename and we have to move the dot and dotdot
8126	 * dependencies to this new name.  The old name is being orphaned
8127	 * soon.
8128	 */
8129	if (mkdir1 != NULL) {
8130		if (inodedep->id_mkdiradd != NULL)
8131			panic("softdep_setup_directory_add: Existing mkdir");
8132		inodedep->id_mkdiradd = dap;
8133	} else if (inodedep->id_mkdiradd)
8134		merge_diradd(inodedep, dap);
8135	if (newdirblk) {
8136		/*
8137		 * There is nothing to do if we are already tracking
8138		 * this block.
8139		 */
8140		if ((pagedep->pd_state & NEWBLOCK) != 0) {
8141			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8142			FREE_LOCK(&lk);
8143			return (0);
8144		}
8145		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8146		    == 0)
8147			panic("softdep_setup_directory_add: lost entry");
8148		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8149		pagedep->pd_state |= NEWBLOCK;
8150		pagedep->pd_newdirblk = newdirblk;
8151		newdirblk->db_pagedep = pagedep;
8152		FREE_LOCK(&lk);
8153		/*
8154		 * If we extended into an indirect signal direnter to sync.
8155		 */
8156		if (isindir)
8157			return (1);
8158		return (0);
8159	}
8160	FREE_LOCK(&lk);
8161	return (0);
8162}
8163
8164/*
8165 * This procedure is called to change the offset of a directory
8166 * entry when compacting a directory block which must be owned
8167 * exclusively by the caller. Note that the actual entry movement
8168 * must be done in this procedure to ensure that no I/O completions
8169 * occur while the move is in progress.
8170 */
8171void
8172softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
8173	struct buf *bp;		/* Buffer holding directory block. */
8174	struct inode *dp;	/* inode for directory */
8175	caddr_t base;		/* address of dp->i_offset */
8176	caddr_t oldloc;		/* address of old directory location */
8177	caddr_t newloc;		/* address of new directory location */
8178	int entrysize;		/* size of directory entry */
8179{
8180	int offset, oldoffset, newoffset;
8181	struct pagedep *pagedep;
8182	struct jmvref *jmvref;
8183	struct diradd *dap;
8184	struct direct *de;
8185	struct mount *mp;
8186	ufs_lbn_t lbn;
8187	int flags;
8188
8189	mp = UFSTOVFS(dp->i_ump);
8190	de = (struct direct *)oldloc;
8191	jmvref = NULL;
8192	flags = 0;
8193	/*
8194	 * Moves are always journaled as it would be too complex to
8195	 * determine if any affected adds or removes are present in the
8196	 * journal.
8197	 */
8198	if (MOUNTEDSUJ(mp)) {
8199		flags = DEPALLOC;
8200		jmvref = newjmvref(dp, de->d_ino,
8201		    dp->i_offset + (oldloc - base),
8202		    dp->i_offset + (newloc - base));
8203	}
8204	lbn = lblkno(dp->i_fs, dp->i_offset);
8205	offset = blkoff(dp->i_fs, dp->i_offset);
8206	oldoffset = offset + (oldloc - base);
8207	newoffset = offset + (newloc - base);
8208	ACQUIRE_LOCK(&lk);
8209	if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8210		goto done;
8211	dap = diradd_lookup(pagedep, oldoffset);
8212	if (dap) {
8213		dap->da_offset = newoffset;
8214		newoffset = DIRADDHASH(newoffset);
8215		oldoffset = DIRADDHASH(oldoffset);
8216		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8217		    newoffset != oldoffset) {
8218			LIST_REMOVE(dap, da_pdlist);
8219			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8220			    dap, da_pdlist);
8221		}
8222	}
8223done:
8224	if (jmvref) {
8225		jmvref->jm_pagedep = pagedep;
8226		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8227		add_to_journal(&jmvref->jm_list);
8228	}
8229	bcopy(oldloc, newloc, entrysize);
8230	FREE_LOCK(&lk);
8231}
8232
8233/*
8234 * Move the mkdir dependencies and journal work from one diradd to another
8235 * when renaming a directory.  The new name must depend on the mkdir deps
8236 * completing as the old name did.  Directories can only have one valid link
8237 * at a time so one must be canonical.
8238 */
8239static void
8240merge_diradd(inodedep, newdap)
8241	struct inodedep *inodedep;
8242	struct diradd *newdap;
8243{
8244	struct diradd *olddap;
8245	struct mkdir *mkdir, *nextmd;
8246	short state;
8247
8248	olddap = inodedep->id_mkdiradd;
8249	inodedep->id_mkdiradd = newdap;
8250	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8251		newdap->da_state &= ~DEPCOMPLETE;
8252		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
8253			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8254			if (mkdir->md_diradd != olddap)
8255				continue;
8256			mkdir->md_diradd = newdap;
8257			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8258			newdap->da_state |= state;
8259			olddap->da_state &= ~state;
8260			if ((olddap->da_state &
8261			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
8262				break;
8263		}
8264		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8265			panic("merge_diradd: unfound ref");
8266	}
8267	/*
8268	 * Any mkdir related journal items are not safe to be freed until
8269	 * the new name is stable.
8270	 */
8271	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8272	olddap->da_state |= DEPCOMPLETE;
8273	complete_diradd(olddap);
8274}
8275
8276/*
8277 * Move the diradd to the pending list when all diradd dependencies are
8278 * complete.
8279 */
8280static void
8281complete_diradd(dap)
8282	struct diradd *dap;
8283{
8284	struct pagedep *pagedep;
8285
8286	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8287		if (dap->da_state & DIRCHG)
8288			pagedep = dap->da_previous->dm_pagedep;
8289		else
8290			pagedep = dap->da_pagedep;
8291		LIST_REMOVE(dap, da_pdlist);
8292		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8293	}
8294}
8295
8296/*
8297 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
8298 * add entries and conditonally journal the remove.
8299 */
8300static void
8301cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
8302	struct diradd *dap;
8303	struct dirrem *dirrem;
8304	struct jremref *jremref;
8305	struct jremref *dotremref;
8306	struct jremref *dotdotremref;
8307{
8308	struct inodedep *inodedep;
8309	struct jaddref *jaddref;
8310	struct inoref *inoref;
8311	struct mkdir *mkdir;
8312
8313	/*
8314	 * If no remove references were allocated we're on a non-journaled
8315	 * filesystem and can skip the cancel step.
8316	 */
8317	if (jremref == NULL) {
8318		free_diradd(dap, NULL);
8319		return;
8320	}
8321	/*
8322	 * Cancel the primary name an free it if it does not require
8323	 * journaling.
8324	 */
8325	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
8326	    0, &inodedep) != 0) {
8327		/* Abort the addref that reference this diradd.  */
8328		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
8329			if (inoref->if_list.wk_type != D_JADDREF)
8330				continue;
8331			jaddref = (struct jaddref *)inoref;
8332			if (jaddref->ja_diradd != dap)
8333				continue;
8334			if (cancel_jaddref(jaddref, inodedep,
8335			    &dirrem->dm_jwork) == 0) {
8336				free_jremref(jremref);
8337				jremref = NULL;
8338			}
8339			break;
8340		}
8341	}
8342	/*
8343	 * Cancel subordinate names and free them if they do not require
8344	 * journaling.
8345	 */
8346	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8347		LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
8348			if (mkdir->md_diradd != dap)
8349				continue;
8350			if ((jaddref = mkdir->md_jaddref) == NULL)
8351				continue;
8352			mkdir->md_jaddref = NULL;
8353			if (mkdir->md_state & MKDIR_PARENT) {
8354				if (cancel_jaddref(jaddref, NULL,
8355				    &dirrem->dm_jwork) == 0) {
8356					free_jremref(dotdotremref);
8357					dotdotremref = NULL;
8358				}
8359			} else {
8360				if (cancel_jaddref(jaddref, inodedep,
8361				    &dirrem->dm_jwork) == 0) {
8362					free_jremref(dotremref);
8363					dotremref = NULL;
8364				}
8365			}
8366		}
8367	}
8368
8369	if (jremref)
8370		journal_jremref(dirrem, jremref, inodedep);
8371	if (dotremref)
8372		journal_jremref(dirrem, dotremref, inodedep);
8373	if (dotdotremref)
8374		journal_jremref(dirrem, dotdotremref, NULL);
8375	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
8376	free_diradd(dap, &dirrem->dm_jwork);
8377}
8378
8379/*
8380 * Free a diradd dependency structure. This routine must be called
8381 * with splbio interrupts blocked.
8382 */
8383static void
8384free_diradd(dap, wkhd)
8385	struct diradd *dap;
8386	struct workhead *wkhd;
8387{
8388	struct dirrem *dirrem;
8389	struct pagedep *pagedep;
8390	struct inodedep *inodedep;
8391	struct mkdir *mkdir, *nextmd;
8392
8393	mtx_assert(&lk, MA_OWNED);
8394	LIST_REMOVE(dap, da_pdlist);
8395	if (dap->da_state & ONWORKLIST)
8396		WORKLIST_REMOVE(&dap->da_list);
8397	if ((dap->da_state & DIRCHG) == 0) {
8398		pagedep = dap->da_pagedep;
8399	} else {
8400		dirrem = dap->da_previous;
8401		pagedep = dirrem->dm_pagedep;
8402		dirrem->dm_dirinum = pagedep->pd_ino;
8403		dirrem->dm_state |= COMPLETE;
8404		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8405			add_to_worklist(&dirrem->dm_list, 0);
8406	}
8407	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
8408	    0, &inodedep) != 0)
8409		if (inodedep->id_mkdiradd == dap)
8410			inodedep->id_mkdiradd = NULL;
8411	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8412		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
8413			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8414			if (mkdir->md_diradd != dap)
8415				continue;
8416			dap->da_state &=
8417			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
8418			LIST_REMOVE(mkdir, md_mkdirs);
8419			if (mkdir->md_state & ONWORKLIST)
8420				WORKLIST_REMOVE(&mkdir->md_list);
8421			if (mkdir->md_jaddref != NULL)
8422				panic("free_diradd: Unexpected jaddref");
8423			WORKITEM_FREE(mkdir, D_MKDIR);
8424			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
8425				break;
8426		}
8427		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8428			panic("free_diradd: unfound ref");
8429	}
8430	if (inodedep)
8431		free_inodedep(inodedep);
8432	/*
8433	 * Free any journal segments waiting for the directory write.
8434	 */
8435	handle_jwork(&dap->da_jwork);
8436	WORKITEM_FREE(dap, D_DIRADD);
8437}
8438
8439/*
8440 * Directory entry removal dependencies.
8441 *
8442 * When removing a directory entry, the entry's inode pointer must be
8443 * zero'ed on disk before the corresponding inode's link count is decremented
8444 * (possibly freeing the inode for re-use). This dependency is handled by
8445 * updating the directory entry but delaying the inode count reduction until
8446 * after the directory block has been written to disk. After this point, the
8447 * inode count can be decremented whenever it is convenient.
8448 */
8449
8450/*
8451 * This routine should be called immediately after removing
8452 * a directory entry.  The inode's link count should not be
8453 * decremented by the calling procedure -- the soft updates
8454 * code will do this task when it is safe.
8455 */
8456void
8457softdep_setup_remove(bp, dp, ip, isrmdir)
8458	struct buf *bp;		/* buffer containing directory block */
8459	struct inode *dp;	/* inode for the directory being modified */
8460	struct inode *ip;	/* inode for directory entry being removed */
8461	int isrmdir;		/* indicates if doing RMDIR */
8462{
8463	struct dirrem *dirrem, *prevdirrem;
8464	struct inodedep *inodedep;
8465	int direct;
8466
8467	/*
8468	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
8469	 * newdirrem() to setup the full directory remove which requires
8470	 * isrmdir > 1.
8471	 */
8472	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8473	/*
8474	 * Add the dirrem to the inodedep's pending remove list for quick
8475	 * discovery later.
8476	 */
8477	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8478	    &inodedep) == 0)
8479		panic("softdep_setup_remove: Lost inodedep.");
8480	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
8481	dirrem->dm_state |= ONDEPLIST;
8482	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8483
8484	/*
8485	 * If the COMPLETE flag is clear, then there were no active
8486	 * entries and we want to roll back to a zeroed entry until
8487	 * the new inode is committed to disk. If the COMPLETE flag is
8488	 * set then we have deleted an entry that never made it to
8489	 * disk. If the entry we deleted resulted from a name change,
8490	 * then the old name still resides on disk. We cannot delete
8491	 * its inode (returned to us in prevdirrem) until the zeroed
8492	 * directory entry gets to disk. The new inode has never been
8493	 * referenced on the disk, so can be deleted immediately.
8494	 */
8495	if ((dirrem->dm_state & COMPLETE) == 0) {
8496		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
8497		    dm_next);
8498		FREE_LOCK(&lk);
8499	} else {
8500		if (prevdirrem != NULL)
8501			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
8502			    prevdirrem, dm_next);
8503		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
8504		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
8505		FREE_LOCK(&lk);
8506		if (direct)
8507			handle_workitem_remove(dirrem, 0);
8508	}
8509}
8510
8511/*
8512 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
8513 * pd_pendinghd list of a pagedep.
8514 */
8515static struct diradd *
8516diradd_lookup(pagedep, offset)
8517	struct pagedep *pagedep;
8518	int offset;
8519{
8520	struct diradd *dap;
8521
8522	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
8523		if (dap->da_offset == offset)
8524			return (dap);
8525	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
8526		if (dap->da_offset == offset)
8527			return (dap);
8528	return (NULL);
8529}
8530
8531/*
8532 * Search for a .. diradd dependency in a directory that is being removed.
8533 * If the directory was renamed to a new parent we have a diradd rather
8534 * than a mkdir for the .. entry.  We need to cancel it now before
8535 * it is found in truncate().
8536 */
8537static struct jremref *
8538cancel_diradd_dotdot(ip, dirrem, jremref)
8539	struct inode *ip;
8540	struct dirrem *dirrem;
8541	struct jremref *jremref;
8542{
8543	struct pagedep *pagedep;
8544	struct diradd *dap;
8545	struct worklist *wk;
8546
8547	if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0,
8548	    &pagedep) == 0)
8549		return (jremref);
8550	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
8551	if (dap == NULL)
8552		return (jremref);
8553	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
8554	/*
8555	 * Mark any journal work as belonging to the parent so it is freed
8556	 * with the .. reference.
8557	 */
8558	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
8559		wk->wk_state |= MKDIR_PARENT;
8560	return (NULL);
8561}
8562
8563/*
8564 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
8565 * replace it with a dirrem/diradd pair as a result of re-parenting a
8566 * directory.  This ensures that we don't simultaneously have a mkdir and
8567 * a diradd for the same .. entry.
8568 */
8569static struct jremref *
8570cancel_mkdir_dotdot(ip, dirrem, jremref)
8571	struct inode *ip;
8572	struct dirrem *dirrem;
8573	struct jremref *jremref;
8574{
8575	struct inodedep *inodedep;
8576	struct jaddref *jaddref;
8577	struct mkdir *mkdir;
8578	struct diradd *dap;
8579
8580	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8581	    &inodedep) == 0)
8582		panic("cancel_mkdir_dotdot: Lost inodedep");
8583	dap = inodedep->id_mkdiradd;
8584	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
8585		return (jremref);
8586	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir;
8587	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
8588		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
8589			break;
8590	if (mkdir == NULL)
8591		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
8592	if ((jaddref = mkdir->md_jaddref) != NULL) {
8593		mkdir->md_jaddref = NULL;
8594		jaddref->ja_state &= ~MKDIR_PARENT;
8595		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
8596		    &inodedep) == 0)
8597			panic("cancel_mkdir_dotdot: Lost parent inodedep");
8598		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
8599			journal_jremref(dirrem, jremref, inodedep);
8600			jremref = NULL;
8601		}
8602	}
8603	if (mkdir->md_state & ONWORKLIST)
8604		WORKLIST_REMOVE(&mkdir->md_list);
8605	mkdir->md_state |= ALLCOMPLETE;
8606	complete_mkdir(mkdir);
8607	return (jremref);
8608}
8609
8610static void
8611journal_jremref(dirrem, jremref, inodedep)
8612	struct dirrem *dirrem;
8613	struct jremref *jremref;
8614	struct inodedep *inodedep;
8615{
8616
8617	if (inodedep == NULL)
8618		if (inodedep_lookup(jremref->jr_list.wk_mp,
8619		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
8620			panic("journal_jremref: Lost inodedep");
8621	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
8622	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
8623	add_to_journal(&jremref->jr_list);
8624}
8625
8626static void
8627dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
8628	struct dirrem *dirrem;
8629	struct jremref *jremref;
8630	struct jremref *dotremref;
8631	struct jremref *dotdotremref;
8632{
8633	struct inodedep *inodedep;
8634
8635
8636	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
8637	    &inodedep) == 0)
8638		panic("dirrem_journal: Lost inodedep");
8639	journal_jremref(dirrem, jremref, inodedep);
8640	if (dotremref)
8641		journal_jremref(dirrem, dotremref, inodedep);
8642	if (dotdotremref)
8643		journal_jremref(dirrem, dotdotremref, NULL);
8644}
8645
8646/*
8647 * Allocate a new dirrem if appropriate and return it along with
8648 * its associated pagedep. Called without a lock, returns with lock.
8649 */
8650static struct dirrem *
8651newdirrem(bp, dp, ip, isrmdir, prevdirremp)
8652	struct buf *bp;		/* buffer containing directory block */
8653	struct inode *dp;	/* inode for the directory being modified */
8654	struct inode *ip;	/* inode for directory entry being removed */
8655	int isrmdir;		/* indicates if doing RMDIR */
8656	struct dirrem **prevdirremp; /* previously referenced inode, if any */
8657{
8658	int offset;
8659	ufs_lbn_t lbn;
8660	struct diradd *dap;
8661	struct dirrem *dirrem;
8662	struct pagedep *pagedep;
8663	struct jremref *jremref;
8664	struct jremref *dotremref;
8665	struct jremref *dotdotremref;
8666	struct vnode *dvp;
8667
8668	/*
8669	 * Whiteouts have no deletion dependencies.
8670	 */
8671	if (ip == NULL)
8672		panic("newdirrem: whiteout");
8673	dvp = ITOV(dp);
8674	/*
8675	 * If we are over our limit, try to improve the situation.
8676	 * Limiting the number of dirrem structures will also limit
8677	 * the number of freefile and freeblks structures.
8678	 */
8679	ACQUIRE_LOCK(&lk);
8680	if (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2)
8681		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS);
8682	FREE_LOCK(&lk);
8683	dirrem = malloc(sizeof(struct dirrem),
8684		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
8685	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
8686	LIST_INIT(&dirrem->dm_jremrefhd);
8687	LIST_INIT(&dirrem->dm_jwork);
8688	dirrem->dm_state = isrmdir ? RMDIR : 0;
8689	dirrem->dm_oldinum = ip->i_number;
8690	*prevdirremp = NULL;
8691	/*
8692	 * Allocate remove reference structures to track journal write
8693	 * dependencies.  We will always have one for the link and
8694	 * when doing directories we will always have one more for dot.
8695	 * When renaming a directory we skip the dotdot link change so
8696	 * this is not needed.
8697	 */
8698	jremref = dotremref = dotdotremref = NULL;
8699	if (DOINGSUJ(dvp)) {
8700		if (isrmdir) {
8701			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
8702			    ip->i_effnlink + 2);
8703			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
8704			    ip->i_effnlink + 1);
8705			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
8706			    dp->i_effnlink + 1);
8707			dotdotremref->jr_state |= MKDIR_PARENT;
8708		} else
8709			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
8710			    ip->i_effnlink + 1);
8711	}
8712	ACQUIRE_LOCK(&lk);
8713	lbn = lblkno(dp->i_fs, dp->i_offset);
8714	offset = blkoff(dp->i_fs, dp->i_offset);
8715	pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC,
8716	    &pagedep);
8717	dirrem->dm_pagedep = pagedep;
8718	dirrem->dm_offset = offset;
8719	/*
8720	 * If we're renaming a .. link to a new directory, cancel any
8721	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
8722	 * the jremref is preserved for any potential diradd in this
8723	 * location.  This can not coincide with a rmdir.
8724	 */
8725	if (dp->i_offset == DOTDOT_OFFSET) {
8726		if (isrmdir)
8727			panic("newdirrem: .. directory change during remove?");
8728		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
8729	}
8730	/*
8731	 * If we're removing a directory search for the .. dependency now and
8732	 * cancel it.  Any pending journal work will be added to the dirrem
8733	 * to be completed when the workitem remove completes.
8734	 */
8735	if (isrmdir)
8736		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
8737	/*
8738	 * Check for a diradd dependency for the same directory entry.
8739	 * If present, then both dependencies become obsolete and can
8740	 * be de-allocated.
8741	 */
8742	dap = diradd_lookup(pagedep, offset);
8743	if (dap == NULL) {
8744		/*
8745		 * Link the jremref structures into the dirrem so they are
8746		 * written prior to the pagedep.
8747		 */
8748		if (jremref)
8749			dirrem_journal(dirrem, jremref, dotremref,
8750			    dotdotremref);
8751		return (dirrem);
8752	}
8753	/*
8754	 * Must be ATTACHED at this point.
8755	 */
8756	if ((dap->da_state & ATTACHED) == 0)
8757		panic("newdirrem: not ATTACHED");
8758	if (dap->da_newinum != ip->i_number)
8759		panic("newdirrem: inum %ju should be %ju",
8760		    (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
8761	/*
8762	 * If we are deleting a changed name that never made it to disk,
8763	 * then return the dirrem describing the previous inode (which
8764	 * represents the inode currently referenced from this entry on disk).
8765	 */
8766	if ((dap->da_state & DIRCHG) != 0) {
8767		*prevdirremp = dap->da_previous;
8768		dap->da_state &= ~DIRCHG;
8769		dap->da_pagedep = pagedep;
8770	}
8771	/*
8772	 * We are deleting an entry that never made it to disk.
8773	 * Mark it COMPLETE so we can delete its inode immediately.
8774	 */
8775	dirrem->dm_state |= COMPLETE;
8776	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
8777#ifdef SUJ_DEBUG
8778	if (isrmdir == 0) {
8779		struct worklist *wk;
8780
8781		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
8782			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
8783				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
8784	}
8785#endif
8786
8787	return (dirrem);
8788}
8789
8790/*
8791 * Directory entry change dependencies.
8792 *
8793 * Changing an existing directory entry requires that an add operation
8794 * be completed first followed by a deletion. The semantics for the addition
8795 * are identical to the description of adding a new entry above except
8796 * that the rollback is to the old inode number rather than zero. Once
8797 * the addition dependency is completed, the removal is done as described
8798 * in the removal routine above.
8799 */
8800
8801/*
8802 * This routine should be called immediately after changing
8803 * a directory entry.  The inode's link count should not be
8804 * decremented by the calling procedure -- the soft updates
8805 * code will perform this task when it is safe.
8806 */
8807void
8808softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
8809	struct buf *bp;		/* buffer containing directory block */
8810	struct inode *dp;	/* inode for the directory being modified */
8811	struct inode *ip;	/* inode for directory entry being removed */
8812	ino_t newinum;		/* new inode number for changed entry */
8813	int isrmdir;		/* indicates if doing RMDIR */
8814{
8815	int offset;
8816	struct diradd *dap = NULL;
8817	struct dirrem *dirrem, *prevdirrem;
8818	struct pagedep *pagedep;
8819	struct inodedep *inodedep;
8820	struct jaddref *jaddref;
8821	struct mount *mp;
8822
8823	offset = blkoff(dp->i_fs, dp->i_offset);
8824	mp = UFSTOVFS(dp->i_ump);
8825
8826	/*
8827	 * Whiteouts do not need diradd dependencies.
8828	 */
8829	if (newinum != WINO) {
8830		dap = malloc(sizeof(struct diradd),
8831		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
8832		workitem_alloc(&dap->da_list, D_DIRADD, mp);
8833		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
8834		dap->da_offset = offset;
8835		dap->da_newinum = newinum;
8836		LIST_INIT(&dap->da_jwork);
8837	}
8838
8839	/*
8840	 * Allocate a new dirrem and ACQUIRE_LOCK.
8841	 */
8842	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8843	pagedep = dirrem->dm_pagedep;
8844	/*
8845	 * The possible values for isrmdir:
8846	 *	0 - non-directory file rename
8847	 *	1 - directory rename within same directory
8848	 *   inum - directory rename to new directory of given inode number
8849	 * When renaming to a new directory, we are both deleting and
8850	 * creating a new directory entry, so the link count on the new
8851	 * directory should not change. Thus we do not need the followup
8852	 * dirrem which is usually done in handle_workitem_remove. We set
8853	 * the DIRCHG flag to tell handle_workitem_remove to skip the
8854	 * followup dirrem.
8855	 */
8856	if (isrmdir > 1)
8857		dirrem->dm_state |= DIRCHG;
8858
8859	/*
8860	 * Whiteouts have no additional dependencies,
8861	 * so just put the dirrem on the correct list.
8862	 */
8863	if (newinum == WINO) {
8864		if ((dirrem->dm_state & COMPLETE) == 0) {
8865			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
8866			    dm_next);
8867		} else {
8868			dirrem->dm_dirinum = pagedep->pd_ino;
8869			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8870				add_to_worklist(&dirrem->dm_list, 0);
8871		}
8872		FREE_LOCK(&lk);
8873		return;
8874	}
8875	/*
8876	 * Add the dirrem to the inodedep's pending remove list for quick
8877	 * discovery later.  A valid nlinkdelta ensures that this lookup
8878	 * will not fail.
8879	 */
8880	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
8881		panic("softdep_setup_directory_change: Lost inodedep.");
8882	dirrem->dm_state |= ONDEPLIST;
8883	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8884
8885	/*
8886	 * If the COMPLETE flag is clear, then there were no active
8887	 * entries and we want to roll back to the previous inode until
8888	 * the new inode is committed to disk. If the COMPLETE flag is
8889	 * set, then we have deleted an entry that never made it to disk.
8890	 * If the entry we deleted resulted from a name change, then the old
8891	 * inode reference still resides on disk. Any rollback that we do
8892	 * needs to be to that old inode (returned to us in prevdirrem). If
8893	 * the entry we deleted resulted from a create, then there is
8894	 * no entry on the disk, so we want to roll back to zero rather
8895	 * than the uncommitted inode. In either of the COMPLETE cases we
8896	 * want to immediately free the unwritten and unreferenced inode.
8897	 */
8898	if ((dirrem->dm_state & COMPLETE) == 0) {
8899		dap->da_previous = dirrem;
8900	} else {
8901		if (prevdirrem != NULL) {
8902			dap->da_previous = prevdirrem;
8903		} else {
8904			dap->da_state &= ~DIRCHG;
8905			dap->da_pagedep = pagedep;
8906		}
8907		dirrem->dm_dirinum = pagedep->pd_ino;
8908		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8909			add_to_worklist(&dirrem->dm_list, 0);
8910	}
8911	/*
8912	 * Lookup the jaddref for this journal entry.  We must finish
8913	 * initializing it and make the diradd write dependent on it.
8914	 * If we're not journaling, put it on the id_bufwait list if the
8915	 * inode is not yet written. If it is written, do the post-inode
8916	 * write processing to put it on the id_pendinghd list.
8917	 */
8918	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
8919	if (MOUNTEDSUJ(mp)) {
8920		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8921		    inoreflst);
8922		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8923		    ("softdep_setup_directory_change: bad jaddref %p",
8924		    jaddref));
8925		jaddref->ja_diroff = dp->i_offset;
8926		jaddref->ja_diradd = dap;
8927		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
8928		    dap, da_pdlist);
8929		add_to_journal(&jaddref->ja_list);
8930	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8931		dap->da_state |= COMPLETE;
8932		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8933		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
8934	} else {
8935		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
8936		    dap, da_pdlist);
8937		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8938	}
8939	/*
8940	 * If we're making a new name for a directory that has not been
8941	 * committed when need to move the dot and dotdot references to
8942	 * this new name.
8943	 */
8944	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
8945		merge_diradd(inodedep, dap);
8946	FREE_LOCK(&lk);
8947}
8948
8949/*
8950 * Called whenever the link count on an inode is changed.
8951 * It creates an inode dependency so that the new reference(s)
8952 * to the inode cannot be committed to disk until the updated
8953 * inode has been written.
8954 */
8955void
8956softdep_change_linkcnt(ip)
8957	struct inode *ip;	/* the inode with the increased link count */
8958{
8959	struct inodedep *inodedep;
8960	int dflags;
8961
8962	ACQUIRE_LOCK(&lk);
8963	dflags = DEPALLOC;
8964	if (IS_SNAPSHOT(ip))
8965		dflags |= NODELAY;
8966	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
8967	if (ip->i_nlink < ip->i_effnlink)
8968		panic("softdep_change_linkcnt: bad delta");
8969	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
8970	FREE_LOCK(&lk);
8971}
8972
8973/*
8974 * Attach a sbdep dependency to the superblock buf so that we can keep
8975 * track of the head of the linked list of referenced but unlinked inodes.
8976 */
8977void
8978softdep_setup_sbupdate(ump, fs, bp)
8979	struct ufsmount *ump;
8980	struct fs *fs;
8981	struct buf *bp;
8982{
8983	struct sbdep *sbdep;
8984	struct worklist *wk;
8985
8986	if (MOUNTEDSUJ(UFSTOVFS(ump)) == 0)
8987		return;
8988	LIST_FOREACH(wk, &bp->b_dep, wk_list)
8989		if (wk->wk_type == D_SBDEP)
8990			break;
8991	if (wk != NULL)
8992		return;
8993	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
8994	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
8995	sbdep->sb_fs = fs;
8996	sbdep->sb_ump = ump;
8997	ACQUIRE_LOCK(&lk);
8998	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
8999	FREE_LOCK(&lk);
9000}
9001
9002/*
9003 * Return the first unlinked inodedep which is ready to be the head of the
9004 * list.  The inodedep and all those after it must have valid next pointers.
9005 */
9006static struct inodedep *
9007first_unlinked_inodedep(ump)
9008	struct ufsmount *ump;
9009{
9010	struct inodedep *inodedep;
9011	struct inodedep *idp;
9012
9013	mtx_assert(&lk, MA_OWNED);
9014	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9015	    inodedep; inodedep = idp) {
9016		if ((inodedep->id_state & UNLINKNEXT) == 0)
9017			return (NULL);
9018		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9019		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9020			break;
9021		if ((inodedep->id_state & UNLINKPREV) == 0)
9022			break;
9023	}
9024	return (inodedep);
9025}
9026
9027/*
9028 * Set the sujfree unlinked head pointer prior to writing a superblock.
9029 */
9030static void
9031initiate_write_sbdep(sbdep)
9032	struct sbdep *sbdep;
9033{
9034	struct inodedep *inodedep;
9035	struct fs *bpfs;
9036	struct fs *fs;
9037
9038	bpfs = sbdep->sb_fs;
9039	fs = sbdep->sb_ump->um_fs;
9040	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9041	if (inodedep) {
9042		fs->fs_sujfree = inodedep->id_ino;
9043		inodedep->id_state |= UNLINKPREV;
9044	} else
9045		fs->fs_sujfree = 0;
9046	bpfs->fs_sujfree = fs->fs_sujfree;
9047}
9048
9049/*
9050 * After a superblock is written determine whether it must be written again
9051 * due to a changing unlinked list head.
9052 */
9053static int
9054handle_written_sbdep(sbdep, bp)
9055	struct sbdep *sbdep;
9056	struct buf *bp;
9057{
9058	struct inodedep *inodedep;
9059	struct mount *mp;
9060	struct fs *fs;
9061
9062	mtx_assert(&lk, MA_OWNED);
9063	fs = sbdep->sb_fs;
9064	mp = UFSTOVFS(sbdep->sb_ump);
9065	/*
9066	 * If the superblock doesn't match the in-memory list start over.
9067	 */
9068	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9069	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9070	    (inodedep == NULL && fs->fs_sujfree != 0)) {
9071		bdirty(bp);
9072		return (1);
9073	}
9074	WORKITEM_FREE(sbdep, D_SBDEP);
9075	if (fs->fs_sujfree == 0)
9076		return (0);
9077	/*
9078	 * Now that we have a record of this inode in stable store allow it
9079	 * to be written to free up pending work.  Inodes may see a lot of
9080	 * write activity after they are unlinked which we must not hold up.
9081	 */
9082	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9083		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9084			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9085			    inodedep, inodedep->id_state);
9086		if (inodedep->id_state & UNLINKONLIST)
9087			break;
9088		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9089	}
9090
9091	return (0);
9092}
9093
9094/*
9095 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9096 */
9097static void
9098unlinked_inodedep(mp, inodedep)
9099	struct mount *mp;
9100	struct inodedep *inodedep;
9101{
9102	struct ufsmount *ump;
9103
9104	mtx_assert(&lk, MA_OWNED);
9105	if (MOUNTEDSUJ(mp) == 0)
9106		return;
9107	ump = VFSTOUFS(mp);
9108	ump->um_fs->fs_fmod = 1;
9109	if (inodedep->id_state & UNLINKED)
9110		panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9111	inodedep->id_state |= UNLINKED;
9112	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9113}
9114
9115/*
9116 * Remove an inodedep from the unlinked inodedep list.  This may require
9117 * disk writes if the inode has made it that far.
9118 */
9119static void
9120clear_unlinked_inodedep(inodedep)
9121	struct inodedep *inodedep;
9122{
9123	struct ufsmount *ump;
9124	struct inodedep *idp;
9125	struct inodedep *idn;
9126	struct fs *fs;
9127	struct buf *bp;
9128	ino_t ino;
9129	ino_t nino;
9130	ino_t pino;
9131	int error;
9132
9133	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9134	fs = ump->um_fs;
9135	ino = inodedep->id_ino;
9136	error = 0;
9137	for (;;) {
9138		mtx_assert(&lk, MA_OWNED);
9139		KASSERT((inodedep->id_state & UNLINKED) != 0,
9140		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9141		    inodedep));
9142		/*
9143		 * If nothing has yet been written simply remove us from
9144		 * the in memory list and return.  This is the most common
9145		 * case where handle_workitem_remove() loses the final
9146		 * reference.
9147		 */
9148		if ((inodedep->id_state & UNLINKLINKS) == 0)
9149			break;
9150		/*
9151		 * If we have a NEXT pointer and no PREV pointer we can simply
9152		 * clear NEXT's PREV and remove ourselves from the list.  Be
9153		 * careful not to clear PREV if the superblock points at
9154		 * next as well.
9155		 */
9156		idn = TAILQ_NEXT(inodedep, id_unlinked);
9157		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9158			if (idn && fs->fs_sujfree != idn->id_ino)
9159				idn->id_state &= ~UNLINKPREV;
9160			break;
9161		}
9162		/*
9163		 * Here we have an inodedep which is actually linked into
9164		 * the list.  We must remove it by forcing a write to the
9165		 * link before us, whether it be the superblock or an inode.
9166		 * Unfortunately the list may change while we're waiting
9167		 * on the buf lock for either resource so we must loop until
9168		 * we lock the right one.  If both the superblock and an
9169		 * inode point to this inode we must clear the inode first
9170		 * followed by the superblock.
9171		 */
9172		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9173		pino = 0;
9174		if (idp && (idp->id_state & UNLINKNEXT))
9175			pino = idp->id_ino;
9176		FREE_LOCK(&lk);
9177		if (pino == 0)
9178			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9179			    (int)fs->fs_sbsize, 0, 0, 0);
9180		else
9181			error = bread(ump->um_devvp,
9182			    fsbtodb(fs, ino_to_fsba(fs, pino)),
9183			    (int)fs->fs_bsize, NOCRED, &bp);
9184		ACQUIRE_LOCK(&lk);
9185		if (error)
9186			break;
9187		/* If the list has changed restart the loop. */
9188		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9189		nino = 0;
9190		if (idp && (idp->id_state & UNLINKNEXT))
9191			nino = idp->id_ino;
9192		if (nino != pino ||
9193		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9194			FREE_LOCK(&lk);
9195			brelse(bp);
9196			ACQUIRE_LOCK(&lk);
9197			continue;
9198		}
9199		nino = 0;
9200		idn = TAILQ_NEXT(inodedep, id_unlinked);
9201		if (idn)
9202			nino = idn->id_ino;
9203		/*
9204		 * Remove us from the in memory list.  After this we cannot
9205		 * access the inodedep.
9206		 */
9207		KASSERT((inodedep->id_state & UNLINKED) != 0,
9208		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9209		    inodedep));
9210		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9211		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9212		FREE_LOCK(&lk);
9213		/*
9214		 * The predecessor's next pointer is manually updated here
9215		 * so that the NEXT flag is never cleared for an element
9216		 * that is in the list.
9217		 */
9218		if (pino == 0) {
9219			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9220			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9221			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9222			    bp);
9223		} else if (fs->fs_magic == FS_UFS1_MAGIC)
9224			((struct ufs1_dinode *)bp->b_data +
9225			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9226		else
9227			((struct ufs2_dinode *)bp->b_data +
9228			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9229		/*
9230		 * If the bwrite fails we have no recourse to recover.  The
9231		 * filesystem is corrupted already.
9232		 */
9233		bwrite(bp);
9234		ACQUIRE_LOCK(&lk);
9235		/*
9236		 * If the superblock pointer still needs to be cleared force
9237		 * a write here.
9238		 */
9239		if (fs->fs_sujfree == ino) {
9240			FREE_LOCK(&lk);
9241			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9242			    (int)fs->fs_sbsize, 0, 0, 0);
9243			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9244			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9245			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9246			    bp);
9247			bwrite(bp);
9248			ACQUIRE_LOCK(&lk);
9249		}
9250
9251		if (fs->fs_sujfree != ino)
9252			return;
9253		panic("clear_unlinked_inodedep: Failed to clear free head");
9254	}
9255	if (inodedep->id_ino == fs->fs_sujfree)
9256		panic("clear_unlinked_inodedep: Freeing head of free list");
9257	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9258	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9259	return;
9260}
9261
9262/*
9263 * This workitem decrements the inode's link count.
9264 * If the link count reaches zero, the file is removed.
9265 */
9266static int
9267handle_workitem_remove(dirrem, flags)
9268	struct dirrem *dirrem;
9269	int flags;
9270{
9271	struct inodedep *inodedep;
9272	struct workhead dotdotwk;
9273	struct worklist *wk;
9274	struct ufsmount *ump;
9275	struct mount *mp;
9276	struct vnode *vp;
9277	struct inode *ip;
9278	ino_t oldinum;
9279
9280	if (dirrem->dm_state & ONWORKLIST)
9281		panic("handle_workitem_remove: dirrem %p still on worklist",
9282		    dirrem);
9283	oldinum = dirrem->dm_oldinum;
9284	mp = dirrem->dm_list.wk_mp;
9285	ump = VFSTOUFS(mp);
9286	flags |= LK_EXCLUSIVE;
9287	if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
9288		return (EBUSY);
9289	ip = VTOI(vp);
9290	ACQUIRE_LOCK(&lk);
9291	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
9292		panic("handle_workitem_remove: lost inodedep");
9293	if (dirrem->dm_state & ONDEPLIST)
9294		LIST_REMOVE(dirrem, dm_inonext);
9295	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9296	    ("handle_workitem_remove:  Journal entries not written."));
9297
9298	/*
9299	 * Move all dependencies waiting on the remove to complete
9300	 * from the dirrem to the inode inowait list to be completed
9301	 * after the inode has been updated and written to disk.  Any
9302	 * marked MKDIR_PARENT are saved to be completed when the .. ref
9303	 * is removed.
9304	 */
9305	LIST_INIT(&dotdotwk);
9306	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
9307		WORKLIST_REMOVE(wk);
9308		if (wk->wk_state & MKDIR_PARENT) {
9309			wk->wk_state &= ~MKDIR_PARENT;
9310			WORKLIST_INSERT(&dotdotwk, wk);
9311			continue;
9312		}
9313		WORKLIST_INSERT(&inodedep->id_inowait, wk);
9314	}
9315	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
9316	/*
9317	 * Normal file deletion.
9318	 */
9319	if ((dirrem->dm_state & RMDIR) == 0) {
9320		ip->i_nlink--;
9321		DIP_SET(ip, i_nlink, ip->i_nlink);
9322		ip->i_flag |= IN_CHANGE;
9323		if (ip->i_nlink < ip->i_effnlink)
9324			panic("handle_workitem_remove: bad file delta");
9325		if (ip->i_nlink == 0)
9326			unlinked_inodedep(mp, inodedep);
9327		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9328		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9329		    ("handle_workitem_remove: worklist not empty. %s",
9330		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
9331		WORKITEM_FREE(dirrem, D_DIRREM);
9332		FREE_LOCK(&lk);
9333		goto out;
9334	}
9335	/*
9336	 * Directory deletion. Decrement reference count for both the
9337	 * just deleted parent directory entry and the reference for ".".
9338	 * Arrange to have the reference count on the parent decremented
9339	 * to account for the loss of "..".
9340	 */
9341	ip->i_nlink -= 2;
9342	DIP_SET(ip, i_nlink, ip->i_nlink);
9343	ip->i_flag |= IN_CHANGE;
9344	if (ip->i_nlink < ip->i_effnlink)
9345		panic("handle_workitem_remove: bad dir delta");
9346	if (ip->i_nlink == 0)
9347		unlinked_inodedep(mp, inodedep);
9348	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9349	/*
9350	 * Rename a directory to a new parent. Since, we are both deleting
9351	 * and creating a new directory entry, the link count on the new
9352	 * directory should not change. Thus we skip the followup dirrem.
9353	 */
9354	if (dirrem->dm_state & DIRCHG) {
9355		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9356		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
9357		WORKITEM_FREE(dirrem, D_DIRREM);
9358		FREE_LOCK(&lk);
9359		goto out;
9360	}
9361	dirrem->dm_state = ONDEPLIST;
9362	dirrem->dm_oldinum = dirrem->dm_dirinum;
9363	/*
9364	 * Place the dirrem on the parent's diremhd list.
9365	 */
9366	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
9367		panic("handle_workitem_remove: lost dir inodedep");
9368	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9369	/*
9370	 * If the allocated inode has never been written to disk, then
9371	 * the on-disk inode is zero'ed and we can remove the file
9372	 * immediately.  When journaling if the inode has been marked
9373	 * unlinked and not DEPCOMPLETE we know it can never be written.
9374	 */
9375	inodedep_lookup(mp, oldinum, 0, &inodedep);
9376	if (inodedep == NULL ||
9377	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
9378	    check_inode_unwritten(inodedep)) {
9379		FREE_LOCK(&lk);
9380		vput(vp);
9381		return handle_workitem_remove(dirrem, flags);
9382	}
9383	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
9384	FREE_LOCK(&lk);
9385	ip->i_flag |= IN_CHANGE;
9386out:
9387	ffs_update(vp, 0);
9388	vput(vp);
9389	return (0);
9390}
9391
9392/*
9393 * Inode de-allocation dependencies.
9394 *
9395 * When an inode's link count is reduced to zero, it can be de-allocated. We
9396 * found it convenient to postpone de-allocation until after the inode is
9397 * written to disk with its new link count (zero).  At this point, all of the
9398 * on-disk inode's block pointers are nullified and, with careful dependency
9399 * list ordering, all dependencies related to the inode will be satisfied and
9400 * the corresponding dependency structures de-allocated.  So, if/when the
9401 * inode is reused, there will be no mixing of old dependencies with new
9402 * ones.  This artificial dependency is set up by the block de-allocation
9403 * procedure above (softdep_setup_freeblocks) and completed by the
9404 * following procedure.
9405 */
9406static void
9407handle_workitem_freefile(freefile)
9408	struct freefile *freefile;
9409{
9410	struct workhead wkhd;
9411	struct fs *fs;
9412	struct inodedep *idp;
9413	struct ufsmount *ump;
9414	int error;
9415
9416	ump = VFSTOUFS(freefile->fx_list.wk_mp);
9417	fs = ump->um_fs;
9418#ifdef DEBUG
9419	ACQUIRE_LOCK(&lk);
9420	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
9421	FREE_LOCK(&lk);
9422	if (error)
9423		panic("handle_workitem_freefile: inodedep %p survived", idp);
9424#endif
9425	UFS_LOCK(ump);
9426	fs->fs_pendinginodes -= 1;
9427	UFS_UNLOCK(ump);
9428	LIST_INIT(&wkhd);
9429	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
9430	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
9431	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
9432		softdep_error("handle_workitem_freefile", error);
9433	ACQUIRE_LOCK(&lk);
9434	WORKITEM_FREE(freefile, D_FREEFILE);
9435	FREE_LOCK(&lk);
9436}
9437
9438
9439/*
9440 * Helper function which unlinks marker element from work list and returns
9441 * the next element on the list.
9442 */
9443static __inline struct worklist *
9444markernext(struct worklist *marker)
9445{
9446	struct worklist *next;
9447
9448	next = LIST_NEXT(marker, wk_list);
9449	LIST_REMOVE(marker, wk_list);
9450	return next;
9451}
9452
9453/*
9454 * Disk writes.
9455 *
9456 * The dependency structures constructed above are most actively used when file
9457 * system blocks are written to disk.  No constraints are placed on when a
9458 * block can be written, but unsatisfied update dependencies are made safe by
9459 * modifying (or replacing) the source memory for the duration of the disk
9460 * write.  When the disk write completes, the memory block is again brought
9461 * up-to-date.
9462 *
9463 * In-core inode structure reclamation.
9464 *
9465 * Because there are a finite number of "in-core" inode structures, they are
9466 * reused regularly.  By transferring all inode-related dependencies to the
9467 * in-memory inode block and indexing them separately (via "inodedep"s), we
9468 * can allow "in-core" inode structures to be reused at any time and avoid
9469 * any increase in contention.
9470 *
9471 * Called just before entering the device driver to initiate a new disk I/O.
9472 * The buffer must be locked, thus, no I/O completion operations can occur
9473 * while we are manipulating its associated dependencies.
9474 */
9475static void
9476softdep_disk_io_initiation(bp)
9477	struct buf *bp;		/* structure describing disk write to occur */
9478{
9479	struct worklist *wk;
9480	struct worklist marker;
9481	struct inodedep *inodedep;
9482	struct freeblks *freeblks;
9483	struct jblkdep *jblkdep;
9484	struct newblk *newblk;
9485
9486	/*
9487	 * We only care about write operations. There should never
9488	 * be dependencies for reads.
9489	 */
9490	if (bp->b_iocmd != BIO_WRITE)
9491		panic("softdep_disk_io_initiation: not write");
9492
9493	if (bp->b_vflags & BV_BKGRDINPROG)
9494		panic("softdep_disk_io_initiation: Writing buffer with "
9495		    "background write in progress: %p", bp);
9496
9497	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
9498	PHOLD(curproc);			/* Don't swap out kernel stack */
9499
9500	ACQUIRE_LOCK(&lk);
9501	/*
9502	 * Do any necessary pre-I/O processing.
9503	 */
9504	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
9505	     wk = markernext(&marker)) {
9506		LIST_INSERT_AFTER(wk, &marker, wk_list);
9507		switch (wk->wk_type) {
9508
9509		case D_PAGEDEP:
9510			initiate_write_filepage(WK_PAGEDEP(wk), bp);
9511			continue;
9512
9513		case D_INODEDEP:
9514			inodedep = WK_INODEDEP(wk);
9515			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
9516				initiate_write_inodeblock_ufs1(inodedep, bp);
9517			else
9518				initiate_write_inodeblock_ufs2(inodedep, bp);
9519			continue;
9520
9521		case D_INDIRDEP:
9522			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
9523			continue;
9524
9525		case D_BMSAFEMAP:
9526			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
9527			continue;
9528
9529		case D_JSEG:
9530			WK_JSEG(wk)->js_buf = NULL;
9531			continue;
9532
9533		case D_FREEBLKS:
9534			freeblks = WK_FREEBLKS(wk);
9535			jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
9536			/*
9537			 * We have to wait for the freeblks to be journaled
9538			 * before we can write an inodeblock with updated
9539			 * pointers.  Be careful to arrange the marker so
9540			 * we revisit the freeblks if it's not removed by
9541			 * the first jwait().
9542			 */
9543			if (jblkdep != NULL) {
9544				LIST_REMOVE(&marker, wk_list);
9545				LIST_INSERT_BEFORE(wk, &marker, wk_list);
9546				jwait(&jblkdep->jb_list, MNT_WAIT);
9547			}
9548			continue;
9549		case D_ALLOCDIRECT:
9550		case D_ALLOCINDIR:
9551			/*
9552			 * We have to wait for the jnewblk to be journaled
9553			 * before we can write to a block if the contents
9554			 * may be confused with an earlier file's indirect
9555			 * at recovery time.  Handle the marker as described
9556			 * above.
9557			 */
9558			newblk = WK_NEWBLK(wk);
9559			if (newblk->nb_jnewblk != NULL &&
9560			    indirblk_lookup(newblk->nb_list.wk_mp,
9561			    newblk->nb_newblkno)) {
9562				LIST_REMOVE(&marker, wk_list);
9563				LIST_INSERT_BEFORE(wk, &marker, wk_list);
9564				jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
9565			}
9566			continue;
9567
9568		case D_SBDEP:
9569			initiate_write_sbdep(WK_SBDEP(wk));
9570			continue;
9571
9572		case D_MKDIR:
9573		case D_FREEWORK:
9574		case D_FREEDEP:
9575		case D_JSEGDEP:
9576			continue;
9577
9578		default:
9579			panic("handle_disk_io_initiation: Unexpected type %s",
9580			    TYPENAME(wk->wk_type));
9581			/* NOTREACHED */
9582		}
9583	}
9584	FREE_LOCK(&lk);
9585	PRELE(curproc);			/* Allow swapout of kernel stack */
9586}
9587
9588/*
9589 * Called from within the procedure above to deal with unsatisfied
9590 * allocation dependencies in a directory. The buffer must be locked,
9591 * thus, no I/O completion operations can occur while we are
9592 * manipulating its associated dependencies.
9593 */
9594static void
9595initiate_write_filepage(pagedep, bp)
9596	struct pagedep *pagedep;
9597	struct buf *bp;
9598{
9599	struct jremref *jremref;
9600	struct jmvref *jmvref;
9601	struct dirrem *dirrem;
9602	struct diradd *dap;
9603	struct direct *ep;
9604	int i;
9605
9606	if (pagedep->pd_state & IOSTARTED) {
9607		/*
9608		 * This can only happen if there is a driver that does not
9609		 * understand chaining. Here biodone will reissue the call
9610		 * to strategy for the incomplete buffers.
9611		 */
9612		printf("initiate_write_filepage: already started\n");
9613		return;
9614	}
9615	pagedep->pd_state |= IOSTARTED;
9616	/*
9617	 * Wait for all journal remove dependencies to hit the disk.
9618	 * We can not allow any potentially conflicting directory adds
9619	 * to be visible before removes and rollback is too difficult.
9620	 * lk may be dropped and re-acquired, however we hold the buf
9621	 * locked so the dependency can not go away.
9622	 */
9623	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
9624		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
9625			jwait(&jremref->jr_list, MNT_WAIT);
9626	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
9627		jwait(&jmvref->jm_list, MNT_WAIT);
9628	for (i = 0; i < DAHASHSZ; i++) {
9629		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
9630			ep = (struct direct *)
9631			    ((char *)bp->b_data + dap->da_offset);
9632			if (ep->d_ino != dap->da_newinum)
9633				panic("%s: dir inum %ju != new %ju",
9634				    "initiate_write_filepage",
9635				    (uintmax_t)ep->d_ino,
9636				    (uintmax_t)dap->da_newinum);
9637			if (dap->da_state & DIRCHG)
9638				ep->d_ino = dap->da_previous->dm_oldinum;
9639			else
9640				ep->d_ino = 0;
9641			dap->da_state &= ~ATTACHED;
9642			dap->da_state |= UNDONE;
9643		}
9644	}
9645}
9646
9647/*
9648 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
9649 * Note that any bug fixes made to this routine must be done in the
9650 * version found below.
9651 *
9652 * Called from within the procedure above to deal with unsatisfied
9653 * allocation dependencies in an inodeblock. The buffer must be
9654 * locked, thus, no I/O completion operations can occur while we
9655 * are manipulating its associated dependencies.
9656 */
9657static void
9658initiate_write_inodeblock_ufs1(inodedep, bp)
9659	struct inodedep *inodedep;
9660	struct buf *bp;			/* The inode block */
9661{
9662	struct allocdirect *adp, *lastadp;
9663	struct ufs1_dinode *dp;
9664	struct ufs1_dinode *sip;
9665	struct inoref *inoref;
9666	struct fs *fs;
9667	ufs_lbn_t i;
9668#ifdef INVARIANTS
9669	ufs_lbn_t prevlbn = 0;
9670#endif
9671	int deplist;
9672
9673	if (inodedep->id_state & IOSTARTED)
9674		panic("initiate_write_inodeblock_ufs1: already started");
9675	inodedep->id_state |= IOSTARTED;
9676	fs = inodedep->id_fs;
9677	dp = (struct ufs1_dinode *)bp->b_data +
9678	    ino_to_fsbo(fs, inodedep->id_ino);
9679
9680	/*
9681	 * If we're on the unlinked list but have not yet written our
9682	 * next pointer initialize it here.
9683	 */
9684	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
9685		struct inodedep *inon;
9686
9687		inon = TAILQ_NEXT(inodedep, id_unlinked);
9688		dp->di_freelink = inon ? inon->id_ino : 0;
9689	}
9690	/*
9691	 * If the bitmap is not yet written, then the allocated
9692	 * inode cannot be written to disk.
9693	 */
9694	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
9695		if (inodedep->id_savedino1 != NULL)
9696			panic("initiate_write_inodeblock_ufs1: I/O underway");
9697		FREE_LOCK(&lk);
9698		sip = malloc(sizeof(struct ufs1_dinode),
9699		    M_SAVEDINO, M_SOFTDEP_FLAGS);
9700		ACQUIRE_LOCK(&lk);
9701		inodedep->id_savedino1 = sip;
9702		*inodedep->id_savedino1 = *dp;
9703		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
9704		dp->di_gen = inodedep->id_savedino1->di_gen;
9705		dp->di_freelink = inodedep->id_savedino1->di_freelink;
9706		return;
9707	}
9708	/*
9709	 * If no dependencies, then there is nothing to roll back.
9710	 */
9711	inodedep->id_savedsize = dp->di_size;
9712	inodedep->id_savedextsize = 0;
9713	inodedep->id_savednlink = dp->di_nlink;
9714	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
9715	    TAILQ_EMPTY(&inodedep->id_inoreflst))
9716		return;
9717	/*
9718	 * Revert the link count to that of the first unwritten journal entry.
9719	 */
9720	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
9721	if (inoref)
9722		dp->di_nlink = inoref->if_nlink;
9723	/*
9724	 * Set the dependencies to busy.
9725	 */
9726	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
9727	     adp = TAILQ_NEXT(adp, ad_next)) {
9728#ifdef INVARIANTS
9729		if (deplist != 0 && prevlbn >= adp->ad_offset)
9730			panic("softdep_write_inodeblock: lbn order");
9731		prevlbn = adp->ad_offset;
9732		if (adp->ad_offset < NDADDR &&
9733		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
9734			panic("%s: direct pointer #%jd mismatch %d != %jd",
9735			    "softdep_write_inodeblock",
9736			    (intmax_t)adp->ad_offset,
9737			    dp->di_db[adp->ad_offset],
9738			    (intmax_t)adp->ad_newblkno);
9739		if (adp->ad_offset >= NDADDR &&
9740		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
9741			panic("%s: indirect pointer #%jd mismatch %d != %jd",
9742			    "softdep_write_inodeblock",
9743			    (intmax_t)adp->ad_offset - NDADDR,
9744			    dp->di_ib[adp->ad_offset - NDADDR],
9745			    (intmax_t)adp->ad_newblkno);
9746		deplist |= 1 << adp->ad_offset;
9747		if ((adp->ad_state & ATTACHED) == 0)
9748			panic("softdep_write_inodeblock: Unknown state 0x%x",
9749			    adp->ad_state);
9750#endif /* INVARIANTS */
9751		adp->ad_state &= ~ATTACHED;
9752		adp->ad_state |= UNDONE;
9753	}
9754	/*
9755	 * The on-disk inode cannot claim to be any larger than the last
9756	 * fragment that has been written. Otherwise, the on-disk inode
9757	 * might have fragments that were not the last block in the file
9758	 * which would corrupt the filesystem.
9759	 */
9760	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
9761	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
9762		if (adp->ad_offset >= NDADDR)
9763			break;
9764		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
9765		/* keep going until hitting a rollback to a frag */
9766		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
9767			continue;
9768		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
9769		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
9770#ifdef INVARIANTS
9771			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
9772				panic("softdep_write_inodeblock: lost dep1");
9773#endif /* INVARIANTS */
9774			dp->di_db[i] = 0;
9775		}
9776		for (i = 0; i < NIADDR; i++) {
9777#ifdef INVARIANTS
9778			if (dp->di_ib[i] != 0 &&
9779			    (deplist & ((1 << NDADDR) << i)) == 0)
9780				panic("softdep_write_inodeblock: lost dep2");
9781#endif /* INVARIANTS */
9782			dp->di_ib[i] = 0;
9783		}
9784		return;
9785	}
9786	/*
9787	 * If we have zero'ed out the last allocated block of the file,
9788	 * roll back the size to the last currently allocated block.
9789	 * We know that this last allocated block is a full-sized as
9790	 * we already checked for fragments in the loop above.
9791	 */
9792	if (lastadp != NULL &&
9793	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
9794		for (i = lastadp->ad_offset; i >= 0; i--)
9795			if (dp->di_db[i] != 0)
9796				break;
9797		dp->di_size = (i + 1) * fs->fs_bsize;
9798	}
9799	/*
9800	 * The only dependencies are for indirect blocks.
9801	 *
9802	 * The file size for indirect block additions is not guaranteed.
9803	 * Such a guarantee would be non-trivial to achieve. The conventional
9804	 * synchronous write implementation also does not make this guarantee.
9805	 * Fsck should catch and fix discrepancies. Arguably, the file size
9806	 * can be over-estimated without destroying integrity when the file
9807	 * moves into the indirect blocks (i.e., is large). If we want to
9808	 * postpone fsck, we are stuck with this argument.
9809	 */
9810	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
9811		dp->di_ib[adp->ad_offset - NDADDR] = 0;
9812}
9813
9814/*
9815 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
9816 * Note that any bug fixes made to this routine must be done in the
9817 * version found above.
9818 *
9819 * Called from within the procedure above to deal with unsatisfied
9820 * allocation dependencies in an inodeblock. The buffer must be
9821 * locked, thus, no I/O completion operations can occur while we
9822 * are manipulating its associated dependencies.
9823 */
9824static void
9825initiate_write_inodeblock_ufs2(inodedep, bp)
9826	struct inodedep *inodedep;
9827	struct buf *bp;			/* The inode block */
9828{
9829	struct allocdirect *adp, *lastadp;
9830	struct ufs2_dinode *dp;
9831	struct ufs2_dinode *sip;
9832	struct inoref *inoref;
9833	struct fs *fs;
9834	ufs_lbn_t i;
9835#ifdef INVARIANTS
9836	ufs_lbn_t prevlbn = 0;
9837#endif
9838	int deplist;
9839
9840	if (inodedep->id_state & IOSTARTED)
9841		panic("initiate_write_inodeblock_ufs2: already started");
9842	inodedep->id_state |= IOSTARTED;
9843	fs = inodedep->id_fs;
9844	dp = (struct ufs2_dinode *)bp->b_data +
9845	    ino_to_fsbo(fs, inodedep->id_ino);
9846
9847	/*
9848	 * If we're on the unlinked list but have not yet written our
9849	 * next pointer initialize it here.
9850	 */
9851	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
9852		struct inodedep *inon;
9853
9854		inon = TAILQ_NEXT(inodedep, id_unlinked);
9855		dp->di_freelink = inon ? inon->id_ino : 0;
9856	}
9857	/*
9858	 * If the bitmap is not yet written, then the allocated
9859	 * inode cannot be written to disk.
9860	 */
9861	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
9862		if (inodedep->id_savedino2 != NULL)
9863			panic("initiate_write_inodeblock_ufs2: I/O underway");
9864		FREE_LOCK(&lk);
9865		sip = malloc(sizeof(struct ufs2_dinode),
9866		    M_SAVEDINO, M_SOFTDEP_FLAGS);
9867		ACQUIRE_LOCK(&lk);
9868		inodedep->id_savedino2 = sip;
9869		*inodedep->id_savedino2 = *dp;
9870		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
9871		dp->di_gen = inodedep->id_savedino2->di_gen;
9872		dp->di_freelink = inodedep->id_savedino2->di_freelink;
9873		return;
9874	}
9875	/*
9876	 * If no dependencies, then there is nothing to roll back.
9877	 */
9878	inodedep->id_savedsize = dp->di_size;
9879	inodedep->id_savedextsize = dp->di_extsize;
9880	inodedep->id_savednlink = dp->di_nlink;
9881	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
9882	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
9883	    TAILQ_EMPTY(&inodedep->id_inoreflst))
9884		return;
9885	/*
9886	 * Revert the link count to that of the first unwritten journal entry.
9887	 */
9888	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
9889	if (inoref)
9890		dp->di_nlink = inoref->if_nlink;
9891
9892	/*
9893	 * Set the ext data dependencies to busy.
9894	 */
9895	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
9896	     adp = TAILQ_NEXT(adp, ad_next)) {
9897#ifdef INVARIANTS
9898		if (deplist != 0 && prevlbn >= adp->ad_offset)
9899			panic("softdep_write_inodeblock: lbn order");
9900		prevlbn = adp->ad_offset;
9901		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
9902			panic("%s: direct pointer #%jd mismatch %jd != %jd",
9903			    "softdep_write_inodeblock",
9904			    (intmax_t)adp->ad_offset,
9905			    (intmax_t)dp->di_extb[adp->ad_offset],
9906			    (intmax_t)adp->ad_newblkno);
9907		deplist |= 1 << adp->ad_offset;
9908		if ((adp->ad_state & ATTACHED) == 0)
9909			panic("softdep_write_inodeblock: Unknown state 0x%x",
9910			    adp->ad_state);
9911#endif /* INVARIANTS */
9912		adp->ad_state &= ~ATTACHED;
9913		adp->ad_state |= UNDONE;
9914	}
9915	/*
9916	 * The on-disk inode cannot claim to be any larger than the last
9917	 * fragment that has been written. Otherwise, the on-disk inode
9918	 * might have fragments that were not the last block in the ext
9919	 * data which would corrupt the filesystem.
9920	 */
9921	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
9922	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
9923		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
9924		/* keep going until hitting a rollback to a frag */
9925		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
9926			continue;
9927		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
9928		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
9929#ifdef INVARIANTS
9930			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
9931				panic("softdep_write_inodeblock: lost dep1");
9932#endif /* INVARIANTS */
9933			dp->di_extb[i] = 0;
9934		}
9935		lastadp = NULL;
9936		break;
9937	}
9938	/*
9939	 * If we have zero'ed out the last allocated block of the ext
9940	 * data, roll back the size to the last currently allocated block.
9941	 * We know that this last allocated block is a full-sized as
9942	 * we already checked for fragments in the loop above.
9943	 */
9944	if (lastadp != NULL &&
9945	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
9946		for (i = lastadp->ad_offset; i >= 0; i--)
9947			if (dp->di_extb[i] != 0)
9948				break;
9949		dp->di_extsize = (i + 1) * fs->fs_bsize;
9950	}
9951	/*
9952	 * Set the file data dependencies to busy.
9953	 */
9954	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
9955	     adp = TAILQ_NEXT(adp, ad_next)) {
9956#ifdef INVARIANTS
9957		if (deplist != 0 && prevlbn >= adp->ad_offset)
9958			panic("softdep_write_inodeblock: lbn order");
9959		if ((adp->ad_state & ATTACHED) == 0)
9960			panic("inodedep %p and adp %p not attached", inodedep, adp);
9961		prevlbn = adp->ad_offset;
9962		if (adp->ad_offset < NDADDR &&
9963		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
9964			panic("%s: direct pointer #%jd mismatch %jd != %jd",
9965			    "softdep_write_inodeblock",
9966			    (intmax_t)adp->ad_offset,
9967			    (intmax_t)dp->di_db[adp->ad_offset],
9968			    (intmax_t)adp->ad_newblkno);
9969		if (adp->ad_offset >= NDADDR &&
9970		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
9971			panic("%s indirect pointer #%jd mismatch %jd != %jd",
9972			    "softdep_write_inodeblock:",
9973			    (intmax_t)adp->ad_offset - NDADDR,
9974			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
9975			    (intmax_t)adp->ad_newblkno);
9976		deplist |= 1 << adp->ad_offset;
9977		if ((adp->ad_state & ATTACHED) == 0)
9978			panic("softdep_write_inodeblock: Unknown state 0x%x",
9979			    adp->ad_state);
9980#endif /* INVARIANTS */
9981		adp->ad_state &= ~ATTACHED;
9982		adp->ad_state |= UNDONE;
9983	}
9984	/*
9985	 * The on-disk inode cannot claim to be any larger than the last
9986	 * fragment that has been written. Otherwise, the on-disk inode
9987	 * might have fragments that were not the last block in the file
9988	 * which would corrupt the filesystem.
9989	 */
9990	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
9991	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
9992		if (adp->ad_offset >= NDADDR)
9993			break;
9994		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
9995		/* keep going until hitting a rollback to a frag */
9996		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
9997			continue;
9998		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
9999		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10000#ifdef INVARIANTS
10001			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10002				panic("softdep_write_inodeblock: lost dep2");
10003#endif /* INVARIANTS */
10004			dp->di_db[i] = 0;
10005		}
10006		for (i = 0; i < NIADDR; i++) {
10007#ifdef INVARIANTS
10008			if (dp->di_ib[i] != 0 &&
10009			    (deplist & ((1 << NDADDR) << i)) == 0)
10010				panic("softdep_write_inodeblock: lost dep3");
10011#endif /* INVARIANTS */
10012			dp->di_ib[i] = 0;
10013		}
10014		return;
10015	}
10016	/*
10017	 * If we have zero'ed out the last allocated block of the file,
10018	 * roll back the size to the last currently allocated block.
10019	 * We know that this last allocated block is a full-sized as
10020	 * we already checked for fragments in the loop above.
10021	 */
10022	if (lastadp != NULL &&
10023	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10024		for (i = lastadp->ad_offset; i >= 0; i--)
10025			if (dp->di_db[i] != 0)
10026				break;
10027		dp->di_size = (i + 1) * fs->fs_bsize;
10028	}
10029	/*
10030	 * The only dependencies are for indirect blocks.
10031	 *
10032	 * The file size for indirect block additions is not guaranteed.
10033	 * Such a guarantee would be non-trivial to achieve. The conventional
10034	 * synchronous write implementation also does not make this guarantee.
10035	 * Fsck should catch and fix discrepancies. Arguably, the file size
10036	 * can be over-estimated without destroying integrity when the file
10037	 * moves into the indirect blocks (i.e., is large). If we want to
10038	 * postpone fsck, we are stuck with this argument.
10039	 */
10040	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10041		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10042}
10043
10044/*
10045 * Cancel an indirdep as a result of truncation.  Release all of the
10046 * children allocindirs and place their journal work on the appropriate
10047 * list.
10048 */
10049static void
10050cancel_indirdep(indirdep, bp, freeblks)
10051	struct indirdep *indirdep;
10052	struct buf *bp;
10053	struct freeblks *freeblks;
10054{
10055	struct allocindir *aip;
10056
10057	/*
10058	 * None of the indirect pointers will ever be visible,
10059	 * so they can simply be tossed. GOINGAWAY ensures
10060	 * that allocated pointers will be saved in the buffer
10061	 * cache until they are freed. Note that they will
10062	 * only be able to be found by their physical address
10063	 * since the inode mapping the logical address will
10064	 * be gone. The save buffer used for the safe copy
10065	 * was allocated in setup_allocindir_phase2 using
10066	 * the physical address so it could be used for this
10067	 * purpose. Hence we swap the safe copy with the real
10068	 * copy, allowing the safe copy to be freed and holding
10069	 * on to the real copy for later use in indir_trunc.
10070	 */
10071	if (indirdep->ir_state & GOINGAWAY)
10072		panic("cancel_indirdep: already gone");
10073	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10074		indirdep->ir_state |= DEPCOMPLETE;
10075		LIST_REMOVE(indirdep, ir_next);
10076	}
10077	indirdep->ir_state |= GOINGAWAY;
10078	VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1;
10079	/*
10080	 * Pass in bp for blocks still have journal writes
10081	 * pending so we can cancel them on their own.
10082	 */
10083	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
10084		cancel_allocindir(aip, bp, freeblks, 0);
10085	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
10086		cancel_allocindir(aip, NULL, freeblks, 0);
10087	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
10088		cancel_allocindir(aip, NULL, freeblks, 0);
10089	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
10090		cancel_allocindir(aip, NULL, freeblks, 0);
10091	/*
10092	 * If there are pending partial truncations we need to keep the
10093	 * old block copy around until they complete.  This is because
10094	 * the current b_data is not a perfect superset of the available
10095	 * blocks.
10096	 */
10097	if (TAILQ_EMPTY(&indirdep->ir_trunc))
10098		bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10099	else
10100		bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10101	WORKLIST_REMOVE(&indirdep->ir_list);
10102	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10103	indirdep->ir_bp = NULL;
10104	indirdep->ir_freeblks = freeblks;
10105}
10106
10107/*
10108 * Free an indirdep once it no longer has new pointers to track.
10109 */
10110static void
10111free_indirdep(indirdep)
10112	struct indirdep *indirdep;
10113{
10114
10115	KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10116	    ("free_indirdep: Indir trunc list not empty."));
10117	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10118	    ("free_indirdep: Complete head not empty."));
10119	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10120	    ("free_indirdep: write head not empty."));
10121	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10122	    ("free_indirdep: done head not empty."));
10123	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10124	    ("free_indirdep: deplist head not empty."));
10125	KASSERT((indirdep->ir_state & DEPCOMPLETE),
10126	    ("free_indirdep: %p still on newblk list.", indirdep));
10127	KASSERT(indirdep->ir_saveddata == NULL,
10128	    ("free_indirdep: %p still has saved data.", indirdep));
10129	if (indirdep->ir_state & ONWORKLIST)
10130		WORKLIST_REMOVE(&indirdep->ir_list);
10131	WORKITEM_FREE(indirdep, D_INDIRDEP);
10132}
10133
10134/*
10135 * Called before a write to an indirdep.  This routine is responsible for
10136 * rolling back pointers to a safe state which includes only those
10137 * allocindirs which have been completed.
10138 */
10139static void
10140initiate_write_indirdep(indirdep, bp)
10141	struct indirdep *indirdep;
10142	struct buf *bp;
10143{
10144
10145	indirdep->ir_state |= IOSTARTED;
10146	if (indirdep->ir_state & GOINGAWAY)
10147		panic("disk_io_initiation: indirdep gone");
10148	/*
10149	 * If there are no remaining dependencies, this will be writing
10150	 * the real pointers.
10151	 */
10152	if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10153	    TAILQ_EMPTY(&indirdep->ir_trunc))
10154		return;
10155	/*
10156	 * Replace up-to-date version with safe version.
10157	 */
10158	if (indirdep->ir_saveddata == NULL) {
10159		FREE_LOCK(&lk);
10160		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10161		    M_SOFTDEP_FLAGS);
10162		ACQUIRE_LOCK(&lk);
10163	}
10164	indirdep->ir_state &= ~ATTACHED;
10165	indirdep->ir_state |= UNDONE;
10166	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10167	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10168	    bp->b_bcount);
10169}
10170
10171/*
10172 * Called when an inode has been cleared in a cg bitmap.  This finally
10173 * eliminates any canceled jaddrefs
10174 */
10175void
10176softdep_setup_inofree(mp, bp, ino, wkhd)
10177	struct mount *mp;
10178	struct buf *bp;
10179	ino_t ino;
10180	struct workhead *wkhd;
10181{
10182	struct worklist *wk, *wkn;
10183	struct inodedep *inodedep;
10184	uint8_t *inosused;
10185	struct cg *cgp;
10186	struct fs *fs;
10187
10188	ACQUIRE_LOCK(&lk);
10189	fs = VFSTOUFS(mp)->um_fs;
10190	cgp = (struct cg *)bp->b_data;
10191	inosused = cg_inosused(cgp);
10192	if (isset(inosused, ino % fs->fs_ipg))
10193		panic("softdep_setup_inofree: inode %ju not freed.",
10194		    (uintmax_t)ino);
10195	if (inodedep_lookup(mp, ino, 0, &inodedep))
10196		panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
10197		    (uintmax_t)ino, inodedep);
10198	if (wkhd) {
10199		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10200			if (wk->wk_type != D_JADDREF)
10201				continue;
10202			WORKLIST_REMOVE(wk);
10203			/*
10204			 * We can free immediately even if the jaddref
10205			 * isn't attached in a background write as now
10206			 * the bitmaps are reconciled.
10207		 	 */
10208			wk->wk_state |= COMPLETE | ATTACHED;
10209			free_jaddref(WK_JADDREF(wk));
10210		}
10211		jwork_move(&bp->b_dep, wkhd);
10212	}
10213	FREE_LOCK(&lk);
10214}
10215
10216
10217/*
10218 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10219 * map.  Any dependencies waiting for the write to clear are added to the
10220 * buf's list and any jnewblks that are being canceled are discarded
10221 * immediately.
10222 */
10223void
10224softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
10225	struct mount *mp;
10226	struct buf *bp;
10227	ufs2_daddr_t blkno;
10228	int frags;
10229	struct workhead *wkhd;
10230{
10231	struct bmsafemap *bmsafemap;
10232	struct jnewblk *jnewblk;
10233	struct worklist *wk;
10234	struct fs *fs;
10235#ifdef SUJ_DEBUG
10236	uint8_t *blksfree;
10237	struct cg *cgp;
10238	ufs2_daddr_t jstart;
10239	ufs2_daddr_t jend;
10240	ufs2_daddr_t end;
10241	long bno;
10242	int i;
10243#endif
10244
10245	ACQUIRE_LOCK(&lk);
10246	/* Lookup the bmsafemap so we track when it is dirty. */
10247	fs = VFSTOUFS(mp)->um_fs;
10248	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10249	/*
10250	 * Detach any jnewblks which have been canceled.  They must linger
10251	 * until the bitmap is cleared again by ffs_blkfree() to prevent
10252	 * an unjournaled allocation from hitting the disk.
10253	 */
10254	if (wkhd) {
10255		while ((wk = LIST_FIRST(wkhd)) != NULL) {
10256			WORKLIST_REMOVE(wk);
10257			if (wk->wk_type != D_JNEWBLK) {
10258				WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
10259				continue;
10260			}
10261			jnewblk = WK_JNEWBLK(wk);
10262			KASSERT(jnewblk->jn_state & GOINGAWAY,
10263			    ("softdep_setup_blkfree: jnewblk not canceled."));
10264#ifdef SUJ_DEBUG
10265			/*
10266			 * Assert that this block is free in the bitmap
10267			 * before we discard the jnewblk.
10268			 */
10269			cgp = (struct cg *)bp->b_data;
10270			blksfree = cg_blksfree(cgp);
10271			bno = dtogd(fs, jnewblk->jn_blkno);
10272			for (i = jnewblk->jn_oldfrags;
10273			    i < jnewblk->jn_frags; i++) {
10274				if (isset(blksfree, bno + i))
10275					continue;
10276				panic("softdep_setup_blkfree: not free");
10277			}
10278#endif
10279			/*
10280			 * Even if it's not attached we can free immediately
10281			 * as the new bitmap is correct.
10282			 */
10283			wk->wk_state |= COMPLETE | ATTACHED;
10284			free_jnewblk(jnewblk);
10285		}
10286	}
10287
10288#ifdef SUJ_DEBUG
10289	/*
10290	 * Assert that we are not freeing a block which has an outstanding
10291	 * allocation dependency.
10292	 */
10293	fs = VFSTOUFS(mp)->um_fs;
10294	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10295	end = blkno + frags;
10296	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10297		/*
10298		 * Don't match against blocks that will be freed when the
10299		 * background write is done.
10300		 */
10301		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
10302		    (COMPLETE | DEPCOMPLETE))
10303			continue;
10304		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
10305		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
10306		if ((blkno >= jstart && blkno < jend) ||
10307		    (end > jstart && end <= jend)) {
10308			printf("state 0x%X %jd - %d %d dep %p\n",
10309			    jnewblk->jn_state, jnewblk->jn_blkno,
10310			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
10311			    jnewblk->jn_dep);
10312			panic("softdep_setup_blkfree: "
10313			    "%jd-%jd(%d) overlaps with %jd-%jd",
10314			    blkno, end, frags, jstart, jend);
10315		}
10316	}
10317#endif
10318	FREE_LOCK(&lk);
10319}
10320
10321/*
10322 * Revert a block allocation when the journal record that describes it
10323 * is not yet written.
10324 */
10325int
10326jnewblk_rollback(jnewblk, fs, cgp, blksfree)
10327	struct jnewblk *jnewblk;
10328	struct fs *fs;
10329	struct cg *cgp;
10330	uint8_t *blksfree;
10331{
10332	ufs1_daddr_t fragno;
10333	long cgbno, bbase;
10334	int frags, blk;
10335	int i;
10336
10337	frags = 0;
10338	cgbno = dtogd(fs, jnewblk->jn_blkno);
10339	/*
10340	 * We have to test which frags need to be rolled back.  We may
10341	 * be operating on a stale copy when doing background writes.
10342	 */
10343	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
10344		if (isclr(blksfree, cgbno + i))
10345			frags++;
10346	if (frags == 0)
10347		return (0);
10348	/*
10349	 * This is mostly ffs_blkfree() sans some validation and
10350	 * superblock updates.
10351	 */
10352	if (frags == fs->fs_frag) {
10353		fragno = fragstoblks(fs, cgbno);
10354		ffs_setblock(fs, blksfree, fragno);
10355		ffs_clusteracct(fs, cgp, fragno, 1);
10356		cgp->cg_cs.cs_nbfree++;
10357	} else {
10358		cgbno += jnewblk->jn_oldfrags;
10359		bbase = cgbno - fragnum(fs, cgbno);
10360		/* Decrement the old frags.  */
10361		blk = blkmap(fs, blksfree, bbase);
10362		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
10363		/* Deallocate the fragment */
10364		for (i = 0; i < frags; i++)
10365			setbit(blksfree, cgbno + i);
10366		cgp->cg_cs.cs_nffree += frags;
10367		/* Add back in counts associated with the new frags */
10368		blk = blkmap(fs, blksfree, bbase);
10369		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
10370                /* If a complete block has been reassembled, account for it. */
10371		fragno = fragstoblks(fs, bbase);
10372		if (ffs_isblock(fs, blksfree, fragno)) {
10373			cgp->cg_cs.cs_nffree -= fs->fs_frag;
10374			ffs_clusteracct(fs, cgp, fragno, 1);
10375			cgp->cg_cs.cs_nbfree++;
10376		}
10377	}
10378	stat_jnewblk++;
10379	jnewblk->jn_state &= ~ATTACHED;
10380	jnewblk->jn_state |= UNDONE;
10381
10382	return (frags);
10383}
10384
10385static void
10386initiate_write_bmsafemap(bmsafemap, bp)
10387	struct bmsafemap *bmsafemap;
10388	struct buf *bp;			/* The cg block. */
10389{
10390	struct jaddref *jaddref;
10391	struct jnewblk *jnewblk;
10392	uint8_t *inosused;
10393	uint8_t *blksfree;
10394	struct cg *cgp;
10395	struct fs *fs;
10396	ino_t ino;
10397
10398	if (bmsafemap->sm_state & IOSTARTED)
10399		panic("initiate_write_bmsafemap: Already started\n");
10400	bmsafemap->sm_state |= IOSTARTED;
10401	/*
10402	 * Clear any inode allocations which are pending journal writes.
10403	 */
10404	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
10405		cgp = (struct cg *)bp->b_data;
10406		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10407		inosused = cg_inosused(cgp);
10408		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
10409			ino = jaddref->ja_ino % fs->fs_ipg;
10410			/*
10411			 * If this is a background copy the inode may not
10412			 * be marked used yet.
10413			 */
10414			if (isset(inosused, ino)) {
10415				if ((jaddref->ja_mode & IFMT) == IFDIR)
10416					cgp->cg_cs.cs_ndir--;
10417				cgp->cg_cs.cs_nifree++;
10418				clrbit(inosused, ino);
10419				jaddref->ja_state &= ~ATTACHED;
10420				jaddref->ja_state |= UNDONE;
10421				stat_jaddref++;
10422			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
10423				panic("initiate_write_bmsafemap: inode %ju "
10424				    "marked free", (uintmax_t)jaddref->ja_ino);
10425		}
10426	}
10427	/*
10428	 * Clear any block allocations which are pending journal writes.
10429	 */
10430	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
10431		cgp = (struct cg *)bp->b_data;
10432		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10433		blksfree = cg_blksfree(cgp);
10434		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10435			if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
10436				continue;
10437			if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
10438				panic("initiate_write_bmsafemap: block %jd "
10439				    "marked free", jnewblk->jn_blkno);
10440		}
10441	}
10442	/*
10443	 * Move allocation lists to the written lists so they can be
10444	 * cleared once the block write is complete.
10445	 */
10446	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
10447	    inodedep, id_deps);
10448	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
10449	    newblk, nb_deps);
10450	LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
10451	    wk_list);
10452}
10453
10454/*
10455 * This routine is called during the completion interrupt
10456 * service routine for a disk write (from the procedure called
10457 * by the device driver to inform the filesystem caches of
10458 * a request completion).  It should be called early in this
10459 * procedure, before the block is made available to other
10460 * processes or other routines are called.
10461 *
10462 */
10463static void
10464softdep_disk_write_complete(bp)
10465	struct buf *bp;		/* describes the completed disk write */
10466{
10467	struct worklist *wk;
10468	struct worklist *owk;
10469	struct workhead reattach;
10470	struct freeblks *freeblks;
10471	struct buf *sbp;
10472
10473	/*
10474	 * If an error occurred while doing the write, then the data
10475	 * has not hit the disk and the dependencies cannot be unrolled.
10476	 */
10477	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
10478		return;
10479	LIST_INIT(&reattach);
10480	/*
10481	 * This lock must not be released anywhere in this code segment.
10482	 */
10483	sbp = NULL;
10484	owk = NULL;
10485	ACQUIRE_LOCK(&lk);
10486	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
10487		WORKLIST_REMOVE(wk);
10488		dep_write[wk->wk_type]++;
10489		if (wk == owk)
10490			panic("duplicate worklist: %p\n", wk);
10491		owk = wk;
10492		switch (wk->wk_type) {
10493
10494		case D_PAGEDEP:
10495			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
10496				WORKLIST_INSERT(&reattach, wk);
10497			continue;
10498
10499		case D_INODEDEP:
10500			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
10501				WORKLIST_INSERT(&reattach, wk);
10502			continue;
10503
10504		case D_BMSAFEMAP:
10505			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
10506				WORKLIST_INSERT(&reattach, wk);
10507			continue;
10508
10509		case D_MKDIR:
10510			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
10511			continue;
10512
10513		case D_ALLOCDIRECT:
10514			wk->wk_state |= COMPLETE;
10515			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
10516			continue;
10517
10518		case D_ALLOCINDIR:
10519			wk->wk_state |= COMPLETE;
10520			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
10521			continue;
10522
10523		case D_INDIRDEP:
10524			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
10525				WORKLIST_INSERT(&reattach, wk);
10526			continue;
10527
10528		case D_FREEBLKS:
10529			wk->wk_state |= COMPLETE;
10530			freeblks = WK_FREEBLKS(wk);
10531			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
10532			    LIST_EMPTY(&freeblks->fb_jblkdephd))
10533				add_to_worklist(wk, WK_NODELAY);
10534			continue;
10535
10536		case D_FREEWORK:
10537			handle_written_freework(WK_FREEWORK(wk));
10538			break;
10539
10540		case D_JSEGDEP:
10541			free_jsegdep(WK_JSEGDEP(wk));
10542			continue;
10543
10544		case D_JSEG:
10545			handle_written_jseg(WK_JSEG(wk), bp);
10546			continue;
10547
10548		case D_SBDEP:
10549			if (handle_written_sbdep(WK_SBDEP(wk), bp))
10550				WORKLIST_INSERT(&reattach, wk);
10551			continue;
10552
10553		case D_FREEDEP:
10554			free_freedep(WK_FREEDEP(wk));
10555			continue;
10556
10557		default:
10558			panic("handle_disk_write_complete: Unknown type %s",
10559			    TYPENAME(wk->wk_type));
10560			/* NOTREACHED */
10561		}
10562	}
10563	/*
10564	 * Reattach any requests that must be redone.
10565	 */
10566	while ((wk = LIST_FIRST(&reattach)) != NULL) {
10567		WORKLIST_REMOVE(wk);
10568		WORKLIST_INSERT(&bp->b_dep, wk);
10569	}
10570	FREE_LOCK(&lk);
10571	if (sbp)
10572		brelse(sbp);
10573}
10574
10575/*
10576 * Called from within softdep_disk_write_complete above. Note that
10577 * this routine is always called from interrupt level with further
10578 * splbio interrupts blocked.
10579 */
10580static void
10581handle_allocdirect_partdone(adp, wkhd)
10582	struct allocdirect *adp;	/* the completed allocdirect */
10583	struct workhead *wkhd;		/* Work to do when inode is writtne. */
10584{
10585	struct allocdirectlst *listhead;
10586	struct allocdirect *listadp;
10587	struct inodedep *inodedep;
10588	long bsize;
10589
10590	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
10591		return;
10592	/*
10593	 * The on-disk inode cannot claim to be any larger than the last
10594	 * fragment that has been written. Otherwise, the on-disk inode
10595	 * might have fragments that were not the last block in the file
10596	 * which would corrupt the filesystem. Thus, we cannot free any
10597	 * allocdirects after one whose ad_oldblkno claims a fragment as
10598	 * these blocks must be rolled back to zero before writing the inode.
10599	 * We check the currently active set of allocdirects in id_inoupdt
10600	 * or id_extupdt as appropriate.
10601	 */
10602	inodedep = adp->ad_inodedep;
10603	bsize = inodedep->id_fs->fs_bsize;
10604	if (adp->ad_state & EXTDATA)
10605		listhead = &inodedep->id_extupdt;
10606	else
10607		listhead = &inodedep->id_inoupdt;
10608	TAILQ_FOREACH(listadp, listhead, ad_next) {
10609		/* found our block */
10610		if (listadp == adp)
10611			break;
10612		/* continue if ad_oldlbn is not a fragment */
10613		if (listadp->ad_oldsize == 0 ||
10614		    listadp->ad_oldsize == bsize)
10615			continue;
10616		/* hit a fragment */
10617		return;
10618	}
10619	/*
10620	 * If we have reached the end of the current list without
10621	 * finding the just finished dependency, then it must be
10622	 * on the future dependency list. Future dependencies cannot
10623	 * be freed until they are moved to the current list.
10624	 */
10625	if (listadp == NULL) {
10626#ifdef DEBUG
10627		if (adp->ad_state & EXTDATA)
10628			listhead = &inodedep->id_newextupdt;
10629		else
10630			listhead = &inodedep->id_newinoupdt;
10631		TAILQ_FOREACH(listadp, listhead, ad_next)
10632			/* found our block */
10633			if (listadp == adp)
10634				break;
10635		if (listadp == NULL)
10636			panic("handle_allocdirect_partdone: lost dep");
10637#endif /* DEBUG */
10638		return;
10639	}
10640	/*
10641	 * If we have found the just finished dependency, then queue
10642	 * it along with anything that follows it that is complete.
10643	 * Since the pointer has not yet been written in the inode
10644	 * as the dependency prevents it, place the allocdirect on the
10645	 * bufwait list where it will be freed once the pointer is
10646	 * valid.
10647	 */
10648	if (wkhd == NULL)
10649		wkhd = &inodedep->id_bufwait;
10650	for (; adp; adp = listadp) {
10651		listadp = TAILQ_NEXT(adp, ad_next);
10652		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
10653			return;
10654		TAILQ_REMOVE(listhead, adp, ad_next);
10655		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
10656	}
10657}
10658
10659/*
10660 * Called from within softdep_disk_write_complete above.  This routine
10661 * completes successfully written allocindirs.
10662 */
10663static void
10664handle_allocindir_partdone(aip)
10665	struct allocindir *aip;		/* the completed allocindir */
10666{
10667	struct indirdep *indirdep;
10668
10669	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
10670		return;
10671	indirdep = aip->ai_indirdep;
10672	LIST_REMOVE(aip, ai_next);
10673	/*
10674	 * Don't set a pointer while the buffer is undergoing IO or while
10675	 * we have active truncations.
10676	 */
10677	if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
10678		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
10679		return;
10680	}
10681	if (indirdep->ir_state & UFS1FMT)
10682		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
10683		    aip->ai_newblkno;
10684	else
10685		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
10686		    aip->ai_newblkno;
10687	/*
10688	 * Await the pointer write before freeing the allocindir.
10689	 */
10690	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
10691}
10692
10693/*
10694 * Release segments held on a jwork list.
10695 */
10696static void
10697handle_jwork(wkhd)
10698	struct workhead *wkhd;
10699{
10700	struct worklist *wk;
10701
10702	while ((wk = LIST_FIRST(wkhd)) != NULL) {
10703		WORKLIST_REMOVE(wk);
10704		switch (wk->wk_type) {
10705		case D_JSEGDEP:
10706			free_jsegdep(WK_JSEGDEP(wk));
10707			continue;
10708		case D_FREEDEP:
10709			free_freedep(WK_FREEDEP(wk));
10710			continue;
10711		case D_FREEFRAG:
10712			rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
10713			WORKITEM_FREE(wk, D_FREEFRAG);
10714			continue;
10715		case D_FREEWORK:
10716			handle_written_freework(WK_FREEWORK(wk));
10717			continue;
10718		default:
10719			panic("handle_jwork: Unknown type %s\n",
10720			    TYPENAME(wk->wk_type));
10721		}
10722	}
10723}
10724
10725/*
10726 * Handle the bufwait list on an inode when it is safe to release items
10727 * held there.  This normally happens after an inode block is written but
10728 * may be delayed and handled later if there are pending journal items that
10729 * are not yet safe to be released.
10730 */
10731static struct freefile *
10732handle_bufwait(inodedep, refhd)
10733	struct inodedep *inodedep;
10734	struct workhead *refhd;
10735{
10736	struct jaddref *jaddref;
10737	struct freefile *freefile;
10738	struct worklist *wk;
10739
10740	freefile = NULL;
10741	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
10742		WORKLIST_REMOVE(wk);
10743		switch (wk->wk_type) {
10744		case D_FREEFILE:
10745			/*
10746			 * We defer adding freefile to the worklist
10747			 * until all other additions have been made to
10748			 * ensure that it will be done after all the
10749			 * old blocks have been freed.
10750			 */
10751			if (freefile != NULL)
10752				panic("handle_bufwait: freefile");
10753			freefile = WK_FREEFILE(wk);
10754			continue;
10755
10756		case D_MKDIR:
10757			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
10758			continue;
10759
10760		case D_DIRADD:
10761			diradd_inode_written(WK_DIRADD(wk), inodedep);
10762			continue;
10763
10764		case D_FREEFRAG:
10765			wk->wk_state |= COMPLETE;
10766			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
10767				add_to_worklist(wk, 0);
10768			continue;
10769
10770		case D_DIRREM:
10771			wk->wk_state |= COMPLETE;
10772			add_to_worklist(wk, 0);
10773			continue;
10774
10775		case D_ALLOCDIRECT:
10776		case D_ALLOCINDIR:
10777			free_newblk(WK_NEWBLK(wk));
10778			continue;
10779
10780		case D_JNEWBLK:
10781			wk->wk_state |= COMPLETE;
10782			free_jnewblk(WK_JNEWBLK(wk));
10783			continue;
10784
10785		/*
10786		 * Save freed journal segments and add references on
10787		 * the supplied list which will delay their release
10788		 * until the cg bitmap is cleared on disk.
10789		 */
10790		case D_JSEGDEP:
10791			if (refhd == NULL)
10792				free_jsegdep(WK_JSEGDEP(wk));
10793			else
10794				WORKLIST_INSERT(refhd, wk);
10795			continue;
10796
10797		case D_JADDREF:
10798			jaddref = WK_JADDREF(wk);
10799			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
10800			    if_deps);
10801			/*
10802			 * Transfer any jaddrefs to the list to be freed with
10803			 * the bitmap if we're handling a removed file.
10804			 */
10805			if (refhd == NULL) {
10806				wk->wk_state |= COMPLETE;
10807				free_jaddref(jaddref);
10808			} else
10809				WORKLIST_INSERT(refhd, wk);
10810			continue;
10811
10812		default:
10813			panic("handle_bufwait: Unknown type %p(%s)",
10814			    wk, TYPENAME(wk->wk_type));
10815			/* NOTREACHED */
10816		}
10817	}
10818	return (freefile);
10819}
10820/*
10821 * Called from within softdep_disk_write_complete above to restore
10822 * in-memory inode block contents to their most up-to-date state. Note
10823 * that this routine is always called from interrupt level with further
10824 * splbio interrupts blocked.
10825 */
10826static int
10827handle_written_inodeblock(inodedep, bp)
10828	struct inodedep *inodedep;
10829	struct buf *bp;		/* buffer containing the inode block */
10830{
10831	struct freefile *freefile;
10832	struct allocdirect *adp, *nextadp;
10833	struct ufs1_dinode *dp1 = NULL;
10834	struct ufs2_dinode *dp2 = NULL;
10835	struct workhead wkhd;
10836	int hadchanges, fstype;
10837	ino_t freelink;
10838
10839	LIST_INIT(&wkhd);
10840	hadchanges = 0;
10841	freefile = NULL;
10842	if ((inodedep->id_state & IOSTARTED) == 0)
10843		panic("handle_written_inodeblock: not started");
10844	inodedep->id_state &= ~IOSTARTED;
10845	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
10846		fstype = UFS1;
10847		dp1 = (struct ufs1_dinode *)bp->b_data +
10848		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
10849		freelink = dp1->di_freelink;
10850	} else {
10851		fstype = UFS2;
10852		dp2 = (struct ufs2_dinode *)bp->b_data +
10853		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
10854		freelink = dp2->di_freelink;
10855	}
10856	/*
10857	 * Leave this inodeblock dirty until it's in the list.
10858	 */
10859	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) {
10860		struct inodedep *inon;
10861
10862		inon = TAILQ_NEXT(inodedep, id_unlinked);
10863		if ((inon == NULL && freelink == 0) ||
10864		    (inon && inon->id_ino == freelink)) {
10865			if (inon)
10866				inon->id_state |= UNLINKPREV;
10867			inodedep->id_state |= UNLINKNEXT;
10868		}
10869		hadchanges = 1;
10870	}
10871	/*
10872	 * If we had to rollback the inode allocation because of
10873	 * bitmaps being incomplete, then simply restore it.
10874	 * Keep the block dirty so that it will not be reclaimed until
10875	 * all associated dependencies have been cleared and the
10876	 * corresponding updates written to disk.
10877	 */
10878	if (inodedep->id_savedino1 != NULL) {
10879		hadchanges = 1;
10880		if (fstype == UFS1)
10881			*dp1 = *inodedep->id_savedino1;
10882		else
10883			*dp2 = *inodedep->id_savedino2;
10884		free(inodedep->id_savedino1, M_SAVEDINO);
10885		inodedep->id_savedino1 = NULL;
10886		if ((bp->b_flags & B_DELWRI) == 0)
10887			stat_inode_bitmap++;
10888		bdirty(bp);
10889		/*
10890		 * If the inode is clear here and GOINGAWAY it will never
10891		 * be written.  Process the bufwait and clear any pending
10892		 * work which may include the freefile.
10893		 */
10894		if (inodedep->id_state & GOINGAWAY)
10895			goto bufwait;
10896		return (1);
10897	}
10898	inodedep->id_state |= COMPLETE;
10899	/*
10900	 * Roll forward anything that had to be rolled back before
10901	 * the inode could be updated.
10902	 */
10903	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
10904		nextadp = TAILQ_NEXT(adp, ad_next);
10905		if (adp->ad_state & ATTACHED)
10906			panic("handle_written_inodeblock: new entry");
10907		if (fstype == UFS1) {
10908			if (adp->ad_offset < NDADDR) {
10909				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
10910					panic("%s %s #%jd mismatch %d != %jd",
10911					    "handle_written_inodeblock:",
10912					    "direct pointer",
10913					    (intmax_t)adp->ad_offset,
10914					    dp1->di_db[adp->ad_offset],
10915					    (intmax_t)adp->ad_oldblkno);
10916				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
10917			} else {
10918				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
10919					panic("%s: %s #%jd allocated as %d",
10920					    "handle_written_inodeblock",
10921					    "indirect pointer",
10922					    (intmax_t)adp->ad_offset - NDADDR,
10923					    dp1->di_ib[adp->ad_offset - NDADDR]);
10924				dp1->di_ib[adp->ad_offset - NDADDR] =
10925				    adp->ad_newblkno;
10926			}
10927		} else {
10928			if (adp->ad_offset < NDADDR) {
10929				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
10930					panic("%s: %s #%jd %s %jd != %jd",
10931					    "handle_written_inodeblock",
10932					    "direct pointer",
10933					    (intmax_t)adp->ad_offset, "mismatch",
10934					    (intmax_t)dp2->di_db[adp->ad_offset],
10935					    (intmax_t)adp->ad_oldblkno);
10936				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
10937			} else {
10938				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
10939					panic("%s: %s #%jd allocated as %jd",
10940					    "handle_written_inodeblock",
10941					    "indirect pointer",
10942					    (intmax_t)adp->ad_offset - NDADDR,
10943					    (intmax_t)
10944					    dp2->di_ib[adp->ad_offset - NDADDR]);
10945				dp2->di_ib[adp->ad_offset - NDADDR] =
10946				    adp->ad_newblkno;
10947			}
10948		}
10949		adp->ad_state &= ~UNDONE;
10950		adp->ad_state |= ATTACHED;
10951		hadchanges = 1;
10952	}
10953	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
10954		nextadp = TAILQ_NEXT(adp, ad_next);
10955		if (adp->ad_state & ATTACHED)
10956			panic("handle_written_inodeblock: new entry");
10957		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
10958			panic("%s: direct pointers #%jd %s %jd != %jd",
10959			    "handle_written_inodeblock",
10960			    (intmax_t)adp->ad_offset, "mismatch",
10961			    (intmax_t)dp2->di_extb[adp->ad_offset],
10962			    (intmax_t)adp->ad_oldblkno);
10963		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
10964		adp->ad_state &= ~UNDONE;
10965		adp->ad_state |= ATTACHED;
10966		hadchanges = 1;
10967	}
10968	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
10969		stat_direct_blk_ptrs++;
10970	/*
10971	 * Reset the file size to its most up-to-date value.
10972	 */
10973	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
10974		panic("handle_written_inodeblock: bad size");
10975	if (inodedep->id_savednlink > LINK_MAX)
10976		panic("handle_written_inodeblock: Invalid link count "
10977		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
10978	if (fstype == UFS1) {
10979		if (dp1->di_nlink != inodedep->id_savednlink) {
10980			dp1->di_nlink = inodedep->id_savednlink;
10981			hadchanges = 1;
10982		}
10983		if (dp1->di_size != inodedep->id_savedsize) {
10984			dp1->di_size = inodedep->id_savedsize;
10985			hadchanges = 1;
10986		}
10987	} else {
10988		if (dp2->di_nlink != inodedep->id_savednlink) {
10989			dp2->di_nlink = inodedep->id_savednlink;
10990			hadchanges = 1;
10991		}
10992		if (dp2->di_size != inodedep->id_savedsize) {
10993			dp2->di_size = inodedep->id_savedsize;
10994			hadchanges = 1;
10995		}
10996		if (dp2->di_extsize != inodedep->id_savedextsize) {
10997			dp2->di_extsize = inodedep->id_savedextsize;
10998			hadchanges = 1;
10999		}
11000	}
11001	inodedep->id_savedsize = -1;
11002	inodedep->id_savedextsize = -1;
11003	inodedep->id_savednlink = -1;
11004	/*
11005	 * If there were any rollbacks in the inode block, then it must be
11006	 * marked dirty so that its will eventually get written back in
11007	 * its correct form.
11008	 */
11009	if (hadchanges)
11010		bdirty(bp);
11011bufwait:
11012	/*
11013	 * Process any allocdirects that completed during the update.
11014	 */
11015	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11016		handle_allocdirect_partdone(adp, &wkhd);
11017	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11018		handle_allocdirect_partdone(adp, &wkhd);
11019	/*
11020	 * Process deallocations that were held pending until the
11021	 * inode had been written to disk. Freeing of the inode
11022	 * is delayed until after all blocks have been freed to
11023	 * avoid creation of new <vfsid, inum, lbn> triples
11024	 * before the old ones have been deleted.  Completely
11025	 * unlinked inodes are not processed until the unlinked
11026	 * inode list is written or the last reference is removed.
11027	 */
11028	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11029		freefile = handle_bufwait(inodedep, NULL);
11030		if (freefile && !LIST_EMPTY(&wkhd)) {
11031			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11032			freefile = NULL;
11033		}
11034	}
11035	/*
11036	 * Move rolled forward dependency completions to the bufwait list
11037	 * now that those that were already written have been processed.
11038	 */
11039	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11040		panic("handle_written_inodeblock: bufwait but no changes");
11041	jwork_move(&inodedep->id_bufwait, &wkhd);
11042
11043	if (freefile != NULL) {
11044		/*
11045		 * If the inode is goingaway it was never written.  Fake up
11046		 * the state here so free_inodedep() can succeed.
11047		 */
11048		if (inodedep->id_state & GOINGAWAY)
11049			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11050		if (free_inodedep(inodedep) == 0)
11051			panic("handle_written_inodeblock: live inodedep %p",
11052			    inodedep);
11053		add_to_worklist(&freefile->fx_list, 0);
11054		return (0);
11055	}
11056
11057	/*
11058	 * If no outstanding dependencies, free it.
11059	 */
11060	if (free_inodedep(inodedep) ||
11061	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11062	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11063	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11064	     LIST_FIRST(&inodedep->id_bufwait) == 0))
11065		return (0);
11066	return (hadchanges);
11067}
11068
11069static int
11070handle_written_indirdep(indirdep, bp, bpp)
11071	struct indirdep *indirdep;
11072	struct buf *bp;
11073	struct buf **bpp;
11074{
11075	struct allocindir *aip;
11076	struct buf *sbp;
11077	int chgs;
11078
11079	if (indirdep->ir_state & GOINGAWAY)
11080		panic("handle_written_indirdep: indirdep gone");
11081	if ((indirdep->ir_state & IOSTARTED) == 0)
11082		panic("handle_written_indirdep: IO not started");
11083	chgs = 0;
11084	/*
11085	 * If there were rollbacks revert them here.
11086	 */
11087	if (indirdep->ir_saveddata) {
11088		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11089		if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11090			free(indirdep->ir_saveddata, M_INDIRDEP);
11091			indirdep->ir_saveddata = NULL;
11092		}
11093		chgs = 1;
11094	}
11095	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11096	indirdep->ir_state |= ATTACHED;
11097	/*
11098	 * Move allocindirs with written pointers to the completehd if
11099	 * the indirdep's pointer is not yet written.  Otherwise
11100	 * free them here.
11101	 */
11102	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
11103		LIST_REMOVE(aip, ai_next);
11104		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11105			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11106			    ai_next);
11107			newblk_freefrag(&aip->ai_block);
11108			continue;
11109		}
11110		free_newblk(&aip->ai_block);
11111	}
11112	/*
11113	 * Move allocindirs that have finished dependency processing from
11114	 * the done list to the write list after updating the pointers.
11115	 */
11116	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11117		while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
11118			handle_allocindir_partdone(aip);
11119			if (aip == LIST_FIRST(&indirdep->ir_donehd))
11120				panic("disk_write_complete: not gone");
11121			chgs = 1;
11122		}
11123	}
11124	/*
11125	 * Preserve the indirdep if there were any changes or if it is not
11126	 * yet valid on disk.
11127	 */
11128	if (chgs) {
11129		stat_indir_blk_ptrs++;
11130		bdirty(bp);
11131		return (1);
11132	}
11133	/*
11134	 * If there were no changes we can discard the savedbp and detach
11135	 * ourselves from the buf.  We are only carrying completed pointers
11136	 * in this case.
11137	 */
11138	sbp = indirdep->ir_savebp;
11139	sbp->b_flags |= B_INVAL | B_NOCACHE;
11140	indirdep->ir_savebp = NULL;
11141	indirdep->ir_bp = NULL;
11142	if (*bpp != NULL)
11143		panic("handle_written_indirdep: bp already exists.");
11144	*bpp = sbp;
11145	/*
11146	 * The indirdep may not be freed until its parent points at it.
11147	 */
11148	if (indirdep->ir_state & DEPCOMPLETE)
11149		free_indirdep(indirdep);
11150
11151	return (0);
11152}
11153
11154/*
11155 * Process a diradd entry after its dependent inode has been written.
11156 * This routine must be called with splbio interrupts blocked.
11157 */
11158static void
11159diradd_inode_written(dap, inodedep)
11160	struct diradd *dap;
11161	struct inodedep *inodedep;
11162{
11163
11164	dap->da_state |= COMPLETE;
11165	complete_diradd(dap);
11166	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
11167}
11168
11169/*
11170 * Returns true if the bmsafemap will have rollbacks when written.  Must
11171 * only be called with lk and the buf lock on the cg held.
11172 */
11173static int
11174bmsafemap_rollbacks(bmsafemap)
11175	struct bmsafemap *bmsafemap;
11176{
11177
11178	return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
11179	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd));
11180}
11181
11182/*
11183 * Re-apply an allocation when a cg write is complete.
11184 */
11185static int
11186jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
11187	struct jnewblk *jnewblk;
11188	struct fs *fs;
11189	struct cg *cgp;
11190	uint8_t *blksfree;
11191{
11192	ufs1_daddr_t fragno;
11193	ufs2_daddr_t blkno;
11194	long cgbno, bbase;
11195	int frags, blk;
11196	int i;
11197
11198	frags = 0;
11199	cgbno = dtogd(fs, jnewblk->jn_blkno);
11200	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
11201		if (isclr(blksfree, cgbno + i))
11202			panic("jnewblk_rollforward: re-allocated fragment");
11203		frags++;
11204	}
11205	if (frags == fs->fs_frag) {
11206		blkno = fragstoblks(fs, cgbno);
11207		ffs_clrblock(fs, blksfree, (long)blkno);
11208		ffs_clusteracct(fs, cgp, blkno, -1);
11209		cgp->cg_cs.cs_nbfree--;
11210	} else {
11211		bbase = cgbno - fragnum(fs, cgbno);
11212		cgbno += jnewblk->jn_oldfrags;
11213                /* If a complete block had been reassembled, account for it. */
11214		fragno = fragstoblks(fs, bbase);
11215		if (ffs_isblock(fs, blksfree, fragno)) {
11216			cgp->cg_cs.cs_nffree += fs->fs_frag;
11217			ffs_clusteracct(fs, cgp, fragno, -1);
11218			cgp->cg_cs.cs_nbfree--;
11219		}
11220		/* Decrement the old frags.  */
11221		blk = blkmap(fs, blksfree, bbase);
11222		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11223		/* Allocate the fragment */
11224		for (i = 0; i < frags; i++)
11225			clrbit(blksfree, cgbno + i);
11226		cgp->cg_cs.cs_nffree -= frags;
11227		/* Add back in counts associated with the new frags */
11228		blk = blkmap(fs, blksfree, bbase);
11229		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11230	}
11231	return (frags);
11232}
11233
11234/*
11235 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
11236 * changes if it's not a background write.  Set all written dependencies
11237 * to DEPCOMPLETE and free the structure if possible.
11238 */
11239static int
11240handle_written_bmsafemap(bmsafemap, bp)
11241	struct bmsafemap *bmsafemap;
11242	struct buf *bp;
11243{
11244	struct newblk *newblk;
11245	struct inodedep *inodedep;
11246	struct jaddref *jaddref, *jatmp;
11247	struct jnewblk *jnewblk, *jntmp;
11248	struct ufsmount *ump;
11249	uint8_t *inosused;
11250	uint8_t *blksfree;
11251	struct cg *cgp;
11252	struct fs *fs;
11253	ino_t ino;
11254	int chgs;
11255
11256	if ((bmsafemap->sm_state & IOSTARTED) == 0)
11257		panic("initiate_write_bmsafemap: Not started\n");
11258	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
11259	chgs = 0;
11260	bmsafemap->sm_state &= ~IOSTARTED;
11261	/*
11262	 * Release journal work that was waiting on the write.
11263	 */
11264	handle_jwork(&bmsafemap->sm_freewr);
11265
11266	/*
11267	 * Restore unwritten inode allocation pending jaddref writes.
11268	 */
11269	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
11270		cgp = (struct cg *)bp->b_data;
11271		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11272		inosused = cg_inosused(cgp);
11273		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
11274		    ja_bmdeps, jatmp) {
11275			if ((jaddref->ja_state & UNDONE) == 0)
11276				continue;
11277			ino = jaddref->ja_ino % fs->fs_ipg;
11278			if (isset(inosused, ino))
11279				panic("handle_written_bmsafemap: "
11280				    "re-allocated inode");
11281			if ((bp->b_xflags & BX_BKGRDMARKER) == 0) {
11282				if ((jaddref->ja_mode & IFMT) == IFDIR)
11283					cgp->cg_cs.cs_ndir++;
11284				cgp->cg_cs.cs_nifree--;
11285				setbit(inosused, ino);
11286				chgs = 1;
11287			}
11288			jaddref->ja_state &= ~UNDONE;
11289			jaddref->ja_state |= ATTACHED;
11290			free_jaddref(jaddref);
11291		}
11292	}
11293	/*
11294	 * Restore any block allocations which are pending journal writes.
11295	 */
11296	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11297		cgp = (struct cg *)bp->b_data;
11298		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11299		blksfree = cg_blksfree(cgp);
11300		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
11301		    jntmp) {
11302			if ((jnewblk->jn_state & UNDONE) == 0)
11303				continue;
11304			if ((bp->b_xflags & BX_BKGRDMARKER) == 0 &&
11305			    jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
11306				chgs = 1;
11307			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
11308			jnewblk->jn_state |= ATTACHED;
11309			free_jnewblk(jnewblk);
11310		}
11311	}
11312	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
11313		newblk->nb_state |= DEPCOMPLETE;
11314		newblk->nb_state &= ~ONDEPLIST;
11315		newblk->nb_bmsafemap = NULL;
11316		LIST_REMOVE(newblk, nb_deps);
11317		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
11318			handle_allocdirect_partdone(
11319			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
11320		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
11321			handle_allocindir_partdone(
11322			    WK_ALLOCINDIR(&newblk->nb_list));
11323		else if (newblk->nb_list.wk_type != D_NEWBLK)
11324			panic("handle_written_bmsafemap: Unexpected type: %s",
11325			    TYPENAME(newblk->nb_list.wk_type));
11326	}
11327	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
11328		inodedep->id_state |= DEPCOMPLETE;
11329		inodedep->id_state &= ~ONDEPLIST;
11330		LIST_REMOVE(inodedep, id_deps);
11331		inodedep->id_bmsafemap = NULL;
11332	}
11333	LIST_REMOVE(bmsafemap, sm_next);
11334	if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
11335	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
11336	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
11337	    LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
11338	    LIST_EMPTY(&bmsafemap->sm_freehd)) {
11339		LIST_REMOVE(bmsafemap, sm_hash);
11340		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
11341		return (0);
11342	}
11343	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
11344	bdirty(bp);
11345	return (1);
11346}
11347
11348/*
11349 * Try to free a mkdir dependency.
11350 */
11351static void
11352complete_mkdir(mkdir)
11353	struct mkdir *mkdir;
11354{
11355	struct diradd *dap;
11356
11357	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
11358		return;
11359	LIST_REMOVE(mkdir, md_mkdirs);
11360	dap = mkdir->md_diradd;
11361	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
11362	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
11363		dap->da_state |= DEPCOMPLETE;
11364		complete_diradd(dap);
11365	}
11366	WORKITEM_FREE(mkdir, D_MKDIR);
11367}
11368
11369/*
11370 * Handle the completion of a mkdir dependency.
11371 */
11372static void
11373handle_written_mkdir(mkdir, type)
11374	struct mkdir *mkdir;
11375	int type;
11376{
11377
11378	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
11379		panic("handle_written_mkdir: bad type");
11380	mkdir->md_state |= COMPLETE;
11381	complete_mkdir(mkdir);
11382}
11383
11384static int
11385free_pagedep(pagedep)
11386	struct pagedep *pagedep;
11387{
11388	int i;
11389
11390	if (pagedep->pd_state & NEWBLOCK)
11391		return (0);
11392	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
11393		return (0);
11394	for (i = 0; i < DAHASHSZ; i++)
11395		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
11396			return (0);
11397	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
11398		return (0);
11399	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
11400		return (0);
11401	if (pagedep->pd_state & ONWORKLIST)
11402		WORKLIST_REMOVE(&pagedep->pd_list);
11403	LIST_REMOVE(pagedep, pd_hash);
11404	WORKITEM_FREE(pagedep, D_PAGEDEP);
11405
11406	return (1);
11407}
11408
11409/*
11410 * Called from within softdep_disk_write_complete above.
11411 * A write operation was just completed. Removed inodes can
11412 * now be freed and associated block pointers may be committed.
11413 * Note that this routine is always called from interrupt level
11414 * with further splbio interrupts blocked.
11415 */
11416static int
11417handle_written_filepage(pagedep, bp)
11418	struct pagedep *pagedep;
11419	struct buf *bp;		/* buffer containing the written page */
11420{
11421	struct dirrem *dirrem;
11422	struct diradd *dap, *nextdap;
11423	struct direct *ep;
11424	int i, chgs;
11425
11426	if ((pagedep->pd_state & IOSTARTED) == 0)
11427		panic("handle_written_filepage: not started");
11428	pagedep->pd_state &= ~IOSTARTED;
11429	/*
11430	 * Process any directory removals that have been committed.
11431	 */
11432	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
11433		LIST_REMOVE(dirrem, dm_next);
11434		dirrem->dm_state |= COMPLETE;
11435		dirrem->dm_dirinum = pagedep->pd_ino;
11436		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
11437		    ("handle_written_filepage: Journal entries not written."));
11438		add_to_worklist(&dirrem->dm_list, 0);
11439	}
11440	/*
11441	 * Free any directory additions that have been committed.
11442	 * If it is a newly allocated block, we have to wait until
11443	 * the on-disk directory inode claims the new block.
11444	 */
11445	if ((pagedep->pd_state & NEWBLOCK) == 0)
11446		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
11447			free_diradd(dap, NULL);
11448	/*
11449	 * Uncommitted directory entries must be restored.
11450	 */
11451	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
11452		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
11453		     dap = nextdap) {
11454			nextdap = LIST_NEXT(dap, da_pdlist);
11455			if (dap->da_state & ATTACHED)
11456				panic("handle_written_filepage: attached");
11457			ep = (struct direct *)
11458			    ((char *)bp->b_data + dap->da_offset);
11459			ep->d_ino = dap->da_newinum;
11460			dap->da_state &= ~UNDONE;
11461			dap->da_state |= ATTACHED;
11462			chgs = 1;
11463			/*
11464			 * If the inode referenced by the directory has
11465			 * been written out, then the dependency can be
11466			 * moved to the pending list.
11467			 */
11468			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
11469				LIST_REMOVE(dap, da_pdlist);
11470				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
11471				    da_pdlist);
11472			}
11473		}
11474	}
11475	/*
11476	 * If there were any rollbacks in the directory, then it must be
11477	 * marked dirty so that its will eventually get written back in
11478	 * its correct form.
11479	 */
11480	if (chgs) {
11481		if ((bp->b_flags & B_DELWRI) == 0)
11482			stat_dir_entry++;
11483		bdirty(bp);
11484		return (1);
11485	}
11486	/*
11487	 * If we are not waiting for a new directory block to be
11488	 * claimed by its inode, then the pagedep will be freed.
11489	 * Otherwise it will remain to track any new entries on
11490	 * the page in case they are fsync'ed.
11491	 */
11492	free_pagedep(pagedep);
11493	return (0);
11494}
11495
11496/*
11497 * Writing back in-core inode structures.
11498 *
11499 * The filesystem only accesses an inode's contents when it occupies an
11500 * "in-core" inode structure.  These "in-core" structures are separate from
11501 * the page frames used to cache inode blocks.  Only the latter are
11502 * transferred to/from the disk.  So, when the updated contents of the
11503 * "in-core" inode structure are copied to the corresponding in-memory inode
11504 * block, the dependencies are also transferred.  The following procedure is
11505 * called when copying a dirty "in-core" inode to a cached inode block.
11506 */
11507
11508/*
11509 * Called when an inode is loaded from disk. If the effective link count
11510 * differed from the actual link count when it was last flushed, then we
11511 * need to ensure that the correct effective link count is put back.
11512 */
11513void
11514softdep_load_inodeblock(ip)
11515	struct inode *ip;	/* the "in_core" copy of the inode */
11516{
11517	struct inodedep *inodedep;
11518
11519	/*
11520	 * Check for alternate nlink count.
11521	 */
11522	ip->i_effnlink = ip->i_nlink;
11523	ACQUIRE_LOCK(&lk);
11524	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
11525	    &inodedep) == 0) {
11526		FREE_LOCK(&lk);
11527		return;
11528	}
11529	ip->i_effnlink -= inodedep->id_nlinkdelta;
11530	FREE_LOCK(&lk);
11531}
11532
11533/*
11534 * This routine is called just before the "in-core" inode
11535 * information is to be copied to the in-memory inode block.
11536 * Recall that an inode block contains several inodes. If
11537 * the force flag is set, then the dependencies will be
11538 * cleared so that the update can always be made. Note that
11539 * the buffer is locked when this routine is called, so we
11540 * will never be in the middle of writing the inode block
11541 * to disk.
11542 */
11543void
11544softdep_update_inodeblock(ip, bp, waitfor)
11545	struct inode *ip;	/* the "in_core" copy of the inode */
11546	struct buf *bp;		/* the buffer containing the inode block */
11547	int waitfor;		/* nonzero => update must be allowed */
11548{
11549	struct inodedep *inodedep;
11550	struct inoref *inoref;
11551	struct worklist *wk;
11552	struct mount *mp;
11553	struct buf *ibp;
11554	struct fs *fs;
11555	int error;
11556
11557	mp = UFSTOVFS(ip->i_ump);
11558	fs = ip->i_fs;
11559	/*
11560	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
11561	 * does not have access to the in-core ip so must write directly into
11562	 * the inode block buffer when setting freelink.
11563	 */
11564	if (fs->fs_magic == FS_UFS1_MAGIC)
11565		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
11566		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
11567	else
11568		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
11569		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
11570	/*
11571	 * If the effective link count is not equal to the actual link
11572	 * count, then we must track the difference in an inodedep while
11573	 * the inode is (potentially) tossed out of the cache. Otherwise,
11574	 * if there is no existing inodedep, then there are no dependencies
11575	 * to track.
11576	 */
11577	ACQUIRE_LOCK(&lk);
11578again:
11579	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
11580		FREE_LOCK(&lk);
11581		if (ip->i_effnlink != ip->i_nlink)
11582			panic("softdep_update_inodeblock: bad link count");
11583		return;
11584	}
11585	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
11586		panic("softdep_update_inodeblock: bad delta");
11587	/*
11588	 * If we're flushing all dependencies we must also move any waiting
11589	 * for journal writes onto the bufwait list prior to I/O.
11590	 */
11591	if (waitfor) {
11592		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
11593			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
11594			    == DEPCOMPLETE) {
11595				jwait(&inoref->if_list, MNT_WAIT);
11596				goto again;
11597			}
11598		}
11599	}
11600	/*
11601	 * Changes have been initiated. Anything depending on these
11602	 * changes cannot occur until this inode has been written.
11603	 */
11604	inodedep->id_state &= ~COMPLETE;
11605	if ((inodedep->id_state & ONWORKLIST) == 0)
11606		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
11607	/*
11608	 * Any new dependencies associated with the incore inode must
11609	 * now be moved to the list associated with the buffer holding
11610	 * the in-memory copy of the inode. Once merged process any
11611	 * allocdirects that are completed by the merger.
11612	 */
11613	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
11614	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
11615		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
11616		    NULL);
11617	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
11618	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
11619		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
11620		    NULL);
11621	/*
11622	 * Now that the inode has been pushed into the buffer, the
11623	 * operations dependent on the inode being written to disk
11624	 * can be moved to the id_bufwait so that they will be
11625	 * processed when the buffer I/O completes.
11626	 */
11627	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
11628		WORKLIST_REMOVE(wk);
11629		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
11630	}
11631	/*
11632	 * Newly allocated inodes cannot be written until the bitmap
11633	 * that allocates them have been written (indicated by
11634	 * DEPCOMPLETE being set in id_state). If we are doing a
11635	 * forced sync (e.g., an fsync on a file), we force the bitmap
11636	 * to be written so that the update can be done.
11637	 */
11638	if (waitfor == 0) {
11639		FREE_LOCK(&lk);
11640		return;
11641	}
11642retry:
11643	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
11644		FREE_LOCK(&lk);
11645		return;
11646	}
11647	ibp = inodedep->id_bmsafemap->sm_buf;
11648	ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
11649	if (ibp == NULL) {
11650		/*
11651		 * If ibp came back as NULL, the dependency could have been
11652		 * freed while we slept.  Look it up again, and check to see
11653		 * that it has completed.
11654		 */
11655		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
11656			goto retry;
11657		FREE_LOCK(&lk);
11658		return;
11659	}
11660	FREE_LOCK(&lk);
11661	if ((error = bwrite(ibp)) != 0)
11662		softdep_error("softdep_update_inodeblock: bwrite", error);
11663}
11664
11665/*
11666 * Merge the a new inode dependency list (such as id_newinoupdt) into an
11667 * old inode dependency list (such as id_inoupdt). This routine must be
11668 * called with splbio interrupts blocked.
11669 */
11670static void
11671merge_inode_lists(newlisthead, oldlisthead)
11672	struct allocdirectlst *newlisthead;
11673	struct allocdirectlst *oldlisthead;
11674{
11675	struct allocdirect *listadp, *newadp;
11676
11677	newadp = TAILQ_FIRST(newlisthead);
11678	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
11679		if (listadp->ad_offset < newadp->ad_offset) {
11680			listadp = TAILQ_NEXT(listadp, ad_next);
11681			continue;
11682		}
11683		TAILQ_REMOVE(newlisthead, newadp, ad_next);
11684		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
11685		if (listadp->ad_offset == newadp->ad_offset) {
11686			allocdirect_merge(oldlisthead, newadp,
11687			    listadp);
11688			listadp = newadp;
11689		}
11690		newadp = TAILQ_FIRST(newlisthead);
11691	}
11692	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
11693		TAILQ_REMOVE(newlisthead, newadp, ad_next);
11694		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
11695	}
11696}
11697
11698/*
11699 * If we are doing an fsync, then we must ensure that any directory
11700 * entries for the inode have been written after the inode gets to disk.
11701 */
11702int
11703softdep_fsync(vp)
11704	struct vnode *vp;	/* the "in_core" copy of the inode */
11705{
11706	struct inodedep *inodedep;
11707	struct pagedep *pagedep;
11708	struct inoref *inoref;
11709	struct worklist *wk;
11710	struct diradd *dap;
11711	struct mount *mp;
11712	struct vnode *pvp;
11713	struct inode *ip;
11714	struct buf *bp;
11715	struct fs *fs;
11716	struct thread *td = curthread;
11717	int error, flushparent, pagedep_new_block;
11718	ino_t parentino;
11719	ufs_lbn_t lbn;
11720
11721	ip = VTOI(vp);
11722	fs = ip->i_fs;
11723	mp = vp->v_mount;
11724	ACQUIRE_LOCK(&lk);
11725restart:
11726	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
11727		FREE_LOCK(&lk);
11728		return (0);
11729	}
11730	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
11731		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
11732		    == DEPCOMPLETE) {
11733			jwait(&inoref->if_list, MNT_WAIT);
11734			goto restart;
11735		}
11736	}
11737	if (!LIST_EMPTY(&inodedep->id_inowait) ||
11738	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
11739	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
11740	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
11741	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
11742		panic("softdep_fsync: pending ops %p", inodedep);
11743	for (error = 0, flushparent = 0; ; ) {
11744		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
11745			break;
11746		if (wk->wk_type != D_DIRADD)
11747			panic("softdep_fsync: Unexpected type %s",
11748			    TYPENAME(wk->wk_type));
11749		dap = WK_DIRADD(wk);
11750		/*
11751		 * Flush our parent if this directory entry has a MKDIR_PARENT
11752		 * dependency or is contained in a newly allocated block.
11753		 */
11754		if (dap->da_state & DIRCHG)
11755			pagedep = dap->da_previous->dm_pagedep;
11756		else
11757			pagedep = dap->da_pagedep;
11758		parentino = pagedep->pd_ino;
11759		lbn = pagedep->pd_lbn;
11760		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
11761			panic("softdep_fsync: dirty");
11762		if ((dap->da_state & MKDIR_PARENT) ||
11763		    (pagedep->pd_state & NEWBLOCK))
11764			flushparent = 1;
11765		else
11766			flushparent = 0;
11767		/*
11768		 * If we are being fsync'ed as part of vgone'ing this vnode,
11769		 * then we will not be able to release and recover the
11770		 * vnode below, so we just have to give up on writing its
11771		 * directory entry out. It will eventually be written, just
11772		 * not now, but then the user was not asking to have it
11773		 * written, so we are not breaking any promises.
11774		 */
11775		if (vp->v_iflag & VI_DOOMED)
11776			break;
11777		/*
11778		 * We prevent deadlock by always fetching inodes from the
11779		 * root, moving down the directory tree. Thus, when fetching
11780		 * our parent directory, we first try to get the lock. If
11781		 * that fails, we must unlock ourselves before requesting
11782		 * the lock on our parent. See the comment in ufs_lookup
11783		 * for details on possible races.
11784		 */
11785		FREE_LOCK(&lk);
11786		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
11787		    FFSV_FORCEINSMQ)) {
11788			error = vfs_busy(mp, MBF_NOWAIT);
11789			if (error != 0) {
11790				vfs_ref(mp);
11791				VOP_UNLOCK(vp, 0);
11792				error = vfs_busy(mp, 0);
11793				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
11794				vfs_rel(mp);
11795				if (error != 0)
11796					return (ENOENT);
11797				if (vp->v_iflag & VI_DOOMED) {
11798					vfs_unbusy(mp);
11799					return (ENOENT);
11800				}
11801			}
11802			VOP_UNLOCK(vp, 0);
11803			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
11804			    &pvp, FFSV_FORCEINSMQ);
11805			vfs_unbusy(mp);
11806			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
11807			if (vp->v_iflag & VI_DOOMED) {
11808				if (error == 0)
11809					vput(pvp);
11810				error = ENOENT;
11811			}
11812			if (error != 0)
11813				return (error);
11814		}
11815		/*
11816		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
11817		 * that are contained in direct blocks will be resolved by
11818		 * doing a ffs_update. Pagedeps contained in indirect blocks
11819		 * may require a complete sync'ing of the directory. So, we
11820		 * try the cheap and fast ffs_update first, and if that fails,
11821		 * then we do the slower ffs_syncvnode of the directory.
11822		 */
11823		if (flushparent) {
11824			int locked;
11825
11826			if ((error = ffs_update(pvp, 1)) != 0) {
11827				vput(pvp);
11828				return (error);
11829			}
11830			ACQUIRE_LOCK(&lk);
11831			locked = 1;
11832			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
11833				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
11834					if (wk->wk_type != D_DIRADD)
11835						panic("softdep_fsync: Unexpected type %s",
11836						      TYPENAME(wk->wk_type));
11837					dap = WK_DIRADD(wk);
11838					if (dap->da_state & DIRCHG)
11839						pagedep = dap->da_previous->dm_pagedep;
11840					else
11841						pagedep = dap->da_pagedep;
11842					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
11843					FREE_LOCK(&lk);
11844					locked = 0;
11845					if (pagedep_new_block && (error =
11846					    ffs_syncvnode(pvp, MNT_WAIT, 0))) {
11847						vput(pvp);
11848						return (error);
11849					}
11850				}
11851			}
11852			if (locked)
11853				FREE_LOCK(&lk);
11854		}
11855		/*
11856		 * Flush directory page containing the inode's name.
11857		 */
11858		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
11859		    &bp);
11860		if (error == 0)
11861			error = bwrite(bp);
11862		else
11863			brelse(bp);
11864		vput(pvp);
11865		if (error != 0)
11866			return (error);
11867		ACQUIRE_LOCK(&lk);
11868		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
11869			break;
11870	}
11871	FREE_LOCK(&lk);
11872	return (0);
11873}
11874
11875/*
11876 * Flush all the dirty bitmaps associated with the block device
11877 * before flushing the rest of the dirty blocks so as to reduce
11878 * the number of dependencies that will have to be rolled back.
11879 *
11880 * XXX Unused?
11881 */
11882void
11883softdep_fsync_mountdev(vp)
11884	struct vnode *vp;
11885{
11886	struct buf *bp, *nbp;
11887	struct worklist *wk;
11888	struct bufobj *bo;
11889
11890	if (!vn_isdisk(vp, NULL))
11891		panic("softdep_fsync_mountdev: vnode not a disk");
11892	bo = &vp->v_bufobj;
11893restart:
11894	BO_LOCK(bo);
11895	ACQUIRE_LOCK(&lk);
11896	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
11897		/*
11898		 * If it is already scheduled, skip to the next buffer.
11899		 */
11900		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
11901			continue;
11902
11903		if ((bp->b_flags & B_DELWRI) == 0)
11904			panic("softdep_fsync_mountdev: not dirty");
11905		/*
11906		 * We are only interested in bitmaps with outstanding
11907		 * dependencies.
11908		 */
11909		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
11910		    wk->wk_type != D_BMSAFEMAP ||
11911		    (bp->b_vflags & BV_BKGRDINPROG)) {
11912			BUF_UNLOCK(bp);
11913			continue;
11914		}
11915		FREE_LOCK(&lk);
11916		BO_UNLOCK(bo);
11917		bremfree(bp);
11918		(void) bawrite(bp);
11919		goto restart;
11920	}
11921	FREE_LOCK(&lk);
11922	drain_output(vp);
11923	BO_UNLOCK(bo);
11924}
11925
11926/*
11927 * Sync all cylinder groups that were dirty at the time this function is
11928 * called.  Newly dirtied cgs will be inserted before the sintenel.  This
11929 * is used to flush freedep activity that may be holding up writes to a
11930 * indirect block.
11931 */
11932static int
11933sync_cgs(mp, waitfor)
11934	struct mount *mp;
11935	int waitfor;
11936{
11937	struct bmsafemap *bmsafemap;
11938	struct bmsafemap *sintenel;
11939	struct ufsmount *ump;
11940	struct buf *bp;
11941	int error;
11942
11943	sintenel = malloc(sizeof(*sintenel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
11944	sintenel->sm_cg = -1;
11945	ump = VFSTOUFS(mp);
11946	error = 0;
11947	ACQUIRE_LOCK(&lk);
11948	LIST_INSERT_HEAD(&ump->softdep_dirtycg, sintenel, sm_next);
11949	for (bmsafemap = LIST_NEXT(sintenel, sm_next); bmsafemap != NULL;
11950	    bmsafemap = LIST_NEXT(sintenel, sm_next)) {
11951		/* Skip sintenels and cgs with no work to release. */
11952		if (bmsafemap->sm_cg == -1 ||
11953		    (LIST_EMPTY(&bmsafemap->sm_freehd) &&
11954		    LIST_EMPTY(&bmsafemap->sm_freewr))) {
11955			LIST_REMOVE(sintenel, sm_next);
11956			LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next);
11957			continue;
11958		}
11959		/*
11960		 * If we don't get the lock and we're waiting try again, if
11961		 * not move on to the next buf and try to sync it.
11962		 */
11963		bp = getdirtybuf(bmsafemap->sm_buf, &lk, waitfor);
11964		if (bp == NULL && waitfor == MNT_WAIT)
11965			continue;
11966		LIST_REMOVE(sintenel, sm_next);
11967		LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next);
11968		if (bp == NULL)
11969			continue;
11970		FREE_LOCK(&lk);
11971		if (waitfor == MNT_NOWAIT)
11972			bawrite(bp);
11973		else
11974			error = bwrite(bp);
11975		ACQUIRE_LOCK(&lk);
11976		if (error)
11977			break;
11978	}
11979	LIST_REMOVE(sintenel, sm_next);
11980	FREE_LOCK(&lk);
11981	free(sintenel, M_BMSAFEMAP);
11982	return (error);
11983}
11984
11985/*
11986 * This routine is called when we are trying to synchronously flush a
11987 * file. This routine must eliminate any filesystem metadata dependencies
11988 * so that the syncing routine can succeed.
11989 */
11990int
11991softdep_sync_metadata(struct vnode *vp)
11992{
11993	int error;
11994
11995	/*
11996	 * Ensure that any direct block dependencies have been cleared,
11997	 * truncations are started, and inode references are journaled.
11998	 */
11999	ACQUIRE_LOCK(&lk);
12000	/*
12001	 * Write all journal records to prevent rollbacks on devvp.
12002	 */
12003	if (vp->v_type == VCHR)
12004		softdep_flushjournal(vp->v_mount);
12005	error = flush_inodedep_deps(vp, vp->v_mount, VTOI(vp)->i_number);
12006	/*
12007	 * Ensure that all truncates are written so we won't find deps on
12008	 * indirect blocks.
12009	 */
12010	process_truncates(vp);
12011	FREE_LOCK(&lk);
12012
12013	return (error);
12014}
12015
12016/*
12017 * This routine is called when we are attempting to sync a buf with
12018 * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12019 * other IO it can but returns EBUSY if the buffer is not yet able to
12020 * be written.  Dependencies which will not cause rollbacks will always
12021 * return 0.
12022 */
12023int
12024softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
12025{
12026	struct indirdep *indirdep;
12027	struct pagedep *pagedep;
12028	struct allocindir *aip;
12029	struct newblk *newblk;
12030	struct buf *nbp;
12031	struct worklist *wk;
12032	int i, error;
12033
12034	/*
12035	 * For VCHR we just don't want to force flush any dependencies that
12036	 * will cause rollbacks.
12037	 */
12038	if (vp->v_type == VCHR) {
12039		if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12040			return (EBUSY);
12041		return (0);
12042	}
12043	ACQUIRE_LOCK(&lk);
12044	/*
12045	 * As we hold the buffer locked, none of its dependencies
12046	 * will disappear.
12047	 */
12048	error = 0;
12049top:
12050	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12051		switch (wk->wk_type) {
12052
12053		case D_ALLOCDIRECT:
12054		case D_ALLOCINDIR:
12055			newblk = WK_NEWBLK(wk);
12056			if (newblk->nb_jnewblk != NULL) {
12057				if (waitfor == MNT_NOWAIT) {
12058					error = EBUSY;
12059					goto out_unlock;
12060				}
12061				jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12062				goto top;
12063			}
12064			if (newblk->nb_state & DEPCOMPLETE ||
12065			    waitfor == MNT_NOWAIT)
12066				continue;
12067			nbp = newblk->nb_bmsafemap->sm_buf;
12068			nbp = getdirtybuf(nbp, &lk, waitfor);
12069			if (nbp == NULL)
12070				goto top;
12071			FREE_LOCK(&lk);
12072			if ((error = bwrite(nbp)) != 0)
12073				goto out;
12074			ACQUIRE_LOCK(&lk);
12075			continue;
12076
12077		case D_INDIRDEP:
12078			indirdep = WK_INDIRDEP(wk);
12079			if (waitfor == MNT_NOWAIT) {
12080				if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
12081				    !LIST_EMPTY(&indirdep->ir_deplisthd)) {
12082					error = EBUSY;
12083					goto out_unlock;
12084				}
12085			}
12086			if (!TAILQ_EMPTY(&indirdep->ir_trunc))
12087				panic("softdep_sync_buf: truncation pending.");
12088		restart:
12089			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
12090				newblk = (struct newblk *)aip;
12091				if (newblk->nb_jnewblk != NULL) {
12092					jwait(&newblk->nb_jnewblk->jn_list,
12093					    waitfor);
12094					goto restart;
12095				}
12096				if (newblk->nb_state & DEPCOMPLETE)
12097					continue;
12098				nbp = newblk->nb_bmsafemap->sm_buf;
12099				nbp = getdirtybuf(nbp, &lk, waitfor);
12100				if (nbp == NULL)
12101					goto restart;
12102				FREE_LOCK(&lk);
12103				if ((error = bwrite(nbp)) != 0)
12104					goto out;
12105				ACQUIRE_LOCK(&lk);
12106				goto restart;
12107			}
12108			continue;
12109
12110		case D_PAGEDEP:
12111			/*
12112			 * Only flush directory entries in synchronous passes.
12113			 */
12114			if (waitfor != MNT_WAIT) {
12115				error = EBUSY;
12116				goto out_unlock;
12117			}
12118			/*
12119			 * While syncing snapshots, we must allow recursive
12120			 * lookups.
12121			 */
12122			BUF_AREC(bp);
12123			/*
12124			 * We are trying to sync a directory that may
12125			 * have dependencies on both its own metadata
12126			 * and/or dependencies on the inodes of any
12127			 * recently allocated files. We walk its diradd
12128			 * lists pushing out the associated inode.
12129			 */
12130			pagedep = WK_PAGEDEP(wk);
12131			for (i = 0; i < DAHASHSZ; i++) {
12132				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
12133					continue;
12134				if ((error = flush_pagedep_deps(vp, wk->wk_mp,
12135				    &pagedep->pd_diraddhd[i]))) {
12136					BUF_NOREC(bp);
12137					goto out_unlock;
12138				}
12139			}
12140			BUF_NOREC(bp);
12141			continue;
12142
12143		case D_FREEWORK:
12144		case D_FREEDEP:
12145		case D_JSEGDEP:
12146		case D_JNEWBLK:
12147			continue;
12148
12149		default:
12150			panic("softdep_sync_buf: Unknown type %s",
12151			    TYPENAME(wk->wk_type));
12152			/* NOTREACHED */
12153		}
12154	}
12155out_unlock:
12156	FREE_LOCK(&lk);
12157out:
12158	return (error);
12159}
12160
12161/*
12162 * Flush the dependencies associated with an inodedep.
12163 * Called with splbio blocked.
12164 */
12165static int
12166flush_inodedep_deps(vp, mp, ino)
12167	struct vnode *vp;
12168	struct mount *mp;
12169	ino_t ino;
12170{
12171	struct inodedep *inodedep;
12172	struct inoref *inoref;
12173	int error, waitfor;
12174
12175	/*
12176	 * This work is done in two passes. The first pass grabs most
12177	 * of the buffers and begins asynchronously writing them. The
12178	 * only way to wait for these asynchronous writes is to sleep
12179	 * on the filesystem vnode which may stay busy for a long time
12180	 * if the filesystem is active. So, instead, we make a second
12181	 * pass over the dependencies blocking on each write. In the
12182	 * usual case we will be blocking against a write that we
12183	 * initiated, so when it is done the dependency will have been
12184	 * resolved. Thus the second pass is expected to end quickly.
12185	 * We give a brief window at the top of the loop to allow
12186	 * any pending I/O to complete.
12187	 */
12188	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
12189		if (error)
12190			return (error);
12191		FREE_LOCK(&lk);
12192		ACQUIRE_LOCK(&lk);
12193restart:
12194		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12195			return (0);
12196		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12197			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12198			    == DEPCOMPLETE) {
12199				jwait(&inoref->if_list, MNT_WAIT);
12200				goto restart;
12201			}
12202		}
12203		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
12204		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
12205		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
12206		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
12207			continue;
12208		/*
12209		 * If pass2, we are done, otherwise do pass 2.
12210		 */
12211		if (waitfor == MNT_WAIT)
12212			break;
12213		waitfor = MNT_WAIT;
12214	}
12215	/*
12216	 * Try freeing inodedep in case all dependencies have been removed.
12217	 */
12218	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
12219		(void) free_inodedep(inodedep);
12220	return (0);
12221}
12222
12223/*
12224 * Flush an inode dependency list.
12225 * Called with splbio blocked.
12226 */
12227static int
12228flush_deplist(listhead, waitfor, errorp)
12229	struct allocdirectlst *listhead;
12230	int waitfor;
12231	int *errorp;
12232{
12233	struct allocdirect *adp;
12234	struct newblk *newblk;
12235	struct buf *bp;
12236
12237	mtx_assert(&lk, MA_OWNED);
12238	TAILQ_FOREACH(adp, listhead, ad_next) {
12239		newblk = (struct newblk *)adp;
12240		if (newblk->nb_jnewblk != NULL) {
12241			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12242			return (1);
12243		}
12244		if (newblk->nb_state & DEPCOMPLETE)
12245			continue;
12246		bp = newblk->nb_bmsafemap->sm_buf;
12247		bp = getdirtybuf(bp, &lk, waitfor);
12248		if (bp == NULL) {
12249			if (waitfor == MNT_NOWAIT)
12250				continue;
12251			return (1);
12252		}
12253		FREE_LOCK(&lk);
12254		if (waitfor == MNT_NOWAIT)
12255			bawrite(bp);
12256		else
12257			*errorp = bwrite(bp);
12258		ACQUIRE_LOCK(&lk);
12259		return (1);
12260	}
12261	return (0);
12262}
12263
12264/*
12265 * Flush dependencies associated with an allocdirect block.
12266 */
12267static int
12268flush_newblk_dep(vp, mp, lbn)
12269	struct vnode *vp;
12270	struct mount *mp;
12271	ufs_lbn_t lbn;
12272{
12273	struct newblk *newblk;
12274	struct bufobj *bo;
12275	struct inode *ip;
12276	struct buf *bp;
12277	ufs2_daddr_t blkno;
12278	int error;
12279
12280	error = 0;
12281	bo = &vp->v_bufobj;
12282	ip = VTOI(vp);
12283	blkno = DIP(ip, i_db[lbn]);
12284	if (blkno == 0)
12285		panic("flush_newblk_dep: Missing block");
12286	ACQUIRE_LOCK(&lk);
12287	/*
12288	 * Loop until all dependencies related to this block are satisfied.
12289	 * We must be careful to restart after each sleep in case a write
12290	 * completes some part of this process for us.
12291	 */
12292	for (;;) {
12293		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
12294			FREE_LOCK(&lk);
12295			break;
12296		}
12297		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
12298			panic("flush_newblk_deps: Bad newblk %p", newblk);
12299		/*
12300		 * Flush the journal.
12301		 */
12302		if (newblk->nb_jnewblk != NULL) {
12303			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12304			continue;
12305		}
12306		/*
12307		 * Write the bitmap dependency.
12308		 */
12309		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
12310			bp = newblk->nb_bmsafemap->sm_buf;
12311			bp = getdirtybuf(bp, &lk, MNT_WAIT);
12312			if (bp == NULL)
12313				continue;
12314			FREE_LOCK(&lk);
12315			error = bwrite(bp);
12316			if (error)
12317				break;
12318			ACQUIRE_LOCK(&lk);
12319			continue;
12320		}
12321		/*
12322		 * Write the buffer.
12323		 */
12324		FREE_LOCK(&lk);
12325		BO_LOCK(bo);
12326		bp = gbincore(bo, lbn);
12327		if (bp != NULL) {
12328			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
12329			    LK_INTERLOCK, BO_MTX(bo));
12330			if (error == ENOLCK) {
12331				ACQUIRE_LOCK(&lk);
12332				continue; /* Slept, retry */
12333			}
12334			if (error != 0)
12335				break;	/* Failed */
12336			if (bp->b_flags & B_DELWRI) {
12337				bremfree(bp);
12338				error = bwrite(bp);
12339				if (error)
12340					break;
12341			} else
12342				BUF_UNLOCK(bp);
12343		} else
12344			BO_UNLOCK(bo);
12345		/*
12346		 * We have to wait for the direct pointers to
12347		 * point at the newdirblk before the dependency
12348		 * will go away.
12349		 */
12350		error = ffs_update(vp, 1);
12351		if (error)
12352			break;
12353		ACQUIRE_LOCK(&lk);
12354	}
12355	return (error);
12356}
12357
12358/*
12359 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
12360 * Called with splbio blocked.
12361 */
12362static int
12363flush_pagedep_deps(pvp, mp, diraddhdp)
12364	struct vnode *pvp;
12365	struct mount *mp;
12366	struct diraddhd *diraddhdp;
12367{
12368	struct inodedep *inodedep;
12369	struct inoref *inoref;
12370	struct ufsmount *ump;
12371	struct diradd *dap;
12372	struct vnode *vp;
12373	int error = 0;
12374	struct buf *bp;
12375	ino_t inum;
12376
12377	ump = VFSTOUFS(mp);
12378restart:
12379	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
12380		/*
12381		 * Flush ourselves if this directory entry
12382		 * has a MKDIR_PARENT dependency.
12383		 */
12384		if (dap->da_state & MKDIR_PARENT) {
12385			FREE_LOCK(&lk);
12386			if ((error = ffs_update(pvp, 1)) != 0)
12387				break;
12388			ACQUIRE_LOCK(&lk);
12389			/*
12390			 * If that cleared dependencies, go on to next.
12391			 */
12392			if (dap != LIST_FIRST(diraddhdp))
12393				continue;
12394			if (dap->da_state & MKDIR_PARENT)
12395				panic("flush_pagedep_deps: MKDIR_PARENT");
12396		}
12397		/*
12398		 * A newly allocated directory must have its "." and
12399		 * ".." entries written out before its name can be
12400		 * committed in its parent.
12401		 */
12402		inum = dap->da_newinum;
12403		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12404			panic("flush_pagedep_deps: lost inode1");
12405		/*
12406		 * Wait for any pending journal adds to complete so we don't
12407		 * cause rollbacks while syncing.
12408		 */
12409		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12410			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12411			    == DEPCOMPLETE) {
12412				jwait(&inoref->if_list, MNT_WAIT);
12413				goto restart;
12414			}
12415		}
12416		if (dap->da_state & MKDIR_BODY) {
12417			FREE_LOCK(&lk);
12418			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12419			    FFSV_FORCEINSMQ)))
12420				break;
12421			error = flush_newblk_dep(vp, mp, 0);
12422			/*
12423			 * If we still have the dependency we might need to
12424			 * update the vnode to sync the new link count to
12425			 * disk.
12426			 */
12427			if (error == 0 && dap == LIST_FIRST(diraddhdp))
12428				error = ffs_update(vp, 1);
12429			vput(vp);
12430			if (error != 0)
12431				break;
12432			ACQUIRE_LOCK(&lk);
12433			/*
12434			 * If that cleared dependencies, go on to next.
12435			 */
12436			if (dap != LIST_FIRST(diraddhdp))
12437				continue;
12438			if (dap->da_state & MKDIR_BODY) {
12439				inodedep_lookup(UFSTOVFS(ump), inum, 0,
12440				    &inodedep);
12441				panic("flush_pagedep_deps: MKDIR_BODY "
12442				    "inodedep %p dap %p vp %p",
12443				    inodedep, dap, vp);
12444			}
12445		}
12446		/*
12447		 * Flush the inode on which the directory entry depends.
12448		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
12449		 * the only remaining dependency is that the updated inode
12450		 * count must get pushed to disk. The inode has already
12451		 * been pushed into its inode buffer (via VOP_UPDATE) at
12452		 * the time of the reference count change. So we need only
12453		 * locate that buffer, ensure that there will be no rollback
12454		 * caused by a bitmap dependency, then write the inode buffer.
12455		 */
12456retry:
12457		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12458			panic("flush_pagedep_deps: lost inode");
12459		/*
12460		 * If the inode still has bitmap dependencies,
12461		 * push them to disk.
12462		 */
12463		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
12464			bp = inodedep->id_bmsafemap->sm_buf;
12465			bp = getdirtybuf(bp, &lk, MNT_WAIT);
12466			if (bp == NULL)
12467				goto retry;
12468			FREE_LOCK(&lk);
12469			if ((error = bwrite(bp)) != 0)
12470				break;
12471			ACQUIRE_LOCK(&lk);
12472			if (dap != LIST_FIRST(diraddhdp))
12473				continue;
12474		}
12475		/*
12476		 * If the inode is still sitting in a buffer waiting
12477		 * to be written or waiting for the link count to be
12478		 * adjusted update it here to flush it to disk.
12479		 */
12480		if (dap == LIST_FIRST(diraddhdp)) {
12481			FREE_LOCK(&lk);
12482			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12483			    FFSV_FORCEINSMQ)))
12484				break;
12485			error = ffs_update(vp, 1);
12486			vput(vp);
12487			if (error)
12488				break;
12489			ACQUIRE_LOCK(&lk);
12490		}
12491		/*
12492		 * If we have failed to get rid of all the dependencies
12493		 * then something is seriously wrong.
12494		 */
12495		if (dap == LIST_FIRST(diraddhdp)) {
12496			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
12497			panic("flush_pagedep_deps: failed to flush "
12498			    "inodedep %p ino %ju dap %p",
12499			    inodedep, (uintmax_t)inum, dap);
12500		}
12501	}
12502	if (error)
12503		ACQUIRE_LOCK(&lk);
12504	return (error);
12505}
12506
12507/*
12508 * A large burst of file addition or deletion activity can drive the
12509 * memory load excessively high. First attempt to slow things down
12510 * using the techniques below. If that fails, this routine requests
12511 * the offending operations to fall back to running synchronously
12512 * until the memory load returns to a reasonable level.
12513 */
12514int
12515softdep_slowdown(vp)
12516	struct vnode *vp;
12517{
12518	struct ufsmount *ump;
12519	int jlow;
12520	int max_softdeps_hard;
12521
12522	ACQUIRE_LOCK(&lk);
12523	jlow = 0;
12524	/*
12525	 * Check for journal space if needed.
12526	 */
12527	if (DOINGSUJ(vp)) {
12528		ump = VFSTOUFS(vp->v_mount);
12529		if (journal_space(ump, 0) == 0)
12530			jlow = 1;
12531	}
12532	max_softdeps_hard = max_softdeps * 11 / 10;
12533	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
12534	    dep_current[D_INODEDEP] < max_softdeps_hard &&
12535	    VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps &&
12536	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0) {
12537		FREE_LOCK(&lk);
12538  		return (0);
12539	}
12540	if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow)
12541		softdep_speedup();
12542	stat_sync_limit_hit += 1;
12543	FREE_LOCK(&lk);
12544	if (DOINGSUJ(vp))
12545		return (0);
12546	return (1);
12547}
12548
12549/*
12550 * Called by the allocation routines when they are about to fail
12551 * in the hope that we can free up the requested resource (inodes
12552 * or disk space).
12553 *
12554 * First check to see if the work list has anything on it. If it has,
12555 * clean up entries until we successfully free the requested resource.
12556 * Because this process holds inodes locked, we cannot handle any remove
12557 * requests that might block on a locked inode as that could lead to
12558 * deadlock. If the worklist yields none of the requested resource,
12559 * start syncing out vnodes to free up the needed space.
12560 */
12561int
12562softdep_request_cleanup(fs, vp, cred, resource)
12563	struct fs *fs;
12564	struct vnode *vp;
12565	struct ucred *cred;
12566	int resource;
12567{
12568	struct ufsmount *ump;
12569	struct mount *mp;
12570	struct vnode *lvp, *mvp;
12571	long starttime;
12572	ufs2_daddr_t needed;
12573	int error;
12574
12575	/*
12576	 * If we are being called because of a process doing a
12577	 * copy-on-write, then it is not safe to process any
12578	 * worklist items as we will recurse into the copyonwrite
12579	 * routine.  This will result in an incoherent snapshot.
12580	 * If the vnode that we hold is a snapshot, we must avoid
12581	 * handling other resources that could cause deadlock.
12582	 */
12583	if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
12584		return (0);
12585
12586	if (resource == FLUSH_BLOCKS_WAIT)
12587		stat_cleanup_blkrequests += 1;
12588	else
12589		stat_cleanup_inorequests += 1;
12590
12591	mp = vp->v_mount;
12592	ump = VFSTOUFS(mp);
12593	mtx_assert(UFS_MTX(ump), MA_OWNED);
12594	UFS_UNLOCK(ump);
12595	error = ffs_update(vp, 1);
12596	if (error != 0) {
12597		UFS_LOCK(ump);
12598		return (0);
12599	}
12600	/*
12601	 * If we are in need of resources, consider pausing for
12602	 * tickdelay to give ourselves some breathing room.
12603	 */
12604	ACQUIRE_LOCK(&lk);
12605	process_removes(vp);
12606	process_truncates(vp);
12607	request_cleanup(UFSTOVFS(ump), resource);
12608	FREE_LOCK(&lk);
12609	/*
12610	 * Now clean up at least as many resources as we will need.
12611	 *
12612	 * When requested to clean up inodes, the number that are needed
12613	 * is set by the number of simultaneous writers (mnt_writeopcount)
12614	 * plus a bit of slop (2) in case some more writers show up while
12615	 * we are cleaning.
12616	 *
12617	 * When requested to free up space, the amount of space that
12618	 * we need is enough blocks to allocate a full-sized segment
12619	 * (fs_contigsumsize). The number of such segments that will
12620	 * be needed is set by the number of simultaneous writers
12621	 * (mnt_writeopcount) plus a bit of slop (2) in case some more
12622	 * writers show up while we are cleaning.
12623	 *
12624	 * Additionally, if we are unpriviledged and allocating space,
12625	 * we need to ensure that we clean up enough blocks to get the
12626	 * needed number of blocks over the threshhold of the minimum
12627	 * number of blocks required to be kept free by the filesystem
12628	 * (fs_minfree).
12629	 */
12630	if (resource == FLUSH_INODES_WAIT) {
12631		needed = vp->v_mount->mnt_writeopcount + 2;
12632	} else if (resource == FLUSH_BLOCKS_WAIT) {
12633		needed = (vp->v_mount->mnt_writeopcount + 2) *
12634		    fs->fs_contigsumsize;
12635		if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
12636			needed += fragstoblks(fs,
12637			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
12638			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
12639	} else {
12640		UFS_LOCK(ump);
12641		printf("softdep_request_cleanup: Unknown resource type %d\n",
12642		    resource);
12643		return (0);
12644	}
12645	starttime = time_second;
12646retry:
12647	if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
12648	    fs->fs_cstotal.cs_nbfree <= needed) ||
12649	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
12650	    fs->fs_cstotal.cs_nifree <= needed)) {
12651		ACQUIRE_LOCK(&lk);
12652		if (ump->softdep_on_worklist > 0 &&
12653		    process_worklist_item(UFSTOVFS(ump),
12654		    ump->softdep_on_worklist, LK_NOWAIT) != 0)
12655			stat_worklist_push += 1;
12656		FREE_LOCK(&lk);
12657	}
12658	/*
12659	 * If we still need resources and there are no more worklist
12660	 * entries to process to obtain them, we have to start flushing
12661	 * the dirty vnodes to force the release of additional requests
12662	 * to the worklist that we can then process to reap addition
12663	 * resources. We walk the vnodes associated with the mount point
12664	 * until we get the needed worklist requests that we can reap.
12665	 */
12666	if ((resource == FLUSH_BLOCKS_WAIT &&
12667	     fs->fs_cstotal.cs_nbfree <= needed) ||
12668	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
12669	     fs->fs_cstotal.cs_nifree <= needed)) {
12670		MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
12671			if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
12672				VI_UNLOCK(lvp);
12673				continue;
12674			}
12675			if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
12676			    curthread))
12677				continue;
12678			if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
12679				vput(lvp);
12680				continue;
12681			}
12682			(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
12683			vput(lvp);
12684		}
12685		lvp = ump->um_devvp;
12686		if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
12687			VOP_FSYNC(lvp, MNT_NOWAIT, curthread);
12688			VOP_UNLOCK(lvp, 0);
12689		}
12690		if (ump->softdep_on_worklist > 0) {
12691			stat_cleanup_retries += 1;
12692			goto retry;
12693		}
12694		stat_cleanup_failures += 1;
12695	}
12696	if (time_second - starttime > stat_cleanup_high_delay)
12697		stat_cleanup_high_delay = time_second - starttime;
12698	UFS_LOCK(ump);
12699	return (1);
12700}
12701
12702/*
12703 * If memory utilization has gotten too high, deliberately slow things
12704 * down and speed up the I/O processing.
12705 */
12706extern struct thread *syncertd;
12707static int
12708request_cleanup(mp, resource)
12709	struct mount *mp;
12710	int resource;
12711{
12712	struct thread *td = curthread;
12713	struct ufsmount *ump;
12714
12715	mtx_assert(&lk, MA_OWNED);
12716	/*
12717	 * We never hold up the filesystem syncer or buf daemon.
12718	 */
12719	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
12720		return (0);
12721	ump = VFSTOUFS(mp);
12722	/*
12723	 * First check to see if the work list has gotten backlogged.
12724	 * If it has, co-opt this process to help clean up two entries.
12725	 * Because this process may hold inodes locked, we cannot
12726	 * handle any remove requests that might block on a locked
12727	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
12728	 * to avoid recursively processing the worklist.
12729	 */
12730	if (ump->softdep_on_worklist > max_softdeps / 10) {
12731		td->td_pflags |= TDP_SOFTDEP;
12732		process_worklist_item(mp, 2, LK_NOWAIT);
12733		td->td_pflags &= ~TDP_SOFTDEP;
12734		stat_worklist_push += 2;
12735		return(1);
12736	}
12737	/*
12738	 * Next, we attempt to speed up the syncer process. If that
12739	 * is successful, then we allow the process to continue.
12740	 */
12741	if (softdep_speedup() &&
12742	    resource != FLUSH_BLOCKS_WAIT &&
12743	    resource != FLUSH_INODES_WAIT)
12744		return(0);
12745	/*
12746	 * If we are resource constrained on inode dependencies, try
12747	 * flushing some dirty inodes. Otherwise, we are constrained
12748	 * by file deletions, so try accelerating flushes of directories
12749	 * with removal dependencies. We would like to do the cleanup
12750	 * here, but we probably hold an inode locked at this point and
12751	 * that might deadlock against one that we try to clean. So,
12752	 * the best that we can do is request the syncer daemon to do
12753	 * the cleanup for us.
12754	 */
12755	switch (resource) {
12756
12757	case FLUSH_INODES:
12758	case FLUSH_INODES_WAIT:
12759		stat_ino_limit_push += 1;
12760		req_clear_inodedeps += 1;
12761		stat_countp = &stat_ino_limit_hit;
12762		break;
12763
12764	case FLUSH_BLOCKS:
12765	case FLUSH_BLOCKS_WAIT:
12766		stat_blk_limit_push += 1;
12767		req_clear_remove += 1;
12768		stat_countp = &stat_blk_limit_hit;
12769		break;
12770
12771	default:
12772		panic("request_cleanup: unknown type");
12773	}
12774	/*
12775	 * Hopefully the syncer daemon will catch up and awaken us.
12776	 * We wait at most tickdelay before proceeding in any case.
12777	 */
12778	proc_waiting += 1;
12779	if (callout_pending(&softdep_callout) == FALSE)
12780		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
12781		    pause_timer, 0);
12782
12783	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
12784	proc_waiting -= 1;
12785	return (1);
12786}
12787
12788/*
12789 * Awaken processes pausing in request_cleanup and clear proc_waiting
12790 * to indicate that there is no longer a timer running.
12791 */
12792static void
12793pause_timer(arg)
12794	void *arg;
12795{
12796
12797	/*
12798	 * The callout_ API has acquired mtx and will hold it around this
12799	 * function call.
12800	 */
12801	*stat_countp += 1;
12802	wakeup_one(&proc_waiting);
12803	if (proc_waiting > 0)
12804		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
12805		    pause_timer, 0);
12806}
12807
12808/*
12809 * Flush out a directory with at least one removal dependency in an effort to
12810 * reduce the number of dirrem, freefile, and freeblks dependency structures.
12811 */
12812static void
12813clear_remove(void)
12814{
12815	struct pagedep_hashhead *pagedephd;
12816	struct pagedep *pagedep;
12817	static int next = 0;
12818	struct mount *mp;
12819	struct vnode *vp;
12820	struct bufobj *bo;
12821	int error, cnt;
12822	ino_t ino;
12823
12824	mtx_assert(&lk, MA_OWNED);
12825
12826	for (cnt = 0; cnt < pagedep_hash; cnt++) {
12827		pagedephd = &pagedep_hashtbl[next++];
12828		if (next >= pagedep_hash)
12829			next = 0;
12830		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
12831			if (LIST_EMPTY(&pagedep->pd_dirremhd))
12832				continue;
12833			mp = pagedep->pd_list.wk_mp;
12834			ino = pagedep->pd_ino;
12835			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
12836				continue;
12837			FREE_LOCK(&lk);
12838
12839			/*
12840			 * Let unmount clear deps
12841			 */
12842			error = vfs_busy(mp, MBF_NOWAIT);
12843			if (error != 0)
12844				goto finish_write;
12845			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
12846			     FFSV_FORCEINSMQ);
12847			vfs_unbusy(mp);
12848			if (error != 0) {
12849				softdep_error("clear_remove: vget", error);
12850				goto finish_write;
12851			}
12852			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
12853				softdep_error("clear_remove: fsync", error);
12854			bo = &vp->v_bufobj;
12855			BO_LOCK(bo);
12856			drain_output(vp);
12857			BO_UNLOCK(bo);
12858			vput(vp);
12859		finish_write:
12860			vn_finished_write(mp);
12861			ACQUIRE_LOCK(&lk);
12862			return;
12863		}
12864	}
12865}
12866
12867/*
12868 * Clear out a block of dirty inodes in an effort to reduce
12869 * the number of inodedep dependency structures.
12870 */
12871static void
12872clear_inodedeps(void)
12873{
12874	struct inodedep_hashhead *inodedephd;
12875	struct inodedep *inodedep;
12876	static int next = 0;
12877	struct mount *mp;
12878	struct vnode *vp;
12879	struct fs *fs;
12880	int error, cnt;
12881	ino_t firstino, lastino, ino;
12882
12883	mtx_assert(&lk, MA_OWNED);
12884	/*
12885	 * Pick a random inode dependency to be cleared.
12886	 * We will then gather up all the inodes in its block
12887	 * that have dependencies and flush them out.
12888	 */
12889	for (cnt = 0; cnt < inodedep_hash; cnt++) {
12890		inodedephd = &inodedep_hashtbl[next++];
12891		if (next >= inodedep_hash)
12892			next = 0;
12893		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
12894			break;
12895	}
12896	if (inodedep == NULL)
12897		return;
12898	fs = inodedep->id_fs;
12899	mp = inodedep->id_list.wk_mp;
12900	/*
12901	 * Find the last inode in the block with dependencies.
12902	 */
12903	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
12904	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
12905		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
12906			break;
12907	/*
12908	 * Asynchronously push all but the last inode with dependencies.
12909	 * Synchronously push the last inode with dependencies to ensure
12910	 * that the inode block gets written to free up the inodedeps.
12911	 */
12912	for (ino = firstino; ino <= lastino; ino++) {
12913		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12914			continue;
12915		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
12916			continue;
12917		FREE_LOCK(&lk);
12918		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
12919		if (error != 0) {
12920			vn_finished_write(mp);
12921			ACQUIRE_LOCK(&lk);
12922			return;
12923		}
12924		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
12925		    FFSV_FORCEINSMQ)) != 0) {
12926			softdep_error("clear_inodedeps: vget", error);
12927			vfs_unbusy(mp);
12928			vn_finished_write(mp);
12929			ACQUIRE_LOCK(&lk);
12930			return;
12931		}
12932		vfs_unbusy(mp);
12933		if (ino == lastino) {
12934			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
12935				softdep_error("clear_inodedeps: fsync1", error);
12936		} else {
12937			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
12938				softdep_error("clear_inodedeps: fsync2", error);
12939			BO_LOCK(&vp->v_bufobj);
12940			drain_output(vp);
12941			BO_UNLOCK(&vp->v_bufobj);
12942		}
12943		vput(vp);
12944		vn_finished_write(mp);
12945		ACQUIRE_LOCK(&lk);
12946	}
12947}
12948
12949void
12950softdep_buf_append(bp, wkhd)
12951	struct buf *bp;
12952	struct workhead *wkhd;
12953{
12954	struct worklist *wk;
12955
12956	ACQUIRE_LOCK(&lk);
12957	while ((wk = LIST_FIRST(wkhd)) != NULL) {
12958		WORKLIST_REMOVE(wk);
12959		WORKLIST_INSERT(&bp->b_dep, wk);
12960	}
12961	FREE_LOCK(&lk);
12962
12963}
12964
12965void
12966softdep_inode_append(ip, cred, wkhd)
12967	struct inode *ip;
12968	struct ucred *cred;
12969	struct workhead *wkhd;
12970{
12971	struct buf *bp;
12972	struct fs *fs;
12973	int error;
12974
12975	fs = ip->i_fs;
12976	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
12977	    (int)fs->fs_bsize, cred, &bp);
12978	if (error) {
12979		softdep_freework(wkhd);
12980		return;
12981	}
12982	softdep_buf_append(bp, wkhd);
12983	bqrelse(bp);
12984}
12985
12986void
12987softdep_freework(wkhd)
12988	struct workhead *wkhd;
12989{
12990
12991	ACQUIRE_LOCK(&lk);
12992	handle_jwork(wkhd);
12993	FREE_LOCK(&lk);
12994}
12995
12996/*
12997 * Function to determine if the buffer has outstanding dependencies
12998 * that will cause a roll-back if the buffer is written. If wantcount
12999 * is set, return number of dependencies, otherwise just yes or no.
13000 */
13001static int
13002softdep_count_dependencies(bp, wantcount)
13003	struct buf *bp;
13004	int wantcount;
13005{
13006	struct worklist *wk;
13007	struct bmsafemap *bmsafemap;
13008	struct freework *freework;
13009	struct inodedep *inodedep;
13010	struct indirdep *indirdep;
13011	struct freeblks *freeblks;
13012	struct allocindir *aip;
13013	struct pagedep *pagedep;
13014	struct dirrem *dirrem;
13015	struct newblk *newblk;
13016	struct mkdir *mkdir;
13017	struct diradd *dap;
13018	int i, retval;
13019
13020	retval = 0;
13021	ACQUIRE_LOCK(&lk);
13022	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
13023		switch (wk->wk_type) {
13024
13025		case D_INODEDEP:
13026			inodedep = WK_INODEDEP(wk);
13027			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
13028				/* bitmap allocation dependency */
13029				retval += 1;
13030				if (!wantcount)
13031					goto out;
13032			}
13033			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
13034				/* direct block pointer dependency */
13035				retval += 1;
13036				if (!wantcount)
13037					goto out;
13038			}
13039			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
13040				/* direct block pointer dependency */
13041				retval += 1;
13042				if (!wantcount)
13043					goto out;
13044			}
13045			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
13046				/* Add reference dependency. */
13047				retval += 1;
13048				if (!wantcount)
13049					goto out;
13050			}
13051			continue;
13052
13053		case D_INDIRDEP:
13054			indirdep = WK_INDIRDEP(wk);
13055
13056			TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
13057				/* indirect truncation dependency */
13058				retval += 1;
13059				if (!wantcount)
13060					goto out;
13061			}
13062
13063			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
13064				/* indirect block pointer dependency */
13065				retval += 1;
13066				if (!wantcount)
13067					goto out;
13068			}
13069			continue;
13070
13071		case D_PAGEDEP:
13072			pagedep = WK_PAGEDEP(wk);
13073			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
13074				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
13075					/* Journal remove ref dependency. */
13076					retval += 1;
13077					if (!wantcount)
13078						goto out;
13079				}
13080			}
13081			for (i = 0; i < DAHASHSZ; i++) {
13082
13083				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
13084					/* directory entry dependency */
13085					retval += 1;
13086					if (!wantcount)
13087						goto out;
13088				}
13089			}
13090			continue;
13091
13092		case D_BMSAFEMAP:
13093			bmsafemap = WK_BMSAFEMAP(wk);
13094			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
13095				/* Add reference dependency. */
13096				retval += 1;
13097				if (!wantcount)
13098					goto out;
13099			}
13100			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
13101				/* Allocate block dependency. */
13102				retval += 1;
13103				if (!wantcount)
13104					goto out;
13105			}
13106			continue;
13107
13108		case D_FREEBLKS:
13109			freeblks = WK_FREEBLKS(wk);
13110			if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
13111				/* Freeblk journal dependency. */
13112				retval += 1;
13113				if (!wantcount)
13114					goto out;
13115			}
13116			continue;
13117
13118		case D_ALLOCDIRECT:
13119		case D_ALLOCINDIR:
13120			newblk = WK_NEWBLK(wk);
13121			if (newblk->nb_jnewblk) {
13122				/* Journal allocate dependency. */
13123				retval += 1;
13124				if (!wantcount)
13125					goto out;
13126			}
13127			continue;
13128
13129		case D_MKDIR:
13130			mkdir = WK_MKDIR(wk);
13131			if (mkdir->md_jaddref) {
13132				/* Journal reference dependency. */
13133				retval += 1;
13134				if (!wantcount)
13135					goto out;
13136			}
13137			continue;
13138
13139		case D_FREEWORK:
13140		case D_FREEDEP:
13141		case D_JSEGDEP:
13142		case D_JSEG:
13143		case D_SBDEP:
13144			/* never a dependency on these blocks */
13145			continue;
13146
13147		default:
13148			panic("softdep_count_dependencies: Unexpected type %s",
13149			    TYPENAME(wk->wk_type));
13150			/* NOTREACHED */
13151		}
13152	}
13153out:
13154	FREE_LOCK(&lk);
13155	return retval;
13156}
13157
13158/*
13159 * Acquire exclusive access to a buffer.
13160 * Must be called with a locked mtx parameter.
13161 * Return acquired buffer or NULL on failure.
13162 */
13163static struct buf *
13164getdirtybuf(bp, mtx, waitfor)
13165	struct buf *bp;
13166	struct mtx *mtx;
13167	int waitfor;
13168{
13169	int error;
13170
13171	mtx_assert(mtx, MA_OWNED);
13172	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
13173		if (waitfor != MNT_WAIT)
13174			return (NULL);
13175		error = BUF_LOCK(bp,
13176		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
13177		/*
13178		 * Even if we sucessfully acquire bp here, we have dropped
13179		 * mtx, which may violates our guarantee.
13180		 */
13181		if (error == 0)
13182			BUF_UNLOCK(bp);
13183		else if (error != ENOLCK)
13184			panic("getdirtybuf: inconsistent lock: %d", error);
13185		mtx_lock(mtx);
13186		return (NULL);
13187	}
13188	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13189		if (mtx == &lk && waitfor == MNT_WAIT) {
13190			mtx_unlock(mtx);
13191			BO_LOCK(bp->b_bufobj);
13192			BUF_UNLOCK(bp);
13193			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13194				bp->b_vflags |= BV_BKGRDWAIT;
13195				msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
13196				       PRIBIO | PDROP, "getbuf", 0);
13197			} else
13198				BO_UNLOCK(bp->b_bufobj);
13199			mtx_lock(mtx);
13200			return (NULL);
13201		}
13202		BUF_UNLOCK(bp);
13203		if (waitfor != MNT_WAIT)
13204			return (NULL);
13205		/*
13206		 * The mtx argument must be bp->b_vp's mutex in
13207		 * this case.
13208		 */
13209#ifdef	DEBUG_VFS_LOCKS
13210		if (bp->b_vp->v_type != VCHR)
13211			ASSERT_BO_LOCKED(bp->b_bufobj);
13212#endif
13213		bp->b_vflags |= BV_BKGRDWAIT;
13214		msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
13215		return (NULL);
13216	}
13217	if ((bp->b_flags & B_DELWRI) == 0) {
13218		BUF_UNLOCK(bp);
13219		return (NULL);
13220	}
13221	bremfree(bp);
13222	return (bp);
13223}
13224
13225
13226/*
13227 * Check if it is safe to suspend the file system now.  On entry,
13228 * the vnode interlock for devvp should be held.  Return 0 with
13229 * the mount interlock held if the file system can be suspended now,
13230 * otherwise return EAGAIN with the mount interlock held.
13231 */
13232int
13233softdep_check_suspend(struct mount *mp,
13234		      struct vnode *devvp,
13235		      int softdep_deps,
13236		      int softdep_accdeps,
13237		      int secondary_writes,
13238		      int secondary_accwrites)
13239{
13240	struct bufobj *bo;
13241	struct ufsmount *ump;
13242	int error;
13243
13244	ump = VFSTOUFS(mp);
13245	bo = &devvp->v_bufobj;
13246	ASSERT_BO_LOCKED(bo);
13247
13248	for (;;) {
13249		if (!TRY_ACQUIRE_LOCK(&lk)) {
13250			BO_UNLOCK(bo);
13251			ACQUIRE_LOCK(&lk);
13252			FREE_LOCK(&lk);
13253			BO_LOCK(bo);
13254			continue;
13255		}
13256		MNT_ILOCK(mp);
13257		if (mp->mnt_secondary_writes != 0) {
13258			FREE_LOCK(&lk);
13259			BO_UNLOCK(bo);
13260			msleep(&mp->mnt_secondary_writes,
13261			       MNT_MTX(mp),
13262			       (PUSER - 1) | PDROP, "secwr", 0);
13263			BO_LOCK(bo);
13264			continue;
13265		}
13266		break;
13267	}
13268
13269	/*
13270	 * Reasons for needing more work before suspend:
13271	 * - Dirty buffers on devvp.
13272	 * - Softdep activity occurred after start of vnode sync loop
13273	 * - Secondary writes occurred after start of vnode sync loop
13274	 */
13275	error = 0;
13276	if (bo->bo_numoutput > 0 ||
13277	    bo->bo_dirty.bv_cnt > 0 ||
13278	    softdep_deps != 0 ||
13279	    ump->softdep_deps != 0 ||
13280	    softdep_accdeps != ump->softdep_accdeps ||
13281	    secondary_writes != 0 ||
13282	    mp->mnt_secondary_writes != 0 ||
13283	    secondary_accwrites != mp->mnt_secondary_accwrites)
13284		error = EAGAIN;
13285	FREE_LOCK(&lk);
13286	BO_UNLOCK(bo);
13287	return (error);
13288}
13289
13290
13291/*
13292 * Get the number of dependency structures for the file system, both
13293 * the current number and the total number allocated.  These will
13294 * later be used to detect that softdep processing has occurred.
13295 */
13296void
13297softdep_get_depcounts(struct mount *mp,
13298		      int *softdep_depsp,
13299		      int *softdep_accdepsp)
13300{
13301	struct ufsmount *ump;
13302
13303	ump = VFSTOUFS(mp);
13304	ACQUIRE_LOCK(&lk);
13305	*softdep_depsp = ump->softdep_deps;
13306	*softdep_accdepsp = ump->softdep_accdeps;
13307	FREE_LOCK(&lk);
13308}
13309
13310/*
13311 * Wait for pending output on a vnode to complete.
13312 * Must be called with vnode lock and interlock locked.
13313 *
13314 * XXX: Should just be a call to bufobj_wwait().
13315 */
13316static void
13317drain_output(vp)
13318	struct vnode *vp;
13319{
13320	struct bufobj *bo;
13321
13322	bo = &vp->v_bufobj;
13323	ASSERT_VOP_LOCKED(vp, "drain_output");
13324	ASSERT_BO_LOCKED(bo);
13325
13326	while (bo->bo_numoutput) {
13327		bo->bo_flag |= BO_WWAIT;
13328		msleep((caddr_t)&bo->bo_numoutput,
13329		    BO_MTX(bo), PRIBIO + 1, "drainvp", 0);
13330	}
13331}
13332
13333/*
13334 * Called whenever a buffer that is being invalidated or reallocated
13335 * contains dependencies. This should only happen if an I/O error has
13336 * occurred. The routine is called with the buffer locked.
13337 */
13338static void
13339softdep_deallocate_dependencies(bp)
13340	struct buf *bp;
13341{
13342
13343	if ((bp->b_ioflags & BIO_ERROR) == 0)
13344		panic("softdep_deallocate_dependencies: dangling deps");
13345	if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
13346		softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
13347	else
13348		printf("softdep_deallocate_dependencies: "
13349		    "got error %d while accessing filesystem\n", bp->b_error);
13350	if (bp->b_error != ENXIO)
13351		panic("softdep_deallocate_dependencies: unrecovered I/O error");
13352}
13353
13354/*
13355 * Function to handle asynchronous write errors in the filesystem.
13356 */
13357static void
13358softdep_error(func, error)
13359	char *func;
13360	int error;
13361{
13362
13363	/* XXX should do something better! */
13364	printf("%s: got error %d while accessing filesystem\n", func, error);
13365}
13366
13367#ifdef DDB
13368
13369static void
13370inodedep_print(struct inodedep *inodedep, int verbose)
13371{
13372	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
13373	    " saveino %p\n",
13374	    inodedep, inodedep->id_fs, inodedep->id_state,
13375	    (intmax_t)inodedep->id_ino,
13376	    (intmax_t)fsbtodb(inodedep->id_fs,
13377	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
13378	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
13379	    inodedep->id_savedino1);
13380
13381	if (verbose == 0)
13382		return;
13383
13384	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
13385	    "mkdiradd %p\n",
13386	    LIST_FIRST(&inodedep->id_pendinghd),
13387	    LIST_FIRST(&inodedep->id_bufwait),
13388	    LIST_FIRST(&inodedep->id_inowait),
13389	    TAILQ_FIRST(&inodedep->id_inoreflst),
13390	    inodedep->id_mkdiradd);
13391	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
13392	    TAILQ_FIRST(&inodedep->id_inoupdt),
13393	    TAILQ_FIRST(&inodedep->id_newinoupdt),
13394	    TAILQ_FIRST(&inodedep->id_extupdt),
13395	    TAILQ_FIRST(&inodedep->id_newextupdt));
13396}
13397
13398DB_SHOW_COMMAND(inodedep, db_show_inodedep)
13399{
13400
13401	if (have_addr == 0) {
13402		db_printf("Address required\n");
13403		return;
13404	}
13405	inodedep_print((struct inodedep*)addr, 1);
13406}
13407
13408DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
13409{
13410	struct inodedep_hashhead *inodedephd;
13411	struct inodedep *inodedep;
13412	struct fs *fs;
13413	int cnt;
13414
13415	fs = have_addr ? (struct fs *)addr : NULL;
13416	for (cnt = 0; cnt < inodedep_hash; cnt++) {
13417		inodedephd = &inodedep_hashtbl[cnt];
13418		LIST_FOREACH(inodedep, inodedephd, id_hash) {
13419			if (fs != NULL && fs != inodedep->id_fs)
13420				continue;
13421			inodedep_print(inodedep, 0);
13422		}
13423	}
13424}
13425
13426DB_SHOW_COMMAND(worklist, db_show_worklist)
13427{
13428	struct worklist *wk;
13429
13430	if (have_addr == 0) {
13431		db_printf("Address required\n");
13432		return;
13433	}
13434	wk = (struct worklist *)addr;
13435	printf("worklist: %p type %s state 0x%X\n",
13436	    wk, TYPENAME(wk->wk_type), wk->wk_state);
13437}
13438
13439DB_SHOW_COMMAND(workhead, db_show_workhead)
13440{
13441	struct workhead *wkhd;
13442	struct worklist *wk;
13443	int i;
13444
13445	if (have_addr == 0) {
13446		db_printf("Address required\n");
13447		return;
13448	}
13449	wkhd = (struct workhead *)addr;
13450	wk = LIST_FIRST(wkhd);
13451	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
13452		db_printf("worklist: %p type %s state 0x%X",
13453		    wk, TYPENAME(wk->wk_type), wk->wk_state);
13454	if (i == 100)
13455		db_printf("workhead overflow");
13456	printf("\n");
13457}
13458
13459
13460DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
13461{
13462	struct jaddref *jaddref;
13463	struct diradd *diradd;
13464	struct mkdir *mkdir;
13465
13466	LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
13467		diradd = mkdir->md_diradd;
13468		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
13469		    mkdir, mkdir->md_state, diradd, diradd->da_state);
13470		if ((jaddref = mkdir->md_jaddref) != NULL)
13471			db_printf(" jaddref %p jaddref state 0x%X",
13472			    jaddref, jaddref->ja_state);
13473		db_printf("\n");
13474	}
13475}
13476
13477#endif /* DDB */
13478
13479#endif /* SOFTUPDATES */
13480