ffs_softdep.c revision 207741
1/*-
2 * Copyright 1998, 2000 Marshall Kirk McKusick.
3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4 * All rights reserved.
5 *
6 * The soft updates code is derived from the appendix of a University
7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8 * "Soft Updates: A Solution to the Metadata Update Problem in File
9 * Systems", CSE-TR-254-95, August 1995).
10 *
11 * Further information about soft updates can be obtained from:
12 *
13 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
14 *	1614 Oxford Street		mckusick@mckusick.com
15 *	Berkeley, CA 94709-1608		+1-510-843-9542
16 *	USA
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 *
22 * 1. Redistributions of source code must retain the above copyright
23 *    notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 *    notice, this list of conditions and the following disclaimer in the
26 *    documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
40 */
41
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 207741 2010-05-07 08:20:56Z jeff $");
44
45#include "opt_ffs.h"
46#include "opt_ddb.h"
47
48/*
49 * For now we want the safety net that the DEBUG flag provides.
50 */
51#ifndef DEBUG
52#define DEBUG
53#endif
54#define	SUJ_DEBUG
55
56#include <sys/param.h>
57#include <sys/kernel.h>
58#include <sys/systm.h>
59#include <sys/bio.h>
60#include <sys/buf.h>
61#include <sys/kdb.h>
62#include <sys/kthread.h>
63#include <sys/lock.h>
64#include <sys/malloc.h>
65#include <sys/mount.h>
66#include <sys/mutex.h>
67#include <sys/namei.h>
68#include <sys/proc.h>
69#include <sys/stat.h>
70#include <sys/sysctl.h>
71#include <sys/syslog.h>
72#include <sys/vnode.h>
73#include <sys/conf.h>
74#include <ufs/ufs/dir.h>
75#include <ufs/ufs/extattr.h>
76#include <ufs/ufs/quota.h>
77#include <ufs/ufs/inode.h>
78#include <ufs/ufs/ufsmount.h>
79#include <ufs/ffs/fs.h>
80#include <ufs/ffs/softdep.h>
81#include <ufs/ffs/ffs_extern.h>
82#include <ufs/ufs/ufs_extern.h>
83
84#include <vm/vm.h>
85
86#include <ddb/ddb.h>
87
88#ifndef SOFTUPDATES
89
90int
91softdep_flushfiles(oldmnt, flags, td)
92	struct mount *oldmnt;
93	int flags;
94	struct thread *td;
95{
96
97	panic("softdep_flushfiles called");
98}
99
100int
101softdep_mount(devvp, mp, fs, cred)
102	struct vnode *devvp;
103	struct mount *mp;
104	struct fs *fs;
105	struct ucred *cred;
106{
107
108	return (0);
109}
110
111void
112softdep_initialize()
113{
114
115	return;
116}
117
118void
119softdep_uninitialize()
120{
121
122	return;
123}
124
125void
126softdep_unmount(mp)
127	struct mount *mp;
128{
129
130}
131
132void
133softdep_setup_sbupdate(ump, fs, bp)
134	struct ufsmount *ump;
135	struct fs *fs;
136	struct buf *bp;
137{
138}
139
140void
141softdep_setup_inomapdep(bp, ip, newinum)
142	struct buf *bp;
143	struct inode *ip;
144	ino_t newinum;
145{
146
147	panic("softdep_setup_inomapdep called");
148}
149
150void
151softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
152	struct buf *bp;
153	struct mount *mp;
154	ufs2_daddr_t newblkno;
155	int frags;
156	int oldfrags;
157{
158
159	panic("softdep_setup_blkmapdep called");
160}
161
162void
163softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
164	struct inode *ip;
165	ufs_lbn_t lbn;
166	ufs2_daddr_t newblkno;
167	ufs2_daddr_t oldblkno;
168	long newsize;
169	long oldsize;
170	struct buf *bp;
171{
172
173	panic("softdep_setup_allocdirect called");
174}
175
176void
177softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
178	struct inode *ip;
179	ufs_lbn_t lbn;
180	ufs2_daddr_t newblkno;
181	ufs2_daddr_t oldblkno;
182	long newsize;
183	long oldsize;
184	struct buf *bp;
185{
186
187	panic("softdep_setup_allocext called");
188}
189
190void
191softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
192	struct inode *ip;
193	ufs_lbn_t lbn;
194	struct buf *bp;
195	int ptrno;
196	ufs2_daddr_t newblkno;
197	ufs2_daddr_t oldblkno;
198	struct buf *nbp;
199{
200
201	panic("softdep_setup_allocindir_page called");
202}
203
204void
205softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
206	struct buf *nbp;
207	struct inode *ip;
208	struct buf *bp;
209	int ptrno;
210	ufs2_daddr_t newblkno;
211{
212
213	panic("softdep_setup_allocindir_meta called");
214}
215
216void
217softdep_setup_freeblocks(ip, length, flags)
218	struct inode *ip;
219	off_t length;
220	int flags;
221{
222
223	panic("softdep_setup_freeblocks called");
224}
225
226void
227softdep_freefile(pvp, ino, mode)
228		struct vnode *pvp;
229		ino_t ino;
230		int mode;
231{
232
233	panic("softdep_freefile called");
234}
235
236int
237softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
238	struct buf *bp;
239	struct inode *dp;
240	off_t diroffset;
241	ino_t newinum;
242	struct buf *newdirbp;
243	int isnewblk;
244{
245
246	panic("softdep_setup_directory_add called");
247}
248
249void
250softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
251	struct buf *bp;
252	struct inode *dp;
253	caddr_t base;
254	caddr_t oldloc;
255	caddr_t newloc;
256	int entrysize;
257{
258
259	panic("softdep_change_directoryentry_offset called");
260}
261
262void
263softdep_setup_remove(bp, dp, ip, isrmdir)
264	struct buf *bp;
265	struct inode *dp;
266	struct inode *ip;
267	int isrmdir;
268{
269
270	panic("softdep_setup_remove called");
271}
272
273void
274softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
275	struct buf *bp;
276	struct inode *dp;
277	struct inode *ip;
278	ino_t newinum;
279	int isrmdir;
280{
281
282	panic("softdep_setup_directory_change called");
283}
284
285void *
286softdep_setup_trunc(vp, length, flags)
287	struct vnode *vp;
288	off_t length;
289	int flags;
290{
291
292	panic("%s called", __FUNCTION__);
293
294	return (NULL);
295}
296
297int
298softdep_complete_trunc(vp, cookie)
299	struct vnode *vp;
300	void *cookie;
301{
302
303	panic("%s called", __FUNCTION__);
304
305	return (0);
306}
307
308void
309softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
310	struct mount *mp;
311	struct buf *bp;
312	ufs2_daddr_t blkno;
313	int frags;
314	struct workhead *wkhd;
315{
316
317	panic("%s called", __FUNCTION__);
318}
319
320void
321softdep_setup_inofree(mp, bp, ino, wkhd)
322	struct mount *mp;
323	struct buf *bp;
324	ino_t ino;
325	struct workhead *wkhd;
326{
327
328	panic("%s called", __FUNCTION__);
329}
330
331void
332softdep_setup_unlink(dp, ip)
333	struct inode *dp;
334	struct inode *ip;
335{
336
337	panic("%s called", __FUNCTION__);
338}
339
340void
341softdep_setup_link(dp, ip)
342	struct inode *dp;
343	struct inode *ip;
344{
345
346	panic("%s called", __FUNCTION__);
347}
348
349void
350softdep_revert_link(dp, ip)
351	struct inode *dp;
352	struct inode *ip;
353{
354
355	panic("%s called", __FUNCTION__);
356}
357
358void
359softdep_setup_rmdir(dp, ip)
360	struct inode *dp;
361	struct inode *ip;
362{
363
364	panic("%s called", __FUNCTION__);
365}
366
367void
368softdep_revert_rmdir(dp, ip)
369	struct inode *dp;
370	struct inode *ip;
371{
372
373	panic("%s called", __FUNCTION__);
374}
375
376void
377softdep_setup_create(dp, ip)
378	struct inode *dp;
379	struct inode *ip;
380{
381
382	panic("%s called", __FUNCTION__);
383}
384
385void
386softdep_revert_create(dp, ip)
387	struct inode *dp;
388	struct inode *ip;
389{
390
391	panic("%s called", __FUNCTION__);
392}
393
394void
395softdep_setup_mkdir(dp, ip)
396	struct inode *dp;
397	struct inode *ip;
398{
399
400	panic("%s called", __FUNCTION__);
401}
402
403void
404softdep_revert_mkdir(dp, ip)
405	struct inode *dp;
406	struct inode *ip;
407{
408
409	panic("%s called", __FUNCTION__);
410}
411
412void
413softdep_setup_dotdot_link(dp, ip)
414	struct inode *dp;
415	struct inode *ip;
416{
417
418	panic("%s called", __FUNCTION__);
419}
420
421int
422softdep_prealloc(vp, waitok)
423	struct vnode *vp;
424	int waitok;
425{
426
427	panic("%s called", __FUNCTION__);
428
429	return (0);
430}
431
432int
433softdep_journal_lookup(mp, vpp)
434	struct mount *mp;
435	struct vnode **vpp;
436{
437
438	return (ENOENT);
439}
440
441void
442softdep_change_linkcnt(ip)
443	struct inode *ip;
444{
445
446	panic("softdep_change_linkcnt called");
447}
448
449void
450softdep_load_inodeblock(ip)
451	struct inode *ip;
452{
453
454	panic("softdep_load_inodeblock called");
455}
456
457void
458softdep_update_inodeblock(ip, bp, waitfor)
459	struct inode *ip;
460	struct buf *bp;
461	int waitfor;
462{
463
464	panic("softdep_update_inodeblock called");
465}
466
467int
468softdep_fsync(vp)
469	struct vnode *vp;	/* the "in_core" copy of the inode */
470{
471
472	return (0);
473}
474
475void
476softdep_fsync_mountdev(vp)
477	struct vnode *vp;
478{
479
480	return;
481}
482
483int
484softdep_flushworklist(oldmnt, countp, td)
485	struct mount *oldmnt;
486	int *countp;
487	struct thread *td;
488{
489
490	*countp = 0;
491	return (0);
492}
493
494int
495softdep_sync_metadata(struct vnode *vp)
496{
497
498	return (0);
499}
500
501int
502softdep_slowdown(vp)
503	struct vnode *vp;
504{
505
506	panic("softdep_slowdown called");
507}
508
509void
510softdep_releasefile(ip)
511	struct inode *ip;	/* inode with the zero effective link count */
512{
513
514	panic("softdep_releasefile called");
515}
516
517int
518softdep_request_cleanup(fs, vp)
519	struct fs *fs;
520	struct vnode *vp;
521{
522
523	return (0);
524}
525
526int
527softdep_check_suspend(struct mount *mp,
528		      struct vnode *devvp,
529		      int softdep_deps,
530		      int softdep_accdeps,
531		      int secondary_writes,
532		      int secondary_accwrites)
533{
534	struct bufobj *bo;
535	int error;
536
537	(void) softdep_deps,
538	(void) softdep_accdeps;
539
540	bo = &devvp->v_bufobj;
541	ASSERT_BO_LOCKED(bo);
542
543	MNT_ILOCK(mp);
544	while (mp->mnt_secondary_writes != 0) {
545		BO_UNLOCK(bo);
546		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
547		    (PUSER - 1) | PDROP, "secwr", 0);
548		BO_LOCK(bo);
549		MNT_ILOCK(mp);
550	}
551
552	/*
553	 * Reasons for needing more work before suspend:
554	 * - Dirty buffers on devvp.
555	 * - Secondary writes occurred after start of vnode sync loop
556	 */
557	error = 0;
558	if (bo->bo_numoutput > 0 ||
559	    bo->bo_dirty.bv_cnt > 0 ||
560	    secondary_writes != 0 ||
561	    mp->mnt_secondary_writes != 0 ||
562	    secondary_accwrites != mp->mnt_secondary_accwrites)
563		error = EAGAIN;
564	BO_UNLOCK(bo);
565	return (error);
566}
567
568void
569softdep_get_depcounts(struct mount *mp,
570		      int *softdepactivep,
571		      int *softdepactiveaccp)
572{
573	(void) mp;
574	*softdepactivep = 0;
575	*softdepactiveaccp = 0;
576}
577
578#else
579/*
580 * These definitions need to be adapted to the system to which
581 * this file is being ported.
582 */
583
584#define M_SOFTDEP_FLAGS	(M_WAITOK | M_USE_RESERVE)
585
586#define	D_PAGEDEP	0
587#define	D_INODEDEP	1
588#define	D_BMSAFEMAP	2
589#define	D_NEWBLK	3
590#define	D_ALLOCDIRECT	4
591#define	D_INDIRDEP	5
592#define	D_ALLOCINDIR	6
593#define	D_FREEFRAG	7
594#define	D_FREEBLKS	8
595#define	D_FREEFILE	9
596#define	D_DIRADD	10
597#define	D_MKDIR		11
598#define	D_DIRREM	12
599#define	D_NEWDIRBLK	13
600#define	D_FREEWORK	14
601#define	D_FREEDEP	15
602#define	D_JADDREF	16
603#define	D_JREMREF	17
604#define	D_JMVREF	18
605#define	D_JNEWBLK	19
606#define	D_JFREEBLK	20
607#define	D_JFREEFRAG	21
608#define	D_JSEG		22
609#define	D_JSEGDEP	23
610#define	D_SBDEP		24
611#define	D_JTRUNC	25
612#define	D_LAST		D_JTRUNC
613
614unsigned long dep_current[D_LAST + 1];
615unsigned long dep_total[D_LAST + 1];
616
617
618SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats");
619SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
620    "total dependencies allocated");
621SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
622    "current dependencies allocated");
623
624#define	SOFTDEP_TYPE(type, str, long)					\
625    static MALLOC_DEFINE(M_ ## type, #str, long);			\
626    SYSCTL_LONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
627	&dep_total[D_ ## type], 0, "");					\
628    SYSCTL_LONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
629	&dep_current[D_ ## type], 0, "");
630
631SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
632SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
633SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
634    "Block or frag allocated from cyl group map");
635SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
636SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
637SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
638SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
639SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
640SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
641SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
642SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
643SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
644SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
645SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
646SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
647SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
648SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
649SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
650SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
651SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
652SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
653SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
654SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
655SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
656SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
657SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
658
659static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
660static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
661
662/*
663 * translate from workitem type to memory type
664 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
665 */
666static struct malloc_type *memtype[] = {
667	M_PAGEDEP,
668	M_INODEDEP,
669	M_BMSAFEMAP,
670	M_NEWBLK,
671	M_ALLOCDIRECT,
672	M_INDIRDEP,
673	M_ALLOCINDIR,
674	M_FREEFRAG,
675	M_FREEBLKS,
676	M_FREEFILE,
677	M_DIRADD,
678	M_MKDIR,
679	M_DIRREM,
680	M_NEWDIRBLK,
681	M_FREEWORK,
682	M_FREEDEP,
683	M_JADDREF,
684	M_JREMREF,
685	M_JMVREF,
686	M_JNEWBLK,
687	M_JFREEBLK,
688	M_JFREEFRAG,
689	M_JSEG,
690	M_JSEGDEP,
691	M_SBDEP,
692	M_JTRUNC
693};
694
695#define DtoM(type) (memtype[type])
696
697/*
698 * Names of malloc types.
699 */
700#define TYPENAME(type)  \
701	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
702/*
703 * End system adaptation definitions.
704 */
705
706#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
707#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
708
709/*
710 * Forward declarations.
711 */
712struct inodedep_hashhead;
713struct newblk_hashhead;
714struct pagedep_hashhead;
715struct bmsafemap_hashhead;
716
717/*
718 * Internal function prototypes.
719 */
720static	void softdep_error(char *, int);
721static	void drain_output(struct vnode *);
722static	struct buf *getdirtybuf(struct buf *, struct mtx *, int);
723static	void clear_remove(struct thread *);
724static	void clear_inodedeps(struct thread *);
725static	void unlinked_inodedep(struct mount *, struct inodedep *);
726static	void clear_unlinked_inodedep(struct inodedep *);
727static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
728static	int flush_pagedep_deps(struct vnode *, struct mount *,
729	    struct diraddhd *);
730static	void free_pagedep(struct pagedep *);
731static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
732static	int flush_inodedep_deps(struct mount *, ino_t);
733static	int flush_deplist(struct allocdirectlst *, int, int *);
734static	int handle_written_filepage(struct pagedep *, struct buf *);
735static	int handle_written_sbdep(struct sbdep *, struct buf *);
736static	void initiate_write_sbdep(struct sbdep *);
737static  void diradd_inode_written(struct diradd *, struct inodedep *);
738static	int handle_written_indirdep(struct indirdep *, struct buf *,
739	    struct buf**);
740static	int handle_written_inodeblock(struct inodedep *, struct buf *);
741static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
742static	void handle_written_jaddref(struct jaddref *);
743static	void handle_written_jremref(struct jremref *);
744static	void handle_written_jseg(struct jseg *, struct buf *);
745static	void handle_written_jnewblk(struct jnewblk *);
746static	void handle_written_jfreeblk(struct jfreeblk *);
747static	void handle_written_jfreefrag(struct jfreefrag *);
748static	void complete_jseg(struct jseg *);
749static	void jseg_write(struct fs *, struct jblocks *, struct jseg *,
750	    uint8_t *);
751static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
752static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
753static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
754static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
755static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
756static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
757static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
758static	inline void inoref_write(struct inoref *, struct jseg *,
759	    struct jrefrec *);
760static	void handle_allocdirect_partdone(struct allocdirect *,
761	    struct workhead *);
762static	void cancel_newblk(struct newblk *, struct workhead *);
763static	void indirdep_complete(struct indirdep *);
764static	void handle_allocindir_partdone(struct allocindir *);
765static	void initiate_write_filepage(struct pagedep *, struct buf *);
766static	void initiate_write_indirdep(struct indirdep*, struct buf *);
767static	void handle_written_mkdir(struct mkdir *, int);
768static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
769static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
770static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
771static	void handle_workitem_freefile(struct freefile *);
772static	void handle_workitem_remove(struct dirrem *, struct vnode *);
773static	struct dirrem *newdirrem(struct buf *, struct inode *,
774	    struct inode *, int, struct dirrem **);
775static	void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *,
776	    struct freeblks *);
777static	void free_indirdep(struct indirdep *);
778static	void free_diradd(struct diradd *, struct workhead *);
779static	void merge_diradd(struct inodedep *, struct diradd *);
780static	void complete_diradd(struct diradd *);
781static	struct diradd *diradd_lookup(struct pagedep *, int);
782static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
783	    struct jremref *);
784static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
785	    struct jremref *);
786static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
787	    struct jremref *, struct jremref *);
788static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
789	    struct jremref *);
790static	void cancel_allocindir(struct allocindir *, struct inodedep *,
791	    struct freeblks *);
792static	void complete_mkdir(struct mkdir *);
793static	void free_newdirblk(struct newdirblk *);
794static	void free_jremref(struct jremref *);
795static	void free_jaddref(struct jaddref *);
796static	void free_jsegdep(struct jsegdep *);
797static	void free_jseg(struct jseg *);
798static	void free_jnewblk(struct jnewblk *);
799static	void free_jfreeblk(struct jfreeblk *);
800static	void free_jfreefrag(struct jfreefrag *);
801static	void free_freedep(struct freedep *);
802static	void journal_jremref(struct dirrem *, struct jremref *,
803	    struct inodedep *);
804static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
805static	int cancel_jaddref(struct jaddref *, struct inodedep *,
806	    struct workhead *);
807static	void cancel_jfreefrag(struct jfreefrag *);
808static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
809static	int deallocate_dependencies(struct buf *, struct inodedep *,
810	    struct freeblks *);
811static	void free_newblk(struct newblk *);
812static	void cancel_allocdirect(struct allocdirectlst *,
813	    struct allocdirect *, struct freeblks *, int);
814static	int check_inode_unwritten(struct inodedep *);
815static	int free_inodedep(struct inodedep *);
816static	void freework_freeblock(struct freework *);
817static	void handle_workitem_freeblocks(struct freeblks *, int);
818static	void handle_complete_freeblocks(struct freeblks *);
819static	void handle_workitem_indirblk(struct freework *);
820static	void handle_written_freework(struct freework *);
821static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
822static	void setup_allocindir_phase2(struct buf *, struct inode *,
823	    struct inodedep *, struct allocindir *, ufs_lbn_t);
824static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
825	    ufs2_daddr_t, ufs_lbn_t);
826static	void handle_workitem_freefrag(struct freefrag *);
827static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
828	    ufs_lbn_t);
829static	void allocdirect_merge(struct allocdirectlst *,
830	    struct allocdirect *, struct allocdirect *);
831static	struct freefrag *allocindir_merge(struct allocindir *,
832	    struct allocindir *);
833static	int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int,
834	    struct bmsafemap **);
835static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
836	    int cg);
837static	int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t,
838	    int, struct newblk **);
839static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
840static	int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
841	    struct inodedep **);
842static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
843static	int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int,
844	    struct pagedep **);
845static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
846	    struct mount *mp, int, struct pagedep **);
847static	void pause_timer(void *);
848static	int request_cleanup(struct mount *, int);
849static	int process_worklist_item(struct mount *, int);
850static	void process_removes(struct vnode *);
851static	void jwork_move(struct workhead *, struct workhead *);
852static	void add_to_worklist(struct worklist *, int);
853static	void remove_from_worklist(struct worklist *);
854static	void softdep_flush(void);
855static	int softdep_speedup(void);
856static	void worklist_speedup(void);
857static	int journal_mount(struct mount *, struct fs *, struct ucred *);
858static	void journal_unmount(struct mount *);
859static	int journal_space(struct ufsmount *, int);
860static	void journal_suspend(struct ufsmount *);
861static	void softdep_prelink(struct vnode *, struct vnode *);
862static	void add_to_journal(struct worklist *);
863static	void remove_from_journal(struct worklist *);
864static	void softdep_process_journal(struct mount *, int);
865static	struct jremref *newjremref(struct dirrem *, struct inode *,
866	    struct inode *ip, off_t, nlink_t);
867static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
868	    uint16_t);
869static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
870	    uint16_t);
871static inline struct jsegdep *inoref_jseg(struct inoref *);
872static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
873static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
874	    ufs2_daddr_t, int);
875static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
876	    ufs2_daddr_t, long, ufs_lbn_t);
877static	struct freework *newfreework(struct freeblks *, struct freework *,
878	    ufs_lbn_t, ufs2_daddr_t, int, int);
879static	void jwait(struct worklist *wk);
880static	struct inodedep *inodedep_lookup_ip(struct inode *);
881static	int bmsafemap_rollbacks(struct bmsafemap *);
882static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
883static	void handle_jwork(struct workhead *);
884static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
885	    struct mkdir **);
886static	struct jblocks *jblocks_create(void);
887static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
888static	void jblocks_free(struct jblocks *, struct mount *, int);
889static	void jblocks_destroy(struct jblocks *);
890static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
891
892/*
893 * Exported softdep operations.
894 */
895static	void softdep_disk_io_initiation(struct buf *);
896static	void softdep_disk_write_complete(struct buf *);
897static	void softdep_deallocate_dependencies(struct buf *);
898static	int softdep_count_dependencies(struct buf *bp, int);
899
900static struct mtx lk;
901MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
902
903#define TRY_ACQUIRE_LOCK(lk)		mtx_trylock(lk)
904#define ACQUIRE_LOCK(lk)		mtx_lock(lk)
905#define FREE_LOCK(lk)			mtx_unlock(lk)
906
907#define	BUF_AREC(bp)	((bp)->b_lock.lock_object.lo_flags |= LO_RECURSABLE)
908#define	BUF_NOREC(bp)	((bp)->b_lock.lock_object.lo_flags &= ~LO_RECURSABLE)
909
910/*
911 * Worklist queue management.
912 * These routines require that the lock be held.
913 */
914#ifndef /* NOT */ DEBUG
915#define WORKLIST_INSERT(head, item) do {	\
916	(item)->wk_state |= ONWORKLIST;		\
917	LIST_INSERT_HEAD(head, item, wk_list);	\
918} while (0)
919#define WORKLIST_REMOVE(item) do {		\
920	(item)->wk_state &= ~ONWORKLIST;	\
921	LIST_REMOVE(item, wk_list);		\
922} while (0)
923#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
924#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
925
926#else /* DEBUG */
927static	void worklist_insert(struct workhead *, struct worklist *, int);
928static	void worklist_remove(struct worklist *, int);
929
930#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
931#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
932#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
933#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
934
935static void
936worklist_insert(head, item, locked)
937	struct workhead *head;
938	struct worklist *item;
939	int locked;
940{
941
942	if (locked)
943		mtx_assert(&lk, MA_OWNED);
944	if (item->wk_state & ONWORKLIST)
945		panic("worklist_insert: %p %s(0x%X) already on list",
946		    item, TYPENAME(item->wk_type), item->wk_state);
947	item->wk_state |= ONWORKLIST;
948	LIST_INSERT_HEAD(head, item, wk_list);
949}
950
951static void
952worklist_remove(item, locked)
953	struct worklist *item;
954	int locked;
955{
956
957	if (locked)
958		mtx_assert(&lk, MA_OWNED);
959	if ((item->wk_state & ONWORKLIST) == 0)
960		panic("worklist_remove: %p %s(0x%X) not on list",
961		    item, TYPENAME(item->wk_type), item->wk_state);
962	item->wk_state &= ~ONWORKLIST;
963	LIST_REMOVE(item, wk_list);
964}
965#endif /* DEBUG */
966
967/*
968 * Merge two jsegdeps keeping only the oldest one as newer references
969 * can't be discarded until after older references.
970 */
971static inline struct jsegdep *
972jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
973{
974	struct jsegdep *swp;
975
976	if (two == NULL)
977		return (one);
978
979	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
980		swp = one;
981		one = two;
982		two = swp;
983	}
984	WORKLIST_REMOVE(&two->jd_list);
985	free_jsegdep(two);
986
987	return (one);
988}
989
990/*
991 * If two freedeps are compatible free one to reduce list size.
992 */
993static inline struct freedep *
994freedep_merge(struct freedep *one, struct freedep *two)
995{
996	if (two == NULL)
997		return (one);
998
999	if (one->fd_freework == two->fd_freework) {
1000		WORKLIST_REMOVE(&two->fd_list);
1001		free_freedep(two);
1002	}
1003	return (one);
1004}
1005
1006/*
1007 * Move journal work from one list to another.  Duplicate freedeps and
1008 * jsegdeps are coalesced to keep the lists as small as possible.
1009 */
1010static void
1011jwork_move(dst, src)
1012	struct workhead *dst;
1013	struct workhead *src;
1014{
1015	struct freedep *freedep;
1016	struct jsegdep *jsegdep;
1017	struct worklist *wkn;
1018	struct worklist *wk;
1019
1020	KASSERT(dst != src,
1021	    ("jwork_move: dst == src"));
1022	freedep = NULL;
1023	jsegdep = NULL;
1024	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1025		if (wk->wk_type == D_JSEGDEP)
1026			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1027		if (wk->wk_type == D_FREEDEP)
1028			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1029	}
1030
1031	mtx_assert(&lk, MA_OWNED);
1032	while ((wk = LIST_FIRST(src)) != NULL) {
1033		WORKLIST_REMOVE(wk);
1034		WORKLIST_INSERT(dst, wk);
1035		if (wk->wk_type == D_JSEGDEP) {
1036			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1037			continue;
1038		}
1039		if (wk->wk_type == D_FREEDEP)
1040			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1041	}
1042}
1043
1044/*
1045 * Routines for tracking and managing workitems.
1046 */
1047static	void workitem_free(struct worklist *, int);
1048static	void workitem_alloc(struct worklist *, int, struct mount *);
1049
1050#define	WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
1051
1052static void
1053workitem_free(item, type)
1054	struct worklist *item;
1055	int type;
1056{
1057	struct ufsmount *ump;
1058	mtx_assert(&lk, MA_OWNED);
1059
1060#ifdef DEBUG
1061	if (item->wk_state & ONWORKLIST)
1062		panic("workitem_free: %s(0x%X) still on list",
1063		    TYPENAME(item->wk_type), item->wk_state);
1064	if (item->wk_type != type)
1065		panic("workitem_free: type mismatch %s != %s",
1066		    TYPENAME(item->wk_type), TYPENAME(type));
1067#endif
1068	ump = VFSTOUFS(item->wk_mp);
1069	if (--ump->softdep_deps == 0 && ump->softdep_req)
1070		wakeup(&ump->softdep_deps);
1071	dep_current[type]--;
1072	free(item, DtoM(type));
1073}
1074
1075static void
1076workitem_alloc(item, type, mp)
1077	struct worklist *item;
1078	int type;
1079	struct mount *mp;
1080{
1081	item->wk_type = type;
1082	item->wk_mp = mp;
1083	item->wk_state = 0;
1084	ACQUIRE_LOCK(&lk);
1085	dep_current[type]++;
1086	dep_total[type]++;
1087	VFSTOUFS(mp)->softdep_deps++;
1088	VFSTOUFS(mp)->softdep_accdeps++;
1089	FREE_LOCK(&lk);
1090}
1091
1092/*
1093 * Workitem queue management
1094 */
1095static int max_softdeps;	/* maximum number of structs before slowdown */
1096static int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
1097static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1098static int proc_waiting;	/* tracks whether we have a timeout posted */
1099static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1100static struct callout softdep_callout;
1101static int req_pending;
1102static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1103#define FLUSH_INODES		1
1104static int req_clear_remove;	/* syncer process flush some freeblks */
1105#define FLUSH_REMOVE		2
1106#define FLUSH_REMOVE_WAIT	3
1107static long num_freeblkdep;	/* number of freeblks workitems allocated */
1108
1109/*
1110 * runtime statistics
1111 */
1112static int stat_worklist_push;	/* number of worklist cleanups */
1113static int stat_blk_limit_push;	/* number of times block limit neared */
1114static int stat_ino_limit_push;	/* number of times inode limit neared */
1115static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1116static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1117static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1118static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1119static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1120static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1121static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1122static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1123static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1124static int stat_journal_min;	/* Times hit journal min threshold */
1125static int stat_journal_low;	/* Times hit journal low threshold */
1126static int stat_journal_wait;	/* Times blocked in jwait(). */
1127static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1128static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1129static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1130static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1131
1132SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1133    &max_softdeps, 0, "");
1134SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1135    &tickdelay, 0, "");
1136SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
1137    &maxindirdeps, 0, "");
1138SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1139    &stat_worklist_push, 0,"");
1140SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1141    &stat_blk_limit_push, 0,"");
1142SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1143    &stat_ino_limit_push, 0,"");
1144SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1145    &stat_blk_limit_hit, 0, "");
1146SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1147    &stat_ino_limit_hit, 0, "");
1148SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1149    &stat_sync_limit_hit, 0, "");
1150SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1151    &stat_indir_blk_ptrs, 0, "");
1152SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1153    &stat_inode_bitmap, 0, "");
1154SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1155    &stat_direct_blk_ptrs, 0, "");
1156SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1157    &stat_dir_entry, 0, "");
1158SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1159    &stat_jaddref, 0, "");
1160SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1161    &stat_jnewblk, 0, "");
1162SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1163    &stat_journal_low, 0, "");
1164SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1165    &stat_journal_min, 0, "");
1166SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1167    &stat_journal_wait, 0, "");
1168SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1169    &stat_jwait_filepage, 0, "");
1170SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1171    &stat_jwait_freeblks, 0, "");
1172SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1173    &stat_jwait_inode, 0, "");
1174SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1175    &stat_jwait_newblk, 0, "");
1176
1177SYSCTL_DECL(_vfs_ffs);
1178
1179LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl;
1180static u_long	bmsafemap_hash;	/* size of hash table - 1 */
1181
1182static int compute_summary_at_mount = 0;	/* Whether to recompute the summary at mount time */
1183SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1184	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1185
1186static struct proc *softdepproc;
1187static struct kproc_desc softdep_kp = {
1188	"softdepflush",
1189	softdep_flush,
1190	&softdepproc
1191};
1192SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
1193    &softdep_kp);
1194
1195static void
1196softdep_flush(void)
1197{
1198	struct mount *nmp;
1199	struct mount *mp;
1200	struct ufsmount *ump;
1201	struct thread *td;
1202	int remaining;
1203	int vfslocked;
1204
1205	td = curthread;
1206	td->td_pflags |= TDP_NORUNNINGBUF;
1207
1208	for (;;) {
1209		kproc_suspend_check(softdepproc);
1210		vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
1211		ACQUIRE_LOCK(&lk);
1212		/*
1213		 * If requested, try removing inode or removal dependencies.
1214		 */
1215		if (req_clear_inodedeps) {
1216			clear_inodedeps(td);
1217			req_clear_inodedeps -= 1;
1218			wakeup_one(&proc_waiting);
1219		}
1220		if (req_clear_remove) {
1221			clear_remove(td);
1222			req_clear_remove -= 1;
1223			wakeup_one(&proc_waiting);
1224		}
1225		FREE_LOCK(&lk);
1226		VFS_UNLOCK_GIANT(vfslocked);
1227		remaining = 0;
1228		mtx_lock(&mountlist_mtx);
1229		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
1230			nmp = TAILQ_NEXT(mp, mnt_list);
1231			if ((mp->mnt_flag & MNT_SOFTDEP) == 0)
1232				continue;
1233			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
1234				continue;
1235			vfslocked = VFS_LOCK_GIANT(mp);
1236			softdep_process_worklist(mp, 0);
1237			ump = VFSTOUFS(mp);
1238			remaining += ump->softdep_on_worklist -
1239				ump->softdep_on_worklist_inprogress;
1240			VFS_UNLOCK_GIANT(vfslocked);
1241			mtx_lock(&mountlist_mtx);
1242			nmp = TAILQ_NEXT(mp, mnt_list);
1243			vfs_unbusy(mp);
1244		}
1245		mtx_unlock(&mountlist_mtx);
1246		if (remaining)
1247			continue;
1248		ACQUIRE_LOCK(&lk);
1249		if (!req_pending)
1250			msleep(&req_pending, &lk, PVM, "sdflush", hz);
1251		req_pending = 0;
1252		FREE_LOCK(&lk);
1253	}
1254}
1255
1256static void
1257worklist_speedup(void)
1258{
1259	mtx_assert(&lk, MA_OWNED);
1260	if (req_pending == 0) {
1261		req_pending = 1;
1262		wakeup(&req_pending);
1263	}
1264}
1265
1266static int
1267softdep_speedup(void)
1268{
1269
1270	worklist_speedup();
1271	bd_speedup();
1272	return speedup_syncer();
1273}
1274
1275/*
1276 * Add an item to the end of the work queue.
1277 * This routine requires that the lock be held.
1278 * This is the only routine that adds items to the list.
1279 * The following routine is the only one that removes items
1280 * and does so in order from first to last.
1281 */
1282static void
1283add_to_worklist(wk, nodelay)
1284	struct worklist *wk;
1285	int nodelay;
1286{
1287	struct ufsmount *ump;
1288
1289	mtx_assert(&lk, MA_OWNED);
1290	ump = VFSTOUFS(wk->wk_mp);
1291	if (wk->wk_state & ONWORKLIST)
1292		panic("add_to_worklist: %s(0x%X) already on list",
1293		    TYPENAME(wk->wk_type), wk->wk_state);
1294	wk->wk_state |= ONWORKLIST;
1295	if (LIST_EMPTY(&ump->softdep_workitem_pending))
1296		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1297	else
1298		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1299	ump->softdep_worklist_tail = wk;
1300	ump->softdep_on_worklist += 1;
1301	if (nodelay)
1302		worklist_speedup();
1303}
1304
1305/*
1306 * Remove the item to be processed. If we are removing the last
1307 * item on the list, we need to recalculate the tail pointer.
1308 */
1309static void
1310remove_from_worklist(wk)
1311	struct worklist *wk;
1312{
1313	struct ufsmount *ump;
1314	struct worklist *wkend;
1315
1316	ump = VFSTOUFS(wk->wk_mp);
1317	WORKLIST_REMOVE(wk);
1318	if (wk == ump->softdep_worklist_tail) {
1319		LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
1320			if (LIST_NEXT(wkend, wk_list) == NULL)
1321				break;
1322		ump->softdep_worklist_tail = wkend;
1323	}
1324	ump->softdep_on_worklist -= 1;
1325}
1326
1327/*
1328 * Process that runs once per second to handle items in the background queue.
1329 *
1330 * Note that we ensure that everything is done in the order in which they
1331 * appear in the queue. The code below depends on this property to ensure
1332 * that blocks of a file are freed before the inode itself is freed. This
1333 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1334 * until all the old ones have been purged from the dependency lists.
1335 */
1336int
1337softdep_process_worklist(mp, full)
1338	struct mount *mp;
1339	int full;
1340{
1341	struct thread *td = curthread;
1342	int cnt, matchcnt, loopcount;
1343	struct ufsmount *ump;
1344	long starttime;
1345
1346	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1347	/*
1348	 * Record the process identifier of our caller so that we can give
1349	 * this process preferential treatment in request_cleanup below.
1350	 */
1351	matchcnt = 0;
1352	ump = VFSTOUFS(mp);
1353	ACQUIRE_LOCK(&lk);
1354	loopcount = 1;
1355	starttime = time_second;
1356	softdep_process_journal(mp, full?MNT_WAIT:0);
1357	while (ump->softdep_on_worklist > 0) {
1358		if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1)
1359			break;
1360		else
1361			matchcnt += cnt;
1362		/*
1363		 * If requested, try removing inode or removal dependencies.
1364		 */
1365		if (req_clear_inodedeps) {
1366			clear_inodedeps(td);
1367			req_clear_inodedeps -= 1;
1368			wakeup_one(&proc_waiting);
1369		}
1370		if (req_clear_remove) {
1371			clear_remove(td);
1372			req_clear_remove -= 1;
1373			wakeup_one(&proc_waiting);
1374		}
1375		/*
1376		 * We do not generally want to stop for buffer space, but if
1377		 * we are really being a buffer hog, we will stop and wait.
1378		 */
1379		if (loopcount++ % 128 == 0) {
1380			FREE_LOCK(&lk);
1381			uio_yield();
1382			bwillwrite();
1383			ACQUIRE_LOCK(&lk);
1384		}
1385		/*
1386		 * Never allow processing to run for more than one
1387		 * second. Otherwise the other mountpoints may get
1388		 * excessively backlogged.
1389		 */
1390		if (!full && starttime != time_second)
1391			break;
1392	}
1393	FREE_LOCK(&lk);
1394	return (matchcnt);
1395}
1396
1397/*
1398 * Process all removes associated with a vnode if we are running out of
1399 * journal space.  Any other process which attempts to flush these will
1400 * be unable as we have the vnodes locked.
1401 */
1402static void
1403process_removes(vp)
1404	struct vnode *vp;
1405{
1406	struct inodedep *inodedep;
1407	struct dirrem *dirrem;
1408	struct mount *mp;
1409	ino_t inum;
1410
1411	mtx_assert(&lk, MA_OWNED);
1412
1413	mp = vp->v_mount;
1414	inum = VTOI(vp)->i_number;
1415	for (;;) {
1416		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1417			return;
1418		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext)
1419			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1420			    (COMPLETE | ONWORKLIST))
1421				break;
1422		if (dirrem == NULL)
1423			return;
1424		/*
1425		 * If another thread is trying to lock this vnode it will
1426		 * fail but we must wait for it to do so before we can
1427		 * proceed.
1428		 */
1429		if (dirrem->dm_state & INPROGRESS) {
1430			dirrem->dm_state |= IOWAITING;
1431			msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0);
1432			continue;
1433		}
1434		remove_from_worklist(&dirrem->dm_list);
1435		FREE_LOCK(&lk);
1436		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1437			panic("process_removes: suspended filesystem");
1438		handle_workitem_remove(dirrem, vp);
1439		vn_finished_secondary_write(mp);
1440		ACQUIRE_LOCK(&lk);
1441	}
1442}
1443
1444/*
1445 * Process one item on the worklist.
1446 */
1447static int
1448process_worklist_item(mp, flags)
1449	struct mount *mp;
1450	int flags;
1451{
1452	struct worklist *wk, *wkXXX;
1453	struct ufsmount *ump;
1454	struct vnode *vp;
1455	int matchcnt = 0;
1456
1457	mtx_assert(&lk, MA_OWNED);
1458	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1459	/*
1460	 * If we are being called because of a process doing a
1461	 * copy-on-write, then it is not safe to write as we may
1462	 * recurse into the copy-on-write routine.
1463	 */
1464	if (curthread->td_pflags & TDP_COWINPROGRESS)
1465		return (-1);
1466	/*
1467	 * Normally we just process each item on the worklist in order.
1468	 * However, if we are in a situation where we cannot lock any
1469	 * inodes, we have to skip over any dirrem requests whose
1470	 * vnodes are resident and locked.
1471	 */
1472	vp = NULL;
1473	ump = VFSTOUFS(mp);
1474	LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
1475		if (wk->wk_state & INPROGRESS) {
1476			wkXXX = wk;
1477			continue;
1478		}
1479		wkXXX = wk;	/* Record the last valid wk pointer. */
1480		if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
1481			break;
1482		wk->wk_state |= INPROGRESS;
1483		ump->softdep_on_worklist_inprogress++;
1484		FREE_LOCK(&lk);
1485		ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum,
1486		    LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ);
1487		ACQUIRE_LOCK(&lk);
1488		if (wk->wk_state & IOWAITING) {
1489			wk->wk_state &= ~IOWAITING;
1490			wakeup(wk);
1491		}
1492		wk->wk_state &= ~INPROGRESS;
1493		ump->softdep_on_worklist_inprogress--;
1494		if (vp != NULL)
1495			break;
1496	}
1497	if (wk == 0)
1498		return (-1);
1499	remove_from_worklist(wk);
1500	FREE_LOCK(&lk);
1501	if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1502		panic("process_worklist_item: suspended filesystem");
1503	matchcnt++;
1504	switch (wk->wk_type) {
1505
1506	case D_DIRREM:
1507		/* removal of a directory entry */
1508		handle_workitem_remove(WK_DIRREM(wk), vp);
1509		if (vp)
1510			vput(vp);
1511		break;
1512
1513	case D_FREEBLKS:
1514		/* releasing blocks and/or fragments from a file */
1515		handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
1516		break;
1517
1518	case D_FREEFRAG:
1519		/* releasing a fragment when replaced as a file grows */
1520		handle_workitem_freefrag(WK_FREEFRAG(wk));
1521		break;
1522
1523	case D_FREEFILE:
1524		/* releasing an inode when its link count drops to 0 */
1525		handle_workitem_freefile(WK_FREEFILE(wk));
1526		break;
1527
1528	case D_FREEWORK:
1529		/* Final block in an indirect was freed. */
1530		handle_workitem_indirblk(WK_FREEWORK(wk));
1531		break;
1532
1533	default:
1534		panic("%s_process_worklist: Unknown type %s",
1535		    "softdep", TYPENAME(wk->wk_type));
1536		/* NOTREACHED */
1537	}
1538	vn_finished_secondary_write(mp);
1539	ACQUIRE_LOCK(&lk);
1540	return (matchcnt);
1541}
1542
1543/*
1544 * Move dependencies from one buffer to another.
1545 */
1546int
1547softdep_move_dependencies(oldbp, newbp)
1548	struct buf *oldbp;
1549	struct buf *newbp;
1550{
1551	struct worklist *wk, *wktail;
1552	int dirty;
1553
1554	dirty = 0;
1555	wktail = NULL;
1556	ACQUIRE_LOCK(&lk);
1557	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1558		LIST_REMOVE(wk, wk_list);
1559		if (wk->wk_type == D_BMSAFEMAP &&
1560		    bmsafemap_rollbacks(WK_BMSAFEMAP(wk)))
1561			dirty = 1;
1562		if (wktail == 0)
1563			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1564		else
1565			LIST_INSERT_AFTER(wktail, wk, wk_list);
1566		wktail = wk;
1567	}
1568	FREE_LOCK(&lk);
1569
1570	return (dirty);
1571}
1572
1573/*
1574 * Purge the work list of all items associated with a particular mount point.
1575 */
1576int
1577softdep_flushworklist(oldmnt, countp, td)
1578	struct mount *oldmnt;
1579	int *countp;
1580	struct thread *td;
1581{
1582	struct vnode *devvp;
1583	int count, error = 0;
1584	struct ufsmount *ump;
1585
1586	/*
1587	 * Alternately flush the block device associated with the mount
1588	 * point and process any dependencies that the flushing
1589	 * creates. We continue until no more worklist dependencies
1590	 * are found.
1591	 */
1592	*countp = 0;
1593	ump = VFSTOUFS(oldmnt);
1594	devvp = ump->um_devvp;
1595	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1596		*countp += count;
1597		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1598		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1599		VOP_UNLOCK(devvp, 0);
1600		if (error)
1601			break;
1602	}
1603	return (error);
1604}
1605
1606int
1607softdep_waitidle(struct mount *mp)
1608{
1609	struct ufsmount *ump;
1610	int error;
1611	int i;
1612
1613	ump = VFSTOUFS(mp);
1614	ACQUIRE_LOCK(&lk);
1615	for (i = 0; i < 10 && ump->softdep_deps; i++) {
1616		ump->softdep_req = 1;
1617		if (ump->softdep_on_worklist)
1618			panic("softdep_waitidle: work added after flush.");
1619		msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
1620	}
1621	ump->softdep_req = 0;
1622	FREE_LOCK(&lk);
1623	error = 0;
1624	if (i == 10) {
1625		error = EBUSY;
1626		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1627		    mp);
1628	}
1629
1630	return (error);
1631}
1632
1633/*
1634 * Flush all vnodes and worklist items associated with a specified mount point.
1635 */
1636int
1637softdep_flushfiles(oldmnt, flags, td)
1638	struct mount *oldmnt;
1639	int flags;
1640	struct thread *td;
1641{
1642	int error, depcount, loopcnt, retry_flush_count, retry;
1643
1644	loopcnt = 10;
1645	retry_flush_count = 3;
1646retry_flush:
1647	error = 0;
1648
1649	/*
1650	 * Alternately flush the vnodes associated with the mount
1651	 * point and process any dependencies that the flushing
1652	 * creates. In theory, this loop can happen at most twice,
1653	 * but we give it a few extra just to be sure.
1654	 */
1655	for (; loopcnt > 0; loopcnt--) {
1656		/*
1657		 * Do another flush in case any vnodes were brought in
1658		 * as part of the cleanup operations.
1659		 */
1660		if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
1661			break;
1662		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1663		    depcount == 0)
1664			break;
1665	}
1666	/*
1667	 * If we are unmounting then it is an error to fail. If we
1668	 * are simply trying to downgrade to read-only, then filesystem
1669	 * activity can keep us busy forever, so we just fail with EBUSY.
1670	 */
1671	if (loopcnt == 0) {
1672		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1673			panic("softdep_flushfiles: looping");
1674		error = EBUSY;
1675	}
1676	if (!error)
1677		error = softdep_waitidle(oldmnt);
1678	if (!error) {
1679		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
1680			retry = 0;
1681			MNT_ILOCK(oldmnt);
1682			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
1683			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
1684			if (oldmnt->mnt_nvnodelistsize > 0) {
1685				if (--retry_flush_count > 0) {
1686					retry = 1;
1687					loopcnt = 3;
1688				} else
1689					error = EBUSY;
1690			}
1691			MNT_IUNLOCK(oldmnt);
1692			if (retry)
1693				goto retry_flush;
1694		}
1695	}
1696	return (error);
1697}
1698
1699/*
1700 * Structure hashing.
1701 *
1702 * There are three types of structures that can be looked up:
1703 *	1) pagedep structures identified by mount point, inode number,
1704 *	   and logical block.
1705 *	2) inodedep structures identified by mount point and inode number.
1706 *	3) newblk structures identified by mount point and
1707 *	   physical block number.
1708 *
1709 * The "pagedep" and "inodedep" dependency structures are hashed
1710 * separately from the file blocks and inodes to which they correspond.
1711 * This separation helps when the in-memory copy of an inode or
1712 * file block must be replaced. It also obviates the need to access
1713 * an inode or file page when simply updating (or de-allocating)
1714 * dependency structures. Lookup of newblk structures is needed to
1715 * find newly allocated blocks when trying to associate them with
1716 * their allocdirect or allocindir structure.
1717 *
1718 * The lookup routines optionally create and hash a new instance when
1719 * an existing entry is not found.
1720 */
1721#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
1722#define NODELAY		0x0002	/* cannot do background work */
1723
1724/*
1725 * Structures and routines associated with pagedep caching.
1726 */
1727LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
1728u_long	pagedep_hash;		/* size of hash table - 1 */
1729#define	PAGEDEP_HASH(mp, inum, lbn) \
1730	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
1731	    pagedep_hash])
1732
1733static int
1734pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
1735	struct pagedep_hashhead *pagedephd;
1736	ino_t ino;
1737	ufs_lbn_t lbn;
1738	struct mount *mp;
1739	int flags;
1740	struct pagedep **pagedeppp;
1741{
1742	struct pagedep *pagedep;
1743
1744	LIST_FOREACH(pagedep, pagedephd, pd_hash)
1745		if (ino == pagedep->pd_ino &&
1746		    lbn == pagedep->pd_lbn &&
1747		    mp == pagedep->pd_list.wk_mp)
1748			break;
1749	if (pagedep) {
1750		*pagedeppp = pagedep;
1751		if ((flags & DEPALLOC) != 0 &&
1752		    (pagedep->pd_state & ONWORKLIST) == 0)
1753			return (0);
1754		return (1);
1755	}
1756	*pagedeppp = NULL;
1757	return (0);
1758}
1759/*
1760 * Look up a pagedep. Return 1 if found, 0 if not found or found
1761 * when asked to allocate but not associated with any buffer.
1762 * If not found, allocate if DEPALLOC flag is passed.
1763 * Found or allocated entry is returned in pagedeppp.
1764 * This routine must be called with splbio interrupts blocked.
1765 */
1766static int
1767pagedep_lookup(mp, ino, lbn, flags, pagedeppp)
1768	struct mount *mp;
1769	ino_t ino;
1770	ufs_lbn_t lbn;
1771	int flags;
1772	struct pagedep **pagedeppp;
1773{
1774	struct pagedep *pagedep;
1775	struct pagedep_hashhead *pagedephd;
1776	int ret;
1777	int i;
1778
1779	mtx_assert(&lk, MA_OWNED);
1780	pagedephd = PAGEDEP_HASH(mp, ino, lbn);
1781
1782	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
1783	if (*pagedeppp || (flags & DEPALLOC) == 0)
1784		return (ret);
1785	FREE_LOCK(&lk);
1786	pagedep = malloc(sizeof(struct pagedep),
1787	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
1788	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
1789	ACQUIRE_LOCK(&lk);
1790	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
1791	if (*pagedeppp) {
1792		WORKITEM_FREE(pagedep, D_PAGEDEP);
1793		return (ret);
1794	}
1795	pagedep->pd_ino = ino;
1796	pagedep->pd_lbn = lbn;
1797	LIST_INIT(&pagedep->pd_dirremhd);
1798	LIST_INIT(&pagedep->pd_pendinghd);
1799	for (i = 0; i < DAHASHSZ; i++)
1800		LIST_INIT(&pagedep->pd_diraddhd[i]);
1801	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
1802	*pagedeppp = pagedep;
1803	return (0);
1804}
1805
1806/*
1807 * Structures and routines associated with inodedep caching.
1808 */
1809LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
1810static u_long	inodedep_hash;	/* size of hash table - 1 */
1811static long	num_inodedep;	/* number of inodedep allocated */
1812#define	INODEDEP_HASH(fs, inum) \
1813      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
1814
1815static int
1816inodedep_find(inodedephd, fs, inum, inodedeppp)
1817	struct inodedep_hashhead *inodedephd;
1818	struct fs *fs;
1819	ino_t inum;
1820	struct inodedep **inodedeppp;
1821{
1822	struct inodedep *inodedep;
1823
1824	LIST_FOREACH(inodedep, inodedephd, id_hash)
1825		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1826			break;
1827	if (inodedep) {
1828		*inodedeppp = inodedep;
1829		return (1);
1830	}
1831	*inodedeppp = NULL;
1832
1833	return (0);
1834}
1835/*
1836 * Look up an inodedep. Return 1 if found, 0 if not found.
1837 * If not found, allocate if DEPALLOC flag is passed.
1838 * Found or allocated entry is returned in inodedeppp.
1839 * This routine must be called with splbio interrupts blocked.
1840 */
1841static int
1842inodedep_lookup(mp, inum, flags, inodedeppp)
1843	struct mount *mp;
1844	ino_t inum;
1845	int flags;
1846	struct inodedep **inodedeppp;
1847{
1848	struct inodedep *inodedep;
1849	struct inodedep_hashhead *inodedephd;
1850	struct fs *fs;
1851
1852	mtx_assert(&lk, MA_OWNED);
1853	fs = VFSTOUFS(mp)->um_fs;
1854	inodedephd = INODEDEP_HASH(fs, inum);
1855
1856	if (inodedep_find(inodedephd, fs, inum, inodedeppp))
1857		return (1);
1858	if ((flags & DEPALLOC) == 0)
1859		return (0);
1860	/*
1861	 * If we are over our limit, try to improve the situation.
1862	 */
1863	if (num_inodedep > max_softdeps && (flags & NODELAY) == 0)
1864		request_cleanup(mp, FLUSH_INODES);
1865	FREE_LOCK(&lk);
1866	inodedep = malloc(sizeof(struct inodedep),
1867		M_INODEDEP, M_SOFTDEP_FLAGS);
1868	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
1869	ACQUIRE_LOCK(&lk);
1870	if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
1871		WORKITEM_FREE(inodedep, D_INODEDEP);
1872		return (1);
1873	}
1874	num_inodedep += 1;
1875	inodedep->id_fs = fs;
1876	inodedep->id_ino = inum;
1877	inodedep->id_state = ALLCOMPLETE;
1878	inodedep->id_nlinkdelta = 0;
1879	inodedep->id_savedino1 = NULL;
1880	inodedep->id_savedsize = -1;
1881	inodedep->id_savedextsize = -1;
1882	inodedep->id_savednlink = -1;
1883	inodedep->id_bmsafemap = NULL;
1884	inodedep->id_mkdiradd = NULL;
1885	LIST_INIT(&inodedep->id_dirremhd);
1886	LIST_INIT(&inodedep->id_pendinghd);
1887	LIST_INIT(&inodedep->id_inowait);
1888	LIST_INIT(&inodedep->id_bufwait);
1889	TAILQ_INIT(&inodedep->id_inoreflst);
1890	TAILQ_INIT(&inodedep->id_inoupdt);
1891	TAILQ_INIT(&inodedep->id_newinoupdt);
1892	TAILQ_INIT(&inodedep->id_extupdt);
1893	TAILQ_INIT(&inodedep->id_newextupdt);
1894	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1895	*inodedeppp = inodedep;
1896	return (0);
1897}
1898
1899/*
1900 * Structures and routines associated with newblk caching.
1901 */
1902LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1903u_long	newblk_hash;		/* size of hash table - 1 */
1904#define	NEWBLK_HASH(fs, inum) \
1905	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1906
1907static int
1908newblk_find(newblkhd, mp, newblkno, flags, newblkpp)
1909	struct newblk_hashhead *newblkhd;
1910	struct mount *mp;
1911	ufs2_daddr_t newblkno;
1912	int flags;
1913	struct newblk **newblkpp;
1914{
1915	struct newblk *newblk;
1916
1917	LIST_FOREACH(newblk, newblkhd, nb_hash) {
1918		if (newblkno != newblk->nb_newblkno)
1919			continue;
1920		if (mp != newblk->nb_list.wk_mp)
1921			continue;
1922		/*
1923		 * If we're creating a new dependency don't match those that
1924		 * have already been converted to allocdirects.  This is for
1925		 * a frag extend.
1926		 */
1927		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
1928			continue;
1929		break;
1930	}
1931	if (newblk) {
1932		*newblkpp = newblk;
1933		return (1);
1934	}
1935	*newblkpp = NULL;
1936	return (0);
1937}
1938
1939/*
1940 * Look up a newblk. Return 1 if found, 0 if not found.
1941 * If not found, allocate if DEPALLOC flag is passed.
1942 * Found or allocated entry is returned in newblkpp.
1943 */
1944static int
1945newblk_lookup(mp, newblkno, flags, newblkpp)
1946	struct mount *mp;
1947	ufs2_daddr_t newblkno;
1948	int flags;
1949	struct newblk **newblkpp;
1950{
1951	struct newblk *newblk;
1952	struct newblk_hashhead *newblkhd;
1953
1954	newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno);
1955	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp))
1956		return (1);
1957	if ((flags & DEPALLOC) == 0)
1958		return (0);
1959	FREE_LOCK(&lk);
1960	newblk = malloc(sizeof(union allblk), M_NEWBLK,
1961	    M_SOFTDEP_FLAGS | M_ZERO);
1962	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
1963	ACQUIRE_LOCK(&lk);
1964	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) {
1965		WORKITEM_FREE(newblk, D_NEWBLK);
1966		return (1);
1967	}
1968	newblk->nb_freefrag = NULL;
1969	LIST_INIT(&newblk->nb_indirdeps);
1970	LIST_INIT(&newblk->nb_newdirblk);
1971	LIST_INIT(&newblk->nb_jwork);
1972	newblk->nb_state = ATTACHED;
1973	newblk->nb_newblkno = newblkno;
1974	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1975	*newblkpp = newblk;
1976	return (0);
1977}
1978
1979/*
1980 * Executed during filesystem system initialization before
1981 * mounting any filesystems.
1982 */
1983void
1984softdep_initialize()
1985{
1986
1987	LIST_INIT(&mkdirlisthd);
1988	max_softdeps = desiredvnodes * 4;
1989	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash);
1990	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
1991	newblk_hashtbl = hashinit(desiredvnodes / 5,  M_NEWBLK, &newblk_hash);
1992	bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
1993
1994	/* initialise bioops hack */
1995	bioops.io_start = softdep_disk_io_initiation;
1996	bioops.io_complete = softdep_disk_write_complete;
1997	bioops.io_deallocate = softdep_deallocate_dependencies;
1998	bioops.io_countdeps = softdep_count_dependencies;
1999
2000	/* Initialize the callout with an mtx. */
2001	callout_init_mtx(&softdep_callout, &lk, 0);
2002}
2003
2004/*
2005 * Executed after all filesystems have been unmounted during
2006 * filesystem module unload.
2007 */
2008void
2009softdep_uninitialize()
2010{
2011
2012	callout_drain(&softdep_callout);
2013	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
2014	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
2015	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
2016	hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash);
2017}
2018
2019/*
2020 * Called at mount time to notify the dependency code that a
2021 * filesystem wishes to use it.
2022 */
2023int
2024softdep_mount(devvp, mp, fs, cred)
2025	struct vnode *devvp;
2026	struct mount *mp;
2027	struct fs *fs;
2028	struct ucred *cred;
2029{
2030	struct csum_total cstotal;
2031	struct ufsmount *ump;
2032	struct cg *cgp;
2033	struct buf *bp;
2034	int error, cyl;
2035
2036	MNT_ILOCK(mp);
2037	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2038	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2039		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2040			MNTK_SOFTDEP;
2041		mp->mnt_noasync++;
2042	}
2043	MNT_IUNLOCK(mp);
2044	ump = VFSTOUFS(mp);
2045	LIST_INIT(&ump->softdep_workitem_pending);
2046	LIST_INIT(&ump->softdep_journal_pending);
2047	TAILQ_INIT(&ump->softdep_unlinked);
2048	ump->softdep_worklist_tail = NULL;
2049	ump->softdep_on_worklist = 0;
2050	ump->softdep_deps = 0;
2051	if ((fs->fs_flags & FS_SUJ) &&
2052	    (error = journal_mount(mp, fs, cred)) != 0) {
2053		printf("Failed to start journal: %d\n", error);
2054		return (error);
2055	}
2056	/*
2057	 * When doing soft updates, the counters in the
2058	 * superblock may have gotten out of sync. Recomputation
2059	 * can take a long time and can be deferred for background
2060	 * fsck.  However, the old behavior of scanning the cylinder
2061	 * groups and recalculating them at mount time is available
2062	 * by setting vfs.ffs.compute_summary_at_mount to one.
2063	 */
2064	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2065		return (0);
2066	bzero(&cstotal, sizeof cstotal);
2067	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2068		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2069		    fs->fs_cgsize, cred, &bp)) != 0) {
2070			brelse(bp);
2071			return (error);
2072		}
2073		cgp = (struct cg *)bp->b_data;
2074		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2075		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2076		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2077		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2078		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2079		brelse(bp);
2080	}
2081#ifdef DEBUG
2082	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2083		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2084#endif
2085	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2086	return (0);
2087}
2088
2089void
2090softdep_unmount(mp)
2091	struct mount *mp;
2092{
2093
2094	if (mp->mnt_kern_flag & MNTK_SUJ)
2095		journal_unmount(mp);
2096}
2097
2098struct jblocks {
2099	struct jseglst	jb_segs;	/* TAILQ of current segments. */
2100	struct jseg	*jb_writeseg;	/* Next write to complete. */
2101	struct jextent	*jb_extent;	/* Extent array. */
2102	uint64_t	jb_nextseq;	/* Next sequence number. */
2103	uint64_t	jb_oldestseq;	/* Oldest active sequence number. */
2104	int		jb_avail;	/* Available extents. */
2105	int		jb_used;	/* Last used extent. */
2106	int		jb_head;	/* Allocator head. */
2107	int		jb_off;		/* Allocator extent offset. */
2108	int		jb_blocks;	/* Total disk blocks covered. */
2109	int		jb_free;	/* Total disk blocks free. */
2110	int		jb_min;		/* Minimum free space. */
2111	int		jb_low;		/* Low on space. */
2112	int		jb_age;		/* Insertion time of oldest rec. */
2113	int		jb_suspended;	/* Did journal suspend writes? */
2114};
2115
2116struct jextent {
2117	ufs2_daddr_t	je_daddr;	/* Disk block address. */
2118	int		je_blocks;	/* Disk block count. */
2119};
2120
2121static struct jblocks *
2122jblocks_create(void)
2123{
2124	struct jblocks *jblocks;
2125
2126	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2127	TAILQ_INIT(&jblocks->jb_segs);
2128	jblocks->jb_avail = 10;
2129	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2130	    M_JBLOCKS, M_WAITOK | M_ZERO);
2131
2132	return (jblocks);
2133}
2134
2135static ufs2_daddr_t
2136jblocks_alloc(jblocks, bytes, actual)
2137	struct jblocks *jblocks;
2138	int bytes;
2139	int *actual;
2140{
2141	ufs2_daddr_t daddr;
2142	struct jextent *jext;
2143	int freecnt;
2144	int blocks;
2145
2146	blocks = bytes / DEV_BSIZE;
2147	jext = &jblocks->jb_extent[jblocks->jb_head];
2148	freecnt = jext->je_blocks - jblocks->jb_off;
2149	if (freecnt == 0) {
2150		jblocks->jb_off = 0;
2151		if (++jblocks->jb_head > jblocks->jb_used)
2152			jblocks->jb_head = 0;
2153		jext = &jblocks->jb_extent[jblocks->jb_head];
2154		freecnt = jext->je_blocks;
2155	}
2156	if (freecnt > blocks)
2157		freecnt = blocks;
2158	*actual = freecnt * DEV_BSIZE;
2159	daddr = jext->je_daddr + jblocks->jb_off;
2160	jblocks->jb_off += freecnt;
2161	jblocks->jb_free -= freecnt;
2162
2163	return (daddr);
2164}
2165
2166static void
2167jblocks_free(jblocks, mp, bytes)
2168	struct jblocks *jblocks;
2169	struct mount *mp;
2170	int bytes;
2171{
2172
2173	jblocks->jb_free += bytes / DEV_BSIZE;
2174	if (jblocks->jb_suspended)
2175		worklist_speedup();
2176	wakeup(jblocks);
2177}
2178
2179static void
2180jblocks_destroy(jblocks)
2181	struct jblocks *jblocks;
2182{
2183
2184	if (jblocks->jb_extent)
2185		free(jblocks->jb_extent, M_JBLOCKS);
2186	free(jblocks, M_JBLOCKS);
2187}
2188
2189static void
2190jblocks_add(jblocks, daddr, blocks)
2191	struct jblocks *jblocks;
2192	ufs2_daddr_t daddr;
2193	int blocks;
2194{
2195	struct jextent *jext;
2196
2197	jblocks->jb_blocks += blocks;
2198	jblocks->jb_free += blocks;
2199	jext = &jblocks->jb_extent[jblocks->jb_used];
2200	/* Adding the first block. */
2201	if (jext->je_daddr == 0) {
2202		jext->je_daddr = daddr;
2203		jext->je_blocks = blocks;
2204		return;
2205	}
2206	/* Extending the last extent. */
2207	if (jext->je_daddr + jext->je_blocks == daddr) {
2208		jext->je_blocks += blocks;
2209		return;
2210	}
2211	/* Adding a new extent. */
2212	if (++jblocks->jb_used == jblocks->jb_avail) {
2213		jblocks->jb_avail *= 2;
2214		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2215		    M_JBLOCKS, M_WAITOK | M_ZERO);
2216		memcpy(jext, jblocks->jb_extent,
2217		    sizeof(struct jextent) * jblocks->jb_used);
2218		free(jblocks->jb_extent, M_JBLOCKS);
2219		jblocks->jb_extent = jext;
2220	}
2221	jext = &jblocks->jb_extent[jblocks->jb_used];
2222	jext->je_daddr = daddr;
2223	jext->je_blocks = blocks;
2224	return;
2225}
2226
2227int
2228softdep_journal_lookup(mp, vpp)
2229	struct mount *mp;
2230	struct vnode **vpp;
2231{
2232	struct componentname cnp;
2233	struct vnode *dvp;
2234	ino_t sujournal;
2235	int error;
2236
2237	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2238	if (error)
2239		return (error);
2240	bzero(&cnp, sizeof(cnp));
2241	cnp.cn_nameiop = LOOKUP;
2242	cnp.cn_flags = ISLASTCN;
2243	cnp.cn_thread = curthread;
2244	cnp.cn_cred = curthread->td_ucred;
2245	cnp.cn_pnbuf = SUJ_FILE;
2246	cnp.cn_nameptr = SUJ_FILE;
2247	cnp.cn_namelen = strlen(SUJ_FILE);
2248	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2249	vput(dvp);
2250	if (error != 0)
2251		return (error);
2252	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2253	return (error);
2254}
2255
2256/*
2257 * Open and verify the journal file.
2258 */
2259static int
2260journal_mount(mp, fs, cred)
2261	struct mount *mp;
2262	struct fs *fs;
2263	struct ucred *cred;
2264{
2265	struct jblocks *jblocks;
2266	struct vnode *vp;
2267	struct inode *ip;
2268	ufs2_daddr_t blkno;
2269	int bcount;
2270	int error;
2271	int i;
2272
2273	mp->mnt_kern_flag |= MNTK_SUJ;
2274	error = softdep_journal_lookup(mp, &vp);
2275	if (error != 0) {
2276		printf("Failed to find journal.  Use tunefs to create one\n");
2277		return (error);
2278	}
2279	ip = VTOI(vp);
2280	if (ip->i_size < SUJ_MIN) {
2281		error = ENOSPC;
2282		goto out;
2283	}
2284	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2285	jblocks = jblocks_create();
2286	for (i = 0; i < bcount; i++) {
2287		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2288		if (error)
2289			break;
2290		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2291	}
2292	if (error) {
2293		jblocks_destroy(jblocks);
2294		goto out;
2295	}
2296	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2297	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2298	/*
2299	 * Only validate the journal contents if the filesystem is clean,
2300	 * otherwise we write the logs but they'll never be used.  If the
2301	 * filesystem was still dirty when we mounted it the journal is
2302	 * invalid and a new journal can only be valid if it starts from a
2303	 * clean mount.
2304	 */
2305	if (fs->fs_clean) {
2306		DIP_SET(ip, i_modrev, fs->fs_mtime);
2307		ip->i_flags |= IN_MODIFIED;
2308		ffs_update(vp, 1);
2309	}
2310	VFSTOUFS(mp)->softdep_jblocks = jblocks;
2311out:
2312	vput(vp);
2313	return (error);
2314}
2315
2316static void
2317journal_unmount(mp)
2318	struct mount *mp;
2319{
2320	struct ufsmount *ump;
2321
2322	ump = VFSTOUFS(mp);
2323	if (ump->softdep_jblocks)
2324		jblocks_destroy(ump->softdep_jblocks);
2325	ump->softdep_jblocks = NULL;
2326}
2327
2328/*
2329 * Called when a journal record is ready to be written.  Space is allocated
2330 * and the journal entry is created when the journal is flushed to stable
2331 * store.
2332 */
2333static void
2334add_to_journal(wk)
2335	struct worklist *wk;
2336{
2337	struct ufsmount *ump;
2338
2339	mtx_assert(&lk, MA_OWNED);
2340	ump = VFSTOUFS(wk->wk_mp);
2341	if (wk->wk_state & ONWORKLIST)
2342		panic("add_to_journal: %s(0x%X) already on list",
2343		    TYPENAME(wk->wk_type), wk->wk_state);
2344	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2345	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2346		ump->softdep_jblocks->jb_age = ticks;
2347		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2348	} else
2349		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2350	ump->softdep_journal_tail = wk;
2351	ump->softdep_on_journal += 1;
2352}
2353
2354/*
2355 * Remove an arbitrary item for the journal worklist maintain the tail
2356 * pointer.  This happens when a new operation obviates the need to
2357 * journal an old operation.
2358 */
2359static void
2360remove_from_journal(wk)
2361	struct worklist *wk;
2362{
2363	struct ufsmount *ump;
2364
2365	mtx_assert(&lk, MA_OWNED);
2366	ump = VFSTOUFS(wk->wk_mp);
2367#ifdef DEBUG	/* XXX Expensive, temporary. */
2368	{
2369		struct worklist *wkn;
2370
2371		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2372			if (wkn == wk)
2373				break;
2374		if (wkn == NULL)
2375			panic("remove_from_journal: %p is not in journal", wk);
2376	}
2377#endif
2378	/*
2379	 * We emulate a TAILQ to save space in most structures which do not
2380	 * require TAILQ semantics.  Here we must update the tail position
2381	 * when removing the tail which is not the final entry.
2382	 */
2383	if (ump->softdep_journal_tail == wk)
2384		ump->softdep_journal_tail =
2385		    (struct worklist *)wk->wk_list.le_prev;
2386
2387	WORKLIST_REMOVE(wk);
2388	ump->softdep_on_journal -= 1;
2389}
2390
2391/*
2392 * Check for journal space as well as dependency limits so the prelink
2393 * code can throttle both journaled and non-journaled filesystems.
2394 * Threshold is 0 for low and 1 for min.
2395 */
2396static int
2397journal_space(ump, thresh)
2398	struct ufsmount *ump;
2399	int thresh;
2400{
2401	struct jblocks *jblocks;
2402	int avail;
2403
2404	/*
2405	 * We use a tighter restriction here to prevent request_cleanup()
2406	 * running in threads from running into locks we currently hold.
2407	 */
2408	if (num_inodedep > (max_softdeps / 10) * 9)
2409		return (0);
2410
2411	jblocks = ump->softdep_jblocks;
2412	if (jblocks == NULL)
2413		return (1);
2414	if (thresh)
2415		thresh = jblocks->jb_min;
2416	else
2417		thresh = jblocks->jb_low;
2418	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2419	avail = jblocks->jb_free - avail;
2420
2421	return (avail > thresh);
2422}
2423
2424static void
2425journal_suspend(ump)
2426	struct ufsmount *ump;
2427{
2428	struct jblocks *jblocks;
2429	struct mount *mp;
2430
2431	mp = UFSTOVFS(ump);
2432	jblocks = ump->softdep_jblocks;
2433	MNT_ILOCK(mp);
2434	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2435		stat_journal_min++;
2436		mp->mnt_kern_flag |= MNTK_SUSPEND;
2437		mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
2438	}
2439	jblocks->jb_suspended = 1;
2440	MNT_IUNLOCK(mp);
2441}
2442
2443/*
2444 * Called before any allocation function to be certain that there is
2445 * sufficient space in the journal prior to creating any new records.
2446 * Since in the case of block allocation we may have multiple locked
2447 * buffers at the time of the actual allocation we can not block
2448 * when the journal records are created.  Doing so would create a deadlock
2449 * if any of these buffers needed to be flushed to reclaim space.  Instead
2450 * we require a sufficiently large amount of available space such that
2451 * each thread in the system could have passed this allocation check and
2452 * still have sufficient free space.  With 20% of a minimum journal size
2453 * of 1MB we have 6553 records available.
2454 */
2455int
2456softdep_prealloc(vp, waitok)
2457	struct vnode *vp;
2458	int waitok;
2459{
2460	struct ufsmount *ump;
2461
2462	if (DOINGSUJ(vp) == 0)
2463		return (0);
2464	ump = VFSTOUFS(vp->v_mount);
2465	ACQUIRE_LOCK(&lk);
2466	if (journal_space(ump, 0)) {
2467		FREE_LOCK(&lk);
2468		return (0);
2469	}
2470	stat_journal_low++;
2471	FREE_LOCK(&lk);
2472	if (waitok == MNT_NOWAIT)
2473		return (ENOSPC);
2474	/*
2475	 * Attempt to sync this vnode once to flush any journal
2476	 * work attached to it.
2477	 */
2478	ffs_syncvnode(vp, waitok);
2479	ACQUIRE_LOCK(&lk);
2480	process_removes(vp);
2481	if (journal_space(ump, 0) == 0) {
2482		softdep_speedup();
2483		if (journal_space(ump, 1) == 0)
2484			journal_suspend(ump);
2485	}
2486	FREE_LOCK(&lk);
2487
2488	return (0);
2489}
2490
2491/*
2492 * Before adjusting a link count on a vnode verify that we have sufficient
2493 * journal space.  If not, process operations that depend on the currently
2494 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
2495 * and softdep flush threads can not acquire these locks to reclaim space.
2496 */
2497static void
2498softdep_prelink(dvp, vp)
2499	struct vnode *dvp;
2500	struct vnode *vp;
2501{
2502	struct ufsmount *ump;
2503
2504	ump = VFSTOUFS(dvp->v_mount);
2505	mtx_assert(&lk, MA_OWNED);
2506	if (journal_space(ump, 0))
2507		return;
2508	stat_journal_low++;
2509	FREE_LOCK(&lk);
2510	if (vp)
2511		ffs_syncvnode(vp, MNT_NOWAIT);
2512	ffs_syncvnode(dvp, MNT_WAIT);
2513	ACQUIRE_LOCK(&lk);
2514	/* Process vp before dvp as it may create .. removes. */
2515	if (vp)
2516		process_removes(vp);
2517	process_removes(dvp);
2518	softdep_speedup();
2519	process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
2520	process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
2521	if (journal_space(ump, 0) == 0) {
2522		softdep_speedup();
2523		if (journal_space(ump, 1) == 0)
2524			journal_suspend(ump);
2525	}
2526}
2527
2528static void
2529jseg_write(fs, jblocks, jseg, data)
2530	struct fs *fs;
2531	struct jblocks *jblocks;
2532	struct jseg *jseg;
2533	uint8_t *data;
2534{
2535	struct jsegrec *rec;
2536
2537	rec = (struct jsegrec *)data;
2538	rec->jsr_seq = jseg->js_seq;
2539	rec->jsr_oldest = jblocks->jb_oldestseq;
2540	rec->jsr_cnt = jseg->js_cnt;
2541	rec->jsr_blocks = jseg->js_size / DEV_BSIZE;
2542	rec->jsr_crc = 0;
2543	rec->jsr_time = fs->fs_mtime;
2544}
2545
2546static inline void
2547inoref_write(inoref, jseg, rec)
2548	struct inoref *inoref;
2549	struct jseg *jseg;
2550	struct jrefrec *rec;
2551{
2552
2553	inoref->if_jsegdep->jd_seg = jseg;
2554	rec->jr_ino = inoref->if_ino;
2555	rec->jr_parent = inoref->if_parent;
2556	rec->jr_nlink = inoref->if_nlink;
2557	rec->jr_mode = inoref->if_mode;
2558	rec->jr_diroff = inoref->if_diroff;
2559}
2560
2561static void
2562jaddref_write(jaddref, jseg, data)
2563	struct jaddref *jaddref;
2564	struct jseg *jseg;
2565	uint8_t *data;
2566{
2567	struct jrefrec *rec;
2568
2569	rec = (struct jrefrec *)data;
2570	rec->jr_op = JOP_ADDREF;
2571	inoref_write(&jaddref->ja_ref, jseg, rec);
2572}
2573
2574static void
2575jremref_write(jremref, jseg, data)
2576	struct jremref *jremref;
2577	struct jseg *jseg;
2578	uint8_t *data;
2579{
2580	struct jrefrec *rec;
2581
2582	rec = (struct jrefrec *)data;
2583	rec->jr_op = JOP_REMREF;
2584	inoref_write(&jremref->jr_ref, jseg, rec);
2585}
2586
2587static	void
2588jmvref_write(jmvref, jseg, data)
2589	struct jmvref *jmvref;
2590	struct jseg *jseg;
2591	uint8_t *data;
2592{
2593	struct jmvrec *rec;
2594
2595	rec = (struct jmvrec *)data;
2596	rec->jm_op = JOP_MVREF;
2597	rec->jm_ino = jmvref->jm_ino;
2598	rec->jm_parent = jmvref->jm_parent;
2599	rec->jm_oldoff = jmvref->jm_oldoff;
2600	rec->jm_newoff = jmvref->jm_newoff;
2601}
2602
2603static void
2604jnewblk_write(jnewblk, jseg, data)
2605	struct jnewblk *jnewblk;
2606	struct jseg *jseg;
2607	uint8_t *data;
2608{
2609	struct jblkrec *rec;
2610
2611	jnewblk->jn_jsegdep->jd_seg = jseg;
2612	rec = (struct jblkrec *)data;
2613	rec->jb_op = JOP_NEWBLK;
2614	rec->jb_ino = jnewblk->jn_ino;
2615	rec->jb_blkno = jnewblk->jn_blkno;
2616	rec->jb_lbn = jnewblk->jn_lbn;
2617	rec->jb_frags = jnewblk->jn_frags;
2618	rec->jb_oldfrags = jnewblk->jn_oldfrags;
2619}
2620
2621static void
2622jfreeblk_write(jfreeblk, jseg, data)
2623	struct jfreeblk *jfreeblk;
2624	struct jseg *jseg;
2625	uint8_t *data;
2626{
2627	struct jblkrec *rec;
2628
2629	jfreeblk->jf_jsegdep->jd_seg = jseg;
2630	rec = (struct jblkrec *)data;
2631	rec->jb_op = JOP_FREEBLK;
2632	rec->jb_ino = jfreeblk->jf_ino;
2633	rec->jb_blkno = jfreeblk->jf_blkno;
2634	rec->jb_lbn = jfreeblk->jf_lbn;
2635	rec->jb_frags = jfreeblk->jf_frags;
2636	rec->jb_oldfrags = 0;
2637}
2638
2639static void
2640jfreefrag_write(jfreefrag, jseg, data)
2641	struct jfreefrag *jfreefrag;
2642	struct jseg *jseg;
2643	uint8_t *data;
2644{
2645	struct jblkrec *rec;
2646
2647	jfreefrag->fr_jsegdep->jd_seg = jseg;
2648	rec = (struct jblkrec *)data;
2649	rec->jb_op = JOP_FREEBLK;
2650	rec->jb_ino = jfreefrag->fr_ino;
2651	rec->jb_blkno = jfreefrag->fr_blkno;
2652	rec->jb_lbn = jfreefrag->fr_lbn;
2653	rec->jb_frags = jfreefrag->fr_frags;
2654	rec->jb_oldfrags = 0;
2655}
2656
2657static void
2658jtrunc_write(jtrunc, jseg, data)
2659	struct jtrunc *jtrunc;
2660	struct jseg *jseg;
2661	uint8_t *data;
2662{
2663	struct jtrncrec *rec;
2664
2665	rec = (struct jtrncrec *)data;
2666	rec->jt_op = JOP_TRUNC;
2667	rec->jt_ino = jtrunc->jt_ino;
2668	rec->jt_size = jtrunc->jt_size;
2669	rec->jt_extsize = jtrunc->jt_extsize;
2670}
2671
2672/*
2673 * Flush some journal records to disk.
2674 */
2675static void
2676softdep_process_journal(mp, flags)
2677	struct mount *mp;
2678	int flags;
2679{
2680	struct jblocks *jblocks;
2681	struct ufsmount *ump;
2682	struct worklist *wk;
2683	struct jseg *jseg;
2684	struct buf *bp;
2685	uint8_t *data;
2686	struct fs *fs;
2687	int segwritten;
2688	int jrecmin;	/* Minimum records per block. */
2689	int jrecmax;	/* Maximum records per block. */
2690	int size;
2691	int cnt;
2692	int off;
2693
2694	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
2695		return;
2696	ump = VFSTOUFS(mp);
2697	fs = ump->um_fs;
2698	jblocks = ump->softdep_jblocks;
2699	/*
2700	 * We write anywhere between a disk block and fs block.  The upper
2701	 * bound is picked to prevent buffer cache fragmentation and limit
2702	 * processing time per I/O.
2703	 */
2704	jrecmin = (DEV_BSIZE / JREC_SIZE) - 1; /* -1 for seg header */
2705	jrecmax = (fs->fs_bsize / DEV_BSIZE) * jrecmin;
2706	segwritten = 0;
2707	while ((cnt = ump->softdep_on_journal) != 0) {
2708		/*
2709		 * Create a new segment to hold as many as 'cnt' journal
2710		 * entries and add them to the segment.  Notice cnt is
2711		 * off by one to account for the space required by the
2712		 * jsegrec.  If we don't have a full block to log skip it
2713		 * unless we haven't written anything.
2714		 */
2715		cnt++;
2716		if (cnt < jrecmax && segwritten)
2717			break;
2718		/*
2719		 * Verify some free journal space.  softdep_prealloc() should
2720	 	 * guarantee that we don't run out so this is indicative of
2721		 * a problem with the flow control.  Try to recover
2722		 * gracefully in any event.
2723		 */
2724		while (jblocks->jb_free == 0) {
2725			if (flags != MNT_WAIT)
2726				break;
2727			printf("softdep: Out of journal space!\n");
2728			softdep_speedup();
2729			msleep(jblocks, &lk, PRIBIO, "jblocks", 1);
2730		}
2731		FREE_LOCK(&lk);
2732		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
2733		workitem_alloc(&jseg->js_list, D_JSEG, mp);
2734		LIST_INIT(&jseg->js_entries);
2735		jseg->js_state = ATTACHED;
2736		jseg->js_jblocks = jblocks;
2737		bp = geteblk(fs->fs_bsize, 0);
2738		ACQUIRE_LOCK(&lk);
2739		/*
2740		 * If there was a race while we were allocating the block
2741		 * and jseg the entry we care about was likely written.
2742		 * We bail out in both the WAIT and NOWAIT case and assume
2743		 * the caller will loop if the entry it cares about is
2744		 * not written.
2745		 */
2746		if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) {
2747			bp->b_flags |= B_INVAL | B_NOCACHE;
2748			WORKITEM_FREE(jseg, D_JSEG);
2749			FREE_LOCK(&lk);
2750			brelse(bp);
2751			ACQUIRE_LOCK(&lk);
2752			break;
2753		}
2754		/*
2755		 * Calculate the disk block size required for the available
2756		 * records rounded to the min size.
2757		 */
2758		cnt = ump->softdep_on_journal;
2759		if (cnt < jrecmax)
2760			size = howmany(cnt, jrecmin) * DEV_BSIZE;
2761		else
2762			size = fs->fs_bsize;
2763		/*
2764		 * Allocate a disk block for this journal data and account
2765		 * for truncation of the requested size if enough contiguous
2766		 * space was not available.
2767		 */
2768		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
2769		bp->b_lblkno = bp->b_blkno;
2770		bp->b_offset = bp->b_blkno * DEV_BSIZE;
2771		bp->b_bcount = size;
2772		bp->b_bufobj = &ump->um_devvp->v_bufobj;
2773		bp->b_flags &= ~B_INVAL;
2774		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
2775		/*
2776		 * Initialize our jseg with cnt records.  Assign the next
2777		 * sequence number to it and link it in-order.
2778		 */
2779		cnt = MIN(ump->softdep_on_journal,
2780		    (size / DEV_BSIZE) * jrecmin);
2781		jseg->js_buf = bp;
2782		jseg->js_cnt = cnt;
2783		jseg->js_refs = cnt + 1;	/* Self ref. */
2784		jseg->js_size = size;
2785		jseg->js_seq = jblocks->jb_nextseq++;
2786		if (TAILQ_EMPTY(&jblocks->jb_segs))
2787			jblocks->jb_oldestseq = jseg->js_seq;
2788		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
2789		if (jblocks->jb_writeseg == NULL)
2790			jblocks->jb_writeseg = jseg;
2791		/*
2792		 * Start filling in records from the pending list.
2793		 */
2794		data = bp->b_data;
2795		off = 0;
2796		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
2797		    != NULL) {
2798			/* Place a segment header on every device block. */
2799			if ((off % DEV_BSIZE) == 0) {
2800				jseg_write(fs, jblocks, jseg, data);
2801				off += JREC_SIZE;
2802				data = bp->b_data + off;
2803			}
2804			remove_from_journal(wk);
2805			wk->wk_state |= IOSTARTED;
2806			WORKLIST_INSERT(&jseg->js_entries, wk);
2807			switch (wk->wk_type) {
2808			case D_JADDREF:
2809				jaddref_write(WK_JADDREF(wk), jseg, data);
2810				break;
2811			case D_JREMREF:
2812				jremref_write(WK_JREMREF(wk), jseg, data);
2813				break;
2814			case D_JMVREF:
2815				jmvref_write(WK_JMVREF(wk), jseg, data);
2816				break;
2817			case D_JNEWBLK:
2818				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
2819				break;
2820			case D_JFREEBLK:
2821				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
2822				break;
2823			case D_JFREEFRAG:
2824				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
2825				break;
2826			case D_JTRUNC:
2827				jtrunc_write(WK_JTRUNC(wk), jseg, data);
2828				break;
2829			default:
2830				panic("process_journal: Unknown type %s",
2831				    TYPENAME(wk->wk_type));
2832				/* NOTREACHED */
2833			}
2834			if (--cnt == 0)
2835				break;
2836			off += JREC_SIZE;
2837			data = bp->b_data + off;
2838		}
2839		/*
2840		 * Write this one buffer and continue.
2841		 */
2842		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
2843		FREE_LOCK(&lk);
2844		BO_LOCK(bp->b_bufobj);
2845		bgetvp(ump->um_devvp, bp);
2846		BO_UNLOCK(bp->b_bufobj);
2847		if (flags == MNT_NOWAIT)
2848			bawrite(bp);
2849		else
2850			bwrite(bp);
2851		ACQUIRE_LOCK(&lk);
2852	}
2853	/*
2854	 * If we've suspended the filesystem because we ran out of journal
2855	 * space either try to sync it here to make some progress or
2856	 * unsuspend it if we already have.
2857	 */
2858	if (flags == 0 && jblocks && jblocks->jb_suspended) {
2859		if (journal_space(ump, jblocks->jb_min)) {
2860			FREE_LOCK(&lk);
2861			jblocks->jb_suspended = 0;
2862			mp->mnt_susp_owner = curthread;
2863			vfs_write_resume(mp);
2864			ACQUIRE_LOCK(&lk);
2865			return;
2866		}
2867		FREE_LOCK(&lk);
2868		VFS_SYNC(mp, MNT_NOWAIT);
2869		ffs_sbupdate(ump, MNT_WAIT, 0);
2870		ACQUIRE_LOCK(&lk);
2871	}
2872}
2873
2874/*
2875 * Complete a jseg, allowing all dependencies awaiting journal writes
2876 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
2877 * structures so that the journal segment can be freed to reclaim space.
2878 */
2879static void
2880complete_jseg(jseg)
2881	struct jseg *jseg;
2882{
2883	struct worklist *wk;
2884	struct jmvref *jmvref;
2885	int waiting;
2886	int i;
2887
2888	i = 0;
2889	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
2890		WORKLIST_REMOVE(wk);
2891		waiting = wk->wk_state & IOWAITING;
2892		wk->wk_state &= ~(IOSTARTED | IOWAITING);
2893		wk->wk_state |= COMPLETE;
2894		KASSERT(i < jseg->js_cnt,
2895		    ("handle_written_jseg: overflow %d >= %d",
2896		    i, jseg->js_cnt));
2897		switch (wk->wk_type) {
2898		case D_JADDREF:
2899			handle_written_jaddref(WK_JADDREF(wk));
2900			break;
2901		case D_JREMREF:
2902			handle_written_jremref(WK_JREMREF(wk));
2903			break;
2904		case D_JMVREF:
2905			/* No jsegdep here. */
2906			free_jseg(jseg);
2907			jmvref = WK_JMVREF(wk);
2908			LIST_REMOVE(jmvref, jm_deps);
2909			free_pagedep(jmvref->jm_pagedep);
2910			WORKITEM_FREE(jmvref, D_JMVREF);
2911			break;
2912		case D_JNEWBLK:
2913			handle_written_jnewblk(WK_JNEWBLK(wk));
2914			break;
2915		case D_JFREEBLK:
2916			handle_written_jfreeblk(WK_JFREEBLK(wk));
2917			break;
2918		case D_JFREEFRAG:
2919			handle_written_jfreefrag(WK_JFREEFRAG(wk));
2920			break;
2921		case D_JTRUNC:
2922			WK_JTRUNC(wk)->jt_jsegdep->jd_seg = jseg;
2923			WORKITEM_FREE(wk, D_JTRUNC);
2924			break;
2925		default:
2926			panic("handle_written_jseg: Unknown type %s",
2927			    TYPENAME(wk->wk_type));
2928			/* NOTREACHED */
2929		}
2930		if (waiting)
2931			wakeup(wk);
2932	}
2933	/* Release the self reference so the structure may be freed. */
2934	free_jseg(jseg);
2935}
2936
2937/*
2938 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Handle jseg
2939 * completions in order only.
2940 */
2941static void
2942handle_written_jseg(jseg, bp)
2943	struct jseg *jseg;
2944	struct buf *bp;
2945{
2946	struct jblocks *jblocks;
2947	struct jseg *jsegn;
2948
2949	if (jseg->js_refs == 0)
2950		panic("handle_written_jseg: No self-reference on %p", jseg);
2951	jseg->js_state |= DEPCOMPLETE;
2952	/*
2953	 * We'll never need this buffer again, set flags so it will be
2954	 * discarded.
2955	 */
2956	bp->b_flags |= B_INVAL | B_NOCACHE;
2957	jblocks = jseg->js_jblocks;
2958	/*
2959	 * Don't allow out of order completions.  If this isn't the first
2960	 * block wait for it to write before we're done.
2961	 */
2962	if (jseg != jblocks->jb_writeseg)
2963		return;
2964	/* Iterate through available jsegs processing their entries. */
2965	do {
2966		jsegn = TAILQ_NEXT(jseg, js_next);
2967		complete_jseg(jseg);
2968		jseg = jsegn;
2969	} while (jseg && jseg->js_state & DEPCOMPLETE);
2970	jblocks->jb_writeseg = jseg;
2971}
2972
2973static inline struct jsegdep *
2974inoref_jseg(inoref)
2975	struct inoref *inoref;
2976{
2977	struct jsegdep *jsegdep;
2978
2979	jsegdep = inoref->if_jsegdep;
2980	inoref->if_jsegdep = NULL;
2981
2982	return (jsegdep);
2983}
2984
2985/*
2986 * Called once a jremref has made it to stable store.  The jremref is marked
2987 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
2988 * for the jremref to complete will be awoken by free_jremref.
2989 */
2990static void
2991handle_written_jremref(jremref)
2992	struct jremref *jremref;
2993{
2994	struct inodedep *inodedep;
2995	struct jsegdep *jsegdep;
2996	struct dirrem *dirrem;
2997
2998	/* Grab the jsegdep. */
2999	jsegdep = inoref_jseg(&jremref->jr_ref);
3000	/*
3001	 * Remove us from the inoref list.
3002	 */
3003	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3004	    0, &inodedep) == 0)
3005		panic("handle_written_jremref: Lost inodedep");
3006	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3007	/*
3008	 * Complete the dirrem.
3009	 */
3010	dirrem = jremref->jr_dirrem;
3011	jremref->jr_dirrem = NULL;
3012	LIST_REMOVE(jremref, jr_deps);
3013	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3014	WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list);
3015	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3016	    (dirrem->dm_state & COMPLETE) != 0)
3017		add_to_worklist(&dirrem->dm_list, 0);
3018	free_jremref(jremref);
3019}
3020
3021/*
3022 * Called once a jaddref has made it to stable store.  The dependency is
3023 * marked complete and any dependent structures are added to the inode
3024 * bufwait list to be completed as soon as it is written.  If a bitmap write
3025 * depends on this entry we move the inode into the inodedephd of the
3026 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3027 */
3028static void
3029handle_written_jaddref(jaddref)
3030	struct jaddref *jaddref;
3031{
3032	struct jsegdep *jsegdep;
3033	struct inodedep *inodedep;
3034	struct diradd *diradd;
3035	struct mkdir *mkdir;
3036
3037	/* Grab the jsegdep. */
3038	jsegdep = inoref_jseg(&jaddref->ja_ref);
3039	mkdir = NULL;
3040	diradd = NULL;
3041	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3042	    0, &inodedep) == 0)
3043		panic("handle_written_jaddref: Lost inodedep.");
3044	if (jaddref->ja_diradd == NULL)
3045		panic("handle_written_jaddref: No dependency");
3046	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3047		diradd = jaddref->ja_diradd;
3048		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3049	} else if (jaddref->ja_state & MKDIR_PARENT) {
3050		mkdir = jaddref->ja_mkdir;
3051		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3052	} else if (jaddref->ja_state & MKDIR_BODY)
3053		mkdir = jaddref->ja_mkdir;
3054	else
3055		panic("handle_written_jaddref: Unknown dependency %p",
3056		    jaddref->ja_diradd);
3057	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3058	/*
3059	 * Remove us from the inode list.
3060	 */
3061	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3062	/*
3063	 * The mkdir may be waiting on the jaddref to clear before freeing.
3064	 */
3065	if (mkdir) {
3066		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3067		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3068		    TYPENAME(mkdir->md_list.wk_type)));
3069		mkdir->md_jaddref = NULL;
3070		diradd = mkdir->md_diradd;
3071		mkdir->md_state |= DEPCOMPLETE;
3072		complete_mkdir(mkdir);
3073	}
3074	WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list);
3075	if (jaddref->ja_state & NEWBLOCK) {
3076		inodedep->id_state |= ONDEPLIST;
3077		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3078		    inodedep, id_deps);
3079	}
3080	free_jaddref(jaddref);
3081}
3082
3083/*
3084 * Called once a jnewblk journal is written.  The allocdirect or allocindir
3085 * is placed in the bmsafemap to await notification of a written bitmap.
3086 */
3087static void
3088handle_written_jnewblk(jnewblk)
3089	struct jnewblk *jnewblk;
3090{
3091	struct bmsafemap *bmsafemap;
3092	struct jsegdep *jsegdep;
3093	struct newblk *newblk;
3094
3095	/* Grab the jsegdep. */
3096	jsegdep = jnewblk->jn_jsegdep;
3097	jnewblk->jn_jsegdep = NULL;
3098	/*
3099	 * Add the written block to the bmsafemap so it can be notified when
3100	 * the bitmap is on disk.
3101	 */
3102	newblk = jnewblk->jn_newblk;
3103	jnewblk->jn_newblk = NULL;
3104	if (newblk == NULL)
3105		panic("handle_written_jnewblk: No dependency for the segdep.");
3106
3107	newblk->nb_jnewblk = NULL;
3108	bmsafemap = newblk->nb_bmsafemap;
3109	WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list);
3110	newblk->nb_state |= ONDEPLIST;
3111	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
3112	free_jnewblk(jnewblk);
3113}
3114
3115/*
3116 * Cancel a jfreefrag that won't be needed, probably due to colliding with
3117 * an in-flight allocation that has not yet been committed.  Divorce us
3118 * from the freefrag and mark it DEPCOMPLETE so that it may be added
3119 * to the worklist.
3120 */
3121static void
3122cancel_jfreefrag(jfreefrag)
3123	struct jfreefrag *jfreefrag;
3124{
3125	struct freefrag *freefrag;
3126
3127	if (jfreefrag->fr_jsegdep) {
3128		free_jsegdep(jfreefrag->fr_jsegdep);
3129		jfreefrag->fr_jsegdep = NULL;
3130	}
3131	freefrag = jfreefrag->fr_freefrag;
3132	jfreefrag->fr_freefrag = NULL;
3133	freefrag->ff_jfreefrag = NULL;
3134	free_jfreefrag(jfreefrag);
3135	freefrag->ff_state |= DEPCOMPLETE;
3136}
3137
3138/*
3139 * Free a jfreefrag when the parent freefrag is rendered obsolete.
3140 */
3141static void
3142free_jfreefrag(jfreefrag)
3143	struct jfreefrag *jfreefrag;
3144{
3145
3146	if (jfreefrag->fr_state & IOSTARTED)
3147		WORKLIST_REMOVE(&jfreefrag->fr_list);
3148	else if (jfreefrag->fr_state & ONWORKLIST)
3149		remove_from_journal(&jfreefrag->fr_list);
3150	if (jfreefrag->fr_freefrag != NULL)
3151		panic("free_jfreefrag:  Still attached to a freefrag.");
3152	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3153}
3154
3155/*
3156 * Called when the journal write for a jfreefrag completes.  The parent
3157 * freefrag is added to the worklist if this completes its dependencies.
3158 */
3159static void
3160handle_written_jfreefrag(jfreefrag)
3161	struct jfreefrag *jfreefrag;
3162{
3163	struct jsegdep *jsegdep;
3164	struct freefrag *freefrag;
3165
3166	/* Grab the jsegdep. */
3167	jsegdep = jfreefrag->fr_jsegdep;
3168	jfreefrag->fr_jsegdep = NULL;
3169	freefrag = jfreefrag->fr_freefrag;
3170	if (freefrag == NULL)
3171		panic("handle_written_jfreefrag: No freefrag.");
3172	freefrag->ff_state |= DEPCOMPLETE;
3173	freefrag->ff_jfreefrag = NULL;
3174	WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list);
3175	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3176		add_to_worklist(&freefrag->ff_list, 0);
3177	jfreefrag->fr_freefrag = NULL;
3178	free_jfreefrag(jfreefrag);
3179}
3180
3181/*
3182 * Called when the journal write for a jfreeblk completes.  The jfreeblk
3183 * is removed from the freeblks list of pending journal writes and the
3184 * jsegdep is moved to the freeblks jwork to be completed when all blocks
3185 * have been reclaimed.
3186 */
3187static void
3188handle_written_jfreeblk(jfreeblk)
3189	struct jfreeblk *jfreeblk;
3190{
3191	struct freeblks *freeblks;
3192	struct jsegdep *jsegdep;
3193
3194	/* Grab the jsegdep. */
3195	jsegdep = jfreeblk->jf_jsegdep;
3196	jfreeblk->jf_jsegdep = NULL;
3197	freeblks = jfreeblk->jf_freeblks;
3198	LIST_REMOVE(jfreeblk, jf_deps);
3199	WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list);
3200	/*
3201	 * If the freeblks is all journaled, we can add it to the worklist.
3202	 */
3203	if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) &&
3204	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) {
3205		/* Remove from the b_dep that is waiting on this write. */
3206		if (freeblks->fb_state & ONWORKLIST)
3207			WORKLIST_REMOVE(&freeblks->fb_list);
3208		add_to_worklist(&freeblks->fb_list, 1);
3209	}
3210
3211	free_jfreeblk(jfreeblk);
3212}
3213
3214static struct jsegdep *
3215newjsegdep(struct worklist *wk)
3216{
3217	struct jsegdep *jsegdep;
3218
3219	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3220	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3221	jsegdep->jd_seg = NULL;
3222
3223	return (jsegdep);
3224}
3225
3226static struct jmvref *
3227newjmvref(dp, ino, oldoff, newoff)
3228	struct inode *dp;
3229	ino_t ino;
3230	off_t oldoff;
3231	off_t newoff;
3232{
3233	struct jmvref *jmvref;
3234
3235	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3236	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3237	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3238	jmvref->jm_parent = dp->i_number;
3239	jmvref->jm_ino = ino;
3240	jmvref->jm_oldoff = oldoff;
3241	jmvref->jm_newoff = newoff;
3242
3243	return (jmvref);
3244}
3245
3246/*
3247 * Allocate a new jremref that tracks the removal of ip from dp with the
3248 * directory entry offset of diroff.  Mark the entry as ATTACHED and
3249 * DEPCOMPLETE as we have all the information required for the journal write
3250 * and the directory has already been removed from the buffer.  The caller
3251 * is responsible for linking the jremref into the pagedep and adding it
3252 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
3253 * a DOTDOT addition so handle_workitem_remove() can properly assign
3254 * the jsegdep when we're done.
3255 */
3256static struct jremref *
3257newjremref(dirrem, dp, ip, diroff, nlink)
3258	struct dirrem *dirrem;
3259	struct inode *dp;
3260	struct inode *ip;
3261	off_t diroff;
3262	nlink_t nlink;
3263{
3264	struct jremref *jremref;
3265
3266	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
3267	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
3268	jremref->jr_state = ATTACHED;
3269	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
3270	   nlink, ip->i_mode);
3271	jremref->jr_dirrem = dirrem;
3272
3273	return (jremref);
3274}
3275
3276static inline void
3277newinoref(inoref, ino, parent, diroff, nlink, mode)
3278	struct inoref *inoref;
3279	ino_t ino;
3280	ino_t parent;
3281	off_t diroff;
3282	nlink_t nlink;
3283	uint16_t mode;
3284{
3285
3286	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
3287	inoref->if_diroff = diroff;
3288	inoref->if_ino = ino;
3289	inoref->if_parent = parent;
3290	inoref->if_nlink = nlink;
3291	inoref->if_mode = mode;
3292}
3293
3294/*
3295 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
3296 * directory offset may not be known until later.  The caller is responsible
3297 * adding the entry to the journal when this information is available.  nlink
3298 * should be the link count prior to the addition and mode is only required
3299 * to have the correct FMT.
3300 */
3301static struct jaddref *
3302newjaddref(dp, ino, diroff, nlink, mode)
3303	struct inode *dp;
3304	ino_t ino;
3305	off_t diroff;
3306	int16_t nlink;
3307	uint16_t mode;
3308{
3309	struct jaddref *jaddref;
3310
3311	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
3312	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
3313	jaddref->ja_state = ATTACHED;
3314	jaddref->ja_mkdir = NULL;
3315	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
3316
3317	return (jaddref);
3318}
3319
3320/*
3321 * Create a new free dependency for a freework.  The caller is responsible
3322 * for adjusting the reference count when it has the lock held.  The freedep
3323 * will track an outstanding bitmap write that will ultimately clear the
3324 * freework to continue.
3325 */
3326static struct freedep *
3327newfreedep(struct freework *freework)
3328{
3329	struct freedep *freedep;
3330
3331	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
3332	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
3333	freedep->fd_freework = freework;
3334
3335	return (freedep);
3336}
3337
3338/*
3339 * Free a freedep structure once the buffer it is linked to is written.  If
3340 * this is the last reference to the freework schedule it for completion.
3341 */
3342static void
3343free_freedep(freedep)
3344	struct freedep *freedep;
3345{
3346
3347	if (--freedep->fd_freework->fw_ref == 0)
3348		add_to_worklist(&freedep->fd_freework->fw_list, 1);
3349	WORKITEM_FREE(freedep, D_FREEDEP);
3350}
3351
3352/*
3353 * Allocate a new freework structure that may be a level in an indirect
3354 * when parent is not NULL or a top level block when it is.  The top level
3355 * freework structures are allocated without lk held and before the freeblks
3356 * is visible outside of softdep_setup_freeblocks().
3357 */
3358static struct freework *
3359newfreework(freeblks, parent, lbn, nb, frags, journal)
3360	struct freeblks *freeblks;
3361	struct freework *parent;
3362	ufs_lbn_t lbn;
3363	ufs2_daddr_t nb;
3364	int frags;
3365	int journal;
3366{
3367	struct freework *freework;
3368
3369	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
3370	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
3371	freework->fw_freeblks = freeblks;
3372	freework->fw_parent = parent;
3373	freework->fw_lbn = lbn;
3374	freework->fw_blkno = nb;
3375	freework->fw_frags = frags;
3376	freework->fw_ref = 0;
3377	freework->fw_off = 0;
3378	LIST_INIT(&freework->fw_jwork);
3379
3380	if (parent == NULL) {
3381		WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd,
3382		    &freework->fw_list);
3383		freeblks->fb_ref++;
3384	}
3385	if (journal)
3386		newjfreeblk(freeblks, lbn, nb, frags);
3387
3388	return (freework);
3389}
3390
3391/*
3392 * Allocate a new jfreeblk to journal top level block pointer when truncating
3393 * a file.  The caller must add this to the worklist when lk is held.
3394 */
3395static struct jfreeblk *
3396newjfreeblk(freeblks, lbn, blkno, frags)
3397	struct freeblks *freeblks;
3398	ufs_lbn_t lbn;
3399	ufs2_daddr_t blkno;
3400	int frags;
3401{
3402	struct jfreeblk *jfreeblk;
3403
3404	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
3405	workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp);
3406	jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list);
3407	jfreeblk->jf_state = ATTACHED | DEPCOMPLETE;
3408	jfreeblk->jf_ino = freeblks->fb_previousinum;
3409	jfreeblk->jf_lbn = lbn;
3410	jfreeblk->jf_blkno = blkno;
3411	jfreeblk->jf_frags = frags;
3412	jfreeblk->jf_freeblks = freeblks;
3413	LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps);
3414
3415	return (jfreeblk);
3416}
3417
3418static void move_newblock_dep(struct jaddref *, struct inodedep *);
3419/*
3420 * If we're canceling a new bitmap we have to search for another ref
3421 * to move into the bmsafemap dep.  This might be better expressed
3422 * with another structure.
3423 */
3424static void
3425move_newblock_dep(jaddref, inodedep)
3426	struct jaddref *jaddref;
3427	struct inodedep *inodedep;
3428{
3429	struct inoref *inoref;
3430	struct jaddref *jaddrefn;
3431
3432	jaddrefn = NULL;
3433	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
3434	    inoref = TAILQ_NEXT(inoref, if_deps)) {
3435		if ((jaddref->ja_state & NEWBLOCK) &&
3436		    inoref->if_list.wk_type == D_JADDREF) {
3437			jaddrefn = (struct jaddref *)inoref;
3438			break;
3439		}
3440	}
3441	if (jaddrefn == NULL)
3442		return;
3443	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
3444	jaddrefn->ja_state |= jaddref->ja_state &
3445	    (ATTACHED | UNDONE | NEWBLOCK);
3446	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
3447	jaddref->ja_state |= ATTACHED;
3448	LIST_REMOVE(jaddref, ja_bmdeps);
3449	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
3450	    ja_bmdeps);
3451}
3452
3453/*
3454 * Cancel a jaddref either before it has been written or while it is being
3455 * written.  This happens when a link is removed before the add reaches
3456 * the disk.  The jaddref dependency is kept linked into the bmsafemap
3457 * and inode to prevent the link count or bitmap from reaching the disk
3458 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
3459 * required.
3460 *
3461 * Returns 1 if the canceled addref requires journaling of the remove and
3462 * 0 otherwise.
3463 */
3464static int
3465cancel_jaddref(jaddref, inodedep, wkhd)
3466	struct jaddref *jaddref;
3467	struct inodedep *inodedep;
3468	struct workhead *wkhd;
3469{
3470	struct inoref *inoref;
3471	struct jsegdep *jsegdep;
3472	int needsj;
3473
3474	KASSERT((jaddref->ja_state & COMPLETE) == 0,
3475	    ("cancel_jaddref: Canceling complete jaddref"));
3476	if (jaddref->ja_state & (IOSTARTED | COMPLETE))
3477		needsj = 1;
3478	else
3479		needsj = 0;
3480	if (inodedep == NULL)
3481		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3482		    0, &inodedep) == 0)
3483			panic("cancel_jaddref: Lost inodedep");
3484	/*
3485	 * We must adjust the nlink of any reference operation that follows
3486	 * us so that it is consistent with the in-memory reference.  This
3487	 * ensures that inode nlink rollbacks always have the correct link.
3488	 */
3489	if (needsj == 0)
3490		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
3491		    inoref = TAILQ_NEXT(inoref, if_deps))
3492			inoref->if_nlink--;
3493	jsegdep = inoref_jseg(&jaddref->ja_ref);
3494	if (jaddref->ja_state & NEWBLOCK)
3495		move_newblock_dep(jaddref, inodedep);
3496	if (jaddref->ja_state & IOWAITING) {
3497		jaddref->ja_state &= ~IOWAITING;
3498		wakeup(&jaddref->ja_list);
3499	}
3500	jaddref->ja_mkdir = NULL;
3501	if (jaddref->ja_state & IOSTARTED) {
3502		jaddref->ja_state &= ~IOSTARTED;
3503		WORKLIST_REMOVE(&jaddref->ja_list);
3504		WORKLIST_INSERT(wkhd, &jsegdep->jd_list);
3505	} else {
3506		free_jsegdep(jsegdep);
3507		if (jaddref->ja_state & DEPCOMPLETE)
3508			remove_from_journal(&jaddref->ja_list);
3509	}
3510	/*
3511	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
3512	 * can arrange for them to be freed with the bitmap.  Otherwise we
3513	 * no longer need this addref attached to the inoreflst and it
3514	 * will incorrectly adjust nlink if we leave it.
3515	 */
3516	if ((jaddref->ja_state & NEWBLOCK) == 0) {
3517		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
3518		    if_deps);
3519		jaddref->ja_state |= COMPLETE;
3520		free_jaddref(jaddref);
3521		return (needsj);
3522	}
3523	jaddref->ja_state |= GOINGAWAY;
3524	/*
3525	 * Leave the head of the list for jsegdeps for fast merging.
3526	 */
3527	if (LIST_FIRST(wkhd) != NULL) {
3528		jaddref->ja_state |= ONWORKLIST;
3529		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
3530	} else
3531		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
3532
3533	return (needsj);
3534}
3535
3536/*
3537 * Attempt to free a jaddref structure when some work completes.  This
3538 * should only succeed once the entry is written and all dependencies have
3539 * been notified.
3540 */
3541static void
3542free_jaddref(jaddref)
3543	struct jaddref *jaddref;
3544{
3545
3546	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
3547		return;
3548	if (jaddref->ja_ref.if_jsegdep)
3549		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
3550		    jaddref, jaddref->ja_state);
3551	if (jaddref->ja_state & NEWBLOCK)
3552		LIST_REMOVE(jaddref, ja_bmdeps);
3553	if (jaddref->ja_state & (IOSTARTED | ONWORKLIST))
3554		panic("free_jaddref: Bad state %p(0x%X)",
3555		    jaddref, jaddref->ja_state);
3556	if (jaddref->ja_mkdir != NULL)
3557		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
3558	WORKITEM_FREE(jaddref, D_JADDREF);
3559}
3560
3561/*
3562 * Free a jremref structure once it has been written or discarded.
3563 */
3564static void
3565free_jremref(jremref)
3566	struct jremref *jremref;
3567{
3568
3569	if (jremref->jr_ref.if_jsegdep)
3570		free_jsegdep(jremref->jr_ref.if_jsegdep);
3571	if (jremref->jr_state & IOSTARTED)
3572		panic("free_jremref: IO still pending");
3573	WORKITEM_FREE(jremref, D_JREMREF);
3574}
3575
3576/*
3577 * Free a jnewblk structure.
3578 */
3579static void
3580free_jnewblk(jnewblk)
3581	struct jnewblk *jnewblk;
3582{
3583
3584	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
3585		return;
3586	LIST_REMOVE(jnewblk, jn_deps);
3587	if (jnewblk->jn_newblk != NULL)
3588		panic("free_jnewblk: Dependency still attached.");
3589	WORKITEM_FREE(jnewblk, D_JNEWBLK);
3590}
3591
3592/*
3593 * Cancel a jnewblk which has been superseded by a freeblk.  The jnewblk
3594 * is kept linked into the bmsafemap until the free completes, thus
3595 * preventing the modified state from ever reaching disk.  The free
3596 * routine must pass this structure via ffs_blkfree() to
3597 * softdep_setup_freeblks() so there is no race in releasing the space.
3598 */
3599static void
3600cancel_jnewblk(jnewblk, wkhd)
3601	struct jnewblk *jnewblk;
3602	struct workhead *wkhd;
3603{
3604	struct jsegdep *jsegdep;
3605
3606	jsegdep = jnewblk->jn_jsegdep;
3607	jnewblk->jn_jsegdep  = NULL;
3608	free_jsegdep(jsegdep);
3609	jnewblk->jn_newblk = NULL;
3610	jnewblk->jn_state |= GOINGAWAY;
3611	if (jnewblk->jn_state & IOSTARTED) {
3612		jnewblk->jn_state &= ~IOSTARTED;
3613		WORKLIST_REMOVE(&jnewblk->jn_list);
3614	} else
3615		remove_from_journal(&jnewblk->jn_list);
3616	/*
3617	 * Leave the head of the list for jsegdeps for fast merging.
3618	 */
3619	if (LIST_FIRST(wkhd) != NULL) {
3620		jnewblk->jn_state |= ONWORKLIST;
3621		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jnewblk->jn_list, wk_list);
3622	} else
3623		WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
3624	if (jnewblk->jn_state & IOWAITING) {
3625		jnewblk->jn_state &= ~IOWAITING;
3626		wakeup(&jnewblk->jn_list);
3627	}
3628}
3629
3630static void
3631free_jfreeblk(jfreeblk)
3632	struct jfreeblk *jfreeblk;
3633{
3634
3635	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
3636}
3637
3638/*
3639 * Release one reference to a jseg and free it if the count reaches 0.  This
3640 * should eventually reclaim journal space as well.
3641 */
3642static void
3643free_jseg(jseg)
3644	struct jseg *jseg;
3645{
3646	struct jblocks *jblocks;
3647
3648	KASSERT(jseg->js_refs > 0,
3649	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
3650	if (--jseg->js_refs != 0)
3651		return;
3652	/*
3653	 * Free only those jsegs which have none allocated before them to
3654	 * preserve the journal space ordering.
3655	 */
3656	jblocks = jseg->js_jblocks;
3657	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
3658		jblocks->jb_oldestseq = jseg->js_seq;
3659		if (jseg->js_refs != 0)
3660			break;
3661		TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
3662		jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
3663		KASSERT(LIST_EMPTY(&jseg->js_entries),
3664		    ("free_jseg: Freed jseg has valid entries."));
3665		WORKITEM_FREE(jseg, D_JSEG);
3666	}
3667}
3668
3669/*
3670 * Release a jsegdep and decrement the jseg count.
3671 */
3672static void
3673free_jsegdep(jsegdep)
3674	struct jsegdep *jsegdep;
3675{
3676
3677	if (jsegdep->jd_seg)
3678		free_jseg(jsegdep->jd_seg);
3679	WORKITEM_FREE(jsegdep, D_JSEGDEP);
3680}
3681
3682/*
3683 * Wait for a journal item to make it to disk.  Initiate journal processing
3684 * if required.
3685 */
3686static void
3687jwait(wk)
3688	struct worklist *wk;
3689{
3690
3691	stat_journal_wait++;
3692	/*
3693	 * If IO has not started we process the journal.  We can't mark the
3694	 * worklist item as IOWAITING because we drop the lock while
3695	 * processing the journal and the worklist entry may be freed after
3696	 * this point.  The caller may call back in and re-issue the request.
3697	 */
3698	if ((wk->wk_state & IOSTARTED) == 0) {
3699		softdep_process_journal(wk->wk_mp, MNT_WAIT);
3700		return;
3701	}
3702	wk->wk_state |= IOWAITING;
3703	msleep(wk, &lk, PRIBIO, "jwait", 0);
3704}
3705
3706/*
3707 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
3708 * appropriate.  This is a convenience function to reduce duplicate code
3709 * for the setup and revert functions below.
3710 */
3711static struct inodedep *
3712inodedep_lookup_ip(ip)
3713	struct inode *ip;
3714{
3715	struct inodedep *inodedep;
3716
3717	KASSERT(ip->i_nlink >= ip->i_effnlink,
3718	    ("inodedep_lookup_ip: bad delta"));
3719	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
3720	    DEPALLOC, &inodedep);
3721	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3722
3723	return (inodedep);
3724}
3725
3726/*
3727 * Create a journal entry that describes a truncate that we're about to
3728 * perform.  The inode allocations and frees between here and the completion
3729 * of the operation are done asynchronously and without journaling.  At
3730 * the end of the operation the vnode is sync'd and the journal space
3731 * is released.  Recovery will discover the partially completed truncate
3732 * and complete it.
3733 */
3734void *
3735softdep_setup_trunc(vp, length, flags)
3736	struct vnode *vp;
3737	off_t length;
3738	int flags;
3739{
3740	struct jsegdep *jsegdep;
3741	struct jtrunc *jtrunc;
3742	struct ufsmount *ump;
3743	struct inode *ip;
3744
3745	softdep_prealloc(vp, MNT_WAIT);
3746	ip = VTOI(vp);
3747	ump = VFSTOUFS(vp->v_mount);
3748	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
3749	workitem_alloc(&jtrunc->jt_list, D_JTRUNC, vp->v_mount);
3750	jsegdep = jtrunc->jt_jsegdep = newjsegdep(&jtrunc->jt_list);
3751	jtrunc->jt_ino = ip->i_number;
3752	jtrunc->jt_extsize = 0;
3753	jtrunc->jt_size = length;
3754	if ((flags & IO_EXT) == 0 && ump->um_fstype == UFS2)
3755		jtrunc->jt_extsize = ip->i_din2->di_extsize;
3756	if ((flags & IO_NORMAL) == 0)
3757		jtrunc->jt_size = DIP(ip, i_size);
3758	ACQUIRE_LOCK(&lk);
3759	add_to_journal(&jtrunc->jt_list);
3760	while (jsegdep->jd_seg == NULL) {
3761		stat_jwait_freeblks++;
3762		jwait(&jtrunc->jt_list);
3763	}
3764	FREE_LOCK(&lk);
3765
3766	return (jsegdep);
3767}
3768
3769/*
3770 * After synchronous truncation is complete we free sync the vnode and
3771 * release the jsegdep so the journal space can be freed.
3772 */
3773int
3774softdep_complete_trunc(vp, cookie)
3775	struct vnode *vp;
3776	void *cookie;
3777{
3778	int error;
3779
3780	error = ffs_syncvnode(vp, MNT_WAIT);
3781	ACQUIRE_LOCK(&lk);
3782	free_jsegdep((struct jsegdep *)cookie);
3783	FREE_LOCK(&lk);
3784
3785	return (error);
3786}
3787
3788/*
3789 * Called prior to creating a new inode and linking it to a directory.  The
3790 * jaddref structure must already be allocated by softdep_setup_inomapdep
3791 * and it is discovered here so we can initialize the mode and update
3792 * nlinkdelta.
3793 */
3794void
3795softdep_setup_create(dp, ip)
3796	struct inode *dp;
3797	struct inode *ip;
3798{
3799	struct inodedep *inodedep;
3800	struct jaddref *jaddref;
3801	struct vnode *dvp;
3802
3803	KASSERT(ip->i_nlink == 1,
3804	    ("softdep_setup_create: Invalid link count."));
3805	dvp = ITOV(dp);
3806	ACQUIRE_LOCK(&lk);
3807	inodedep = inodedep_lookup_ip(ip);
3808	if (DOINGSUJ(dvp)) {
3809		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3810		    inoreflst);
3811		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
3812		    ("softdep_setup_create: No addref structure present."));
3813		jaddref->ja_mode = ip->i_mode;
3814	}
3815	softdep_prelink(dvp, NULL);
3816	FREE_LOCK(&lk);
3817}
3818
3819/*
3820 * Create a jaddref structure to track the addition of a DOTDOT link when
3821 * we are reparenting an inode as part of a rename.  This jaddref will be
3822 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
3823 * non-journaling softdep.
3824 */
3825void
3826softdep_setup_dotdot_link(dp, ip)
3827	struct inode *dp;
3828	struct inode *ip;
3829{
3830	struct inodedep *inodedep;
3831	struct jaddref *jaddref;
3832	struct vnode *dvp;
3833	struct vnode *vp;
3834
3835	dvp = ITOV(dp);
3836	vp = ITOV(ip);
3837	jaddref = NULL;
3838	/*
3839	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
3840	 * is used as a normal link would be.
3841	 */
3842	if (DOINGSUJ(dvp))
3843		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
3844		    dp->i_effnlink - 1, dp->i_mode);
3845	ACQUIRE_LOCK(&lk);
3846	inodedep = inodedep_lookup_ip(dp);
3847	if (jaddref)
3848		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
3849		    if_deps);
3850	softdep_prelink(dvp, ITOV(ip));
3851	FREE_LOCK(&lk);
3852}
3853
3854/*
3855 * Create a jaddref structure to track a new link to an inode.  The directory
3856 * offset is not known until softdep_setup_directory_add or
3857 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
3858 * softdep.
3859 */
3860void
3861softdep_setup_link(dp, ip)
3862	struct inode *dp;
3863	struct inode *ip;
3864{
3865	struct inodedep *inodedep;
3866	struct jaddref *jaddref;
3867	struct vnode *dvp;
3868
3869	dvp = ITOV(dp);
3870	jaddref = NULL;
3871	if (DOINGSUJ(dvp))
3872		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
3873		    ip->i_mode);
3874	ACQUIRE_LOCK(&lk);
3875	inodedep = inodedep_lookup_ip(ip);
3876	if (jaddref)
3877		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
3878		    if_deps);
3879	softdep_prelink(dvp, ITOV(ip));
3880	FREE_LOCK(&lk);
3881}
3882
3883/*
3884 * Called to create the jaddref structures to track . and .. references as
3885 * well as lookup and further initialize the incomplete jaddref created
3886 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
3887 * nlinkdelta for non-journaling softdep.
3888 */
3889void
3890softdep_setup_mkdir(dp, ip)
3891	struct inode *dp;
3892	struct inode *ip;
3893{
3894	struct inodedep *inodedep;
3895	struct jaddref *dotdotaddref;
3896	struct jaddref *dotaddref;
3897	struct jaddref *jaddref;
3898	struct vnode *dvp;
3899
3900	dvp = ITOV(dp);
3901	dotaddref = dotdotaddref = NULL;
3902	if (DOINGSUJ(dvp)) {
3903		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
3904		    ip->i_mode);
3905		dotaddref->ja_state |= MKDIR_BODY;
3906		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
3907		    dp->i_effnlink - 1, dp->i_mode);
3908		dotdotaddref->ja_state |= MKDIR_PARENT;
3909	}
3910	ACQUIRE_LOCK(&lk);
3911	inodedep = inodedep_lookup_ip(ip);
3912	if (DOINGSUJ(dvp)) {
3913		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3914		    inoreflst);
3915		KASSERT(jaddref != NULL,
3916		    ("softdep_setup_mkdir: No addref structure present."));
3917		KASSERT(jaddref->ja_parent == dp->i_number,
3918		    ("softdep_setup_mkdir: bad parent %d",
3919		    jaddref->ja_parent));
3920		jaddref->ja_mode = ip->i_mode;
3921		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
3922		    if_deps);
3923	}
3924	inodedep = inodedep_lookup_ip(dp);
3925	if (DOINGSUJ(dvp))
3926		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
3927		    &dotdotaddref->ja_ref, if_deps);
3928	softdep_prelink(ITOV(dp), NULL);
3929	FREE_LOCK(&lk);
3930}
3931
3932/*
3933 * Called to track nlinkdelta of the inode and parent directories prior to
3934 * unlinking a directory.
3935 */
3936void
3937softdep_setup_rmdir(dp, ip)
3938	struct inode *dp;
3939	struct inode *ip;
3940{
3941	struct vnode *dvp;
3942
3943	dvp = ITOV(dp);
3944	ACQUIRE_LOCK(&lk);
3945	(void) inodedep_lookup_ip(ip);
3946	(void) inodedep_lookup_ip(dp);
3947	softdep_prelink(dvp, ITOV(ip));
3948	FREE_LOCK(&lk);
3949}
3950
3951/*
3952 * Called to track nlinkdelta of the inode and parent directories prior to
3953 * unlink.
3954 */
3955void
3956softdep_setup_unlink(dp, ip)
3957	struct inode *dp;
3958	struct inode *ip;
3959{
3960	struct vnode *dvp;
3961
3962	dvp = ITOV(dp);
3963	ACQUIRE_LOCK(&lk);
3964	(void) inodedep_lookup_ip(ip);
3965	(void) inodedep_lookup_ip(dp);
3966	softdep_prelink(dvp, ITOV(ip));
3967	FREE_LOCK(&lk);
3968}
3969
3970/*
3971 * Called to release the journal structures created by a failed non-directory
3972 * creation.  Adjusts nlinkdelta for non-journaling softdep.
3973 */
3974void
3975softdep_revert_create(dp, ip)
3976	struct inode *dp;
3977	struct inode *ip;
3978{
3979	struct inodedep *inodedep;
3980	struct jaddref *jaddref;
3981	struct vnode *dvp;
3982
3983	dvp = ITOV(dp);
3984	ACQUIRE_LOCK(&lk);
3985	inodedep = inodedep_lookup_ip(ip);
3986	if (DOINGSUJ(dvp)) {
3987		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3988		    inoreflst);
3989		KASSERT(jaddref->ja_parent == dp->i_number,
3990		    ("softdep_revert_create: addref parent mismatch"));
3991		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
3992	}
3993	FREE_LOCK(&lk);
3994}
3995
3996/*
3997 * Called to release the journal structures created by a failed dotdot link
3998 * creation.  Adjusts nlinkdelta for non-journaling softdep.
3999 */
4000void
4001softdep_revert_dotdot_link(dp, ip)
4002	struct inode *dp;
4003	struct inode *ip;
4004{
4005	struct inodedep *inodedep;
4006	struct jaddref *jaddref;
4007	struct vnode *dvp;
4008
4009	dvp = ITOV(dp);
4010	ACQUIRE_LOCK(&lk);
4011	inodedep = inodedep_lookup_ip(dp);
4012	if (DOINGSUJ(dvp)) {
4013		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4014		    inoreflst);
4015		KASSERT(jaddref->ja_parent == ip->i_number,
4016		    ("softdep_revert_dotdot_link: addref parent mismatch"));
4017		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4018	}
4019	FREE_LOCK(&lk);
4020}
4021
4022/*
4023 * Called to release the journal structures created by a failed link
4024 * addition.  Adjusts nlinkdelta for non-journaling softdep.
4025 */
4026void
4027softdep_revert_link(dp, ip)
4028	struct inode *dp;
4029	struct inode *ip;
4030{
4031	struct inodedep *inodedep;
4032	struct jaddref *jaddref;
4033	struct vnode *dvp;
4034
4035	dvp = ITOV(dp);
4036	ACQUIRE_LOCK(&lk);
4037	inodedep = inodedep_lookup_ip(ip);
4038	if (DOINGSUJ(dvp)) {
4039		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4040		    inoreflst);
4041		KASSERT(jaddref->ja_parent == dp->i_number,
4042		    ("softdep_revert_link: addref parent mismatch"));
4043		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4044	}
4045	FREE_LOCK(&lk);
4046}
4047
4048/*
4049 * Called to release the journal structures created by a failed mkdir
4050 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4051 */
4052void
4053softdep_revert_mkdir(dp, ip)
4054	struct inode *dp;
4055	struct inode *ip;
4056{
4057	struct inodedep *inodedep;
4058	struct jaddref *jaddref;
4059	struct vnode *dvp;
4060
4061	dvp = ITOV(dp);
4062
4063	ACQUIRE_LOCK(&lk);
4064	inodedep = inodedep_lookup_ip(dp);
4065	if (DOINGSUJ(dvp)) {
4066		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4067		    inoreflst);
4068		KASSERT(jaddref->ja_parent == ip->i_number,
4069		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4070		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4071	}
4072	inodedep = inodedep_lookup_ip(ip);
4073	if (DOINGSUJ(dvp)) {
4074		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4075		    inoreflst);
4076		KASSERT(jaddref->ja_parent == dp->i_number,
4077		    ("softdep_revert_mkdir: addref parent mismatch"));
4078		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4079		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4080		    inoreflst);
4081		KASSERT(jaddref->ja_parent == ip->i_number,
4082		    ("softdep_revert_mkdir: dot addref parent mismatch"));
4083		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4084	}
4085	FREE_LOCK(&lk);
4086}
4087
4088/*
4089 * Called to correct nlinkdelta after a failed rmdir.
4090 */
4091void
4092softdep_revert_rmdir(dp, ip)
4093	struct inode *dp;
4094	struct inode *ip;
4095{
4096
4097	ACQUIRE_LOCK(&lk);
4098	(void) inodedep_lookup_ip(ip);
4099	(void) inodedep_lookup_ip(dp);
4100	FREE_LOCK(&lk);
4101}
4102
4103/*
4104 * Protecting the freemaps (or bitmaps).
4105 *
4106 * To eliminate the need to execute fsck before mounting a filesystem
4107 * after a power failure, one must (conservatively) guarantee that the
4108 * on-disk copy of the bitmaps never indicate that a live inode or block is
4109 * free.  So, when a block or inode is allocated, the bitmap should be
4110 * updated (on disk) before any new pointers.  When a block or inode is
4111 * freed, the bitmap should not be updated until all pointers have been
4112 * reset.  The latter dependency is handled by the delayed de-allocation
4113 * approach described below for block and inode de-allocation.  The former
4114 * dependency is handled by calling the following procedure when a block or
4115 * inode is allocated. When an inode is allocated an "inodedep" is created
4116 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4117 * Each "inodedep" is also inserted into the hash indexing structure so
4118 * that any additional link additions can be made dependent on the inode
4119 * allocation.
4120 *
4121 * The ufs filesystem maintains a number of free block counts (e.g., per
4122 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4123 * in addition to the bitmaps.  These counts are used to improve efficiency
4124 * during allocation and therefore must be consistent with the bitmaps.
4125 * There is no convenient way to guarantee post-crash consistency of these
4126 * counts with simple update ordering, for two main reasons: (1) The counts
4127 * and bitmaps for a single cylinder group block are not in the same disk
4128 * sector.  If a disk write is interrupted (e.g., by power failure), one may
4129 * be written and the other not.  (2) Some of the counts are located in the
4130 * superblock rather than the cylinder group block. So, we focus our soft
4131 * updates implementation on protecting the bitmaps. When mounting a
4132 * filesystem, we recompute the auxiliary counts from the bitmaps.
4133 */
4134
4135/*
4136 * Called just after updating the cylinder group block to allocate an inode.
4137 */
4138void
4139softdep_setup_inomapdep(bp, ip, newinum)
4140	struct buf *bp;		/* buffer for cylgroup block with inode map */
4141	struct inode *ip;	/* inode related to allocation */
4142	ino_t newinum;		/* new inode number being allocated */
4143{
4144	struct inodedep *inodedep;
4145	struct bmsafemap *bmsafemap;
4146	struct jaddref *jaddref;
4147	struct mount *mp;
4148	struct fs *fs;
4149
4150	mp = UFSTOVFS(ip->i_ump);
4151	fs = ip->i_ump->um_fs;
4152	jaddref = NULL;
4153
4154	/*
4155	 * Allocate the journal reference add structure so that the bitmap
4156	 * can be dependent on it.
4157	 */
4158	if (mp->mnt_kern_flag & MNTK_SUJ) {
4159		jaddref = newjaddref(ip, newinum, 0, 0, 0);
4160		jaddref->ja_state |= NEWBLOCK;
4161	}
4162
4163	/*
4164	 * Create a dependency for the newly allocated inode.
4165	 * Panic if it already exists as something is seriously wrong.
4166	 * Otherwise add it to the dependency list for the buffer holding
4167	 * the cylinder group map from which it was allocated.
4168	 */
4169	ACQUIRE_LOCK(&lk);
4170	if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep)))
4171		panic("softdep_setup_inomapdep: dependency %p for new"
4172		    "inode already exists", inodedep);
4173	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum));
4174	if (jaddref) {
4175		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
4176		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4177		    if_deps);
4178	} else {
4179		inodedep->id_state |= ONDEPLIST;
4180		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
4181	}
4182	inodedep->id_bmsafemap = bmsafemap;
4183	inodedep->id_state &= ~DEPCOMPLETE;
4184	FREE_LOCK(&lk);
4185}
4186
4187/*
4188 * Called just after updating the cylinder group block to
4189 * allocate block or fragment.
4190 */
4191void
4192softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
4193	struct buf *bp;		/* buffer for cylgroup block with block map */
4194	struct mount *mp;	/* filesystem doing allocation */
4195	ufs2_daddr_t newblkno;	/* number of newly allocated block */
4196	int frags;		/* Number of fragments. */
4197	int oldfrags;		/* Previous number of fragments for extend. */
4198{
4199	struct newblk *newblk;
4200	struct bmsafemap *bmsafemap;
4201	struct jnewblk *jnewblk;
4202	struct fs *fs;
4203
4204	fs = VFSTOUFS(mp)->um_fs;
4205	jnewblk = NULL;
4206	/*
4207	 * Create a dependency for the newly allocated block.
4208	 * Add it to the dependency list for the buffer holding
4209	 * the cylinder group map from which it was allocated.
4210	 */
4211	if (mp->mnt_kern_flag & MNTK_SUJ) {
4212		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
4213		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
4214		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
4215		jnewblk->jn_state = ATTACHED;
4216		jnewblk->jn_blkno = newblkno;
4217		jnewblk->jn_frags = frags;
4218		jnewblk->jn_oldfrags = oldfrags;
4219#ifdef SUJ_DEBUG
4220		{
4221			struct cg *cgp;
4222			uint8_t *blksfree;
4223			long bno;
4224			int i;
4225
4226			cgp = (struct cg *)bp->b_data;
4227			blksfree = cg_blksfree(cgp);
4228			bno = dtogd(fs, jnewblk->jn_blkno);
4229			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
4230			    i++) {
4231				if (isset(blksfree, bno + i))
4232					panic("softdep_setup_blkmapdep: "
4233					    "free fragment %d from %d-%d "
4234					    "state 0x%X dep %p", i,
4235					    jnewblk->jn_oldfrags,
4236					    jnewblk->jn_frags,
4237					    jnewblk->jn_state,
4238					    jnewblk->jn_newblk);
4239			}
4240		}
4241#endif
4242	}
4243	ACQUIRE_LOCK(&lk);
4244	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
4245		panic("softdep_setup_blkmapdep: found block");
4246	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
4247	    dtog(fs, newblkno));
4248	if (jnewblk) {
4249		jnewblk->jn_newblk = newblk;
4250		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
4251	} else {
4252		newblk->nb_state |= ONDEPLIST;
4253		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
4254	}
4255	newblk->nb_bmsafemap = bmsafemap;
4256	newblk->nb_jnewblk = jnewblk;
4257	FREE_LOCK(&lk);
4258}
4259
4260#define	BMSAFEMAP_HASH(fs, cg) \
4261      (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash])
4262
4263static int
4264bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp)
4265	struct bmsafemap_hashhead *bmsafemaphd;
4266	struct mount *mp;
4267	int cg;
4268	struct bmsafemap **bmsafemapp;
4269{
4270	struct bmsafemap *bmsafemap;
4271
4272	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
4273		if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg)
4274			break;
4275	if (bmsafemap) {
4276		*bmsafemapp = bmsafemap;
4277		return (1);
4278	}
4279	*bmsafemapp = NULL;
4280
4281	return (0);
4282}
4283
4284/*
4285 * Find the bmsafemap associated with a cylinder group buffer.
4286 * If none exists, create one. The buffer must be locked when
4287 * this routine is called and this routine must be called with
4288 * splbio interrupts blocked.
4289 */
4290static struct bmsafemap *
4291bmsafemap_lookup(mp, bp, cg)
4292	struct mount *mp;
4293	struct buf *bp;
4294	int cg;
4295{
4296	struct bmsafemap_hashhead *bmsafemaphd;
4297	struct bmsafemap *bmsafemap, *collision;
4298	struct worklist *wk;
4299	struct fs *fs;
4300
4301	mtx_assert(&lk, MA_OWNED);
4302	if (bp)
4303		LIST_FOREACH(wk, &bp->b_dep, wk_list)
4304			if (wk->wk_type == D_BMSAFEMAP)
4305				return (WK_BMSAFEMAP(wk));
4306	fs = VFSTOUFS(mp)->um_fs;
4307	bmsafemaphd = BMSAFEMAP_HASH(fs, cg);
4308	if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1)
4309		return (bmsafemap);
4310	FREE_LOCK(&lk);
4311	bmsafemap = malloc(sizeof(struct bmsafemap),
4312		M_BMSAFEMAP, M_SOFTDEP_FLAGS);
4313	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
4314	bmsafemap->sm_buf = bp;
4315	LIST_INIT(&bmsafemap->sm_inodedephd);
4316	LIST_INIT(&bmsafemap->sm_inodedepwr);
4317	LIST_INIT(&bmsafemap->sm_newblkhd);
4318	LIST_INIT(&bmsafemap->sm_newblkwr);
4319	LIST_INIT(&bmsafemap->sm_jaddrefhd);
4320	LIST_INIT(&bmsafemap->sm_jnewblkhd);
4321	ACQUIRE_LOCK(&lk);
4322	if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) {
4323		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
4324		return (collision);
4325	}
4326	bmsafemap->sm_cg = cg;
4327	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
4328	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
4329	return (bmsafemap);
4330}
4331
4332/*
4333 * Direct block allocation dependencies.
4334 *
4335 * When a new block is allocated, the corresponding disk locations must be
4336 * initialized (with zeros or new data) before the on-disk inode points to
4337 * them.  Also, the freemap from which the block was allocated must be
4338 * updated (on disk) before the inode's pointer. These two dependencies are
4339 * independent of each other and are needed for all file blocks and indirect
4340 * blocks that are pointed to directly by the inode.  Just before the
4341 * "in-core" version of the inode is updated with a newly allocated block
4342 * number, a procedure (below) is called to setup allocation dependency
4343 * structures.  These structures are removed when the corresponding
4344 * dependencies are satisfied or when the block allocation becomes obsolete
4345 * (i.e., the file is deleted, the block is de-allocated, or the block is a
4346 * fragment that gets upgraded).  All of these cases are handled in
4347 * procedures described later.
4348 *
4349 * When a file extension causes a fragment to be upgraded, either to a larger
4350 * fragment or to a full block, the on-disk location may change (if the
4351 * previous fragment could not simply be extended). In this case, the old
4352 * fragment must be de-allocated, but not until after the inode's pointer has
4353 * been updated. In most cases, this is handled by later procedures, which
4354 * will construct a "freefrag" structure to be added to the workitem queue
4355 * when the inode update is complete (or obsolete).  The main exception to
4356 * this is when an allocation occurs while a pending allocation dependency
4357 * (for the same block pointer) remains.  This case is handled in the main
4358 * allocation dependency setup procedure by immediately freeing the
4359 * unreferenced fragments.
4360 */
4361void
4362softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
4363	struct inode *ip;	/* inode to which block is being added */
4364	ufs_lbn_t off;		/* block pointer within inode */
4365	ufs2_daddr_t newblkno;	/* disk block number being added */
4366	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
4367	long newsize;		/* size of new block */
4368	long oldsize;		/* size of new block */
4369	struct buf *bp;		/* bp for allocated block */
4370{
4371	struct allocdirect *adp, *oldadp;
4372	struct allocdirectlst *adphead;
4373	struct freefrag *freefrag;
4374	struct inodedep *inodedep;
4375	struct pagedep *pagedep;
4376	struct jnewblk *jnewblk;
4377	struct newblk *newblk;
4378	struct mount *mp;
4379	ufs_lbn_t lbn;
4380
4381	lbn = bp->b_lblkno;
4382	mp = UFSTOVFS(ip->i_ump);
4383	if (oldblkno && oldblkno != newblkno)
4384		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
4385	else
4386		freefrag = NULL;
4387
4388	ACQUIRE_LOCK(&lk);
4389	if (off >= NDADDR) {
4390		if (lbn > 0)
4391			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
4392			    lbn, off);
4393		/* allocating an indirect block */
4394		if (oldblkno != 0)
4395			panic("softdep_setup_allocdirect: non-zero indir");
4396	} else {
4397		if (off != lbn)
4398			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
4399			    lbn, off);
4400		/*
4401		 * Allocating a direct block.
4402		 *
4403		 * If we are allocating a directory block, then we must
4404		 * allocate an associated pagedep to track additions and
4405		 * deletions.
4406		 */
4407		if ((ip->i_mode & IFMT) == IFDIR &&
4408		    pagedep_lookup(mp, ip->i_number, off, DEPALLOC,
4409		    &pagedep) == 0)
4410			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
4411	}
4412	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
4413		panic("softdep_setup_allocdirect: lost block");
4414	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4415	    ("softdep_setup_allocdirect: newblk already initialized"));
4416	/*
4417	 * Convert the newblk to an allocdirect.
4418	 */
4419	newblk->nb_list.wk_type = D_ALLOCDIRECT;
4420	adp = (struct allocdirect *)newblk;
4421	newblk->nb_freefrag = freefrag;
4422	adp->ad_offset = off;
4423	adp->ad_oldblkno = oldblkno;
4424	adp->ad_newsize = newsize;
4425	adp->ad_oldsize = oldsize;
4426
4427	/*
4428	 * Finish initializing the journal.
4429	 */
4430	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4431		jnewblk->jn_ino = ip->i_number;
4432		jnewblk->jn_lbn = lbn;
4433		add_to_journal(&jnewblk->jn_list);
4434	}
4435	if (freefrag && freefrag->ff_jfreefrag != NULL)
4436		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
4437	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
4438	adp->ad_inodedep = inodedep;
4439
4440	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
4441	/*
4442	 * The list of allocdirects must be kept in sorted and ascending
4443	 * order so that the rollback routines can quickly determine the
4444	 * first uncommitted block (the size of the file stored on disk
4445	 * ends at the end of the lowest committed fragment, or if there
4446	 * are no fragments, at the end of the highest committed block).
4447	 * Since files generally grow, the typical case is that the new
4448	 * block is to be added at the end of the list. We speed this
4449	 * special case by checking against the last allocdirect in the
4450	 * list before laboriously traversing the list looking for the
4451	 * insertion point.
4452	 */
4453	adphead = &inodedep->id_newinoupdt;
4454	oldadp = TAILQ_LAST(adphead, allocdirectlst);
4455	if (oldadp == NULL || oldadp->ad_offset <= off) {
4456		/* insert at end of list */
4457		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
4458		if (oldadp != NULL && oldadp->ad_offset == off)
4459			allocdirect_merge(adphead, adp, oldadp);
4460		FREE_LOCK(&lk);
4461		return;
4462	}
4463	TAILQ_FOREACH(oldadp, adphead, ad_next) {
4464		if (oldadp->ad_offset >= off)
4465			break;
4466	}
4467	if (oldadp == NULL)
4468		panic("softdep_setup_allocdirect: lost entry");
4469	/* insert in middle of list */
4470	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
4471	if (oldadp->ad_offset == off)
4472		allocdirect_merge(adphead, adp, oldadp);
4473
4474	FREE_LOCK(&lk);
4475}
4476
4477/*
4478 * Replace an old allocdirect dependency with a newer one.
4479 * This routine must be called with splbio interrupts blocked.
4480 */
4481static void
4482allocdirect_merge(adphead, newadp, oldadp)
4483	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
4484	struct allocdirect *newadp;	/* allocdirect being added */
4485	struct allocdirect *oldadp;	/* existing allocdirect being checked */
4486{
4487	struct worklist *wk;
4488	struct freefrag *freefrag;
4489	struct newdirblk *newdirblk;
4490
4491	freefrag = NULL;
4492	mtx_assert(&lk, MA_OWNED);
4493	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
4494	    newadp->ad_oldsize != oldadp->ad_newsize ||
4495	    newadp->ad_offset >= NDADDR)
4496		panic("%s %jd != new %jd || old size %ld != new %ld",
4497		    "allocdirect_merge: old blkno",
4498		    (intmax_t)newadp->ad_oldblkno,
4499		    (intmax_t)oldadp->ad_newblkno,
4500		    newadp->ad_oldsize, oldadp->ad_newsize);
4501	newadp->ad_oldblkno = oldadp->ad_oldblkno;
4502	newadp->ad_oldsize = oldadp->ad_oldsize;
4503	/*
4504	 * If the old dependency had a fragment to free or had never
4505	 * previously had a block allocated, then the new dependency
4506	 * can immediately post its freefrag and adopt the old freefrag.
4507	 * This action is done by swapping the freefrag dependencies.
4508	 * The new dependency gains the old one's freefrag, and the
4509	 * old one gets the new one and then immediately puts it on
4510	 * the worklist when it is freed by free_newblk. It is
4511	 * not possible to do this swap when the old dependency had a
4512	 * non-zero size but no previous fragment to free. This condition
4513	 * arises when the new block is an extension of the old block.
4514	 * Here, the first part of the fragment allocated to the new
4515	 * dependency is part of the block currently claimed on disk by
4516	 * the old dependency, so cannot legitimately be freed until the
4517	 * conditions for the new dependency are fulfilled.
4518	 */
4519	freefrag = newadp->ad_freefrag;
4520	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
4521		newadp->ad_freefrag = oldadp->ad_freefrag;
4522		oldadp->ad_freefrag = freefrag;
4523	}
4524	/*
4525	 * If we are tracking a new directory-block allocation,
4526	 * move it from the old allocdirect to the new allocdirect.
4527	 */
4528	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
4529		newdirblk = WK_NEWDIRBLK(wk);
4530		WORKLIST_REMOVE(&newdirblk->db_list);
4531		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
4532			panic("allocdirect_merge: extra newdirblk");
4533		WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
4534	}
4535	TAILQ_REMOVE(adphead, oldadp, ad_next);
4536	/*
4537	 * We need to move any journal dependencies over to the freefrag
4538	 * that releases this block if it exists.  Otherwise we are
4539	 * extending an existing block and we'll wait until that is
4540	 * complete to release the journal space and extend the
4541	 * new journal to cover this old space as well.
4542	 */
4543	if (freefrag == NULL) {
4544		struct jnewblk *jnewblk;
4545		struct jnewblk *njnewblk;
4546
4547		if (oldadp->ad_newblkno != newadp->ad_newblkno)
4548			panic("allocdirect_merge: %jd != %jd",
4549			    oldadp->ad_newblkno, newadp->ad_newblkno);
4550		jnewblk = oldadp->ad_block.nb_jnewblk;
4551		cancel_newblk(&oldadp->ad_block, &newadp->ad_block.nb_jwork);
4552		/*
4553		 * We have an unwritten jnewblk, we need to merge the
4554		 * frag bits with our own.  The newer adp's journal can not
4555		 * be written prior to the old one so no need to check for
4556		 * it here.
4557		 */
4558		if (jnewblk) {
4559			njnewblk = newadp->ad_block.nb_jnewblk;
4560			if (njnewblk == NULL)
4561				panic("allocdirect_merge: No jnewblk");
4562			if (jnewblk->jn_state & UNDONE) {
4563				njnewblk->jn_state |= UNDONE | NEWBLOCK;
4564				njnewblk->jn_state &= ~ATTACHED;
4565				jnewblk->jn_state &= ~UNDONE;
4566			}
4567			njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
4568			WORKLIST_REMOVE(&jnewblk->jn_list);
4569			jnewblk->jn_state |= ATTACHED | COMPLETE;
4570			free_jnewblk(jnewblk);
4571		}
4572	} else {
4573		/*
4574		 * We can skip journaling for this freefrag and just complete
4575		 * any pending journal work for the allocdirect that is being
4576		 * removed after the freefrag completes.
4577		 */
4578		if (freefrag->ff_jfreefrag)
4579			cancel_jfreefrag(freefrag->ff_jfreefrag);
4580		cancel_newblk(&oldadp->ad_block, &freefrag->ff_jwork);
4581	}
4582	free_newblk(&oldadp->ad_block);
4583}
4584
4585/*
4586 * Allocate a jfreefrag structure to journal a single block free.
4587 */
4588static struct jfreefrag *
4589newjfreefrag(freefrag, ip, blkno, size, lbn)
4590	struct freefrag *freefrag;
4591	struct inode *ip;
4592	ufs2_daddr_t blkno;
4593	long size;
4594	ufs_lbn_t lbn;
4595{
4596	struct jfreefrag *jfreefrag;
4597	struct fs *fs;
4598
4599	fs = ip->i_fs;
4600	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
4601	    M_SOFTDEP_FLAGS);
4602	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
4603	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
4604	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
4605	jfreefrag->fr_ino = ip->i_number;
4606	jfreefrag->fr_lbn = lbn;
4607	jfreefrag->fr_blkno = blkno;
4608	jfreefrag->fr_frags = numfrags(fs, size);
4609	jfreefrag->fr_freefrag = freefrag;
4610
4611	return (jfreefrag);
4612}
4613
4614/*
4615 * Allocate a new freefrag structure.
4616 */
4617static struct freefrag *
4618newfreefrag(ip, blkno, size, lbn)
4619	struct inode *ip;
4620	ufs2_daddr_t blkno;
4621	long size;
4622	ufs_lbn_t lbn;
4623{
4624	struct freefrag *freefrag;
4625	struct fs *fs;
4626
4627	fs = ip->i_fs;
4628	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
4629		panic("newfreefrag: frag size");
4630	freefrag = malloc(sizeof(struct freefrag),
4631	    M_FREEFRAG, M_SOFTDEP_FLAGS);
4632	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
4633	freefrag->ff_state = ATTACHED;
4634	LIST_INIT(&freefrag->ff_jwork);
4635	freefrag->ff_inum = ip->i_number;
4636	freefrag->ff_blkno = blkno;
4637	freefrag->ff_fragsize = size;
4638
4639	if (fs->fs_flags & FS_SUJ) {
4640		freefrag->ff_jfreefrag =
4641		    newjfreefrag(freefrag, ip, blkno, size, lbn);
4642	} else {
4643		freefrag->ff_state |= DEPCOMPLETE;
4644		freefrag->ff_jfreefrag = NULL;
4645	}
4646
4647	return (freefrag);
4648}
4649
4650/*
4651 * This workitem de-allocates fragments that were replaced during
4652 * file block allocation.
4653 */
4654static void
4655handle_workitem_freefrag(freefrag)
4656	struct freefrag *freefrag;
4657{
4658	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
4659	struct workhead wkhd;
4660
4661	/*
4662	 * It would be illegal to add new completion items to the
4663	 * freefrag after it was schedule to be done so it must be
4664	 * safe to modify the list head here.
4665	 */
4666	LIST_INIT(&wkhd);
4667	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
4668	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
4669	    freefrag->ff_fragsize, freefrag->ff_inum, &wkhd);
4670	ACQUIRE_LOCK(&lk);
4671	WORKITEM_FREE(freefrag, D_FREEFRAG);
4672	FREE_LOCK(&lk);
4673}
4674
4675/*
4676 * Set up a dependency structure for an external attributes data block.
4677 * This routine follows much of the structure of softdep_setup_allocdirect.
4678 * See the description of softdep_setup_allocdirect above for details.
4679 */
4680void
4681softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
4682	struct inode *ip;
4683	ufs_lbn_t off;
4684	ufs2_daddr_t newblkno;
4685	ufs2_daddr_t oldblkno;
4686	long newsize;
4687	long oldsize;
4688	struct buf *bp;
4689{
4690	struct allocdirect *adp, *oldadp;
4691	struct allocdirectlst *adphead;
4692	struct freefrag *freefrag;
4693	struct inodedep *inodedep;
4694	struct jnewblk *jnewblk;
4695	struct newblk *newblk;
4696	struct mount *mp;
4697	ufs_lbn_t lbn;
4698
4699	if (off >= NXADDR)
4700		panic("softdep_setup_allocext: lbn %lld > NXADDR",
4701		    (long long)off);
4702
4703	lbn = bp->b_lblkno;
4704	mp = UFSTOVFS(ip->i_ump);
4705	if (oldblkno && oldblkno != newblkno)
4706		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
4707	else
4708		freefrag = NULL;
4709
4710	ACQUIRE_LOCK(&lk);
4711	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
4712		panic("softdep_setup_allocext: lost block");
4713	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4714	    ("softdep_setup_allocext: newblk already initialized"));
4715	/*
4716	 * Convert the newblk to an allocdirect.
4717	 */
4718	newblk->nb_list.wk_type = D_ALLOCDIRECT;
4719	adp = (struct allocdirect *)newblk;
4720	newblk->nb_freefrag = freefrag;
4721	adp->ad_offset = off;
4722	adp->ad_oldblkno = oldblkno;
4723	adp->ad_newsize = newsize;
4724	adp->ad_oldsize = oldsize;
4725	adp->ad_state |=  EXTDATA;
4726
4727	/*
4728	 * Finish initializing the journal.
4729	 */
4730	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4731		jnewblk->jn_ino = ip->i_number;
4732		jnewblk->jn_lbn = lbn;
4733		add_to_journal(&jnewblk->jn_list);
4734	}
4735	if (freefrag && freefrag->ff_jfreefrag != NULL)
4736		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
4737	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
4738	adp->ad_inodedep = inodedep;
4739
4740	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
4741	/*
4742	 * The list of allocdirects must be kept in sorted and ascending
4743	 * order so that the rollback routines can quickly determine the
4744	 * first uncommitted block (the size of the file stored on disk
4745	 * ends at the end of the lowest committed fragment, or if there
4746	 * are no fragments, at the end of the highest committed block).
4747	 * Since files generally grow, the typical case is that the new
4748	 * block is to be added at the end of the list. We speed this
4749	 * special case by checking against the last allocdirect in the
4750	 * list before laboriously traversing the list looking for the
4751	 * insertion point.
4752	 */
4753	adphead = &inodedep->id_newextupdt;
4754	oldadp = TAILQ_LAST(adphead, allocdirectlst);
4755	if (oldadp == NULL || oldadp->ad_offset <= off) {
4756		/* insert at end of list */
4757		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
4758		if (oldadp != NULL && oldadp->ad_offset == off)
4759			allocdirect_merge(adphead, adp, oldadp);
4760		FREE_LOCK(&lk);
4761		return;
4762	}
4763	TAILQ_FOREACH(oldadp, adphead, ad_next) {
4764		if (oldadp->ad_offset >= off)
4765			break;
4766	}
4767	if (oldadp == NULL)
4768		panic("softdep_setup_allocext: lost entry");
4769	/* insert in middle of list */
4770	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
4771	if (oldadp->ad_offset == off)
4772		allocdirect_merge(adphead, adp, oldadp);
4773	FREE_LOCK(&lk);
4774}
4775
4776/*
4777 * Indirect block allocation dependencies.
4778 *
4779 * The same dependencies that exist for a direct block also exist when
4780 * a new block is allocated and pointed to by an entry in a block of
4781 * indirect pointers. The undo/redo states described above are also
4782 * used here. Because an indirect block contains many pointers that
4783 * may have dependencies, a second copy of the entire in-memory indirect
4784 * block is kept. The buffer cache copy is always completely up-to-date.
4785 * The second copy, which is used only as a source for disk writes,
4786 * contains only the safe pointers (i.e., those that have no remaining
4787 * update dependencies). The second copy is freed when all pointers
4788 * are safe. The cache is not allowed to replace indirect blocks with
4789 * pending update dependencies. If a buffer containing an indirect
4790 * block with dependencies is written, these routines will mark it
4791 * dirty again. It can only be successfully written once all the
4792 * dependencies are removed. The ffs_fsync routine in conjunction with
4793 * softdep_sync_metadata work together to get all the dependencies
4794 * removed so that a file can be successfully written to disk. Three
4795 * procedures are used when setting up indirect block pointer
4796 * dependencies. The division is necessary because of the organization
4797 * of the "balloc" routine and because of the distinction between file
4798 * pages and file metadata blocks.
4799 */
4800
4801/*
4802 * Allocate a new allocindir structure.
4803 */
4804static struct allocindir *
4805newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
4806	struct inode *ip;	/* inode for file being extended */
4807	int ptrno;		/* offset of pointer in indirect block */
4808	ufs2_daddr_t newblkno;	/* disk block number being added */
4809	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
4810	ufs_lbn_t lbn;
4811{
4812	struct newblk *newblk;
4813	struct allocindir *aip;
4814	struct freefrag *freefrag;
4815	struct jnewblk *jnewblk;
4816
4817	if (oldblkno)
4818		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
4819	else
4820		freefrag = NULL;
4821	ACQUIRE_LOCK(&lk);
4822	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
4823		panic("new_allocindir: lost block");
4824	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4825	    ("newallocindir: newblk already initialized"));
4826	newblk->nb_list.wk_type = D_ALLOCINDIR;
4827	newblk->nb_freefrag = freefrag;
4828	aip = (struct allocindir *)newblk;
4829	aip->ai_offset = ptrno;
4830	aip->ai_oldblkno = oldblkno;
4831	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4832		jnewblk->jn_ino = ip->i_number;
4833		jnewblk->jn_lbn = lbn;
4834		add_to_journal(&jnewblk->jn_list);
4835	}
4836	if (freefrag && freefrag->ff_jfreefrag != NULL)
4837		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
4838	return (aip);
4839}
4840
4841/*
4842 * Called just before setting an indirect block pointer
4843 * to a newly allocated file page.
4844 */
4845void
4846softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
4847	struct inode *ip;	/* inode for file being extended */
4848	ufs_lbn_t lbn;		/* allocated block number within file */
4849	struct buf *bp;		/* buffer with indirect blk referencing page */
4850	int ptrno;		/* offset of pointer in indirect block */
4851	ufs2_daddr_t newblkno;	/* disk block number being added */
4852	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
4853	struct buf *nbp;	/* buffer holding allocated page */
4854{
4855	struct inodedep *inodedep;
4856	struct allocindir *aip;
4857	struct pagedep *pagedep;
4858	struct mount *mp;
4859
4860	if (lbn != nbp->b_lblkno)
4861		panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
4862		    lbn, bp->b_lblkno);
4863	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
4864	mp = UFSTOVFS(ip->i_ump);
4865	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
4866	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
4867	/*
4868	 * If we are allocating a directory page, then we must
4869	 * allocate an associated pagedep to track additions and
4870	 * deletions.
4871	 */
4872	if ((ip->i_mode & IFMT) == IFDIR &&
4873	    pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0)
4874		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
4875	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
4876	setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
4877	FREE_LOCK(&lk);
4878}
4879
4880/*
4881 * Called just before setting an indirect block pointer to a
4882 * newly allocated indirect block.
4883 */
4884void
4885softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
4886	struct buf *nbp;	/* newly allocated indirect block */
4887	struct inode *ip;	/* inode for file being extended */
4888	struct buf *bp;		/* indirect block referencing allocated block */
4889	int ptrno;		/* offset of pointer in indirect block */
4890	ufs2_daddr_t newblkno;	/* disk block number being added */
4891{
4892	struct inodedep *inodedep;
4893	struct allocindir *aip;
4894	ufs_lbn_t lbn;
4895
4896	lbn = nbp->b_lblkno;
4897	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
4898	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
4899	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
4900	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
4901	setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
4902	FREE_LOCK(&lk);
4903}
4904
4905static void
4906indirdep_complete(indirdep)
4907	struct indirdep *indirdep;
4908{
4909	struct allocindir *aip;
4910
4911	LIST_REMOVE(indirdep, ir_next);
4912	indirdep->ir_state &= ~ONDEPLIST;
4913
4914	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
4915		LIST_REMOVE(aip, ai_next);
4916		free_newblk(&aip->ai_block);
4917	}
4918	/*
4919	 * If this indirdep is not attached to a buf it was simply waiting
4920	 * on completion to clear completehd.  free_indirdep() asserts
4921	 * that nothing is dangling.
4922	 */
4923	if ((indirdep->ir_state & ONWORKLIST) == 0)
4924		free_indirdep(indirdep);
4925}
4926
4927/*
4928 * Called to finish the allocation of the "aip" allocated
4929 * by one of the two routines above.
4930 */
4931static void
4932setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
4933	struct buf *bp;		/* in-memory copy of the indirect block */
4934	struct inode *ip;	/* inode for file being extended */
4935	struct inodedep *inodedep; /* Inodedep for ip */
4936	struct allocindir *aip;	/* allocindir allocated by the above routines */
4937	ufs_lbn_t lbn;		/* Logical block number for this block. */
4938{
4939	struct worklist *wk;
4940	struct fs *fs;
4941	struct newblk *newblk;
4942	struct indirdep *indirdep, *newindirdep;
4943	struct allocindir *oldaip;
4944	struct freefrag *freefrag;
4945	struct mount *mp;
4946	ufs2_daddr_t blkno;
4947
4948	mp = UFSTOVFS(ip->i_ump);
4949	fs = ip->i_fs;
4950	mtx_assert(&lk, MA_OWNED);
4951	if (bp->b_lblkno >= 0)
4952		panic("setup_allocindir_phase2: not indir blk");
4953	for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) {
4954		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4955			if (wk->wk_type != D_INDIRDEP)
4956				continue;
4957			indirdep = WK_INDIRDEP(wk);
4958			break;
4959		}
4960		if (indirdep == NULL && newindirdep) {
4961			indirdep = newindirdep;
4962			newindirdep = NULL;
4963			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
4964			if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0,
4965			    &newblk)) {
4966				indirdep->ir_state |= ONDEPLIST;
4967				LIST_INSERT_HEAD(&newblk->nb_indirdeps,
4968				    indirdep, ir_next);
4969			} else
4970				indirdep->ir_state |= DEPCOMPLETE;
4971		}
4972		if (indirdep) {
4973			aip->ai_indirdep = indirdep;
4974			/*
4975			 * Check to see if there is an existing dependency
4976			 * for this block. If there is, merge the old
4977			 * dependency into the new one.  This happens
4978			 * as a result of reallocblk only.
4979			 */
4980			if (aip->ai_oldblkno == 0)
4981				oldaip = NULL;
4982			else
4983
4984				LIST_FOREACH(oldaip, &indirdep->ir_deplisthd,
4985				    ai_next)
4986					if (oldaip->ai_offset == aip->ai_offset)
4987						break;
4988			if (oldaip != NULL)
4989				freefrag = allocindir_merge(aip, oldaip);
4990			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
4991			KASSERT(aip->ai_offset >= 0 &&
4992			    aip->ai_offset < NINDIR(ip->i_ump->um_fs),
4993			    ("setup_allocindir_phase2: Bad offset %d",
4994			    aip->ai_offset));
4995			KASSERT(indirdep->ir_savebp != NULL,
4996			    ("setup_allocindir_phase2 NULL ir_savebp"));
4997			if (ip->i_ump->um_fstype == UFS1)
4998				((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
4999				    [aip->ai_offset] = aip->ai_oldblkno;
5000			else
5001				((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
5002				    [aip->ai_offset] = aip->ai_oldblkno;
5003			FREE_LOCK(&lk);
5004			if (freefrag != NULL)
5005				handle_workitem_freefrag(freefrag);
5006		} else
5007			FREE_LOCK(&lk);
5008		if (newindirdep) {
5009			newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
5010			brelse(newindirdep->ir_savebp);
5011			ACQUIRE_LOCK(&lk);
5012			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
5013			if (indirdep)
5014				break;
5015			FREE_LOCK(&lk);
5016		}
5017		if (indirdep) {
5018			ACQUIRE_LOCK(&lk);
5019			break;
5020		}
5021		newindirdep = malloc(sizeof(struct indirdep),
5022			M_INDIRDEP, M_SOFTDEP_FLAGS);
5023		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5024		newindirdep->ir_state = ATTACHED;
5025		if (ip->i_ump->um_fstype == UFS1)
5026			newindirdep->ir_state |= UFS1FMT;
5027		newindirdep->ir_saveddata = NULL;
5028		LIST_INIT(&newindirdep->ir_deplisthd);
5029		LIST_INIT(&newindirdep->ir_donehd);
5030		LIST_INIT(&newindirdep->ir_writehd);
5031		LIST_INIT(&newindirdep->ir_completehd);
5032		LIST_INIT(&newindirdep->ir_jwork);
5033		if (bp->b_blkno == bp->b_lblkno) {
5034			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5035			    NULL, NULL);
5036			bp->b_blkno = blkno;
5037		}
5038		newindirdep->ir_savebp =
5039		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5040		BUF_KERNPROC(newindirdep->ir_savebp);
5041		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5042		ACQUIRE_LOCK(&lk);
5043	}
5044}
5045
5046/*
5047 * Merge two allocindirs which refer to the same block.  Move newblock
5048 * dependencies and setup the freefrags appropriately.
5049 */
5050static struct freefrag *
5051allocindir_merge(aip, oldaip)
5052	struct allocindir *aip;
5053	struct allocindir *oldaip;
5054{
5055	struct newdirblk *newdirblk;
5056	struct freefrag *freefrag;
5057	struct worklist *wk;
5058
5059	if (oldaip->ai_newblkno != aip->ai_oldblkno)
5060		panic("allocindir_merge: blkno");
5061	aip->ai_oldblkno = oldaip->ai_oldblkno;
5062	freefrag = aip->ai_freefrag;
5063	aip->ai_freefrag = oldaip->ai_freefrag;
5064	oldaip->ai_freefrag = NULL;
5065	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
5066	/*
5067	 * If we are tracking a new directory-block allocation,
5068	 * move it from the old allocindir to the new allocindir.
5069	 */
5070	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
5071		newdirblk = WK_NEWDIRBLK(wk);
5072		WORKLIST_REMOVE(&newdirblk->db_list);
5073		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
5074			panic("allocindir_merge: extra newdirblk");
5075		WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list);
5076	}
5077	/*
5078	 * We can skip journaling for this freefrag and just complete
5079	 * any pending journal work for the allocindir that is being
5080	 * removed after the freefrag completes.
5081	 */
5082	if (freefrag->ff_jfreefrag)
5083		cancel_jfreefrag(freefrag->ff_jfreefrag);
5084	LIST_REMOVE(oldaip, ai_next);
5085	cancel_newblk(&oldaip->ai_block, &freefrag->ff_jwork);
5086	free_newblk(&oldaip->ai_block);
5087
5088	return (freefrag);
5089}
5090
5091/*
5092 * Block de-allocation dependencies.
5093 *
5094 * When blocks are de-allocated, the on-disk pointers must be nullified before
5095 * the blocks are made available for use by other files.  (The true
5096 * requirement is that old pointers must be nullified before new on-disk
5097 * pointers are set.  We chose this slightly more stringent requirement to
5098 * reduce complexity.) Our implementation handles this dependency by updating
5099 * the inode (or indirect block) appropriately but delaying the actual block
5100 * de-allocation (i.e., freemap and free space count manipulation) until
5101 * after the updated versions reach stable storage.  After the disk is
5102 * updated, the blocks can be safely de-allocated whenever it is convenient.
5103 * This implementation handles only the common case of reducing a file's
5104 * length to zero. Other cases are handled by the conventional synchronous
5105 * write approach.
5106 *
5107 * The ffs implementation with which we worked double-checks
5108 * the state of the block pointers and file size as it reduces
5109 * a file's length.  Some of this code is replicated here in our
5110 * soft updates implementation.  The freeblks->fb_chkcnt field is
5111 * used to transfer a part of this information to the procedure
5112 * that eventually de-allocates the blocks.
5113 *
5114 * This routine should be called from the routine that shortens
5115 * a file's length, before the inode's size or block pointers
5116 * are modified. It will save the block pointer information for
5117 * later release and zero the inode so that the calling routine
5118 * can release it.
5119 */
5120void
5121softdep_setup_freeblocks(ip, length, flags)
5122	struct inode *ip;	/* The inode whose length is to be reduced */
5123	off_t length;		/* The new length for the file */
5124	int flags;		/* IO_EXT and/or IO_NORMAL */
5125{
5126	struct ufs1_dinode *dp1;
5127	struct ufs2_dinode *dp2;
5128	struct freeblks *freeblks;
5129	struct inodedep *inodedep;
5130	struct allocdirect *adp;
5131	struct jfreeblk *jfreeblk;
5132	struct bufobj *bo;
5133	struct vnode *vp;
5134	struct buf *bp;
5135	struct fs *fs;
5136	ufs2_daddr_t extblocks, datablocks;
5137	struct mount *mp;
5138	int i, delay, error;
5139	ufs2_daddr_t blkno;
5140	ufs_lbn_t tmpval;
5141	ufs_lbn_t lbn;
5142	long oldextsize;
5143	long oldsize;
5144	int frags;
5145	int needj;
5146
5147	fs = ip->i_fs;
5148	mp = UFSTOVFS(ip->i_ump);
5149	if (length != 0)
5150		panic("softdep_setup_freeblocks: non-zero length");
5151	freeblks = malloc(sizeof(struct freeblks),
5152		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
5153	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
5154	LIST_INIT(&freeblks->fb_jfreeblkhd);
5155	LIST_INIT(&freeblks->fb_jwork);
5156	freeblks->fb_state = ATTACHED;
5157	freeblks->fb_uid = ip->i_uid;
5158	freeblks->fb_previousinum = ip->i_number;
5159	freeblks->fb_devvp = ip->i_devvp;
5160	freeblks->fb_chkcnt = 0;
5161	ACQUIRE_LOCK(&lk);
5162	/*
5163	 * If we're truncating a removed file that will never be written
5164	 * we don't need to journal the block frees.  The canceled journals
5165	 * for the allocations will suffice.
5166	 */
5167	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5168	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED ||
5169	    (fs->fs_flags & FS_SUJ) == 0)
5170		needj = 0;
5171	else
5172		needj = 1;
5173	num_freeblkdep++;
5174	FREE_LOCK(&lk);
5175	extblocks = 0;
5176	if (fs->fs_magic == FS_UFS2_MAGIC)
5177		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
5178	datablocks = DIP(ip, i_blocks) - extblocks;
5179	if ((flags & IO_NORMAL) != 0) {
5180		oldsize = ip->i_size;
5181		ip->i_size = 0;
5182		DIP_SET(ip, i_size, 0);
5183		freeblks->fb_chkcnt = datablocks;
5184		for (i = 0; i < NDADDR; i++) {
5185			blkno = DIP(ip, i_db[i]);
5186			DIP_SET(ip, i_db[i], 0);
5187			if (blkno == 0)
5188				continue;
5189			frags = sblksize(fs, oldsize, i);
5190			frags = numfrags(fs, frags);
5191			newfreework(freeblks, NULL, i, blkno, frags, needj);
5192		}
5193		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
5194		    i++, tmpval *= NINDIR(fs)) {
5195			blkno = DIP(ip, i_ib[i]);
5196			DIP_SET(ip, i_ib[i], 0);
5197			if (blkno)
5198				newfreework(freeblks, NULL, -lbn - i, blkno,
5199				    fs->fs_frag, needj);
5200			lbn += tmpval;
5201		}
5202		/*
5203		 * If the file was removed, then the space being freed was
5204		 * accounted for then (see softdep_releasefile()). If the
5205		 * file is merely being truncated, then we account for it now.
5206		 */
5207		if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
5208			UFS_LOCK(ip->i_ump);
5209			fs->fs_pendingblocks += datablocks;
5210			UFS_UNLOCK(ip->i_ump);
5211		}
5212	}
5213	if ((flags & IO_EXT) != 0) {
5214		oldextsize = ip->i_din2->di_extsize;
5215		ip->i_din2->di_extsize = 0;
5216		freeblks->fb_chkcnt += extblocks;
5217		for (i = 0; i < NXADDR; i++) {
5218			blkno = ip->i_din2->di_extb[i];
5219			ip->i_din2->di_extb[i] = 0;
5220			if (blkno == 0)
5221				continue;
5222			frags = sblksize(fs, oldextsize, i);
5223			frags = numfrags(fs, frags);
5224			newfreework(freeblks, NULL, -1 - i, blkno, frags,
5225			    needj);
5226		}
5227	}
5228	if (LIST_EMPTY(&freeblks->fb_jfreeblkhd))
5229		needj = 0;
5230	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
5231	/*
5232	 * Push the zero'ed inode to to its disk buffer so that we are free
5233	 * to delete its dependencies below. Once the dependencies are gone
5234	 * the buffer can be safely released.
5235	 */
5236	if ((error = bread(ip->i_devvp,
5237	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
5238	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
5239		brelse(bp);
5240		softdep_error("softdep_setup_freeblocks", error);
5241	}
5242	if (ip->i_ump->um_fstype == UFS1) {
5243		dp1 = ((struct ufs1_dinode *)bp->b_data +
5244		    ino_to_fsbo(fs, ip->i_number));
5245		ip->i_din1->di_freelink = dp1->di_freelink;
5246		*dp1 = *ip->i_din1;
5247	} else {
5248		dp2 = ((struct ufs2_dinode *)bp->b_data +
5249		    ino_to_fsbo(fs, ip->i_number));
5250		ip->i_din2->di_freelink = dp2->di_freelink;
5251		*dp2 = *ip->i_din2;
5252	}
5253	/*
5254	 * Find and eliminate any inode dependencies.
5255	 */
5256	ACQUIRE_LOCK(&lk);
5257	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5258	if ((inodedep->id_state & IOSTARTED) != 0)
5259		panic("softdep_setup_freeblocks: inode busy");
5260	/*
5261	 * Add the freeblks structure to the list of operations that
5262	 * must await the zero'ed inode being written to disk. If we
5263	 * still have a bitmap dependency (delay == 0), then the inode
5264	 * has never been written to disk, so we can process the
5265	 * freeblks below once we have deleted the dependencies.
5266	 */
5267	delay = (inodedep->id_state & DEPCOMPLETE);
5268	if (delay)
5269		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
5270	else if (needj)
5271		freeblks->fb_state |= DEPCOMPLETE | COMPLETE;
5272	/*
5273	 * Because the file length has been truncated to zero, any
5274	 * pending block allocation dependency structures associated
5275	 * with this inode are obsolete and can simply be de-allocated.
5276	 * We must first merge the two dependency lists to get rid of
5277	 * any duplicate freefrag structures, then purge the merged list.
5278	 * If we still have a bitmap dependency, then the inode has never
5279	 * been written to disk, so we can free any fragments without delay.
5280	 */
5281	if (flags & IO_NORMAL) {
5282		merge_inode_lists(&inodedep->id_newinoupdt,
5283		    &inodedep->id_inoupdt);
5284		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
5285			cancel_allocdirect(&inodedep->id_inoupdt, adp,
5286			    freeblks, delay);
5287	}
5288	if (flags & IO_EXT) {
5289		merge_inode_lists(&inodedep->id_newextupdt,
5290		    &inodedep->id_extupdt);
5291		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
5292			cancel_allocdirect(&inodedep->id_extupdt, adp,
5293			    freeblks, delay);
5294	}
5295	LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps)
5296		add_to_journal(&jfreeblk->jf_list);
5297
5298	FREE_LOCK(&lk);
5299	bdwrite(bp);
5300	/*
5301	 * We must wait for any I/O in progress to finish so that
5302	 * all potential buffers on the dirty list will be visible.
5303	 * Once they are all there, walk the list and get rid of
5304	 * any dependencies.
5305	 */
5306	vp = ITOV(ip);
5307	bo = &vp->v_bufobj;
5308	BO_LOCK(bo);
5309	drain_output(vp);
5310restart:
5311	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
5312		if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
5313		    ((flags & IO_NORMAL) == 0 &&
5314		      (bp->b_xflags & BX_ALTDATA) == 0))
5315			continue;
5316		if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL)
5317			goto restart;
5318		BO_UNLOCK(bo);
5319		ACQUIRE_LOCK(&lk);
5320		(void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
5321		if (deallocate_dependencies(bp, inodedep, freeblks))
5322			bp->b_flags |= B_INVAL | B_NOCACHE;
5323		FREE_LOCK(&lk);
5324		brelse(bp);
5325		BO_LOCK(bo);
5326		goto restart;
5327	}
5328	BO_UNLOCK(bo);
5329	ACQUIRE_LOCK(&lk);
5330	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
5331		(void) free_inodedep(inodedep);
5332
5333	if (delay) {
5334		freeblks->fb_state |= DEPCOMPLETE;
5335		/*
5336		 * If the inode with zeroed block pointers is now on disk
5337		 * we can start freeing blocks. Add freeblks to the worklist
5338		 * instead of calling  handle_workitem_freeblocks directly as
5339		 * it is more likely that additional IO is needed to complete
5340		 * the request here than in the !delay case.
5341		 */
5342		if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
5343			add_to_worklist(&freeblks->fb_list, 1);
5344	}
5345
5346	FREE_LOCK(&lk);
5347	/*
5348	 * If the inode has never been written to disk (delay == 0) and
5349	 * we're not waiting on any journal writes, then we can process the
5350	 * freeblks now that we have deleted the dependencies.
5351	 */
5352	if (!delay && !needj)
5353		handle_workitem_freeblocks(freeblks, 0);
5354}
5355
5356/*
5357 * Reclaim any dependency structures from a buffer that is about to
5358 * be reallocated to a new vnode. The buffer must be locked, thus,
5359 * no I/O completion operations can occur while we are manipulating
5360 * its associated dependencies. The mutex is held so that other I/O's
5361 * associated with related dependencies do not occur.  Returns 1 if
5362 * all dependencies were cleared, 0 otherwise.
5363 */
5364static int
5365deallocate_dependencies(bp, inodedep, freeblks)
5366	struct buf *bp;
5367	struct inodedep *inodedep;
5368	struct freeblks *freeblks;
5369{
5370	struct worklist *wk;
5371	struct indirdep *indirdep;
5372	struct newdirblk *newdirblk;
5373	struct allocindir *aip;
5374	struct pagedep *pagedep;
5375	struct jremref *jremref;
5376	struct jmvref *jmvref;
5377	struct dirrem *dirrem;
5378	int i;
5379
5380	mtx_assert(&lk, MA_OWNED);
5381	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
5382		switch (wk->wk_type) {
5383
5384		case D_INDIRDEP:
5385			indirdep = WK_INDIRDEP(wk);
5386			if (bp->b_lblkno >= 0 ||
5387			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
5388				panic("deallocate_dependencies: not indir");
5389			cancel_indirdep(indirdep, bp, inodedep, freeblks);
5390			continue;
5391
5392		case D_PAGEDEP:
5393			pagedep = WK_PAGEDEP(wk);
5394			/*
5395			 * There should be no directory add dependencies present
5396			 * as the directory could not be truncated until all
5397			 * children were removed.
5398			 */
5399			KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
5400			    ("deallocate_dependencies: pendinghd != NULL"));
5401			for (i = 0; i < DAHASHSZ; i++)
5402				KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
5403				    ("deallocate_dependencies: diraddhd != NULL"));
5404			/*
5405			 * Copy any directory remove dependencies to the list
5406			 * to be processed after the zero'ed inode is written.
5407			 * If the inode has already been written, then they
5408			 * can be dumped directly onto the work list.
5409			 */
5410			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
5411				/*
5412				 * If there are any dirrems we wait for
5413				 * the journal write to complete and
5414				 * then restart the buf scan as the lock
5415				 * has been dropped.
5416				 */
5417				while ((jremref =
5418				    LIST_FIRST(&dirrem->dm_jremrefhd))
5419				    != NULL) {
5420					stat_jwait_filepage++;
5421					jwait(&jremref->jr_list);
5422					return (0);
5423				}
5424				LIST_REMOVE(dirrem, dm_next);
5425				dirrem->dm_dirinum = pagedep->pd_ino;
5426				if (inodedep == NULL ||
5427				    (inodedep->id_state & ALLCOMPLETE) ==
5428				     ALLCOMPLETE) {
5429					dirrem->dm_state |= COMPLETE;
5430					add_to_worklist(&dirrem->dm_list, 0);
5431				} else
5432					WORKLIST_INSERT(&inodedep->id_bufwait,
5433					    &dirrem->dm_list);
5434			}
5435			if ((pagedep->pd_state & NEWBLOCK) != 0) {
5436				newdirblk = pagedep->pd_newdirblk;
5437				WORKLIST_REMOVE(&newdirblk->db_list);
5438				free_newdirblk(newdirblk);
5439			}
5440			while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd))
5441			    != NULL) {
5442				stat_jwait_filepage++;
5443				jwait(&jmvref->jm_list);
5444				return (0);
5445			}
5446			WORKLIST_REMOVE(&pagedep->pd_list);
5447			LIST_REMOVE(pagedep, pd_hash);
5448			WORKITEM_FREE(pagedep, D_PAGEDEP);
5449			continue;
5450
5451		case D_ALLOCINDIR:
5452			aip = WK_ALLOCINDIR(wk);
5453			cancel_allocindir(aip, inodedep, freeblks);
5454			continue;
5455
5456		case D_ALLOCDIRECT:
5457		case D_INODEDEP:
5458			panic("deallocate_dependencies: Unexpected type %s",
5459			    TYPENAME(wk->wk_type));
5460			/* NOTREACHED */
5461
5462		default:
5463			panic("deallocate_dependencies: Unknown type %s",
5464			    TYPENAME(wk->wk_type));
5465			/* NOTREACHED */
5466		}
5467	}
5468
5469	return (1);
5470}
5471
5472/*
5473 * An allocdirect is being canceled due to a truncate.  We must make sure
5474 * the journal entry is released in concert with the blkfree that releases
5475 * the storage.  Completed journal entries must not be released until the
5476 * space is no longer pointed to by the inode or in the bitmap.
5477 */
5478static void
5479cancel_allocdirect(adphead, adp, freeblks, delay)
5480	struct allocdirectlst *adphead;
5481	struct allocdirect *adp;
5482	struct freeblks *freeblks;
5483	int delay;
5484{
5485	struct freework *freework;
5486	struct newblk *newblk;
5487	struct worklist *wk;
5488	ufs_lbn_t lbn;
5489
5490	TAILQ_REMOVE(adphead, adp, ad_next);
5491	newblk = (struct newblk *)adp;
5492	/*
5493	 * If the journal hasn't been written the jnewblk must be passed
5494	 * to the call to ffs_freeblk that reclaims the space.  We accomplish
5495	 * this by linking the journal dependency into the freework to be
5496	 * freed when freework_freeblock() is called.  If the journal has
5497	 * been written we can simply reclaim the journal space when the
5498	 * freeblks work is complete.
5499	 */
5500	if (newblk->nb_jnewblk == NULL) {
5501		cancel_newblk(newblk, &freeblks->fb_jwork);
5502		goto found;
5503	}
5504	lbn = newblk->nb_jnewblk->jn_lbn;
5505	/*
5506	 * Find the correct freework structure so it releases the canceled
5507	 * journal when the bitmap is cleared.  This preserves rollback
5508	 * until the allocation is reverted.
5509	 */
5510	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
5511		freework = WK_FREEWORK(wk);
5512		if (freework->fw_lbn != lbn)
5513			continue;
5514		cancel_newblk(newblk, &freework->fw_jwork);
5515		goto found;
5516	}
5517	panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn);
5518found:
5519	if (delay)
5520		WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
5521		    &newblk->nb_list);
5522	else
5523		free_newblk(newblk);
5524	return;
5525}
5526
5527
5528static void
5529cancel_newblk(newblk, wkhd)
5530	struct newblk *newblk;
5531	struct workhead *wkhd;
5532{
5533	struct indirdep *indirdep;
5534	struct allocindir *aip;
5535
5536	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
5537		indirdep->ir_state &= ~ONDEPLIST;
5538		LIST_REMOVE(indirdep, ir_next);
5539		/*
5540		 * If an indirdep is not on the buf worklist we need to
5541		 * free it here as deallocate_dependencies() will never
5542		 * find it.  These pointers were never visible on disk and
5543		 * can be discarded immediately.
5544		 */
5545		while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5546			LIST_REMOVE(aip, ai_next);
5547			cancel_newblk(&aip->ai_block, wkhd);
5548			free_newblk(&aip->ai_block);
5549		}
5550		/*
5551		 * If this indirdep is not attached to a buf it was simply
5552		 * waiting on completion to clear completehd.  free_indirdep()
5553		 * asserts that nothing is dangling.
5554		 */
5555		if ((indirdep->ir_state & ONWORKLIST) == 0)
5556			free_indirdep(indirdep);
5557	}
5558	if (newblk->nb_state & ONDEPLIST) {
5559		newblk->nb_state &= ~ONDEPLIST;
5560		LIST_REMOVE(newblk, nb_deps);
5561	}
5562	if (newblk->nb_state & ONWORKLIST)
5563		WORKLIST_REMOVE(&newblk->nb_list);
5564	/*
5565	 * If the journal entry hasn't been written we hold onto the dep
5566	 * until it is safe to free along with the other journal work.
5567	 */
5568	if (newblk->nb_jnewblk != NULL) {
5569		cancel_jnewblk(newblk->nb_jnewblk, wkhd);
5570		newblk->nb_jnewblk = NULL;
5571	}
5572	if (!LIST_EMPTY(&newblk->nb_jwork))
5573		jwork_move(wkhd, &newblk->nb_jwork);
5574}
5575
5576/*
5577 * Free a newblk. Generate a new freefrag work request if appropriate.
5578 * This must be called after the inode pointer and any direct block pointers
5579 * are valid or fully removed via truncate or frag extension.
5580 */
5581static void
5582free_newblk(newblk)
5583	struct newblk *newblk;
5584{
5585	struct indirdep *indirdep;
5586	struct newdirblk *newdirblk;
5587	struct freefrag *freefrag;
5588	struct worklist *wk;
5589
5590	mtx_assert(&lk, MA_OWNED);
5591	if (newblk->nb_state & ONDEPLIST)
5592		LIST_REMOVE(newblk, nb_deps);
5593	if (newblk->nb_state & ONWORKLIST)
5594		WORKLIST_REMOVE(&newblk->nb_list);
5595	LIST_REMOVE(newblk, nb_hash);
5596	if ((freefrag = newblk->nb_freefrag) != NULL) {
5597		freefrag->ff_state |= COMPLETE;
5598		if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
5599			add_to_worklist(&freefrag->ff_list, 0);
5600	}
5601	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) {
5602		newdirblk = WK_NEWDIRBLK(wk);
5603		WORKLIST_REMOVE(&newdirblk->db_list);
5604		if (!LIST_EMPTY(&newblk->nb_newdirblk))
5605			panic("free_newblk: extra newdirblk");
5606		free_newdirblk(newdirblk);
5607	}
5608	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
5609		indirdep->ir_state |= DEPCOMPLETE;
5610		indirdep_complete(indirdep);
5611	}
5612	KASSERT(newblk->nb_jnewblk == NULL,
5613	    ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk));
5614	handle_jwork(&newblk->nb_jwork);
5615	newblk->nb_list.wk_type = D_NEWBLK;
5616	WORKITEM_FREE(newblk, D_NEWBLK);
5617}
5618
5619/*
5620 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
5621 * This routine must be called with splbio interrupts blocked.
5622 */
5623static void
5624free_newdirblk(newdirblk)
5625	struct newdirblk *newdirblk;
5626{
5627	struct pagedep *pagedep;
5628	struct diradd *dap;
5629	struct worklist *wk;
5630	int i;
5631
5632	mtx_assert(&lk, MA_OWNED);
5633	/*
5634	 * If the pagedep is still linked onto the directory buffer
5635	 * dependency chain, then some of the entries on the
5636	 * pd_pendinghd list may not be committed to disk yet. In
5637	 * this case, we will simply clear the NEWBLOCK flag and
5638	 * let the pd_pendinghd list be processed when the pagedep
5639	 * is next written. If the pagedep is no longer on the buffer
5640	 * dependency chain, then all the entries on the pd_pending
5641	 * list are committed to disk and we can free them here.
5642	 */
5643	pagedep = newdirblk->db_pagedep;
5644	pagedep->pd_state &= ~NEWBLOCK;
5645	if ((pagedep->pd_state & ONWORKLIST) == 0)
5646		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
5647			free_diradd(dap, NULL);
5648	/*
5649	 * If no dependencies remain, the pagedep will be freed.
5650	 */
5651	for (i = 0; i < DAHASHSZ; i++)
5652		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
5653			break;
5654	if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 &&
5655	    LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
5656		KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL,
5657		    ("free_newdirblk: Freeing non-free pagedep %p", pagedep));
5658		LIST_REMOVE(pagedep, pd_hash);
5659		WORKITEM_FREE(pagedep, D_PAGEDEP);
5660	}
5661	/* Should only ever be one item in the list. */
5662	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
5663		WORKLIST_REMOVE(wk);
5664		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
5665	}
5666	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
5667}
5668
5669/*
5670 * Prepare an inode to be freed. The actual free operation is not
5671 * done until the zero'ed inode has been written to disk.
5672 */
5673void
5674softdep_freefile(pvp, ino, mode)
5675	struct vnode *pvp;
5676	ino_t ino;
5677	int mode;
5678{
5679	struct inode *ip = VTOI(pvp);
5680	struct inodedep *inodedep;
5681	struct freefile *freefile;
5682
5683	/*
5684	 * This sets up the inode de-allocation dependency.
5685	 */
5686	freefile = malloc(sizeof(struct freefile),
5687		M_FREEFILE, M_SOFTDEP_FLAGS);
5688	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
5689	freefile->fx_mode = mode;
5690	freefile->fx_oldinum = ino;
5691	freefile->fx_devvp = ip->i_devvp;
5692	LIST_INIT(&freefile->fx_jwork);
5693	if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
5694		UFS_LOCK(ip->i_ump);
5695		ip->i_fs->fs_pendinginodes += 1;
5696		UFS_UNLOCK(ip->i_ump);
5697	}
5698
5699	/*
5700	 * If the inodedep does not exist, then the zero'ed inode has
5701	 * been written to disk. If the allocated inode has never been
5702	 * written to disk, then the on-disk inode is zero'ed. In either
5703	 * case we can free the file immediately.  If the journal was
5704	 * canceled before being written the inode will never make it to
5705	 * disk and we must send the canceled journal entrys to
5706	 * ffs_freefile() to be cleared in conjunction with the bitmap.
5707	 * Any blocks waiting on the inode to write can be safely freed
5708	 * here as it will never been written.
5709	 */
5710	ACQUIRE_LOCK(&lk);
5711	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
5712	/*
5713	 * Remove this inode from the unlinked list and set
5714	 * GOINGAWAY as appropriate to indicate that this inode
5715	 * will never be written.
5716	 */
5717	if (inodedep && inodedep->id_state & UNLINKED) {
5718		/*
5719		 * Save the journal work to be freed with the bitmap
5720		 * before we clear UNLINKED.  Otherwise it can be lost
5721		 * if the inode block is written.
5722		 */
5723		handle_bufwait(inodedep, &freefile->fx_jwork);
5724		clear_unlinked_inodedep(inodedep);
5725		/* Re-acquire inodedep as we've dropped lk. */
5726		inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
5727		if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0)
5728			inodedep->id_state |= GOINGAWAY;
5729	}
5730	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
5731		FREE_LOCK(&lk);
5732		handle_workitem_freefile(freefile);
5733		return;
5734	}
5735	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
5736	FREE_LOCK(&lk);
5737	if (ip->i_number == ino)
5738		ip->i_flag |= IN_MODIFIED;
5739}
5740
5741/*
5742 * Check to see if an inode has never been written to disk. If
5743 * so free the inodedep and return success, otherwise return failure.
5744 * This routine must be called with splbio interrupts blocked.
5745 *
5746 * If we still have a bitmap dependency, then the inode has never
5747 * been written to disk. Drop the dependency as it is no longer
5748 * necessary since the inode is being deallocated. We set the
5749 * ALLCOMPLETE flags since the bitmap now properly shows that the
5750 * inode is not allocated. Even if the inode is actively being
5751 * written, it has been rolled back to its zero'ed state, so we
5752 * are ensured that a zero inode is what is on the disk. For short
5753 * lived files, this change will usually result in removing all the
5754 * dependencies from the inode so that it can be freed immediately.
5755 */
5756static int
5757check_inode_unwritten(inodedep)
5758	struct inodedep *inodedep;
5759{
5760
5761	mtx_assert(&lk, MA_OWNED);
5762
5763	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
5764	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
5765	    !LIST_EMPTY(&inodedep->id_bufwait) ||
5766	    !LIST_EMPTY(&inodedep->id_inowait) ||
5767	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
5768	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
5769	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
5770	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
5771	    inodedep->id_mkdiradd != NULL ||
5772	    inodedep->id_nlinkdelta != 0)
5773		return (0);
5774	/*
5775	 * Another process might be in initiate_write_inodeblock_ufs[12]
5776	 * trying to allocate memory without holding "Softdep Lock".
5777	 */
5778	if ((inodedep->id_state & IOSTARTED) != 0 &&
5779	    inodedep->id_savedino1 == NULL)
5780		return (0);
5781
5782	if (inodedep->id_state & ONDEPLIST)
5783		LIST_REMOVE(inodedep, id_deps);
5784	inodedep->id_state &= ~ONDEPLIST;
5785	inodedep->id_state |= ALLCOMPLETE;
5786	inodedep->id_bmsafemap = NULL;
5787	if (inodedep->id_state & ONWORKLIST)
5788		WORKLIST_REMOVE(&inodedep->id_list);
5789	if (inodedep->id_savedino1 != NULL) {
5790		free(inodedep->id_savedino1, M_SAVEDINO);
5791		inodedep->id_savedino1 = NULL;
5792	}
5793	if (free_inodedep(inodedep) == 0)
5794		panic("check_inode_unwritten: busy inode");
5795	return (1);
5796}
5797
5798/*
5799 * Try to free an inodedep structure. Return 1 if it could be freed.
5800 */
5801static int
5802free_inodedep(inodedep)
5803	struct inodedep *inodedep;
5804{
5805
5806	mtx_assert(&lk, MA_OWNED);
5807	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
5808	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
5809	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
5810	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
5811	    !LIST_EMPTY(&inodedep->id_bufwait) ||
5812	    !LIST_EMPTY(&inodedep->id_inowait) ||
5813	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
5814	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
5815	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
5816	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
5817	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
5818	    inodedep->id_mkdiradd != NULL ||
5819	    inodedep->id_nlinkdelta != 0 ||
5820	    inodedep->id_savedino1 != NULL)
5821		return (0);
5822	if (inodedep->id_state & ONDEPLIST)
5823		LIST_REMOVE(inodedep, id_deps);
5824	LIST_REMOVE(inodedep, id_hash);
5825	WORKITEM_FREE(inodedep, D_INODEDEP);
5826	num_inodedep -= 1;
5827	return (1);
5828}
5829
5830/*
5831 * Free the block referenced by a freework structure.  The parent freeblks
5832 * structure is released and completed when the final cg bitmap reaches
5833 * the disk.  This routine may be freeing a jnewblk which never made it to
5834 * disk in which case we do not have to wait as the operation is undone
5835 * in memory immediately.
5836 */
5837static void
5838freework_freeblock(freework)
5839	struct freework *freework;
5840{
5841	struct freeblks *freeblks;
5842	struct ufsmount *ump;
5843	struct workhead wkhd;
5844	struct fs *fs;
5845	int complete;
5846	int pending;
5847	int bsize;
5848	int needj;
5849
5850	freeblks = freework->fw_freeblks;
5851	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
5852	fs = ump->um_fs;
5853	needj = freeblks->fb_list.wk_mp->mnt_kern_flag & MNTK_SUJ;
5854	complete = 0;
5855	LIST_INIT(&wkhd);
5856	/*
5857	 * If we are canceling an existing jnewblk pass it to the free
5858	 * routine, otherwise pass the freeblk which will ultimately
5859	 * release the freeblks.  If we're not journaling, we can just
5860	 * free the freeblks immediately.
5861	 */
5862	if (!LIST_EMPTY(&freework->fw_jwork)) {
5863		LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list);
5864		complete = 1;
5865	} else if (needj)
5866		WORKLIST_INSERT_UNLOCKED(&wkhd, &freework->fw_list);
5867	bsize = lfragtosize(fs, freework->fw_frags);
5868	pending = btodb(bsize);
5869	ACQUIRE_LOCK(&lk);
5870	freeblks->fb_chkcnt -= pending;
5871	FREE_LOCK(&lk);
5872	/*
5873	 * extattr blocks don't show up in pending blocks.  XXX why?
5874	 */
5875	if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) {
5876		UFS_LOCK(ump);
5877		fs->fs_pendingblocks -= pending;
5878		UFS_UNLOCK(ump);
5879	}
5880	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno,
5881	    bsize, freeblks->fb_previousinum, &wkhd);
5882	if (complete == 0 && needj)
5883		return;
5884	/*
5885	 * The jnewblk will be discarded and the bits in the map never
5886	 * made it to disk.  We can immediately free the freeblk.
5887	 */
5888	ACQUIRE_LOCK(&lk);
5889	handle_written_freework(freework);
5890	FREE_LOCK(&lk);
5891}
5892
5893/*
5894 * Start, continue, or finish the process of freeing an indirect block tree.
5895 * The free operation may be paused at any point with fw_off containing the
5896 * offset to restart from.  This enables us to implement some flow control
5897 * for large truncates which may fan out and generate a huge number of
5898 * dependencies.
5899 */
5900static void
5901handle_workitem_indirblk(freework)
5902	struct freework *freework;
5903{
5904	struct freeblks *freeblks;
5905	struct ufsmount *ump;
5906	struct fs *fs;
5907
5908
5909	freeblks = freework->fw_freeblks;
5910	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
5911	fs = ump->um_fs;
5912	if (freework->fw_off == NINDIR(fs))
5913		freework_freeblock(freework);
5914	else
5915		indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
5916		    freework->fw_lbn);
5917}
5918
5919/*
5920 * Called when a freework structure attached to a cg buf is written.  The
5921 * ref on either the parent or the freeblks structure is released and
5922 * either may be added to the worklist if it is the final ref.
5923 */
5924static void
5925handle_written_freework(freework)
5926	struct freework *freework;
5927{
5928	struct freeblks *freeblks;
5929	struct freework *parent;
5930
5931	freeblks = freework->fw_freeblks;
5932	parent = freework->fw_parent;
5933	if (parent) {
5934		if (--parent->fw_ref != 0)
5935			parent = NULL;
5936		freeblks = NULL;
5937	} else if (--freeblks->fb_ref != 0)
5938		freeblks = NULL;
5939	WORKITEM_FREE(freework, D_FREEWORK);
5940	/*
5941	 * Don't delay these block frees or it takes an intolerable amount
5942	 * of time to process truncates and free their journal entries.
5943	 */
5944	if (freeblks)
5945		add_to_worklist(&freeblks->fb_list, 1);
5946	if (parent)
5947		add_to_worklist(&parent->fw_list, 1);
5948}
5949
5950/*
5951 * This workitem routine performs the block de-allocation.
5952 * The workitem is added to the pending list after the updated
5953 * inode block has been written to disk.  As mentioned above,
5954 * checks regarding the number of blocks de-allocated (compared
5955 * to the number of blocks allocated for the file) are also
5956 * performed in this function.
5957 */
5958static void
5959handle_workitem_freeblocks(freeblks, flags)
5960	struct freeblks *freeblks;
5961	int flags;
5962{
5963	struct freework *freework;
5964	struct worklist *wk;
5965
5966	KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd),
5967	    ("handle_workitem_freeblocks: Journal entries not written."));
5968	if (LIST_EMPTY(&freeblks->fb_freeworkhd)) {
5969		handle_complete_freeblocks(freeblks);
5970		return;
5971	}
5972	freeblks->fb_ref++;
5973	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
5974		KASSERT(wk->wk_type == D_FREEWORK,
5975		    ("handle_workitem_freeblocks: Unknown type %s",
5976		    TYPENAME(wk->wk_type)));
5977		WORKLIST_REMOVE_UNLOCKED(wk);
5978		freework = WK_FREEWORK(wk);
5979		if (freework->fw_lbn <= -NDADDR)
5980			handle_workitem_indirblk(freework);
5981		else
5982			freework_freeblock(freework);
5983	}
5984	ACQUIRE_LOCK(&lk);
5985	if (--freeblks->fb_ref != 0)
5986		freeblks = NULL;
5987	FREE_LOCK(&lk);
5988	if (freeblks)
5989		handle_complete_freeblocks(freeblks);
5990}
5991
5992/*
5993 * Once all of the freework workitems are complete we can retire the
5994 * freeblocks dependency and any journal work awaiting completion.  This
5995 * can not be called until all other dependencies are stable on disk.
5996 */
5997static void
5998handle_complete_freeblocks(freeblks)
5999	struct freeblks *freeblks;
6000{
6001	struct inode *ip;
6002	struct vnode *vp;
6003	struct fs *fs;
6004	struct ufsmount *ump;
6005	int flags;
6006
6007	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
6008	fs = ump->um_fs;
6009	flags = LK_NOWAIT;
6010
6011	/*
6012	 * If we still have not finished background cleanup, then check
6013	 * to see if the block count needs to be adjusted.
6014	 */
6015	if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 &&
6016	    ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
6017	    (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) {
6018		ip = VTOI(vp);
6019		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt);
6020		ip->i_flag |= IN_CHANGE;
6021		vput(vp);
6022	}
6023
6024#ifdef INVARIANTS
6025	if (freeblks->fb_chkcnt != 0 &&
6026	    ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
6027		printf("handle_workitem_freeblocks: block count\n");
6028#endif /* INVARIANTS */
6029
6030	ACQUIRE_LOCK(&lk);
6031	/*
6032	 * All of the freeblock deps must be complete prior to this call
6033	 * so it's now safe to complete earlier outstanding journal entries.
6034	 */
6035	handle_jwork(&freeblks->fb_jwork);
6036	WORKITEM_FREE(freeblks, D_FREEBLKS);
6037	num_freeblkdep--;
6038	FREE_LOCK(&lk);
6039}
6040
6041/*
6042 * Release blocks associated with the inode ip and stored in the indirect
6043 * block dbn. If level is greater than SINGLE, the block is an indirect block
6044 * and recursive calls to indirtrunc must be used to cleanse other indirect
6045 * blocks.
6046 */
6047static void
6048indir_trunc(freework, dbn, lbn)
6049	struct freework *freework;
6050	ufs2_daddr_t dbn;
6051	ufs_lbn_t lbn;
6052{
6053	struct freework *nfreework;
6054	struct workhead wkhd;
6055	struct jnewblk *jnewblk;
6056	struct freeblks *freeblks;
6057	struct buf *bp;
6058	struct fs *fs;
6059	struct worklist *wkn;
6060	struct worklist *wk;
6061	struct indirdep *indirdep;
6062	struct ufsmount *ump;
6063	ufs1_daddr_t *bap1 = 0;
6064	ufs2_daddr_t nb, nnb, *bap2 = 0;
6065	ufs_lbn_t lbnadd;
6066	int i, nblocks, ufs1fmt;
6067	int fs_pendingblocks;
6068	int freedeps;
6069	int needj;
6070	int level;
6071	int cnt;
6072
6073	LIST_INIT(&wkhd);
6074	level = lbn_level(lbn);
6075	if (level == -1)
6076		panic("indir_trunc: Invalid lbn %jd\n", lbn);
6077	freeblks = freework->fw_freeblks;
6078	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
6079	fs = ump->um_fs;
6080	fs_pendingblocks = 0;
6081	freedeps = 0;
6082	needj = UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ;
6083	lbnadd = 1;
6084	for (i = level; i > 0; i--)
6085		lbnadd *= NINDIR(fs);
6086	/*
6087	 * Get buffer of block pointers to be freed. This routine is not
6088	 * called until the zero'ed inode has been written, so it is safe
6089	 * to free blocks as they are encountered. Because the inode has
6090	 * been zero'ed, calls to bmap on these blocks will fail. So, we
6091	 * have to use the on-disk address and the block device for the
6092	 * filesystem to look them up. If the file was deleted before its
6093	 * indirect blocks were all written to disk, the routine that set
6094	 * us up (deallocate_dependencies) will have arranged to leave
6095	 * a complete copy of the indirect block in memory for our use.
6096	 * Otherwise we have to read the blocks in from the disk.
6097	 */
6098#ifdef notyet
6099	bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
6100	    GB_NOCREAT);
6101#else
6102	bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
6103#endif
6104	ACQUIRE_LOCK(&lk);
6105	if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
6106		if (wk->wk_type != D_INDIRDEP ||
6107		    (wk->wk_state & GOINGAWAY) == 0)
6108			panic("indir_trunc: lost indirdep %p", wk);
6109		indirdep = WK_INDIRDEP(wk);
6110		LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list);
6111		free_indirdep(indirdep);
6112		if (!LIST_EMPTY(&bp->b_dep))
6113			panic("indir_trunc: dangling dep %p",
6114			    LIST_FIRST(&bp->b_dep));
6115		ump->um_numindirdeps -= 1;
6116		FREE_LOCK(&lk);
6117	} else {
6118#ifdef notyet
6119		if (bp)
6120			brelse(bp);
6121#endif
6122		FREE_LOCK(&lk);
6123		if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
6124		    NOCRED, &bp) != 0) {
6125			brelse(bp);
6126			return;
6127		}
6128	}
6129	/*
6130	 * Recursively free indirect blocks.
6131	 */
6132	if (ump->um_fstype == UFS1) {
6133		ufs1fmt = 1;
6134		bap1 = (ufs1_daddr_t *)bp->b_data;
6135	} else {
6136		ufs1fmt = 0;
6137		bap2 = (ufs2_daddr_t *)bp->b_data;
6138	}
6139	/*
6140	 * Reclaim indirect blocks which never made it to disk.
6141	 */
6142	cnt = 0;
6143	LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) {
6144		struct workhead freewk;
6145		if (wk->wk_type != D_JNEWBLK)
6146			continue;
6147		WORKLIST_REMOVE_UNLOCKED(wk);
6148		LIST_INIT(&freewk);
6149		WORKLIST_INSERT_UNLOCKED(&freewk, wk);
6150		jnewblk = WK_JNEWBLK(wk);
6151		if (jnewblk->jn_lbn > 0)
6152			i = (jnewblk->jn_lbn - -lbn) / lbnadd;
6153		else
6154			i = (jnewblk->jn_lbn - (lbn + 1)) / lbnadd;
6155		KASSERT(i >= 0 && i < NINDIR(fs),
6156		    ("indir_trunc: Index out of range %d parent %jd lbn %jd",
6157		    i, lbn, jnewblk->jn_lbn));
6158		/* Clear the pointer so it isn't found below. */
6159		if (ufs1fmt) {
6160			nb = bap1[i];
6161			bap1[i] = 0;
6162		} else {
6163			nb = bap2[i];
6164			bap2[i] = 0;
6165		}
6166		KASSERT(nb == jnewblk->jn_blkno,
6167		    ("indir_trunc: Block mismatch %jd != %jd",
6168		    nb, jnewblk->jn_blkno));
6169		ffs_blkfree(ump, fs, freeblks->fb_devvp, jnewblk->jn_blkno,
6170		    fs->fs_bsize, freeblks->fb_previousinum, &freewk);
6171		cnt++;
6172	}
6173	ACQUIRE_LOCK(&lk);
6174	if (needj)
6175		freework->fw_ref += NINDIR(fs) + 1;
6176	/* Any remaining journal work can be completed with freeblks. */
6177	jwork_move(&freeblks->fb_jwork, &wkhd);
6178	FREE_LOCK(&lk);
6179	nblocks = btodb(fs->fs_bsize);
6180	if (ufs1fmt)
6181		nb = bap1[0];
6182	else
6183		nb = bap2[0];
6184	nfreework = freework;
6185	/*
6186	 * Reclaim on disk blocks.
6187	 */
6188	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
6189		if (i != NINDIR(fs) - 1) {
6190			if (ufs1fmt)
6191				nnb = bap1[i+1];
6192			else
6193				nnb = bap2[i+1];
6194		} else
6195			nnb = 0;
6196		if (nb == 0)
6197			continue;
6198		cnt++;
6199		if (level != 0) {
6200			ufs_lbn_t nlbn;
6201
6202			nlbn = (lbn + 1) - (i * lbnadd);
6203			if (needj != 0) {
6204				nfreework = newfreework(freeblks, freework,
6205				    nlbn, nb, fs->fs_frag, 0);
6206				freedeps++;
6207			}
6208			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
6209		} else {
6210			struct freedep *freedep;
6211
6212			/*
6213			 * Attempt to aggregate freedep dependencies for
6214			 * all blocks being released to the same CG.
6215			 */
6216			LIST_INIT(&wkhd);
6217			if (needj != 0 &&
6218			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
6219				freedep = newfreedep(freework);
6220				WORKLIST_INSERT_UNLOCKED(&wkhd,
6221				    &freedep->fd_list);
6222				freedeps++;
6223			}
6224			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
6225			    fs->fs_bsize, freeblks->fb_previousinum, &wkhd);
6226		}
6227	}
6228	if (level == 0)
6229		fs_pendingblocks = (nblocks * cnt);
6230	/*
6231	 * If we're not journaling we can free the indirect now.  Otherwise
6232	 * setup the ref counts and offset so this indirect can be completed
6233	 * when its children are free.
6234	 */
6235	if (needj == 0) {
6236		fs_pendingblocks += nblocks;
6237		dbn = dbtofsb(fs, dbn);
6238		ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
6239		    freeblks->fb_previousinum, NULL);
6240		ACQUIRE_LOCK(&lk);
6241		freeblks->fb_chkcnt -= fs_pendingblocks;
6242		if (freework->fw_blkno == dbn)
6243			handle_written_freework(freework);
6244		FREE_LOCK(&lk);
6245		freework = NULL;
6246	} else {
6247		ACQUIRE_LOCK(&lk);
6248		freework->fw_off = i;
6249		freework->fw_ref += freedeps;
6250		freework->fw_ref -= NINDIR(fs) + 1;
6251		if (freework->fw_ref != 0)
6252			freework = NULL;
6253		freeblks->fb_chkcnt -= fs_pendingblocks;
6254		FREE_LOCK(&lk);
6255	}
6256	if (fs_pendingblocks) {
6257		UFS_LOCK(ump);
6258		fs->fs_pendingblocks -= fs_pendingblocks;
6259		UFS_UNLOCK(ump);
6260	}
6261	bp->b_flags |= B_INVAL | B_NOCACHE;
6262	brelse(bp);
6263	if (freework)
6264		handle_workitem_indirblk(freework);
6265	return;
6266}
6267
6268/*
6269 * Cancel an allocindir when it is removed via truncation.
6270 */
6271static void
6272cancel_allocindir(aip, inodedep, freeblks)
6273	struct allocindir *aip;
6274	struct inodedep *inodedep;
6275	struct freeblks *freeblks;
6276{
6277	struct newblk *newblk;
6278
6279	/*
6280	 * If the journal hasn't been written the jnewblk must be passed
6281	 * to the call to ffs_freeblk that reclaims the space.  We accomplish
6282	 * this by linking the journal dependency into the indirdep to be
6283	 * freed when indir_trunc() is called.  If the journal has already
6284	 * been written we can simply reclaim the journal space when the
6285	 * freeblks work is complete.
6286	 */
6287	LIST_REMOVE(aip, ai_next);
6288	newblk = (struct newblk *)aip;
6289	if (newblk->nb_jnewblk == NULL)
6290		cancel_newblk(newblk, &freeblks->fb_jwork);
6291	else
6292		cancel_newblk(newblk, &aip->ai_indirdep->ir_jwork);
6293	if (inodedep && inodedep->id_state & DEPCOMPLETE)
6294		WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list);
6295	else
6296		free_newblk(newblk);
6297}
6298
6299/*
6300 * Create the mkdir dependencies for . and .. in a new directory.  Link them
6301 * in to a newdirblk so any subsequent additions are tracked properly.  The
6302 * caller is responsible for adding the mkdir1 dependency to the journal
6303 * and updating id_mkdiradd.  This function returns with lk held.
6304 */
6305static struct mkdir *
6306setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
6307	struct diradd *dap;
6308	ino_t newinum;
6309	ino_t dinum;
6310	struct buf *newdirbp;
6311	struct mkdir **mkdirp;
6312{
6313	struct newblk *newblk;
6314	struct pagedep *pagedep;
6315	struct inodedep *inodedep;
6316	struct newdirblk *newdirblk = 0;
6317	struct mkdir *mkdir1, *mkdir2;
6318	struct worklist *wk;
6319	struct jaddref *jaddref;
6320	struct mount *mp;
6321
6322	mp = dap->da_list.wk_mp;
6323	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
6324	    M_SOFTDEP_FLAGS);
6325	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
6326	LIST_INIT(&newdirblk->db_mkdir);
6327	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
6328	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
6329	mkdir1->md_state = ATTACHED | MKDIR_BODY;
6330	mkdir1->md_diradd = dap;
6331	mkdir1->md_jaddref = NULL;
6332	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
6333	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
6334	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
6335	mkdir2->md_diradd = dap;
6336	mkdir2->md_jaddref = NULL;
6337	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) {
6338		mkdir1->md_state |= DEPCOMPLETE;
6339		mkdir2->md_state |= DEPCOMPLETE;
6340	}
6341	/*
6342	 * Dependency on "." and ".." being written to disk.
6343	 */
6344	mkdir1->md_buf = newdirbp;
6345	ACQUIRE_LOCK(&lk);
6346	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
6347	/*
6348	 * We must link the pagedep, allocdirect, and newdirblk for
6349	 * the initial file page so the pointer to the new directory
6350	 * is not written until the directory contents are live and
6351	 * any subsequent additions are not marked live until the
6352	 * block is reachable via the inode.
6353	 */
6354	if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0)
6355		panic("setup_newdir: lost pagedep");
6356	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
6357		if (wk->wk_type == D_ALLOCDIRECT)
6358			break;
6359	if (wk == NULL)
6360		panic("setup_newdir: lost allocdirect");
6361	newblk = WK_NEWBLK(wk);
6362	pagedep->pd_state |= NEWBLOCK;
6363	pagedep->pd_newdirblk = newdirblk;
6364	newdirblk->db_pagedep = pagedep;
6365	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
6366	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
6367	/*
6368	 * Look up the inodedep for the parent directory so that we
6369	 * can link mkdir2 into the pending dotdot jaddref or
6370	 * the inode write if there is none.  If the inode is
6371	 * ALLCOMPLETE and no jaddref is present all dependencies have
6372	 * been satisfied and mkdir2 can be freed.
6373	 */
6374	inodedep_lookup(mp, dinum, 0, &inodedep);
6375	if (mp->mnt_kern_flag & MNTK_SUJ) {
6376		if (inodedep == NULL)
6377			panic("setup_newdir: Lost parent.");
6378		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
6379		    inoreflst);
6380		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
6381		    (jaddref->ja_state & MKDIR_PARENT),
6382		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
6383		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
6384		mkdir2->md_jaddref = jaddref;
6385		jaddref->ja_mkdir = mkdir2;
6386	} else if (inodedep == NULL ||
6387	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
6388		dap->da_state &= ~MKDIR_PARENT;
6389		WORKITEM_FREE(mkdir2, D_MKDIR);
6390	} else {
6391		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
6392		WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
6393	}
6394	*mkdirp = mkdir2;
6395
6396	return (mkdir1);
6397}
6398
6399/*
6400 * Directory entry addition dependencies.
6401 *
6402 * When adding a new directory entry, the inode (with its incremented link
6403 * count) must be written to disk before the directory entry's pointer to it.
6404 * Also, if the inode is newly allocated, the corresponding freemap must be
6405 * updated (on disk) before the directory entry's pointer. These requirements
6406 * are met via undo/redo on the directory entry's pointer, which consists
6407 * simply of the inode number.
6408 *
6409 * As directory entries are added and deleted, the free space within a
6410 * directory block can become fragmented.  The ufs filesystem will compact
6411 * a fragmented directory block to make space for a new entry. When this
6412 * occurs, the offsets of previously added entries change. Any "diradd"
6413 * dependency structures corresponding to these entries must be updated with
6414 * the new offsets.
6415 */
6416
6417/*
6418 * This routine is called after the in-memory inode's link
6419 * count has been incremented, but before the directory entry's
6420 * pointer to the inode has been set.
6421 */
6422int
6423softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
6424	struct buf *bp;		/* buffer containing directory block */
6425	struct inode *dp;	/* inode for directory */
6426	off_t diroffset;	/* offset of new entry in directory */
6427	ino_t newinum;		/* inode referenced by new directory entry */
6428	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
6429	int isnewblk;		/* entry is in a newly allocated block */
6430{
6431	int offset;		/* offset of new entry within directory block */
6432	ufs_lbn_t lbn;		/* block in directory containing new entry */
6433	struct fs *fs;
6434	struct diradd *dap;
6435	struct newblk *newblk;
6436	struct pagedep *pagedep;
6437	struct inodedep *inodedep;
6438	struct newdirblk *newdirblk = 0;
6439	struct mkdir *mkdir1, *mkdir2;
6440	struct jaddref *jaddref;
6441	struct mount *mp;
6442	int isindir;
6443
6444	/*
6445	 * Whiteouts have no dependencies.
6446	 */
6447	if (newinum == WINO) {
6448		if (newdirbp != NULL)
6449			bdwrite(newdirbp);
6450		return (0);
6451	}
6452	jaddref = NULL;
6453	mkdir1 = mkdir2 = NULL;
6454	mp = UFSTOVFS(dp->i_ump);
6455	fs = dp->i_fs;
6456	lbn = lblkno(fs, diroffset);
6457	offset = blkoff(fs, diroffset);
6458	dap = malloc(sizeof(struct diradd), M_DIRADD,
6459		M_SOFTDEP_FLAGS|M_ZERO);
6460	workitem_alloc(&dap->da_list, D_DIRADD, mp);
6461	dap->da_offset = offset;
6462	dap->da_newinum = newinum;
6463	dap->da_state = ATTACHED;
6464	LIST_INIT(&dap->da_jwork);
6465	isindir = bp->b_lblkno >= NDADDR;
6466	if (isnewblk &&
6467	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
6468		newdirblk = malloc(sizeof(struct newdirblk),
6469		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
6470		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
6471		LIST_INIT(&newdirblk->db_mkdir);
6472	}
6473	/*
6474	 * If we're creating a new directory setup the dependencies and set
6475	 * the dap state to wait for them.  Otherwise it's COMPLETE and
6476	 * we can move on.
6477	 */
6478	if (newdirbp == NULL) {
6479		dap->da_state |= DEPCOMPLETE;
6480		ACQUIRE_LOCK(&lk);
6481	} else {
6482		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
6483		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
6484		    &mkdir2);
6485	}
6486	/*
6487	 * Link into parent directory pagedep to await its being written.
6488	 */
6489	if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0)
6490		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
6491#ifdef DEBUG
6492	if (diradd_lookup(pagedep, offset) != NULL)
6493		panic("softdep_setup_directory_add: %p already at off %d\n",
6494		    diradd_lookup(pagedep, offset), offset);
6495#endif
6496	dap->da_pagedep = pagedep;
6497	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
6498	    da_pdlist);
6499	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
6500	/*
6501	 * If we're journaling, link the diradd into the jaddref so it
6502	 * may be completed after the journal entry is written.  Otherwise,
6503	 * link the diradd into its inodedep.  If the inode is not yet
6504	 * written place it on the bufwait list, otherwise do the post-inode
6505	 * write processing to put it on the id_pendinghd list.
6506	 */
6507	if (mp->mnt_kern_flag & MNTK_SUJ) {
6508		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
6509		    inoreflst);
6510		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
6511		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
6512		jaddref->ja_diroff = diroffset;
6513		jaddref->ja_diradd = dap;
6514		add_to_journal(&jaddref->ja_list);
6515	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
6516		diradd_inode_written(dap, inodedep);
6517	else
6518		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
6519	/*
6520	 * Add the journal entries for . and .. links now that the primary
6521	 * link is written.
6522	 */
6523	if (mkdir1 != NULL && mp->mnt_kern_flag & MNTK_SUJ) {
6524		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
6525		    inoreflst, if_deps);
6526		KASSERT(jaddref != NULL &&
6527		    jaddref->ja_ino == jaddref->ja_parent &&
6528		    (jaddref->ja_state & MKDIR_BODY),
6529		    ("softdep_setup_directory_add: bad dot jaddref %p",
6530		    jaddref));
6531		mkdir1->md_jaddref = jaddref;
6532		jaddref->ja_mkdir = mkdir1;
6533		/*
6534		 * It is important that the dotdot journal entry
6535		 * is added prior to the dot entry since dot writes
6536		 * both the dot and dotdot links.  These both must
6537		 * be added after the primary link for the journal
6538		 * to remain consistent.
6539		 */
6540		add_to_journal(&mkdir2->md_jaddref->ja_list);
6541		add_to_journal(&jaddref->ja_list);
6542	}
6543	/*
6544	 * If we are adding a new directory remember this diradd so that if
6545	 * we rename it we can keep the dot and dotdot dependencies.  If
6546	 * we are adding a new name for an inode that has a mkdiradd we
6547	 * must be in rename and we have to move the dot and dotdot
6548	 * dependencies to this new name.  The old name is being orphaned
6549	 * soon.
6550	 */
6551	if (mkdir1 != NULL) {
6552		if (inodedep->id_mkdiradd != NULL)
6553			panic("softdep_setup_directory_add: Existing mkdir");
6554		inodedep->id_mkdiradd = dap;
6555	} else if (inodedep->id_mkdiradd)
6556		merge_diradd(inodedep, dap);
6557	if (newdirblk) {
6558		/*
6559		 * There is nothing to do if we are already tracking
6560		 * this block.
6561		 */
6562		if ((pagedep->pd_state & NEWBLOCK) != 0) {
6563			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
6564			FREE_LOCK(&lk);
6565			return (0);
6566		}
6567		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
6568		    == 0)
6569			panic("softdep_setup_directory_add: lost entry");
6570		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
6571		pagedep->pd_state |= NEWBLOCK;
6572		pagedep->pd_newdirblk = newdirblk;
6573		newdirblk->db_pagedep = pagedep;
6574		FREE_LOCK(&lk);
6575		/*
6576		 * If we extended into an indirect signal direnter to sync.
6577		 */
6578		if (isindir)
6579			return (1);
6580		return (0);
6581	}
6582	FREE_LOCK(&lk);
6583	return (0);
6584}
6585
6586/*
6587 * This procedure is called to change the offset of a directory
6588 * entry when compacting a directory block which must be owned
6589 * exclusively by the caller. Note that the actual entry movement
6590 * must be done in this procedure to ensure that no I/O completions
6591 * occur while the move is in progress.
6592 */
6593void
6594softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
6595	struct buf *bp;		/* Buffer holding directory block. */
6596	struct inode *dp;	/* inode for directory */
6597	caddr_t base;		/* address of dp->i_offset */
6598	caddr_t oldloc;		/* address of old directory location */
6599	caddr_t newloc;		/* address of new directory location */
6600	int entrysize;		/* size of directory entry */
6601{
6602	int offset, oldoffset, newoffset;
6603	struct pagedep *pagedep;
6604	struct jmvref *jmvref;
6605	struct diradd *dap;
6606	struct direct *de;
6607	struct mount *mp;
6608	ufs_lbn_t lbn;
6609	int flags;
6610
6611	mp = UFSTOVFS(dp->i_ump);
6612	de = (struct direct *)oldloc;
6613	jmvref = NULL;
6614	flags = 0;
6615	/*
6616	 * Moves are always journaled as it would be too complex to
6617	 * determine if any affected adds or removes are present in the
6618	 * journal.
6619	 */
6620	if (mp->mnt_kern_flag & MNTK_SUJ)  {
6621		flags = DEPALLOC;
6622		jmvref = newjmvref(dp, de->d_ino,
6623		    dp->i_offset + (oldloc - base),
6624		    dp->i_offset + (newloc - base));
6625	}
6626	lbn = lblkno(dp->i_fs, dp->i_offset);
6627	offset = blkoff(dp->i_fs, dp->i_offset);
6628	oldoffset = offset + (oldloc - base);
6629	newoffset = offset + (newloc - base);
6630	ACQUIRE_LOCK(&lk);
6631	if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) {
6632		if (pagedep)
6633			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
6634		goto done;
6635	}
6636	dap = diradd_lookup(pagedep, oldoffset);
6637	if (dap) {
6638		dap->da_offset = newoffset;
6639		newoffset = DIRADDHASH(newoffset);
6640		oldoffset = DIRADDHASH(oldoffset);
6641		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
6642		    newoffset != oldoffset) {
6643			LIST_REMOVE(dap, da_pdlist);
6644			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
6645			    dap, da_pdlist);
6646		}
6647	}
6648done:
6649	if (jmvref) {
6650		jmvref->jm_pagedep = pagedep;
6651		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
6652		add_to_journal(&jmvref->jm_list);
6653	}
6654	bcopy(oldloc, newloc, entrysize);
6655	FREE_LOCK(&lk);
6656}
6657
6658/*
6659 * Move the mkdir dependencies and journal work from one diradd to another
6660 * when renaming a directory.  The new name must depend on the mkdir deps
6661 * completing as the old name did.  Directories can only have one valid link
6662 * at a time so one must be canonical.
6663 */
6664static void
6665merge_diradd(inodedep, newdap)
6666	struct inodedep *inodedep;
6667	struct diradd *newdap;
6668{
6669	struct diradd *olddap;
6670	struct mkdir *mkdir, *nextmd;
6671	short state;
6672
6673	olddap = inodedep->id_mkdiradd;
6674	inodedep->id_mkdiradd = newdap;
6675	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
6676		newdap->da_state &= ~DEPCOMPLETE;
6677		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
6678			nextmd = LIST_NEXT(mkdir, md_mkdirs);
6679			if (mkdir->md_diradd != olddap)
6680				continue;
6681			mkdir->md_diradd = newdap;
6682			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
6683			newdap->da_state |= state;
6684			olddap->da_state &= ~state;
6685			if ((olddap->da_state &
6686			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
6687				break;
6688		}
6689		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
6690			panic("merge_diradd: unfound ref");
6691	}
6692	/*
6693	 * Any mkdir related journal items are not safe to be freed until
6694	 * the new name is stable.
6695	 */
6696	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
6697	olddap->da_state |= DEPCOMPLETE;
6698	complete_diradd(olddap);
6699}
6700
6701/*
6702 * Move the diradd to the pending list when all diradd dependencies are
6703 * complete.
6704 */
6705static void
6706complete_diradd(dap)
6707	struct diradd *dap;
6708{
6709	struct pagedep *pagedep;
6710
6711	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
6712		if (dap->da_state & DIRCHG)
6713			pagedep = dap->da_previous->dm_pagedep;
6714		else
6715			pagedep = dap->da_pagedep;
6716		LIST_REMOVE(dap, da_pdlist);
6717		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
6718	}
6719}
6720
6721/*
6722 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
6723 * add entries and conditonally journal the remove.
6724 */
6725static void
6726cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
6727	struct diradd *dap;
6728	struct dirrem *dirrem;
6729	struct jremref *jremref;
6730	struct jremref *dotremref;
6731	struct jremref *dotdotremref;
6732{
6733	struct inodedep *inodedep;
6734	struct jaddref *jaddref;
6735	struct inoref *inoref;
6736	struct mkdir *mkdir;
6737
6738	/*
6739	 * If no remove references were allocated we're on a non-journaled
6740	 * filesystem and can skip the cancel step.
6741	 */
6742	if (jremref == NULL) {
6743		free_diradd(dap, NULL);
6744		return;
6745	}
6746	/*
6747	 * Cancel the primary name an free it if it does not require
6748	 * journaling.
6749	 */
6750	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
6751	    0, &inodedep) != 0) {
6752		/* Abort the addref that reference this diradd.  */
6753		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
6754			if (inoref->if_list.wk_type != D_JADDREF)
6755				continue;
6756			jaddref = (struct jaddref *)inoref;
6757			if (jaddref->ja_diradd != dap)
6758				continue;
6759			if (cancel_jaddref(jaddref, inodedep,
6760			    &dirrem->dm_jwork) == 0) {
6761				free_jremref(jremref);
6762				jremref = NULL;
6763			}
6764			break;
6765		}
6766	}
6767	/*
6768	 * Cancel subordinate names and free them if they do not require
6769	 * journaling.
6770	 */
6771	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
6772		LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
6773			if (mkdir->md_diradd != dap)
6774				continue;
6775			if ((jaddref = mkdir->md_jaddref) == NULL)
6776				continue;
6777			mkdir->md_jaddref = NULL;
6778			if (mkdir->md_state & MKDIR_PARENT) {
6779				if (cancel_jaddref(jaddref, NULL,
6780				    &dirrem->dm_jwork) == 0) {
6781					free_jremref(dotdotremref);
6782					dotdotremref = NULL;
6783				}
6784			} else {
6785				if (cancel_jaddref(jaddref, inodedep,
6786				    &dirrem->dm_jwork) == 0) {
6787					free_jremref(dotremref);
6788					dotremref = NULL;
6789				}
6790			}
6791		}
6792	}
6793
6794	if (jremref)
6795		journal_jremref(dirrem, jremref, inodedep);
6796	if (dotremref)
6797		journal_jremref(dirrem, dotremref, inodedep);
6798	if (dotdotremref)
6799		journal_jremref(dirrem, dotdotremref, NULL);
6800	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
6801	free_diradd(dap, &dirrem->dm_jwork);
6802}
6803
6804/*
6805 * Free a diradd dependency structure. This routine must be called
6806 * with splbio interrupts blocked.
6807 */
6808static void
6809free_diradd(dap, wkhd)
6810	struct diradd *dap;
6811	struct workhead *wkhd;
6812{
6813	struct dirrem *dirrem;
6814	struct pagedep *pagedep;
6815	struct inodedep *inodedep;
6816	struct mkdir *mkdir, *nextmd;
6817
6818	mtx_assert(&lk, MA_OWNED);
6819	LIST_REMOVE(dap, da_pdlist);
6820	if (dap->da_state & ONWORKLIST)
6821		WORKLIST_REMOVE(&dap->da_list);
6822	if ((dap->da_state & DIRCHG) == 0) {
6823		pagedep = dap->da_pagedep;
6824	} else {
6825		dirrem = dap->da_previous;
6826		pagedep = dirrem->dm_pagedep;
6827		dirrem->dm_dirinum = pagedep->pd_ino;
6828		dirrem->dm_state |= COMPLETE;
6829		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
6830			add_to_worklist(&dirrem->dm_list, 0);
6831	}
6832	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
6833	    0, &inodedep) != 0)
6834		if (inodedep->id_mkdiradd == dap)
6835			inodedep->id_mkdiradd = NULL;
6836	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
6837		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
6838			nextmd = LIST_NEXT(mkdir, md_mkdirs);
6839			if (mkdir->md_diradd != dap)
6840				continue;
6841			dap->da_state &=
6842			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
6843			LIST_REMOVE(mkdir, md_mkdirs);
6844			if (mkdir->md_state & ONWORKLIST)
6845				WORKLIST_REMOVE(&mkdir->md_list);
6846			if (mkdir->md_jaddref != NULL)
6847				panic("free_diradd: Unexpected jaddref");
6848			WORKITEM_FREE(mkdir, D_MKDIR);
6849			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
6850				break;
6851		}
6852		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
6853			panic("free_diradd: unfound ref");
6854	}
6855	if (inodedep)
6856		free_inodedep(inodedep);
6857	/*
6858	 * Free any journal segments waiting for the directory write.
6859	 */
6860	handle_jwork(&dap->da_jwork);
6861	WORKITEM_FREE(dap, D_DIRADD);
6862}
6863
6864/*
6865 * Directory entry removal dependencies.
6866 *
6867 * When removing a directory entry, the entry's inode pointer must be
6868 * zero'ed on disk before the corresponding inode's link count is decremented
6869 * (possibly freeing the inode for re-use). This dependency is handled by
6870 * updating the directory entry but delaying the inode count reduction until
6871 * after the directory block has been written to disk. After this point, the
6872 * inode count can be decremented whenever it is convenient.
6873 */
6874
6875/*
6876 * This routine should be called immediately after removing
6877 * a directory entry.  The inode's link count should not be
6878 * decremented by the calling procedure -- the soft updates
6879 * code will do this task when it is safe.
6880 */
6881void
6882softdep_setup_remove(bp, dp, ip, isrmdir)
6883	struct buf *bp;		/* buffer containing directory block */
6884	struct inode *dp;	/* inode for the directory being modified */
6885	struct inode *ip;	/* inode for directory entry being removed */
6886	int isrmdir;		/* indicates if doing RMDIR */
6887{
6888	struct dirrem *dirrem, *prevdirrem;
6889	struct inodedep *inodedep;
6890	int direct;
6891
6892	/*
6893	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
6894	 * newdirrem() to setup the full directory remove which requires
6895	 * isrmdir > 1.
6896	 */
6897	dirrem = newdirrem(bp, dp, ip, isrmdir?2:0, &prevdirrem);
6898	/*
6899	 * Add the dirrem to the inodedep's pending remove list for quick
6900	 * discovery later.
6901	 */
6902	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
6903	    &inodedep) == 0)
6904		panic("softdep_setup_remove: Lost inodedep.");
6905	dirrem->dm_state |= ONDEPLIST;
6906	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
6907
6908	/*
6909	 * If the COMPLETE flag is clear, then there were no active
6910	 * entries and we want to roll back to a zeroed entry until
6911	 * the new inode is committed to disk. If the COMPLETE flag is
6912	 * set then we have deleted an entry that never made it to
6913	 * disk. If the entry we deleted resulted from a name change,
6914	 * then the old name still resides on disk. We cannot delete
6915	 * its inode (returned to us in prevdirrem) until the zeroed
6916	 * directory entry gets to disk. The new inode has never been
6917	 * referenced on the disk, so can be deleted immediately.
6918	 */
6919	if ((dirrem->dm_state & COMPLETE) == 0) {
6920		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
6921		    dm_next);
6922		FREE_LOCK(&lk);
6923	} else {
6924		if (prevdirrem != NULL)
6925			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
6926			    prevdirrem, dm_next);
6927		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
6928		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
6929		FREE_LOCK(&lk);
6930		if (direct)
6931			handle_workitem_remove(dirrem, NULL);
6932	}
6933}
6934
6935/*
6936 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
6937 * pd_pendinghd list of a pagedep.
6938 */
6939static struct diradd *
6940diradd_lookup(pagedep, offset)
6941	struct pagedep *pagedep;
6942	int offset;
6943{
6944	struct diradd *dap;
6945
6946	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
6947		if (dap->da_offset == offset)
6948			return (dap);
6949	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
6950		if (dap->da_offset == offset)
6951			return (dap);
6952	return (NULL);
6953}
6954
6955/*
6956 * Search for a .. diradd dependency in a directory that is being removed.
6957 * If the directory was renamed to a new parent we have a diradd rather
6958 * than a mkdir for the .. entry.  We need to cancel it now before
6959 * it is found in truncate().
6960 */
6961static struct jremref *
6962cancel_diradd_dotdot(ip, dirrem, jremref)
6963	struct inode *ip;
6964	struct dirrem *dirrem;
6965	struct jremref *jremref;
6966{
6967	struct pagedep *pagedep;
6968	struct diradd *dap;
6969	struct worklist *wk;
6970
6971	if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0,
6972	    &pagedep) == 0)
6973		return (jremref);
6974	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
6975	if (dap == NULL)
6976		return (jremref);
6977	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
6978	/*
6979	 * Mark any journal work as belonging to the parent so it is freed
6980	 * with the .. reference.
6981	 */
6982	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
6983		wk->wk_state |= MKDIR_PARENT;
6984	return (NULL);
6985}
6986
6987/*
6988 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
6989 * replace it with a dirrem/diradd pair as a result of re-parenting a
6990 * directory.  This ensures that we don't simultaneously have a mkdir and
6991 * a diradd for the same .. entry.
6992 */
6993static struct jremref *
6994cancel_mkdir_dotdot(ip, dirrem, jremref)
6995	struct inode *ip;
6996	struct dirrem *dirrem;
6997	struct jremref *jremref;
6998{
6999	struct inodedep *inodedep;
7000	struct jaddref *jaddref;
7001	struct mkdir *mkdir;
7002	struct diradd *dap;
7003
7004	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
7005	    &inodedep) == 0)
7006		panic("cancel_mkdir_dotdot: Lost inodedep");
7007	dap = inodedep->id_mkdiradd;
7008	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
7009		return (jremref);
7010	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir;
7011	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
7012		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
7013			break;
7014	if (mkdir == NULL)
7015		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
7016	if ((jaddref = mkdir->md_jaddref) != NULL) {
7017		mkdir->md_jaddref = NULL;
7018		jaddref->ja_state &= ~MKDIR_PARENT;
7019		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
7020		    &inodedep) == 0)
7021			panic("cancel_mkdir_dotdot: Lost parent inodedep");
7022		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
7023			journal_jremref(dirrem, jremref, inodedep);
7024			jremref = NULL;
7025		}
7026	}
7027	if (mkdir->md_state & ONWORKLIST)
7028		WORKLIST_REMOVE(&mkdir->md_list);
7029	mkdir->md_state |= ALLCOMPLETE;
7030	complete_mkdir(mkdir);
7031	return (jremref);
7032}
7033
7034static void
7035journal_jremref(dirrem, jremref, inodedep)
7036	struct dirrem *dirrem;
7037	struct jremref *jremref;
7038	struct inodedep *inodedep;
7039{
7040
7041	if (inodedep == NULL)
7042		if (inodedep_lookup(jremref->jr_list.wk_mp,
7043		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
7044			panic("journal_jremref: Lost inodedep");
7045	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
7046	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
7047	add_to_journal(&jremref->jr_list);
7048}
7049
7050static void
7051dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
7052	struct dirrem *dirrem;
7053	struct jremref *jremref;
7054	struct jremref *dotremref;
7055	struct jremref *dotdotremref;
7056{
7057	struct inodedep *inodedep;
7058
7059
7060	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
7061	    &inodedep) == 0)
7062		panic("dirrem_journal: Lost inodedep");
7063	journal_jremref(dirrem, jremref, inodedep);
7064	if (dotremref)
7065		journal_jremref(dirrem, dotremref, inodedep);
7066	if (dotdotremref)
7067		journal_jremref(dirrem, dotdotremref, NULL);
7068}
7069
7070/*
7071 * Allocate a new dirrem if appropriate and return it along with
7072 * its associated pagedep. Called without a lock, returns with lock.
7073 */
7074static long num_dirrem;		/* number of dirrem allocated */
7075static struct dirrem *
7076newdirrem(bp, dp, ip, isrmdir, prevdirremp)
7077	struct buf *bp;		/* buffer containing directory block */
7078	struct inode *dp;	/* inode for the directory being modified */
7079	struct inode *ip;	/* inode for directory entry being removed */
7080	int isrmdir;		/* indicates if doing RMDIR */
7081	struct dirrem **prevdirremp; /* previously referenced inode, if any */
7082{
7083	int offset;
7084	ufs_lbn_t lbn;
7085	struct diradd *dap;
7086	struct dirrem *dirrem;
7087	struct pagedep *pagedep;
7088	struct jremref *jremref;
7089	struct jremref *dotremref;
7090	struct jremref *dotdotremref;
7091	struct vnode *dvp;
7092
7093	/*
7094	 * Whiteouts have no deletion dependencies.
7095	 */
7096	if (ip == NULL)
7097		panic("newdirrem: whiteout");
7098	dvp = ITOV(dp);
7099	/*
7100	 * If we are over our limit, try to improve the situation.
7101	 * Limiting the number of dirrem structures will also limit
7102	 * the number of freefile and freeblks structures.
7103	 */
7104	ACQUIRE_LOCK(&lk);
7105	if (!(ip->i_flags & SF_SNAPSHOT) && num_dirrem > max_softdeps / 2)
7106		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE);
7107	num_dirrem += 1;
7108	FREE_LOCK(&lk);
7109	dirrem = malloc(sizeof(struct dirrem),
7110		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
7111	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
7112	LIST_INIT(&dirrem->dm_jremrefhd);
7113	LIST_INIT(&dirrem->dm_jwork);
7114	dirrem->dm_state = isrmdir ? RMDIR : 0;
7115	dirrem->dm_oldinum = ip->i_number;
7116	*prevdirremp = NULL;
7117	/*
7118	 * Allocate remove reference structures to track journal write
7119	 * dependencies.  We will always have one for the link and
7120	 * when doing directories we will always have one more for dot.
7121	 * When renaming a directory we skip the dotdot link change so
7122	 * this is not needed.
7123	 */
7124	jremref = dotremref = dotdotremref = NULL;
7125	if (DOINGSUJ(dvp)) {
7126		if (isrmdir) {
7127			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
7128			    ip->i_effnlink + 2);
7129			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
7130			    ip->i_effnlink + 1);
7131		} else
7132			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
7133			    ip->i_effnlink + 1);
7134		if (isrmdir > 1) {
7135			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
7136			    dp->i_effnlink + 1);
7137			dotdotremref->jr_state |= MKDIR_PARENT;
7138		}
7139	}
7140	ACQUIRE_LOCK(&lk);
7141	lbn = lblkno(dp->i_fs, dp->i_offset);
7142	offset = blkoff(dp->i_fs, dp->i_offset);
7143	if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC,
7144	    &pagedep) == 0)
7145		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
7146	dirrem->dm_pagedep = pagedep;
7147	/*
7148	 * If we're renaming a .. link to a new directory, cancel any
7149	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
7150	 * the jremref is preserved for any potential diradd in this
7151	 * location.  This can not coincide with a rmdir.
7152	 */
7153	if (dp->i_offset == DOTDOT_OFFSET) {
7154		if (isrmdir)
7155			panic("newdirrem: .. directory change during remove?");
7156		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
7157	}
7158	/*
7159	 * If we're removing a directory search for the .. dependency now and
7160	 * cancel it.  Any pending journal work will be added to the dirrem
7161	 * to be completed when the workitem remove completes.
7162	 */
7163	if (isrmdir > 1)
7164		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
7165	/*
7166	 * Check for a diradd dependency for the same directory entry.
7167	 * If present, then both dependencies become obsolete and can
7168	 * be de-allocated.
7169	 */
7170	dap = diradd_lookup(pagedep, offset);
7171	if (dap == NULL) {
7172		/*
7173		 * Link the jremref structures into the dirrem so they are
7174		 * written prior to the pagedep.
7175		 */
7176		if (jremref)
7177			dirrem_journal(dirrem, jremref, dotremref,
7178			    dotdotremref);
7179		return (dirrem);
7180	}
7181	/*
7182	 * Must be ATTACHED at this point.
7183	 */
7184	if ((dap->da_state & ATTACHED) == 0)
7185		panic("newdirrem: not ATTACHED");
7186	if (dap->da_newinum != ip->i_number)
7187		panic("newdirrem: inum %d should be %d",
7188		    ip->i_number, dap->da_newinum);
7189	/*
7190	 * If we are deleting a changed name that never made it to disk,
7191	 * then return the dirrem describing the previous inode (which
7192	 * represents the inode currently referenced from this entry on disk).
7193	 */
7194	if ((dap->da_state & DIRCHG) != 0) {
7195		*prevdirremp = dap->da_previous;
7196		dap->da_state &= ~DIRCHG;
7197		dap->da_pagedep = pagedep;
7198	}
7199	/*
7200	 * We are deleting an entry that never made it to disk.
7201	 * Mark it COMPLETE so we can delete its inode immediately.
7202	 */
7203	dirrem->dm_state |= COMPLETE;
7204	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
7205#ifdef SUJ_DEBUG
7206	if (isrmdir == 0) {
7207		struct worklist *wk;
7208
7209		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
7210			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
7211				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
7212	}
7213#endif
7214
7215	return (dirrem);
7216}
7217
7218/*
7219 * Directory entry change dependencies.
7220 *
7221 * Changing an existing directory entry requires that an add operation
7222 * be completed first followed by a deletion. The semantics for the addition
7223 * are identical to the description of adding a new entry above except
7224 * that the rollback is to the old inode number rather than zero. Once
7225 * the addition dependency is completed, the removal is done as described
7226 * in the removal routine above.
7227 */
7228
7229/*
7230 * This routine should be called immediately after changing
7231 * a directory entry.  The inode's link count should not be
7232 * decremented by the calling procedure -- the soft updates
7233 * code will perform this task when it is safe.
7234 */
7235void
7236softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
7237	struct buf *bp;		/* buffer containing directory block */
7238	struct inode *dp;	/* inode for the directory being modified */
7239	struct inode *ip;	/* inode for directory entry being removed */
7240	ino_t newinum;		/* new inode number for changed entry */
7241	int isrmdir;		/* indicates if doing RMDIR */
7242{
7243	int offset;
7244	struct diradd *dap = NULL;
7245	struct dirrem *dirrem, *prevdirrem;
7246	struct pagedep *pagedep;
7247	struct inodedep *inodedep;
7248	struct jaddref *jaddref;
7249	struct mount *mp;
7250
7251	offset = blkoff(dp->i_fs, dp->i_offset);
7252	mp = UFSTOVFS(dp->i_ump);
7253
7254	/*
7255	 * Whiteouts do not need diradd dependencies.
7256	 */
7257	if (newinum != WINO) {
7258		dap = malloc(sizeof(struct diradd),
7259		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
7260		workitem_alloc(&dap->da_list, D_DIRADD, mp);
7261		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
7262		dap->da_offset = offset;
7263		dap->da_newinum = newinum;
7264		LIST_INIT(&dap->da_jwork);
7265	}
7266
7267	/*
7268	 * Allocate a new dirrem and ACQUIRE_LOCK.
7269	 */
7270	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
7271	pagedep = dirrem->dm_pagedep;
7272	/*
7273	 * The possible values for isrmdir:
7274	 *	0 - non-directory file rename
7275	 *	1 - directory rename within same directory
7276	 *   inum - directory rename to new directory of given inode number
7277	 * When renaming to a new directory, we are both deleting and
7278	 * creating a new directory entry, so the link count on the new
7279	 * directory should not change. Thus we do not need the followup
7280	 * dirrem which is usually done in handle_workitem_remove. We set
7281	 * the DIRCHG flag to tell handle_workitem_remove to skip the
7282	 * followup dirrem.
7283	 */
7284	if (isrmdir > 1)
7285		dirrem->dm_state |= DIRCHG;
7286
7287	/*
7288	 * Whiteouts have no additional dependencies,
7289	 * so just put the dirrem on the correct list.
7290	 */
7291	if (newinum == WINO) {
7292		if ((dirrem->dm_state & COMPLETE) == 0) {
7293			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
7294			    dm_next);
7295		} else {
7296			dirrem->dm_dirinum = pagedep->pd_ino;
7297			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
7298				add_to_worklist(&dirrem->dm_list, 0);
7299		}
7300		FREE_LOCK(&lk);
7301		return;
7302	}
7303	/*
7304	 * Add the dirrem to the inodedep's pending remove list for quick
7305	 * discovery later.  A valid nlinkdelta ensures that this lookup
7306	 * will not fail.
7307	 */
7308	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
7309		panic("softdep_setup_directory_change: Lost inodedep.");
7310	dirrem->dm_state |= ONDEPLIST;
7311	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
7312
7313	/*
7314	 * If the COMPLETE flag is clear, then there were no active
7315	 * entries and we want to roll back to the previous inode until
7316	 * the new inode is committed to disk. If the COMPLETE flag is
7317	 * set, then we have deleted an entry that never made it to disk.
7318	 * If the entry we deleted resulted from a name change, then the old
7319	 * inode reference still resides on disk. Any rollback that we do
7320	 * needs to be to that old inode (returned to us in prevdirrem). If
7321	 * the entry we deleted resulted from a create, then there is
7322	 * no entry on the disk, so we want to roll back to zero rather
7323	 * than the uncommitted inode. In either of the COMPLETE cases we
7324	 * want to immediately free the unwritten and unreferenced inode.
7325	 */
7326	if ((dirrem->dm_state & COMPLETE) == 0) {
7327		dap->da_previous = dirrem;
7328	} else {
7329		if (prevdirrem != NULL) {
7330			dap->da_previous = prevdirrem;
7331		} else {
7332			dap->da_state &= ~DIRCHG;
7333			dap->da_pagedep = pagedep;
7334		}
7335		dirrem->dm_dirinum = pagedep->pd_ino;
7336		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
7337			add_to_worklist(&dirrem->dm_list, 0);
7338	}
7339	/*
7340	 * Lookup the jaddref for this journal entry.  We must finish
7341	 * initializing it and make the diradd write dependent on it.
7342	 * If we're not journaling Put it on the id_bufwait list if the inode
7343	 * is not yet written. If it is written, do the post-inode write
7344	 * processing to put it on the id_pendinghd list.
7345	 */
7346	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
7347	if (mp->mnt_kern_flag & MNTK_SUJ) {
7348		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
7349		    inoreflst);
7350		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
7351		    ("softdep_setup_directory_change: bad jaddref %p",
7352		    jaddref));
7353		jaddref->ja_diroff = dp->i_offset;
7354		jaddref->ja_diradd = dap;
7355		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
7356		    dap, da_pdlist);
7357		add_to_journal(&jaddref->ja_list);
7358	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
7359		dap->da_state |= COMPLETE;
7360		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
7361		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
7362	} else {
7363		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
7364		    dap, da_pdlist);
7365		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
7366	}
7367	/*
7368	 * If we're making a new name for a directory that has not been
7369	 * committed when need to move the dot and dotdot references to
7370	 * this new name.
7371	 */
7372	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
7373		merge_diradd(inodedep, dap);
7374	FREE_LOCK(&lk);
7375}
7376
7377/*
7378 * Called whenever the link count on an inode is changed.
7379 * It creates an inode dependency so that the new reference(s)
7380 * to the inode cannot be committed to disk until the updated
7381 * inode has been written.
7382 */
7383void
7384softdep_change_linkcnt(ip)
7385	struct inode *ip;	/* the inode with the increased link count */
7386{
7387	struct inodedep *inodedep;
7388
7389	ACQUIRE_LOCK(&lk);
7390	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
7391	if (ip->i_nlink < ip->i_effnlink)
7392		panic("softdep_change_linkcnt: bad delta");
7393	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
7394	FREE_LOCK(&lk);
7395}
7396
7397/*
7398 * Called when the effective link count and the reference count
7399 * on an inode drops to zero. At this point there are no names
7400 * referencing the file in the filesystem and no active file
7401 * references. The space associated with the file will be freed
7402 * as soon as the necessary soft dependencies are cleared.
7403 */
7404void
7405softdep_releasefile(ip)
7406	struct inode *ip;	/* inode with the zero effective link count */
7407{
7408	struct inodedep *inodedep;
7409	struct fs *fs;
7410	int extblocks;
7411
7412	if (ip->i_effnlink > 0)
7413		panic("softdep_releasefile: file still referenced");
7414	/*
7415	 * We may be called several times as the on-disk link count
7416	 * drops to zero. We only want to account for the space once.
7417	 */
7418	if (ip->i_flag & IN_SPACECOUNTED)
7419		return;
7420	/*
7421	 * We have to deactivate a snapshot otherwise copyonwrites may
7422	 * add blocks and the cleanup may remove blocks after we have
7423	 * tried to account for them.
7424	 */
7425	if ((ip->i_flags & SF_SNAPSHOT) != 0)
7426		ffs_snapremove(ITOV(ip));
7427	/*
7428	 * If we are tracking an nlinkdelta, we have to also remember
7429	 * whether we accounted for the freed space yet.
7430	 */
7431	ACQUIRE_LOCK(&lk);
7432	if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep)))
7433		inodedep->id_state |= SPACECOUNTED;
7434	FREE_LOCK(&lk);
7435	fs = ip->i_fs;
7436	extblocks = 0;
7437	if (fs->fs_magic == FS_UFS2_MAGIC)
7438		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
7439	UFS_LOCK(ip->i_ump);
7440	ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks;
7441	ip->i_fs->fs_pendinginodes += 1;
7442	UFS_UNLOCK(ip->i_ump);
7443	ip->i_flag |= IN_SPACECOUNTED;
7444}
7445
7446/*
7447 * Attach a sbdep dependency to the superblock buf so that we can keep
7448 * track of the head of the linked list of referenced but unlinked inodes.
7449 */
7450void
7451softdep_setup_sbupdate(ump, fs, bp)
7452	struct ufsmount *ump;
7453	struct fs *fs;
7454	struct buf *bp;
7455{
7456	struct sbdep *sbdep;
7457	struct worklist *wk;
7458
7459	if ((fs->fs_flags & FS_SUJ) == 0)
7460		return;
7461	LIST_FOREACH(wk, &bp->b_dep, wk_list)
7462		if (wk->wk_type == D_SBDEP)
7463			break;
7464	if (wk != NULL)
7465		return;
7466	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
7467	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
7468	sbdep->sb_fs = fs;
7469	sbdep->sb_ump = ump;
7470	ACQUIRE_LOCK(&lk);
7471	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
7472	FREE_LOCK(&lk);
7473}
7474
7475/*
7476 * Return the first unlinked inodedep which is ready to be the head of the
7477 * list.  The inodedep and all those after it must have valid next pointers.
7478 */
7479static struct inodedep *
7480first_unlinked_inodedep(ump)
7481	struct ufsmount *ump;
7482{
7483	struct inodedep *inodedep;
7484	struct inodedep *idp;
7485
7486	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
7487	    inodedep; inodedep = idp) {
7488		if ((inodedep->id_state & UNLINKNEXT) == 0)
7489			return (NULL);
7490		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
7491		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
7492			break;
7493		if ((inodedep->id_state & UNLINKPREV) == 0)
7494			panic("first_unlinked_inodedep: prev != next");
7495	}
7496	if (inodedep == NULL)
7497		return (NULL);
7498
7499	return (inodedep);
7500}
7501
7502/*
7503 * Set the sujfree unlinked head pointer prior to writing a superblock.
7504 */
7505static void
7506initiate_write_sbdep(sbdep)
7507	struct sbdep *sbdep;
7508{
7509	struct inodedep *inodedep;
7510	struct fs *bpfs;
7511	struct fs *fs;
7512
7513	bpfs = sbdep->sb_fs;
7514	fs = sbdep->sb_ump->um_fs;
7515	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
7516	if (inodedep) {
7517		fs->fs_sujfree = inodedep->id_ino;
7518		inodedep->id_state |= UNLINKPREV;
7519	} else
7520		fs->fs_sujfree = 0;
7521	bpfs->fs_sujfree = fs->fs_sujfree;
7522}
7523
7524/*
7525 * After a superblock is written determine whether it must be written again
7526 * due to a changing unlinked list head.
7527 */
7528static int
7529handle_written_sbdep(sbdep, bp)
7530	struct sbdep *sbdep;
7531	struct buf *bp;
7532{
7533	struct inodedep *inodedep;
7534	struct mount *mp;
7535	struct fs *fs;
7536
7537	fs = sbdep->sb_fs;
7538	mp = UFSTOVFS(sbdep->sb_ump);
7539	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
7540	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
7541	    (inodedep == NULL && fs->fs_sujfree != 0)) {
7542		bdirty(bp);
7543		return (1);
7544	}
7545	WORKITEM_FREE(sbdep, D_SBDEP);
7546	if (fs->fs_sujfree == 0)
7547		return (0);
7548	if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0)
7549		panic("handle_written_sbdep: lost inodedep");
7550	/*
7551	 * Now that we have a record of this indode in stable store allow it
7552	 * to be written to free up pending work.  Inodes may see a lot of
7553	 * write activity after they are unlinked which we must not hold up.
7554	 */
7555	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
7556		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
7557			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
7558			    inodedep, inodedep->id_state);
7559		if (inodedep->id_state & UNLINKONLIST)
7560			break;
7561		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
7562	}
7563
7564	return (0);
7565}
7566
7567/*
7568 * Mark an inodedep has unlinked and insert it into the in-memory unlinked
7569 * list.
7570 */
7571static void
7572unlinked_inodedep(mp, inodedep)
7573	struct mount *mp;
7574	struct inodedep *inodedep;
7575{
7576	struct ufsmount *ump;
7577
7578	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
7579		return;
7580	ump = VFSTOUFS(mp);
7581	ump->um_fs->fs_fmod = 1;
7582	inodedep->id_state |= UNLINKED;
7583	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
7584}
7585
7586/*
7587 * Remove an inodedep from the unlinked inodedep list.  This may require
7588 * disk writes if the inode has made it that far.
7589 */
7590static void
7591clear_unlinked_inodedep(inodedep)
7592	struct inodedep *inodedep;
7593{
7594	struct ufsmount *ump;
7595	struct inodedep *idp;
7596	struct inodedep *idn;
7597	struct fs *fs;
7598	struct buf *bp;
7599	ino_t ino;
7600	ino_t nino;
7601	ino_t pino;
7602	int error;
7603
7604	ump = VFSTOUFS(inodedep->id_list.wk_mp);
7605	fs = ump->um_fs;
7606	ino = inodedep->id_ino;
7607	error = 0;
7608	for (;;) {
7609		/*
7610		 * If nothing has yet been written simply remove us from
7611		 * the in memory list and return.  This is the most common
7612		 * case where handle_workitem_remove() loses the final
7613		 * reference.
7614		 */
7615		if ((inodedep->id_state & UNLINKLINKS) == 0)
7616			break;
7617		/*
7618		 * If we have a NEXT pointer and no PREV pointer we can simply
7619		 * clear NEXT's PREV and remove ourselves from the list.  Be
7620		 * careful not to clear PREV if the superblock points at
7621		 * next as well.
7622		 */
7623		idn = TAILQ_NEXT(inodedep, id_unlinked);
7624		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
7625			if (idn && fs->fs_sujfree != idn->id_ino)
7626				idn->id_state &= ~UNLINKPREV;
7627			break;
7628		}
7629		/*
7630		 * Here we have an inodedep which is actually linked into
7631		 * the list.  We must remove it by forcing a write to the
7632		 * link before us, whether it be the superblock or an inode.
7633		 * Unfortunately the list may change while we're waiting
7634		 * on the buf lock for either resource so we must loop until
7635		 * we lock. the right one.  If both the superblock and an
7636		 * inode point to this inode we must clear the inode first
7637		 * followed by the superblock.
7638		 */
7639		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
7640		pino = 0;
7641		if (idp && (idp->id_state & UNLINKNEXT))
7642			pino = idp->id_ino;
7643		FREE_LOCK(&lk);
7644		if (pino == 0)
7645			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
7646			    (int)fs->fs_sbsize, 0, 0, 0);
7647		else
7648			error = bread(ump->um_devvp,
7649			    fsbtodb(fs, ino_to_fsba(fs, pino)),
7650			    (int)fs->fs_bsize, NOCRED, &bp);
7651		ACQUIRE_LOCK(&lk);
7652		if (error)
7653			break;
7654		/* If the list has changed restart the loop. */
7655		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
7656		nino = 0;
7657		if (idp && (idp->id_state & UNLINKNEXT))
7658			nino = idp->id_ino;
7659		if (nino != pino ||
7660		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
7661			FREE_LOCK(&lk);
7662			brelse(bp);
7663			ACQUIRE_LOCK(&lk);
7664			continue;
7665		}
7666		/*
7667		 * Remove us from the in memory list.  After this we cannot
7668		 * access the inodedep.
7669		 */
7670		idn = TAILQ_NEXT(inodedep, id_unlinked);
7671		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
7672		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
7673		/*
7674		 * Determine the next inode number.
7675		 */
7676		nino = 0;
7677		if (idn) {
7678			/*
7679			 * If next isn't on the list we can just clear prev's
7680			 * state and schedule it to be fixed later.  No need
7681			 * to synchronously write if we're not in the real
7682			 * list.
7683			 */
7684			if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) {
7685				idp->id_state &= ~UNLINKNEXT;
7686				if ((idp->id_state & ONWORKLIST) == 0)
7687					WORKLIST_INSERT(&bp->b_dep,
7688					    &idp->id_list);
7689				FREE_LOCK(&lk);
7690				bawrite(bp);
7691				ACQUIRE_LOCK(&lk);
7692				return;
7693			}
7694			nino = idn->id_ino;
7695		}
7696		FREE_LOCK(&lk);
7697		/*
7698		 * The predecessor's next pointer is manually updated here
7699		 * so that the NEXT flag is never cleared for an element
7700		 * that is in the list.
7701		 */
7702		if (pino == 0) {
7703			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
7704			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
7705			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
7706			    bp);
7707		} else if (fs->fs_magic == FS_UFS1_MAGIC)
7708			((struct ufs1_dinode *)bp->b_data +
7709			    ino_to_fsbo(fs, pino))->di_freelink = nino;
7710		else
7711			((struct ufs2_dinode *)bp->b_data +
7712			    ino_to_fsbo(fs, pino))->di_freelink = nino;
7713		/*
7714		 * If the bwrite fails we have no recourse to recover.  The
7715		 * filesystem is corrupted already.
7716		 */
7717		bwrite(bp);
7718		ACQUIRE_LOCK(&lk);
7719		/*
7720		 * If the superblock pointer still needs to be cleared force
7721		 * a write here.
7722		 */
7723		if (fs->fs_sujfree == ino) {
7724			FREE_LOCK(&lk);
7725			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
7726			    (int)fs->fs_sbsize, 0, 0, 0);
7727			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
7728			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
7729			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
7730			    bp);
7731			bwrite(bp);
7732			ACQUIRE_LOCK(&lk);
7733		}
7734		if (fs->fs_sujfree != ino)
7735			return;
7736		panic("clear_unlinked_inodedep: Failed to clear free head");
7737	}
7738	if (inodedep->id_ino == fs->fs_sujfree)
7739		panic("clear_unlinked_inodedep: Freeing head of free list");
7740	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
7741	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
7742	return;
7743}
7744
7745/*
7746 * This workitem decrements the inode's link count.
7747 * If the link count reaches zero, the file is removed.
7748 */
7749static void
7750handle_workitem_remove(dirrem, xp)
7751	struct dirrem *dirrem;
7752	struct vnode *xp;
7753{
7754	struct thread *td = curthread;
7755	struct inodedep *inodedep;
7756	struct workhead dotdotwk;
7757	struct worklist *wk;
7758	struct ufsmount *ump;
7759	struct mount *mp;
7760	struct vnode *vp;
7761	struct inode *ip;
7762	ino_t oldinum;
7763	int error;
7764
7765	if (dirrem->dm_state & ONWORKLIST)
7766		panic("handle_workitem_remove: dirrem %p still on worklist",
7767		    dirrem);
7768	oldinum = dirrem->dm_oldinum;
7769	mp = dirrem->dm_list.wk_mp;
7770	ump = VFSTOUFS(mp);
7771	if ((vp = xp) == NULL &&
7772	    (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp,
7773	    FFSV_FORCEINSMQ)) != 0) {
7774		softdep_error("handle_workitem_remove: vget", error);
7775		return;
7776	}
7777	ip = VTOI(vp);
7778	ACQUIRE_LOCK(&lk);
7779	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
7780		panic("handle_workitem_remove: lost inodedep");
7781	if (dirrem->dm_state & ONDEPLIST)
7782		LIST_REMOVE(dirrem, dm_inonext);
7783	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
7784	    ("handle_workitem_remove:  Journal entries not written."));
7785
7786	/*
7787	 * Move all dependencies waiting on the remove to complete
7788	 * from the dirrem to the inode inowait list to be completed
7789	 * after the inode has been updated and written to disk.  Any
7790	 * marked MKDIR_PARENT are saved to be completed when the .. ref
7791	 * is removed.
7792	 */
7793	LIST_INIT(&dotdotwk);
7794	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
7795		WORKLIST_REMOVE(wk);
7796		if (wk->wk_state & MKDIR_PARENT) {
7797			wk->wk_state &= ~MKDIR_PARENT;
7798			WORKLIST_INSERT(&dotdotwk, wk);
7799			continue;
7800		}
7801		WORKLIST_INSERT(&inodedep->id_inowait, wk);
7802	}
7803	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
7804	/*
7805	 * Normal file deletion.
7806	 */
7807	if ((dirrem->dm_state & RMDIR) == 0) {
7808		ip->i_nlink--;
7809		DIP_SET(ip, i_nlink, ip->i_nlink);
7810		ip->i_flag |= IN_CHANGE;
7811		if (ip->i_nlink < ip->i_effnlink)
7812			panic("handle_workitem_remove: bad file delta");
7813		if (ip->i_nlink == 0)
7814			unlinked_inodedep(mp, inodedep);
7815		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
7816		num_dirrem -= 1;
7817		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
7818		    ("handle_workitem_remove: worklist not empty. %s",
7819		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
7820		WORKITEM_FREE(dirrem, D_DIRREM);
7821		FREE_LOCK(&lk);
7822		goto out;
7823	}
7824	/*
7825	 * Directory deletion. Decrement reference count for both the
7826	 * just deleted parent directory entry and the reference for ".".
7827	 * Next truncate the directory to length zero. When the
7828	 * truncation completes, arrange to have the reference count on
7829	 * the parent decremented to account for the loss of "..".
7830	 */
7831	ip->i_nlink -= 2;
7832	DIP_SET(ip, i_nlink, ip->i_nlink);
7833	ip->i_flag |= IN_CHANGE;
7834	if (ip->i_nlink < ip->i_effnlink)
7835		panic("handle_workitem_remove: bad dir delta");
7836	if (ip->i_nlink == 0)
7837		unlinked_inodedep(mp, inodedep);
7838	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
7839	FREE_LOCK(&lk);
7840	if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
7841		softdep_error("handle_workitem_remove: truncate", error);
7842	ACQUIRE_LOCK(&lk);
7843	/*
7844	 * Rename a directory to a new parent. Since, we are both deleting
7845	 * and creating a new directory entry, the link count on the new
7846	 * directory should not change. Thus we skip the followup dirrem.
7847	 */
7848	if (dirrem->dm_state & DIRCHG) {
7849		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
7850		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
7851		num_dirrem -= 1;
7852		WORKITEM_FREE(dirrem, D_DIRREM);
7853		FREE_LOCK(&lk);
7854		goto out;
7855	}
7856	dirrem->dm_state = ONDEPLIST;
7857	dirrem->dm_oldinum = dirrem->dm_dirinum;
7858	/*
7859	 * Place the dirrem on the parent's diremhd list.
7860	 */
7861	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
7862		panic("handle_workitem_remove: lost dir inodedep");
7863	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
7864	/*
7865	 * If the allocated inode has never been written to disk, then
7866	 * the on-disk inode is zero'ed and we can remove the file
7867	 * immediately.  When journaling if the inode has been marked
7868	 * unlinked and not DEPCOMPLETE we know it can never be written.
7869	 */
7870	inodedep_lookup(mp, oldinum, 0, &inodedep);
7871	if (inodedep == NULL ||
7872	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
7873	    check_inode_unwritten(inodedep)) {
7874		if (xp != NULL)
7875			add_to_worklist(&dirrem->dm_list, 0);
7876		FREE_LOCK(&lk);
7877		if (xp == NULL) {
7878			vput(vp);
7879			handle_workitem_remove(dirrem, NULL);
7880		}
7881		return;
7882	}
7883	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
7884	FREE_LOCK(&lk);
7885	ip->i_flag |= IN_CHANGE;
7886out:
7887	ffs_update(vp, 0);
7888	if (xp == NULL)
7889		vput(vp);
7890}
7891
7892/*
7893 * Inode de-allocation dependencies.
7894 *
7895 * When an inode's link count is reduced to zero, it can be de-allocated. We
7896 * found it convenient to postpone de-allocation until after the inode is
7897 * written to disk with its new link count (zero).  At this point, all of the
7898 * on-disk inode's block pointers are nullified and, with careful dependency
7899 * list ordering, all dependencies related to the inode will be satisfied and
7900 * the corresponding dependency structures de-allocated.  So, if/when the
7901 * inode is reused, there will be no mixing of old dependencies with new
7902 * ones.  This artificial dependency is set up by the block de-allocation
7903 * procedure above (softdep_setup_freeblocks) and completed by the
7904 * following procedure.
7905 */
7906static void
7907handle_workitem_freefile(freefile)
7908	struct freefile *freefile;
7909{
7910	struct workhead wkhd;
7911	struct fs *fs;
7912	struct inodedep *idp;
7913	struct ufsmount *ump;
7914	int error;
7915
7916	ump = VFSTOUFS(freefile->fx_list.wk_mp);
7917	fs = ump->um_fs;
7918#ifdef DEBUG
7919	ACQUIRE_LOCK(&lk);
7920	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
7921	FREE_LOCK(&lk);
7922	if (error)
7923		panic("handle_workitem_freefile: inodedep %p survived", idp);
7924#endif
7925	UFS_LOCK(ump);
7926	fs->fs_pendinginodes -= 1;
7927	UFS_UNLOCK(ump);
7928	LIST_INIT(&wkhd);
7929	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
7930	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
7931	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
7932		softdep_error("handle_workitem_freefile", error);
7933	ACQUIRE_LOCK(&lk);
7934	WORKITEM_FREE(freefile, D_FREEFILE);
7935	FREE_LOCK(&lk);
7936}
7937
7938
7939/*
7940 * Helper function which unlinks marker element from work list and returns
7941 * the next element on the list.
7942 */
7943static __inline struct worklist *
7944markernext(struct worklist *marker)
7945{
7946	struct worklist *next;
7947
7948	next = LIST_NEXT(marker, wk_list);
7949	LIST_REMOVE(marker, wk_list);
7950	return next;
7951}
7952
7953/*
7954 * Disk writes.
7955 *
7956 * The dependency structures constructed above are most actively used when file
7957 * system blocks are written to disk.  No constraints are placed on when a
7958 * block can be written, but unsatisfied update dependencies are made safe by
7959 * modifying (or replacing) the source memory for the duration of the disk
7960 * write.  When the disk write completes, the memory block is again brought
7961 * up-to-date.
7962 *
7963 * In-core inode structure reclamation.
7964 *
7965 * Because there are a finite number of "in-core" inode structures, they are
7966 * reused regularly.  By transferring all inode-related dependencies to the
7967 * in-memory inode block and indexing them separately (via "inodedep"s), we
7968 * can allow "in-core" inode structures to be reused at any time and avoid
7969 * any increase in contention.
7970 *
7971 * Called just before entering the device driver to initiate a new disk I/O.
7972 * The buffer must be locked, thus, no I/O completion operations can occur
7973 * while we are manipulating its associated dependencies.
7974 */
7975static void
7976softdep_disk_io_initiation(bp)
7977	struct buf *bp;		/* structure describing disk write to occur */
7978{
7979	struct worklist *wk;
7980	struct worklist marker;
7981	struct inodedep *inodedep;
7982	struct freeblks *freeblks;
7983	struct jfreeblk *jfreeblk;
7984	struct newblk *newblk;
7985
7986	/*
7987	 * We only care about write operations. There should never
7988	 * be dependencies for reads.
7989	 */
7990	if (bp->b_iocmd != BIO_WRITE)
7991		panic("softdep_disk_io_initiation: not write");
7992
7993	if (bp->b_vflags & BV_BKGRDINPROG)
7994		panic("softdep_disk_io_initiation: Writing buffer with "
7995		    "background write in progress: %p", bp);
7996
7997	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
7998	PHOLD(curproc);			/* Don't swap out kernel stack */
7999
8000	ACQUIRE_LOCK(&lk);
8001	/*
8002	 * Do any necessary pre-I/O processing.
8003	 */
8004	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
8005	     wk = markernext(&marker)) {
8006		LIST_INSERT_AFTER(wk, &marker, wk_list);
8007		switch (wk->wk_type) {
8008
8009		case D_PAGEDEP:
8010			initiate_write_filepage(WK_PAGEDEP(wk), bp);
8011			continue;
8012
8013		case D_INODEDEP:
8014			inodedep = WK_INODEDEP(wk);
8015			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
8016				initiate_write_inodeblock_ufs1(inodedep, bp);
8017			else
8018				initiate_write_inodeblock_ufs2(inodedep, bp);
8019			continue;
8020
8021		case D_INDIRDEP:
8022			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
8023			continue;
8024
8025		case D_BMSAFEMAP:
8026			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
8027			continue;
8028
8029		case D_JSEG:
8030			WK_JSEG(wk)->js_buf = NULL;
8031			continue;
8032
8033		case D_FREEBLKS:
8034			freeblks = WK_FREEBLKS(wk);
8035			jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd);
8036			/*
8037			 * We have to wait for the jfreeblks to be journaled
8038			 * before we can write an inodeblock with updated
8039			 * pointers.  Be careful to arrange the marker so
8040			 * we revisit the jfreeblk if it's not removed by
8041			 * the first jwait().
8042			 */
8043			if (jfreeblk != NULL) {
8044				LIST_REMOVE(&marker, wk_list);
8045				LIST_INSERT_BEFORE(wk, &marker, wk_list);
8046				jwait(&jfreeblk->jf_list);
8047			}
8048			continue;
8049		case D_ALLOCDIRECT:
8050		case D_ALLOCINDIR:
8051			/*
8052			 * We have to wait for the jnewblk to be journaled
8053			 * before we can write to a block otherwise the
8054			 * contents may be confused with an earlier file
8055			 * at recovery time.  Handle the marker as described
8056			 * above.
8057			 */
8058			newblk = WK_NEWBLK(wk);
8059			if (newblk->nb_jnewblk != NULL) {
8060				LIST_REMOVE(&marker, wk_list);
8061				LIST_INSERT_BEFORE(wk, &marker, wk_list);
8062				jwait(&newblk->nb_jnewblk->jn_list);
8063			}
8064			continue;
8065
8066		case D_SBDEP:
8067			initiate_write_sbdep(WK_SBDEP(wk));
8068			continue;
8069
8070		case D_MKDIR:
8071		case D_FREEWORK:
8072		case D_FREEDEP:
8073		case D_JSEGDEP:
8074			continue;
8075
8076		default:
8077			panic("handle_disk_io_initiation: Unexpected type %s",
8078			    TYPENAME(wk->wk_type));
8079			/* NOTREACHED */
8080		}
8081	}
8082	FREE_LOCK(&lk);
8083	PRELE(curproc);			/* Allow swapout of kernel stack */
8084}
8085
8086/*
8087 * Called from within the procedure above to deal with unsatisfied
8088 * allocation dependencies in a directory. The buffer must be locked,
8089 * thus, no I/O completion operations can occur while we are
8090 * manipulating its associated dependencies.
8091 */
8092static void
8093initiate_write_filepage(pagedep, bp)
8094	struct pagedep *pagedep;
8095	struct buf *bp;
8096{
8097	struct jremref *jremref;
8098	struct jmvref *jmvref;
8099	struct dirrem *dirrem;
8100	struct diradd *dap;
8101	struct direct *ep;
8102	int i;
8103
8104	if (pagedep->pd_state & IOSTARTED) {
8105		/*
8106		 * This can only happen if there is a driver that does not
8107		 * understand chaining. Here biodone will reissue the call
8108		 * to strategy for the incomplete buffers.
8109		 */
8110		printf("initiate_write_filepage: already started\n");
8111		return;
8112	}
8113	pagedep->pd_state |= IOSTARTED;
8114	/*
8115	 * Wait for all journal remove dependencies to hit the disk.
8116	 * We can not allow any potentially conflicting directory adds
8117	 * to be visible before removes and rollback is too difficult.
8118	 * lk may be dropped and re-acquired, however we hold the buf
8119	 * locked so the dependency can not go away.
8120	 */
8121	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
8122		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
8123			stat_jwait_filepage++;
8124			jwait(&jremref->jr_list);
8125		}
8126	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
8127		stat_jwait_filepage++;
8128		jwait(&jmvref->jm_list);
8129	}
8130	for (i = 0; i < DAHASHSZ; i++) {
8131		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
8132			ep = (struct direct *)
8133			    ((char *)bp->b_data + dap->da_offset);
8134			if (ep->d_ino != dap->da_newinum)
8135				panic("%s: dir inum %d != new %d",
8136				    "initiate_write_filepage",
8137				    ep->d_ino, dap->da_newinum);
8138			if (dap->da_state & DIRCHG)
8139				ep->d_ino = dap->da_previous->dm_oldinum;
8140			else
8141				ep->d_ino = 0;
8142			dap->da_state &= ~ATTACHED;
8143			dap->da_state |= UNDONE;
8144		}
8145	}
8146}
8147
8148/*
8149 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
8150 * Note that any bug fixes made to this routine must be done in the
8151 * version found below.
8152 *
8153 * Called from within the procedure above to deal with unsatisfied
8154 * allocation dependencies in an inodeblock. The buffer must be
8155 * locked, thus, no I/O completion operations can occur while we
8156 * are manipulating its associated dependencies.
8157 */
8158static void
8159initiate_write_inodeblock_ufs1(inodedep, bp)
8160	struct inodedep *inodedep;
8161	struct buf *bp;			/* The inode block */
8162{
8163	struct allocdirect *adp, *lastadp;
8164	struct ufs1_dinode *dp;
8165	struct ufs1_dinode *sip;
8166	struct inoref *inoref;
8167	struct fs *fs;
8168	ufs_lbn_t i;
8169#ifdef INVARIANTS
8170	ufs_lbn_t prevlbn = 0;
8171#endif
8172	int deplist;
8173
8174	if (inodedep->id_state & IOSTARTED)
8175		panic("initiate_write_inodeblock_ufs1: already started");
8176	inodedep->id_state |= IOSTARTED;
8177	fs = inodedep->id_fs;
8178	dp = (struct ufs1_dinode *)bp->b_data +
8179	    ino_to_fsbo(fs, inodedep->id_ino);
8180
8181	/*
8182	 * If we're on the unlinked list but have not yet written our
8183	 * next pointer initialize it here.
8184	 */
8185	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
8186		struct inodedep *inon;
8187
8188		inon = TAILQ_NEXT(inodedep, id_unlinked);
8189		dp->di_freelink = inon ? inon->id_ino : 0;
8190	}
8191	/*
8192	 * If the bitmap is not yet written, then the allocated
8193	 * inode cannot be written to disk.
8194	 */
8195	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
8196		if (inodedep->id_savedino1 != NULL)
8197			panic("initiate_write_inodeblock_ufs1: I/O underway");
8198		FREE_LOCK(&lk);
8199		sip = malloc(sizeof(struct ufs1_dinode),
8200		    M_SAVEDINO, M_SOFTDEP_FLAGS);
8201		ACQUIRE_LOCK(&lk);
8202		inodedep->id_savedino1 = sip;
8203		*inodedep->id_savedino1 = *dp;
8204		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
8205		dp->di_gen = inodedep->id_savedino1->di_gen;
8206		dp->di_freelink = inodedep->id_savedino1->di_freelink;
8207		return;
8208	}
8209	/*
8210	 * If no dependencies, then there is nothing to roll back.
8211	 */
8212	inodedep->id_savedsize = dp->di_size;
8213	inodedep->id_savedextsize = 0;
8214	inodedep->id_savednlink = dp->di_nlink;
8215	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
8216	    TAILQ_EMPTY(&inodedep->id_inoreflst))
8217		return;
8218	/*
8219	 * Revert the link count to that of the first unwritten journal entry.
8220	 */
8221	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
8222	if (inoref)
8223		dp->di_nlink = inoref->if_nlink;
8224	/*
8225	 * Set the dependencies to busy.
8226	 */
8227	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8228	     adp = TAILQ_NEXT(adp, ad_next)) {
8229#ifdef INVARIANTS
8230		if (deplist != 0 && prevlbn >= adp->ad_offset)
8231			panic("softdep_write_inodeblock: lbn order");
8232		prevlbn = adp->ad_offset;
8233		if (adp->ad_offset < NDADDR &&
8234		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
8235			panic("%s: direct pointer #%jd mismatch %d != %jd",
8236			    "softdep_write_inodeblock",
8237			    (intmax_t)adp->ad_offset,
8238			    dp->di_db[adp->ad_offset],
8239			    (intmax_t)adp->ad_newblkno);
8240		if (adp->ad_offset >= NDADDR &&
8241		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
8242			panic("%s: indirect pointer #%jd mismatch %d != %jd",
8243			    "softdep_write_inodeblock",
8244			    (intmax_t)adp->ad_offset - NDADDR,
8245			    dp->di_ib[adp->ad_offset - NDADDR],
8246			    (intmax_t)adp->ad_newblkno);
8247		deplist |= 1 << adp->ad_offset;
8248		if ((adp->ad_state & ATTACHED) == 0)
8249			panic("softdep_write_inodeblock: Unknown state 0x%x",
8250			    adp->ad_state);
8251#endif /* INVARIANTS */
8252		adp->ad_state &= ~ATTACHED;
8253		adp->ad_state |= UNDONE;
8254	}
8255	/*
8256	 * The on-disk inode cannot claim to be any larger than the last
8257	 * fragment that has been written. Otherwise, the on-disk inode
8258	 * might have fragments that were not the last block in the file
8259	 * which would corrupt the filesystem.
8260	 */
8261	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8262	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8263		if (adp->ad_offset >= NDADDR)
8264			break;
8265		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
8266		/* keep going until hitting a rollback to a frag */
8267		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8268			continue;
8269		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8270		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
8271#ifdef INVARIANTS
8272			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
8273				panic("softdep_write_inodeblock: lost dep1");
8274#endif /* INVARIANTS */
8275			dp->di_db[i] = 0;
8276		}
8277		for (i = 0; i < NIADDR; i++) {
8278#ifdef INVARIANTS
8279			if (dp->di_ib[i] != 0 &&
8280			    (deplist & ((1 << NDADDR) << i)) == 0)
8281				panic("softdep_write_inodeblock: lost dep2");
8282#endif /* INVARIANTS */
8283			dp->di_ib[i] = 0;
8284		}
8285		return;
8286	}
8287	/*
8288	 * If we have zero'ed out the last allocated block of the file,
8289	 * roll back the size to the last currently allocated block.
8290	 * We know that this last allocated block is a full-sized as
8291	 * we already checked for fragments in the loop above.
8292	 */
8293	if (lastadp != NULL &&
8294	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8295		for (i = lastadp->ad_offset; i >= 0; i--)
8296			if (dp->di_db[i] != 0)
8297				break;
8298		dp->di_size = (i + 1) * fs->fs_bsize;
8299	}
8300	/*
8301	 * The only dependencies are for indirect blocks.
8302	 *
8303	 * The file size for indirect block additions is not guaranteed.
8304	 * Such a guarantee would be non-trivial to achieve. The conventional
8305	 * synchronous write implementation also does not make this guarantee.
8306	 * Fsck should catch and fix discrepancies. Arguably, the file size
8307	 * can be over-estimated without destroying integrity when the file
8308	 * moves into the indirect blocks (i.e., is large). If we want to
8309	 * postpone fsck, we are stuck with this argument.
8310	 */
8311	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
8312		dp->di_ib[adp->ad_offset - NDADDR] = 0;
8313}
8314
8315/*
8316 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
8317 * Note that any bug fixes made to this routine must be done in the
8318 * version found above.
8319 *
8320 * Called from within the procedure above to deal with unsatisfied
8321 * allocation dependencies in an inodeblock. The buffer must be
8322 * locked, thus, no I/O completion operations can occur while we
8323 * are manipulating its associated dependencies.
8324 */
8325static void
8326initiate_write_inodeblock_ufs2(inodedep, bp)
8327	struct inodedep *inodedep;
8328	struct buf *bp;			/* The inode block */
8329{
8330	struct allocdirect *adp, *lastadp;
8331	struct ufs2_dinode *dp;
8332	struct ufs2_dinode *sip;
8333	struct inoref *inoref;
8334	struct fs *fs;
8335	ufs_lbn_t i;
8336#ifdef INVARIANTS
8337	ufs_lbn_t prevlbn = 0;
8338#endif
8339	int deplist;
8340
8341	if (inodedep->id_state & IOSTARTED)
8342		panic("initiate_write_inodeblock_ufs2: already started");
8343	inodedep->id_state |= IOSTARTED;
8344	fs = inodedep->id_fs;
8345	dp = (struct ufs2_dinode *)bp->b_data +
8346	    ino_to_fsbo(fs, inodedep->id_ino);
8347
8348	/*
8349	 * If we're on the unlinked list but have not yet written our
8350	 * next pointer initialize it here.
8351	 */
8352	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
8353		struct inodedep *inon;
8354
8355		inon = TAILQ_NEXT(inodedep, id_unlinked);
8356		dp->di_freelink = inon ? inon->id_ino : 0;
8357	}
8358	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) ==
8359	    (UNLINKED | UNLINKNEXT)) {
8360		struct inodedep *inon;
8361		ino_t freelink;
8362
8363		inon = TAILQ_NEXT(inodedep, id_unlinked);
8364		freelink = inon ? inon->id_ino : 0;
8365		if (freelink != dp->di_freelink)
8366			panic("ino %p(0x%X) %d, %d != %d",
8367			    inodedep, inodedep->id_state, inodedep->id_ino,
8368			    freelink, dp->di_freelink);
8369	}
8370	/*
8371	 * If the bitmap is not yet written, then the allocated
8372	 * inode cannot be written to disk.
8373	 */
8374	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
8375		if (inodedep->id_savedino2 != NULL)
8376			panic("initiate_write_inodeblock_ufs2: I/O underway");
8377		FREE_LOCK(&lk);
8378		sip = malloc(sizeof(struct ufs2_dinode),
8379		    M_SAVEDINO, M_SOFTDEP_FLAGS);
8380		ACQUIRE_LOCK(&lk);
8381		inodedep->id_savedino2 = sip;
8382		*inodedep->id_savedino2 = *dp;
8383		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
8384		dp->di_gen = inodedep->id_savedino2->di_gen;
8385		dp->di_freelink = inodedep->id_savedino2->di_freelink;
8386		return;
8387	}
8388	/*
8389	 * If no dependencies, then there is nothing to roll back.
8390	 */
8391	inodedep->id_savedsize = dp->di_size;
8392	inodedep->id_savedextsize = dp->di_extsize;
8393	inodedep->id_savednlink = dp->di_nlink;
8394	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
8395	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
8396	    TAILQ_EMPTY(&inodedep->id_inoreflst))
8397		return;
8398	/*
8399	 * Revert the link count to that of the first unwritten journal entry.
8400	 */
8401	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
8402	if (inoref)
8403		dp->di_nlink = inoref->if_nlink;
8404
8405	/*
8406	 * Set the ext data dependencies to busy.
8407	 */
8408	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
8409	     adp = TAILQ_NEXT(adp, ad_next)) {
8410#ifdef INVARIANTS
8411		if (deplist != 0 && prevlbn >= adp->ad_offset)
8412			panic("softdep_write_inodeblock: lbn order");
8413		prevlbn = adp->ad_offset;
8414		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
8415			panic("%s: direct pointer #%jd mismatch %jd != %jd",
8416			    "softdep_write_inodeblock",
8417			    (intmax_t)adp->ad_offset,
8418			    (intmax_t)dp->di_extb[adp->ad_offset],
8419			    (intmax_t)adp->ad_newblkno);
8420		deplist |= 1 << adp->ad_offset;
8421		if ((adp->ad_state & ATTACHED) == 0)
8422			panic("softdep_write_inodeblock: Unknown state 0x%x",
8423			    adp->ad_state);
8424#endif /* INVARIANTS */
8425		adp->ad_state &= ~ATTACHED;
8426		adp->ad_state |= UNDONE;
8427	}
8428	/*
8429	 * The on-disk inode cannot claim to be any larger than the last
8430	 * fragment that has been written. Otherwise, the on-disk inode
8431	 * might have fragments that were not the last block in the ext
8432	 * data which would corrupt the filesystem.
8433	 */
8434	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
8435	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8436		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
8437		/* keep going until hitting a rollback to a frag */
8438		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8439			continue;
8440		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8441		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
8442#ifdef INVARIANTS
8443			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
8444				panic("softdep_write_inodeblock: lost dep1");
8445#endif /* INVARIANTS */
8446			dp->di_extb[i] = 0;
8447		}
8448		lastadp = NULL;
8449		break;
8450	}
8451	/*
8452	 * If we have zero'ed out the last allocated block of the ext
8453	 * data, roll back the size to the last currently allocated block.
8454	 * We know that this last allocated block is a full-sized as
8455	 * we already checked for fragments in the loop above.
8456	 */
8457	if (lastadp != NULL &&
8458	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8459		for (i = lastadp->ad_offset; i >= 0; i--)
8460			if (dp->di_extb[i] != 0)
8461				break;
8462		dp->di_extsize = (i + 1) * fs->fs_bsize;
8463	}
8464	/*
8465	 * Set the file data dependencies to busy.
8466	 */
8467	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8468	     adp = TAILQ_NEXT(adp, ad_next)) {
8469#ifdef INVARIANTS
8470		if (deplist != 0 && prevlbn >= adp->ad_offset)
8471			panic("softdep_write_inodeblock: lbn order");
8472		prevlbn = adp->ad_offset;
8473		if (adp->ad_offset < NDADDR &&
8474		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
8475			panic("%s: direct pointer #%jd mismatch %jd != %jd",
8476			    "softdep_write_inodeblock",
8477			    (intmax_t)adp->ad_offset,
8478			    (intmax_t)dp->di_db[adp->ad_offset],
8479			    (intmax_t)adp->ad_newblkno);
8480		if (adp->ad_offset >= NDADDR &&
8481		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
8482			panic("%s indirect pointer #%jd mismatch %jd != %jd",
8483			    "softdep_write_inodeblock:",
8484			    (intmax_t)adp->ad_offset - NDADDR,
8485			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
8486			    (intmax_t)adp->ad_newblkno);
8487		deplist |= 1 << adp->ad_offset;
8488		if ((adp->ad_state & ATTACHED) == 0)
8489			panic("softdep_write_inodeblock: Unknown state 0x%x",
8490			    adp->ad_state);
8491#endif /* INVARIANTS */
8492		adp->ad_state &= ~ATTACHED;
8493		adp->ad_state |= UNDONE;
8494	}
8495	/*
8496	 * The on-disk inode cannot claim to be any larger than the last
8497	 * fragment that has been written. Otherwise, the on-disk inode
8498	 * might have fragments that were not the last block in the file
8499	 * which would corrupt the filesystem.
8500	 */
8501	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8502	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8503		if (adp->ad_offset >= NDADDR)
8504			break;
8505		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
8506		/* keep going until hitting a rollback to a frag */
8507		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8508			continue;
8509		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8510		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
8511#ifdef INVARIANTS
8512			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
8513				panic("softdep_write_inodeblock: lost dep2");
8514#endif /* INVARIANTS */
8515			dp->di_db[i] = 0;
8516		}
8517		for (i = 0; i < NIADDR; i++) {
8518#ifdef INVARIANTS
8519			if (dp->di_ib[i] != 0 &&
8520			    (deplist & ((1 << NDADDR) << i)) == 0)
8521				panic("softdep_write_inodeblock: lost dep3");
8522#endif /* INVARIANTS */
8523			dp->di_ib[i] = 0;
8524		}
8525		return;
8526	}
8527	/*
8528	 * If we have zero'ed out the last allocated block of the file,
8529	 * roll back the size to the last currently allocated block.
8530	 * We know that this last allocated block is a full-sized as
8531	 * we already checked for fragments in the loop above.
8532	 */
8533	if (lastadp != NULL &&
8534	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8535		for (i = lastadp->ad_offset; i >= 0; i--)
8536			if (dp->di_db[i] != 0)
8537				break;
8538		dp->di_size = (i + 1) * fs->fs_bsize;
8539	}
8540	/*
8541	 * The only dependencies are for indirect blocks.
8542	 *
8543	 * The file size for indirect block additions is not guaranteed.
8544	 * Such a guarantee would be non-trivial to achieve. The conventional
8545	 * synchronous write implementation also does not make this guarantee.
8546	 * Fsck should catch and fix discrepancies. Arguably, the file size
8547	 * can be over-estimated without destroying integrity when the file
8548	 * moves into the indirect blocks (i.e., is large). If we want to
8549	 * postpone fsck, we are stuck with this argument.
8550	 */
8551	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
8552		dp->di_ib[adp->ad_offset - NDADDR] = 0;
8553}
8554
8555/*
8556 * Cancel an indirdep as a result of truncation.  Release all of the
8557 * children allocindirs and place their journal work on the appropriate
8558 * list.
8559 */
8560static void
8561cancel_indirdep(indirdep, bp, inodedep, freeblks)
8562	struct indirdep *indirdep;
8563	struct buf *bp;
8564	struct inodedep *inodedep;
8565	struct freeblks *freeblks;
8566{
8567	struct allocindir *aip;
8568
8569	/*
8570	 * None of the indirect pointers will ever be visible,
8571	 * so they can simply be tossed. GOINGAWAY ensures
8572	 * that allocated pointers will be saved in the buffer
8573	 * cache until they are freed. Note that they will
8574	 * only be able to be found by their physical address
8575	 * since the inode mapping the logical address will
8576	 * be gone. The save buffer used for the safe copy
8577	 * was allocated in setup_allocindir_phase2 using
8578	 * the physical address so it could be used for this
8579	 * purpose. Hence we swap the safe copy with the real
8580	 * copy, allowing the safe copy to be freed and holding
8581	 * on to the real copy for later use in indir_trunc.
8582	 */
8583	if (indirdep->ir_state & GOINGAWAY)
8584		panic("cancel_indirdep: already gone");
8585	if (indirdep->ir_state & ONDEPLIST) {
8586		indirdep->ir_state &= ~ONDEPLIST;
8587		LIST_REMOVE(indirdep, ir_next);
8588	}
8589	indirdep->ir_state |= GOINGAWAY;
8590	VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1;
8591	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
8592		cancel_allocindir(aip, inodedep, freeblks);
8593	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
8594		cancel_allocindir(aip, inodedep, freeblks);
8595	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
8596		cancel_allocindir(aip, inodedep, freeblks);
8597	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
8598		cancel_allocindir(aip, inodedep, freeblks);
8599	bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
8600	WORKLIST_REMOVE(&indirdep->ir_list);
8601	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
8602	indirdep->ir_savebp = NULL;
8603}
8604
8605/*
8606 * Free an indirdep once it no longer has new pointers to track.
8607 */
8608static void
8609free_indirdep(indirdep)
8610	struct indirdep *indirdep;
8611{
8612
8613	KASSERT(LIST_EMPTY(&indirdep->ir_jwork),
8614	    ("free_indirdep: Journal work not empty."));
8615	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
8616	    ("free_indirdep: Complete head not empty."));
8617	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
8618	    ("free_indirdep: write head not empty."));
8619	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
8620	    ("free_indirdep: done head not empty."));
8621	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
8622	    ("free_indirdep: deplist head not empty."));
8623	KASSERT(indirdep->ir_savebp == NULL,
8624	    ("free_indirdep: %p ir_savebp != NULL", indirdep));
8625	KASSERT((indirdep->ir_state & ONDEPLIST) == 0,
8626	    ("free_indirdep: %p still on deplist.", indirdep));
8627	if (indirdep->ir_state & ONWORKLIST)
8628		WORKLIST_REMOVE(&indirdep->ir_list);
8629	WORKITEM_FREE(indirdep, D_INDIRDEP);
8630}
8631
8632/*
8633 * Called before a write to an indirdep.  This routine is responsible for
8634 * rolling back pointers to a safe state which includes only those
8635 * allocindirs which have been completed.
8636 */
8637static void
8638initiate_write_indirdep(indirdep, bp)
8639	struct indirdep *indirdep;
8640	struct buf *bp;
8641{
8642
8643	if (indirdep->ir_state & GOINGAWAY)
8644		panic("disk_io_initiation: indirdep gone");
8645
8646	/*
8647	 * If there are no remaining dependencies, this will be writing
8648	 * the real pointers.
8649	 */
8650	if (LIST_EMPTY(&indirdep->ir_deplisthd))
8651		return;
8652	/*
8653	 * Replace up-to-date version with safe version.
8654	 */
8655	FREE_LOCK(&lk);
8656	indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
8657	    M_SOFTDEP_FLAGS);
8658	ACQUIRE_LOCK(&lk);
8659	indirdep->ir_state &= ~ATTACHED;
8660	indirdep->ir_state |= UNDONE;
8661	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
8662	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
8663	    bp->b_bcount);
8664}
8665
8666/*
8667 * Called when an inode has been cleared in a cg bitmap.  This finally
8668 * eliminates any canceled jaddrefs
8669 */
8670void
8671softdep_setup_inofree(mp, bp, ino, wkhd)
8672	struct mount *mp;
8673	struct buf *bp;
8674	ino_t ino;
8675	struct workhead *wkhd;
8676{
8677	struct worklist *wk, *wkn;
8678	struct inodedep *inodedep;
8679	uint8_t *inosused;
8680	struct cg *cgp;
8681	struct fs *fs;
8682
8683	ACQUIRE_LOCK(&lk);
8684	fs = VFSTOUFS(mp)->um_fs;
8685	cgp = (struct cg *)bp->b_data;
8686	inosused = cg_inosused(cgp);
8687	if (isset(inosused, ino % fs->fs_ipg))
8688		panic("softdep_setup_inofree: inode %d not freed.", ino);
8689	if (inodedep_lookup(mp, ino, 0, &inodedep))
8690		panic("softdep_setup_inofree: ino %d has existing inodedep %p",
8691		    ino, inodedep);
8692	if (wkhd) {
8693		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
8694			if (wk->wk_type != D_JADDREF)
8695				continue;
8696			WORKLIST_REMOVE(wk);
8697			/*
8698			 * We can free immediately even if the jaddref
8699			 * isn't attached in a background write as now
8700			 * the bitmaps are reconciled.
8701		 	 */
8702			wk->wk_state |= COMPLETE | ATTACHED;
8703			free_jaddref(WK_JADDREF(wk));
8704		}
8705		jwork_move(&bp->b_dep, wkhd);
8706	}
8707	FREE_LOCK(&lk);
8708}
8709
8710
8711/*
8712 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
8713 * map.  Any dependencies waiting for the write to clear are added to the
8714 * buf's list and any jnewblks that are being canceled are discarded
8715 * immediately.
8716 */
8717void
8718softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
8719	struct mount *mp;
8720	struct buf *bp;
8721	ufs2_daddr_t blkno;
8722	int frags;
8723	struct workhead *wkhd;
8724{
8725	struct jnewblk *jnewblk;
8726	struct worklist *wk, *wkn;
8727#ifdef SUJ_DEBUG
8728	struct bmsafemap *bmsafemap;
8729	struct fs *fs;
8730	uint8_t *blksfree;
8731	struct cg *cgp;
8732	ufs2_daddr_t jstart;
8733	ufs2_daddr_t jend;
8734	ufs2_daddr_t end;
8735	long bno;
8736	int i;
8737#endif
8738
8739	ACQUIRE_LOCK(&lk);
8740	/*
8741	 * Detach any jnewblks which have been canceled.  They must linger
8742	 * until the bitmap is cleared again by ffs_blkfree() to prevent
8743	 * an unjournaled allocation from hitting the disk.
8744	 */
8745	if (wkhd) {
8746		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
8747			if (wk->wk_type != D_JNEWBLK)
8748				continue;
8749			jnewblk = WK_JNEWBLK(wk);
8750			KASSERT(jnewblk->jn_state & GOINGAWAY,
8751			    ("softdep_setup_blkfree: jnewblk not canceled."));
8752			WORKLIST_REMOVE(wk);
8753#ifdef SUJ_DEBUG
8754			/*
8755			 * Assert that this block is free in the bitmap
8756			 * before we discard the jnewblk.
8757			 */
8758			fs = VFSTOUFS(mp)->um_fs;
8759			cgp = (struct cg *)bp->b_data;
8760			blksfree = cg_blksfree(cgp);
8761			bno = dtogd(fs, jnewblk->jn_blkno);
8762			for (i = jnewblk->jn_oldfrags;
8763			    i < jnewblk->jn_frags; i++) {
8764				if (isset(blksfree, bno + i))
8765					continue;
8766				panic("softdep_setup_blkfree: not free");
8767			}
8768#endif
8769			/*
8770			 * Even if it's not attached we can free immediately
8771			 * as the new bitmap is correct.
8772			 */
8773			wk->wk_state |= COMPLETE | ATTACHED;
8774			free_jnewblk(jnewblk);
8775		}
8776		/*
8777		 * The buf must be locked by the caller otherwise these could
8778		 * be added while it's being written and the write would
8779		 * complete them before they made it to disk.
8780		 */
8781		jwork_move(&bp->b_dep, wkhd);
8782	}
8783
8784#ifdef SUJ_DEBUG
8785	/*
8786	 * Assert that we are not freeing a block which has an outstanding
8787	 * allocation dependency.
8788	 */
8789	fs = VFSTOUFS(mp)->um_fs;
8790	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno));
8791	end = blkno + frags;
8792	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
8793		/*
8794		 * Don't match against blocks that will be freed when the
8795		 * background write is done.
8796		 */
8797		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
8798		    (COMPLETE | DEPCOMPLETE))
8799			continue;
8800		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
8801		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
8802		if ((blkno >= jstart && blkno < jend) ||
8803		    (end > jstart && end <= jend)) {
8804			printf("state 0x%X %jd - %d %d dep %p\n",
8805			    jnewblk->jn_state, jnewblk->jn_blkno,
8806			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
8807			    jnewblk->jn_newblk);
8808			panic("softdep_setup_blkfree: "
8809			    "%jd-%jd(%d) overlaps with %jd-%jd",
8810			    blkno, end, frags, jstart, jend);
8811		}
8812	}
8813#endif
8814	FREE_LOCK(&lk);
8815}
8816
8817static void
8818initiate_write_bmsafemap(bmsafemap, bp)
8819	struct bmsafemap *bmsafemap;
8820	struct buf *bp;			/* The cg block. */
8821{
8822	struct jaddref *jaddref;
8823	struct jnewblk *jnewblk;
8824	uint8_t *inosused;
8825	uint8_t *blksfree;
8826	struct cg *cgp;
8827	struct fs *fs;
8828	int cleared;
8829	ino_t ino;
8830	long bno;
8831	int i;
8832
8833	if (bmsafemap->sm_state & IOSTARTED)
8834		panic("initiate_write_bmsafemap: Already started\n");
8835	bmsafemap->sm_state |= IOSTARTED;
8836	/*
8837	 * Clear any inode allocations which are pending journal writes.
8838	 */
8839	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
8840		cgp = (struct cg *)bp->b_data;
8841		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
8842		inosused = cg_inosused(cgp);
8843		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
8844			ino = jaddref->ja_ino % fs->fs_ipg;
8845			/*
8846			 * If this is a background copy the inode may not
8847			 * be marked used yet.
8848			 */
8849			if (isset(inosused, ino)) {
8850				if ((jaddref->ja_mode & IFMT) == IFDIR)
8851					cgp->cg_cs.cs_ndir--;
8852				cgp->cg_cs.cs_nifree++;
8853				clrbit(inosused, ino);
8854				jaddref->ja_state &= ~ATTACHED;
8855				jaddref->ja_state |= UNDONE;
8856				stat_jaddref++;
8857			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
8858				panic("initiate_write_bmsafemap: inode %d "
8859				    "marked free", jaddref->ja_ino);
8860		}
8861	}
8862	/*
8863	 * Clear any block allocations which are pending journal writes.
8864	 */
8865	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
8866		cgp = (struct cg *)bp->b_data;
8867		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
8868		blksfree = cg_blksfree(cgp);
8869		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
8870			bno = dtogd(fs, jnewblk->jn_blkno);
8871			cleared = 0;
8872			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
8873			    i++) {
8874				if (isclr(blksfree, bno + i)) {
8875					cleared = 1;
8876					setbit(blksfree, bno + i);
8877				}
8878			}
8879			/*
8880			 * We may not clear the block if it's a background
8881			 * copy.  In that case there is no reason to detach
8882			 * it.
8883			 */
8884			if (cleared) {
8885				stat_jnewblk++;
8886				jnewblk->jn_state &= ~ATTACHED;
8887				jnewblk->jn_state |= UNDONE;
8888			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
8889				panic("initiate_write_bmsafemap: block %jd "
8890				    "marked free", jnewblk->jn_blkno);
8891		}
8892	}
8893	/*
8894	 * Move allocation lists to the written lists so they can be
8895	 * cleared once the block write is complete.
8896	 */
8897	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
8898	    inodedep, id_deps);
8899	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
8900	    newblk, nb_deps);
8901}
8902
8903/*
8904 * This routine is called during the completion interrupt
8905 * service routine for a disk write (from the procedure called
8906 * by the device driver to inform the filesystem caches of
8907 * a request completion).  It should be called early in this
8908 * procedure, before the block is made available to other
8909 * processes or other routines are called.
8910 *
8911 */
8912static void
8913softdep_disk_write_complete(bp)
8914	struct buf *bp;		/* describes the completed disk write */
8915{
8916	struct worklist *wk;
8917	struct worklist *owk;
8918	struct workhead reattach;
8919	struct buf *sbp;
8920
8921	/*
8922	 * If an error occurred while doing the write, then the data
8923	 * has not hit the disk and the dependencies cannot be unrolled.
8924	 */
8925	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
8926		return;
8927	LIST_INIT(&reattach);
8928	/*
8929	 * This lock must not be released anywhere in this code segment.
8930	 */
8931	sbp = NULL;
8932	owk = NULL;
8933	ACQUIRE_LOCK(&lk);
8934	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
8935		WORKLIST_REMOVE(wk);
8936		if (wk == owk)
8937			panic("duplicate worklist: %p\n", wk);
8938		owk = wk;
8939		switch (wk->wk_type) {
8940
8941		case D_PAGEDEP:
8942			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
8943				WORKLIST_INSERT(&reattach, wk);
8944			continue;
8945
8946		case D_INODEDEP:
8947			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
8948				WORKLIST_INSERT(&reattach, wk);
8949			continue;
8950
8951		case D_BMSAFEMAP:
8952			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
8953				WORKLIST_INSERT(&reattach, wk);
8954			continue;
8955
8956		case D_MKDIR:
8957			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
8958			continue;
8959
8960		case D_ALLOCDIRECT:
8961			wk->wk_state |= COMPLETE;
8962			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
8963			continue;
8964
8965		case D_ALLOCINDIR:
8966			wk->wk_state |= COMPLETE;
8967			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
8968			continue;
8969
8970		case D_INDIRDEP:
8971			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
8972				WORKLIST_INSERT(&reattach, wk);
8973			continue;
8974
8975		case D_FREEBLKS:
8976			wk->wk_state |= COMPLETE;
8977			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
8978				add_to_worklist(wk, 1);
8979			continue;
8980
8981		case D_FREEWORK:
8982			handle_written_freework(WK_FREEWORK(wk));
8983			break;
8984
8985		case D_FREEDEP:
8986			free_freedep(WK_FREEDEP(wk));
8987			continue;
8988
8989		case D_JSEGDEP:
8990			free_jsegdep(WK_JSEGDEP(wk));
8991			continue;
8992
8993		case D_JSEG:
8994			handle_written_jseg(WK_JSEG(wk), bp);
8995			continue;
8996
8997		case D_SBDEP:
8998			if (handle_written_sbdep(WK_SBDEP(wk), bp))
8999				WORKLIST_INSERT(&reattach, wk);
9000			continue;
9001
9002		default:
9003			panic("handle_disk_write_complete: Unknown type %s",
9004			    TYPENAME(wk->wk_type));
9005			/* NOTREACHED */
9006		}
9007	}
9008	/*
9009	 * Reattach any requests that must be redone.
9010	 */
9011	while ((wk = LIST_FIRST(&reattach)) != NULL) {
9012		WORKLIST_REMOVE(wk);
9013		WORKLIST_INSERT(&bp->b_dep, wk);
9014	}
9015	FREE_LOCK(&lk);
9016	if (sbp)
9017		brelse(sbp);
9018}
9019
9020/*
9021 * Called from within softdep_disk_write_complete above. Note that
9022 * this routine is always called from interrupt level with further
9023 * splbio interrupts blocked.
9024 */
9025static void
9026handle_allocdirect_partdone(adp, wkhd)
9027	struct allocdirect *adp;	/* the completed allocdirect */
9028	struct workhead *wkhd;		/* Work to do when inode is writtne. */
9029{
9030	struct allocdirectlst *listhead;
9031	struct allocdirect *listadp;
9032	struct inodedep *inodedep;
9033	long bsize;
9034
9035	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
9036		return;
9037	/*
9038	 * The on-disk inode cannot claim to be any larger than the last
9039	 * fragment that has been written. Otherwise, the on-disk inode
9040	 * might have fragments that were not the last block in the file
9041	 * which would corrupt the filesystem. Thus, we cannot free any
9042	 * allocdirects after one whose ad_oldblkno claims a fragment as
9043	 * these blocks must be rolled back to zero before writing the inode.
9044	 * We check the currently active set of allocdirects in id_inoupdt
9045	 * or id_extupdt as appropriate.
9046	 */
9047	inodedep = adp->ad_inodedep;
9048	bsize = inodedep->id_fs->fs_bsize;
9049	if (adp->ad_state & EXTDATA)
9050		listhead = &inodedep->id_extupdt;
9051	else
9052		listhead = &inodedep->id_inoupdt;
9053	TAILQ_FOREACH(listadp, listhead, ad_next) {
9054		/* found our block */
9055		if (listadp == adp)
9056			break;
9057		/* continue if ad_oldlbn is not a fragment */
9058		if (listadp->ad_oldsize == 0 ||
9059		    listadp->ad_oldsize == bsize)
9060			continue;
9061		/* hit a fragment */
9062		return;
9063	}
9064	/*
9065	 * If we have reached the end of the current list without
9066	 * finding the just finished dependency, then it must be
9067	 * on the future dependency list. Future dependencies cannot
9068	 * be freed until they are moved to the current list.
9069	 */
9070	if (listadp == NULL) {
9071#ifdef DEBUG
9072		if (adp->ad_state & EXTDATA)
9073			listhead = &inodedep->id_newextupdt;
9074		else
9075			listhead = &inodedep->id_newinoupdt;
9076		TAILQ_FOREACH(listadp, listhead, ad_next)
9077			/* found our block */
9078			if (listadp == adp)
9079				break;
9080		if (listadp == NULL)
9081			panic("handle_allocdirect_partdone: lost dep");
9082#endif /* DEBUG */
9083		return;
9084	}
9085	/*
9086	 * If we have found the just finished dependency, then queue
9087	 * it along with anything that follows it that is complete.
9088	 * Since the pointer has not yet been written in the inode
9089	 * as the dependency prevents it, place the allocdirect on the
9090	 * bufwait list where it will be freed once the pointer is
9091	 * valid.
9092	 */
9093	if (wkhd == NULL)
9094		wkhd = &inodedep->id_bufwait;
9095	for (; adp; adp = listadp) {
9096		listadp = TAILQ_NEXT(adp, ad_next);
9097		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
9098			return;
9099		TAILQ_REMOVE(listhead, adp, ad_next);
9100		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
9101	}
9102}
9103
9104/*
9105 * Called from within softdep_disk_write_complete above.  This routine
9106 * completes successfully written allocindirs.
9107 */
9108static void
9109handle_allocindir_partdone(aip)
9110	struct allocindir *aip;		/* the completed allocindir */
9111{
9112	struct indirdep *indirdep;
9113
9114	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
9115		return;
9116	indirdep = aip->ai_indirdep;
9117	LIST_REMOVE(aip, ai_next);
9118	if (indirdep->ir_state & UNDONE) {
9119		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
9120		return;
9121	}
9122	if (indirdep->ir_state & UFS1FMT)
9123		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
9124		    aip->ai_newblkno;
9125	else
9126		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
9127		    aip->ai_newblkno;
9128	/*
9129	 * Await the pointer write before freeing the allocindir.
9130	 */
9131	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
9132}
9133
9134/*
9135 * Release segments held on a jwork list.
9136 */
9137static void
9138handle_jwork(wkhd)
9139	struct workhead *wkhd;
9140{
9141	struct worklist *wk;
9142
9143	while ((wk = LIST_FIRST(wkhd)) != NULL) {
9144		WORKLIST_REMOVE(wk);
9145		switch (wk->wk_type) {
9146		case D_JSEGDEP:
9147			free_jsegdep(WK_JSEGDEP(wk));
9148			continue;
9149		default:
9150			panic("handle_jwork: Unknown type %s\n",
9151			    TYPENAME(wk->wk_type));
9152		}
9153	}
9154}
9155
9156/*
9157 * Handle the bufwait list on an inode when it is safe to release items
9158 * held there.  This normally happens after an inode block is written but
9159 * may be delayed and handle later if there are pending journal items that
9160 * are not yet safe to be released.
9161 */
9162static struct freefile *
9163handle_bufwait(inodedep, refhd)
9164	struct inodedep *inodedep;
9165	struct workhead *refhd;
9166{
9167	struct jaddref *jaddref;
9168	struct freefile *freefile;
9169	struct worklist *wk;
9170
9171	freefile = NULL;
9172	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
9173		WORKLIST_REMOVE(wk);
9174		switch (wk->wk_type) {
9175		case D_FREEFILE:
9176			/*
9177			 * We defer adding freefile to the worklist
9178			 * until all other additions have been made to
9179			 * ensure that it will be done after all the
9180			 * old blocks have been freed.
9181			 */
9182			if (freefile != NULL)
9183				panic("handle_bufwait: freefile");
9184			freefile = WK_FREEFILE(wk);
9185			continue;
9186
9187		case D_MKDIR:
9188			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
9189			continue;
9190
9191		case D_DIRADD:
9192			diradd_inode_written(WK_DIRADD(wk), inodedep);
9193			continue;
9194
9195		case D_FREEFRAG:
9196			wk->wk_state |= COMPLETE;
9197			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
9198				add_to_worklist(wk, 0);
9199			continue;
9200
9201		case D_DIRREM:
9202			wk->wk_state |= COMPLETE;
9203			add_to_worklist(wk, 0);
9204			continue;
9205
9206		case D_ALLOCDIRECT:
9207		case D_ALLOCINDIR:
9208			free_newblk(WK_NEWBLK(wk));
9209			continue;
9210
9211		case D_JNEWBLK:
9212			wk->wk_state |= COMPLETE;
9213			free_jnewblk(WK_JNEWBLK(wk));
9214			continue;
9215
9216		/*
9217		 * Save freed journal segments and add references on
9218		 * the supplied list which will delay their release
9219		 * until the cg bitmap is cleared on disk.
9220		 */
9221		case D_JSEGDEP:
9222			if (refhd == NULL)
9223				free_jsegdep(WK_JSEGDEP(wk));
9224			else
9225				WORKLIST_INSERT(refhd, wk);
9226			continue;
9227
9228		case D_JADDREF:
9229			jaddref = WK_JADDREF(wk);
9230			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
9231			    if_deps);
9232			/*
9233			 * Transfer any jaddrefs to the list to be freed with
9234			 * the bitmap if we're handling a removed file.
9235			 */
9236			if (refhd == NULL) {
9237				wk->wk_state |= COMPLETE;
9238				free_jaddref(jaddref);
9239			} else
9240				WORKLIST_INSERT(refhd, wk);
9241			continue;
9242
9243		default:
9244			panic("handle_bufwait: Unknown type %p(%s)",
9245			    wk, TYPENAME(wk->wk_type));
9246			/* NOTREACHED */
9247		}
9248	}
9249	return (freefile);
9250}
9251/*
9252 * Called from within softdep_disk_write_complete above to restore
9253 * in-memory inode block contents to their most up-to-date state. Note
9254 * that this routine is always called from interrupt level with further
9255 * splbio interrupts blocked.
9256 */
9257static int
9258handle_written_inodeblock(inodedep, bp)
9259	struct inodedep *inodedep;
9260	struct buf *bp;		/* buffer containing the inode block */
9261{
9262	struct freefile *freefile;
9263	struct allocdirect *adp, *nextadp;
9264	struct ufs1_dinode *dp1 = NULL;
9265	struct ufs2_dinode *dp2 = NULL;
9266	struct workhead wkhd;
9267	int hadchanges, fstype;
9268	ino_t freelink;
9269
9270	LIST_INIT(&wkhd);
9271	hadchanges = 0;
9272	freefile = NULL;
9273	if ((inodedep->id_state & IOSTARTED) == 0)
9274		panic("handle_written_inodeblock: not started");
9275	inodedep->id_state &= ~IOSTARTED;
9276	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
9277		fstype = UFS1;
9278		dp1 = (struct ufs1_dinode *)bp->b_data +
9279		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
9280		freelink = dp1->di_freelink;
9281	} else {
9282		fstype = UFS2;
9283		dp2 = (struct ufs2_dinode *)bp->b_data +
9284		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
9285		freelink = dp2->di_freelink;
9286	}
9287	/*
9288	 * If we wrote a valid freelink pointer during the last write
9289	 * record it here.
9290	 */
9291	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
9292		struct inodedep *inon;
9293
9294		inon = TAILQ_NEXT(inodedep, id_unlinked);
9295		if ((inon == NULL && freelink == 0) ||
9296		    (inon && inon->id_ino == freelink)) {
9297			if (inon)
9298				inon->id_state |= UNLINKPREV;
9299			inodedep->id_state |= UNLINKNEXT;
9300		} else
9301			hadchanges = 1;
9302	}
9303	/* Leave this inodeblock dirty until it's in the list. */
9304	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED)
9305		hadchanges = 1;
9306	/*
9307	 * If we had to rollback the inode allocation because of
9308	 * bitmaps being incomplete, then simply restore it.
9309	 * Keep the block dirty so that it will not be reclaimed until
9310	 * all associated dependencies have been cleared and the
9311	 * corresponding updates written to disk.
9312	 */
9313	if (inodedep->id_savedino1 != NULL) {
9314		hadchanges = 1;
9315		if (fstype == UFS1)
9316			*dp1 = *inodedep->id_savedino1;
9317		else
9318			*dp2 = *inodedep->id_savedino2;
9319		free(inodedep->id_savedino1, M_SAVEDINO);
9320		inodedep->id_savedino1 = NULL;
9321		if ((bp->b_flags & B_DELWRI) == 0)
9322			stat_inode_bitmap++;
9323		bdirty(bp);
9324		/*
9325		 * If the inode is clear here and GOINGAWAY it will never
9326		 * be written.  Process the bufwait and clear any pending
9327		 * work which may include the freefile.
9328		 */
9329		if (inodedep->id_state & GOINGAWAY)
9330			goto bufwait;
9331		return (1);
9332	}
9333	inodedep->id_state |= COMPLETE;
9334	/*
9335	 * Roll forward anything that had to be rolled back before
9336	 * the inode could be updated.
9337	 */
9338	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
9339		nextadp = TAILQ_NEXT(adp, ad_next);
9340		if (adp->ad_state & ATTACHED)
9341			panic("handle_written_inodeblock: new entry");
9342		if (fstype == UFS1) {
9343			if (adp->ad_offset < NDADDR) {
9344				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
9345					panic("%s %s #%jd mismatch %d != %jd",
9346					    "handle_written_inodeblock:",
9347					    "direct pointer",
9348					    (intmax_t)adp->ad_offset,
9349					    dp1->di_db[adp->ad_offset],
9350					    (intmax_t)adp->ad_oldblkno);
9351				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
9352			} else {
9353				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
9354					panic("%s: %s #%jd allocated as %d",
9355					    "handle_written_inodeblock",
9356					    "indirect pointer",
9357					    (intmax_t)adp->ad_offset - NDADDR,
9358					    dp1->di_ib[adp->ad_offset - NDADDR]);
9359				dp1->di_ib[adp->ad_offset - NDADDR] =
9360				    adp->ad_newblkno;
9361			}
9362		} else {
9363			if (adp->ad_offset < NDADDR) {
9364				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
9365					panic("%s: %s #%jd %s %jd != %jd",
9366					    "handle_written_inodeblock",
9367					    "direct pointer",
9368					    (intmax_t)adp->ad_offset, "mismatch",
9369					    (intmax_t)dp2->di_db[adp->ad_offset],
9370					    (intmax_t)adp->ad_oldblkno);
9371				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
9372			} else {
9373				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
9374					panic("%s: %s #%jd allocated as %jd",
9375					    "handle_written_inodeblock",
9376					    "indirect pointer",
9377					    (intmax_t)adp->ad_offset - NDADDR,
9378					    (intmax_t)
9379					    dp2->di_ib[adp->ad_offset - NDADDR]);
9380				dp2->di_ib[adp->ad_offset - NDADDR] =
9381				    adp->ad_newblkno;
9382			}
9383		}
9384		adp->ad_state &= ~UNDONE;
9385		adp->ad_state |= ATTACHED;
9386		hadchanges = 1;
9387	}
9388	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
9389		nextadp = TAILQ_NEXT(adp, ad_next);
9390		if (adp->ad_state & ATTACHED)
9391			panic("handle_written_inodeblock: new entry");
9392		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
9393			panic("%s: direct pointers #%jd %s %jd != %jd",
9394			    "handle_written_inodeblock",
9395			    (intmax_t)adp->ad_offset, "mismatch",
9396			    (intmax_t)dp2->di_extb[adp->ad_offset],
9397			    (intmax_t)adp->ad_oldblkno);
9398		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
9399		adp->ad_state &= ~UNDONE;
9400		adp->ad_state |= ATTACHED;
9401		hadchanges = 1;
9402	}
9403	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
9404		stat_direct_blk_ptrs++;
9405	/*
9406	 * Reset the file size to its most up-to-date value.
9407	 */
9408	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
9409		panic("handle_written_inodeblock: bad size");
9410	if (inodedep->id_savednlink > LINK_MAX)
9411		panic("handle_written_inodeblock: Invalid link count "
9412		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
9413	if (fstype == UFS1) {
9414		if (dp1->di_nlink != inodedep->id_savednlink) {
9415			dp1->di_nlink = inodedep->id_savednlink;
9416			hadchanges = 1;
9417		}
9418		if (dp1->di_size != inodedep->id_savedsize) {
9419			dp1->di_size = inodedep->id_savedsize;
9420			hadchanges = 1;
9421		}
9422	} else {
9423		if (dp2->di_nlink != inodedep->id_savednlink) {
9424			dp2->di_nlink = inodedep->id_savednlink;
9425			hadchanges = 1;
9426		}
9427		if (dp2->di_size != inodedep->id_savedsize) {
9428			dp2->di_size = inodedep->id_savedsize;
9429			hadchanges = 1;
9430		}
9431		if (dp2->di_extsize != inodedep->id_savedextsize) {
9432			dp2->di_extsize = inodedep->id_savedextsize;
9433			hadchanges = 1;
9434		}
9435	}
9436	inodedep->id_savedsize = -1;
9437	inodedep->id_savedextsize = -1;
9438	inodedep->id_savednlink = -1;
9439	/*
9440	 * If there were any rollbacks in the inode block, then it must be
9441	 * marked dirty so that its will eventually get written back in
9442	 * its correct form.
9443	 */
9444	if (hadchanges)
9445		bdirty(bp);
9446bufwait:
9447	/*
9448	 * Process any allocdirects that completed during the update.
9449	 */
9450	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
9451		handle_allocdirect_partdone(adp, &wkhd);
9452	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
9453		handle_allocdirect_partdone(adp, &wkhd);
9454	/*
9455	 * Process deallocations that were held pending until the
9456	 * inode had been written to disk. Freeing of the inode
9457	 * is delayed until after all blocks have been freed to
9458	 * avoid creation of new <vfsid, inum, lbn> triples
9459	 * before the old ones have been deleted.  Completely
9460	 * unlinked inodes are not processed until the unlinked
9461	 * inode list is written or the last reference is removed.
9462	 */
9463	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
9464		freefile = handle_bufwait(inodedep, NULL);
9465		if (freefile && !LIST_EMPTY(&wkhd)) {
9466			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
9467			freefile = NULL;
9468		}
9469	}
9470	/*
9471	 * Move rolled forward dependency completions to the bufwait list
9472	 * now that those that were already written have been processed.
9473	 */
9474	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
9475		panic("handle_written_inodeblock: bufwait but no changes");
9476	jwork_move(&inodedep->id_bufwait, &wkhd);
9477
9478	if (freefile != NULL) {
9479		/*
9480		 * If the inode is goingaway it was never written.  Fake up
9481		 * the state here so free_inodedep() can succeed.
9482		 */
9483		if (inodedep->id_state & GOINGAWAY)
9484			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
9485		if (free_inodedep(inodedep) == 0)
9486			panic("handle_written_inodeblock: live inodedep %p",
9487			    inodedep);
9488		add_to_worklist(&freefile->fx_list, 0);
9489		return (0);
9490	}
9491
9492	/*
9493	 * If no outstanding dependencies, free it.
9494	 */
9495	if (free_inodedep(inodedep) ||
9496	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
9497	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
9498	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
9499	     LIST_FIRST(&inodedep->id_bufwait) == 0))
9500		return (0);
9501	return (hadchanges);
9502}
9503
9504static int
9505handle_written_indirdep(indirdep, bp, bpp)
9506	struct indirdep *indirdep;
9507	struct buf *bp;
9508	struct buf **bpp;
9509{
9510	struct allocindir *aip;
9511	int chgs;
9512
9513	if (indirdep->ir_state & GOINGAWAY)
9514		panic("disk_write_complete: indirdep gone");
9515	chgs = 0;
9516	/*
9517	 * If there were rollbacks revert them here.
9518	 */
9519	if (indirdep->ir_saveddata) {
9520		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
9521		free(indirdep->ir_saveddata, M_INDIRDEP);
9522		indirdep->ir_saveddata = 0;
9523		chgs = 1;
9524	}
9525	indirdep->ir_state &= ~UNDONE;
9526	indirdep->ir_state |= ATTACHED;
9527	/*
9528	 * Move allocindirs with written pointers to the completehd if
9529	 * the the indirdep's pointer is not yet written.  Otherwise
9530	 * free them here.
9531	 */
9532	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
9533		LIST_REMOVE(aip, ai_next);
9534		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
9535			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
9536			    ai_next);
9537			continue;
9538		}
9539		free_newblk(&aip->ai_block);
9540	}
9541	/*
9542	 * Move allocindirs that have finished dependency processing from
9543	 * the done list to the write list after updating the pointers.
9544	 */
9545	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
9546		handle_allocindir_partdone(aip);
9547		if (aip == LIST_FIRST(&indirdep->ir_donehd))
9548			panic("disk_write_complete: not gone");
9549		chgs = 1;
9550	}
9551	/*
9552	 * If this indirdep has been detached from its newblk during
9553	 * I/O we need to keep this dep attached to the buffer so
9554	 * deallocate_dependencies can find it and properly resolve
9555	 * any outstanding dependencies.
9556	 */
9557	if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0)
9558		chgs = 1;
9559	if ((bp->b_flags & B_DELWRI) == 0)
9560		stat_indir_blk_ptrs++;
9561	/*
9562	 * If there were no changes we can discard the savedbp and detach
9563	 * ourselves from the buf.  We are only carrying completed pointers
9564	 * in this case.
9565	 */
9566	if (chgs == 0) {
9567		struct buf *sbp;
9568
9569		sbp = indirdep->ir_savebp;
9570		sbp->b_flags |= B_INVAL | B_NOCACHE;
9571		indirdep->ir_savebp = NULL;
9572		if (*bpp != NULL)
9573			panic("handle_written_indirdep: bp already exists.");
9574		*bpp = sbp;
9575	} else
9576		bdirty(bp);
9577	/*
9578	 * If there are no fresh dependencies and none waiting on writes
9579	 * we can free the indirdep.
9580	 */
9581	if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) {
9582		if (indirdep->ir_state & ONDEPLIST)
9583			LIST_REMOVE(indirdep, ir_next);
9584		free_indirdep(indirdep);
9585		return (0);
9586	}
9587
9588	return (chgs);
9589}
9590
9591/*
9592 * Process a diradd entry after its dependent inode has been written.
9593 * This routine must be called with splbio interrupts blocked.
9594 */
9595static void
9596diradd_inode_written(dap, inodedep)
9597	struct diradd *dap;
9598	struct inodedep *inodedep;
9599{
9600
9601	dap->da_state |= COMPLETE;
9602	complete_diradd(dap);
9603	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9604}
9605
9606/*
9607 * Returns true if the bmsafemap will have rollbacks when written.  Must
9608 * only be called with lk and the buf lock on the cg held.
9609 */
9610static int
9611bmsafemap_rollbacks(bmsafemap)
9612	struct bmsafemap *bmsafemap;
9613{
9614
9615	return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
9616	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd));
9617}
9618
9619/*
9620 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
9621 * changes if it's not a background write.  Set all written dependencies
9622 * to DEPCOMPLETE and free the structure if possible.
9623 */
9624static int
9625handle_written_bmsafemap(bmsafemap, bp)
9626	struct bmsafemap *bmsafemap;
9627	struct buf *bp;
9628{
9629	struct newblk *newblk;
9630	struct inodedep *inodedep;
9631	struct jaddref *jaddref, *jatmp;
9632	struct jnewblk *jnewblk, *jntmp;
9633	uint8_t *inosused;
9634	uint8_t *blksfree;
9635	struct cg *cgp;
9636	struct fs *fs;
9637	ino_t ino;
9638	long bno;
9639	int chgs;
9640	int i;
9641
9642	if ((bmsafemap->sm_state & IOSTARTED) == 0)
9643		panic("initiate_write_bmsafemap: Not started\n");
9644	chgs = 0;
9645	bmsafemap->sm_state &= ~IOSTARTED;
9646	/*
9647	 * Restore unwritten inode allocation pending jaddref writes.
9648	 */
9649	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
9650		cgp = (struct cg *)bp->b_data;
9651		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
9652		inosused = cg_inosused(cgp);
9653		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
9654		    ja_bmdeps, jatmp) {
9655			if ((jaddref->ja_state & UNDONE) == 0)
9656				continue;
9657			ino = jaddref->ja_ino % fs->fs_ipg;
9658			if (isset(inosused, ino))
9659				panic("handle_written_bmsafemap: "
9660				    "re-allocated inode");
9661			if ((bp->b_xflags & BX_BKGRDMARKER) == 0) {
9662				if ((jaddref->ja_mode & IFMT) == IFDIR)
9663					cgp->cg_cs.cs_ndir++;
9664				cgp->cg_cs.cs_nifree--;
9665				setbit(inosused, ino);
9666				chgs = 1;
9667			}
9668			jaddref->ja_state &= ~UNDONE;
9669			jaddref->ja_state |= ATTACHED;
9670			free_jaddref(jaddref);
9671		}
9672	}
9673	/*
9674	 * Restore any block allocations which are pending journal writes.
9675	 */
9676	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
9677		cgp = (struct cg *)bp->b_data;
9678		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
9679		blksfree = cg_blksfree(cgp);
9680		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
9681		    jntmp) {
9682			if ((jnewblk->jn_state & UNDONE) == 0)
9683				continue;
9684			bno = dtogd(fs, jnewblk->jn_blkno);
9685			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
9686			    i++) {
9687				if (bp->b_xflags & BX_BKGRDMARKER)
9688					break;
9689				if ((jnewblk->jn_state & NEWBLOCK) == 0 &&
9690				    isclr(blksfree, bno + i))
9691					panic("handle_written_bmsafemap: "
9692					    "re-allocated fragment");
9693				clrbit(blksfree, bno + i);
9694				chgs = 1;
9695			}
9696			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
9697			jnewblk->jn_state |= ATTACHED;
9698			free_jnewblk(jnewblk);
9699		}
9700	}
9701	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
9702		newblk->nb_state |= DEPCOMPLETE;
9703		newblk->nb_state &= ~ONDEPLIST;
9704		newblk->nb_bmsafemap = NULL;
9705		LIST_REMOVE(newblk, nb_deps);
9706		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
9707			handle_allocdirect_partdone(
9708			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
9709		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
9710			handle_allocindir_partdone(
9711			    WK_ALLOCINDIR(&newblk->nb_list));
9712		else if (newblk->nb_list.wk_type != D_NEWBLK)
9713			panic("handle_written_bmsafemap: Unexpected type: %s",
9714			    TYPENAME(newblk->nb_list.wk_type));
9715	}
9716	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
9717		inodedep->id_state |= DEPCOMPLETE;
9718		inodedep->id_state &= ~ONDEPLIST;
9719		LIST_REMOVE(inodedep, id_deps);
9720		inodedep->id_bmsafemap = NULL;
9721	}
9722	if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
9723	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
9724	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
9725	    LIST_EMPTY(&bmsafemap->sm_inodedephd)) {
9726		if (chgs)
9727			bdirty(bp);
9728		LIST_REMOVE(bmsafemap, sm_hash);
9729		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
9730		return (0);
9731	}
9732	bdirty(bp);
9733	return (1);
9734}
9735
9736/*
9737 * Try to free a mkdir dependency.
9738 */
9739static void
9740complete_mkdir(mkdir)
9741	struct mkdir *mkdir;
9742{
9743	struct diradd *dap;
9744
9745	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
9746		return;
9747	LIST_REMOVE(mkdir, md_mkdirs);
9748	dap = mkdir->md_diradd;
9749	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
9750	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
9751		dap->da_state |= DEPCOMPLETE;
9752		complete_diradd(dap);
9753	}
9754	WORKITEM_FREE(mkdir, D_MKDIR);
9755}
9756
9757/*
9758 * Handle the completion of a mkdir dependency.
9759 */
9760static void
9761handle_written_mkdir(mkdir, type)
9762	struct mkdir *mkdir;
9763	int type;
9764{
9765
9766	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
9767		panic("handle_written_mkdir: bad type");
9768	mkdir->md_state |= COMPLETE;
9769	complete_mkdir(mkdir);
9770}
9771
9772static void
9773free_pagedep(pagedep)
9774	struct pagedep *pagedep;
9775{
9776	int i;
9777
9778	if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST))
9779		return;
9780	for (i = 0; i < DAHASHSZ; i++)
9781		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
9782			return;
9783	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
9784		return;
9785	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
9786		return;
9787	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
9788		return;
9789	LIST_REMOVE(pagedep, pd_hash);
9790	WORKITEM_FREE(pagedep, D_PAGEDEP);
9791}
9792
9793/*
9794 * Called from within softdep_disk_write_complete above.
9795 * A write operation was just completed. Removed inodes can
9796 * now be freed and associated block pointers may be committed.
9797 * Note that this routine is always called from interrupt level
9798 * with further splbio interrupts blocked.
9799 */
9800static int
9801handle_written_filepage(pagedep, bp)
9802	struct pagedep *pagedep;
9803	struct buf *bp;		/* buffer containing the written page */
9804{
9805	struct dirrem *dirrem;
9806	struct diradd *dap, *nextdap;
9807	struct direct *ep;
9808	int i, chgs;
9809
9810	if ((pagedep->pd_state & IOSTARTED) == 0)
9811		panic("handle_written_filepage: not started");
9812	pagedep->pd_state &= ~IOSTARTED;
9813	/*
9814	 * Process any directory removals that have been committed.
9815	 */
9816	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
9817		LIST_REMOVE(dirrem, dm_next);
9818		dirrem->dm_state |= COMPLETE;
9819		dirrem->dm_dirinum = pagedep->pd_ino;
9820		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9821		    ("handle_written_filepage: Journal entries not written."));
9822		add_to_worklist(&dirrem->dm_list, 0);
9823	}
9824	/*
9825	 * Free any directory additions that have been committed.
9826	 * If it is a newly allocated block, we have to wait until
9827	 * the on-disk directory inode claims the new block.
9828	 */
9829	if ((pagedep->pd_state & NEWBLOCK) == 0)
9830		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
9831			free_diradd(dap, NULL);
9832	/*
9833	 * Uncommitted directory entries must be restored.
9834	 */
9835	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
9836		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
9837		     dap = nextdap) {
9838			nextdap = LIST_NEXT(dap, da_pdlist);
9839			if (dap->da_state & ATTACHED)
9840				panic("handle_written_filepage: attached");
9841			ep = (struct direct *)
9842			    ((char *)bp->b_data + dap->da_offset);
9843			ep->d_ino = dap->da_newinum;
9844			dap->da_state &= ~UNDONE;
9845			dap->da_state |= ATTACHED;
9846			chgs = 1;
9847			/*
9848			 * If the inode referenced by the directory has
9849			 * been written out, then the dependency can be
9850			 * moved to the pending list.
9851			 */
9852			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
9853				LIST_REMOVE(dap, da_pdlist);
9854				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
9855				    da_pdlist);
9856			}
9857		}
9858	}
9859	/*
9860	 * If there were any rollbacks in the directory, then it must be
9861	 * marked dirty so that its will eventually get written back in
9862	 * its correct form.
9863	 */
9864	if (chgs) {
9865		if ((bp->b_flags & B_DELWRI) == 0)
9866			stat_dir_entry++;
9867		bdirty(bp);
9868		return (1);
9869	}
9870	/*
9871	 * If we are not waiting for a new directory block to be
9872	 * claimed by its inode, then the pagedep will be freed.
9873	 * Otherwise it will remain to track any new entries on
9874	 * the page in case they are fsync'ed.
9875	 */
9876	if ((pagedep->pd_state & NEWBLOCK) == 0 &&
9877	    LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
9878		LIST_REMOVE(pagedep, pd_hash);
9879		WORKITEM_FREE(pagedep, D_PAGEDEP);
9880	}
9881	return (0);
9882}
9883
9884/*
9885 * Writing back in-core inode structures.
9886 *
9887 * The filesystem only accesses an inode's contents when it occupies an
9888 * "in-core" inode structure.  These "in-core" structures are separate from
9889 * the page frames used to cache inode blocks.  Only the latter are
9890 * transferred to/from the disk.  So, when the updated contents of the
9891 * "in-core" inode structure are copied to the corresponding in-memory inode
9892 * block, the dependencies are also transferred.  The following procedure is
9893 * called when copying a dirty "in-core" inode to a cached inode block.
9894 */
9895
9896/*
9897 * Called when an inode is loaded from disk. If the effective link count
9898 * differed from the actual link count when it was last flushed, then we
9899 * need to ensure that the correct effective link count is put back.
9900 */
9901void
9902softdep_load_inodeblock(ip)
9903	struct inode *ip;	/* the "in_core" copy of the inode */
9904{
9905	struct inodedep *inodedep;
9906
9907	/*
9908	 * Check for alternate nlink count.
9909	 */
9910	ip->i_effnlink = ip->i_nlink;
9911	ACQUIRE_LOCK(&lk);
9912	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
9913	    &inodedep) == 0) {
9914		FREE_LOCK(&lk);
9915		return;
9916	}
9917	ip->i_effnlink -= inodedep->id_nlinkdelta;
9918	if (inodedep->id_state & SPACECOUNTED)
9919		ip->i_flag |= IN_SPACECOUNTED;
9920	FREE_LOCK(&lk);
9921}
9922
9923/*
9924 * This routine is called just before the "in-core" inode
9925 * information is to be copied to the in-memory inode block.
9926 * Recall that an inode block contains several inodes. If
9927 * the force flag is set, then the dependencies will be
9928 * cleared so that the update can always be made. Note that
9929 * the buffer is locked when this routine is called, so we
9930 * will never be in the middle of writing the inode block
9931 * to disk.
9932 */
9933void
9934softdep_update_inodeblock(ip, bp, waitfor)
9935	struct inode *ip;	/* the "in_core" copy of the inode */
9936	struct buf *bp;		/* the buffer containing the inode block */
9937	int waitfor;		/* nonzero => update must be allowed */
9938{
9939	struct inodedep *inodedep;
9940	struct inoref *inoref;
9941	struct worklist *wk;
9942	struct mount *mp;
9943	struct buf *ibp;
9944	struct fs *fs;
9945	int error;
9946
9947	mp = UFSTOVFS(ip->i_ump);
9948	fs = ip->i_fs;
9949	/*
9950	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
9951	 * does not have access to the in-core ip so must write directly into
9952	 * the inode block buffer when setting freelink.
9953	 */
9954	if (fs->fs_magic == FS_UFS1_MAGIC)
9955		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
9956		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
9957	else
9958		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
9959		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
9960	/*
9961	 * If the effective link count is not equal to the actual link
9962	 * count, then we must track the difference in an inodedep while
9963	 * the inode is (potentially) tossed out of the cache. Otherwise,
9964	 * if there is no existing inodedep, then there are no dependencies
9965	 * to track.
9966	 */
9967	ACQUIRE_LOCK(&lk);
9968again:
9969	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
9970		FREE_LOCK(&lk);
9971		if (ip->i_effnlink != ip->i_nlink)
9972			panic("softdep_update_inodeblock: bad link count");
9973		return;
9974	}
9975	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
9976		panic("softdep_update_inodeblock: bad delta");
9977	/*
9978	 * If we're flushing all dependencies we must also move any waiting
9979	 * for journal writes onto the bufwait list prior to I/O.
9980	 */
9981	if (waitfor) {
9982		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
9983			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
9984			    == DEPCOMPLETE) {
9985				stat_jwait_inode++;
9986				jwait(&inoref->if_list);
9987				goto again;
9988			}
9989		}
9990	}
9991	/*
9992	 * Changes have been initiated. Anything depending on these
9993	 * changes cannot occur until this inode has been written.
9994	 */
9995	inodedep->id_state &= ~COMPLETE;
9996	if ((inodedep->id_state & ONWORKLIST) == 0)
9997		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
9998	/*
9999	 * Any new dependencies associated with the incore inode must
10000	 * now be moved to the list associated with the buffer holding
10001	 * the in-memory copy of the inode. Once merged process any
10002	 * allocdirects that are completed by the merger.
10003	 */
10004	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
10005	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
10006		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
10007		    NULL);
10008	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
10009	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
10010		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
10011		    NULL);
10012	/*
10013	 * Now that the inode has been pushed into the buffer, the
10014	 * operations dependent on the inode being written to disk
10015	 * can be moved to the id_bufwait so that they will be
10016	 * processed when the buffer I/O completes.
10017	 */
10018	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
10019		WORKLIST_REMOVE(wk);
10020		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
10021	}
10022	/*
10023	 * Newly allocated inodes cannot be written until the bitmap
10024	 * that allocates them have been written (indicated by
10025	 * DEPCOMPLETE being set in id_state). If we are doing a
10026	 * forced sync (e.g., an fsync on a file), we force the bitmap
10027	 * to be written so that the update can be done.
10028	 */
10029	if (waitfor == 0) {
10030		FREE_LOCK(&lk);
10031		return;
10032	}
10033retry:
10034	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
10035		FREE_LOCK(&lk);
10036		return;
10037	}
10038	ibp = inodedep->id_bmsafemap->sm_buf;
10039	ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
10040	if (ibp == NULL) {
10041		/*
10042		 * If ibp came back as NULL, the dependency could have been
10043		 * freed while we slept.  Look it up again, and check to see
10044		 * that it has completed.
10045		 */
10046		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
10047			goto retry;
10048		FREE_LOCK(&lk);
10049		return;
10050	}
10051	FREE_LOCK(&lk);
10052	if ((error = bwrite(ibp)) != 0)
10053		softdep_error("softdep_update_inodeblock: bwrite", error);
10054}
10055
10056/*
10057 * Merge the a new inode dependency list (such as id_newinoupdt) into an
10058 * old inode dependency list (such as id_inoupdt). This routine must be
10059 * called with splbio interrupts blocked.
10060 */
10061static void
10062merge_inode_lists(newlisthead, oldlisthead)
10063	struct allocdirectlst *newlisthead;
10064	struct allocdirectlst *oldlisthead;
10065{
10066	struct allocdirect *listadp, *newadp;
10067
10068	newadp = TAILQ_FIRST(newlisthead);
10069	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
10070		if (listadp->ad_offset < newadp->ad_offset) {
10071			listadp = TAILQ_NEXT(listadp, ad_next);
10072			continue;
10073		}
10074		TAILQ_REMOVE(newlisthead, newadp, ad_next);
10075		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
10076		if (listadp->ad_offset == newadp->ad_offset) {
10077			allocdirect_merge(oldlisthead, newadp,
10078			    listadp);
10079			listadp = newadp;
10080		}
10081		newadp = TAILQ_FIRST(newlisthead);
10082	}
10083	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
10084		TAILQ_REMOVE(newlisthead, newadp, ad_next);
10085		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
10086	}
10087}
10088
10089/*
10090 * If we are doing an fsync, then we must ensure that any directory
10091 * entries for the inode have been written after the inode gets to disk.
10092 */
10093int
10094softdep_fsync(vp)
10095	struct vnode *vp;	/* the "in_core" copy of the inode */
10096{
10097	struct inodedep *inodedep;
10098	struct pagedep *pagedep;
10099	struct inoref *inoref;
10100	struct worklist *wk;
10101	struct diradd *dap;
10102	struct mount *mp;
10103	struct vnode *pvp;
10104	struct inode *ip;
10105	struct buf *bp;
10106	struct fs *fs;
10107	struct thread *td = curthread;
10108	int error, flushparent, pagedep_new_block;
10109	ino_t parentino;
10110	ufs_lbn_t lbn;
10111
10112	ip = VTOI(vp);
10113	fs = ip->i_fs;
10114	mp = vp->v_mount;
10115	ACQUIRE_LOCK(&lk);
10116restart:
10117	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
10118		FREE_LOCK(&lk);
10119		return (0);
10120	}
10121	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
10122		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
10123		    == DEPCOMPLETE) {
10124			stat_jwait_inode++;
10125			jwait(&inoref->if_list);
10126			goto restart;
10127		}
10128	}
10129	if (!LIST_EMPTY(&inodedep->id_inowait) ||
10130	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
10131	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
10132	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
10133	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
10134		panic("softdep_fsync: pending ops %p", inodedep);
10135	for (error = 0, flushparent = 0; ; ) {
10136		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
10137			break;
10138		if (wk->wk_type != D_DIRADD)
10139			panic("softdep_fsync: Unexpected type %s",
10140			    TYPENAME(wk->wk_type));
10141		dap = WK_DIRADD(wk);
10142		/*
10143		 * Flush our parent if this directory entry has a MKDIR_PARENT
10144		 * dependency or is contained in a newly allocated block.
10145		 */
10146		if (dap->da_state & DIRCHG)
10147			pagedep = dap->da_previous->dm_pagedep;
10148		else
10149			pagedep = dap->da_pagedep;
10150		parentino = pagedep->pd_ino;
10151		lbn = pagedep->pd_lbn;
10152		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
10153			panic("softdep_fsync: dirty");
10154		if ((dap->da_state & MKDIR_PARENT) ||
10155		    (pagedep->pd_state & NEWBLOCK))
10156			flushparent = 1;
10157		else
10158			flushparent = 0;
10159		/*
10160		 * If we are being fsync'ed as part of vgone'ing this vnode,
10161		 * then we will not be able to release and recover the
10162		 * vnode below, so we just have to give up on writing its
10163		 * directory entry out. It will eventually be written, just
10164		 * not now, but then the user was not asking to have it
10165		 * written, so we are not breaking any promises.
10166		 */
10167		if (vp->v_iflag & VI_DOOMED)
10168			break;
10169		/*
10170		 * We prevent deadlock by always fetching inodes from the
10171		 * root, moving down the directory tree. Thus, when fetching
10172		 * our parent directory, we first try to get the lock. If
10173		 * that fails, we must unlock ourselves before requesting
10174		 * the lock on our parent. See the comment in ufs_lookup
10175		 * for details on possible races.
10176		 */
10177		FREE_LOCK(&lk);
10178		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
10179		    FFSV_FORCEINSMQ)) {
10180			error = vfs_busy(mp, MBF_NOWAIT);
10181			if (error != 0) {
10182				vfs_ref(mp);
10183				VOP_UNLOCK(vp, 0);
10184				error = vfs_busy(mp, 0);
10185				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
10186				vfs_rel(mp);
10187				if (error != 0)
10188					return (ENOENT);
10189				if (vp->v_iflag & VI_DOOMED) {
10190					vfs_unbusy(mp);
10191					return (ENOENT);
10192				}
10193			}
10194			VOP_UNLOCK(vp, 0);
10195			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
10196			    &pvp, FFSV_FORCEINSMQ);
10197			vfs_unbusy(mp);
10198			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
10199			if (vp->v_iflag & VI_DOOMED) {
10200				if (error == 0)
10201					vput(pvp);
10202				error = ENOENT;
10203			}
10204			if (error != 0)
10205				return (error);
10206		}
10207		/*
10208		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
10209		 * that are contained in direct blocks will be resolved by
10210		 * doing a ffs_update. Pagedeps contained in indirect blocks
10211		 * may require a complete sync'ing of the directory. So, we
10212		 * try the cheap and fast ffs_update first, and if that fails,
10213		 * then we do the slower ffs_syncvnode of the directory.
10214		 */
10215		if (flushparent) {
10216			int locked;
10217
10218			if ((error = ffs_update(pvp, 1)) != 0) {
10219				vput(pvp);
10220				return (error);
10221			}
10222			ACQUIRE_LOCK(&lk);
10223			locked = 1;
10224			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
10225				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
10226					if (wk->wk_type != D_DIRADD)
10227						panic("softdep_fsync: Unexpected type %s",
10228						      TYPENAME(wk->wk_type));
10229					dap = WK_DIRADD(wk);
10230					if (dap->da_state & DIRCHG)
10231						pagedep = dap->da_previous->dm_pagedep;
10232					else
10233						pagedep = dap->da_pagedep;
10234					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
10235					FREE_LOCK(&lk);
10236					locked = 0;
10237					if (pagedep_new_block &&
10238					    (error = ffs_syncvnode(pvp, MNT_WAIT))) {
10239						vput(pvp);
10240						return (error);
10241					}
10242				}
10243			}
10244			if (locked)
10245				FREE_LOCK(&lk);
10246		}
10247		/*
10248		 * Flush directory page containing the inode's name.
10249		 */
10250		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
10251		    &bp);
10252		if (error == 0)
10253			error = bwrite(bp);
10254		else
10255			brelse(bp);
10256		vput(pvp);
10257		if (error != 0)
10258			return (error);
10259		ACQUIRE_LOCK(&lk);
10260		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
10261			break;
10262	}
10263	FREE_LOCK(&lk);
10264	return (0);
10265}
10266
10267/*
10268 * Flush all the dirty bitmaps associated with the block device
10269 * before flushing the rest of the dirty blocks so as to reduce
10270 * the number of dependencies that will have to be rolled back.
10271 */
10272void
10273softdep_fsync_mountdev(vp)
10274	struct vnode *vp;
10275{
10276	struct buf *bp, *nbp;
10277	struct worklist *wk;
10278	struct bufobj *bo;
10279
10280	if (!vn_isdisk(vp, NULL))
10281		panic("softdep_fsync_mountdev: vnode not a disk");
10282	bo = &vp->v_bufobj;
10283restart:
10284	BO_LOCK(bo);
10285	ACQUIRE_LOCK(&lk);
10286	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
10287		/*
10288		 * If it is already scheduled, skip to the next buffer.
10289		 */
10290		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
10291			continue;
10292
10293		if ((bp->b_flags & B_DELWRI) == 0)
10294			panic("softdep_fsync_mountdev: not dirty");
10295		/*
10296		 * We are only interested in bitmaps with outstanding
10297		 * dependencies.
10298		 */
10299		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
10300		    wk->wk_type != D_BMSAFEMAP ||
10301		    (bp->b_vflags & BV_BKGRDINPROG)) {
10302			BUF_UNLOCK(bp);
10303			continue;
10304		}
10305		FREE_LOCK(&lk);
10306		BO_UNLOCK(bo);
10307		bremfree(bp);
10308		(void) bawrite(bp);
10309		goto restart;
10310	}
10311	FREE_LOCK(&lk);
10312	drain_output(vp);
10313	BO_UNLOCK(bo);
10314}
10315
10316/*
10317 * This routine is called when we are trying to synchronously flush a
10318 * file. This routine must eliminate any filesystem metadata dependencies
10319 * so that the syncing routine can succeed by pushing the dirty blocks
10320 * associated with the file. If any I/O errors occur, they are returned.
10321 */
10322int
10323softdep_sync_metadata(struct vnode *vp)
10324{
10325	struct pagedep *pagedep;
10326	struct allocindir *aip;
10327	struct newblk *newblk;
10328	struct buf *bp, *nbp;
10329	struct worklist *wk;
10330	struct bufobj *bo;
10331	int i, error, waitfor;
10332
10333	if (!DOINGSOFTDEP(vp))
10334		return (0);
10335	/*
10336	 * Ensure that any direct block dependencies have been cleared.
10337	 */
10338	ACQUIRE_LOCK(&lk);
10339	if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) {
10340		FREE_LOCK(&lk);
10341		return (error);
10342	}
10343	FREE_LOCK(&lk);
10344	/*
10345	 * For most files, the only metadata dependencies are the
10346	 * cylinder group maps that allocate their inode or blocks.
10347	 * The block allocation dependencies can be found by traversing
10348	 * the dependency lists for any buffers that remain on their
10349	 * dirty buffer list. The inode allocation dependency will
10350	 * be resolved when the inode is updated with MNT_WAIT.
10351	 * This work is done in two passes. The first pass grabs most
10352	 * of the buffers and begins asynchronously writing them. The
10353	 * only way to wait for these asynchronous writes is to sleep
10354	 * on the filesystem vnode which may stay busy for a long time
10355	 * if the filesystem is active. So, instead, we make a second
10356	 * pass over the dependencies blocking on each write. In the
10357	 * usual case we will be blocking against a write that we
10358	 * initiated, so when it is done the dependency will have been
10359	 * resolved. Thus the second pass is expected to end quickly.
10360	 */
10361	waitfor = MNT_NOWAIT;
10362	bo = &vp->v_bufobj;
10363
10364top:
10365	/*
10366	 * We must wait for any I/O in progress to finish so that
10367	 * all potential buffers on the dirty list will be visible.
10368	 */
10369	BO_LOCK(bo);
10370	drain_output(vp);
10371	while ((bp = TAILQ_FIRST(&bo->bo_dirty.bv_hd)) != NULL) {
10372		bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT);
10373		if (bp)
10374			break;
10375	}
10376	BO_UNLOCK(bo);
10377	if (bp == NULL)
10378		return (0);
10379loop:
10380	/* While syncing snapshots, we must allow recursive lookups */
10381	BUF_AREC(bp);
10382	ACQUIRE_LOCK(&lk);
10383	/*
10384	 * As we hold the buffer locked, none of its dependencies
10385	 * will disappear.
10386	 */
10387	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
10388		switch (wk->wk_type) {
10389
10390		case D_ALLOCDIRECT:
10391		case D_ALLOCINDIR:
10392			newblk = WK_NEWBLK(wk);
10393			if (newblk->nb_jnewblk != NULL) {
10394				stat_jwait_newblk++;
10395				jwait(&newblk->nb_jnewblk->jn_list);
10396				goto restart;
10397			}
10398			if (newblk->nb_state & DEPCOMPLETE)
10399				continue;
10400			nbp = newblk->nb_bmsafemap->sm_buf;
10401			nbp = getdirtybuf(nbp, &lk, waitfor);
10402			if (nbp == NULL)
10403				continue;
10404			FREE_LOCK(&lk);
10405			if (waitfor == MNT_NOWAIT) {
10406				bawrite(nbp);
10407			} else if ((error = bwrite(nbp)) != 0) {
10408				break;
10409			}
10410			ACQUIRE_LOCK(&lk);
10411			continue;
10412
10413		case D_INDIRDEP:
10414		restart:
10415
10416			LIST_FOREACH(aip,
10417			    &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
10418				newblk = (struct newblk *)aip;
10419				if (newblk->nb_jnewblk != NULL) {
10420					stat_jwait_newblk++;
10421					jwait(&newblk->nb_jnewblk->jn_list);
10422					goto restart;
10423				}
10424				if (newblk->nb_state & DEPCOMPLETE)
10425					continue;
10426				nbp = newblk->nb_bmsafemap->sm_buf;
10427				nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
10428				if (nbp == NULL)
10429					goto restart;
10430				FREE_LOCK(&lk);
10431				if ((error = bwrite(nbp)) != 0) {
10432					goto loop_end;
10433				}
10434				ACQUIRE_LOCK(&lk);
10435				goto restart;
10436			}
10437			continue;
10438
10439		case D_PAGEDEP:
10440			/*
10441			 * We are trying to sync a directory that may
10442			 * have dependencies on both its own metadata
10443			 * and/or dependencies on the inodes of any
10444			 * recently allocated files. We walk its diradd
10445			 * lists pushing out the associated inode.
10446			 */
10447			pagedep = WK_PAGEDEP(wk);
10448			for (i = 0; i < DAHASHSZ; i++) {
10449				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
10450					continue;
10451				if ((error =
10452				    flush_pagedep_deps(vp, wk->wk_mp,
10453						&pagedep->pd_diraddhd[i]))) {
10454					FREE_LOCK(&lk);
10455					goto loop_end;
10456				}
10457			}
10458			continue;
10459
10460		default:
10461			panic("softdep_sync_metadata: Unknown type %s",
10462			    TYPENAME(wk->wk_type));
10463			/* NOTREACHED */
10464		}
10465	loop_end:
10466		/* We reach here only in error and unlocked */
10467		if (error == 0)
10468			panic("softdep_sync_metadata: zero error");
10469		BUF_NOREC(bp);
10470		bawrite(bp);
10471		return (error);
10472	}
10473	FREE_LOCK(&lk);
10474	BO_LOCK(bo);
10475	while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) {
10476		nbp = getdirtybuf(nbp, BO_MTX(bo), MNT_WAIT);
10477		if (nbp)
10478			break;
10479	}
10480	BO_UNLOCK(bo);
10481	BUF_NOREC(bp);
10482	bawrite(bp);
10483	if (nbp != NULL) {
10484		bp = nbp;
10485		goto loop;
10486	}
10487	/*
10488	 * The brief unlock is to allow any pent up dependency
10489	 * processing to be done. Then proceed with the second pass.
10490	 */
10491	if (waitfor == MNT_NOWAIT) {
10492		waitfor = MNT_WAIT;
10493		goto top;
10494	}
10495
10496	/*
10497	 * If we have managed to get rid of all the dirty buffers,
10498	 * then we are done. For certain directories and block
10499	 * devices, we may need to do further work.
10500	 *
10501	 * We must wait for any I/O in progress to finish so that
10502	 * all potential buffers on the dirty list will be visible.
10503	 */
10504	BO_LOCK(bo);
10505	drain_output(vp);
10506	BO_UNLOCK(bo);
10507	return ffs_update(vp, 1);
10508	/* return (0); */
10509}
10510
10511/*
10512 * Flush the dependencies associated with an inodedep.
10513 * Called with splbio blocked.
10514 */
10515static int
10516flush_inodedep_deps(mp, ino)
10517	struct mount *mp;
10518	ino_t ino;
10519{
10520	struct inodedep *inodedep;
10521	struct inoref *inoref;
10522	int error, waitfor;
10523
10524	/*
10525	 * This work is done in two passes. The first pass grabs most
10526	 * of the buffers and begins asynchronously writing them. The
10527	 * only way to wait for these asynchronous writes is to sleep
10528	 * on the filesystem vnode which may stay busy for a long time
10529	 * if the filesystem is active. So, instead, we make a second
10530	 * pass over the dependencies blocking on each write. In the
10531	 * usual case we will be blocking against a write that we
10532	 * initiated, so when it is done the dependency will have been
10533	 * resolved. Thus the second pass is expected to end quickly.
10534	 * We give a brief window at the top of the loop to allow
10535	 * any pending I/O to complete.
10536	 */
10537	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
10538		if (error)
10539			return (error);
10540		FREE_LOCK(&lk);
10541		ACQUIRE_LOCK(&lk);
10542restart:
10543		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
10544			return (0);
10545		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
10546			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
10547			    == DEPCOMPLETE) {
10548				stat_jwait_inode++;
10549				jwait(&inoref->if_list);
10550				goto restart;
10551			}
10552		}
10553		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
10554		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
10555		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
10556		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
10557			continue;
10558		/*
10559		 * If pass2, we are done, otherwise do pass 2.
10560		 */
10561		if (waitfor == MNT_WAIT)
10562			break;
10563		waitfor = MNT_WAIT;
10564	}
10565	/*
10566	 * Try freeing inodedep in case all dependencies have been removed.
10567	 */
10568	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
10569		(void) free_inodedep(inodedep);
10570	return (0);
10571}
10572
10573/*
10574 * Flush an inode dependency list.
10575 * Called with splbio blocked.
10576 */
10577static int
10578flush_deplist(listhead, waitfor, errorp)
10579	struct allocdirectlst *listhead;
10580	int waitfor;
10581	int *errorp;
10582{
10583	struct allocdirect *adp;
10584	struct newblk *newblk;
10585	struct buf *bp;
10586
10587	mtx_assert(&lk, MA_OWNED);
10588	TAILQ_FOREACH(adp, listhead, ad_next) {
10589		newblk = (struct newblk *)adp;
10590		if (newblk->nb_jnewblk != NULL) {
10591			stat_jwait_newblk++;
10592			jwait(&newblk->nb_jnewblk->jn_list);
10593			return (1);
10594		}
10595		if (newblk->nb_state & DEPCOMPLETE)
10596			continue;
10597		bp = newblk->nb_bmsafemap->sm_buf;
10598		bp = getdirtybuf(bp, &lk, waitfor);
10599		if (bp == NULL) {
10600			if (waitfor == MNT_NOWAIT)
10601				continue;
10602			return (1);
10603		}
10604		FREE_LOCK(&lk);
10605		if (waitfor == MNT_NOWAIT) {
10606			bawrite(bp);
10607		} else if ((*errorp = bwrite(bp)) != 0) {
10608			ACQUIRE_LOCK(&lk);
10609			return (1);
10610		}
10611		ACQUIRE_LOCK(&lk);
10612		return (1);
10613	}
10614	return (0);
10615}
10616
10617/*
10618 * Flush dependencies associated with an allocdirect block.
10619 */
10620static int
10621flush_newblk_dep(vp, mp, lbn)
10622	struct vnode *vp;
10623	struct mount *mp;
10624	ufs_lbn_t lbn;
10625{
10626	struct newblk *newblk;
10627	struct bufobj *bo;
10628	struct inode *ip;
10629	struct buf *bp;
10630	ufs2_daddr_t blkno;
10631	int error;
10632
10633	error = 0;
10634	bo = &vp->v_bufobj;
10635	ip = VTOI(vp);
10636	blkno = DIP(ip, i_db[lbn]);
10637	if (blkno == 0)
10638		panic("flush_newblk_dep: Missing block");
10639	ACQUIRE_LOCK(&lk);
10640	/*
10641	 * Loop until all dependencies related to this block are satisfied.
10642	 * We must be careful to restart after each sleep in case a write
10643	 * completes some part of this process for us.
10644	 */
10645	for (;;) {
10646		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
10647			FREE_LOCK(&lk);
10648			break;
10649		}
10650		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
10651			panic("flush_newblk_deps: Bad newblk %p", newblk);
10652		/*
10653		 * Flush the journal.
10654		 */
10655		if (newblk->nb_jnewblk != NULL) {
10656			stat_jwait_newblk++;
10657			jwait(&newblk->nb_jnewblk->jn_list);
10658			continue;
10659		}
10660		/*
10661		 * Write the bitmap dependency.
10662		 */
10663		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
10664			bp = newblk->nb_bmsafemap->sm_buf;
10665			bp = getdirtybuf(bp, &lk, MNT_WAIT);
10666			if (bp == NULL)
10667				continue;
10668			FREE_LOCK(&lk);
10669			error = bwrite(bp);
10670			if (error)
10671				break;
10672			ACQUIRE_LOCK(&lk);
10673			continue;
10674		}
10675		/*
10676		 * Write the buffer.
10677		 */
10678		FREE_LOCK(&lk);
10679		BO_LOCK(bo);
10680		bp = gbincore(bo, lbn);
10681		if (bp != NULL) {
10682			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
10683			    LK_INTERLOCK, BO_MTX(bo));
10684			if (error == ENOLCK) {
10685				ACQUIRE_LOCK(&lk);
10686				continue; /* Slept, retry */
10687			}
10688			if (error != 0)
10689				break;	/* Failed */
10690			if (bp->b_flags & B_DELWRI) {
10691				bremfree(bp);
10692				error = bwrite(bp);
10693				if (error)
10694					break;
10695			} else
10696				BUF_UNLOCK(bp);
10697		} else
10698			BO_UNLOCK(bo);
10699		/*
10700		 * We have to wait for the direct pointers to
10701		 * point at the newdirblk before the dependency
10702		 * will go away.
10703		 */
10704		error = ffs_update(vp, MNT_WAIT);
10705		if (error)
10706			break;
10707		ACQUIRE_LOCK(&lk);
10708	}
10709	return (error);
10710}
10711
10712/*
10713 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
10714 * Called with splbio blocked.
10715 */
10716static int
10717flush_pagedep_deps(pvp, mp, diraddhdp)
10718	struct vnode *pvp;
10719	struct mount *mp;
10720	struct diraddhd *diraddhdp;
10721{
10722	struct inodedep *inodedep;
10723	struct inoref *inoref;
10724	struct ufsmount *ump;
10725	struct diradd *dap;
10726	struct vnode *vp;
10727	int error = 0;
10728	struct buf *bp;
10729	ino_t inum;
10730
10731	ump = VFSTOUFS(mp);
10732restart:
10733	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
10734		/*
10735		 * Flush ourselves if this directory entry
10736		 * has a MKDIR_PARENT dependency.
10737		 */
10738		if (dap->da_state & MKDIR_PARENT) {
10739			FREE_LOCK(&lk);
10740			if ((error = ffs_update(pvp, MNT_WAIT)) != 0)
10741				break;
10742			ACQUIRE_LOCK(&lk);
10743			/*
10744			 * If that cleared dependencies, go on to next.
10745			 */
10746			if (dap != LIST_FIRST(diraddhdp))
10747				continue;
10748			if (dap->da_state & MKDIR_PARENT)
10749				panic("flush_pagedep_deps: MKDIR_PARENT");
10750		}
10751		/*
10752		 * A newly allocated directory must have its "." and
10753		 * ".." entries written out before its name can be
10754		 * committed in its parent.
10755		 */
10756		inum = dap->da_newinum;
10757		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
10758			panic("flush_pagedep_deps: lost inode1");
10759		/*
10760		 * Wait for any pending journal adds to complete so we don't
10761		 * cause rollbacks while syncing.
10762		 */
10763		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
10764			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
10765			    == DEPCOMPLETE) {
10766				stat_jwait_inode++;
10767				jwait(&inoref->if_list);
10768				goto restart;
10769			}
10770		}
10771		if (dap->da_state & MKDIR_BODY) {
10772			FREE_LOCK(&lk);
10773			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
10774			    FFSV_FORCEINSMQ)))
10775				break;
10776			error = flush_newblk_dep(vp, mp, 0);
10777			/*
10778			 * If we still have the dependency we might need to
10779			 * update the vnode to sync the new link count to
10780			 * disk.
10781			 */
10782			if (error == 0 && dap == LIST_FIRST(diraddhdp))
10783				error = ffs_update(vp, MNT_WAIT);
10784			vput(vp);
10785			if (error != 0)
10786				break;
10787			ACQUIRE_LOCK(&lk);
10788			/*
10789			 * If that cleared dependencies, go on to next.
10790			 */
10791			if (dap != LIST_FIRST(diraddhdp))
10792				continue;
10793			if (dap->da_state & MKDIR_BODY) {
10794				inodedep_lookup(UFSTOVFS(ump), inum, 0,
10795				    &inodedep);
10796				panic("flush_pagedep_deps: MKDIR_BODY "
10797				    "inodedep %p dap %p vp %p",
10798				    inodedep, dap, vp);
10799			}
10800		}
10801		/*
10802		 * Flush the inode on which the directory entry depends.
10803		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
10804		 * the only remaining dependency is that the updated inode
10805		 * count must get pushed to disk. The inode has already
10806		 * been pushed into its inode buffer (via VOP_UPDATE) at
10807		 * the time of the reference count change. So we need only
10808		 * locate that buffer, ensure that there will be no rollback
10809		 * caused by a bitmap dependency, then write the inode buffer.
10810		 */
10811retry:
10812		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
10813			panic("flush_pagedep_deps: lost inode");
10814		/*
10815		 * If the inode still has bitmap dependencies,
10816		 * push them to disk.
10817		 */
10818		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
10819			bp = inodedep->id_bmsafemap->sm_buf;
10820			bp = getdirtybuf(bp, &lk, MNT_WAIT);
10821			if (bp == NULL)
10822				goto retry;
10823			FREE_LOCK(&lk);
10824			if ((error = bwrite(bp)) != 0)
10825				break;
10826			ACQUIRE_LOCK(&lk);
10827			if (dap != LIST_FIRST(diraddhdp))
10828				continue;
10829		}
10830		/*
10831		 * If the inode is still sitting in a buffer waiting
10832		 * to be written or waiting for the link count to be
10833		 * adjusted update it here to flush it to disk.
10834		 */
10835		if (dap == LIST_FIRST(diraddhdp)) {
10836			FREE_LOCK(&lk);
10837			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
10838			    FFSV_FORCEINSMQ)))
10839				break;
10840			error = ffs_update(vp, MNT_WAIT);
10841			vput(vp);
10842			if (error)
10843				break;
10844			ACQUIRE_LOCK(&lk);
10845		}
10846		/*
10847		 * If we have failed to get rid of all the dependencies
10848		 * then something is seriously wrong.
10849		 */
10850		if (dap == LIST_FIRST(diraddhdp)) {
10851			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
10852			panic("flush_pagedep_deps: failed to flush "
10853			    "inodedep %p ino %d dap %p", inodedep, inum, dap);
10854		}
10855	}
10856	if (error)
10857		ACQUIRE_LOCK(&lk);
10858	return (error);
10859}
10860
10861/*
10862 * A large burst of file addition or deletion activity can drive the
10863 * memory load excessively high. First attempt to slow things down
10864 * using the techniques below. If that fails, this routine requests
10865 * the offending operations to fall back to running synchronously
10866 * until the memory load returns to a reasonable level.
10867 */
10868int
10869softdep_slowdown(vp)
10870	struct vnode *vp;
10871{
10872	int max_softdeps_hard;
10873
10874	ACQUIRE_LOCK(&lk);
10875	max_softdeps_hard = max_softdeps * 11 / 10;
10876	if (num_dirrem < max_softdeps_hard / 2 &&
10877	    num_inodedep < max_softdeps_hard &&
10878	    VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps &&
10879	    num_freeblkdep < max_softdeps_hard) {
10880		FREE_LOCK(&lk);
10881  		return (0);
10882	}
10883	if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps)
10884		softdep_speedup();
10885	stat_sync_limit_hit += 1;
10886	FREE_LOCK(&lk);
10887	return (1);
10888}
10889
10890/*
10891 * Called by the allocation routines when they are about to fail
10892 * in the hope that we can free up some disk space.
10893 *
10894 * First check to see if the work list has anything on it. If it has,
10895 * clean up entries until we successfully free some space. Because this
10896 * process holds inodes locked, we cannot handle any remove requests
10897 * that might block on a locked inode as that could lead to deadlock.
10898 * If the worklist yields no free space, encourage the syncer daemon
10899 * to help us. In no event will we try for longer than tickdelay seconds.
10900 */
10901int
10902softdep_request_cleanup(fs, vp)
10903	struct fs *fs;
10904	struct vnode *vp;
10905{
10906	struct ufsmount *ump;
10907	long starttime;
10908	ufs2_daddr_t needed;
10909	int error;
10910
10911	ump = VTOI(vp)->i_ump;
10912	mtx_assert(UFS_MTX(ump), MA_OWNED);
10913	needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
10914	starttime = time_second + tickdelay;
10915	/*
10916	 * If we are being called because of a process doing a
10917	 * copy-on-write, then it is not safe to update the vnode
10918	 * as we may recurse into the copy-on-write routine.
10919	 */
10920	if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
10921		UFS_UNLOCK(ump);
10922		error = ffs_update(vp, 1);
10923		UFS_LOCK(ump);
10924		if (error != 0)
10925			return (0);
10926	}
10927	while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
10928		if (time_second > starttime)
10929			return (0);
10930		UFS_UNLOCK(ump);
10931		ACQUIRE_LOCK(&lk);
10932		process_removes(vp);
10933		if (ump->softdep_on_worklist > 0 &&
10934		    process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
10935			stat_worklist_push += 1;
10936			FREE_LOCK(&lk);
10937			UFS_LOCK(ump);
10938			continue;
10939		}
10940		request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT);
10941		FREE_LOCK(&lk);
10942		UFS_LOCK(ump);
10943	}
10944	return (1);
10945}
10946
10947/*
10948 * If memory utilization has gotten too high, deliberately slow things
10949 * down and speed up the I/O processing.
10950 */
10951extern struct thread *syncertd;
10952static int
10953request_cleanup(mp, resource)
10954	struct mount *mp;
10955	int resource;
10956{
10957	struct thread *td = curthread;
10958	struct ufsmount *ump;
10959
10960	mtx_assert(&lk, MA_OWNED);
10961	/*
10962	 * We never hold up the filesystem syncer or buf daemon.
10963	 */
10964	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
10965		return (0);
10966	ump = VFSTOUFS(mp);
10967	/*
10968	 * First check to see if the work list has gotten backlogged.
10969	 * If it has, co-opt this process to help clean up two entries.
10970	 * Because this process may hold inodes locked, we cannot
10971	 * handle any remove requests that might block on a locked
10972	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
10973	 * to avoid recursively processing the worklist.
10974	 */
10975	if (ump->softdep_on_worklist > max_softdeps / 10) {
10976		td->td_pflags |= TDP_SOFTDEP;
10977		process_worklist_item(mp, LK_NOWAIT);
10978		process_worklist_item(mp, LK_NOWAIT);
10979		td->td_pflags &= ~TDP_SOFTDEP;
10980		stat_worklist_push += 2;
10981		return(1);
10982	}
10983	/*
10984	 * Next, we attempt to speed up the syncer process. If that
10985	 * is successful, then we allow the process to continue.
10986	 */
10987	if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT)
10988		return(0);
10989	/*
10990	 * If we are resource constrained on inode dependencies, try
10991	 * flushing some dirty inodes. Otherwise, we are constrained
10992	 * by file deletions, so try accelerating flushes of directories
10993	 * with removal dependencies. We would like to do the cleanup
10994	 * here, but we probably hold an inode locked at this point and
10995	 * that might deadlock against one that we try to clean. So,
10996	 * the best that we can do is request the syncer daemon to do
10997	 * the cleanup for us.
10998	 */
10999	switch (resource) {
11000
11001	case FLUSH_INODES:
11002		stat_ino_limit_push += 1;
11003		req_clear_inodedeps += 1;
11004		stat_countp = &stat_ino_limit_hit;
11005		break;
11006
11007	case FLUSH_REMOVE:
11008	case FLUSH_REMOVE_WAIT:
11009		stat_blk_limit_push += 1;
11010		req_clear_remove += 1;
11011		stat_countp = &stat_blk_limit_hit;
11012		break;
11013
11014	default:
11015		panic("request_cleanup: unknown type");
11016	}
11017	/*
11018	 * Hopefully the syncer daemon will catch up and awaken us.
11019	 * We wait at most tickdelay before proceeding in any case.
11020	 */
11021	proc_waiting += 1;
11022	if (callout_pending(&softdep_callout) == FALSE)
11023		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
11024		    pause_timer, 0);
11025
11026	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
11027	proc_waiting -= 1;
11028	return (1);
11029}
11030
11031/*
11032 * Awaken processes pausing in request_cleanup and clear proc_waiting
11033 * to indicate that there is no longer a timer running.
11034 */
11035static void
11036pause_timer(arg)
11037	void *arg;
11038{
11039
11040	/*
11041	 * The callout_ API has acquired mtx and will hold it around this
11042	 * function call.
11043	 */
11044	*stat_countp += 1;
11045	wakeup_one(&proc_waiting);
11046	if (proc_waiting > 0)
11047		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
11048		    pause_timer, 0);
11049}
11050
11051/*
11052 * Flush out a directory with at least one removal dependency in an effort to
11053 * reduce the number of dirrem, freefile, and freeblks dependency structures.
11054 */
11055static void
11056clear_remove(td)
11057	struct thread *td;
11058{
11059	struct pagedep_hashhead *pagedephd;
11060	struct pagedep *pagedep;
11061	static int next = 0;
11062	struct mount *mp;
11063	struct vnode *vp;
11064	struct bufobj *bo;
11065	int error, cnt;
11066	ino_t ino;
11067
11068	mtx_assert(&lk, MA_OWNED);
11069
11070	for (cnt = 0; cnt < pagedep_hash; cnt++) {
11071		pagedephd = &pagedep_hashtbl[next++];
11072		if (next >= pagedep_hash)
11073			next = 0;
11074		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
11075			if (LIST_EMPTY(&pagedep->pd_dirremhd))
11076				continue;
11077			mp = pagedep->pd_list.wk_mp;
11078			ino = pagedep->pd_ino;
11079			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
11080				continue;
11081			FREE_LOCK(&lk);
11082
11083			/*
11084			 * Let unmount clear deps
11085			 */
11086			error = vfs_busy(mp, MBF_NOWAIT);
11087			if (error != 0)
11088				goto finish_write;
11089			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
11090			     FFSV_FORCEINSMQ);
11091			vfs_unbusy(mp);
11092			if (error != 0) {
11093				softdep_error("clear_remove: vget", error);
11094				goto finish_write;
11095			}
11096			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
11097				softdep_error("clear_remove: fsync", error);
11098			bo = &vp->v_bufobj;
11099			BO_LOCK(bo);
11100			drain_output(vp);
11101			BO_UNLOCK(bo);
11102			vput(vp);
11103		finish_write:
11104			vn_finished_write(mp);
11105			ACQUIRE_LOCK(&lk);
11106			return;
11107		}
11108	}
11109}
11110
11111/*
11112 * Clear out a block of dirty inodes in an effort to reduce
11113 * the number of inodedep dependency structures.
11114 */
11115static void
11116clear_inodedeps(td)
11117	struct thread *td;
11118{
11119	struct inodedep_hashhead *inodedephd;
11120	struct inodedep *inodedep;
11121	static int next = 0;
11122	struct mount *mp;
11123	struct vnode *vp;
11124	struct fs *fs;
11125	int error, cnt;
11126	ino_t firstino, lastino, ino;
11127
11128	mtx_assert(&lk, MA_OWNED);
11129	/*
11130	 * Pick a random inode dependency to be cleared.
11131	 * We will then gather up all the inodes in its block
11132	 * that have dependencies and flush them out.
11133	 */
11134	for (cnt = 0; cnt < inodedep_hash; cnt++) {
11135		inodedephd = &inodedep_hashtbl[next++];
11136		if (next >= inodedep_hash)
11137			next = 0;
11138		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
11139			break;
11140	}
11141	if (inodedep == NULL)
11142		return;
11143	fs = inodedep->id_fs;
11144	mp = inodedep->id_list.wk_mp;
11145	/*
11146	 * Find the last inode in the block with dependencies.
11147	 */
11148	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
11149	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
11150		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
11151			break;
11152	/*
11153	 * Asynchronously push all but the last inode with dependencies.
11154	 * Synchronously push the last inode with dependencies to ensure
11155	 * that the inode block gets written to free up the inodedeps.
11156	 */
11157	for (ino = firstino; ino <= lastino; ino++) {
11158		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
11159			continue;
11160		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
11161			continue;
11162		FREE_LOCK(&lk);
11163		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
11164		if (error != 0) {
11165			vn_finished_write(mp);
11166			ACQUIRE_LOCK(&lk);
11167			return;
11168		}
11169		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
11170		    FFSV_FORCEINSMQ)) != 0) {
11171			softdep_error("clear_inodedeps: vget", error);
11172			vfs_unbusy(mp);
11173			vn_finished_write(mp);
11174			ACQUIRE_LOCK(&lk);
11175			return;
11176		}
11177		vfs_unbusy(mp);
11178		if (ino == lastino) {
11179			if ((error = ffs_syncvnode(vp, MNT_WAIT)))
11180				softdep_error("clear_inodedeps: fsync1", error);
11181		} else {
11182			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
11183				softdep_error("clear_inodedeps: fsync2", error);
11184			BO_LOCK(&vp->v_bufobj);
11185			drain_output(vp);
11186			BO_UNLOCK(&vp->v_bufobj);
11187		}
11188		vput(vp);
11189		vn_finished_write(mp);
11190		ACQUIRE_LOCK(&lk);
11191	}
11192}
11193
11194/*
11195 * Function to determine if the buffer has outstanding dependencies
11196 * that will cause a roll-back if the buffer is written. If wantcount
11197 * is set, return number of dependencies, otherwise just yes or no.
11198 */
11199static int
11200softdep_count_dependencies(bp, wantcount)
11201	struct buf *bp;
11202	int wantcount;
11203{
11204	struct worklist *wk;
11205	struct bmsafemap *bmsafemap;
11206	struct inodedep *inodedep;
11207	struct indirdep *indirdep;
11208	struct freeblks *freeblks;
11209	struct allocindir *aip;
11210	struct pagedep *pagedep;
11211	struct dirrem *dirrem;
11212	struct newblk *newblk;
11213	struct mkdir *mkdir;
11214	struct diradd *dap;
11215	int i, retval;
11216
11217	retval = 0;
11218	ACQUIRE_LOCK(&lk);
11219	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
11220		switch (wk->wk_type) {
11221
11222		case D_INODEDEP:
11223			inodedep = WK_INODEDEP(wk);
11224			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
11225				/* bitmap allocation dependency */
11226				retval += 1;
11227				if (!wantcount)
11228					goto out;
11229			}
11230			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
11231				/* direct block pointer dependency */
11232				retval += 1;
11233				if (!wantcount)
11234					goto out;
11235			}
11236			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
11237				/* direct block pointer dependency */
11238				retval += 1;
11239				if (!wantcount)
11240					goto out;
11241			}
11242			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
11243				/* Add reference dependency. */
11244				retval += 1;
11245				if (!wantcount)
11246					goto out;
11247			}
11248			continue;
11249
11250		case D_INDIRDEP:
11251			indirdep = WK_INDIRDEP(wk);
11252
11253			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
11254				/* indirect block pointer dependency */
11255				retval += 1;
11256				if (!wantcount)
11257					goto out;
11258			}
11259			continue;
11260
11261		case D_PAGEDEP:
11262			pagedep = WK_PAGEDEP(wk);
11263			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
11264				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
11265					/* Journal remove ref dependency. */
11266					retval += 1;
11267					if (!wantcount)
11268						goto out;
11269				}
11270			}
11271			for (i = 0; i < DAHASHSZ; i++) {
11272
11273				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
11274					/* directory entry dependency */
11275					retval += 1;
11276					if (!wantcount)
11277						goto out;
11278				}
11279			}
11280			continue;
11281
11282		case D_BMSAFEMAP:
11283			bmsafemap = WK_BMSAFEMAP(wk);
11284			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
11285				/* Add reference dependency. */
11286				retval += 1;
11287				if (!wantcount)
11288					goto out;
11289			}
11290			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
11291				/* Allocate block dependency. */
11292				retval += 1;
11293				if (!wantcount)
11294					goto out;
11295			}
11296			continue;
11297
11298		case D_FREEBLKS:
11299			freeblks = WK_FREEBLKS(wk);
11300			if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) {
11301				/* Freeblk journal dependency. */
11302				retval += 1;
11303				if (!wantcount)
11304					goto out;
11305			}
11306			continue;
11307
11308		case D_ALLOCDIRECT:
11309		case D_ALLOCINDIR:
11310			newblk = WK_NEWBLK(wk);
11311			if (newblk->nb_jnewblk) {
11312				/* Journal allocate dependency. */
11313				retval += 1;
11314				if (!wantcount)
11315					goto out;
11316			}
11317			continue;
11318
11319		case D_MKDIR:
11320			mkdir = WK_MKDIR(wk);
11321			if (mkdir->md_jaddref) {
11322				/* Journal reference dependency. */
11323				retval += 1;
11324				if (!wantcount)
11325					goto out;
11326			}
11327			continue;
11328
11329		case D_FREEWORK:
11330		case D_FREEDEP:
11331		case D_JSEGDEP:
11332		case D_JSEG:
11333		case D_SBDEP:
11334			/* never a dependency on these blocks */
11335			continue;
11336
11337		default:
11338			panic("softdep_count_dependencies: Unexpected type %s",
11339			    TYPENAME(wk->wk_type));
11340			/* NOTREACHED */
11341		}
11342	}
11343out:
11344	FREE_LOCK(&lk);
11345	return retval;
11346}
11347
11348/*
11349 * Acquire exclusive access to a buffer.
11350 * Must be called with a locked mtx parameter.
11351 * Return acquired buffer or NULL on failure.
11352 */
11353static struct buf *
11354getdirtybuf(bp, mtx, waitfor)
11355	struct buf *bp;
11356	struct mtx *mtx;
11357	int waitfor;
11358{
11359	int error;
11360
11361	mtx_assert(mtx, MA_OWNED);
11362	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
11363		if (waitfor != MNT_WAIT)
11364			return (NULL);
11365		error = BUF_LOCK(bp,
11366		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
11367		/*
11368		 * Even if we sucessfully acquire bp here, we have dropped
11369		 * mtx, which may violates our guarantee.
11370		 */
11371		if (error == 0)
11372			BUF_UNLOCK(bp);
11373		else if (error != ENOLCK)
11374			panic("getdirtybuf: inconsistent lock: %d", error);
11375		mtx_lock(mtx);
11376		return (NULL);
11377	}
11378	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
11379		if (mtx == &lk && waitfor == MNT_WAIT) {
11380			mtx_unlock(mtx);
11381			BO_LOCK(bp->b_bufobj);
11382			BUF_UNLOCK(bp);
11383			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
11384				bp->b_vflags |= BV_BKGRDWAIT;
11385				msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
11386				       PRIBIO | PDROP, "getbuf", 0);
11387			} else
11388				BO_UNLOCK(bp->b_bufobj);
11389			mtx_lock(mtx);
11390			return (NULL);
11391		}
11392		BUF_UNLOCK(bp);
11393		if (waitfor != MNT_WAIT)
11394			return (NULL);
11395		/*
11396		 * The mtx argument must be bp->b_vp's mutex in
11397		 * this case.
11398		 */
11399#ifdef	DEBUG_VFS_LOCKS
11400		if (bp->b_vp->v_type != VCHR)
11401			ASSERT_BO_LOCKED(bp->b_bufobj);
11402#endif
11403		bp->b_vflags |= BV_BKGRDWAIT;
11404		msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
11405		return (NULL);
11406	}
11407	if ((bp->b_flags & B_DELWRI) == 0) {
11408		BUF_UNLOCK(bp);
11409		return (NULL);
11410	}
11411	bremfree(bp);
11412	return (bp);
11413}
11414
11415
11416/*
11417 * Check if it is safe to suspend the file system now.  On entry,
11418 * the vnode interlock for devvp should be held.  Return 0 with
11419 * the mount interlock held if the file system can be suspended now,
11420 * otherwise return EAGAIN with the mount interlock held.
11421 */
11422int
11423softdep_check_suspend(struct mount *mp,
11424		      struct vnode *devvp,
11425		      int softdep_deps,
11426		      int softdep_accdeps,
11427		      int secondary_writes,
11428		      int secondary_accwrites)
11429{
11430	struct bufobj *bo;
11431	struct ufsmount *ump;
11432	int error;
11433
11434	ump = VFSTOUFS(mp);
11435	bo = &devvp->v_bufobj;
11436	ASSERT_BO_LOCKED(bo);
11437
11438	for (;;) {
11439		if (!TRY_ACQUIRE_LOCK(&lk)) {
11440			BO_UNLOCK(bo);
11441			ACQUIRE_LOCK(&lk);
11442			FREE_LOCK(&lk);
11443			BO_LOCK(bo);
11444			continue;
11445		}
11446		MNT_ILOCK(mp);
11447		if (mp->mnt_secondary_writes != 0) {
11448			FREE_LOCK(&lk);
11449			BO_UNLOCK(bo);
11450			msleep(&mp->mnt_secondary_writes,
11451			       MNT_MTX(mp),
11452			       (PUSER - 1) | PDROP, "secwr", 0);
11453			BO_LOCK(bo);
11454			continue;
11455		}
11456		break;
11457	}
11458
11459	/*
11460	 * Reasons for needing more work before suspend:
11461	 * - Dirty buffers on devvp.
11462	 * - Softdep activity occurred after start of vnode sync loop
11463	 * - Secondary writes occurred after start of vnode sync loop
11464	 */
11465	error = 0;
11466	if (bo->bo_numoutput > 0 ||
11467	    bo->bo_dirty.bv_cnt > 0 ||
11468	    softdep_deps != 0 ||
11469	    ump->softdep_deps != 0 ||
11470	    softdep_accdeps != ump->softdep_accdeps ||
11471	    secondary_writes != 0 ||
11472	    mp->mnt_secondary_writes != 0 ||
11473	    secondary_accwrites != mp->mnt_secondary_accwrites)
11474		error = EAGAIN;
11475	FREE_LOCK(&lk);
11476	BO_UNLOCK(bo);
11477	return (error);
11478}
11479
11480
11481/*
11482 * Get the number of dependency structures for the file system, both
11483 * the current number and the total number allocated.  These will
11484 * later be used to detect that softdep processing has occurred.
11485 */
11486void
11487softdep_get_depcounts(struct mount *mp,
11488		      int *softdep_depsp,
11489		      int *softdep_accdepsp)
11490{
11491	struct ufsmount *ump;
11492
11493	ump = VFSTOUFS(mp);
11494	ACQUIRE_LOCK(&lk);
11495	*softdep_depsp = ump->softdep_deps;
11496	*softdep_accdepsp = ump->softdep_accdeps;
11497	FREE_LOCK(&lk);
11498}
11499
11500/*
11501 * Wait for pending output on a vnode to complete.
11502 * Must be called with vnode lock and interlock locked.
11503 *
11504 * XXX: Should just be a call to bufobj_wwait().
11505 */
11506static void
11507drain_output(vp)
11508	struct vnode *vp;
11509{
11510	struct bufobj *bo;
11511
11512	bo = &vp->v_bufobj;
11513	ASSERT_VOP_LOCKED(vp, "drain_output");
11514	ASSERT_BO_LOCKED(bo);
11515
11516	while (bo->bo_numoutput) {
11517		bo->bo_flag |= BO_WWAIT;
11518		msleep((caddr_t)&bo->bo_numoutput,
11519		    BO_MTX(bo), PRIBIO + 1, "drainvp", 0);
11520	}
11521}
11522
11523/*
11524 * Called whenever a buffer that is being invalidated or reallocated
11525 * contains dependencies. This should only happen if an I/O error has
11526 * occurred. The routine is called with the buffer locked.
11527 */
11528static void
11529softdep_deallocate_dependencies(bp)
11530	struct buf *bp;
11531{
11532
11533	if ((bp->b_ioflags & BIO_ERROR) == 0)
11534		panic("softdep_deallocate_dependencies: dangling deps");
11535	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
11536	panic("softdep_deallocate_dependencies: unrecovered I/O error");
11537}
11538
11539/*
11540 * Function to handle asynchronous write errors in the filesystem.
11541 */
11542static void
11543softdep_error(func, error)
11544	char *func;
11545	int error;
11546{
11547
11548	/* XXX should do something better! */
11549	printf("%s: got error %d while accessing filesystem\n", func, error);
11550}
11551
11552#ifdef DDB
11553
11554static void
11555inodedep_print(struct inodedep *inodedep, int verbose)
11556{
11557	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
11558	    " saveino %p\n",
11559	    inodedep, inodedep->id_fs, inodedep->id_state,
11560	    (intmax_t)inodedep->id_ino,
11561	    (intmax_t)fsbtodb(inodedep->id_fs,
11562	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
11563	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
11564	    inodedep->id_savedino1);
11565
11566	if (verbose == 0)
11567		return;
11568
11569	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
11570	    "mkdiradd %p\n",
11571	    LIST_FIRST(&inodedep->id_pendinghd),
11572	    LIST_FIRST(&inodedep->id_bufwait),
11573	    LIST_FIRST(&inodedep->id_inowait),
11574	    TAILQ_FIRST(&inodedep->id_inoreflst),
11575	    inodedep->id_mkdiradd);
11576	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
11577	    TAILQ_FIRST(&inodedep->id_inoupdt),
11578	    TAILQ_FIRST(&inodedep->id_newinoupdt),
11579	    TAILQ_FIRST(&inodedep->id_extupdt),
11580	    TAILQ_FIRST(&inodedep->id_newextupdt));
11581}
11582
11583DB_SHOW_COMMAND(inodedep, db_show_inodedep)
11584{
11585
11586	if (have_addr == 0) {
11587		db_printf("Address required\n");
11588		return;
11589	}
11590	inodedep_print((struct inodedep*)addr, 1);
11591}
11592
11593DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
11594{
11595	struct inodedep_hashhead *inodedephd;
11596	struct inodedep *inodedep;
11597	struct fs *fs;
11598	int cnt;
11599
11600	fs = have_addr ? (struct fs *)addr : NULL;
11601	for (cnt = 0; cnt < inodedep_hash; cnt++) {
11602		inodedephd = &inodedep_hashtbl[cnt];
11603		LIST_FOREACH(inodedep, inodedephd, id_hash) {
11604			if (fs != NULL && fs != inodedep->id_fs)
11605				continue;
11606			inodedep_print(inodedep, 0);
11607		}
11608	}
11609}
11610
11611DB_SHOW_COMMAND(worklist, db_show_worklist)
11612{
11613	struct worklist *wk;
11614
11615	if (have_addr == 0) {
11616		db_printf("Address required\n");
11617		return;
11618	}
11619	wk = (struct worklist *)addr;
11620	printf("worklist: %p type %s state 0x%X\n",
11621	    wk, TYPENAME(wk->wk_type), wk->wk_state);
11622}
11623
11624DB_SHOW_COMMAND(workhead, db_show_workhead)
11625{
11626	struct workhead *wkhd;
11627	struct worklist *wk;
11628	int i;
11629
11630	if (have_addr == 0) {
11631		db_printf("Address required\n");
11632		return;
11633	}
11634	wkhd = (struct workhead *)addr;
11635	wk = LIST_FIRST(wkhd);
11636	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
11637		db_printf("worklist: %p type %s state 0x%X",
11638		    wk, TYPENAME(wk->wk_type), wk->wk_state);
11639	if (i == 100)
11640		db_printf("workhead overflow");
11641	printf("\n");
11642}
11643
11644
11645DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
11646{
11647	struct jaddref *jaddref;
11648	struct diradd *diradd;
11649	struct mkdir *mkdir;
11650
11651	LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
11652		diradd = mkdir->md_diradd;
11653		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
11654		    mkdir, mkdir->md_state, diradd, diradd->da_state);
11655		if ((jaddref = mkdir->md_jaddref) != NULL)
11656			db_printf(" jaddref %p jaddref state 0x%X",
11657			    jaddref, jaddref->ja_state);
11658		db_printf("\n");
11659	}
11660}
11661
11662#endif /* DDB */
11663
11664#endif /* SOFTUPDATES */
11665