ffs_softdep.c revision 283600
1/*-
2 * Copyright 1998, 2000 Marshall Kirk McKusick.
3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4 * All rights reserved.
5 *
6 * The soft updates code is derived from the appendix of a University
7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8 * "Soft Updates: A Solution to the Metadata Update Problem in File
9 * Systems", CSE-TR-254-95, August 1995).
10 *
11 * Further information about soft updates can be obtained from:
12 *
13 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
14 *	1614 Oxford Street		mckusick@mckusick.com
15 *	Berkeley, CA 94709-1608		+1-510-843-9542
16 *	USA
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 *
22 * 1. Redistributions of source code must retain the above copyright
23 *    notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 *    notice, this list of conditions and the following disclaimer in the
26 *    documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
40 */
41
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 283600 2015-05-27 09:20:42Z kib $");
44
45#include "opt_ffs.h"
46#include "opt_quota.h"
47#include "opt_ddb.h"
48
49/*
50 * For now we want the safety net that the DEBUG flag provides.
51 */
52#ifndef DEBUG
53#define DEBUG
54#endif
55
56#include <sys/param.h>
57#include <sys/kernel.h>
58#include <sys/systm.h>
59#include <sys/bio.h>
60#include <sys/buf.h>
61#include <sys/kdb.h>
62#include <sys/kthread.h>
63#include <sys/ktr.h>
64#include <sys/limits.h>
65#include <sys/lock.h>
66#include <sys/malloc.h>
67#include <sys/mount.h>
68#include <sys/mutex.h>
69#include <sys/namei.h>
70#include <sys/priv.h>
71#include <sys/proc.h>
72#include <sys/rwlock.h>
73#include <sys/stat.h>
74#include <sys/sysctl.h>
75#include <sys/syslog.h>
76#include <sys/vnode.h>
77#include <sys/conf.h>
78
79#include <ufs/ufs/dir.h>
80#include <ufs/ufs/extattr.h>
81#include <ufs/ufs/quota.h>
82#include <ufs/ufs/inode.h>
83#include <ufs/ufs/ufsmount.h>
84#include <ufs/ffs/fs.h>
85#include <ufs/ffs/softdep.h>
86#include <ufs/ffs/ffs_extern.h>
87#include <ufs/ufs/ufs_extern.h>
88
89#include <vm/vm.h>
90#include <vm/vm_extern.h>
91#include <vm/vm_object.h>
92
93#include <geom/geom.h>
94
95#include <ddb/ddb.h>
96
97#define	KTR_SUJ	0	/* Define to KTR_SPARE. */
98
99#ifndef SOFTUPDATES
100
101int
102softdep_flushfiles(oldmnt, flags, td)
103	struct mount *oldmnt;
104	int flags;
105	struct thread *td;
106{
107
108	panic("softdep_flushfiles called");
109}
110
111int
112softdep_mount(devvp, mp, fs, cred)
113	struct vnode *devvp;
114	struct mount *mp;
115	struct fs *fs;
116	struct ucred *cred;
117{
118
119	return (0);
120}
121
122void
123softdep_initialize()
124{
125
126	return;
127}
128
129void
130softdep_uninitialize()
131{
132
133	return;
134}
135
136void
137softdep_unmount(mp)
138	struct mount *mp;
139{
140
141	panic("softdep_unmount called");
142}
143
144void
145softdep_setup_sbupdate(ump, fs, bp)
146	struct ufsmount *ump;
147	struct fs *fs;
148	struct buf *bp;
149{
150
151	panic("softdep_setup_sbupdate called");
152}
153
154void
155softdep_setup_inomapdep(bp, ip, newinum, mode)
156	struct buf *bp;
157	struct inode *ip;
158	ino_t newinum;
159	int mode;
160{
161
162	panic("softdep_setup_inomapdep called");
163}
164
165void
166softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
167	struct buf *bp;
168	struct mount *mp;
169	ufs2_daddr_t newblkno;
170	int frags;
171	int oldfrags;
172{
173
174	panic("softdep_setup_blkmapdep called");
175}
176
177void
178softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
179	struct inode *ip;
180	ufs_lbn_t lbn;
181	ufs2_daddr_t newblkno;
182	ufs2_daddr_t oldblkno;
183	long newsize;
184	long oldsize;
185	struct buf *bp;
186{
187
188	panic("softdep_setup_allocdirect called");
189}
190
191void
192softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
193	struct inode *ip;
194	ufs_lbn_t lbn;
195	ufs2_daddr_t newblkno;
196	ufs2_daddr_t oldblkno;
197	long newsize;
198	long oldsize;
199	struct buf *bp;
200{
201
202	panic("softdep_setup_allocext called");
203}
204
205void
206softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
207	struct inode *ip;
208	ufs_lbn_t lbn;
209	struct buf *bp;
210	int ptrno;
211	ufs2_daddr_t newblkno;
212	ufs2_daddr_t oldblkno;
213	struct buf *nbp;
214{
215
216	panic("softdep_setup_allocindir_page called");
217}
218
219void
220softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
221	struct buf *nbp;
222	struct inode *ip;
223	struct buf *bp;
224	int ptrno;
225	ufs2_daddr_t newblkno;
226{
227
228	panic("softdep_setup_allocindir_meta called");
229}
230
231void
232softdep_journal_freeblocks(ip, cred, length, flags)
233	struct inode *ip;
234	struct ucred *cred;
235	off_t length;
236	int flags;
237{
238
239	panic("softdep_journal_freeblocks called");
240}
241
242void
243softdep_journal_fsync(ip)
244	struct inode *ip;
245{
246
247	panic("softdep_journal_fsync called");
248}
249
250void
251softdep_setup_freeblocks(ip, length, flags)
252	struct inode *ip;
253	off_t length;
254	int flags;
255{
256
257	panic("softdep_setup_freeblocks called");
258}
259
260void
261softdep_freefile(pvp, ino, mode)
262		struct vnode *pvp;
263		ino_t ino;
264		int mode;
265{
266
267	panic("softdep_freefile called");
268}
269
270int
271softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
272	struct buf *bp;
273	struct inode *dp;
274	off_t diroffset;
275	ino_t newinum;
276	struct buf *newdirbp;
277	int isnewblk;
278{
279
280	panic("softdep_setup_directory_add called");
281}
282
283void
284softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
285	struct buf *bp;
286	struct inode *dp;
287	caddr_t base;
288	caddr_t oldloc;
289	caddr_t newloc;
290	int entrysize;
291{
292
293	panic("softdep_change_directoryentry_offset called");
294}
295
296void
297softdep_setup_remove(bp, dp, ip, isrmdir)
298	struct buf *bp;
299	struct inode *dp;
300	struct inode *ip;
301	int isrmdir;
302{
303
304	panic("softdep_setup_remove called");
305}
306
307void
308softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
309	struct buf *bp;
310	struct inode *dp;
311	struct inode *ip;
312	ino_t newinum;
313	int isrmdir;
314{
315
316	panic("softdep_setup_directory_change called");
317}
318
319void
320softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
321	struct mount *mp;
322	struct buf *bp;
323	ufs2_daddr_t blkno;
324	int frags;
325	struct workhead *wkhd;
326{
327
328	panic("%s called", __FUNCTION__);
329}
330
331void
332softdep_setup_inofree(mp, bp, ino, wkhd)
333	struct mount *mp;
334	struct buf *bp;
335	ino_t ino;
336	struct workhead *wkhd;
337{
338
339	panic("%s called", __FUNCTION__);
340}
341
342void
343softdep_setup_unlink(dp, ip)
344	struct inode *dp;
345	struct inode *ip;
346{
347
348	panic("%s called", __FUNCTION__);
349}
350
351void
352softdep_setup_link(dp, ip)
353	struct inode *dp;
354	struct inode *ip;
355{
356
357	panic("%s called", __FUNCTION__);
358}
359
360void
361softdep_revert_link(dp, ip)
362	struct inode *dp;
363	struct inode *ip;
364{
365
366	panic("%s called", __FUNCTION__);
367}
368
369void
370softdep_setup_rmdir(dp, ip)
371	struct inode *dp;
372	struct inode *ip;
373{
374
375	panic("%s called", __FUNCTION__);
376}
377
378void
379softdep_revert_rmdir(dp, ip)
380	struct inode *dp;
381	struct inode *ip;
382{
383
384	panic("%s called", __FUNCTION__);
385}
386
387void
388softdep_setup_create(dp, ip)
389	struct inode *dp;
390	struct inode *ip;
391{
392
393	panic("%s called", __FUNCTION__);
394}
395
396void
397softdep_revert_create(dp, ip)
398	struct inode *dp;
399	struct inode *ip;
400{
401
402	panic("%s called", __FUNCTION__);
403}
404
405void
406softdep_setup_mkdir(dp, ip)
407	struct inode *dp;
408	struct inode *ip;
409{
410
411	panic("%s called", __FUNCTION__);
412}
413
414void
415softdep_revert_mkdir(dp, ip)
416	struct inode *dp;
417	struct inode *ip;
418{
419
420	panic("%s called", __FUNCTION__);
421}
422
423void
424softdep_setup_dotdot_link(dp, ip)
425	struct inode *dp;
426	struct inode *ip;
427{
428
429	panic("%s called", __FUNCTION__);
430}
431
432int
433softdep_prealloc(vp, waitok)
434	struct vnode *vp;
435	int waitok;
436{
437
438	panic("%s called", __FUNCTION__);
439}
440
441int
442softdep_journal_lookup(mp, vpp)
443	struct mount *mp;
444	struct vnode **vpp;
445{
446
447	return (ENOENT);
448}
449
450void
451softdep_change_linkcnt(ip)
452	struct inode *ip;
453{
454
455	panic("softdep_change_linkcnt called");
456}
457
458void
459softdep_load_inodeblock(ip)
460	struct inode *ip;
461{
462
463	panic("softdep_load_inodeblock called");
464}
465
466void
467softdep_update_inodeblock(ip, bp, waitfor)
468	struct inode *ip;
469	struct buf *bp;
470	int waitfor;
471{
472
473	panic("softdep_update_inodeblock called");
474}
475
476int
477softdep_fsync(vp)
478	struct vnode *vp;	/* the "in_core" copy of the inode */
479{
480
481	return (0);
482}
483
484void
485softdep_fsync_mountdev(vp)
486	struct vnode *vp;
487{
488
489	return;
490}
491
492int
493softdep_flushworklist(oldmnt, countp, td)
494	struct mount *oldmnt;
495	int *countp;
496	struct thread *td;
497{
498
499	*countp = 0;
500	return (0);
501}
502
503int
504softdep_sync_metadata(struct vnode *vp)
505{
506
507	panic("softdep_sync_metadata called");
508}
509
510int
511softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
512{
513
514	panic("softdep_sync_buf called");
515}
516
517int
518softdep_slowdown(vp)
519	struct vnode *vp;
520{
521
522	panic("softdep_slowdown called");
523}
524
525int
526softdep_request_cleanup(fs, vp, cred, resource)
527	struct fs *fs;
528	struct vnode *vp;
529	struct ucred *cred;
530	int resource;
531{
532
533	return (0);
534}
535
536int
537softdep_check_suspend(struct mount *mp,
538		      struct vnode *devvp,
539		      int softdep_depcnt,
540		      int softdep_accdepcnt,
541		      int secondary_writes,
542		      int secondary_accwrites)
543{
544	struct bufobj *bo;
545	int error;
546
547	(void) softdep_depcnt,
548	(void) softdep_accdepcnt;
549
550	bo = &devvp->v_bufobj;
551	ASSERT_BO_WLOCKED(bo);
552
553	MNT_ILOCK(mp);
554	while (mp->mnt_secondary_writes != 0) {
555		BO_UNLOCK(bo);
556		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
557		    (PUSER - 1) | PDROP, "secwr", 0);
558		BO_LOCK(bo);
559		MNT_ILOCK(mp);
560	}
561
562	/*
563	 * Reasons for needing more work before suspend:
564	 * - Dirty buffers on devvp.
565	 * - Secondary writes occurred after start of vnode sync loop
566	 */
567	error = 0;
568	if (bo->bo_numoutput > 0 ||
569	    bo->bo_dirty.bv_cnt > 0 ||
570	    secondary_writes != 0 ||
571	    mp->mnt_secondary_writes != 0 ||
572	    secondary_accwrites != mp->mnt_secondary_accwrites)
573		error = EAGAIN;
574	BO_UNLOCK(bo);
575	return (error);
576}
577
578void
579softdep_get_depcounts(struct mount *mp,
580		      int *softdepactivep,
581		      int *softdepactiveaccp)
582{
583	(void) mp;
584	*softdepactivep = 0;
585	*softdepactiveaccp = 0;
586}
587
588void
589softdep_buf_append(bp, wkhd)
590	struct buf *bp;
591	struct workhead *wkhd;
592{
593
594	panic("softdep_buf_appendwork called");
595}
596
597void
598softdep_inode_append(ip, cred, wkhd)
599	struct inode *ip;
600	struct ucred *cred;
601	struct workhead *wkhd;
602{
603
604	panic("softdep_inode_appendwork called");
605}
606
607void
608softdep_freework(wkhd)
609	struct workhead *wkhd;
610{
611
612	panic("softdep_freework called");
613}
614
615#else
616
617FEATURE(softupdates, "FFS soft-updates support");
618
619static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
620    "soft updates stats");
621static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
622    "total dependencies allocated");
623static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
624    "high use dependencies allocated");
625static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
626    "current dependencies allocated");
627static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
628    "current dependencies written");
629
630unsigned long dep_current[D_LAST + 1];
631unsigned long dep_highuse[D_LAST + 1];
632unsigned long dep_total[D_LAST + 1];
633unsigned long dep_write[D_LAST + 1];
634
635#define	SOFTDEP_TYPE(type, str, long)					\
636    static MALLOC_DEFINE(M_ ## type, #str, long);			\
637    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
638	&dep_total[D_ ## type], 0, "");					\
639    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
640	&dep_current[D_ ## type], 0, "");				\
641    SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, 	\
642	&dep_highuse[D_ ## type], 0, "");				\
643    SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, 	\
644	&dep_write[D_ ## type], 0, "");
645
646SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
647SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
648SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
649    "Block or frag allocated from cyl group map");
650SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
651SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
652SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
653SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
654SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
655SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
656SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
657SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
658SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
659SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
660SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
661SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
662SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
663SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
664SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
665SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
666SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
667SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
668SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
669SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
670SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
671SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
672SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
673SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
674
675static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
676
677static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
678static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
679static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
680
681#define M_SOFTDEP_FLAGS	(M_WAITOK)
682
683/*
684 * translate from workitem type to memory type
685 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
686 */
687static struct malloc_type *memtype[] = {
688	M_PAGEDEP,
689	M_INODEDEP,
690	M_BMSAFEMAP,
691	M_NEWBLK,
692	M_ALLOCDIRECT,
693	M_INDIRDEP,
694	M_ALLOCINDIR,
695	M_FREEFRAG,
696	M_FREEBLKS,
697	M_FREEFILE,
698	M_DIRADD,
699	M_MKDIR,
700	M_DIRREM,
701	M_NEWDIRBLK,
702	M_FREEWORK,
703	M_FREEDEP,
704	M_JADDREF,
705	M_JREMREF,
706	M_JMVREF,
707	M_JNEWBLK,
708	M_JFREEBLK,
709	M_JFREEFRAG,
710	M_JSEG,
711	M_JSEGDEP,
712	M_SBDEP,
713	M_JTRUNC,
714	M_JFSYNC,
715	M_SENTINEL
716};
717
718#define DtoM(type) (memtype[type])
719
720/*
721 * Names of malloc types.
722 */
723#define TYPENAME(type)  \
724	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
725/*
726 * End system adaptation definitions.
727 */
728
729#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
730#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
731
732/*
733 * Internal function prototypes.
734 */
735static	void check_clear_deps(struct mount *);
736static	void softdep_error(char *, int);
737static	int softdep_process_worklist(struct mount *, int);
738static	int softdep_waitidle(struct mount *, int);
739static	void drain_output(struct vnode *);
740static	struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
741static	int check_inodedep_free(struct inodedep *);
742static	void clear_remove(struct mount *);
743static	void clear_inodedeps(struct mount *);
744static	void unlinked_inodedep(struct mount *, struct inodedep *);
745static	void clear_unlinked_inodedep(struct inodedep *);
746static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
747static	int flush_pagedep_deps(struct vnode *, struct mount *,
748	    struct diraddhd *);
749static	int free_pagedep(struct pagedep *);
750static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
751static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
752static	int flush_deplist(struct allocdirectlst *, int, int *);
753static	int sync_cgs(struct mount *, int);
754static	int handle_written_filepage(struct pagedep *, struct buf *);
755static	int handle_written_sbdep(struct sbdep *, struct buf *);
756static	void initiate_write_sbdep(struct sbdep *);
757static	void diradd_inode_written(struct diradd *, struct inodedep *);
758static	int handle_written_indirdep(struct indirdep *, struct buf *,
759	    struct buf**);
760static	int handle_written_inodeblock(struct inodedep *, struct buf *);
761static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
762	    uint8_t *);
763static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
764static	void handle_written_jaddref(struct jaddref *);
765static	void handle_written_jremref(struct jremref *);
766static	void handle_written_jseg(struct jseg *, struct buf *);
767static	void handle_written_jnewblk(struct jnewblk *);
768static	void handle_written_jblkdep(struct jblkdep *);
769static	void handle_written_jfreefrag(struct jfreefrag *);
770static	void complete_jseg(struct jseg *);
771static	void complete_jsegs(struct jseg *);
772static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
773static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
774static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
775static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
776static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
777static	void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
778static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
779static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
780static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
781static	inline void inoref_write(struct inoref *, struct jseg *,
782	    struct jrefrec *);
783static	void handle_allocdirect_partdone(struct allocdirect *,
784	    struct workhead *);
785static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
786	    struct workhead *);
787static	void indirdep_complete(struct indirdep *);
788static	int indirblk_lookup(struct mount *, ufs2_daddr_t);
789static	void indirblk_insert(struct freework *);
790static	void indirblk_remove(struct freework *);
791static	void handle_allocindir_partdone(struct allocindir *);
792static	void initiate_write_filepage(struct pagedep *, struct buf *);
793static	void initiate_write_indirdep(struct indirdep*, struct buf *);
794static	void handle_written_mkdir(struct mkdir *, int);
795static	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
796	    uint8_t *);
797static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
798static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
799static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
800static	void handle_workitem_freefile(struct freefile *);
801static	int handle_workitem_remove(struct dirrem *, int);
802static	struct dirrem *newdirrem(struct buf *, struct inode *,
803	    struct inode *, int, struct dirrem **);
804static	struct indirdep *indirdep_lookup(struct mount *, struct inode *,
805	    struct buf *);
806static	void cancel_indirdep(struct indirdep *, struct buf *,
807	    struct freeblks *);
808static	void free_indirdep(struct indirdep *);
809static	void free_diradd(struct diradd *, struct workhead *);
810static	void merge_diradd(struct inodedep *, struct diradd *);
811static	void complete_diradd(struct diradd *);
812static	struct diradd *diradd_lookup(struct pagedep *, int);
813static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
814	    struct jremref *);
815static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
816	    struct jremref *);
817static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
818	    struct jremref *, struct jremref *);
819static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
820	    struct jremref *);
821static	void cancel_allocindir(struct allocindir *, struct buf *bp,
822	    struct freeblks *, int);
823static	int setup_trunc_indir(struct freeblks *, struct inode *,
824	    ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
825static	void complete_trunc_indir(struct freework *);
826static	void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
827	    int);
828static	void complete_mkdir(struct mkdir *);
829static	void free_newdirblk(struct newdirblk *);
830static	void free_jremref(struct jremref *);
831static	void free_jaddref(struct jaddref *);
832static	void free_jsegdep(struct jsegdep *);
833static	void free_jsegs(struct jblocks *);
834static	void rele_jseg(struct jseg *);
835static	void free_jseg(struct jseg *, struct jblocks *);
836static	void free_jnewblk(struct jnewblk *);
837static	void free_jblkdep(struct jblkdep *);
838static	void free_jfreefrag(struct jfreefrag *);
839static	void free_freedep(struct freedep *);
840static	void journal_jremref(struct dirrem *, struct jremref *,
841	    struct inodedep *);
842static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
843static	int cancel_jaddref(struct jaddref *, struct inodedep *,
844	    struct workhead *);
845static	void cancel_jfreefrag(struct jfreefrag *);
846static	inline void setup_freedirect(struct freeblks *, struct inode *,
847	    int, int);
848static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
849static	inline void setup_freeindir(struct freeblks *, struct inode *, int,
850	    ufs_lbn_t, int);
851static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
852static	void freeblks_free(struct ufsmount *, struct freeblks *, int);
853static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
854static	ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
855static	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
856static	void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
857	    int, int);
858static	void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
859static 	int cancel_pagedep(struct pagedep *, struct freeblks *, int);
860static	int deallocate_dependencies(struct buf *, struct freeblks *, int);
861static	void newblk_freefrag(struct newblk*);
862static	void free_newblk(struct newblk *);
863static	void cancel_allocdirect(struct allocdirectlst *,
864	    struct allocdirect *, struct freeblks *);
865static	int check_inode_unwritten(struct inodedep *);
866static	int free_inodedep(struct inodedep *);
867static	void freework_freeblock(struct freework *);
868static	void freework_enqueue(struct freework *);
869static	int handle_workitem_freeblocks(struct freeblks *, int);
870static	int handle_complete_freeblocks(struct freeblks *, int);
871static	void handle_workitem_indirblk(struct freework *);
872static	void handle_written_freework(struct freework *);
873static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
874static	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
875	    struct workhead *);
876static	struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
877	    struct inodedep *, struct allocindir *, ufs_lbn_t);
878static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
879	    ufs2_daddr_t, ufs_lbn_t);
880static	void handle_workitem_freefrag(struct freefrag *);
881static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
882	    ufs_lbn_t);
883static	void allocdirect_merge(struct allocdirectlst *,
884	    struct allocdirect *, struct allocdirect *);
885static	struct freefrag *allocindir_merge(struct allocindir *,
886	    struct allocindir *);
887static	int bmsafemap_find(struct bmsafemap_hashhead *, int,
888	    struct bmsafemap **);
889static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
890	    int cg, struct bmsafemap *);
891static	int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
892	    struct newblk **);
893static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
894static	int inodedep_find(struct inodedep_hashhead *, ino_t,
895	    struct inodedep **);
896static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
897static	int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
898	    int, struct pagedep **);
899static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
900	    struct pagedep **);
901static	void pause_timer(void *);
902static	int request_cleanup(struct mount *, int);
903static	void schedule_cleanup(struct mount *);
904static void softdep_ast_cleanup_proc(void);
905static	int process_worklist_item(struct mount *, int, int);
906static	void process_removes(struct vnode *);
907static	void process_truncates(struct vnode *);
908static	void jwork_move(struct workhead *, struct workhead *);
909static	void jwork_insert(struct workhead *, struct jsegdep *);
910static	void add_to_worklist(struct worklist *, int);
911static	void wake_worklist(struct worklist *);
912static	void wait_worklist(struct worklist *, char *);
913static	void remove_from_worklist(struct worklist *);
914static	void softdep_flush(void *);
915static	void softdep_flushjournal(struct mount *);
916static	int softdep_speedup(struct ufsmount *);
917static	void worklist_speedup(struct mount *);
918static	int journal_mount(struct mount *, struct fs *, struct ucred *);
919static	void journal_unmount(struct ufsmount *);
920static	int journal_space(struct ufsmount *, int);
921static	void journal_suspend(struct ufsmount *);
922static	int journal_unsuspend(struct ufsmount *ump);
923static	void softdep_prelink(struct vnode *, struct vnode *);
924static	void add_to_journal(struct worklist *);
925static	void remove_from_journal(struct worklist *);
926static	bool softdep_excess_inodes(struct ufsmount *);
927static	bool softdep_excess_dirrem(struct ufsmount *);
928static	void softdep_process_journal(struct mount *, struct worklist *, int);
929static	struct jremref *newjremref(struct dirrem *, struct inode *,
930	    struct inode *ip, off_t, nlink_t);
931static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
932	    uint16_t);
933static	inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
934	    uint16_t);
935static	inline struct jsegdep *inoref_jseg(struct inoref *);
936static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
937static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
938	    ufs2_daddr_t, int);
939static	void adjust_newfreework(struct freeblks *, int);
940static	struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
941static	void move_newblock_dep(struct jaddref *, struct inodedep *);
942static	void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
943static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
944	    ufs2_daddr_t, long, ufs_lbn_t);
945static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
946	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
947static	int jwait(struct worklist *, int);
948static	struct inodedep *inodedep_lookup_ip(struct inode *);
949static	int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
950static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
951static	void handle_jwork(struct workhead *);
952static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
953	    struct mkdir **);
954static	struct jblocks *jblocks_create(void);
955static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
956static	void jblocks_free(struct jblocks *, struct mount *, int);
957static	void jblocks_destroy(struct jblocks *);
958static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
959
960/*
961 * Exported softdep operations.
962 */
963static	void softdep_disk_io_initiation(struct buf *);
964static	void softdep_disk_write_complete(struct buf *);
965static	void softdep_deallocate_dependencies(struct buf *);
966static	int softdep_count_dependencies(struct buf *bp, int);
967
968/*
969 * Global lock over all of soft updates.
970 */
971static struct mtx lk;
972MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF);
973
974#define ACQUIRE_GBLLOCK(lk)	mtx_lock(lk)
975#define FREE_GBLLOCK(lk)	mtx_unlock(lk)
976#define GBLLOCK_OWNED(lk)	mtx_assert((lk), MA_OWNED)
977
978/*
979 * Per-filesystem soft-updates locking.
980 */
981#define LOCK_PTR(ump)		(&(ump)->um_softdep->sd_fslock)
982#define TRY_ACQUIRE_LOCK(ump)	rw_try_wlock(&(ump)->um_softdep->sd_fslock)
983#define ACQUIRE_LOCK(ump)	rw_wlock(&(ump)->um_softdep->sd_fslock)
984#define FREE_LOCK(ump)		rw_wunlock(&(ump)->um_softdep->sd_fslock)
985#define LOCK_OWNED(ump)		rw_assert(&(ump)->um_softdep->sd_fslock, \
986				    RA_WLOCKED)
987
988#define	BUF_AREC(bp)		lockallowrecurse(&(bp)->b_lock)
989#define	BUF_NOREC(bp)		lockdisablerecurse(&(bp)->b_lock)
990
991/*
992 * Worklist queue management.
993 * These routines require that the lock be held.
994 */
995#ifndef /* NOT */ DEBUG
996#define WORKLIST_INSERT(head, item) do {	\
997	(item)->wk_state |= ONWORKLIST;		\
998	LIST_INSERT_HEAD(head, item, wk_list);	\
999} while (0)
1000#define WORKLIST_REMOVE(item) do {		\
1001	(item)->wk_state &= ~ONWORKLIST;	\
1002	LIST_REMOVE(item, wk_list);		\
1003} while (0)
1004#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
1005#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
1006
1007#else /* DEBUG */
1008static	void worklist_insert(struct workhead *, struct worklist *, int);
1009static	void worklist_remove(struct worklist *, int);
1010
1011#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
1012#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
1013#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
1014#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
1015
1016static void
1017worklist_insert(head, item, locked)
1018	struct workhead *head;
1019	struct worklist *item;
1020	int locked;
1021{
1022
1023	if (locked)
1024		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1025	if (item->wk_state & ONWORKLIST)
1026		panic("worklist_insert: %p %s(0x%X) already on list",
1027		    item, TYPENAME(item->wk_type), item->wk_state);
1028	item->wk_state |= ONWORKLIST;
1029	LIST_INSERT_HEAD(head, item, wk_list);
1030}
1031
1032static void
1033worklist_remove(item, locked)
1034	struct worklist *item;
1035	int locked;
1036{
1037
1038	if (locked)
1039		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1040	if ((item->wk_state & ONWORKLIST) == 0)
1041		panic("worklist_remove: %p %s(0x%X) not on list",
1042		    item, TYPENAME(item->wk_type), item->wk_state);
1043	item->wk_state &= ~ONWORKLIST;
1044	LIST_REMOVE(item, wk_list);
1045}
1046#endif /* DEBUG */
1047
1048/*
1049 * Merge two jsegdeps keeping only the oldest one as newer references
1050 * can't be discarded until after older references.
1051 */
1052static inline struct jsegdep *
1053jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1054{
1055	struct jsegdep *swp;
1056
1057	if (two == NULL)
1058		return (one);
1059
1060	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1061		swp = one;
1062		one = two;
1063		two = swp;
1064	}
1065	WORKLIST_REMOVE(&two->jd_list);
1066	free_jsegdep(two);
1067
1068	return (one);
1069}
1070
1071/*
1072 * If two freedeps are compatible free one to reduce list size.
1073 */
1074static inline struct freedep *
1075freedep_merge(struct freedep *one, struct freedep *two)
1076{
1077	if (two == NULL)
1078		return (one);
1079
1080	if (one->fd_freework == two->fd_freework) {
1081		WORKLIST_REMOVE(&two->fd_list);
1082		free_freedep(two);
1083	}
1084	return (one);
1085}
1086
1087/*
1088 * Move journal work from one list to another.  Duplicate freedeps and
1089 * jsegdeps are coalesced to keep the lists as small as possible.
1090 */
1091static void
1092jwork_move(dst, src)
1093	struct workhead *dst;
1094	struct workhead *src;
1095{
1096	struct freedep *freedep;
1097	struct jsegdep *jsegdep;
1098	struct worklist *wkn;
1099	struct worklist *wk;
1100
1101	KASSERT(dst != src,
1102	    ("jwork_move: dst == src"));
1103	freedep = NULL;
1104	jsegdep = NULL;
1105	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1106		if (wk->wk_type == D_JSEGDEP)
1107			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1108		else if (wk->wk_type == D_FREEDEP)
1109			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1110	}
1111
1112	while ((wk = LIST_FIRST(src)) != NULL) {
1113		WORKLIST_REMOVE(wk);
1114		WORKLIST_INSERT(dst, wk);
1115		if (wk->wk_type == D_JSEGDEP) {
1116			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1117			continue;
1118		}
1119		if (wk->wk_type == D_FREEDEP)
1120			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1121	}
1122}
1123
1124static void
1125jwork_insert(dst, jsegdep)
1126	struct workhead *dst;
1127	struct jsegdep *jsegdep;
1128{
1129	struct jsegdep *jsegdepn;
1130	struct worklist *wk;
1131
1132	LIST_FOREACH(wk, dst, wk_list)
1133		if (wk->wk_type == D_JSEGDEP)
1134			break;
1135	if (wk == NULL) {
1136		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1137		return;
1138	}
1139	jsegdepn = WK_JSEGDEP(wk);
1140	if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1141		WORKLIST_REMOVE(wk);
1142		free_jsegdep(jsegdepn);
1143		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1144	} else
1145		free_jsegdep(jsegdep);
1146}
1147
1148/*
1149 * Routines for tracking and managing workitems.
1150 */
1151static	void workitem_free(struct worklist *, int);
1152static	void workitem_alloc(struct worklist *, int, struct mount *);
1153static	void workitem_reassign(struct worklist *, int);
1154
1155#define	WORKITEM_FREE(item, type) \
1156	workitem_free((struct worklist *)(item), (type))
1157#define	WORKITEM_REASSIGN(item, type) \
1158	workitem_reassign((struct worklist *)(item), (type))
1159
1160static void
1161workitem_free(item, type)
1162	struct worklist *item;
1163	int type;
1164{
1165	struct ufsmount *ump;
1166
1167#ifdef DEBUG
1168	if (item->wk_state & ONWORKLIST)
1169		panic("workitem_free: %s(0x%X) still on list",
1170		    TYPENAME(item->wk_type), item->wk_state);
1171	if (item->wk_type != type && type != D_NEWBLK)
1172		panic("workitem_free: type mismatch %s != %s",
1173		    TYPENAME(item->wk_type), TYPENAME(type));
1174#endif
1175	if (item->wk_state & IOWAITING)
1176		wakeup(item);
1177	ump = VFSTOUFS(item->wk_mp);
1178	LOCK_OWNED(ump);
1179	KASSERT(ump->softdep_deps > 0,
1180	    ("workitem_free: %s: softdep_deps going negative",
1181	    ump->um_fs->fs_fsmnt));
1182	if (--ump->softdep_deps == 0 && ump->softdep_req)
1183		wakeup(&ump->softdep_deps);
1184	KASSERT(dep_current[item->wk_type] > 0,
1185	    ("workitem_free: %s: dep_current[%s] going negative",
1186	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1187	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1188	    ("workitem_free: %s: softdep_curdeps[%s] going negative",
1189	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1190	atomic_subtract_long(&dep_current[item->wk_type], 1);
1191	ump->softdep_curdeps[item->wk_type] -= 1;
1192	free(item, DtoM(type));
1193}
1194
1195static void
1196workitem_alloc(item, type, mp)
1197	struct worklist *item;
1198	int type;
1199	struct mount *mp;
1200{
1201	struct ufsmount *ump;
1202
1203	item->wk_type = type;
1204	item->wk_mp = mp;
1205	item->wk_state = 0;
1206
1207	ump = VFSTOUFS(mp);
1208	ACQUIRE_GBLLOCK(&lk);
1209	dep_current[type]++;
1210	if (dep_current[type] > dep_highuse[type])
1211		dep_highuse[type] = dep_current[type];
1212	dep_total[type]++;
1213	FREE_GBLLOCK(&lk);
1214	ACQUIRE_LOCK(ump);
1215	ump->softdep_curdeps[type] += 1;
1216	ump->softdep_deps++;
1217	ump->softdep_accdeps++;
1218	FREE_LOCK(ump);
1219}
1220
1221static void
1222workitem_reassign(item, newtype)
1223	struct worklist *item;
1224	int newtype;
1225{
1226	struct ufsmount *ump;
1227
1228	ump = VFSTOUFS(item->wk_mp);
1229	LOCK_OWNED(ump);
1230	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1231	    ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
1232	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1233	ump->softdep_curdeps[item->wk_type] -= 1;
1234	ump->softdep_curdeps[newtype] += 1;
1235	KASSERT(dep_current[item->wk_type] > 0,
1236	    ("workitem_reassign: %s: dep_current[%s] going negative",
1237	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1238	ACQUIRE_GBLLOCK(&lk);
1239	dep_current[newtype]++;
1240	dep_current[item->wk_type]--;
1241	if (dep_current[newtype] > dep_highuse[newtype])
1242		dep_highuse[newtype] = dep_current[newtype];
1243	dep_total[newtype]++;
1244	FREE_GBLLOCK(&lk);
1245	item->wk_type = newtype;
1246}
1247
1248/*
1249 * Workitem queue management
1250 */
1251static int max_softdeps;	/* maximum number of structs before slowdown */
1252static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1253static int proc_waiting;	/* tracks whether we have a timeout posted */
1254static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1255static struct callout softdep_callout;
1256static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1257static int req_clear_remove;	/* syncer process flush some freeblks */
1258static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1259
1260/*
1261 * runtime statistics
1262 */
1263static int stat_flush_threads;	/* number of softdep flushing threads */
1264static int stat_worklist_push;	/* number of worklist cleanups */
1265static int stat_blk_limit_push;	/* number of times block limit neared */
1266static int stat_ino_limit_push;	/* number of times inode limit neared */
1267static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1268static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1269static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1270static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1271static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1272static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1273static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1274static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1275static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1276static int stat_journal_min;	/* Times hit journal min threshold */
1277static int stat_journal_low;	/* Times hit journal low threshold */
1278static int stat_journal_wait;	/* Times blocked in jwait(). */
1279static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1280static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1281static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1282static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1283static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1284static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1285static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1286static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1287static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1288static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
1289
1290SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1291    &max_softdeps, 0, "");
1292SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1293    &tickdelay, 0, "");
1294SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
1295    &stat_flush_threads, 0, "");
1296SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1297    &stat_worklist_push, 0,"");
1298SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1299    &stat_blk_limit_push, 0,"");
1300SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1301    &stat_ino_limit_push, 0,"");
1302SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1303    &stat_blk_limit_hit, 0, "");
1304SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1305    &stat_ino_limit_hit, 0, "");
1306SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1307    &stat_sync_limit_hit, 0, "");
1308SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1309    &stat_indir_blk_ptrs, 0, "");
1310SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1311    &stat_inode_bitmap, 0, "");
1312SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1313    &stat_direct_blk_ptrs, 0, "");
1314SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1315    &stat_dir_entry, 0, "");
1316SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1317    &stat_jaddref, 0, "");
1318SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1319    &stat_jnewblk, 0, "");
1320SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1321    &stat_journal_low, 0, "");
1322SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1323    &stat_journal_min, 0, "");
1324SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1325    &stat_journal_wait, 0, "");
1326SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1327    &stat_jwait_filepage, 0, "");
1328SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1329    &stat_jwait_freeblks, 0, "");
1330SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1331    &stat_jwait_inode, 0, "");
1332SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1333    &stat_jwait_newblk, 0, "");
1334SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1335    &stat_cleanup_blkrequests, 0, "");
1336SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1337    &stat_cleanup_inorequests, 0, "");
1338SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1339    &stat_cleanup_high_delay, 0, "");
1340SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1341    &stat_cleanup_retries, 0, "");
1342SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1343    &stat_cleanup_failures, 0, "");
1344SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1345    &softdep_flushcache, 0, "");
1346SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
1347    &stat_emptyjblocks, 0, "");
1348
1349SYSCTL_DECL(_vfs_ffs);
1350
1351/* Whether to recompute the summary at mount time */
1352static int compute_summary_at_mount = 0;
1353SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1354	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1355static int print_threads = 0;
1356SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
1357    &print_threads, 0, "Notify flusher thread start/stop");
1358
1359/* List of all filesystems mounted with soft updates */
1360static TAILQ_HEAD(, mount_softdeps) softdepmounts;
1361
1362/*
1363 * This function cleans the worklist for a filesystem.
1364 * Each filesystem running with soft dependencies gets its own
1365 * thread to run in this function. The thread is started up in
1366 * softdep_mount and shutdown in softdep_unmount. They show up
1367 * as part of the kernel "bufdaemon" process whose process
1368 * entry is available in bufdaemonproc.
1369 */
1370static int searchfailed;
1371extern struct proc *bufdaemonproc;
1372static void
1373softdep_flush(addr)
1374	void *addr;
1375{
1376	struct mount *mp;
1377	struct thread *td;
1378	struct ufsmount *ump;
1379
1380	td = curthread;
1381	td->td_pflags |= TDP_NORUNNINGBUF;
1382	mp = (struct mount *)addr;
1383	ump = VFSTOUFS(mp);
1384	atomic_add_int(&stat_flush_threads, 1);
1385	ACQUIRE_LOCK(ump);
1386	ump->softdep_flags &= ~FLUSH_STARTING;
1387	wakeup(&ump->softdep_flushtd);
1388	FREE_LOCK(ump);
1389	if (print_threads) {
1390		if (stat_flush_threads == 1)
1391			printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
1392			    bufdaemonproc->p_pid);
1393		printf("Start thread %s\n", td->td_name);
1394	}
1395	for (;;) {
1396		while (softdep_process_worklist(mp, 0) > 0 ||
1397		    (MOUNTEDSUJ(mp) &&
1398		    VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
1399			kthread_suspend_check();
1400		ACQUIRE_LOCK(ump);
1401		if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1402			msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
1403			    "sdflush", hz / 2);
1404		ump->softdep_flags &= ~FLUSH_CLEANUP;
1405		/*
1406		 * Check to see if we are done and need to exit.
1407		 */
1408		if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
1409			FREE_LOCK(ump);
1410			continue;
1411		}
1412		ump->softdep_flags &= ~FLUSH_EXIT;
1413		FREE_LOCK(ump);
1414		wakeup(&ump->softdep_flags);
1415		if (print_threads)
1416			printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
1417		atomic_subtract_int(&stat_flush_threads, 1);
1418		kthread_exit();
1419		panic("kthread_exit failed\n");
1420	}
1421}
1422
1423static void
1424worklist_speedup(mp)
1425	struct mount *mp;
1426{
1427	struct ufsmount *ump;
1428
1429	ump = VFSTOUFS(mp);
1430	LOCK_OWNED(ump);
1431	if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1432		ump->softdep_flags |= FLUSH_CLEANUP;
1433	wakeup(&ump->softdep_flushtd);
1434}
1435
1436static int
1437softdep_speedup(ump)
1438	struct ufsmount *ump;
1439{
1440	struct ufsmount *altump;
1441	struct mount_softdeps *sdp;
1442
1443	LOCK_OWNED(ump);
1444	worklist_speedup(ump->um_mountp);
1445	bd_speedup();
1446	/*
1447	 * If we have global shortages, then we need other
1448	 * filesystems to help with the cleanup. Here we wakeup a
1449	 * flusher thread for a filesystem that is over its fair
1450	 * share of resources.
1451	 */
1452	if (req_clear_inodedeps || req_clear_remove) {
1453		ACQUIRE_GBLLOCK(&lk);
1454		TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
1455			if ((altump = sdp->sd_ump) == ump)
1456				continue;
1457			if (((req_clear_inodedeps &&
1458			    altump->softdep_curdeps[D_INODEDEP] >
1459			    max_softdeps / stat_flush_threads) ||
1460			    (req_clear_remove &&
1461			    altump->softdep_curdeps[D_DIRREM] >
1462			    (max_softdeps / 2) / stat_flush_threads)) &&
1463			    TRY_ACQUIRE_LOCK(altump))
1464				break;
1465		}
1466		if (sdp == NULL) {
1467			searchfailed++;
1468			FREE_GBLLOCK(&lk);
1469		} else {
1470			/*
1471			 * Move to the end of the list so we pick a
1472			 * different one on out next try.
1473			 */
1474			TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
1475			TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
1476			FREE_GBLLOCK(&lk);
1477			if ((altump->softdep_flags &
1478			    (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1479				altump->softdep_flags |= FLUSH_CLEANUP;
1480			altump->um_softdep->sd_cleanups++;
1481			wakeup(&altump->softdep_flushtd);
1482			FREE_LOCK(altump);
1483		}
1484	}
1485	return (speedup_syncer());
1486}
1487
1488/*
1489 * Add an item to the end of the work queue.
1490 * This routine requires that the lock be held.
1491 * This is the only routine that adds items to the list.
1492 * The following routine is the only one that removes items
1493 * and does so in order from first to last.
1494 */
1495
1496#define	WK_HEAD		0x0001	/* Add to HEAD. */
1497#define	WK_NODELAY	0x0002	/* Process immediately. */
1498
1499static void
1500add_to_worklist(wk, flags)
1501	struct worklist *wk;
1502	int flags;
1503{
1504	struct ufsmount *ump;
1505
1506	ump = VFSTOUFS(wk->wk_mp);
1507	LOCK_OWNED(ump);
1508	if (wk->wk_state & ONWORKLIST)
1509		panic("add_to_worklist: %s(0x%X) already on list",
1510		    TYPENAME(wk->wk_type), wk->wk_state);
1511	wk->wk_state |= ONWORKLIST;
1512	if (ump->softdep_on_worklist == 0) {
1513		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1514		ump->softdep_worklist_tail = wk;
1515	} else if (flags & WK_HEAD) {
1516		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1517	} else {
1518		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1519		ump->softdep_worklist_tail = wk;
1520	}
1521	ump->softdep_on_worklist += 1;
1522	if (flags & WK_NODELAY)
1523		worklist_speedup(wk->wk_mp);
1524}
1525
1526/*
1527 * Remove the item to be processed. If we are removing the last
1528 * item on the list, we need to recalculate the tail pointer.
1529 */
1530static void
1531remove_from_worklist(wk)
1532	struct worklist *wk;
1533{
1534	struct ufsmount *ump;
1535
1536	ump = VFSTOUFS(wk->wk_mp);
1537	WORKLIST_REMOVE(wk);
1538	if (ump->softdep_worklist_tail == wk)
1539		ump->softdep_worklist_tail =
1540		    (struct worklist *)wk->wk_list.le_prev;
1541	ump->softdep_on_worklist -= 1;
1542}
1543
1544static void
1545wake_worklist(wk)
1546	struct worklist *wk;
1547{
1548	if (wk->wk_state & IOWAITING) {
1549		wk->wk_state &= ~IOWAITING;
1550		wakeup(wk);
1551	}
1552}
1553
1554static void
1555wait_worklist(wk, wmesg)
1556	struct worklist *wk;
1557	char *wmesg;
1558{
1559	struct ufsmount *ump;
1560
1561	ump = VFSTOUFS(wk->wk_mp);
1562	wk->wk_state |= IOWAITING;
1563	msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
1564}
1565
1566/*
1567 * Process that runs once per second to handle items in the background queue.
1568 *
1569 * Note that we ensure that everything is done in the order in which they
1570 * appear in the queue. The code below depends on this property to ensure
1571 * that blocks of a file are freed before the inode itself is freed. This
1572 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1573 * until all the old ones have been purged from the dependency lists.
1574 */
1575static int
1576softdep_process_worklist(mp, full)
1577	struct mount *mp;
1578	int full;
1579{
1580	int cnt, matchcnt;
1581	struct ufsmount *ump;
1582	long starttime;
1583
1584	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1585	if (MOUNTEDSOFTDEP(mp) == 0)
1586		return (0);
1587	matchcnt = 0;
1588	ump = VFSTOUFS(mp);
1589	ACQUIRE_LOCK(ump);
1590	starttime = time_second;
1591	softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
1592	check_clear_deps(mp);
1593	while (ump->softdep_on_worklist > 0) {
1594		if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1595			break;
1596		else
1597			matchcnt += cnt;
1598		check_clear_deps(mp);
1599		/*
1600		 * We do not generally want to stop for buffer space, but if
1601		 * we are really being a buffer hog, we will stop and wait.
1602		 */
1603		if (should_yield()) {
1604			FREE_LOCK(ump);
1605			kern_yield(PRI_USER);
1606			bwillwrite();
1607			ACQUIRE_LOCK(ump);
1608		}
1609		/*
1610		 * Never allow processing to run for more than one
1611		 * second. This gives the syncer thread the opportunity
1612		 * to pause if appropriate.
1613		 */
1614		if (!full && starttime != time_second)
1615			break;
1616	}
1617	if (full == 0)
1618		journal_unsuspend(ump);
1619	FREE_LOCK(ump);
1620	return (matchcnt);
1621}
1622
1623/*
1624 * Process all removes associated with a vnode if we are running out of
1625 * journal space.  Any other process which attempts to flush these will
1626 * be unable as we have the vnodes locked.
1627 */
1628static void
1629process_removes(vp)
1630	struct vnode *vp;
1631{
1632	struct inodedep *inodedep;
1633	struct dirrem *dirrem;
1634	struct ufsmount *ump;
1635	struct mount *mp;
1636	ino_t inum;
1637
1638	mp = vp->v_mount;
1639	ump = VFSTOUFS(mp);
1640	LOCK_OWNED(ump);
1641	inum = VTOI(vp)->i_number;
1642	for (;;) {
1643top:
1644		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1645			return;
1646		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1647			/*
1648			 * If another thread is trying to lock this vnode
1649			 * it will fail but we must wait for it to do so
1650			 * before we can proceed.
1651			 */
1652			if (dirrem->dm_state & INPROGRESS) {
1653				wait_worklist(&dirrem->dm_list, "pwrwait");
1654				goto top;
1655			}
1656			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1657			    (COMPLETE | ONWORKLIST))
1658				break;
1659		}
1660		if (dirrem == NULL)
1661			return;
1662		remove_from_worklist(&dirrem->dm_list);
1663		FREE_LOCK(ump);
1664		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1665			panic("process_removes: suspended filesystem");
1666		handle_workitem_remove(dirrem, 0);
1667		vn_finished_secondary_write(mp);
1668		ACQUIRE_LOCK(ump);
1669	}
1670}
1671
1672/*
1673 * Process all truncations associated with a vnode if we are running out
1674 * of journal space.  This is called when the vnode lock is already held
1675 * and no other process can clear the truncation.  This function returns
1676 * a value greater than zero if it did any work.
1677 */
1678static void
1679process_truncates(vp)
1680	struct vnode *vp;
1681{
1682	struct inodedep *inodedep;
1683	struct freeblks *freeblks;
1684	struct ufsmount *ump;
1685	struct mount *mp;
1686	ino_t inum;
1687	int cgwait;
1688
1689	mp = vp->v_mount;
1690	ump = VFSTOUFS(mp);
1691	LOCK_OWNED(ump);
1692	inum = VTOI(vp)->i_number;
1693	for (;;) {
1694		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1695			return;
1696		cgwait = 0;
1697		TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1698			/* Journal entries not yet written.  */
1699			if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1700				jwait(&LIST_FIRST(
1701				    &freeblks->fb_jblkdephd)->jb_list,
1702				    MNT_WAIT);
1703				break;
1704			}
1705			/* Another thread is executing this item. */
1706			if (freeblks->fb_state & INPROGRESS) {
1707				wait_worklist(&freeblks->fb_list, "ptrwait");
1708				break;
1709			}
1710			/* Freeblks is waiting on a inode write. */
1711			if ((freeblks->fb_state & COMPLETE) == 0) {
1712				FREE_LOCK(ump);
1713				ffs_update(vp, 1);
1714				ACQUIRE_LOCK(ump);
1715				break;
1716			}
1717			if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1718			    (ALLCOMPLETE | ONWORKLIST)) {
1719				remove_from_worklist(&freeblks->fb_list);
1720				freeblks->fb_state |= INPROGRESS;
1721				FREE_LOCK(ump);
1722				if (vn_start_secondary_write(NULL, &mp,
1723				    V_NOWAIT))
1724					panic("process_truncates: "
1725					    "suspended filesystem");
1726				handle_workitem_freeblocks(freeblks, 0);
1727				vn_finished_secondary_write(mp);
1728				ACQUIRE_LOCK(ump);
1729				break;
1730			}
1731			if (freeblks->fb_cgwait)
1732				cgwait++;
1733		}
1734		if (cgwait) {
1735			FREE_LOCK(ump);
1736			sync_cgs(mp, MNT_WAIT);
1737			ffs_sync_snap(mp, MNT_WAIT);
1738			ACQUIRE_LOCK(ump);
1739			continue;
1740		}
1741		if (freeblks == NULL)
1742			break;
1743	}
1744	return;
1745}
1746
1747/*
1748 * Process one item on the worklist.
1749 */
1750static int
1751process_worklist_item(mp, target, flags)
1752	struct mount *mp;
1753	int target;
1754	int flags;
1755{
1756	struct worklist sentinel;
1757	struct worklist *wk;
1758	struct ufsmount *ump;
1759	int matchcnt;
1760	int error;
1761
1762	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1763	/*
1764	 * If we are being called because of a process doing a
1765	 * copy-on-write, then it is not safe to write as we may
1766	 * recurse into the copy-on-write routine.
1767	 */
1768	if (curthread->td_pflags & TDP_COWINPROGRESS)
1769		return (-1);
1770	PHOLD(curproc);	/* Don't let the stack go away. */
1771	ump = VFSTOUFS(mp);
1772	LOCK_OWNED(ump);
1773	matchcnt = 0;
1774	sentinel.wk_mp = NULL;
1775	sentinel.wk_type = D_SENTINEL;
1776	LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1777	for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1778	    wk = LIST_NEXT(&sentinel, wk_list)) {
1779		if (wk->wk_type == D_SENTINEL) {
1780			LIST_REMOVE(&sentinel, wk_list);
1781			LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1782			continue;
1783		}
1784		if (wk->wk_state & INPROGRESS)
1785			panic("process_worklist_item: %p already in progress.",
1786			    wk);
1787		wk->wk_state |= INPROGRESS;
1788		remove_from_worklist(wk);
1789		FREE_LOCK(ump);
1790		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1791			panic("process_worklist_item: suspended filesystem");
1792		switch (wk->wk_type) {
1793		case D_DIRREM:
1794			/* removal of a directory entry */
1795			error = handle_workitem_remove(WK_DIRREM(wk), flags);
1796			break;
1797
1798		case D_FREEBLKS:
1799			/* releasing blocks and/or fragments from a file */
1800			error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1801			    flags);
1802			break;
1803
1804		case D_FREEFRAG:
1805			/* releasing a fragment when replaced as a file grows */
1806			handle_workitem_freefrag(WK_FREEFRAG(wk));
1807			error = 0;
1808			break;
1809
1810		case D_FREEFILE:
1811			/* releasing an inode when its link count drops to 0 */
1812			handle_workitem_freefile(WK_FREEFILE(wk));
1813			error = 0;
1814			break;
1815
1816		default:
1817			panic("%s_process_worklist: Unknown type %s",
1818			    "softdep", TYPENAME(wk->wk_type));
1819			/* NOTREACHED */
1820		}
1821		vn_finished_secondary_write(mp);
1822		ACQUIRE_LOCK(ump);
1823		if (error == 0) {
1824			if (++matchcnt == target)
1825				break;
1826			continue;
1827		}
1828		/*
1829		 * We have to retry the worklist item later.  Wake up any
1830		 * waiters who may be able to complete it immediately and
1831		 * add the item back to the head so we don't try to execute
1832		 * it again.
1833		 */
1834		wk->wk_state &= ~INPROGRESS;
1835		wake_worklist(wk);
1836		add_to_worklist(wk, WK_HEAD);
1837	}
1838	LIST_REMOVE(&sentinel, wk_list);
1839	/* Sentinal could've become the tail from remove_from_worklist. */
1840	if (ump->softdep_worklist_tail == &sentinel)
1841		ump->softdep_worklist_tail =
1842		    (struct worklist *)sentinel.wk_list.le_prev;
1843	PRELE(curproc);
1844	return (matchcnt);
1845}
1846
1847/*
1848 * Move dependencies from one buffer to another.
1849 */
1850int
1851softdep_move_dependencies(oldbp, newbp)
1852	struct buf *oldbp;
1853	struct buf *newbp;
1854{
1855	struct worklist *wk, *wktail;
1856	struct ufsmount *ump;
1857	int dirty;
1858
1859	if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
1860		return (0);
1861	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
1862	    ("softdep_move_dependencies called on non-softdep filesystem"));
1863	dirty = 0;
1864	wktail = NULL;
1865	ump = VFSTOUFS(wk->wk_mp);
1866	ACQUIRE_LOCK(ump);
1867	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1868		LIST_REMOVE(wk, wk_list);
1869		if (wk->wk_type == D_BMSAFEMAP &&
1870		    bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
1871			dirty = 1;
1872		if (wktail == 0)
1873			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1874		else
1875			LIST_INSERT_AFTER(wktail, wk, wk_list);
1876		wktail = wk;
1877	}
1878	FREE_LOCK(ump);
1879
1880	return (dirty);
1881}
1882
1883/*
1884 * Purge the work list of all items associated with a particular mount point.
1885 */
1886int
1887softdep_flushworklist(oldmnt, countp, td)
1888	struct mount *oldmnt;
1889	int *countp;
1890	struct thread *td;
1891{
1892	struct vnode *devvp;
1893	struct ufsmount *ump;
1894	int count, error;
1895
1896	/*
1897	 * Alternately flush the block device associated with the mount
1898	 * point and process any dependencies that the flushing
1899	 * creates. We continue until no more worklist dependencies
1900	 * are found.
1901	 */
1902	*countp = 0;
1903	error = 0;
1904	ump = VFSTOUFS(oldmnt);
1905	devvp = ump->um_devvp;
1906	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1907		*countp += count;
1908		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1909		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1910		VOP_UNLOCK(devvp, 0);
1911		if (error != 0)
1912			break;
1913	}
1914	return (error);
1915}
1916
1917#define	SU_WAITIDLE_RETRIES	20
1918static int
1919softdep_waitidle(struct mount *mp, int flags __unused)
1920{
1921	struct ufsmount *ump;
1922	struct vnode *devvp;
1923	struct thread *td;
1924	int error, i;
1925
1926	ump = VFSTOUFS(mp);
1927	devvp = ump->um_devvp;
1928	td = curthread;
1929	error = 0;
1930	ACQUIRE_LOCK(ump);
1931	for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
1932		ump->softdep_req = 1;
1933		KASSERT((flags & FORCECLOSE) == 0 ||
1934		    ump->softdep_on_worklist == 0,
1935		    ("softdep_waitidle: work added after flush"));
1936		msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
1937		    "softdeps", 10 * hz);
1938		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1939		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1940		VOP_UNLOCK(devvp, 0);
1941		if (error != 0)
1942			break;
1943		ACQUIRE_LOCK(ump);
1944	}
1945	ump->softdep_req = 0;
1946	if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
1947		error = EBUSY;
1948		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1949		    mp);
1950	}
1951	FREE_LOCK(ump);
1952	return (error);
1953}
1954
1955/*
1956 * Flush all vnodes and worklist items associated with a specified mount point.
1957 */
1958int
1959softdep_flushfiles(oldmnt, flags, td)
1960	struct mount *oldmnt;
1961	int flags;
1962	struct thread *td;
1963{
1964#ifdef QUOTA
1965	struct ufsmount *ump;
1966	int i;
1967#endif
1968	int error, early, depcount, loopcnt, retry_flush_count, retry;
1969	int morework;
1970
1971	KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
1972	    ("softdep_flushfiles called on non-softdep filesystem"));
1973	loopcnt = 10;
1974	retry_flush_count = 3;
1975retry_flush:
1976	error = 0;
1977
1978	/*
1979	 * Alternately flush the vnodes associated with the mount
1980	 * point and process any dependencies that the flushing
1981	 * creates. In theory, this loop can happen at most twice,
1982	 * but we give it a few extra just to be sure.
1983	 */
1984	for (; loopcnt > 0; loopcnt--) {
1985		/*
1986		 * Do another flush in case any vnodes were brought in
1987		 * as part of the cleanup operations.
1988		 */
1989		early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
1990		    MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
1991		if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
1992			break;
1993		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1994		    depcount == 0)
1995			break;
1996	}
1997	/*
1998	 * If we are unmounting then it is an error to fail. If we
1999	 * are simply trying to downgrade to read-only, then filesystem
2000	 * activity can keep us busy forever, so we just fail with EBUSY.
2001	 */
2002	if (loopcnt == 0) {
2003		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
2004			panic("softdep_flushfiles: looping");
2005		error = EBUSY;
2006	}
2007	if (!error)
2008		error = softdep_waitidle(oldmnt, flags);
2009	if (!error) {
2010		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
2011			retry = 0;
2012			MNT_ILOCK(oldmnt);
2013			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
2014			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
2015			morework = oldmnt->mnt_nvnodelistsize > 0;
2016#ifdef QUOTA
2017			ump = VFSTOUFS(oldmnt);
2018			UFS_LOCK(ump);
2019			for (i = 0; i < MAXQUOTAS; i++) {
2020				if (ump->um_quotas[i] != NULLVP)
2021					morework = 1;
2022			}
2023			UFS_UNLOCK(ump);
2024#endif
2025			if (morework) {
2026				if (--retry_flush_count > 0) {
2027					retry = 1;
2028					loopcnt = 3;
2029				} else
2030					error = EBUSY;
2031			}
2032			MNT_IUNLOCK(oldmnt);
2033			if (retry)
2034				goto retry_flush;
2035		}
2036	}
2037	return (error);
2038}
2039
2040/*
2041 * Structure hashing.
2042 *
2043 * There are four types of structures that can be looked up:
2044 *	1) pagedep structures identified by mount point, inode number,
2045 *	   and logical block.
2046 *	2) inodedep structures identified by mount point and inode number.
2047 *	3) newblk structures identified by mount point and
2048 *	   physical block number.
2049 *	4) bmsafemap structures identified by mount point and
2050 *	   cylinder group number.
2051 *
2052 * The "pagedep" and "inodedep" dependency structures are hashed
2053 * separately from the file blocks and inodes to which they correspond.
2054 * This separation helps when the in-memory copy of an inode or
2055 * file block must be replaced. It also obviates the need to access
2056 * an inode or file page when simply updating (or de-allocating)
2057 * dependency structures. Lookup of newblk structures is needed to
2058 * find newly allocated blocks when trying to associate them with
2059 * their allocdirect or allocindir structure.
2060 *
2061 * The lookup routines optionally create and hash a new instance when
2062 * an existing entry is not found. The bmsafemap lookup routine always
2063 * allocates a new structure if an existing one is not found.
2064 */
2065#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
2066#define NODELAY		0x0002	/* cannot do background work */
2067
2068/*
2069 * Structures and routines associated with pagedep caching.
2070 */
2071#define	PAGEDEP_HASH(ump, inum, lbn) \
2072	(&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
2073
2074static int
2075pagedep_find(pagedephd, ino, lbn, pagedeppp)
2076	struct pagedep_hashhead *pagedephd;
2077	ino_t ino;
2078	ufs_lbn_t lbn;
2079	struct pagedep **pagedeppp;
2080{
2081	struct pagedep *pagedep;
2082
2083	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2084		if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
2085			*pagedeppp = pagedep;
2086			return (1);
2087		}
2088	}
2089	*pagedeppp = NULL;
2090	return (0);
2091}
2092/*
2093 * Look up a pagedep. Return 1 if found, 0 otherwise.
2094 * If not found, allocate if DEPALLOC flag is passed.
2095 * Found or allocated entry is returned in pagedeppp.
2096 * This routine must be called with splbio interrupts blocked.
2097 */
2098static int
2099pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
2100	struct mount *mp;
2101	struct buf *bp;
2102	ino_t ino;
2103	ufs_lbn_t lbn;
2104	int flags;
2105	struct pagedep **pagedeppp;
2106{
2107	struct pagedep *pagedep;
2108	struct pagedep_hashhead *pagedephd;
2109	struct worklist *wk;
2110	struct ufsmount *ump;
2111	int ret;
2112	int i;
2113
2114	ump = VFSTOUFS(mp);
2115	LOCK_OWNED(ump);
2116	if (bp) {
2117		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2118			if (wk->wk_type == D_PAGEDEP) {
2119				*pagedeppp = WK_PAGEDEP(wk);
2120				return (1);
2121			}
2122		}
2123	}
2124	pagedephd = PAGEDEP_HASH(ump, ino, lbn);
2125	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2126	if (ret) {
2127		if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2128			WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2129		return (1);
2130	}
2131	if ((flags & DEPALLOC) == 0)
2132		return (0);
2133	FREE_LOCK(ump);
2134	pagedep = malloc(sizeof(struct pagedep),
2135	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2136	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2137	ACQUIRE_LOCK(ump);
2138	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2139	if (*pagedeppp) {
2140		/*
2141		 * This should never happen since we only create pagedeps
2142		 * with the vnode lock held.  Could be an assert.
2143		 */
2144		WORKITEM_FREE(pagedep, D_PAGEDEP);
2145		return (ret);
2146	}
2147	pagedep->pd_ino = ino;
2148	pagedep->pd_lbn = lbn;
2149	LIST_INIT(&pagedep->pd_dirremhd);
2150	LIST_INIT(&pagedep->pd_pendinghd);
2151	for (i = 0; i < DAHASHSZ; i++)
2152		LIST_INIT(&pagedep->pd_diraddhd[i]);
2153	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2154	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2155	*pagedeppp = pagedep;
2156	return (0);
2157}
2158
2159/*
2160 * Structures and routines associated with inodedep caching.
2161 */
2162#define	INODEDEP_HASH(ump, inum) \
2163      (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
2164
2165static int
2166inodedep_find(inodedephd, inum, inodedeppp)
2167	struct inodedep_hashhead *inodedephd;
2168	ino_t inum;
2169	struct inodedep **inodedeppp;
2170{
2171	struct inodedep *inodedep;
2172
2173	LIST_FOREACH(inodedep, inodedephd, id_hash)
2174		if (inum == inodedep->id_ino)
2175			break;
2176	if (inodedep) {
2177		*inodedeppp = inodedep;
2178		return (1);
2179	}
2180	*inodedeppp = NULL;
2181
2182	return (0);
2183}
2184/*
2185 * Look up an inodedep. Return 1 if found, 0 if not found.
2186 * If not found, allocate if DEPALLOC flag is passed.
2187 * Found or allocated entry is returned in inodedeppp.
2188 * This routine must be called with splbio interrupts blocked.
2189 */
2190static int
2191inodedep_lookup(mp, inum, flags, inodedeppp)
2192	struct mount *mp;
2193	ino_t inum;
2194	int flags;
2195	struct inodedep **inodedeppp;
2196{
2197	struct inodedep *inodedep;
2198	struct inodedep_hashhead *inodedephd;
2199	struct ufsmount *ump;
2200	struct fs *fs;
2201
2202	ump = VFSTOUFS(mp);
2203	LOCK_OWNED(ump);
2204	fs = ump->um_fs;
2205	inodedephd = INODEDEP_HASH(ump, inum);
2206
2207	if (inodedep_find(inodedephd, inum, inodedeppp))
2208		return (1);
2209	if ((flags & DEPALLOC) == 0)
2210		return (0);
2211	/*
2212	 * If the system is over its limit and our filesystem is
2213	 * responsible for more than our share of that usage and
2214	 * we are not in a rush, request some inodedep cleanup.
2215	 */
2216	if (softdep_excess_inodes(ump))
2217		schedule_cleanup(mp);
2218	else
2219		FREE_LOCK(ump);
2220	inodedep = malloc(sizeof(struct inodedep),
2221		M_INODEDEP, M_SOFTDEP_FLAGS);
2222	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2223	ACQUIRE_LOCK(ump);
2224	if (inodedep_find(inodedephd, inum, inodedeppp)) {
2225		WORKITEM_FREE(inodedep, D_INODEDEP);
2226		return (1);
2227	}
2228	inodedep->id_fs = fs;
2229	inodedep->id_ino = inum;
2230	inodedep->id_state = ALLCOMPLETE;
2231	inodedep->id_nlinkdelta = 0;
2232	inodedep->id_savedino1 = NULL;
2233	inodedep->id_savedsize = -1;
2234	inodedep->id_savedextsize = -1;
2235	inodedep->id_savednlink = -1;
2236	inodedep->id_bmsafemap = NULL;
2237	inodedep->id_mkdiradd = NULL;
2238	LIST_INIT(&inodedep->id_dirremhd);
2239	LIST_INIT(&inodedep->id_pendinghd);
2240	LIST_INIT(&inodedep->id_inowait);
2241	LIST_INIT(&inodedep->id_bufwait);
2242	TAILQ_INIT(&inodedep->id_inoreflst);
2243	TAILQ_INIT(&inodedep->id_inoupdt);
2244	TAILQ_INIT(&inodedep->id_newinoupdt);
2245	TAILQ_INIT(&inodedep->id_extupdt);
2246	TAILQ_INIT(&inodedep->id_newextupdt);
2247	TAILQ_INIT(&inodedep->id_freeblklst);
2248	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2249	*inodedeppp = inodedep;
2250	return (0);
2251}
2252
2253/*
2254 * Structures and routines associated with newblk caching.
2255 */
2256#define	NEWBLK_HASH(ump, inum) \
2257	(&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
2258
2259static int
2260newblk_find(newblkhd, newblkno, flags, newblkpp)
2261	struct newblk_hashhead *newblkhd;
2262	ufs2_daddr_t newblkno;
2263	int flags;
2264	struct newblk **newblkpp;
2265{
2266	struct newblk *newblk;
2267
2268	LIST_FOREACH(newblk, newblkhd, nb_hash) {
2269		if (newblkno != newblk->nb_newblkno)
2270			continue;
2271		/*
2272		 * If we're creating a new dependency don't match those that
2273		 * have already been converted to allocdirects.  This is for
2274		 * a frag extend.
2275		 */
2276		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2277			continue;
2278		break;
2279	}
2280	if (newblk) {
2281		*newblkpp = newblk;
2282		return (1);
2283	}
2284	*newblkpp = NULL;
2285	return (0);
2286}
2287
2288/*
2289 * Look up a newblk. Return 1 if found, 0 if not found.
2290 * If not found, allocate if DEPALLOC flag is passed.
2291 * Found or allocated entry is returned in newblkpp.
2292 */
2293static int
2294newblk_lookup(mp, newblkno, flags, newblkpp)
2295	struct mount *mp;
2296	ufs2_daddr_t newblkno;
2297	int flags;
2298	struct newblk **newblkpp;
2299{
2300	struct newblk *newblk;
2301	struct newblk_hashhead *newblkhd;
2302	struct ufsmount *ump;
2303
2304	ump = VFSTOUFS(mp);
2305	LOCK_OWNED(ump);
2306	newblkhd = NEWBLK_HASH(ump, newblkno);
2307	if (newblk_find(newblkhd, newblkno, flags, newblkpp))
2308		return (1);
2309	if ((flags & DEPALLOC) == 0)
2310		return (0);
2311	FREE_LOCK(ump);
2312	newblk = malloc(sizeof(union allblk), M_NEWBLK,
2313	    M_SOFTDEP_FLAGS | M_ZERO);
2314	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2315	ACQUIRE_LOCK(ump);
2316	if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
2317		WORKITEM_FREE(newblk, D_NEWBLK);
2318		return (1);
2319	}
2320	newblk->nb_freefrag = NULL;
2321	LIST_INIT(&newblk->nb_indirdeps);
2322	LIST_INIT(&newblk->nb_newdirblk);
2323	LIST_INIT(&newblk->nb_jwork);
2324	newblk->nb_state = ATTACHED;
2325	newblk->nb_newblkno = newblkno;
2326	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2327	*newblkpp = newblk;
2328	return (0);
2329}
2330
2331/*
2332 * Structures and routines associated with freed indirect block caching.
2333 */
2334#define	INDIR_HASH(ump, blkno) \
2335	(&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
2336
2337/*
2338 * Lookup an indirect block in the indir hash table.  The freework is
2339 * removed and potentially freed.  The caller must do a blocking journal
2340 * write before writing to the blkno.
2341 */
2342static int
2343indirblk_lookup(mp, blkno)
2344	struct mount *mp;
2345	ufs2_daddr_t blkno;
2346{
2347	struct freework *freework;
2348	struct indir_hashhead *wkhd;
2349	struct ufsmount *ump;
2350
2351	ump = VFSTOUFS(mp);
2352	wkhd = INDIR_HASH(ump, blkno);
2353	TAILQ_FOREACH(freework, wkhd, fw_next) {
2354		if (freework->fw_blkno != blkno)
2355			continue;
2356		indirblk_remove(freework);
2357		return (1);
2358	}
2359	return (0);
2360}
2361
2362/*
2363 * Insert an indirect block represented by freework into the indirblk
2364 * hash table so that it may prevent the block from being re-used prior
2365 * to the journal being written.
2366 */
2367static void
2368indirblk_insert(freework)
2369	struct freework *freework;
2370{
2371	struct jblocks *jblocks;
2372	struct jseg *jseg;
2373	struct ufsmount *ump;
2374
2375	ump = VFSTOUFS(freework->fw_list.wk_mp);
2376	jblocks = ump->softdep_jblocks;
2377	jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2378	if (jseg == NULL)
2379		return;
2380
2381	LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2382	TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
2383	    fw_next);
2384	freework->fw_state &= ~DEPCOMPLETE;
2385}
2386
2387static void
2388indirblk_remove(freework)
2389	struct freework *freework;
2390{
2391	struct ufsmount *ump;
2392
2393	ump = VFSTOUFS(freework->fw_list.wk_mp);
2394	LIST_REMOVE(freework, fw_segs);
2395	TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
2396	freework->fw_state |= DEPCOMPLETE;
2397	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2398		WORKITEM_FREE(freework, D_FREEWORK);
2399}
2400
2401/*
2402 * Executed during filesystem system initialization before
2403 * mounting any filesystems.
2404 */
2405void
2406softdep_initialize()
2407{
2408
2409	TAILQ_INIT(&softdepmounts);
2410	max_softdeps = desiredvnodes * 4;
2411
2412	/* initialise bioops hack */
2413	bioops.io_start = softdep_disk_io_initiation;
2414	bioops.io_complete = softdep_disk_write_complete;
2415	bioops.io_deallocate = softdep_deallocate_dependencies;
2416	bioops.io_countdeps = softdep_count_dependencies;
2417	softdep_ast_cleanup = softdep_ast_cleanup_proc;
2418
2419	/* Initialize the callout with an mtx. */
2420	callout_init_mtx(&softdep_callout, &lk, 0);
2421}
2422
2423/*
2424 * Executed after all filesystems have been unmounted during
2425 * filesystem module unload.
2426 */
2427void
2428softdep_uninitialize()
2429{
2430
2431	/* clear bioops hack */
2432	bioops.io_start = NULL;
2433	bioops.io_complete = NULL;
2434	bioops.io_deallocate = NULL;
2435	bioops.io_countdeps = NULL;
2436	softdep_ast_cleanup = NULL;
2437
2438	callout_drain(&softdep_callout);
2439}
2440
2441/*
2442 * Called at mount time to notify the dependency code that a
2443 * filesystem wishes to use it.
2444 */
2445int
2446softdep_mount(devvp, mp, fs, cred)
2447	struct vnode *devvp;
2448	struct mount *mp;
2449	struct fs *fs;
2450	struct ucred *cred;
2451{
2452	struct csum_total cstotal;
2453	struct mount_softdeps *sdp;
2454	struct ufsmount *ump;
2455	struct cg *cgp;
2456	struct buf *bp;
2457	int i, error, cyl;
2458
2459	sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
2460	    M_WAITOK | M_ZERO);
2461	MNT_ILOCK(mp);
2462	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2463	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2464		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2465			MNTK_SOFTDEP | MNTK_NOASYNC;
2466	}
2467	ump = VFSTOUFS(mp);
2468	ump->um_softdep = sdp;
2469	MNT_IUNLOCK(mp);
2470	rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock");
2471	sdp->sd_ump = ump;
2472	LIST_INIT(&ump->softdep_workitem_pending);
2473	LIST_INIT(&ump->softdep_journal_pending);
2474	TAILQ_INIT(&ump->softdep_unlinked);
2475	LIST_INIT(&ump->softdep_dirtycg);
2476	ump->softdep_worklist_tail = NULL;
2477	ump->softdep_on_worklist = 0;
2478	ump->softdep_deps = 0;
2479	LIST_INIT(&ump->softdep_mkdirlisthd);
2480	ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
2481	    &ump->pagedep_hash_size);
2482	ump->pagedep_nextclean = 0;
2483	ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
2484	    &ump->inodedep_hash_size);
2485	ump->inodedep_nextclean = 0;
2486	ump->newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK,
2487	    &ump->newblk_hash_size);
2488	ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
2489	    &ump->bmsafemap_hash_size);
2490	i = 1 << (ffs(desiredvnodes / 10) - 1);
2491	ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
2492	    M_FREEWORK, M_WAITOK);
2493	ump->indir_hash_size = i - 1;
2494	for (i = 0; i <= ump->indir_hash_size; i++)
2495		TAILQ_INIT(&ump->indir_hashtbl[i]);
2496	ACQUIRE_GBLLOCK(&lk);
2497	TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
2498	FREE_GBLLOCK(&lk);
2499	if ((fs->fs_flags & FS_SUJ) &&
2500	    (error = journal_mount(mp, fs, cred)) != 0) {
2501		printf("Failed to start journal: %d\n", error);
2502		softdep_unmount(mp);
2503		return (error);
2504	}
2505	/*
2506	 * Start our flushing thread in the bufdaemon process.
2507	 */
2508	ACQUIRE_LOCK(ump);
2509	ump->softdep_flags |= FLUSH_STARTING;
2510	FREE_LOCK(ump);
2511	kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
2512	    &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
2513	    mp->mnt_stat.f_mntonname);
2514	ACQUIRE_LOCK(ump);
2515	while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
2516		msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
2517		    hz / 2);
2518	}
2519	FREE_LOCK(ump);
2520	/*
2521	 * When doing soft updates, the counters in the
2522	 * superblock may have gotten out of sync. Recomputation
2523	 * can take a long time and can be deferred for background
2524	 * fsck.  However, the old behavior of scanning the cylinder
2525	 * groups and recalculating them at mount time is available
2526	 * by setting vfs.ffs.compute_summary_at_mount to one.
2527	 */
2528	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2529		return (0);
2530	bzero(&cstotal, sizeof cstotal);
2531	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2532		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2533		    fs->fs_cgsize, cred, &bp)) != 0) {
2534			brelse(bp);
2535			softdep_unmount(mp);
2536			return (error);
2537		}
2538		cgp = (struct cg *)bp->b_data;
2539		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2540		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2541		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2542		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2543		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2544		brelse(bp);
2545	}
2546#ifdef DEBUG
2547	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2548		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2549#endif
2550	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2551	return (0);
2552}
2553
2554void
2555softdep_unmount(mp)
2556	struct mount *mp;
2557{
2558	struct ufsmount *ump;
2559#ifdef INVARIANTS
2560	int i;
2561#endif
2562
2563	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
2564	    ("softdep_unmount called on non-softdep filesystem"));
2565	ump = VFSTOUFS(mp);
2566	MNT_ILOCK(mp);
2567	mp->mnt_flag &= ~MNT_SOFTDEP;
2568	if (MOUNTEDSUJ(mp) == 0) {
2569		MNT_IUNLOCK(mp);
2570	} else {
2571		mp->mnt_flag &= ~MNT_SUJ;
2572		MNT_IUNLOCK(mp);
2573		journal_unmount(ump);
2574	}
2575	/*
2576	 * Shut down our flushing thread. Check for NULL is if
2577	 * softdep_mount errors out before the thread has been created.
2578	 */
2579	if (ump->softdep_flushtd != NULL) {
2580		ACQUIRE_LOCK(ump);
2581		ump->softdep_flags |= FLUSH_EXIT;
2582		wakeup(&ump->softdep_flushtd);
2583		msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
2584		    "sdwait", 0);
2585		KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
2586		    ("Thread shutdown failed"));
2587	}
2588	/*
2589	 * Free up our resources.
2590	 */
2591	ACQUIRE_GBLLOCK(&lk);
2592	TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
2593	FREE_GBLLOCK(&lk);
2594	rw_destroy(LOCK_PTR(ump));
2595	hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
2596	hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
2597	hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
2598	hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
2599	    ump->bmsafemap_hash_size);
2600	free(ump->indir_hashtbl, M_FREEWORK);
2601#ifdef INVARIANTS
2602	for (i = 0; i <= D_LAST; i++)
2603		KASSERT(ump->softdep_curdeps[i] == 0,
2604		    ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
2605		    TYPENAME(i), ump->softdep_curdeps[i]));
2606#endif
2607	free(ump->um_softdep, M_MOUNTDATA);
2608}
2609
2610static struct jblocks *
2611jblocks_create(void)
2612{
2613	struct jblocks *jblocks;
2614
2615	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2616	TAILQ_INIT(&jblocks->jb_segs);
2617	jblocks->jb_avail = 10;
2618	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2619	    M_JBLOCKS, M_WAITOK | M_ZERO);
2620
2621	return (jblocks);
2622}
2623
2624static ufs2_daddr_t
2625jblocks_alloc(jblocks, bytes, actual)
2626	struct jblocks *jblocks;
2627	int bytes;
2628	int *actual;
2629{
2630	ufs2_daddr_t daddr;
2631	struct jextent *jext;
2632	int freecnt;
2633	int blocks;
2634
2635	blocks = bytes / DEV_BSIZE;
2636	jext = &jblocks->jb_extent[jblocks->jb_head];
2637	freecnt = jext->je_blocks - jblocks->jb_off;
2638	if (freecnt == 0) {
2639		jblocks->jb_off = 0;
2640		if (++jblocks->jb_head > jblocks->jb_used)
2641			jblocks->jb_head = 0;
2642		jext = &jblocks->jb_extent[jblocks->jb_head];
2643		freecnt = jext->je_blocks;
2644	}
2645	if (freecnt > blocks)
2646		freecnt = blocks;
2647	*actual = freecnt * DEV_BSIZE;
2648	daddr = jext->je_daddr + jblocks->jb_off;
2649	jblocks->jb_off += freecnt;
2650	jblocks->jb_free -= freecnt;
2651
2652	return (daddr);
2653}
2654
2655static void
2656jblocks_free(jblocks, mp, bytes)
2657	struct jblocks *jblocks;
2658	struct mount *mp;
2659	int bytes;
2660{
2661
2662	LOCK_OWNED(VFSTOUFS(mp));
2663	jblocks->jb_free += bytes / DEV_BSIZE;
2664	if (jblocks->jb_suspended)
2665		worklist_speedup(mp);
2666	wakeup(jblocks);
2667}
2668
2669static void
2670jblocks_destroy(jblocks)
2671	struct jblocks *jblocks;
2672{
2673
2674	if (jblocks->jb_extent)
2675		free(jblocks->jb_extent, M_JBLOCKS);
2676	free(jblocks, M_JBLOCKS);
2677}
2678
2679static void
2680jblocks_add(jblocks, daddr, blocks)
2681	struct jblocks *jblocks;
2682	ufs2_daddr_t daddr;
2683	int blocks;
2684{
2685	struct jextent *jext;
2686
2687	jblocks->jb_blocks += blocks;
2688	jblocks->jb_free += blocks;
2689	jext = &jblocks->jb_extent[jblocks->jb_used];
2690	/* Adding the first block. */
2691	if (jext->je_daddr == 0) {
2692		jext->je_daddr = daddr;
2693		jext->je_blocks = blocks;
2694		return;
2695	}
2696	/* Extending the last extent. */
2697	if (jext->je_daddr + jext->je_blocks == daddr) {
2698		jext->je_blocks += blocks;
2699		return;
2700	}
2701	/* Adding a new extent. */
2702	if (++jblocks->jb_used == jblocks->jb_avail) {
2703		jblocks->jb_avail *= 2;
2704		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2705		    M_JBLOCKS, M_WAITOK | M_ZERO);
2706		memcpy(jext, jblocks->jb_extent,
2707		    sizeof(struct jextent) * jblocks->jb_used);
2708		free(jblocks->jb_extent, M_JBLOCKS);
2709		jblocks->jb_extent = jext;
2710	}
2711	jext = &jblocks->jb_extent[jblocks->jb_used];
2712	jext->je_daddr = daddr;
2713	jext->je_blocks = blocks;
2714	return;
2715}
2716
2717int
2718softdep_journal_lookup(mp, vpp)
2719	struct mount *mp;
2720	struct vnode **vpp;
2721{
2722	struct componentname cnp;
2723	struct vnode *dvp;
2724	ino_t sujournal;
2725	int error;
2726
2727	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2728	if (error)
2729		return (error);
2730	bzero(&cnp, sizeof(cnp));
2731	cnp.cn_nameiop = LOOKUP;
2732	cnp.cn_flags = ISLASTCN;
2733	cnp.cn_thread = curthread;
2734	cnp.cn_cred = curthread->td_ucred;
2735	cnp.cn_pnbuf = SUJ_FILE;
2736	cnp.cn_nameptr = SUJ_FILE;
2737	cnp.cn_namelen = strlen(SUJ_FILE);
2738	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2739	vput(dvp);
2740	if (error != 0)
2741		return (error);
2742	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2743	return (error);
2744}
2745
2746/*
2747 * Open and verify the journal file.
2748 */
2749static int
2750journal_mount(mp, fs, cred)
2751	struct mount *mp;
2752	struct fs *fs;
2753	struct ucred *cred;
2754{
2755	struct jblocks *jblocks;
2756	struct ufsmount *ump;
2757	struct vnode *vp;
2758	struct inode *ip;
2759	ufs2_daddr_t blkno;
2760	int bcount;
2761	int error;
2762	int i;
2763
2764	ump = VFSTOUFS(mp);
2765	ump->softdep_journal_tail = NULL;
2766	ump->softdep_on_journal = 0;
2767	ump->softdep_accdeps = 0;
2768	ump->softdep_req = 0;
2769	ump->softdep_jblocks = NULL;
2770	error = softdep_journal_lookup(mp, &vp);
2771	if (error != 0) {
2772		printf("Failed to find journal.  Use tunefs to create one\n");
2773		return (error);
2774	}
2775	ip = VTOI(vp);
2776	if (ip->i_size < SUJ_MIN) {
2777		error = ENOSPC;
2778		goto out;
2779	}
2780	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2781	jblocks = jblocks_create();
2782	for (i = 0; i < bcount; i++) {
2783		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2784		if (error)
2785			break;
2786		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2787	}
2788	if (error) {
2789		jblocks_destroy(jblocks);
2790		goto out;
2791	}
2792	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2793	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2794	ump->softdep_jblocks = jblocks;
2795out:
2796	if (error == 0) {
2797		MNT_ILOCK(mp);
2798		mp->mnt_flag |= MNT_SUJ;
2799		mp->mnt_flag &= ~MNT_SOFTDEP;
2800		MNT_IUNLOCK(mp);
2801		/*
2802		 * Only validate the journal contents if the
2803		 * filesystem is clean, otherwise we write the logs
2804		 * but they'll never be used.  If the filesystem was
2805		 * still dirty when we mounted it the journal is
2806		 * invalid and a new journal can only be valid if it
2807		 * starts from a clean mount.
2808		 */
2809		if (fs->fs_clean) {
2810			DIP_SET(ip, i_modrev, fs->fs_mtime);
2811			ip->i_flags |= IN_MODIFIED;
2812			ffs_update(vp, 1);
2813		}
2814	}
2815	vput(vp);
2816	return (error);
2817}
2818
2819static void
2820journal_unmount(ump)
2821	struct ufsmount *ump;
2822{
2823
2824	if (ump->softdep_jblocks)
2825		jblocks_destroy(ump->softdep_jblocks);
2826	ump->softdep_jblocks = NULL;
2827}
2828
2829/*
2830 * Called when a journal record is ready to be written.  Space is allocated
2831 * and the journal entry is created when the journal is flushed to stable
2832 * store.
2833 */
2834static void
2835add_to_journal(wk)
2836	struct worklist *wk;
2837{
2838	struct ufsmount *ump;
2839
2840	ump = VFSTOUFS(wk->wk_mp);
2841	LOCK_OWNED(ump);
2842	if (wk->wk_state & ONWORKLIST)
2843		panic("add_to_journal: %s(0x%X) already on list",
2844		    TYPENAME(wk->wk_type), wk->wk_state);
2845	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2846	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2847		ump->softdep_jblocks->jb_age = ticks;
2848		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2849	} else
2850		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2851	ump->softdep_journal_tail = wk;
2852	ump->softdep_on_journal += 1;
2853}
2854
2855/*
2856 * Remove an arbitrary item for the journal worklist maintain the tail
2857 * pointer.  This happens when a new operation obviates the need to
2858 * journal an old operation.
2859 */
2860static void
2861remove_from_journal(wk)
2862	struct worklist *wk;
2863{
2864	struct ufsmount *ump;
2865
2866	ump = VFSTOUFS(wk->wk_mp);
2867	LOCK_OWNED(ump);
2868#ifdef SUJ_DEBUG
2869	{
2870		struct worklist *wkn;
2871
2872		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2873			if (wkn == wk)
2874				break;
2875		if (wkn == NULL)
2876			panic("remove_from_journal: %p is not in journal", wk);
2877	}
2878#endif
2879	/*
2880	 * We emulate a TAILQ to save space in most structures which do not
2881	 * require TAILQ semantics.  Here we must update the tail position
2882	 * when removing the tail which is not the final entry. This works
2883	 * only if the worklist linkage are at the beginning of the structure.
2884	 */
2885	if (ump->softdep_journal_tail == wk)
2886		ump->softdep_journal_tail =
2887		    (struct worklist *)wk->wk_list.le_prev;
2888
2889	WORKLIST_REMOVE(wk);
2890	ump->softdep_on_journal -= 1;
2891}
2892
2893/*
2894 * Check for journal space as well as dependency limits so the prelink
2895 * code can throttle both journaled and non-journaled filesystems.
2896 * Threshold is 0 for low and 1 for min.
2897 */
2898static int
2899journal_space(ump, thresh)
2900	struct ufsmount *ump;
2901	int thresh;
2902{
2903	struct jblocks *jblocks;
2904	int limit, avail;
2905
2906	jblocks = ump->softdep_jblocks;
2907	if (jblocks == NULL)
2908		return (1);
2909	/*
2910	 * We use a tighter restriction here to prevent request_cleanup()
2911	 * running in threads from running into locks we currently hold.
2912	 * We have to be over the limit and our filesystem has to be
2913	 * responsible for more than our share of that usage.
2914	 */
2915	limit = (max_softdeps / 10) * 9;
2916	if (dep_current[D_INODEDEP] > limit &&
2917	    ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
2918		return (0);
2919	if (thresh)
2920		thresh = jblocks->jb_min;
2921	else
2922		thresh = jblocks->jb_low;
2923	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2924	avail = jblocks->jb_free - avail;
2925
2926	return (avail > thresh);
2927}
2928
2929static void
2930journal_suspend(ump)
2931	struct ufsmount *ump;
2932{
2933	struct jblocks *jblocks;
2934	struct mount *mp;
2935
2936	mp = UFSTOVFS(ump);
2937	jblocks = ump->softdep_jblocks;
2938	MNT_ILOCK(mp);
2939	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2940		stat_journal_min++;
2941		mp->mnt_kern_flag |= MNTK_SUSPEND;
2942		mp->mnt_susp_owner = ump->softdep_flushtd;
2943	}
2944	jblocks->jb_suspended = 1;
2945	MNT_IUNLOCK(mp);
2946}
2947
2948static int
2949journal_unsuspend(struct ufsmount *ump)
2950{
2951	struct jblocks *jblocks;
2952	struct mount *mp;
2953
2954	mp = UFSTOVFS(ump);
2955	jblocks = ump->softdep_jblocks;
2956
2957	if (jblocks != NULL && jblocks->jb_suspended &&
2958	    journal_space(ump, jblocks->jb_min)) {
2959		jblocks->jb_suspended = 0;
2960		FREE_LOCK(ump);
2961		mp->mnt_susp_owner = curthread;
2962		vfs_write_resume(mp, 0);
2963		ACQUIRE_LOCK(ump);
2964		return (1);
2965	}
2966	return (0);
2967}
2968
2969/*
2970 * Called before any allocation function to be certain that there is
2971 * sufficient space in the journal prior to creating any new records.
2972 * Since in the case of block allocation we may have multiple locked
2973 * buffers at the time of the actual allocation we can not block
2974 * when the journal records are created.  Doing so would create a deadlock
2975 * if any of these buffers needed to be flushed to reclaim space.  Instead
2976 * we require a sufficiently large amount of available space such that
2977 * each thread in the system could have passed this allocation check and
2978 * still have sufficient free space.  With 20% of a minimum journal size
2979 * of 1MB we have 6553 records available.
2980 */
2981int
2982softdep_prealloc(vp, waitok)
2983	struct vnode *vp;
2984	int waitok;
2985{
2986	struct ufsmount *ump;
2987
2988	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
2989	    ("softdep_prealloc called on non-softdep filesystem"));
2990	/*
2991	 * Nothing to do if we are not running journaled soft updates.
2992	 * If we currently hold the snapshot lock, we must avoid handling
2993	 * other resources that could cause deadlock.
2994	 */
2995	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)))
2996		return (0);
2997	ump = VFSTOUFS(vp->v_mount);
2998	ACQUIRE_LOCK(ump);
2999	if (journal_space(ump, 0)) {
3000		FREE_LOCK(ump);
3001		return (0);
3002	}
3003	stat_journal_low++;
3004	FREE_LOCK(ump);
3005	if (waitok == MNT_NOWAIT)
3006		return (ENOSPC);
3007	/*
3008	 * Attempt to sync this vnode once to flush any journal
3009	 * work attached to it.
3010	 */
3011	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
3012		ffs_syncvnode(vp, waitok, 0);
3013	ACQUIRE_LOCK(ump);
3014	process_removes(vp);
3015	process_truncates(vp);
3016	if (journal_space(ump, 0) == 0) {
3017		softdep_speedup(ump);
3018		if (journal_space(ump, 1) == 0)
3019			journal_suspend(ump);
3020	}
3021	FREE_LOCK(ump);
3022
3023	return (0);
3024}
3025
3026/*
3027 * Before adjusting a link count on a vnode verify that we have sufficient
3028 * journal space.  If not, process operations that depend on the currently
3029 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
3030 * and softdep flush threads can not acquire these locks to reclaim space.
3031 */
3032static void
3033softdep_prelink(dvp, vp)
3034	struct vnode *dvp;
3035	struct vnode *vp;
3036{
3037	struct ufsmount *ump;
3038
3039	ump = VFSTOUFS(dvp->v_mount);
3040	LOCK_OWNED(ump);
3041	/*
3042	 * Nothing to do if we have sufficient journal space.
3043	 * If we currently hold the snapshot lock, we must avoid
3044	 * handling other resources that could cause deadlock.
3045	 */
3046	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
3047		return;
3048	stat_journal_low++;
3049	FREE_LOCK(ump);
3050	if (vp)
3051		ffs_syncvnode(vp, MNT_NOWAIT, 0);
3052	ffs_syncvnode(dvp, MNT_WAIT, 0);
3053	ACQUIRE_LOCK(ump);
3054	/* Process vp before dvp as it may create .. removes. */
3055	if (vp) {
3056		process_removes(vp);
3057		process_truncates(vp);
3058	}
3059	process_removes(dvp);
3060	process_truncates(dvp);
3061	softdep_speedup(ump);
3062	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3063	if (journal_space(ump, 0) == 0) {
3064		softdep_speedup(ump);
3065		if (journal_space(ump, 1) == 0)
3066			journal_suspend(ump);
3067	}
3068}
3069
3070static void
3071jseg_write(ump, jseg, data)
3072	struct ufsmount *ump;
3073	struct jseg *jseg;
3074	uint8_t *data;
3075{
3076	struct jsegrec *rec;
3077
3078	rec = (struct jsegrec *)data;
3079	rec->jsr_seq = jseg->js_seq;
3080	rec->jsr_oldest = jseg->js_oldseq;
3081	rec->jsr_cnt = jseg->js_cnt;
3082	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
3083	rec->jsr_crc = 0;
3084	rec->jsr_time = ump->um_fs->fs_mtime;
3085}
3086
3087static inline void
3088inoref_write(inoref, jseg, rec)
3089	struct inoref *inoref;
3090	struct jseg *jseg;
3091	struct jrefrec *rec;
3092{
3093
3094	inoref->if_jsegdep->jd_seg = jseg;
3095	rec->jr_ino = inoref->if_ino;
3096	rec->jr_parent = inoref->if_parent;
3097	rec->jr_nlink = inoref->if_nlink;
3098	rec->jr_mode = inoref->if_mode;
3099	rec->jr_diroff = inoref->if_diroff;
3100}
3101
3102static void
3103jaddref_write(jaddref, jseg, data)
3104	struct jaddref *jaddref;
3105	struct jseg *jseg;
3106	uint8_t *data;
3107{
3108	struct jrefrec *rec;
3109
3110	rec = (struct jrefrec *)data;
3111	rec->jr_op = JOP_ADDREF;
3112	inoref_write(&jaddref->ja_ref, jseg, rec);
3113}
3114
3115static void
3116jremref_write(jremref, jseg, data)
3117	struct jremref *jremref;
3118	struct jseg *jseg;
3119	uint8_t *data;
3120{
3121	struct jrefrec *rec;
3122
3123	rec = (struct jrefrec *)data;
3124	rec->jr_op = JOP_REMREF;
3125	inoref_write(&jremref->jr_ref, jseg, rec);
3126}
3127
3128static void
3129jmvref_write(jmvref, jseg, data)
3130	struct jmvref *jmvref;
3131	struct jseg *jseg;
3132	uint8_t *data;
3133{
3134	struct jmvrec *rec;
3135
3136	rec = (struct jmvrec *)data;
3137	rec->jm_op = JOP_MVREF;
3138	rec->jm_ino = jmvref->jm_ino;
3139	rec->jm_parent = jmvref->jm_parent;
3140	rec->jm_oldoff = jmvref->jm_oldoff;
3141	rec->jm_newoff = jmvref->jm_newoff;
3142}
3143
3144static void
3145jnewblk_write(jnewblk, jseg, data)
3146	struct jnewblk *jnewblk;
3147	struct jseg *jseg;
3148	uint8_t *data;
3149{
3150	struct jblkrec *rec;
3151
3152	jnewblk->jn_jsegdep->jd_seg = jseg;
3153	rec = (struct jblkrec *)data;
3154	rec->jb_op = JOP_NEWBLK;
3155	rec->jb_ino = jnewblk->jn_ino;
3156	rec->jb_blkno = jnewblk->jn_blkno;
3157	rec->jb_lbn = jnewblk->jn_lbn;
3158	rec->jb_frags = jnewblk->jn_frags;
3159	rec->jb_oldfrags = jnewblk->jn_oldfrags;
3160}
3161
3162static void
3163jfreeblk_write(jfreeblk, jseg, data)
3164	struct jfreeblk *jfreeblk;
3165	struct jseg *jseg;
3166	uint8_t *data;
3167{
3168	struct jblkrec *rec;
3169
3170	jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3171	rec = (struct jblkrec *)data;
3172	rec->jb_op = JOP_FREEBLK;
3173	rec->jb_ino = jfreeblk->jf_ino;
3174	rec->jb_blkno = jfreeblk->jf_blkno;
3175	rec->jb_lbn = jfreeblk->jf_lbn;
3176	rec->jb_frags = jfreeblk->jf_frags;
3177	rec->jb_oldfrags = 0;
3178}
3179
3180static void
3181jfreefrag_write(jfreefrag, jseg, data)
3182	struct jfreefrag *jfreefrag;
3183	struct jseg *jseg;
3184	uint8_t *data;
3185{
3186	struct jblkrec *rec;
3187
3188	jfreefrag->fr_jsegdep->jd_seg = jseg;
3189	rec = (struct jblkrec *)data;
3190	rec->jb_op = JOP_FREEBLK;
3191	rec->jb_ino = jfreefrag->fr_ino;
3192	rec->jb_blkno = jfreefrag->fr_blkno;
3193	rec->jb_lbn = jfreefrag->fr_lbn;
3194	rec->jb_frags = jfreefrag->fr_frags;
3195	rec->jb_oldfrags = 0;
3196}
3197
3198static void
3199jtrunc_write(jtrunc, jseg, data)
3200	struct jtrunc *jtrunc;
3201	struct jseg *jseg;
3202	uint8_t *data;
3203{
3204	struct jtrncrec *rec;
3205
3206	jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3207	rec = (struct jtrncrec *)data;
3208	rec->jt_op = JOP_TRUNC;
3209	rec->jt_ino = jtrunc->jt_ino;
3210	rec->jt_size = jtrunc->jt_size;
3211	rec->jt_extsize = jtrunc->jt_extsize;
3212}
3213
3214static void
3215jfsync_write(jfsync, jseg, data)
3216	struct jfsync *jfsync;
3217	struct jseg *jseg;
3218	uint8_t *data;
3219{
3220	struct jtrncrec *rec;
3221
3222	rec = (struct jtrncrec *)data;
3223	rec->jt_op = JOP_SYNC;
3224	rec->jt_ino = jfsync->jfs_ino;
3225	rec->jt_size = jfsync->jfs_size;
3226	rec->jt_extsize = jfsync->jfs_extsize;
3227}
3228
3229static void
3230softdep_flushjournal(mp)
3231	struct mount *mp;
3232{
3233	struct jblocks *jblocks;
3234	struct ufsmount *ump;
3235
3236	if (MOUNTEDSUJ(mp) == 0)
3237		return;
3238	ump = VFSTOUFS(mp);
3239	jblocks = ump->softdep_jblocks;
3240	ACQUIRE_LOCK(ump);
3241	while (ump->softdep_on_journal) {
3242		jblocks->jb_needseg = 1;
3243		softdep_process_journal(mp, NULL, MNT_WAIT);
3244	}
3245	FREE_LOCK(ump);
3246}
3247
3248static void softdep_synchronize_completed(struct bio *);
3249static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3250
3251static void
3252softdep_synchronize_completed(bp)
3253        struct bio *bp;
3254{
3255	struct jseg *oldest;
3256	struct jseg *jseg;
3257	struct ufsmount *ump;
3258
3259	/*
3260	 * caller1 marks the last segment written before we issued the
3261	 * synchronize cache.
3262	 */
3263	jseg = bp->bio_caller1;
3264	if (jseg == NULL) {
3265		g_destroy_bio(bp);
3266		return;
3267	}
3268	ump = VFSTOUFS(jseg->js_list.wk_mp);
3269	ACQUIRE_LOCK(ump);
3270	oldest = NULL;
3271	/*
3272	 * Mark all the journal entries waiting on the synchronize cache
3273	 * as completed so they may continue on.
3274	 */
3275	while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3276		jseg->js_state |= COMPLETE;
3277		oldest = jseg;
3278		jseg = TAILQ_PREV(jseg, jseglst, js_next);
3279	}
3280	/*
3281	 * Restart deferred journal entry processing from the oldest
3282	 * completed jseg.
3283	 */
3284	if (oldest)
3285		complete_jsegs(oldest);
3286
3287	FREE_LOCK(ump);
3288	g_destroy_bio(bp);
3289}
3290
3291/*
3292 * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3293 * barriers.  The journal must be written prior to any blocks that depend
3294 * on it and the journal can not be released until the blocks have be
3295 * written.  This code handles both barriers simultaneously.
3296 */
3297static void
3298softdep_synchronize(bp, ump, caller1)
3299	struct bio *bp;
3300	struct ufsmount *ump;
3301	void *caller1;
3302{
3303
3304	bp->bio_cmd = BIO_FLUSH;
3305	bp->bio_flags |= BIO_ORDERED;
3306	bp->bio_data = NULL;
3307	bp->bio_offset = ump->um_cp->provider->mediasize;
3308	bp->bio_length = 0;
3309	bp->bio_done = softdep_synchronize_completed;
3310	bp->bio_caller1 = caller1;
3311	g_io_request(bp,
3312	    (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
3313}
3314
3315/*
3316 * Flush some journal records to disk.
3317 */
3318static void
3319softdep_process_journal(mp, needwk, flags)
3320	struct mount *mp;
3321	struct worklist *needwk;
3322	int flags;
3323{
3324	struct jblocks *jblocks;
3325	struct ufsmount *ump;
3326	struct worklist *wk;
3327	struct jseg *jseg;
3328	struct buf *bp;
3329	struct bio *bio;
3330	uint8_t *data;
3331	struct fs *fs;
3332	int shouldflush;
3333	int segwritten;
3334	int jrecmin;	/* Minimum records per block. */
3335	int jrecmax;	/* Maximum records per block. */
3336	int size;
3337	int cnt;
3338	int off;
3339	int devbsize;
3340
3341	if (MOUNTEDSUJ(mp) == 0)
3342		return;
3343	shouldflush = softdep_flushcache;
3344	bio = NULL;
3345	jseg = NULL;
3346	ump = VFSTOUFS(mp);
3347	LOCK_OWNED(ump);
3348	fs = ump->um_fs;
3349	jblocks = ump->softdep_jblocks;
3350	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3351	/*
3352	 * We write anywhere between a disk block and fs block.  The upper
3353	 * bound is picked to prevent buffer cache fragmentation and limit
3354	 * processing time per I/O.
3355	 */
3356	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3357	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3358	segwritten = 0;
3359	for (;;) {
3360		cnt = ump->softdep_on_journal;
3361		/*
3362		 * Criteria for writing a segment:
3363		 * 1) We have a full block.
3364		 * 2) We're called from jwait() and haven't found the
3365		 *    journal item yet.
3366		 * 3) Always write if needseg is set.
3367		 * 4) If we are called from process_worklist and have
3368		 *    not yet written anything we write a partial block
3369		 *    to enforce a 1 second maximum latency on journal
3370		 *    entries.
3371		 */
3372		if (cnt < (jrecmax - 1) && needwk == NULL &&
3373		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3374			break;
3375		cnt++;
3376		/*
3377		 * Verify some free journal space.  softdep_prealloc() should
3378		 * guarantee that we don't run out so this is indicative of
3379		 * a problem with the flow control.  Try to recover
3380		 * gracefully in any event.
3381		 */
3382		while (jblocks->jb_free == 0) {
3383			if (flags != MNT_WAIT)
3384				break;
3385			printf("softdep: Out of journal space!\n");
3386			softdep_speedup(ump);
3387			msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
3388		}
3389		FREE_LOCK(ump);
3390		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3391		workitem_alloc(&jseg->js_list, D_JSEG, mp);
3392		LIST_INIT(&jseg->js_entries);
3393		LIST_INIT(&jseg->js_indirs);
3394		jseg->js_state = ATTACHED;
3395		if (shouldflush == 0)
3396			jseg->js_state |= COMPLETE;
3397		else if (bio == NULL)
3398			bio = g_alloc_bio();
3399		jseg->js_jblocks = jblocks;
3400		bp = geteblk(fs->fs_bsize, 0);
3401		ACQUIRE_LOCK(ump);
3402		/*
3403		 * If there was a race while we were allocating the block
3404		 * and jseg the entry we care about was likely written.
3405		 * We bail out in both the WAIT and NOWAIT case and assume
3406		 * the caller will loop if the entry it cares about is
3407		 * not written.
3408		 */
3409		cnt = ump->softdep_on_journal;
3410		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3411			bp->b_flags |= B_INVAL | B_NOCACHE;
3412			WORKITEM_FREE(jseg, D_JSEG);
3413			FREE_LOCK(ump);
3414			brelse(bp);
3415			ACQUIRE_LOCK(ump);
3416			break;
3417		}
3418		/*
3419		 * Calculate the disk block size required for the available
3420		 * records rounded to the min size.
3421		 */
3422		if (cnt == 0)
3423			size = devbsize;
3424		else if (cnt < jrecmax)
3425			size = howmany(cnt, jrecmin) * devbsize;
3426		else
3427			size = fs->fs_bsize;
3428		/*
3429		 * Allocate a disk block for this journal data and account
3430		 * for truncation of the requested size if enough contiguous
3431		 * space was not available.
3432		 */
3433		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3434		bp->b_lblkno = bp->b_blkno;
3435		bp->b_offset = bp->b_blkno * DEV_BSIZE;
3436		bp->b_bcount = size;
3437		bp->b_flags &= ~B_INVAL;
3438		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3439		/*
3440		 * Initialize our jseg with cnt records.  Assign the next
3441		 * sequence number to it and link it in-order.
3442		 */
3443		cnt = MIN(cnt, (size / devbsize) * jrecmin);
3444		jseg->js_buf = bp;
3445		jseg->js_cnt = cnt;
3446		jseg->js_refs = cnt + 1;	/* Self ref. */
3447		jseg->js_size = size;
3448		jseg->js_seq = jblocks->jb_nextseq++;
3449		if (jblocks->jb_oldestseg == NULL)
3450			jblocks->jb_oldestseg = jseg;
3451		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3452		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3453		if (jblocks->jb_writeseg == NULL)
3454			jblocks->jb_writeseg = jseg;
3455		/*
3456		 * Start filling in records from the pending list.
3457		 */
3458		data = bp->b_data;
3459		off = 0;
3460
3461		/*
3462		 * Always put a header on the first block.
3463		 * XXX As with below, there might not be a chance to get
3464		 * into the loop.  Ensure that something valid is written.
3465		 */
3466		jseg_write(ump, jseg, data);
3467		off += JREC_SIZE;
3468		data = bp->b_data + off;
3469
3470		/*
3471		 * XXX Something is wrong here.  There's no work to do,
3472		 * but we need to perform and I/O and allow it to complete
3473		 * anyways.
3474		 */
3475		if (LIST_EMPTY(&ump->softdep_journal_pending))
3476			stat_emptyjblocks++;
3477
3478		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3479		    != NULL) {
3480			if (cnt == 0)
3481				break;
3482			/* Place a segment header on every device block. */
3483			if ((off % devbsize) == 0) {
3484				jseg_write(ump, jseg, data);
3485				off += JREC_SIZE;
3486				data = bp->b_data + off;
3487			}
3488			if (wk == needwk)
3489				needwk = NULL;
3490			remove_from_journal(wk);
3491			wk->wk_state |= INPROGRESS;
3492			WORKLIST_INSERT(&jseg->js_entries, wk);
3493			switch (wk->wk_type) {
3494			case D_JADDREF:
3495				jaddref_write(WK_JADDREF(wk), jseg, data);
3496				break;
3497			case D_JREMREF:
3498				jremref_write(WK_JREMREF(wk), jseg, data);
3499				break;
3500			case D_JMVREF:
3501				jmvref_write(WK_JMVREF(wk), jseg, data);
3502				break;
3503			case D_JNEWBLK:
3504				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3505				break;
3506			case D_JFREEBLK:
3507				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3508				break;
3509			case D_JFREEFRAG:
3510				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3511				break;
3512			case D_JTRUNC:
3513				jtrunc_write(WK_JTRUNC(wk), jseg, data);
3514				break;
3515			case D_JFSYNC:
3516				jfsync_write(WK_JFSYNC(wk), jseg, data);
3517				break;
3518			default:
3519				panic("process_journal: Unknown type %s",
3520				    TYPENAME(wk->wk_type));
3521				/* NOTREACHED */
3522			}
3523			off += JREC_SIZE;
3524			data = bp->b_data + off;
3525			cnt--;
3526		}
3527
3528		/* Clear any remaining space so we don't leak kernel data */
3529		if (size > off)
3530			bzero(data, size - off);
3531
3532		/*
3533		 * Write this one buffer and continue.
3534		 */
3535		segwritten = 1;
3536		jblocks->jb_needseg = 0;
3537		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3538		FREE_LOCK(ump);
3539		pbgetvp(ump->um_devvp, bp);
3540		/*
3541		 * We only do the blocking wait once we find the journal
3542		 * entry we're looking for.
3543		 */
3544		if (needwk == NULL && flags == MNT_WAIT)
3545			bwrite(bp);
3546		else
3547			bawrite(bp);
3548		ACQUIRE_LOCK(ump);
3549	}
3550	/*
3551	 * If we wrote a segment issue a synchronize cache so the journal
3552	 * is reflected on disk before the data is written.  Since reclaiming
3553	 * journal space also requires writing a journal record this
3554	 * process also enforces a barrier before reclamation.
3555	 */
3556	if (segwritten && shouldflush) {
3557		softdep_synchronize(bio, ump,
3558		    TAILQ_LAST(&jblocks->jb_segs, jseglst));
3559	} else if (bio)
3560		g_destroy_bio(bio);
3561	/*
3562	 * If we've suspended the filesystem because we ran out of journal
3563	 * space either try to sync it here to make some progress or
3564	 * unsuspend it if we already have.
3565	 */
3566	if (flags == 0 && jblocks->jb_suspended) {
3567		if (journal_unsuspend(ump))
3568			return;
3569		FREE_LOCK(ump);
3570		VFS_SYNC(mp, MNT_NOWAIT);
3571		ffs_sbupdate(ump, MNT_WAIT, 0);
3572		ACQUIRE_LOCK(ump);
3573	}
3574}
3575
3576/*
3577 * Complete a jseg, allowing all dependencies awaiting journal writes
3578 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3579 * structures so that the journal segment can be freed to reclaim space.
3580 */
3581static void
3582complete_jseg(jseg)
3583	struct jseg *jseg;
3584{
3585	struct worklist *wk;
3586	struct jmvref *jmvref;
3587	int waiting;
3588#ifdef INVARIANTS
3589	int i = 0;
3590#endif
3591
3592	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3593		WORKLIST_REMOVE(wk);
3594		waiting = wk->wk_state & IOWAITING;
3595		wk->wk_state &= ~(INPROGRESS | IOWAITING);
3596		wk->wk_state |= COMPLETE;
3597		KASSERT(i++ < jseg->js_cnt,
3598		    ("handle_written_jseg: overflow %d >= %d",
3599		    i - 1, jseg->js_cnt));
3600		switch (wk->wk_type) {
3601		case D_JADDREF:
3602			handle_written_jaddref(WK_JADDREF(wk));
3603			break;
3604		case D_JREMREF:
3605			handle_written_jremref(WK_JREMREF(wk));
3606			break;
3607		case D_JMVREF:
3608			rele_jseg(jseg);	/* No jsegdep. */
3609			jmvref = WK_JMVREF(wk);
3610			LIST_REMOVE(jmvref, jm_deps);
3611			if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3612				free_pagedep(jmvref->jm_pagedep);
3613			WORKITEM_FREE(jmvref, D_JMVREF);
3614			break;
3615		case D_JNEWBLK:
3616			handle_written_jnewblk(WK_JNEWBLK(wk));
3617			break;
3618		case D_JFREEBLK:
3619			handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3620			break;
3621		case D_JTRUNC:
3622			handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3623			break;
3624		case D_JFSYNC:
3625			rele_jseg(jseg);	/* No jsegdep. */
3626			WORKITEM_FREE(wk, D_JFSYNC);
3627			break;
3628		case D_JFREEFRAG:
3629			handle_written_jfreefrag(WK_JFREEFRAG(wk));
3630			break;
3631		default:
3632			panic("handle_written_jseg: Unknown type %s",
3633			    TYPENAME(wk->wk_type));
3634			/* NOTREACHED */
3635		}
3636		if (waiting)
3637			wakeup(wk);
3638	}
3639	/* Release the self reference so the structure may be freed. */
3640	rele_jseg(jseg);
3641}
3642
3643/*
3644 * Determine which jsegs are ready for completion processing.  Waits for
3645 * synchronize cache to complete as well as forcing in-order completion
3646 * of journal entries.
3647 */
3648static void
3649complete_jsegs(jseg)
3650	struct jseg *jseg;
3651{
3652	struct jblocks *jblocks;
3653	struct jseg *jsegn;
3654
3655	jblocks = jseg->js_jblocks;
3656	/*
3657	 * Don't allow out of order completions.  If this isn't the first
3658	 * block wait for it to write before we're done.
3659	 */
3660	if (jseg != jblocks->jb_writeseg)
3661		return;
3662	/* Iterate through available jsegs processing their entries. */
3663	while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3664		jblocks->jb_oldestwrseq = jseg->js_oldseq;
3665		jsegn = TAILQ_NEXT(jseg, js_next);
3666		complete_jseg(jseg);
3667		jseg = jsegn;
3668	}
3669	jblocks->jb_writeseg = jseg;
3670	/*
3671	 * Attempt to free jsegs now that oldestwrseq may have advanced.
3672	 */
3673	free_jsegs(jblocks);
3674}
3675
3676/*
3677 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
3678 * the final completions.
3679 */
3680static void
3681handle_written_jseg(jseg, bp)
3682	struct jseg *jseg;
3683	struct buf *bp;
3684{
3685
3686	if (jseg->js_refs == 0)
3687		panic("handle_written_jseg: No self-reference on %p", jseg);
3688	jseg->js_state |= DEPCOMPLETE;
3689	/*
3690	 * We'll never need this buffer again, set flags so it will be
3691	 * discarded.
3692	 */
3693	bp->b_flags |= B_INVAL | B_NOCACHE;
3694	pbrelvp(bp);
3695	complete_jsegs(jseg);
3696}
3697
3698static inline struct jsegdep *
3699inoref_jseg(inoref)
3700	struct inoref *inoref;
3701{
3702	struct jsegdep *jsegdep;
3703
3704	jsegdep = inoref->if_jsegdep;
3705	inoref->if_jsegdep = NULL;
3706
3707	return (jsegdep);
3708}
3709
3710/*
3711 * Called once a jremref has made it to stable store.  The jremref is marked
3712 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3713 * for the jremref to complete will be awoken by free_jremref.
3714 */
3715static void
3716handle_written_jremref(jremref)
3717	struct jremref *jremref;
3718{
3719	struct inodedep *inodedep;
3720	struct jsegdep *jsegdep;
3721	struct dirrem *dirrem;
3722
3723	/* Grab the jsegdep. */
3724	jsegdep = inoref_jseg(&jremref->jr_ref);
3725	/*
3726	 * Remove us from the inoref list.
3727	 */
3728	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3729	    0, &inodedep) == 0)
3730		panic("handle_written_jremref: Lost inodedep");
3731	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3732	/*
3733	 * Complete the dirrem.
3734	 */
3735	dirrem = jremref->jr_dirrem;
3736	jremref->jr_dirrem = NULL;
3737	LIST_REMOVE(jremref, jr_deps);
3738	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3739	jwork_insert(&dirrem->dm_jwork, jsegdep);
3740	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3741	    (dirrem->dm_state & COMPLETE) != 0)
3742		add_to_worklist(&dirrem->dm_list, 0);
3743	free_jremref(jremref);
3744}
3745
3746/*
3747 * Called once a jaddref has made it to stable store.  The dependency is
3748 * marked complete and any dependent structures are added to the inode
3749 * bufwait list to be completed as soon as it is written.  If a bitmap write
3750 * depends on this entry we move the inode into the inodedephd of the
3751 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3752 */
3753static void
3754handle_written_jaddref(jaddref)
3755	struct jaddref *jaddref;
3756{
3757	struct jsegdep *jsegdep;
3758	struct inodedep *inodedep;
3759	struct diradd *diradd;
3760	struct mkdir *mkdir;
3761
3762	/* Grab the jsegdep. */
3763	jsegdep = inoref_jseg(&jaddref->ja_ref);
3764	mkdir = NULL;
3765	diradd = NULL;
3766	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3767	    0, &inodedep) == 0)
3768		panic("handle_written_jaddref: Lost inodedep.");
3769	if (jaddref->ja_diradd == NULL)
3770		panic("handle_written_jaddref: No dependency");
3771	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3772		diradd = jaddref->ja_diradd;
3773		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3774	} else if (jaddref->ja_state & MKDIR_PARENT) {
3775		mkdir = jaddref->ja_mkdir;
3776		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3777	} else if (jaddref->ja_state & MKDIR_BODY)
3778		mkdir = jaddref->ja_mkdir;
3779	else
3780		panic("handle_written_jaddref: Unknown dependency %p",
3781		    jaddref->ja_diradd);
3782	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3783	/*
3784	 * Remove us from the inode list.
3785	 */
3786	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3787	/*
3788	 * The mkdir may be waiting on the jaddref to clear before freeing.
3789	 */
3790	if (mkdir) {
3791		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3792		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3793		    TYPENAME(mkdir->md_list.wk_type)));
3794		mkdir->md_jaddref = NULL;
3795		diradd = mkdir->md_diradd;
3796		mkdir->md_state |= DEPCOMPLETE;
3797		complete_mkdir(mkdir);
3798	}
3799	jwork_insert(&diradd->da_jwork, jsegdep);
3800	if (jaddref->ja_state & NEWBLOCK) {
3801		inodedep->id_state |= ONDEPLIST;
3802		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3803		    inodedep, id_deps);
3804	}
3805	free_jaddref(jaddref);
3806}
3807
3808/*
3809 * Called once a jnewblk journal is written.  The allocdirect or allocindir
3810 * is placed in the bmsafemap to await notification of a written bitmap.  If
3811 * the operation was canceled we add the segdep to the appropriate
3812 * dependency to free the journal space once the canceling operation
3813 * completes.
3814 */
3815static void
3816handle_written_jnewblk(jnewblk)
3817	struct jnewblk *jnewblk;
3818{
3819	struct bmsafemap *bmsafemap;
3820	struct freefrag *freefrag;
3821	struct freework *freework;
3822	struct jsegdep *jsegdep;
3823	struct newblk *newblk;
3824
3825	/* Grab the jsegdep. */
3826	jsegdep = jnewblk->jn_jsegdep;
3827	jnewblk->jn_jsegdep = NULL;
3828	if (jnewblk->jn_dep == NULL)
3829		panic("handle_written_jnewblk: No dependency for the segdep.");
3830	switch (jnewblk->jn_dep->wk_type) {
3831	case D_NEWBLK:
3832	case D_ALLOCDIRECT:
3833	case D_ALLOCINDIR:
3834		/*
3835		 * Add the written block to the bmsafemap so it can
3836		 * be notified when the bitmap is on disk.
3837		 */
3838		newblk = WK_NEWBLK(jnewblk->jn_dep);
3839		newblk->nb_jnewblk = NULL;
3840		if ((newblk->nb_state & GOINGAWAY) == 0) {
3841			bmsafemap = newblk->nb_bmsafemap;
3842			newblk->nb_state |= ONDEPLIST;
3843			LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3844			    nb_deps);
3845		}
3846		jwork_insert(&newblk->nb_jwork, jsegdep);
3847		break;
3848	case D_FREEFRAG:
3849		/*
3850		 * A newblock being removed by a freefrag when replaced by
3851		 * frag extension.
3852		 */
3853		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3854		freefrag->ff_jdep = NULL;
3855		jwork_insert(&freefrag->ff_jwork, jsegdep);
3856		break;
3857	case D_FREEWORK:
3858		/*
3859		 * A direct block was removed by truncate.
3860		 */
3861		freework = WK_FREEWORK(jnewblk->jn_dep);
3862		freework->fw_jnewblk = NULL;
3863		jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
3864		break;
3865	default:
3866		panic("handle_written_jnewblk: Unknown type %d.",
3867		    jnewblk->jn_dep->wk_type);
3868	}
3869	jnewblk->jn_dep = NULL;
3870	free_jnewblk(jnewblk);
3871}
3872
3873/*
3874 * Cancel a jfreefrag that won't be needed, probably due to colliding with
3875 * an in-flight allocation that has not yet been committed.  Divorce us
3876 * from the freefrag and mark it DEPCOMPLETE so that it may be added
3877 * to the worklist.
3878 */
3879static void
3880cancel_jfreefrag(jfreefrag)
3881	struct jfreefrag *jfreefrag;
3882{
3883	struct freefrag *freefrag;
3884
3885	if (jfreefrag->fr_jsegdep) {
3886		free_jsegdep(jfreefrag->fr_jsegdep);
3887		jfreefrag->fr_jsegdep = NULL;
3888	}
3889	freefrag = jfreefrag->fr_freefrag;
3890	jfreefrag->fr_freefrag = NULL;
3891	free_jfreefrag(jfreefrag);
3892	freefrag->ff_state |= DEPCOMPLETE;
3893	CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
3894}
3895
3896/*
3897 * Free a jfreefrag when the parent freefrag is rendered obsolete.
3898 */
3899static void
3900free_jfreefrag(jfreefrag)
3901	struct jfreefrag *jfreefrag;
3902{
3903
3904	if (jfreefrag->fr_state & INPROGRESS)
3905		WORKLIST_REMOVE(&jfreefrag->fr_list);
3906	else if (jfreefrag->fr_state & ONWORKLIST)
3907		remove_from_journal(&jfreefrag->fr_list);
3908	if (jfreefrag->fr_freefrag != NULL)
3909		panic("free_jfreefrag:  Still attached to a freefrag.");
3910	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3911}
3912
3913/*
3914 * Called when the journal write for a jfreefrag completes.  The parent
3915 * freefrag is added to the worklist if this completes its dependencies.
3916 */
3917static void
3918handle_written_jfreefrag(jfreefrag)
3919	struct jfreefrag *jfreefrag;
3920{
3921	struct jsegdep *jsegdep;
3922	struct freefrag *freefrag;
3923
3924	/* Grab the jsegdep. */
3925	jsegdep = jfreefrag->fr_jsegdep;
3926	jfreefrag->fr_jsegdep = NULL;
3927	freefrag = jfreefrag->fr_freefrag;
3928	if (freefrag == NULL)
3929		panic("handle_written_jfreefrag: No freefrag.");
3930	freefrag->ff_state |= DEPCOMPLETE;
3931	freefrag->ff_jdep = NULL;
3932	jwork_insert(&freefrag->ff_jwork, jsegdep);
3933	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3934		add_to_worklist(&freefrag->ff_list, 0);
3935	jfreefrag->fr_freefrag = NULL;
3936	free_jfreefrag(jfreefrag);
3937}
3938
3939/*
3940 * Called when the journal write for a jfreeblk completes.  The jfreeblk
3941 * is removed from the freeblks list of pending journal writes and the
3942 * jsegdep is moved to the freeblks jwork to be completed when all blocks
3943 * have been reclaimed.
3944 */
3945static void
3946handle_written_jblkdep(jblkdep)
3947	struct jblkdep *jblkdep;
3948{
3949	struct freeblks *freeblks;
3950	struct jsegdep *jsegdep;
3951
3952	/* Grab the jsegdep. */
3953	jsegdep = jblkdep->jb_jsegdep;
3954	jblkdep->jb_jsegdep = NULL;
3955	freeblks = jblkdep->jb_freeblks;
3956	LIST_REMOVE(jblkdep, jb_deps);
3957	jwork_insert(&freeblks->fb_jwork, jsegdep);
3958	/*
3959	 * If the freeblks is all journaled, we can add it to the worklist.
3960	 */
3961	if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
3962	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
3963		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
3964
3965	free_jblkdep(jblkdep);
3966}
3967
3968static struct jsegdep *
3969newjsegdep(struct worklist *wk)
3970{
3971	struct jsegdep *jsegdep;
3972
3973	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3974	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3975	jsegdep->jd_seg = NULL;
3976
3977	return (jsegdep);
3978}
3979
3980static struct jmvref *
3981newjmvref(dp, ino, oldoff, newoff)
3982	struct inode *dp;
3983	ino_t ino;
3984	off_t oldoff;
3985	off_t newoff;
3986{
3987	struct jmvref *jmvref;
3988
3989	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3990	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3991	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3992	jmvref->jm_parent = dp->i_number;
3993	jmvref->jm_ino = ino;
3994	jmvref->jm_oldoff = oldoff;
3995	jmvref->jm_newoff = newoff;
3996
3997	return (jmvref);
3998}
3999
4000/*
4001 * Allocate a new jremref that tracks the removal of ip from dp with the
4002 * directory entry offset of diroff.  Mark the entry as ATTACHED and
4003 * DEPCOMPLETE as we have all the information required for the journal write
4004 * and the directory has already been removed from the buffer.  The caller
4005 * is responsible for linking the jremref into the pagedep and adding it
4006 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
4007 * a DOTDOT addition so handle_workitem_remove() can properly assign
4008 * the jsegdep when we're done.
4009 */
4010static struct jremref *
4011newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
4012    off_t diroff, nlink_t nlink)
4013{
4014	struct jremref *jremref;
4015
4016	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
4017	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
4018	jremref->jr_state = ATTACHED;
4019	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
4020	   nlink, ip->i_mode);
4021	jremref->jr_dirrem = dirrem;
4022
4023	return (jremref);
4024}
4025
4026static inline void
4027newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
4028    nlink_t nlink, uint16_t mode)
4029{
4030
4031	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
4032	inoref->if_diroff = diroff;
4033	inoref->if_ino = ino;
4034	inoref->if_parent = parent;
4035	inoref->if_nlink = nlink;
4036	inoref->if_mode = mode;
4037}
4038
4039/*
4040 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
4041 * directory offset may not be known until later.  The caller is responsible
4042 * adding the entry to the journal when this information is available.  nlink
4043 * should be the link count prior to the addition and mode is only required
4044 * to have the correct FMT.
4045 */
4046static struct jaddref *
4047newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
4048    uint16_t mode)
4049{
4050	struct jaddref *jaddref;
4051
4052	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
4053	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
4054	jaddref->ja_state = ATTACHED;
4055	jaddref->ja_mkdir = NULL;
4056	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
4057
4058	return (jaddref);
4059}
4060
4061/*
4062 * Create a new free dependency for a freework.  The caller is responsible
4063 * for adjusting the reference count when it has the lock held.  The freedep
4064 * will track an outstanding bitmap write that will ultimately clear the
4065 * freework to continue.
4066 */
4067static struct freedep *
4068newfreedep(struct freework *freework)
4069{
4070	struct freedep *freedep;
4071
4072	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
4073	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
4074	freedep->fd_freework = freework;
4075
4076	return (freedep);
4077}
4078
4079/*
4080 * Free a freedep structure once the buffer it is linked to is written.  If
4081 * this is the last reference to the freework schedule it for completion.
4082 */
4083static void
4084free_freedep(freedep)
4085	struct freedep *freedep;
4086{
4087	struct freework *freework;
4088
4089	freework = freedep->fd_freework;
4090	freework->fw_freeblks->fb_cgwait--;
4091	if (--freework->fw_ref == 0)
4092		freework_enqueue(freework);
4093	WORKITEM_FREE(freedep, D_FREEDEP);
4094}
4095
4096/*
4097 * Allocate a new freework structure that may be a level in an indirect
4098 * when parent is not NULL or a top level block when it is.  The top level
4099 * freework structures are allocated without the per-filesystem lock held
4100 * and before the freeblks is visible outside of softdep_setup_freeblocks().
4101 */
4102static struct freework *
4103newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
4104	struct ufsmount *ump;
4105	struct freeblks *freeblks;
4106	struct freework *parent;
4107	ufs_lbn_t lbn;
4108	ufs2_daddr_t nb;
4109	int frags;
4110	int off;
4111	int journal;
4112{
4113	struct freework *freework;
4114
4115	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
4116	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
4117	freework->fw_state = ATTACHED;
4118	freework->fw_jnewblk = NULL;
4119	freework->fw_freeblks = freeblks;
4120	freework->fw_parent = parent;
4121	freework->fw_lbn = lbn;
4122	freework->fw_blkno = nb;
4123	freework->fw_frags = frags;
4124	freework->fw_indir = NULL;
4125	freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR)
4126		? 0 : NINDIR(ump->um_fs) + 1;
4127	freework->fw_start = freework->fw_off = off;
4128	if (journal)
4129		newjfreeblk(freeblks, lbn, nb, frags);
4130	if (parent == NULL) {
4131		ACQUIRE_LOCK(ump);
4132		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
4133		freeblks->fb_ref++;
4134		FREE_LOCK(ump);
4135	}
4136
4137	return (freework);
4138}
4139
4140/*
4141 * Eliminate a jfreeblk for a block that does not need journaling.
4142 */
4143static void
4144cancel_jfreeblk(freeblks, blkno)
4145	struct freeblks *freeblks;
4146	ufs2_daddr_t blkno;
4147{
4148	struct jfreeblk *jfreeblk;
4149	struct jblkdep *jblkdep;
4150
4151	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4152		if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4153			continue;
4154		jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4155		if (jfreeblk->jf_blkno == blkno)
4156			break;
4157	}
4158	if (jblkdep == NULL)
4159		return;
4160	CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4161	free_jsegdep(jblkdep->jb_jsegdep);
4162	LIST_REMOVE(jblkdep, jb_deps);
4163	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4164}
4165
4166/*
4167 * Allocate a new jfreeblk to journal top level block pointer when truncating
4168 * a file.  The caller must add this to the worklist when the per-filesystem
4169 * lock is held.
4170 */
4171static struct jfreeblk *
4172newjfreeblk(freeblks, lbn, blkno, frags)
4173	struct freeblks *freeblks;
4174	ufs_lbn_t lbn;
4175	ufs2_daddr_t blkno;
4176	int frags;
4177{
4178	struct jfreeblk *jfreeblk;
4179
4180	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4181	workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4182	    freeblks->fb_list.wk_mp);
4183	jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4184	jfreeblk->jf_dep.jb_freeblks = freeblks;
4185	jfreeblk->jf_ino = freeblks->fb_inum;
4186	jfreeblk->jf_lbn = lbn;
4187	jfreeblk->jf_blkno = blkno;
4188	jfreeblk->jf_frags = frags;
4189	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4190
4191	return (jfreeblk);
4192}
4193
4194/*
4195 * The journal is only prepared to handle full-size block numbers, so we
4196 * have to adjust the record to reflect the change to a full-size block.
4197 * For example, suppose we have a block made up of fragments 8-15 and
4198 * want to free its last two fragments. We are given a request that says:
4199 *     FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
4200 * where frags are the number of fragments to free and oldfrags are the
4201 * number of fragments to keep. To block align it, we have to change it to
4202 * have a valid full-size blkno, so it becomes:
4203 *     FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
4204 */
4205static void
4206adjust_newfreework(freeblks, frag_offset)
4207	struct freeblks *freeblks;
4208	int frag_offset;
4209{
4210	struct jfreeblk *jfreeblk;
4211
4212	KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
4213	    LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
4214	    ("adjust_newfreework: Missing freeblks dependency"));
4215
4216	jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
4217	jfreeblk->jf_blkno -= frag_offset;
4218	jfreeblk->jf_frags += frag_offset;
4219}
4220
4221/*
4222 * Allocate a new jtrunc to track a partial truncation.
4223 */
4224static struct jtrunc *
4225newjtrunc(freeblks, size, extsize)
4226	struct freeblks *freeblks;
4227	off_t size;
4228	int extsize;
4229{
4230	struct jtrunc *jtrunc;
4231
4232	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4233	workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4234	    freeblks->fb_list.wk_mp);
4235	jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4236	jtrunc->jt_dep.jb_freeblks = freeblks;
4237	jtrunc->jt_ino = freeblks->fb_inum;
4238	jtrunc->jt_size = size;
4239	jtrunc->jt_extsize = extsize;
4240	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4241
4242	return (jtrunc);
4243}
4244
4245/*
4246 * If we're canceling a new bitmap we have to search for another ref
4247 * to move into the bmsafemap dep.  This might be better expressed
4248 * with another structure.
4249 */
4250static void
4251move_newblock_dep(jaddref, inodedep)
4252	struct jaddref *jaddref;
4253	struct inodedep *inodedep;
4254{
4255	struct inoref *inoref;
4256	struct jaddref *jaddrefn;
4257
4258	jaddrefn = NULL;
4259	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4260	    inoref = TAILQ_NEXT(inoref, if_deps)) {
4261		if ((jaddref->ja_state & NEWBLOCK) &&
4262		    inoref->if_list.wk_type == D_JADDREF) {
4263			jaddrefn = (struct jaddref *)inoref;
4264			break;
4265		}
4266	}
4267	if (jaddrefn == NULL)
4268		return;
4269	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4270	jaddrefn->ja_state |= jaddref->ja_state &
4271	    (ATTACHED | UNDONE | NEWBLOCK);
4272	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4273	jaddref->ja_state |= ATTACHED;
4274	LIST_REMOVE(jaddref, ja_bmdeps);
4275	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4276	    ja_bmdeps);
4277}
4278
4279/*
4280 * Cancel a jaddref either before it has been written or while it is being
4281 * written.  This happens when a link is removed before the add reaches
4282 * the disk.  The jaddref dependency is kept linked into the bmsafemap
4283 * and inode to prevent the link count or bitmap from reaching the disk
4284 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4285 * required.
4286 *
4287 * Returns 1 if the canceled addref requires journaling of the remove and
4288 * 0 otherwise.
4289 */
4290static int
4291cancel_jaddref(jaddref, inodedep, wkhd)
4292	struct jaddref *jaddref;
4293	struct inodedep *inodedep;
4294	struct workhead *wkhd;
4295{
4296	struct inoref *inoref;
4297	struct jsegdep *jsegdep;
4298	int needsj;
4299
4300	KASSERT((jaddref->ja_state & COMPLETE) == 0,
4301	    ("cancel_jaddref: Canceling complete jaddref"));
4302	if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4303		needsj = 1;
4304	else
4305		needsj = 0;
4306	if (inodedep == NULL)
4307		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4308		    0, &inodedep) == 0)
4309			panic("cancel_jaddref: Lost inodedep");
4310	/*
4311	 * We must adjust the nlink of any reference operation that follows
4312	 * us so that it is consistent with the in-memory reference.  This
4313	 * ensures that inode nlink rollbacks always have the correct link.
4314	 */
4315	if (needsj == 0) {
4316		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4317		    inoref = TAILQ_NEXT(inoref, if_deps)) {
4318			if (inoref->if_state & GOINGAWAY)
4319				break;
4320			inoref->if_nlink--;
4321		}
4322	}
4323	jsegdep = inoref_jseg(&jaddref->ja_ref);
4324	if (jaddref->ja_state & NEWBLOCK)
4325		move_newblock_dep(jaddref, inodedep);
4326	wake_worklist(&jaddref->ja_list);
4327	jaddref->ja_mkdir = NULL;
4328	if (jaddref->ja_state & INPROGRESS) {
4329		jaddref->ja_state &= ~INPROGRESS;
4330		WORKLIST_REMOVE(&jaddref->ja_list);
4331		jwork_insert(wkhd, jsegdep);
4332	} else {
4333		free_jsegdep(jsegdep);
4334		if (jaddref->ja_state & DEPCOMPLETE)
4335			remove_from_journal(&jaddref->ja_list);
4336	}
4337	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4338	/*
4339	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4340	 * can arrange for them to be freed with the bitmap.  Otherwise we
4341	 * no longer need this addref attached to the inoreflst and it
4342	 * will incorrectly adjust nlink if we leave it.
4343	 */
4344	if ((jaddref->ja_state & NEWBLOCK) == 0) {
4345		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4346		    if_deps);
4347		jaddref->ja_state |= COMPLETE;
4348		free_jaddref(jaddref);
4349		return (needsj);
4350	}
4351	/*
4352	 * Leave the head of the list for jsegdeps for fast merging.
4353	 */
4354	if (LIST_FIRST(wkhd) != NULL) {
4355		jaddref->ja_state |= ONWORKLIST;
4356		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4357	} else
4358		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4359
4360	return (needsj);
4361}
4362
4363/*
4364 * Attempt to free a jaddref structure when some work completes.  This
4365 * should only succeed once the entry is written and all dependencies have
4366 * been notified.
4367 */
4368static void
4369free_jaddref(jaddref)
4370	struct jaddref *jaddref;
4371{
4372
4373	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4374		return;
4375	if (jaddref->ja_ref.if_jsegdep)
4376		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4377		    jaddref, jaddref->ja_state);
4378	if (jaddref->ja_state & NEWBLOCK)
4379		LIST_REMOVE(jaddref, ja_bmdeps);
4380	if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4381		panic("free_jaddref: Bad state %p(0x%X)",
4382		    jaddref, jaddref->ja_state);
4383	if (jaddref->ja_mkdir != NULL)
4384		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4385	WORKITEM_FREE(jaddref, D_JADDREF);
4386}
4387
4388/*
4389 * Free a jremref structure once it has been written or discarded.
4390 */
4391static void
4392free_jremref(jremref)
4393	struct jremref *jremref;
4394{
4395
4396	if (jremref->jr_ref.if_jsegdep)
4397		free_jsegdep(jremref->jr_ref.if_jsegdep);
4398	if (jremref->jr_state & INPROGRESS)
4399		panic("free_jremref: IO still pending");
4400	WORKITEM_FREE(jremref, D_JREMREF);
4401}
4402
4403/*
4404 * Free a jnewblk structure.
4405 */
4406static void
4407free_jnewblk(jnewblk)
4408	struct jnewblk *jnewblk;
4409{
4410
4411	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4412		return;
4413	LIST_REMOVE(jnewblk, jn_deps);
4414	if (jnewblk->jn_dep != NULL)
4415		panic("free_jnewblk: Dependency still attached.");
4416	WORKITEM_FREE(jnewblk, D_JNEWBLK);
4417}
4418
4419/*
4420 * Cancel a jnewblk which has been been made redundant by frag extension.
4421 */
4422static void
4423cancel_jnewblk(jnewblk, wkhd)
4424	struct jnewblk *jnewblk;
4425	struct workhead *wkhd;
4426{
4427	struct jsegdep *jsegdep;
4428
4429	CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4430	jsegdep = jnewblk->jn_jsegdep;
4431	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4432		panic("cancel_jnewblk: Invalid state");
4433	jnewblk->jn_jsegdep  = NULL;
4434	jnewblk->jn_dep = NULL;
4435	jnewblk->jn_state |= GOINGAWAY;
4436	if (jnewblk->jn_state & INPROGRESS) {
4437		jnewblk->jn_state &= ~INPROGRESS;
4438		WORKLIST_REMOVE(&jnewblk->jn_list);
4439		jwork_insert(wkhd, jsegdep);
4440	} else {
4441		free_jsegdep(jsegdep);
4442		remove_from_journal(&jnewblk->jn_list);
4443	}
4444	wake_worklist(&jnewblk->jn_list);
4445	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4446}
4447
4448static void
4449free_jblkdep(jblkdep)
4450	struct jblkdep *jblkdep;
4451{
4452
4453	if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4454		WORKITEM_FREE(jblkdep, D_JFREEBLK);
4455	else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4456		WORKITEM_FREE(jblkdep, D_JTRUNC);
4457	else
4458		panic("free_jblkdep: Unexpected type %s",
4459		    TYPENAME(jblkdep->jb_list.wk_type));
4460}
4461
4462/*
4463 * Free a single jseg once it is no longer referenced in memory or on
4464 * disk.  Reclaim journal blocks and dependencies waiting for the segment
4465 * to disappear.
4466 */
4467static void
4468free_jseg(jseg, jblocks)
4469	struct jseg *jseg;
4470	struct jblocks *jblocks;
4471{
4472	struct freework *freework;
4473
4474	/*
4475	 * Free freework structures that were lingering to indicate freed
4476	 * indirect blocks that forced journal write ordering on reallocate.
4477	 */
4478	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4479		indirblk_remove(freework);
4480	if (jblocks->jb_oldestseg == jseg)
4481		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4482	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4483	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4484	KASSERT(LIST_EMPTY(&jseg->js_entries),
4485	    ("free_jseg: Freed jseg has valid entries."));
4486	WORKITEM_FREE(jseg, D_JSEG);
4487}
4488
4489/*
4490 * Free all jsegs that meet the criteria for being reclaimed and update
4491 * oldestseg.
4492 */
4493static void
4494free_jsegs(jblocks)
4495	struct jblocks *jblocks;
4496{
4497	struct jseg *jseg;
4498
4499	/*
4500	 * Free only those jsegs which have none allocated before them to
4501	 * preserve the journal space ordering.
4502	 */
4503	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4504		/*
4505		 * Only reclaim space when nothing depends on this journal
4506		 * set and another set has written that it is no longer
4507		 * valid.
4508		 */
4509		if (jseg->js_refs != 0) {
4510			jblocks->jb_oldestseg = jseg;
4511			return;
4512		}
4513		if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4514			break;
4515		if (jseg->js_seq > jblocks->jb_oldestwrseq)
4516			break;
4517		/*
4518		 * We can free jsegs that didn't write entries when
4519		 * oldestwrseq == js_seq.
4520		 */
4521		if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4522		    jseg->js_cnt != 0)
4523			break;
4524		free_jseg(jseg, jblocks);
4525	}
4526	/*
4527	 * If we exited the loop above we still must discover the
4528	 * oldest valid segment.
4529	 */
4530	if (jseg)
4531		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4532		     jseg = TAILQ_NEXT(jseg, js_next))
4533			if (jseg->js_refs != 0)
4534				break;
4535	jblocks->jb_oldestseg = jseg;
4536	/*
4537	 * The journal has no valid records but some jsegs may still be
4538	 * waiting on oldestwrseq to advance.  We force a small record
4539	 * out to permit these lingering records to be reclaimed.
4540	 */
4541	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4542		jblocks->jb_needseg = 1;
4543}
4544
4545/*
4546 * Release one reference to a jseg and free it if the count reaches 0.  This
4547 * should eventually reclaim journal space as well.
4548 */
4549static void
4550rele_jseg(jseg)
4551	struct jseg *jseg;
4552{
4553
4554	KASSERT(jseg->js_refs > 0,
4555	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4556	if (--jseg->js_refs != 0)
4557		return;
4558	free_jsegs(jseg->js_jblocks);
4559}
4560
4561/*
4562 * Release a jsegdep and decrement the jseg count.
4563 */
4564static void
4565free_jsegdep(jsegdep)
4566	struct jsegdep *jsegdep;
4567{
4568
4569	if (jsegdep->jd_seg)
4570		rele_jseg(jsegdep->jd_seg);
4571	WORKITEM_FREE(jsegdep, D_JSEGDEP);
4572}
4573
4574/*
4575 * Wait for a journal item to make it to disk.  Initiate journal processing
4576 * if required.
4577 */
4578static int
4579jwait(wk, waitfor)
4580	struct worklist *wk;
4581	int waitfor;
4582{
4583
4584	LOCK_OWNED(VFSTOUFS(wk->wk_mp));
4585	/*
4586	 * Blocking journal waits cause slow synchronous behavior.  Record
4587	 * stats on the frequency of these blocking operations.
4588	 */
4589	if (waitfor == MNT_WAIT) {
4590		stat_journal_wait++;
4591		switch (wk->wk_type) {
4592		case D_JREMREF:
4593		case D_JMVREF:
4594			stat_jwait_filepage++;
4595			break;
4596		case D_JTRUNC:
4597		case D_JFREEBLK:
4598			stat_jwait_freeblks++;
4599			break;
4600		case D_JNEWBLK:
4601			stat_jwait_newblk++;
4602			break;
4603		case D_JADDREF:
4604			stat_jwait_inode++;
4605			break;
4606		default:
4607			break;
4608		}
4609	}
4610	/*
4611	 * If IO has not started we process the journal.  We can't mark the
4612	 * worklist item as IOWAITING because we drop the lock while
4613	 * processing the journal and the worklist entry may be freed after
4614	 * this point.  The caller may call back in and re-issue the request.
4615	 */
4616	if ((wk->wk_state & INPROGRESS) == 0) {
4617		softdep_process_journal(wk->wk_mp, wk, waitfor);
4618		if (waitfor != MNT_WAIT)
4619			return (EBUSY);
4620		return (0);
4621	}
4622	if (waitfor != MNT_WAIT)
4623		return (EBUSY);
4624	wait_worklist(wk, "jwait");
4625	return (0);
4626}
4627
4628/*
4629 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4630 * appropriate.  This is a convenience function to reduce duplicate code
4631 * for the setup and revert functions below.
4632 */
4633static struct inodedep *
4634inodedep_lookup_ip(ip)
4635	struct inode *ip;
4636{
4637	struct inodedep *inodedep;
4638	int dflags;
4639
4640	KASSERT(ip->i_nlink >= ip->i_effnlink,
4641	    ("inodedep_lookup_ip: bad delta"));
4642	dflags = DEPALLOC;
4643	if (IS_SNAPSHOT(ip))
4644		dflags |= NODELAY;
4645	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags,
4646	    &inodedep);
4647	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4648	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4649
4650	return (inodedep);
4651}
4652
4653/*
4654 * Called prior to creating a new inode and linking it to a directory.  The
4655 * jaddref structure must already be allocated by softdep_setup_inomapdep
4656 * and it is discovered here so we can initialize the mode and update
4657 * nlinkdelta.
4658 */
4659void
4660softdep_setup_create(dp, ip)
4661	struct inode *dp;
4662	struct inode *ip;
4663{
4664	struct inodedep *inodedep;
4665	struct jaddref *jaddref;
4666	struct vnode *dvp;
4667
4668	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4669	    ("softdep_setup_create called on non-softdep filesystem"));
4670	KASSERT(ip->i_nlink == 1,
4671	    ("softdep_setup_create: Invalid link count."));
4672	dvp = ITOV(dp);
4673	ACQUIRE_LOCK(dp->i_ump);
4674	inodedep = inodedep_lookup_ip(ip);
4675	if (DOINGSUJ(dvp)) {
4676		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4677		    inoreflst);
4678		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4679		    ("softdep_setup_create: No addref structure present."));
4680	}
4681	softdep_prelink(dvp, NULL);
4682	FREE_LOCK(dp->i_ump);
4683}
4684
4685/*
4686 * Create a jaddref structure to track the addition of a DOTDOT link when
4687 * we are reparenting an inode as part of a rename.  This jaddref will be
4688 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4689 * non-journaling softdep.
4690 */
4691void
4692softdep_setup_dotdot_link(dp, ip)
4693	struct inode *dp;
4694	struct inode *ip;
4695{
4696	struct inodedep *inodedep;
4697	struct jaddref *jaddref;
4698	struct vnode *dvp;
4699	struct vnode *vp;
4700
4701	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4702	    ("softdep_setup_dotdot_link called on non-softdep filesystem"));
4703	dvp = ITOV(dp);
4704	vp = ITOV(ip);
4705	jaddref = NULL;
4706	/*
4707	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4708	 * is used as a normal link would be.
4709	 */
4710	if (DOINGSUJ(dvp))
4711		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4712		    dp->i_effnlink - 1, dp->i_mode);
4713	ACQUIRE_LOCK(dp->i_ump);
4714	inodedep = inodedep_lookup_ip(dp);
4715	if (jaddref)
4716		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4717		    if_deps);
4718	softdep_prelink(dvp, ITOV(ip));
4719	FREE_LOCK(dp->i_ump);
4720}
4721
4722/*
4723 * Create a jaddref structure to track a new link to an inode.  The directory
4724 * offset is not known until softdep_setup_directory_add or
4725 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4726 * softdep.
4727 */
4728void
4729softdep_setup_link(dp, ip)
4730	struct inode *dp;
4731	struct inode *ip;
4732{
4733	struct inodedep *inodedep;
4734	struct jaddref *jaddref;
4735	struct vnode *dvp;
4736
4737	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4738	    ("softdep_setup_link called on non-softdep filesystem"));
4739	dvp = ITOV(dp);
4740	jaddref = NULL;
4741	if (DOINGSUJ(dvp))
4742		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4743		    ip->i_mode);
4744	ACQUIRE_LOCK(dp->i_ump);
4745	inodedep = inodedep_lookup_ip(ip);
4746	if (jaddref)
4747		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4748		    if_deps);
4749	softdep_prelink(dvp, ITOV(ip));
4750	FREE_LOCK(dp->i_ump);
4751}
4752
4753/*
4754 * Called to create the jaddref structures to track . and .. references as
4755 * well as lookup and further initialize the incomplete jaddref created
4756 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4757 * nlinkdelta for non-journaling softdep.
4758 */
4759void
4760softdep_setup_mkdir(dp, ip)
4761	struct inode *dp;
4762	struct inode *ip;
4763{
4764	struct inodedep *inodedep;
4765	struct jaddref *dotdotaddref;
4766	struct jaddref *dotaddref;
4767	struct jaddref *jaddref;
4768	struct vnode *dvp;
4769
4770	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4771	    ("softdep_setup_mkdir called on non-softdep filesystem"));
4772	dvp = ITOV(dp);
4773	dotaddref = dotdotaddref = NULL;
4774	if (DOINGSUJ(dvp)) {
4775		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4776		    ip->i_mode);
4777		dotaddref->ja_state |= MKDIR_BODY;
4778		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4779		    dp->i_effnlink - 1, dp->i_mode);
4780		dotdotaddref->ja_state |= MKDIR_PARENT;
4781	}
4782	ACQUIRE_LOCK(dp->i_ump);
4783	inodedep = inodedep_lookup_ip(ip);
4784	if (DOINGSUJ(dvp)) {
4785		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4786		    inoreflst);
4787		KASSERT(jaddref != NULL,
4788		    ("softdep_setup_mkdir: No addref structure present."));
4789		KASSERT(jaddref->ja_parent == dp->i_number,
4790		    ("softdep_setup_mkdir: bad parent %ju",
4791		    (uintmax_t)jaddref->ja_parent));
4792		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4793		    if_deps);
4794	}
4795	inodedep = inodedep_lookup_ip(dp);
4796	if (DOINGSUJ(dvp))
4797		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4798		    &dotdotaddref->ja_ref, if_deps);
4799	softdep_prelink(ITOV(dp), NULL);
4800	FREE_LOCK(dp->i_ump);
4801}
4802
4803/*
4804 * Called to track nlinkdelta of the inode and parent directories prior to
4805 * unlinking a directory.
4806 */
4807void
4808softdep_setup_rmdir(dp, ip)
4809	struct inode *dp;
4810	struct inode *ip;
4811{
4812	struct vnode *dvp;
4813
4814	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4815	    ("softdep_setup_rmdir called on non-softdep filesystem"));
4816	dvp = ITOV(dp);
4817	ACQUIRE_LOCK(dp->i_ump);
4818	(void) inodedep_lookup_ip(ip);
4819	(void) inodedep_lookup_ip(dp);
4820	softdep_prelink(dvp, ITOV(ip));
4821	FREE_LOCK(dp->i_ump);
4822}
4823
4824/*
4825 * Called to track nlinkdelta of the inode and parent directories prior to
4826 * unlink.
4827 */
4828void
4829softdep_setup_unlink(dp, ip)
4830	struct inode *dp;
4831	struct inode *ip;
4832{
4833	struct vnode *dvp;
4834
4835	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4836	    ("softdep_setup_unlink called on non-softdep filesystem"));
4837	dvp = ITOV(dp);
4838	ACQUIRE_LOCK(dp->i_ump);
4839	(void) inodedep_lookup_ip(ip);
4840	(void) inodedep_lookup_ip(dp);
4841	softdep_prelink(dvp, ITOV(ip));
4842	FREE_LOCK(dp->i_ump);
4843}
4844
4845/*
4846 * Called to release the journal structures created by a failed non-directory
4847 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4848 */
4849void
4850softdep_revert_create(dp, ip)
4851	struct inode *dp;
4852	struct inode *ip;
4853{
4854	struct inodedep *inodedep;
4855	struct jaddref *jaddref;
4856	struct vnode *dvp;
4857
4858	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4859	    ("softdep_revert_create called on non-softdep filesystem"));
4860	dvp = ITOV(dp);
4861	ACQUIRE_LOCK(dp->i_ump);
4862	inodedep = inodedep_lookup_ip(ip);
4863	if (DOINGSUJ(dvp)) {
4864		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4865		    inoreflst);
4866		KASSERT(jaddref->ja_parent == dp->i_number,
4867		    ("softdep_revert_create: addref parent mismatch"));
4868		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4869	}
4870	FREE_LOCK(dp->i_ump);
4871}
4872
4873/*
4874 * Called to release the journal structures created by a failed link
4875 * addition.  Adjusts nlinkdelta for non-journaling softdep.
4876 */
4877void
4878softdep_revert_link(dp, ip)
4879	struct inode *dp;
4880	struct inode *ip;
4881{
4882	struct inodedep *inodedep;
4883	struct jaddref *jaddref;
4884	struct vnode *dvp;
4885
4886	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4887	    ("softdep_revert_link called on non-softdep filesystem"));
4888	dvp = ITOV(dp);
4889	ACQUIRE_LOCK(dp->i_ump);
4890	inodedep = inodedep_lookup_ip(ip);
4891	if (DOINGSUJ(dvp)) {
4892		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4893		    inoreflst);
4894		KASSERT(jaddref->ja_parent == dp->i_number,
4895		    ("softdep_revert_link: addref parent mismatch"));
4896		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4897	}
4898	FREE_LOCK(dp->i_ump);
4899}
4900
4901/*
4902 * Called to release the journal structures created by a failed mkdir
4903 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4904 */
4905void
4906softdep_revert_mkdir(dp, ip)
4907	struct inode *dp;
4908	struct inode *ip;
4909{
4910	struct inodedep *inodedep;
4911	struct jaddref *jaddref;
4912	struct jaddref *dotaddref;
4913	struct vnode *dvp;
4914
4915	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4916	    ("softdep_revert_mkdir called on non-softdep filesystem"));
4917	dvp = ITOV(dp);
4918
4919	ACQUIRE_LOCK(dp->i_ump);
4920	inodedep = inodedep_lookup_ip(dp);
4921	if (DOINGSUJ(dvp)) {
4922		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4923		    inoreflst);
4924		KASSERT(jaddref->ja_parent == ip->i_number,
4925		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4926		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4927	}
4928	inodedep = inodedep_lookup_ip(ip);
4929	if (DOINGSUJ(dvp)) {
4930		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4931		    inoreflst);
4932		KASSERT(jaddref->ja_parent == dp->i_number,
4933		    ("softdep_revert_mkdir: addref parent mismatch"));
4934		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4935		    inoreflst, if_deps);
4936		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4937		KASSERT(dotaddref->ja_parent == ip->i_number,
4938		    ("softdep_revert_mkdir: dot addref parent mismatch"));
4939		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4940	}
4941	FREE_LOCK(dp->i_ump);
4942}
4943
4944/*
4945 * Called to correct nlinkdelta after a failed rmdir.
4946 */
4947void
4948softdep_revert_rmdir(dp, ip)
4949	struct inode *dp;
4950	struct inode *ip;
4951{
4952
4953	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4954	    ("softdep_revert_rmdir called on non-softdep filesystem"));
4955	ACQUIRE_LOCK(dp->i_ump);
4956	(void) inodedep_lookup_ip(ip);
4957	(void) inodedep_lookup_ip(dp);
4958	FREE_LOCK(dp->i_ump);
4959}
4960
4961/*
4962 * Protecting the freemaps (or bitmaps).
4963 *
4964 * To eliminate the need to execute fsck before mounting a filesystem
4965 * after a power failure, one must (conservatively) guarantee that the
4966 * on-disk copy of the bitmaps never indicate that a live inode or block is
4967 * free.  So, when a block or inode is allocated, the bitmap should be
4968 * updated (on disk) before any new pointers.  When a block or inode is
4969 * freed, the bitmap should not be updated until all pointers have been
4970 * reset.  The latter dependency is handled by the delayed de-allocation
4971 * approach described below for block and inode de-allocation.  The former
4972 * dependency is handled by calling the following procedure when a block or
4973 * inode is allocated. When an inode is allocated an "inodedep" is created
4974 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4975 * Each "inodedep" is also inserted into the hash indexing structure so
4976 * that any additional link additions can be made dependent on the inode
4977 * allocation.
4978 *
4979 * The ufs filesystem maintains a number of free block counts (e.g., per
4980 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4981 * in addition to the bitmaps.  These counts are used to improve efficiency
4982 * during allocation and therefore must be consistent with the bitmaps.
4983 * There is no convenient way to guarantee post-crash consistency of these
4984 * counts with simple update ordering, for two main reasons: (1) The counts
4985 * and bitmaps for a single cylinder group block are not in the same disk
4986 * sector.  If a disk write is interrupted (e.g., by power failure), one may
4987 * be written and the other not.  (2) Some of the counts are located in the
4988 * superblock rather than the cylinder group block. So, we focus our soft
4989 * updates implementation on protecting the bitmaps. When mounting a
4990 * filesystem, we recompute the auxiliary counts from the bitmaps.
4991 */
4992
4993/*
4994 * Called just after updating the cylinder group block to allocate an inode.
4995 */
4996void
4997softdep_setup_inomapdep(bp, ip, newinum, mode)
4998	struct buf *bp;		/* buffer for cylgroup block with inode map */
4999	struct inode *ip;	/* inode related to allocation */
5000	ino_t newinum;		/* new inode number being allocated */
5001	int mode;
5002{
5003	struct inodedep *inodedep;
5004	struct bmsafemap *bmsafemap;
5005	struct jaddref *jaddref;
5006	struct mount *mp;
5007	struct fs *fs;
5008
5009	mp = UFSTOVFS(ip->i_ump);
5010	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5011	    ("softdep_setup_inomapdep called on non-softdep filesystem"));
5012	fs = ip->i_ump->um_fs;
5013	jaddref = NULL;
5014
5015	/*
5016	 * Allocate the journal reference add structure so that the bitmap
5017	 * can be dependent on it.
5018	 */
5019	if (MOUNTEDSUJ(mp)) {
5020		jaddref = newjaddref(ip, newinum, 0, 0, mode);
5021		jaddref->ja_state |= NEWBLOCK;
5022	}
5023
5024	/*
5025	 * Create a dependency for the newly allocated inode.
5026	 * Panic if it already exists as something is seriously wrong.
5027	 * Otherwise add it to the dependency list for the buffer holding
5028	 * the cylinder group map from which it was allocated.
5029	 *
5030	 * We have to preallocate a bmsafemap entry in case it is needed
5031	 * in bmsafemap_lookup since once we allocate the inodedep, we
5032	 * have to finish initializing it before we can FREE_LOCK().
5033	 * By preallocating, we avoid FREE_LOCK() while doing a malloc
5034	 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
5035	 * creating the inodedep as it can be freed during the time
5036	 * that we FREE_LOCK() while allocating the inodedep. We must
5037	 * call workitem_alloc() before entering the locked section as
5038	 * it also acquires the lock and we must avoid trying doing so
5039	 * recursively.
5040	 */
5041	bmsafemap = malloc(sizeof(struct bmsafemap),
5042	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5043	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5044	ACQUIRE_LOCK(ip->i_ump);
5045	if ((inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep)))
5046		panic("softdep_setup_inomapdep: dependency %p for new"
5047		    "inode already exists", inodedep);
5048	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
5049	if (jaddref) {
5050		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
5051		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5052		    if_deps);
5053	} else {
5054		inodedep->id_state |= ONDEPLIST;
5055		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
5056	}
5057	inodedep->id_bmsafemap = bmsafemap;
5058	inodedep->id_state &= ~DEPCOMPLETE;
5059	FREE_LOCK(ip->i_ump);
5060}
5061
5062/*
5063 * Called just after updating the cylinder group block to
5064 * allocate block or fragment.
5065 */
5066void
5067softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
5068	struct buf *bp;		/* buffer for cylgroup block with block map */
5069	struct mount *mp;	/* filesystem doing allocation */
5070	ufs2_daddr_t newblkno;	/* number of newly allocated block */
5071	int frags;		/* Number of fragments. */
5072	int oldfrags;		/* Previous number of fragments for extend. */
5073{
5074	struct newblk *newblk;
5075	struct bmsafemap *bmsafemap;
5076	struct jnewblk *jnewblk;
5077	struct ufsmount *ump;
5078	struct fs *fs;
5079
5080	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5081	    ("softdep_setup_blkmapdep called on non-softdep filesystem"));
5082	ump = VFSTOUFS(mp);
5083	fs = ump->um_fs;
5084	jnewblk = NULL;
5085	/*
5086	 * Create a dependency for the newly allocated block.
5087	 * Add it to the dependency list for the buffer holding
5088	 * the cylinder group map from which it was allocated.
5089	 */
5090	if (MOUNTEDSUJ(mp)) {
5091		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
5092		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
5093		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
5094		jnewblk->jn_state = ATTACHED;
5095		jnewblk->jn_blkno = newblkno;
5096		jnewblk->jn_frags = frags;
5097		jnewblk->jn_oldfrags = oldfrags;
5098#ifdef SUJ_DEBUG
5099		{
5100			struct cg *cgp;
5101			uint8_t *blksfree;
5102			long bno;
5103			int i;
5104
5105			cgp = (struct cg *)bp->b_data;
5106			blksfree = cg_blksfree(cgp);
5107			bno = dtogd(fs, jnewblk->jn_blkno);
5108			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
5109			    i++) {
5110				if (isset(blksfree, bno + i))
5111					panic("softdep_setup_blkmapdep: "
5112					    "free fragment %d from %d-%d "
5113					    "state 0x%X dep %p", i,
5114					    jnewblk->jn_oldfrags,
5115					    jnewblk->jn_frags,
5116					    jnewblk->jn_state,
5117					    jnewblk->jn_dep);
5118			}
5119		}
5120#endif
5121	}
5122
5123	CTR3(KTR_SUJ,
5124	    "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
5125	    newblkno, frags, oldfrags);
5126	ACQUIRE_LOCK(ump);
5127	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
5128		panic("softdep_setup_blkmapdep: found block");
5129	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
5130	    dtog(fs, newblkno), NULL);
5131	if (jnewblk) {
5132		jnewblk->jn_dep = (struct worklist *)newblk;
5133		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
5134	} else {
5135		newblk->nb_state |= ONDEPLIST;
5136		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
5137	}
5138	newblk->nb_bmsafemap = bmsafemap;
5139	newblk->nb_jnewblk = jnewblk;
5140	FREE_LOCK(ump);
5141}
5142
5143#define	BMSAFEMAP_HASH(ump, cg) \
5144      (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
5145
5146static int
5147bmsafemap_find(bmsafemaphd, cg, bmsafemapp)
5148	struct bmsafemap_hashhead *bmsafemaphd;
5149	int cg;
5150	struct bmsafemap **bmsafemapp;
5151{
5152	struct bmsafemap *bmsafemap;
5153
5154	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
5155		if (bmsafemap->sm_cg == cg)
5156			break;
5157	if (bmsafemap) {
5158		*bmsafemapp = bmsafemap;
5159		return (1);
5160	}
5161	*bmsafemapp = NULL;
5162
5163	return (0);
5164}
5165
5166/*
5167 * Find the bmsafemap associated with a cylinder group buffer.
5168 * If none exists, create one. The buffer must be locked when
5169 * this routine is called and this routine must be called with
5170 * the softdep lock held. To avoid giving up the lock while
5171 * allocating a new bmsafemap, a preallocated bmsafemap may be
5172 * provided. If it is provided but not needed, it is freed.
5173 */
5174static struct bmsafemap *
5175bmsafemap_lookup(mp, bp, cg, newbmsafemap)
5176	struct mount *mp;
5177	struct buf *bp;
5178	int cg;
5179	struct bmsafemap *newbmsafemap;
5180{
5181	struct bmsafemap_hashhead *bmsafemaphd;
5182	struct bmsafemap *bmsafemap, *collision;
5183	struct worklist *wk;
5184	struct ufsmount *ump;
5185
5186	ump = VFSTOUFS(mp);
5187	LOCK_OWNED(ump);
5188	KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
5189	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5190		if (wk->wk_type == D_BMSAFEMAP) {
5191			if (newbmsafemap)
5192				WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5193			return (WK_BMSAFEMAP(wk));
5194		}
5195	}
5196	bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
5197	if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
5198		if (newbmsafemap)
5199			WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5200		return (bmsafemap);
5201	}
5202	if (newbmsafemap) {
5203		bmsafemap = newbmsafemap;
5204	} else {
5205		FREE_LOCK(ump);
5206		bmsafemap = malloc(sizeof(struct bmsafemap),
5207			M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5208		workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5209		ACQUIRE_LOCK(ump);
5210	}
5211	bmsafemap->sm_buf = bp;
5212	LIST_INIT(&bmsafemap->sm_inodedephd);
5213	LIST_INIT(&bmsafemap->sm_inodedepwr);
5214	LIST_INIT(&bmsafemap->sm_newblkhd);
5215	LIST_INIT(&bmsafemap->sm_newblkwr);
5216	LIST_INIT(&bmsafemap->sm_jaddrefhd);
5217	LIST_INIT(&bmsafemap->sm_jnewblkhd);
5218	LIST_INIT(&bmsafemap->sm_freehd);
5219	LIST_INIT(&bmsafemap->sm_freewr);
5220	if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
5221		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5222		return (collision);
5223	}
5224	bmsafemap->sm_cg = cg;
5225	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5226	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
5227	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5228	return (bmsafemap);
5229}
5230
5231/*
5232 * Direct block allocation dependencies.
5233 *
5234 * When a new block is allocated, the corresponding disk locations must be
5235 * initialized (with zeros or new data) before the on-disk inode points to
5236 * them.  Also, the freemap from which the block was allocated must be
5237 * updated (on disk) before the inode's pointer. These two dependencies are
5238 * independent of each other and are needed for all file blocks and indirect
5239 * blocks that are pointed to directly by the inode.  Just before the
5240 * "in-core" version of the inode is updated with a newly allocated block
5241 * number, a procedure (below) is called to setup allocation dependency
5242 * structures.  These structures are removed when the corresponding
5243 * dependencies are satisfied or when the block allocation becomes obsolete
5244 * (i.e., the file is deleted, the block is de-allocated, or the block is a
5245 * fragment that gets upgraded).  All of these cases are handled in
5246 * procedures described later.
5247 *
5248 * When a file extension causes a fragment to be upgraded, either to a larger
5249 * fragment or to a full block, the on-disk location may change (if the
5250 * previous fragment could not simply be extended). In this case, the old
5251 * fragment must be de-allocated, but not until after the inode's pointer has
5252 * been updated. In most cases, this is handled by later procedures, which
5253 * will construct a "freefrag" structure to be added to the workitem queue
5254 * when the inode update is complete (or obsolete).  The main exception to
5255 * this is when an allocation occurs while a pending allocation dependency
5256 * (for the same block pointer) remains.  This case is handled in the main
5257 * allocation dependency setup procedure by immediately freeing the
5258 * unreferenced fragments.
5259 */
5260void
5261softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5262	struct inode *ip;	/* inode to which block is being added */
5263	ufs_lbn_t off;		/* block pointer within inode */
5264	ufs2_daddr_t newblkno;	/* disk block number being added */
5265	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
5266	long newsize;		/* size of new block */
5267	long oldsize;		/* size of new block */
5268	struct buf *bp;		/* bp for allocated block */
5269{
5270	struct allocdirect *adp, *oldadp;
5271	struct allocdirectlst *adphead;
5272	struct freefrag *freefrag;
5273	struct inodedep *inodedep;
5274	struct pagedep *pagedep;
5275	struct jnewblk *jnewblk;
5276	struct newblk *newblk;
5277	struct mount *mp;
5278	ufs_lbn_t lbn;
5279
5280	lbn = bp->b_lblkno;
5281	mp = UFSTOVFS(ip->i_ump);
5282	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5283	    ("softdep_setup_allocdirect called on non-softdep filesystem"));
5284	if (oldblkno && oldblkno != newblkno)
5285		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5286	else
5287		freefrag = NULL;
5288
5289	CTR6(KTR_SUJ,
5290	    "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5291	    "off %jd newsize %ld oldsize %d",
5292	    ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5293	ACQUIRE_LOCK(ip->i_ump);
5294	if (off >= NDADDR) {
5295		if (lbn > 0)
5296			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5297			    lbn, off);
5298		/* allocating an indirect block */
5299		if (oldblkno != 0)
5300			panic("softdep_setup_allocdirect: non-zero indir");
5301	} else {
5302		if (off != lbn)
5303			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5304			    lbn, off);
5305		/*
5306		 * Allocating a direct block.
5307		 *
5308		 * If we are allocating a directory block, then we must
5309		 * allocate an associated pagedep to track additions and
5310		 * deletions.
5311		 */
5312		if ((ip->i_mode & IFMT) == IFDIR)
5313			pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5314			    &pagedep);
5315	}
5316	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5317		panic("softdep_setup_allocdirect: lost block");
5318	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5319	    ("softdep_setup_allocdirect: newblk already initialized"));
5320	/*
5321	 * Convert the newblk to an allocdirect.
5322	 */
5323	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5324	adp = (struct allocdirect *)newblk;
5325	newblk->nb_freefrag = freefrag;
5326	adp->ad_offset = off;
5327	adp->ad_oldblkno = oldblkno;
5328	adp->ad_newsize = newsize;
5329	adp->ad_oldsize = oldsize;
5330
5331	/*
5332	 * Finish initializing the journal.
5333	 */
5334	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5335		jnewblk->jn_ino = ip->i_number;
5336		jnewblk->jn_lbn = lbn;
5337		add_to_journal(&jnewblk->jn_list);
5338	}
5339	if (freefrag && freefrag->ff_jdep != NULL &&
5340	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5341		add_to_journal(freefrag->ff_jdep);
5342	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5343	adp->ad_inodedep = inodedep;
5344
5345	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5346	/*
5347	 * The list of allocdirects must be kept in sorted and ascending
5348	 * order so that the rollback routines can quickly determine the
5349	 * first uncommitted block (the size of the file stored on disk
5350	 * ends at the end of the lowest committed fragment, or if there
5351	 * are no fragments, at the end of the highest committed block).
5352	 * Since files generally grow, the typical case is that the new
5353	 * block is to be added at the end of the list. We speed this
5354	 * special case by checking against the last allocdirect in the
5355	 * list before laboriously traversing the list looking for the
5356	 * insertion point.
5357	 */
5358	adphead = &inodedep->id_newinoupdt;
5359	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5360	if (oldadp == NULL || oldadp->ad_offset <= off) {
5361		/* insert at end of list */
5362		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5363		if (oldadp != NULL && oldadp->ad_offset == off)
5364			allocdirect_merge(adphead, adp, oldadp);
5365		FREE_LOCK(ip->i_ump);
5366		return;
5367	}
5368	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5369		if (oldadp->ad_offset >= off)
5370			break;
5371	}
5372	if (oldadp == NULL)
5373		panic("softdep_setup_allocdirect: lost entry");
5374	/* insert in middle of list */
5375	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5376	if (oldadp->ad_offset == off)
5377		allocdirect_merge(adphead, adp, oldadp);
5378
5379	FREE_LOCK(ip->i_ump);
5380}
5381
5382/*
5383 * Merge a newer and older journal record to be stored either in a
5384 * newblock or freefrag.  This handles aggregating journal records for
5385 * fragment allocation into a second record as well as replacing a
5386 * journal free with an aborted journal allocation.  A segment for the
5387 * oldest record will be placed on wkhd if it has been written.  If not
5388 * the segment for the newer record will suffice.
5389 */
5390static struct worklist *
5391jnewblk_merge(new, old, wkhd)
5392	struct worklist *new;
5393	struct worklist *old;
5394	struct workhead *wkhd;
5395{
5396	struct jnewblk *njnewblk;
5397	struct jnewblk *jnewblk;
5398
5399	/* Handle NULLs to simplify callers. */
5400	if (new == NULL)
5401		return (old);
5402	if (old == NULL)
5403		return (new);
5404	/* Replace a jfreefrag with a jnewblk. */
5405	if (new->wk_type == D_JFREEFRAG) {
5406		if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5407			panic("jnewblk_merge: blkno mismatch: %p, %p",
5408			    old, new);
5409		cancel_jfreefrag(WK_JFREEFRAG(new));
5410		return (old);
5411	}
5412	if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5413		panic("jnewblk_merge: Bad type: old %d new %d\n",
5414		    old->wk_type, new->wk_type);
5415	/*
5416	 * Handle merging of two jnewblk records that describe
5417	 * different sets of fragments in the same block.
5418	 */
5419	jnewblk = WK_JNEWBLK(old);
5420	njnewblk = WK_JNEWBLK(new);
5421	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5422		panic("jnewblk_merge: Merging disparate blocks.");
5423	/*
5424	 * The record may be rolled back in the cg.
5425	 */
5426	if (jnewblk->jn_state & UNDONE) {
5427		jnewblk->jn_state &= ~UNDONE;
5428		njnewblk->jn_state |= UNDONE;
5429		njnewblk->jn_state &= ~ATTACHED;
5430	}
5431	/*
5432	 * We modify the newer addref and free the older so that if neither
5433	 * has been written the most up-to-date copy will be on disk.  If
5434	 * both have been written but rolled back we only temporarily need
5435	 * one of them to fix the bits when the cg write completes.
5436	 */
5437	jnewblk->jn_state |= ATTACHED | COMPLETE;
5438	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5439	cancel_jnewblk(jnewblk, wkhd);
5440	WORKLIST_REMOVE(&jnewblk->jn_list);
5441	free_jnewblk(jnewblk);
5442	return (new);
5443}
5444
5445/*
5446 * Replace an old allocdirect dependency with a newer one.
5447 * This routine must be called with splbio interrupts blocked.
5448 */
5449static void
5450allocdirect_merge(adphead, newadp, oldadp)
5451	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
5452	struct allocdirect *newadp;	/* allocdirect being added */
5453	struct allocdirect *oldadp;	/* existing allocdirect being checked */
5454{
5455	struct worklist *wk;
5456	struct freefrag *freefrag;
5457
5458	freefrag = NULL;
5459	LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
5460	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5461	    newadp->ad_oldsize != oldadp->ad_newsize ||
5462	    newadp->ad_offset >= NDADDR)
5463		panic("%s %jd != new %jd || old size %ld != new %ld",
5464		    "allocdirect_merge: old blkno",
5465		    (intmax_t)newadp->ad_oldblkno,
5466		    (intmax_t)oldadp->ad_newblkno,
5467		    newadp->ad_oldsize, oldadp->ad_newsize);
5468	newadp->ad_oldblkno = oldadp->ad_oldblkno;
5469	newadp->ad_oldsize = oldadp->ad_oldsize;
5470	/*
5471	 * If the old dependency had a fragment to free or had never
5472	 * previously had a block allocated, then the new dependency
5473	 * can immediately post its freefrag and adopt the old freefrag.
5474	 * This action is done by swapping the freefrag dependencies.
5475	 * The new dependency gains the old one's freefrag, and the
5476	 * old one gets the new one and then immediately puts it on
5477	 * the worklist when it is freed by free_newblk. It is
5478	 * not possible to do this swap when the old dependency had a
5479	 * non-zero size but no previous fragment to free. This condition
5480	 * arises when the new block is an extension of the old block.
5481	 * Here, the first part of the fragment allocated to the new
5482	 * dependency is part of the block currently claimed on disk by
5483	 * the old dependency, so cannot legitimately be freed until the
5484	 * conditions for the new dependency are fulfilled.
5485	 */
5486	freefrag = newadp->ad_freefrag;
5487	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5488		newadp->ad_freefrag = oldadp->ad_freefrag;
5489		oldadp->ad_freefrag = freefrag;
5490	}
5491	/*
5492	 * If we are tracking a new directory-block allocation,
5493	 * move it from the old allocdirect to the new allocdirect.
5494	 */
5495	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5496		WORKLIST_REMOVE(wk);
5497		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5498			panic("allocdirect_merge: extra newdirblk");
5499		WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5500	}
5501	TAILQ_REMOVE(adphead, oldadp, ad_next);
5502	/*
5503	 * We need to move any journal dependencies over to the freefrag
5504	 * that releases this block if it exists.  Otherwise we are
5505	 * extending an existing block and we'll wait until that is
5506	 * complete to release the journal space and extend the
5507	 * new journal to cover this old space as well.
5508	 */
5509	if (freefrag == NULL) {
5510		if (oldadp->ad_newblkno != newadp->ad_newblkno)
5511			panic("allocdirect_merge: %jd != %jd",
5512			    oldadp->ad_newblkno, newadp->ad_newblkno);
5513		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5514		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5515		    &oldadp->ad_block.nb_jnewblk->jn_list,
5516		    &newadp->ad_block.nb_jwork);
5517		oldadp->ad_block.nb_jnewblk = NULL;
5518		cancel_newblk(&oldadp->ad_block, NULL,
5519		    &newadp->ad_block.nb_jwork);
5520	} else {
5521		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5522		    &freefrag->ff_list, &freefrag->ff_jwork);
5523		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5524		    &freefrag->ff_jwork);
5525	}
5526	free_newblk(&oldadp->ad_block);
5527}
5528
5529/*
5530 * Allocate a jfreefrag structure to journal a single block free.
5531 */
5532static struct jfreefrag *
5533newjfreefrag(freefrag, ip, blkno, size, lbn)
5534	struct freefrag *freefrag;
5535	struct inode *ip;
5536	ufs2_daddr_t blkno;
5537	long size;
5538	ufs_lbn_t lbn;
5539{
5540	struct jfreefrag *jfreefrag;
5541	struct fs *fs;
5542
5543	fs = ip->i_fs;
5544	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5545	    M_SOFTDEP_FLAGS);
5546	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
5547	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5548	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5549	jfreefrag->fr_ino = ip->i_number;
5550	jfreefrag->fr_lbn = lbn;
5551	jfreefrag->fr_blkno = blkno;
5552	jfreefrag->fr_frags = numfrags(fs, size);
5553	jfreefrag->fr_freefrag = freefrag;
5554
5555	return (jfreefrag);
5556}
5557
5558/*
5559 * Allocate a new freefrag structure.
5560 */
5561static struct freefrag *
5562newfreefrag(ip, blkno, size, lbn)
5563	struct inode *ip;
5564	ufs2_daddr_t blkno;
5565	long size;
5566	ufs_lbn_t lbn;
5567{
5568	struct freefrag *freefrag;
5569	struct fs *fs;
5570
5571	CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5572	    ip->i_number, blkno, size, lbn);
5573	fs = ip->i_fs;
5574	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5575		panic("newfreefrag: frag size");
5576	freefrag = malloc(sizeof(struct freefrag),
5577	    M_FREEFRAG, M_SOFTDEP_FLAGS);
5578	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
5579	freefrag->ff_state = ATTACHED;
5580	LIST_INIT(&freefrag->ff_jwork);
5581	freefrag->ff_inum = ip->i_number;
5582	freefrag->ff_vtype = ITOV(ip)->v_type;
5583	freefrag->ff_blkno = blkno;
5584	freefrag->ff_fragsize = size;
5585
5586	if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) {
5587		freefrag->ff_jdep = (struct worklist *)
5588		    newjfreefrag(freefrag, ip, blkno, size, lbn);
5589	} else {
5590		freefrag->ff_state |= DEPCOMPLETE;
5591		freefrag->ff_jdep = NULL;
5592	}
5593
5594	return (freefrag);
5595}
5596
5597/*
5598 * This workitem de-allocates fragments that were replaced during
5599 * file block allocation.
5600 */
5601static void
5602handle_workitem_freefrag(freefrag)
5603	struct freefrag *freefrag;
5604{
5605	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5606	struct workhead wkhd;
5607
5608	CTR3(KTR_SUJ,
5609	    "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5610	    freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5611	/*
5612	 * It would be illegal to add new completion items to the
5613	 * freefrag after it was schedule to be done so it must be
5614	 * safe to modify the list head here.
5615	 */
5616	LIST_INIT(&wkhd);
5617	ACQUIRE_LOCK(ump);
5618	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5619	/*
5620	 * If the journal has not been written we must cancel it here.
5621	 */
5622	if (freefrag->ff_jdep) {
5623		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5624			panic("handle_workitem_freefrag: Unexpected type %d\n",
5625			    freefrag->ff_jdep->wk_type);
5626		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5627	}
5628	FREE_LOCK(ump);
5629	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5630	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
5631	ACQUIRE_LOCK(ump);
5632	WORKITEM_FREE(freefrag, D_FREEFRAG);
5633	FREE_LOCK(ump);
5634}
5635
5636/*
5637 * Set up a dependency structure for an external attributes data block.
5638 * This routine follows much of the structure of softdep_setup_allocdirect.
5639 * See the description of softdep_setup_allocdirect above for details.
5640 */
5641void
5642softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5643	struct inode *ip;
5644	ufs_lbn_t off;
5645	ufs2_daddr_t newblkno;
5646	ufs2_daddr_t oldblkno;
5647	long newsize;
5648	long oldsize;
5649	struct buf *bp;
5650{
5651	struct allocdirect *adp, *oldadp;
5652	struct allocdirectlst *adphead;
5653	struct freefrag *freefrag;
5654	struct inodedep *inodedep;
5655	struct jnewblk *jnewblk;
5656	struct newblk *newblk;
5657	struct mount *mp;
5658	ufs_lbn_t lbn;
5659
5660	mp = UFSTOVFS(ip->i_ump);
5661	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5662	    ("softdep_setup_allocext called on non-softdep filesystem"));
5663	KASSERT(off < NXADDR, ("softdep_setup_allocext: lbn %lld > NXADDR",
5664		    (long long)off));
5665
5666	lbn = bp->b_lblkno;
5667	if (oldblkno && oldblkno != newblkno)
5668		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5669	else
5670		freefrag = NULL;
5671
5672	ACQUIRE_LOCK(ip->i_ump);
5673	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5674		panic("softdep_setup_allocext: lost block");
5675	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5676	    ("softdep_setup_allocext: newblk already initialized"));
5677	/*
5678	 * Convert the newblk to an allocdirect.
5679	 */
5680	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5681	adp = (struct allocdirect *)newblk;
5682	newblk->nb_freefrag = freefrag;
5683	adp->ad_offset = off;
5684	adp->ad_oldblkno = oldblkno;
5685	adp->ad_newsize = newsize;
5686	adp->ad_oldsize = oldsize;
5687	adp->ad_state |=  EXTDATA;
5688
5689	/*
5690	 * Finish initializing the journal.
5691	 */
5692	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5693		jnewblk->jn_ino = ip->i_number;
5694		jnewblk->jn_lbn = lbn;
5695		add_to_journal(&jnewblk->jn_list);
5696	}
5697	if (freefrag && freefrag->ff_jdep != NULL &&
5698	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5699		add_to_journal(freefrag->ff_jdep);
5700	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5701	adp->ad_inodedep = inodedep;
5702
5703	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5704	/*
5705	 * The list of allocdirects must be kept in sorted and ascending
5706	 * order so that the rollback routines can quickly determine the
5707	 * first uncommitted block (the size of the file stored on disk
5708	 * ends at the end of the lowest committed fragment, or if there
5709	 * are no fragments, at the end of the highest committed block).
5710	 * Since files generally grow, the typical case is that the new
5711	 * block is to be added at the end of the list. We speed this
5712	 * special case by checking against the last allocdirect in the
5713	 * list before laboriously traversing the list looking for the
5714	 * insertion point.
5715	 */
5716	adphead = &inodedep->id_newextupdt;
5717	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5718	if (oldadp == NULL || oldadp->ad_offset <= off) {
5719		/* insert at end of list */
5720		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5721		if (oldadp != NULL && oldadp->ad_offset == off)
5722			allocdirect_merge(adphead, adp, oldadp);
5723		FREE_LOCK(ip->i_ump);
5724		return;
5725	}
5726	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5727		if (oldadp->ad_offset >= off)
5728			break;
5729	}
5730	if (oldadp == NULL)
5731		panic("softdep_setup_allocext: lost entry");
5732	/* insert in middle of list */
5733	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5734	if (oldadp->ad_offset == off)
5735		allocdirect_merge(adphead, adp, oldadp);
5736	FREE_LOCK(ip->i_ump);
5737}
5738
5739/*
5740 * Indirect block allocation dependencies.
5741 *
5742 * The same dependencies that exist for a direct block also exist when
5743 * a new block is allocated and pointed to by an entry in a block of
5744 * indirect pointers. The undo/redo states described above are also
5745 * used here. Because an indirect block contains many pointers that
5746 * may have dependencies, a second copy of the entire in-memory indirect
5747 * block is kept. The buffer cache copy is always completely up-to-date.
5748 * The second copy, which is used only as a source for disk writes,
5749 * contains only the safe pointers (i.e., those that have no remaining
5750 * update dependencies). The second copy is freed when all pointers
5751 * are safe. The cache is not allowed to replace indirect blocks with
5752 * pending update dependencies. If a buffer containing an indirect
5753 * block with dependencies is written, these routines will mark it
5754 * dirty again. It can only be successfully written once all the
5755 * dependencies are removed. The ffs_fsync routine in conjunction with
5756 * softdep_sync_metadata work together to get all the dependencies
5757 * removed so that a file can be successfully written to disk. Three
5758 * procedures are used when setting up indirect block pointer
5759 * dependencies. The division is necessary because of the organization
5760 * of the "balloc" routine and because of the distinction between file
5761 * pages and file metadata blocks.
5762 */
5763
5764/*
5765 * Allocate a new allocindir structure.
5766 */
5767static struct allocindir *
5768newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
5769	struct inode *ip;	/* inode for file being extended */
5770	int ptrno;		/* offset of pointer in indirect block */
5771	ufs2_daddr_t newblkno;	/* disk block number being added */
5772	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5773	ufs_lbn_t lbn;
5774{
5775	struct newblk *newblk;
5776	struct allocindir *aip;
5777	struct freefrag *freefrag;
5778	struct jnewblk *jnewblk;
5779
5780	if (oldblkno)
5781		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
5782	else
5783		freefrag = NULL;
5784	ACQUIRE_LOCK(ip->i_ump);
5785	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
5786		panic("new_allocindir: lost block");
5787	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5788	    ("newallocindir: newblk already initialized"));
5789	WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
5790	newblk->nb_freefrag = freefrag;
5791	aip = (struct allocindir *)newblk;
5792	aip->ai_offset = ptrno;
5793	aip->ai_oldblkno = oldblkno;
5794	aip->ai_lbn = lbn;
5795	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5796		jnewblk->jn_ino = ip->i_number;
5797		jnewblk->jn_lbn = lbn;
5798		add_to_journal(&jnewblk->jn_list);
5799	}
5800	if (freefrag && freefrag->ff_jdep != NULL &&
5801	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5802		add_to_journal(freefrag->ff_jdep);
5803	return (aip);
5804}
5805
5806/*
5807 * Called just before setting an indirect block pointer
5808 * to a newly allocated file page.
5809 */
5810void
5811softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
5812	struct inode *ip;	/* inode for file being extended */
5813	ufs_lbn_t lbn;		/* allocated block number within file */
5814	struct buf *bp;		/* buffer with indirect blk referencing page */
5815	int ptrno;		/* offset of pointer in indirect block */
5816	ufs2_daddr_t newblkno;	/* disk block number being added */
5817	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5818	struct buf *nbp;	/* buffer holding allocated page */
5819{
5820	struct inodedep *inodedep;
5821	struct freefrag *freefrag;
5822	struct allocindir *aip;
5823	struct pagedep *pagedep;
5824	struct mount *mp;
5825	int dflags;
5826
5827	mp = UFSTOVFS(ip->i_ump);
5828	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5829	    ("softdep_setup_allocindir_page called on non-softdep filesystem"));
5830	KASSERT(lbn == nbp->b_lblkno,
5831	    ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
5832	    lbn, bp->b_lblkno));
5833	CTR4(KTR_SUJ,
5834	    "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
5835	    "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
5836	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
5837	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
5838	dflags = DEPALLOC;
5839	if (IS_SNAPSHOT(ip))
5840		dflags |= NODELAY;
5841	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
5842	/*
5843	 * If we are allocating a directory page, then we must
5844	 * allocate an associated pagedep to track additions and
5845	 * deletions.
5846	 */
5847	if ((ip->i_mode & IFMT) == IFDIR)
5848		pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
5849	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5850	freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5851	FREE_LOCK(ip->i_ump);
5852	if (freefrag)
5853		handle_workitem_freefrag(freefrag);
5854}
5855
5856/*
5857 * Called just before setting an indirect block pointer to a
5858 * newly allocated indirect block.
5859 */
5860void
5861softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
5862	struct buf *nbp;	/* newly allocated indirect block */
5863	struct inode *ip;	/* inode for file being extended */
5864	struct buf *bp;		/* indirect block referencing allocated block */
5865	int ptrno;		/* offset of pointer in indirect block */
5866	ufs2_daddr_t newblkno;	/* disk block number being added */
5867{
5868	struct inodedep *inodedep;
5869	struct allocindir *aip;
5870	ufs_lbn_t lbn;
5871	int dflags;
5872
5873	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
5874	    ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
5875	CTR3(KTR_SUJ,
5876	    "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
5877	    ip->i_number, newblkno, ptrno);
5878	lbn = nbp->b_lblkno;
5879	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
5880	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
5881	dflags = DEPALLOC;
5882	if (IS_SNAPSHOT(ip))
5883		dflags |= NODELAY;
5884	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
5885	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5886	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
5887		panic("softdep_setup_allocindir_meta: Block already existed");
5888	FREE_LOCK(ip->i_ump);
5889}
5890
5891static void
5892indirdep_complete(indirdep)
5893	struct indirdep *indirdep;
5894{
5895	struct allocindir *aip;
5896
5897	LIST_REMOVE(indirdep, ir_next);
5898	indirdep->ir_state |= DEPCOMPLETE;
5899
5900	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5901		LIST_REMOVE(aip, ai_next);
5902		free_newblk(&aip->ai_block);
5903	}
5904	/*
5905	 * If this indirdep is not attached to a buf it was simply waiting
5906	 * on completion to clear completehd.  free_indirdep() asserts
5907	 * that nothing is dangling.
5908	 */
5909	if ((indirdep->ir_state & ONWORKLIST) == 0)
5910		free_indirdep(indirdep);
5911}
5912
5913static struct indirdep *
5914indirdep_lookup(mp, ip, bp)
5915	struct mount *mp;
5916	struct inode *ip;
5917	struct buf *bp;
5918{
5919	struct indirdep *indirdep, *newindirdep;
5920	struct newblk *newblk;
5921	struct ufsmount *ump;
5922	struct worklist *wk;
5923	struct fs *fs;
5924	ufs2_daddr_t blkno;
5925
5926	ump = VFSTOUFS(mp);
5927	LOCK_OWNED(ump);
5928	indirdep = NULL;
5929	newindirdep = NULL;
5930	fs = ip->i_fs;
5931	for (;;) {
5932		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5933			if (wk->wk_type != D_INDIRDEP)
5934				continue;
5935			indirdep = WK_INDIRDEP(wk);
5936			break;
5937		}
5938		/* Found on the buffer worklist, no new structure to free. */
5939		if (indirdep != NULL && newindirdep == NULL)
5940			return (indirdep);
5941		if (indirdep != NULL && newindirdep != NULL)
5942			panic("indirdep_lookup: simultaneous create");
5943		/* None found on the buffer and a new structure is ready. */
5944		if (indirdep == NULL && newindirdep != NULL)
5945			break;
5946		/* None found and no new structure available. */
5947		FREE_LOCK(ump);
5948		newindirdep = malloc(sizeof(struct indirdep),
5949		    M_INDIRDEP, M_SOFTDEP_FLAGS);
5950		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5951		newindirdep->ir_state = ATTACHED;
5952		if (ip->i_ump->um_fstype == UFS1)
5953			newindirdep->ir_state |= UFS1FMT;
5954		TAILQ_INIT(&newindirdep->ir_trunc);
5955		newindirdep->ir_saveddata = NULL;
5956		LIST_INIT(&newindirdep->ir_deplisthd);
5957		LIST_INIT(&newindirdep->ir_donehd);
5958		LIST_INIT(&newindirdep->ir_writehd);
5959		LIST_INIT(&newindirdep->ir_completehd);
5960		if (bp->b_blkno == bp->b_lblkno) {
5961			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5962			    NULL, NULL);
5963			bp->b_blkno = blkno;
5964		}
5965		newindirdep->ir_freeblks = NULL;
5966		newindirdep->ir_savebp =
5967		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5968		newindirdep->ir_bp = bp;
5969		BUF_KERNPROC(newindirdep->ir_savebp);
5970		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5971		ACQUIRE_LOCK(ump);
5972	}
5973	indirdep = newindirdep;
5974	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
5975	/*
5976	 * If the block is not yet allocated we don't set DEPCOMPLETE so
5977	 * that we don't free dependencies until the pointers are valid.
5978	 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
5979	 * than using the hash.
5980	 */
5981	if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
5982		LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
5983	else
5984		indirdep->ir_state |= DEPCOMPLETE;
5985	return (indirdep);
5986}
5987
5988/*
5989 * Called to finish the allocation of the "aip" allocated
5990 * by one of the two routines above.
5991 */
5992static struct freefrag *
5993setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
5994	struct buf *bp;		/* in-memory copy of the indirect block */
5995	struct inode *ip;	/* inode for file being extended */
5996	struct inodedep *inodedep; /* Inodedep for ip */
5997	struct allocindir *aip;	/* allocindir allocated by the above routines */
5998	ufs_lbn_t lbn;		/* Logical block number for this block. */
5999{
6000	struct fs *fs;
6001	struct indirdep *indirdep;
6002	struct allocindir *oldaip;
6003	struct freefrag *freefrag;
6004	struct mount *mp;
6005
6006	LOCK_OWNED(ip->i_ump);
6007	mp = UFSTOVFS(ip->i_ump);
6008	fs = ip->i_fs;
6009	if (bp->b_lblkno >= 0)
6010		panic("setup_allocindir_phase2: not indir blk");
6011	KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
6012	    ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
6013	indirdep = indirdep_lookup(mp, ip, bp);
6014	KASSERT(indirdep->ir_savebp != NULL,
6015	    ("setup_allocindir_phase2 NULL ir_savebp"));
6016	aip->ai_indirdep = indirdep;
6017	/*
6018	 * Check for an unwritten dependency for this indirect offset.  If
6019	 * there is, merge the old dependency into the new one.  This happens
6020	 * as a result of reallocblk only.
6021	 */
6022	freefrag = NULL;
6023	if (aip->ai_oldblkno != 0) {
6024		LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
6025			if (oldaip->ai_offset == aip->ai_offset) {
6026				freefrag = allocindir_merge(aip, oldaip);
6027				goto done;
6028			}
6029		}
6030		LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
6031			if (oldaip->ai_offset == aip->ai_offset) {
6032				freefrag = allocindir_merge(aip, oldaip);
6033				goto done;
6034			}
6035		}
6036	}
6037done:
6038	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
6039	return (freefrag);
6040}
6041
6042/*
6043 * Merge two allocindirs which refer to the same block.  Move newblock
6044 * dependencies and setup the freefrags appropriately.
6045 */
6046static struct freefrag *
6047allocindir_merge(aip, oldaip)
6048	struct allocindir *aip;
6049	struct allocindir *oldaip;
6050{
6051	struct freefrag *freefrag;
6052	struct worklist *wk;
6053
6054	if (oldaip->ai_newblkno != aip->ai_oldblkno)
6055		panic("allocindir_merge: blkno");
6056	aip->ai_oldblkno = oldaip->ai_oldblkno;
6057	freefrag = aip->ai_freefrag;
6058	aip->ai_freefrag = oldaip->ai_freefrag;
6059	oldaip->ai_freefrag = NULL;
6060	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
6061	/*
6062	 * If we are tracking a new directory-block allocation,
6063	 * move it from the old allocindir to the new allocindir.
6064	 */
6065	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
6066		WORKLIST_REMOVE(wk);
6067		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
6068			panic("allocindir_merge: extra newdirblk");
6069		WORKLIST_INSERT(&aip->ai_newdirblk, wk);
6070	}
6071	/*
6072	 * We can skip journaling for this freefrag and just complete
6073	 * any pending journal work for the allocindir that is being
6074	 * removed after the freefrag completes.
6075	 */
6076	if (freefrag->ff_jdep)
6077		cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
6078	LIST_REMOVE(oldaip, ai_next);
6079	freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
6080	    &freefrag->ff_list, &freefrag->ff_jwork);
6081	free_newblk(&oldaip->ai_block);
6082
6083	return (freefrag);
6084}
6085
6086static inline void
6087setup_freedirect(freeblks, ip, i, needj)
6088	struct freeblks *freeblks;
6089	struct inode *ip;
6090	int i;
6091	int needj;
6092{
6093	ufs2_daddr_t blkno;
6094	int frags;
6095
6096	blkno = DIP(ip, i_db[i]);
6097	if (blkno == 0)
6098		return;
6099	DIP_SET(ip, i_db[i], 0);
6100	frags = sblksize(ip->i_fs, ip->i_size, i);
6101	frags = numfrags(ip->i_fs, frags);
6102	newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj);
6103}
6104
6105static inline void
6106setup_freeext(freeblks, ip, i, needj)
6107	struct freeblks *freeblks;
6108	struct inode *ip;
6109	int i;
6110	int needj;
6111{
6112	ufs2_daddr_t blkno;
6113	int frags;
6114
6115	blkno = ip->i_din2->di_extb[i];
6116	if (blkno == 0)
6117		return;
6118	ip->i_din2->di_extb[i] = 0;
6119	frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
6120	frags = numfrags(ip->i_fs, frags);
6121	newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
6122}
6123
6124static inline void
6125setup_freeindir(freeblks, ip, i, lbn, needj)
6126	struct freeblks *freeblks;
6127	struct inode *ip;
6128	int i;
6129	ufs_lbn_t lbn;
6130	int needj;
6131{
6132	ufs2_daddr_t blkno;
6133
6134	blkno = DIP(ip, i_ib[i]);
6135	if (blkno == 0)
6136		return;
6137	DIP_SET(ip, i_ib[i], 0);
6138	newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
6139	    0, needj);
6140}
6141
6142static inline struct freeblks *
6143newfreeblks(mp, ip)
6144	struct mount *mp;
6145	struct inode *ip;
6146{
6147	struct freeblks *freeblks;
6148
6149	freeblks = malloc(sizeof(struct freeblks),
6150		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
6151	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
6152	LIST_INIT(&freeblks->fb_jblkdephd);
6153	LIST_INIT(&freeblks->fb_jwork);
6154	freeblks->fb_ref = 0;
6155	freeblks->fb_cgwait = 0;
6156	freeblks->fb_state = ATTACHED;
6157	freeblks->fb_uid = ip->i_uid;
6158	freeblks->fb_inum = ip->i_number;
6159	freeblks->fb_vtype = ITOV(ip)->v_type;
6160	freeblks->fb_modrev = DIP(ip, i_modrev);
6161	freeblks->fb_devvp = ip->i_devvp;
6162	freeblks->fb_chkcnt = 0;
6163	freeblks->fb_len = 0;
6164
6165	return (freeblks);
6166}
6167
6168static void
6169trunc_indirdep(indirdep, freeblks, bp, off)
6170	struct indirdep *indirdep;
6171	struct freeblks *freeblks;
6172	struct buf *bp;
6173	int off;
6174{
6175	struct allocindir *aip, *aipn;
6176
6177	/*
6178	 * The first set of allocindirs won't be in savedbp.
6179	 */
6180	LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
6181		if (aip->ai_offset > off)
6182			cancel_allocindir(aip, bp, freeblks, 1);
6183	LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
6184		if (aip->ai_offset > off)
6185			cancel_allocindir(aip, bp, freeblks, 1);
6186	/*
6187	 * These will exist in savedbp.
6188	 */
6189	LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
6190		if (aip->ai_offset > off)
6191			cancel_allocindir(aip, NULL, freeblks, 0);
6192	LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
6193		if (aip->ai_offset > off)
6194			cancel_allocindir(aip, NULL, freeblks, 0);
6195}
6196
6197/*
6198 * Follow the chain of indirects down to lastlbn creating a freework
6199 * structure for each.  This will be used to start indir_trunc() at
6200 * the right offset and create the journal records for the parrtial
6201 * truncation.  A second step will handle the truncated dependencies.
6202 */
6203static int
6204setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
6205	struct freeblks *freeblks;
6206	struct inode *ip;
6207	ufs_lbn_t lbn;
6208	ufs_lbn_t lastlbn;
6209	ufs2_daddr_t blkno;
6210{
6211	struct indirdep *indirdep;
6212	struct indirdep *indirn;
6213	struct freework *freework;
6214	struct newblk *newblk;
6215	struct mount *mp;
6216	struct buf *bp;
6217	uint8_t *start;
6218	uint8_t *end;
6219	ufs_lbn_t lbnadd;
6220	int level;
6221	int error;
6222	int off;
6223
6224
6225	freework = NULL;
6226	if (blkno == 0)
6227		return (0);
6228	mp = freeblks->fb_list.wk_mp;
6229	bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
6230	if ((bp->b_flags & B_CACHE) == 0) {
6231		bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
6232		bp->b_iocmd = BIO_READ;
6233		bp->b_flags &= ~B_INVAL;
6234		bp->b_ioflags &= ~BIO_ERROR;
6235		vfs_busy_pages(bp, 0);
6236		bp->b_iooffset = dbtob(bp->b_blkno);
6237		bstrategy(bp);
6238		curthread->td_ru.ru_inblock++;
6239		error = bufwait(bp);
6240		if (error) {
6241			brelse(bp);
6242			return (error);
6243		}
6244	}
6245	level = lbn_level(lbn);
6246	lbnadd = lbn_offset(ip->i_fs, level);
6247	/*
6248	 * Compute the offset of the last block we want to keep.  Store
6249	 * in the freework the first block we want to completely free.
6250	 */
6251	off = (lastlbn - -(lbn + level)) / lbnadd;
6252	if (off + 1 == NINDIR(ip->i_fs))
6253		goto nowork;
6254	freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1,
6255	    0);
6256	/*
6257	 * Link the freework into the indirdep.  This will prevent any new
6258	 * allocations from proceeding until we are finished with the
6259	 * truncate and the block is written.
6260	 */
6261	ACQUIRE_LOCK(ip->i_ump);
6262	indirdep = indirdep_lookup(mp, ip, bp);
6263	if (indirdep->ir_freeblks)
6264		panic("setup_trunc_indir: indirdep already truncated.");
6265	TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
6266	freework->fw_indir = indirdep;
6267	/*
6268	 * Cancel any allocindirs that will not make it to disk.
6269	 * We have to do this for all copies of the indirdep that
6270	 * live on this newblk.
6271	 */
6272	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
6273		newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk);
6274		LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
6275			trunc_indirdep(indirn, freeblks, bp, off);
6276	} else
6277		trunc_indirdep(indirdep, freeblks, bp, off);
6278	FREE_LOCK(ip->i_ump);
6279	/*
6280	 * Creation is protected by the buf lock. The saveddata is only
6281	 * needed if a full truncation follows a partial truncation but it
6282	 * is difficult to allocate in that case so we fetch it anyway.
6283	 */
6284	if (indirdep->ir_saveddata == NULL)
6285		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
6286		    M_SOFTDEP_FLAGS);
6287nowork:
6288	/* Fetch the blkno of the child and the zero start offset. */
6289	if (ip->i_ump->um_fstype == UFS1) {
6290		blkno = ((ufs1_daddr_t *)bp->b_data)[off];
6291		start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
6292	} else {
6293		blkno = ((ufs2_daddr_t *)bp->b_data)[off];
6294		start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
6295	}
6296	if (freework) {
6297		/* Zero the truncated pointers. */
6298		end = bp->b_data + bp->b_bcount;
6299		bzero(start, end - start);
6300		bdwrite(bp);
6301	} else
6302		bqrelse(bp);
6303	if (level == 0)
6304		return (0);
6305	lbn++; /* adjust level */
6306	lbn -= (off * lbnadd);
6307	return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
6308}
6309
6310/*
6311 * Complete the partial truncation of an indirect block setup by
6312 * setup_trunc_indir().  This zeros the truncated pointers in the saved
6313 * copy and writes them to disk before the freeblks is allowed to complete.
6314 */
6315static void
6316complete_trunc_indir(freework)
6317	struct freework *freework;
6318{
6319	struct freework *fwn;
6320	struct indirdep *indirdep;
6321	struct ufsmount *ump;
6322	struct buf *bp;
6323	uintptr_t start;
6324	int count;
6325
6326	ump = VFSTOUFS(freework->fw_list.wk_mp);
6327	LOCK_OWNED(ump);
6328	indirdep = freework->fw_indir;
6329	for (;;) {
6330		bp = indirdep->ir_bp;
6331		/* See if the block was discarded. */
6332		if (bp == NULL)
6333			break;
6334		/* Inline part of getdirtybuf().  We dont want bremfree. */
6335		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
6336			break;
6337		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6338		    LOCK_PTR(ump)) == 0)
6339			BUF_UNLOCK(bp);
6340		ACQUIRE_LOCK(ump);
6341	}
6342	freework->fw_state |= DEPCOMPLETE;
6343	TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
6344	/*
6345	 * Zero the pointers in the saved copy.
6346	 */
6347	if (indirdep->ir_state & UFS1FMT)
6348		start = sizeof(ufs1_daddr_t);
6349	else
6350		start = sizeof(ufs2_daddr_t);
6351	start *= freework->fw_start;
6352	count = indirdep->ir_savebp->b_bcount - start;
6353	start += (uintptr_t)indirdep->ir_savebp->b_data;
6354	bzero((char *)start, count);
6355	/*
6356	 * We need to start the next truncation in the list if it has not
6357	 * been started yet.
6358	 */
6359	fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6360	if (fwn != NULL) {
6361		if (fwn->fw_freeblks == indirdep->ir_freeblks)
6362			TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6363		if ((fwn->fw_state & ONWORKLIST) == 0)
6364			freework_enqueue(fwn);
6365	}
6366	/*
6367	 * If bp is NULL the block was fully truncated, restore
6368	 * the saved block list otherwise free it if it is no
6369	 * longer needed.
6370	 */
6371	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6372		if (bp == NULL)
6373			bcopy(indirdep->ir_saveddata,
6374			    indirdep->ir_savebp->b_data,
6375			    indirdep->ir_savebp->b_bcount);
6376		free(indirdep->ir_saveddata, M_INDIRDEP);
6377		indirdep->ir_saveddata = NULL;
6378	}
6379	/*
6380	 * When bp is NULL there is a full truncation pending.  We
6381	 * must wait for this full truncation to be journaled before
6382	 * we can release this freework because the disk pointers will
6383	 * never be written as zero.
6384	 */
6385	if (bp == NULL)  {
6386		if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6387			handle_written_freework(freework);
6388		else
6389			WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6390			   &freework->fw_list);
6391	} else {
6392		/* Complete when the real copy is written. */
6393		WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6394		BUF_UNLOCK(bp);
6395	}
6396}
6397
6398/*
6399 * Calculate the number of blocks we are going to release where datablocks
6400 * is the current total and length is the new file size.
6401 */
6402static ufs2_daddr_t
6403blkcount(fs, datablocks, length)
6404	struct fs *fs;
6405	ufs2_daddr_t datablocks;
6406	off_t length;
6407{
6408	off_t totblks, numblks;
6409
6410	totblks = 0;
6411	numblks = howmany(length, fs->fs_bsize);
6412	if (numblks <= NDADDR) {
6413		totblks = howmany(length, fs->fs_fsize);
6414		goto out;
6415	}
6416        totblks = blkstofrags(fs, numblks);
6417	numblks -= NDADDR;
6418	/*
6419	 * Count all single, then double, then triple indirects required.
6420	 * Subtracting one indirects worth of blocks for each pass
6421	 * acknowledges one of each pointed to by the inode.
6422	 */
6423	for (;;) {
6424		totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6425		numblks -= NINDIR(fs);
6426		if (numblks <= 0)
6427			break;
6428		numblks = howmany(numblks, NINDIR(fs));
6429	}
6430out:
6431	totblks = fsbtodb(fs, totblks);
6432	/*
6433	 * Handle sparse files.  We can't reclaim more blocks than the inode
6434	 * references.  We will correct it later in handle_complete_freeblks()
6435	 * when we know the real count.
6436	 */
6437	if (totblks > datablocks)
6438		return (0);
6439	return (datablocks - totblks);
6440}
6441
6442/*
6443 * Handle freeblocks for journaled softupdate filesystems.
6444 *
6445 * Contrary to normal softupdates, we must preserve the block pointers in
6446 * indirects until their subordinates are free.  This is to avoid journaling
6447 * every block that is freed which may consume more space than the journal
6448 * itself.  The recovery program will see the free block journals at the
6449 * base of the truncated area and traverse them to reclaim space.  The
6450 * pointers in the inode may be cleared immediately after the journal
6451 * records are written because each direct and indirect pointer in the
6452 * inode is recorded in a journal.  This permits full truncation to proceed
6453 * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6454 *
6455 * The algorithm is as follows:
6456 * 1) Traverse the in-memory state and create journal entries to release
6457 *    the relevant blocks and full indirect trees.
6458 * 2) Traverse the indirect block chain adding partial truncation freework
6459 *    records to indirects in the path to lastlbn.  The freework will
6460 *    prevent new allocation dependencies from being satisfied in this
6461 *    indirect until the truncation completes.
6462 * 3) Read and lock the inode block, performing an update with the new size
6463 *    and pointers.  This prevents truncated data from becoming valid on
6464 *    disk through step 4.
6465 * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6466 *    eliminate journal work for those records that do not require it.
6467 * 5) Schedule the journal records to be written followed by the inode block.
6468 * 6) Allocate any necessary frags for the end of file.
6469 * 7) Zero any partially truncated blocks.
6470 *
6471 * From this truncation proceeds asynchronously using the freework and
6472 * indir_trunc machinery.  The file will not be extended again into a
6473 * partially truncated indirect block until all work is completed but
6474 * the normal dependency mechanism ensures that it is rolled back/forward
6475 * as appropriate.  Further truncation may occur without delay and is
6476 * serialized in indir_trunc().
6477 */
6478void
6479softdep_journal_freeblocks(ip, cred, length, flags)
6480	struct inode *ip;	/* The inode whose length is to be reduced */
6481	struct ucred *cred;
6482	off_t length;		/* The new length for the file */
6483	int flags;		/* IO_EXT and/or IO_NORMAL */
6484{
6485	struct freeblks *freeblks, *fbn;
6486	struct worklist *wk, *wkn;
6487	struct inodedep *inodedep;
6488	struct jblkdep *jblkdep;
6489	struct allocdirect *adp, *adpn;
6490	struct ufsmount *ump;
6491	struct fs *fs;
6492	struct buf *bp;
6493	struct vnode *vp;
6494	struct mount *mp;
6495	ufs2_daddr_t extblocks, datablocks;
6496	ufs_lbn_t tmpval, lbn, lastlbn;
6497	int frags, lastoff, iboff, allocblock, needj, dflags, error, i;
6498
6499	fs = ip->i_fs;
6500	ump = ip->i_ump;
6501	mp = UFSTOVFS(ump);
6502	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6503	    ("softdep_journal_freeblocks called on non-softdep filesystem"));
6504	vp = ITOV(ip);
6505	needj = 1;
6506	iboff = -1;
6507	allocblock = 0;
6508	extblocks = 0;
6509	datablocks = 0;
6510	frags = 0;
6511	freeblks = newfreeblks(mp, ip);
6512	ACQUIRE_LOCK(ump);
6513	/*
6514	 * If we're truncating a removed file that will never be written
6515	 * we don't need to journal the block frees.  The canceled journals
6516	 * for the allocations will suffice.
6517	 */
6518	dflags = DEPALLOC;
6519	if (IS_SNAPSHOT(ip))
6520		dflags |= NODELAY;
6521	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6522	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6523	    length == 0)
6524		needj = 0;
6525	CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
6526	    ip->i_number, length, needj);
6527	FREE_LOCK(ump);
6528	/*
6529	 * Calculate the lbn that we are truncating to.  This results in -1
6530	 * if we're truncating the 0 bytes.  So it is the last lbn we want
6531	 * to keep, not the first lbn we want to truncate.
6532	 */
6533	lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6534	lastoff = blkoff(fs, length);
6535	/*
6536	 * Compute frags we are keeping in lastlbn.  0 means all.
6537	 */
6538	if (lastlbn >= 0 && lastlbn < NDADDR) {
6539		frags = fragroundup(fs, lastoff);
6540		/* adp offset of last valid allocdirect. */
6541		iboff = lastlbn;
6542	} else if (lastlbn > 0)
6543		iboff = NDADDR;
6544	if (fs->fs_magic == FS_UFS2_MAGIC)
6545		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6546	/*
6547	 * Handle normal data blocks and indirects.  This section saves
6548	 * values used after the inode update to complete frag and indirect
6549	 * truncation.
6550	 */
6551	if ((flags & IO_NORMAL) != 0) {
6552		/*
6553		 * Handle truncation of whole direct and indirect blocks.
6554		 */
6555		for (i = iboff + 1; i < NDADDR; i++)
6556			setup_freedirect(freeblks, ip, i, needj);
6557		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6558		    i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6559			/* Release a whole indirect tree. */
6560			if (lbn > lastlbn) {
6561				setup_freeindir(freeblks, ip, i, -lbn -i,
6562				    needj);
6563				continue;
6564			}
6565			iboff = i + NDADDR;
6566			/*
6567			 * Traverse partially truncated indirect tree.
6568			 */
6569			if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6570				setup_trunc_indir(freeblks, ip, -lbn - i,
6571				    lastlbn, DIP(ip, i_ib[i]));
6572		}
6573		/*
6574		 * Handle partial truncation to a frag boundary.
6575		 */
6576		if (frags) {
6577			ufs2_daddr_t blkno;
6578			long oldfrags;
6579
6580			oldfrags = blksize(fs, ip, lastlbn);
6581			blkno = DIP(ip, i_db[lastlbn]);
6582			if (blkno && oldfrags != frags) {
6583				oldfrags -= frags;
6584				oldfrags = numfrags(ip->i_fs, oldfrags);
6585				blkno += numfrags(ip->i_fs, frags);
6586				newfreework(ump, freeblks, NULL, lastlbn,
6587				    blkno, oldfrags, 0, needj);
6588				if (needj)
6589					adjust_newfreework(freeblks,
6590					    numfrags(ip->i_fs, frags));
6591			} else if (blkno == 0)
6592				allocblock = 1;
6593		}
6594		/*
6595		 * Add a journal record for partial truncate if we are
6596		 * handling indirect blocks.  Non-indirects need no extra
6597		 * journaling.
6598		 */
6599		if (length != 0 && lastlbn >= NDADDR) {
6600			ip->i_flag |= IN_TRUNCATED;
6601			newjtrunc(freeblks, length, 0);
6602		}
6603		ip->i_size = length;
6604		DIP_SET(ip, i_size, ip->i_size);
6605		datablocks = DIP(ip, i_blocks) - extblocks;
6606		if (length != 0)
6607			datablocks = blkcount(ip->i_fs, datablocks, length);
6608		freeblks->fb_len = length;
6609	}
6610	if ((flags & IO_EXT) != 0) {
6611		for (i = 0; i < NXADDR; i++)
6612			setup_freeext(freeblks, ip, i, needj);
6613		ip->i_din2->di_extsize = 0;
6614		datablocks += extblocks;
6615	}
6616#ifdef QUOTA
6617	/* Reference the quotas in case the block count is wrong in the end. */
6618	quotaref(vp, freeblks->fb_quota);
6619	(void) chkdq(ip, -datablocks, NOCRED, 0);
6620#endif
6621	freeblks->fb_chkcnt = -datablocks;
6622	UFS_LOCK(ump);
6623	fs->fs_pendingblocks += datablocks;
6624	UFS_UNLOCK(ump);
6625	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6626	/*
6627	 * Handle truncation of incomplete alloc direct dependencies.  We
6628	 * hold the inode block locked to prevent incomplete dependencies
6629	 * from reaching the disk while we are eliminating those that
6630	 * have been truncated.  This is a partially inlined ffs_update().
6631	 */
6632	ufs_itimes(vp);
6633	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6634	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6635	    (int)fs->fs_bsize, cred, &bp);
6636	if (error) {
6637		brelse(bp);
6638		softdep_error("softdep_journal_freeblocks", error);
6639		return;
6640	}
6641	if (bp->b_bufsize == fs->fs_bsize)
6642		bp->b_flags |= B_CLUSTEROK;
6643	softdep_update_inodeblock(ip, bp, 0);
6644	if (ump->um_fstype == UFS1)
6645		*((struct ufs1_dinode *)bp->b_data +
6646		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6647	else
6648		*((struct ufs2_dinode *)bp->b_data +
6649		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6650	ACQUIRE_LOCK(ump);
6651	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6652	if ((inodedep->id_state & IOSTARTED) != 0)
6653		panic("softdep_setup_freeblocks: inode busy");
6654	/*
6655	 * Add the freeblks structure to the list of operations that
6656	 * must await the zero'ed inode being written to disk. If we
6657	 * still have a bitmap dependency (needj), then the inode
6658	 * has never been written to disk, so we can process the
6659	 * freeblks below once we have deleted the dependencies.
6660	 */
6661	if (needj)
6662		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6663	else
6664		freeblks->fb_state |= COMPLETE;
6665	if ((flags & IO_NORMAL) != 0) {
6666		TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6667			if (adp->ad_offset > iboff)
6668				cancel_allocdirect(&inodedep->id_inoupdt, adp,
6669				    freeblks);
6670			/*
6671			 * Truncate the allocdirect.  We could eliminate
6672			 * or modify journal records as well.
6673			 */
6674			else if (adp->ad_offset == iboff && frags)
6675				adp->ad_newsize = frags;
6676		}
6677	}
6678	if ((flags & IO_EXT) != 0)
6679		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6680			cancel_allocdirect(&inodedep->id_extupdt, adp,
6681			    freeblks);
6682	/*
6683	 * Scan the bufwait list for newblock dependencies that will never
6684	 * make it to disk.
6685	 */
6686	LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
6687		if (wk->wk_type != D_ALLOCDIRECT)
6688			continue;
6689		adp = WK_ALLOCDIRECT(wk);
6690		if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
6691		    ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
6692			cancel_jfreeblk(freeblks, adp->ad_newblkno);
6693			cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
6694			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6695		}
6696	}
6697	/*
6698	 * Add journal work.
6699	 */
6700	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6701		add_to_journal(&jblkdep->jb_list);
6702	FREE_LOCK(ump);
6703	bdwrite(bp);
6704	/*
6705	 * Truncate dependency structures beyond length.
6706	 */
6707	trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6708	/*
6709	 * This is only set when we need to allocate a fragment because
6710	 * none existed at the end of a frag-sized file.  It handles only
6711	 * allocating a new, zero filled block.
6712	 */
6713	if (allocblock) {
6714		ip->i_size = length - lastoff;
6715		DIP_SET(ip, i_size, ip->i_size);
6716		error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6717		if (error != 0) {
6718			softdep_error("softdep_journal_freeblks", error);
6719			return;
6720		}
6721		ip->i_size = length;
6722		DIP_SET(ip, i_size, length);
6723		ip->i_flag |= IN_CHANGE | IN_UPDATE;
6724		allocbuf(bp, frags);
6725		ffs_update(vp, 0);
6726		bawrite(bp);
6727	} else if (lastoff != 0 && vp->v_type != VDIR) {
6728		int size;
6729
6730		/*
6731		 * Zero the end of a truncated frag or block.
6732		 */
6733		size = sblksize(fs, length, lastlbn);
6734		error = bread(vp, lastlbn, size, cred, &bp);
6735		if (error) {
6736			softdep_error("softdep_journal_freeblks", error);
6737			return;
6738		}
6739		bzero((char *)bp->b_data + lastoff, size - lastoff);
6740		bawrite(bp);
6741
6742	}
6743	ACQUIRE_LOCK(ump);
6744	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6745	TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
6746	freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
6747	/*
6748	 * We zero earlier truncations so they don't erroneously
6749	 * update i_blocks.
6750	 */
6751	if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
6752		TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
6753			fbn->fb_len = 0;
6754	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
6755	    LIST_EMPTY(&freeblks->fb_jblkdephd))
6756		freeblks->fb_state |= INPROGRESS;
6757	else
6758		freeblks = NULL;
6759	FREE_LOCK(ump);
6760	if (freeblks)
6761		handle_workitem_freeblocks(freeblks, 0);
6762	trunc_pages(ip, length, extblocks, flags);
6763
6764}
6765
6766/*
6767 * Flush a JOP_SYNC to the journal.
6768 */
6769void
6770softdep_journal_fsync(ip)
6771	struct inode *ip;
6772{
6773	struct jfsync *jfsync;
6774
6775	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
6776	    ("softdep_journal_fsync called on non-softdep filesystem"));
6777	if ((ip->i_flag & IN_TRUNCATED) == 0)
6778		return;
6779	ip->i_flag &= ~IN_TRUNCATED;
6780	jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
6781	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump));
6782	jfsync->jfs_size = ip->i_size;
6783	jfsync->jfs_ino = ip->i_number;
6784	ACQUIRE_LOCK(ip->i_ump);
6785	add_to_journal(&jfsync->jfs_list);
6786	jwait(&jfsync->jfs_list, MNT_WAIT);
6787	FREE_LOCK(ip->i_ump);
6788}
6789
6790/*
6791 * Block de-allocation dependencies.
6792 *
6793 * When blocks are de-allocated, the on-disk pointers must be nullified before
6794 * the blocks are made available for use by other files.  (The true
6795 * requirement is that old pointers must be nullified before new on-disk
6796 * pointers are set.  We chose this slightly more stringent requirement to
6797 * reduce complexity.) Our implementation handles this dependency by updating
6798 * the inode (or indirect block) appropriately but delaying the actual block
6799 * de-allocation (i.e., freemap and free space count manipulation) until
6800 * after the updated versions reach stable storage.  After the disk is
6801 * updated, the blocks can be safely de-allocated whenever it is convenient.
6802 * This implementation handles only the common case of reducing a file's
6803 * length to zero. Other cases are handled by the conventional synchronous
6804 * write approach.
6805 *
6806 * The ffs implementation with which we worked double-checks
6807 * the state of the block pointers and file size as it reduces
6808 * a file's length.  Some of this code is replicated here in our
6809 * soft updates implementation.  The freeblks->fb_chkcnt field is
6810 * used to transfer a part of this information to the procedure
6811 * that eventually de-allocates the blocks.
6812 *
6813 * This routine should be called from the routine that shortens
6814 * a file's length, before the inode's size or block pointers
6815 * are modified. It will save the block pointer information for
6816 * later release and zero the inode so that the calling routine
6817 * can release it.
6818 */
6819void
6820softdep_setup_freeblocks(ip, length, flags)
6821	struct inode *ip;	/* The inode whose length is to be reduced */
6822	off_t length;		/* The new length for the file */
6823	int flags;		/* IO_EXT and/or IO_NORMAL */
6824{
6825	struct ufs1_dinode *dp1;
6826	struct ufs2_dinode *dp2;
6827	struct freeblks *freeblks;
6828	struct inodedep *inodedep;
6829	struct allocdirect *adp;
6830	struct ufsmount *ump;
6831	struct buf *bp;
6832	struct fs *fs;
6833	ufs2_daddr_t extblocks, datablocks;
6834	struct mount *mp;
6835	int i, delay, error, dflags;
6836	ufs_lbn_t tmpval;
6837	ufs_lbn_t lbn;
6838
6839	ump = ip->i_ump;
6840	mp = UFSTOVFS(ump);
6841	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6842	    ("softdep_setup_freeblocks called on non-softdep filesystem"));
6843	CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
6844	    ip->i_number, length);
6845	KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
6846	fs = ip->i_fs;
6847	freeblks = newfreeblks(mp, ip);
6848	extblocks = 0;
6849	datablocks = 0;
6850	if (fs->fs_magic == FS_UFS2_MAGIC)
6851		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6852	if ((flags & IO_NORMAL) != 0) {
6853		for (i = 0; i < NDADDR; i++)
6854			setup_freedirect(freeblks, ip, i, 0);
6855		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6856		    i++, lbn += tmpval, tmpval *= NINDIR(fs))
6857			setup_freeindir(freeblks, ip, i, -lbn -i, 0);
6858		ip->i_size = 0;
6859		DIP_SET(ip, i_size, 0);
6860		datablocks = DIP(ip, i_blocks) - extblocks;
6861	}
6862	if ((flags & IO_EXT) != 0) {
6863		for (i = 0; i < NXADDR; i++)
6864			setup_freeext(freeblks, ip, i, 0);
6865		ip->i_din2->di_extsize = 0;
6866		datablocks += extblocks;
6867	}
6868#ifdef QUOTA
6869	/* Reference the quotas in case the block count is wrong in the end. */
6870	quotaref(ITOV(ip), freeblks->fb_quota);
6871	(void) chkdq(ip, -datablocks, NOCRED, 0);
6872#endif
6873	freeblks->fb_chkcnt = -datablocks;
6874	UFS_LOCK(ump);
6875	fs->fs_pendingblocks += datablocks;
6876	UFS_UNLOCK(ump);
6877	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6878	/*
6879	 * Push the zero'ed inode to to its disk buffer so that we are free
6880	 * to delete its dependencies below. Once the dependencies are gone
6881	 * the buffer can be safely released.
6882	 */
6883	if ((error = bread(ip->i_devvp,
6884	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6885	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
6886		brelse(bp);
6887		softdep_error("softdep_setup_freeblocks", error);
6888	}
6889	if (ump->um_fstype == UFS1) {
6890		dp1 = ((struct ufs1_dinode *)bp->b_data +
6891		    ino_to_fsbo(fs, ip->i_number));
6892		ip->i_din1->di_freelink = dp1->di_freelink;
6893		*dp1 = *ip->i_din1;
6894	} else {
6895		dp2 = ((struct ufs2_dinode *)bp->b_data +
6896		    ino_to_fsbo(fs, ip->i_number));
6897		ip->i_din2->di_freelink = dp2->di_freelink;
6898		*dp2 = *ip->i_din2;
6899	}
6900	/*
6901	 * Find and eliminate any inode dependencies.
6902	 */
6903	ACQUIRE_LOCK(ump);
6904	dflags = DEPALLOC;
6905	if (IS_SNAPSHOT(ip))
6906		dflags |= NODELAY;
6907	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6908	if ((inodedep->id_state & IOSTARTED) != 0)
6909		panic("softdep_setup_freeblocks: inode busy");
6910	/*
6911	 * Add the freeblks structure to the list of operations that
6912	 * must await the zero'ed inode being written to disk. If we
6913	 * still have a bitmap dependency (delay == 0), then the inode
6914	 * has never been written to disk, so we can process the
6915	 * freeblks below once we have deleted the dependencies.
6916	 */
6917	delay = (inodedep->id_state & DEPCOMPLETE);
6918	if (delay)
6919		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6920	else
6921		freeblks->fb_state |= COMPLETE;
6922	/*
6923	 * Because the file length has been truncated to zero, any
6924	 * pending block allocation dependency structures associated
6925	 * with this inode are obsolete and can simply be de-allocated.
6926	 * We must first merge the two dependency lists to get rid of
6927	 * any duplicate freefrag structures, then purge the merged list.
6928	 * If we still have a bitmap dependency, then the inode has never
6929	 * been written to disk, so we can free any fragments without delay.
6930	 */
6931	if (flags & IO_NORMAL) {
6932		merge_inode_lists(&inodedep->id_newinoupdt,
6933		    &inodedep->id_inoupdt);
6934		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
6935			cancel_allocdirect(&inodedep->id_inoupdt, adp,
6936			    freeblks);
6937	}
6938	if (flags & IO_EXT) {
6939		merge_inode_lists(&inodedep->id_newextupdt,
6940		    &inodedep->id_extupdt);
6941		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6942			cancel_allocdirect(&inodedep->id_extupdt, adp,
6943			    freeblks);
6944	}
6945	FREE_LOCK(ump);
6946	bdwrite(bp);
6947	trunc_dependencies(ip, freeblks, -1, 0, flags);
6948	ACQUIRE_LOCK(ump);
6949	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
6950		(void) free_inodedep(inodedep);
6951	freeblks->fb_state |= DEPCOMPLETE;
6952	/*
6953	 * If the inode with zeroed block pointers is now on disk
6954	 * we can start freeing blocks.
6955	 */
6956	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
6957		freeblks->fb_state |= INPROGRESS;
6958	else
6959		freeblks = NULL;
6960	FREE_LOCK(ump);
6961	if (freeblks)
6962		handle_workitem_freeblocks(freeblks, 0);
6963	trunc_pages(ip, length, extblocks, flags);
6964}
6965
6966/*
6967 * Eliminate pages from the page cache that back parts of this inode and
6968 * adjust the vnode pager's idea of our size.  This prevents stale data
6969 * from hanging around in the page cache.
6970 */
6971static void
6972trunc_pages(ip, length, extblocks, flags)
6973	struct inode *ip;
6974	off_t length;
6975	ufs2_daddr_t extblocks;
6976	int flags;
6977{
6978	struct vnode *vp;
6979	struct fs *fs;
6980	ufs_lbn_t lbn;
6981	off_t end, extend;
6982
6983	vp = ITOV(ip);
6984	fs = ip->i_fs;
6985	extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
6986	if ((flags & IO_EXT) != 0)
6987		vn_pages_remove(vp, extend, 0);
6988	if ((flags & IO_NORMAL) == 0)
6989		return;
6990	BO_LOCK(&vp->v_bufobj);
6991	drain_output(vp);
6992	BO_UNLOCK(&vp->v_bufobj);
6993	/*
6994	 * The vnode pager eliminates file pages we eliminate indirects
6995	 * below.
6996	 */
6997	vnode_pager_setsize(vp, length);
6998	/*
6999	 * Calculate the end based on the last indirect we want to keep.  If
7000	 * the block extends into indirects we can just use the negative of
7001	 * its lbn.  Doubles and triples exist at lower numbers so we must
7002	 * be careful not to remove those, if they exist.  double and triple
7003	 * indirect lbns do not overlap with others so it is not important
7004	 * to verify how many levels are required.
7005	 */
7006	lbn = lblkno(fs, length);
7007	if (lbn >= NDADDR) {
7008		/* Calculate the virtual lbn of the triple indirect. */
7009		lbn = -lbn - (NIADDR - 1);
7010		end = OFF_TO_IDX(lblktosize(fs, lbn));
7011	} else
7012		end = extend;
7013	vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
7014}
7015
7016/*
7017 * See if the buf bp is in the range eliminated by truncation.
7018 */
7019static int
7020trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
7021	struct buf *bp;
7022	int *blkoffp;
7023	ufs_lbn_t lastlbn;
7024	int lastoff;
7025	int flags;
7026{
7027	ufs_lbn_t lbn;
7028
7029	*blkoffp = 0;
7030	/* Only match ext/normal blocks as appropriate. */
7031	if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
7032	    ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
7033		return (0);
7034	/* ALTDATA is always a full truncation. */
7035	if ((bp->b_xflags & BX_ALTDATA) != 0)
7036		return (1);
7037	/* -1 is full truncation. */
7038	if (lastlbn == -1)
7039		return (1);
7040	/*
7041	 * If this is a partial truncate we only want those
7042	 * blocks and indirect blocks that cover the range
7043	 * we're after.
7044	 */
7045	lbn = bp->b_lblkno;
7046	if (lbn < 0)
7047		lbn = -(lbn + lbn_level(lbn));
7048	if (lbn < lastlbn)
7049		return (0);
7050	/* Here we only truncate lblkno if it's partial. */
7051	if (lbn == lastlbn) {
7052		if (lastoff == 0)
7053			return (0);
7054		*blkoffp = lastoff;
7055	}
7056	return (1);
7057}
7058
7059/*
7060 * Eliminate any dependencies that exist in memory beyond lblkno:off
7061 */
7062static void
7063trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
7064	struct inode *ip;
7065	struct freeblks *freeblks;
7066	ufs_lbn_t lastlbn;
7067	int lastoff;
7068	int flags;
7069{
7070	struct bufobj *bo;
7071	struct vnode *vp;
7072	struct buf *bp;
7073	struct fs *fs;
7074	int blkoff;
7075
7076	/*
7077	 * We must wait for any I/O in progress to finish so that
7078	 * all potential buffers on the dirty list will be visible.
7079	 * Once they are all there, walk the list and get rid of
7080	 * any dependencies.
7081	 */
7082	fs = ip->i_fs;
7083	vp = ITOV(ip);
7084	bo = &vp->v_bufobj;
7085	BO_LOCK(bo);
7086	drain_output(vp);
7087	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
7088		bp->b_vflags &= ~BV_SCANNED;
7089restart:
7090	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
7091		if (bp->b_vflags & BV_SCANNED)
7092			continue;
7093		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7094			bp->b_vflags |= BV_SCANNED;
7095			continue;
7096		}
7097		KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer"));
7098		if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL)
7099			goto restart;
7100		BO_UNLOCK(bo);
7101		if (deallocate_dependencies(bp, freeblks, blkoff))
7102			bqrelse(bp);
7103		else
7104			brelse(bp);
7105		BO_LOCK(bo);
7106		goto restart;
7107	}
7108	/*
7109	 * Now do the work of vtruncbuf while also matching indirect blocks.
7110	 */
7111	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
7112		bp->b_vflags &= ~BV_SCANNED;
7113cleanrestart:
7114	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
7115		if (bp->b_vflags & BV_SCANNED)
7116			continue;
7117		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7118			bp->b_vflags |= BV_SCANNED;
7119			continue;
7120		}
7121		if (BUF_LOCK(bp,
7122		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
7123		    BO_LOCKPTR(bo)) == ENOLCK) {
7124			BO_LOCK(bo);
7125			goto cleanrestart;
7126		}
7127		bp->b_vflags |= BV_SCANNED;
7128		bremfree(bp);
7129		if (blkoff != 0) {
7130			allocbuf(bp, blkoff);
7131			bqrelse(bp);
7132		} else {
7133			bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
7134			brelse(bp);
7135		}
7136		BO_LOCK(bo);
7137		goto cleanrestart;
7138	}
7139	drain_output(vp);
7140	BO_UNLOCK(bo);
7141}
7142
7143static int
7144cancel_pagedep(pagedep, freeblks, blkoff)
7145	struct pagedep *pagedep;
7146	struct freeblks *freeblks;
7147	int blkoff;
7148{
7149	struct jremref *jremref;
7150	struct jmvref *jmvref;
7151	struct dirrem *dirrem, *tmp;
7152	int i;
7153
7154	/*
7155	 * Copy any directory remove dependencies to the list
7156	 * to be processed after the freeblks proceeds.  If
7157	 * directory entry never made it to disk they
7158	 * can be dumped directly onto the work list.
7159	 */
7160	LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
7161		/* Skip this directory removal if it is intended to remain. */
7162		if (dirrem->dm_offset < blkoff)
7163			continue;
7164		/*
7165		 * If there are any dirrems we wait for the journal write
7166		 * to complete and then restart the buf scan as the lock
7167		 * has been dropped.
7168		 */
7169		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
7170			jwait(&jremref->jr_list, MNT_WAIT);
7171			return (ERESTART);
7172		}
7173		LIST_REMOVE(dirrem, dm_next);
7174		dirrem->dm_dirinum = pagedep->pd_ino;
7175		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
7176	}
7177	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
7178		jwait(&jmvref->jm_list, MNT_WAIT);
7179		return (ERESTART);
7180	}
7181	/*
7182	 * When we're partially truncating a pagedep we just want to flush
7183	 * journal entries and return.  There can not be any adds in the
7184	 * truncated portion of the directory and newblk must remain if
7185	 * part of the block remains.
7186	 */
7187	if (blkoff != 0) {
7188		struct diradd *dap;
7189
7190		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
7191			if (dap->da_offset > blkoff)
7192				panic("cancel_pagedep: diradd %p off %d > %d",
7193				    dap, dap->da_offset, blkoff);
7194		for (i = 0; i < DAHASHSZ; i++)
7195			LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
7196				if (dap->da_offset > blkoff)
7197					panic("cancel_pagedep: diradd %p off %d > %d",
7198					    dap, dap->da_offset, blkoff);
7199		return (0);
7200	}
7201	/*
7202	 * There should be no directory add dependencies present
7203	 * as the directory could not be truncated until all
7204	 * children were removed.
7205	 */
7206	KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
7207	    ("deallocate_dependencies: pendinghd != NULL"));
7208	for (i = 0; i < DAHASHSZ; i++)
7209		KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
7210		    ("deallocate_dependencies: diraddhd != NULL"));
7211	if ((pagedep->pd_state & NEWBLOCK) != 0)
7212		free_newdirblk(pagedep->pd_newdirblk);
7213	if (free_pagedep(pagedep) == 0)
7214		panic("Failed to free pagedep %p", pagedep);
7215	return (0);
7216}
7217
7218/*
7219 * Reclaim any dependency structures from a buffer that is about to
7220 * be reallocated to a new vnode. The buffer must be locked, thus,
7221 * no I/O completion operations can occur while we are manipulating
7222 * its associated dependencies. The mutex is held so that other I/O's
7223 * associated with related dependencies do not occur.
7224 */
7225static int
7226deallocate_dependencies(bp, freeblks, off)
7227	struct buf *bp;
7228	struct freeblks *freeblks;
7229	int off;
7230{
7231	struct indirdep *indirdep;
7232	struct pagedep *pagedep;
7233	struct allocdirect *adp;
7234	struct worklist *wk, *wkn;
7235	struct ufsmount *ump;
7236
7237	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
7238		goto done;
7239	ump = VFSTOUFS(wk->wk_mp);
7240	ACQUIRE_LOCK(ump);
7241	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
7242		switch (wk->wk_type) {
7243		case D_INDIRDEP:
7244			indirdep = WK_INDIRDEP(wk);
7245			if (bp->b_lblkno >= 0 ||
7246			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
7247				panic("deallocate_dependencies: not indir");
7248			cancel_indirdep(indirdep, bp, freeblks);
7249			continue;
7250
7251		case D_PAGEDEP:
7252			pagedep = WK_PAGEDEP(wk);
7253			if (cancel_pagedep(pagedep, freeblks, off)) {
7254				FREE_LOCK(ump);
7255				return (ERESTART);
7256			}
7257			continue;
7258
7259		case D_ALLOCINDIR:
7260			/*
7261			 * Simply remove the allocindir, we'll find it via
7262			 * the indirdep where we can clear pointers if
7263			 * needed.
7264			 */
7265			WORKLIST_REMOVE(wk);
7266			continue;
7267
7268		case D_FREEWORK:
7269			/*
7270			 * A truncation is waiting for the zero'd pointers
7271			 * to be written.  It can be freed when the freeblks
7272			 * is journaled.
7273			 */
7274			WORKLIST_REMOVE(wk);
7275			wk->wk_state |= ONDEPLIST;
7276			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7277			break;
7278
7279		case D_ALLOCDIRECT:
7280			adp = WK_ALLOCDIRECT(wk);
7281			if (off != 0)
7282				continue;
7283			/* FALLTHROUGH */
7284		default:
7285			panic("deallocate_dependencies: Unexpected type %s",
7286			    TYPENAME(wk->wk_type));
7287			/* NOTREACHED */
7288		}
7289	}
7290	FREE_LOCK(ump);
7291done:
7292	/*
7293	 * Don't throw away this buf, we were partially truncating and
7294	 * some deps may always remain.
7295	 */
7296	if (off) {
7297		allocbuf(bp, off);
7298		bp->b_vflags |= BV_SCANNED;
7299		return (EBUSY);
7300	}
7301	bp->b_flags |= B_INVAL | B_NOCACHE;
7302
7303	return (0);
7304}
7305
7306/*
7307 * An allocdirect is being canceled due to a truncate.  We must make sure
7308 * the journal entry is released in concert with the blkfree that releases
7309 * the storage.  Completed journal entries must not be released until the
7310 * space is no longer pointed to by the inode or in the bitmap.
7311 */
7312static void
7313cancel_allocdirect(adphead, adp, freeblks)
7314	struct allocdirectlst *adphead;
7315	struct allocdirect *adp;
7316	struct freeblks *freeblks;
7317{
7318	struct freework *freework;
7319	struct newblk *newblk;
7320	struct worklist *wk;
7321
7322	TAILQ_REMOVE(adphead, adp, ad_next);
7323	newblk = (struct newblk *)adp;
7324	freework = NULL;
7325	/*
7326	 * Find the correct freework structure.
7327	 */
7328	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
7329		if (wk->wk_type != D_FREEWORK)
7330			continue;
7331		freework = WK_FREEWORK(wk);
7332		if (freework->fw_blkno == newblk->nb_newblkno)
7333			break;
7334	}
7335	if (freework == NULL)
7336		panic("cancel_allocdirect: Freework not found");
7337	/*
7338	 * If a newblk exists at all we still have the journal entry that
7339	 * initiated the allocation so we do not need to journal the free.
7340	 */
7341	cancel_jfreeblk(freeblks, freework->fw_blkno);
7342	/*
7343	 * If the journal hasn't been written the jnewblk must be passed
7344	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
7345	 * this by linking the journal dependency into the freework to be
7346	 * freed when freework_freeblock() is called.  If the journal has
7347	 * been written we can simply reclaim the journal space when the
7348	 * freeblks work is complete.
7349	 */
7350	freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
7351	    &freeblks->fb_jwork);
7352	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7353}
7354
7355
7356/*
7357 * Cancel a new block allocation.  May be an indirect or direct block.  We
7358 * remove it from various lists and return any journal record that needs to
7359 * be resolved by the caller.
7360 *
7361 * A special consideration is made for indirects which were never pointed
7362 * at on disk and will never be found once this block is released.
7363 */
7364static struct jnewblk *
7365cancel_newblk(newblk, wk, wkhd)
7366	struct newblk *newblk;
7367	struct worklist *wk;
7368	struct workhead *wkhd;
7369{
7370	struct jnewblk *jnewblk;
7371
7372	CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
7373
7374	newblk->nb_state |= GOINGAWAY;
7375	/*
7376	 * Previously we traversed the completedhd on each indirdep
7377	 * attached to this newblk to cancel them and gather journal
7378	 * work.  Since we need only the oldest journal segment and
7379	 * the lowest point on the tree will always have the oldest
7380	 * journal segment we are free to release the segments
7381	 * of any subordinates and may leave the indirdep list to
7382	 * indirdep_complete() when this newblk is freed.
7383	 */
7384	if (newblk->nb_state & ONDEPLIST) {
7385		newblk->nb_state &= ~ONDEPLIST;
7386		LIST_REMOVE(newblk, nb_deps);
7387	}
7388	if (newblk->nb_state & ONWORKLIST)
7389		WORKLIST_REMOVE(&newblk->nb_list);
7390	/*
7391	 * If the journal entry hasn't been written we save a pointer to
7392	 * the dependency that frees it until it is written or the
7393	 * superseding operation completes.
7394	 */
7395	jnewblk = newblk->nb_jnewblk;
7396	if (jnewblk != NULL && wk != NULL) {
7397		newblk->nb_jnewblk = NULL;
7398		jnewblk->jn_dep = wk;
7399	}
7400	if (!LIST_EMPTY(&newblk->nb_jwork))
7401		jwork_move(wkhd, &newblk->nb_jwork);
7402	/*
7403	 * When truncating we must free the newdirblk early to remove
7404	 * the pagedep from the hash before returning.
7405	 */
7406	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7407		free_newdirblk(WK_NEWDIRBLK(wk));
7408	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7409		panic("cancel_newblk: extra newdirblk");
7410
7411	return (jnewblk);
7412}
7413
7414/*
7415 * Schedule the freefrag associated with a newblk to be released once
7416 * the pointers are written and the previous block is no longer needed.
7417 */
7418static void
7419newblk_freefrag(newblk)
7420	struct newblk *newblk;
7421{
7422	struct freefrag *freefrag;
7423
7424	if (newblk->nb_freefrag == NULL)
7425		return;
7426	freefrag = newblk->nb_freefrag;
7427	newblk->nb_freefrag = NULL;
7428	freefrag->ff_state |= COMPLETE;
7429	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7430		add_to_worklist(&freefrag->ff_list, 0);
7431}
7432
7433/*
7434 * Free a newblk. Generate a new freefrag work request if appropriate.
7435 * This must be called after the inode pointer and any direct block pointers
7436 * are valid or fully removed via truncate or frag extension.
7437 */
7438static void
7439free_newblk(newblk)
7440	struct newblk *newblk;
7441{
7442	struct indirdep *indirdep;
7443	struct worklist *wk;
7444
7445	KASSERT(newblk->nb_jnewblk == NULL,
7446	    ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
7447	KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
7448	    ("free_newblk: unclaimed newblk"));
7449	LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp));
7450	newblk_freefrag(newblk);
7451	if (newblk->nb_state & ONDEPLIST)
7452		LIST_REMOVE(newblk, nb_deps);
7453	if (newblk->nb_state & ONWORKLIST)
7454		WORKLIST_REMOVE(&newblk->nb_list);
7455	LIST_REMOVE(newblk, nb_hash);
7456	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7457		free_newdirblk(WK_NEWDIRBLK(wk));
7458	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7459		panic("free_newblk: extra newdirblk");
7460	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7461		indirdep_complete(indirdep);
7462	handle_jwork(&newblk->nb_jwork);
7463	WORKITEM_FREE(newblk, D_NEWBLK);
7464}
7465
7466/*
7467 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7468 * This routine must be called with splbio interrupts blocked.
7469 */
7470static void
7471free_newdirblk(newdirblk)
7472	struct newdirblk *newdirblk;
7473{
7474	struct pagedep *pagedep;
7475	struct diradd *dap;
7476	struct worklist *wk;
7477
7478	LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp));
7479	WORKLIST_REMOVE(&newdirblk->db_list);
7480	/*
7481	 * If the pagedep is still linked onto the directory buffer
7482	 * dependency chain, then some of the entries on the
7483	 * pd_pendinghd list may not be committed to disk yet. In
7484	 * this case, we will simply clear the NEWBLOCK flag and
7485	 * let the pd_pendinghd list be processed when the pagedep
7486	 * is next written. If the pagedep is no longer on the buffer
7487	 * dependency chain, then all the entries on the pd_pending
7488	 * list are committed to disk and we can free them here.
7489	 */
7490	pagedep = newdirblk->db_pagedep;
7491	pagedep->pd_state &= ~NEWBLOCK;
7492	if ((pagedep->pd_state & ONWORKLIST) == 0) {
7493		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7494			free_diradd(dap, NULL);
7495		/*
7496		 * If no dependencies remain, the pagedep will be freed.
7497		 */
7498		free_pagedep(pagedep);
7499	}
7500	/* Should only ever be one item in the list. */
7501	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7502		WORKLIST_REMOVE(wk);
7503		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7504	}
7505	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7506}
7507
7508/*
7509 * Prepare an inode to be freed. The actual free operation is not
7510 * done until the zero'ed inode has been written to disk.
7511 */
7512void
7513softdep_freefile(pvp, ino, mode)
7514	struct vnode *pvp;
7515	ino_t ino;
7516	int mode;
7517{
7518	struct inode *ip = VTOI(pvp);
7519	struct inodedep *inodedep;
7520	struct freefile *freefile;
7521	struct freeblks *freeblks;
7522	struct ufsmount *ump;
7523
7524	ump = ip->i_ump;
7525	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
7526	    ("softdep_freefile called on non-softdep filesystem"));
7527	/*
7528	 * This sets up the inode de-allocation dependency.
7529	 */
7530	freefile = malloc(sizeof(struct freefile),
7531		M_FREEFILE, M_SOFTDEP_FLAGS);
7532	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7533	freefile->fx_mode = mode;
7534	freefile->fx_oldinum = ino;
7535	freefile->fx_devvp = ip->i_devvp;
7536	LIST_INIT(&freefile->fx_jwork);
7537	UFS_LOCK(ump);
7538	ip->i_fs->fs_pendinginodes += 1;
7539	UFS_UNLOCK(ump);
7540
7541	/*
7542	 * If the inodedep does not exist, then the zero'ed inode has
7543	 * been written to disk. If the allocated inode has never been
7544	 * written to disk, then the on-disk inode is zero'ed. In either
7545	 * case we can free the file immediately.  If the journal was
7546	 * canceled before being written the inode will never make it to
7547	 * disk and we must send the canceled journal entrys to
7548	 * ffs_freefile() to be cleared in conjunction with the bitmap.
7549	 * Any blocks waiting on the inode to write can be safely freed
7550	 * here as it will never been written.
7551	 */
7552	ACQUIRE_LOCK(ump);
7553	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7554	if (inodedep) {
7555		/*
7556		 * Clear out freeblks that no longer need to reference
7557		 * this inode.
7558		 */
7559		while ((freeblks =
7560		    TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7561			TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7562			    fb_next);
7563			freeblks->fb_state &= ~ONDEPLIST;
7564		}
7565		/*
7566		 * Remove this inode from the unlinked list.
7567		 */
7568		if (inodedep->id_state & UNLINKED) {
7569			/*
7570			 * Save the journal work to be freed with the bitmap
7571			 * before we clear UNLINKED.  Otherwise it can be lost
7572			 * if the inode block is written.
7573			 */
7574			handle_bufwait(inodedep, &freefile->fx_jwork);
7575			clear_unlinked_inodedep(inodedep);
7576			/*
7577			 * Re-acquire inodedep as we've dropped the
7578			 * per-filesystem lock in clear_unlinked_inodedep().
7579			 */
7580			inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7581		}
7582	}
7583	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7584		FREE_LOCK(ump);
7585		handle_workitem_freefile(freefile);
7586		return;
7587	}
7588	if ((inodedep->id_state & DEPCOMPLETE) == 0)
7589		inodedep->id_state |= GOINGAWAY;
7590	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7591	FREE_LOCK(ump);
7592	if (ip->i_number == ino)
7593		ip->i_flag |= IN_MODIFIED;
7594}
7595
7596/*
7597 * Check to see if an inode has never been written to disk. If
7598 * so free the inodedep and return success, otherwise return failure.
7599 * This routine must be called with splbio interrupts blocked.
7600 *
7601 * If we still have a bitmap dependency, then the inode has never
7602 * been written to disk. Drop the dependency as it is no longer
7603 * necessary since the inode is being deallocated. We set the
7604 * ALLCOMPLETE flags since the bitmap now properly shows that the
7605 * inode is not allocated. Even if the inode is actively being
7606 * written, it has been rolled back to its zero'ed state, so we
7607 * are ensured that a zero inode is what is on the disk. For short
7608 * lived files, this change will usually result in removing all the
7609 * dependencies from the inode so that it can be freed immediately.
7610 */
7611static int
7612check_inode_unwritten(inodedep)
7613	struct inodedep *inodedep;
7614{
7615
7616	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7617
7618	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7619	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7620	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7621	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7622	    !LIST_EMPTY(&inodedep->id_inowait) ||
7623	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7624	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7625	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7626	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7627	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7628	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7629	    inodedep->id_mkdiradd != NULL ||
7630	    inodedep->id_nlinkdelta != 0)
7631		return (0);
7632	/*
7633	 * Another process might be in initiate_write_inodeblock_ufs[12]
7634	 * trying to allocate memory without holding "Softdep Lock".
7635	 */
7636	if ((inodedep->id_state & IOSTARTED) != 0 &&
7637	    inodedep->id_savedino1 == NULL)
7638		return (0);
7639
7640	if (inodedep->id_state & ONDEPLIST)
7641		LIST_REMOVE(inodedep, id_deps);
7642	inodedep->id_state &= ~ONDEPLIST;
7643	inodedep->id_state |= ALLCOMPLETE;
7644	inodedep->id_bmsafemap = NULL;
7645	if (inodedep->id_state & ONWORKLIST)
7646		WORKLIST_REMOVE(&inodedep->id_list);
7647	if (inodedep->id_savedino1 != NULL) {
7648		free(inodedep->id_savedino1, M_SAVEDINO);
7649		inodedep->id_savedino1 = NULL;
7650	}
7651	if (free_inodedep(inodedep) == 0)
7652		panic("check_inode_unwritten: busy inode");
7653	return (1);
7654}
7655
7656static int
7657check_inodedep_free(inodedep)
7658	struct inodedep *inodedep;
7659{
7660
7661	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7662	if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7663	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7664	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7665	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7666	    !LIST_EMPTY(&inodedep->id_inowait) ||
7667	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7668	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7669	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7670	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7671	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7672	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7673	    inodedep->id_mkdiradd != NULL ||
7674	    inodedep->id_nlinkdelta != 0 ||
7675	    inodedep->id_savedino1 != NULL)
7676		return (0);
7677	return (1);
7678}
7679
7680/*
7681 * Try to free an inodedep structure. Return 1 if it could be freed.
7682 */
7683static int
7684free_inodedep(inodedep)
7685	struct inodedep *inodedep;
7686{
7687
7688	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7689	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7690	    !check_inodedep_free(inodedep))
7691		return (0);
7692	if (inodedep->id_state & ONDEPLIST)
7693		LIST_REMOVE(inodedep, id_deps);
7694	LIST_REMOVE(inodedep, id_hash);
7695	WORKITEM_FREE(inodedep, D_INODEDEP);
7696	return (1);
7697}
7698
7699/*
7700 * Free the block referenced by a freework structure.  The parent freeblks
7701 * structure is released and completed when the final cg bitmap reaches
7702 * the disk.  This routine may be freeing a jnewblk which never made it to
7703 * disk in which case we do not have to wait as the operation is undone
7704 * in memory immediately.
7705 */
7706static void
7707freework_freeblock(freework)
7708	struct freework *freework;
7709{
7710	struct freeblks *freeblks;
7711	struct jnewblk *jnewblk;
7712	struct ufsmount *ump;
7713	struct workhead wkhd;
7714	struct fs *fs;
7715	int bsize;
7716	int needj;
7717
7718	ump = VFSTOUFS(freework->fw_list.wk_mp);
7719	LOCK_OWNED(ump);
7720	/*
7721	 * Handle partial truncate separately.
7722	 */
7723	if (freework->fw_indir) {
7724		complete_trunc_indir(freework);
7725		return;
7726	}
7727	freeblks = freework->fw_freeblks;
7728	fs = ump->um_fs;
7729	needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7730	bsize = lfragtosize(fs, freework->fw_frags);
7731	LIST_INIT(&wkhd);
7732	/*
7733	 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7734	 * on the indirblk hashtable and prevents premature freeing.
7735	 */
7736	freework->fw_state |= DEPCOMPLETE;
7737	/*
7738	 * SUJ needs to wait for the segment referencing freed indirect
7739	 * blocks to expire so that we know the checker will not confuse
7740	 * a re-allocated indirect block with its old contents.
7741	 */
7742	if (needj && freework->fw_lbn <= -NDADDR)
7743		indirblk_insert(freework);
7744	/*
7745	 * If we are canceling an existing jnewblk pass it to the free
7746	 * routine, otherwise pass the freeblk which will ultimately
7747	 * release the freeblks.  If we're not journaling, we can just
7748	 * free the freeblks immediately.
7749	 */
7750	jnewblk = freework->fw_jnewblk;
7751	if (jnewblk != NULL) {
7752		cancel_jnewblk(jnewblk, &wkhd);
7753		needj = 0;
7754	} else if (needj) {
7755		freework->fw_state |= DELAYEDFREE;
7756		freeblks->fb_cgwait++;
7757		WORKLIST_INSERT(&wkhd, &freework->fw_list);
7758	}
7759	FREE_LOCK(ump);
7760	freeblks_free(ump, freeblks, btodb(bsize));
7761	CTR4(KTR_SUJ,
7762	    "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
7763	    freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
7764	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
7765	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
7766	ACQUIRE_LOCK(ump);
7767	/*
7768	 * The jnewblk will be discarded and the bits in the map never
7769	 * made it to disk.  We can immediately free the freeblk.
7770	 */
7771	if (needj == 0)
7772		handle_written_freework(freework);
7773}
7774
7775/*
7776 * We enqueue freework items that need processing back on the freeblks and
7777 * add the freeblks to the worklist.  This makes it easier to find all work
7778 * required to flush a truncation in process_truncates().
7779 */
7780static void
7781freework_enqueue(freework)
7782	struct freework *freework;
7783{
7784	struct freeblks *freeblks;
7785
7786	freeblks = freework->fw_freeblks;
7787	if ((freework->fw_state & INPROGRESS) == 0)
7788		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
7789	if ((freeblks->fb_state &
7790	    (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
7791	    LIST_EMPTY(&freeblks->fb_jblkdephd))
7792		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7793}
7794
7795/*
7796 * Start, continue, or finish the process of freeing an indirect block tree.
7797 * The free operation may be paused at any point with fw_off containing the
7798 * offset to restart from.  This enables us to implement some flow control
7799 * for large truncates which may fan out and generate a huge number of
7800 * dependencies.
7801 */
7802static void
7803handle_workitem_indirblk(freework)
7804	struct freework *freework;
7805{
7806	struct freeblks *freeblks;
7807	struct ufsmount *ump;
7808	struct fs *fs;
7809
7810	freeblks = freework->fw_freeblks;
7811	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7812	fs = ump->um_fs;
7813	if (freework->fw_state & DEPCOMPLETE) {
7814		handle_written_freework(freework);
7815		return;
7816	}
7817	if (freework->fw_off == NINDIR(fs)) {
7818		freework_freeblock(freework);
7819		return;
7820	}
7821	freework->fw_state |= INPROGRESS;
7822	FREE_LOCK(ump);
7823	indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
7824	    freework->fw_lbn);
7825	ACQUIRE_LOCK(ump);
7826}
7827
7828/*
7829 * Called when a freework structure attached to a cg buf is written.  The
7830 * ref on either the parent or the freeblks structure is released and
7831 * the freeblks is added back to the worklist if there is more work to do.
7832 */
7833static void
7834handle_written_freework(freework)
7835	struct freework *freework;
7836{
7837	struct freeblks *freeblks;
7838	struct freework *parent;
7839
7840	freeblks = freework->fw_freeblks;
7841	parent = freework->fw_parent;
7842	if (freework->fw_state & DELAYEDFREE)
7843		freeblks->fb_cgwait--;
7844	freework->fw_state |= COMPLETE;
7845	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
7846		WORKITEM_FREE(freework, D_FREEWORK);
7847	if (parent) {
7848		if (--parent->fw_ref == 0)
7849			freework_enqueue(parent);
7850		return;
7851	}
7852	if (--freeblks->fb_ref != 0)
7853		return;
7854	if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
7855	    ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
7856		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7857}
7858
7859/*
7860 * This workitem routine performs the block de-allocation.
7861 * The workitem is added to the pending list after the updated
7862 * inode block has been written to disk.  As mentioned above,
7863 * checks regarding the number of blocks de-allocated (compared
7864 * to the number of blocks allocated for the file) are also
7865 * performed in this function.
7866 */
7867static int
7868handle_workitem_freeblocks(freeblks, flags)
7869	struct freeblks *freeblks;
7870	int flags;
7871{
7872	struct freework *freework;
7873	struct newblk *newblk;
7874	struct allocindir *aip;
7875	struct ufsmount *ump;
7876	struct worklist *wk;
7877
7878	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
7879	    ("handle_workitem_freeblocks: Journal entries not written."));
7880	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7881	ACQUIRE_LOCK(ump);
7882	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
7883		WORKLIST_REMOVE(wk);
7884		switch (wk->wk_type) {
7885		case D_DIRREM:
7886			wk->wk_state |= COMPLETE;
7887			add_to_worklist(wk, 0);
7888			continue;
7889
7890		case D_ALLOCDIRECT:
7891			free_newblk(WK_NEWBLK(wk));
7892			continue;
7893
7894		case D_ALLOCINDIR:
7895			aip = WK_ALLOCINDIR(wk);
7896			freework = NULL;
7897			if (aip->ai_state & DELAYEDFREE) {
7898				FREE_LOCK(ump);
7899				freework = newfreework(ump, freeblks, NULL,
7900				    aip->ai_lbn, aip->ai_newblkno,
7901				    ump->um_fs->fs_frag, 0, 0);
7902				ACQUIRE_LOCK(ump);
7903			}
7904			newblk = WK_NEWBLK(wk);
7905			if (newblk->nb_jnewblk) {
7906				freework->fw_jnewblk = newblk->nb_jnewblk;
7907				newblk->nb_jnewblk->jn_dep = &freework->fw_list;
7908				newblk->nb_jnewblk = NULL;
7909			}
7910			free_newblk(newblk);
7911			continue;
7912
7913		case D_FREEWORK:
7914			freework = WK_FREEWORK(wk);
7915			if (freework->fw_lbn <= -NDADDR)
7916				handle_workitem_indirblk(freework);
7917			else
7918				freework_freeblock(freework);
7919			continue;
7920		default:
7921			panic("handle_workitem_freeblocks: Unknown type %s",
7922			    TYPENAME(wk->wk_type));
7923		}
7924	}
7925	if (freeblks->fb_ref != 0) {
7926		freeblks->fb_state &= ~INPROGRESS;
7927		wake_worklist(&freeblks->fb_list);
7928		freeblks = NULL;
7929	}
7930	FREE_LOCK(ump);
7931	if (freeblks)
7932		return handle_complete_freeblocks(freeblks, flags);
7933	return (0);
7934}
7935
7936/*
7937 * Handle completion of block free via truncate.  This allows fs_pending
7938 * to track the actual free block count more closely than if we only updated
7939 * it at the end.  We must be careful to handle cases where the block count
7940 * on free was incorrect.
7941 */
7942static void
7943freeblks_free(ump, freeblks, blocks)
7944	struct ufsmount *ump;
7945	struct freeblks *freeblks;
7946	int blocks;
7947{
7948	struct fs *fs;
7949	ufs2_daddr_t remain;
7950
7951	UFS_LOCK(ump);
7952	remain = -freeblks->fb_chkcnt;
7953	freeblks->fb_chkcnt += blocks;
7954	if (remain > 0) {
7955		if (remain < blocks)
7956			blocks = remain;
7957		fs = ump->um_fs;
7958		fs->fs_pendingblocks -= blocks;
7959	}
7960	UFS_UNLOCK(ump);
7961}
7962
7963/*
7964 * Once all of the freework workitems are complete we can retire the
7965 * freeblocks dependency and any journal work awaiting completion.  This
7966 * can not be called until all other dependencies are stable on disk.
7967 */
7968static int
7969handle_complete_freeblocks(freeblks, flags)
7970	struct freeblks *freeblks;
7971	int flags;
7972{
7973	struct inodedep *inodedep;
7974	struct inode *ip;
7975	struct vnode *vp;
7976	struct fs *fs;
7977	struct ufsmount *ump;
7978	ufs2_daddr_t spare;
7979
7980	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7981	fs = ump->um_fs;
7982	flags = LK_EXCLUSIVE | flags;
7983	spare = freeblks->fb_chkcnt;
7984
7985	/*
7986	 * If we did not release the expected number of blocks we may have
7987	 * to adjust the inode block count here.  Only do so if it wasn't
7988	 * a truncation to zero and the modrev still matches.
7989	 */
7990	if (spare && freeblks->fb_len != 0) {
7991		if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7992		    flags, &vp, FFSV_FORCEINSMQ) != 0)
7993			return (EBUSY);
7994		ip = VTOI(vp);
7995		if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
7996			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
7997			ip->i_flag |= IN_CHANGE;
7998			/*
7999			 * We must wait so this happens before the
8000			 * journal is reclaimed.
8001			 */
8002			ffs_update(vp, 1);
8003		}
8004		vput(vp);
8005	}
8006	if (spare < 0) {
8007		UFS_LOCK(ump);
8008		fs->fs_pendingblocks += spare;
8009		UFS_UNLOCK(ump);
8010	}
8011#ifdef QUOTA
8012	/* Handle spare. */
8013	if (spare)
8014		quotaadj(freeblks->fb_quota, ump, -spare);
8015	quotarele(freeblks->fb_quota);
8016#endif
8017	ACQUIRE_LOCK(ump);
8018	if (freeblks->fb_state & ONDEPLIST) {
8019		inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
8020		    0, &inodedep);
8021		TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
8022		freeblks->fb_state &= ~ONDEPLIST;
8023		if (TAILQ_EMPTY(&inodedep->id_freeblklst))
8024			free_inodedep(inodedep);
8025	}
8026	/*
8027	 * All of the freeblock deps must be complete prior to this call
8028	 * so it's now safe to complete earlier outstanding journal entries.
8029	 */
8030	handle_jwork(&freeblks->fb_jwork);
8031	WORKITEM_FREE(freeblks, D_FREEBLKS);
8032	FREE_LOCK(ump);
8033	return (0);
8034}
8035
8036/*
8037 * Release blocks associated with the freeblks and stored in the indirect
8038 * block dbn. If level is greater than SINGLE, the block is an indirect block
8039 * and recursive calls to indirtrunc must be used to cleanse other indirect
8040 * blocks.
8041 *
8042 * This handles partial and complete truncation of blocks.  Partial is noted
8043 * with goingaway == 0.  In this case the freework is completed after the
8044 * zero'd indirects are written to disk.  For full truncation the freework
8045 * is completed after the block is freed.
8046 */
8047static void
8048indir_trunc(freework, dbn, lbn)
8049	struct freework *freework;
8050	ufs2_daddr_t dbn;
8051	ufs_lbn_t lbn;
8052{
8053	struct freework *nfreework;
8054	struct workhead wkhd;
8055	struct freeblks *freeblks;
8056	struct buf *bp;
8057	struct fs *fs;
8058	struct indirdep *indirdep;
8059	struct ufsmount *ump;
8060	ufs1_daddr_t *bap1 = 0;
8061	ufs2_daddr_t nb, nnb, *bap2 = 0;
8062	ufs_lbn_t lbnadd, nlbn;
8063	int i, nblocks, ufs1fmt;
8064	int freedblocks;
8065	int goingaway;
8066	int freedeps;
8067	int needj;
8068	int level;
8069	int cnt;
8070
8071	freeblks = freework->fw_freeblks;
8072	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8073	fs = ump->um_fs;
8074	/*
8075	 * Get buffer of block pointers to be freed.  There are three cases:
8076	 *
8077	 * 1) Partial truncate caches the indirdep pointer in the freework
8078	 *    which provides us a back copy to the save bp which holds the
8079	 *    pointers we want to clear.  When this completes the zero
8080	 *    pointers are written to the real copy.
8081	 * 2) The indirect is being completely truncated, cancel_indirdep()
8082	 *    eliminated the real copy and placed the indirdep on the saved
8083	 *    copy.  The indirdep and buf are discarded when this completes.
8084	 * 3) The indirect was not in memory, we read a copy off of the disk
8085	 *    using the devvp and drop and invalidate the buffer when we're
8086	 *    done.
8087	 */
8088	goingaway = 1;
8089	indirdep = NULL;
8090	if (freework->fw_indir != NULL) {
8091		goingaway = 0;
8092		indirdep = freework->fw_indir;
8093		bp = indirdep->ir_savebp;
8094		if (bp == NULL || bp->b_blkno != dbn)
8095			panic("indir_trunc: Bad saved buf %p blkno %jd",
8096			    bp, (intmax_t)dbn);
8097	} else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
8098		/*
8099		 * The lock prevents the buf dep list from changing and
8100	 	 * indirects on devvp should only ever have one dependency.
8101		 */
8102		indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
8103		if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
8104			panic("indir_trunc: Bad indirdep %p from buf %p",
8105			    indirdep, bp);
8106	} else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
8107	    NOCRED, &bp) != 0) {
8108		brelse(bp);
8109		return;
8110	}
8111	ACQUIRE_LOCK(ump);
8112	/* Protects against a race with complete_trunc_indir(). */
8113	freework->fw_state &= ~INPROGRESS;
8114	/*
8115	 * If we have an indirdep we need to enforce the truncation order
8116	 * and discard it when it is complete.
8117	 */
8118	if (indirdep) {
8119		if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
8120		    !TAILQ_EMPTY(&indirdep->ir_trunc)) {
8121			/*
8122			 * Add the complete truncate to the list on the
8123			 * indirdep to enforce in-order processing.
8124			 */
8125			if (freework->fw_indir == NULL)
8126				TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
8127				    freework, fw_next);
8128			FREE_LOCK(ump);
8129			return;
8130		}
8131		/*
8132		 * If we're goingaway, free the indirdep.  Otherwise it will
8133		 * linger until the write completes.
8134		 */
8135		if (goingaway)
8136			free_indirdep(indirdep);
8137	}
8138	FREE_LOCK(ump);
8139	/* Initialize pointers depending on block size. */
8140	if (ump->um_fstype == UFS1) {
8141		bap1 = (ufs1_daddr_t *)bp->b_data;
8142		nb = bap1[freework->fw_off];
8143		ufs1fmt = 1;
8144	} else {
8145		bap2 = (ufs2_daddr_t *)bp->b_data;
8146		nb = bap2[freework->fw_off];
8147		ufs1fmt = 0;
8148	}
8149	level = lbn_level(lbn);
8150	needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
8151	lbnadd = lbn_offset(fs, level);
8152	nblocks = btodb(fs->fs_bsize);
8153	nfreework = freework;
8154	freedeps = 0;
8155	cnt = 0;
8156	/*
8157	 * Reclaim blocks.  Traverses into nested indirect levels and
8158	 * arranges for the current level to be freed when subordinates
8159	 * are free when journaling.
8160	 */
8161	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
8162		if (i != NINDIR(fs) - 1) {
8163			if (ufs1fmt)
8164				nnb = bap1[i+1];
8165			else
8166				nnb = bap2[i+1];
8167		} else
8168			nnb = 0;
8169		if (nb == 0)
8170			continue;
8171		cnt++;
8172		if (level != 0) {
8173			nlbn = (lbn + 1) - (i * lbnadd);
8174			if (needj != 0) {
8175				nfreework = newfreework(ump, freeblks, freework,
8176				    nlbn, nb, fs->fs_frag, 0, 0);
8177				freedeps++;
8178			}
8179			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
8180		} else {
8181			struct freedep *freedep;
8182
8183			/*
8184			 * Attempt to aggregate freedep dependencies for
8185			 * all blocks being released to the same CG.
8186			 */
8187			LIST_INIT(&wkhd);
8188			if (needj != 0 &&
8189			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
8190				freedep = newfreedep(freework);
8191				WORKLIST_INSERT_UNLOCKED(&wkhd,
8192				    &freedep->fd_list);
8193				freedeps++;
8194			}
8195			CTR3(KTR_SUJ,
8196			    "indir_trunc: ino %d blkno %jd size %ld",
8197			    freeblks->fb_inum, nb, fs->fs_bsize);
8198			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
8199			    fs->fs_bsize, freeblks->fb_inum,
8200			    freeblks->fb_vtype, &wkhd);
8201		}
8202	}
8203	if (goingaway) {
8204		bp->b_flags |= B_INVAL | B_NOCACHE;
8205		brelse(bp);
8206	}
8207	freedblocks = 0;
8208	if (level == 0)
8209		freedblocks = (nblocks * cnt);
8210	if (needj == 0)
8211		freedblocks += nblocks;
8212	freeblks_free(ump, freeblks, freedblocks);
8213	/*
8214	 * If we are journaling set up the ref counts and offset so this
8215	 * indirect can be completed when its children are free.
8216	 */
8217	if (needj) {
8218		ACQUIRE_LOCK(ump);
8219		freework->fw_off = i;
8220		freework->fw_ref += freedeps;
8221		freework->fw_ref -= NINDIR(fs) + 1;
8222		if (level == 0)
8223			freeblks->fb_cgwait += freedeps;
8224		if (freework->fw_ref == 0)
8225			freework_freeblock(freework);
8226		FREE_LOCK(ump);
8227		return;
8228	}
8229	/*
8230	 * If we're not journaling we can free the indirect now.
8231	 */
8232	dbn = dbtofsb(fs, dbn);
8233	CTR3(KTR_SUJ,
8234	    "indir_trunc 2: ino %d blkno %jd size %ld",
8235	    freeblks->fb_inum, dbn, fs->fs_bsize);
8236	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
8237	    freeblks->fb_inum, freeblks->fb_vtype, NULL);
8238	/* Non SUJ softdep does single-threaded truncations. */
8239	if (freework->fw_blkno == dbn) {
8240		freework->fw_state |= ALLCOMPLETE;
8241		ACQUIRE_LOCK(ump);
8242		handle_written_freework(freework);
8243		FREE_LOCK(ump);
8244	}
8245	return;
8246}
8247
8248/*
8249 * Cancel an allocindir when it is removed via truncation.  When bp is not
8250 * NULL the indirect never appeared on disk and is scheduled to be freed
8251 * independently of the indir so we can more easily track journal work.
8252 */
8253static void
8254cancel_allocindir(aip, bp, freeblks, trunc)
8255	struct allocindir *aip;
8256	struct buf *bp;
8257	struct freeblks *freeblks;
8258	int trunc;
8259{
8260	struct indirdep *indirdep;
8261	struct freefrag *freefrag;
8262	struct newblk *newblk;
8263
8264	newblk = (struct newblk *)aip;
8265	LIST_REMOVE(aip, ai_next);
8266	/*
8267	 * We must eliminate the pointer in bp if it must be freed on its
8268	 * own due to partial truncate or pending journal work.
8269	 */
8270	if (bp && (trunc || newblk->nb_jnewblk)) {
8271		/*
8272		 * Clear the pointer and mark the aip to be freed
8273		 * directly if it never existed on disk.
8274		 */
8275		aip->ai_state |= DELAYEDFREE;
8276		indirdep = aip->ai_indirdep;
8277		if (indirdep->ir_state & UFS1FMT)
8278			((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8279		else
8280			((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8281	}
8282	/*
8283	 * When truncating the previous pointer will be freed via
8284	 * savedbp.  Eliminate the freefrag which would dup free.
8285	 */
8286	if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
8287		newblk->nb_freefrag = NULL;
8288		if (freefrag->ff_jdep)
8289			cancel_jfreefrag(
8290			    WK_JFREEFRAG(freefrag->ff_jdep));
8291		jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
8292		WORKITEM_FREE(freefrag, D_FREEFRAG);
8293	}
8294	/*
8295	 * If the journal hasn't been written the jnewblk must be passed
8296	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
8297	 * this by leaving the journal dependency on the newblk to be freed
8298	 * when a freework is created in handle_workitem_freeblocks().
8299	 */
8300	cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
8301	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
8302}
8303
8304/*
8305 * Create the mkdir dependencies for . and .. in a new directory.  Link them
8306 * in to a newdirblk so any subsequent additions are tracked properly.  The
8307 * caller is responsible for adding the mkdir1 dependency to the journal
8308 * and updating id_mkdiradd.  This function returns with the per-filesystem
8309 * lock held.
8310 */
8311static struct mkdir *
8312setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
8313	struct diradd *dap;
8314	ino_t newinum;
8315	ino_t dinum;
8316	struct buf *newdirbp;
8317	struct mkdir **mkdirp;
8318{
8319	struct newblk *newblk;
8320	struct pagedep *pagedep;
8321	struct inodedep *inodedep;
8322	struct newdirblk *newdirblk = 0;
8323	struct mkdir *mkdir1, *mkdir2;
8324	struct worklist *wk;
8325	struct jaddref *jaddref;
8326	struct ufsmount *ump;
8327	struct mount *mp;
8328
8329	mp = dap->da_list.wk_mp;
8330	ump = VFSTOUFS(mp);
8331	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
8332	    M_SOFTDEP_FLAGS);
8333	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8334	LIST_INIT(&newdirblk->db_mkdir);
8335	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8336	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
8337	mkdir1->md_state = ATTACHED | MKDIR_BODY;
8338	mkdir1->md_diradd = dap;
8339	mkdir1->md_jaddref = NULL;
8340	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8341	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
8342	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
8343	mkdir2->md_diradd = dap;
8344	mkdir2->md_jaddref = NULL;
8345	if (MOUNTEDSUJ(mp) == 0) {
8346		mkdir1->md_state |= DEPCOMPLETE;
8347		mkdir2->md_state |= DEPCOMPLETE;
8348	}
8349	/*
8350	 * Dependency on "." and ".." being written to disk.
8351	 */
8352	mkdir1->md_buf = newdirbp;
8353	ACQUIRE_LOCK(VFSTOUFS(mp));
8354	LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs);
8355	/*
8356	 * We must link the pagedep, allocdirect, and newdirblk for
8357	 * the initial file page so the pointer to the new directory
8358	 * is not written until the directory contents are live and
8359	 * any subsequent additions are not marked live until the
8360	 * block is reachable via the inode.
8361	 */
8362	if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
8363		panic("setup_newdir: lost pagedep");
8364	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
8365		if (wk->wk_type == D_ALLOCDIRECT)
8366			break;
8367	if (wk == NULL)
8368		panic("setup_newdir: lost allocdirect");
8369	if (pagedep->pd_state & NEWBLOCK)
8370		panic("setup_newdir: NEWBLOCK already set");
8371	newblk = WK_NEWBLK(wk);
8372	pagedep->pd_state |= NEWBLOCK;
8373	pagedep->pd_newdirblk = newdirblk;
8374	newdirblk->db_pagedep = pagedep;
8375	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8376	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
8377	/*
8378	 * Look up the inodedep for the parent directory so that we
8379	 * can link mkdir2 into the pending dotdot jaddref or
8380	 * the inode write if there is none.  If the inode is
8381	 * ALLCOMPLETE and no jaddref is present all dependencies have
8382	 * been satisfied and mkdir2 can be freed.
8383	 */
8384	inodedep_lookup(mp, dinum, 0, &inodedep);
8385	if (MOUNTEDSUJ(mp)) {
8386		if (inodedep == NULL)
8387			panic("setup_newdir: Lost parent.");
8388		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8389		    inoreflst);
8390		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
8391		    (jaddref->ja_state & MKDIR_PARENT),
8392		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
8393		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8394		mkdir2->md_jaddref = jaddref;
8395		jaddref->ja_mkdir = mkdir2;
8396	} else if (inodedep == NULL ||
8397	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8398		dap->da_state &= ~MKDIR_PARENT;
8399		WORKITEM_FREE(mkdir2, D_MKDIR);
8400		mkdir2 = NULL;
8401	} else {
8402		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8403		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
8404	}
8405	*mkdirp = mkdir2;
8406
8407	return (mkdir1);
8408}
8409
8410/*
8411 * Directory entry addition dependencies.
8412 *
8413 * When adding a new directory entry, the inode (with its incremented link
8414 * count) must be written to disk before the directory entry's pointer to it.
8415 * Also, if the inode is newly allocated, the corresponding freemap must be
8416 * updated (on disk) before the directory entry's pointer. These requirements
8417 * are met via undo/redo on the directory entry's pointer, which consists
8418 * simply of the inode number.
8419 *
8420 * As directory entries are added and deleted, the free space within a
8421 * directory block can become fragmented.  The ufs filesystem will compact
8422 * a fragmented directory block to make space for a new entry. When this
8423 * occurs, the offsets of previously added entries change. Any "diradd"
8424 * dependency structures corresponding to these entries must be updated with
8425 * the new offsets.
8426 */
8427
8428/*
8429 * This routine is called after the in-memory inode's link
8430 * count has been incremented, but before the directory entry's
8431 * pointer to the inode has been set.
8432 */
8433int
8434softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
8435	struct buf *bp;		/* buffer containing directory block */
8436	struct inode *dp;	/* inode for directory */
8437	off_t diroffset;	/* offset of new entry in directory */
8438	ino_t newinum;		/* inode referenced by new directory entry */
8439	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
8440	int isnewblk;		/* entry is in a newly allocated block */
8441{
8442	int offset;		/* offset of new entry within directory block */
8443	ufs_lbn_t lbn;		/* block in directory containing new entry */
8444	struct fs *fs;
8445	struct diradd *dap;
8446	struct newblk *newblk;
8447	struct pagedep *pagedep;
8448	struct inodedep *inodedep;
8449	struct newdirblk *newdirblk = 0;
8450	struct mkdir *mkdir1, *mkdir2;
8451	struct jaddref *jaddref;
8452	struct ufsmount *ump;
8453	struct mount *mp;
8454	int isindir;
8455
8456	ump = dp->i_ump;
8457	mp = UFSTOVFS(ump);
8458	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8459	    ("softdep_setup_directory_add called on non-softdep filesystem"));
8460	/*
8461	 * Whiteouts have no dependencies.
8462	 */
8463	if (newinum == WINO) {
8464		if (newdirbp != NULL)
8465			bdwrite(newdirbp);
8466		return (0);
8467	}
8468	jaddref = NULL;
8469	mkdir1 = mkdir2 = NULL;
8470	fs = dp->i_fs;
8471	lbn = lblkno(fs, diroffset);
8472	offset = blkoff(fs, diroffset);
8473	dap = malloc(sizeof(struct diradd), M_DIRADD,
8474		M_SOFTDEP_FLAGS|M_ZERO);
8475	workitem_alloc(&dap->da_list, D_DIRADD, mp);
8476	dap->da_offset = offset;
8477	dap->da_newinum = newinum;
8478	dap->da_state = ATTACHED;
8479	LIST_INIT(&dap->da_jwork);
8480	isindir = bp->b_lblkno >= NDADDR;
8481	if (isnewblk &&
8482	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8483		newdirblk = malloc(sizeof(struct newdirblk),
8484		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8485		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8486		LIST_INIT(&newdirblk->db_mkdir);
8487	}
8488	/*
8489	 * If we're creating a new directory setup the dependencies and set
8490	 * the dap state to wait for them.  Otherwise it's COMPLETE and
8491	 * we can move on.
8492	 */
8493	if (newdirbp == NULL) {
8494		dap->da_state |= DEPCOMPLETE;
8495		ACQUIRE_LOCK(ump);
8496	} else {
8497		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8498		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8499		    &mkdir2);
8500	}
8501	/*
8502	 * Link into parent directory pagedep to await its being written.
8503	 */
8504	pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8505#ifdef DEBUG
8506	if (diradd_lookup(pagedep, offset) != NULL)
8507		panic("softdep_setup_directory_add: %p already at off %d\n",
8508		    diradd_lookup(pagedep, offset), offset);
8509#endif
8510	dap->da_pagedep = pagedep;
8511	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8512	    da_pdlist);
8513	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
8514	/*
8515	 * If we're journaling, link the diradd into the jaddref so it
8516	 * may be completed after the journal entry is written.  Otherwise,
8517	 * link the diradd into its inodedep.  If the inode is not yet
8518	 * written place it on the bufwait list, otherwise do the post-inode
8519	 * write processing to put it on the id_pendinghd list.
8520	 */
8521	if (MOUNTEDSUJ(mp)) {
8522		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8523		    inoreflst);
8524		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8525		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8526		jaddref->ja_diroff = diroffset;
8527		jaddref->ja_diradd = dap;
8528		add_to_journal(&jaddref->ja_list);
8529	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8530		diradd_inode_written(dap, inodedep);
8531	else
8532		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8533	/*
8534	 * Add the journal entries for . and .. links now that the primary
8535	 * link is written.
8536	 */
8537	if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8538		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8539		    inoreflst, if_deps);
8540		KASSERT(jaddref != NULL &&
8541		    jaddref->ja_ino == jaddref->ja_parent &&
8542		    (jaddref->ja_state & MKDIR_BODY),
8543		    ("softdep_setup_directory_add: bad dot jaddref %p",
8544		    jaddref));
8545		mkdir1->md_jaddref = jaddref;
8546		jaddref->ja_mkdir = mkdir1;
8547		/*
8548		 * It is important that the dotdot journal entry
8549		 * is added prior to the dot entry since dot writes
8550		 * both the dot and dotdot links.  These both must
8551		 * be added after the primary link for the journal
8552		 * to remain consistent.
8553		 */
8554		add_to_journal(&mkdir2->md_jaddref->ja_list);
8555		add_to_journal(&jaddref->ja_list);
8556	}
8557	/*
8558	 * If we are adding a new directory remember this diradd so that if
8559	 * we rename it we can keep the dot and dotdot dependencies.  If
8560	 * we are adding a new name for an inode that has a mkdiradd we
8561	 * must be in rename and we have to move the dot and dotdot
8562	 * dependencies to this new name.  The old name is being orphaned
8563	 * soon.
8564	 */
8565	if (mkdir1 != NULL) {
8566		if (inodedep->id_mkdiradd != NULL)
8567			panic("softdep_setup_directory_add: Existing mkdir");
8568		inodedep->id_mkdiradd = dap;
8569	} else if (inodedep->id_mkdiradd)
8570		merge_diradd(inodedep, dap);
8571	if (newdirblk) {
8572		/*
8573		 * There is nothing to do if we are already tracking
8574		 * this block.
8575		 */
8576		if ((pagedep->pd_state & NEWBLOCK) != 0) {
8577			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8578			FREE_LOCK(ump);
8579			return (0);
8580		}
8581		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8582		    == 0)
8583			panic("softdep_setup_directory_add: lost entry");
8584		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8585		pagedep->pd_state |= NEWBLOCK;
8586		pagedep->pd_newdirblk = newdirblk;
8587		newdirblk->db_pagedep = pagedep;
8588		FREE_LOCK(ump);
8589		/*
8590		 * If we extended into an indirect signal direnter to sync.
8591		 */
8592		if (isindir)
8593			return (1);
8594		return (0);
8595	}
8596	FREE_LOCK(ump);
8597	return (0);
8598}
8599
8600/*
8601 * This procedure is called to change the offset of a directory
8602 * entry when compacting a directory block which must be owned
8603 * exclusively by the caller. Note that the actual entry movement
8604 * must be done in this procedure to ensure that no I/O completions
8605 * occur while the move is in progress.
8606 */
8607void
8608softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
8609	struct buf *bp;		/* Buffer holding directory block. */
8610	struct inode *dp;	/* inode for directory */
8611	caddr_t base;		/* address of dp->i_offset */
8612	caddr_t oldloc;		/* address of old directory location */
8613	caddr_t newloc;		/* address of new directory location */
8614	int entrysize;		/* size of directory entry */
8615{
8616	int offset, oldoffset, newoffset;
8617	struct pagedep *pagedep;
8618	struct jmvref *jmvref;
8619	struct diradd *dap;
8620	struct direct *de;
8621	struct mount *mp;
8622	ufs_lbn_t lbn;
8623	int flags;
8624
8625	mp = UFSTOVFS(dp->i_ump);
8626	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8627	    ("softdep_change_directoryentry_offset called on "
8628	     "non-softdep filesystem"));
8629	de = (struct direct *)oldloc;
8630	jmvref = NULL;
8631	flags = 0;
8632	/*
8633	 * Moves are always journaled as it would be too complex to
8634	 * determine if any affected adds or removes are present in the
8635	 * journal.
8636	 */
8637	if (MOUNTEDSUJ(mp)) {
8638		flags = DEPALLOC;
8639		jmvref = newjmvref(dp, de->d_ino,
8640		    dp->i_offset + (oldloc - base),
8641		    dp->i_offset + (newloc - base));
8642	}
8643	lbn = lblkno(dp->i_fs, dp->i_offset);
8644	offset = blkoff(dp->i_fs, dp->i_offset);
8645	oldoffset = offset + (oldloc - base);
8646	newoffset = offset + (newloc - base);
8647	ACQUIRE_LOCK(dp->i_ump);
8648	if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8649		goto done;
8650	dap = diradd_lookup(pagedep, oldoffset);
8651	if (dap) {
8652		dap->da_offset = newoffset;
8653		newoffset = DIRADDHASH(newoffset);
8654		oldoffset = DIRADDHASH(oldoffset);
8655		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8656		    newoffset != oldoffset) {
8657			LIST_REMOVE(dap, da_pdlist);
8658			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8659			    dap, da_pdlist);
8660		}
8661	}
8662done:
8663	if (jmvref) {
8664		jmvref->jm_pagedep = pagedep;
8665		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8666		add_to_journal(&jmvref->jm_list);
8667	}
8668	bcopy(oldloc, newloc, entrysize);
8669	FREE_LOCK(dp->i_ump);
8670}
8671
8672/*
8673 * Move the mkdir dependencies and journal work from one diradd to another
8674 * when renaming a directory.  The new name must depend on the mkdir deps
8675 * completing as the old name did.  Directories can only have one valid link
8676 * at a time so one must be canonical.
8677 */
8678static void
8679merge_diradd(inodedep, newdap)
8680	struct inodedep *inodedep;
8681	struct diradd *newdap;
8682{
8683	struct diradd *olddap;
8684	struct mkdir *mkdir, *nextmd;
8685	struct ufsmount *ump;
8686	short state;
8687
8688	olddap = inodedep->id_mkdiradd;
8689	inodedep->id_mkdiradd = newdap;
8690	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8691		newdap->da_state &= ~DEPCOMPLETE;
8692		ump = VFSTOUFS(inodedep->id_list.wk_mp);
8693		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8694		     mkdir = nextmd) {
8695			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8696			if (mkdir->md_diradd != olddap)
8697				continue;
8698			mkdir->md_diradd = newdap;
8699			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8700			newdap->da_state |= state;
8701			olddap->da_state &= ~state;
8702			if ((olddap->da_state &
8703			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
8704				break;
8705		}
8706		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8707			panic("merge_diradd: unfound ref");
8708	}
8709	/*
8710	 * Any mkdir related journal items are not safe to be freed until
8711	 * the new name is stable.
8712	 */
8713	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8714	olddap->da_state |= DEPCOMPLETE;
8715	complete_diradd(olddap);
8716}
8717
8718/*
8719 * Move the diradd to the pending list when all diradd dependencies are
8720 * complete.
8721 */
8722static void
8723complete_diradd(dap)
8724	struct diradd *dap;
8725{
8726	struct pagedep *pagedep;
8727
8728	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8729		if (dap->da_state & DIRCHG)
8730			pagedep = dap->da_previous->dm_pagedep;
8731		else
8732			pagedep = dap->da_pagedep;
8733		LIST_REMOVE(dap, da_pdlist);
8734		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8735	}
8736}
8737
8738/*
8739 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
8740 * add entries and conditonally journal the remove.
8741 */
8742static void
8743cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
8744	struct diradd *dap;
8745	struct dirrem *dirrem;
8746	struct jremref *jremref;
8747	struct jremref *dotremref;
8748	struct jremref *dotdotremref;
8749{
8750	struct inodedep *inodedep;
8751	struct jaddref *jaddref;
8752	struct inoref *inoref;
8753	struct ufsmount *ump;
8754	struct mkdir *mkdir;
8755
8756	/*
8757	 * If no remove references were allocated we're on a non-journaled
8758	 * filesystem and can skip the cancel step.
8759	 */
8760	if (jremref == NULL) {
8761		free_diradd(dap, NULL);
8762		return;
8763	}
8764	/*
8765	 * Cancel the primary name an free it if it does not require
8766	 * journaling.
8767	 */
8768	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
8769	    0, &inodedep) != 0) {
8770		/* Abort the addref that reference this diradd.  */
8771		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
8772			if (inoref->if_list.wk_type != D_JADDREF)
8773				continue;
8774			jaddref = (struct jaddref *)inoref;
8775			if (jaddref->ja_diradd != dap)
8776				continue;
8777			if (cancel_jaddref(jaddref, inodedep,
8778			    &dirrem->dm_jwork) == 0) {
8779				free_jremref(jremref);
8780				jremref = NULL;
8781			}
8782			break;
8783		}
8784	}
8785	/*
8786	 * Cancel subordinate names and free them if they do not require
8787	 * journaling.
8788	 */
8789	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8790		ump = VFSTOUFS(dap->da_list.wk_mp);
8791		LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) {
8792			if (mkdir->md_diradd != dap)
8793				continue;
8794			if ((jaddref = mkdir->md_jaddref) == NULL)
8795				continue;
8796			mkdir->md_jaddref = NULL;
8797			if (mkdir->md_state & MKDIR_PARENT) {
8798				if (cancel_jaddref(jaddref, NULL,
8799				    &dirrem->dm_jwork) == 0) {
8800					free_jremref(dotdotremref);
8801					dotdotremref = NULL;
8802				}
8803			} else {
8804				if (cancel_jaddref(jaddref, inodedep,
8805				    &dirrem->dm_jwork) == 0) {
8806					free_jremref(dotremref);
8807					dotremref = NULL;
8808				}
8809			}
8810		}
8811	}
8812
8813	if (jremref)
8814		journal_jremref(dirrem, jremref, inodedep);
8815	if (dotremref)
8816		journal_jremref(dirrem, dotremref, inodedep);
8817	if (dotdotremref)
8818		journal_jremref(dirrem, dotdotremref, NULL);
8819	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
8820	free_diradd(dap, &dirrem->dm_jwork);
8821}
8822
8823/*
8824 * Free a diradd dependency structure. This routine must be called
8825 * with splbio interrupts blocked.
8826 */
8827static void
8828free_diradd(dap, wkhd)
8829	struct diradd *dap;
8830	struct workhead *wkhd;
8831{
8832	struct dirrem *dirrem;
8833	struct pagedep *pagedep;
8834	struct inodedep *inodedep;
8835	struct mkdir *mkdir, *nextmd;
8836	struct ufsmount *ump;
8837
8838	ump = VFSTOUFS(dap->da_list.wk_mp);
8839	LOCK_OWNED(ump);
8840	LIST_REMOVE(dap, da_pdlist);
8841	if (dap->da_state & ONWORKLIST)
8842		WORKLIST_REMOVE(&dap->da_list);
8843	if ((dap->da_state & DIRCHG) == 0) {
8844		pagedep = dap->da_pagedep;
8845	} else {
8846		dirrem = dap->da_previous;
8847		pagedep = dirrem->dm_pagedep;
8848		dirrem->dm_dirinum = pagedep->pd_ino;
8849		dirrem->dm_state |= COMPLETE;
8850		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8851			add_to_worklist(&dirrem->dm_list, 0);
8852	}
8853	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
8854	    0, &inodedep) != 0)
8855		if (inodedep->id_mkdiradd == dap)
8856			inodedep->id_mkdiradd = NULL;
8857	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8858		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8859		     mkdir = nextmd) {
8860			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8861			if (mkdir->md_diradd != dap)
8862				continue;
8863			dap->da_state &=
8864			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
8865			LIST_REMOVE(mkdir, md_mkdirs);
8866			if (mkdir->md_state & ONWORKLIST)
8867				WORKLIST_REMOVE(&mkdir->md_list);
8868			if (mkdir->md_jaddref != NULL)
8869				panic("free_diradd: Unexpected jaddref");
8870			WORKITEM_FREE(mkdir, D_MKDIR);
8871			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
8872				break;
8873		}
8874		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8875			panic("free_diradd: unfound ref");
8876	}
8877	if (inodedep)
8878		free_inodedep(inodedep);
8879	/*
8880	 * Free any journal segments waiting for the directory write.
8881	 */
8882	handle_jwork(&dap->da_jwork);
8883	WORKITEM_FREE(dap, D_DIRADD);
8884}
8885
8886/*
8887 * Directory entry removal dependencies.
8888 *
8889 * When removing a directory entry, the entry's inode pointer must be
8890 * zero'ed on disk before the corresponding inode's link count is decremented
8891 * (possibly freeing the inode for re-use). This dependency is handled by
8892 * updating the directory entry but delaying the inode count reduction until
8893 * after the directory block has been written to disk. After this point, the
8894 * inode count can be decremented whenever it is convenient.
8895 */
8896
8897/*
8898 * This routine should be called immediately after removing
8899 * a directory entry.  The inode's link count should not be
8900 * decremented by the calling procedure -- the soft updates
8901 * code will do this task when it is safe.
8902 */
8903void
8904softdep_setup_remove(bp, dp, ip, isrmdir)
8905	struct buf *bp;		/* buffer containing directory block */
8906	struct inode *dp;	/* inode for the directory being modified */
8907	struct inode *ip;	/* inode for directory entry being removed */
8908	int isrmdir;		/* indicates if doing RMDIR */
8909{
8910	struct dirrem *dirrem, *prevdirrem;
8911	struct inodedep *inodedep;
8912	int direct;
8913
8914	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
8915	    ("softdep_setup_remove called on non-softdep filesystem"));
8916	/*
8917	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
8918	 * newdirrem() to setup the full directory remove which requires
8919	 * isrmdir > 1.
8920	 */
8921	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8922	/*
8923	 * Add the dirrem to the inodedep's pending remove list for quick
8924	 * discovery later.
8925	 */
8926	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8927	    &inodedep) == 0)
8928		panic("softdep_setup_remove: Lost inodedep.");
8929	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
8930	dirrem->dm_state |= ONDEPLIST;
8931	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8932
8933	/*
8934	 * If the COMPLETE flag is clear, then there were no active
8935	 * entries and we want to roll back to a zeroed entry until
8936	 * the new inode is committed to disk. If the COMPLETE flag is
8937	 * set then we have deleted an entry that never made it to
8938	 * disk. If the entry we deleted resulted from a name change,
8939	 * then the old name still resides on disk. We cannot delete
8940	 * its inode (returned to us in prevdirrem) until the zeroed
8941	 * directory entry gets to disk. The new inode has never been
8942	 * referenced on the disk, so can be deleted immediately.
8943	 */
8944	if ((dirrem->dm_state & COMPLETE) == 0) {
8945		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
8946		    dm_next);
8947		FREE_LOCK(ip->i_ump);
8948	} else {
8949		if (prevdirrem != NULL)
8950			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
8951			    prevdirrem, dm_next);
8952		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
8953		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
8954		FREE_LOCK(ip->i_ump);
8955		if (direct)
8956			handle_workitem_remove(dirrem, 0);
8957	}
8958}
8959
8960/*
8961 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
8962 * pd_pendinghd list of a pagedep.
8963 */
8964static struct diradd *
8965diradd_lookup(pagedep, offset)
8966	struct pagedep *pagedep;
8967	int offset;
8968{
8969	struct diradd *dap;
8970
8971	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
8972		if (dap->da_offset == offset)
8973			return (dap);
8974	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
8975		if (dap->da_offset == offset)
8976			return (dap);
8977	return (NULL);
8978}
8979
8980/*
8981 * Search for a .. diradd dependency in a directory that is being removed.
8982 * If the directory was renamed to a new parent we have a diradd rather
8983 * than a mkdir for the .. entry.  We need to cancel it now before
8984 * it is found in truncate().
8985 */
8986static struct jremref *
8987cancel_diradd_dotdot(ip, dirrem, jremref)
8988	struct inode *ip;
8989	struct dirrem *dirrem;
8990	struct jremref *jremref;
8991{
8992	struct pagedep *pagedep;
8993	struct diradd *dap;
8994	struct worklist *wk;
8995
8996	if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0,
8997	    &pagedep) == 0)
8998		return (jremref);
8999	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
9000	if (dap == NULL)
9001		return (jremref);
9002	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
9003	/*
9004	 * Mark any journal work as belonging to the parent so it is freed
9005	 * with the .. reference.
9006	 */
9007	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9008		wk->wk_state |= MKDIR_PARENT;
9009	return (NULL);
9010}
9011
9012/*
9013 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
9014 * replace it with a dirrem/diradd pair as a result of re-parenting a
9015 * directory.  This ensures that we don't simultaneously have a mkdir and
9016 * a diradd for the same .. entry.
9017 */
9018static struct jremref *
9019cancel_mkdir_dotdot(ip, dirrem, jremref)
9020	struct inode *ip;
9021	struct dirrem *dirrem;
9022	struct jremref *jremref;
9023{
9024	struct inodedep *inodedep;
9025	struct jaddref *jaddref;
9026	struct ufsmount *ump;
9027	struct mkdir *mkdir;
9028	struct diradd *dap;
9029
9030	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
9031	    &inodedep) == 0)
9032		return (jremref);
9033	dap = inodedep->id_mkdiradd;
9034	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
9035		return (jremref);
9036	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9037	for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
9038	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
9039		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
9040			break;
9041	if (mkdir == NULL)
9042		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
9043	if ((jaddref = mkdir->md_jaddref) != NULL) {
9044		mkdir->md_jaddref = NULL;
9045		jaddref->ja_state &= ~MKDIR_PARENT;
9046		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
9047		    &inodedep) == 0)
9048			panic("cancel_mkdir_dotdot: Lost parent inodedep");
9049		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
9050			journal_jremref(dirrem, jremref, inodedep);
9051			jremref = NULL;
9052		}
9053	}
9054	if (mkdir->md_state & ONWORKLIST)
9055		WORKLIST_REMOVE(&mkdir->md_list);
9056	mkdir->md_state |= ALLCOMPLETE;
9057	complete_mkdir(mkdir);
9058	return (jremref);
9059}
9060
9061static void
9062journal_jremref(dirrem, jremref, inodedep)
9063	struct dirrem *dirrem;
9064	struct jremref *jremref;
9065	struct inodedep *inodedep;
9066{
9067
9068	if (inodedep == NULL)
9069		if (inodedep_lookup(jremref->jr_list.wk_mp,
9070		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
9071			panic("journal_jremref: Lost inodedep");
9072	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
9073	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
9074	add_to_journal(&jremref->jr_list);
9075}
9076
9077static void
9078dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
9079	struct dirrem *dirrem;
9080	struct jremref *jremref;
9081	struct jremref *dotremref;
9082	struct jremref *dotdotremref;
9083{
9084	struct inodedep *inodedep;
9085
9086
9087	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
9088	    &inodedep) == 0)
9089		panic("dirrem_journal: Lost inodedep");
9090	journal_jremref(dirrem, jremref, inodedep);
9091	if (dotremref)
9092		journal_jremref(dirrem, dotremref, inodedep);
9093	if (dotdotremref)
9094		journal_jremref(dirrem, dotdotremref, NULL);
9095}
9096
9097/*
9098 * Allocate a new dirrem if appropriate and return it along with
9099 * its associated pagedep. Called without a lock, returns with lock.
9100 */
9101static struct dirrem *
9102newdirrem(bp, dp, ip, isrmdir, prevdirremp)
9103	struct buf *bp;		/* buffer containing directory block */
9104	struct inode *dp;	/* inode for the directory being modified */
9105	struct inode *ip;	/* inode for directory entry being removed */
9106	int isrmdir;		/* indicates if doing RMDIR */
9107	struct dirrem **prevdirremp; /* previously referenced inode, if any */
9108{
9109	int offset;
9110	ufs_lbn_t lbn;
9111	struct diradd *dap;
9112	struct dirrem *dirrem;
9113	struct pagedep *pagedep;
9114	struct jremref *jremref;
9115	struct jremref *dotremref;
9116	struct jremref *dotdotremref;
9117	struct vnode *dvp;
9118
9119	/*
9120	 * Whiteouts have no deletion dependencies.
9121	 */
9122	if (ip == NULL)
9123		panic("newdirrem: whiteout");
9124	dvp = ITOV(dp);
9125	/*
9126	 * If the system is over its limit and our filesystem is
9127	 * responsible for more than our share of that usage and
9128	 * we are not a snapshot, request some inodedep cleanup.
9129	 * Limiting the number of dirrem structures will also limit
9130	 * the number of freefile and freeblks structures.
9131	 */
9132	ACQUIRE_LOCK(ip->i_ump);
9133	if (!IS_SNAPSHOT(ip) && softdep_excess_dirrem(ip->i_ump))
9134		schedule_cleanup(ITOV(dp)->v_mount);
9135	else
9136		FREE_LOCK(ip->i_ump);
9137	dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS |
9138	    M_ZERO);
9139	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
9140	LIST_INIT(&dirrem->dm_jremrefhd);
9141	LIST_INIT(&dirrem->dm_jwork);
9142	dirrem->dm_state = isrmdir ? RMDIR : 0;
9143	dirrem->dm_oldinum = ip->i_number;
9144	*prevdirremp = NULL;
9145	/*
9146	 * Allocate remove reference structures to track journal write
9147	 * dependencies.  We will always have one for the link and
9148	 * when doing directories we will always have one more for dot.
9149	 * When renaming a directory we skip the dotdot link change so
9150	 * this is not needed.
9151	 */
9152	jremref = dotremref = dotdotremref = NULL;
9153	if (DOINGSUJ(dvp)) {
9154		if (isrmdir) {
9155			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
9156			    ip->i_effnlink + 2);
9157			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
9158			    ip->i_effnlink + 1);
9159			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
9160			    dp->i_effnlink + 1);
9161			dotdotremref->jr_state |= MKDIR_PARENT;
9162		} else
9163			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
9164			    ip->i_effnlink + 1);
9165	}
9166	ACQUIRE_LOCK(ip->i_ump);
9167	lbn = lblkno(dp->i_fs, dp->i_offset);
9168	offset = blkoff(dp->i_fs, dp->i_offset);
9169	pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC,
9170	    &pagedep);
9171	dirrem->dm_pagedep = pagedep;
9172	dirrem->dm_offset = offset;
9173	/*
9174	 * If we're renaming a .. link to a new directory, cancel any
9175	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
9176	 * the jremref is preserved for any potential diradd in this
9177	 * location.  This can not coincide with a rmdir.
9178	 */
9179	if (dp->i_offset == DOTDOT_OFFSET) {
9180		if (isrmdir)
9181			panic("newdirrem: .. directory change during remove?");
9182		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
9183	}
9184	/*
9185	 * If we're removing a directory search for the .. dependency now and
9186	 * cancel it.  Any pending journal work will be added to the dirrem
9187	 * to be completed when the workitem remove completes.
9188	 */
9189	if (isrmdir)
9190		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
9191	/*
9192	 * Check for a diradd dependency for the same directory entry.
9193	 * If present, then both dependencies become obsolete and can
9194	 * be de-allocated.
9195	 */
9196	dap = diradd_lookup(pagedep, offset);
9197	if (dap == NULL) {
9198		/*
9199		 * Link the jremref structures into the dirrem so they are
9200		 * written prior to the pagedep.
9201		 */
9202		if (jremref)
9203			dirrem_journal(dirrem, jremref, dotremref,
9204			    dotdotremref);
9205		return (dirrem);
9206	}
9207	/*
9208	 * Must be ATTACHED at this point.
9209	 */
9210	if ((dap->da_state & ATTACHED) == 0)
9211		panic("newdirrem: not ATTACHED");
9212	if (dap->da_newinum != ip->i_number)
9213		panic("newdirrem: inum %ju should be %ju",
9214		    (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
9215	/*
9216	 * If we are deleting a changed name that never made it to disk,
9217	 * then return the dirrem describing the previous inode (which
9218	 * represents the inode currently referenced from this entry on disk).
9219	 */
9220	if ((dap->da_state & DIRCHG) != 0) {
9221		*prevdirremp = dap->da_previous;
9222		dap->da_state &= ~DIRCHG;
9223		dap->da_pagedep = pagedep;
9224	}
9225	/*
9226	 * We are deleting an entry that never made it to disk.
9227	 * Mark it COMPLETE so we can delete its inode immediately.
9228	 */
9229	dirrem->dm_state |= COMPLETE;
9230	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
9231#ifdef SUJ_DEBUG
9232	if (isrmdir == 0) {
9233		struct worklist *wk;
9234
9235		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9236			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
9237				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
9238	}
9239#endif
9240
9241	return (dirrem);
9242}
9243
9244/*
9245 * Directory entry change dependencies.
9246 *
9247 * Changing an existing directory entry requires that an add operation
9248 * be completed first followed by a deletion. The semantics for the addition
9249 * are identical to the description of adding a new entry above except
9250 * that the rollback is to the old inode number rather than zero. Once
9251 * the addition dependency is completed, the removal is done as described
9252 * in the removal routine above.
9253 */
9254
9255/*
9256 * This routine should be called immediately after changing
9257 * a directory entry.  The inode's link count should not be
9258 * decremented by the calling procedure -- the soft updates
9259 * code will perform this task when it is safe.
9260 */
9261void
9262softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
9263	struct buf *bp;		/* buffer containing directory block */
9264	struct inode *dp;	/* inode for the directory being modified */
9265	struct inode *ip;	/* inode for directory entry being removed */
9266	ino_t newinum;		/* new inode number for changed entry */
9267	int isrmdir;		/* indicates if doing RMDIR */
9268{
9269	int offset;
9270	struct diradd *dap = NULL;
9271	struct dirrem *dirrem, *prevdirrem;
9272	struct pagedep *pagedep;
9273	struct inodedep *inodedep;
9274	struct jaddref *jaddref;
9275	struct mount *mp;
9276
9277	offset = blkoff(dp->i_fs, dp->i_offset);
9278	mp = UFSTOVFS(dp->i_ump);
9279	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
9280	   ("softdep_setup_directory_change called on non-softdep filesystem"));
9281
9282	/*
9283	 * Whiteouts do not need diradd dependencies.
9284	 */
9285	if (newinum != WINO) {
9286		dap = malloc(sizeof(struct diradd),
9287		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
9288		workitem_alloc(&dap->da_list, D_DIRADD, mp);
9289		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
9290		dap->da_offset = offset;
9291		dap->da_newinum = newinum;
9292		LIST_INIT(&dap->da_jwork);
9293	}
9294
9295	/*
9296	 * Allocate a new dirrem and ACQUIRE_LOCK.
9297	 */
9298	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9299	pagedep = dirrem->dm_pagedep;
9300	/*
9301	 * The possible values for isrmdir:
9302	 *	0 - non-directory file rename
9303	 *	1 - directory rename within same directory
9304	 *   inum - directory rename to new directory of given inode number
9305	 * When renaming to a new directory, we are both deleting and
9306	 * creating a new directory entry, so the link count on the new
9307	 * directory should not change. Thus we do not need the followup
9308	 * dirrem which is usually done in handle_workitem_remove. We set
9309	 * the DIRCHG flag to tell handle_workitem_remove to skip the
9310	 * followup dirrem.
9311	 */
9312	if (isrmdir > 1)
9313		dirrem->dm_state |= DIRCHG;
9314
9315	/*
9316	 * Whiteouts have no additional dependencies,
9317	 * so just put the dirrem on the correct list.
9318	 */
9319	if (newinum == WINO) {
9320		if ((dirrem->dm_state & COMPLETE) == 0) {
9321			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
9322			    dm_next);
9323		} else {
9324			dirrem->dm_dirinum = pagedep->pd_ino;
9325			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9326				add_to_worklist(&dirrem->dm_list, 0);
9327		}
9328		FREE_LOCK(dp->i_ump);
9329		return;
9330	}
9331	/*
9332	 * Add the dirrem to the inodedep's pending remove list for quick
9333	 * discovery later.  A valid nlinkdelta ensures that this lookup
9334	 * will not fail.
9335	 */
9336	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9337		panic("softdep_setup_directory_change: Lost inodedep.");
9338	dirrem->dm_state |= ONDEPLIST;
9339	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9340
9341	/*
9342	 * If the COMPLETE flag is clear, then there were no active
9343	 * entries and we want to roll back to the previous inode until
9344	 * the new inode is committed to disk. If the COMPLETE flag is
9345	 * set, then we have deleted an entry that never made it to disk.
9346	 * If the entry we deleted resulted from a name change, then the old
9347	 * inode reference still resides on disk. Any rollback that we do
9348	 * needs to be to that old inode (returned to us in prevdirrem). If
9349	 * the entry we deleted resulted from a create, then there is
9350	 * no entry on the disk, so we want to roll back to zero rather
9351	 * than the uncommitted inode. In either of the COMPLETE cases we
9352	 * want to immediately free the unwritten and unreferenced inode.
9353	 */
9354	if ((dirrem->dm_state & COMPLETE) == 0) {
9355		dap->da_previous = dirrem;
9356	} else {
9357		if (prevdirrem != NULL) {
9358			dap->da_previous = prevdirrem;
9359		} else {
9360			dap->da_state &= ~DIRCHG;
9361			dap->da_pagedep = pagedep;
9362		}
9363		dirrem->dm_dirinum = pagedep->pd_ino;
9364		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9365			add_to_worklist(&dirrem->dm_list, 0);
9366	}
9367	/*
9368	 * Lookup the jaddref for this journal entry.  We must finish
9369	 * initializing it and make the diradd write dependent on it.
9370	 * If we're not journaling, put it on the id_bufwait list if the
9371	 * inode is not yet written. If it is written, do the post-inode
9372	 * write processing to put it on the id_pendinghd list.
9373	 */
9374	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
9375	if (MOUNTEDSUJ(mp)) {
9376		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
9377		    inoreflst);
9378		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
9379		    ("softdep_setup_directory_change: bad jaddref %p",
9380		    jaddref));
9381		jaddref->ja_diroff = dp->i_offset;
9382		jaddref->ja_diradd = dap;
9383		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9384		    dap, da_pdlist);
9385		add_to_journal(&jaddref->ja_list);
9386	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
9387		dap->da_state |= COMPLETE;
9388		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9389		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9390	} else {
9391		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9392		    dap, da_pdlist);
9393		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
9394	}
9395	/*
9396	 * If we're making a new name for a directory that has not been
9397	 * committed when need to move the dot and dotdot references to
9398	 * this new name.
9399	 */
9400	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
9401		merge_diradd(inodedep, dap);
9402	FREE_LOCK(dp->i_ump);
9403}
9404
9405/*
9406 * Called whenever the link count on an inode is changed.
9407 * It creates an inode dependency so that the new reference(s)
9408 * to the inode cannot be committed to disk until the updated
9409 * inode has been written.
9410 */
9411void
9412softdep_change_linkcnt(ip)
9413	struct inode *ip;	/* the inode with the increased link count */
9414{
9415	struct inodedep *inodedep;
9416	int dflags;
9417
9418	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
9419	    ("softdep_change_linkcnt called on non-softdep filesystem"));
9420	ACQUIRE_LOCK(ip->i_ump);
9421	dflags = DEPALLOC;
9422	if (IS_SNAPSHOT(ip))
9423		dflags |= NODELAY;
9424	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
9425	if (ip->i_nlink < ip->i_effnlink)
9426		panic("softdep_change_linkcnt: bad delta");
9427	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9428	FREE_LOCK(ip->i_ump);
9429}
9430
9431/*
9432 * Attach a sbdep dependency to the superblock buf so that we can keep
9433 * track of the head of the linked list of referenced but unlinked inodes.
9434 */
9435void
9436softdep_setup_sbupdate(ump, fs, bp)
9437	struct ufsmount *ump;
9438	struct fs *fs;
9439	struct buf *bp;
9440{
9441	struct sbdep *sbdep;
9442	struct worklist *wk;
9443
9444	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9445	    ("softdep_setup_sbupdate called on non-softdep filesystem"));
9446	LIST_FOREACH(wk, &bp->b_dep, wk_list)
9447		if (wk->wk_type == D_SBDEP)
9448			break;
9449	if (wk != NULL)
9450		return;
9451	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
9452	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9453	sbdep->sb_fs = fs;
9454	sbdep->sb_ump = ump;
9455	ACQUIRE_LOCK(ump);
9456	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9457	FREE_LOCK(ump);
9458}
9459
9460/*
9461 * Return the first unlinked inodedep which is ready to be the head of the
9462 * list.  The inodedep and all those after it must have valid next pointers.
9463 */
9464static struct inodedep *
9465first_unlinked_inodedep(ump)
9466	struct ufsmount *ump;
9467{
9468	struct inodedep *inodedep;
9469	struct inodedep *idp;
9470
9471	LOCK_OWNED(ump);
9472	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9473	    inodedep; inodedep = idp) {
9474		if ((inodedep->id_state & UNLINKNEXT) == 0)
9475			return (NULL);
9476		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9477		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9478			break;
9479		if ((inodedep->id_state & UNLINKPREV) == 0)
9480			break;
9481	}
9482	return (inodedep);
9483}
9484
9485/*
9486 * Set the sujfree unlinked head pointer prior to writing a superblock.
9487 */
9488static void
9489initiate_write_sbdep(sbdep)
9490	struct sbdep *sbdep;
9491{
9492	struct inodedep *inodedep;
9493	struct fs *bpfs;
9494	struct fs *fs;
9495
9496	bpfs = sbdep->sb_fs;
9497	fs = sbdep->sb_ump->um_fs;
9498	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9499	if (inodedep) {
9500		fs->fs_sujfree = inodedep->id_ino;
9501		inodedep->id_state |= UNLINKPREV;
9502	} else
9503		fs->fs_sujfree = 0;
9504	bpfs->fs_sujfree = fs->fs_sujfree;
9505}
9506
9507/*
9508 * After a superblock is written determine whether it must be written again
9509 * due to a changing unlinked list head.
9510 */
9511static int
9512handle_written_sbdep(sbdep, bp)
9513	struct sbdep *sbdep;
9514	struct buf *bp;
9515{
9516	struct inodedep *inodedep;
9517	struct mount *mp;
9518	struct fs *fs;
9519
9520	LOCK_OWNED(sbdep->sb_ump);
9521	fs = sbdep->sb_fs;
9522	mp = UFSTOVFS(sbdep->sb_ump);
9523	/*
9524	 * If the superblock doesn't match the in-memory list start over.
9525	 */
9526	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9527	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9528	    (inodedep == NULL && fs->fs_sujfree != 0)) {
9529		bdirty(bp);
9530		return (1);
9531	}
9532	WORKITEM_FREE(sbdep, D_SBDEP);
9533	if (fs->fs_sujfree == 0)
9534		return (0);
9535	/*
9536	 * Now that we have a record of this inode in stable store allow it
9537	 * to be written to free up pending work.  Inodes may see a lot of
9538	 * write activity after they are unlinked which we must not hold up.
9539	 */
9540	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9541		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9542			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9543			    inodedep, inodedep->id_state);
9544		if (inodedep->id_state & UNLINKONLIST)
9545			break;
9546		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9547	}
9548
9549	return (0);
9550}
9551
9552/*
9553 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9554 */
9555static void
9556unlinked_inodedep(mp, inodedep)
9557	struct mount *mp;
9558	struct inodedep *inodedep;
9559{
9560	struct ufsmount *ump;
9561
9562	ump = VFSTOUFS(mp);
9563	LOCK_OWNED(ump);
9564	if (MOUNTEDSUJ(mp) == 0)
9565		return;
9566	ump->um_fs->fs_fmod = 1;
9567	if (inodedep->id_state & UNLINKED)
9568		panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9569	inodedep->id_state |= UNLINKED;
9570	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9571}
9572
9573/*
9574 * Remove an inodedep from the unlinked inodedep list.  This may require
9575 * disk writes if the inode has made it that far.
9576 */
9577static void
9578clear_unlinked_inodedep(inodedep)
9579	struct inodedep *inodedep;
9580{
9581	struct ufsmount *ump;
9582	struct inodedep *idp;
9583	struct inodedep *idn;
9584	struct fs *fs;
9585	struct buf *bp;
9586	ino_t ino;
9587	ino_t nino;
9588	ino_t pino;
9589	int error;
9590
9591	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9592	fs = ump->um_fs;
9593	ino = inodedep->id_ino;
9594	error = 0;
9595	for (;;) {
9596		LOCK_OWNED(ump);
9597		KASSERT((inodedep->id_state & UNLINKED) != 0,
9598		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9599		    inodedep));
9600		/*
9601		 * If nothing has yet been written simply remove us from
9602		 * the in memory list and return.  This is the most common
9603		 * case where handle_workitem_remove() loses the final
9604		 * reference.
9605		 */
9606		if ((inodedep->id_state & UNLINKLINKS) == 0)
9607			break;
9608		/*
9609		 * If we have a NEXT pointer and no PREV pointer we can simply
9610		 * clear NEXT's PREV and remove ourselves from the list.  Be
9611		 * careful not to clear PREV if the superblock points at
9612		 * next as well.
9613		 */
9614		idn = TAILQ_NEXT(inodedep, id_unlinked);
9615		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9616			if (idn && fs->fs_sujfree != idn->id_ino)
9617				idn->id_state &= ~UNLINKPREV;
9618			break;
9619		}
9620		/*
9621		 * Here we have an inodedep which is actually linked into
9622		 * the list.  We must remove it by forcing a write to the
9623		 * link before us, whether it be the superblock or an inode.
9624		 * Unfortunately the list may change while we're waiting
9625		 * on the buf lock for either resource so we must loop until
9626		 * we lock the right one.  If both the superblock and an
9627		 * inode point to this inode we must clear the inode first
9628		 * followed by the superblock.
9629		 */
9630		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9631		pino = 0;
9632		if (idp && (idp->id_state & UNLINKNEXT))
9633			pino = idp->id_ino;
9634		FREE_LOCK(ump);
9635		if (pino == 0) {
9636			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9637			    (int)fs->fs_sbsize, 0, 0, 0);
9638		} else {
9639			error = bread(ump->um_devvp,
9640			    fsbtodb(fs, ino_to_fsba(fs, pino)),
9641			    (int)fs->fs_bsize, NOCRED, &bp);
9642			if (error)
9643				brelse(bp);
9644		}
9645		ACQUIRE_LOCK(ump);
9646		if (error)
9647			break;
9648		/* If the list has changed restart the loop. */
9649		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9650		nino = 0;
9651		if (idp && (idp->id_state & UNLINKNEXT))
9652			nino = idp->id_ino;
9653		if (nino != pino ||
9654		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9655			FREE_LOCK(ump);
9656			brelse(bp);
9657			ACQUIRE_LOCK(ump);
9658			continue;
9659		}
9660		nino = 0;
9661		idn = TAILQ_NEXT(inodedep, id_unlinked);
9662		if (idn)
9663			nino = idn->id_ino;
9664		/*
9665		 * Remove us from the in memory list.  After this we cannot
9666		 * access the inodedep.
9667		 */
9668		KASSERT((inodedep->id_state & UNLINKED) != 0,
9669		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9670		    inodedep));
9671		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9672		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9673		FREE_LOCK(ump);
9674		/*
9675		 * The predecessor's next pointer is manually updated here
9676		 * so that the NEXT flag is never cleared for an element
9677		 * that is in the list.
9678		 */
9679		if (pino == 0) {
9680			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9681			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9682			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9683			    bp);
9684		} else if (fs->fs_magic == FS_UFS1_MAGIC)
9685			((struct ufs1_dinode *)bp->b_data +
9686			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9687		else
9688			((struct ufs2_dinode *)bp->b_data +
9689			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9690		/*
9691		 * If the bwrite fails we have no recourse to recover.  The
9692		 * filesystem is corrupted already.
9693		 */
9694		bwrite(bp);
9695		ACQUIRE_LOCK(ump);
9696		/*
9697		 * If the superblock pointer still needs to be cleared force
9698		 * a write here.
9699		 */
9700		if (fs->fs_sujfree == ino) {
9701			FREE_LOCK(ump);
9702			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9703			    (int)fs->fs_sbsize, 0, 0, 0);
9704			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9705			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9706			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9707			    bp);
9708			bwrite(bp);
9709			ACQUIRE_LOCK(ump);
9710		}
9711
9712		if (fs->fs_sujfree != ino)
9713			return;
9714		panic("clear_unlinked_inodedep: Failed to clear free head");
9715	}
9716	if (inodedep->id_ino == fs->fs_sujfree)
9717		panic("clear_unlinked_inodedep: Freeing head of free list");
9718	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9719	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9720	return;
9721}
9722
9723/*
9724 * This workitem decrements the inode's link count.
9725 * If the link count reaches zero, the file is removed.
9726 */
9727static int
9728handle_workitem_remove(dirrem, flags)
9729	struct dirrem *dirrem;
9730	int flags;
9731{
9732	struct inodedep *inodedep;
9733	struct workhead dotdotwk;
9734	struct worklist *wk;
9735	struct ufsmount *ump;
9736	struct mount *mp;
9737	struct vnode *vp;
9738	struct inode *ip;
9739	ino_t oldinum;
9740
9741	if (dirrem->dm_state & ONWORKLIST)
9742		panic("handle_workitem_remove: dirrem %p still on worklist",
9743		    dirrem);
9744	oldinum = dirrem->dm_oldinum;
9745	mp = dirrem->dm_list.wk_mp;
9746	ump = VFSTOUFS(mp);
9747	flags |= LK_EXCLUSIVE;
9748	if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
9749		return (EBUSY);
9750	ip = VTOI(vp);
9751	ACQUIRE_LOCK(ump);
9752	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
9753		panic("handle_workitem_remove: lost inodedep");
9754	if (dirrem->dm_state & ONDEPLIST)
9755		LIST_REMOVE(dirrem, dm_inonext);
9756	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9757	    ("handle_workitem_remove:  Journal entries not written."));
9758
9759	/*
9760	 * Move all dependencies waiting on the remove to complete
9761	 * from the dirrem to the inode inowait list to be completed
9762	 * after the inode has been updated and written to disk.  Any
9763	 * marked MKDIR_PARENT are saved to be completed when the .. ref
9764	 * is removed.
9765	 */
9766	LIST_INIT(&dotdotwk);
9767	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
9768		WORKLIST_REMOVE(wk);
9769		if (wk->wk_state & MKDIR_PARENT) {
9770			wk->wk_state &= ~MKDIR_PARENT;
9771			WORKLIST_INSERT(&dotdotwk, wk);
9772			continue;
9773		}
9774		WORKLIST_INSERT(&inodedep->id_inowait, wk);
9775	}
9776	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
9777	/*
9778	 * Normal file deletion.
9779	 */
9780	if ((dirrem->dm_state & RMDIR) == 0) {
9781		ip->i_nlink--;
9782		DIP_SET(ip, i_nlink, ip->i_nlink);
9783		ip->i_flag |= IN_CHANGE;
9784		if (ip->i_nlink < ip->i_effnlink)
9785			panic("handle_workitem_remove: bad file delta");
9786		if (ip->i_nlink == 0)
9787			unlinked_inodedep(mp, inodedep);
9788		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9789		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9790		    ("handle_workitem_remove: worklist not empty. %s",
9791		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
9792		WORKITEM_FREE(dirrem, D_DIRREM);
9793		FREE_LOCK(ump);
9794		goto out;
9795	}
9796	/*
9797	 * Directory deletion. Decrement reference count for both the
9798	 * just deleted parent directory entry and the reference for ".".
9799	 * Arrange to have the reference count on the parent decremented
9800	 * to account for the loss of "..".
9801	 */
9802	ip->i_nlink -= 2;
9803	DIP_SET(ip, i_nlink, ip->i_nlink);
9804	ip->i_flag |= IN_CHANGE;
9805	if (ip->i_nlink < ip->i_effnlink)
9806		panic("handle_workitem_remove: bad dir delta");
9807	if (ip->i_nlink == 0)
9808		unlinked_inodedep(mp, inodedep);
9809	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9810	/*
9811	 * Rename a directory to a new parent. Since, we are both deleting
9812	 * and creating a new directory entry, the link count on the new
9813	 * directory should not change. Thus we skip the followup dirrem.
9814	 */
9815	if (dirrem->dm_state & DIRCHG) {
9816		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9817		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
9818		WORKITEM_FREE(dirrem, D_DIRREM);
9819		FREE_LOCK(ump);
9820		goto out;
9821	}
9822	dirrem->dm_state = ONDEPLIST;
9823	dirrem->dm_oldinum = dirrem->dm_dirinum;
9824	/*
9825	 * Place the dirrem on the parent's diremhd list.
9826	 */
9827	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
9828		panic("handle_workitem_remove: lost dir inodedep");
9829	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9830	/*
9831	 * If the allocated inode has never been written to disk, then
9832	 * the on-disk inode is zero'ed and we can remove the file
9833	 * immediately.  When journaling if the inode has been marked
9834	 * unlinked and not DEPCOMPLETE we know it can never be written.
9835	 */
9836	inodedep_lookup(mp, oldinum, 0, &inodedep);
9837	if (inodedep == NULL ||
9838	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
9839	    check_inode_unwritten(inodedep)) {
9840		FREE_LOCK(ump);
9841		vput(vp);
9842		return handle_workitem_remove(dirrem, flags);
9843	}
9844	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
9845	FREE_LOCK(ump);
9846	ip->i_flag |= IN_CHANGE;
9847out:
9848	ffs_update(vp, 0);
9849	vput(vp);
9850	return (0);
9851}
9852
9853/*
9854 * Inode de-allocation dependencies.
9855 *
9856 * When an inode's link count is reduced to zero, it can be de-allocated. We
9857 * found it convenient to postpone de-allocation until after the inode is
9858 * written to disk with its new link count (zero).  At this point, all of the
9859 * on-disk inode's block pointers are nullified and, with careful dependency
9860 * list ordering, all dependencies related to the inode will be satisfied and
9861 * the corresponding dependency structures de-allocated.  So, if/when the
9862 * inode is reused, there will be no mixing of old dependencies with new
9863 * ones.  This artificial dependency is set up by the block de-allocation
9864 * procedure above (softdep_setup_freeblocks) and completed by the
9865 * following procedure.
9866 */
9867static void
9868handle_workitem_freefile(freefile)
9869	struct freefile *freefile;
9870{
9871	struct workhead wkhd;
9872	struct fs *fs;
9873	struct inodedep *idp;
9874	struct ufsmount *ump;
9875	int error;
9876
9877	ump = VFSTOUFS(freefile->fx_list.wk_mp);
9878	fs = ump->um_fs;
9879#ifdef DEBUG
9880	ACQUIRE_LOCK(ump);
9881	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
9882	FREE_LOCK(ump);
9883	if (error)
9884		panic("handle_workitem_freefile: inodedep %p survived", idp);
9885#endif
9886	UFS_LOCK(ump);
9887	fs->fs_pendinginodes -= 1;
9888	UFS_UNLOCK(ump);
9889	LIST_INIT(&wkhd);
9890	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
9891	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
9892	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
9893		softdep_error("handle_workitem_freefile", error);
9894	ACQUIRE_LOCK(ump);
9895	WORKITEM_FREE(freefile, D_FREEFILE);
9896	FREE_LOCK(ump);
9897}
9898
9899
9900/*
9901 * Helper function which unlinks marker element from work list and returns
9902 * the next element on the list.
9903 */
9904static __inline struct worklist *
9905markernext(struct worklist *marker)
9906{
9907	struct worklist *next;
9908
9909	next = LIST_NEXT(marker, wk_list);
9910	LIST_REMOVE(marker, wk_list);
9911	return next;
9912}
9913
9914/*
9915 * Disk writes.
9916 *
9917 * The dependency structures constructed above are most actively used when file
9918 * system blocks are written to disk.  No constraints are placed on when a
9919 * block can be written, but unsatisfied update dependencies are made safe by
9920 * modifying (or replacing) the source memory for the duration of the disk
9921 * write.  When the disk write completes, the memory block is again brought
9922 * up-to-date.
9923 *
9924 * In-core inode structure reclamation.
9925 *
9926 * Because there are a finite number of "in-core" inode structures, they are
9927 * reused regularly.  By transferring all inode-related dependencies to the
9928 * in-memory inode block and indexing them separately (via "inodedep"s), we
9929 * can allow "in-core" inode structures to be reused at any time and avoid
9930 * any increase in contention.
9931 *
9932 * Called just before entering the device driver to initiate a new disk I/O.
9933 * The buffer must be locked, thus, no I/O completion operations can occur
9934 * while we are manipulating its associated dependencies.
9935 */
9936static void
9937softdep_disk_io_initiation(bp)
9938	struct buf *bp;		/* structure describing disk write to occur */
9939{
9940	struct worklist *wk;
9941	struct worklist marker;
9942	struct inodedep *inodedep;
9943	struct freeblks *freeblks;
9944	struct jblkdep *jblkdep;
9945	struct newblk *newblk;
9946	struct ufsmount *ump;
9947
9948	/*
9949	 * We only care about write operations. There should never
9950	 * be dependencies for reads.
9951	 */
9952	if (bp->b_iocmd != BIO_WRITE)
9953		panic("softdep_disk_io_initiation: not write");
9954
9955	if (bp->b_vflags & BV_BKGRDINPROG)
9956		panic("softdep_disk_io_initiation: Writing buffer with "
9957		    "background write in progress: %p", bp);
9958
9959	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
9960		return;
9961	ump = VFSTOUFS(wk->wk_mp);
9962
9963	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
9964	PHOLD(curproc);			/* Don't swap out kernel stack */
9965	ACQUIRE_LOCK(ump);
9966	/*
9967	 * Do any necessary pre-I/O processing.
9968	 */
9969	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
9970	     wk = markernext(&marker)) {
9971		LIST_INSERT_AFTER(wk, &marker, wk_list);
9972		switch (wk->wk_type) {
9973
9974		case D_PAGEDEP:
9975			initiate_write_filepage(WK_PAGEDEP(wk), bp);
9976			continue;
9977
9978		case D_INODEDEP:
9979			inodedep = WK_INODEDEP(wk);
9980			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
9981				initiate_write_inodeblock_ufs1(inodedep, bp);
9982			else
9983				initiate_write_inodeblock_ufs2(inodedep, bp);
9984			continue;
9985
9986		case D_INDIRDEP:
9987			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
9988			continue;
9989
9990		case D_BMSAFEMAP:
9991			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
9992			continue;
9993
9994		case D_JSEG:
9995			WK_JSEG(wk)->js_buf = NULL;
9996			continue;
9997
9998		case D_FREEBLKS:
9999			freeblks = WK_FREEBLKS(wk);
10000			jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
10001			/*
10002			 * We have to wait for the freeblks to be journaled
10003			 * before we can write an inodeblock with updated
10004			 * pointers.  Be careful to arrange the marker so
10005			 * we revisit the freeblks if it's not removed by
10006			 * the first jwait().
10007			 */
10008			if (jblkdep != NULL) {
10009				LIST_REMOVE(&marker, wk_list);
10010				LIST_INSERT_BEFORE(wk, &marker, wk_list);
10011				jwait(&jblkdep->jb_list, MNT_WAIT);
10012			}
10013			continue;
10014		case D_ALLOCDIRECT:
10015		case D_ALLOCINDIR:
10016			/*
10017			 * We have to wait for the jnewblk to be journaled
10018			 * before we can write to a block if the contents
10019			 * may be confused with an earlier file's indirect
10020			 * at recovery time.  Handle the marker as described
10021			 * above.
10022			 */
10023			newblk = WK_NEWBLK(wk);
10024			if (newblk->nb_jnewblk != NULL &&
10025			    indirblk_lookup(newblk->nb_list.wk_mp,
10026			    newblk->nb_newblkno)) {
10027				LIST_REMOVE(&marker, wk_list);
10028				LIST_INSERT_BEFORE(wk, &marker, wk_list);
10029				jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
10030			}
10031			continue;
10032
10033		case D_SBDEP:
10034			initiate_write_sbdep(WK_SBDEP(wk));
10035			continue;
10036
10037		case D_MKDIR:
10038		case D_FREEWORK:
10039		case D_FREEDEP:
10040		case D_JSEGDEP:
10041			continue;
10042
10043		default:
10044			panic("handle_disk_io_initiation: Unexpected type %s",
10045			    TYPENAME(wk->wk_type));
10046			/* NOTREACHED */
10047		}
10048	}
10049	FREE_LOCK(ump);
10050	PRELE(curproc);			/* Allow swapout of kernel stack */
10051}
10052
10053/*
10054 * Called from within the procedure above to deal with unsatisfied
10055 * allocation dependencies in a directory. The buffer must be locked,
10056 * thus, no I/O completion operations can occur while we are
10057 * manipulating its associated dependencies.
10058 */
10059static void
10060initiate_write_filepage(pagedep, bp)
10061	struct pagedep *pagedep;
10062	struct buf *bp;
10063{
10064	struct jremref *jremref;
10065	struct jmvref *jmvref;
10066	struct dirrem *dirrem;
10067	struct diradd *dap;
10068	struct direct *ep;
10069	int i;
10070
10071	if (pagedep->pd_state & IOSTARTED) {
10072		/*
10073		 * This can only happen if there is a driver that does not
10074		 * understand chaining. Here biodone will reissue the call
10075		 * to strategy for the incomplete buffers.
10076		 */
10077		printf("initiate_write_filepage: already started\n");
10078		return;
10079	}
10080	pagedep->pd_state |= IOSTARTED;
10081	/*
10082	 * Wait for all journal remove dependencies to hit the disk.
10083	 * We can not allow any potentially conflicting directory adds
10084	 * to be visible before removes and rollback is too difficult.
10085	 * The per-filesystem lock may be dropped and re-acquired, however
10086	 * we hold the buf locked so the dependency can not go away.
10087	 */
10088	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
10089		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
10090			jwait(&jremref->jr_list, MNT_WAIT);
10091	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
10092		jwait(&jmvref->jm_list, MNT_WAIT);
10093	for (i = 0; i < DAHASHSZ; i++) {
10094		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
10095			ep = (struct direct *)
10096			    ((char *)bp->b_data + dap->da_offset);
10097			if (ep->d_ino != dap->da_newinum)
10098				panic("%s: dir inum %ju != new %ju",
10099				    "initiate_write_filepage",
10100				    (uintmax_t)ep->d_ino,
10101				    (uintmax_t)dap->da_newinum);
10102			if (dap->da_state & DIRCHG)
10103				ep->d_ino = dap->da_previous->dm_oldinum;
10104			else
10105				ep->d_ino = 0;
10106			dap->da_state &= ~ATTACHED;
10107			dap->da_state |= UNDONE;
10108		}
10109	}
10110}
10111
10112/*
10113 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
10114 * Note that any bug fixes made to this routine must be done in the
10115 * version found below.
10116 *
10117 * Called from within the procedure above to deal with unsatisfied
10118 * allocation dependencies in an inodeblock. The buffer must be
10119 * locked, thus, no I/O completion operations can occur while we
10120 * are manipulating its associated dependencies.
10121 */
10122static void
10123initiate_write_inodeblock_ufs1(inodedep, bp)
10124	struct inodedep *inodedep;
10125	struct buf *bp;			/* The inode block */
10126{
10127	struct allocdirect *adp, *lastadp;
10128	struct ufs1_dinode *dp;
10129	struct ufs1_dinode *sip;
10130	struct inoref *inoref;
10131	struct ufsmount *ump;
10132	struct fs *fs;
10133	ufs_lbn_t i;
10134#ifdef INVARIANTS
10135	ufs_lbn_t prevlbn = 0;
10136#endif
10137	int deplist;
10138
10139	if (inodedep->id_state & IOSTARTED)
10140		panic("initiate_write_inodeblock_ufs1: already started");
10141	inodedep->id_state |= IOSTARTED;
10142	fs = inodedep->id_fs;
10143	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10144	LOCK_OWNED(ump);
10145	dp = (struct ufs1_dinode *)bp->b_data +
10146	    ino_to_fsbo(fs, inodedep->id_ino);
10147
10148	/*
10149	 * If we're on the unlinked list but have not yet written our
10150	 * next pointer initialize it here.
10151	 */
10152	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10153		struct inodedep *inon;
10154
10155		inon = TAILQ_NEXT(inodedep, id_unlinked);
10156		dp->di_freelink = inon ? inon->id_ino : 0;
10157	}
10158	/*
10159	 * If the bitmap is not yet written, then the allocated
10160	 * inode cannot be written to disk.
10161	 */
10162	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10163		if (inodedep->id_savedino1 != NULL)
10164			panic("initiate_write_inodeblock_ufs1: I/O underway");
10165		FREE_LOCK(ump);
10166		sip = malloc(sizeof(struct ufs1_dinode),
10167		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10168		ACQUIRE_LOCK(ump);
10169		inodedep->id_savedino1 = sip;
10170		*inodedep->id_savedino1 = *dp;
10171		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
10172		dp->di_gen = inodedep->id_savedino1->di_gen;
10173		dp->di_freelink = inodedep->id_savedino1->di_freelink;
10174		return;
10175	}
10176	/*
10177	 * If no dependencies, then there is nothing to roll back.
10178	 */
10179	inodedep->id_savedsize = dp->di_size;
10180	inodedep->id_savedextsize = 0;
10181	inodedep->id_savednlink = dp->di_nlink;
10182	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10183	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10184		return;
10185	/*
10186	 * Revert the link count to that of the first unwritten journal entry.
10187	 */
10188	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10189	if (inoref)
10190		dp->di_nlink = inoref->if_nlink;
10191	/*
10192	 * Set the dependencies to busy.
10193	 */
10194	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10195	     adp = TAILQ_NEXT(adp, ad_next)) {
10196#ifdef INVARIANTS
10197		if (deplist != 0 && prevlbn >= adp->ad_offset)
10198			panic("softdep_write_inodeblock: lbn order");
10199		prevlbn = adp->ad_offset;
10200		if (adp->ad_offset < NDADDR &&
10201		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10202			panic("%s: direct pointer #%jd mismatch %d != %jd",
10203			    "softdep_write_inodeblock",
10204			    (intmax_t)adp->ad_offset,
10205			    dp->di_db[adp->ad_offset],
10206			    (intmax_t)adp->ad_newblkno);
10207		if (adp->ad_offset >= NDADDR &&
10208		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10209			panic("%s: indirect pointer #%jd mismatch %d != %jd",
10210			    "softdep_write_inodeblock",
10211			    (intmax_t)adp->ad_offset - NDADDR,
10212			    dp->di_ib[adp->ad_offset - NDADDR],
10213			    (intmax_t)adp->ad_newblkno);
10214		deplist |= 1 << adp->ad_offset;
10215		if ((adp->ad_state & ATTACHED) == 0)
10216			panic("softdep_write_inodeblock: Unknown state 0x%x",
10217			    adp->ad_state);
10218#endif /* INVARIANTS */
10219		adp->ad_state &= ~ATTACHED;
10220		adp->ad_state |= UNDONE;
10221	}
10222	/*
10223	 * The on-disk inode cannot claim to be any larger than the last
10224	 * fragment that has been written. Otherwise, the on-disk inode
10225	 * might have fragments that were not the last block in the file
10226	 * which would corrupt the filesystem.
10227	 */
10228	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10229	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10230		if (adp->ad_offset >= NDADDR)
10231			break;
10232		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10233		/* keep going until hitting a rollback to a frag */
10234		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10235			continue;
10236		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10237		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10238#ifdef INVARIANTS
10239			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10240				panic("softdep_write_inodeblock: lost dep1");
10241#endif /* INVARIANTS */
10242			dp->di_db[i] = 0;
10243		}
10244		for (i = 0; i < NIADDR; i++) {
10245#ifdef INVARIANTS
10246			if (dp->di_ib[i] != 0 &&
10247			    (deplist & ((1 << NDADDR) << i)) == 0)
10248				panic("softdep_write_inodeblock: lost dep2");
10249#endif /* INVARIANTS */
10250			dp->di_ib[i] = 0;
10251		}
10252		return;
10253	}
10254	/*
10255	 * If we have zero'ed out the last allocated block of the file,
10256	 * roll back the size to the last currently allocated block.
10257	 * We know that this last allocated block is a full-sized as
10258	 * we already checked for fragments in the loop above.
10259	 */
10260	if (lastadp != NULL &&
10261	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10262		for (i = lastadp->ad_offset; i >= 0; i--)
10263			if (dp->di_db[i] != 0)
10264				break;
10265		dp->di_size = (i + 1) * fs->fs_bsize;
10266	}
10267	/*
10268	 * The only dependencies are for indirect blocks.
10269	 *
10270	 * The file size for indirect block additions is not guaranteed.
10271	 * Such a guarantee would be non-trivial to achieve. The conventional
10272	 * synchronous write implementation also does not make this guarantee.
10273	 * Fsck should catch and fix discrepancies. Arguably, the file size
10274	 * can be over-estimated without destroying integrity when the file
10275	 * moves into the indirect blocks (i.e., is large). If we want to
10276	 * postpone fsck, we are stuck with this argument.
10277	 */
10278	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10279		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10280}
10281
10282/*
10283 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
10284 * Note that any bug fixes made to this routine must be done in the
10285 * version found above.
10286 *
10287 * Called from within the procedure above to deal with unsatisfied
10288 * allocation dependencies in an inodeblock. The buffer must be
10289 * locked, thus, no I/O completion operations can occur while we
10290 * are manipulating its associated dependencies.
10291 */
10292static void
10293initiate_write_inodeblock_ufs2(inodedep, bp)
10294	struct inodedep *inodedep;
10295	struct buf *bp;			/* The inode block */
10296{
10297	struct allocdirect *adp, *lastadp;
10298	struct ufs2_dinode *dp;
10299	struct ufs2_dinode *sip;
10300	struct inoref *inoref;
10301	struct ufsmount *ump;
10302	struct fs *fs;
10303	ufs_lbn_t i;
10304#ifdef INVARIANTS
10305	ufs_lbn_t prevlbn = 0;
10306#endif
10307	int deplist;
10308
10309	if (inodedep->id_state & IOSTARTED)
10310		panic("initiate_write_inodeblock_ufs2: already started");
10311	inodedep->id_state |= IOSTARTED;
10312	fs = inodedep->id_fs;
10313	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10314	LOCK_OWNED(ump);
10315	dp = (struct ufs2_dinode *)bp->b_data +
10316	    ino_to_fsbo(fs, inodedep->id_ino);
10317
10318	/*
10319	 * If we're on the unlinked list but have not yet written our
10320	 * next pointer initialize it here.
10321	 */
10322	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10323		struct inodedep *inon;
10324
10325		inon = TAILQ_NEXT(inodedep, id_unlinked);
10326		dp->di_freelink = inon ? inon->id_ino : 0;
10327	}
10328	/*
10329	 * If the bitmap is not yet written, then the allocated
10330	 * inode cannot be written to disk.
10331	 */
10332	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10333		if (inodedep->id_savedino2 != NULL)
10334			panic("initiate_write_inodeblock_ufs2: I/O underway");
10335		FREE_LOCK(ump);
10336		sip = malloc(sizeof(struct ufs2_dinode),
10337		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10338		ACQUIRE_LOCK(ump);
10339		inodedep->id_savedino2 = sip;
10340		*inodedep->id_savedino2 = *dp;
10341		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10342		dp->di_gen = inodedep->id_savedino2->di_gen;
10343		dp->di_freelink = inodedep->id_savedino2->di_freelink;
10344		return;
10345	}
10346	/*
10347	 * If no dependencies, then there is nothing to roll back.
10348	 */
10349	inodedep->id_savedsize = dp->di_size;
10350	inodedep->id_savedextsize = dp->di_extsize;
10351	inodedep->id_savednlink = dp->di_nlink;
10352	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10353	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
10354	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10355		return;
10356	/*
10357	 * Revert the link count to that of the first unwritten journal entry.
10358	 */
10359	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10360	if (inoref)
10361		dp->di_nlink = inoref->if_nlink;
10362
10363	/*
10364	 * Set the ext data dependencies to busy.
10365	 */
10366	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10367	     adp = TAILQ_NEXT(adp, ad_next)) {
10368#ifdef INVARIANTS
10369		if (deplist != 0 && prevlbn >= adp->ad_offset)
10370			panic("softdep_write_inodeblock: lbn order");
10371		prevlbn = adp->ad_offset;
10372		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10373			panic("%s: direct pointer #%jd mismatch %jd != %jd",
10374			    "softdep_write_inodeblock",
10375			    (intmax_t)adp->ad_offset,
10376			    (intmax_t)dp->di_extb[adp->ad_offset],
10377			    (intmax_t)adp->ad_newblkno);
10378		deplist |= 1 << adp->ad_offset;
10379		if ((adp->ad_state & ATTACHED) == 0)
10380			panic("softdep_write_inodeblock: Unknown state 0x%x",
10381			    adp->ad_state);
10382#endif /* INVARIANTS */
10383		adp->ad_state &= ~ATTACHED;
10384		adp->ad_state |= UNDONE;
10385	}
10386	/*
10387	 * The on-disk inode cannot claim to be any larger than the last
10388	 * fragment that has been written. Otherwise, the on-disk inode
10389	 * might have fragments that were not the last block in the ext
10390	 * data which would corrupt the filesystem.
10391	 */
10392	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10393	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10394		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10395		/* keep going until hitting a rollback to a frag */
10396		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10397			continue;
10398		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10399		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
10400#ifdef INVARIANTS
10401			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10402				panic("softdep_write_inodeblock: lost dep1");
10403#endif /* INVARIANTS */
10404			dp->di_extb[i] = 0;
10405		}
10406		lastadp = NULL;
10407		break;
10408	}
10409	/*
10410	 * If we have zero'ed out the last allocated block of the ext
10411	 * data, roll back the size to the last currently allocated block.
10412	 * We know that this last allocated block is a full-sized as
10413	 * we already checked for fragments in the loop above.
10414	 */
10415	if (lastadp != NULL &&
10416	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10417		for (i = lastadp->ad_offset; i >= 0; i--)
10418			if (dp->di_extb[i] != 0)
10419				break;
10420		dp->di_extsize = (i + 1) * fs->fs_bsize;
10421	}
10422	/*
10423	 * Set the file data dependencies to busy.
10424	 */
10425	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10426	     adp = TAILQ_NEXT(adp, ad_next)) {
10427#ifdef INVARIANTS
10428		if (deplist != 0 && prevlbn >= adp->ad_offset)
10429			panic("softdep_write_inodeblock: lbn order");
10430		if ((adp->ad_state & ATTACHED) == 0)
10431			panic("inodedep %p and adp %p not attached", inodedep, adp);
10432		prevlbn = adp->ad_offset;
10433		if (adp->ad_offset < NDADDR &&
10434		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10435			panic("%s: direct pointer #%jd mismatch %jd != %jd",
10436			    "softdep_write_inodeblock",
10437			    (intmax_t)adp->ad_offset,
10438			    (intmax_t)dp->di_db[adp->ad_offset],
10439			    (intmax_t)adp->ad_newblkno);
10440		if (adp->ad_offset >= NDADDR &&
10441		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10442			panic("%s indirect pointer #%jd mismatch %jd != %jd",
10443			    "softdep_write_inodeblock:",
10444			    (intmax_t)adp->ad_offset - NDADDR,
10445			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
10446			    (intmax_t)adp->ad_newblkno);
10447		deplist |= 1 << adp->ad_offset;
10448		if ((adp->ad_state & ATTACHED) == 0)
10449			panic("softdep_write_inodeblock: Unknown state 0x%x",
10450			    adp->ad_state);
10451#endif /* INVARIANTS */
10452		adp->ad_state &= ~ATTACHED;
10453		adp->ad_state |= UNDONE;
10454	}
10455	/*
10456	 * The on-disk inode cannot claim to be any larger than the last
10457	 * fragment that has been written. Otherwise, the on-disk inode
10458	 * might have fragments that were not the last block in the file
10459	 * which would corrupt the filesystem.
10460	 */
10461	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10462	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10463		if (adp->ad_offset >= NDADDR)
10464			break;
10465		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10466		/* keep going until hitting a rollback to a frag */
10467		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10468			continue;
10469		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10470		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10471#ifdef INVARIANTS
10472			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10473				panic("softdep_write_inodeblock: lost dep2");
10474#endif /* INVARIANTS */
10475			dp->di_db[i] = 0;
10476		}
10477		for (i = 0; i < NIADDR; i++) {
10478#ifdef INVARIANTS
10479			if (dp->di_ib[i] != 0 &&
10480			    (deplist & ((1 << NDADDR) << i)) == 0)
10481				panic("softdep_write_inodeblock: lost dep3");
10482#endif /* INVARIANTS */
10483			dp->di_ib[i] = 0;
10484		}
10485		return;
10486	}
10487	/*
10488	 * If we have zero'ed out the last allocated block of the file,
10489	 * roll back the size to the last currently allocated block.
10490	 * We know that this last allocated block is a full-sized as
10491	 * we already checked for fragments in the loop above.
10492	 */
10493	if (lastadp != NULL &&
10494	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10495		for (i = lastadp->ad_offset; i >= 0; i--)
10496			if (dp->di_db[i] != 0)
10497				break;
10498		dp->di_size = (i + 1) * fs->fs_bsize;
10499	}
10500	/*
10501	 * The only dependencies are for indirect blocks.
10502	 *
10503	 * The file size for indirect block additions is not guaranteed.
10504	 * Such a guarantee would be non-trivial to achieve. The conventional
10505	 * synchronous write implementation also does not make this guarantee.
10506	 * Fsck should catch and fix discrepancies. Arguably, the file size
10507	 * can be over-estimated without destroying integrity when the file
10508	 * moves into the indirect blocks (i.e., is large). If we want to
10509	 * postpone fsck, we are stuck with this argument.
10510	 */
10511	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10512		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10513}
10514
10515/*
10516 * Cancel an indirdep as a result of truncation.  Release all of the
10517 * children allocindirs and place their journal work on the appropriate
10518 * list.
10519 */
10520static void
10521cancel_indirdep(indirdep, bp, freeblks)
10522	struct indirdep *indirdep;
10523	struct buf *bp;
10524	struct freeblks *freeblks;
10525{
10526	struct allocindir *aip;
10527
10528	/*
10529	 * None of the indirect pointers will ever be visible,
10530	 * so they can simply be tossed. GOINGAWAY ensures
10531	 * that allocated pointers will be saved in the buffer
10532	 * cache until they are freed. Note that they will
10533	 * only be able to be found by their physical address
10534	 * since the inode mapping the logical address will
10535	 * be gone. The save buffer used for the safe copy
10536	 * was allocated in setup_allocindir_phase2 using
10537	 * the physical address so it could be used for this
10538	 * purpose. Hence we swap the safe copy with the real
10539	 * copy, allowing the safe copy to be freed and holding
10540	 * on to the real copy for later use in indir_trunc.
10541	 */
10542	if (indirdep->ir_state & GOINGAWAY)
10543		panic("cancel_indirdep: already gone");
10544	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10545		indirdep->ir_state |= DEPCOMPLETE;
10546		LIST_REMOVE(indirdep, ir_next);
10547	}
10548	indirdep->ir_state |= GOINGAWAY;
10549	/*
10550	 * Pass in bp for blocks still have journal writes
10551	 * pending so we can cancel them on their own.
10552	 */
10553	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
10554		cancel_allocindir(aip, bp, freeblks, 0);
10555	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
10556		cancel_allocindir(aip, NULL, freeblks, 0);
10557	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
10558		cancel_allocindir(aip, NULL, freeblks, 0);
10559	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
10560		cancel_allocindir(aip, NULL, freeblks, 0);
10561	/*
10562	 * If there are pending partial truncations we need to keep the
10563	 * old block copy around until they complete.  This is because
10564	 * the current b_data is not a perfect superset of the available
10565	 * blocks.
10566	 */
10567	if (TAILQ_EMPTY(&indirdep->ir_trunc))
10568		bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10569	else
10570		bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10571	WORKLIST_REMOVE(&indirdep->ir_list);
10572	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10573	indirdep->ir_bp = NULL;
10574	indirdep->ir_freeblks = freeblks;
10575}
10576
10577/*
10578 * Free an indirdep once it no longer has new pointers to track.
10579 */
10580static void
10581free_indirdep(indirdep)
10582	struct indirdep *indirdep;
10583{
10584
10585	KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10586	    ("free_indirdep: Indir trunc list not empty."));
10587	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10588	    ("free_indirdep: Complete head not empty."));
10589	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10590	    ("free_indirdep: write head not empty."));
10591	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10592	    ("free_indirdep: done head not empty."));
10593	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10594	    ("free_indirdep: deplist head not empty."));
10595	KASSERT((indirdep->ir_state & DEPCOMPLETE),
10596	    ("free_indirdep: %p still on newblk list.", indirdep));
10597	KASSERT(indirdep->ir_saveddata == NULL,
10598	    ("free_indirdep: %p still has saved data.", indirdep));
10599	if (indirdep->ir_state & ONWORKLIST)
10600		WORKLIST_REMOVE(&indirdep->ir_list);
10601	WORKITEM_FREE(indirdep, D_INDIRDEP);
10602}
10603
10604/*
10605 * Called before a write to an indirdep.  This routine is responsible for
10606 * rolling back pointers to a safe state which includes only those
10607 * allocindirs which have been completed.
10608 */
10609static void
10610initiate_write_indirdep(indirdep, bp)
10611	struct indirdep *indirdep;
10612	struct buf *bp;
10613{
10614	struct ufsmount *ump;
10615
10616	indirdep->ir_state |= IOSTARTED;
10617	if (indirdep->ir_state & GOINGAWAY)
10618		panic("disk_io_initiation: indirdep gone");
10619	/*
10620	 * If there are no remaining dependencies, this will be writing
10621	 * the real pointers.
10622	 */
10623	if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10624	    TAILQ_EMPTY(&indirdep->ir_trunc))
10625		return;
10626	/*
10627	 * Replace up-to-date version with safe version.
10628	 */
10629	if (indirdep->ir_saveddata == NULL) {
10630		ump = VFSTOUFS(indirdep->ir_list.wk_mp);
10631		LOCK_OWNED(ump);
10632		FREE_LOCK(ump);
10633		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10634		    M_SOFTDEP_FLAGS);
10635		ACQUIRE_LOCK(ump);
10636	}
10637	indirdep->ir_state &= ~ATTACHED;
10638	indirdep->ir_state |= UNDONE;
10639	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10640	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10641	    bp->b_bcount);
10642}
10643
10644/*
10645 * Called when an inode has been cleared in a cg bitmap.  This finally
10646 * eliminates any canceled jaddrefs
10647 */
10648void
10649softdep_setup_inofree(mp, bp, ino, wkhd)
10650	struct mount *mp;
10651	struct buf *bp;
10652	ino_t ino;
10653	struct workhead *wkhd;
10654{
10655	struct worklist *wk, *wkn;
10656	struct inodedep *inodedep;
10657	struct ufsmount *ump;
10658	uint8_t *inosused;
10659	struct cg *cgp;
10660	struct fs *fs;
10661
10662	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
10663	    ("softdep_setup_inofree called on non-softdep filesystem"));
10664	ump = VFSTOUFS(mp);
10665	ACQUIRE_LOCK(ump);
10666	fs = ump->um_fs;
10667	cgp = (struct cg *)bp->b_data;
10668	inosused = cg_inosused(cgp);
10669	if (isset(inosused, ino % fs->fs_ipg))
10670		panic("softdep_setup_inofree: inode %ju not freed.",
10671		    (uintmax_t)ino);
10672	if (inodedep_lookup(mp, ino, 0, &inodedep))
10673		panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
10674		    (uintmax_t)ino, inodedep);
10675	if (wkhd) {
10676		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10677			if (wk->wk_type != D_JADDREF)
10678				continue;
10679			WORKLIST_REMOVE(wk);
10680			/*
10681			 * We can free immediately even if the jaddref
10682			 * isn't attached in a background write as now
10683			 * the bitmaps are reconciled.
10684			 */
10685			wk->wk_state |= COMPLETE | ATTACHED;
10686			free_jaddref(WK_JADDREF(wk));
10687		}
10688		jwork_move(&bp->b_dep, wkhd);
10689	}
10690	FREE_LOCK(ump);
10691}
10692
10693
10694/*
10695 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10696 * map.  Any dependencies waiting for the write to clear are added to the
10697 * buf's list and any jnewblks that are being canceled are discarded
10698 * immediately.
10699 */
10700void
10701softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
10702	struct mount *mp;
10703	struct buf *bp;
10704	ufs2_daddr_t blkno;
10705	int frags;
10706	struct workhead *wkhd;
10707{
10708	struct bmsafemap *bmsafemap;
10709	struct jnewblk *jnewblk;
10710	struct ufsmount *ump;
10711	struct worklist *wk;
10712	struct fs *fs;
10713#ifdef SUJ_DEBUG
10714	uint8_t *blksfree;
10715	struct cg *cgp;
10716	ufs2_daddr_t jstart;
10717	ufs2_daddr_t jend;
10718	ufs2_daddr_t end;
10719	long bno;
10720	int i;
10721#endif
10722
10723	CTR3(KTR_SUJ,
10724	    "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
10725	    blkno, frags, wkhd);
10726
10727	ump = VFSTOUFS(mp);
10728	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
10729	    ("softdep_setup_blkfree called on non-softdep filesystem"));
10730	ACQUIRE_LOCK(ump);
10731	/* Lookup the bmsafemap so we track when it is dirty. */
10732	fs = ump->um_fs;
10733	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10734	/*
10735	 * Detach any jnewblks which have been canceled.  They must linger
10736	 * until the bitmap is cleared again by ffs_blkfree() to prevent
10737	 * an unjournaled allocation from hitting the disk.
10738	 */
10739	if (wkhd) {
10740		while ((wk = LIST_FIRST(wkhd)) != NULL) {
10741			CTR2(KTR_SUJ,
10742			    "softdep_setup_blkfree: blkno %jd wk type %d",
10743			    blkno, wk->wk_type);
10744			WORKLIST_REMOVE(wk);
10745			if (wk->wk_type != D_JNEWBLK) {
10746				WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
10747				continue;
10748			}
10749			jnewblk = WK_JNEWBLK(wk);
10750			KASSERT(jnewblk->jn_state & GOINGAWAY,
10751			    ("softdep_setup_blkfree: jnewblk not canceled."));
10752#ifdef SUJ_DEBUG
10753			/*
10754			 * Assert that this block is free in the bitmap
10755			 * before we discard the jnewblk.
10756			 */
10757			cgp = (struct cg *)bp->b_data;
10758			blksfree = cg_blksfree(cgp);
10759			bno = dtogd(fs, jnewblk->jn_blkno);
10760			for (i = jnewblk->jn_oldfrags;
10761			    i < jnewblk->jn_frags; i++) {
10762				if (isset(blksfree, bno + i))
10763					continue;
10764				panic("softdep_setup_blkfree: not free");
10765			}
10766#endif
10767			/*
10768			 * Even if it's not attached we can free immediately
10769			 * as the new bitmap is correct.
10770			 */
10771			wk->wk_state |= COMPLETE | ATTACHED;
10772			free_jnewblk(jnewblk);
10773		}
10774	}
10775
10776#ifdef SUJ_DEBUG
10777	/*
10778	 * Assert that we are not freeing a block which has an outstanding
10779	 * allocation dependency.
10780	 */
10781	fs = VFSTOUFS(mp)->um_fs;
10782	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10783	end = blkno + frags;
10784	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10785		/*
10786		 * Don't match against blocks that will be freed when the
10787		 * background write is done.
10788		 */
10789		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
10790		    (COMPLETE | DEPCOMPLETE))
10791			continue;
10792		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
10793		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
10794		if ((blkno >= jstart && blkno < jend) ||
10795		    (end > jstart && end <= jend)) {
10796			printf("state 0x%X %jd - %d %d dep %p\n",
10797			    jnewblk->jn_state, jnewblk->jn_blkno,
10798			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
10799			    jnewblk->jn_dep);
10800			panic("softdep_setup_blkfree: "
10801			    "%jd-%jd(%d) overlaps with %jd-%jd",
10802			    blkno, end, frags, jstart, jend);
10803		}
10804	}
10805#endif
10806	FREE_LOCK(ump);
10807}
10808
10809/*
10810 * Revert a block allocation when the journal record that describes it
10811 * is not yet written.
10812 */
10813static int
10814jnewblk_rollback(jnewblk, fs, cgp, blksfree)
10815	struct jnewblk *jnewblk;
10816	struct fs *fs;
10817	struct cg *cgp;
10818	uint8_t *blksfree;
10819{
10820	ufs1_daddr_t fragno;
10821	long cgbno, bbase;
10822	int frags, blk;
10823	int i;
10824
10825	frags = 0;
10826	cgbno = dtogd(fs, jnewblk->jn_blkno);
10827	/*
10828	 * We have to test which frags need to be rolled back.  We may
10829	 * be operating on a stale copy when doing background writes.
10830	 */
10831	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
10832		if (isclr(blksfree, cgbno + i))
10833			frags++;
10834	if (frags == 0)
10835		return (0);
10836	/*
10837	 * This is mostly ffs_blkfree() sans some validation and
10838	 * superblock updates.
10839	 */
10840	if (frags == fs->fs_frag) {
10841		fragno = fragstoblks(fs, cgbno);
10842		ffs_setblock(fs, blksfree, fragno);
10843		ffs_clusteracct(fs, cgp, fragno, 1);
10844		cgp->cg_cs.cs_nbfree++;
10845	} else {
10846		cgbno += jnewblk->jn_oldfrags;
10847		bbase = cgbno - fragnum(fs, cgbno);
10848		/* Decrement the old frags.  */
10849		blk = blkmap(fs, blksfree, bbase);
10850		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
10851		/* Deallocate the fragment */
10852		for (i = 0; i < frags; i++)
10853			setbit(blksfree, cgbno + i);
10854		cgp->cg_cs.cs_nffree += frags;
10855		/* Add back in counts associated with the new frags */
10856		blk = blkmap(fs, blksfree, bbase);
10857		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
10858		/* If a complete block has been reassembled, account for it. */
10859		fragno = fragstoblks(fs, bbase);
10860		if (ffs_isblock(fs, blksfree, fragno)) {
10861			cgp->cg_cs.cs_nffree -= fs->fs_frag;
10862			ffs_clusteracct(fs, cgp, fragno, 1);
10863			cgp->cg_cs.cs_nbfree++;
10864		}
10865	}
10866	stat_jnewblk++;
10867	jnewblk->jn_state &= ~ATTACHED;
10868	jnewblk->jn_state |= UNDONE;
10869
10870	return (frags);
10871}
10872
10873static void
10874initiate_write_bmsafemap(bmsafemap, bp)
10875	struct bmsafemap *bmsafemap;
10876	struct buf *bp;			/* The cg block. */
10877{
10878	struct jaddref *jaddref;
10879	struct jnewblk *jnewblk;
10880	uint8_t *inosused;
10881	uint8_t *blksfree;
10882	struct cg *cgp;
10883	struct fs *fs;
10884	ino_t ino;
10885
10886	if (bmsafemap->sm_state & IOSTARTED)
10887		return;
10888	bmsafemap->sm_state |= IOSTARTED;
10889	/*
10890	 * Clear any inode allocations which are pending journal writes.
10891	 */
10892	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
10893		cgp = (struct cg *)bp->b_data;
10894		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10895		inosused = cg_inosused(cgp);
10896		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
10897			ino = jaddref->ja_ino % fs->fs_ipg;
10898			if (isset(inosused, ino)) {
10899				if ((jaddref->ja_mode & IFMT) == IFDIR)
10900					cgp->cg_cs.cs_ndir--;
10901				cgp->cg_cs.cs_nifree++;
10902				clrbit(inosused, ino);
10903				jaddref->ja_state &= ~ATTACHED;
10904				jaddref->ja_state |= UNDONE;
10905				stat_jaddref++;
10906			} else
10907				panic("initiate_write_bmsafemap: inode %ju "
10908				    "marked free", (uintmax_t)jaddref->ja_ino);
10909		}
10910	}
10911	/*
10912	 * Clear any block allocations which are pending journal writes.
10913	 */
10914	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
10915		cgp = (struct cg *)bp->b_data;
10916		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10917		blksfree = cg_blksfree(cgp);
10918		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10919			if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
10920				continue;
10921			panic("initiate_write_bmsafemap: block %jd "
10922			    "marked free", jnewblk->jn_blkno);
10923		}
10924	}
10925	/*
10926	 * Move allocation lists to the written lists so they can be
10927	 * cleared once the block write is complete.
10928	 */
10929	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
10930	    inodedep, id_deps);
10931	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
10932	    newblk, nb_deps);
10933	LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
10934	    wk_list);
10935}
10936
10937/*
10938 * This routine is called during the completion interrupt
10939 * service routine for a disk write (from the procedure called
10940 * by the device driver to inform the filesystem caches of
10941 * a request completion).  It should be called early in this
10942 * procedure, before the block is made available to other
10943 * processes or other routines are called.
10944 *
10945 */
10946static void
10947softdep_disk_write_complete(bp)
10948	struct buf *bp;		/* describes the completed disk write */
10949{
10950	struct worklist *wk;
10951	struct worklist *owk;
10952	struct ufsmount *ump;
10953	struct workhead reattach;
10954	struct freeblks *freeblks;
10955	struct buf *sbp;
10956
10957	/*
10958	 * If an error occurred while doing the write, then the data
10959	 * has not hit the disk and the dependencies cannot be unrolled.
10960	 */
10961	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
10962		return;
10963	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
10964		return;
10965	ump = VFSTOUFS(wk->wk_mp);
10966	LIST_INIT(&reattach);
10967	/*
10968	 * This lock must not be released anywhere in this code segment.
10969	 */
10970	sbp = NULL;
10971	owk = NULL;
10972	ACQUIRE_LOCK(ump);
10973	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
10974		WORKLIST_REMOVE(wk);
10975		atomic_add_long(&dep_write[wk->wk_type], 1);
10976		if (wk == owk)
10977			panic("duplicate worklist: %p\n", wk);
10978		owk = wk;
10979		switch (wk->wk_type) {
10980
10981		case D_PAGEDEP:
10982			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
10983				WORKLIST_INSERT(&reattach, wk);
10984			continue;
10985
10986		case D_INODEDEP:
10987			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
10988				WORKLIST_INSERT(&reattach, wk);
10989			continue;
10990
10991		case D_BMSAFEMAP:
10992			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
10993				WORKLIST_INSERT(&reattach, wk);
10994			continue;
10995
10996		case D_MKDIR:
10997			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
10998			continue;
10999
11000		case D_ALLOCDIRECT:
11001			wk->wk_state |= COMPLETE;
11002			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
11003			continue;
11004
11005		case D_ALLOCINDIR:
11006			wk->wk_state |= COMPLETE;
11007			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
11008			continue;
11009
11010		case D_INDIRDEP:
11011			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
11012				WORKLIST_INSERT(&reattach, wk);
11013			continue;
11014
11015		case D_FREEBLKS:
11016			wk->wk_state |= COMPLETE;
11017			freeblks = WK_FREEBLKS(wk);
11018			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
11019			    LIST_EMPTY(&freeblks->fb_jblkdephd))
11020				add_to_worklist(wk, WK_NODELAY);
11021			continue;
11022
11023		case D_FREEWORK:
11024			handle_written_freework(WK_FREEWORK(wk));
11025			break;
11026
11027		case D_JSEGDEP:
11028			free_jsegdep(WK_JSEGDEP(wk));
11029			continue;
11030
11031		case D_JSEG:
11032			handle_written_jseg(WK_JSEG(wk), bp);
11033			continue;
11034
11035		case D_SBDEP:
11036			if (handle_written_sbdep(WK_SBDEP(wk), bp))
11037				WORKLIST_INSERT(&reattach, wk);
11038			continue;
11039
11040		case D_FREEDEP:
11041			free_freedep(WK_FREEDEP(wk));
11042			continue;
11043
11044		default:
11045			panic("handle_disk_write_complete: Unknown type %s",
11046			    TYPENAME(wk->wk_type));
11047			/* NOTREACHED */
11048		}
11049	}
11050	/*
11051	 * Reattach any requests that must be redone.
11052	 */
11053	while ((wk = LIST_FIRST(&reattach)) != NULL) {
11054		WORKLIST_REMOVE(wk);
11055		WORKLIST_INSERT(&bp->b_dep, wk);
11056	}
11057	FREE_LOCK(ump);
11058	if (sbp)
11059		brelse(sbp);
11060}
11061
11062/*
11063 * Called from within softdep_disk_write_complete above. Note that
11064 * this routine is always called from interrupt level with further
11065 * splbio interrupts blocked.
11066 */
11067static void
11068handle_allocdirect_partdone(adp, wkhd)
11069	struct allocdirect *adp;	/* the completed allocdirect */
11070	struct workhead *wkhd;		/* Work to do when inode is writtne. */
11071{
11072	struct allocdirectlst *listhead;
11073	struct allocdirect *listadp;
11074	struct inodedep *inodedep;
11075	long bsize;
11076
11077	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11078		return;
11079	/*
11080	 * The on-disk inode cannot claim to be any larger than the last
11081	 * fragment that has been written. Otherwise, the on-disk inode
11082	 * might have fragments that were not the last block in the file
11083	 * which would corrupt the filesystem. Thus, we cannot free any
11084	 * allocdirects after one whose ad_oldblkno claims a fragment as
11085	 * these blocks must be rolled back to zero before writing the inode.
11086	 * We check the currently active set of allocdirects in id_inoupdt
11087	 * or id_extupdt as appropriate.
11088	 */
11089	inodedep = adp->ad_inodedep;
11090	bsize = inodedep->id_fs->fs_bsize;
11091	if (adp->ad_state & EXTDATA)
11092		listhead = &inodedep->id_extupdt;
11093	else
11094		listhead = &inodedep->id_inoupdt;
11095	TAILQ_FOREACH(listadp, listhead, ad_next) {
11096		/* found our block */
11097		if (listadp == adp)
11098			break;
11099		/* continue if ad_oldlbn is not a fragment */
11100		if (listadp->ad_oldsize == 0 ||
11101		    listadp->ad_oldsize == bsize)
11102			continue;
11103		/* hit a fragment */
11104		return;
11105	}
11106	/*
11107	 * If we have reached the end of the current list without
11108	 * finding the just finished dependency, then it must be
11109	 * on the future dependency list. Future dependencies cannot
11110	 * be freed until they are moved to the current list.
11111	 */
11112	if (listadp == NULL) {
11113#ifdef DEBUG
11114		if (adp->ad_state & EXTDATA)
11115			listhead = &inodedep->id_newextupdt;
11116		else
11117			listhead = &inodedep->id_newinoupdt;
11118		TAILQ_FOREACH(listadp, listhead, ad_next)
11119			/* found our block */
11120			if (listadp == adp)
11121				break;
11122		if (listadp == NULL)
11123			panic("handle_allocdirect_partdone: lost dep");
11124#endif /* DEBUG */
11125		return;
11126	}
11127	/*
11128	 * If we have found the just finished dependency, then queue
11129	 * it along with anything that follows it that is complete.
11130	 * Since the pointer has not yet been written in the inode
11131	 * as the dependency prevents it, place the allocdirect on the
11132	 * bufwait list where it will be freed once the pointer is
11133	 * valid.
11134	 */
11135	if (wkhd == NULL)
11136		wkhd = &inodedep->id_bufwait;
11137	for (; adp; adp = listadp) {
11138		listadp = TAILQ_NEXT(adp, ad_next);
11139		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11140			return;
11141		TAILQ_REMOVE(listhead, adp, ad_next);
11142		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
11143	}
11144}
11145
11146/*
11147 * Called from within softdep_disk_write_complete above.  This routine
11148 * completes successfully written allocindirs.
11149 */
11150static void
11151handle_allocindir_partdone(aip)
11152	struct allocindir *aip;		/* the completed allocindir */
11153{
11154	struct indirdep *indirdep;
11155
11156	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
11157		return;
11158	indirdep = aip->ai_indirdep;
11159	LIST_REMOVE(aip, ai_next);
11160	/*
11161	 * Don't set a pointer while the buffer is undergoing IO or while
11162	 * we have active truncations.
11163	 */
11164	if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
11165		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
11166		return;
11167	}
11168	if (indirdep->ir_state & UFS1FMT)
11169		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11170		    aip->ai_newblkno;
11171	else
11172		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11173		    aip->ai_newblkno;
11174	/*
11175	 * Await the pointer write before freeing the allocindir.
11176	 */
11177	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
11178}
11179
11180/*
11181 * Release segments held on a jwork list.
11182 */
11183static void
11184handle_jwork(wkhd)
11185	struct workhead *wkhd;
11186{
11187	struct worklist *wk;
11188
11189	while ((wk = LIST_FIRST(wkhd)) != NULL) {
11190		WORKLIST_REMOVE(wk);
11191		switch (wk->wk_type) {
11192		case D_JSEGDEP:
11193			free_jsegdep(WK_JSEGDEP(wk));
11194			continue;
11195		case D_FREEDEP:
11196			free_freedep(WK_FREEDEP(wk));
11197			continue;
11198		case D_FREEFRAG:
11199			rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
11200			WORKITEM_FREE(wk, D_FREEFRAG);
11201			continue;
11202		case D_FREEWORK:
11203			handle_written_freework(WK_FREEWORK(wk));
11204			continue;
11205		default:
11206			panic("handle_jwork: Unknown type %s\n",
11207			    TYPENAME(wk->wk_type));
11208		}
11209	}
11210}
11211
11212/*
11213 * Handle the bufwait list on an inode when it is safe to release items
11214 * held there.  This normally happens after an inode block is written but
11215 * may be delayed and handled later if there are pending journal items that
11216 * are not yet safe to be released.
11217 */
11218static struct freefile *
11219handle_bufwait(inodedep, refhd)
11220	struct inodedep *inodedep;
11221	struct workhead *refhd;
11222{
11223	struct jaddref *jaddref;
11224	struct freefile *freefile;
11225	struct worklist *wk;
11226
11227	freefile = NULL;
11228	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
11229		WORKLIST_REMOVE(wk);
11230		switch (wk->wk_type) {
11231		case D_FREEFILE:
11232			/*
11233			 * We defer adding freefile to the worklist
11234			 * until all other additions have been made to
11235			 * ensure that it will be done after all the
11236			 * old blocks have been freed.
11237			 */
11238			if (freefile != NULL)
11239				panic("handle_bufwait: freefile");
11240			freefile = WK_FREEFILE(wk);
11241			continue;
11242
11243		case D_MKDIR:
11244			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
11245			continue;
11246
11247		case D_DIRADD:
11248			diradd_inode_written(WK_DIRADD(wk), inodedep);
11249			continue;
11250
11251		case D_FREEFRAG:
11252			wk->wk_state |= COMPLETE;
11253			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
11254				add_to_worklist(wk, 0);
11255			continue;
11256
11257		case D_DIRREM:
11258			wk->wk_state |= COMPLETE;
11259			add_to_worklist(wk, 0);
11260			continue;
11261
11262		case D_ALLOCDIRECT:
11263		case D_ALLOCINDIR:
11264			free_newblk(WK_NEWBLK(wk));
11265			continue;
11266
11267		case D_JNEWBLK:
11268			wk->wk_state |= COMPLETE;
11269			free_jnewblk(WK_JNEWBLK(wk));
11270			continue;
11271
11272		/*
11273		 * Save freed journal segments and add references on
11274		 * the supplied list which will delay their release
11275		 * until the cg bitmap is cleared on disk.
11276		 */
11277		case D_JSEGDEP:
11278			if (refhd == NULL)
11279				free_jsegdep(WK_JSEGDEP(wk));
11280			else
11281				WORKLIST_INSERT(refhd, wk);
11282			continue;
11283
11284		case D_JADDREF:
11285			jaddref = WK_JADDREF(wk);
11286			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
11287			    if_deps);
11288			/*
11289			 * Transfer any jaddrefs to the list to be freed with
11290			 * the bitmap if we're handling a removed file.
11291			 */
11292			if (refhd == NULL) {
11293				wk->wk_state |= COMPLETE;
11294				free_jaddref(jaddref);
11295			} else
11296				WORKLIST_INSERT(refhd, wk);
11297			continue;
11298
11299		default:
11300			panic("handle_bufwait: Unknown type %p(%s)",
11301			    wk, TYPENAME(wk->wk_type));
11302			/* NOTREACHED */
11303		}
11304	}
11305	return (freefile);
11306}
11307/*
11308 * Called from within softdep_disk_write_complete above to restore
11309 * in-memory inode block contents to their most up-to-date state. Note
11310 * that this routine is always called from interrupt level with further
11311 * splbio interrupts blocked.
11312 */
11313static int
11314handle_written_inodeblock(inodedep, bp)
11315	struct inodedep *inodedep;
11316	struct buf *bp;		/* buffer containing the inode block */
11317{
11318	struct freefile *freefile;
11319	struct allocdirect *adp, *nextadp;
11320	struct ufs1_dinode *dp1 = NULL;
11321	struct ufs2_dinode *dp2 = NULL;
11322	struct workhead wkhd;
11323	int hadchanges, fstype;
11324	ino_t freelink;
11325
11326	LIST_INIT(&wkhd);
11327	hadchanges = 0;
11328	freefile = NULL;
11329	if ((inodedep->id_state & IOSTARTED) == 0)
11330		panic("handle_written_inodeblock: not started");
11331	inodedep->id_state &= ~IOSTARTED;
11332	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11333		fstype = UFS1;
11334		dp1 = (struct ufs1_dinode *)bp->b_data +
11335		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11336		freelink = dp1->di_freelink;
11337	} else {
11338		fstype = UFS2;
11339		dp2 = (struct ufs2_dinode *)bp->b_data +
11340		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11341		freelink = dp2->di_freelink;
11342	}
11343	/*
11344	 * Leave this inodeblock dirty until it's in the list.
11345	 */
11346	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) {
11347		struct inodedep *inon;
11348
11349		inon = TAILQ_NEXT(inodedep, id_unlinked);
11350		if ((inon == NULL && freelink == 0) ||
11351		    (inon && inon->id_ino == freelink)) {
11352			if (inon)
11353				inon->id_state |= UNLINKPREV;
11354			inodedep->id_state |= UNLINKNEXT;
11355		}
11356		hadchanges = 1;
11357	}
11358	/*
11359	 * If we had to rollback the inode allocation because of
11360	 * bitmaps being incomplete, then simply restore it.
11361	 * Keep the block dirty so that it will not be reclaimed until
11362	 * all associated dependencies have been cleared and the
11363	 * corresponding updates written to disk.
11364	 */
11365	if (inodedep->id_savedino1 != NULL) {
11366		hadchanges = 1;
11367		if (fstype == UFS1)
11368			*dp1 = *inodedep->id_savedino1;
11369		else
11370			*dp2 = *inodedep->id_savedino2;
11371		free(inodedep->id_savedino1, M_SAVEDINO);
11372		inodedep->id_savedino1 = NULL;
11373		if ((bp->b_flags & B_DELWRI) == 0)
11374			stat_inode_bitmap++;
11375		bdirty(bp);
11376		/*
11377		 * If the inode is clear here and GOINGAWAY it will never
11378		 * be written.  Process the bufwait and clear any pending
11379		 * work which may include the freefile.
11380		 */
11381		if (inodedep->id_state & GOINGAWAY)
11382			goto bufwait;
11383		return (1);
11384	}
11385	inodedep->id_state |= COMPLETE;
11386	/*
11387	 * Roll forward anything that had to be rolled back before
11388	 * the inode could be updated.
11389	 */
11390	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11391		nextadp = TAILQ_NEXT(adp, ad_next);
11392		if (adp->ad_state & ATTACHED)
11393			panic("handle_written_inodeblock: new entry");
11394		if (fstype == UFS1) {
11395			if (adp->ad_offset < NDADDR) {
11396				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11397					panic("%s %s #%jd mismatch %d != %jd",
11398					    "handle_written_inodeblock:",
11399					    "direct pointer",
11400					    (intmax_t)adp->ad_offset,
11401					    dp1->di_db[adp->ad_offset],
11402					    (intmax_t)adp->ad_oldblkno);
11403				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11404			} else {
11405				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
11406					panic("%s: %s #%jd allocated as %d",
11407					    "handle_written_inodeblock",
11408					    "indirect pointer",
11409					    (intmax_t)adp->ad_offset - NDADDR,
11410					    dp1->di_ib[adp->ad_offset - NDADDR]);
11411				dp1->di_ib[adp->ad_offset - NDADDR] =
11412				    adp->ad_newblkno;
11413			}
11414		} else {
11415			if (adp->ad_offset < NDADDR) {
11416				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11417					panic("%s: %s #%jd %s %jd != %jd",
11418					    "handle_written_inodeblock",
11419					    "direct pointer",
11420					    (intmax_t)adp->ad_offset, "mismatch",
11421					    (intmax_t)dp2->di_db[adp->ad_offset],
11422					    (intmax_t)adp->ad_oldblkno);
11423				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11424			} else {
11425				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
11426					panic("%s: %s #%jd allocated as %jd",
11427					    "handle_written_inodeblock",
11428					    "indirect pointer",
11429					    (intmax_t)adp->ad_offset - NDADDR,
11430					    (intmax_t)
11431					    dp2->di_ib[adp->ad_offset - NDADDR]);
11432				dp2->di_ib[adp->ad_offset - NDADDR] =
11433				    adp->ad_newblkno;
11434			}
11435		}
11436		adp->ad_state &= ~UNDONE;
11437		adp->ad_state |= ATTACHED;
11438		hadchanges = 1;
11439	}
11440	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11441		nextadp = TAILQ_NEXT(adp, ad_next);
11442		if (adp->ad_state & ATTACHED)
11443			panic("handle_written_inodeblock: new entry");
11444		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11445			panic("%s: direct pointers #%jd %s %jd != %jd",
11446			    "handle_written_inodeblock",
11447			    (intmax_t)adp->ad_offset, "mismatch",
11448			    (intmax_t)dp2->di_extb[adp->ad_offset],
11449			    (intmax_t)adp->ad_oldblkno);
11450		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11451		adp->ad_state &= ~UNDONE;
11452		adp->ad_state |= ATTACHED;
11453		hadchanges = 1;
11454	}
11455	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11456		stat_direct_blk_ptrs++;
11457	/*
11458	 * Reset the file size to its most up-to-date value.
11459	 */
11460	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11461		panic("handle_written_inodeblock: bad size");
11462	if (inodedep->id_savednlink > LINK_MAX)
11463		panic("handle_written_inodeblock: Invalid link count "
11464		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
11465	if (fstype == UFS1) {
11466		if (dp1->di_nlink != inodedep->id_savednlink) {
11467			dp1->di_nlink = inodedep->id_savednlink;
11468			hadchanges = 1;
11469		}
11470		if (dp1->di_size != inodedep->id_savedsize) {
11471			dp1->di_size = inodedep->id_savedsize;
11472			hadchanges = 1;
11473		}
11474	} else {
11475		if (dp2->di_nlink != inodedep->id_savednlink) {
11476			dp2->di_nlink = inodedep->id_savednlink;
11477			hadchanges = 1;
11478		}
11479		if (dp2->di_size != inodedep->id_savedsize) {
11480			dp2->di_size = inodedep->id_savedsize;
11481			hadchanges = 1;
11482		}
11483		if (dp2->di_extsize != inodedep->id_savedextsize) {
11484			dp2->di_extsize = inodedep->id_savedextsize;
11485			hadchanges = 1;
11486		}
11487	}
11488	inodedep->id_savedsize = -1;
11489	inodedep->id_savedextsize = -1;
11490	inodedep->id_savednlink = -1;
11491	/*
11492	 * If there were any rollbacks in the inode block, then it must be
11493	 * marked dirty so that its will eventually get written back in
11494	 * its correct form.
11495	 */
11496	if (hadchanges)
11497		bdirty(bp);
11498bufwait:
11499	/*
11500	 * Process any allocdirects that completed during the update.
11501	 */
11502	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11503		handle_allocdirect_partdone(adp, &wkhd);
11504	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11505		handle_allocdirect_partdone(adp, &wkhd);
11506	/*
11507	 * Process deallocations that were held pending until the
11508	 * inode had been written to disk. Freeing of the inode
11509	 * is delayed until after all blocks have been freed to
11510	 * avoid creation of new <vfsid, inum, lbn> triples
11511	 * before the old ones have been deleted.  Completely
11512	 * unlinked inodes are not processed until the unlinked
11513	 * inode list is written or the last reference is removed.
11514	 */
11515	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11516		freefile = handle_bufwait(inodedep, NULL);
11517		if (freefile && !LIST_EMPTY(&wkhd)) {
11518			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11519			freefile = NULL;
11520		}
11521	}
11522	/*
11523	 * Move rolled forward dependency completions to the bufwait list
11524	 * now that those that were already written have been processed.
11525	 */
11526	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11527		panic("handle_written_inodeblock: bufwait but no changes");
11528	jwork_move(&inodedep->id_bufwait, &wkhd);
11529
11530	if (freefile != NULL) {
11531		/*
11532		 * If the inode is goingaway it was never written.  Fake up
11533		 * the state here so free_inodedep() can succeed.
11534		 */
11535		if (inodedep->id_state & GOINGAWAY)
11536			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11537		if (free_inodedep(inodedep) == 0)
11538			panic("handle_written_inodeblock: live inodedep %p",
11539			    inodedep);
11540		add_to_worklist(&freefile->fx_list, 0);
11541		return (0);
11542	}
11543
11544	/*
11545	 * If no outstanding dependencies, free it.
11546	 */
11547	if (free_inodedep(inodedep) ||
11548	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11549	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11550	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11551	     LIST_FIRST(&inodedep->id_bufwait) == 0))
11552		return (0);
11553	return (hadchanges);
11554}
11555
11556static int
11557handle_written_indirdep(indirdep, bp, bpp)
11558	struct indirdep *indirdep;
11559	struct buf *bp;
11560	struct buf **bpp;
11561{
11562	struct allocindir *aip;
11563	struct buf *sbp;
11564	int chgs;
11565
11566	if (indirdep->ir_state & GOINGAWAY)
11567		panic("handle_written_indirdep: indirdep gone");
11568	if ((indirdep->ir_state & IOSTARTED) == 0)
11569		panic("handle_written_indirdep: IO not started");
11570	chgs = 0;
11571	/*
11572	 * If there were rollbacks revert them here.
11573	 */
11574	if (indirdep->ir_saveddata) {
11575		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11576		if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11577			free(indirdep->ir_saveddata, M_INDIRDEP);
11578			indirdep->ir_saveddata = NULL;
11579		}
11580		chgs = 1;
11581	}
11582	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11583	indirdep->ir_state |= ATTACHED;
11584	/*
11585	 * Move allocindirs with written pointers to the completehd if
11586	 * the indirdep's pointer is not yet written.  Otherwise
11587	 * free them here.
11588	 */
11589	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
11590		LIST_REMOVE(aip, ai_next);
11591		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11592			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11593			    ai_next);
11594			newblk_freefrag(&aip->ai_block);
11595			continue;
11596		}
11597		free_newblk(&aip->ai_block);
11598	}
11599	/*
11600	 * Move allocindirs that have finished dependency processing from
11601	 * the done list to the write list after updating the pointers.
11602	 */
11603	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11604		while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
11605			handle_allocindir_partdone(aip);
11606			if (aip == LIST_FIRST(&indirdep->ir_donehd))
11607				panic("disk_write_complete: not gone");
11608			chgs = 1;
11609		}
11610	}
11611	/*
11612	 * Preserve the indirdep if there were any changes or if it is not
11613	 * yet valid on disk.
11614	 */
11615	if (chgs) {
11616		stat_indir_blk_ptrs++;
11617		bdirty(bp);
11618		return (1);
11619	}
11620	/*
11621	 * If there were no changes we can discard the savedbp and detach
11622	 * ourselves from the buf.  We are only carrying completed pointers
11623	 * in this case.
11624	 */
11625	sbp = indirdep->ir_savebp;
11626	sbp->b_flags |= B_INVAL | B_NOCACHE;
11627	indirdep->ir_savebp = NULL;
11628	indirdep->ir_bp = NULL;
11629	if (*bpp != NULL)
11630		panic("handle_written_indirdep: bp already exists.");
11631	*bpp = sbp;
11632	/*
11633	 * The indirdep may not be freed until its parent points at it.
11634	 */
11635	if (indirdep->ir_state & DEPCOMPLETE)
11636		free_indirdep(indirdep);
11637
11638	return (0);
11639}
11640
11641/*
11642 * Process a diradd entry after its dependent inode has been written.
11643 * This routine must be called with splbio interrupts blocked.
11644 */
11645static void
11646diradd_inode_written(dap, inodedep)
11647	struct diradd *dap;
11648	struct inodedep *inodedep;
11649{
11650
11651	dap->da_state |= COMPLETE;
11652	complete_diradd(dap);
11653	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
11654}
11655
11656/*
11657 * Returns true if the bmsafemap will have rollbacks when written.  Must only
11658 * be called with the per-filesystem lock and the buf lock on the cg held.
11659 */
11660static int
11661bmsafemap_backgroundwrite(bmsafemap, bp)
11662	struct bmsafemap *bmsafemap;
11663	struct buf *bp;
11664{
11665	int dirty;
11666
11667	LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp));
11668	dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
11669	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
11670	/*
11671	 * If we're initiating a background write we need to process the
11672	 * rollbacks as they exist now, not as they exist when IO starts.
11673	 * No other consumers will look at the contents of the shadowed
11674	 * buf so this is safe to do here.
11675	 */
11676	if (bp->b_xflags & BX_BKGRDMARKER)
11677		initiate_write_bmsafemap(bmsafemap, bp);
11678
11679	return (dirty);
11680}
11681
11682/*
11683 * Re-apply an allocation when a cg write is complete.
11684 */
11685static int
11686jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
11687	struct jnewblk *jnewblk;
11688	struct fs *fs;
11689	struct cg *cgp;
11690	uint8_t *blksfree;
11691{
11692	ufs1_daddr_t fragno;
11693	ufs2_daddr_t blkno;
11694	long cgbno, bbase;
11695	int frags, blk;
11696	int i;
11697
11698	frags = 0;
11699	cgbno = dtogd(fs, jnewblk->jn_blkno);
11700	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
11701		if (isclr(blksfree, cgbno + i))
11702			panic("jnewblk_rollforward: re-allocated fragment");
11703		frags++;
11704	}
11705	if (frags == fs->fs_frag) {
11706		blkno = fragstoblks(fs, cgbno);
11707		ffs_clrblock(fs, blksfree, (long)blkno);
11708		ffs_clusteracct(fs, cgp, blkno, -1);
11709		cgp->cg_cs.cs_nbfree--;
11710	} else {
11711		bbase = cgbno - fragnum(fs, cgbno);
11712		cgbno += jnewblk->jn_oldfrags;
11713                /* If a complete block had been reassembled, account for it. */
11714		fragno = fragstoblks(fs, bbase);
11715		if (ffs_isblock(fs, blksfree, fragno)) {
11716			cgp->cg_cs.cs_nffree += fs->fs_frag;
11717			ffs_clusteracct(fs, cgp, fragno, -1);
11718			cgp->cg_cs.cs_nbfree--;
11719		}
11720		/* Decrement the old frags.  */
11721		blk = blkmap(fs, blksfree, bbase);
11722		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11723		/* Allocate the fragment */
11724		for (i = 0; i < frags; i++)
11725			clrbit(blksfree, cgbno + i);
11726		cgp->cg_cs.cs_nffree -= frags;
11727		/* Add back in counts associated with the new frags */
11728		blk = blkmap(fs, blksfree, bbase);
11729		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11730	}
11731	return (frags);
11732}
11733
11734/*
11735 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
11736 * changes if it's not a background write.  Set all written dependencies
11737 * to DEPCOMPLETE and free the structure if possible.
11738 */
11739static int
11740handle_written_bmsafemap(bmsafemap, bp)
11741	struct bmsafemap *bmsafemap;
11742	struct buf *bp;
11743{
11744	struct newblk *newblk;
11745	struct inodedep *inodedep;
11746	struct jaddref *jaddref, *jatmp;
11747	struct jnewblk *jnewblk, *jntmp;
11748	struct ufsmount *ump;
11749	uint8_t *inosused;
11750	uint8_t *blksfree;
11751	struct cg *cgp;
11752	struct fs *fs;
11753	ino_t ino;
11754	int foreground;
11755	int chgs;
11756
11757	if ((bmsafemap->sm_state & IOSTARTED) == 0)
11758		panic("initiate_write_bmsafemap: Not started\n");
11759	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
11760	chgs = 0;
11761	bmsafemap->sm_state &= ~IOSTARTED;
11762	foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
11763	/*
11764	 * Release journal work that was waiting on the write.
11765	 */
11766	handle_jwork(&bmsafemap->sm_freewr);
11767
11768	/*
11769	 * Restore unwritten inode allocation pending jaddref writes.
11770	 */
11771	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
11772		cgp = (struct cg *)bp->b_data;
11773		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11774		inosused = cg_inosused(cgp);
11775		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
11776		    ja_bmdeps, jatmp) {
11777			if ((jaddref->ja_state & UNDONE) == 0)
11778				continue;
11779			ino = jaddref->ja_ino % fs->fs_ipg;
11780			if (isset(inosused, ino))
11781				panic("handle_written_bmsafemap: "
11782				    "re-allocated inode");
11783			/* Do the roll-forward only if it's a real copy. */
11784			if (foreground) {
11785				if ((jaddref->ja_mode & IFMT) == IFDIR)
11786					cgp->cg_cs.cs_ndir++;
11787				cgp->cg_cs.cs_nifree--;
11788				setbit(inosused, ino);
11789				chgs = 1;
11790			}
11791			jaddref->ja_state &= ~UNDONE;
11792			jaddref->ja_state |= ATTACHED;
11793			free_jaddref(jaddref);
11794		}
11795	}
11796	/*
11797	 * Restore any block allocations which are pending journal writes.
11798	 */
11799	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11800		cgp = (struct cg *)bp->b_data;
11801		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11802		blksfree = cg_blksfree(cgp);
11803		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
11804		    jntmp) {
11805			if ((jnewblk->jn_state & UNDONE) == 0)
11806				continue;
11807			/* Do the roll-forward only if it's a real copy. */
11808			if (foreground &&
11809			    jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
11810				chgs = 1;
11811			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
11812			jnewblk->jn_state |= ATTACHED;
11813			free_jnewblk(jnewblk);
11814		}
11815	}
11816	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
11817		newblk->nb_state |= DEPCOMPLETE;
11818		newblk->nb_state &= ~ONDEPLIST;
11819		newblk->nb_bmsafemap = NULL;
11820		LIST_REMOVE(newblk, nb_deps);
11821		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
11822			handle_allocdirect_partdone(
11823			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
11824		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
11825			handle_allocindir_partdone(
11826			    WK_ALLOCINDIR(&newblk->nb_list));
11827		else if (newblk->nb_list.wk_type != D_NEWBLK)
11828			panic("handle_written_bmsafemap: Unexpected type: %s",
11829			    TYPENAME(newblk->nb_list.wk_type));
11830	}
11831	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
11832		inodedep->id_state |= DEPCOMPLETE;
11833		inodedep->id_state &= ~ONDEPLIST;
11834		LIST_REMOVE(inodedep, id_deps);
11835		inodedep->id_bmsafemap = NULL;
11836	}
11837	LIST_REMOVE(bmsafemap, sm_next);
11838	if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
11839	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
11840	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
11841	    LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
11842	    LIST_EMPTY(&bmsafemap->sm_freehd)) {
11843		LIST_REMOVE(bmsafemap, sm_hash);
11844		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
11845		return (0);
11846	}
11847	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
11848	if (foreground)
11849		bdirty(bp);
11850	return (1);
11851}
11852
11853/*
11854 * Try to free a mkdir dependency.
11855 */
11856static void
11857complete_mkdir(mkdir)
11858	struct mkdir *mkdir;
11859{
11860	struct diradd *dap;
11861
11862	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
11863		return;
11864	LIST_REMOVE(mkdir, md_mkdirs);
11865	dap = mkdir->md_diradd;
11866	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
11867	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
11868		dap->da_state |= DEPCOMPLETE;
11869		complete_diradd(dap);
11870	}
11871	WORKITEM_FREE(mkdir, D_MKDIR);
11872}
11873
11874/*
11875 * Handle the completion of a mkdir dependency.
11876 */
11877static void
11878handle_written_mkdir(mkdir, type)
11879	struct mkdir *mkdir;
11880	int type;
11881{
11882
11883	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
11884		panic("handle_written_mkdir: bad type");
11885	mkdir->md_state |= COMPLETE;
11886	complete_mkdir(mkdir);
11887}
11888
11889static int
11890free_pagedep(pagedep)
11891	struct pagedep *pagedep;
11892{
11893	int i;
11894
11895	if (pagedep->pd_state & NEWBLOCK)
11896		return (0);
11897	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
11898		return (0);
11899	for (i = 0; i < DAHASHSZ; i++)
11900		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
11901			return (0);
11902	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
11903		return (0);
11904	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
11905		return (0);
11906	if (pagedep->pd_state & ONWORKLIST)
11907		WORKLIST_REMOVE(&pagedep->pd_list);
11908	LIST_REMOVE(pagedep, pd_hash);
11909	WORKITEM_FREE(pagedep, D_PAGEDEP);
11910
11911	return (1);
11912}
11913
11914/*
11915 * Called from within softdep_disk_write_complete above.
11916 * A write operation was just completed. Removed inodes can
11917 * now be freed and associated block pointers may be committed.
11918 * Note that this routine is always called from interrupt level
11919 * with further splbio interrupts blocked.
11920 */
11921static int
11922handle_written_filepage(pagedep, bp)
11923	struct pagedep *pagedep;
11924	struct buf *bp;		/* buffer containing the written page */
11925{
11926	struct dirrem *dirrem;
11927	struct diradd *dap, *nextdap;
11928	struct direct *ep;
11929	int i, chgs;
11930
11931	if ((pagedep->pd_state & IOSTARTED) == 0)
11932		panic("handle_written_filepage: not started");
11933	pagedep->pd_state &= ~IOSTARTED;
11934	/*
11935	 * Process any directory removals that have been committed.
11936	 */
11937	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
11938		LIST_REMOVE(dirrem, dm_next);
11939		dirrem->dm_state |= COMPLETE;
11940		dirrem->dm_dirinum = pagedep->pd_ino;
11941		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
11942		    ("handle_written_filepage: Journal entries not written."));
11943		add_to_worklist(&dirrem->dm_list, 0);
11944	}
11945	/*
11946	 * Free any directory additions that have been committed.
11947	 * If it is a newly allocated block, we have to wait until
11948	 * the on-disk directory inode claims the new block.
11949	 */
11950	if ((pagedep->pd_state & NEWBLOCK) == 0)
11951		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
11952			free_diradd(dap, NULL);
11953	/*
11954	 * Uncommitted directory entries must be restored.
11955	 */
11956	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
11957		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
11958		     dap = nextdap) {
11959			nextdap = LIST_NEXT(dap, da_pdlist);
11960			if (dap->da_state & ATTACHED)
11961				panic("handle_written_filepage: attached");
11962			ep = (struct direct *)
11963			    ((char *)bp->b_data + dap->da_offset);
11964			ep->d_ino = dap->da_newinum;
11965			dap->da_state &= ~UNDONE;
11966			dap->da_state |= ATTACHED;
11967			chgs = 1;
11968			/*
11969			 * If the inode referenced by the directory has
11970			 * been written out, then the dependency can be
11971			 * moved to the pending list.
11972			 */
11973			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
11974				LIST_REMOVE(dap, da_pdlist);
11975				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
11976				    da_pdlist);
11977			}
11978		}
11979	}
11980	/*
11981	 * If there were any rollbacks in the directory, then it must be
11982	 * marked dirty so that its will eventually get written back in
11983	 * its correct form.
11984	 */
11985	if (chgs) {
11986		if ((bp->b_flags & B_DELWRI) == 0)
11987			stat_dir_entry++;
11988		bdirty(bp);
11989		return (1);
11990	}
11991	/*
11992	 * If we are not waiting for a new directory block to be
11993	 * claimed by its inode, then the pagedep will be freed.
11994	 * Otherwise it will remain to track any new entries on
11995	 * the page in case they are fsync'ed.
11996	 */
11997	free_pagedep(pagedep);
11998	return (0);
11999}
12000
12001/*
12002 * Writing back in-core inode structures.
12003 *
12004 * The filesystem only accesses an inode's contents when it occupies an
12005 * "in-core" inode structure.  These "in-core" structures are separate from
12006 * the page frames used to cache inode blocks.  Only the latter are
12007 * transferred to/from the disk.  So, when the updated contents of the
12008 * "in-core" inode structure are copied to the corresponding in-memory inode
12009 * block, the dependencies are also transferred.  The following procedure is
12010 * called when copying a dirty "in-core" inode to a cached inode block.
12011 */
12012
12013/*
12014 * Called when an inode is loaded from disk. If the effective link count
12015 * differed from the actual link count when it was last flushed, then we
12016 * need to ensure that the correct effective link count is put back.
12017 */
12018void
12019softdep_load_inodeblock(ip)
12020	struct inode *ip;	/* the "in_core" copy of the inode */
12021{
12022	struct inodedep *inodedep;
12023
12024	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
12025	    ("softdep_load_inodeblock called on non-softdep filesystem"));
12026	/*
12027	 * Check for alternate nlink count.
12028	 */
12029	ip->i_effnlink = ip->i_nlink;
12030	ACQUIRE_LOCK(ip->i_ump);
12031	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
12032	    &inodedep) == 0) {
12033		FREE_LOCK(ip->i_ump);
12034		return;
12035	}
12036	ip->i_effnlink -= inodedep->id_nlinkdelta;
12037	FREE_LOCK(ip->i_ump);
12038}
12039
12040/*
12041 * This routine is called just before the "in-core" inode
12042 * information is to be copied to the in-memory inode block.
12043 * Recall that an inode block contains several inodes. If
12044 * the force flag is set, then the dependencies will be
12045 * cleared so that the update can always be made. Note that
12046 * the buffer is locked when this routine is called, so we
12047 * will never be in the middle of writing the inode block
12048 * to disk.
12049 */
12050void
12051softdep_update_inodeblock(ip, bp, waitfor)
12052	struct inode *ip;	/* the "in_core" copy of the inode */
12053	struct buf *bp;		/* the buffer containing the inode block */
12054	int waitfor;		/* nonzero => update must be allowed */
12055{
12056	struct inodedep *inodedep;
12057	struct inoref *inoref;
12058	struct ufsmount *ump;
12059	struct worklist *wk;
12060	struct mount *mp;
12061	struct buf *ibp;
12062	struct fs *fs;
12063	int error;
12064
12065	ump = ip->i_ump;
12066	mp = UFSTOVFS(ump);
12067	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
12068	    ("softdep_update_inodeblock called on non-softdep filesystem"));
12069	fs = ip->i_fs;
12070	/*
12071	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
12072	 * does not have access to the in-core ip so must write directly into
12073	 * the inode block buffer when setting freelink.
12074	 */
12075	if (fs->fs_magic == FS_UFS1_MAGIC)
12076		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
12077		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
12078	else
12079		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
12080		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
12081	/*
12082	 * If the effective link count is not equal to the actual link
12083	 * count, then we must track the difference in an inodedep while
12084	 * the inode is (potentially) tossed out of the cache. Otherwise,
12085	 * if there is no existing inodedep, then there are no dependencies
12086	 * to track.
12087	 */
12088	ACQUIRE_LOCK(ump);
12089again:
12090	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12091		FREE_LOCK(ump);
12092		if (ip->i_effnlink != ip->i_nlink)
12093			panic("softdep_update_inodeblock: bad link count");
12094		return;
12095	}
12096	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
12097		panic("softdep_update_inodeblock: bad delta");
12098	/*
12099	 * If we're flushing all dependencies we must also move any waiting
12100	 * for journal writes onto the bufwait list prior to I/O.
12101	 */
12102	if (waitfor) {
12103		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12104			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12105			    == DEPCOMPLETE) {
12106				jwait(&inoref->if_list, MNT_WAIT);
12107				goto again;
12108			}
12109		}
12110	}
12111	/*
12112	 * Changes have been initiated. Anything depending on these
12113	 * changes cannot occur until this inode has been written.
12114	 */
12115	inodedep->id_state &= ~COMPLETE;
12116	if ((inodedep->id_state & ONWORKLIST) == 0)
12117		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
12118	/*
12119	 * Any new dependencies associated with the incore inode must
12120	 * now be moved to the list associated with the buffer holding
12121	 * the in-memory copy of the inode. Once merged process any
12122	 * allocdirects that are completed by the merger.
12123	 */
12124	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
12125	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
12126		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
12127		    NULL);
12128	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
12129	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
12130		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
12131		    NULL);
12132	/*
12133	 * Now that the inode has been pushed into the buffer, the
12134	 * operations dependent on the inode being written to disk
12135	 * can be moved to the id_bufwait so that they will be
12136	 * processed when the buffer I/O completes.
12137	 */
12138	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
12139		WORKLIST_REMOVE(wk);
12140		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
12141	}
12142	/*
12143	 * Newly allocated inodes cannot be written until the bitmap
12144	 * that allocates them have been written (indicated by
12145	 * DEPCOMPLETE being set in id_state). If we are doing a
12146	 * forced sync (e.g., an fsync on a file), we force the bitmap
12147	 * to be written so that the update can be done.
12148	 */
12149	if (waitfor == 0) {
12150		FREE_LOCK(ump);
12151		return;
12152	}
12153retry:
12154	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
12155		FREE_LOCK(ump);
12156		return;
12157	}
12158	ibp = inodedep->id_bmsafemap->sm_buf;
12159	ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT);
12160	if (ibp == NULL) {
12161		/*
12162		 * If ibp came back as NULL, the dependency could have been
12163		 * freed while we slept.  Look it up again, and check to see
12164		 * that it has completed.
12165		 */
12166		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
12167			goto retry;
12168		FREE_LOCK(ump);
12169		return;
12170	}
12171	FREE_LOCK(ump);
12172	if ((error = bwrite(ibp)) != 0)
12173		softdep_error("softdep_update_inodeblock: bwrite", error);
12174}
12175
12176/*
12177 * Merge the a new inode dependency list (such as id_newinoupdt) into an
12178 * old inode dependency list (such as id_inoupdt). This routine must be
12179 * called with splbio interrupts blocked.
12180 */
12181static void
12182merge_inode_lists(newlisthead, oldlisthead)
12183	struct allocdirectlst *newlisthead;
12184	struct allocdirectlst *oldlisthead;
12185{
12186	struct allocdirect *listadp, *newadp;
12187
12188	newadp = TAILQ_FIRST(newlisthead);
12189	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
12190		if (listadp->ad_offset < newadp->ad_offset) {
12191			listadp = TAILQ_NEXT(listadp, ad_next);
12192			continue;
12193		}
12194		TAILQ_REMOVE(newlisthead, newadp, ad_next);
12195		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
12196		if (listadp->ad_offset == newadp->ad_offset) {
12197			allocdirect_merge(oldlisthead, newadp,
12198			    listadp);
12199			listadp = newadp;
12200		}
12201		newadp = TAILQ_FIRST(newlisthead);
12202	}
12203	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
12204		TAILQ_REMOVE(newlisthead, newadp, ad_next);
12205		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
12206	}
12207}
12208
12209/*
12210 * If we are doing an fsync, then we must ensure that any directory
12211 * entries for the inode have been written after the inode gets to disk.
12212 */
12213int
12214softdep_fsync(vp)
12215	struct vnode *vp;	/* the "in_core" copy of the inode */
12216{
12217	struct inodedep *inodedep;
12218	struct pagedep *pagedep;
12219	struct inoref *inoref;
12220	struct ufsmount *ump;
12221	struct worklist *wk;
12222	struct diradd *dap;
12223	struct mount *mp;
12224	struct vnode *pvp;
12225	struct inode *ip;
12226	struct buf *bp;
12227	struct fs *fs;
12228	struct thread *td = curthread;
12229	int error, flushparent, pagedep_new_block;
12230	ino_t parentino;
12231	ufs_lbn_t lbn;
12232
12233	ip = VTOI(vp);
12234	fs = ip->i_fs;
12235	ump = ip->i_ump;
12236	mp = vp->v_mount;
12237	if (MOUNTEDSOFTDEP(mp) == 0)
12238		return (0);
12239	ACQUIRE_LOCK(ump);
12240restart:
12241	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12242		FREE_LOCK(ump);
12243		return (0);
12244	}
12245	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12246		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12247		    == DEPCOMPLETE) {
12248			jwait(&inoref->if_list, MNT_WAIT);
12249			goto restart;
12250		}
12251	}
12252	if (!LIST_EMPTY(&inodedep->id_inowait) ||
12253	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
12254	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
12255	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
12256	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
12257		panic("softdep_fsync: pending ops %p", inodedep);
12258	for (error = 0, flushparent = 0; ; ) {
12259		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
12260			break;
12261		if (wk->wk_type != D_DIRADD)
12262			panic("softdep_fsync: Unexpected type %s",
12263			    TYPENAME(wk->wk_type));
12264		dap = WK_DIRADD(wk);
12265		/*
12266		 * Flush our parent if this directory entry has a MKDIR_PARENT
12267		 * dependency or is contained in a newly allocated block.
12268		 */
12269		if (dap->da_state & DIRCHG)
12270			pagedep = dap->da_previous->dm_pagedep;
12271		else
12272			pagedep = dap->da_pagedep;
12273		parentino = pagedep->pd_ino;
12274		lbn = pagedep->pd_lbn;
12275		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
12276			panic("softdep_fsync: dirty");
12277		if ((dap->da_state & MKDIR_PARENT) ||
12278		    (pagedep->pd_state & NEWBLOCK))
12279			flushparent = 1;
12280		else
12281			flushparent = 0;
12282		/*
12283		 * If we are being fsync'ed as part of vgone'ing this vnode,
12284		 * then we will not be able to release and recover the
12285		 * vnode below, so we just have to give up on writing its
12286		 * directory entry out. It will eventually be written, just
12287		 * not now, but then the user was not asking to have it
12288		 * written, so we are not breaking any promises.
12289		 */
12290		if (vp->v_iflag & VI_DOOMED)
12291			break;
12292		/*
12293		 * We prevent deadlock by always fetching inodes from the
12294		 * root, moving down the directory tree. Thus, when fetching
12295		 * our parent directory, we first try to get the lock. If
12296		 * that fails, we must unlock ourselves before requesting
12297		 * the lock on our parent. See the comment in ufs_lookup
12298		 * for details on possible races.
12299		 */
12300		FREE_LOCK(ump);
12301		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
12302		    FFSV_FORCEINSMQ)) {
12303			error = vfs_busy(mp, MBF_NOWAIT);
12304			if (error != 0) {
12305				vfs_ref(mp);
12306				VOP_UNLOCK(vp, 0);
12307				error = vfs_busy(mp, 0);
12308				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12309				vfs_rel(mp);
12310				if (error != 0)
12311					return (ENOENT);
12312				if (vp->v_iflag & VI_DOOMED) {
12313					vfs_unbusy(mp);
12314					return (ENOENT);
12315				}
12316			}
12317			VOP_UNLOCK(vp, 0);
12318			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
12319			    &pvp, FFSV_FORCEINSMQ);
12320			vfs_unbusy(mp);
12321			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12322			if (vp->v_iflag & VI_DOOMED) {
12323				if (error == 0)
12324					vput(pvp);
12325				error = ENOENT;
12326			}
12327			if (error != 0)
12328				return (error);
12329		}
12330		/*
12331		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12332		 * that are contained in direct blocks will be resolved by
12333		 * doing a ffs_update. Pagedeps contained in indirect blocks
12334		 * may require a complete sync'ing of the directory. So, we
12335		 * try the cheap and fast ffs_update first, and if that fails,
12336		 * then we do the slower ffs_syncvnode of the directory.
12337		 */
12338		if (flushparent) {
12339			int locked;
12340
12341			if ((error = ffs_update(pvp, 1)) != 0) {
12342				vput(pvp);
12343				return (error);
12344			}
12345			ACQUIRE_LOCK(ump);
12346			locked = 1;
12347			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12348				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12349					if (wk->wk_type != D_DIRADD)
12350						panic("softdep_fsync: Unexpected type %s",
12351						      TYPENAME(wk->wk_type));
12352					dap = WK_DIRADD(wk);
12353					if (dap->da_state & DIRCHG)
12354						pagedep = dap->da_previous->dm_pagedep;
12355					else
12356						pagedep = dap->da_pagedep;
12357					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12358					FREE_LOCK(ump);
12359					locked = 0;
12360					if (pagedep_new_block && (error =
12361					    ffs_syncvnode(pvp, MNT_WAIT, 0))) {
12362						vput(pvp);
12363						return (error);
12364					}
12365				}
12366			}
12367			if (locked)
12368				FREE_LOCK(ump);
12369		}
12370		/*
12371		 * Flush directory page containing the inode's name.
12372		 */
12373		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12374		    &bp);
12375		if (error == 0)
12376			error = bwrite(bp);
12377		else
12378			brelse(bp);
12379		vput(pvp);
12380		if (error != 0)
12381			return (error);
12382		ACQUIRE_LOCK(ump);
12383		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12384			break;
12385	}
12386	FREE_LOCK(ump);
12387	return (0);
12388}
12389
12390/*
12391 * Flush all the dirty bitmaps associated with the block device
12392 * before flushing the rest of the dirty blocks so as to reduce
12393 * the number of dependencies that will have to be rolled back.
12394 *
12395 * XXX Unused?
12396 */
12397void
12398softdep_fsync_mountdev(vp)
12399	struct vnode *vp;
12400{
12401	struct buf *bp, *nbp;
12402	struct worklist *wk;
12403	struct bufobj *bo;
12404
12405	if (!vn_isdisk(vp, NULL))
12406		panic("softdep_fsync_mountdev: vnode not a disk");
12407	bo = &vp->v_bufobj;
12408restart:
12409	BO_LOCK(bo);
12410	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
12411		/*
12412		 * If it is already scheduled, skip to the next buffer.
12413		 */
12414		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
12415			continue;
12416
12417		if ((bp->b_flags & B_DELWRI) == 0)
12418			panic("softdep_fsync_mountdev: not dirty");
12419		/*
12420		 * We are only interested in bitmaps with outstanding
12421		 * dependencies.
12422		 */
12423		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
12424		    wk->wk_type != D_BMSAFEMAP ||
12425		    (bp->b_vflags & BV_BKGRDINPROG)) {
12426			BUF_UNLOCK(bp);
12427			continue;
12428		}
12429		BO_UNLOCK(bo);
12430		bremfree(bp);
12431		(void) bawrite(bp);
12432		goto restart;
12433	}
12434	drain_output(vp);
12435	BO_UNLOCK(bo);
12436}
12437
12438/*
12439 * Sync all cylinder groups that were dirty at the time this function is
12440 * called.  Newly dirtied cgs will be inserted before the sentinel.  This
12441 * is used to flush freedep activity that may be holding up writes to a
12442 * indirect block.
12443 */
12444static int
12445sync_cgs(mp, waitfor)
12446	struct mount *mp;
12447	int waitfor;
12448{
12449	struct bmsafemap *bmsafemap;
12450	struct bmsafemap *sentinel;
12451	struct ufsmount *ump;
12452	struct buf *bp;
12453	int error;
12454
12455	sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
12456	sentinel->sm_cg = -1;
12457	ump = VFSTOUFS(mp);
12458	error = 0;
12459	ACQUIRE_LOCK(ump);
12460	LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
12461	for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
12462	    bmsafemap = LIST_NEXT(sentinel, sm_next)) {
12463		/* Skip sentinels and cgs with no work to release. */
12464		if (bmsafemap->sm_cg == -1 ||
12465		    (LIST_EMPTY(&bmsafemap->sm_freehd) &&
12466		    LIST_EMPTY(&bmsafemap->sm_freewr))) {
12467			LIST_REMOVE(sentinel, sm_next);
12468			LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12469			continue;
12470		}
12471		/*
12472		 * If we don't get the lock and we're waiting try again, if
12473		 * not move on to the next buf and try to sync it.
12474		 */
12475		bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor);
12476		if (bp == NULL && waitfor == MNT_WAIT)
12477			continue;
12478		LIST_REMOVE(sentinel, sm_next);
12479		LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12480		if (bp == NULL)
12481			continue;
12482		FREE_LOCK(ump);
12483		if (waitfor == MNT_NOWAIT)
12484			bawrite(bp);
12485		else
12486			error = bwrite(bp);
12487		ACQUIRE_LOCK(ump);
12488		if (error)
12489			break;
12490	}
12491	LIST_REMOVE(sentinel, sm_next);
12492	FREE_LOCK(ump);
12493	free(sentinel, M_BMSAFEMAP);
12494	return (error);
12495}
12496
12497/*
12498 * This routine is called when we are trying to synchronously flush a
12499 * file. This routine must eliminate any filesystem metadata dependencies
12500 * so that the syncing routine can succeed.
12501 */
12502int
12503softdep_sync_metadata(struct vnode *vp)
12504{
12505	struct inode *ip;
12506	int error;
12507
12508	ip = VTOI(vp);
12509	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
12510	    ("softdep_sync_metadata called on non-softdep filesystem"));
12511	/*
12512	 * Ensure that any direct block dependencies have been cleared,
12513	 * truncations are started, and inode references are journaled.
12514	 */
12515	ACQUIRE_LOCK(ip->i_ump);
12516	/*
12517	 * Write all journal records to prevent rollbacks on devvp.
12518	 */
12519	if (vp->v_type == VCHR)
12520		softdep_flushjournal(vp->v_mount);
12521	error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number);
12522	/*
12523	 * Ensure that all truncates are written so we won't find deps on
12524	 * indirect blocks.
12525	 */
12526	process_truncates(vp);
12527	FREE_LOCK(ip->i_ump);
12528
12529	return (error);
12530}
12531
12532/*
12533 * This routine is called when we are attempting to sync a buf with
12534 * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12535 * other IO it can but returns EBUSY if the buffer is not yet able to
12536 * be written.  Dependencies which will not cause rollbacks will always
12537 * return 0.
12538 */
12539int
12540softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
12541{
12542	struct indirdep *indirdep;
12543	struct pagedep *pagedep;
12544	struct allocindir *aip;
12545	struct newblk *newblk;
12546	struct ufsmount *ump;
12547	struct buf *nbp;
12548	struct worklist *wk;
12549	int i, error;
12550
12551	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12552	    ("softdep_sync_buf called on non-softdep filesystem"));
12553	/*
12554	 * For VCHR we just don't want to force flush any dependencies that
12555	 * will cause rollbacks.
12556	 */
12557	if (vp->v_type == VCHR) {
12558		if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12559			return (EBUSY);
12560		return (0);
12561	}
12562	ump = VTOI(vp)->i_ump;
12563	ACQUIRE_LOCK(ump);
12564	/*
12565	 * As we hold the buffer locked, none of its dependencies
12566	 * will disappear.
12567	 */
12568	error = 0;
12569top:
12570	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12571		switch (wk->wk_type) {
12572
12573		case D_ALLOCDIRECT:
12574		case D_ALLOCINDIR:
12575			newblk = WK_NEWBLK(wk);
12576			if (newblk->nb_jnewblk != NULL) {
12577				if (waitfor == MNT_NOWAIT) {
12578					error = EBUSY;
12579					goto out_unlock;
12580				}
12581				jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12582				goto top;
12583			}
12584			if (newblk->nb_state & DEPCOMPLETE ||
12585			    waitfor == MNT_NOWAIT)
12586				continue;
12587			nbp = newblk->nb_bmsafemap->sm_buf;
12588			nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12589			if (nbp == NULL)
12590				goto top;
12591			FREE_LOCK(ump);
12592			if ((error = bwrite(nbp)) != 0)
12593				goto out;
12594			ACQUIRE_LOCK(ump);
12595			continue;
12596
12597		case D_INDIRDEP:
12598			indirdep = WK_INDIRDEP(wk);
12599			if (waitfor == MNT_NOWAIT) {
12600				if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
12601				    !LIST_EMPTY(&indirdep->ir_deplisthd)) {
12602					error = EBUSY;
12603					goto out_unlock;
12604				}
12605			}
12606			if (!TAILQ_EMPTY(&indirdep->ir_trunc))
12607				panic("softdep_sync_buf: truncation pending.");
12608		restart:
12609			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
12610				newblk = (struct newblk *)aip;
12611				if (newblk->nb_jnewblk != NULL) {
12612					jwait(&newblk->nb_jnewblk->jn_list,
12613					    waitfor);
12614					goto restart;
12615				}
12616				if (newblk->nb_state & DEPCOMPLETE)
12617					continue;
12618				nbp = newblk->nb_bmsafemap->sm_buf;
12619				nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12620				if (nbp == NULL)
12621					goto restart;
12622				FREE_LOCK(ump);
12623				if ((error = bwrite(nbp)) != 0)
12624					goto out;
12625				ACQUIRE_LOCK(ump);
12626				goto restart;
12627			}
12628			continue;
12629
12630		case D_PAGEDEP:
12631			/*
12632			 * Only flush directory entries in synchronous passes.
12633			 */
12634			if (waitfor != MNT_WAIT) {
12635				error = EBUSY;
12636				goto out_unlock;
12637			}
12638			/*
12639			 * While syncing snapshots, we must allow recursive
12640			 * lookups.
12641			 */
12642			BUF_AREC(bp);
12643			/*
12644			 * We are trying to sync a directory that may
12645			 * have dependencies on both its own metadata
12646			 * and/or dependencies on the inodes of any
12647			 * recently allocated files. We walk its diradd
12648			 * lists pushing out the associated inode.
12649			 */
12650			pagedep = WK_PAGEDEP(wk);
12651			for (i = 0; i < DAHASHSZ; i++) {
12652				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
12653					continue;
12654				if ((error = flush_pagedep_deps(vp, wk->wk_mp,
12655				    &pagedep->pd_diraddhd[i]))) {
12656					BUF_NOREC(bp);
12657					goto out_unlock;
12658				}
12659			}
12660			BUF_NOREC(bp);
12661			continue;
12662
12663		case D_FREEWORK:
12664		case D_FREEDEP:
12665		case D_JSEGDEP:
12666		case D_JNEWBLK:
12667			continue;
12668
12669		default:
12670			panic("softdep_sync_buf: Unknown type %s",
12671			    TYPENAME(wk->wk_type));
12672			/* NOTREACHED */
12673		}
12674	}
12675out_unlock:
12676	FREE_LOCK(ump);
12677out:
12678	return (error);
12679}
12680
12681/*
12682 * Flush the dependencies associated with an inodedep.
12683 * Called with splbio blocked.
12684 */
12685static int
12686flush_inodedep_deps(vp, mp, ino)
12687	struct vnode *vp;
12688	struct mount *mp;
12689	ino_t ino;
12690{
12691	struct inodedep *inodedep;
12692	struct inoref *inoref;
12693	struct ufsmount *ump;
12694	int error, waitfor;
12695
12696	/*
12697	 * This work is done in two passes. The first pass grabs most
12698	 * of the buffers and begins asynchronously writing them. The
12699	 * only way to wait for these asynchronous writes is to sleep
12700	 * on the filesystem vnode which may stay busy for a long time
12701	 * if the filesystem is active. So, instead, we make a second
12702	 * pass over the dependencies blocking on each write. In the
12703	 * usual case we will be blocking against a write that we
12704	 * initiated, so when it is done the dependency will have been
12705	 * resolved. Thus the second pass is expected to end quickly.
12706	 * We give a brief window at the top of the loop to allow
12707	 * any pending I/O to complete.
12708	 */
12709	ump = VFSTOUFS(mp);
12710	LOCK_OWNED(ump);
12711	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
12712		if (error)
12713			return (error);
12714		FREE_LOCK(ump);
12715		ACQUIRE_LOCK(ump);
12716restart:
12717		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12718			return (0);
12719		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12720			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12721			    == DEPCOMPLETE) {
12722				jwait(&inoref->if_list, MNT_WAIT);
12723				goto restart;
12724			}
12725		}
12726		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
12727		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
12728		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
12729		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
12730			continue;
12731		/*
12732		 * If pass2, we are done, otherwise do pass 2.
12733		 */
12734		if (waitfor == MNT_WAIT)
12735			break;
12736		waitfor = MNT_WAIT;
12737	}
12738	/*
12739	 * Try freeing inodedep in case all dependencies have been removed.
12740	 */
12741	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
12742		(void) free_inodedep(inodedep);
12743	return (0);
12744}
12745
12746/*
12747 * Flush an inode dependency list.
12748 * Called with splbio blocked.
12749 */
12750static int
12751flush_deplist(listhead, waitfor, errorp)
12752	struct allocdirectlst *listhead;
12753	int waitfor;
12754	int *errorp;
12755{
12756	struct allocdirect *adp;
12757	struct newblk *newblk;
12758	struct ufsmount *ump;
12759	struct buf *bp;
12760
12761	if ((adp = TAILQ_FIRST(listhead)) == NULL)
12762		return (0);
12763	ump = VFSTOUFS(adp->ad_list.wk_mp);
12764	LOCK_OWNED(ump);
12765	TAILQ_FOREACH(adp, listhead, ad_next) {
12766		newblk = (struct newblk *)adp;
12767		if (newblk->nb_jnewblk != NULL) {
12768			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12769			return (1);
12770		}
12771		if (newblk->nb_state & DEPCOMPLETE)
12772			continue;
12773		bp = newblk->nb_bmsafemap->sm_buf;
12774		bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor);
12775		if (bp == NULL) {
12776			if (waitfor == MNT_NOWAIT)
12777				continue;
12778			return (1);
12779		}
12780		FREE_LOCK(ump);
12781		if (waitfor == MNT_NOWAIT)
12782			bawrite(bp);
12783		else
12784			*errorp = bwrite(bp);
12785		ACQUIRE_LOCK(ump);
12786		return (1);
12787	}
12788	return (0);
12789}
12790
12791/*
12792 * Flush dependencies associated with an allocdirect block.
12793 */
12794static int
12795flush_newblk_dep(vp, mp, lbn)
12796	struct vnode *vp;
12797	struct mount *mp;
12798	ufs_lbn_t lbn;
12799{
12800	struct newblk *newblk;
12801	struct ufsmount *ump;
12802	struct bufobj *bo;
12803	struct inode *ip;
12804	struct buf *bp;
12805	ufs2_daddr_t blkno;
12806	int error;
12807
12808	error = 0;
12809	bo = &vp->v_bufobj;
12810	ip = VTOI(vp);
12811	blkno = DIP(ip, i_db[lbn]);
12812	if (blkno == 0)
12813		panic("flush_newblk_dep: Missing block");
12814	ump = VFSTOUFS(mp);
12815	ACQUIRE_LOCK(ump);
12816	/*
12817	 * Loop until all dependencies related to this block are satisfied.
12818	 * We must be careful to restart after each sleep in case a write
12819	 * completes some part of this process for us.
12820	 */
12821	for (;;) {
12822		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
12823			FREE_LOCK(ump);
12824			break;
12825		}
12826		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
12827			panic("flush_newblk_deps: Bad newblk %p", newblk);
12828		/*
12829		 * Flush the journal.
12830		 */
12831		if (newblk->nb_jnewblk != NULL) {
12832			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12833			continue;
12834		}
12835		/*
12836		 * Write the bitmap dependency.
12837		 */
12838		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
12839			bp = newblk->nb_bmsafemap->sm_buf;
12840			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
12841			if (bp == NULL)
12842				continue;
12843			FREE_LOCK(ump);
12844			error = bwrite(bp);
12845			if (error)
12846				break;
12847			ACQUIRE_LOCK(ump);
12848			continue;
12849		}
12850		/*
12851		 * Write the buffer.
12852		 */
12853		FREE_LOCK(ump);
12854		BO_LOCK(bo);
12855		bp = gbincore(bo, lbn);
12856		if (bp != NULL) {
12857			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
12858			    LK_INTERLOCK, BO_LOCKPTR(bo));
12859			if (error == ENOLCK) {
12860				ACQUIRE_LOCK(ump);
12861				continue; /* Slept, retry */
12862			}
12863			if (error != 0)
12864				break;	/* Failed */
12865			if (bp->b_flags & B_DELWRI) {
12866				bremfree(bp);
12867				error = bwrite(bp);
12868				if (error)
12869					break;
12870			} else
12871				BUF_UNLOCK(bp);
12872		} else
12873			BO_UNLOCK(bo);
12874		/*
12875		 * We have to wait for the direct pointers to
12876		 * point at the newdirblk before the dependency
12877		 * will go away.
12878		 */
12879		error = ffs_update(vp, 1);
12880		if (error)
12881			break;
12882		ACQUIRE_LOCK(ump);
12883	}
12884	return (error);
12885}
12886
12887/*
12888 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
12889 * Called with splbio blocked.
12890 */
12891static int
12892flush_pagedep_deps(pvp, mp, diraddhdp)
12893	struct vnode *pvp;
12894	struct mount *mp;
12895	struct diraddhd *diraddhdp;
12896{
12897	struct inodedep *inodedep;
12898	struct inoref *inoref;
12899	struct ufsmount *ump;
12900	struct diradd *dap;
12901	struct vnode *vp;
12902	int error = 0;
12903	struct buf *bp;
12904	ino_t inum;
12905	struct diraddhd unfinished;
12906
12907	LIST_INIT(&unfinished);
12908	ump = VFSTOUFS(mp);
12909	LOCK_OWNED(ump);
12910restart:
12911	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
12912		/*
12913		 * Flush ourselves if this directory entry
12914		 * has a MKDIR_PARENT dependency.
12915		 */
12916		if (dap->da_state & MKDIR_PARENT) {
12917			FREE_LOCK(ump);
12918			if ((error = ffs_update(pvp, 1)) != 0)
12919				break;
12920			ACQUIRE_LOCK(ump);
12921			/*
12922			 * If that cleared dependencies, go on to next.
12923			 */
12924			if (dap != LIST_FIRST(diraddhdp))
12925				continue;
12926			/*
12927			 * All MKDIR_PARENT dependencies and all the
12928			 * NEWBLOCK pagedeps that are contained in direct
12929			 * blocks were resolved by doing above ffs_update.
12930			 * Pagedeps contained in indirect blocks may
12931			 * require a complete sync'ing of the directory.
12932			 * We are in the midst of doing a complete sync,
12933			 * so if they are not resolved in this pass we
12934			 * defer them for now as they will be sync'ed by
12935			 * our caller shortly.
12936			 */
12937			LIST_REMOVE(dap, da_pdlist);
12938			LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
12939			continue;
12940		}
12941		/*
12942		 * A newly allocated directory must have its "." and
12943		 * ".." entries written out before its name can be
12944		 * committed in its parent.
12945		 */
12946		inum = dap->da_newinum;
12947		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12948			panic("flush_pagedep_deps: lost inode1");
12949		/*
12950		 * Wait for any pending journal adds to complete so we don't
12951		 * cause rollbacks while syncing.
12952		 */
12953		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12954			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12955			    == DEPCOMPLETE) {
12956				jwait(&inoref->if_list, MNT_WAIT);
12957				goto restart;
12958			}
12959		}
12960		if (dap->da_state & MKDIR_BODY) {
12961			FREE_LOCK(ump);
12962			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12963			    FFSV_FORCEINSMQ)))
12964				break;
12965			error = flush_newblk_dep(vp, mp, 0);
12966			/*
12967			 * If we still have the dependency we might need to
12968			 * update the vnode to sync the new link count to
12969			 * disk.
12970			 */
12971			if (error == 0 && dap == LIST_FIRST(diraddhdp))
12972				error = ffs_update(vp, 1);
12973			vput(vp);
12974			if (error != 0)
12975				break;
12976			ACQUIRE_LOCK(ump);
12977			/*
12978			 * If that cleared dependencies, go on to next.
12979			 */
12980			if (dap != LIST_FIRST(diraddhdp))
12981				continue;
12982			if (dap->da_state & MKDIR_BODY) {
12983				inodedep_lookup(UFSTOVFS(ump), inum, 0,
12984				    &inodedep);
12985				panic("flush_pagedep_deps: MKDIR_BODY "
12986				    "inodedep %p dap %p vp %p",
12987				    inodedep, dap, vp);
12988			}
12989		}
12990		/*
12991		 * Flush the inode on which the directory entry depends.
12992		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
12993		 * the only remaining dependency is that the updated inode
12994		 * count must get pushed to disk. The inode has already
12995		 * been pushed into its inode buffer (via VOP_UPDATE) at
12996		 * the time of the reference count change. So we need only
12997		 * locate that buffer, ensure that there will be no rollback
12998		 * caused by a bitmap dependency, then write the inode buffer.
12999		 */
13000retry:
13001		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13002			panic("flush_pagedep_deps: lost inode");
13003		/*
13004		 * If the inode still has bitmap dependencies,
13005		 * push them to disk.
13006		 */
13007		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
13008			bp = inodedep->id_bmsafemap->sm_buf;
13009			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
13010			if (bp == NULL)
13011				goto retry;
13012			FREE_LOCK(ump);
13013			if ((error = bwrite(bp)) != 0)
13014				break;
13015			ACQUIRE_LOCK(ump);
13016			if (dap != LIST_FIRST(diraddhdp))
13017				continue;
13018		}
13019		/*
13020		 * If the inode is still sitting in a buffer waiting
13021		 * to be written or waiting for the link count to be
13022		 * adjusted update it here to flush it to disk.
13023		 */
13024		if (dap == LIST_FIRST(diraddhdp)) {
13025			FREE_LOCK(ump);
13026			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
13027			    FFSV_FORCEINSMQ)))
13028				break;
13029			error = ffs_update(vp, 1);
13030			vput(vp);
13031			if (error)
13032				break;
13033			ACQUIRE_LOCK(ump);
13034		}
13035		/*
13036		 * If we have failed to get rid of all the dependencies
13037		 * then something is seriously wrong.
13038		 */
13039		if (dap == LIST_FIRST(diraddhdp)) {
13040			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
13041			panic("flush_pagedep_deps: failed to flush "
13042			    "inodedep %p ino %ju dap %p",
13043			    inodedep, (uintmax_t)inum, dap);
13044		}
13045	}
13046	if (error)
13047		ACQUIRE_LOCK(ump);
13048	while ((dap = LIST_FIRST(&unfinished)) != NULL) {
13049		LIST_REMOVE(dap, da_pdlist);
13050		LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
13051	}
13052	return (error);
13053}
13054
13055/*
13056 * A large burst of file addition or deletion activity can drive the
13057 * memory load excessively high. First attempt to slow things down
13058 * using the techniques below. If that fails, this routine requests
13059 * the offending operations to fall back to running synchronously
13060 * until the memory load returns to a reasonable level.
13061 */
13062int
13063softdep_slowdown(vp)
13064	struct vnode *vp;
13065{
13066	struct ufsmount *ump;
13067	int jlow;
13068	int max_softdeps_hard;
13069
13070	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
13071	    ("softdep_slowdown called on non-softdep filesystem"));
13072	ump = VFSTOUFS(vp->v_mount);
13073	ACQUIRE_LOCK(ump);
13074	jlow = 0;
13075	/*
13076	 * Check for journal space if needed.
13077	 */
13078	if (DOINGSUJ(vp)) {
13079		if (journal_space(ump, 0) == 0)
13080			jlow = 1;
13081	}
13082	/*
13083	 * If the system is under its limits and our filesystem is
13084	 * not responsible for more than our share of the usage and
13085	 * we are not low on journal space, then no need to slow down.
13086	 */
13087	max_softdeps_hard = max_softdeps * 11 / 10;
13088	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
13089	    dep_current[D_INODEDEP] < max_softdeps_hard &&
13090	    dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 &&
13091	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 &&
13092	    ump->softdep_curdeps[D_DIRREM] <
13093	    (max_softdeps_hard / 2) / stat_flush_threads &&
13094	    ump->softdep_curdeps[D_INODEDEP] <
13095	    max_softdeps_hard / stat_flush_threads &&
13096	    ump->softdep_curdeps[D_INDIRDEP] <
13097	    (max_softdeps_hard / 1000) / stat_flush_threads &&
13098	    ump->softdep_curdeps[D_FREEBLKS] <
13099	    max_softdeps_hard / stat_flush_threads) {
13100		FREE_LOCK(ump);
13101  		return (0);
13102	}
13103	/*
13104	 * If the journal is low or our filesystem is over its limit
13105	 * then speedup the cleanup.
13106	 */
13107	if (ump->softdep_curdeps[D_INDIRDEP] <
13108	    (max_softdeps_hard / 1000) / stat_flush_threads || jlow)
13109		softdep_speedup(ump);
13110	stat_sync_limit_hit += 1;
13111	FREE_LOCK(ump);
13112	/*
13113	 * We only slow down the rate at which new dependencies are
13114	 * generated if we are not using journaling. With journaling,
13115	 * the cleanup should always be sufficient to keep things
13116	 * under control.
13117	 */
13118	if (DOINGSUJ(vp))
13119		return (0);
13120	return (1);
13121}
13122
13123/*
13124 * Called by the allocation routines when they are about to fail
13125 * in the hope that we can free up the requested resource (inodes
13126 * or disk space).
13127 *
13128 * First check to see if the work list has anything on it. If it has,
13129 * clean up entries until we successfully free the requested resource.
13130 * Because this process holds inodes locked, we cannot handle any remove
13131 * requests that might block on a locked inode as that could lead to
13132 * deadlock. If the worklist yields none of the requested resource,
13133 * start syncing out vnodes to free up the needed space.
13134 */
13135int
13136softdep_request_cleanup(fs, vp, cred, resource)
13137	struct fs *fs;
13138	struct vnode *vp;
13139	struct ucred *cred;
13140	int resource;
13141{
13142	struct ufsmount *ump;
13143	struct mount *mp;
13144	struct vnode *lvp, *mvp;
13145	long starttime;
13146	ufs2_daddr_t needed;
13147	int error;
13148
13149	/*
13150	 * If we are being called because of a process doing a
13151	 * copy-on-write, then it is not safe to process any
13152	 * worklist items as we will recurse into the copyonwrite
13153	 * routine.  This will result in an incoherent snapshot.
13154	 * If the vnode that we hold is a snapshot, we must avoid
13155	 * handling other resources that could cause deadlock.
13156	 */
13157	if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
13158		return (0);
13159
13160	if (resource == FLUSH_BLOCKS_WAIT)
13161		stat_cleanup_blkrequests += 1;
13162	else
13163		stat_cleanup_inorequests += 1;
13164
13165	mp = vp->v_mount;
13166	ump = VFSTOUFS(mp);
13167	mtx_assert(UFS_MTX(ump), MA_OWNED);
13168	UFS_UNLOCK(ump);
13169	error = ffs_update(vp, 1);
13170	if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) {
13171		UFS_LOCK(ump);
13172		return (0);
13173	}
13174	/*
13175	 * If we are in need of resources, start by cleaning up
13176	 * any block removals associated with our inode.
13177	 */
13178	ACQUIRE_LOCK(ump);
13179	process_removes(vp);
13180	process_truncates(vp);
13181	FREE_LOCK(ump);
13182	/*
13183	 * Now clean up at least as many resources as we will need.
13184	 *
13185	 * When requested to clean up inodes, the number that are needed
13186	 * is set by the number of simultaneous writers (mnt_writeopcount)
13187	 * plus a bit of slop (2) in case some more writers show up while
13188	 * we are cleaning.
13189	 *
13190	 * When requested to free up space, the amount of space that
13191	 * we need is enough blocks to allocate a full-sized segment
13192	 * (fs_contigsumsize). The number of such segments that will
13193	 * be needed is set by the number of simultaneous writers
13194	 * (mnt_writeopcount) plus a bit of slop (2) in case some more
13195	 * writers show up while we are cleaning.
13196	 *
13197	 * Additionally, if we are unpriviledged and allocating space,
13198	 * we need to ensure that we clean up enough blocks to get the
13199	 * needed number of blocks over the threshhold of the minimum
13200	 * number of blocks required to be kept free by the filesystem
13201	 * (fs_minfree).
13202	 */
13203	if (resource == FLUSH_INODES_WAIT) {
13204		needed = vp->v_mount->mnt_writeopcount + 2;
13205	} else if (resource == FLUSH_BLOCKS_WAIT) {
13206		needed = (vp->v_mount->mnt_writeopcount + 2) *
13207		    fs->fs_contigsumsize;
13208		if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
13209			needed += fragstoblks(fs,
13210			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
13211			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
13212	} else {
13213		UFS_LOCK(ump);
13214		printf("softdep_request_cleanup: Unknown resource type %d\n",
13215		    resource);
13216		return (0);
13217	}
13218	starttime = time_second;
13219retry:
13220	if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
13221	    fs->fs_cstotal.cs_nbfree <= needed) ||
13222	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13223	    fs->fs_cstotal.cs_nifree <= needed)) {
13224		ACQUIRE_LOCK(ump);
13225		if (ump->softdep_on_worklist > 0 &&
13226		    process_worklist_item(UFSTOVFS(ump),
13227		    ump->softdep_on_worklist, LK_NOWAIT) != 0)
13228			stat_worklist_push += 1;
13229		FREE_LOCK(ump);
13230	}
13231	/*
13232	 * If we still need resources and there are no more worklist
13233	 * entries to process to obtain them, we have to start flushing
13234	 * the dirty vnodes to force the release of additional requests
13235	 * to the worklist that we can then process to reap addition
13236	 * resources. We walk the vnodes associated with the mount point
13237	 * until we get the needed worklist requests that we can reap.
13238	 */
13239	if ((resource == FLUSH_BLOCKS_WAIT &&
13240	     fs->fs_cstotal.cs_nbfree <= needed) ||
13241	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13242	     fs->fs_cstotal.cs_nifree <= needed)) {
13243		MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
13244			if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
13245				VI_UNLOCK(lvp);
13246				continue;
13247			}
13248			if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
13249			    curthread))
13250				continue;
13251			if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
13252				vput(lvp);
13253				continue;
13254			}
13255			(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
13256			vput(lvp);
13257		}
13258		lvp = ump->um_devvp;
13259		if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
13260			VOP_FSYNC(lvp, MNT_NOWAIT, curthread);
13261			VOP_UNLOCK(lvp, 0);
13262		}
13263		if (ump->softdep_on_worklist > 0) {
13264			stat_cleanup_retries += 1;
13265			goto retry;
13266		}
13267		stat_cleanup_failures += 1;
13268	}
13269	if (time_second - starttime > stat_cleanup_high_delay)
13270		stat_cleanup_high_delay = time_second - starttime;
13271	UFS_LOCK(ump);
13272	return (1);
13273}
13274
13275static bool
13276softdep_excess_inodes(struct ufsmount *ump)
13277{
13278
13279	return (dep_current[D_INODEDEP] > max_softdeps &&
13280	    ump->softdep_curdeps[D_INODEDEP] > max_softdeps /
13281	    stat_flush_threads);
13282}
13283
13284static bool
13285softdep_excess_dirrem(struct ufsmount *ump)
13286{
13287
13288	return (dep_current[D_DIRREM] > max_softdeps / 2 &&
13289	    ump->softdep_curdeps[D_DIRREM] > (max_softdeps / 2) /
13290	    stat_flush_threads);
13291}
13292
13293static void
13294schedule_cleanup(struct mount *mp)
13295{
13296	struct ufsmount *ump;
13297	struct thread *td;
13298
13299	ump = VFSTOUFS(mp);
13300	LOCK_OWNED(ump);
13301	FREE_LOCK(ump);
13302	td = curthread;
13303	if ((td->td_pflags & TDP_KTHREAD) != 0 &&
13304	    (td->td_proc->p_flag2 & P2_AST_SU) == 0) {
13305		/*
13306		 * No ast is delivered to kernel threads, so nobody
13307		 * would deref the mp.  Some kernel threads
13308		 * explicitely check for AST, e.g. NFS daemon does
13309		 * this in the serving loop.
13310		 */
13311		return;
13312	}
13313	if (td->td_su != NULL)
13314		vfs_rel(td->td_su);
13315	vfs_ref(mp);
13316	td->td_su = mp;
13317	thread_lock(td);
13318	td->td_flags |= TDF_ASTPENDING;
13319	thread_unlock(td);
13320}
13321
13322static void
13323softdep_ast_cleanup_proc(void)
13324{
13325	struct thread *td;
13326	struct mount *mp;
13327	struct ufsmount *ump;
13328	int error;
13329	bool req;
13330
13331	td = curthread;
13332	mp = td->td_su;
13333	if (mp == NULL)
13334		return;
13335	td->td_su = NULL;
13336	error = vfs_busy(mp, MBF_NOWAIT);
13337	vfs_rel(mp);
13338	if (error != 0)
13339		return;
13340	if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) {
13341		ump = VFSTOUFS(mp);
13342		for (;;) {
13343			req = false;
13344			ACQUIRE_LOCK(ump);
13345			if (softdep_excess_inodes(ump)) {
13346				req = true;
13347				request_cleanup(mp, FLUSH_INODES);
13348			}
13349			if (softdep_excess_dirrem(ump)) {
13350				req = true;
13351				request_cleanup(mp, FLUSH_BLOCKS);
13352			}
13353			FREE_LOCK(ump);
13354			if ((td->td_pflags & TDP_KTHREAD) != 0 || !req)
13355				break;
13356		}
13357	}
13358	vfs_unbusy(mp);
13359}
13360
13361/*
13362 * If memory utilization has gotten too high, deliberately slow things
13363 * down and speed up the I/O processing.
13364 */
13365static int
13366request_cleanup(mp, resource)
13367	struct mount *mp;
13368	int resource;
13369{
13370	struct thread *td = curthread;
13371	struct ufsmount *ump;
13372
13373	ump = VFSTOUFS(mp);
13374	LOCK_OWNED(ump);
13375	/*
13376	 * We never hold up the filesystem syncer or buf daemon.
13377	 */
13378	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
13379		return (0);
13380	/*
13381	 * First check to see if the work list has gotten backlogged.
13382	 * If it has, co-opt this process to help clean up two entries.
13383	 * Because this process may hold inodes locked, we cannot
13384	 * handle any remove requests that might block on a locked
13385	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
13386	 * to avoid recursively processing the worklist.
13387	 */
13388	if (ump->softdep_on_worklist > max_softdeps / 10) {
13389		td->td_pflags |= TDP_SOFTDEP;
13390		process_worklist_item(mp, 2, LK_NOWAIT);
13391		td->td_pflags &= ~TDP_SOFTDEP;
13392		stat_worklist_push += 2;
13393		return(1);
13394	}
13395	/*
13396	 * Next, we attempt to speed up the syncer process. If that
13397	 * is successful, then we allow the process to continue.
13398	 */
13399	if (softdep_speedup(ump) &&
13400	    resource != FLUSH_BLOCKS_WAIT &&
13401	    resource != FLUSH_INODES_WAIT)
13402		return(0);
13403	/*
13404	 * If we are resource constrained on inode dependencies, try
13405	 * flushing some dirty inodes. Otherwise, we are constrained
13406	 * by file deletions, so try accelerating flushes of directories
13407	 * with removal dependencies. We would like to do the cleanup
13408	 * here, but we probably hold an inode locked at this point and
13409	 * that might deadlock against one that we try to clean. So,
13410	 * the best that we can do is request the syncer daemon to do
13411	 * the cleanup for us.
13412	 */
13413	switch (resource) {
13414
13415	case FLUSH_INODES:
13416	case FLUSH_INODES_WAIT:
13417		ACQUIRE_GBLLOCK(&lk);
13418		stat_ino_limit_push += 1;
13419		req_clear_inodedeps += 1;
13420		FREE_GBLLOCK(&lk);
13421		stat_countp = &stat_ino_limit_hit;
13422		break;
13423
13424	case FLUSH_BLOCKS:
13425	case FLUSH_BLOCKS_WAIT:
13426		ACQUIRE_GBLLOCK(&lk);
13427		stat_blk_limit_push += 1;
13428		req_clear_remove += 1;
13429		FREE_GBLLOCK(&lk);
13430		stat_countp = &stat_blk_limit_hit;
13431		break;
13432
13433	default:
13434		panic("request_cleanup: unknown type");
13435	}
13436	/*
13437	 * Hopefully the syncer daemon will catch up and awaken us.
13438	 * We wait at most tickdelay before proceeding in any case.
13439	 */
13440	ACQUIRE_GBLLOCK(&lk);
13441	FREE_LOCK(ump);
13442	proc_waiting += 1;
13443	if (callout_pending(&softdep_callout) == FALSE)
13444		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13445		    pause_timer, 0);
13446
13447	if ((td->td_pflags & TDP_KTHREAD) == 0)
13448		msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
13449	proc_waiting -= 1;
13450	FREE_GBLLOCK(&lk);
13451	ACQUIRE_LOCK(ump);
13452	return (1);
13453}
13454
13455/*
13456 * Awaken processes pausing in request_cleanup and clear proc_waiting
13457 * to indicate that there is no longer a timer running. Pause_timer
13458 * will be called with the global softdep mutex (&lk) locked.
13459 */
13460static void
13461pause_timer(arg)
13462	void *arg;
13463{
13464
13465	GBLLOCK_OWNED(&lk);
13466	/*
13467	 * The callout_ API has acquired mtx and will hold it around this
13468	 * function call.
13469	 */
13470	*stat_countp += proc_waiting;
13471	wakeup(&proc_waiting);
13472}
13473
13474/*
13475 * If requested, try removing inode or removal dependencies.
13476 */
13477static void
13478check_clear_deps(mp)
13479	struct mount *mp;
13480{
13481
13482	/*
13483	 * If we are suspended, it may be because of our using
13484	 * too many inodedeps, so help clear them out.
13485	 */
13486	if (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended)
13487		clear_inodedeps(mp);
13488	/*
13489	 * General requests for cleanup of backed up dependencies
13490	 */
13491	ACQUIRE_GBLLOCK(&lk);
13492	if (req_clear_inodedeps) {
13493		req_clear_inodedeps -= 1;
13494		FREE_GBLLOCK(&lk);
13495		clear_inodedeps(mp);
13496		ACQUIRE_GBLLOCK(&lk);
13497		wakeup(&proc_waiting);
13498	}
13499	if (req_clear_remove) {
13500		req_clear_remove -= 1;
13501		FREE_GBLLOCK(&lk);
13502		clear_remove(mp);
13503		ACQUIRE_GBLLOCK(&lk);
13504		wakeup(&proc_waiting);
13505	}
13506	FREE_GBLLOCK(&lk);
13507}
13508
13509/*
13510 * Flush out a directory with at least one removal dependency in an effort to
13511 * reduce the number of dirrem, freefile, and freeblks dependency structures.
13512 */
13513static void
13514clear_remove(mp)
13515	struct mount *mp;
13516{
13517	struct pagedep_hashhead *pagedephd;
13518	struct pagedep *pagedep;
13519	struct ufsmount *ump;
13520	struct vnode *vp;
13521	struct bufobj *bo;
13522	int error, cnt;
13523	ino_t ino;
13524
13525	ump = VFSTOUFS(mp);
13526	LOCK_OWNED(ump);
13527
13528	for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) {
13529		pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++];
13530		if (ump->pagedep_nextclean > ump->pagedep_hash_size)
13531			ump->pagedep_nextclean = 0;
13532		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
13533			if (LIST_EMPTY(&pagedep->pd_dirremhd))
13534				continue;
13535			ino = pagedep->pd_ino;
13536			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13537				continue;
13538			FREE_LOCK(ump);
13539
13540			/*
13541			 * Let unmount clear deps
13542			 */
13543			error = vfs_busy(mp, MBF_NOWAIT);
13544			if (error != 0)
13545				goto finish_write;
13546			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13547			     FFSV_FORCEINSMQ);
13548			vfs_unbusy(mp);
13549			if (error != 0) {
13550				softdep_error("clear_remove: vget", error);
13551				goto finish_write;
13552			}
13553			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13554				softdep_error("clear_remove: fsync", error);
13555			bo = &vp->v_bufobj;
13556			BO_LOCK(bo);
13557			drain_output(vp);
13558			BO_UNLOCK(bo);
13559			vput(vp);
13560		finish_write:
13561			vn_finished_write(mp);
13562			ACQUIRE_LOCK(ump);
13563			return;
13564		}
13565	}
13566}
13567
13568/*
13569 * Clear out a block of dirty inodes in an effort to reduce
13570 * the number of inodedep dependency structures.
13571 */
13572static void
13573clear_inodedeps(mp)
13574	struct mount *mp;
13575{
13576	struct inodedep_hashhead *inodedephd;
13577	struct inodedep *inodedep;
13578	struct ufsmount *ump;
13579	struct vnode *vp;
13580	struct fs *fs;
13581	int error, cnt;
13582	ino_t firstino, lastino, ino;
13583
13584	ump = VFSTOUFS(mp);
13585	fs = ump->um_fs;
13586	LOCK_OWNED(ump);
13587	/*
13588	 * Pick a random inode dependency to be cleared.
13589	 * We will then gather up all the inodes in its block
13590	 * that have dependencies and flush them out.
13591	 */
13592	for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) {
13593		inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++];
13594		if (ump->inodedep_nextclean > ump->inodedep_hash_size)
13595			ump->inodedep_nextclean = 0;
13596		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
13597			break;
13598	}
13599	if (inodedep == NULL)
13600		return;
13601	/*
13602	 * Find the last inode in the block with dependencies.
13603	 */
13604	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
13605	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
13606		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
13607			break;
13608	/*
13609	 * Asynchronously push all but the last inode with dependencies.
13610	 * Synchronously push the last inode with dependencies to ensure
13611	 * that the inode block gets written to free up the inodedeps.
13612	 */
13613	for (ino = firstino; ino <= lastino; ino++) {
13614		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13615			continue;
13616		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13617			continue;
13618		FREE_LOCK(ump);
13619		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
13620		if (error != 0) {
13621			vn_finished_write(mp);
13622			ACQUIRE_LOCK(ump);
13623			return;
13624		}
13625		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13626		    FFSV_FORCEINSMQ)) != 0) {
13627			softdep_error("clear_inodedeps: vget", error);
13628			vfs_unbusy(mp);
13629			vn_finished_write(mp);
13630			ACQUIRE_LOCK(ump);
13631			return;
13632		}
13633		vfs_unbusy(mp);
13634		if (ino == lastino) {
13635			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
13636				softdep_error("clear_inodedeps: fsync1", error);
13637		} else {
13638			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13639				softdep_error("clear_inodedeps: fsync2", error);
13640			BO_LOCK(&vp->v_bufobj);
13641			drain_output(vp);
13642			BO_UNLOCK(&vp->v_bufobj);
13643		}
13644		vput(vp);
13645		vn_finished_write(mp);
13646		ACQUIRE_LOCK(ump);
13647	}
13648}
13649
13650void
13651softdep_buf_append(bp, wkhd)
13652	struct buf *bp;
13653	struct workhead *wkhd;
13654{
13655	struct worklist *wk;
13656	struct ufsmount *ump;
13657
13658	if ((wk = LIST_FIRST(wkhd)) == NULL)
13659		return;
13660	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13661	    ("softdep_buf_append called on non-softdep filesystem"));
13662	ump = VFSTOUFS(wk->wk_mp);
13663	ACQUIRE_LOCK(ump);
13664	while ((wk = LIST_FIRST(wkhd)) != NULL) {
13665		WORKLIST_REMOVE(wk);
13666		WORKLIST_INSERT(&bp->b_dep, wk);
13667	}
13668	FREE_LOCK(ump);
13669
13670}
13671
13672void
13673softdep_inode_append(ip, cred, wkhd)
13674	struct inode *ip;
13675	struct ucred *cred;
13676	struct workhead *wkhd;
13677{
13678	struct buf *bp;
13679	struct fs *fs;
13680	int error;
13681
13682	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
13683	    ("softdep_inode_append called on non-softdep filesystem"));
13684	fs = ip->i_fs;
13685	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
13686	    (int)fs->fs_bsize, cred, &bp);
13687	if (error) {
13688		bqrelse(bp);
13689		softdep_freework(wkhd);
13690		return;
13691	}
13692	softdep_buf_append(bp, wkhd);
13693	bqrelse(bp);
13694}
13695
13696void
13697softdep_freework(wkhd)
13698	struct workhead *wkhd;
13699{
13700	struct worklist *wk;
13701	struct ufsmount *ump;
13702
13703	if ((wk = LIST_FIRST(wkhd)) == NULL)
13704		return;
13705	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13706	    ("softdep_freework called on non-softdep filesystem"));
13707	ump = VFSTOUFS(wk->wk_mp);
13708	ACQUIRE_LOCK(ump);
13709	handle_jwork(wkhd);
13710	FREE_LOCK(ump);
13711}
13712
13713/*
13714 * Function to determine if the buffer has outstanding dependencies
13715 * that will cause a roll-back if the buffer is written. If wantcount
13716 * is set, return number of dependencies, otherwise just yes or no.
13717 */
13718static int
13719softdep_count_dependencies(bp, wantcount)
13720	struct buf *bp;
13721	int wantcount;
13722{
13723	struct worklist *wk;
13724	struct ufsmount *ump;
13725	struct bmsafemap *bmsafemap;
13726	struct freework *freework;
13727	struct inodedep *inodedep;
13728	struct indirdep *indirdep;
13729	struct freeblks *freeblks;
13730	struct allocindir *aip;
13731	struct pagedep *pagedep;
13732	struct dirrem *dirrem;
13733	struct newblk *newblk;
13734	struct mkdir *mkdir;
13735	struct diradd *dap;
13736	int i, retval;
13737
13738	retval = 0;
13739	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
13740		return (0);
13741	ump = VFSTOUFS(wk->wk_mp);
13742	ACQUIRE_LOCK(ump);
13743	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
13744		switch (wk->wk_type) {
13745
13746		case D_INODEDEP:
13747			inodedep = WK_INODEDEP(wk);
13748			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
13749				/* bitmap allocation dependency */
13750				retval += 1;
13751				if (!wantcount)
13752					goto out;
13753			}
13754			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
13755				/* direct block pointer dependency */
13756				retval += 1;
13757				if (!wantcount)
13758					goto out;
13759			}
13760			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
13761				/* direct block pointer dependency */
13762				retval += 1;
13763				if (!wantcount)
13764					goto out;
13765			}
13766			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
13767				/* Add reference dependency. */
13768				retval += 1;
13769				if (!wantcount)
13770					goto out;
13771			}
13772			continue;
13773
13774		case D_INDIRDEP:
13775			indirdep = WK_INDIRDEP(wk);
13776
13777			TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
13778				/* indirect truncation dependency */
13779				retval += 1;
13780				if (!wantcount)
13781					goto out;
13782			}
13783
13784			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
13785				/* indirect block pointer dependency */
13786				retval += 1;
13787				if (!wantcount)
13788					goto out;
13789			}
13790			continue;
13791
13792		case D_PAGEDEP:
13793			pagedep = WK_PAGEDEP(wk);
13794			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
13795				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
13796					/* Journal remove ref dependency. */
13797					retval += 1;
13798					if (!wantcount)
13799						goto out;
13800				}
13801			}
13802			for (i = 0; i < DAHASHSZ; i++) {
13803
13804				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
13805					/* directory entry dependency */
13806					retval += 1;
13807					if (!wantcount)
13808						goto out;
13809				}
13810			}
13811			continue;
13812
13813		case D_BMSAFEMAP:
13814			bmsafemap = WK_BMSAFEMAP(wk);
13815			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
13816				/* Add reference dependency. */
13817				retval += 1;
13818				if (!wantcount)
13819					goto out;
13820			}
13821			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
13822				/* Allocate block dependency. */
13823				retval += 1;
13824				if (!wantcount)
13825					goto out;
13826			}
13827			continue;
13828
13829		case D_FREEBLKS:
13830			freeblks = WK_FREEBLKS(wk);
13831			if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
13832				/* Freeblk journal dependency. */
13833				retval += 1;
13834				if (!wantcount)
13835					goto out;
13836			}
13837			continue;
13838
13839		case D_ALLOCDIRECT:
13840		case D_ALLOCINDIR:
13841			newblk = WK_NEWBLK(wk);
13842			if (newblk->nb_jnewblk) {
13843				/* Journal allocate dependency. */
13844				retval += 1;
13845				if (!wantcount)
13846					goto out;
13847			}
13848			continue;
13849
13850		case D_MKDIR:
13851			mkdir = WK_MKDIR(wk);
13852			if (mkdir->md_jaddref) {
13853				/* Journal reference dependency. */
13854				retval += 1;
13855				if (!wantcount)
13856					goto out;
13857			}
13858			continue;
13859
13860		case D_FREEWORK:
13861		case D_FREEDEP:
13862		case D_JSEGDEP:
13863		case D_JSEG:
13864		case D_SBDEP:
13865			/* never a dependency on these blocks */
13866			continue;
13867
13868		default:
13869			panic("softdep_count_dependencies: Unexpected type %s",
13870			    TYPENAME(wk->wk_type));
13871			/* NOTREACHED */
13872		}
13873	}
13874out:
13875	FREE_LOCK(ump);
13876	return retval;
13877}
13878
13879/*
13880 * Acquire exclusive access to a buffer.
13881 * Must be called with a locked mtx parameter.
13882 * Return acquired buffer or NULL on failure.
13883 */
13884static struct buf *
13885getdirtybuf(bp, lock, waitfor)
13886	struct buf *bp;
13887	struct rwlock *lock;
13888	int waitfor;
13889{
13890	int error;
13891
13892	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
13893		if (waitfor != MNT_WAIT)
13894			return (NULL);
13895		error = BUF_LOCK(bp,
13896		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
13897		/*
13898		 * Even if we sucessfully acquire bp here, we have dropped
13899		 * lock, which may violates our guarantee.
13900		 */
13901		if (error == 0)
13902			BUF_UNLOCK(bp);
13903		else if (error != ENOLCK)
13904			panic("getdirtybuf: inconsistent lock: %d", error);
13905		rw_wlock(lock);
13906		return (NULL);
13907	}
13908	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13909		if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) {
13910			rw_wunlock(lock);
13911			BO_LOCK(bp->b_bufobj);
13912			BUF_UNLOCK(bp);
13913			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13914				bp->b_vflags |= BV_BKGRDWAIT;
13915				msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
13916				       PRIBIO | PDROP, "getbuf", 0);
13917			} else
13918				BO_UNLOCK(bp->b_bufobj);
13919			rw_wlock(lock);
13920			return (NULL);
13921		}
13922		BUF_UNLOCK(bp);
13923		if (waitfor != MNT_WAIT)
13924			return (NULL);
13925		/*
13926		 * The lock argument must be bp->b_vp's mutex in
13927		 * this case.
13928		 */
13929#ifdef	DEBUG_VFS_LOCKS
13930		if (bp->b_vp->v_type != VCHR)
13931			ASSERT_BO_WLOCKED(bp->b_bufobj);
13932#endif
13933		bp->b_vflags |= BV_BKGRDWAIT;
13934		rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
13935		return (NULL);
13936	}
13937	if ((bp->b_flags & B_DELWRI) == 0) {
13938		BUF_UNLOCK(bp);
13939		return (NULL);
13940	}
13941	bremfree(bp);
13942	return (bp);
13943}
13944
13945
13946/*
13947 * Check if it is safe to suspend the file system now.  On entry,
13948 * the vnode interlock for devvp should be held.  Return 0 with
13949 * the mount interlock held if the file system can be suspended now,
13950 * otherwise return EAGAIN with the mount interlock held.
13951 */
13952int
13953softdep_check_suspend(struct mount *mp,
13954		      struct vnode *devvp,
13955		      int softdep_depcnt,
13956		      int softdep_accdepcnt,
13957		      int secondary_writes,
13958		      int secondary_accwrites)
13959{
13960	struct bufobj *bo;
13961	struct ufsmount *ump;
13962	struct inodedep *inodedep;
13963	int error, unlinked;
13964
13965	bo = &devvp->v_bufobj;
13966	ASSERT_BO_WLOCKED(bo);
13967
13968	/*
13969	 * If we are not running with soft updates, then we need only
13970	 * deal with secondary writes as we try to suspend.
13971	 */
13972	if (MOUNTEDSOFTDEP(mp) == 0) {
13973		MNT_ILOCK(mp);
13974		while (mp->mnt_secondary_writes != 0) {
13975			BO_UNLOCK(bo);
13976			msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
13977			    (PUSER - 1) | PDROP, "secwr", 0);
13978			BO_LOCK(bo);
13979			MNT_ILOCK(mp);
13980		}
13981
13982		/*
13983		 * Reasons for needing more work before suspend:
13984		 * - Dirty buffers on devvp.
13985		 * - Secondary writes occurred after start of vnode sync loop
13986		 */
13987		error = 0;
13988		if (bo->bo_numoutput > 0 ||
13989		    bo->bo_dirty.bv_cnt > 0 ||
13990		    secondary_writes != 0 ||
13991		    mp->mnt_secondary_writes != 0 ||
13992		    secondary_accwrites != mp->mnt_secondary_accwrites)
13993			error = EAGAIN;
13994		BO_UNLOCK(bo);
13995		return (error);
13996	}
13997
13998	/*
13999	 * If we are running with soft updates, then we need to coordinate
14000	 * with them as we try to suspend.
14001	 */
14002	ump = VFSTOUFS(mp);
14003	for (;;) {
14004		if (!TRY_ACQUIRE_LOCK(ump)) {
14005			BO_UNLOCK(bo);
14006			ACQUIRE_LOCK(ump);
14007			FREE_LOCK(ump);
14008			BO_LOCK(bo);
14009			continue;
14010		}
14011		MNT_ILOCK(mp);
14012		if (mp->mnt_secondary_writes != 0) {
14013			FREE_LOCK(ump);
14014			BO_UNLOCK(bo);
14015			msleep(&mp->mnt_secondary_writes,
14016			       MNT_MTX(mp),
14017			       (PUSER - 1) | PDROP, "secwr", 0);
14018			BO_LOCK(bo);
14019			continue;
14020		}
14021		break;
14022	}
14023
14024	unlinked = 0;
14025	if (MOUNTEDSUJ(mp)) {
14026		for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked);
14027		    inodedep != NULL;
14028		    inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
14029			if ((inodedep->id_state & (UNLINKED | UNLINKLINKS |
14030			    UNLINKONLIST)) != (UNLINKED | UNLINKLINKS |
14031			    UNLINKONLIST) ||
14032			    !check_inodedep_free(inodedep))
14033				continue;
14034			unlinked++;
14035		}
14036	}
14037
14038	/*
14039	 * Reasons for needing more work before suspend:
14040	 * - Dirty buffers on devvp.
14041	 * - Softdep activity occurred after start of vnode sync loop
14042	 * - Secondary writes occurred after start of vnode sync loop
14043	 */
14044	error = 0;
14045	if (bo->bo_numoutput > 0 ||
14046	    bo->bo_dirty.bv_cnt > 0 ||
14047	    softdep_depcnt != unlinked ||
14048	    ump->softdep_deps != unlinked ||
14049	    softdep_accdepcnt != ump->softdep_accdeps ||
14050	    secondary_writes != 0 ||
14051	    mp->mnt_secondary_writes != 0 ||
14052	    secondary_accwrites != mp->mnt_secondary_accwrites)
14053		error = EAGAIN;
14054	FREE_LOCK(ump);
14055	BO_UNLOCK(bo);
14056	return (error);
14057}
14058
14059
14060/*
14061 * Get the number of dependency structures for the file system, both
14062 * the current number and the total number allocated.  These will
14063 * later be used to detect that softdep processing has occurred.
14064 */
14065void
14066softdep_get_depcounts(struct mount *mp,
14067		      int *softdep_depsp,
14068		      int *softdep_accdepsp)
14069{
14070	struct ufsmount *ump;
14071
14072	if (MOUNTEDSOFTDEP(mp) == 0) {
14073		*softdep_depsp = 0;
14074		*softdep_accdepsp = 0;
14075		return;
14076	}
14077	ump = VFSTOUFS(mp);
14078	ACQUIRE_LOCK(ump);
14079	*softdep_depsp = ump->softdep_deps;
14080	*softdep_accdepsp = ump->softdep_accdeps;
14081	FREE_LOCK(ump);
14082}
14083
14084/*
14085 * Wait for pending output on a vnode to complete.
14086 * Must be called with vnode lock and interlock locked.
14087 *
14088 * XXX: Should just be a call to bufobj_wwait().
14089 */
14090static void
14091drain_output(vp)
14092	struct vnode *vp;
14093{
14094	struct bufobj *bo;
14095
14096	bo = &vp->v_bufobj;
14097	ASSERT_VOP_LOCKED(vp, "drain_output");
14098	ASSERT_BO_WLOCKED(bo);
14099
14100	while (bo->bo_numoutput) {
14101		bo->bo_flag |= BO_WWAIT;
14102		msleep((caddr_t)&bo->bo_numoutput,
14103		    BO_LOCKPTR(bo), PRIBIO + 1, "drainvp", 0);
14104	}
14105}
14106
14107/*
14108 * Called whenever a buffer that is being invalidated or reallocated
14109 * contains dependencies. This should only happen if an I/O error has
14110 * occurred. The routine is called with the buffer locked.
14111 */
14112static void
14113softdep_deallocate_dependencies(bp)
14114	struct buf *bp;
14115{
14116
14117	if ((bp->b_ioflags & BIO_ERROR) == 0)
14118		panic("softdep_deallocate_dependencies: dangling deps");
14119	if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
14120		softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
14121	else
14122		printf("softdep_deallocate_dependencies: "
14123		    "got error %d while accessing filesystem\n", bp->b_error);
14124	if (bp->b_error != ENXIO)
14125		panic("softdep_deallocate_dependencies: unrecovered I/O error");
14126}
14127
14128/*
14129 * Function to handle asynchronous write errors in the filesystem.
14130 */
14131static void
14132softdep_error(func, error)
14133	char *func;
14134	int error;
14135{
14136
14137	/* XXX should do something better! */
14138	printf("%s: got error %d while accessing filesystem\n", func, error);
14139}
14140
14141#ifdef DDB
14142
14143static void
14144inodedep_print(struct inodedep *inodedep, int verbose)
14145{
14146	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
14147	    " saveino %p\n",
14148	    inodedep, inodedep->id_fs, inodedep->id_state,
14149	    (intmax_t)inodedep->id_ino,
14150	    (intmax_t)fsbtodb(inodedep->id_fs,
14151	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
14152	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
14153	    inodedep->id_savedino1);
14154
14155	if (verbose == 0)
14156		return;
14157
14158	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
14159	    "mkdiradd %p\n",
14160	    LIST_FIRST(&inodedep->id_pendinghd),
14161	    LIST_FIRST(&inodedep->id_bufwait),
14162	    LIST_FIRST(&inodedep->id_inowait),
14163	    TAILQ_FIRST(&inodedep->id_inoreflst),
14164	    inodedep->id_mkdiradd);
14165	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
14166	    TAILQ_FIRST(&inodedep->id_inoupdt),
14167	    TAILQ_FIRST(&inodedep->id_newinoupdt),
14168	    TAILQ_FIRST(&inodedep->id_extupdt),
14169	    TAILQ_FIRST(&inodedep->id_newextupdt));
14170}
14171
14172DB_SHOW_COMMAND(inodedep, db_show_inodedep)
14173{
14174
14175	if (have_addr == 0) {
14176		db_printf("Address required\n");
14177		return;
14178	}
14179	inodedep_print((struct inodedep*)addr, 1);
14180}
14181
14182DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
14183{
14184	struct inodedep_hashhead *inodedephd;
14185	struct inodedep *inodedep;
14186	struct ufsmount *ump;
14187	int cnt;
14188
14189	if (have_addr == 0) {
14190		db_printf("Address required\n");
14191		return;
14192	}
14193	ump = (struct ufsmount *)addr;
14194	for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) {
14195		inodedephd = &ump->inodedep_hashtbl[cnt];
14196		LIST_FOREACH(inodedep, inodedephd, id_hash) {
14197			inodedep_print(inodedep, 0);
14198		}
14199	}
14200}
14201
14202DB_SHOW_COMMAND(worklist, db_show_worklist)
14203{
14204	struct worklist *wk;
14205
14206	if (have_addr == 0) {
14207		db_printf("Address required\n");
14208		return;
14209	}
14210	wk = (struct worklist *)addr;
14211	printf("worklist: %p type %s state 0x%X\n",
14212	    wk, TYPENAME(wk->wk_type), wk->wk_state);
14213}
14214
14215DB_SHOW_COMMAND(workhead, db_show_workhead)
14216{
14217	struct workhead *wkhd;
14218	struct worklist *wk;
14219	int i;
14220
14221	if (have_addr == 0) {
14222		db_printf("Address required\n");
14223		return;
14224	}
14225	wkhd = (struct workhead *)addr;
14226	wk = LIST_FIRST(wkhd);
14227	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
14228		db_printf("worklist: %p type %s state 0x%X",
14229		    wk, TYPENAME(wk->wk_type), wk->wk_state);
14230	if (i == 100)
14231		db_printf("workhead overflow");
14232	printf("\n");
14233}
14234
14235
14236DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
14237{
14238	struct mkdirlist *mkdirlisthd;
14239	struct jaddref *jaddref;
14240	struct diradd *diradd;
14241	struct mkdir *mkdir;
14242
14243	if (have_addr == 0) {
14244		db_printf("Address required\n");
14245		return;
14246	}
14247	mkdirlisthd = (struct mkdirlist *)addr;
14248	LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) {
14249		diradd = mkdir->md_diradd;
14250		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
14251		    mkdir, mkdir->md_state, diradd, diradd->da_state);
14252		if ((jaddref = mkdir->md_jaddref) != NULL)
14253			db_printf(" jaddref %p jaddref state 0x%X",
14254			    jaddref, jaddref->ja_state);
14255		db_printf("\n");
14256	}
14257}
14258
14259/* exported to ffs_vfsops.c */
14260extern void db_print_ffs(struct ufsmount *ump);
14261void
14262db_print_ffs(struct ufsmount *ump)
14263{
14264	db_printf("mp %p %s devvp %p fs %p su_wl %d su_deps %d su_req %d\n",
14265	    ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname,
14266	    ump->um_devvp, ump->um_fs, ump->softdep_on_worklist,
14267	    ump->softdep_deps, ump->softdep_req);
14268}
14269
14270#endif /* DDB */
14271
14272#endif /* SOFTUPDATES */
14273