ffs_softdep.c revision 256808
1/*-
2 * Copyright 1998, 2000 Marshall Kirk McKusick.
3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4 * All rights reserved.
5 *
6 * The soft updates code is derived from the appendix of a University
7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8 * "Soft Updates: A Solution to the Metadata Update Problem in File
9 * Systems", CSE-TR-254-95, August 1995).
10 *
11 * Further information about soft updates can be obtained from:
12 *
13 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
14 *	1614 Oxford Street		mckusick@mckusick.com
15 *	Berkeley, CA 94709-1608		+1-510-843-9542
16 *	USA
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 *
22 * 1. Redistributions of source code must retain the above copyright
23 *    notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 *    notice, this list of conditions and the following disclaimer in the
26 *    documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
40 */
41
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 256808 2013-10-20 21:11:40Z mckusick $");
44
45#include "opt_ffs.h"
46#include "opt_quota.h"
47#include "opt_ddb.h"
48
49/*
50 * For now we want the safety net that the DEBUG flag provides.
51 */
52#ifndef DEBUG
53#define DEBUG
54#endif
55
56#include <sys/param.h>
57#include <sys/kernel.h>
58#include <sys/systm.h>
59#include <sys/bio.h>
60#include <sys/buf.h>
61#include <sys/kdb.h>
62#include <sys/kthread.h>
63#include <sys/ktr.h>
64#include <sys/limits.h>
65#include <sys/lock.h>
66#include <sys/malloc.h>
67#include <sys/mount.h>
68#include <sys/mutex.h>
69#include <sys/namei.h>
70#include <sys/priv.h>
71#include <sys/proc.h>
72#include <sys/rwlock.h>
73#include <sys/stat.h>
74#include <sys/sysctl.h>
75#include <sys/syslog.h>
76#include <sys/vnode.h>
77#include <sys/conf.h>
78
79#include <ufs/ufs/dir.h>
80#include <ufs/ufs/extattr.h>
81#include <ufs/ufs/quota.h>
82#include <ufs/ufs/inode.h>
83#include <ufs/ufs/ufsmount.h>
84#include <ufs/ffs/fs.h>
85#include <ufs/ffs/softdep.h>
86#include <ufs/ffs/ffs_extern.h>
87#include <ufs/ufs/ufs_extern.h>
88
89#include <vm/vm.h>
90#include <vm/vm_extern.h>
91#include <vm/vm_object.h>
92
93#include <geom/geom.h>
94
95#include <ddb/ddb.h>
96
97#define	KTR_SUJ	0	/* Define to KTR_SPARE. */
98
99#ifndef SOFTUPDATES
100
101int
102softdep_flushfiles(oldmnt, flags, td)
103	struct mount *oldmnt;
104	int flags;
105	struct thread *td;
106{
107
108	panic("softdep_flushfiles called");
109}
110
111int
112softdep_mount(devvp, mp, fs, cred)
113	struct vnode *devvp;
114	struct mount *mp;
115	struct fs *fs;
116	struct ucred *cred;
117{
118
119	return (0);
120}
121
122void
123softdep_initialize()
124{
125
126	return;
127}
128
129void
130softdep_uninitialize()
131{
132
133	return;
134}
135
136void
137softdep_unmount(mp)
138	struct mount *mp;
139{
140
141}
142
143void
144softdep_setup_sbupdate(ump, fs, bp)
145	struct ufsmount *ump;
146	struct fs *fs;
147	struct buf *bp;
148{
149}
150
151void
152softdep_setup_inomapdep(bp, ip, newinum, mode)
153	struct buf *bp;
154	struct inode *ip;
155	ino_t newinum;
156	int mode;
157{
158
159	panic("softdep_setup_inomapdep called");
160}
161
162void
163softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
164	struct buf *bp;
165	struct mount *mp;
166	ufs2_daddr_t newblkno;
167	int frags;
168	int oldfrags;
169{
170
171	panic("softdep_setup_blkmapdep called");
172}
173
174void
175softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
176	struct inode *ip;
177	ufs_lbn_t lbn;
178	ufs2_daddr_t newblkno;
179	ufs2_daddr_t oldblkno;
180	long newsize;
181	long oldsize;
182	struct buf *bp;
183{
184
185	panic("softdep_setup_allocdirect called");
186}
187
188void
189softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
190	struct inode *ip;
191	ufs_lbn_t lbn;
192	ufs2_daddr_t newblkno;
193	ufs2_daddr_t oldblkno;
194	long newsize;
195	long oldsize;
196	struct buf *bp;
197{
198
199	panic("softdep_setup_allocext called");
200}
201
202void
203softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
204	struct inode *ip;
205	ufs_lbn_t lbn;
206	struct buf *bp;
207	int ptrno;
208	ufs2_daddr_t newblkno;
209	ufs2_daddr_t oldblkno;
210	struct buf *nbp;
211{
212
213	panic("softdep_setup_allocindir_page called");
214}
215
216void
217softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
218	struct buf *nbp;
219	struct inode *ip;
220	struct buf *bp;
221	int ptrno;
222	ufs2_daddr_t newblkno;
223{
224
225	panic("softdep_setup_allocindir_meta called");
226}
227
228void
229softdep_journal_freeblocks(ip, cred, length, flags)
230	struct inode *ip;
231	struct ucred *cred;
232	off_t length;
233	int flags;
234{
235
236	panic("softdep_journal_freeblocks called");
237}
238
239void
240softdep_journal_fsync(ip)
241	struct inode *ip;
242{
243
244	panic("softdep_journal_fsync called");
245}
246
247void
248softdep_setup_freeblocks(ip, length, flags)
249	struct inode *ip;
250	off_t length;
251	int flags;
252{
253
254	panic("softdep_setup_freeblocks called");
255}
256
257void
258softdep_freefile(pvp, ino, mode)
259		struct vnode *pvp;
260		ino_t ino;
261		int mode;
262{
263
264	panic("softdep_freefile called");
265}
266
267int
268softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
269	struct buf *bp;
270	struct inode *dp;
271	off_t diroffset;
272	ino_t newinum;
273	struct buf *newdirbp;
274	int isnewblk;
275{
276
277	panic("softdep_setup_directory_add called");
278}
279
280void
281softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
282	struct buf *bp;
283	struct inode *dp;
284	caddr_t base;
285	caddr_t oldloc;
286	caddr_t newloc;
287	int entrysize;
288{
289
290	panic("softdep_change_directoryentry_offset called");
291}
292
293void
294softdep_setup_remove(bp, dp, ip, isrmdir)
295	struct buf *bp;
296	struct inode *dp;
297	struct inode *ip;
298	int isrmdir;
299{
300
301	panic("softdep_setup_remove called");
302}
303
304void
305softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
306	struct buf *bp;
307	struct inode *dp;
308	struct inode *ip;
309	ino_t newinum;
310	int isrmdir;
311{
312
313	panic("softdep_setup_directory_change called");
314}
315
316void
317softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
318	struct mount *mp;
319	struct buf *bp;
320	ufs2_daddr_t blkno;
321	int frags;
322	struct workhead *wkhd;
323{
324
325	panic("%s called", __FUNCTION__);
326}
327
328void
329softdep_setup_inofree(mp, bp, ino, wkhd)
330	struct mount *mp;
331	struct buf *bp;
332	ino_t ino;
333	struct workhead *wkhd;
334{
335
336	panic("%s called", __FUNCTION__);
337}
338
339void
340softdep_setup_unlink(dp, ip)
341	struct inode *dp;
342	struct inode *ip;
343{
344
345	panic("%s called", __FUNCTION__);
346}
347
348void
349softdep_setup_link(dp, ip)
350	struct inode *dp;
351	struct inode *ip;
352{
353
354	panic("%s called", __FUNCTION__);
355}
356
357void
358softdep_revert_link(dp, ip)
359	struct inode *dp;
360	struct inode *ip;
361{
362
363	panic("%s called", __FUNCTION__);
364}
365
366void
367softdep_setup_rmdir(dp, ip)
368	struct inode *dp;
369	struct inode *ip;
370{
371
372	panic("%s called", __FUNCTION__);
373}
374
375void
376softdep_revert_rmdir(dp, ip)
377	struct inode *dp;
378	struct inode *ip;
379{
380
381	panic("%s called", __FUNCTION__);
382}
383
384void
385softdep_setup_create(dp, ip)
386	struct inode *dp;
387	struct inode *ip;
388{
389
390	panic("%s called", __FUNCTION__);
391}
392
393void
394softdep_revert_create(dp, ip)
395	struct inode *dp;
396	struct inode *ip;
397{
398
399	panic("%s called", __FUNCTION__);
400}
401
402void
403softdep_setup_mkdir(dp, ip)
404	struct inode *dp;
405	struct inode *ip;
406{
407
408	panic("%s called", __FUNCTION__);
409}
410
411void
412softdep_revert_mkdir(dp, ip)
413	struct inode *dp;
414	struct inode *ip;
415{
416
417	panic("%s called", __FUNCTION__);
418}
419
420void
421softdep_setup_dotdot_link(dp, ip)
422	struct inode *dp;
423	struct inode *ip;
424{
425
426	panic("%s called", __FUNCTION__);
427}
428
429int
430softdep_prealloc(vp, waitok)
431	struct vnode *vp;
432	int waitok;
433{
434
435	panic("%s called", __FUNCTION__);
436
437	return (0);
438}
439
440int
441softdep_journal_lookup(mp, vpp)
442	struct mount *mp;
443	struct vnode **vpp;
444{
445
446	return (ENOENT);
447}
448
449void
450softdep_change_linkcnt(ip)
451	struct inode *ip;
452{
453
454	panic("softdep_change_linkcnt called");
455}
456
457void
458softdep_load_inodeblock(ip)
459	struct inode *ip;
460{
461
462	panic("softdep_load_inodeblock called");
463}
464
465void
466softdep_update_inodeblock(ip, bp, waitfor)
467	struct inode *ip;
468	struct buf *bp;
469	int waitfor;
470{
471
472	panic("softdep_update_inodeblock called");
473}
474
475int
476softdep_fsync(vp)
477	struct vnode *vp;	/* the "in_core" copy of the inode */
478{
479
480	return (0);
481}
482
483void
484softdep_fsync_mountdev(vp)
485	struct vnode *vp;
486{
487
488	return;
489}
490
491int
492softdep_flushworklist(oldmnt, countp, td)
493	struct mount *oldmnt;
494	int *countp;
495	struct thread *td;
496{
497
498	*countp = 0;
499	return (0);
500}
501
502int
503softdep_sync_metadata(struct vnode *vp)
504{
505
506	return (0);
507}
508
509int
510softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
511{
512
513	return (0);
514}
515
516int
517softdep_slowdown(vp)
518	struct vnode *vp;
519{
520
521	panic("softdep_slowdown called");
522}
523
524int
525softdep_request_cleanup(fs, vp, cred, resource)
526	struct fs *fs;
527	struct vnode *vp;
528	struct ucred *cred;
529	int resource;
530{
531
532	return (0);
533}
534
535int
536softdep_check_suspend(struct mount *mp,
537		      struct vnode *devvp,
538		      int softdep_deps,
539		      int softdep_accdeps,
540		      int secondary_writes,
541		      int secondary_accwrites)
542{
543	struct bufobj *bo;
544	int error;
545
546	(void) softdep_deps,
547	(void) softdep_accdeps;
548
549	bo = &devvp->v_bufobj;
550	ASSERT_BO_WLOCKED(bo);
551
552	MNT_ILOCK(mp);
553	while (mp->mnt_secondary_writes != 0) {
554		BO_UNLOCK(bo);
555		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
556		    (PUSER - 1) | PDROP, "secwr", 0);
557		BO_LOCK(bo);
558		MNT_ILOCK(mp);
559	}
560
561	/*
562	 * Reasons for needing more work before suspend:
563	 * - Dirty buffers on devvp.
564	 * - Secondary writes occurred after start of vnode sync loop
565	 */
566	error = 0;
567	if (bo->bo_numoutput > 0 ||
568	    bo->bo_dirty.bv_cnt > 0 ||
569	    secondary_writes != 0 ||
570	    mp->mnt_secondary_writes != 0 ||
571	    secondary_accwrites != mp->mnt_secondary_accwrites)
572		error = EAGAIN;
573	BO_UNLOCK(bo);
574	return (error);
575}
576
577void
578softdep_get_depcounts(struct mount *mp,
579		      int *softdepactivep,
580		      int *softdepactiveaccp)
581{
582	(void) mp;
583	*softdepactivep = 0;
584	*softdepactiveaccp = 0;
585}
586
587void
588softdep_buf_append(bp, wkhd)
589	struct buf *bp;
590	struct workhead *wkhd;
591{
592
593	panic("softdep_buf_appendwork called");
594}
595
596void
597softdep_inode_append(ip, cred, wkhd)
598	struct inode *ip;
599	struct ucred *cred;
600	struct workhead *wkhd;
601{
602
603	panic("softdep_inode_appendwork called");
604}
605
606void
607softdep_freework(wkhd)
608	struct workhead *wkhd;
609{
610
611	panic("softdep_freework called");
612}
613
614#else
615
616FEATURE(softupdates, "FFS soft-updates support");
617
618/*
619 * These definitions need to be adapted to the system to which
620 * this file is being ported.
621 */
622
623#define M_SOFTDEP_FLAGS	(M_WAITOK)
624
625#define	D_PAGEDEP	0
626#define	D_INODEDEP	1
627#define	D_BMSAFEMAP	2
628#define	D_NEWBLK	3
629#define	D_ALLOCDIRECT	4
630#define	D_INDIRDEP	5
631#define	D_ALLOCINDIR	6
632#define	D_FREEFRAG	7
633#define	D_FREEBLKS	8
634#define	D_FREEFILE	9
635#define	D_DIRADD	10
636#define	D_MKDIR		11
637#define	D_DIRREM	12
638#define	D_NEWDIRBLK	13
639#define	D_FREEWORK	14
640#define	D_FREEDEP	15
641#define	D_JADDREF	16
642#define	D_JREMREF	17
643#define	D_JMVREF	18
644#define	D_JNEWBLK	19
645#define	D_JFREEBLK	20
646#define	D_JFREEFRAG	21
647#define	D_JSEG		22
648#define	D_JSEGDEP	23
649#define	D_SBDEP		24
650#define	D_JTRUNC	25
651#define	D_JFSYNC	26
652#define	D_SENTINEL	27
653#define	D_LAST		D_SENTINEL
654
655unsigned long dep_current[D_LAST + 1];
656unsigned long dep_highuse[D_LAST + 1];
657unsigned long dep_total[D_LAST + 1];
658unsigned long dep_write[D_LAST + 1];
659
660static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
661    "soft updates stats");
662static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
663    "total dependencies allocated");
664static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
665    "high use dependencies allocated");
666static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
667    "current dependencies allocated");
668static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
669    "current dependencies written");
670
671#define	SOFTDEP_TYPE(type, str, long)					\
672    static MALLOC_DEFINE(M_ ## type, #str, long);			\
673    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
674	&dep_total[D_ ## type], 0, "");					\
675    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
676	&dep_current[D_ ## type], 0, "");				\
677    SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, 	\
678	&dep_highuse[D_ ## type], 0, "");				\
679    SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, 	\
680	&dep_write[D_ ## type], 0, "");
681
682SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
683SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
684SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
685    "Block or frag allocated from cyl group map");
686SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
687SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
688SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
689SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
690SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
691SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
692SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
693SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
694SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
695SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
696SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
697SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
698SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
699SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
700SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
701SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
702SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
703SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
704SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
705SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
706SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
707SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
708SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
709SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
710
711static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
712
713static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
714static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
715
716/*
717 * translate from workitem type to memory type
718 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
719 */
720static struct malloc_type *memtype[] = {
721	M_PAGEDEP,
722	M_INODEDEP,
723	M_BMSAFEMAP,
724	M_NEWBLK,
725	M_ALLOCDIRECT,
726	M_INDIRDEP,
727	M_ALLOCINDIR,
728	M_FREEFRAG,
729	M_FREEBLKS,
730	M_FREEFILE,
731	M_DIRADD,
732	M_MKDIR,
733	M_DIRREM,
734	M_NEWDIRBLK,
735	M_FREEWORK,
736	M_FREEDEP,
737	M_JADDREF,
738	M_JREMREF,
739	M_JMVREF,
740	M_JNEWBLK,
741	M_JFREEBLK,
742	M_JFREEFRAG,
743	M_JSEG,
744	M_JSEGDEP,
745	M_SBDEP,
746	M_JTRUNC,
747	M_JFSYNC,
748	M_SENTINEL
749};
750
751static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;
752
753#define DtoM(type) (memtype[type])
754
755/*
756 * Names of malloc types.
757 */
758#define TYPENAME(type)  \
759	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
760/*
761 * End system adaptation definitions.
762 */
763
764#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
765#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
766
767/*
768 * Forward declarations.
769 */
770struct inodedep_hashhead;
771struct newblk_hashhead;
772struct pagedep_hashhead;
773struct bmsafemap_hashhead;
774
775/*
776 * Private journaling structures.
777 */
778struct jblocks {
779	struct jseglst	jb_segs;	/* TAILQ of current segments. */
780	struct jseg	*jb_writeseg;	/* Next write to complete. */
781	struct jseg	*jb_oldestseg;	/* Oldest segment with valid entries. */
782	struct jextent	*jb_extent;	/* Extent array. */
783	uint64_t	jb_nextseq;	/* Next sequence number. */
784	uint64_t	jb_oldestwrseq;	/* Oldest written sequence number. */
785	uint8_t		jb_needseg;	/* Need a forced segment. */
786	uint8_t		jb_suspended;	/* Did journal suspend writes? */
787	int		jb_avail;	/* Available extents. */
788	int		jb_used;	/* Last used extent. */
789	int		jb_head;	/* Allocator head. */
790	int		jb_off;		/* Allocator extent offset. */
791	int		jb_blocks;	/* Total disk blocks covered. */
792	int		jb_free;	/* Total disk blocks free. */
793	int		jb_min;		/* Minimum free space. */
794	int		jb_low;		/* Low on space. */
795	int		jb_age;		/* Insertion time of oldest rec. */
796};
797
798struct jextent {
799	ufs2_daddr_t	je_daddr;	/* Disk block address. */
800	int		je_blocks;	/* Disk block count. */
801};
802
803/*
804 * Internal function prototypes.
805 */
806static	void softdep_error(char *, int);
807static	int softdep_process_worklist(struct mount *, int);
808static	int softdep_waitidle(struct mount *);
809static	void drain_output(struct vnode *);
810static	struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
811static	void clear_remove(void);
812static	void clear_inodedeps(void);
813static	void unlinked_inodedep(struct mount *, struct inodedep *);
814static	void clear_unlinked_inodedep(struct inodedep *);
815static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
816static	int flush_pagedep_deps(struct vnode *, struct mount *,
817	    struct diraddhd *);
818static	int free_pagedep(struct pagedep *);
819static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
820static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
821static	int flush_deplist(struct allocdirectlst *, int, int *);
822static	int sync_cgs(struct mount *, int);
823static	int handle_written_filepage(struct pagedep *, struct buf *);
824static	int handle_written_sbdep(struct sbdep *, struct buf *);
825static	void initiate_write_sbdep(struct sbdep *);
826static  void diradd_inode_written(struct diradd *, struct inodedep *);
827static	int handle_written_indirdep(struct indirdep *, struct buf *,
828	    struct buf**);
829static	int handle_written_inodeblock(struct inodedep *, struct buf *);
830static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
831	    uint8_t *);
832static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
833static	void handle_written_jaddref(struct jaddref *);
834static	void handle_written_jremref(struct jremref *);
835static	void handle_written_jseg(struct jseg *, struct buf *);
836static	void handle_written_jnewblk(struct jnewblk *);
837static	void handle_written_jblkdep(struct jblkdep *);
838static	void handle_written_jfreefrag(struct jfreefrag *);
839static	void complete_jseg(struct jseg *);
840static	void complete_jsegs(struct jseg *);
841static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
842static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
843static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
844static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
845static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
846static	void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
847static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
848static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
849static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
850static	inline void inoref_write(struct inoref *, struct jseg *,
851	    struct jrefrec *);
852static	void handle_allocdirect_partdone(struct allocdirect *,
853	    struct workhead *);
854static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
855	    struct workhead *);
856static	void indirdep_complete(struct indirdep *);
857static	int indirblk_lookup(struct mount *, ufs2_daddr_t);
858static	void indirblk_insert(struct freework *);
859static	void indirblk_remove(struct freework *);
860static	void handle_allocindir_partdone(struct allocindir *);
861static	void initiate_write_filepage(struct pagedep *, struct buf *);
862static	void initiate_write_indirdep(struct indirdep*, struct buf *);
863static	void handle_written_mkdir(struct mkdir *, int);
864static	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
865	    uint8_t *);
866static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
867static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
868static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
869static	void handle_workitem_freefile(struct freefile *);
870static	int handle_workitem_remove(struct dirrem *, int);
871static	struct dirrem *newdirrem(struct buf *, struct inode *,
872	    struct inode *, int, struct dirrem **);
873static	struct indirdep *indirdep_lookup(struct mount *, struct inode *,
874	    struct buf *);
875static	void cancel_indirdep(struct indirdep *, struct buf *,
876	    struct freeblks *);
877static	void free_indirdep(struct indirdep *);
878static	void free_diradd(struct diradd *, struct workhead *);
879static	void merge_diradd(struct inodedep *, struct diradd *);
880static	void complete_diradd(struct diradd *);
881static	struct diradd *diradd_lookup(struct pagedep *, int);
882static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
883	    struct jremref *);
884static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
885	    struct jremref *);
886static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
887	    struct jremref *, struct jremref *);
888static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
889	    struct jremref *);
890static	void cancel_allocindir(struct allocindir *, struct buf *bp,
891	    struct freeblks *, int);
892static	int setup_trunc_indir(struct freeblks *, struct inode *,
893	    ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
894static	void complete_trunc_indir(struct freework *);
895static	void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
896	    int);
897static	void complete_mkdir(struct mkdir *);
898static	void free_newdirblk(struct newdirblk *);
899static	void free_jremref(struct jremref *);
900static	void free_jaddref(struct jaddref *);
901static	void free_jsegdep(struct jsegdep *);
902static	void free_jsegs(struct jblocks *);
903static	void rele_jseg(struct jseg *);
904static	void free_jseg(struct jseg *, struct jblocks *);
905static	void free_jnewblk(struct jnewblk *);
906static	void free_jblkdep(struct jblkdep *);
907static	void free_jfreefrag(struct jfreefrag *);
908static	void free_freedep(struct freedep *);
909static	void journal_jremref(struct dirrem *, struct jremref *,
910	    struct inodedep *);
911static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
912static	int cancel_jaddref(struct jaddref *, struct inodedep *,
913	    struct workhead *);
914static	void cancel_jfreefrag(struct jfreefrag *);
915static	inline void setup_freedirect(struct freeblks *, struct inode *,
916	    int, int);
917static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
918static	inline void setup_freeindir(struct freeblks *, struct inode *, int,
919	    ufs_lbn_t, int);
920static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
921static	void freeblks_free(struct ufsmount *, struct freeblks *, int);
922static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
923static	ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
924static	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
925static	void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
926	    int, int);
927static	void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
928static 	int cancel_pagedep(struct pagedep *, struct freeblks *, int);
929static	int deallocate_dependencies(struct buf *, struct freeblks *, int);
930static	void newblk_freefrag(struct newblk*);
931static	void free_newblk(struct newblk *);
932static	void cancel_allocdirect(struct allocdirectlst *,
933	    struct allocdirect *, struct freeblks *);
934static	int check_inode_unwritten(struct inodedep *);
935static	int free_inodedep(struct inodedep *);
936static	void freework_freeblock(struct freework *);
937static	void freework_enqueue(struct freework *);
938static	int handle_workitem_freeblocks(struct freeblks *, int);
939static	int handle_complete_freeblocks(struct freeblks *, int);
940static	void handle_workitem_indirblk(struct freework *);
941static	void handle_written_freework(struct freework *);
942static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
943static	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
944	    struct workhead *);
945static	struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
946	    struct inodedep *, struct allocindir *, ufs_lbn_t);
947static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
948	    ufs2_daddr_t, ufs_lbn_t);
949static	void handle_workitem_freefrag(struct freefrag *);
950static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
951	    ufs_lbn_t);
952static	void allocdirect_merge(struct allocdirectlst *,
953	    struct allocdirect *, struct allocdirect *);
954static	struct freefrag *allocindir_merge(struct allocindir *,
955	    struct allocindir *);
956static	int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int,
957	    struct bmsafemap **);
958static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
959	    int cg, struct bmsafemap *);
960static	int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t,
961	    int, struct newblk **);
962static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
963static	int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
964	    struct inodedep **);
965static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
966static	int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
967	    int, struct pagedep **);
968static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
969	    struct mount *mp, int, struct pagedep **);
970static	void pause_timer(void *);
971static	int request_cleanup(struct mount *, int);
972static	int process_worklist_item(struct mount *, int, int);
973static	void process_removes(struct vnode *);
974static	void process_truncates(struct vnode *);
975static	void jwork_move(struct workhead *, struct workhead *);
976static	void jwork_insert(struct workhead *, struct jsegdep *);
977static	void add_to_worklist(struct worklist *, int);
978static	void wake_worklist(struct worklist *);
979static	void wait_worklist(struct worklist *, char *);
980static	void remove_from_worklist(struct worklist *);
981static	void softdep_flush(void);
982static	void softdep_flushjournal(struct mount *);
983static	int softdep_speedup(void);
984static	void worklist_speedup(void);
985static	int journal_mount(struct mount *, struct fs *, struct ucred *);
986static	void journal_unmount(struct mount *);
987static	int journal_space(struct ufsmount *, int);
988static	void journal_suspend(struct ufsmount *);
989static	int journal_unsuspend(struct ufsmount *ump);
990static	void softdep_prelink(struct vnode *, struct vnode *);
991static	void add_to_journal(struct worklist *);
992static	void remove_from_journal(struct worklist *);
993static	void softdep_process_journal(struct mount *, struct worklist *, int);
994static	struct jremref *newjremref(struct dirrem *, struct inode *,
995	    struct inode *ip, off_t, nlink_t);
996static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
997	    uint16_t);
998static	inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
999	    uint16_t);
1000static	inline struct jsegdep *inoref_jseg(struct inoref *);
1001static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
1002static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
1003	    ufs2_daddr_t, int);
1004static	struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
1005static	void move_newblock_dep(struct jaddref *, struct inodedep *);
1006static	void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
1007static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
1008	    ufs2_daddr_t, long, ufs_lbn_t);
1009static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
1010	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
1011static	int jwait(struct worklist *, int);
1012static	struct inodedep *inodedep_lookup_ip(struct inode *);
1013static	int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
1014static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
1015static	void handle_jwork(struct workhead *);
1016static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
1017	    struct mkdir **);
1018static	struct jblocks *jblocks_create(void);
1019static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
1020static	void jblocks_free(struct jblocks *, struct mount *, int);
1021static	void jblocks_destroy(struct jblocks *);
1022static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
1023
1024/*
1025 * Exported softdep operations.
1026 */
1027static	void softdep_disk_io_initiation(struct buf *);
1028static	void softdep_disk_write_complete(struct buf *);
1029static	void softdep_deallocate_dependencies(struct buf *);
1030static	int softdep_count_dependencies(struct buf *bp, int);
1031
1032static struct rwlock lk;
1033RW_SYSINIT(softdep_lock, &lk, "Softdep Lock");
1034
1035#define TRY_ACQUIRE_LOCK(lk)		rw_try_wlock(lk)
1036#define ACQUIRE_LOCK(lk)		rw_wlock(lk)
1037#define FREE_LOCK(lk)			rw_wunlock(lk)
1038
1039#define	BUF_AREC(bp)			lockallowrecurse(&(bp)->b_lock)
1040#define	BUF_NOREC(bp)			lockdisablerecurse(&(bp)->b_lock)
1041
1042/*
1043 * Worklist queue management.
1044 * These routines require that the lock be held.
1045 */
1046#ifndef /* NOT */ DEBUG
1047#define WORKLIST_INSERT(head, item) do {	\
1048	(item)->wk_state |= ONWORKLIST;		\
1049	LIST_INSERT_HEAD(head, item, wk_list);	\
1050} while (0)
1051#define WORKLIST_REMOVE(item) do {		\
1052	(item)->wk_state &= ~ONWORKLIST;	\
1053	LIST_REMOVE(item, wk_list);		\
1054} while (0)
1055#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
1056#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
1057
1058#else /* DEBUG */
1059static	void worklist_insert(struct workhead *, struct worklist *, int);
1060static	void worklist_remove(struct worklist *, int);
1061
1062#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
1063#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
1064#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
1065#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
1066
1067static void
1068worklist_insert(head, item, locked)
1069	struct workhead *head;
1070	struct worklist *item;
1071	int locked;
1072{
1073
1074	if (locked)
1075		rw_assert(&lk, RA_WLOCKED);
1076	if (item->wk_state & ONWORKLIST)
1077		panic("worklist_insert: %p %s(0x%X) already on list",
1078		    item, TYPENAME(item->wk_type), item->wk_state);
1079	item->wk_state |= ONWORKLIST;
1080	LIST_INSERT_HEAD(head, item, wk_list);
1081}
1082
1083static void
1084worklist_remove(item, locked)
1085	struct worklist *item;
1086	int locked;
1087{
1088
1089	if (locked)
1090		rw_assert(&lk, RA_WLOCKED);
1091	if ((item->wk_state & ONWORKLIST) == 0)
1092		panic("worklist_remove: %p %s(0x%X) not on list",
1093		    item, TYPENAME(item->wk_type), item->wk_state);
1094	item->wk_state &= ~ONWORKLIST;
1095	LIST_REMOVE(item, wk_list);
1096}
1097#endif /* DEBUG */
1098
1099/*
1100 * Merge two jsegdeps keeping only the oldest one as newer references
1101 * can't be discarded until after older references.
1102 */
1103static inline struct jsegdep *
1104jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1105{
1106	struct jsegdep *swp;
1107
1108	if (two == NULL)
1109		return (one);
1110
1111	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1112		swp = one;
1113		one = two;
1114		two = swp;
1115	}
1116	WORKLIST_REMOVE(&two->jd_list);
1117	free_jsegdep(two);
1118
1119	return (one);
1120}
1121
1122/*
1123 * If two freedeps are compatible free one to reduce list size.
1124 */
1125static inline struct freedep *
1126freedep_merge(struct freedep *one, struct freedep *two)
1127{
1128	if (two == NULL)
1129		return (one);
1130
1131	if (one->fd_freework == two->fd_freework) {
1132		WORKLIST_REMOVE(&two->fd_list);
1133		free_freedep(two);
1134	}
1135	return (one);
1136}
1137
1138/*
1139 * Move journal work from one list to another.  Duplicate freedeps and
1140 * jsegdeps are coalesced to keep the lists as small as possible.
1141 */
1142static void
1143jwork_move(dst, src)
1144	struct workhead *dst;
1145	struct workhead *src;
1146{
1147	struct freedep *freedep;
1148	struct jsegdep *jsegdep;
1149	struct worklist *wkn;
1150	struct worklist *wk;
1151
1152	KASSERT(dst != src,
1153	    ("jwork_move: dst == src"));
1154	freedep = NULL;
1155	jsegdep = NULL;
1156	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1157		if (wk->wk_type == D_JSEGDEP)
1158			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1159		if (wk->wk_type == D_FREEDEP)
1160			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1161	}
1162
1163	rw_assert(&lk, RA_WLOCKED);
1164	while ((wk = LIST_FIRST(src)) != NULL) {
1165		WORKLIST_REMOVE(wk);
1166		WORKLIST_INSERT(dst, wk);
1167		if (wk->wk_type == D_JSEGDEP) {
1168			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1169			continue;
1170		}
1171		if (wk->wk_type == D_FREEDEP)
1172			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1173	}
1174}
1175
1176static void
1177jwork_insert(dst, jsegdep)
1178	struct workhead *dst;
1179	struct jsegdep *jsegdep;
1180{
1181	struct jsegdep *jsegdepn;
1182	struct worklist *wk;
1183
1184	LIST_FOREACH(wk, dst, wk_list)
1185		if (wk->wk_type == D_JSEGDEP)
1186			break;
1187	if (wk == NULL) {
1188		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1189		return;
1190	}
1191	jsegdepn = WK_JSEGDEP(wk);
1192	if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1193		WORKLIST_REMOVE(wk);
1194		free_jsegdep(jsegdepn);
1195		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1196	} else
1197		free_jsegdep(jsegdep);
1198}
1199
1200/*
1201 * Routines for tracking and managing workitems.
1202 */
1203static	void workitem_free(struct worklist *, int);
1204static	void workitem_alloc(struct worklist *, int, struct mount *);
1205static	void workitem_reassign(struct worklist *, int);
1206
1207#define	WORKITEM_FREE(item, type) \
1208	workitem_free((struct worklist *)(item), (type))
1209#define	WORKITEM_REASSIGN(item, type) \
1210	workitem_reassign((struct worklist *)(item), (type))
1211
1212static void
1213workitem_free(item, type)
1214	struct worklist *item;
1215	int type;
1216{
1217	struct ufsmount *ump;
1218	rw_assert(&lk, RA_WLOCKED);
1219
1220#ifdef DEBUG
1221	if (item->wk_state & ONWORKLIST)
1222		panic("workitem_free: %s(0x%X) still on list",
1223		    TYPENAME(item->wk_type), item->wk_state);
1224	if (item->wk_type != type && type != D_NEWBLK)
1225		panic("workitem_free: type mismatch %s != %s",
1226		    TYPENAME(item->wk_type), TYPENAME(type));
1227#endif
1228	if (item->wk_state & IOWAITING)
1229		wakeup(item);
1230	ump = VFSTOUFS(item->wk_mp);
1231	KASSERT(ump->softdep_deps > 0,
1232	    ("workitem_free: %s: softdep_deps going negative",
1233	    ump->um_fs->fs_fsmnt));
1234	if (--ump->softdep_deps == 0 && ump->softdep_req)
1235		wakeup(&ump->softdep_deps);
1236	KASSERT(dep_current[item->wk_type] > 0,
1237	    ("workitem_free: %s: dep_current[%s] going negative",
1238	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1239	dep_current[item->wk_type]--;
1240	free(item, DtoM(type));
1241}
1242
1243static void
1244workitem_alloc(item, type, mp)
1245	struct worklist *item;
1246	int type;
1247	struct mount *mp;
1248{
1249	struct ufsmount *ump;
1250
1251	item->wk_type = type;
1252	item->wk_mp = mp;
1253	item->wk_state = 0;
1254
1255	ump = VFSTOUFS(mp);
1256	ACQUIRE_LOCK(&lk);
1257	dep_current[type]++;
1258	if (dep_current[type] > dep_highuse[type])
1259		dep_highuse[type] = dep_current[type];
1260	dep_total[type]++;
1261	ump->softdep_deps++;
1262	ump->softdep_accdeps++;
1263	FREE_LOCK(&lk);
1264}
1265
1266static void
1267workitem_reassign(item, newtype)
1268	struct worklist *item;
1269	int newtype;
1270{
1271
1272	KASSERT(dep_current[item->wk_type] > 0,
1273	    ("workitem_reassign: %s: dep_current[%s] going negative",
1274	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1275	dep_current[item->wk_type]--;
1276	dep_current[newtype]++;
1277	if (dep_current[newtype] > dep_highuse[newtype])
1278		dep_highuse[newtype] = dep_current[newtype];
1279	dep_total[newtype]++;
1280	item->wk_type = newtype;
1281}
1282
1283/*
1284 * Workitem queue management
1285 */
1286static int max_softdeps;	/* maximum number of structs before slowdown */
1287static int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
1288static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1289static int proc_waiting;	/* tracks whether we have a timeout posted */
1290static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1291static struct callout softdep_callout;
1292static int req_pending;
1293static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1294static int req_clear_remove;	/* syncer process flush some freeblks */
1295static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1296
1297/*
1298 * runtime statistics
1299 */
1300static int stat_worklist_push;	/* number of worklist cleanups */
1301static int stat_blk_limit_push;	/* number of times block limit neared */
1302static int stat_ino_limit_push;	/* number of times inode limit neared */
1303static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1304static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1305static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1306static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1307static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1308static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1309static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1310static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1311static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1312static int stat_journal_min;	/* Times hit journal min threshold */
1313static int stat_journal_low;	/* Times hit journal low threshold */
1314static int stat_journal_wait;	/* Times blocked in jwait(). */
1315static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1316static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1317static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1318static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1319static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1320static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1321static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1322static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1323static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1324
1325SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1326    &max_softdeps, 0, "");
1327SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1328    &tickdelay, 0, "");
1329SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
1330    &maxindirdeps, 0, "");
1331SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1332    &stat_worklist_push, 0,"");
1333SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1334    &stat_blk_limit_push, 0,"");
1335SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1336    &stat_ino_limit_push, 0,"");
1337SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1338    &stat_blk_limit_hit, 0, "");
1339SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1340    &stat_ino_limit_hit, 0, "");
1341SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1342    &stat_sync_limit_hit, 0, "");
1343SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1344    &stat_indir_blk_ptrs, 0, "");
1345SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1346    &stat_inode_bitmap, 0, "");
1347SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1348    &stat_direct_blk_ptrs, 0, "");
1349SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1350    &stat_dir_entry, 0, "");
1351SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1352    &stat_jaddref, 0, "");
1353SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1354    &stat_jnewblk, 0, "");
1355SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1356    &stat_journal_low, 0, "");
1357SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1358    &stat_journal_min, 0, "");
1359SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1360    &stat_journal_wait, 0, "");
1361SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1362    &stat_jwait_filepage, 0, "");
1363SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1364    &stat_jwait_freeblks, 0, "");
1365SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1366    &stat_jwait_inode, 0, "");
1367SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1368    &stat_jwait_newblk, 0, "");
1369SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1370    &stat_cleanup_blkrequests, 0, "");
1371SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1372    &stat_cleanup_inorequests, 0, "");
1373SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1374    &stat_cleanup_high_delay, 0, "");
1375SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1376    &stat_cleanup_retries, 0, "");
1377SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1378    &stat_cleanup_failures, 0, "");
1379SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1380    &softdep_flushcache, 0, "");
1381
1382SYSCTL_DECL(_vfs_ffs);
1383
1384LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl;
1385static u_long	bmsafemap_hash;	/* size of hash table - 1 */
1386
1387static int compute_summary_at_mount = 0;	/* Whether to recompute the summary at mount time */
1388SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1389	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1390
1391static struct proc *softdepproc;
1392static struct kproc_desc softdep_kp = {
1393	"softdepflush",
1394	softdep_flush,
1395	&softdepproc
1396};
1397SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
1398    &softdep_kp);
1399
1400static void
1401softdep_flush(void)
1402{
1403	struct mount *nmp;
1404	struct mount *mp;
1405	struct ufsmount *ump;
1406	struct thread *td;
1407	int remaining;
1408	int progress;
1409
1410	td = curthread;
1411	td->td_pflags |= TDP_NORUNNINGBUF;
1412
1413	for (;;) {
1414		kproc_suspend_check(softdepproc);
1415		ACQUIRE_LOCK(&lk);
1416		/*
1417		 * If requested, try removing inode or removal dependencies.
1418		 */
1419		if (req_clear_inodedeps) {
1420			clear_inodedeps();
1421			req_clear_inodedeps -= 1;
1422			wakeup_one(&proc_waiting);
1423		}
1424		if (req_clear_remove) {
1425			clear_remove();
1426			req_clear_remove -= 1;
1427			wakeup_one(&proc_waiting);
1428		}
1429		FREE_LOCK(&lk);
1430		remaining = progress = 0;
1431		mtx_lock(&mountlist_mtx);
1432		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
1433			nmp = TAILQ_NEXT(mp, mnt_list);
1434			if (MOUNTEDSOFTDEP(mp) == 0)
1435				continue;
1436			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
1437				continue;
1438			progress += softdep_process_worklist(mp, 0);
1439			ump = VFSTOUFS(mp);
1440			remaining += ump->softdep_on_worklist;
1441			mtx_lock(&mountlist_mtx);
1442			nmp = TAILQ_NEXT(mp, mnt_list);
1443			vfs_unbusy(mp);
1444		}
1445		mtx_unlock(&mountlist_mtx);
1446		if (remaining && progress)
1447			continue;
1448		ACQUIRE_LOCK(&lk);
1449		if (!req_pending)
1450			msleep(&req_pending, &lk, PVM, "sdflush", hz);
1451		req_pending = 0;
1452		FREE_LOCK(&lk);
1453	}
1454}
1455
1456static void
1457worklist_speedup(void)
1458{
1459	rw_assert(&lk, RA_WLOCKED);
1460	if (req_pending == 0) {
1461		req_pending = 1;
1462		wakeup(&req_pending);
1463	}
1464}
1465
1466static int
1467softdep_speedup(void)
1468{
1469
1470	worklist_speedup();
1471	bd_speedup();
1472	return speedup_syncer();
1473}
1474
1475/*
1476 * Add an item to the end of the work queue.
1477 * This routine requires that the lock be held.
1478 * This is the only routine that adds items to the list.
1479 * The following routine is the only one that removes items
1480 * and does so in order from first to last.
1481 */
1482
1483#define	WK_HEAD		0x0001	/* Add to HEAD. */
1484#define	WK_NODELAY	0x0002	/* Process immediately. */
1485
1486static void
1487add_to_worklist(wk, flags)
1488	struct worklist *wk;
1489	int flags;
1490{
1491	struct ufsmount *ump;
1492
1493	rw_assert(&lk, RA_WLOCKED);
1494	ump = VFSTOUFS(wk->wk_mp);
1495	if (wk->wk_state & ONWORKLIST)
1496		panic("add_to_worklist: %s(0x%X) already on list",
1497		    TYPENAME(wk->wk_type), wk->wk_state);
1498	wk->wk_state |= ONWORKLIST;
1499	if (ump->softdep_on_worklist == 0) {
1500		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1501		ump->softdep_worklist_tail = wk;
1502	} else if (flags & WK_HEAD) {
1503		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1504	} else {
1505		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1506		ump->softdep_worklist_tail = wk;
1507	}
1508	ump->softdep_on_worklist += 1;
1509	if (flags & WK_NODELAY)
1510		worklist_speedup();
1511}
1512
1513/*
1514 * Remove the item to be processed. If we are removing the last
1515 * item on the list, we need to recalculate the tail pointer.
1516 */
1517static void
1518remove_from_worklist(wk)
1519	struct worklist *wk;
1520{
1521	struct ufsmount *ump;
1522
1523	ump = VFSTOUFS(wk->wk_mp);
1524	WORKLIST_REMOVE(wk);
1525	if (ump->softdep_worklist_tail == wk)
1526		ump->softdep_worklist_tail =
1527		    (struct worklist *)wk->wk_list.le_prev;
1528	ump->softdep_on_worklist -= 1;
1529}
1530
1531static void
1532wake_worklist(wk)
1533	struct worklist *wk;
1534{
1535	if (wk->wk_state & IOWAITING) {
1536		wk->wk_state &= ~IOWAITING;
1537		wakeup(wk);
1538	}
1539}
1540
1541static void
1542wait_worklist(wk, wmesg)
1543	struct worklist *wk;
1544	char *wmesg;
1545{
1546
1547	wk->wk_state |= IOWAITING;
1548	msleep(wk, &lk, PVM, wmesg, 0);
1549}
1550
1551/*
1552 * Process that runs once per second to handle items in the background queue.
1553 *
1554 * Note that we ensure that everything is done in the order in which they
1555 * appear in the queue. The code below depends on this property to ensure
1556 * that blocks of a file are freed before the inode itself is freed. This
1557 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1558 * until all the old ones have been purged from the dependency lists.
1559 */
1560static int
1561softdep_process_worklist(mp, full)
1562	struct mount *mp;
1563	int full;
1564{
1565	int cnt, matchcnt;
1566	struct ufsmount *ump;
1567	long starttime;
1568
1569	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1570	/*
1571	 * Record the process identifier of our caller so that we can give
1572	 * this process preferential treatment in request_cleanup below.
1573	 */
1574	matchcnt = 0;
1575	ump = VFSTOUFS(mp);
1576	ACQUIRE_LOCK(&lk);
1577	starttime = time_second;
1578	softdep_process_journal(mp, NULL, full?MNT_WAIT:0);
1579	while (ump->softdep_on_worklist > 0) {
1580		if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1581			break;
1582		else
1583			matchcnt += cnt;
1584		/*
1585		 * If requested, try removing inode or removal dependencies.
1586		 */
1587		if (req_clear_inodedeps) {
1588			clear_inodedeps();
1589			req_clear_inodedeps -= 1;
1590			wakeup_one(&proc_waiting);
1591		}
1592		if (req_clear_remove) {
1593			clear_remove();
1594			req_clear_remove -= 1;
1595			wakeup_one(&proc_waiting);
1596		}
1597		/*
1598		 * We do not generally want to stop for buffer space, but if
1599		 * we are really being a buffer hog, we will stop and wait.
1600		 */
1601		if (should_yield()) {
1602			FREE_LOCK(&lk);
1603			kern_yield(PRI_USER);
1604			bwillwrite();
1605			ACQUIRE_LOCK(&lk);
1606		}
1607		/*
1608		 * Never allow processing to run for more than one
1609		 * second. Otherwise the other mountpoints may get
1610		 * excessively backlogged.
1611		 */
1612		if (!full && starttime != time_second)
1613			break;
1614	}
1615	if (full == 0)
1616		journal_unsuspend(ump);
1617	FREE_LOCK(&lk);
1618	return (matchcnt);
1619}
1620
1621/*
1622 * Process all removes associated with a vnode if we are running out of
1623 * journal space.  Any other process which attempts to flush these will
1624 * be unable as we have the vnodes locked.
1625 */
1626static void
1627process_removes(vp)
1628	struct vnode *vp;
1629{
1630	struct inodedep *inodedep;
1631	struct dirrem *dirrem;
1632	struct mount *mp;
1633	ino_t inum;
1634
1635	rw_assert(&lk, RA_WLOCKED);
1636
1637	mp = vp->v_mount;
1638	inum = VTOI(vp)->i_number;
1639	for (;;) {
1640top:
1641		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1642			return;
1643		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1644			/*
1645			 * If another thread is trying to lock this vnode
1646			 * it will fail but we must wait for it to do so
1647			 * before we can proceed.
1648			 */
1649			if (dirrem->dm_state & INPROGRESS) {
1650				wait_worklist(&dirrem->dm_list, "pwrwait");
1651				goto top;
1652			}
1653			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1654			    (COMPLETE | ONWORKLIST))
1655				break;
1656		}
1657		if (dirrem == NULL)
1658			return;
1659		remove_from_worklist(&dirrem->dm_list);
1660		FREE_LOCK(&lk);
1661		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1662			panic("process_removes: suspended filesystem");
1663		handle_workitem_remove(dirrem, 0);
1664		vn_finished_secondary_write(mp);
1665		ACQUIRE_LOCK(&lk);
1666	}
1667}
1668
1669/*
1670 * Process all truncations associated with a vnode if we are running out
1671 * of journal space.  This is called when the vnode lock is already held
1672 * and no other process can clear the truncation.  This function returns
1673 * a value greater than zero if it did any work.
1674 */
1675static void
1676process_truncates(vp)
1677	struct vnode *vp;
1678{
1679	struct inodedep *inodedep;
1680	struct freeblks *freeblks;
1681	struct mount *mp;
1682	ino_t inum;
1683	int cgwait;
1684
1685	rw_assert(&lk, RA_WLOCKED);
1686
1687	mp = vp->v_mount;
1688	inum = VTOI(vp)->i_number;
1689	for (;;) {
1690		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1691			return;
1692		cgwait = 0;
1693		TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1694			/* Journal entries not yet written.  */
1695			if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1696				jwait(&LIST_FIRST(
1697				    &freeblks->fb_jblkdephd)->jb_list,
1698				    MNT_WAIT);
1699				break;
1700			}
1701			/* Another thread is executing this item. */
1702			if (freeblks->fb_state & INPROGRESS) {
1703				wait_worklist(&freeblks->fb_list, "ptrwait");
1704				break;
1705			}
1706			/* Freeblks is waiting on a inode write. */
1707			if ((freeblks->fb_state & COMPLETE) == 0) {
1708				FREE_LOCK(&lk);
1709				ffs_update(vp, 1);
1710				ACQUIRE_LOCK(&lk);
1711				break;
1712			}
1713			if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1714			    (ALLCOMPLETE | ONWORKLIST)) {
1715				remove_from_worklist(&freeblks->fb_list);
1716				freeblks->fb_state |= INPROGRESS;
1717				FREE_LOCK(&lk);
1718				if (vn_start_secondary_write(NULL, &mp,
1719				    V_NOWAIT))
1720					panic("process_truncates: "
1721					    "suspended filesystem");
1722				handle_workitem_freeblocks(freeblks, 0);
1723				vn_finished_secondary_write(mp);
1724				ACQUIRE_LOCK(&lk);
1725				break;
1726			}
1727			if (freeblks->fb_cgwait)
1728				cgwait++;
1729		}
1730		if (cgwait) {
1731			FREE_LOCK(&lk);
1732			sync_cgs(mp, MNT_WAIT);
1733			ffs_sync_snap(mp, MNT_WAIT);
1734			ACQUIRE_LOCK(&lk);
1735			continue;
1736		}
1737		if (freeblks == NULL)
1738			break;
1739	}
1740	return;
1741}
1742
1743/*
1744 * Process one item on the worklist.
1745 */
1746static int
1747process_worklist_item(mp, target, flags)
1748	struct mount *mp;
1749	int target;
1750	int flags;
1751{
1752	struct worklist sentinel;
1753	struct worklist *wk;
1754	struct ufsmount *ump;
1755	int matchcnt;
1756	int error;
1757
1758	rw_assert(&lk, RA_WLOCKED);
1759	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1760	/*
1761	 * If we are being called because of a process doing a
1762	 * copy-on-write, then it is not safe to write as we may
1763	 * recurse into the copy-on-write routine.
1764	 */
1765	if (curthread->td_pflags & TDP_COWINPROGRESS)
1766		return (-1);
1767	PHOLD(curproc);	/* Don't let the stack go away. */
1768	ump = VFSTOUFS(mp);
1769	matchcnt = 0;
1770	sentinel.wk_mp = NULL;
1771	sentinel.wk_type = D_SENTINEL;
1772	LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1773	for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1774	    wk = LIST_NEXT(&sentinel, wk_list)) {
1775		if (wk->wk_type == D_SENTINEL) {
1776			LIST_REMOVE(&sentinel, wk_list);
1777			LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1778			continue;
1779		}
1780		if (wk->wk_state & INPROGRESS)
1781			panic("process_worklist_item: %p already in progress.",
1782			    wk);
1783		wk->wk_state |= INPROGRESS;
1784		remove_from_worklist(wk);
1785		FREE_LOCK(&lk);
1786		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1787			panic("process_worklist_item: suspended filesystem");
1788		switch (wk->wk_type) {
1789		case D_DIRREM:
1790			/* removal of a directory entry */
1791			error = handle_workitem_remove(WK_DIRREM(wk), flags);
1792			break;
1793
1794		case D_FREEBLKS:
1795			/* releasing blocks and/or fragments from a file */
1796			error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1797			    flags);
1798			break;
1799
1800		case D_FREEFRAG:
1801			/* releasing a fragment when replaced as a file grows */
1802			handle_workitem_freefrag(WK_FREEFRAG(wk));
1803			error = 0;
1804			break;
1805
1806		case D_FREEFILE:
1807			/* releasing an inode when its link count drops to 0 */
1808			handle_workitem_freefile(WK_FREEFILE(wk));
1809			error = 0;
1810			break;
1811
1812		default:
1813			panic("%s_process_worklist: Unknown type %s",
1814			    "softdep", TYPENAME(wk->wk_type));
1815			/* NOTREACHED */
1816		}
1817		vn_finished_secondary_write(mp);
1818		ACQUIRE_LOCK(&lk);
1819		if (error == 0) {
1820			if (++matchcnt == target)
1821				break;
1822			continue;
1823		}
1824		/*
1825		 * We have to retry the worklist item later.  Wake up any
1826		 * waiters who may be able to complete it immediately and
1827		 * add the item back to the head so we don't try to execute
1828		 * it again.
1829		 */
1830		wk->wk_state &= ~INPROGRESS;
1831		wake_worklist(wk);
1832		add_to_worklist(wk, WK_HEAD);
1833	}
1834	LIST_REMOVE(&sentinel, wk_list);
1835	/* Sentinal could've become the tail from remove_from_worklist. */
1836	if (ump->softdep_worklist_tail == &sentinel)
1837		ump->softdep_worklist_tail =
1838		    (struct worklist *)sentinel.wk_list.le_prev;
1839	PRELE(curproc);
1840	return (matchcnt);
1841}
1842
1843/*
1844 * Move dependencies from one buffer to another.
1845 */
1846int
1847softdep_move_dependencies(oldbp, newbp)
1848	struct buf *oldbp;
1849	struct buf *newbp;
1850{
1851	struct worklist *wk, *wktail;
1852	int dirty;
1853
1854	dirty = 0;
1855	wktail = NULL;
1856	ACQUIRE_LOCK(&lk);
1857	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1858		LIST_REMOVE(wk, wk_list);
1859		if (wk->wk_type == D_BMSAFEMAP &&
1860		    bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
1861			dirty = 1;
1862		if (wktail == 0)
1863			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1864		else
1865			LIST_INSERT_AFTER(wktail, wk, wk_list);
1866		wktail = wk;
1867	}
1868	FREE_LOCK(&lk);
1869
1870	return (dirty);
1871}
1872
1873/*
1874 * Purge the work list of all items associated with a particular mount point.
1875 */
1876int
1877softdep_flushworklist(oldmnt, countp, td)
1878	struct mount *oldmnt;
1879	int *countp;
1880	struct thread *td;
1881{
1882	struct vnode *devvp;
1883	int count, error = 0;
1884	struct ufsmount *ump;
1885
1886	/*
1887	 * Alternately flush the block device associated with the mount
1888	 * point and process any dependencies that the flushing
1889	 * creates. We continue until no more worklist dependencies
1890	 * are found.
1891	 */
1892	*countp = 0;
1893	ump = VFSTOUFS(oldmnt);
1894	devvp = ump->um_devvp;
1895	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1896		*countp += count;
1897		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1898		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1899		VOP_UNLOCK(devvp, 0);
1900		if (error)
1901			break;
1902	}
1903	return (error);
1904}
1905
1906static int
1907softdep_waitidle(struct mount *mp)
1908{
1909	struct ufsmount *ump;
1910	int error;
1911	int i;
1912
1913	ump = VFSTOUFS(mp);
1914	ACQUIRE_LOCK(&lk);
1915	for (i = 0; i < 10 && ump->softdep_deps; i++) {
1916		ump->softdep_req = 1;
1917		if (ump->softdep_on_worklist)
1918			panic("softdep_waitidle: work added after flush.");
1919		msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
1920	}
1921	ump->softdep_req = 0;
1922	FREE_LOCK(&lk);
1923	error = 0;
1924	if (i == 10) {
1925		error = EBUSY;
1926		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1927		    mp);
1928	}
1929
1930	return (error);
1931}
1932
1933/*
1934 * Flush all vnodes and worklist items associated with a specified mount point.
1935 */
1936int
1937softdep_flushfiles(oldmnt, flags, td)
1938	struct mount *oldmnt;
1939	int flags;
1940	struct thread *td;
1941{
1942#ifdef QUOTA
1943	struct ufsmount *ump;
1944	int i;
1945#endif
1946	int error, early, depcount, loopcnt, retry_flush_count, retry;
1947	int morework;
1948
1949	loopcnt = 10;
1950	retry_flush_count = 3;
1951retry_flush:
1952	error = 0;
1953
1954	/*
1955	 * Alternately flush the vnodes associated with the mount
1956	 * point and process any dependencies that the flushing
1957	 * creates. In theory, this loop can happen at most twice,
1958	 * but we give it a few extra just to be sure.
1959	 */
1960	for (; loopcnt > 0; loopcnt--) {
1961		/*
1962		 * Do another flush in case any vnodes were brought in
1963		 * as part of the cleanup operations.
1964		 */
1965		early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
1966		    MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
1967		if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
1968			break;
1969		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1970		    depcount == 0)
1971			break;
1972	}
1973	/*
1974	 * If we are unmounting then it is an error to fail. If we
1975	 * are simply trying to downgrade to read-only, then filesystem
1976	 * activity can keep us busy forever, so we just fail with EBUSY.
1977	 */
1978	if (loopcnt == 0) {
1979		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1980			panic("softdep_flushfiles: looping");
1981		error = EBUSY;
1982	}
1983	if (!error)
1984		error = softdep_waitidle(oldmnt);
1985	if (!error) {
1986		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
1987			retry = 0;
1988			MNT_ILOCK(oldmnt);
1989			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
1990			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
1991			morework = oldmnt->mnt_nvnodelistsize > 0;
1992#ifdef QUOTA
1993			ump = VFSTOUFS(oldmnt);
1994			UFS_LOCK(ump);
1995			for (i = 0; i < MAXQUOTAS; i++) {
1996				if (ump->um_quotas[i] != NULLVP)
1997					morework = 1;
1998			}
1999			UFS_UNLOCK(ump);
2000#endif
2001			if (morework) {
2002				if (--retry_flush_count > 0) {
2003					retry = 1;
2004					loopcnt = 3;
2005				} else
2006					error = EBUSY;
2007			}
2008			MNT_IUNLOCK(oldmnt);
2009			if (retry)
2010				goto retry_flush;
2011		}
2012	}
2013	return (error);
2014}
2015
2016/*
2017 * Structure hashing.
2018 *
2019 * There are three types of structures that can be looked up:
2020 *	1) pagedep structures identified by mount point, inode number,
2021 *	   and logical block.
2022 *	2) inodedep structures identified by mount point and inode number.
2023 *	3) newblk structures identified by mount point and
2024 *	   physical block number.
2025 *
2026 * The "pagedep" and "inodedep" dependency structures are hashed
2027 * separately from the file blocks and inodes to which they correspond.
2028 * This separation helps when the in-memory copy of an inode or
2029 * file block must be replaced. It also obviates the need to access
2030 * an inode or file page when simply updating (or de-allocating)
2031 * dependency structures. Lookup of newblk structures is needed to
2032 * find newly allocated blocks when trying to associate them with
2033 * their allocdirect or allocindir structure.
2034 *
2035 * The lookup routines optionally create and hash a new instance when
2036 * an existing entry is not found.
2037 */
2038#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
2039#define NODELAY		0x0002	/* cannot do background work */
2040
2041/*
2042 * Structures and routines associated with pagedep caching.
2043 */
2044LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
2045u_long	pagedep_hash;		/* size of hash table - 1 */
2046#define	PAGEDEP_HASH(mp, inum, lbn) \
2047	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
2048	    pagedep_hash])
2049
2050static int
2051pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
2052	struct pagedep_hashhead *pagedephd;
2053	ino_t ino;
2054	ufs_lbn_t lbn;
2055	struct mount *mp;
2056	int flags;
2057	struct pagedep **pagedeppp;
2058{
2059	struct pagedep *pagedep;
2060
2061	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2062		if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn &&
2063		    mp == pagedep->pd_list.wk_mp) {
2064			*pagedeppp = pagedep;
2065			return (1);
2066		}
2067	}
2068	*pagedeppp = NULL;
2069	return (0);
2070}
2071/*
2072 * Look up a pagedep. Return 1 if found, 0 otherwise.
2073 * If not found, allocate if DEPALLOC flag is passed.
2074 * Found or allocated entry is returned in pagedeppp.
2075 * This routine must be called with splbio interrupts blocked.
2076 */
2077static int
2078pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
2079	struct mount *mp;
2080	struct buf *bp;
2081	ino_t ino;
2082	ufs_lbn_t lbn;
2083	int flags;
2084	struct pagedep **pagedeppp;
2085{
2086	struct pagedep *pagedep;
2087	struct pagedep_hashhead *pagedephd;
2088	struct worklist *wk;
2089	int ret;
2090	int i;
2091
2092	rw_assert(&lk, RA_WLOCKED);
2093	if (bp) {
2094		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2095			if (wk->wk_type == D_PAGEDEP) {
2096				*pagedeppp = WK_PAGEDEP(wk);
2097				return (1);
2098			}
2099		}
2100	}
2101	pagedephd = PAGEDEP_HASH(mp, ino, lbn);
2102	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
2103	if (ret) {
2104		if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2105			WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2106		return (1);
2107	}
2108	if ((flags & DEPALLOC) == 0)
2109		return (0);
2110	FREE_LOCK(&lk);
2111	pagedep = malloc(sizeof(struct pagedep),
2112	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2113	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2114	ACQUIRE_LOCK(&lk);
2115	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
2116	if (*pagedeppp) {
2117		/*
2118		 * This should never happen since we only create pagedeps
2119		 * with the vnode lock held.  Could be an assert.
2120		 */
2121		WORKITEM_FREE(pagedep, D_PAGEDEP);
2122		return (ret);
2123	}
2124	pagedep->pd_ino = ino;
2125	pagedep->pd_lbn = lbn;
2126	LIST_INIT(&pagedep->pd_dirremhd);
2127	LIST_INIT(&pagedep->pd_pendinghd);
2128	for (i = 0; i < DAHASHSZ; i++)
2129		LIST_INIT(&pagedep->pd_diraddhd[i]);
2130	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2131	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2132	*pagedeppp = pagedep;
2133	return (0);
2134}
2135
2136/*
2137 * Structures and routines associated with inodedep caching.
2138 */
2139LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
2140static u_long	inodedep_hash;	/* size of hash table - 1 */
2141#define	INODEDEP_HASH(fs, inum) \
2142      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
2143
2144static int
2145inodedep_find(inodedephd, fs, inum, inodedeppp)
2146	struct inodedep_hashhead *inodedephd;
2147	struct fs *fs;
2148	ino_t inum;
2149	struct inodedep **inodedeppp;
2150{
2151	struct inodedep *inodedep;
2152
2153	LIST_FOREACH(inodedep, inodedephd, id_hash)
2154		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
2155			break;
2156	if (inodedep) {
2157		*inodedeppp = inodedep;
2158		return (1);
2159	}
2160	*inodedeppp = NULL;
2161
2162	return (0);
2163}
2164/*
2165 * Look up an inodedep. Return 1 if found, 0 if not found.
2166 * If not found, allocate if DEPALLOC flag is passed.
2167 * Found or allocated entry is returned in inodedeppp.
2168 * This routine must be called with splbio interrupts blocked.
2169 */
2170static int
2171inodedep_lookup(mp, inum, flags, inodedeppp)
2172	struct mount *mp;
2173	ino_t inum;
2174	int flags;
2175	struct inodedep **inodedeppp;
2176{
2177	struct inodedep *inodedep;
2178	struct inodedep_hashhead *inodedephd;
2179	struct fs *fs;
2180
2181	rw_assert(&lk, RA_WLOCKED);
2182	fs = VFSTOUFS(mp)->um_fs;
2183	inodedephd = INODEDEP_HASH(fs, inum);
2184
2185	if (inodedep_find(inodedephd, fs, inum, inodedeppp))
2186		return (1);
2187	if ((flags & DEPALLOC) == 0)
2188		return (0);
2189	/*
2190	 * If we are over our limit, try to improve the situation.
2191	 */
2192	if (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0)
2193		request_cleanup(mp, FLUSH_INODES);
2194	FREE_LOCK(&lk);
2195	inodedep = malloc(sizeof(struct inodedep),
2196		M_INODEDEP, M_SOFTDEP_FLAGS);
2197	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2198	ACQUIRE_LOCK(&lk);
2199	if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
2200		WORKITEM_FREE(inodedep, D_INODEDEP);
2201		return (1);
2202	}
2203	inodedep->id_fs = fs;
2204	inodedep->id_ino = inum;
2205	inodedep->id_state = ALLCOMPLETE;
2206	inodedep->id_nlinkdelta = 0;
2207	inodedep->id_savedino1 = NULL;
2208	inodedep->id_savedsize = -1;
2209	inodedep->id_savedextsize = -1;
2210	inodedep->id_savednlink = -1;
2211	inodedep->id_bmsafemap = NULL;
2212	inodedep->id_mkdiradd = NULL;
2213	LIST_INIT(&inodedep->id_dirremhd);
2214	LIST_INIT(&inodedep->id_pendinghd);
2215	LIST_INIT(&inodedep->id_inowait);
2216	LIST_INIT(&inodedep->id_bufwait);
2217	TAILQ_INIT(&inodedep->id_inoreflst);
2218	TAILQ_INIT(&inodedep->id_inoupdt);
2219	TAILQ_INIT(&inodedep->id_newinoupdt);
2220	TAILQ_INIT(&inodedep->id_extupdt);
2221	TAILQ_INIT(&inodedep->id_newextupdt);
2222	TAILQ_INIT(&inodedep->id_freeblklst);
2223	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2224	*inodedeppp = inodedep;
2225	return (0);
2226}
2227
2228/*
2229 * Structures and routines associated with newblk caching.
2230 */
2231LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
2232u_long	newblk_hash;		/* size of hash table - 1 */
2233#define	NEWBLK_HASH(fs, inum) \
2234	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
2235
2236static int
2237newblk_find(newblkhd, mp, newblkno, flags, newblkpp)
2238	struct newblk_hashhead *newblkhd;
2239	struct mount *mp;
2240	ufs2_daddr_t newblkno;
2241	int flags;
2242	struct newblk **newblkpp;
2243{
2244	struct newblk *newblk;
2245
2246	LIST_FOREACH(newblk, newblkhd, nb_hash) {
2247		if (newblkno != newblk->nb_newblkno)
2248			continue;
2249		if (mp != newblk->nb_list.wk_mp)
2250			continue;
2251		/*
2252		 * If we're creating a new dependency don't match those that
2253		 * have already been converted to allocdirects.  This is for
2254		 * a frag extend.
2255		 */
2256		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2257			continue;
2258		break;
2259	}
2260	if (newblk) {
2261		*newblkpp = newblk;
2262		return (1);
2263	}
2264	*newblkpp = NULL;
2265	return (0);
2266}
2267
2268/*
2269 * Look up a newblk. Return 1 if found, 0 if not found.
2270 * If not found, allocate if DEPALLOC flag is passed.
2271 * Found or allocated entry is returned in newblkpp.
2272 */
2273static int
2274newblk_lookup(mp, newblkno, flags, newblkpp)
2275	struct mount *mp;
2276	ufs2_daddr_t newblkno;
2277	int flags;
2278	struct newblk **newblkpp;
2279{
2280	struct newblk *newblk;
2281	struct newblk_hashhead *newblkhd;
2282
2283	newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno);
2284	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp))
2285		return (1);
2286	if ((flags & DEPALLOC) == 0)
2287		return (0);
2288	FREE_LOCK(&lk);
2289	newblk = malloc(sizeof(union allblk), M_NEWBLK,
2290	    M_SOFTDEP_FLAGS | M_ZERO);
2291	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2292	ACQUIRE_LOCK(&lk);
2293	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) {
2294		WORKITEM_FREE(newblk, D_NEWBLK);
2295		return (1);
2296	}
2297	newblk->nb_freefrag = NULL;
2298	LIST_INIT(&newblk->nb_indirdeps);
2299	LIST_INIT(&newblk->nb_newdirblk);
2300	LIST_INIT(&newblk->nb_jwork);
2301	newblk->nb_state = ATTACHED;
2302	newblk->nb_newblkno = newblkno;
2303	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2304	*newblkpp = newblk;
2305	return (0);
2306}
2307
2308/*
2309 * Structures and routines associated with freed indirect block caching.
2310 */
2311struct freeworklst *indir_hashtbl;
2312u_long	indir_hash;		/* size of hash table - 1 */
2313#define	INDIR_HASH(mp, blkno) \
2314	(&indir_hashtbl[((((register_t)(mp)) >> 13) + (blkno)) & indir_hash])
2315
2316/*
2317 * Lookup an indirect block in the indir hash table.  The freework is
2318 * removed and potentially freed.  The caller must do a blocking journal
2319 * write before writing to the blkno.
2320 */
2321static int
2322indirblk_lookup(mp, blkno)
2323	struct mount *mp;
2324	ufs2_daddr_t blkno;
2325{
2326	struct freework *freework;
2327	struct freeworklst *wkhd;
2328
2329	wkhd = INDIR_HASH(mp, blkno);
2330	TAILQ_FOREACH(freework, wkhd, fw_next) {
2331		if (freework->fw_blkno != blkno)
2332			continue;
2333		if (freework->fw_list.wk_mp != mp)
2334			continue;
2335		indirblk_remove(freework);
2336		return (1);
2337	}
2338	return (0);
2339}
2340
2341/*
2342 * Insert an indirect block represented by freework into the indirblk
2343 * hash table so that it may prevent the block from being re-used prior
2344 * to the journal being written.
2345 */
2346static void
2347indirblk_insert(freework)
2348	struct freework *freework;
2349{
2350	struct jblocks *jblocks;
2351	struct jseg *jseg;
2352
2353	jblocks = VFSTOUFS(freework->fw_list.wk_mp)->softdep_jblocks;
2354	jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2355	if (jseg == NULL)
2356		return;
2357
2358	LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2359	TAILQ_INSERT_HEAD(INDIR_HASH(freework->fw_list.wk_mp,
2360	    freework->fw_blkno), freework, fw_next);
2361	freework->fw_state &= ~DEPCOMPLETE;
2362}
2363
2364static void
2365indirblk_remove(freework)
2366	struct freework *freework;
2367{
2368
2369	LIST_REMOVE(freework, fw_segs);
2370	TAILQ_REMOVE(INDIR_HASH(freework->fw_list.wk_mp,
2371	    freework->fw_blkno), freework, fw_next);
2372	freework->fw_state |= DEPCOMPLETE;
2373	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2374		WORKITEM_FREE(freework, D_FREEWORK);
2375}
2376
2377/*
2378 * Executed during filesystem system initialization before
2379 * mounting any filesystems.
2380 */
2381void
2382softdep_initialize()
2383{
2384	int i;
2385
2386	LIST_INIT(&mkdirlisthd);
2387	max_softdeps = desiredvnodes * 4;
2388	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash);
2389	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
2390	newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK, &newblk_hash);
2391	bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
2392	i = 1 << (ffs(desiredvnodes / 10) - 1);
2393	indir_hashtbl = malloc(i * sizeof(indir_hashtbl[0]), M_FREEWORK,
2394	    M_WAITOK);
2395	indir_hash = i - 1;
2396	for (i = 0; i <= indir_hash; i++)
2397		TAILQ_INIT(&indir_hashtbl[i]);
2398
2399	/* initialise bioops hack */
2400	bioops.io_start = softdep_disk_io_initiation;
2401	bioops.io_complete = softdep_disk_write_complete;
2402	bioops.io_deallocate = softdep_deallocate_dependencies;
2403	bioops.io_countdeps = softdep_count_dependencies;
2404
2405	/* Initialize the callout with an mtx. */
2406	callout_init_mtx(&softdep_callout, &lk, 0);
2407}
2408
2409/*
2410 * Executed after all filesystems have been unmounted during
2411 * filesystem module unload.
2412 */
2413void
2414softdep_uninitialize()
2415{
2416
2417	callout_drain(&softdep_callout);
2418	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
2419	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
2420	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
2421	hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash);
2422	free(indir_hashtbl, M_FREEWORK);
2423}
2424
2425/*
2426 * Called at mount time to notify the dependency code that a
2427 * filesystem wishes to use it.
2428 */
2429int
2430softdep_mount(devvp, mp, fs, cred)
2431	struct vnode *devvp;
2432	struct mount *mp;
2433	struct fs *fs;
2434	struct ucred *cred;
2435{
2436	struct csum_total cstotal;
2437	struct ufsmount *ump;
2438	struct cg *cgp;
2439	struct buf *bp;
2440	int error, cyl;
2441
2442	MNT_ILOCK(mp);
2443	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2444	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2445		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2446			MNTK_SOFTDEP | MNTK_NOASYNC;
2447	}
2448	MNT_IUNLOCK(mp);
2449	ump = VFSTOUFS(mp);
2450	LIST_INIT(&ump->softdep_workitem_pending);
2451	LIST_INIT(&ump->softdep_journal_pending);
2452	TAILQ_INIT(&ump->softdep_unlinked);
2453	LIST_INIT(&ump->softdep_dirtycg);
2454	ump->softdep_worklist_tail = NULL;
2455	ump->softdep_on_worklist = 0;
2456	ump->softdep_deps = 0;
2457	if ((fs->fs_flags & FS_SUJ) &&
2458	    (error = journal_mount(mp, fs, cred)) != 0) {
2459		printf("Failed to start journal: %d\n", error);
2460		return (error);
2461	}
2462	/*
2463	 * When doing soft updates, the counters in the
2464	 * superblock may have gotten out of sync. Recomputation
2465	 * can take a long time and can be deferred for background
2466	 * fsck.  However, the old behavior of scanning the cylinder
2467	 * groups and recalculating them at mount time is available
2468	 * by setting vfs.ffs.compute_summary_at_mount to one.
2469	 */
2470	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2471		return (0);
2472	bzero(&cstotal, sizeof cstotal);
2473	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2474		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2475		    fs->fs_cgsize, cred, &bp)) != 0) {
2476			brelse(bp);
2477			return (error);
2478		}
2479		cgp = (struct cg *)bp->b_data;
2480		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2481		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2482		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2483		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2484		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2485		brelse(bp);
2486	}
2487#ifdef DEBUG
2488	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2489		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2490#endif
2491	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2492	return (0);
2493}
2494
2495void
2496softdep_unmount(mp)
2497	struct mount *mp;
2498{
2499
2500	MNT_ILOCK(mp);
2501	mp->mnt_flag &= ~MNT_SOFTDEP;
2502	if (MOUNTEDSUJ(mp) == 0) {
2503		MNT_IUNLOCK(mp);
2504		return;
2505	}
2506	mp->mnt_flag &= ~MNT_SUJ;
2507	MNT_IUNLOCK(mp);
2508	journal_unmount(mp);
2509}
2510
2511static struct jblocks *
2512jblocks_create(void)
2513{
2514	struct jblocks *jblocks;
2515
2516	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2517	TAILQ_INIT(&jblocks->jb_segs);
2518	jblocks->jb_avail = 10;
2519	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2520	    M_JBLOCKS, M_WAITOK | M_ZERO);
2521
2522	return (jblocks);
2523}
2524
2525static ufs2_daddr_t
2526jblocks_alloc(jblocks, bytes, actual)
2527	struct jblocks *jblocks;
2528	int bytes;
2529	int *actual;
2530{
2531	ufs2_daddr_t daddr;
2532	struct jextent *jext;
2533	int freecnt;
2534	int blocks;
2535
2536	blocks = bytes / DEV_BSIZE;
2537	jext = &jblocks->jb_extent[jblocks->jb_head];
2538	freecnt = jext->je_blocks - jblocks->jb_off;
2539	if (freecnt == 0) {
2540		jblocks->jb_off = 0;
2541		if (++jblocks->jb_head > jblocks->jb_used)
2542			jblocks->jb_head = 0;
2543		jext = &jblocks->jb_extent[jblocks->jb_head];
2544		freecnt = jext->je_blocks;
2545	}
2546	if (freecnt > blocks)
2547		freecnt = blocks;
2548	*actual = freecnt * DEV_BSIZE;
2549	daddr = jext->je_daddr + jblocks->jb_off;
2550	jblocks->jb_off += freecnt;
2551	jblocks->jb_free -= freecnt;
2552
2553	return (daddr);
2554}
2555
2556static void
2557jblocks_free(jblocks, mp, bytes)
2558	struct jblocks *jblocks;
2559	struct mount *mp;
2560	int bytes;
2561{
2562
2563	jblocks->jb_free += bytes / DEV_BSIZE;
2564	if (jblocks->jb_suspended)
2565		worklist_speedup();
2566	wakeup(jblocks);
2567}
2568
2569static void
2570jblocks_destroy(jblocks)
2571	struct jblocks *jblocks;
2572{
2573
2574	if (jblocks->jb_extent)
2575		free(jblocks->jb_extent, M_JBLOCKS);
2576	free(jblocks, M_JBLOCKS);
2577}
2578
2579static void
2580jblocks_add(jblocks, daddr, blocks)
2581	struct jblocks *jblocks;
2582	ufs2_daddr_t daddr;
2583	int blocks;
2584{
2585	struct jextent *jext;
2586
2587	jblocks->jb_blocks += blocks;
2588	jblocks->jb_free += blocks;
2589	jext = &jblocks->jb_extent[jblocks->jb_used];
2590	/* Adding the first block. */
2591	if (jext->je_daddr == 0) {
2592		jext->je_daddr = daddr;
2593		jext->je_blocks = blocks;
2594		return;
2595	}
2596	/* Extending the last extent. */
2597	if (jext->je_daddr + jext->je_blocks == daddr) {
2598		jext->je_blocks += blocks;
2599		return;
2600	}
2601	/* Adding a new extent. */
2602	if (++jblocks->jb_used == jblocks->jb_avail) {
2603		jblocks->jb_avail *= 2;
2604		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2605		    M_JBLOCKS, M_WAITOK | M_ZERO);
2606		memcpy(jext, jblocks->jb_extent,
2607		    sizeof(struct jextent) * jblocks->jb_used);
2608		free(jblocks->jb_extent, M_JBLOCKS);
2609		jblocks->jb_extent = jext;
2610	}
2611	jext = &jblocks->jb_extent[jblocks->jb_used];
2612	jext->je_daddr = daddr;
2613	jext->je_blocks = blocks;
2614	return;
2615}
2616
2617int
2618softdep_journal_lookup(mp, vpp)
2619	struct mount *mp;
2620	struct vnode **vpp;
2621{
2622	struct componentname cnp;
2623	struct vnode *dvp;
2624	ino_t sujournal;
2625	int error;
2626
2627	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2628	if (error)
2629		return (error);
2630	bzero(&cnp, sizeof(cnp));
2631	cnp.cn_nameiop = LOOKUP;
2632	cnp.cn_flags = ISLASTCN;
2633	cnp.cn_thread = curthread;
2634	cnp.cn_cred = curthread->td_ucred;
2635	cnp.cn_pnbuf = SUJ_FILE;
2636	cnp.cn_nameptr = SUJ_FILE;
2637	cnp.cn_namelen = strlen(SUJ_FILE);
2638	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2639	vput(dvp);
2640	if (error != 0)
2641		return (error);
2642	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2643	return (error);
2644}
2645
2646/*
2647 * Open and verify the journal file.
2648 */
2649static int
2650journal_mount(mp, fs, cred)
2651	struct mount *mp;
2652	struct fs *fs;
2653	struct ucred *cred;
2654{
2655	struct jblocks *jblocks;
2656	struct vnode *vp;
2657	struct inode *ip;
2658	ufs2_daddr_t blkno;
2659	int bcount;
2660	int error;
2661	int i;
2662
2663	error = softdep_journal_lookup(mp, &vp);
2664	if (error != 0) {
2665		printf("Failed to find journal.  Use tunefs to create one\n");
2666		return (error);
2667	}
2668	ip = VTOI(vp);
2669	if (ip->i_size < SUJ_MIN) {
2670		error = ENOSPC;
2671		goto out;
2672	}
2673	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2674	jblocks = jblocks_create();
2675	for (i = 0; i < bcount; i++) {
2676		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2677		if (error)
2678			break;
2679		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2680	}
2681	if (error) {
2682		jblocks_destroy(jblocks);
2683		goto out;
2684	}
2685	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2686	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2687	VFSTOUFS(mp)->softdep_jblocks = jblocks;
2688out:
2689	if (error == 0) {
2690		MNT_ILOCK(mp);
2691		mp->mnt_flag |= MNT_SUJ;
2692		mp->mnt_flag &= ~MNT_SOFTDEP;
2693		MNT_IUNLOCK(mp);
2694		/*
2695		 * Only validate the journal contents if the
2696		 * filesystem is clean, otherwise we write the logs
2697		 * but they'll never be used.  If the filesystem was
2698		 * still dirty when we mounted it the journal is
2699		 * invalid and a new journal can only be valid if it
2700		 * starts from a clean mount.
2701		 */
2702		if (fs->fs_clean) {
2703			DIP_SET(ip, i_modrev, fs->fs_mtime);
2704			ip->i_flags |= IN_MODIFIED;
2705			ffs_update(vp, 1);
2706		}
2707	}
2708	vput(vp);
2709	return (error);
2710}
2711
2712static void
2713journal_unmount(mp)
2714	struct mount *mp;
2715{
2716	struct ufsmount *ump;
2717
2718	ump = VFSTOUFS(mp);
2719	if (ump->softdep_jblocks)
2720		jblocks_destroy(ump->softdep_jblocks);
2721	ump->softdep_jblocks = NULL;
2722}
2723
2724/*
2725 * Called when a journal record is ready to be written.  Space is allocated
2726 * and the journal entry is created when the journal is flushed to stable
2727 * store.
2728 */
2729static void
2730add_to_journal(wk)
2731	struct worklist *wk;
2732{
2733	struct ufsmount *ump;
2734
2735	rw_assert(&lk, RA_WLOCKED);
2736	ump = VFSTOUFS(wk->wk_mp);
2737	if (wk->wk_state & ONWORKLIST)
2738		panic("add_to_journal: %s(0x%X) already on list",
2739		    TYPENAME(wk->wk_type), wk->wk_state);
2740	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2741	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2742		ump->softdep_jblocks->jb_age = ticks;
2743		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2744	} else
2745		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2746	ump->softdep_journal_tail = wk;
2747	ump->softdep_on_journal += 1;
2748}
2749
2750/*
2751 * Remove an arbitrary item for the journal worklist maintain the tail
2752 * pointer.  This happens when a new operation obviates the need to
2753 * journal an old operation.
2754 */
2755static void
2756remove_from_journal(wk)
2757	struct worklist *wk;
2758{
2759	struct ufsmount *ump;
2760
2761	rw_assert(&lk, RA_WLOCKED);
2762	ump = VFSTOUFS(wk->wk_mp);
2763#ifdef SUJ_DEBUG
2764	{
2765		struct worklist *wkn;
2766
2767		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2768			if (wkn == wk)
2769				break;
2770		if (wkn == NULL)
2771			panic("remove_from_journal: %p is not in journal", wk);
2772	}
2773#endif
2774	/*
2775	 * We emulate a TAILQ to save space in most structures which do not
2776	 * require TAILQ semantics.  Here we must update the tail position
2777	 * when removing the tail which is not the final entry. This works
2778	 * only if the worklist linkage are at the beginning of the structure.
2779	 */
2780	if (ump->softdep_journal_tail == wk)
2781		ump->softdep_journal_tail =
2782		    (struct worklist *)wk->wk_list.le_prev;
2783
2784	WORKLIST_REMOVE(wk);
2785	ump->softdep_on_journal -= 1;
2786}
2787
2788/*
2789 * Check for journal space as well as dependency limits so the prelink
2790 * code can throttle both journaled and non-journaled filesystems.
2791 * Threshold is 0 for low and 1 for min.
2792 */
2793static int
2794journal_space(ump, thresh)
2795	struct ufsmount *ump;
2796	int thresh;
2797{
2798	struct jblocks *jblocks;
2799	int avail;
2800
2801	jblocks = ump->softdep_jblocks;
2802	if (jblocks == NULL)
2803		return (1);
2804	/*
2805	 * We use a tighter restriction here to prevent request_cleanup()
2806	 * running in threads from running into locks we currently hold.
2807	 */
2808	if (dep_current[D_INODEDEP] > (max_softdeps / 10) * 9)
2809		return (0);
2810	if (thresh)
2811		thresh = jblocks->jb_min;
2812	else
2813		thresh = jblocks->jb_low;
2814	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2815	avail = jblocks->jb_free - avail;
2816
2817	return (avail > thresh);
2818}
2819
2820static void
2821journal_suspend(ump)
2822	struct ufsmount *ump;
2823{
2824	struct jblocks *jblocks;
2825	struct mount *mp;
2826
2827	mp = UFSTOVFS(ump);
2828	jblocks = ump->softdep_jblocks;
2829	MNT_ILOCK(mp);
2830	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2831		stat_journal_min++;
2832		mp->mnt_kern_flag |= MNTK_SUSPEND;
2833		mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
2834	}
2835	jblocks->jb_suspended = 1;
2836	MNT_IUNLOCK(mp);
2837}
2838
2839static int
2840journal_unsuspend(struct ufsmount *ump)
2841{
2842	struct jblocks *jblocks;
2843	struct mount *mp;
2844
2845	mp = UFSTOVFS(ump);
2846	jblocks = ump->softdep_jblocks;
2847
2848	if (jblocks != NULL && jblocks->jb_suspended &&
2849	    journal_space(ump, jblocks->jb_min)) {
2850		jblocks->jb_suspended = 0;
2851		FREE_LOCK(&lk);
2852		mp->mnt_susp_owner = curthread;
2853		vfs_write_resume(mp, 0);
2854		ACQUIRE_LOCK(&lk);
2855		return (1);
2856	}
2857	return (0);
2858}
2859
2860/*
2861 * Called before any allocation function to be certain that there is
2862 * sufficient space in the journal prior to creating any new records.
2863 * Since in the case of block allocation we may have multiple locked
2864 * buffers at the time of the actual allocation we can not block
2865 * when the journal records are created.  Doing so would create a deadlock
2866 * if any of these buffers needed to be flushed to reclaim space.  Instead
2867 * we require a sufficiently large amount of available space such that
2868 * each thread in the system could have passed this allocation check and
2869 * still have sufficient free space.  With 20% of a minimum journal size
2870 * of 1MB we have 6553 records available.
2871 */
2872int
2873softdep_prealloc(vp, waitok)
2874	struct vnode *vp;
2875	int waitok;
2876{
2877	struct ufsmount *ump;
2878
2879	/*
2880	 * Nothing to do if we are not running journaled soft updates.
2881	 * If we currently hold the snapshot lock, we must avoid handling
2882	 * other resources that could cause deadlock.
2883	 */
2884	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)))
2885		return (0);
2886	ump = VFSTOUFS(vp->v_mount);
2887	ACQUIRE_LOCK(&lk);
2888	if (journal_space(ump, 0)) {
2889		FREE_LOCK(&lk);
2890		return (0);
2891	}
2892	stat_journal_low++;
2893	FREE_LOCK(&lk);
2894	if (waitok == MNT_NOWAIT)
2895		return (ENOSPC);
2896	/*
2897	 * Attempt to sync this vnode once to flush any journal
2898	 * work attached to it.
2899	 */
2900	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
2901		ffs_syncvnode(vp, waitok, 0);
2902	ACQUIRE_LOCK(&lk);
2903	process_removes(vp);
2904	process_truncates(vp);
2905	if (journal_space(ump, 0) == 0) {
2906		softdep_speedup();
2907		if (journal_space(ump, 1) == 0)
2908			journal_suspend(ump);
2909	}
2910	FREE_LOCK(&lk);
2911
2912	return (0);
2913}
2914
2915/*
2916 * Before adjusting a link count on a vnode verify that we have sufficient
2917 * journal space.  If not, process operations that depend on the currently
2918 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
2919 * and softdep flush threads can not acquire these locks to reclaim space.
2920 */
2921static void
2922softdep_prelink(dvp, vp)
2923	struct vnode *dvp;
2924	struct vnode *vp;
2925{
2926	struct ufsmount *ump;
2927
2928	ump = VFSTOUFS(dvp->v_mount);
2929	rw_assert(&lk, RA_WLOCKED);
2930	/*
2931	 * Nothing to do if we have sufficient journal space.
2932	 * If we currently hold the snapshot lock, we must avoid
2933	 * handling other resources that could cause deadlock.
2934	 */
2935	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
2936		return;
2937	stat_journal_low++;
2938	FREE_LOCK(&lk);
2939	if (vp)
2940		ffs_syncvnode(vp, MNT_NOWAIT, 0);
2941	ffs_syncvnode(dvp, MNT_WAIT, 0);
2942	ACQUIRE_LOCK(&lk);
2943	/* Process vp before dvp as it may create .. removes. */
2944	if (vp) {
2945		process_removes(vp);
2946		process_truncates(vp);
2947	}
2948	process_removes(dvp);
2949	process_truncates(dvp);
2950	softdep_speedup();
2951	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
2952	if (journal_space(ump, 0) == 0) {
2953		softdep_speedup();
2954		if (journal_space(ump, 1) == 0)
2955			journal_suspend(ump);
2956	}
2957}
2958
2959static void
2960jseg_write(ump, jseg, data)
2961	struct ufsmount *ump;
2962	struct jseg *jseg;
2963	uint8_t *data;
2964{
2965	struct jsegrec *rec;
2966
2967	rec = (struct jsegrec *)data;
2968	rec->jsr_seq = jseg->js_seq;
2969	rec->jsr_oldest = jseg->js_oldseq;
2970	rec->jsr_cnt = jseg->js_cnt;
2971	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
2972	rec->jsr_crc = 0;
2973	rec->jsr_time = ump->um_fs->fs_mtime;
2974}
2975
2976static inline void
2977inoref_write(inoref, jseg, rec)
2978	struct inoref *inoref;
2979	struct jseg *jseg;
2980	struct jrefrec *rec;
2981{
2982
2983	inoref->if_jsegdep->jd_seg = jseg;
2984	rec->jr_ino = inoref->if_ino;
2985	rec->jr_parent = inoref->if_parent;
2986	rec->jr_nlink = inoref->if_nlink;
2987	rec->jr_mode = inoref->if_mode;
2988	rec->jr_diroff = inoref->if_diroff;
2989}
2990
2991static void
2992jaddref_write(jaddref, jseg, data)
2993	struct jaddref *jaddref;
2994	struct jseg *jseg;
2995	uint8_t *data;
2996{
2997	struct jrefrec *rec;
2998
2999	rec = (struct jrefrec *)data;
3000	rec->jr_op = JOP_ADDREF;
3001	inoref_write(&jaddref->ja_ref, jseg, rec);
3002}
3003
3004static void
3005jremref_write(jremref, jseg, data)
3006	struct jremref *jremref;
3007	struct jseg *jseg;
3008	uint8_t *data;
3009{
3010	struct jrefrec *rec;
3011
3012	rec = (struct jrefrec *)data;
3013	rec->jr_op = JOP_REMREF;
3014	inoref_write(&jremref->jr_ref, jseg, rec);
3015}
3016
3017static void
3018jmvref_write(jmvref, jseg, data)
3019	struct jmvref *jmvref;
3020	struct jseg *jseg;
3021	uint8_t *data;
3022{
3023	struct jmvrec *rec;
3024
3025	rec = (struct jmvrec *)data;
3026	rec->jm_op = JOP_MVREF;
3027	rec->jm_ino = jmvref->jm_ino;
3028	rec->jm_parent = jmvref->jm_parent;
3029	rec->jm_oldoff = jmvref->jm_oldoff;
3030	rec->jm_newoff = jmvref->jm_newoff;
3031}
3032
3033static void
3034jnewblk_write(jnewblk, jseg, data)
3035	struct jnewblk *jnewblk;
3036	struct jseg *jseg;
3037	uint8_t *data;
3038{
3039	struct jblkrec *rec;
3040
3041	jnewblk->jn_jsegdep->jd_seg = jseg;
3042	rec = (struct jblkrec *)data;
3043	rec->jb_op = JOP_NEWBLK;
3044	rec->jb_ino = jnewblk->jn_ino;
3045	rec->jb_blkno = jnewblk->jn_blkno;
3046	rec->jb_lbn = jnewblk->jn_lbn;
3047	rec->jb_frags = jnewblk->jn_frags;
3048	rec->jb_oldfrags = jnewblk->jn_oldfrags;
3049}
3050
3051static void
3052jfreeblk_write(jfreeblk, jseg, data)
3053	struct jfreeblk *jfreeblk;
3054	struct jseg *jseg;
3055	uint8_t *data;
3056{
3057	struct jblkrec *rec;
3058
3059	jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3060	rec = (struct jblkrec *)data;
3061	rec->jb_op = JOP_FREEBLK;
3062	rec->jb_ino = jfreeblk->jf_ino;
3063	rec->jb_blkno = jfreeblk->jf_blkno;
3064	rec->jb_lbn = jfreeblk->jf_lbn;
3065	rec->jb_frags = jfreeblk->jf_frags;
3066	rec->jb_oldfrags = 0;
3067}
3068
3069static void
3070jfreefrag_write(jfreefrag, jseg, data)
3071	struct jfreefrag *jfreefrag;
3072	struct jseg *jseg;
3073	uint8_t *data;
3074{
3075	struct jblkrec *rec;
3076
3077	jfreefrag->fr_jsegdep->jd_seg = jseg;
3078	rec = (struct jblkrec *)data;
3079	rec->jb_op = JOP_FREEBLK;
3080	rec->jb_ino = jfreefrag->fr_ino;
3081	rec->jb_blkno = jfreefrag->fr_blkno;
3082	rec->jb_lbn = jfreefrag->fr_lbn;
3083	rec->jb_frags = jfreefrag->fr_frags;
3084	rec->jb_oldfrags = 0;
3085}
3086
3087static void
3088jtrunc_write(jtrunc, jseg, data)
3089	struct jtrunc *jtrunc;
3090	struct jseg *jseg;
3091	uint8_t *data;
3092{
3093	struct jtrncrec *rec;
3094
3095	jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3096	rec = (struct jtrncrec *)data;
3097	rec->jt_op = JOP_TRUNC;
3098	rec->jt_ino = jtrunc->jt_ino;
3099	rec->jt_size = jtrunc->jt_size;
3100	rec->jt_extsize = jtrunc->jt_extsize;
3101}
3102
3103static void
3104jfsync_write(jfsync, jseg, data)
3105	struct jfsync *jfsync;
3106	struct jseg *jseg;
3107	uint8_t *data;
3108{
3109	struct jtrncrec *rec;
3110
3111	rec = (struct jtrncrec *)data;
3112	rec->jt_op = JOP_SYNC;
3113	rec->jt_ino = jfsync->jfs_ino;
3114	rec->jt_size = jfsync->jfs_size;
3115	rec->jt_extsize = jfsync->jfs_extsize;
3116}
3117
3118static void
3119softdep_flushjournal(mp)
3120	struct mount *mp;
3121{
3122	struct jblocks *jblocks;
3123	struct ufsmount *ump;
3124
3125	if (MOUNTEDSUJ(mp) == 0)
3126		return;
3127	ump = VFSTOUFS(mp);
3128	jblocks = ump->softdep_jblocks;
3129	ACQUIRE_LOCK(&lk);
3130	while (ump->softdep_on_journal) {
3131		jblocks->jb_needseg = 1;
3132		softdep_process_journal(mp, NULL, MNT_WAIT);
3133	}
3134	FREE_LOCK(&lk);
3135}
3136
3137static void softdep_synchronize_completed(struct bio *);
3138static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3139
3140static void
3141softdep_synchronize_completed(bp)
3142        struct bio *bp;
3143{
3144	struct jseg *oldest;
3145	struct jseg *jseg;
3146
3147	/*
3148	 * caller1 marks the last segment written before we issued the
3149	 * synchronize cache.
3150	 */
3151	jseg = bp->bio_caller1;
3152	oldest = NULL;
3153	ACQUIRE_LOCK(&lk);
3154	/*
3155	 * Mark all the journal entries waiting on the synchronize cache
3156	 * as completed so they may continue on.
3157	 */
3158	while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3159		jseg->js_state |= COMPLETE;
3160		oldest = jseg;
3161		jseg = TAILQ_PREV(jseg, jseglst, js_next);
3162	}
3163	/*
3164	 * Restart deferred journal entry processing from the oldest
3165	 * completed jseg.
3166	 */
3167	if (oldest)
3168		complete_jsegs(oldest);
3169
3170	FREE_LOCK(&lk);
3171	g_destroy_bio(bp);
3172}
3173
3174/*
3175 * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3176 * barriers.  The journal must be written prior to any blocks that depend
3177 * on it and the journal can not be released until the blocks have be
3178 * written.  This code handles both barriers simultaneously.
3179 */
3180static void
3181softdep_synchronize(bp, ump, caller1)
3182	struct bio *bp;
3183	struct ufsmount *ump;
3184	void *caller1;
3185{
3186
3187	bp->bio_cmd = BIO_FLUSH;
3188	bp->bio_flags |= BIO_ORDERED;
3189	bp->bio_data = NULL;
3190	bp->bio_offset = ump->um_cp->provider->mediasize;
3191	bp->bio_length = 0;
3192	bp->bio_done = softdep_synchronize_completed;
3193	bp->bio_caller1 = caller1;
3194	g_io_request(bp,
3195	    (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
3196}
3197
3198/*
3199 * Flush some journal records to disk.
3200 */
3201static void
3202softdep_process_journal(mp, needwk, flags)
3203	struct mount *mp;
3204	struct worklist *needwk;
3205	int flags;
3206{
3207	struct jblocks *jblocks;
3208	struct ufsmount *ump;
3209	struct worklist *wk;
3210	struct jseg *jseg;
3211	struct buf *bp;
3212	struct bio *bio;
3213	uint8_t *data;
3214	struct fs *fs;
3215	int shouldflush;
3216	int segwritten;
3217	int jrecmin;	/* Minimum records per block. */
3218	int jrecmax;	/* Maximum records per block. */
3219	int size;
3220	int cnt;
3221	int off;
3222	int devbsize;
3223
3224	if (MOUNTEDSUJ(mp) == 0)
3225		return;
3226	shouldflush = softdep_flushcache;
3227	bio = NULL;
3228	jseg = NULL;
3229	ump = VFSTOUFS(mp);
3230	fs = ump->um_fs;
3231	jblocks = ump->softdep_jblocks;
3232	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3233	/*
3234	 * We write anywhere between a disk block and fs block.  The upper
3235	 * bound is picked to prevent buffer cache fragmentation and limit
3236	 * processing time per I/O.
3237	 */
3238	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3239	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3240	segwritten = 0;
3241	for (;;) {
3242		cnt = ump->softdep_on_journal;
3243		/*
3244		 * Criteria for writing a segment:
3245		 * 1) We have a full block.
3246		 * 2) We're called from jwait() and haven't found the
3247		 *    journal item yet.
3248		 * 3) Always write if needseg is set.
3249		 * 4) If we are called from process_worklist and have
3250		 *    not yet written anything we write a partial block
3251		 *    to enforce a 1 second maximum latency on journal
3252		 *    entries.
3253		 */
3254		if (cnt < (jrecmax - 1) && needwk == NULL &&
3255		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3256			break;
3257		cnt++;
3258		/*
3259		 * Verify some free journal space.  softdep_prealloc() should
3260	 	 * guarantee that we don't run out so this is indicative of
3261		 * a problem with the flow control.  Try to recover
3262		 * gracefully in any event.
3263		 */
3264		while (jblocks->jb_free == 0) {
3265			if (flags != MNT_WAIT)
3266				break;
3267			printf("softdep: Out of journal space!\n");
3268			softdep_speedup();
3269			msleep(jblocks, &lk, PRIBIO, "jblocks", hz);
3270		}
3271		FREE_LOCK(&lk);
3272		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3273		workitem_alloc(&jseg->js_list, D_JSEG, mp);
3274		LIST_INIT(&jseg->js_entries);
3275		LIST_INIT(&jseg->js_indirs);
3276		jseg->js_state = ATTACHED;
3277		if (shouldflush == 0)
3278			jseg->js_state |= COMPLETE;
3279		else if (bio == NULL)
3280			bio = g_alloc_bio();
3281		jseg->js_jblocks = jblocks;
3282		bp = geteblk(fs->fs_bsize, 0);
3283		ACQUIRE_LOCK(&lk);
3284		/*
3285		 * If there was a race while we were allocating the block
3286		 * and jseg the entry we care about was likely written.
3287		 * We bail out in both the WAIT and NOWAIT case and assume
3288		 * the caller will loop if the entry it cares about is
3289		 * not written.
3290		 */
3291		cnt = ump->softdep_on_journal;
3292		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3293			bp->b_flags |= B_INVAL | B_NOCACHE;
3294			WORKITEM_FREE(jseg, D_JSEG);
3295			FREE_LOCK(&lk);
3296			brelse(bp);
3297			ACQUIRE_LOCK(&lk);
3298			break;
3299		}
3300		/*
3301		 * Calculate the disk block size required for the available
3302		 * records rounded to the min size.
3303		 */
3304		if (cnt == 0)
3305			size = devbsize;
3306		else if (cnt < jrecmax)
3307			size = howmany(cnt, jrecmin) * devbsize;
3308		else
3309			size = fs->fs_bsize;
3310		/*
3311		 * Allocate a disk block for this journal data and account
3312		 * for truncation of the requested size if enough contiguous
3313		 * space was not available.
3314		 */
3315		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3316		bp->b_lblkno = bp->b_blkno;
3317		bp->b_offset = bp->b_blkno * DEV_BSIZE;
3318		bp->b_bcount = size;
3319		bp->b_flags &= ~B_INVAL;
3320		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3321		/*
3322		 * Initialize our jseg with cnt records.  Assign the next
3323		 * sequence number to it and link it in-order.
3324		 */
3325		cnt = MIN(cnt, (size / devbsize) * jrecmin);
3326		jseg->js_buf = bp;
3327		jseg->js_cnt = cnt;
3328		jseg->js_refs = cnt + 1;	/* Self ref. */
3329		jseg->js_size = size;
3330		jseg->js_seq = jblocks->jb_nextseq++;
3331		if (jblocks->jb_oldestseg == NULL)
3332			jblocks->jb_oldestseg = jseg;
3333		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3334		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3335		if (jblocks->jb_writeseg == NULL)
3336			jblocks->jb_writeseg = jseg;
3337		/*
3338		 * Start filling in records from the pending list.
3339		 */
3340		data = bp->b_data;
3341		off = 0;
3342		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3343		    != NULL) {
3344			if (cnt == 0)
3345				break;
3346			/* Place a segment header on every device block. */
3347			if ((off % devbsize) == 0) {
3348				jseg_write(ump, jseg, data);
3349				off += JREC_SIZE;
3350				data = bp->b_data + off;
3351			}
3352			if (wk == needwk)
3353				needwk = NULL;
3354			remove_from_journal(wk);
3355			wk->wk_state |= INPROGRESS;
3356			WORKLIST_INSERT(&jseg->js_entries, wk);
3357			switch (wk->wk_type) {
3358			case D_JADDREF:
3359				jaddref_write(WK_JADDREF(wk), jseg, data);
3360				break;
3361			case D_JREMREF:
3362				jremref_write(WK_JREMREF(wk), jseg, data);
3363				break;
3364			case D_JMVREF:
3365				jmvref_write(WK_JMVREF(wk), jseg, data);
3366				break;
3367			case D_JNEWBLK:
3368				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3369				break;
3370			case D_JFREEBLK:
3371				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3372				break;
3373			case D_JFREEFRAG:
3374				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3375				break;
3376			case D_JTRUNC:
3377				jtrunc_write(WK_JTRUNC(wk), jseg, data);
3378				break;
3379			case D_JFSYNC:
3380				jfsync_write(WK_JFSYNC(wk), jseg, data);
3381				break;
3382			default:
3383				panic("process_journal: Unknown type %s",
3384				    TYPENAME(wk->wk_type));
3385				/* NOTREACHED */
3386			}
3387			off += JREC_SIZE;
3388			data = bp->b_data + off;
3389			cnt--;
3390		}
3391		/*
3392		 * Write this one buffer and continue.
3393		 */
3394		segwritten = 1;
3395		jblocks->jb_needseg = 0;
3396		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3397		FREE_LOCK(&lk);
3398		pbgetvp(ump->um_devvp, bp);
3399		/*
3400		 * We only do the blocking wait once we find the journal
3401		 * entry we're looking for.
3402		 */
3403		if (needwk == NULL && flags == MNT_WAIT)
3404			bwrite(bp);
3405		else
3406			bawrite(bp);
3407		ACQUIRE_LOCK(&lk);
3408	}
3409	/*
3410	 * If we wrote a segment issue a synchronize cache so the journal
3411	 * is reflected on disk before the data is written.  Since reclaiming
3412	 * journal space also requires writing a journal record this
3413	 * process also enforces a barrier before reclamation.
3414	 */
3415	if (segwritten && shouldflush) {
3416		softdep_synchronize(bio, ump,
3417		    TAILQ_LAST(&jblocks->jb_segs, jseglst));
3418	} else if (bio)
3419		g_destroy_bio(bio);
3420	/*
3421	 * If we've suspended the filesystem because we ran out of journal
3422	 * space either try to sync it here to make some progress or
3423	 * unsuspend it if we already have.
3424	 */
3425	if (flags == 0 && jblocks->jb_suspended) {
3426		if (journal_unsuspend(ump))
3427			return;
3428		FREE_LOCK(&lk);
3429		VFS_SYNC(mp, MNT_NOWAIT);
3430		ffs_sbupdate(ump, MNT_WAIT, 0);
3431		ACQUIRE_LOCK(&lk);
3432	}
3433}
3434
3435/*
3436 * Complete a jseg, allowing all dependencies awaiting journal writes
3437 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3438 * structures so that the journal segment can be freed to reclaim space.
3439 */
3440static void
3441complete_jseg(jseg)
3442	struct jseg *jseg;
3443{
3444	struct worklist *wk;
3445	struct jmvref *jmvref;
3446	int waiting;
3447#ifdef INVARIANTS
3448	int i = 0;
3449#endif
3450
3451	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3452		WORKLIST_REMOVE(wk);
3453		waiting = wk->wk_state & IOWAITING;
3454		wk->wk_state &= ~(INPROGRESS | IOWAITING);
3455		wk->wk_state |= COMPLETE;
3456		KASSERT(i++ < jseg->js_cnt,
3457		    ("handle_written_jseg: overflow %d >= %d",
3458		    i - 1, jseg->js_cnt));
3459		switch (wk->wk_type) {
3460		case D_JADDREF:
3461			handle_written_jaddref(WK_JADDREF(wk));
3462			break;
3463		case D_JREMREF:
3464			handle_written_jremref(WK_JREMREF(wk));
3465			break;
3466		case D_JMVREF:
3467			rele_jseg(jseg);	/* No jsegdep. */
3468			jmvref = WK_JMVREF(wk);
3469			LIST_REMOVE(jmvref, jm_deps);
3470			if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3471				free_pagedep(jmvref->jm_pagedep);
3472			WORKITEM_FREE(jmvref, D_JMVREF);
3473			break;
3474		case D_JNEWBLK:
3475			handle_written_jnewblk(WK_JNEWBLK(wk));
3476			break;
3477		case D_JFREEBLK:
3478			handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3479			break;
3480		case D_JTRUNC:
3481			handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3482			break;
3483		case D_JFSYNC:
3484			rele_jseg(jseg);	/* No jsegdep. */
3485			WORKITEM_FREE(wk, D_JFSYNC);
3486			break;
3487		case D_JFREEFRAG:
3488			handle_written_jfreefrag(WK_JFREEFRAG(wk));
3489			break;
3490		default:
3491			panic("handle_written_jseg: Unknown type %s",
3492			    TYPENAME(wk->wk_type));
3493			/* NOTREACHED */
3494		}
3495		if (waiting)
3496			wakeup(wk);
3497	}
3498	/* Release the self reference so the structure may be freed. */
3499	rele_jseg(jseg);
3500}
3501
3502/*
3503 * Determine which jsegs are ready for completion processing.  Waits for
3504 * synchronize cache to complete as well as forcing in-order completion
3505 * of journal entries.
3506 */
3507static void
3508complete_jsegs(jseg)
3509	struct jseg *jseg;
3510{
3511	struct jblocks *jblocks;
3512	struct jseg *jsegn;
3513
3514	jblocks = jseg->js_jblocks;
3515	/*
3516	 * Don't allow out of order completions.  If this isn't the first
3517	 * block wait for it to write before we're done.
3518	 */
3519	if (jseg != jblocks->jb_writeseg)
3520		return;
3521	/* Iterate through available jsegs processing their entries. */
3522	while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3523		jblocks->jb_oldestwrseq = jseg->js_oldseq;
3524		jsegn = TAILQ_NEXT(jseg, js_next);
3525		complete_jseg(jseg);
3526		jseg = jsegn;
3527	}
3528	jblocks->jb_writeseg = jseg;
3529	/*
3530	 * Attempt to free jsegs now that oldestwrseq may have advanced.
3531	 */
3532	free_jsegs(jblocks);
3533}
3534
3535/*
3536 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
3537 * the final completions.
3538 */
3539static void
3540handle_written_jseg(jseg, bp)
3541	struct jseg *jseg;
3542	struct buf *bp;
3543{
3544
3545	if (jseg->js_refs == 0)
3546		panic("handle_written_jseg: No self-reference on %p", jseg);
3547	jseg->js_state |= DEPCOMPLETE;
3548	/*
3549	 * We'll never need this buffer again, set flags so it will be
3550	 * discarded.
3551	 */
3552	bp->b_flags |= B_INVAL | B_NOCACHE;
3553	pbrelvp(bp);
3554	complete_jsegs(jseg);
3555}
3556
3557static inline struct jsegdep *
3558inoref_jseg(inoref)
3559	struct inoref *inoref;
3560{
3561	struct jsegdep *jsegdep;
3562
3563	jsegdep = inoref->if_jsegdep;
3564	inoref->if_jsegdep = NULL;
3565
3566	return (jsegdep);
3567}
3568
3569/*
3570 * Called once a jremref has made it to stable store.  The jremref is marked
3571 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3572 * for the jremref to complete will be awoken by free_jremref.
3573 */
3574static void
3575handle_written_jremref(jremref)
3576	struct jremref *jremref;
3577{
3578	struct inodedep *inodedep;
3579	struct jsegdep *jsegdep;
3580	struct dirrem *dirrem;
3581
3582	/* Grab the jsegdep. */
3583	jsegdep = inoref_jseg(&jremref->jr_ref);
3584	/*
3585	 * Remove us from the inoref list.
3586	 */
3587	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3588	    0, &inodedep) == 0)
3589		panic("handle_written_jremref: Lost inodedep");
3590	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3591	/*
3592	 * Complete the dirrem.
3593	 */
3594	dirrem = jremref->jr_dirrem;
3595	jremref->jr_dirrem = NULL;
3596	LIST_REMOVE(jremref, jr_deps);
3597	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3598	jwork_insert(&dirrem->dm_jwork, jsegdep);
3599	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3600	    (dirrem->dm_state & COMPLETE) != 0)
3601		add_to_worklist(&dirrem->dm_list, 0);
3602	free_jremref(jremref);
3603}
3604
3605/*
3606 * Called once a jaddref has made it to stable store.  The dependency is
3607 * marked complete and any dependent structures are added to the inode
3608 * bufwait list to be completed as soon as it is written.  If a bitmap write
3609 * depends on this entry we move the inode into the inodedephd of the
3610 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3611 */
3612static void
3613handle_written_jaddref(jaddref)
3614	struct jaddref *jaddref;
3615{
3616	struct jsegdep *jsegdep;
3617	struct inodedep *inodedep;
3618	struct diradd *diradd;
3619	struct mkdir *mkdir;
3620
3621	/* Grab the jsegdep. */
3622	jsegdep = inoref_jseg(&jaddref->ja_ref);
3623	mkdir = NULL;
3624	diradd = NULL;
3625	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3626	    0, &inodedep) == 0)
3627		panic("handle_written_jaddref: Lost inodedep.");
3628	if (jaddref->ja_diradd == NULL)
3629		panic("handle_written_jaddref: No dependency");
3630	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3631		diradd = jaddref->ja_diradd;
3632		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3633	} else if (jaddref->ja_state & MKDIR_PARENT) {
3634		mkdir = jaddref->ja_mkdir;
3635		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3636	} else if (jaddref->ja_state & MKDIR_BODY)
3637		mkdir = jaddref->ja_mkdir;
3638	else
3639		panic("handle_written_jaddref: Unknown dependency %p",
3640		    jaddref->ja_diradd);
3641	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3642	/*
3643	 * Remove us from the inode list.
3644	 */
3645	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3646	/*
3647	 * The mkdir may be waiting on the jaddref to clear before freeing.
3648	 */
3649	if (mkdir) {
3650		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3651		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3652		    TYPENAME(mkdir->md_list.wk_type)));
3653		mkdir->md_jaddref = NULL;
3654		diradd = mkdir->md_diradd;
3655		mkdir->md_state |= DEPCOMPLETE;
3656		complete_mkdir(mkdir);
3657	}
3658	jwork_insert(&diradd->da_jwork, jsegdep);
3659	if (jaddref->ja_state & NEWBLOCK) {
3660		inodedep->id_state |= ONDEPLIST;
3661		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3662		    inodedep, id_deps);
3663	}
3664	free_jaddref(jaddref);
3665}
3666
3667/*
3668 * Called once a jnewblk journal is written.  The allocdirect or allocindir
3669 * is placed in the bmsafemap to await notification of a written bitmap.  If
3670 * the operation was canceled we add the segdep to the appropriate
3671 * dependency to free the journal space once the canceling operation
3672 * completes.
3673 */
3674static void
3675handle_written_jnewblk(jnewblk)
3676	struct jnewblk *jnewblk;
3677{
3678	struct bmsafemap *bmsafemap;
3679	struct freefrag *freefrag;
3680	struct freework *freework;
3681	struct jsegdep *jsegdep;
3682	struct newblk *newblk;
3683
3684	/* Grab the jsegdep. */
3685	jsegdep = jnewblk->jn_jsegdep;
3686	jnewblk->jn_jsegdep = NULL;
3687	if (jnewblk->jn_dep == NULL)
3688		panic("handle_written_jnewblk: No dependency for the segdep.");
3689	switch (jnewblk->jn_dep->wk_type) {
3690	case D_NEWBLK:
3691	case D_ALLOCDIRECT:
3692	case D_ALLOCINDIR:
3693		/*
3694		 * Add the written block to the bmsafemap so it can
3695		 * be notified when the bitmap is on disk.
3696		 */
3697		newblk = WK_NEWBLK(jnewblk->jn_dep);
3698		newblk->nb_jnewblk = NULL;
3699		if ((newblk->nb_state & GOINGAWAY) == 0) {
3700			bmsafemap = newblk->nb_bmsafemap;
3701			newblk->nb_state |= ONDEPLIST;
3702			LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3703			    nb_deps);
3704		}
3705		jwork_insert(&newblk->nb_jwork, jsegdep);
3706		break;
3707	case D_FREEFRAG:
3708		/*
3709		 * A newblock being removed by a freefrag when replaced by
3710		 * frag extension.
3711		 */
3712		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3713		freefrag->ff_jdep = NULL;
3714		jwork_insert(&freefrag->ff_jwork, jsegdep);
3715		break;
3716	case D_FREEWORK:
3717		/*
3718		 * A direct block was removed by truncate.
3719		 */
3720		freework = WK_FREEWORK(jnewblk->jn_dep);
3721		freework->fw_jnewblk = NULL;
3722		jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
3723		break;
3724	default:
3725		panic("handle_written_jnewblk: Unknown type %d.",
3726		    jnewblk->jn_dep->wk_type);
3727	}
3728	jnewblk->jn_dep = NULL;
3729	free_jnewblk(jnewblk);
3730}
3731
3732/*
3733 * Cancel a jfreefrag that won't be needed, probably due to colliding with
3734 * an in-flight allocation that has not yet been committed.  Divorce us
3735 * from the freefrag and mark it DEPCOMPLETE so that it may be added
3736 * to the worklist.
3737 */
3738static void
3739cancel_jfreefrag(jfreefrag)
3740	struct jfreefrag *jfreefrag;
3741{
3742	struct freefrag *freefrag;
3743
3744	if (jfreefrag->fr_jsegdep) {
3745		free_jsegdep(jfreefrag->fr_jsegdep);
3746		jfreefrag->fr_jsegdep = NULL;
3747	}
3748	freefrag = jfreefrag->fr_freefrag;
3749	jfreefrag->fr_freefrag = NULL;
3750	free_jfreefrag(jfreefrag);
3751	freefrag->ff_state |= DEPCOMPLETE;
3752	CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
3753}
3754
3755/*
3756 * Free a jfreefrag when the parent freefrag is rendered obsolete.
3757 */
3758static void
3759free_jfreefrag(jfreefrag)
3760	struct jfreefrag *jfreefrag;
3761{
3762
3763	if (jfreefrag->fr_state & INPROGRESS)
3764		WORKLIST_REMOVE(&jfreefrag->fr_list);
3765	else if (jfreefrag->fr_state & ONWORKLIST)
3766		remove_from_journal(&jfreefrag->fr_list);
3767	if (jfreefrag->fr_freefrag != NULL)
3768		panic("free_jfreefrag:  Still attached to a freefrag.");
3769	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3770}
3771
3772/*
3773 * Called when the journal write for a jfreefrag completes.  The parent
3774 * freefrag is added to the worklist if this completes its dependencies.
3775 */
3776static void
3777handle_written_jfreefrag(jfreefrag)
3778	struct jfreefrag *jfreefrag;
3779{
3780	struct jsegdep *jsegdep;
3781	struct freefrag *freefrag;
3782
3783	/* Grab the jsegdep. */
3784	jsegdep = jfreefrag->fr_jsegdep;
3785	jfreefrag->fr_jsegdep = NULL;
3786	freefrag = jfreefrag->fr_freefrag;
3787	if (freefrag == NULL)
3788		panic("handle_written_jfreefrag: No freefrag.");
3789	freefrag->ff_state |= DEPCOMPLETE;
3790	freefrag->ff_jdep = NULL;
3791	jwork_insert(&freefrag->ff_jwork, jsegdep);
3792	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3793		add_to_worklist(&freefrag->ff_list, 0);
3794	jfreefrag->fr_freefrag = NULL;
3795	free_jfreefrag(jfreefrag);
3796}
3797
3798/*
3799 * Called when the journal write for a jfreeblk completes.  The jfreeblk
3800 * is removed from the freeblks list of pending journal writes and the
3801 * jsegdep is moved to the freeblks jwork to be completed when all blocks
3802 * have been reclaimed.
3803 */
3804static void
3805handle_written_jblkdep(jblkdep)
3806	struct jblkdep *jblkdep;
3807{
3808	struct freeblks *freeblks;
3809	struct jsegdep *jsegdep;
3810
3811	/* Grab the jsegdep. */
3812	jsegdep = jblkdep->jb_jsegdep;
3813	jblkdep->jb_jsegdep = NULL;
3814	freeblks = jblkdep->jb_freeblks;
3815	LIST_REMOVE(jblkdep, jb_deps);
3816	jwork_insert(&freeblks->fb_jwork, jsegdep);
3817	/*
3818	 * If the freeblks is all journaled, we can add it to the worklist.
3819	 */
3820	if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
3821	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
3822		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
3823
3824	free_jblkdep(jblkdep);
3825}
3826
3827static struct jsegdep *
3828newjsegdep(struct worklist *wk)
3829{
3830	struct jsegdep *jsegdep;
3831
3832	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3833	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3834	jsegdep->jd_seg = NULL;
3835
3836	return (jsegdep);
3837}
3838
3839static struct jmvref *
3840newjmvref(dp, ino, oldoff, newoff)
3841	struct inode *dp;
3842	ino_t ino;
3843	off_t oldoff;
3844	off_t newoff;
3845{
3846	struct jmvref *jmvref;
3847
3848	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3849	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3850	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3851	jmvref->jm_parent = dp->i_number;
3852	jmvref->jm_ino = ino;
3853	jmvref->jm_oldoff = oldoff;
3854	jmvref->jm_newoff = newoff;
3855
3856	return (jmvref);
3857}
3858
3859/*
3860 * Allocate a new jremref that tracks the removal of ip from dp with the
3861 * directory entry offset of diroff.  Mark the entry as ATTACHED and
3862 * DEPCOMPLETE as we have all the information required for the journal write
3863 * and the directory has already been removed from the buffer.  The caller
3864 * is responsible for linking the jremref into the pagedep and adding it
3865 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
3866 * a DOTDOT addition so handle_workitem_remove() can properly assign
3867 * the jsegdep when we're done.
3868 */
3869static struct jremref *
3870newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
3871    off_t diroff, nlink_t nlink)
3872{
3873	struct jremref *jremref;
3874
3875	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
3876	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
3877	jremref->jr_state = ATTACHED;
3878	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
3879	   nlink, ip->i_mode);
3880	jremref->jr_dirrem = dirrem;
3881
3882	return (jremref);
3883}
3884
3885static inline void
3886newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
3887    nlink_t nlink, uint16_t mode)
3888{
3889
3890	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
3891	inoref->if_diroff = diroff;
3892	inoref->if_ino = ino;
3893	inoref->if_parent = parent;
3894	inoref->if_nlink = nlink;
3895	inoref->if_mode = mode;
3896}
3897
3898/*
3899 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
3900 * directory offset may not be known until later.  The caller is responsible
3901 * adding the entry to the journal when this information is available.  nlink
3902 * should be the link count prior to the addition and mode is only required
3903 * to have the correct FMT.
3904 */
3905static struct jaddref *
3906newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
3907    uint16_t mode)
3908{
3909	struct jaddref *jaddref;
3910
3911	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
3912	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
3913	jaddref->ja_state = ATTACHED;
3914	jaddref->ja_mkdir = NULL;
3915	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
3916
3917	return (jaddref);
3918}
3919
3920/*
3921 * Create a new free dependency for a freework.  The caller is responsible
3922 * for adjusting the reference count when it has the lock held.  The freedep
3923 * will track an outstanding bitmap write that will ultimately clear the
3924 * freework to continue.
3925 */
3926static struct freedep *
3927newfreedep(struct freework *freework)
3928{
3929	struct freedep *freedep;
3930
3931	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
3932	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
3933	freedep->fd_freework = freework;
3934
3935	return (freedep);
3936}
3937
3938/*
3939 * Free a freedep structure once the buffer it is linked to is written.  If
3940 * this is the last reference to the freework schedule it for completion.
3941 */
3942static void
3943free_freedep(freedep)
3944	struct freedep *freedep;
3945{
3946	struct freework *freework;
3947
3948	freework = freedep->fd_freework;
3949	freework->fw_freeblks->fb_cgwait--;
3950	if (--freework->fw_ref == 0)
3951		freework_enqueue(freework);
3952	WORKITEM_FREE(freedep, D_FREEDEP);
3953}
3954
3955/*
3956 * Allocate a new freework structure that may be a level in an indirect
3957 * when parent is not NULL or a top level block when it is.  The top level
3958 * freework structures are allocated without lk held and before the freeblks
3959 * is visible outside of softdep_setup_freeblocks().
3960 */
3961static struct freework *
3962newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
3963	struct ufsmount *ump;
3964	struct freeblks *freeblks;
3965	struct freework *parent;
3966	ufs_lbn_t lbn;
3967	ufs2_daddr_t nb;
3968	int frags;
3969	int off;
3970	int journal;
3971{
3972	struct freework *freework;
3973
3974	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
3975	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
3976	freework->fw_state = ATTACHED;
3977	freework->fw_jnewblk = NULL;
3978	freework->fw_freeblks = freeblks;
3979	freework->fw_parent = parent;
3980	freework->fw_lbn = lbn;
3981	freework->fw_blkno = nb;
3982	freework->fw_frags = frags;
3983	freework->fw_indir = NULL;
3984	freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR)
3985		? 0 : NINDIR(ump->um_fs) + 1;
3986	freework->fw_start = freework->fw_off = off;
3987	if (journal)
3988		newjfreeblk(freeblks, lbn, nb, frags);
3989	if (parent == NULL) {
3990		ACQUIRE_LOCK(&lk);
3991		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
3992		freeblks->fb_ref++;
3993		FREE_LOCK(&lk);
3994	}
3995
3996	return (freework);
3997}
3998
3999/*
4000 * Eliminate a jfreeblk for a block that does not need journaling.
4001 */
4002static void
4003cancel_jfreeblk(freeblks, blkno)
4004	struct freeblks *freeblks;
4005	ufs2_daddr_t blkno;
4006{
4007	struct jfreeblk *jfreeblk;
4008	struct jblkdep *jblkdep;
4009
4010	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4011		if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4012			continue;
4013		jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4014		if (jfreeblk->jf_blkno == blkno)
4015			break;
4016	}
4017	if (jblkdep == NULL)
4018		return;
4019	CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4020	free_jsegdep(jblkdep->jb_jsegdep);
4021	LIST_REMOVE(jblkdep, jb_deps);
4022	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4023}
4024
4025/*
4026 * Allocate a new jfreeblk to journal top level block pointer when truncating
4027 * a file.  The caller must add this to the worklist when lk is held.
4028 */
4029static struct jfreeblk *
4030newjfreeblk(freeblks, lbn, blkno, frags)
4031	struct freeblks *freeblks;
4032	ufs_lbn_t lbn;
4033	ufs2_daddr_t blkno;
4034	int frags;
4035{
4036	struct jfreeblk *jfreeblk;
4037
4038	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4039	workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4040	    freeblks->fb_list.wk_mp);
4041	jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4042	jfreeblk->jf_dep.jb_freeblks = freeblks;
4043	jfreeblk->jf_ino = freeblks->fb_inum;
4044	jfreeblk->jf_lbn = lbn;
4045	jfreeblk->jf_blkno = blkno;
4046	jfreeblk->jf_frags = frags;
4047	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4048
4049	return (jfreeblk);
4050}
4051
4052/*
4053 * Allocate a new jtrunc to track a partial truncation.
4054 */
4055static struct jtrunc *
4056newjtrunc(freeblks, size, extsize)
4057	struct freeblks *freeblks;
4058	off_t size;
4059	int extsize;
4060{
4061	struct jtrunc *jtrunc;
4062
4063	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4064	workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4065	    freeblks->fb_list.wk_mp);
4066	jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4067	jtrunc->jt_dep.jb_freeblks = freeblks;
4068	jtrunc->jt_ino = freeblks->fb_inum;
4069	jtrunc->jt_size = size;
4070	jtrunc->jt_extsize = extsize;
4071	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4072
4073	return (jtrunc);
4074}
4075
4076/*
4077 * If we're canceling a new bitmap we have to search for another ref
4078 * to move into the bmsafemap dep.  This might be better expressed
4079 * with another structure.
4080 */
4081static void
4082move_newblock_dep(jaddref, inodedep)
4083	struct jaddref *jaddref;
4084	struct inodedep *inodedep;
4085{
4086	struct inoref *inoref;
4087	struct jaddref *jaddrefn;
4088
4089	jaddrefn = NULL;
4090	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4091	    inoref = TAILQ_NEXT(inoref, if_deps)) {
4092		if ((jaddref->ja_state & NEWBLOCK) &&
4093		    inoref->if_list.wk_type == D_JADDREF) {
4094			jaddrefn = (struct jaddref *)inoref;
4095			break;
4096		}
4097	}
4098	if (jaddrefn == NULL)
4099		return;
4100	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4101	jaddrefn->ja_state |= jaddref->ja_state &
4102	    (ATTACHED | UNDONE | NEWBLOCK);
4103	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4104	jaddref->ja_state |= ATTACHED;
4105	LIST_REMOVE(jaddref, ja_bmdeps);
4106	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4107	    ja_bmdeps);
4108}
4109
4110/*
4111 * Cancel a jaddref either before it has been written or while it is being
4112 * written.  This happens when a link is removed before the add reaches
4113 * the disk.  The jaddref dependency is kept linked into the bmsafemap
4114 * and inode to prevent the link count or bitmap from reaching the disk
4115 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4116 * required.
4117 *
4118 * Returns 1 if the canceled addref requires journaling of the remove and
4119 * 0 otherwise.
4120 */
4121static int
4122cancel_jaddref(jaddref, inodedep, wkhd)
4123	struct jaddref *jaddref;
4124	struct inodedep *inodedep;
4125	struct workhead *wkhd;
4126{
4127	struct inoref *inoref;
4128	struct jsegdep *jsegdep;
4129	int needsj;
4130
4131	KASSERT((jaddref->ja_state & COMPLETE) == 0,
4132	    ("cancel_jaddref: Canceling complete jaddref"));
4133	if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4134		needsj = 1;
4135	else
4136		needsj = 0;
4137	if (inodedep == NULL)
4138		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4139		    0, &inodedep) == 0)
4140			panic("cancel_jaddref: Lost inodedep");
4141	/*
4142	 * We must adjust the nlink of any reference operation that follows
4143	 * us so that it is consistent with the in-memory reference.  This
4144	 * ensures that inode nlink rollbacks always have the correct link.
4145	 */
4146	if (needsj == 0) {
4147		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4148		    inoref = TAILQ_NEXT(inoref, if_deps)) {
4149			if (inoref->if_state & GOINGAWAY)
4150				break;
4151			inoref->if_nlink--;
4152		}
4153	}
4154	jsegdep = inoref_jseg(&jaddref->ja_ref);
4155	if (jaddref->ja_state & NEWBLOCK)
4156		move_newblock_dep(jaddref, inodedep);
4157	wake_worklist(&jaddref->ja_list);
4158	jaddref->ja_mkdir = NULL;
4159	if (jaddref->ja_state & INPROGRESS) {
4160		jaddref->ja_state &= ~INPROGRESS;
4161		WORKLIST_REMOVE(&jaddref->ja_list);
4162		jwork_insert(wkhd, jsegdep);
4163	} else {
4164		free_jsegdep(jsegdep);
4165		if (jaddref->ja_state & DEPCOMPLETE)
4166			remove_from_journal(&jaddref->ja_list);
4167	}
4168	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4169	/*
4170	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4171	 * can arrange for them to be freed with the bitmap.  Otherwise we
4172	 * no longer need this addref attached to the inoreflst and it
4173	 * will incorrectly adjust nlink if we leave it.
4174	 */
4175	if ((jaddref->ja_state & NEWBLOCK) == 0) {
4176		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4177		    if_deps);
4178		jaddref->ja_state |= COMPLETE;
4179		free_jaddref(jaddref);
4180		return (needsj);
4181	}
4182	/*
4183	 * Leave the head of the list for jsegdeps for fast merging.
4184	 */
4185	if (LIST_FIRST(wkhd) != NULL) {
4186		jaddref->ja_state |= ONWORKLIST;
4187		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4188	} else
4189		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4190
4191	return (needsj);
4192}
4193
4194/*
4195 * Attempt to free a jaddref structure when some work completes.  This
4196 * should only succeed once the entry is written and all dependencies have
4197 * been notified.
4198 */
4199static void
4200free_jaddref(jaddref)
4201	struct jaddref *jaddref;
4202{
4203
4204	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4205		return;
4206	if (jaddref->ja_ref.if_jsegdep)
4207		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4208		    jaddref, jaddref->ja_state);
4209	if (jaddref->ja_state & NEWBLOCK)
4210		LIST_REMOVE(jaddref, ja_bmdeps);
4211	if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4212		panic("free_jaddref: Bad state %p(0x%X)",
4213		    jaddref, jaddref->ja_state);
4214	if (jaddref->ja_mkdir != NULL)
4215		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4216	WORKITEM_FREE(jaddref, D_JADDREF);
4217}
4218
4219/*
4220 * Free a jremref structure once it has been written or discarded.
4221 */
4222static void
4223free_jremref(jremref)
4224	struct jremref *jremref;
4225{
4226
4227	if (jremref->jr_ref.if_jsegdep)
4228		free_jsegdep(jremref->jr_ref.if_jsegdep);
4229	if (jremref->jr_state & INPROGRESS)
4230		panic("free_jremref: IO still pending");
4231	WORKITEM_FREE(jremref, D_JREMREF);
4232}
4233
4234/*
4235 * Free a jnewblk structure.
4236 */
4237static void
4238free_jnewblk(jnewblk)
4239	struct jnewblk *jnewblk;
4240{
4241
4242	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4243		return;
4244	LIST_REMOVE(jnewblk, jn_deps);
4245	if (jnewblk->jn_dep != NULL)
4246		panic("free_jnewblk: Dependency still attached.");
4247	WORKITEM_FREE(jnewblk, D_JNEWBLK);
4248}
4249
4250/*
4251 * Cancel a jnewblk which has been been made redundant by frag extension.
4252 */
4253static void
4254cancel_jnewblk(jnewblk, wkhd)
4255	struct jnewblk *jnewblk;
4256	struct workhead *wkhd;
4257{
4258	struct jsegdep *jsegdep;
4259
4260	CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4261	jsegdep = jnewblk->jn_jsegdep;
4262	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4263		panic("cancel_jnewblk: Invalid state");
4264	jnewblk->jn_jsegdep  = NULL;
4265	jnewblk->jn_dep = NULL;
4266	jnewblk->jn_state |= GOINGAWAY;
4267	if (jnewblk->jn_state & INPROGRESS) {
4268		jnewblk->jn_state &= ~INPROGRESS;
4269		WORKLIST_REMOVE(&jnewblk->jn_list);
4270		jwork_insert(wkhd, jsegdep);
4271	} else {
4272		free_jsegdep(jsegdep);
4273		remove_from_journal(&jnewblk->jn_list);
4274	}
4275	wake_worklist(&jnewblk->jn_list);
4276	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4277}
4278
4279static void
4280free_jblkdep(jblkdep)
4281	struct jblkdep *jblkdep;
4282{
4283
4284	if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4285		WORKITEM_FREE(jblkdep, D_JFREEBLK);
4286	else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4287		WORKITEM_FREE(jblkdep, D_JTRUNC);
4288	else
4289		panic("free_jblkdep: Unexpected type %s",
4290		    TYPENAME(jblkdep->jb_list.wk_type));
4291}
4292
4293/*
4294 * Free a single jseg once it is no longer referenced in memory or on
4295 * disk.  Reclaim journal blocks and dependencies waiting for the segment
4296 * to disappear.
4297 */
4298static void
4299free_jseg(jseg, jblocks)
4300	struct jseg *jseg;
4301	struct jblocks *jblocks;
4302{
4303	struct freework *freework;
4304
4305	/*
4306	 * Free freework structures that were lingering to indicate freed
4307	 * indirect blocks that forced journal write ordering on reallocate.
4308	 */
4309	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4310		indirblk_remove(freework);
4311	if (jblocks->jb_oldestseg == jseg)
4312		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4313	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4314	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4315	KASSERT(LIST_EMPTY(&jseg->js_entries),
4316	    ("free_jseg: Freed jseg has valid entries."));
4317	WORKITEM_FREE(jseg, D_JSEG);
4318}
4319
4320/*
4321 * Free all jsegs that meet the criteria for being reclaimed and update
4322 * oldestseg.
4323 */
4324static void
4325free_jsegs(jblocks)
4326	struct jblocks *jblocks;
4327{
4328	struct jseg *jseg;
4329
4330	/*
4331	 * Free only those jsegs which have none allocated before them to
4332	 * preserve the journal space ordering.
4333	 */
4334	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4335		/*
4336		 * Only reclaim space when nothing depends on this journal
4337		 * set and another set has written that it is no longer
4338		 * valid.
4339		 */
4340		if (jseg->js_refs != 0) {
4341			jblocks->jb_oldestseg = jseg;
4342			return;
4343		}
4344		if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4345			break;
4346		if (jseg->js_seq > jblocks->jb_oldestwrseq)
4347			break;
4348		/*
4349		 * We can free jsegs that didn't write entries when
4350		 * oldestwrseq == js_seq.
4351		 */
4352		if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4353		    jseg->js_cnt != 0)
4354			break;
4355		free_jseg(jseg, jblocks);
4356	}
4357	/*
4358	 * If we exited the loop above we still must discover the
4359	 * oldest valid segment.
4360	 */
4361	if (jseg)
4362		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4363		     jseg = TAILQ_NEXT(jseg, js_next))
4364			if (jseg->js_refs != 0)
4365				break;
4366	jblocks->jb_oldestseg = jseg;
4367	/*
4368	 * The journal has no valid records but some jsegs may still be
4369	 * waiting on oldestwrseq to advance.  We force a small record
4370	 * out to permit these lingering records to be reclaimed.
4371	 */
4372	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4373		jblocks->jb_needseg = 1;
4374}
4375
4376/*
4377 * Release one reference to a jseg and free it if the count reaches 0.  This
4378 * should eventually reclaim journal space as well.
4379 */
4380static void
4381rele_jseg(jseg)
4382	struct jseg *jseg;
4383{
4384
4385	KASSERT(jseg->js_refs > 0,
4386	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4387	if (--jseg->js_refs != 0)
4388		return;
4389	free_jsegs(jseg->js_jblocks);
4390}
4391
4392/*
4393 * Release a jsegdep and decrement the jseg count.
4394 */
4395static void
4396free_jsegdep(jsegdep)
4397	struct jsegdep *jsegdep;
4398{
4399
4400	if (jsegdep->jd_seg)
4401		rele_jseg(jsegdep->jd_seg);
4402	WORKITEM_FREE(jsegdep, D_JSEGDEP);
4403}
4404
4405/*
4406 * Wait for a journal item to make it to disk.  Initiate journal processing
4407 * if required.
4408 */
4409static int
4410jwait(wk, waitfor)
4411	struct worklist *wk;
4412	int waitfor;
4413{
4414
4415	/*
4416	 * Blocking journal waits cause slow synchronous behavior.  Record
4417	 * stats on the frequency of these blocking operations.
4418	 */
4419	if (waitfor == MNT_WAIT) {
4420		stat_journal_wait++;
4421		switch (wk->wk_type) {
4422		case D_JREMREF:
4423		case D_JMVREF:
4424			stat_jwait_filepage++;
4425			break;
4426		case D_JTRUNC:
4427		case D_JFREEBLK:
4428			stat_jwait_freeblks++;
4429			break;
4430		case D_JNEWBLK:
4431			stat_jwait_newblk++;
4432			break;
4433		case D_JADDREF:
4434			stat_jwait_inode++;
4435			break;
4436		default:
4437			break;
4438		}
4439	}
4440	/*
4441	 * If IO has not started we process the journal.  We can't mark the
4442	 * worklist item as IOWAITING because we drop the lock while
4443	 * processing the journal and the worklist entry may be freed after
4444	 * this point.  The caller may call back in and re-issue the request.
4445	 */
4446	if ((wk->wk_state & INPROGRESS) == 0) {
4447		softdep_process_journal(wk->wk_mp, wk, waitfor);
4448		if (waitfor != MNT_WAIT)
4449			return (EBUSY);
4450		return (0);
4451	}
4452	if (waitfor != MNT_WAIT)
4453		return (EBUSY);
4454	wait_worklist(wk, "jwait");
4455	return (0);
4456}
4457
4458/*
4459 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4460 * appropriate.  This is a convenience function to reduce duplicate code
4461 * for the setup and revert functions below.
4462 */
4463static struct inodedep *
4464inodedep_lookup_ip(ip)
4465	struct inode *ip;
4466{
4467	struct inodedep *inodedep;
4468	int dflags;
4469
4470	KASSERT(ip->i_nlink >= ip->i_effnlink,
4471	    ("inodedep_lookup_ip: bad delta"));
4472	dflags = DEPALLOC;
4473	if (IS_SNAPSHOT(ip))
4474		dflags |= NODELAY;
4475	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags,
4476	    &inodedep);
4477	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4478	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4479
4480	return (inodedep);
4481}
4482
4483/*
4484 * Called prior to creating a new inode and linking it to a directory.  The
4485 * jaddref structure must already be allocated by softdep_setup_inomapdep
4486 * and it is discovered here so we can initialize the mode and update
4487 * nlinkdelta.
4488 */
4489void
4490softdep_setup_create(dp, ip)
4491	struct inode *dp;
4492	struct inode *ip;
4493{
4494	struct inodedep *inodedep;
4495	struct jaddref *jaddref;
4496	struct vnode *dvp;
4497
4498	KASSERT(ip->i_nlink == 1,
4499	    ("softdep_setup_create: Invalid link count."));
4500	dvp = ITOV(dp);
4501	ACQUIRE_LOCK(&lk);
4502	inodedep = inodedep_lookup_ip(ip);
4503	if (DOINGSUJ(dvp)) {
4504		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4505		    inoreflst);
4506		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4507		    ("softdep_setup_create: No addref structure present."));
4508	}
4509	softdep_prelink(dvp, NULL);
4510	FREE_LOCK(&lk);
4511}
4512
4513/*
4514 * Create a jaddref structure to track the addition of a DOTDOT link when
4515 * we are reparenting an inode as part of a rename.  This jaddref will be
4516 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4517 * non-journaling softdep.
4518 */
4519void
4520softdep_setup_dotdot_link(dp, ip)
4521	struct inode *dp;
4522	struct inode *ip;
4523{
4524	struct inodedep *inodedep;
4525	struct jaddref *jaddref;
4526	struct vnode *dvp;
4527	struct vnode *vp;
4528
4529	dvp = ITOV(dp);
4530	vp = ITOV(ip);
4531	jaddref = NULL;
4532	/*
4533	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4534	 * is used as a normal link would be.
4535	 */
4536	if (DOINGSUJ(dvp))
4537		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4538		    dp->i_effnlink - 1, dp->i_mode);
4539	ACQUIRE_LOCK(&lk);
4540	inodedep = inodedep_lookup_ip(dp);
4541	if (jaddref)
4542		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4543		    if_deps);
4544	softdep_prelink(dvp, ITOV(ip));
4545	FREE_LOCK(&lk);
4546}
4547
4548/*
4549 * Create a jaddref structure to track a new link to an inode.  The directory
4550 * offset is not known until softdep_setup_directory_add or
4551 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4552 * softdep.
4553 */
4554void
4555softdep_setup_link(dp, ip)
4556	struct inode *dp;
4557	struct inode *ip;
4558{
4559	struct inodedep *inodedep;
4560	struct jaddref *jaddref;
4561	struct vnode *dvp;
4562
4563	dvp = ITOV(dp);
4564	jaddref = NULL;
4565	if (DOINGSUJ(dvp))
4566		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4567		    ip->i_mode);
4568	ACQUIRE_LOCK(&lk);
4569	inodedep = inodedep_lookup_ip(ip);
4570	if (jaddref)
4571		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4572		    if_deps);
4573	softdep_prelink(dvp, ITOV(ip));
4574	FREE_LOCK(&lk);
4575}
4576
4577/*
4578 * Called to create the jaddref structures to track . and .. references as
4579 * well as lookup and further initialize the incomplete jaddref created
4580 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4581 * nlinkdelta for non-journaling softdep.
4582 */
4583void
4584softdep_setup_mkdir(dp, ip)
4585	struct inode *dp;
4586	struct inode *ip;
4587{
4588	struct inodedep *inodedep;
4589	struct jaddref *dotdotaddref;
4590	struct jaddref *dotaddref;
4591	struct jaddref *jaddref;
4592	struct vnode *dvp;
4593
4594	dvp = ITOV(dp);
4595	dotaddref = dotdotaddref = NULL;
4596	if (DOINGSUJ(dvp)) {
4597		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4598		    ip->i_mode);
4599		dotaddref->ja_state |= MKDIR_BODY;
4600		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4601		    dp->i_effnlink - 1, dp->i_mode);
4602		dotdotaddref->ja_state |= MKDIR_PARENT;
4603	}
4604	ACQUIRE_LOCK(&lk);
4605	inodedep = inodedep_lookup_ip(ip);
4606	if (DOINGSUJ(dvp)) {
4607		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4608		    inoreflst);
4609		KASSERT(jaddref != NULL,
4610		    ("softdep_setup_mkdir: No addref structure present."));
4611		KASSERT(jaddref->ja_parent == dp->i_number,
4612		    ("softdep_setup_mkdir: bad parent %ju",
4613		    (uintmax_t)jaddref->ja_parent));
4614		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4615		    if_deps);
4616	}
4617	inodedep = inodedep_lookup_ip(dp);
4618	if (DOINGSUJ(dvp))
4619		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4620		    &dotdotaddref->ja_ref, if_deps);
4621	softdep_prelink(ITOV(dp), NULL);
4622	FREE_LOCK(&lk);
4623}
4624
4625/*
4626 * Called to track nlinkdelta of the inode and parent directories prior to
4627 * unlinking a directory.
4628 */
4629void
4630softdep_setup_rmdir(dp, ip)
4631	struct inode *dp;
4632	struct inode *ip;
4633{
4634	struct vnode *dvp;
4635
4636	dvp = ITOV(dp);
4637	ACQUIRE_LOCK(&lk);
4638	(void) inodedep_lookup_ip(ip);
4639	(void) inodedep_lookup_ip(dp);
4640	softdep_prelink(dvp, ITOV(ip));
4641	FREE_LOCK(&lk);
4642}
4643
4644/*
4645 * Called to track nlinkdelta of the inode and parent directories prior to
4646 * unlink.
4647 */
4648void
4649softdep_setup_unlink(dp, ip)
4650	struct inode *dp;
4651	struct inode *ip;
4652{
4653	struct vnode *dvp;
4654
4655	dvp = ITOV(dp);
4656	ACQUIRE_LOCK(&lk);
4657	(void) inodedep_lookup_ip(ip);
4658	(void) inodedep_lookup_ip(dp);
4659	softdep_prelink(dvp, ITOV(ip));
4660	FREE_LOCK(&lk);
4661}
4662
4663/*
4664 * Called to release the journal structures created by a failed non-directory
4665 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4666 */
4667void
4668softdep_revert_create(dp, ip)
4669	struct inode *dp;
4670	struct inode *ip;
4671{
4672	struct inodedep *inodedep;
4673	struct jaddref *jaddref;
4674	struct vnode *dvp;
4675
4676	dvp = ITOV(dp);
4677	ACQUIRE_LOCK(&lk);
4678	inodedep = inodedep_lookup_ip(ip);
4679	if (DOINGSUJ(dvp)) {
4680		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4681		    inoreflst);
4682		KASSERT(jaddref->ja_parent == dp->i_number,
4683		    ("softdep_revert_create: addref parent mismatch"));
4684		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4685	}
4686	FREE_LOCK(&lk);
4687}
4688
4689/*
4690 * Called to release the journal structures created by a failed link
4691 * addition.  Adjusts nlinkdelta for non-journaling softdep.
4692 */
4693void
4694softdep_revert_link(dp, ip)
4695	struct inode *dp;
4696	struct inode *ip;
4697{
4698	struct inodedep *inodedep;
4699	struct jaddref *jaddref;
4700	struct vnode *dvp;
4701
4702	dvp = ITOV(dp);
4703	ACQUIRE_LOCK(&lk);
4704	inodedep = inodedep_lookup_ip(ip);
4705	if (DOINGSUJ(dvp)) {
4706		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4707		    inoreflst);
4708		KASSERT(jaddref->ja_parent == dp->i_number,
4709		    ("softdep_revert_link: addref parent mismatch"));
4710		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4711	}
4712	FREE_LOCK(&lk);
4713}
4714
4715/*
4716 * Called to release the journal structures created by a failed mkdir
4717 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4718 */
4719void
4720softdep_revert_mkdir(dp, ip)
4721	struct inode *dp;
4722	struct inode *ip;
4723{
4724	struct inodedep *inodedep;
4725	struct jaddref *jaddref;
4726	struct jaddref *dotaddref;
4727	struct vnode *dvp;
4728
4729	dvp = ITOV(dp);
4730
4731	ACQUIRE_LOCK(&lk);
4732	inodedep = inodedep_lookup_ip(dp);
4733	if (DOINGSUJ(dvp)) {
4734		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4735		    inoreflst);
4736		KASSERT(jaddref->ja_parent == ip->i_number,
4737		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4738		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4739	}
4740	inodedep = inodedep_lookup_ip(ip);
4741	if (DOINGSUJ(dvp)) {
4742		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4743		    inoreflst);
4744		KASSERT(jaddref->ja_parent == dp->i_number,
4745		    ("softdep_revert_mkdir: addref parent mismatch"));
4746		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4747		    inoreflst, if_deps);
4748		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4749		KASSERT(dotaddref->ja_parent == ip->i_number,
4750		    ("softdep_revert_mkdir: dot addref parent mismatch"));
4751		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4752	}
4753	FREE_LOCK(&lk);
4754}
4755
4756/*
4757 * Called to correct nlinkdelta after a failed rmdir.
4758 */
4759void
4760softdep_revert_rmdir(dp, ip)
4761	struct inode *dp;
4762	struct inode *ip;
4763{
4764
4765	ACQUIRE_LOCK(&lk);
4766	(void) inodedep_lookup_ip(ip);
4767	(void) inodedep_lookup_ip(dp);
4768	FREE_LOCK(&lk);
4769}
4770
4771/*
4772 * Protecting the freemaps (or bitmaps).
4773 *
4774 * To eliminate the need to execute fsck before mounting a filesystem
4775 * after a power failure, one must (conservatively) guarantee that the
4776 * on-disk copy of the bitmaps never indicate that a live inode or block is
4777 * free.  So, when a block or inode is allocated, the bitmap should be
4778 * updated (on disk) before any new pointers.  When a block or inode is
4779 * freed, the bitmap should not be updated until all pointers have been
4780 * reset.  The latter dependency is handled by the delayed de-allocation
4781 * approach described below for block and inode de-allocation.  The former
4782 * dependency is handled by calling the following procedure when a block or
4783 * inode is allocated. When an inode is allocated an "inodedep" is created
4784 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4785 * Each "inodedep" is also inserted into the hash indexing structure so
4786 * that any additional link additions can be made dependent on the inode
4787 * allocation.
4788 *
4789 * The ufs filesystem maintains a number of free block counts (e.g., per
4790 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4791 * in addition to the bitmaps.  These counts are used to improve efficiency
4792 * during allocation and therefore must be consistent with the bitmaps.
4793 * There is no convenient way to guarantee post-crash consistency of these
4794 * counts with simple update ordering, for two main reasons: (1) The counts
4795 * and bitmaps for a single cylinder group block are not in the same disk
4796 * sector.  If a disk write is interrupted (e.g., by power failure), one may
4797 * be written and the other not.  (2) Some of the counts are located in the
4798 * superblock rather than the cylinder group block. So, we focus our soft
4799 * updates implementation on protecting the bitmaps. When mounting a
4800 * filesystem, we recompute the auxiliary counts from the bitmaps.
4801 */
4802
4803/*
4804 * Called just after updating the cylinder group block to allocate an inode.
4805 */
4806void
4807softdep_setup_inomapdep(bp, ip, newinum, mode)
4808	struct buf *bp;		/* buffer for cylgroup block with inode map */
4809	struct inode *ip;	/* inode related to allocation */
4810	ino_t newinum;		/* new inode number being allocated */
4811	int mode;
4812{
4813	struct inodedep *inodedep;
4814	struct bmsafemap *bmsafemap;
4815	struct jaddref *jaddref;
4816	struct mount *mp;
4817	struct fs *fs;
4818
4819	mp = UFSTOVFS(ip->i_ump);
4820	fs = ip->i_ump->um_fs;
4821	jaddref = NULL;
4822
4823	/*
4824	 * Allocate the journal reference add structure so that the bitmap
4825	 * can be dependent on it.
4826	 */
4827	if (MOUNTEDSUJ(mp)) {
4828		jaddref = newjaddref(ip, newinum, 0, 0, mode);
4829		jaddref->ja_state |= NEWBLOCK;
4830	}
4831
4832	/*
4833	 * Create a dependency for the newly allocated inode.
4834	 * Panic if it already exists as something is seriously wrong.
4835	 * Otherwise add it to the dependency list for the buffer holding
4836	 * the cylinder group map from which it was allocated.
4837	 *
4838	 * We have to preallocate a bmsafemap entry in case it is needed
4839	 * in bmsafemap_lookup since once we allocate the inodedep, we
4840	 * have to finish initializing it before we can FREE_LOCK().
4841	 * By preallocating, we avoid FREE_LOCK() while doing a malloc
4842	 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
4843	 * creating the inodedep as it can be freed during the time
4844	 * that we FREE_LOCK() while allocating the inodedep. We must
4845	 * call workitem_alloc() before entering the locked section as
4846	 * it also acquires the lock and we must avoid trying doing so
4847	 * recursively.
4848	 */
4849	bmsafemap = malloc(sizeof(struct bmsafemap),
4850	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
4851	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
4852	ACQUIRE_LOCK(&lk);
4853	if ((inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep)))
4854		panic("softdep_setup_inomapdep: dependency %p for new"
4855		    "inode already exists", inodedep);
4856	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
4857	if (jaddref) {
4858		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
4859		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4860		    if_deps);
4861	} else {
4862		inodedep->id_state |= ONDEPLIST;
4863		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
4864	}
4865	inodedep->id_bmsafemap = bmsafemap;
4866	inodedep->id_state &= ~DEPCOMPLETE;
4867	FREE_LOCK(&lk);
4868}
4869
4870/*
4871 * Called just after updating the cylinder group block to
4872 * allocate block or fragment.
4873 */
4874void
4875softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
4876	struct buf *bp;		/* buffer for cylgroup block with block map */
4877	struct mount *mp;	/* filesystem doing allocation */
4878	ufs2_daddr_t newblkno;	/* number of newly allocated block */
4879	int frags;		/* Number of fragments. */
4880	int oldfrags;		/* Previous number of fragments for extend. */
4881{
4882	struct newblk *newblk;
4883	struct bmsafemap *bmsafemap;
4884	struct jnewblk *jnewblk;
4885	struct fs *fs;
4886
4887	fs = VFSTOUFS(mp)->um_fs;
4888	jnewblk = NULL;
4889	/*
4890	 * Create a dependency for the newly allocated block.
4891	 * Add it to the dependency list for the buffer holding
4892	 * the cylinder group map from which it was allocated.
4893	 */
4894	if (MOUNTEDSUJ(mp)) {
4895		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
4896		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
4897		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
4898		jnewblk->jn_state = ATTACHED;
4899		jnewblk->jn_blkno = newblkno;
4900		jnewblk->jn_frags = frags;
4901		jnewblk->jn_oldfrags = oldfrags;
4902#ifdef SUJ_DEBUG
4903		{
4904			struct cg *cgp;
4905			uint8_t *blksfree;
4906			long bno;
4907			int i;
4908
4909			cgp = (struct cg *)bp->b_data;
4910			blksfree = cg_blksfree(cgp);
4911			bno = dtogd(fs, jnewblk->jn_blkno);
4912			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
4913			    i++) {
4914				if (isset(blksfree, bno + i))
4915					panic("softdep_setup_blkmapdep: "
4916					    "free fragment %d from %d-%d "
4917					    "state 0x%X dep %p", i,
4918					    jnewblk->jn_oldfrags,
4919					    jnewblk->jn_frags,
4920					    jnewblk->jn_state,
4921					    jnewblk->jn_dep);
4922			}
4923		}
4924#endif
4925	}
4926
4927	CTR3(KTR_SUJ,
4928	    "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
4929	    newblkno, frags, oldfrags);
4930	ACQUIRE_LOCK(&lk);
4931	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
4932		panic("softdep_setup_blkmapdep: found block");
4933	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
4934	    dtog(fs, newblkno), NULL);
4935	if (jnewblk) {
4936		jnewblk->jn_dep = (struct worklist *)newblk;
4937		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
4938	} else {
4939		newblk->nb_state |= ONDEPLIST;
4940		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
4941	}
4942	newblk->nb_bmsafemap = bmsafemap;
4943	newblk->nb_jnewblk = jnewblk;
4944	FREE_LOCK(&lk);
4945}
4946
4947#define	BMSAFEMAP_HASH(fs, cg) \
4948      (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash])
4949
4950static int
4951bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp)
4952	struct bmsafemap_hashhead *bmsafemaphd;
4953	struct mount *mp;
4954	int cg;
4955	struct bmsafemap **bmsafemapp;
4956{
4957	struct bmsafemap *bmsafemap;
4958
4959	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
4960		if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg)
4961			break;
4962	if (bmsafemap) {
4963		*bmsafemapp = bmsafemap;
4964		return (1);
4965	}
4966	*bmsafemapp = NULL;
4967
4968	return (0);
4969}
4970
4971/*
4972 * Find the bmsafemap associated with a cylinder group buffer.
4973 * If none exists, create one. The buffer must be locked when
4974 * this routine is called and this routine must be called with
4975 * the softdep lock held. To avoid giving up the lock while
4976 * allocating a new bmsafemap, a preallocated bmsafemap may be
4977 * provided. If it is provided but not needed, it is freed.
4978 */
4979static struct bmsafemap *
4980bmsafemap_lookup(mp, bp, cg, newbmsafemap)
4981	struct mount *mp;
4982	struct buf *bp;
4983	int cg;
4984	struct bmsafemap *newbmsafemap;
4985{
4986	struct bmsafemap_hashhead *bmsafemaphd;
4987	struct bmsafemap *bmsafemap, *collision;
4988	struct worklist *wk;
4989	struct fs *fs;
4990
4991	rw_assert(&lk, RA_WLOCKED);
4992	KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
4993	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4994		if (wk->wk_type == D_BMSAFEMAP) {
4995			if (newbmsafemap)
4996				WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
4997			return (WK_BMSAFEMAP(wk));
4998		}
4999	}
5000	fs = VFSTOUFS(mp)->um_fs;
5001	bmsafemaphd = BMSAFEMAP_HASH(fs, cg);
5002	if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) {
5003		if (newbmsafemap)
5004			WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5005		return (bmsafemap);
5006	}
5007	if (newbmsafemap) {
5008		bmsafemap = newbmsafemap;
5009	} else {
5010		FREE_LOCK(&lk);
5011		bmsafemap = malloc(sizeof(struct bmsafemap),
5012			M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5013		workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5014		ACQUIRE_LOCK(&lk);
5015	}
5016	bmsafemap->sm_buf = bp;
5017	LIST_INIT(&bmsafemap->sm_inodedephd);
5018	LIST_INIT(&bmsafemap->sm_inodedepwr);
5019	LIST_INIT(&bmsafemap->sm_newblkhd);
5020	LIST_INIT(&bmsafemap->sm_newblkwr);
5021	LIST_INIT(&bmsafemap->sm_jaddrefhd);
5022	LIST_INIT(&bmsafemap->sm_jnewblkhd);
5023	LIST_INIT(&bmsafemap->sm_freehd);
5024	LIST_INIT(&bmsafemap->sm_freewr);
5025	if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) {
5026		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5027		return (collision);
5028	}
5029	bmsafemap->sm_cg = cg;
5030	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5031	LIST_INSERT_HEAD(&VFSTOUFS(mp)->softdep_dirtycg, bmsafemap, sm_next);
5032	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5033	return (bmsafemap);
5034}
5035
5036/*
5037 * Direct block allocation dependencies.
5038 *
5039 * When a new block is allocated, the corresponding disk locations must be
5040 * initialized (with zeros or new data) before the on-disk inode points to
5041 * them.  Also, the freemap from which the block was allocated must be
5042 * updated (on disk) before the inode's pointer. These two dependencies are
5043 * independent of each other and are needed for all file blocks and indirect
5044 * blocks that are pointed to directly by the inode.  Just before the
5045 * "in-core" version of the inode is updated with a newly allocated block
5046 * number, a procedure (below) is called to setup allocation dependency
5047 * structures.  These structures are removed when the corresponding
5048 * dependencies are satisfied or when the block allocation becomes obsolete
5049 * (i.e., the file is deleted, the block is de-allocated, or the block is a
5050 * fragment that gets upgraded).  All of these cases are handled in
5051 * procedures described later.
5052 *
5053 * When a file extension causes a fragment to be upgraded, either to a larger
5054 * fragment or to a full block, the on-disk location may change (if the
5055 * previous fragment could not simply be extended). In this case, the old
5056 * fragment must be de-allocated, but not until after the inode's pointer has
5057 * been updated. In most cases, this is handled by later procedures, which
5058 * will construct a "freefrag" structure to be added to the workitem queue
5059 * when the inode update is complete (or obsolete).  The main exception to
5060 * this is when an allocation occurs while a pending allocation dependency
5061 * (for the same block pointer) remains.  This case is handled in the main
5062 * allocation dependency setup procedure by immediately freeing the
5063 * unreferenced fragments.
5064 */
5065void
5066softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5067	struct inode *ip;	/* inode to which block is being added */
5068	ufs_lbn_t off;		/* block pointer within inode */
5069	ufs2_daddr_t newblkno;	/* disk block number being added */
5070	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
5071	long newsize;		/* size of new block */
5072	long oldsize;		/* size of new block */
5073	struct buf *bp;		/* bp for allocated block */
5074{
5075	struct allocdirect *adp, *oldadp;
5076	struct allocdirectlst *adphead;
5077	struct freefrag *freefrag;
5078	struct inodedep *inodedep;
5079	struct pagedep *pagedep;
5080	struct jnewblk *jnewblk;
5081	struct newblk *newblk;
5082	struct mount *mp;
5083	ufs_lbn_t lbn;
5084
5085	lbn = bp->b_lblkno;
5086	mp = UFSTOVFS(ip->i_ump);
5087	if (oldblkno && oldblkno != newblkno)
5088		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5089	else
5090		freefrag = NULL;
5091
5092	CTR6(KTR_SUJ,
5093	    "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5094	    "off %jd newsize %ld oldsize %d",
5095	    ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5096	ACQUIRE_LOCK(&lk);
5097	if (off >= NDADDR) {
5098		if (lbn > 0)
5099			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5100			    lbn, off);
5101		/* allocating an indirect block */
5102		if (oldblkno != 0)
5103			panic("softdep_setup_allocdirect: non-zero indir");
5104	} else {
5105		if (off != lbn)
5106			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5107			    lbn, off);
5108		/*
5109		 * Allocating a direct block.
5110		 *
5111		 * If we are allocating a directory block, then we must
5112		 * allocate an associated pagedep to track additions and
5113		 * deletions.
5114		 */
5115		if ((ip->i_mode & IFMT) == IFDIR)
5116			pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5117			    &pagedep);
5118	}
5119	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5120		panic("softdep_setup_allocdirect: lost block");
5121	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5122	    ("softdep_setup_allocdirect: newblk already initialized"));
5123	/*
5124	 * Convert the newblk to an allocdirect.
5125	 */
5126	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5127	adp = (struct allocdirect *)newblk;
5128	newblk->nb_freefrag = freefrag;
5129	adp->ad_offset = off;
5130	adp->ad_oldblkno = oldblkno;
5131	adp->ad_newsize = newsize;
5132	adp->ad_oldsize = oldsize;
5133
5134	/*
5135	 * Finish initializing the journal.
5136	 */
5137	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5138		jnewblk->jn_ino = ip->i_number;
5139		jnewblk->jn_lbn = lbn;
5140		add_to_journal(&jnewblk->jn_list);
5141	}
5142	if (freefrag && freefrag->ff_jdep != NULL &&
5143	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5144		add_to_journal(freefrag->ff_jdep);
5145	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5146	adp->ad_inodedep = inodedep;
5147
5148	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5149	/*
5150	 * The list of allocdirects must be kept in sorted and ascending
5151	 * order so that the rollback routines can quickly determine the
5152	 * first uncommitted block (the size of the file stored on disk
5153	 * ends at the end of the lowest committed fragment, or if there
5154	 * are no fragments, at the end of the highest committed block).
5155	 * Since files generally grow, the typical case is that the new
5156	 * block is to be added at the end of the list. We speed this
5157	 * special case by checking against the last allocdirect in the
5158	 * list before laboriously traversing the list looking for the
5159	 * insertion point.
5160	 */
5161	adphead = &inodedep->id_newinoupdt;
5162	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5163	if (oldadp == NULL || oldadp->ad_offset <= off) {
5164		/* insert at end of list */
5165		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5166		if (oldadp != NULL && oldadp->ad_offset == off)
5167			allocdirect_merge(adphead, adp, oldadp);
5168		FREE_LOCK(&lk);
5169		return;
5170	}
5171	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5172		if (oldadp->ad_offset >= off)
5173			break;
5174	}
5175	if (oldadp == NULL)
5176		panic("softdep_setup_allocdirect: lost entry");
5177	/* insert in middle of list */
5178	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5179	if (oldadp->ad_offset == off)
5180		allocdirect_merge(adphead, adp, oldadp);
5181
5182	FREE_LOCK(&lk);
5183}
5184
5185/*
5186 * Merge a newer and older journal record to be stored either in a
5187 * newblock or freefrag.  This handles aggregating journal records for
5188 * fragment allocation into a second record as well as replacing a
5189 * journal free with an aborted journal allocation.  A segment for the
5190 * oldest record will be placed on wkhd if it has been written.  If not
5191 * the segment for the newer record will suffice.
5192 */
5193static struct worklist *
5194jnewblk_merge(new, old, wkhd)
5195	struct worklist *new;
5196	struct worklist *old;
5197	struct workhead *wkhd;
5198{
5199	struct jnewblk *njnewblk;
5200	struct jnewblk *jnewblk;
5201
5202	/* Handle NULLs to simplify callers. */
5203	if (new == NULL)
5204		return (old);
5205	if (old == NULL)
5206		return (new);
5207	/* Replace a jfreefrag with a jnewblk. */
5208	if (new->wk_type == D_JFREEFRAG) {
5209		if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5210			panic("jnewblk_merge: blkno mismatch: %p, %p",
5211			    old, new);
5212		cancel_jfreefrag(WK_JFREEFRAG(new));
5213		return (old);
5214	}
5215	if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5216		panic("jnewblk_merge: Bad type: old %d new %d\n",
5217		    old->wk_type, new->wk_type);
5218	/*
5219	 * Handle merging of two jnewblk records that describe
5220	 * different sets of fragments in the same block.
5221	 */
5222	jnewblk = WK_JNEWBLK(old);
5223	njnewblk = WK_JNEWBLK(new);
5224	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5225		panic("jnewblk_merge: Merging disparate blocks.");
5226	/*
5227	 * The record may be rolled back in the cg.
5228	 */
5229	if (jnewblk->jn_state & UNDONE) {
5230		jnewblk->jn_state &= ~UNDONE;
5231		njnewblk->jn_state |= UNDONE;
5232		njnewblk->jn_state &= ~ATTACHED;
5233	}
5234	/*
5235	 * We modify the newer addref and free the older so that if neither
5236	 * has been written the most up-to-date copy will be on disk.  If
5237	 * both have been written but rolled back we only temporarily need
5238	 * one of them to fix the bits when the cg write completes.
5239	 */
5240	jnewblk->jn_state |= ATTACHED | COMPLETE;
5241	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5242	cancel_jnewblk(jnewblk, wkhd);
5243	WORKLIST_REMOVE(&jnewblk->jn_list);
5244	free_jnewblk(jnewblk);
5245	return (new);
5246}
5247
5248/*
5249 * Replace an old allocdirect dependency with a newer one.
5250 * This routine must be called with splbio interrupts blocked.
5251 */
5252static void
5253allocdirect_merge(adphead, newadp, oldadp)
5254	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
5255	struct allocdirect *newadp;	/* allocdirect being added */
5256	struct allocdirect *oldadp;	/* existing allocdirect being checked */
5257{
5258	struct worklist *wk;
5259	struct freefrag *freefrag;
5260
5261	freefrag = NULL;
5262	rw_assert(&lk, RA_WLOCKED);
5263	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5264	    newadp->ad_oldsize != oldadp->ad_newsize ||
5265	    newadp->ad_offset >= NDADDR)
5266		panic("%s %jd != new %jd || old size %ld != new %ld",
5267		    "allocdirect_merge: old blkno",
5268		    (intmax_t)newadp->ad_oldblkno,
5269		    (intmax_t)oldadp->ad_newblkno,
5270		    newadp->ad_oldsize, oldadp->ad_newsize);
5271	newadp->ad_oldblkno = oldadp->ad_oldblkno;
5272	newadp->ad_oldsize = oldadp->ad_oldsize;
5273	/*
5274	 * If the old dependency had a fragment to free or had never
5275	 * previously had a block allocated, then the new dependency
5276	 * can immediately post its freefrag and adopt the old freefrag.
5277	 * This action is done by swapping the freefrag dependencies.
5278	 * The new dependency gains the old one's freefrag, and the
5279	 * old one gets the new one and then immediately puts it on
5280	 * the worklist when it is freed by free_newblk. It is
5281	 * not possible to do this swap when the old dependency had a
5282	 * non-zero size but no previous fragment to free. This condition
5283	 * arises when the new block is an extension of the old block.
5284	 * Here, the first part of the fragment allocated to the new
5285	 * dependency is part of the block currently claimed on disk by
5286	 * the old dependency, so cannot legitimately be freed until the
5287	 * conditions for the new dependency are fulfilled.
5288	 */
5289	freefrag = newadp->ad_freefrag;
5290	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5291		newadp->ad_freefrag = oldadp->ad_freefrag;
5292		oldadp->ad_freefrag = freefrag;
5293	}
5294	/*
5295	 * If we are tracking a new directory-block allocation,
5296	 * move it from the old allocdirect to the new allocdirect.
5297	 */
5298	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5299		WORKLIST_REMOVE(wk);
5300		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5301			panic("allocdirect_merge: extra newdirblk");
5302		WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5303	}
5304	TAILQ_REMOVE(adphead, oldadp, ad_next);
5305	/*
5306	 * We need to move any journal dependencies over to the freefrag
5307	 * that releases this block if it exists.  Otherwise we are
5308	 * extending an existing block and we'll wait until that is
5309	 * complete to release the journal space and extend the
5310	 * new journal to cover this old space as well.
5311	 */
5312	if (freefrag == NULL) {
5313		if (oldadp->ad_newblkno != newadp->ad_newblkno)
5314			panic("allocdirect_merge: %jd != %jd",
5315			    oldadp->ad_newblkno, newadp->ad_newblkno);
5316		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5317		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5318		    &oldadp->ad_block.nb_jnewblk->jn_list,
5319		    &newadp->ad_block.nb_jwork);
5320		oldadp->ad_block.nb_jnewblk = NULL;
5321		cancel_newblk(&oldadp->ad_block, NULL,
5322		    &newadp->ad_block.nb_jwork);
5323	} else {
5324		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5325		    &freefrag->ff_list, &freefrag->ff_jwork);
5326		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5327		    &freefrag->ff_jwork);
5328	}
5329	free_newblk(&oldadp->ad_block);
5330}
5331
5332/*
5333 * Allocate a jfreefrag structure to journal a single block free.
5334 */
5335static struct jfreefrag *
5336newjfreefrag(freefrag, ip, blkno, size, lbn)
5337	struct freefrag *freefrag;
5338	struct inode *ip;
5339	ufs2_daddr_t blkno;
5340	long size;
5341	ufs_lbn_t lbn;
5342{
5343	struct jfreefrag *jfreefrag;
5344	struct fs *fs;
5345
5346	fs = ip->i_fs;
5347	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5348	    M_SOFTDEP_FLAGS);
5349	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
5350	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5351	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5352	jfreefrag->fr_ino = ip->i_number;
5353	jfreefrag->fr_lbn = lbn;
5354	jfreefrag->fr_blkno = blkno;
5355	jfreefrag->fr_frags = numfrags(fs, size);
5356	jfreefrag->fr_freefrag = freefrag;
5357
5358	return (jfreefrag);
5359}
5360
5361/*
5362 * Allocate a new freefrag structure.
5363 */
5364static struct freefrag *
5365newfreefrag(ip, blkno, size, lbn)
5366	struct inode *ip;
5367	ufs2_daddr_t blkno;
5368	long size;
5369	ufs_lbn_t lbn;
5370{
5371	struct freefrag *freefrag;
5372	struct fs *fs;
5373
5374	CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5375	    ip->i_number, blkno, size, lbn);
5376	fs = ip->i_fs;
5377	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5378		panic("newfreefrag: frag size");
5379	freefrag = malloc(sizeof(struct freefrag),
5380	    M_FREEFRAG, M_SOFTDEP_FLAGS);
5381	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
5382	freefrag->ff_state = ATTACHED;
5383	LIST_INIT(&freefrag->ff_jwork);
5384	freefrag->ff_inum = ip->i_number;
5385	freefrag->ff_vtype = ITOV(ip)->v_type;
5386	freefrag->ff_blkno = blkno;
5387	freefrag->ff_fragsize = size;
5388
5389	if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) {
5390		freefrag->ff_jdep = (struct worklist *)
5391		    newjfreefrag(freefrag, ip, blkno, size, lbn);
5392	} else {
5393		freefrag->ff_state |= DEPCOMPLETE;
5394		freefrag->ff_jdep = NULL;
5395	}
5396
5397	return (freefrag);
5398}
5399
5400/*
5401 * This workitem de-allocates fragments that were replaced during
5402 * file block allocation.
5403 */
5404static void
5405handle_workitem_freefrag(freefrag)
5406	struct freefrag *freefrag;
5407{
5408	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5409	struct workhead wkhd;
5410
5411	CTR3(KTR_SUJ,
5412	    "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5413	    freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5414	/*
5415	 * It would be illegal to add new completion items to the
5416	 * freefrag after it was schedule to be done so it must be
5417	 * safe to modify the list head here.
5418	 */
5419	LIST_INIT(&wkhd);
5420	ACQUIRE_LOCK(&lk);
5421	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5422	/*
5423	 * If the journal has not been written we must cancel it here.
5424	 */
5425	if (freefrag->ff_jdep) {
5426		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5427			panic("handle_workitem_freefrag: Unexpected type %d\n",
5428			    freefrag->ff_jdep->wk_type);
5429		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5430	}
5431	FREE_LOCK(&lk);
5432	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5433	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
5434	ACQUIRE_LOCK(&lk);
5435	WORKITEM_FREE(freefrag, D_FREEFRAG);
5436	FREE_LOCK(&lk);
5437}
5438
5439/*
5440 * Set up a dependency structure for an external attributes data block.
5441 * This routine follows much of the structure of softdep_setup_allocdirect.
5442 * See the description of softdep_setup_allocdirect above for details.
5443 */
5444void
5445softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5446	struct inode *ip;
5447	ufs_lbn_t off;
5448	ufs2_daddr_t newblkno;
5449	ufs2_daddr_t oldblkno;
5450	long newsize;
5451	long oldsize;
5452	struct buf *bp;
5453{
5454	struct allocdirect *adp, *oldadp;
5455	struct allocdirectlst *adphead;
5456	struct freefrag *freefrag;
5457	struct inodedep *inodedep;
5458	struct jnewblk *jnewblk;
5459	struct newblk *newblk;
5460	struct mount *mp;
5461	ufs_lbn_t lbn;
5462
5463	if (off >= NXADDR)
5464		panic("softdep_setup_allocext: lbn %lld > NXADDR",
5465		    (long long)off);
5466
5467	lbn = bp->b_lblkno;
5468	mp = UFSTOVFS(ip->i_ump);
5469	if (oldblkno && oldblkno != newblkno)
5470		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5471	else
5472		freefrag = NULL;
5473
5474	ACQUIRE_LOCK(&lk);
5475	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5476		panic("softdep_setup_allocext: lost block");
5477	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5478	    ("softdep_setup_allocext: newblk already initialized"));
5479	/*
5480	 * Convert the newblk to an allocdirect.
5481	 */
5482	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5483	adp = (struct allocdirect *)newblk;
5484	newblk->nb_freefrag = freefrag;
5485	adp->ad_offset = off;
5486	adp->ad_oldblkno = oldblkno;
5487	adp->ad_newsize = newsize;
5488	adp->ad_oldsize = oldsize;
5489	adp->ad_state |=  EXTDATA;
5490
5491	/*
5492	 * Finish initializing the journal.
5493	 */
5494	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5495		jnewblk->jn_ino = ip->i_number;
5496		jnewblk->jn_lbn = lbn;
5497		add_to_journal(&jnewblk->jn_list);
5498	}
5499	if (freefrag && freefrag->ff_jdep != NULL &&
5500	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5501		add_to_journal(freefrag->ff_jdep);
5502	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5503	adp->ad_inodedep = inodedep;
5504
5505	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5506	/*
5507	 * The list of allocdirects must be kept in sorted and ascending
5508	 * order so that the rollback routines can quickly determine the
5509	 * first uncommitted block (the size of the file stored on disk
5510	 * ends at the end of the lowest committed fragment, or if there
5511	 * are no fragments, at the end of the highest committed block).
5512	 * Since files generally grow, the typical case is that the new
5513	 * block is to be added at the end of the list. We speed this
5514	 * special case by checking against the last allocdirect in the
5515	 * list before laboriously traversing the list looking for the
5516	 * insertion point.
5517	 */
5518	adphead = &inodedep->id_newextupdt;
5519	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5520	if (oldadp == NULL || oldadp->ad_offset <= off) {
5521		/* insert at end of list */
5522		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5523		if (oldadp != NULL && oldadp->ad_offset == off)
5524			allocdirect_merge(adphead, adp, oldadp);
5525		FREE_LOCK(&lk);
5526		return;
5527	}
5528	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5529		if (oldadp->ad_offset >= off)
5530			break;
5531	}
5532	if (oldadp == NULL)
5533		panic("softdep_setup_allocext: lost entry");
5534	/* insert in middle of list */
5535	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5536	if (oldadp->ad_offset == off)
5537		allocdirect_merge(adphead, adp, oldadp);
5538	FREE_LOCK(&lk);
5539}
5540
5541/*
5542 * Indirect block allocation dependencies.
5543 *
5544 * The same dependencies that exist for a direct block also exist when
5545 * a new block is allocated and pointed to by an entry in a block of
5546 * indirect pointers. The undo/redo states described above are also
5547 * used here. Because an indirect block contains many pointers that
5548 * may have dependencies, a second copy of the entire in-memory indirect
5549 * block is kept. The buffer cache copy is always completely up-to-date.
5550 * The second copy, which is used only as a source for disk writes,
5551 * contains only the safe pointers (i.e., those that have no remaining
5552 * update dependencies). The second copy is freed when all pointers
5553 * are safe. The cache is not allowed to replace indirect blocks with
5554 * pending update dependencies. If a buffer containing an indirect
5555 * block with dependencies is written, these routines will mark it
5556 * dirty again. It can only be successfully written once all the
5557 * dependencies are removed. The ffs_fsync routine in conjunction with
5558 * softdep_sync_metadata work together to get all the dependencies
5559 * removed so that a file can be successfully written to disk. Three
5560 * procedures are used when setting up indirect block pointer
5561 * dependencies. The division is necessary because of the organization
5562 * of the "balloc" routine and because of the distinction between file
5563 * pages and file metadata blocks.
5564 */
5565
5566/*
5567 * Allocate a new allocindir structure.
5568 */
5569static struct allocindir *
5570newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
5571	struct inode *ip;	/* inode for file being extended */
5572	int ptrno;		/* offset of pointer in indirect block */
5573	ufs2_daddr_t newblkno;	/* disk block number being added */
5574	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5575	ufs_lbn_t lbn;
5576{
5577	struct newblk *newblk;
5578	struct allocindir *aip;
5579	struct freefrag *freefrag;
5580	struct jnewblk *jnewblk;
5581
5582	if (oldblkno)
5583		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
5584	else
5585		freefrag = NULL;
5586	ACQUIRE_LOCK(&lk);
5587	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
5588		panic("new_allocindir: lost block");
5589	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5590	    ("newallocindir: newblk already initialized"));
5591	WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
5592	newblk->nb_freefrag = freefrag;
5593	aip = (struct allocindir *)newblk;
5594	aip->ai_offset = ptrno;
5595	aip->ai_oldblkno = oldblkno;
5596	aip->ai_lbn = lbn;
5597	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5598		jnewblk->jn_ino = ip->i_number;
5599		jnewblk->jn_lbn = lbn;
5600		add_to_journal(&jnewblk->jn_list);
5601	}
5602	if (freefrag && freefrag->ff_jdep != NULL &&
5603	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5604		add_to_journal(freefrag->ff_jdep);
5605	return (aip);
5606}
5607
5608/*
5609 * Called just before setting an indirect block pointer
5610 * to a newly allocated file page.
5611 */
5612void
5613softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
5614	struct inode *ip;	/* inode for file being extended */
5615	ufs_lbn_t lbn;		/* allocated block number within file */
5616	struct buf *bp;		/* buffer with indirect blk referencing page */
5617	int ptrno;		/* offset of pointer in indirect block */
5618	ufs2_daddr_t newblkno;	/* disk block number being added */
5619	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5620	struct buf *nbp;	/* buffer holding allocated page */
5621{
5622	struct inodedep *inodedep;
5623	struct freefrag *freefrag;
5624	struct allocindir *aip;
5625	struct pagedep *pagedep;
5626	struct mount *mp;
5627	int dflags;
5628
5629	if (lbn != nbp->b_lblkno)
5630		panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
5631		    lbn, bp->b_lblkno);
5632	CTR4(KTR_SUJ,
5633	    "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
5634	    "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
5635	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
5636	mp = UFSTOVFS(ip->i_ump);
5637	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
5638	dflags = DEPALLOC;
5639	if (IS_SNAPSHOT(ip))
5640		dflags |= NODELAY;
5641	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
5642	/*
5643	 * If we are allocating a directory page, then we must
5644	 * allocate an associated pagedep to track additions and
5645	 * deletions.
5646	 */
5647	if ((ip->i_mode & IFMT) == IFDIR)
5648		pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
5649	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5650	freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5651	FREE_LOCK(&lk);
5652	if (freefrag)
5653		handle_workitem_freefrag(freefrag);
5654}
5655
5656/*
5657 * Called just before setting an indirect block pointer to a
5658 * newly allocated indirect block.
5659 */
5660void
5661softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
5662	struct buf *nbp;	/* newly allocated indirect block */
5663	struct inode *ip;	/* inode for file being extended */
5664	struct buf *bp;		/* indirect block referencing allocated block */
5665	int ptrno;		/* offset of pointer in indirect block */
5666	ufs2_daddr_t newblkno;	/* disk block number being added */
5667{
5668	struct inodedep *inodedep;
5669	struct allocindir *aip;
5670	ufs_lbn_t lbn;
5671	int dflags;
5672
5673	CTR3(KTR_SUJ,
5674	    "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
5675	    ip->i_number, newblkno, ptrno);
5676	lbn = nbp->b_lblkno;
5677	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
5678	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
5679	dflags = DEPALLOC;
5680	if (IS_SNAPSHOT(ip))
5681		dflags |= NODELAY;
5682	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
5683	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5684	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
5685		panic("softdep_setup_allocindir_meta: Block already existed");
5686	FREE_LOCK(&lk);
5687}
5688
5689static void
5690indirdep_complete(indirdep)
5691	struct indirdep *indirdep;
5692{
5693	struct allocindir *aip;
5694
5695	LIST_REMOVE(indirdep, ir_next);
5696	indirdep->ir_state |= DEPCOMPLETE;
5697
5698	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5699		LIST_REMOVE(aip, ai_next);
5700		free_newblk(&aip->ai_block);
5701	}
5702	/*
5703	 * If this indirdep is not attached to a buf it was simply waiting
5704	 * on completion to clear completehd.  free_indirdep() asserts
5705	 * that nothing is dangling.
5706	 */
5707	if ((indirdep->ir_state & ONWORKLIST) == 0)
5708		free_indirdep(indirdep);
5709}
5710
5711static struct indirdep *
5712indirdep_lookup(mp, ip, bp)
5713	struct mount *mp;
5714	struct inode *ip;
5715	struct buf *bp;
5716{
5717	struct indirdep *indirdep, *newindirdep;
5718	struct newblk *newblk;
5719	struct worklist *wk;
5720	struct fs *fs;
5721	ufs2_daddr_t blkno;
5722
5723	rw_assert(&lk, RA_WLOCKED);
5724	indirdep = NULL;
5725	newindirdep = NULL;
5726	fs = ip->i_fs;
5727	for (;;) {
5728		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5729			if (wk->wk_type != D_INDIRDEP)
5730				continue;
5731			indirdep = WK_INDIRDEP(wk);
5732			break;
5733		}
5734		/* Found on the buffer worklist, no new structure to free. */
5735		if (indirdep != NULL && newindirdep == NULL)
5736			return (indirdep);
5737		if (indirdep != NULL && newindirdep != NULL)
5738			panic("indirdep_lookup: simultaneous create");
5739		/* None found on the buffer and a new structure is ready. */
5740		if (indirdep == NULL && newindirdep != NULL)
5741			break;
5742		/* None found and no new structure available. */
5743		FREE_LOCK(&lk);
5744		newindirdep = malloc(sizeof(struct indirdep),
5745		    M_INDIRDEP, M_SOFTDEP_FLAGS);
5746		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5747		newindirdep->ir_state = ATTACHED;
5748		if (ip->i_ump->um_fstype == UFS1)
5749			newindirdep->ir_state |= UFS1FMT;
5750		TAILQ_INIT(&newindirdep->ir_trunc);
5751		newindirdep->ir_saveddata = NULL;
5752		LIST_INIT(&newindirdep->ir_deplisthd);
5753		LIST_INIT(&newindirdep->ir_donehd);
5754		LIST_INIT(&newindirdep->ir_writehd);
5755		LIST_INIT(&newindirdep->ir_completehd);
5756		if (bp->b_blkno == bp->b_lblkno) {
5757			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5758			    NULL, NULL);
5759			bp->b_blkno = blkno;
5760		}
5761		newindirdep->ir_freeblks = NULL;
5762		newindirdep->ir_savebp =
5763		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5764		newindirdep->ir_bp = bp;
5765		BUF_KERNPROC(newindirdep->ir_savebp);
5766		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5767		ACQUIRE_LOCK(&lk);
5768	}
5769	indirdep = newindirdep;
5770	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
5771	/*
5772	 * If the block is not yet allocated we don't set DEPCOMPLETE so
5773	 * that we don't free dependencies until the pointers are valid.
5774	 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
5775	 * than using the hash.
5776	 */
5777	if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
5778		LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
5779	else
5780		indirdep->ir_state |= DEPCOMPLETE;
5781	return (indirdep);
5782}
5783
5784/*
5785 * Called to finish the allocation of the "aip" allocated
5786 * by one of the two routines above.
5787 */
5788static struct freefrag *
5789setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
5790	struct buf *bp;		/* in-memory copy of the indirect block */
5791	struct inode *ip;	/* inode for file being extended */
5792	struct inodedep *inodedep; /* Inodedep for ip */
5793	struct allocindir *aip;	/* allocindir allocated by the above routines */
5794	ufs_lbn_t lbn;		/* Logical block number for this block. */
5795{
5796	struct fs *fs;
5797	struct indirdep *indirdep;
5798	struct allocindir *oldaip;
5799	struct freefrag *freefrag;
5800	struct mount *mp;
5801
5802	rw_assert(&lk, RA_WLOCKED);
5803	mp = UFSTOVFS(ip->i_ump);
5804	fs = ip->i_fs;
5805	if (bp->b_lblkno >= 0)
5806		panic("setup_allocindir_phase2: not indir blk");
5807	KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
5808	    ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
5809	indirdep = indirdep_lookup(mp, ip, bp);
5810	KASSERT(indirdep->ir_savebp != NULL,
5811	    ("setup_allocindir_phase2 NULL ir_savebp"));
5812	aip->ai_indirdep = indirdep;
5813	/*
5814	 * Check for an unwritten dependency for this indirect offset.  If
5815	 * there is, merge the old dependency into the new one.  This happens
5816	 * as a result of reallocblk only.
5817	 */
5818	freefrag = NULL;
5819	if (aip->ai_oldblkno != 0) {
5820		LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
5821			if (oldaip->ai_offset == aip->ai_offset) {
5822				freefrag = allocindir_merge(aip, oldaip);
5823				goto done;
5824			}
5825		}
5826		LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
5827			if (oldaip->ai_offset == aip->ai_offset) {
5828				freefrag = allocindir_merge(aip, oldaip);
5829				goto done;
5830			}
5831		}
5832	}
5833done:
5834	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
5835	return (freefrag);
5836}
5837
5838/*
5839 * Merge two allocindirs which refer to the same block.  Move newblock
5840 * dependencies and setup the freefrags appropriately.
5841 */
5842static struct freefrag *
5843allocindir_merge(aip, oldaip)
5844	struct allocindir *aip;
5845	struct allocindir *oldaip;
5846{
5847	struct freefrag *freefrag;
5848	struct worklist *wk;
5849
5850	if (oldaip->ai_newblkno != aip->ai_oldblkno)
5851		panic("allocindir_merge: blkno");
5852	aip->ai_oldblkno = oldaip->ai_oldblkno;
5853	freefrag = aip->ai_freefrag;
5854	aip->ai_freefrag = oldaip->ai_freefrag;
5855	oldaip->ai_freefrag = NULL;
5856	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
5857	/*
5858	 * If we are tracking a new directory-block allocation,
5859	 * move it from the old allocindir to the new allocindir.
5860	 */
5861	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
5862		WORKLIST_REMOVE(wk);
5863		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
5864			panic("allocindir_merge: extra newdirblk");
5865		WORKLIST_INSERT(&aip->ai_newdirblk, wk);
5866	}
5867	/*
5868	 * We can skip journaling for this freefrag and just complete
5869	 * any pending journal work for the allocindir that is being
5870	 * removed after the freefrag completes.
5871	 */
5872	if (freefrag->ff_jdep)
5873		cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
5874	LIST_REMOVE(oldaip, ai_next);
5875	freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
5876	    &freefrag->ff_list, &freefrag->ff_jwork);
5877	free_newblk(&oldaip->ai_block);
5878
5879	return (freefrag);
5880}
5881
5882static inline void
5883setup_freedirect(freeblks, ip, i, needj)
5884	struct freeblks *freeblks;
5885	struct inode *ip;
5886	int i;
5887	int needj;
5888{
5889	ufs2_daddr_t blkno;
5890	int frags;
5891
5892	blkno = DIP(ip, i_db[i]);
5893	if (blkno == 0)
5894		return;
5895	DIP_SET(ip, i_db[i], 0);
5896	frags = sblksize(ip->i_fs, ip->i_size, i);
5897	frags = numfrags(ip->i_fs, frags);
5898	newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj);
5899}
5900
5901static inline void
5902setup_freeext(freeblks, ip, i, needj)
5903	struct freeblks *freeblks;
5904	struct inode *ip;
5905	int i;
5906	int needj;
5907{
5908	ufs2_daddr_t blkno;
5909	int frags;
5910
5911	blkno = ip->i_din2->di_extb[i];
5912	if (blkno == 0)
5913		return;
5914	ip->i_din2->di_extb[i] = 0;
5915	frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
5916	frags = numfrags(ip->i_fs, frags);
5917	newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
5918}
5919
5920static inline void
5921setup_freeindir(freeblks, ip, i, lbn, needj)
5922	struct freeblks *freeblks;
5923	struct inode *ip;
5924	int i;
5925	ufs_lbn_t lbn;
5926	int needj;
5927{
5928	ufs2_daddr_t blkno;
5929
5930	blkno = DIP(ip, i_ib[i]);
5931	if (blkno == 0)
5932		return;
5933	DIP_SET(ip, i_ib[i], 0);
5934	newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
5935	    0, needj);
5936}
5937
5938static inline struct freeblks *
5939newfreeblks(mp, ip)
5940	struct mount *mp;
5941	struct inode *ip;
5942{
5943	struct freeblks *freeblks;
5944
5945	freeblks = malloc(sizeof(struct freeblks),
5946		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
5947	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
5948	LIST_INIT(&freeblks->fb_jblkdephd);
5949	LIST_INIT(&freeblks->fb_jwork);
5950	freeblks->fb_ref = 0;
5951	freeblks->fb_cgwait = 0;
5952	freeblks->fb_state = ATTACHED;
5953	freeblks->fb_uid = ip->i_uid;
5954	freeblks->fb_inum = ip->i_number;
5955	freeblks->fb_vtype = ITOV(ip)->v_type;
5956	freeblks->fb_modrev = DIP(ip, i_modrev);
5957	freeblks->fb_devvp = ip->i_devvp;
5958	freeblks->fb_chkcnt = 0;
5959	freeblks->fb_len = 0;
5960
5961	return (freeblks);
5962}
5963
5964static void
5965trunc_indirdep(indirdep, freeblks, bp, off)
5966	struct indirdep *indirdep;
5967	struct freeblks *freeblks;
5968	struct buf *bp;
5969	int off;
5970{
5971	struct allocindir *aip, *aipn;
5972
5973	/*
5974	 * The first set of allocindirs won't be in savedbp.
5975	 */
5976	LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
5977		if (aip->ai_offset > off)
5978			cancel_allocindir(aip, bp, freeblks, 1);
5979	LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
5980		if (aip->ai_offset > off)
5981			cancel_allocindir(aip, bp, freeblks, 1);
5982	/*
5983	 * These will exist in savedbp.
5984	 */
5985	LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
5986		if (aip->ai_offset > off)
5987			cancel_allocindir(aip, NULL, freeblks, 0);
5988	LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
5989		if (aip->ai_offset > off)
5990			cancel_allocindir(aip, NULL, freeblks, 0);
5991}
5992
5993/*
5994 * Follow the chain of indirects down to lastlbn creating a freework
5995 * structure for each.  This will be used to start indir_trunc() at
5996 * the right offset and create the journal records for the parrtial
5997 * truncation.  A second step will handle the truncated dependencies.
5998 */
5999static int
6000setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
6001	struct freeblks *freeblks;
6002	struct inode *ip;
6003	ufs_lbn_t lbn;
6004	ufs_lbn_t lastlbn;
6005	ufs2_daddr_t blkno;
6006{
6007	struct indirdep *indirdep;
6008	struct indirdep *indirn;
6009	struct freework *freework;
6010	struct newblk *newblk;
6011	struct mount *mp;
6012	struct buf *bp;
6013	uint8_t *start;
6014	uint8_t *end;
6015	ufs_lbn_t lbnadd;
6016	int level;
6017	int error;
6018	int off;
6019
6020
6021	freework = NULL;
6022	if (blkno == 0)
6023		return (0);
6024	mp = freeblks->fb_list.wk_mp;
6025	bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
6026	if ((bp->b_flags & B_CACHE) == 0) {
6027		bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
6028		bp->b_iocmd = BIO_READ;
6029		bp->b_flags &= ~B_INVAL;
6030		bp->b_ioflags &= ~BIO_ERROR;
6031		vfs_busy_pages(bp, 0);
6032		bp->b_iooffset = dbtob(bp->b_blkno);
6033		bstrategy(bp);
6034		curthread->td_ru.ru_inblock++;
6035		error = bufwait(bp);
6036		if (error) {
6037			brelse(bp);
6038			return (error);
6039		}
6040	}
6041	level = lbn_level(lbn);
6042	lbnadd = lbn_offset(ip->i_fs, level);
6043	/*
6044	 * Compute the offset of the last block we want to keep.  Store
6045	 * in the freework the first block we want to completely free.
6046	 */
6047	off = (lastlbn - -(lbn + level)) / lbnadd;
6048	if (off + 1 == NINDIR(ip->i_fs))
6049		goto nowork;
6050	freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1,
6051	    0);
6052	/*
6053	 * Link the freework into the indirdep.  This will prevent any new
6054	 * allocations from proceeding until we are finished with the
6055	 * truncate and the block is written.
6056	 */
6057	ACQUIRE_LOCK(&lk);
6058	indirdep = indirdep_lookup(mp, ip, bp);
6059	if (indirdep->ir_freeblks)
6060		panic("setup_trunc_indir: indirdep already truncated.");
6061	TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
6062	freework->fw_indir = indirdep;
6063	/*
6064	 * Cancel any allocindirs that will not make it to disk.
6065	 * We have to do this for all copies of the indirdep that
6066	 * live on this newblk.
6067	 */
6068	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
6069		newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk);
6070		LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
6071			trunc_indirdep(indirn, freeblks, bp, off);
6072	} else
6073		trunc_indirdep(indirdep, freeblks, bp, off);
6074	FREE_LOCK(&lk);
6075	/*
6076	 * Creation is protected by the buf lock. The saveddata is only
6077	 * needed if a full truncation follows a partial truncation but it
6078	 * is difficult to allocate in that case so we fetch it anyway.
6079	 */
6080	if (indirdep->ir_saveddata == NULL)
6081		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
6082		    M_SOFTDEP_FLAGS);
6083nowork:
6084	/* Fetch the blkno of the child and the zero start offset. */
6085	if (ip->i_ump->um_fstype == UFS1) {
6086		blkno = ((ufs1_daddr_t *)bp->b_data)[off];
6087		start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
6088	} else {
6089		blkno = ((ufs2_daddr_t *)bp->b_data)[off];
6090		start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
6091	}
6092	if (freework) {
6093		/* Zero the truncated pointers. */
6094		end = bp->b_data + bp->b_bcount;
6095		bzero(start, end - start);
6096		bdwrite(bp);
6097	} else
6098		bqrelse(bp);
6099	if (level == 0)
6100		return (0);
6101	lbn++; /* adjust level */
6102	lbn -= (off * lbnadd);
6103	return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
6104}
6105
6106/*
6107 * Complete the partial truncation of an indirect block setup by
6108 * setup_trunc_indir().  This zeros the truncated pointers in the saved
6109 * copy and writes them to disk before the freeblks is allowed to complete.
6110 */
6111static void
6112complete_trunc_indir(freework)
6113	struct freework *freework;
6114{
6115	struct freework *fwn;
6116	struct indirdep *indirdep;
6117	struct buf *bp;
6118	uintptr_t start;
6119	int count;
6120
6121	indirdep = freework->fw_indir;
6122	for (;;) {
6123		bp = indirdep->ir_bp;
6124		/* See if the block was discarded. */
6125		if (bp == NULL)
6126			break;
6127		/* Inline part of getdirtybuf().  We dont want bremfree. */
6128		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
6129			break;
6130		if (BUF_LOCK(bp,
6131		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, &lk) == 0)
6132			BUF_UNLOCK(bp);
6133		ACQUIRE_LOCK(&lk);
6134	}
6135	rw_assert(&lk, RA_WLOCKED);
6136	freework->fw_state |= DEPCOMPLETE;
6137	TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
6138	/*
6139	 * Zero the pointers in the saved copy.
6140	 */
6141	if (indirdep->ir_state & UFS1FMT)
6142		start = sizeof(ufs1_daddr_t);
6143	else
6144		start = sizeof(ufs2_daddr_t);
6145	start *= freework->fw_start;
6146	count = indirdep->ir_savebp->b_bcount - start;
6147	start += (uintptr_t)indirdep->ir_savebp->b_data;
6148	bzero((char *)start, count);
6149	/*
6150	 * We need to start the next truncation in the list if it has not
6151	 * been started yet.
6152	 */
6153	fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6154	if (fwn != NULL) {
6155		if (fwn->fw_freeblks == indirdep->ir_freeblks)
6156			TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6157		if ((fwn->fw_state & ONWORKLIST) == 0)
6158			freework_enqueue(fwn);
6159	}
6160	/*
6161	 * If bp is NULL the block was fully truncated, restore
6162	 * the saved block list otherwise free it if it is no
6163	 * longer needed.
6164	 */
6165	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6166		if (bp == NULL)
6167			bcopy(indirdep->ir_saveddata,
6168			    indirdep->ir_savebp->b_data,
6169			    indirdep->ir_savebp->b_bcount);
6170		free(indirdep->ir_saveddata, M_INDIRDEP);
6171		indirdep->ir_saveddata = NULL;
6172	}
6173	/*
6174	 * When bp is NULL there is a full truncation pending.  We
6175	 * must wait for this full truncation to be journaled before
6176	 * we can release this freework because the disk pointers will
6177	 * never be written as zero.
6178	 */
6179	if (bp == NULL)  {
6180		if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6181			handle_written_freework(freework);
6182		else
6183			WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6184			   &freework->fw_list);
6185	} else {
6186		/* Complete when the real copy is written. */
6187		WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6188		BUF_UNLOCK(bp);
6189	}
6190}
6191
6192/*
6193 * Calculate the number of blocks we are going to release where datablocks
6194 * is the current total and length is the new file size.
6195 */
6196static ufs2_daddr_t
6197blkcount(fs, datablocks, length)
6198	struct fs *fs;
6199	ufs2_daddr_t datablocks;
6200	off_t length;
6201{
6202	off_t totblks, numblks;
6203
6204	totblks = 0;
6205	numblks = howmany(length, fs->fs_bsize);
6206	if (numblks <= NDADDR) {
6207		totblks = howmany(length, fs->fs_fsize);
6208		goto out;
6209	}
6210        totblks = blkstofrags(fs, numblks);
6211	numblks -= NDADDR;
6212	/*
6213	 * Count all single, then double, then triple indirects required.
6214	 * Subtracting one indirects worth of blocks for each pass
6215	 * acknowledges one of each pointed to by the inode.
6216	 */
6217	for (;;) {
6218		totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6219		numblks -= NINDIR(fs);
6220		if (numblks <= 0)
6221			break;
6222		numblks = howmany(numblks, NINDIR(fs));
6223	}
6224out:
6225	totblks = fsbtodb(fs, totblks);
6226	/*
6227	 * Handle sparse files.  We can't reclaim more blocks than the inode
6228	 * references.  We will correct it later in handle_complete_freeblks()
6229	 * when we know the real count.
6230	 */
6231	if (totblks > datablocks)
6232		return (0);
6233	return (datablocks - totblks);
6234}
6235
6236/*
6237 * Handle freeblocks for journaled softupdate filesystems.
6238 *
6239 * Contrary to normal softupdates, we must preserve the block pointers in
6240 * indirects until their subordinates are free.  This is to avoid journaling
6241 * every block that is freed which may consume more space than the journal
6242 * itself.  The recovery program will see the free block journals at the
6243 * base of the truncated area and traverse them to reclaim space.  The
6244 * pointers in the inode may be cleared immediately after the journal
6245 * records are written because each direct and indirect pointer in the
6246 * inode is recorded in a journal.  This permits full truncation to proceed
6247 * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6248 *
6249 * The algorithm is as follows:
6250 * 1) Traverse the in-memory state and create journal entries to release
6251 *    the relevant blocks and full indirect trees.
6252 * 2) Traverse the indirect block chain adding partial truncation freework
6253 *    records to indirects in the path to lastlbn.  The freework will
6254 *    prevent new allocation dependencies from being satisfied in this
6255 *    indirect until the truncation completes.
6256 * 3) Read and lock the inode block, performing an update with the new size
6257 *    and pointers.  This prevents truncated data from becoming valid on
6258 *    disk through step 4.
6259 * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6260 *    eliminate journal work for those records that do not require it.
6261 * 5) Schedule the journal records to be written followed by the inode block.
6262 * 6) Allocate any necessary frags for the end of file.
6263 * 7) Zero any partially truncated blocks.
6264 *
6265 * From this truncation proceeds asynchronously using the freework and
6266 * indir_trunc machinery.  The file will not be extended again into a
6267 * partially truncated indirect block until all work is completed but
6268 * the normal dependency mechanism ensures that it is rolled back/forward
6269 * as appropriate.  Further truncation may occur without delay and is
6270 * serialized in indir_trunc().
6271 */
6272void
6273softdep_journal_freeblocks(ip, cred, length, flags)
6274	struct inode *ip;	/* The inode whose length is to be reduced */
6275	struct ucred *cred;
6276	off_t length;		/* The new length for the file */
6277	int flags;		/* IO_EXT and/or IO_NORMAL */
6278{
6279	struct freeblks *freeblks, *fbn;
6280	struct worklist *wk, *wkn;
6281	struct inodedep *inodedep;
6282	struct jblkdep *jblkdep;
6283	struct allocdirect *adp, *adpn;
6284	struct fs *fs;
6285	struct buf *bp;
6286	struct vnode *vp;
6287	struct mount *mp;
6288	ufs2_daddr_t extblocks, datablocks;
6289	ufs_lbn_t tmpval, lbn, lastlbn;
6290	int frags, lastoff, iboff, allocblock, needj, dflags, error, i;
6291
6292	fs = ip->i_fs;
6293	mp = UFSTOVFS(ip->i_ump);
6294	vp = ITOV(ip);
6295	needj = 1;
6296	iboff = -1;
6297	allocblock = 0;
6298	extblocks = 0;
6299	datablocks = 0;
6300	frags = 0;
6301	freeblks = newfreeblks(mp, ip);
6302	ACQUIRE_LOCK(&lk);
6303	/*
6304	 * If we're truncating a removed file that will never be written
6305	 * we don't need to journal the block frees.  The canceled journals
6306	 * for the allocations will suffice.
6307	 */
6308	dflags = DEPALLOC;
6309	if (IS_SNAPSHOT(ip))
6310		dflags |= NODELAY;
6311	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6312	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6313	    length == 0)
6314		needj = 0;
6315	CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
6316	    ip->i_number, length, needj);
6317	FREE_LOCK(&lk);
6318	/*
6319	 * Calculate the lbn that we are truncating to.  This results in -1
6320	 * if we're truncating the 0 bytes.  So it is the last lbn we want
6321	 * to keep, not the first lbn we want to truncate.
6322	 */
6323	lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6324	lastoff = blkoff(fs, length);
6325	/*
6326	 * Compute frags we are keeping in lastlbn.  0 means all.
6327	 */
6328	if (lastlbn >= 0 && lastlbn < NDADDR) {
6329		frags = fragroundup(fs, lastoff);
6330		/* adp offset of last valid allocdirect. */
6331		iboff = lastlbn;
6332	} else if (lastlbn > 0)
6333		iboff = NDADDR;
6334	if (fs->fs_magic == FS_UFS2_MAGIC)
6335		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6336	/*
6337	 * Handle normal data blocks and indirects.  This section saves
6338	 * values used after the inode update to complete frag and indirect
6339	 * truncation.
6340	 */
6341	if ((flags & IO_NORMAL) != 0) {
6342		/*
6343		 * Handle truncation of whole direct and indirect blocks.
6344		 */
6345		for (i = iboff + 1; i < NDADDR; i++)
6346			setup_freedirect(freeblks, ip, i, needj);
6347		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6348		    i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6349			/* Release a whole indirect tree. */
6350			if (lbn > lastlbn) {
6351				setup_freeindir(freeblks, ip, i, -lbn -i,
6352				    needj);
6353				continue;
6354			}
6355			iboff = i + NDADDR;
6356			/*
6357			 * Traverse partially truncated indirect tree.
6358			 */
6359			if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6360				setup_trunc_indir(freeblks, ip, -lbn - i,
6361				    lastlbn, DIP(ip, i_ib[i]));
6362		}
6363		/*
6364		 * Handle partial truncation to a frag boundary.
6365		 */
6366		if (frags) {
6367			ufs2_daddr_t blkno;
6368			long oldfrags;
6369
6370			oldfrags = blksize(fs, ip, lastlbn);
6371			blkno = DIP(ip, i_db[lastlbn]);
6372			if (blkno && oldfrags != frags) {
6373				oldfrags -= frags;
6374				oldfrags = numfrags(ip->i_fs, oldfrags);
6375				blkno += numfrags(ip->i_fs, frags);
6376				newfreework(ip->i_ump, freeblks, NULL, lastlbn,
6377				    blkno, oldfrags, 0, needj);
6378			} else if (blkno == 0)
6379				allocblock = 1;
6380		}
6381		/*
6382		 * Add a journal record for partial truncate if we are
6383		 * handling indirect blocks.  Non-indirects need no extra
6384		 * journaling.
6385		 */
6386		if (length != 0 && lastlbn >= NDADDR) {
6387			ip->i_flag |= IN_TRUNCATED;
6388			newjtrunc(freeblks, length, 0);
6389		}
6390		ip->i_size = length;
6391		DIP_SET(ip, i_size, ip->i_size);
6392		datablocks = DIP(ip, i_blocks) - extblocks;
6393		if (length != 0)
6394			datablocks = blkcount(ip->i_fs, datablocks, length);
6395		freeblks->fb_len = length;
6396	}
6397	if ((flags & IO_EXT) != 0) {
6398		for (i = 0; i < NXADDR; i++)
6399			setup_freeext(freeblks, ip, i, needj);
6400		ip->i_din2->di_extsize = 0;
6401		datablocks += extblocks;
6402	}
6403#ifdef QUOTA
6404	/* Reference the quotas in case the block count is wrong in the end. */
6405	quotaref(vp, freeblks->fb_quota);
6406	(void) chkdq(ip, -datablocks, NOCRED, 0);
6407#endif
6408	freeblks->fb_chkcnt = -datablocks;
6409	UFS_LOCK(ip->i_ump);
6410	fs->fs_pendingblocks += datablocks;
6411	UFS_UNLOCK(ip->i_ump);
6412	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6413	/*
6414	 * Handle truncation of incomplete alloc direct dependencies.  We
6415	 * hold the inode block locked to prevent incomplete dependencies
6416	 * from reaching the disk while we are eliminating those that
6417	 * have been truncated.  This is a partially inlined ffs_update().
6418	 */
6419	ufs_itimes(vp);
6420	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6421	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6422	    (int)fs->fs_bsize, cred, &bp);
6423	if (error) {
6424		brelse(bp);
6425		softdep_error("softdep_journal_freeblocks", error);
6426		return;
6427	}
6428	if (bp->b_bufsize == fs->fs_bsize)
6429		bp->b_flags |= B_CLUSTEROK;
6430	softdep_update_inodeblock(ip, bp, 0);
6431	if (ip->i_ump->um_fstype == UFS1)
6432		*((struct ufs1_dinode *)bp->b_data +
6433		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6434	else
6435		*((struct ufs2_dinode *)bp->b_data +
6436		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6437	ACQUIRE_LOCK(&lk);
6438	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6439	if ((inodedep->id_state & IOSTARTED) != 0)
6440		panic("softdep_setup_freeblocks: inode busy");
6441	/*
6442	 * Add the freeblks structure to the list of operations that
6443	 * must await the zero'ed inode being written to disk. If we
6444	 * still have a bitmap dependency (needj), then the inode
6445	 * has never been written to disk, so we can process the
6446	 * freeblks below once we have deleted the dependencies.
6447	 */
6448	if (needj)
6449		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6450	else
6451		freeblks->fb_state |= COMPLETE;
6452	if ((flags & IO_NORMAL) != 0) {
6453		TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6454			if (adp->ad_offset > iboff)
6455				cancel_allocdirect(&inodedep->id_inoupdt, adp,
6456				    freeblks);
6457			/*
6458			 * Truncate the allocdirect.  We could eliminate
6459			 * or modify journal records as well.
6460			 */
6461			else if (adp->ad_offset == iboff && frags)
6462				adp->ad_newsize = frags;
6463		}
6464	}
6465	if ((flags & IO_EXT) != 0)
6466		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6467			cancel_allocdirect(&inodedep->id_extupdt, adp,
6468			    freeblks);
6469	/*
6470	 * Scan the bufwait list for newblock dependencies that will never
6471	 * make it to disk.
6472	 */
6473	LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
6474		if (wk->wk_type != D_ALLOCDIRECT)
6475			continue;
6476		adp = WK_ALLOCDIRECT(wk);
6477		if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
6478		    ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
6479			cancel_jfreeblk(freeblks, adp->ad_newblkno);
6480			cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
6481			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6482		}
6483	}
6484	/*
6485	 * Add journal work.
6486	 */
6487	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6488		add_to_journal(&jblkdep->jb_list);
6489	FREE_LOCK(&lk);
6490	bdwrite(bp);
6491	/*
6492	 * Truncate dependency structures beyond length.
6493	 */
6494	trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6495	/*
6496	 * This is only set when we need to allocate a fragment because
6497	 * none existed at the end of a frag-sized file.  It handles only
6498	 * allocating a new, zero filled block.
6499	 */
6500	if (allocblock) {
6501		ip->i_size = length - lastoff;
6502		DIP_SET(ip, i_size, ip->i_size);
6503		error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6504		if (error != 0) {
6505			softdep_error("softdep_journal_freeblks", error);
6506			return;
6507		}
6508		ip->i_size = length;
6509		DIP_SET(ip, i_size, length);
6510		ip->i_flag |= IN_CHANGE | IN_UPDATE;
6511		allocbuf(bp, frags);
6512		ffs_update(vp, 0);
6513		bawrite(bp);
6514	} else if (lastoff != 0 && vp->v_type != VDIR) {
6515		int size;
6516
6517		/*
6518		 * Zero the end of a truncated frag or block.
6519		 */
6520		size = sblksize(fs, length, lastlbn);
6521		error = bread(vp, lastlbn, size, cred, &bp);
6522		if (error) {
6523			softdep_error("softdep_journal_freeblks", error);
6524			return;
6525		}
6526		bzero((char *)bp->b_data + lastoff, size - lastoff);
6527		bawrite(bp);
6528
6529	}
6530	ACQUIRE_LOCK(&lk);
6531	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6532	TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
6533	freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
6534	/*
6535	 * We zero earlier truncations so they don't erroneously
6536	 * update i_blocks.
6537	 */
6538	if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
6539		TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
6540			fbn->fb_len = 0;
6541	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
6542	    LIST_EMPTY(&freeblks->fb_jblkdephd))
6543		freeblks->fb_state |= INPROGRESS;
6544	else
6545		freeblks = NULL;
6546	FREE_LOCK(&lk);
6547	if (freeblks)
6548		handle_workitem_freeblocks(freeblks, 0);
6549	trunc_pages(ip, length, extblocks, flags);
6550
6551}
6552
6553/*
6554 * Flush a JOP_SYNC to the journal.
6555 */
6556void
6557softdep_journal_fsync(ip)
6558	struct inode *ip;
6559{
6560	struct jfsync *jfsync;
6561
6562	if ((ip->i_flag & IN_TRUNCATED) == 0)
6563		return;
6564	ip->i_flag &= ~IN_TRUNCATED;
6565	jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
6566	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump));
6567	jfsync->jfs_size = ip->i_size;
6568	jfsync->jfs_ino = ip->i_number;
6569	ACQUIRE_LOCK(&lk);
6570	add_to_journal(&jfsync->jfs_list);
6571	jwait(&jfsync->jfs_list, MNT_WAIT);
6572	FREE_LOCK(&lk);
6573}
6574
6575/*
6576 * Block de-allocation dependencies.
6577 *
6578 * When blocks are de-allocated, the on-disk pointers must be nullified before
6579 * the blocks are made available for use by other files.  (The true
6580 * requirement is that old pointers must be nullified before new on-disk
6581 * pointers are set.  We chose this slightly more stringent requirement to
6582 * reduce complexity.) Our implementation handles this dependency by updating
6583 * the inode (or indirect block) appropriately but delaying the actual block
6584 * de-allocation (i.e., freemap and free space count manipulation) until
6585 * after the updated versions reach stable storage.  After the disk is
6586 * updated, the blocks can be safely de-allocated whenever it is convenient.
6587 * This implementation handles only the common case of reducing a file's
6588 * length to zero. Other cases are handled by the conventional synchronous
6589 * write approach.
6590 *
6591 * The ffs implementation with which we worked double-checks
6592 * the state of the block pointers and file size as it reduces
6593 * a file's length.  Some of this code is replicated here in our
6594 * soft updates implementation.  The freeblks->fb_chkcnt field is
6595 * used to transfer a part of this information to the procedure
6596 * that eventually de-allocates the blocks.
6597 *
6598 * This routine should be called from the routine that shortens
6599 * a file's length, before the inode's size or block pointers
6600 * are modified. It will save the block pointer information for
6601 * later release and zero the inode so that the calling routine
6602 * can release it.
6603 */
6604void
6605softdep_setup_freeblocks(ip, length, flags)
6606	struct inode *ip;	/* The inode whose length is to be reduced */
6607	off_t length;		/* The new length for the file */
6608	int flags;		/* IO_EXT and/or IO_NORMAL */
6609{
6610	struct ufs1_dinode *dp1;
6611	struct ufs2_dinode *dp2;
6612	struct freeblks *freeblks;
6613	struct inodedep *inodedep;
6614	struct allocdirect *adp;
6615	struct buf *bp;
6616	struct fs *fs;
6617	ufs2_daddr_t extblocks, datablocks;
6618	struct mount *mp;
6619	int i, delay, error, dflags;
6620	ufs_lbn_t tmpval;
6621	ufs_lbn_t lbn;
6622
6623	CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
6624	    ip->i_number, length);
6625	fs = ip->i_fs;
6626	mp = UFSTOVFS(ip->i_ump);
6627	if (length != 0)
6628		panic("softdep_setup_freeblocks: non-zero length");
6629	freeblks = newfreeblks(mp, ip);
6630	extblocks = 0;
6631	datablocks = 0;
6632	if (fs->fs_magic == FS_UFS2_MAGIC)
6633		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6634	if ((flags & IO_NORMAL) != 0) {
6635		for (i = 0; i < NDADDR; i++)
6636			setup_freedirect(freeblks, ip, i, 0);
6637		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6638		    i++, lbn += tmpval, tmpval *= NINDIR(fs))
6639			setup_freeindir(freeblks, ip, i, -lbn -i, 0);
6640		ip->i_size = 0;
6641		DIP_SET(ip, i_size, 0);
6642		datablocks = DIP(ip, i_blocks) - extblocks;
6643	}
6644	if ((flags & IO_EXT) != 0) {
6645		for (i = 0; i < NXADDR; i++)
6646			setup_freeext(freeblks, ip, i, 0);
6647		ip->i_din2->di_extsize = 0;
6648		datablocks += extblocks;
6649	}
6650#ifdef QUOTA
6651	/* Reference the quotas in case the block count is wrong in the end. */
6652	quotaref(ITOV(ip), freeblks->fb_quota);
6653	(void) chkdq(ip, -datablocks, NOCRED, 0);
6654#endif
6655	freeblks->fb_chkcnt = -datablocks;
6656	UFS_LOCK(ip->i_ump);
6657	fs->fs_pendingblocks += datablocks;
6658	UFS_UNLOCK(ip->i_ump);
6659	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6660	/*
6661	 * Push the zero'ed inode to to its disk buffer so that we are free
6662	 * to delete its dependencies below. Once the dependencies are gone
6663	 * the buffer can be safely released.
6664	 */
6665	if ((error = bread(ip->i_devvp,
6666	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6667	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
6668		brelse(bp);
6669		softdep_error("softdep_setup_freeblocks", error);
6670	}
6671	if (ip->i_ump->um_fstype == UFS1) {
6672		dp1 = ((struct ufs1_dinode *)bp->b_data +
6673		    ino_to_fsbo(fs, ip->i_number));
6674		ip->i_din1->di_freelink = dp1->di_freelink;
6675		*dp1 = *ip->i_din1;
6676	} else {
6677		dp2 = ((struct ufs2_dinode *)bp->b_data +
6678		    ino_to_fsbo(fs, ip->i_number));
6679		ip->i_din2->di_freelink = dp2->di_freelink;
6680		*dp2 = *ip->i_din2;
6681	}
6682	/*
6683	 * Find and eliminate any inode dependencies.
6684	 */
6685	ACQUIRE_LOCK(&lk);
6686	dflags = DEPALLOC;
6687	if (IS_SNAPSHOT(ip))
6688		dflags |= NODELAY;
6689	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6690	if ((inodedep->id_state & IOSTARTED) != 0)
6691		panic("softdep_setup_freeblocks: inode busy");
6692	/*
6693	 * Add the freeblks structure to the list of operations that
6694	 * must await the zero'ed inode being written to disk. If we
6695	 * still have a bitmap dependency (delay == 0), then the inode
6696	 * has never been written to disk, so we can process the
6697	 * freeblks below once we have deleted the dependencies.
6698	 */
6699	delay = (inodedep->id_state & DEPCOMPLETE);
6700	if (delay)
6701		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6702	else
6703		freeblks->fb_state |= COMPLETE;
6704	/*
6705	 * Because the file length has been truncated to zero, any
6706	 * pending block allocation dependency structures associated
6707	 * with this inode are obsolete and can simply be de-allocated.
6708	 * We must first merge the two dependency lists to get rid of
6709	 * any duplicate freefrag structures, then purge the merged list.
6710	 * If we still have a bitmap dependency, then the inode has never
6711	 * been written to disk, so we can free any fragments without delay.
6712	 */
6713	if (flags & IO_NORMAL) {
6714		merge_inode_lists(&inodedep->id_newinoupdt,
6715		    &inodedep->id_inoupdt);
6716		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
6717			cancel_allocdirect(&inodedep->id_inoupdt, adp,
6718			    freeblks);
6719	}
6720	if (flags & IO_EXT) {
6721		merge_inode_lists(&inodedep->id_newextupdt,
6722		    &inodedep->id_extupdt);
6723		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6724			cancel_allocdirect(&inodedep->id_extupdt, adp,
6725			    freeblks);
6726	}
6727	FREE_LOCK(&lk);
6728	bdwrite(bp);
6729	trunc_dependencies(ip, freeblks, -1, 0, flags);
6730	ACQUIRE_LOCK(&lk);
6731	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
6732		(void) free_inodedep(inodedep);
6733	freeblks->fb_state |= DEPCOMPLETE;
6734	/*
6735	 * If the inode with zeroed block pointers is now on disk
6736	 * we can start freeing blocks.
6737	 */
6738	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
6739		freeblks->fb_state |= INPROGRESS;
6740	else
6741		freeblks = NULL;
6742	FREE_LOCK(&lk);
6743	if (freeblks)
6744		handle_workitem_freeblocks(freeblks, 0);
6745	trunc_pages(ip, length, extblocks, flags);
6746}
6747
6748/*
6749 * Eliminate pages from the page cache that back parts of this inode and
6750 * adjust the vnode pager's idea of our size.  This prevents stale data
6751 * from hanging around in the page cache.
6752 */
6753static void
6754trunc_pages(ip, length, extblocks, flags)
6755	struct inode *ip;
6756	off_t length;
6757	ufs2_daddr_t extblocks;
6758	int flags;
6759{
6760	struct vnode *vp;
6761	struct fs *fs;
6762	ufs_lbn_t lbn;
6763	off_t end, extend;
6764
6765	vp = ITOV(ip);
6766	fs = ip->i_fs;
6767	extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
6768	if ((flags & IO_EXT) != 0)
6769		vn_pages_remove(vp, extend, 0);
6770	if ((flags & IO_NORMAL) == 0)
6771		return;
6772	BO_LOCK(&vp->v_bufobj);
6773	drain_output(vp);
6774	BO_UNLOCK(&vp->v_bufobj);
6775	/*
6776	 * The vnode pager eliminates file pages we eliminate indirects
6777	 * below.
6778	 */
6779	vnode_pager_setsize(vp, length);
6780	/*
6781	 * Calculate the end based on the last indirect we want to keep.  If
6782	 * the block extends into indirects we can just use the negative of
6783	 * its lbn.  Doubles and triples exist at lower numbers so we must
6784	 * be careful not to remove those, if they exist.  double and triple
6785	 * indirect lbns do not overlap with others so it is not important
6786	 * to verify how many levels are required.
6787	 */
6788	lbn = lblkno(fs, length);
6789	if (lbn >= NDADDR) {
6790		/* Calculate the virtual lbn of the triple indirect. */
6791		lbn = -lbn - (NIADDR - 1);
6792		end = OFF_TO_IDX(lblktosize(fs, lbn));
6793	} else
6794		end = extend;
6795	vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
6796}
6797
6798/*
6799 * See if the buf bp is in the range eliminated by truncation.
6800 */
6801static int
6802trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
6803	struct buf *bp;
6804	int *blkoffp;
6805	ufs_lbn_t lastlbn;
6806	int lastoff;
6807	int flags;
6808{
6809	ufs_lbn_t lbn;
6810
6811	*blkoffp = 0;
6812	/* Only match ext/normal blocks as appropriate. */
6813	if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
6814	    ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
6815		return (0);
6816	/* ALTDATA is always a full truncation. */
6817	if ((bp->b_xflags & BX_ALTDATA) != 0)
6818		return (1);
6819	/* -1 is full truncation. */
6820	if (lastlbn == -1)
6821		return (1);
6822	/*
6823	 * If this is a partial truncate we only want those
6824	 * blocks and indirect blocks that cover the range
6825	 * we're after.
6826	 */
6827	lbn = bp->b_lblkno;
6828	if (lbn < 0)
6829		lbn = -(lbn + lbn_level(lbn));
6830	if (lbn < lastlbn)
6831		return (0);
6832	/* Here we only truncate lblkno if it's partial. */
6833	if (lbn == lastlbn) {
6834		if (lastoff == 0)
6835			return (0);
6836		*blkoffp = lastoff;
6837	}
6838	return (1);
6839}
6840
6841/*
6842 * Eliminate any dependencies that exist in memory beyond lblkno:off
6843 */
6844static void
6845trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
6846	struct inode *ip;
6847	struct freeblks *freeblks;
6848	ufs_lbn_t lastlbn;
6849	int lastoff;
6850	int flags;
6851{
6852	struct bufobj *bo;
6853	struct vnode *vp;
6854	struct buf *bp;
6855	struct fs *fs;
6856	int blkoff;
6857
6858	/*
6859	 * We must wait for any I/O in progress to finish so that
6860	 * all potential buffers on the dirty list will be visible.
6861	 * Once they are all there, walk the list and get rid of
6862	 * any dependencies.
6863	 */
6864	fs = ip->i_fs;
6865	vp = ITOV(ip);
6866	bo = &vp->v_bufobj;
6867	BO_LOCK(bo);
6868	drain_output(vp);
6869	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
6870		bp->b_vflags &= ~BV_SCANNED;
6871restart:
6872	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
6873		if (bp->b_vflags & BV_SCANNED)
6874			continue;
6875		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
6876			bp->b_vflags |= BV_SCANNED;
6877			continue;
6878		}
6879		if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL)
6880			goto restart;
6881		BO_UNLOCK(bo);
6882		if (deallocate_dependencies(bp, freeblks, blkoff))
6883			bqrelse(bp);
6884		else
6885			brelse(bp);
6886		BO_LOCK(bo);
6887		goto restart;
6888	}
6889	/*
6890	 * Now do the work of vtruncbuf while also matching indirect blocks.
6891	 */
6892	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
6893		bp->b_vflags &= ~BV_SCANNED;
6894cleanrestart:
6895	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
6896		if (bp->b_vflags & BV_SCANNED)
6897			continue;
6898		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
6899			bp->b_vflags |= BV_SCANNED;
6900			continue;
6901		}
6902		if (BUF_LOCK(bp,
6903		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6904		    BO_LOCKPTR(bo)) == ENOLCK) {
6905			BO_LOCK(bo);
6906			goto cleanrestart;
6907		}
6908		bp->b_vflags |= BV_SCANNED;
6909		bremfree(bp);
6910		if (blkoff != 0) {
6911			allocbuf(bp, blkoff);
6912			bqrelse(bp);
6913		} else {
6914			bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
6915			brelse(bp);
6916		}
6917		BO_LOCK(bo);
6918		goto cleanrestart;
6919	}
6920	drain_output(vp);
6921	BO_UNLOCK(bo);
6922}
6923
6924static int
6925cancel_pagedep(pagedep, freeblks, blkoff)
6926	struct pagedep *pagedep;
6927	struct freeblks *freeblks;
6928	int blkoff;
6929{
6930	struct jremref *jremref;
6931	struct jmvref *jmvref;
6932	struct dirrem *dirrem, *tmp;
6933	int i;
6934
6935	/*
6936	 * Copy any directory remove dependencies to the list
6937	 * to be processed after the freeblks proceeds.  If
6938	 * directory entry never made it to disk they
6939	 * can be dumped directly onto the work list.
6940	 */
6941	LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
6942		/* Skip this directory removal if it is intended to remain. */
6943		if (dirrem->dm_offset < blkoff)
6944			continue;
6945		/*
6946		 * If there are any dirrems we wait for the journal write
6947		 * to complete and then restart the buf scan as the lock
6948		 * has been dropped.
6949		 */
6950		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
6951			jwait(&jremref->jr_list, MNT_WAIT);
6952			return (ERESTART);
6953		}
6954		LIST_REMOVE(dirrem, dm_next);
6955		dirrem->dm_dirinum = pagedep->pd_ino;
6956		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
6957	}
6958	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
6959		jwait(&jmvref->jm_list, MNT_WAIT);
6960		return (ERESTART);
6961	}
6962	/*
6963	 * When we're partially truncating a pagedep we just want to flush
6964	 * journal entries and return.  There can not be any adds in the
6965	 * truncated portion of the directory and newblk must remain if
6966	 * part of the block remains.
6967	 */
6968	if (blkoff != 0) {
6969		struct diradd *dap;
6970
6971		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
6972			if (dap->da_offset > blkoff)
6973				panic("cancel_pagedep: diradd %p off %d > %d",
6974				    dap, dap->da_offset, blkoff);
6975		for (i = 0; i < DAHASHSZ; i++)
6976			LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
6977				if (dap->da_offset > blkoff)
6978					panic("cancel_pagedep: diradd %p off %d > %d",
6979					    dap, dap->da_offset, blkoff);
6980		return (0);
6981	}
6982	/*
6983	 * There should be no directory add dependencies present
6984	 * as the directory could not be truncated until all
6985	 * children were removed.
6986	 */
6987	KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
6988	    ("deallocate_dependencies: pendinghd != NULL"));
6989	for (i = 0; i < DAHASHSZ; i++)
6990		KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
6991		    ("deallocate_dependencies: diraddhd != NULL"));
6992	if ((pagedep->pd_state & NEWBLOCK) != 0)
6993		free_newdirblk(pagedep->pd_newdirblk);
6994	if (free_pagedep(pagedep) == 0)
6995		panic("Failed to free pagedep %p", pagedep);
6996	return (0);
6997}
6998
6999/*
7000 * Reclaim any dependency structures from a buffer that is about to
7001 * be reallocated to a new vnode. The buffer must be locked, thus,
7002 * no I/O completion operations can occur while we are manipulating
7003 * its associated dependencies. The mutex is held so that other I/O's
7004 * associated with related dependencies do not occur.
7005 */
7006static int
7007deallocate_dependencies(bp, freeblks, off)
7008	struct buf *bp;
7009	struct freeblks *freeblks;
7010	int off;
7011{
7012	struct indirdep *indirdep;
7013	struct pagedep *pagedep;
7014	struct allocdirect *adp;
7015	struct worklist *wk, *wkn;
7016
7017	ACQUIRE_LOCK(&lk);
7018	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
7019		switch (wk->wk_type) {
7020		case D_INDIRDEP:
7021			indirdep = WK_INDIRDEP(wk);
7022			if (bp->b_lblkno >= 0 ||
7023			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
7024				panic("deallocate_dependencies: not indir");
7025			cancel_indirdep(indirdep, bp, freeblks);
7026			continue;
7027
7028		case D_PAGEDEP:
7029			pagedep = WK_PAGEDEP(wk);
7030			if (cancel_pagedep(pagedep, freeblks, off)) {
7031				FREE_LOCK(&lk);
7032				return (ERESTART);
7033			}
7034			continue;
7035
7036		case D_ALLOCINDIR:
7037			/*
7038			 * Simply remove the allocindir, we'll find it via
7039			 * the indirdep where we can clear pointers if
7040			 * needed.
7041			 */
7042			WORKLIST_REMOVE(wk);
7043			continue;
7044
7045		case D_FREEWORK:
7046			/*
7047			 * A truncation is waiting for the zero'd pointers
7048			 * to be written.  It can be freed when the freeblks
7049			 * is journaled.
7050			 */
7051			WORKLIST_REMOVE(wk);
7052			wk->wk_state |= ONDEPLIST;
7053			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7054			break;
7055
7056		case D_ALLOCDIRECT:
7057			adp = WK_ALLOCDIRECT(wk);
7058			if (off != 0)
7059				continue;
7060			/* FALLTHROUGH */
7061		default:
7062			panic("deallocate_dependencies: Unexpected type %s",
7063			    TYPENAME(wk->wk_type));
7064			/* NOTREACHED */
7065		}
7066	}
7067	FREE_LOCK(&lk);
7068	/*
7069	 * Don't throw away this buf, we were partially truncating and
7070	 * some deps may always remain.
7071	 */
7072	if (off) {
7073		allocbuf(bp, off);
7074		bp->b_vflags |= BV_SCANNED;
7075		return (EBUSY);
7076	}
7077	bp->b_flags |= B_INVAL | B_NOCACHE;
7078
7079	return (0);
7080}
7081
7082/*
7083 * An allocdirect is being canceled due to a truncate.  We must make sure
7084 * the journal entry is released in concert with the blkfree that releases
7085 * the storage.  Completed journal entries must not be released until the
7086 * space is no longer pointed to by the inode or in the bitmap.
7087 */
7088static void
7089cancel_allocdirect(adphead, adp, freeblks)
7090	struct allocdirectlst *adphead;
7091	struct allocdirect *adp;
7092	struct freeblks *freeblks;
7093{
7094	struct freework *freework;
7095	struct newblk *newblk;
7096	struct worklist *wk;
7097
7098	TAILQ_REMOVE(adphead, adp, ad_next);
7099	newblk = (struct newblk *)adp;
7100	freework = NULL;
7101	/*
7102	 * Find the correct freework structure.
7103	 */
7104	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
7105		if (wk->wk_type != D_FREEWORK)
7106			continue;
7107		freework = WK_FREEWORK(wk);
7108		if (freework->fw_blkno == newblk->nb_newblkno)
7109			break;
7110	}
7111	if (freework == NULL)
7112		panic("cancel_allocdirect: Freework not found");
7113	/*
7114	 * If a newblk exists at all we still have the journal entry that
7115	 * initiated the allocation so we do not need to journal the free.
7116	 */
7117	cancel_jfreeblk(freeblks, freework->fw_blkno);
7118	/*
7119	 * If the journal hasn't been written the jnewblk must be passed
7120	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
7121	 * this by linking the journal dependency into the freework to be
7122	 * freed when freework_freeblock() is called.  If the journal has
7123	 * been written we can simply reclaim the journal space when the
7124	 * freeblks work is complete.
7125	 */
7126	freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
7127	    &freeblks->fb_jwork);
7128	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7129}
7130
7131
7132/*
7133 * Cancel a new block allocation.  May be an indirect or direct block.  We
7134 * remove it from various lists and return any journal record that needs to
7135 * be resolved by the caller.
7136 *
7137 * A special consideration is made for indirects which were never pointed
7138 * at on disk and will never be found once this block is released.
7139 */
7140static struct jnewblk *
7141cancel_newblk(newblk, wk, wkhd)
7142	struct newblk *newblk;
7143	struct worklist *wk;
7144	struct workhead *wkhd;
7145{
7146	struct jnewblk *jnewblk;
7147
7148	CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
7149
7150	newblk->nb_state |= GOINGAWAY;
7151	/*
7152	 * Previously we traversed the completedhd on each indirdep
7153	 * attached to this newblk to cancel them and gather journal
7154	 * work.  Since we need only the oldest journal segment and
7155	 * the lowest point on the tree will always have the oldest
7156	 * journal segment we are free to release the segments
7157	 * of any subordinates and may leave the indirdep list to
7158	 * indirdep_complete() when this newblk is freed.
7159	 */
7160	if (newblk->nb_state & ONDEPLIST) {
7161		newblk->nb_state &= ~ONDEPLIST;
7162		LIST_REMOVE(newblk, nb_deps);
7163	}
7164	if (newblk->nb_state & ONWORKLIST)
7165		WORKLIST_REMOVE(&newblk->nb_list);
7166	/*
7167	 * If the journal entry hasn't been written we save a pointer to
7168	 * the dependency that frees it until it is written or the
7169	 * superseding operation completes.
7170	 */
7171	jnewblk = newblk->nb_jnewblk;
7172	if (jnewblk != NULL && wk != NULL) {
7173		newblk->nb_jnewblk = NULL;
7174		jnewblk->jn_dep = wk;
7175	}
7176	if (!LIST_EMPTY(&newblk->nb_jwork))
7177		jwork_move(wkhd, &newblk->nb_jwork);
7178	/*
7179	 * When truncating we must free the newdirblk early to remove
7180	 * the pagedep from the hash before returning.
7181	 */
7182	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7183		free_newdirblk(WK_NEWDIRBLK(wk));
7184	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7185		panic("cancel_newblk: extra newdirblk");
7186
7187	return (jnewblk);
7188}
7189
7190/*
7191 * Schedule the freefrag associated with a newblk to be released once
7192 * the pointers are written and the previous block is no longer needed.
7193 */
7194static void
7195newblk_freefrag(newblk)
7196	struct newblk *newblk;
7197{
7198	struct freefrag *freefrag;
7199
7200	if (newblk->nb_freefrag == NULL)
7201		return;
7202	freefrag = newblk->nb_freefrag;
7203	newblk->nb_freefrag = NULL;
7204	freefrag->ff_state |= COMPLETE;
7205	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7206		add_to_worklist(&freefrag->ff_list, 0);
7207}
7208
7209/*
7210 * Free a newblk. Generate a new freefrag work request if appropriate.
7211 * This must be called after the inode pointer and any direct block pointers
7212 * are valid or fully removed via truncate or frag extension.
7213 */
7214static void
7215free_newblk(newblk)
7216	struct newblk *newblk;
7217{
7218	struct indirdep *indirdep;
7219	struct worklist *wk;
7220
7221	KASSERT(newblk->nb_jnewblk == NULL,
7222	    ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
7223	KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
7224	    ("free_newblk: unclaimed newblk"));
7225	rw_assert(&lk, RA_WLOCKED);
7226	newblk_freefrag(newblk);
7227	if (newblk->nb_state & ONDEPLIST)
7228		LIST_REMOVE(newblk, nb_deps);
7229	if (newblk->nb_state & ONWORKLIST)
7230		WORKLIST_REMOVE(&newblk->nb_list);
7231	LIST_REMOVE(newblk, nb_hash);
7232	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7233		free_newdirblk(WK_NEWDIRBLK(wk));
7234	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7235		panic("free_newblk: extra newdirblk");
7236	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7237		indirdep_complete(indirdep);
7238	handle_jwork(&newblk->nb_jwork);
7239	WORKITEM_FREE(newblk, D_NEWBLK);
7240}
7241
7242/*
7243 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7244 * This routine must be called with splbio interrupts blocked.
7245 */
7246static void
7247free_newdirblk(newdirblk)
7248	struct newdirblk *newdirblk;
7249{
7250	struct pagedep *pagedep;
7251	struct diradd *dap;
7252	struct worklist *wk;
7253
7254	rw_assert(&lk, RA_WLOCKED);
7255	WORKLIST_REMOVE(&newdirblk->db_list);
7256	/*
7257	 * If the pagedep is still linked onto the directory buffer
7258	 * dependency chain, then some of the entries on the
7259	 * pd_pendinghd list may not be committed to disk yet. In
7260	 * this case, we will simply clear the NEWBLOCK flag and
7261	 * let the pd_pendinghd list be processed when the pagedep
7262	 * is next written. If the pagedep is no longer on the buffer
7263	 * dependency chain, then all the entries on the pd_pending
7264	 * list are committed to disk and we can free them here.
7265	 */
7266	pagedep = newdirblk->db_pagedep;
7267	pagedep->pd_state &= ~NEWBLOCK;
7268	if ((pagedep->pd_state & ONWORKLIST) == 0) {
7269		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7270			free_diradd(dap, NULL);
7271		/*
7272		 * If no dependencies remain, the pagedep will be freed.
7273		 */
7274		free_pagedep(pagedep);
7275	}
7276	/* Should only ever be one item in the list. */
7277	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7278		WORKLIST_REMOVE(wk);
7279		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7280	}
7281	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7282}
7283
7284/*
7285 * Prepare an inode to be freed. The actual free operation is not
7286 * done until the zero'ed inode has been written to disk.
7287 */
7288void
7289softdep_freefile(pvp, ino, mode)
7290	struct vnode *pvp;
7291	ino_t ino;
7292	int mode;
7293{
7294	struct inode *ip = VTOI(pvp);
7295	struct inodedep *inodedep;
7296	struct freefile *freefile;
7297	struct freeblks *freeblks;
7298
7299	/*
7300	 * This sets up the inode de-allocation dependency.
7301	 */
7302	freefile = malloc(sizeof(struct freefile),
7303		M_FREEFILE, M_SOFTDEP_FLAGS);
7304	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7305	freefile->fx_mode = mode;
7306	freefile->fx_oldinum = ino;
7307	freefile->fx_devvp = ip->i_devvp;
7308	LIST_INIT(&freefile->fx_jwork);
7309	UFS_LOCK(ip->i_ump);
7310	ip->i_fs->fs_pendinginodes += 1;
7311	UFS_UNLOCK(ip->i_ump);
7312
7313	/*
7314	 * If the inodedep does not exist, then the zero'ed inode has
7315	 * been written to disk. If the allocated inode has never been
7316	 * written to disk, then the on-disk inode is zero'ed. In either
7317	 * case we can free the file immediately.  If the journal was
7318	 * canceled before being written the inode will never make it to
7319	 * disk and we must send the canceled journal entrys to
7320	 * ffs_freefile() to be cleared in conjunction with the bitmap.
7321	 * Any blocks waiting on the inode to write can be safely freed
7322	 * here as it will never been written.
7323	 */
7324	ACQUIRE_LOCK(&lk);
7325	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7326	if (inodedep) {
7327		/*
7328		 * Clear out freeblks that no longer need to reference
7329		 * this inode.
7330		 */
7331		while ((freeblks =
7332		    TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7333			TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7334			    fb_next);
7335			freeblks->fb_state &= ~ONDEPLIST;
7336		}
7337		/*
7338		 * Remove this inode from the unlinked list.
7339		 */
7340		if (inodedep->id_state & UNLINKED) {
7341			/*
7342			 * Save the journal work to be freed with the bitmap
7343			 * before we clear UNLINKED.  Otherwise it can be lost
7344			 * if the inode block is written.
7345			 */
7346			handle_bufwait(inodedep, &freefile->fx_jwork);
7347			clear_unlinked_inodedep(inodedep);
7348			/* Re-acquire inodedep as we've dropped lk. */
7349			inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7350		}
7351	}
7352	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7353		FREE_LOCK(&lk);
7354		handle_workitem_freefile(freefile);
7355		return;
7356	}
7357	if ((inodedep->id_state & DEPCOMPLETE) == 0)
7358		inodedep->id_state |= GOINGAWAY;
7359	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7360	FREE_LOCK(&lk);
7361	if (ip->i_number == ino)
7362		ip->i_flag |= IN_MODIFIED;
7363}
7364
7365/*
7366 * Check to see if an inode has never been written to disk. If
7367 * so free the inodedep and return success, otherwise return failure.
7368 * This routine must be called with splbio interrupts blocked.
7369 *
7370 * If we still have a bitmap dependency, then the inode has never
7371 * been written to disk. Drop the dependency as it is no longer
7372 * necessary since the inode is being deallocated. We set the
7373 * ALLCOMPLETE flags since the bitmap now properly shows that the
7374 * inode is not allocated. Even if the inode is actively being
7375 * written, it has been rolled back to its zero'ed state, so we
7376 * are ensured that a zero inode is what is on the disk. For short
7377 * lived files, this change will usually result in removing all the
7378 * dependencies from the inode so that it can be freed immediately.
7379 */
7380static int
7381check_inode_unwritten(inodedep)
7382	struct inodedep *inodedep;
7383{
7384
7385	rw_assert(&lk, RA_WLOCKED);
7386
7387	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7388	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7389	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7390	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7391	    !LIST_EMPTY(&inodedep->id_inowait) ||
7392	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7393	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7394	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7395	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7396	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7397	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7398	    inodedep->id_mkdiradd != NULL ||
7399	    inodedep->id_nlinkdelta != 0)
7400		return (0);
7401	/*
7402	 * Another process might be in initiate_write_inodeblock_ufs[12]
7403	 * trying to allocate memory without holding "Softdep Lock".
7404	 */
7405	if ((inodedep->id_state & IOSTARTED) != 0 &&
7406	    inodedep->id_savedino1 == NULL)
7407		return (0);
7408
7409	if (inodedep->id_state & ONDEPLIST)
7410		LIST_REMOVE(inodedep, id_deps);
7411	inodedep->id_state &= ~ONDEPLIST;
7412	inodedep->id_state |= ALLCOMPLETE;
7413	inodedep->id_bmsafemap = NULL;
7414	if (inodedep->id_state & ONWORKLIST)
7415		WORKLIST_REMOVE(&inodedep->id_list);
7416	if (inodedep->id_savedino1 != NULL) {
7417		free(inodedep->id_savedino1, M_SAVEDINO);
7418		inodedep->id_savedino1 = NULL;
7419	}
7420	if (free_inodedep(inodedep) == 0)
7421		panic("check_inode_unwritten: busy inode");
7422	return (1);
7423}
7424
7425/*
7426 * Try to free an inodedep structure. Return 1 if it could be freed.
7427 */
7428static int
7429free_inodedep(inodedep)
7430	struct inodedep *inodedep;
7431{
7432
7433	rw_assert(&lk, RA_WLOCKED);
7434	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7435	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7436	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7437	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7438	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7439	    !LIST_EMPTY(&inodedep->id_inowait) ||
7440	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7441	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7442	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7443	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7444	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7445	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7446	    inodedep->id_mkdiradd != NULL ||
7447	    inodedep->id_nlinkdelta != 0 ||
7448	    inodedep->id_savedino1 != NULL)
7449		return (0);
7450	if (inodedep->id_state & ONDEPLIST)
7451		LIST_REMOVE(inodedep, id_deps);
7452	LIST_REMOVE(inodedep, id_hash);
7453	WORKITEM_FREE(inodedep, D_INODEDEP);
7454	return (1);
7455}
7456
7457/*
7458 * Free the block referenced by a freework structure.  The parent freeblks
7459 * structure is released and completed when the final cg bitmap reaches
7460 * the disk.  This routine may be freeing a jnewblk which never made it to
7461 * disk in which case we do not have to wait as the operation is undone
7462 * in memory immediately.
7463 */
7464static void
7465freework_freeblock(freework)
7466	struct freework *freework;
7467{
7468	struct freeblks *freeblks;
7469	struct jnewblk *jnewblk;
7470	struct ufsmount *ump;
7471	struct workhead wkhd;
7472	struct fs *fs;
7473	int bsize;
7474	int needj;
7475
7476	rw_assert(&lk, RA_WLOCKED);
7477	/*
7478	 * Handle partial truncate separately.
7479	 */
7480	if (freework->fw_indir) {
7481		complete_trunc_indir(freework);
7482		return;
7483	}
7484	freeblks = freework->fw_freeblks;
7485	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7486	fs = ump->um_fs;
7487	needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7488	bsize = lfragtosize(fs, freework->fw_frags);
7489	LIST_INIT(&wkhd);
7490	/*
7491	 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7492	 * on the indirblk hashtable and prevents premature freeing.
7493	 */
7494	freework->fw_state |= DEPCOMPLETE;
7495	/*
7496	 * SUJ needs to wait for the segment referencing freed indirect
7497	 * blocks to expire so that we know the checker will not confuse
7498	 * a re-allocated indirect block with its old contents.
7499	 */
7500	if (needj && freework->fw_lbn <= -NDADDR)
7501		indirblk_insert(freework);
7502	/*
7503	 * If we are canceling an existing jnewblk pass it to the free
7504	 * routine, otherwise pass the freeblk which will ultimately
7505	 * release the freeblks.  If we're not journaling, we can just
7506	 * free the freeblks immediately.
7507	 */
7508	jnewblk = freework->fw_jnewblk;
7509	if (jnewblk != NULL) {
7510		cancel_jnewblk(jnewblk, &wkhd);
7511		needj = 0;
7512	} else if (needj) {
7513		freework->fw_state |= DELAYEDFREE;
7514		freeblks->fb_cgwait++;
7515		WORKLIST_INSERT(&wkhd, &freework->fw_list);
7516	}
7517	FREE_LOCK(&lk);
7518	freeblks_free(ump, freeblks, btodb(bsize));
7519	CTR4(KTR_SUJ,
7520	    "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
7521	    freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
7522	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
7523	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
7524	ACQUIRE_LOCK(&lk);
7525	/*
7526	 * The jnewblk will be discarded and the bits in the map never
7527	 * made it to disk.  We can immediately free the freeblk.
7528	 */
7529	if (needj == 0)
7530		handle_written_freework(freework);
7531}
7532
7533/*
7534 * We enqueue freework items that need processing back on the freeblks and
7535 * add the freeblks to the worklist.  This makes it easier to find all work
7536 * required to flush a truncation in process_truncates().
7537 */
7538static void
7539freework_enqueue(freework)
7540	struct freework *freework;
7541{
7542	struct freeblks *freeblks;
7543
7544	freeblks = freework->fw_freeblks;
7545	if ((freework->fw_state & INPROGRESS) == 0)
7546		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
7547	if ((freeblks->fb_state &
7548	    (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
7549	    LIST_EMPTY(&freeblks->fb_jblkdephd))
7550		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7551}
7552
7553/*
7554 * Start, continue, or finish the process of freeing an indirect block tree.
7555 * The free operation may be paused at any point with fw_off containing the
7556 * offset to restart from.  This enables us to implement some flow control
7557 * for large truncates which may fan out and generate a huge number of
7558 * dependencies.
7559 */
7560static void
7561handle_workitem_indirblk(freework)
7562	struct freework *freework;
7563{
7564	struct freeblks *freeblks;
7565	struct ufsmount *ump;
7566	struct fs *fs;
7567
7568	freeblks = freework->fw_freeblks;
7569	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7570	fs = ump->um_fs;
7571	if (freework->fw_state & DEPCOMPLETE) {
7572		handle_written_freework(freework);
7573		return;
7574	}
7575	if (freework->fw_off == NINDIR(fs)) {
7576		freework_freeblock(freework);
7577		return;
7578	}
7579	freework->fw_state |= INPROGRESS;
7580	FREE_LOCK(&lk);
7581	indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
7582	    freework->fw_lbn);
7583	ACQUIRE_LOCK(&lk);
7584}
7585
7586/*
7587 * Called when a freework structure attached to a cg buf is written.  The
7588 * ref on either the parent or the freeblks structure is released and
7589 * the freeblks is added back to the worklist if there is more work to do.
7590 */
7591static void
7592handle_written_freework(freework)
7593	struct freework *freework;
7594{
7595	struct freeblks *freeblks;
7596	struct freework *parent;
7597
7598	freeblks = freework->fw_freeblks;
7599	parent = freework->fw_parent;
7600	if (freework->fw_state & DELAYEDFREE)
7601		freeblks->fb_cgwait--;
7602	freework->fw_state |= COMPLETE;
7603	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
7604		WORKITEM_FREE(freework, D_FREEWORK);
7605	if (parent) {
7606		if (--parent->fw_ref == 0)
7607			freework_enqueue(parent);
7608		return;
7609	}
7610	if (--freeblks->fb_ref != 0)
7611		return;
7612	if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
7613	    ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
7614		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7615}
7616
7617/*
7618 * This workitem routine performs the block de-allocation.
7619 * The workitem is added to the pending list after the updated
7620 * inode block has been written to disk.  As mentioned above,
7621 * checks regarding the number of blocks de-allocated (compared
7622 * to the number of blocks allocated for the file) are also
7623 * performed in this function.
7624 */
7625static int
7626handle_workitem_freeblocks(freeblks, flags)
7627	struct freeblks *freeblks;
7628	int flags;
7629{
7630	struct freework *freework;
7631	struct newblk *newblk;
7632	struct allocindir *aip;
7633	struct ufsmount *ump;
7634	struct worklist *wk;
7635
7636	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
7637	    ("handle_workitem_freeblocks: Journal entries not written."));
7638	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7639	ACQUIRE_LOCK(&lk);
7640	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
7641		WORKLIST_REMOVE(wk);
7642		switch (wk->wk_type) {
7643		case D_DIRREM:
7644			wk->wk_state |= COMPLETE;
7645			add_to_worklist(wk, 0);
7646			continue;
7647
7648		case D_ALLOCDIRECT:
7649			free_newblk(WK_NEWBLK(wk));
7650			continue;
7651
7652		case D_ALLOCINDIR:
7653			aip = WK_ALLOCINDIR(wk);
7654			freework = NULL;
7655			if (aip->ai_state & DELAYEDFREE) {
7656				FREE_LOCK(&lk);
7657				freework = newfreework(ump, freeblks, NULL,
7658				    aip->ai_lbn, aip->ai_newblkno,
7659				    ump->um_fs->fs_frag, 0, 0);
7660				ACQUIRE_LOCK(&lk);
7661			}
7662			newblk = WK_NEWBLK(wk);
7663			if (newblk->nb_jnewblk) {
7664				freework->fw_jnewblk = newblk->nb_jnewblk;
7665				newblk->nb_jnewblk->jn_dep = &freework->fw_list;
7666				newblk->nb_jnewblk = NULL;
7667			}
7668			free_newblk(newblk);
7669			continue;
7670
7671		case D_FREEWORK:
7672			freework = WK_FREEWORK(wk);
7673			if (freework->fw_lbn <= -NDADDR)
7674				handle_workitem_indirblk(freework);
7675			else
7676				freework_freeblock(freework);
7677			continue;
7678		default:
7679			panic("handle_workitem_freeblocks: Unknown type %s",
7680			    TYPENAME(wk->wk_type));
7681		}
7682	}
7683	if (freeblks->fb_ref != 0) {
7684		freeblks->fb_state &= ~INPROGRESS;
7685		wake_worklist(&freeblks->fb_list);
7686		freeblks = NULL;
7687	}
7688	FREE_LOCK(&lk);
7689	if (freeblks)
7690		return handle_complete_freeblocks(freeblks, flags);
7691	return (0);
7692}
7693
7694/*
7695 * Handle completion of block free via truncate.  This allows fs_pending
7696 * to track the actual free block count more closely than if we only updated
7697 * it at the end.  We must be careful to handle cases where the block count
7698 * on free was incorrect.
7699 */
7700static void
7701freeblks_free(ump, freeblks, blocks)
7702	struct ufsmount *ump;
7703	struct freeblks *freeblks;
7704	int blocks;
7705{
7706	struct fs *fs;
7707	ufs2_daddr_t remain;
7708
7709	UFS_LOCK(ump);
7710	remain = -freeblks->fb_chkcnt;
7711	freeblks->fb_chkcnt += blocks;
7712	if (remain > 0) {
7713		if (remain < blocks)
7714			blocks = remain;
7715		fs = ump->um_fs;
7716		fs->fs_pendingblocks -= blocks;
7717	}
7718	UFS_UNLOCK(ump);
7719}
7720
7721/*
7722 * Once all of the freework workitems are complete we can retire the
7723 * freeblocks dependency and any journal work awaiting completion.  This
7724 * can not be called until all other dependencies are stable on disk.
7725 */
7726static int
7727handle_complete_freeblocks(freeblks, flags)
7728	struct freeblks *freeblks;
7729	int flags;
7730{
7731	struct inodedep *inodedep;
7732	struct inode *ip;
7733	struct vnode *vp;
7734	struct fs *fs;
7735	struct ufsmount *ump;
7736	ufs2_daddr_t spare;
7737
7738	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7739	fs = ump->um_fs;
7740	flags = LK_EXCLUSIVE | flags;
7741	spare = freeblks->fb_chkcnt;
7742
7743	/*
7744	 * If we did not release the expected number of blocks we may have
7745	 * to adjust the inode block count here.  Only do so if it wasn't
7746	 * a truncation to zero and the modrev still matches.
7747	 */
7748	if (spare && freeblks->fb_len != 0) {
7749		if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7750		    flags, &vp, FFSV_FORCEINSMQ) != 0)
7751			return (EBUSY);
7752		ip = VTOI(vp);
7753		if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
7754			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
7755			ip->i_flag |= IN_CHANGE;
7756			/*
7757			 * We must wait so this happens before the
7758			 * journal is reclaimed.
7759			 */
7760			ffs_update(vp, 1);
7761		}
7762		vput(vp);
7763	}
7764	if (spare < 0) {
7765		UFS_LOCK(ump);
7766		fs->fs_pendingblocks += spare;
7767		UFS_UNLOCK(ump);
7768	}
7769#ifdef QUOTA
7770	/* Handle spare. */
7771	if (spare)
7772		quotaadj(freeblks->fb_quota, ump, -spare);
7773	quotarele(freeblks->fb_quota);
7774#endif
7775	ACQUIRE_LOCK(&lk);
7776	if (freeblks->fb_state & ONDEPLIST) {
7777		inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7778		    0, &inodedep);
7779		TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
7780		freeblks->fb_state &= ~ONDEPLIST;
7781		if (TAILQ_EMPTY(&inodedep->id_freeblklst))
7782			free_inodedep(inodedep);
7783	}
7784	/*
7785	 * All of the freeblock deps must be complete prior to this call
7786	 * so it's now safe to complete earlier outstanding journal entries.
7787	 */
7788	handle_jwork(&freeblks->fb_jwork);
7789	WORKITEM_FREE(freeblks, D_FREEBLKS);
7790	FREE_LOCK(&lk);
7791	return (0);
7792}
7793
7794/*
7795 * Release blocks associated with the freeblks and stored in the indirect
7796 * block dbn. If level is greater than SINGLE, the block is an indirect block
7797 * and recursive calls to indirtrunc must be used to cleanse other indirect
7798 * blocks.
7799 *
7800 * This handles partial and complete truncation of blocks.  Partial is noted
7801 * with goingaway == 0.  In this case the freework is completed after the
7802 * zero'd indirects are written to disk.  For full truncation the freework
7803 * is completed after the block is freed.
7804 */
7805static void
7806indir_trunc(freework, dbn, lbn)
7807	struct freework *freework;
7808	ufs2_daddr_t dbn;
7809	ufs_lbn_t lbn;
7810{
7811	struct freework *nfreework;
7812	struct workhead wkhd;
7813	struct freeblks *freeblks;
7814	struct buf *bp;
7815	struct fs *fs;
7816	struct indirdep *indirdep;
7817	struct ufsmount *ump;
7818	ufs1_daddr_t *bap1 = 0;
7819	ufs2_daddr_t nb, nnb, *bap2 = 0;
7820	ufs_lbn_t lbnadd, nlbn;
7821	int i, nblocks, ufs1fmt;
7822	int freedblocks;
7823	int goingaway;
7824	int freedeps;
7825	int needj;
7826	int level;
7827	int cnt;
7828
7829	freeblks = freework->fw_freeblks;
7830	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7831	fs = ump->um_fs;
7832	/*
7833	 * Get buffer of block pointers to be freed.  There are three cases:
7834	 *
7835	 * 1) Partial truncate caches the indirdep pointer in the freework
7836	 *    which provides us a back copy to the save bp which holds the
7837	 *    pointers we want to clear.  When this completes the zero
7838	 *    pointers are written to the real copy.
7839	 * 2) The indirect is being completely truncated, cancel_indirdep()
7840	 *    eliminated the real copy and placed the indirdep on the saved
7841	 *    copy.  The indirdep and buf are discarded when this completes.
7842	 * 3) The indirect was not in memory, we read a copy off of the disk
7843	 *    using the devvp and drop and invalidate the buffer when we're
7844	 *    done.
7845	 */
7846	goingaway = 1;
7847	indirdep = NULL;
7848	if (freework->fw_indir != NULL) {
7849		goingaway = 0;
7850		indirdep = freework->fw_indir;
7851		bp = indirdep->ir_savebp;
7852		if (bp == NULL || bp->b_blkno != dbn)
7853			panic("indir_trunc: Bad saved buf %p blkno %jd",
7854			    bp, (intmax_t)dbn);
7855	} else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
7856		/*
7857		 * The lock prevents the buf dep list from changing and
7858	 	 * indirects on devvp should only ever have one dependency.
7859		 */
7860		indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
7861		if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
7862			panic("indir_trunc: Bad indirdep %p from buf %p",
7863			    indirdep, bp);
7864	} else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
7865	    NOCRED, &bp) != 0) {
7866		brelse(bp);
7867		return;
7868	}
7869	ACQUIRE_LOCK(&lk);
7870	/* Protects against a race with complete_trunc_indir(). */
7871	freework->fw_state &= ~INPROGRESS;
7872	/*
7873	 * If we have an indirdep we need to enforce the truncation order
7874	 * and discard it when it is complete.
7875	 */
7876	if (indirdep) {
7877		if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
7878		    !TAILQ_EMPTY(&indirdep->ir_trunc)) {
7879			/*
7880			 * Add the complete truncate to the list on the
7881			 * indirdep to enforce in-order processing.
7882			 */
7883			if (freework->fw_indir == NULL)
7884				TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
7885				    freework, fw_next);
7886			FREE_LOCK(&lk);
7887			return;
7888		}
7889		/*
7890		 * If we're goingaway, free the indirdep.  Otherwise it will
7891		 * linger until the write completes.
7892		 */
7893		if (goingaway) {
7894			free_indirdep(indirdep);
7895			ump->um_numindirdeps -= 1;
7896		}
7897	}
7898	FREE_LOCK(&lk);
7899	/* Initialize pointers depending on block size. */
7900	if (ump->um_fstype == UFS1) {
7901		bap1 = (ufs1_daddr_t *)bp->b_data;
7902		nb = bap1[freework->fw_off];
7903		ufs1fmt = 1;
7904	} else {
7905		bap2 = (ufs2_daddr_t *)bp->b_data;
7906		nb = bap2[freework->fw_off];
7907		ufs1fmt = 0;
7908	}
7909	level = lbn_level(lbn);
7910	needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
7911	lbnadd = lbn_offset(fs, level);
7912	nblocks = btodb(fs->fs_bsize);
7913	nfreework = freework;
7914	freedeps = 0;
7915	cnt = 0;
7916	/*
7917	 * Reclaim blocks.  Traverses into nested indirect levels and
7918	 * arranges for the current level to be freed when subordinates
7919	 * are free when journaling.
7920	 */
7921	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
7922		if (i != NINDIR(fs) - 1) {
7923			if (ufs1fmt)
7924				nnb = bap1[i+1];
7925			else
7926				nnb = bap2[i+1];
7927		} else
7928			nnb = 0;
7929		if (nb == 0)
7930			continue;
7931		cnt++;
7932		if (level != 0) {
7933			nlbn = (lbn + 1) - (i * lbnadd);
7934			if (needj != 0) {
7935				nfreework = newfreework(ump, freeblks, freework,
7936				    nlbn, nb, fs->fs_frag, 0, 0);
7937				freedeps++;
7938			}
7939			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
7940		} else {
7941			struct freedep *freedep;
7942
7943			/*
7944			 * Attempt to aggregate freedep dependencies for
7945			 * all blocks being released to the same CG.
7946			 */
7947			LIST_INIT(&wkhd);
7948			if (needj != 0 &&
7949			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
7950				freedep = newfreedep(freework);
7951				WORKLIST_INSERT_UNLOCKED(&wkhd,
7952				    &freedep->fd_list);
7953				freedeps++;
7954			}
7955			CTR3(KTR_SUJ,
7956			    "indir_trunc: ino %d blkno %jd size %ld",
7957			    freeblks->fb_inum, nb, fs->fs_bsize);
7958			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
7959			    fs->fs_bsize, freeblks->fb_inum,
7960			    freeblks->fb_vtype, &wkhd);
7961		}
7962	}
7963	if (goingaway) {
7964		bp->b_flags |= B_INVAL | B_NOCACHE;
7965		brelse(bp);
7966	}
7967	freedblocks = 0;
7968	if (level == 0)
7969		freedblocks = (nblocks * cnt);
7970	if (needj == 0)
7971		freedblocks += nblocks;
7972	freeblks_free(ump, freeblks, freedblocks);
7973	/*
7974	 * If we are journaling set up the ref counts and offset so this
7975	 * indirect can be completed when its children are free.
7976	 */
7977	if (needj) {
7978		ACQUIRE_LOCK(&lk);
7979		freework->fw_off = i;
7980		freework->fw_ref += freedeps;
7981		freework->fw_ref -= NINDIR(fs) + 1;
7982		if (level == 0)
7983			freeblks->fb_cgwait += freedeps;
7984		if (freework->fw_ref == 0)
7985			freework_freeblock(freework);
7986		FREE_LOCK(&lk);
7987		return;
7988	}
7989	/*
7990	 * If we're not journaling we can free the indirect now.
7991	 */
7992	dbn = dbtofsb(fs, dbn);
7993	CTR3(KTR_SUJ,
7994	    "indir_trunc 2: ino %d blkno %jd size %ld",
7995	    freeblks->fb_inum, dbn, fs->fs_bsize);
7996	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
7997	    freeblks->fb_inum, freeblks->fb_vtype, NULL);
7998	/* Non SUJ softdep does single-threaded truncations. */
7999	if (freework->fw_blkno == dbn) {
8000		freework->fw_state |= ALLCOMPLETE;
8001		ACQUIRE_LOCK(&lk);
8002		handle_written_freework(freework);
8003		FREE_LOCK(&lk);
8004	}
8005	return;
8006}
8007
8008/*
8009 * Cancel an allocindir when it is removed via truncation.  When bp is not
8010 * NULL the indirect never appeared on disk and is scheduled to be freed
8011 * independently of the indir so we can more easily track journal work.
8012 */
8013static void
8014cancel_allocindir(aip, bp, freeblks, trunc)
8015	struct allocindir *aip;
8016	struct buf *bp;
8017	struct freeblks *freeblks;
8018	int trunc;
8019{
8020	struct indirdep *indirdep;
8021	struct freefrag *freefrag;
8022	struct newblk *newblk;
8023
8024	newblk = (struct newblk *)aip;
8025	LIST_REMOVE(aip, ai_next);
8026	/*
8027	 * We must eliminate the pointer in bp if it must be freed on its
8028	 * own due to partial truncate or pending journal work.
8029	 */
8030	if (bp && (trunc || newblk->nb_jnewblk)) {
8031		/*
8032		 * Clear the pointer and mark the aip to be freed
8033		 * directly if it never existed on disk.
8034		 */
8035		aip->ai_state |= DELAYEDFREE;
8036		indirdep = aip->ai_indirdep;
8037		if (indirdep->ir_state & UFS1FMT)
8038			((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8039		else
8040			((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8041	}
8042	/*
8043	 * When truncating the previous pointer will be freed via
8044	 * savedbp.  Eliminate the freefrag which would dup free.
8045	 */
8046	if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
8047		newblk->nb_freefrag = NULL;
8048		if (freefrag->ff_jdep)
8049			cancel_jfreefrag(
8050			    WK_JFREEFRAG(freefrag->ff_jdep));
8051		jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
8052		WORKITEM_FREE(freefrag, D_FREEFRAG);
8053	}
8054	/*
8055	 * If the journal hasn't been written the jnewblk must be passed
8056	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
8057	 * this by leaving the journal dependency on the newblk to be freed
8058	 * when a freework is created in handle_workitem_freeblocks().
8059	 */
8060	cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
8061	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
8062}
8063
8064/*
8065 * Create the mkdir dependencies for . and .. in a new directory.  Link them
8066 * in to a newdirblk so any subsequent additions are tracked properly.  The
8067 * caller is responsible for adding the mkdir1 dependency to the journal
8068 * and updating id_mkdiradd.  This function returns with lk held.
8069 */
8070static struct mkdir *
8071setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
8072	struct diradd *dap;
8073	ino_t newinum;
8074	ino_t dinum;
8075	struct buf *newdirbp;
8076	struct mkdir **mkdirp;
8077{
8078	struct newblk *newblk;
8079	struct pagedep *pagedep;
8080	struct inodedep *inodedep;
8081	struct newdirblk *newdirblk = 0;
8082	struct mkdir *mkdir1, *mkdir2;
8083	struct worklist *wk;
8084	struct jaddref *jaddref;
8085	struct mount *mp;
8086
8087	mp = dap->da_list.wk_mp;
8088	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
8089	    M_SOFTDEP_FLAGS);
8090	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8091	LIST_INIT(&newdirblk->db_mkdir);
8092	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8093	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
8094	mkdir1->md_state = ATTACHED | MKDIR_BODY;
8095	mkdir1->md_diradd = dap;
8096	mkdir1->md_jaddref = NULL;
8097	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8098	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
8099	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
8100	mkdir2->md_diradd = dap;
8101	mkdir2->md_jaddref = NULL;
8102	if (MOUNTEDSUJ(mp) == 0) {
8103		mkdir1->md_state |= DEPCOMPLETE;
8104		mkdir2->md_state |= DEPCOMPLETE;
8105	}
8106	/*
8107	 * Dependency on "." and ".." being written to disk.
8108	 */
8109	mkdir1->md_buf = newdirbp;
8110	ACQUIRE_LOCK(&lk);
8111	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
8112	/*
8113	 * We must link the pagedep, allocdirect, and newdirblk for
8114	 * the initial file page so the pointer to the new directory
8115	 * is not written until the directory contents are live and
8116	 * any subsequent additions are not marked live until the
8117	 * block is reachable via the inode.
8118	 */
8119	if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
8120		panic("setup_newdir: lost pagedep");
8121	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
8122		if (wk->wk_type == D_ALLOCDIRECT)
8123			break;
8124	if (wk == NULL)
8125		panic("setup_newdir: lost allocdirect");
8126	if (pagedep->pd_state & NEWBLOCK)
8127		panic("setup_newdir: NEWBLOCK already set");
8128	newblk = WK_NEWBLK(wk);
8129	pagedep->pd_state |= NEWBLOCK;
8130	pagedep->pd_newdirblk = newdirblk;
8131	newdirblk->db_pagedep = pagedep;
8132	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8133	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
8134	/*
8135	 * Look up the inodedep for the parent directory so that we
8136	 * can link mkdir2 into the pending dotdot jaddref or
8137	 * the inode write if there is none.  If the inode is
8138	 * ALLCOMPLETE and no jaddref is present all dependencies have
8139	 * been satisfied and mkdir2 can be freed.
8140	 */
8141	inodedep_lookup(mp, dinum, 0, &inodedep);
8142	if (MOUNTEDSUJ(mp)) {
8143		if (inodedep == NULL)
8144			panic("setup_newdir: Lost parent.");
8145		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8146		    inoreflst);
8147		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
8148		    (jaddref->ja_state & MKDIR_PARENT),
8149		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
8150		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
8151		mkdir2->md_jaddref = jaddref;
8152		jaddref->ja_mkdir = mkdir2;
8153	} else if (inodedep == NULL ||
8154	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8155		dap->da_state &= ~MKDIR_PARENT;
8156		WORKITEM_FREE(mkdir2, D_MKDIR);
8157		mkdir2 = NULL;
8158	} else {
8159		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
8160		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
8161	}
8162	*mkdirp = mkdir2;
8163
8164	return (mkdir1);
8165}
8166
8167/*
8168 * Directory entry addition dependencies.
8169 *
8170 * When adding a new directory entry, the inode (with its incremented link
8171 * count) must be written to disk before the directory entry's pointer to it.
8172 * Also, if the inode is newly allocated, the corresponding freemap must be
8173 * updated (on disk) before the directory entry's pointer. These requirements
8174 * are met via undo/redo on the directory entry's pointer, which consists
8175 * simply of the inode number.
8176 *
8177 * As directory entries are added and deleted, the free space within a
8178 * directory block can become fragmented.  The ufs filesystem will compact
8179 * a fragmented directory block to make space for a new entry. When this
8180 * occurs, the offsets of previously added entries change. Any "diradd"
8181 * dependency structures corresponding to these entries must be updated with
8182 * the new offsets.
8183 */
8184
8185/*
8186 * This routine is called after the in-memory inode's link
8187 * count has been incremented, but before the directory entry's
8188 * pointer to the inode has been set.
8189 */
8190int
8191softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
8192	struct buf *bp;		/* buffer containing directory block */
8193	struct inode *dp;	/* inode for directory */
8194	off_t diroffset;	/* offset of new entry in directory */
8195	ino_t newinum;		/* inode referenced by new directory entry */
8196	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
8197	int isnewblk;		/* entry is in a newly allocated block */
8198{
8199	int offset;		/* offset of new entry within directory block */
8200	ufs_lbn_t lbn;		/* block in directory containing new entry */
8201	struct fs *fs;
8202	struct diradd *dap;
8203	struct newblk *newblk;
8204	struct pagedep *pagedep;
8205	struct inodedep *inodedep;
8206	struct newdirblk *newdirblk = 0;
8207	struct mkdir *mkdir1, *mkdir2;
8208	struct jaddref *jaddref;
8209	struct mount *mp;
8210	int isindir;
8211
8212	/*
8213	 * Whiteouts have no dependencies.
8214	 */
8215	if (newinum == WINO) {
8216		if (newdirbp != NULL)
8217			bdwrite(newdirbp);
8218		return (0);
8219	}
8220	jaddref = NULL;
8221	mkdir1 = mkdir2 = NULL;
8222	mp = UFSTOVFS(dp->i_ump);
8223	fs = dp->i_fs;
8224	lbn = lblkno(fs, diroffset);
8225	offset = blkoff(fs, diroffset);
8226	dap = malloc(sizeof(struct diradd), M_DIRADD,
8227		M_SOFTDEP_FLAGS|M_ZERO);
8228	workitem_alloc(&dap->da_list, D_DIRADD, mp);
8229	dap->da_offset = offset;
8230	dap->da_newinum = newinum;
8231	dap->da_state = ATTACHED;
8232	LIST_INIT(&dap->da_jwork);
8233	isindir = bp->b_lblkno >= NDADDR;
8234	if (isnewblk &&
8235	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8236		newdirblk = malloc(sizeof(struct newdirblk),
8237		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8238		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8239		LIST_INIT(&newdirblk->db_mkdir);
8240	}
8241	/*
8242	 * If we're creating a new directory setup the dependencies and set
8243	 * the dap state to wait for them.  Otherwise it's COMPLETE and
8244	 * we can move on.
8245	 */
8246	if (newdirbp == NULL) {
8247		dap->da_state |= DEPCOMPLETE;
8248		ACQUIRE_LOCK(&lk);
8249	} else {
8250		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8251		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8252		    &mkdir2);
8253	}
8254	/*
8255	 * Link into parent directory pagedep to await its being written.
8256	 */
8257	pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8258#ifdef DEBUG
8259	if (diradd_lookup(pagedep, offset) != NULL)
8260		panic("softdep_setup_directory_add: %p already at off %d\n",
8261		    diradd_lookup(pagedep, offset), offset);
8262#endif
8263	dap->da_pagedep = pagedep;
8264	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8265	    da_pdlist);
8266	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
8267	/*
8268	 * If we're journaling, link the diradd into the jaddref so it
8269	 * may be completed after the journal entry is written.  Otherwise,
8270	 * link the diradd into its inodedep.  If the inode is not yet
8271	 * written place it on the bufwait list, otherwise do the post-inode
8272	 * write processing to put it on the id_pendinghd list.
8273	 */
8274	if (MOUNTEDSUJ(mp)) {
8275		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8276		    inoreflst);
8277		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8278		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8279		jaddref->ja_diroff = diroffset;
8280		jaddref->ja_diradd = dap;
8281		add_to_journal(&jaddref->ja_list);
8282	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8283		diradd_inode_written(dap, inodedep);
8284	else
8285		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8286	/*
8287	 * Add the journal entries for . and .. links now that the primary
8288	 * link is written.
8289	 */
8290	if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8291		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8292		    inoreflst, if_deps);
8293		KASSERT(jaddref != NULL &&
8294		    jaddref->ja_ino == jaddref->ja_parent &&
8295		    (jaddref->ja_state & MKDIR_BODY),
8296		    ("softdep_setup_directory_add: bad dot jaddref %p",
8297		    jaddref));
8298		mkdir1->md_jaddref = jaddref;
8299		jaddref->ja_mkdir = mkdir1;
8300		/*
8301		 * It is important that the dotdot journal entry
8302		 * is added prior to the dot entry since dot writes
8303		 * both the dot and dotdot links.  These both must
8304		 * be added after the primary link for the journal
8305		 * to remain consistent.
8306		 */
8307		add_to_journal(&mkdir2->md_jaddref->ja_list);
8308		add_to_journal(&jaddref->ja_list);
8309	}
8310	/*
8311	 * If we are adding a new directory remember this diradd so that if
8312	 * we rename it we can keep the dot and dotdot dependencies.  If
8313	 * we are adding a new name for an inode that has a mkdiradd we
8314	 * must be in rename and we have to move the dot and dotdot
8315	 * dependencies to this new name.  The old name is being orphaned
8316	 * soon.
8317	 */
8318	if (mkdir1 != NULL) {
8319		if (inodedep->id_mkdiradd != NULL)
8320			panic("softdep_setup_directory_add: Existing mkdir");
8321		inodedep->id_mkdiradd = dap;
8322	} else if (inodedep->id_mkdiradd)
8323		merge_diradd(inodedep, dap);
8324	if (newdirblk) {
8325		/*
8326		 * There is nothing to do if we are already tracking
8327		 * this block.
8328		 */
8329		if ((pagedep->pd_state & NEWBLOCK) != 0) {
8330			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8331			FREE_LOCK(&lk);
8332			return (0);
8333		}
8334		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8335		    == 0)
8336			panic("softdep_setup_directory_add: lost entry");
8337		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8338		pagedep->pd_state |= NEWBLOCK;
8339		pagedep->pd_newdirblk = newdirblk;
8340		newdirblk->db_pagedep = pagedep;
8341		FREE_LOCK(&lk);
8342		/*
8343		 * If we extended into an indirect signal direnter to sync.
8344		 */
8345		if (isindir)
8346			return (1);
8347		return (0);
8348	}
8349	FREE_LOCK(&lk);
8350	return (0);
8351}
8352
8353/*
8354 * This procedure is called to change the offset of a directory
8355 * entry when compacting a directory block which must be owned
8356 * exclusively by the caller. Note that the actual entry movement
8357 * must be done in this procedure to ensure that no I/O completions
8358 * occur while the move is in progress.
8359 */
8360void
8361softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
8362	struct buf *bp;		/* Buffer holding directory block. */
8363	struct inode *dp;	/* inode for directory */
8364	caddr_t base;		/* address of dp->i_offset */
8365	caddr_t oldloc;		/* address of old directory location */
8366	caddr_t newloc;		/* address of new directory location */
8367	int entrysize;		/* size of directory entry */
8368{
8369	int offset, oldoffset, newoffset;
8370	struct pagedep *pagedep;
8371	struct jmvref *jmvref;
8372	struct diradd *dap;
8373	struct direct *de;
8374	struct mount *mp;
8375	ufs_lbn_t lbn;
8376	int flags;
8377
8378	mp = UFSTOVFS(dp->i_ump);
8379	de = (struct direct *)oldloc;
8380	jmvref = NULL;
8381	flags = 0;
8382	/*
8383	 * Moves are always journaled as it would be too complex to
8384	 * determine if any affected adds or removes are present in the
8385	 * journal.
8386	 */
8387	if (MOUNTEDSUJ(mp)) {
8388		flags = DEPALLOC;
8389		jmvref = newjmvref(dp, de->d_ino,
8390		    dp->i_offset + (oldloc - base),
8391		    dp->i_offset + (newloc - base));
8392	}
8393	lbn = lblkno(dp->i_fs, dp->i_offset);
8394	offset = blkoff(dp->i_fs, dp->i_offset);
8395	oldoffset = offset + (oldloc - base);
8396	newoffset = offset + (newloc - base);
8397	ACQUIRE_LOCK(&lk);
8398	if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8399		goto done;
8400	dap = diradd_lookup(pagedep, oldoffset);
8401	if (dap) {
8402		dap->da_offset = newoffset;
8403		newoffset = DIRADDHASH(newoffset);
8404		oldoffset = DIRADDHASH(oldoffset);
8405		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8406		    newoffset != oldoffset) {
8407			LIST_REMOVE(dap, da_pdlist);
8408			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8409			    dap, da_pdlist);
8410		}
8411	}
8412done:
8413	if (jmvref) {
8414		jmvref->jm_pagedep = pagedep;
8415		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8416		add_to_journal(&jmvref->jm_list);
8417	}
8418	bcopy(oldloc, newloc, entrysize);
8419	FREE_LOCK(&lk);
8420}
8421
8422/*
8423 * Move the mkdir dependencies and journal work from one diradd to another
8424 * when renaming a directory.  The new name must depend on the mkdir deps
8425 * completing as the old name did.  Directories can only have one valid link
8426 * at a time so one must be canonical.
8427 */
8428static void
8429merge_diradd(inodedep, newdap)
8430	struct inodedep *inodedep;
8431	struct diradd *newdap;
8432{
8433	struct diradd *olddap;
8434	struct mkdir *mkdir, *nextmd;
8435	short state;
8436
8437	olddap = inodedep->id_mkdiradd;
8438	inodedep->id_mkdiradd = newdap;
8439	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8440		newdap->da_state &= ~DEPCOMPLETE;
8441		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
8442			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8443			if (mkdir->md_diradd != olddap)
8444				continue;
8445			mkdir->md_diradd = newdap;
8446			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8447			newdap->da_state |= state;
8448			olddap->da_state &= ~state;
8449			if ((olddap->da_state &
8450			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
8451				break;
8452		}
8453		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8454			panic("merge_diradd: unfound ref");
8455	}
8456	/*
8457	 * Any mkdir related journal items are not safe to be freed until
8458	 * the new name is stable.
8459	 */
8460	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8461	olddap->da_state |= DEPCOMPLETE;
8462	complete_diradd(olddap);
8463}
8464
8465/*
8466 * Move the diradd to the pending list when all diradd dependencies are
8467 * complete.
8468 */
8469static void
8470complete_diradd(dap)
8471	struct diradd *dap;
8472{
8473	struct pagedep *pagedep;
8474
8475	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8476		if (dap->da_state & DIRCHG)
8477			pagedep = dap->da_previous->dm_pagedep;
8478		else
8479			pagedep = dap->da_pagedep;
8480		LIST_REMOVE(dap, da_pdlist);
8481		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8482	}
8483}
8484
8485/*
8486 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
8487 * add entries and conditonally journal the remove.
8488 */
8489static void
8490cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
8491	struct diradd *dap;
8492	struct dirrem *dirrem;
8493	struct jremref *jremref;
8494	struct jremref *dotremref;
8495	struct jremref *dotdotremref;
8496{
8497	struct inodedep *inodedep;
8498	struct jaddref *jaddref;
8499	struct inoref *inoref;
8500	struct mkdir *mkdir;
8501
8502	/*
8503	 * If no remove references were allocated we're on a non-journaled
8504	 * filesystem and can skip the cancel step.
8505	 */
8506	if (jremref == NULL) {
8507		free_diradd(dap, NULL);
8508		return;
8509	}
8510	/*
8511	 * Cancel the primary name an free it if it does not require
8512	 * journaling.
8513	 */
8514	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
8515	    0, &inodedep) != 0) {
8516		/* Abort the addref that reference this diradd.  */
8517		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
8518			if (inoref->if_list.wk_type != D_JADDREF)
8519				continue;
8520			jaddref = (struct jaddref *)inoref;
8521			if (jaddref->ja_diradd != dap)
8522				continue;
8523			if (cancel_jaddref(jaddref, inodedep,
8524			    &dirrem->dm_jwork) == 0) {
8525				free_jremref(jremref);
8526				jremref = NULL;
8527			}
8528			break;
8529		}
8530	}
8531	/*
8532	 * Cancel subordinate names and free them if they do not require
8533	 * journaling.
8534	 */
8535	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8536		LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
8537			if (mkdir->md_diradd != dap)
8538				continue;
8539			if ((jaddref = mkdir->md_jaddref) == NULL)
8540				continue;
8541			mkdir->md_jaddref = NULL;
8542			if (mkdir->md_state & MKDIR_PARENT) {
8543				if (cancel_jaddref(jaddref, NULL,
8544				    &dirrem->dm_jwork) == 0) {
8545					free_jremref(dotdotremref);
8546					dotdotremref = NULL;
8547				}
8548			} else {
8549				if (cancel_jaddref(jaddref, inodedep,
8550				    &dirrem->dm_jwork) == 0) {
8551					free_jremref(dotremref);
8552					dotremref = NULL;
8553				}
8554			}
8555		}
8556	}
8557
8558	if (jremref)
8559		journal_jremref(dirrem, jremref, inodedep);
8560	if (dotremref)
8561		journal_jremref(dirrem, dotremref, inodedep);
8562	if (dotdotremref)
8563		journal_jremref(dirrem, dotdotremref, NULL);
8564	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
8565	free_diradd(dap, &dirrem->dm_jwork);
8566}
8567
8568/*
8569 * Free a diradd dependency structure. This routine must be called
8570 * with splbio interrupts blocked.
8571 */
8572static void
8573free_diradd(dap, wkhd)
8574	struct diradd *dap;
8575	struct workhead *wkhd;
8576{
8577	struct dirrem *dirrem;
8578	struct pagedep *pagedep;
8579	struct inodedep *inodedep;
8580	struct mkdir *mkdir, *nextmd;
8581
8582	rw_assert(&lk, RA_WLOCKED);
8583	LIST_REMOVE(dap, da_pdlist);
8584	if (dap->da_state & ONWORKLIST)
8585		WORKLIST_REMOVE(&dap->da_list);
8586	if ((dap->da_state & DIRCHG) == 0) {
8587		pagedep = dap->da_pagedep;
8588	} else {
8589		dirrem = dap->da_previous;
8590		pagedep = dirrem->dm_pagedep;
8591		dirrem->dm_dirinum = pagedep->pd_ino;
8592		dirrem->dm_state |= COMPLETE;
8593		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8594			add_to_worklist(&dirrem->dm_list, 0);
8595	}
8596	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
8597	    0, &inodedep) != 0)
8598		if (inodedep->id_mkdiradd == dap)
8599			inodedep->id_mkdiradd = NULL;
8600	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8601		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
8602			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8603			if (mkdir->md_diradd != dap)
8604				continue;
8605			dap->da_state &=
8606			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
8607			LIST_REMOVE(mkdir, md_mkdirs);
8608			if (mkdir->md_state & ONWORKLIST)
8609				WORKLIST_REMOVE(&mkdir->md_list);
8610			if (mkdir->md_jaddref != NULL)
8611				panic("free_diradd: Unexpected jaddref");
8612			WORKITEM_FREE(mkdir, D_MKDIR);
8613			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
8614				break;
8615		}
8616		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8617			panic("free_diradd: unfound ref");
8618	}
8619	if (inodedep)
8620		free_inodedep(inodedep);
8621	/*
8622	 * Free any journal segments waiting for the directory write.
8623	 */
8624	handle_jwork(&dap->da_jwork);
8625	WORKITEM_FREE(dap, D_DIRADD);
8626}
8627
8628/*
8629 * Directory entry removal dependencies.
8630 *
8631 * When removing a directory entry, the entry's inode pointer must be
8632 * zero'ed on disk before the corresponding inode's link count is decremented
8633 * (possibly freeing the inode for re-use). This dependency is handled by
8634 * updating the directory entry but delaying the inode count reduction until
8635 * after the directory block has been written to disk. After this point, the
8636 * inode count can be decremented whenever it is convenient.
8637 */
8638
8639/*
8640 * This routine should be called immediately after removing
8641 * a directory entry.  The inode's link count should not be
8642 * decremented by the calling procedure -- the soft updates
8643 * code will do this task when it is safe.
8644 */
8645void
8646softdep_setup_remove(bp, dp, ip, isrmdir)
8647	struct buf *bp;		/* buffer containing directory block */
8648	struct inode *dp;	/* inode for the directory being modified */
8649	struct inode *ip;	/* inode for directory entry being removed */
8650	int isrmdir;		/* indicates if doing RMDIR */
8651{
8652	struct dirrem *dirrem, *prevdirrem;
8653	struct inodedep *inodedep;
8654	int direct;
8655
8656	/*
8657	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
8658	 * newdirrem() to setup the full directory remove which requires
8659	 * isrmdir > 1.
8660	 */
8661	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8662	/*
8663	 * Add the dirrem to the inodedep's pending remove list for quick
8664	 * discovery later.
8665	 */
8666	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8667	    &inodedep) == 0)
8668		panic("softdep_setup_remove: Lost inodedep.");
8669	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
8670	dirrem->dm_state |= ONDEPLIST;
8671	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8672
8673	/*
8674	 * If the COMPLETE flag is clear, then there were no active
8675	 * entries and we want to roll back to a zeroed entry until
8676	 * the new inode is committed to disk. If the COMPLETE flag is
8677	 * set then we have deleted an entry that never made it to
8678	 * disk. If the entry we deleted resulted from a name change,
8679	 * then the old name still resides on disk. We cannot delete
8680	 * its inode (returned to us in prevdirrem) until the zeroed
8681	 * directory entry gets to disk. The new inode has never been
8682	 * referenced on the disk, so can be deleted immediately.
8683	 */
8684	if ((dirrem->dm_state & COMPLETE) == 0) {
8685		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
8686		    dm_next);
8687		FREE_LOCK(&lk);
8688	} else {
8689		if (prevdirrem != NULL)
8690			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
8691			    prevdirrem, dm_next);
8692		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
8693		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
8694		FREE_LOCK(&lk);
8695		if (direct)
8696			handle_workitem_remove(dirrem, 0);
8697	}
8698}
8699
8700/*
8701 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
8702 * pd_pendinghd list of a pagedep.
8703 */
8704static struct diradd *
8705diradd_lookup(pagedep, offset)
8706	struct pagedep *pagedep;
8707	int offset;
8708{
8709	struct diradd *dap;
8710
8711	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
8712		if (dap->da_offset == offset)
8713			return (dap);
8714	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
8715		if (dap->da_offset == offset)
8716			return (dap);
8717	return (NULL);
8718}
8719
8720/*
8721 * Search for a .. diradd dependency in a directory that is being removed.
8722 * If the directory was renamed to a new parent we have a diradd rather
8723 * than a mkdir for the .. entry.  We need to cancel it now before
8724 * it is found in truncate().
8725 */
8726static struct jremref *
8727cancel_diradd_dotdot(ip, dirrem, jremref)
8728	struct inode *ip;
8729	struct dirrem *dirrem;
8730	struct jremref *jremref;
8731{
8732	struct pagedep *pagedep;
8733	struct diradd *dap;
8734	struct worklist *wk;
8735
8736	if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0,
8737	    &pagedep) == 0)
8738		return (jremref);
8739	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
8740	if (dap == NULL)
8741		return (jremref);
8742	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
8743	/*
8744	 * Mark any journal work as belonging to the parent so it is freed
8745	 * with the .. reference.
8746	 */
8747	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
8748		wk->wk_state |= MKDIR_PARENT;
8749	return (NULL);
8750}
8751
8752/*
8753 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
8754 * replace it with a dirrem/diradd pair as a result of re-parenting a
8755 * directory.  This ensures that we don't simultaneously have a mkdir and
8756 * a diradd for the same .. entry.
8757 */
8758static struct jremref *
8759cancel_mkdir_dotdot(ip, dirrem, jremref)
8760	struct inode *ip;
8761	struct dirrem *dirrem;
8762	struct jremref *jremref;
8763{
8764	struct inodedep *inodedep;
8765	struct jaddref *jaddref;
8766	struct mkdir *mkdir;
8767	struct diradd *dap;
8768
8769	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8770	    &inodedep) == 0)
8771		return (jremref);
8772	dap = inodedep->id_mkdiradd;
8773	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
8774		return (jremref);
8775	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir;
8776	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
8777		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
8778			break;
8779	if (mkdir == NULL)
8780		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
8781	if ((jaddref = mkdir->md_jaddref) != NULL) {
8782		mkdir->md_jaddref = NULL;
8783		jaddref->ja_state &= ~MKDIR_PARENT;
8784		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
8785		    &inodedep) == 0)
8786			panic("cancel_mkdir_dotdot: Lost parent inodedep");
8787		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
8788			journal_jremref(dirrem, jremref, inodedep);
8789			jremref = NULL;
8790		}
8791	}
8792	if (mkdir->md_state & ONWORKLIST)
8793		WORKLIST_REMOVE(&mkdir->md_list);
8794	mkdir->md_state |= ALLCOMPLETE;
8795	complete_mkdir(mkdir);
8796	return (jremref);
8797}
8798
8799static void
8800journal_jremref(dirrem, jremref, inodedep)
8801	struct dirrem *dirrem;
8802	struct jremref *jremref;
8803	struct inodedep *inodedep;
8804{
8805
8806	if (inodedep == NULL)
8807		if (inodedep_lookup(jremref->jr_list.wk_mp,
8808		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
8809			panic("journal_jremref: Lost inodedep");
8810	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
8811	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
8812	add_to_journal(&jremref->jr_list);
8813}
8814
8815static void
8816dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
8817	struct dirrem *dirrem;
8818	struct jremref *jremref;
8819	struct jremref *dotremref;
8820	struct jremref *dotdotremref;
8821{
8822	struct inodedep *inodedep;
8823
8824
8825	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
8826	    &inodedep) == 0)
8827		panic("dirrem_journal: Lost inodedep");
8828	journal_jremref(dirrem, jremref, inodedep);
8829	if (dotremref)
8830		journal_jremref(dirrem, dotremref, inodedep);
8831	if (dotdotremref)
8832		journal_jremref(dirrem, dotdotremref, NULL);
8833}
8834
8835/*
8836 * Allocate a new dirrem if appropriate and return it along with
8837 * its associated pagedep. Called without a lock, returns with lock.
8838 */
8839static struct dirrem *
8840newdirrem(bp, dp, ip, isrmdir, prevdirremp)
8841	struct buf *bp;		/* buffer containing directory block */
8842	struct inode *dp;	/* inode for the directory being modified */
8843	struct inode *ip;	/* inode for directory entry being removed */
8844	int isrmdir;		/* indicates if doing RMDIR */
8845	struct dirrem **prevdirremp; /* previously referenced inode, if any */
8846{
8847	int offset;
8848	ufs_lbn_t lbn;
8849	struct diradd *dap;
8850	struct dirrem *dirrem;
8851	struct pagedep *pagedep;
8852	struct jremref *jremref;
8853	struct jremref *dotremref;
8854	struct jremref *dotdotremref;
8855	struct vnode *dvp;
8856
8857	/*
8858	 * Whiteouts have no deletion dependencies.
8859	 */
8860	if (ip == NULL)
8861		panic("newdirrem: whiteout");
8862	dvp = ITOV(dp);
8863	/*
8864	 * If we are over our limit, try to improve the situation.
8865	 * Limiting the number of dirrem structures will also limit
8866	 * the number of freefile and freeblks structures.
8867	 */
8868	ACQUIRE_LOCK(&lk);
8869	if (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2)
8870		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS);
8871	FREE_LOCK(&lk);
8872	dirrem = malloc(sizeof(struct dirrem),
8873		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
8874	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
8875	LIST_INIT(&dirrem->dm_jremrefhd);
8876	LIST_INIT(&dirrem->dm_jwork);
8877	dirrem->dm_state = isrmdir ? RMDIR : 0;
8878	dirrem->dm_oldinum = ip->i_number;
8879	*prevdirremp = NULL;
8880	/*
8881	 * Allocate remove reference structures to track journal write
8882	 * dependencies.  We will always have one for the link and
8883	 * when doing directories we will always have one more for dot.
8884	 * When renaming a directory we skip the dotdot link change so
8885	 * this is not needed.
8886	 */
8887	jremref = dotremref = dotdotremref = NULL;
8888	if (DOINGSUJ(dvp)) {
8889		if (isrmdir) {
8890			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
8891			    ip->i_effnlink + 2);
8892			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
8893			    ip->i_effnlink + 1);
8894			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
8895			    dp->i_effnlink + 1);
8896			dotdotremref->jr_state |= MKDIR_PARENT;
8897		} else
8898			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
8899			    ip->i_effnlink + 1);
8900	}
8901	ACQUIRE_LOCK(&lk);
8902	lbn = lblkno(dp->i_fs, dp->i_offset);
8903	offset = blkoff(dp->i_fs, dp->i_offset);
8904	pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC,
8905	    &pagedep);
8906	dirrem->dm_pagedep = pagedep;
8907	dirrem->dm_offset = offset;
8908	/*
8909	 * If we're renaming a .. link to a new directory, cancel any
8910	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
8911	 * the jremref is preserved for any potential diradd in this
8912	 * location.  This can not coincide with a rmdir.
8913	 */
8914	if (dp->i_offset == DOTDOT_OFFSET) {
8915		if (isrmdir)
8916			panic("newdirrem: .. directory change during remove?");
8917		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
8918	}
8919	/*
8920	 * If we're removing a directory search for the .. dependency now and
8921	 * cancel it.  Any pending journal work will be added to the dirrem
8922	 * to be completed when the workitem remove completes.
8923	 */
8924	if (isrmdir)
8925		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
8926	/*
8927	 * Check for a diradd dependency for the same directory entry.
8928	 * If present, then both dependencies become obsolete and can
8929	 * be de-allocated.
8930	 */
8931	dap = diradd_lookup(pagedep, offset);
8932	if (dap == NULL) {
8933		/*
8934		 * Link the jremref structures into the dirrem so they are
8935		 * written prior to the pagedep.
8936		 */
8937		if (jremref)
8938			dirrem_journal(dirrem, jremref, dotremref,
8939			    dotdotremref);
8940		return (dirrem);
8941	}
8942	/*
8943	 * Must be ATTACHED at this point.
8944	 */
8945	if ((dap->da_state & ATTACHED) == 0)
8946		panic("newdirrem: not ATTACHED");
8947	if (dap->da_newinum != ip->i_number)
8948		panic("newdirrem: inum %ju should be %ju",
8949		    (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
8950	/*
8951	 * If we are deleting a changed name that never made it to disk,
8952	 * then return the dirrem describing the previous inode (which
8953	 * represents the inode currently referenced from this entry on disk).
8954	 */
8955	if ((dap->da_state & DIRCHG) != 0) {
8956		*prevdirremp = dap->da_previous;
8957		dap->da_state &= ~DIRCHG;
8958		dap->da_pagedep = pagedep;
8959	}
8960	/*
8961	 * We are deleting an entry that never made it to disk.
8962	 * Mark it COMPLETE so we can delete its inode immediately.
8963	 */
8964	dirrem->dm_state |= COMPLETE;
8965	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
8966#ifdef SUJ_DEBUG
8967	if (isrmdir == 0) {
8968		struct worklist *wk;
8969
8970		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
8971			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
8972				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
8973	}
8974#endif
8975
8976	return (dirrem);
8977}
8978
8979/*
8980 * Directory entry change dependencies.
8981 *
8982 * Changing an existing directory entry requires that an add operation
8983 * be completed first followed by a deletion. The semantics for the addition
8984 * are identical to the description of adding a new entry above except
8985 * that the rollback is to the old inode number rather than zero. Once
8986 * the addition dependency is completed, the removal is done as described
8987 * in the removal routine above.
8988 */
8989
8990/*
8991 * This routine should be called immediately after changing
8992 * a directory entry.  The inode's link count should not be
8993 * decremented by the calling procedure -- the soft updates
8994 * code will perform this task when it is safe.
8995 */
8996void
8997softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
8998	struct buf *bp;		/* buffer containing directory block */
8999	struct inode *dp;	/* inode for the directory being modified */
9000	struct inode *ip;	/* inode for directory entry being removed */
9001	ino_t newinum;		/* new inode number for changed entry */
9002	int isrmdir;		/* indicates if doing RMDIR */
9003{
9004	int offset;
9005	struct diradd *dap = NULL;
9006	struct dirrem *dirrem, *prevdirrem;
9007	struct pagedep *pagedep;
9008	struct inodedep *inodedep;
9009	struct jaddref *jaddref;
9010	struct mount *mp;
9011
9012	offset = blkoff(dp->i_fs, dp->i_offset);
9013	mp = UFSTOVFS(dp->i_ump);
9014
9015	/*
9016	 * Whiteouts do not need diradd dependencies.
9017	 */
9018	if (newinum != WINO) {
9019		dap = malloc(sizeof(struct diradd),
9020		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
9021		workitem_alloc(&dap->da_list, D_DIRADD, mp);
9022		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
9023		dap->da_offset = offset;
9024		dap->da_newinum = newinum;
9025		LIST_INIT(&dap->da_jwork);
9026	}
9027
9028	/*
9029	 * Allocate a new dirrem and ACQUIRE_LOCK.
9030	 */
9031	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9032	pagedep = dirrem->dm_pagedep;
9033	/*
9034	 * The possible values for isrmdir:
9035	 *	0 - non-directory file rename
9036	 *	1 - directory rename within same directory
9037	 *   inum - directory rename to new directory of given inode number
9038	 * When renaming to a new directory, we are both deleting and
9039	 * creating a new directory entry, so the link count on the new
9040	 * directory should not change. Thus we do not need the followup
9041	 * dirrem which is usually done in handle_workitem_remove. We set
9042	 * the DIRCHG flag to tell handle_workitem_remove to skip the
9043	 * followup dirrem.
9044	 */
9045	if (isrmdir > 1)
9046		dirrem->dm_state |= DIRCHG;
9047
9048	/*
9049	 * Whiteouts have no additional dependencies,
9050	 * so just put the dirrem on the correct list.
9051	 */
9052	if (newinum == WINO) {
9053		if ((dirrem->dm_state & COMPLETE) == 0) {
9054			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
9055			    dm_next);
9056		} else {
9057			dirrem->dm_dirinum = pagedep->pd_ino;
9058			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9059				add_to_worklist(&dirrem->dm_list, 0);
9060		}
9061		FREE_LOCK(&lk);
9062		return;
9063	}
9064	/*
9065	 * Add the dirrem to the inodedep's pending remove list for quick
9066	 * discovery later.  A valid nlinkdelta ensures that this lookup
9067	 * will not fail.
9068	 */
9069	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9070		panic("softdep_setup_directory_change: Lost inodedep.");
9071	dirrem->dm_state |= ONDEPLIST;
9072	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9073
9074	/*
9075	 * If the COMPLETE flag is clear, then there were no active
9076	 * entries and we want to roll back to the previous inode until
9077	 * the new inode is committed to disk. If the COMPLETE flag is
9078	 * set, then we have deleted an entry that never made it to disk.
9079	 * If the entry we deleted resulted from a name change, then the old
9080	 * inode reference still resides on disk. Any rollback that we do
9081	 * needs to be to that old inode (returned to us in prevdirrem). If
9082	 * the entry we deleted resulted from a create, then there is
9083	 * no entry on the disk, so we want to roll back to zero rather
9084	 * than the uncommitted inode. In either of the COMPLETE cases we
9085	 * want to immediately free the unwritten and unreferenced inode.
9086	 */
9087	if ((dirrem->dm_state & COMPLETE) == 0) {
9088		dap->da_previous = dirrem;
9089	} else {
9090		if (prevdirrem != NULL) {
9091			dap->da_previous = prevdirrem;
9092		} else {
9093			dap->da_state &= ~DIRCHG;
9094			dap->da_pagedep = pagedep;
9095		}
9096		dirrem->dm_dirinum = pagedep->pd_ino;
9097		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9098			add_to_worklist(&dirrem->dm_list, 0);
9099	}
9100	/*
9101	 * Lookup the jaddref for this journal entry.  We must finish
9102	 * initializing it and make the diradd write dependent on it.
9103	 * If we're not journaling, put it on the id_bufwait list if the
9104	 * inode is not yet written. If it is written, do the post-inode
9105	 * write processing to put it on the id_pendinghd list.
9106	 */
9107	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
9108	if (MOUNTEDSUJ(mp)) {
9109		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
9110		    inoreflst);
9111		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
9112		    ("softdep_setup_directory_change: bad jaddref %p",
9113		    jaddref));
9114		jaddref->ja_diroff = dp->i_offset;
9115		jaddref->ja_diradd = dap;
9116		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9117		    dap, da_pdlist);
9118		add_to_journal(&jaddref->ja_list);
9119	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
9120		dap->da_state |= COMPLETE;
9121		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9122		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9123	} else {
9124		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9125		    dap, da_pdlist);
9126		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
9127	}
9128	/*
9129	 * If we're making a new name for a directory that has not been
9130	 * committed when need to move the dot and dotdot references to
9131	 * this new name.
9132	 */
9133	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
9134		merge_diradd(inodedep, dap);
9135	FREE_LOCK(&lk);
9136}
9137
9138/*
9139 * Called whenever the link count on an inode is changed.
9140 * It creates an inode dependency so that the new reference(s)
9141 * to the inode cannot be committed to disk until the updated
9142 * inode has been written.
9143 */
9144void
9145softdep_change_linkcnt(ip)
9146	struct inode *ip;	/* the inode with the increased link count */
9147{
9148	struct inodedep *inodedep;
9149	int dflags;
9150
9151	ACQUIRE_LOCK(&lk);
9152	dflags = DEPALLOC;
9153	if (IS_SNAPSHOT(ip))
9154		dflags |= NODELAY;
9155	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
9156	if (ip->i_nlink < ip->i_effnlink)
9157		panic("softdep_change_linkcnt: bad delta");
9158	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9159	FREE_LOCK(&lk);
9160}
9161
9162/*
9163 * Attach a sbdep dependency to the superblock buf so that we can keep
9164 * track of the head of the linked list of referenced but unlinked inodes.
9165 */
9166void
9167softdep_setup_sbupdate(ump, fs, bp)
9168	struct ufsmount *ump;
9169	struct fs *fs;
9170	struct buf *bp;
9171{
9172	struct sbdep *sbdep;
9173	struct worklist *wk;
9174
9175	if (MOUNTEDSUJ(UFSTOVFS(ump)) == 0)
9176		return;
9177	LIST_FOREACH(wk, &bp->b_dep, wk_list)
9178		if (wk->wk_type == D_SBDEP)
9179			break;
9180	if (wk != NULL)
9181		return;
9182	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
9183	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9184	sbdep->sb_fs = fs;
9185	sbdep->sb_ump = ump;
9186	ACQUIRE_LOCK(&lk);
9187	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9188	FREE_LOCK(&lk);
9189}
9190
9191/*
9192 * Return the first unlinked inodedep which is ready to be the head of the
9193 * list.  The inodedep and all those after it must have valid next pointers.
9194 */
9195static struct inodedep *
9196first_unlinked_inodedep(ump)
9197	struct ufsmount *ump;
9198{
9199	struct inodedep *inodedep;
9200	struct inodedep *idp;
9201
9202	rw_assert(&lk, RA_WLOCKED);
9203	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9204	    inodedep; inodedep = idp) {
9205		if ((inodedep->id_state & UNLINKNEXT) == 0)
9206			return (NULL);
9207		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9208		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9209			break;
9210		if ((inodedep->id_state & UNLINKPREV) == 0)
9211			break;
9212	}
9213	return (inodedep);
9214}
9215
9216/*
9217 * Set the sujfree unlinked head pointer prior to writing a superblock.
9218 */
9219static void
9220initiate_write_sbdep(sbdep)
9221	struct sbdep *sbdep;
9222{
9223	struct inodedep *inodedep;
9224	struct fs *bpfs;
9225	struct fs *fs;
9226
9227	bpfs = sbdep->sb_fs;
9228	fs = sbdep->sb_ump->um_fs;
9229	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9230	if (inodedep) {
9231		fs->fs_sujfree = inodedep->id_ino;
9232		inodedep->id_state |= UNLINKPREV;
9233	} else
9234		fs->fs_sujfree = 0;
9235	bpfs->fs_sujfree = fs->fs_sujfree;
9236}
9237
9238/*
9239 * After a superblock is written determine whether it must be written again
9240 * due to a changing unlinked list head.
9241 */
9242static int
9243handle_written_sbdep(sbdep, bp)
9244	struct sbdep *sbdep;
9245	struct buf *bp;
9246{
9247	struct inodedep *inodedep;
9248	struct mount *mp;
9249	struct fs *fs;
9250
9251	rw_assert(&lk, RA_WLOCKED);
9252	fs = sbdep->sb_fs;
9253	mp = UFSTOVFS(sbdep->sb_ump);
9254	/*
9255	 * If the superblock doesn't match the in-memory list start over.
9256	 */
9257	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9258	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9259	    (inodedep == NULL && fs->fs_sujfree != 0)) {
9260		bdirty(bp);
9261		return (1);
9262	}
9263	WORKITEM_FREE(sbdep, D_SBDEP);
9264	if (fs->fs_sujfree == 0)
9265		return (0);
9266	/*
9267	 * Now that we have a record of this inode in stable store allow it
9268	 * to be written to free up pending work.  Inodes may see a lot of
9269	 * write activity after they are unlinked which we must not hold up.
9270	 */
9271	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9272		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9273			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9274			    inodedep, inodedep->id_state);
9275		if (inodedep->id_state & UNLINKONLIST)
9276			break;
9277		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9278	}
9279
9280	return (0);
9281}
9282
9283/*
9284 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9285 */
9286static void
9287unlinked_inodedep(mp, inodedep)
9288	struct mount *mp;
9289	struct inodedep *inodedep;
9290{
9291	struct ufsmount *ump;
9292
9293	rw_assert(&lk, RA_WLOCKED);
9294	if (MOUNTEDSUJ(mp) == 0)
9295		return;
9296	ump = VFSTOUFS(mp);
9297	ump->um_fs->fs_fmod = 1;
9298	if (inodedep->id_state & UNLINKED)
9299		panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9300	inodedep->id_state |= UNLINKED;
9301	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9302}
9303
9304/*
9305 * Remove an inodedep from the unlinked inodedep list.  This may require
9306 * disk writes if the inode has made it that far.
9307 */
9308static void
9309clear_unlinked_inodedep(inodedep)
9310	struct inodedep *inodedep;
9311{
9312	struct ufsmount *ump;
9313	struct inodedep *idp;
9314	struct inodedep *idn;
9315	struct fs *fs;
9316	struct buf *bp;
9317	ino_t ino;
9318	ino_t nino;
9319	ino_t pino;
9320	int error;
9321
9322	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9323	fs = ump->um_fs;
9324	ino = inodedep->id_ino;
9325	error = 0;
9326	for (;;) {
9327		rw_assert(&lk, RA_WLOCKED);
9328		KASSERT((inodedep->id_state & UNLINKED) != 0,
9329		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9330		    inodedep));
9331		/*
9332		 * If nothing has yet been written simply remove us from
9333		 * the in memory list and return.  This is the most common
9334		 * case where handle_workitem_remove() loses the final
9335		 * reference.
9336		 */
9337		if ((inodedep->id_state & UNLINKLINKS) == 0)
9338			break;
9339		/*
9340		 * If we have a NEXT pointer and no PREV pointer we can simply
9341		 * clear NEXT's PREV and remove ourselves from the list.  Be
9342		 * careful not to clear PREV if the superblock points at
9343		 * next as well.
9344		 */
9345		idn = TAILQ_NEXT(inodedep, id_unlinked);
9346		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9347			if (idn && fs->fs_sujfree != idn->id_ino)
9348				idn->id_state &= ~UNLINKPREV;
9349			break;
9350		}
9351		/*
9352		 * Here we have an inodedep which is actually linked into
9353		 * the list.  We must remove it by forcing a write to the
9354		 * link before us, whether it be the superblock or an inode.
9355		 * Unfortunately the list may change while we're waiting
9356		 * on the buf lock for either resource so we must loop until
9357		 * we lock the right one.  If both the superblock and an
9358		 * inode point to this inode we must clear the inode first
9359		 * followed by the superblock.
9360		 */
9361		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9362		pino = 0;
9363		if (idp && (idp->id_state & UNLINKNEXT))
9364			pino = idp->id_ino;
9365		FREE_LOCK(&lk);
9366		if (pino == 0) {
9367			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9368			    (int)fs->fs_sbsize, 0, 0, 0);
9369		} else {
9370			error = bread(ump->um_devvp,
9371			    fsbtodb(fs, ino_to_fsba(fs, pino)),
9372			    (int)fs->fs_bsize, NOCRED, &bp);
9373			if (error)
9374				brelse(bp);
9375		}
9376		ACQUIRE_LOCK(&lk);
9377		if (error)
9378			break;
9379		/* If the list has changed restart the loop. */
9380		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9381		nino = 0;
9382		if (idp && (idp->id_state & UNLINKNEXT))
9383			nino = idp->id_ino;
9384		if (nino != pino ||
9385		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9386			FREE_LOCK(&lk);
9387			brelse(bp);
9388			ACQUIRE_LOCK(&lk);
9389			continue;
9390		}
9391		nino = 0;
9392		idn = TAILQ_NEXT(inodedep, id_unlinked);
9393		if (idn)
9394			nino = idn->id_ino;
9395		/*
9396		 * Remove us from the in memory list.  After this we cannot
9397		 * access the inodedep.
9398		 */
9399		KASSERT((inodedep->id_state & UNLINKED) != 0,
9400		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9401		    inodedep));
9402		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9403		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9404		FREE_LOCK(&lk);
9405		/*
9406		 * The predecessor's next pointer is manually updated here
9407		 * so that the NEXT flag is never cleared for an element
9408		 * that is in the list.
9409		 */
9410		if (pino == 0) {
9411			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9412			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9413			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9414			    bp);
9415		} else if (fs->fs_magic == FS_UFS1_MAGIC)
9416			((struct ufs1_dinode *)bp->b_data +
9417			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9418		else
9419			((struct ufs2_dinode *)bp->b_data +
9420			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9421		/*
9422		 * If the bwrite fails we have no recourse to recover.  The
9423		 * filesystem is corrupted already.
9424		 */
9425		bwrite(bp);
9426		ACQUIRE_LOCK(&lk);
9427		/*
9428		 * If the superblock pointer still needs to be cleared force
9429		 * a write here.
9430		 */
9431		if (fs->fs_sujfree == ino) {
9432			FREE_LOCK(&lk);
9433			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9434			    (int)fs->fs_sbsize, 0, 0, 0);
9435			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9436			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9437			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9438			    bp);
9439			bwrite(bp);
9440			ACQUIRE_LOCK(&lk);
9441		}
9442
9443		if (fs->fs_sujfree != ino)
9444			return;
9445		panic("clear_unlinked_inodedep: Failed to clear free head");
9446	}
9447	if (inodedep->id_ino == fs->fs_sujfree)
9448		panic("clear_unlinked_inodedep: Freeing head of free list");
9449	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9450	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9451	return;
9452}
9453
9454/*
9455 * This workitem decrements the inode's link count.
9456 * If the link count reaches zero, the file is removed.
9457 */
9458static int
9459handle_workitem_remove(dirrem, flags)
9460	struct dirrem *dirrem;
9461	int flags;
9462{
9463	struct inodedep *inodedep;
9464	struct workhead dotdotwk;
9465	struct worklist *wk;
9466	struct ufsmount *ump;
9467	struct mount *mp;
9468	struct vnode *vp;
9469	struct inode *ip;
9470	ino_t oldinum;
9471
9472	if (dirrem->dm_state & ONWORKLIST)
9473		panic("handle_workitem_remove: dirrem %p still on worklist",
9474		    dirrem);
9475	oldinum = dirrem->dm_oldinum;
9476	mp = dirrem->dm_list.wk_mp;
9477	ump = VFSTOUFS(mp);
9478	flags |= LK_EXCLUSIVE;
9479	if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
9480		return (EBUSY);
9481	ip = VTOI(vp);
9482	ACQUIRE_LOCK(&lk);
9483	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
9484		panic("handle_workitem_remove: lost inodedep");
9485	if (dirrem->dm_state & ONDEPLIST)
9486		LIST_REMOVE(dirrem, dm_inonext);
9487	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9488	    ("handle_workitem_remove:  Journal entries not written."));
9489
9490	/*
9491	 * Move all dependencies waiting on the remove to complete
9492	 * from the dirrem to the inode inowait list to be completed
9493	 * after the inode has been updated and written to disk.  Any
9494	 * marked MKDIR_PARENT are saved to be completed when the .. ref
9495	 * is removed.
9496	 */
9497	LIST_INIT(&dotdotwk);
9498	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
9499		WORKLIST_REMOVE(wk);
9500		if (wk->wk_state & MKDIR_PARENT) {
9501			wk->wk_state &= ~MKDIR_PARENT;
9502			WORKLIST_INSERT(&dotdotwk, wk);
9503			continue;
9504		}
9505		WORKLIST_INSERT(&inodedep->id_inowait, wk);
9506	}
9507	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
9508	/*
9509	 * Normal file deletion.
9510	 */
9511	if ((dirrem->dm_state & RMDIR) == 0) {
9512		ip->i_nlink--;
9513		DIP_SET(ip, i_nlink, ip->i_nlink);
9514		ip->i_flag |= IN_CHANGE;
9515		if (ip->i_nlink < ip->i_effnlink)
9516			panic("handle_workitem_remove: bad file delta");
9517		if (ip->i_nlink == 0)
9518			unlinked_inodedep(mp, inodedep);
9519		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9520		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9521		    ("handle_workitem_remove: worklist not empty. %s",
9522		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
9523		WORKITEM_FREE(dirrem, D_DIRREM);
9524		FREE_LOCK(&lk);
9525		goto out;
9526	}
9527	/*
9528	 * Directory deletion. Decrement reference count for both the
9529	 * just deleted parent directory entry and the reference for ".".
9530	 * Arrange to have the reference count on the parent decremented
9531	 * to account for the loss of "..".
9532	 */
9533	ip->i_nlink -= 2;
9534	DIP_SET(ip, i_nlink, ip->i_nlink);
9535	ip->i_flag |= IN_CHANGE;
9536	if (ip->i_nlink < ip->i_effnlink)
9537		panic("handle_workitem_remove: bad dir delta");
9538	if (ip->i_nlink == 0)
9539		unlinked_inodedep(mp, inodedep);
9540	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9541	/*
9542	 * Rename a directory to a new parent. Since, we are both deleting
9543	 * and creating a new directory entry, the link count on the new
9544	 * directory should not change. Thus we skip the followup dirrem.
9545	 */
9546	if (dirrem->dm_state & DIRCHG) {
9547		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9548		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
9549		WORKITEM_FREE(dirrem, D_DIRREM);
9550		FREE_LOCK(&lk);
9551		goto out;
9552	}
9553	dirrem->dm_state = ONDEPLIST;
9554	dirrem->dm_oldinum = dirrem->dm_dirinum;
9555	/*
9556	 * Place the dirrem on the parent's diremhd list.
9557	 */
9558	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
9559		panic("handle_workitem_remove: lost dir inodedep");
9560	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9561	/*
9562	 * If the allocated inode has never been written to disk, then
9563	 * the on-disk inode is zero'ed and we can remove the file
9564	 * immediately.  When journaling if the inode has been marked
9565	 * unlinked and not DEPCOMPLETE we know it can never be written.
9566	 */
9567	inodedep_lookup(mp, oldinum, 0, &inodedep);
9568	if (inodedep == NULL ||
9569	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
9570	    check_inode_unwritten(inodedep)) {
9571		FREE_LOCK(&lk);
9572		vput(vp);
9573		return handle_workitem_remove(dirrem, flags);
9574	}
9575	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
9576	FREE_LOCK(&lk);
9577	ip->i_flag |= IN_CHANGE;
9578out:
9579	ffs_update(vp, 0);
9580	vput(vp);
9581	return (0);
9582}
9583
9584/*
9585 * Inode de-allocation dependencies.
9586 *
9587 * When an inode's link count is reduced to zero, it can be de-allocated. We
9588 * found it convenient to postpone de-allocation until after the inode is
9589 * written to disk with its new link count (zero).  At this point, all of the
9590 * on-disk inode's block pointers are nullified and, with careful dependency
9591 * list ordering, all dependencies related to the inode will be satisfied and
9592 * the corresponding dependency structures de-allocated.  So, if/when the
9593 * inode is reused, there will be no mixing of old dependencies with new
9594 * ones.  This artificial dependency is set up by the block de-allocation
9595 * procedure above (softdep_setup_freeblocks) and completed by the
9596 * following procedure.
9597 */
9598static void
9599handle_workitem_freefile(freefile)
9600	struct freefile *freefile;
9601{
9602	struct workhead wkhd;
9603	struct fs *fs;
9604	struct inodedep *idp;
9605	struct ufsmount *ump;
9606	int error;
9607
9608	ump = VFSTOUFS(freefile->fx_list.wk_mp);
9609	fs = ump->um_fs;
9610#ifdef DEBUG
9611	ACQUIRE_LOCK(&lk);
9612	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
9613	FREE_LOCK(&lk);
9614	if (error)
9615		panic("handle_workitem_freefile: inodedep %p survived", idp);
9616#endif
9617	UFS_LOCK(ump);
9618	fs->fs_pendinginodes -= 1;
9619	UFS_UNLOCK(ump);
9620	LIST_INIT(&wkhd);
9621	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
9622	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
9623	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
9624		softdep_error("handle_workitem_freefile", error);
9625	ACQUIRE_LOCK(&lk);
9626	WORKITEM_FREE(freefile, D_FREEFILE);
9627	FREE_LOCK(&lk);
9628}
9629
9630
9631/*
9632 * Helper function which unlinks marker element from work list and returns
9633 * the next element on the list.
9634 */
9635static __inline struct worklist *
9636markernext(struct worklist *marker)
9637{
9638	struct worklist *next;
9639
9640	next = LIST_NEXT(marker, wk_list);
9641	LIST_REMOVE(marker, wk_list);
9642	return next;
9643}
9644
9645/*
9646 * Disk writes.
9647 *
9648 * The dependency structures constructed above are most actively used when file
9649 * system blocks are written to disk.  No constraints are placed on when a
9650 * block can be written, but unsatisfied update dependencies are made safe by
9651 * modifying (or replacing) the source memory for the duration of the disk
9652 * write.  When the disk write completes, the memory block is again brought
9653 * up-to-date.
9654 *
9655 * In-core inode structure reclamation.
9656 *
9657 * Because there are a finite number of "in-core" inode structures, they are
9658 * reused regularly.  By transferring all inode-related dependencies to the
9659 * in-memory inode block and indexing them separately (via "inodedep"s), we
9660 * can allow "in-core" inode structures to be reused at any time and avoid
9661 * any increase in contention.
9662 *
9663 * Called just before entering the device driver to initiate a new disk I/O.
9664 * The buffer must be locked, thus, no I/O completion operations can occur
9665 * while we are manipulating its associated dependencies.
9666 */
9667static void
9668softdep_disk_io_initiation(bp)
9669	struct buf *bp;		/* structure describing disk write to occur */
9670{
9671	struct worklist *wk;
9672	struct worklist marker;
9673	struct inodedep *inodedep;
9674	struct freeblks *freeblks;
9675	struct jblkdep *jblkdep;
9676	struct newblk *newblk;
9677
9678	/*
9679	 * We only care about write operations. There should never
9680	 * be dependencies for reads.
9681	 */
9682	if (bp->b_iocmd != BIO_WRITE)
9683		panic("softdep_disk_io_initiation: not write");
9684
9685	if (bp->b_vflags & BV_BKGRDINPROG)
9686		panic("softdep_disk_io_initiation: Writing buffer with "
9687		    "background write in progress: %p", bp);
9688
9689	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
9690	PHOLD(curproc);			/* Don't swap out kernel stack */
9691
9692	ACQUIRE_LOCK(&lk);
9693	/*
9694	 * Do any necessary pre-I/O processing.
9695	 */
9696	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
9697	     wk = markernext(&marker)) {
9698		LIST_INSERT_AFTER(wk, &marker, wk_list);
9699		switch (wk->wk_type) {
9700
9701		case D_PAGEDEP:
9702			initiate_write_filepage(WK_PAGEDEP(wk), bp);
9703			continue;
9704
9705		case D_INODEDEP:
9706			inodedep = WK_INODEDEP(wk);
9707			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
9708				initiate_write_inodeblock_ufs1(inodedep, bp);
9709			else
9710				initiate_write_inodeblock_ufs2(inodedep, bp);
9711			continue;
9712
9713		case D_INDIRDEP:
9714			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
9715			continue;
9716
9717		case D_BMSAFEMAP:
9718			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
9719			continue;
9720
9721		case D_JSEG:
9722			WK_JSEG(wk)->js_buf = NULL;
9723			continue;
9724
9725		case D_FREEBLKS:
9726			freeblks = WK_FREEBLKS(wk);
9727			jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
9728			/*
9729			 * We have to wait for the freeblks to be journaled
9730			 * before we can write an inodeblock with updated
9731			 * pointers.  Be careful to arrange the marker so
9732			 * we revisit the freeblks if it's not removed by
9733			 * the first jwait().
9734			 */
9735			if (jblkdep != NULL) {
9736				LIST_REMOVE(&marker, wk_list);
9737				LIST_INSERT_BEFORE(wk, &marker, wk_list);
9738				jwait(&jblkdep->jb_list, MNT_WAIT);
9739			}
9740			continue;
9741		case D_ALLOCDIRECT:
9742		case D_ALLOCINDIR:
9743			/*
9744			 * We have to wait for the jnewblk to be journaled
9745			 * before we can write to a block if the contents
9746			 * may be confused with an earlier file's indirect
9747			 * at recovery time.  Handle the marker as described
9748			 * above.
9749			 */
9750			newblk = WK_NEWBLK(wk);
9751			if (newblk->nb_jnewblk != NULL &&
9752			    indirblk_lookup(newblk->nb_list.wk_mp,
9753			    newblk->nb_newblkno)) {
9754				LIST_REMOVE(&marker, wk_list);
9755				LIST_INSERT_BEFORE(wk, &marker, wk_list);
9756				jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
9757			}
9758			continue;
9759
9760		case D_SBDEP:
9761			initiate_write_sbdep(WK_SBDEP(wk));
9762			continue;
9763
9764		case D_MKDIR:
9765		case D_FREEWORK:
9766		case D_FREEDEP:
9767		case D_JSEGDEP:
9768			continue;
9769
9770		default:
9771			panic("handle_disk_io_initiation: Unexpected type %s",
9772			    TYPENAME(wk->wk_type));
9773			/* NOTREACHED */
9774		}
9775	}
9776	FREE_LOCK(&lk);
9777	PRELE(curproc);			/* Allow swapout of kernel stack */
9778}
9779
9780/*
9781 * Called from within the procedure above to deal with unsatisfied
9782 * allocation dependencies in a directory. The buffer must be locked,
9783 * thus, no I/O completion operations can occur while we are
9784 * manipulating its associated dependencies.
9785 */
9786static void
9787initiate_write_filepage(pagedep, bp)
9788	struct pagedep *pagedep;
9789	struct buf *bp;
9790{
9791	struct jremref *jremref;
9792	struct jmvref *jmvref;
9793	struct dirrem *dirrem;
9794	struct diradd *dap;
9795	struct direct *ep;
9796	int i;
9797
9798	if (pagedep->pd_state & IOSTARTED) {
9799		/*
9800		 * This can only happen if there is a driver that does not
9801		 * understand chaining. Here biodone will reissue the call
9802		 * to strategy for the incomplete buffers.
9803		 */
9804		printf("initiate_write_filepage: already started\n");
9805		return;
9806	}
9807	pagedep->pd_state |= IOSTARTED;
9808	/*
9809	 * Wait for all journal remove dependencies to hit the disk.
9810	 * We can not allow any potentially conflicting directory adds
9811	 * to be visible before removes and rollback is too difficult.
9812	 * lk may be dropped and re-acquired, however we hold the buf
9813	 * locked so the dependency can not go away.
9814	 */
9815	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
9816		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
9817			jwait(&jremref->jr_list, MNT_WAIT);
9818	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
9819		jwait(&jmvref->jm_list, MNT_WAIT);
9820	for (i = 0; i < DAHASHSZ; i++) {
9821		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
9822			ep = (struct direct *)
9823			    ((char *)bp->b_data + dap->da_offset);
9824			if (ep->d_ino != dap->da_newinum)
9825				panic("%s: dir inum %ju != new %ju",
9826				    "initiate_write_filepage",
9827				    (uintmax_t)ep->d_ino,
9828				    (uintmax_t)dap->da_newinum);
9829			if (dap->da_state & DIRCHG)
9830				ep->d_ino = dap->da_previous->dm_oldinum;
9831			else
9832				ep->d_ino = 0;
9833			dap->da_state &= ~ATTACHED;
9834			dap->da_state |= UNDONE;
9835		}
9836	}
9837}
9838
9839/*
9840 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
9841 * Note that any bug fixes made to this routine must be done in the
9842 * version found below.
9843 *
9844 * Called from within the procedure above to deal with unsatisfied
9845 * allocation dependencies in an inodeblock. The buffer must be
9846 * locked, thus, no I/O completion operations can occur while we
9847 * are manipulating its associated dependencies.
9848 */
9849static void
9850initiate_write_inodeblock_ufs1(inodedep, bp)
9851	struct inodedep *inodedep;
9852	struct buf *bp;			/* The inode block */
9853{
9854	struct allocdirect *adp, *lastadp;
9855	struct ufs1_dinode *dp;
9856	struct ufs1_dinode *sip;
9857	struct inoref *inoref;
9858	struct fs *fs;
9859	ufs_lbn_t i;
9860#ifdef INVARIANTS
9861	ufs_lbn_t prevlbn = 0;
9862#endif
9863	int deplist;
9864
9865	if (inodedep->id_state & IOSTARTED)
9866		panic("initiate_write_inodeblock_ufs1: already started");
9867	inodedep->id_state |= IOSTARTED;
9868	fs = inodedep->id_fs;
9869	dp = (struct ufs1_dinode *)bp->b_data +
9870	    ino_to_fsbo(fs, inodedep->id_ino);
9871
9872	/*
9873	 * If we're on the unlinked list but have not yet written our
9874	 * next pointer initialize it here.
9875	 */
9876	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
9877		struct inodedep *inon;
9878
9879		inon = TAILQ_NEXT(inodedep, id_unlinked);
9880		dp->di_freelink = inon ? inon->id_ino : 0;
9881	}
9882	/*
9883	 * If the bitmap is not yet written, then the allocated
9884	 * inode cannot be written to disk.
9885	 */
9886	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
9887		if (inodedep->id_savedino1 != NULL)
9888			panic("initiate_write_inodeblock_ufs1: I/O underway");
9889		FREE_LOCK(&lk);
9890		sip = malloc(sizeof(struct ufs1_dinode),
9891		    M_SAVEDINO, M_SOFTDEP_FLAGS);
9892		ACQUIRE_LOCK(&lk);
9893		inodedep->id_savedino1 = sip;
9894		*inodedep->id_savedino1 = *dp;
9895		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
9896		dp->di_gen = inodedep->id_savedino1->di_gen;
9897		dp->di_freelink = inodedep->id_savedino1->di_freelink;
9898		return;
9899	}
9900	/*
9901	 * If no dependencies, then there is nothing to roll back.
9902	 */
9903	inodedep->id_savedsize = dp->di_size;
9904	inodedep->id_savedextsize = 0;
9905	inodedep->id_savednlink = dp->di_nlink;
9906	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
9907	    TAILQ_EMPTY(&inodedep->id_inoreflst))
9908		return;
9909	/*
9910	 * Revert the link count to that of the first unwritten journal entry.
9911	 */
9912	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
9913	if (inoref)
9914		dp->di_nlink = inoref->if_nlink;
9915	/*
9916	 * Set the dependencies to busy.
9917	 */
9918	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
9919	     adp = TAILQ_NEXT(adp, ad_next)) {
9920#ifdef INVARIANTS
9921		if (deplist != 0 && prevlbn >= adp->ad_offset)
9922			panic("softdep_write_inodeblock: lbn order");
9923		prevlbn = adp->ad_offset;
9924		if (adp->ad_offset < NDADDR &&
9925		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
9926			panic("%s: direct pointer #%jd mismatch %d != %jd",
9927			    "softdep_write_inodeblock",
9928			    (intmax_t)adp->ad_offset,
9929			    dp->di_db[adp->ad_offset],
9930			    (intmax_t)adp->ad_newblkno);
9931		if (adp->ad_offset >= NDADDR &&
9932		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
9933			panic("%s: indirect pointer #%jd mismatch %d != %jd",
9934			    "softdep_write_inodeblock",
9935			    (intmax_t)adp->ad_offset - NDADDR,
9936			    dp->di_ib[adp->ad_offset - NDADDR],
9937			    (intmax_t)adp->ad_newblkno);
9938		deplist |= 1 << adp->ad_offset;
9939		if ((adp->ad_state & ATTACHED) == 0)
9940			panic("softdep_write_inodeblock: Unknown state 0x%x",
9941			    adp->ad_state);
9942#endif /* INVARIANTS */
9943		adp->ad_state &= ~ATTACHED;
9944		adp->ad_state |= UNDONE;
9945	}
9946	/*
9947	 * The on-disk inode cannot claim to be any larger than the last
9948	 * fragment that has been written. Otherwise, the on-disk inode
9949	 * might have fragments that were not the last block in the file
9950	 * which would corrupt the filesystem.
9951	 */
9952	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
9953	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
9954		if (adp->ad_offset >= NDADDR)
9955			break;
9956		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
9957		/* keep going until hitting a rollback to a frag */
9958		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
9959			continue;
9960		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
9961		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
9962#ifdef INVARIANTS
9963			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
9964				panic("softdep_write_inodeblock: lost dep1");
9965#endif /* INVARIANTS */
9966			dp->di_db[i] = 0;
9967		}
9968		for (i = 0; i < NIADDR; i++) {
9969#ifdef INVARIANTS
9970			if (dp->di_ib[i] != 0 &&
9971			    (deplist & ((1 << NDADDR) << i)) == 0)
9972				panic("softdep_write_inodeblock: lost dep2");
9973#endif /* INVARIANTS */
9974			dp->di_ib[i] = 0;
9975		}
9976		return;
9977	}
9978	/*
9979	 * If we have zero'ed out the last allocated block of the file,
9980	 * roll back the size to the last currently allocated block.
9981	 * We know that this last allocated block is a full-sized as
9982	 * we already checked for fragments in the loop above.
9983	 */
9984	if (lastadp != NULL &&
9985	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
9986		for (i = lastadp->ad_offset; i >= 0; i--)
9987			if (dp->di_db[i] != 0)
9988				break;
9989		dp->di_size = (i + 1) * fs->fs_bsize;
9990	}
9991	/*
9992	 * The only dependencies are for indirect blocks.
9993	 *
9994	 * The file size for indirect block additions is not guaranteed.
9995	 * Such a guarantee would be non-trivial to achieve. The conventional
9996	 * synchronous write implementation also does not make this guarantee.
9997	 * Fsck should catch and fix discrepancies. Arguably, the file size
9998	 * can be over-estimated without destroying integrity when the file
9999	 * moves into the indirect blocks (i.e., is large). If we want to
10000	 * postpone fsck, we are stuck with this argument.
10001	 */
10002	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10003		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10004}
10005
10006/*
10007 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
10008 * Note that any bug fixes made to this routine must be done in the
10009 * version found above.
10010 *
10011 * Called from within the procedure above to deal with unsatisfied
10012 * allocation dependencies in an inodeblock. The buffer must be
10013 * locked, thus, no I/O completion operations can occur while we
10014 * are manipulating its associated dependencies.
10015 */
10016static void
10017initiate_write_inodeblock_ufs2(inodedep, bp)
10018	struct inodedep *inodedep;
10019	struct buf *bp;			/* The inode block */
10020{
10021	struct allocdirect *adp, *lastadp;
10022	struct ufs2_dinode *dp;
10023	struct ufs2_dinode *sip;
10024	struct inoref *inoref;
10025	struct fs *fs;
10026	ufs_lbn_t i;
10027#ifdef INVARIANTS
10028	ufs_lbn_t prevlbn = 0;
10029#endif
10030	int deplist;
10031
10032	if (inodedep->id_state & IOSTARTED)
10033		panic("initiate_write_inodeblock_ufs2: already started");
10034	inodedep->id_state |= IOSTARTED;
10035	fs = inodedep->id_fs;
10036	dp = (struct ufs2_dinode *)bp->b_data +
10037	    ino_to_fsbo(fs, inodedep->id_ino);
10038
10039	/*
10040	 * If we're on the unlinked list but have not yet written our
10041	 * next pointer initialize it here.
10042	 */
10043	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10044		struct inodedep *inon;
10045
10046		inon = TAILQ_NEXT(inodedep, id_unlinked);
10047		dp->di_freelink = inon ? inon->id_ino : 0;
10048	}
10049	/*
10050	 * If the bitmap is not yet written, then the allocated
10051	 * inode cannot be written to disk.
10052	 */
10053	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10054		if (inodedep->id_savedino2 != NULL)
10055			panic("initiate_write_inodeblock_ufs2: I/O underway");
10056		FREE_LOCK(&lk);
10057		sip = malloc(sizeof(struct ufs2_dinode),
10058		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10059		ACQUIRE_LOCK(&lk);
10060		inodedep->id_savedino2 = sip;
10061		*inodedep->id_savedino2 = *dp;
10062		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10063		dp->di_gen = inodedep->id_savedino2->di_gen;
10064		dp->di_freelink = inodedep->id_savedino2->di_freelink;
10065		return;
10066	}
10067	/*
10068	 * If no dependencies, then there is nothing to roll back.
10069	 */
10070	inodedep->id_savedsize = dp->di_size;
10071	inodedep->id_savedextsize = dp->di_extsize;
10072	inodedep->id_savednlink = dp->di_nlink;
10073	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10074	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
10075	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10076		return;
10077	/*
10078	 * Revert the link count to that of the first unwritten journal entry.
10079	 */
10080	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10081	if (inoref)
10082		dp->di_nlink = inoref->if_nlink;
10083
10084	/*
10085	 * Set the ext data dependencies to busy.
10086	 */
10087	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10088	     adp = TAILQ_NEXT(adp, ad_next)) {
10089#ifdef INVARIANTS
10090		if (deplist != 0 && prevlbn >= adp->ad_offset)
10091			panic("softdep_write_inodeblock: lbn order");
10092		prevlbn = adp->ad_offset;
10093		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10094			panic("%s: direct pointer #%jd mismatch %jd != %jd",
10095			    "softdep_write_inodeblock",
10096			    (intmax_t)adp->ad_offset,
10097			    (intmax_t)dp->di_extb[adp->ad_offset],
10098			    (intmax_t)adp->ad_newblkno);
10099		deplist |= 1 << adp->ad_offset;
10100		if ((adp->ad_state & ATTACHED) == 0)
10101			panic("softdep_write_inodeblock: Unknown state 0x%x",
10102			    adp->ad_state);
10103#endif /* INVARIANTS */
10104		adp->ad_state &= ~ATTACHED;
10105		adp->ad_state |= UNDONE;
10106	}
10107	/*
10108	 * The on-disk inode cannot claim to be any larger than the last
10109	 * fragment that has been written. Otherwise, the on-disk inode
10110	 * might have fragments that were not the last block in the ext
10111	 * data which would corrupt the filesystem.
10112	 */
10113	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10114	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10115		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10116		/* keep going until hitting a rollback to a frag */
10117		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10118			continue;
10119		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10120		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
10121#ifdef INVARIANTS
10122			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10123				panic("softdep_write_inodeblock: lost dep1");
10124#endif /* INVARIANTS */
10125			dp->di_extb[i] = 0;
10126		}
10127		lastadp = NULL;
10128		break;
10129	}
10130	/*
10131	 * If we have zero'ed out the last allocated block of the ext
10132	 * data, roll back the size to the last currently allocated block.
10133	 * We know that this last allocated block is a full-sized as
10134	 * we already checked for fragments in the loop above.
10135	 */
10136	if (lastadp != NULL &&
10137	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10138		for (i = lastadp->ad_offset; i >= 0; i--)
10139			if (dp->di_extb[i] != 0)
10140				break;
10141		dp->di_extsize = (i + 1) * fs->fs_bsize;
10142	}
10143	/*
10144	 * Set the file data dependencies to busy.
10145	 */
10146	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10147	     adp = TAILQ_NEXT(adp, ad_next)) {
10148#ifdef INVARIANTS
10149		if (deplist != 0 && prevlbn >= adp->ad_offset)
10150			panic("softdep_write_inodeblock: lbn order");
10151		if ((adp->ad_state & ATTACHED) == 0)
10152			panic("inodedep %p and adp %p not attached", inodedep, adp);
10153		prevlbn = adp->ad_offset;
10154		if (adp->ad_offset < NDADDR &&
10155		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10156			panic("%s: direct pointer #%jd mismatch %jd != %jd",
10157			    "softdep_write_inodeblock",
10158			    (intmax_t)adp->ad_offset,
10159			    (intmax_t)dp->di_db[adp->ad_offset],
10160			    (intmax_t)adp->ad_newblkno);
10161		if (adp->ad_offset >= NDADDR &&
10162		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10163			panic("%s indirect pointer #%jd mismatch %jd != %jd",
10164			    "softdep_write_inodeblock:",
10165			    (intmax_t)adp->ad_offset - NDADDR,
10166			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
10167			    (intmax_t)adp->ad_newblkno);
10168		deplist |= 1 << adp->ad_offset;
10169		if ((adp->ad_state & ATTACHED) == 0)
10170			panic("softdep_write_inodeblock: Unknown state 0x%x",
10171			    adp->ad_state);
10172#endif /* INVARIANTS */
10173		adp->ad_state &= ~ATTACHED;
10174		adp->ad_state |= UNDONE;
10175	}
10176	/*
10177	 * The on-disk inode cannot claim to be any larger than the last
10178	 * fragment that has been written. Otherwise, the on-disk inode
10179	 * might have fragments that were not the last block in the file
10180	 * which would corrupt the filesystem.
10181	 */
10182	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10183	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10184		if (adp->ad_offset >= NDADDR)
10185			break;
10186		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10187		/* keep going until hitting a rollback to a frag */
10188		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10189			continue;
10190		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10191		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10192#ifdef INVARIANTS
10193			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10194				panic("softdep_write_inodeblock: lost dep2");
10195#endif /* INVARIANTS */
10196			dp->di_db[i] = 0;
10197		}
10198		for (i = 0; i < NIADDR; i++) {
10199#ifdef INVARIANTS
10200			if (dp->di_ib[i] != 0 &&
10201			    (deplist & ((1 << NDADDR) << i)) == 0)
10202				panic("softdep_write_inodeblock: lost dep3");
10203#endif /* INVARIANTS */
10204			dp->di_ib[i] = 0;
10205		}
10206		return;
10207	}
10208	/*
10209	 * If we have zero'ed out the last allocated block of the file,
10210	 * roll back the size to the last currently allocated block.
10211	 * We know that this last allocated block is a full-sized as
10212	 * we already checked for fragments in the loop above.
10213	 */
10214	if (lastadp != NULL &&
10215	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10216		for (i = lastadp->ad_offset; i >= 0; i--)
10217			if (dp->di_db[i] != 0)
10218				break;
10219		dp->di_size = (i + 1) * fs->fs_bsize;
10220	}
10221	/*
10222	 * The only dependencies are for indirect blocks.
10223	 *
10224	 * The file size for indirect block additions is not guaranteed.
10225	 * Such a guarantee would be non-trivial to achieve. The conventional
10226	 * synchronous write implementation also does not make this guarantee.
10227	 * Fsck should catch and fix discrepancies. Arguably, the file size
10228	 * can be over-estimated without destroying integrity when the file
10229	 * moves into the indirect blocks (i.e., is large). If we want to
10230	 * postpone fsck, we are stuck with this argument.
10231	 */
10232	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10233		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10234}
10235
10236/*
10237 * Cancel an indirdep as a result of truncation.  Release all of the
10238 * children allocindirs and place their journal work on the appropriate
10239 * list.
10240 */
10241static void
10242cancel_indirdep(indirdep, bp, freeblks)
10243	struct indirdep *indirdep;
10244	struct buf *bp;
10245	struct freeblks *freeblks;
10246{
10247	struct allocindir *aip;
10248
10249	/*
10250	 * None of the indirect pointers will ever be visible,
10251	 * so they can simply be tossed. GOINGAWAY ensures
10252	 * that allocated pointers will be saved in the buffer
10253	 * cache until they are freed. Note that they will
10254	 * only be able to be found by their physical address
10255	 * since the inode mapping the logical address will
10256	 * be gone. The save buffer used for the safe copy
10257	 * was allocated in setup_allocindir_phase2 using
10258	 * the physical address so it could be used for this
10259	 * purpose. Hence we swap the safe copy with the real
10260	 * copy, allowing the safe copy to be freed and holding
10261	 * on to the real copy for later use in indir_trunc.
10262	 */
10263	if (indirdep->ir_state & GOINGAWAY)
10264		panic("cancel_indirdep: already gone");
10265	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10266		indirdep->ir_state |= DEPCOMPLETE;
10267		LIST_REMOVE(indirdep, ir_next);
10268	}
10269	indirdep->ir_state |= GOINGAWAY;
10270	VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1;
10271	/*
10272	 * Pass in bp for blocks still have journal writes
10273	 * pending so we can cancel them on their own.
10274	 */
10275	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
10276		cancel_allocindir(aip, bp, freeblks, 0);
10277	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
10278		cancel_allocindir(aip, NULL, freeblks, 0);
10279	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
10280		cancel_allocindir(aip, NULL, freeblks, 0);
10281	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
10282		cancel_allocindir(aip, NULL, freeblks, 0);
10283	/*
10284	 * If there are pending partial truncations we need to keep the
10285	 * old block copy around until they complete.  This is because
10286	 * the current b_data is not a perfect superset of the available
10287	 * blocks.
10288	 */
10289	if (TAILQ_EMPTY(&indirdep->ir_trunc))
10290		bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10291	else
10292		bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10293	WORKLIST_REMOVE(&indirdep->ir_list);
10294	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10295	indirdep->ir_bp = NULL;
10296	indirdep->ir_freeblks = freeblks;
10297}
10298
10299/*
10300 * Free an indirdep once it no longer has new pointers to track.
10301 */
10302static void
10303free_indirdep(indirdep)
10304	struct indirdep *indirdep;
10305{
10306
10307	KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10308	    ("free_indirdep: Indir trunc list not empty."));
10309	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10310	    ("free_indirdep: Complete head not empty."));
10311	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10312	    ("free_indirdep: write head not empty."));
10313	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10314	    ("free_indirdep: done head not empty."));
10315	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10316	    ("free_indirdep: deplist head not empty."));
10317	KASSERT((indirdep->ir_state & DEPCOMPLETE),
10318	    ("free_indirdep: %p still on newblk list.", indirdep));
10319	KASSERT(indirdep->ir_saveddata == NULL,
10320	    ("free_indirdep: %p still has saved data.", indirdep));
10321	if (indirdep->ir_state & ONWORKLIST)
10322		WORKLIST_REMOVE(&indirdep->ir_list);
10323	WORKITEM_FREE(indirdep, D_INDIRDEP);
10324}
10325
10326/*
10327 * Called before a write to an indirdep.  This routine is responsible for
10328 * rolling back pointers to a safe state which includes only those
10329 * allocindirs which have been completed.
10330 */
10331static void
10332initiate_write_indirdep(indirdep, bp)
10333	struct indirdep *indirdep;
10334	struct buf *bp;
10335{
10336
10337	indirdep->ir_state |= IOSTARTED;
10338	if (indirdep->ir_state & GOINGAWAY)
10339		panic("disk_io_initiation: indirdep gone");
10340	/*
10341	 * If there are no remaining dependencies, this will be writing
10342	 * the real pointers.
10343	 */
10344	if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10345	    TAILQ_EMPTY(&indirdep->ir_trunc))
10346		return;
10347	/*
10348	 * Replace up-to-date version with safe version.
10349	 */
10350	if (indirdep->ir_saveddata == NULL) {
10351		FREE_LOCK(&lk);
10352		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10353		    M_SOFTDEP_FLAGS);
10354		ACQUIRE_LOCK(&lk);
10355	}
10356	indirdep->ir_state &= ~ATTACHED;
10357	indirdep->ir_state |= UNDONE;
10358	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10359	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10360	    bp->b_bcount);
10361}
10362
10363/*
10364 * Called when an inode has been cleared in a cg bitmap.  This finally
10365 * eliminates any canceled jaddrefs
10366 */
10367void
10368softdep_setup_inofree(mp, bp, ino, wkhd)
10369	struct mount *mp;
10370	struct buf *bp;
10371	ino_t ino;
10372	struct workhead *wkhd;
10373{
10374	struct worklist *wk, *wkn;
10375	struct inodedep *inodedep;
10376	uint8_t *inosused;
10377	struct cg *cgp;
10378	struct fs *fs;
10379
10380	ACQUIRE_LOCK(&lk);
10381	fs = VFSTOUFS(mp)->um_fs;
10382	cgp = (struct cg *)bp->b_data;
10383	inosused = cg_inosused(cgp);
10384	if (isset(inosused, ino % fs->fs_ipg))
10385		panic("softdep_setup_inofree: inode %ju not freed.",
10386		    (uintmax_t)ino);
10387	if (inodedep_lookup(mp, ino, 0, &inodedep))
10388		panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
10389		    (uintmax_t)ino, inodedep);
10390	if (wkhd) {
10391		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10392			if (wk->wk_type != D_JADDREF)
10393				continue;
10394			WORKLIST_REMOVE(wk);
10395			/*
10396			 * We can free immediately even if the jaddref
10397			 * isn't attached in a background write as now
10398			 * the bitmaps are reconciled.
10399		 	 */
10400			wk->wk_state |= COMPLETE | ATTACHED;
10401			free_jaddref(WK_JADDREF(wk));
10402		}
10403		jwork_move(&bp->b_dep, wkhd);
10404	}
10405	FREE_LOCK(&lk);
10406}
10407
10408
10409/*
10410 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10411 * map.  Any dependencies waiting for the write to clear are added to the
10412 * buf's list and any jnewblks that are being canceled are discarded
10413 * immediately.
10414 */
10415void
10416softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
10417	struct mount *mp;
10418	struct buf *bp;
10419	ufs2_daddr_t blkno;
10420	int frags;
10421	struct workhead *wkhd;
10422{
10423	struct bmsafemap *bmsafemap;
10424	struct jnewblk *jnewblk;
10425	struct worklist *wk;
10426	struct fs *fs;
10427#ifdef SUJ_DEBUG
10428	uint8_t *blksfree;
10429	struct cg *cgp;
10430	ufs2_daddr_t jstart;
10431	ufs2_daddr_t jend;
10432	ufs2_daddr_t end;
10433	long bno;
10434	int i;
10435#endif
10436
10437	CTR3(KTR_SUJ,
10438	    "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
10439	    blkno, frags, wkhd);
10440
10441	ACQUIRE_LOCK(&lk);
10442	/* Lookup the bmsafemap so we track when it is dirty. */
10443	fs = VFSTOUFS(mp)->um_fs;
10444	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10445	/*
10446	 * Detach any jnewblks which have been canceled.  They must linger
10447	 * until the bitmap is cleared again by ffs_blkfree() to prevent
10448	 * an unjournaled allocation from hitting the disk.
10449	 */
10450	if (wkhd) {
10451		while ((wk = LIST_FIRST(wkhd)) != NULL) {
10452			CTR2(KTR_SUJ,
10453			    "softdep_setup_blkfree: blkno %jd wk type %d",
10454			    blkno, wk->wk_type);
10455			WORKLIST_REMOVE(wk);
10456			if (wk->wk_type != D_JNEWBLK) {
10457				WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
10458				continue;
10459			}
10460			jnewblk = WK_JNEWBLK(wk);
10461			KASSERT(jnewblk->jn_state & GOINGAWAY,
10462			    ("softdep_setup_blkfree: jnewblk not canceled."));
10463#ifdef SUJ_DEBUG
10464			/*
10465			 * Assert that this block is free in the bitmap
10466			 * before we discard the jnewblk.
10467			 */
10468			cgp = (struct cg *)bp->b_data;
10469			blksfree = cg_blksfree(cgp);
10470			bno = dtogd(fs, jnewblk->jn_blkno);
10471			for (i = jnewblk->jn_oldfrags;
10472			    i < jnewblk->jn_frags; i++) {
10473				if (isset(blksfree, bno + i))
10474					continue;
10475				panic("softdep_setup_blkfree: not free");
10476			}
10477#endif
10478			/*
10479			 * Even if it's not attached we can free immediately
10480			 * as the new bitmap is correct.
10481			 */
10482			wk->wk_state |= COMPLETE | ATTACHED;
10483			free_jnewblk(jnewblk);
10484		}
10485	}
10486
10487#ifdef SUJ_DEBUG
10488	/*
10489	 * Assert that we are not freeing a block which has an outstanding
10490	 * allocation dependency.
10491	 */
10492	fs = VFSTOUFS(mp)->um_fs;
10493	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10494	end = blkno + frags;
10495	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10496		/*
10497		 * Don't match against blocks that will be freed when the
10498		 * background write is done.
10499		 */
10500		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
10501		    (COMPLETE | DEPCOMPLETE))
10502			continue;
10503		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
10504		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
10505		if ((blkno >= jstart && blkno < jend) ||
10506		    (end > jstart && end <= jend)) {
10507			printf("state 0x%X %jd - %d %d dep %p\n",
10508			    jnewblk->jn_state, jnewblk->jn_blkno,
10509			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
10510			    jnewblk->jn_dep);
10511			panic("softdep_setup_blkfree: "
10512			    "%jd-%jd(%d) overlaps with %jd-%jd",
10513			    blkno, end, frags, jstart, jend);
10514		}
10515	}
10516#endif
10517	FREE_LOCK(&lk);
10518}
10519
10520/*
10521 * Revert a block allocation when the journal record that describes it
10522 * is not yet written.
10523 */
10524int
10525jnewblk_rollback(jnewblk, fs, cgp, blksfree)
10526	struct jnewblk *jnewblk;
10527	struct fs *fs;
10528	struct cg *cgp;
10529	uint8_t *blksfree;
10530{
10531	ufs1_daddr_t fragno;
10532	long cgbno, bbase;
10533	int frags, blk;
10534	int i;
10535
10536	frags = 0;
10537	cgbno = dtogd(fs, jnewblk->jn_blkno);
10538	/*
10539	 * We have to test which frags need to be rolled back.  We may
10540	 * be operating on a stale copy when doing background writes.
10541	 */
10542	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
10543		if (isclr(blksfree, cgbno + i))
10544			frags++;
10545	if (frags == 0)
10546		return (0);
10547	/*
10548	 * This is mostly ffs_blkfree() sans some validation and
10549	 * superblock updates.
10550	 */
10551	if (frags == fs->fs_frag) {
10552		fragno = fragstoblks(fs, cgbno);
10553		ffs_setblock(fs, blksfree, fragno);
10554		ffs_clusteracct(fs, cgp, fragno, 1);
10555		cgp->cg_cs.cs_nbfree++;
10556	} else {
10557		cgbno += jnewblk->jn_oldfrags;
10558		bbase = cgbno - fragnum(fs, cgbno);
10559		/* Decrement the old frags.  */
10560		blk = blkmap(fs, blksfree, bbase);
10561		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
10562		/* Deallocate the fragment */
10563		for (i = 0; i < frags; i++)
10564			setbit(blksfree, cgbno + i);
10565		cgp->cg_cs.cs_nffree += frags;
10566		/* Add back in counts associated with the new frags */
10567		blk = blkmap(fs, blksfree, bbase);
10568		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
10569                /* If a complete block has been reassembled, account for it. */
10570		fragno = fragstoblks(fs, bbase);
10571		if (ffs_isblock(fs, blksfree, fragno)) {
10572			cgp->cg_cs.cs_nffree -= fs->fs_frag;
10573			ffs_clusteracct(fs, cgp, fragno, 1);
10574			cgp->cg_cs.cs_nbfree++;
10575		}
10576	}
10577	stat_jnewblk++;
10578	jnewblk->jn_state &= ~ATTACHED;
10579	jnewblk->jn_state |= UNDONE;
10580
10581	return (frags);
10582}
10583
10584static void
10585initiate_write_bmsafemap(bmsafemap, bp)
10586	struct bmsafemap *bmsafemap;
10587	struct buf *bp;			/* The cg block. */
10588{
10589	struct jaddref *jaddref;
10590	struct jnewblk *jnewblk;
10591	uint8_t *inosused;
10592	uint8_t *blksfree;
10593	struct cg *cgp;
10594	struct fs *fs;
10595	ino_t ino;
10596
10597	if (bmsafemap->sm_state & IOSTARTED)
10598		return;
10599	bmsafemap->sm_state |= IOSTARTED;
10600	/*
10601	 * Clear any inode allocations which are pending journal writes.
10602	 */
10603	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
10604		cgp = (struct cg *)bp->b_data;
10605		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10606		inosused = cg_inosused(cgp);
10607		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
10608			ino = jaddref->ja_ino % fs->fs_ipg;
10609			if (isset(inosused, ino)) {
10610				if ((jaddref->ja_mode & IFMT) == IFDIR)
10611					cgp->cg_cs.cs_ndir--;
10612				cgp->cg_cs.cs_nifree++;
10613				clrbit(inosused, ino);
10614				jaddref->ja_state &= ~ATTACHED;
10615				jaddref->ja_state |= UNDONE;
10616				stat_jaddref++;
10617			} else
10618				panic("initiate_write_bmsafemap: inode %ju "
10619				    "marked free", (uintmax_t)jaddref->ja_ino);
10620		}
10621	}
10622	/*
10623	 * Clear any block allocations which are pending journal writes.
10624	 */
10625	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
10626		cgp = (struct cg *)bp->b_data;
10627		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10628		blksfree = cg_blksfree(cgp);
10629		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10630			if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
10631				continue;
10632			panic("initiate_write_bmsafemap: block %jd "
10633			    "marked free", jnewblk->jn_blkno);
10634		}
10635	}
10636	/*
10637	 * Move allocation lists to the written lists so they can be
10638	 * cleared once the block write is complete.
10639	 */
10640	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
10641	    inodedep, id_deps);
10642	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
10643	    newblk, nb_deps);
10644	LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
10645	    wk_list);
10646}
10647
10648/*
10649 * This routine is called during the completion interrupt
10650 * service routine for a disk write (from the procedure called
10651 * by the device driver to inform the filesystem caches of
10652 * a request completion).  It should be called early in this
10653 * procedure, before the block is made available to other
10654 * processes or other routines are called.
10655 *
10656 */
10657static void
10658softdep_disk_write_complete(bp)
10659	struct buf *bp;		/* describes the completed disk write */
10660{
10661	struct worklist *wk;
10662	struct worklist *owk;
10663	struct workhead reattach;
10664	struct freeblks *freeblks;
10665	struct buf *sbp;
10666
10667	/*
10668	 * If an error occurred while doing the write, then the data
10669	 * has not hit the disk and the dependencies cannot be unrolled.
10670	 */
10671	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
10672		return;
10673	LIST_INIT(&reattach);
10674	/*
10675	 * This lock must not be released anywhere in this code segment.
10676	 */
10677	sbp = NULL;
10678	owk = NULL;
10679	ACQUIRE_LOCK(&lk);
10680	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
10681		WORKLIST_REMOVE(wk);
10682		dep_write[wk->wk_type]++;
10683		if (wk == owk)
10684			panic("duplicate worklist: %p\n", wk);
10685		owk = wk;
10686		switch (wk->wk_type) {
10687
10688		case D_PAGEDEP:
10689			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
10690				WORKLIST_INSERT(&reattach, wk);
10691			continue;
10692
10693		case D_INODEDEP:
10694			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
10695				WORKLIST_INSERT(&reattach, wk);
10696			continue;
10697
10698		case D_BMSAFEMAP:
10699			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
10700				WORKLIST_INSERT(&reattach, wk);
10701			continue;
10702
10703		case D_MKDIR:
10704			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
10705			continue;
10706
10707		case D_ALLOCDIRECT:
10708			wk->wk_state |= COMPLETE;
10709			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
10710			continue;
10711
10712		case D_ALLOCINDIR:
10713			wk->wk_state |= COMPLETE;
10714			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
10715			continue;
10716
10717		case D_INDIRDEP:
10718			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
10719				WORKLIST_INSERT(&reattach, wk);
10720			continue;
10721
10722		case D_FREEBLKS:
10723			wk->wk_state |= COMPLETE;
10724			freeblks = WK_FREEBLKS(wk);
10725			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
10726			    LIST_EMPTY(&freeblks->fb_jblkdephd))
10727				add_to_worklist(wk, WK_NODELAY);
10728			continue;
10729
10730		case D_FREEWORK:
10731			handle_written_freework(WK_FREEWORK(wk));
10732			break;
10733
10734		case D_JSEGDEP:
10735			free_jsegdep(WK_JSEGDEP(wk));
10736			continue;
10737
10738		case D_JSEG:
10739			handle_written_jseg(WK_JSEG(wk), bp);
10740			continue;
10741
10742		case D_SBDEP:
10743			if (handle_written_sbdep(WK_SBDEP(wk), bp))
10744				WORKLIST_INSERT(&reattach, wk);
10745			continue;
10746
10747		case D_FREEDEP:
10748			free_freedep(WK_FREEDEP(wk));
10749			continue;
10750
10751		default:
10752			panic("handle_disk_write_complete: Unknown type %s",
10753			    TYPENAME(wk->wk_type));
10754			/* NOTREACHED */
10755		}
10756	}
10757	/*
10758	 * Reattach any requests that must be redone.
10759	 */
10760	while ((wk = LIST_FIRST(&reattach)) != NULL) {
10761		WORKLIST_REMOVE(wk);
10762		WORKLIST_INSERT(&bp->b_dep, wk);
10763	}
10764	FREE_LOCK(&lk);
10765	if (sbp)
10766		brelse(sbp);
10767}
10768
10769/*
10770 * Called from within softdep_disk_write_complete above. Note that
10771 * this routine is always called from interrupt level with further
10772 * splbio interrupts blocked.
10773 */
10774static void
10775handle_allocdirect_partdone(adp, wkhd)
10776	struct allocdirect *adp;	/* the completed allocdirect */
10777	struct workhead *wkhd;		/* Work to do when inode is writtne. */
10778{
10779	struct allocdirectlst *listhead;
10780	struct allocdirect *listadp;
10781	struct inodedep *inodedep;
10782	long bsize;
10783
10784	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
10785		return;
10786	/*
10787	 * The on-disk inode cannot claim to be any larger than the last
10788	 * fragment that has been written. Otherwise, the on-disk inode
10789	 * might have fragments that were not the last block in the file
10790	 * which would corrupt the filesystem. Thus, we cannot free any
10791	 * allocdirects after one whose ad_oldblkno claims a fragment as
10792	 * these blocks must be rolled back to zero before writing the inode.
10793	 * We check the currently active set of allocdirects in id_inoupdt
10794	 * or id_extupdt as appropriate.
10795	 */
10796	inodedep = adp->ad_inodedep;
10797	bsize = inodedep->id_fs->fs_bsize;
10798	if (adp->ad_state & EXTDATA)
10799		listhead = &inodedep->id_extupdt;
10800	else
10801		listhead = &inodedep->id_inoupdt;
10802	TAILQ_FOREACH(listadp, listhead, ad_next) {
10803		/* found our block */
10804		if (listadp == adp)
10805			break;
10806		/* continue if ad_oldlbn is not a fragment */
10807		if (listadp->ad_oldsize == 0 ||
10808		    listadp->ad_oldsize == bsize)
10809			continue;
10810		/* hit a fragment */
10811		return;
10812	}
10813	/*
10814	 * If we have reached the end of the current list without
10815	 * finding the just finished dependency, then it must be
10816	 * on the future dependency list. Future dependencies cannot
10817	 * be freed until they are moved to the current list.
10818	 */
10819	if (listadp == NULL) {
10820#ifdef DEBUG
10821		if (adp->ad_state & EXTDATA)
10822			listhead = &inodedep->id_newextupdt;
10823		else
10824			listhead = &inodedep->id_newinoupdt;
10825		TAILQ_FOREACH(listadp, listhead, ad_next)
10826			/* found our block */
10827			if (listadp == adp)
10828				break;
10829		if (listadp == NULL)
10830			panic("handle_allocdirect_partdone: lost dep");
10831#endif /* DEBUG */
10832		return;
10833	}
10834	/*
10835	 * If we have found the just finished dependency, then queue
10836	 * it along with anything that follows it that is complete.
10837	 * Since the pointer has not yet been written in the inode
10838	 * as the dependency prevents it, place the allocdirect on the
10839	 * bufwait list where it will be freed once the pointer is
10840	 * valid.
10841	 */
10842	if (wkhd == NULL)
10843		wkhd = &inodedep->id_bufwait;
10844	for (; adp; adp = listadp) {
10845		listadp = TAILQ_NEXT(adp, ad_next);
10846		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
10847			return;
10848		TAILQ_REMOVE(listhead, adp, ad_next);
10849		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
10850	}
10851}
10852
10853/*
10854 * Called from within softdep_disk_write_complete above.  This routine
10855 * completes successfully written allocindirs.
10856 */
10857static void
10858handle_allocindir_partdone(aip)
10859	struct allocindir *aip;		/* the completed allocindir */
10860{
10861	struct indirdep *indirdep;
10862
10863	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
10864		return;
10865	indirdep = aip->ai_indirdep;
10866	LIST_REMOVE(aip, ai_next);
10867	/*
10868	 * Don't set a pointer while the buffer is undergoing IO or while
10869	 * we have active truncations.
10870	 */
10871	if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
10872		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
10873		return;
10874	}
10875	if (indirdep->ir_state & UFS1FMT)
10876		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
10877		    aip->ai_newblkno;
10878	else
10879		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
10880		    aip->ai_newblkno;
10881	/*
10882	 * Await the pointer write before freeing the allocindir.
10883	 */
10884	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
10885}
10886
10887/*
10888 * Release segments held on a jwork list.
10889 */
10890static void
10891handle_jwork(wkhd)
10892	struct workhead *wkhd;
10893{
10894	struct worklist *wk;
10895
10896	while ((wk = LIST_FIRST(wkhd)) != NULL) {
10897		WORKLIST_REMOVE(wk);
10898		switch (wk->wk_type) {
10899		case D_JSEGDEP:
10900			free_jsegdep(WK_JSEGDEP(wk));
10901			continue;
10902		case D_FREEDEP:
10903			free_freedep(WK_FREEDEP(wk));
10904			continue;
10905		case D_FREEFRAG:
10906			rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
10907			WORKITEM_FREE(wk, D_FREEFRAG);
10908			continue;
10909		case D_FREEWORK:
10910			handle_written_freework(WK_FREEWORK(wk));
10911			continue;
10912		default:
10913			panic("handle_jwork: Unknown type %s\n",
10914			    TYPENAME(wk->wk_type));
10915		}
10916	}
10917}
10918
10919/*
10920 * Handle the bufwait list on an inode when it is safe to release items
10921 * held there.  This normally happens after an inode block is written but
10922 * may be delayed and handled later if there are pending journal items that
10923 * are not yet safe to be released.
10924 */
10925static struct freefile *
10926handle_bufwait(inodedep, refhd)
10927	struct inodedep *inodedep;
10928	struct workhead *refhd;
10929{
10930	struct jaddref *jaddref;
10931	struct freefile *freefile;
10932	struct worklist *wk;
10933
10934	freefile = NULL;
10935	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
10936		WORKLIST_REMOVE(wk);
10937		switch (wk->wk_type) {
10938		case D_FREEFILE:
10939			/*
10940			 * We defer adding freefile to the worklist
10941			 * until all other additions have been made to
10942			 * ensure that it will be done after all the
10943			 * old blocks have been freed.
10944			 */
10945			if (freefile != NULL)
10946				panic("handle_bufwait: freefile");
10947			freefile = WK_FREEFILE(wk);
10948			continue;
10949
10950		case D_MKDIR:
10951			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
10952			continue;
10953
10954		case D_DIRADD:
10955			diradd_inode_written(WK_DIRADD(wk), inodedep);
10956			continue;
10957
10958		case D_FREEFRAG:
10959			wk->wk_state |= COMPLETE;
10960			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
10961				add_to_worklist(wk, 0);
10962			continue;
10963
10964		case D_DIRREM:
10965			wk->wk_state |= COMPLETE;
10966			add_to_worklist(wk, 0);
10967			continue;
10968
10969		case D_ALLOCDIRECT:
10970		case D_ALLOCINDIR:
10971			free_newblk(WK_NEWBLK(wk));
10972			continue;
10973
10974		case D_JNEWBLK:
10975			wk->wk_state |= COMPLETE;
10976			free_jnewblk(WK_JNEWBLK(wk));
10977			continue;
10978
10979		/*
10980		 * Save freed journal segments and add references on
10981		 * the supplied list which will delay their release
10982		 * until the cg bitmap is cleared on disk.
10983		 */
10984		case D_JSEGDEP:
10985			if (refhd == NULL)
10986				free_jsegdep(WK_JSEGDEP(wk));
10987			else
10988				WORKLIST_INSERT(refhd, wk);
10989			continue;
10990
10991		case D_JADDREF:
10992			jaddref = WK_JADDREF(wk);
10993			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
10994			    if_deps);
10995			/*
10996			 * Transfer any jaddrefs to the list to be freed with
10997			 * the bitmap if we're handling a removed file.
10998			 */
10999			if (refhd == NULL) {
11000				wk->wk_state |= COMPLETE;
11001				free_jaddref(jaddref);
11002			} else
11003				WORKLIST_INSERT(refhd, wk);
11004			continue;
11005
11006		default:
11007			panic("handle_bufwait: Unknown type %p(%s)",
11008			    wk, TYPENAME(wk->wk_type));
11009			/* NOTREACHED */
11010		}
11011	}
11012	return (freefile);
11013}
11014/*
11015 * Called from within softdep_disk_write_complete above to restore
11016 * in-memory inode block contents to their most up-to-date state. Note
11017 * that this routine is always called from interrupt level with further
11018 * splbio interrupts blocked.
11019 */
11020static int
11021handle_written_inodeblock(inodedep, bp)
11022	struct inodedep *inodedep;
11023	struct buf *bp;		/* buffer containing the inode block */
11024{
11025	struct freefile *freefile;
11026	struct allocdirect *adp, *nextadp;
11027	struct ufs1_dinode *dp1 = NULL;
11028	struct ufs2_dinode *dp2 = NULL;
11029	struct workhead wkhd;
11030	int hadchanges, fstype;
11031	ino_t freelink;
11032
11033	LIST_INIT(&wkhd);
11034	hadchanges = 0;
11035	freefile = NULL;
11036	if ((inodedep->id_state & IOSTARTED) == 0)
11037		panic("handle_written_inodeblock: not started");
11038	inodedep->id_state &= ~IOSTARTED;
11039	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11040		fstype = UFS1;
11041		dp1 = (struct ufs1_dinode *)bp->b_data +
11042		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11043		freelink = dp1->di_freelink;
11044	} else {
11045		fstype = UFS2;
11046		dp2 = (struct ufs2_dinode *)bp->b_data +
11047		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11048		freelink = dp2->di_freelink;
11049	}
11050	/*
11051	 * Leave this inodeblock dirty until it's in the list.
11052	 */
11053	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) {
11054		struct inodedep *inon;
11055
11056		inon = TAILQ_NEXT(inodedep, id_unlinked);
11057		if ((inon == NULL && freelink == 0) ||
11058		    (inon && inon->id_ino == freelink)) {
11059			if (inon)
11060				inon->id_state |= UNLINKPREV;
11061			inodedep->id_state |= UNLINKNEXT;
11062		}
11063		hadchanges = 1;
11064	}
11065	/*
11066	 * If we had to rollback the inode allocation because of
11067	 * bitmaps being incomplete, then simply restore it.
11068	 * Keep the block dirty so that it will not be reclaimed until
11069	 * all associated dependencies have been cleared and the
11070	 * corresponding updates written to disk.
11071	 */
11072	if (inodedep->id_savedino1 != NULL) {
11073		hadchanges = 1;
11074		if (fstype == UFS1)
11075			*dp1 = *inodedep->id_savedino1;
11076		else
11077			*dp2 = *inodedep->id_savedino2;
11078		free(inodedep->id_savedino1, M_SAVEDINO);
11079		inodedep->id_savedino1 = NULL;
11080		if ((bp->b_flags & B_DELWRI) == 0)
11081			stat_inode_bitmap++;
11082		bdirty(bp);
11083		/*
11084		 * If the inode is clear here and GOINGAWAY it will never
11085		 * be written.  Process the bufwait and clear any pending
11086		 * work which may include the freefile.
11087		 */
11088		if (inodedep->id_state & GOINGAWAY)
11089			goto bufwait;
11090		return (1);
11091	}
11092	inodedep->id_state |= COMPLETE;
11093	/*
11094	 * Roll forward anything that had to be rolled back before
11095	 * the inode could be updated.
11096	 */
11097	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11098		nextadp = TAILQ_NEXT(adp, ad_next);
11099		if (adp->ad_state & ATTACHED)
11100			panic("handle_written_inodeblock: new entry");
11101		if (fstype == UFS1) {
11102			if (adp->ad_offset < NDADDR) {
11103				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11104					panic("%s %s #%jd mismatch %d != %jd",
11105					    "handle_written_inodeblock:",
11106					    "direct pointer",
11107					    (intmax_t)adp->ad_offset,
11108					    dp1->di_db[adp->ad_offset],
11109					    (intmax_t)adp->ad_oldblkno);
11110				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11111			} else {
11112				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
11113					panic("%s: %s #%jd allocated as %d",
11114					    "handle_written_inodeblock",
11115					    "indirect pointer",
11116					    (intmax_t)adp->ad_offset - NDADDR,
11117					    dp1->di_ib[adp->ad_offset - NDADDR]);
11118				dp1->di_ib[adp->ad_offset - NDADDR] =
11119				    adp->ad_newblkno;
11120			}
11121		} else {
11122			if (adp->ad_offset < NDADDR) {
11123				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11124					panic("%s: %s #%jd %s %jd != %jd",
11125					    "handle_written_inodeblock",
11126					    "direct pointer",
11127					    (intmax_t)adp->ad_offset, "mismatch",
11128					    (intmax_t)dp2->di_db[adp->ad_offset],
11129					    (intmax_t)adp->ad_oldblkno);
11130				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11131			} else {
11132				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
11133					panic("%s: %s #%jd allocated as %jd",
11134					    "handle_written_inodeblock",
11135					    "indirect pointer",
11136					    (intmax_t)adp->ad_offset - NDADDR,
11137					    (intmax_t)
11138					    dp2->di_ib[adp->ad_offset - NDADDR]);
11139				dp2->di_ib[adp->ad_offset - NDADDR] =
11140				    adp->ad_newblkno;
11141			}
11142		}
11143		adp->ad_state &= ~UNDONE;
11144		adp->ad_state |= ATTACHED;
11145		hadchanges = 1;
11146	}
11147	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11148		nextadp = TAILQ_NEXT(adp, ad_next);
11149		if (adp->ad_state & ATTACHED)
11150			panic("handle_written_inodeblock: new entry");
11151		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11152			panic("%s: direct pointers #%jd %s %jd != %jd",
11153			    "handle_written_inodeblock",
11154			    (intmax_t)adp->ad_offset, "mismatch",
11155			    (intmax_t)dp2->di_extb[adp->ad_offset],
11156			    (intmax_t)adp->ad_oldblkno);
11157		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11158		adp->ad_state &= ~UNDONE;
11159		adp->ad_state |= ATTACHED;
11160		hadchanges = 1;
11161	}
11162	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11163		stat_direct_blk_ptrs++;
11164	/*
11165	 * Reset the file size to its most up-to-date value.
11166	 */
11167	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11168		panic("handle_written_inodeblock: bad size");
11169	if (inodedep->id_savednlink > LINK_MAX)
11170		panic("handle_written_inodeblock: Invalid link count "
11171		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
11172	if (fstype == UFS1) {
11173		if (dp1->di_nlink != inodedep->id_savednlink) {
11174			dp1->di_nlink = inodedep->id_savednlink;
11175			hadchanges = 1;
11176		}
11177		if (dp1->di_size != inodedep->id_savedsize) {
11178			dp1->di_size = inodedep->id_savedsize;
11179			hadchanges = 1;
11180		}
11181	} else {
11182		if (dp2->di_nlink != inodedep->id_savednlink) {
11183			dp2->di_nlink = inodedep->id_savednlink;
11184			hadchanges = 1;
11185		}
11186		if (dp2->di_size != inodedep->id_savedsize) {
11187			dp2->di_size = inodedep->id_savedsize;
11188			hadchanges = 1;
11189		}
11190		if (dp2->di_extsize != inodedep->id_savedextsize) {
11191			dp2->di_extsize = inodedep->id_savedextsize;
11192			hadchanges = 1;
11193		}
11194	}
11195	inodedep->id_savedsize = -1;
11196	inodedep->id_savedextsize = -1;
11197	inodedep->id_savednlink = -1;
11198	/*
11199	 * If there were any rollbacks in the inode block, then it must be
11200	 * marked dirty so that its will eventually get written back in
11201	 * its correct form.
11202	 */
11203	if (hadchanges)
11204		bdirty(bp);
11205bufwait:
11206	/*
11207	 * Process any allocdirects that completed during the update.
11208	 */
11209	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11210		handle_allocdirect_partdone(adp, &wkhd);
11211	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11212		handle_allocdirect_partdone(adp, &wkhd);
11213	/*
11214	 * Process deallocations that were held pending until the
11215	 * inode had been written to disk. Freeing of the inode
11216	 * is delayed until after all blocks have been freed to
11217	 * avoid creation of new <vfsid, inum, lbn> triples
11218	 * before the old ones have been deleted.  Completely
11219	 * unlinked inodes are not processed until the unlinked
11220	 * inode list is written or the last reference is removed.
11221	 */
11222	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11223		freefile = handle_bufwait(inodedep, NULL);
11224		if (freefile && !LIST_EMPTY(&wkhd)) {
11225			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11226			freefile = NULL;
11227		}
11228	}
11229	/*
11230	 * Move rolled forward dependency completions to the bufwait list
11231	 * now that those that were already written have been processed.
11232	 */
11233	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11234		panic("handle_written_inodeblock: bufwait but no changes");
11235	jwork_move(&inodedep->id_bufwait, &wkhd);
11236
11237	if (freefile != NULL) {
11238		/*
11239		 * If the inode is goingaway it was never written.  Fake up
11240		 * the state here so free_inodedep() can succeed.
11241		 */
11242		if (inodedep->id_state & GOINGAWAY)
11243			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11244		if (free_inodedep(inodedep) == 0)
11245			panic("handle_written_inodeblock: live inodedep %p",
11246			    inodedep);
11247		add_to_worklist(&freefile->fx_list, 0);
11248		return (0);
11249	}
11250
11251	/*
11252	 * If no outstanding dependencies, free it.
11253	 */
11254	if (free_inodedep(inodedep) ||
11255	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11256	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11257	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11258	     LIST_FIRST(&inodedep->id_bufwait) == 0))
11259		return (0);
11260	return (hadchanges);
11261}
11262
11263static int
11264handle_written_indirdep(indirdep, bp, bpp)
11265	struct indirdep *indirdep;
11266	struct buf *bp;
11267	struct buf **bpp;
11268{
11269	struct allocindir *aip;
11270	struct buf *sbp;
11271	int chgs;
11272
11273	if (indirdep->ir_state & GOINGAWAY)
11274		panic("handle_written_indirdep: indirdep gone");
11275	if ((indirdep->ir_state & IOSTARTED) == 0)
11276		panic("handle_written_indirdep: IO not started");
11277	chgs = 0;
11278	/*
11279	 * If there were rollbacks revert them here.
11280	 */
11281	if (indirdep->ir_saveddata) {
11282		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11283		if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11284			free(indirdep->ir_saveddata, M_INDIRDEP);
11285			indirdep->ir_saveddata = NULL;
11286		}
11287		chgs = 1;
11288	}
11289	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11290	indirdep->ir_state |= ATTACHED;
11291	/*
11292	 * Move allocindirs with written pointers to the completehd if
11293	 * the indirdep's pointer is not yet written.  Otherwise
11294	 * free them here.
11295	 */
11296	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
11297		LIST_REMOVE(aip, ai_next);
11298		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11299			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11300			    ai_next);
11301			newblk_freefrag(&aip->ai_block);
11302			continue;
11303		}
11304		free_newblk(&aip->ai_block);
11305	}
11306	/*
11307	 * Move allocindirs that have finished dependency processing from
11308	 * the done list to the write list after updating the pointers.
11309	 */
11310	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11311		while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
11312			handle_allocindir_partdone(aip);
11313			if (aip == LIST_FIRST(&indirdep->ir_donehd))
11314				panic("disk_write_complete: not gone");
11315			chgs = 1;
11316		}
11317	}
11318	/*
11319	 * Preserve the indirdep if there were any changes or if it is not
11320	 * yet valid on disk.
11321	 */
11322	if (chgs) {
11323		stat_indir_blk_ptrs++;
11324		bdirty(bp);
11325		return (1);
11326	}
11327	/*
11328	 * If there were no changes we can discard the savedbp and detach
11329	 * ourselves from the buf.  We are only carrying completed pointers
11330	 * in this case.
11331	 */
11332	sbp = indirdep->ir_savebp;
11333	sbp->b_flags |= B_INVAL | B_NOCACHE;
11334	indirdep->ir_savebp = NULL;
11335	indirdep->ir_bp = NULL;
11336	if (*bpp != NULL)
11337		panic("handle_written_indirdep: bp already exists.");
11338	*bpp = sbp;
11339	/*
11340	 * The indirdep may not be freed until its parent points at it.
11341	 */
11342	if (indirdep->ir_state & DEPCOMPLETE)
11343		free_indirdep(indirdep);
11344
11345	return (0);
11346}
11347
11348/*
11349 * Process a diradd entry after its dependent inode has been written.
11350 * This routine must be called with splbio interrupts blocked.
11351 */
11352static void
11353diradd_inode_written(dap, inodedep)
11354	struct diradd *dap;
11355	struct inodedep *inodedep;
11356{
11357
11358	dap->da_state |= COMPLETE;
11359	complete_diradd(dap);
11360	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
11361}
11362
11363/*
11364 * Returns true if the bmsafemap will have rollbacks when written.  Must
11365 * only be called with lk and the buf lock on the cg held.
11366 */
11367static int
11368bmsafemap_backgroundwrite(bmsafemap, bp)
11369	struct bmsafemap *bmsafemap;
11370	struct buf *bp;
11371{
11372	int dirty;
11373
11374	dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
11375	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
11376	/*
11377	 * If we're initiating a background write we need to process the
11378	 * rollbacks as they exist now, not as they exist when IO starts.
11379	 * No other consumers will look at the contents of the shadowed
11380	 * buf so this is safe to do here.
11381	 */
11382	if (bp->b_xflags & BX_BKGRDMARKER)
11383		initiate_write_bmsafemap(bmsafemap, bp);
11384
11385	return (dirty);
11386}
11387
11388/*
11389 * Re-apply an allocation when a cg write is complete.
11390 */
11391static int
11392jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
11393	struct jnewblk *jnewblk;
11394	struct fs *fs;
11395	struct cg *cgp;
11396	uint8_t *blksfree;
11397{
11398	ufs1_daddr_t fragno;
11399	ufs2_daddr_t blkno;
11400	long cgbno, bbase;
11401	int frags, blk;
11402	int i;
11403
11404	frags = 0;
11405	cgbno = dtogd(fs, jnewblk->jn_blkno);
11406	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
11407		if (isclr(blksfree, cgbno + i))
11408			panic("jnewblk_rollforward: re-allocated fragment");
11409		frags++;
11410	}
11411	if (frags == fs->fs_frag) {
11412		blkno = fragstoblks(fs, cgbno);
11413		ffs_clrblock(fs, blksfree, (long)blkno);
11414		ffs_clusteracct(fs, cgp, blkno, -1);
11415		cgp->cg_cs.cs_nbfree--;
11416	} else {
11417		bbase = cgbno - fragnum(fs, cgbno);
11418		cgbno += jnewblk->jn_oldfrags;
11419                /* If a complete block had been reassembled, account for it. */
11420		fragno = fragstoblks(fs, bbase);
11421		if (ffs_isblock(fs, blksfree, fragno)) {
11422			cgp->cg_cs.cs_nffree += fs->fs_frag;
11423			ffs_clusteracct(fs, cgp, fragno, -1);
11424			cgp->cg_cs.cs_nbfree--;
11425		}
11426		/* Decrement the old frags.  */
11427		blk = blkmap(fs, blksfree, bbase);
11428		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11429		/* Allocate the fragment */
11430		for (i = 0; i < frags; i++)
11431			clrbit(blksfree, cgbno + i);
11432		cgp->cg_cs.cs_nffree -= frags;
11433		/* Add back in counts associated with the new frags */
11434		blk = blkmap(fs, blksfree, bbase);
11435		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11436	}
11437	return (frags);
11438}
11439
11440/*
11441 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
11442 * changes if it's not a background write.  Set all written dependencies
11443 * to DEPCOMPLETE and free the structure if possible.
11444 */
11445static int
11446handle_written_bmsafemap(bmsafemap, bp)
11447	struct bmsafemap *bmsafemap;
11448	struct buf *bp;
11449{
11450	struct newblk *newblk;
11451	struct inodedep *inodedep;
11452	struct jaddref *jaddref, *jatmp;
11453	struct jnewblk *jnewblk, *jntmp;
11454	struct ufsmount *ump;
11455	uint8_t *inosused;
11456	uint8_t *blksfree;
11457	struct cg *cgp;
11458	struct fs *fs;
11459	ino_t ino;
11460	int foreground;
11461	int chgs;
11462
11463	if ((bmsafemap->sm_state & IOSTARTED) == 0)
11464		panic("initiate_write_bmsafemap: Not started\n");
11465	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
11466	chgs = 0;
11467	bmsafemap->sm_state &= ~IOSTARTED;
11468	foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
11469	/*
11470	 * Release journal work that was waiting on the write.
11471	 */
11472	handle_jwork(&bmsafemap->sm_freewr);
11473
11474	/*
11475	 * Restore unwritten inode allocation pending jaddref writes.
11476	 */
11477	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
11478		cgp = (struct cg *)bp->b_data;
11479		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11480		inosused = cg_inosused(cgp);
11481		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
11482		    ja_bmdeps, jatmp) {
11483			if ((jaddref->ja_state & UNDONE) == 0)
11484				continue;
11485			ino = jaddref->ja_ino % fs->fs_ipg;
11486			if (isset(inosused, ino))
11487				panic("handle_written_bmsafemap: "
11488				    "re-allocated inode");
11489			/* Do the roll-forward only if it's a real copy. */
11490			if (foreground) {
11491				if ((jaddref->ja_mode & IFMT) == IFDIR)
11492					cgp->cg_cs.cs_ndir++;
11493				cgp->cg_cs.cs_nifree--;
11494				setbit(inosused, ino);
11495				chgs = 1;
11496			}
11497			jaddref->ja_state &= ~UNDONE;
11498			jaddref->ja_state |= ATTACHED;
11499			free_jaddref(jaddref);
11500		}
11501	}
11502	/*
11503	 * Restore any block allocations which are pending journal writes.
11504	 */
11505	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11506		cgp = (struct cg *)bp->b_data;
11507		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11508		blksfree = cg_blksfree(cgp);
11509		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
11510		    jntmp) {
11511			if ((jnewblk->jn_state & UNDONE) == 0)
11512				continue;
11513			/* Do the roll-forward only if it's a real copy. */
11514			if (foreground &&
11515			    jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
11516				chgs = 1;
11517			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
11518			jnewblk->jn_state |= ATTACHED;
11519			free_jnewblk(jnewblk);
11520		}
11521	}
11522	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
11523		newblk->nb_state |= DEPCOMPLETE;
11524		newblk->nb_state &= ~ONDEPLIST;
11525		newblk->nb_bmsafemap = NULL;
11526		LIST_REMOVE(newblk, nb_deps);
11527		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
11528			handle_allocdirect_partdone(
11529			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
11530		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
11531			handle_allocindir_partdone(
11532			    WK_ALLOCINDIR(&newblk->nb_list));
11533		else if (newblk->nb_list.wk_type != D_NEWBLK)
11534			panic("handle_written_bmsafemap: Unexpected type: %s",
11535			    TYPENAME(newblk->nb_list.wk_type));
11536	}
11537	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
11538		inodedep->id_state |= DEPCOMPLETE;
11539		inodedep->id_state &= ~ONDEPLIST;
11540		LIST_REMOVE(inodedep, id_deps);
11541		inodedep->id_bmsafemap = NULL;
11542	}
11543	LIST_REMOVE(bmsafemap, sm_next);
11544	if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
11545	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
11546	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
11547	    LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
11548	    LIST_EMPTY(&bmsafemap->sm_freehd)) {
11549		LIST_REMOVE(bmsafemap, sm_hash);
11550		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
11551		return (0);
11552	}
11553	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
11554	if (foreground)
11555		bdirty(bp);
11556	return (1);
11557}
11558
11559/*
11560 * Try to free a mkdir dependency.
11561 */
11562static void
11563complete_mkdir(mkdir)
11564	struct mkdir *mkdir;
11565{
11566	struct diradd *dap;
11567
11568	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
11569		return;
11570	LIST_REMOVE(mkdir, md_mkdirs);
11571	dap = mkdir->md_diradd;
11572	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
11573	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
11574		dap->da_state |= DEPCOMPLETE;
11575		complete_diradd(dap);
11576	}
11577	WORKITEM_FREE(mkdir, D_MKDIR);
11578}
11579
11580/*
11581 * Handle the completion of a mkdir dependency.
11582 */
11583static void
11584handle_written_mkdir(mkdir, type)
11585	struct mkdir *mkdir;
11586	int type;
11587{
11588
11589	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
11590		panic("handle_written_mkdir: bad type");
11591	mkdir->md_state |= COMPLETE;
11592	complete_mkdir(mkdir);
11593}
11594
11595static int
11596free_pagedep(pagedep)
11597	struct pagedep *pagedep;
11598{
11599	int i;
11600
11601	if (pagedep->pd_state & NEWBLOCK)
11602		return (0);
11603	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
11604		return (0);
11605	for (i = 0; i < DAHASHSZ; i++)
11606		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
11607			return (0);
11608	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
11609		return (0);
11610	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
11611		return (0);
11612	if (pagedep->pd_state & ONWORKLIST)
11613		WORKLIST_REMOVE(&pagedep->pd_list);
11614	LIST_REMOVE(pagedep, pd_hash);
11615	WORKITEM_FREE(pagedep, D_PAGEDEP);
11616
11617	return (1);
11618}
11619
11620/*
11621 * Called from within softdep_disk_write_complete above.
11622 * A write operation was just completed. Removed inodes can
11623 * now be freed and associated block pointers may be committed.
11624 * Note that this routine is always called from interrupt level
11625 * with further splbio interrupts blocked.
11626 */
11627static int
11628handle_written_filepage(pagedep, bp)
11629	struct pagedep *pagedep;
11630	struct buf *bp;		/* buffer containing the written page */
11631{
11632	struct dirrem *dirrem;
11633	struct diradd *dap, *nextdap;
11634	struct direct *ep;
11635	int i, chgs;
11636
11637	if ((pagedep->pd_state & IOSTARTED) == 0)
11638		panic("handle_written_filepage: not started");
11639	pagedep->pd_state &= ~IOSTARTED;
11640	/*
11641	 * Process any directory removals that have been committed.
11642	 */
11643	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
11644		LIST_REMOVE(dirrem, dm_next);
11645		dirrem->dm_state |= COMPLETE;
11646		dirrem->dm_dirinum = pagedep->pd_ino;
11647		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
11648		    ("handle_written_filepage: Journal entries not written."));
11649		add_to_worklist(&dirrem->dm_list, 0);
11650	}
11651	/*
11652	 * Free any directory additions that have been committed.
11653	 * If it is a newly allocated block, we have to wait until
11654	 * the on-disk directory inode claims the new block.
11655	 */
11656	if ((pagedep->pd_state & NEWBLOCK) == 0)
11657		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
11658			free_diradd(dap, NULL);
11659	/*
11660	 * Uncommitted directory entries must be restored.
11661	 */
11662	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
11663		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
11664		     dap = nextdap) {
11665			nextdap = LIST_NEXT(dap, da_pdlist);
11666			if (dap->da_state & ATTACHED)
11667				panic("handle_written_filepage: attached");
11668			ep = (struct direct *)
11669			    ((char *)bp->b_data + dap->da_offset);
11670			ep->d_ino = dap->da_newinum;
11671			dap->da_state &= ~UNDONE;
11672			dap->da_state |= ATTACHED;
11673			chgs = 1;
11674			/*
11675			 * If the inode referenced by the directory has
11676			 * been written out, then the dependency can be
11677			 * moved to the pending list.
11678			 */
11679			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
11680				LIST_REMOVE(dap, da_pdlist);
11681				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
11682				    da_pdlist);
11683			}
11684		}
11685	}
11686	/*
11687	 * If there were any rollbacks in the directory, then it must be
11688	 * marked dirty so that its will eventually get written back in
11689	 * its correct form.
11690	 */
11691	if (chgs) {
11692		if ((bp->b_flags & B_DELWRI) == 0)
11693			stat_dir_entry++;
11694		bdirty(bp);
11695		return (1);
11696	}
11697	/*
11698	 * If we are not waiting for a new directory block to be
11699	 * claimed by its inode, then the pagedep will be freed.
11700	 * Otherwise it will remain to track any new entries on
11701	 * the page in case they are fsync'ed.
11702	 */
11703	free_pagedep(pagedep);
11704	return (0);
11705}
11706
11707/*
11708 * Writing back in-core inode structures.
11709 *
11710 * The filesystem only accesses an inode's contents when it occupies an
11711 * "in-core" inode structure.  These "in-core" structures are separate from
11712 * the page frames used to cache inode blocks.  Only the latter are
11713 * transferred to/from the disk.  So, when the updated contents of the
11714 * "in-core" inode structure are copied to the corresponding in-memory inode
11715 * block, the dependencies are also transferred.  The following procedure is
11716 * called when copying a dirty "in-core" inode to a cached inode block.
11717 */
11718
11719/*
11720 * Called when an inode is loaded from disk. If the effective link count
11721 * differed from the actual link count when it was last flushed, then we
11722 * need to ensure that the correct effective link count is put back.
11723 */
11724void
11725softdep_load_inodeblock(ip)
11726	struct inode *ip;	/* the "in_core" copy of the inode */
11727{
11728	struct inodedep *inodedep;
11729
11730	/*
11731	 * Check for alternate nlink count.
11732	 */
11733	ip->i_effnlink = ip->i_nlink;
11734	ACQUIRE_LOCK(&lk);
11735	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
11736	    &inodedep) == 0) {
11737		FREE_LOCK(&lk);
11738		return;
11739	}
11740	ip->i_effnlink -= inodedep->id_nlinkdelta;
11741	FREE_LOCK(&lk);
11742}
11743
11744/*
11745 * This routine is called just before the "in-core" inode
11746 * information is to be copied to the in-memory inode block.
11747 * Recall that an inode block contains several inodes. If
11748 * the force flag is set, then the dependencies will be
11749 * cleared so that the update can always be made. Note that
11750 * the buffer is locked when this routine is called, so we
11751 * will never be in the middle of writing the inode block
11752 * to disk.
11753 */
11754void
11755softdep_update_inodeblock(ip, bp, waitfor)
11756	struct inode *ip;	/* the "in_core" copy of the inode */
11757	struct buf *bp;		/* the buffer containing the inode block */
11758	int waitfor;		/* nonzero => update must be allowed */
11759{
11760	struct inodedep *inodedep;
11761	struct inoref *inoref;
11762	struct worklist *wk;
11763	struct mount *mp;
11764	struct buf *ibp;
11765	struct fs *fs;
11766	int error;
11767
11768	mp = UFSTOVFS(ip->i_ump);
11769	fs = ip->i_fs;
11770	/*
11771	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
11772	 * does not have access to the in-core ip so must write directly into
11773	 * the inode block buffer when setting freelink.
11774	 */
11775	if (fs->fs_magic == FS_UFS1_MAGIC)
11776		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
11777		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
11778	else
11779		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
11780		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
11781	/*
11782	 * If the effective link count is not equal to the actual link
11783	 * count, then we must track the difference in an inodedep while
11784	 * the inode is (potentially) tossed out of the cache. Otherwise,
11785	 * if there is no existing inodedep, then there are no dependencies
11786	 * to track.
11787	 */
11788	ACQUIRE_LOCK(&lk);
11789again:
11790	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
11791		FREE_LOCK(&lk);
11792		if (ip->i_effnlink != ip->i_nlink)
11793			panic("softdep_update_inodeblock: bad link count");
11794		return;
11795	}
11796	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
11797		panic("softdep_update_inodeblock: bad delta");
11798	/*
11799	 * If we're flushing all dependencies we must also move any waiting
11800	 * for journal writes onto the bufwait list prior to I/O.
11801	 */
11802	if (waitfor) {
11803		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
11804			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
11805			    == DEPCOMPLETE) {
11806				jwait(&inoref->if_list, MNT_WAIT);
11807				goto again;
11808			}
11809		}
11810	}
11811	/*
11812	 * Changes have been initiated. Anything depending on these
11813	 * changes cannot occur until this inode has been written.
11814	 */
11815	inodedep->id_state &= ~COMPLETE;
11816	if ((inodedep->id_state & ONWORKLIST) == 0)
11817		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
11818	/*
11819	 * Any new dependencies associated with the incore inode must
11820	 * now be moved to the list associated with the buffer holding
11821	 * the in-memory copy of the inode. Once merged process any
11822	 * allocdirects that are completed by the merger.
11823	 */
11824	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
11825	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
11826		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
11827		    NULL);
11828	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
11829	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
11830		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
11831		    NULL);
11832	/*
11833	 * Now that the inode has been pushed into the buffer, the
11834	 * operations dependent on the inode being written to disk
11835	 * can be moved to the id_bufwait so that they will be
11836	 * processed when the buffer I/O completes.
11837	 */
11838	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
11839		WORKLIST_REMOVE(wk);
11840		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
11841	}
11842	/*
11843	 * Newly allocated inodes cannot be written until the bitmap
11844	 * that allocates them have been written (indicated by
11845	 * DEPCOMPLETE being set in id_state). If we are doing a
11846	 * forced sync (e.g., an fsync on a file), we force the bitmap
11847	 * to be written so that the update can be done.
11848	 */
11849	if (waitfor == 0) {
11850		FREE_LOCK(&lk);
11851		return;
11852	}
11853retry:
11854	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
11855		FREE_LOCK(&lk);
11856		return;
11857	}
11858	ibp = inodedep->id_bmsafemap->sm_buf;
11859	ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
11860	if (ibp == NULL) {
11861		/*
11862		 * If ibp came back as NULL, the dependency could have been
11863		 * freed while we slept.  Look it up again, and check to see
11864		 * that it has completed.
11865		 */
11866		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
11867			goto retry;
11868		FREE_LOCK(&lk);
11869		return;
11870	}
11871	FREE_LOCK(&lk);
11872	if ((error = bwrite(ibp)) != 0)
11873		softdep_error("softdep_update_inodeblock: bwrite", error);
11874}
11875
11876/*
11877 * Merge the a new inode dependency list (such as id_newinoupdt) into an
11878 * old inode dependency list (such as id_inoupdt). This routine must be
11879 * called with splbio interrupts blocked.
11880 */
11881static void
11882merge_inode_lists(newlisthead, oldlisthead)
11883	struct allocdirectlst *newlisthead;
11884	struct allocdirectlst *oldlisthead;
11885{
11886	struct allocdirect *listadp, *newadp;
11887
11888	newadp = TAILQ_FIRST(newlisthead);
11889	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
11890		if (listadp->ad_offset < newadp->ad_offset) {
11891			listadp = TAILQ_NEXT(listadp, ad_next);
11892			continue;
11893		}
11894		TAILQ_REMOVE(newlisthead, newadp, ad_next);
11895		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
11896		if (listadp->ad_offset == newadp->ad_offset) {
11897			allocdirect_merge(oldlisthead, newadp,
11898			    listadp);
11899			listadp = newadp;
11900		}
11901		newadp = TAILQ_FIRST(newlisthead);
11902	}
11903	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
11904		TAILQ_REMOVE(newlisthead, newadp, ad_next);
11905		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
11906	}
11907}
11908
11909/*
11910 * If we are doing an fsync, then we must ensure that any directory
11911 * entries for the inode have been written after the inode gets to disk.
11912 */
11913int
11914softdep_fsync(vp)
11915	struct vnode *vp;	/* the "in_core" copy of the inode */
11916{
11917	struct inodedep *inodedep;
11918	struct pagedep *pagedep;
11919	struct inoref *inoref;
11920	struct worklist *wk;
11921	struct diradd *dap;
11922	struct mount *mp;
11923	struct vnode *pvp;
11924	struct inode *ip;
11925	struct buf *bp;
11926	struct fs *fs;
11927	struct thread *td = curthread;
11928	int error, flushparent, pagedep_new_block;
11929	ino_t parentino;
11930	ufs_lbn_t lbn;
11931
11932	ip = VTOI(vp);
11933	fs = ip->i_fs;
11934	mp = vp->v_mount;
11935	ACQUIRE_LOCK(&lk);
11936restart:
11937	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
11938		FREE_LOCK(&lk);
11939		return (0);
11940	}
11941	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
11942		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
11943		    == DEPCOMPLETE) {
11944			jwait(&inoref->if_list, MNT_WAIT);
11945			goto restart;
11946		}
11947	}
11948	if (!LIST_EMPTY(&inodedep->id_inowait) ||
11949	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
11950	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
11951	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
11952	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
11953		panic("softdep_fsync: pending ops %p", inodedep);
11954	for (error = 0, flushparent = 0; ; ) {
11955		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
11956			break;
11957		if (wk->wk_type != D_DIRADD)
11958			panic("softdep_fsync: Unexpected type %s",
11959			    TYPENAME(wk->wk_type));
11960		dap = WK_DIRADD(wk);
11961		/*
11962		 * Flush our parent if this directory entry has a MKDIR_PARENT
11963		 * dependency or is contained in a newly allocated block.
11964		 */
11965		if (dap->da_state & DIRCHG)
11966			pagedep = dap->da_previous->dm_pagedep;
11967		else
11968			pagedep = dap->da_pagedep;
11969		parentino = pagedep->pd_ino;
11970		lbn = pagedep->pd_lbn;
11971		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
11972			panic("softdep_fsync: dirty");
11973		if ((dap->da_state & MKDIR_PARENT) ||
11974		    (pagedep->pd_state & NEWBLOCK))
11975			flushparent = 1;
11976		else
11977			flushparent = 0;
11978		/*
11979		 * If we are being fsync'ed as part of vgone'ing this vnode,
11980		 * then we will not be able to release and recover the
11981		 * vnode below, so we just have to give up on writing its
11982		 * directory entry out. It will eventually be written, just
11983		 * not now, but then the user was not asking to have it
11984		 * written, so we are not breaking any promises.
11985		 */
11986		if (vp->v_iflag & VI_DOOMED)
11987			break;
11988		/*
11989		 * We prevent deadlock by always fetching inodes from the
11990		 * root, moving down the directory tree. Thus, when fetching
11991		 * our parent directory, we first try to get the lock. If
11992		 * that fails, we must unlock ourselves before requesting
11993		 * the lock on our parent. See the comment in ufs_lookup
11994		 * for details on possible races.
11995		 */
11996		FREE_LOCK(&lk);
11997		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
11998		    FFSV_FORCEINSMQ)) {
11999			error = vfs_busy(mp, MBF_NOWAIT);
12000			if (error != 0) {
12001				vfs_ref(mp);
12002				VOP_UNLOCK(vp, 0);
12003				error = vfs_busy(mp, 0);
12004				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12005				vfs_rel(mp);
12006				if (error != 0)
12007					return (ENOENT);
12008				if (vp->v_iflag & VI_DOOMED) {
12009					vfs_unbusy(mp);
12010					return (ENOENT);
12011				}
12012			}
12013			VOP_UNLOCK(vp, 0);
12014			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
12015			    &pvp, FFSV_FORCEINSMQ);
12016			vfs_unbusy(mp);
12017			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12018			if (vp->v_iflag & VI_DOOMED) {
12019				if (error == 0)
12020					vput(pvp);
12021				error = ENOENT;
12022			}
12023			if (error != 0)
12024				return (error);
12025		}
12026		/*
12027		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12028		 * that are contained in direct blocks will be resolved by
12029		 * doing a ffs_update. Pagedeps contained in indirect blocks
12030		 * may require a complete sync'ing of the directory. So, we
12031		 * try the cheap and fast ffs_update first, and if that fails,
12032		 * then we do the slower ffs_syncvnode of the directory.
12033		 */
12034		if (flushparent) {
12035			int locked;
12036
12037			if ((error = ffs_update(pvp, 1)) != 0) {
12038				vput(pvp);
12039				return (error);
12040			}
12041			ACQUIRE_LOCK(&lk);
12042			locked = 1;
12043			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12044				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12045					if (wk->wk_type != D_DIRADD)
12046						panic("softdep_fsync: Unexpected type %s",
12047						      TYPENAME(wk->wk_type));
12048					dap = WK_DIRADD(wk);
12049					if (dap->da_state & DIRCHG)
12050						pagedep = dap->da_previous->dm_pagedep;
12051					else
12052						pagedep = dap->da_pagedep;
12053					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12054					FREE_LOCK(&lk);
12055					locked = 0;
12056					if (pagedep_new_block && (error =
12057					    ffs_syncvnode(pvp, MNT_WAIT, 0))) {
12058						vput(pvp);
12059						return (error);
12060					}
12061				}
12062			}
12063			if (locked)
12064				FREE_LOCK(&lk);
12065		}
12066		/*
12067		 * Flush directory page containing the inode's name.
12068		 */
12069		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12070		    &bp);
12071		if (error == 0)
12072			error = bwrite(bp);
12073		else
12074			brelse(bp);
12075		vput(pvp);
12076		if (error != 0)
12077			return (error);
12078		ACQUIRE_LOCK(&lk);
12079		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12080			break;
12081	}
12082	FREE_LOCK(&lk);
12083	return (0);
12084}
12085
12086/*
12087 * Flush all the dirty bitmaps associated with the block device
12088 * before flushing the rest of the dirty blocks so as to reduce
12089 * the number of dependencies that will have to be rolled back.
12090 *
12091 * XXX Unused?
12092 */
12093void
12094softdep_fsync_mountdev(vp)
12095	struct vnode *vp;
12096{
12097	struct buf *bp, *nbp;
12098	struct worklist *wk;
12099	struct bufobj *bo;
12100
12101	if (!vn_isdisk(vp, NULL))
12102		panic("softdep_fsync_mountdev: vnode not a disk");
12103	bo = &vp->v_bufobj;
12104restart:
12105	BO_LOCK(bo);
12106	ACQUIRE_LOCK(&lk);
12107	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
12108		/*
12109		 * If it is already scheduled, skip to the next buffer.
12110		 */
12111		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
12112			continue;
12113
12114		if ((bp->b_flags & B_DELWRI) == 0)
12115			panic("softdep_fsync_mountdev: not dirty");
12116		/*
12117		 * We are only interested in bitmaps with outstanding
12118		 * dependencies.
12119		 */
12120		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
12121		    wk->wk_type != D_BMSAFEMAP ||
12122		    (bp->b_vflags & BV_BKGRDINPROG)) {
12123			BUF_UNLOCK(bp);
12124			continue;
12125		}
12126		FREE_LOCK(&lk);
12127		BO_UNLOCK(bo);
12128		bremfree(bp);
12129		(void) bawrite(bp);
12130		goto restart;
12131	}
12132	FREE_LOCK(&lk);
12133	drain_output(vp);
12134	BO_UNLOCK(bo);
12135}
12136
12137/*
12138 * Sync all cylinder groups that were dirty at the time this function is
12139 * called.  Newly dirtied cgs will be inserted before the sentinel.  This
12140 * is used to flush freedep activity that may be holding up writes to a
12141 * indirect block.
12142 */
12143static int
12144sync_cgs(mp, waitfor)
12145	struct mount *mp;
12146	int waitfor;
12147{
12148	struct bmsafemap *bmsafemap;
12149	struct bmsafemap *sentinel;
12150	struct ufsmount *ump;
12151	struct buf *bp;
12152	int error;
12153
12154	sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
12155	sentinel->sm_cg = -1;
12156	ump = VFSTOUFS(mp);
12157	error = 0;
12158	ACQUIRE_LOCK(&lk);
12159	LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
12160	for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
12161	    bmsafemap = LIST_NEXT(sentinel, sm_next)) {
12162		/* Skip sentinels and cgs with no work to release. */
12163		if (bmsafemap->sm_cg == -1 ||
12164		    (LIST_EMPTY(&bmsafemap->sm_freehd) &&
12165		    LIST_EMPTY(&bmsafemap->sm_freewr))) {
12166			LIST_REMOVE(sentinel, sm_next);
12167			LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12168			continue;
12169		}
12170		/*
12171		 * If we don't get the lock and we're waiting try again, if
12172		 * not move on to the next buf and try to sync it.
12173		 */
12174		bp = getdirtybuf(bmsafemap->sm_buf, &lk, waitfor);
12175		if (bp == NULL && waitfor == MNT_WAIT)
12176			continue;
12177		LIST_REMOVE(sentinel, sm_next);
12178		LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12179		if (bp == NULL)
12180			continue;
12181		FREE_LOCK(&lk);
12182		if (waitfor == MNT_NOWAIT)
12183			bawrite(bp);
12184		else
12185			error = bwrite(bp);
12186		ACQUIRE_LOCK(&lk);
12187		if (error)
12188			break;
12189	}
12190	LIST_REMOVE(sentinel, sm_next);
12191	FREE_LOCK(&lk);
12192	free(sentinel, M_BMSAFEMAP);
12193	return (error);
12194}
12195
12196/*
12197 * This routine is called when we are trying to synchronously flush a
12198 * file. This routine must eliminate any filesystem metadata dependencies
12199 * so that the syncing routine can succeed.
12200 */
12201int
12202softdep_sync_metadata(struct vnode *vp)
12203{
12204	int error;
12205
12206	/*
12207	 * Ensure that any direct block dependencies have been cleared,
12208	 * truncations are started, and inode references are journaled.
12209	 */
12210	ACQUIRE_LOCK(&lk);
12211	/*
12212	 * Write all journal records to prevent rollbacks on devvp.
12213	 */
12214	if (vp->v_type == VCHR)
12215		softdep_flushjournal(vp->v_mount);
12216	error = flush_inodedep_deps(vp, vp->v_mount, VTOI(vp)->i_number);
12217	/*
12218	 * Ensure that all truncates are written so we won't find deps on
12219	 * indirect blocks.
12220	 */
12221	process_truncates(vp);
12222	FREE_LOCK(&lk);
12223
12224	return (error);
12225}
12226
12227/*
12228 * This routine is called when we are attempting to sync a buf with
12229 * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12230 * other IO it can but returns EBUSY if the buffer is not yet able to
12231 * be written.  Dependencies which will not cause rollbacks will always
12232 * return 0.
12233 */
12234int
12235softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
12236{
12237	struct indirdep *indirdep;
12238	struct pagedep *pagedep;
12239	struct allocindir *aip;
12240	struct newblk *newblk;
12241	struct buf *nbp;
12242	struct worklist *wk;
12243	int i, error;
12244
12245	/*
12246	 * For VCHR we just don't want to force flush any dependencies that
12247	 * will cause rollbacks.
12248	 */
12249	if (vp->v_type == VCHR) {
12250		if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12251			return (EBUSY);
12252		return (0);
12253	}
12254	ACQUIRE_LOCK(&lk);
12255	/*
12256	 * As we hold the buffer locked, none of its dependencies
12257	 * will disappear.
12258	 */
12259	error = 0;
12260top:
12261	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12262		switch (wk->wk_type) {
12263
12264		case D_ALLOCDIRECT:
12265		case D_ALLOCINDIR:
12266			newblk = WK_NEWBLK(wk);
12267			if (newblk->nb_jnewblk != NULL) {
12268				if (waitfor == MNT_NOWAIT) {
12269					error = EBUSY;
12270					goto out_unlock;
12271				}
12272				jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12273				goto top;
12274			}
12275			if (newblk->nb_state & DEPCOMPLETE ||
12276			    waitfor == MNT_NOWAIT)
12277				continue;
12278			nbp = newblk->nb_bmsafemap->sm_buf;
12279			nbp = getdirtybuf(nbp, &lk, waitfor);
12280			if (nbp == NULL)
12281				goto top;
12282			FREE_LOCK(&lk);
12283			if ((error = bwrite(nbp)) != 0)
12284				goto out;
12285			ACQUIRE_LOCK(&lk);
12286			continue;
12287
12288		case D_INDIRDEP:
12289			indirdep = WK_INDIRDEP(wk);
12290			if (waitfor == MNT_NOWAIT) {
12291				if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
12292				    !LIST_EMPTY(&indirdep->ir_deplisthd)) {
12293					error = EBUSY;
12294					goto out_unlock;
12295				}
12296			}
12297			if (!TAILQ_EMPTY(&indirdep->ir_trunc))
12298				panic("softdep_sync_buf: truncation pending.");
12299		restart:
12300			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
12301				newblk = (struct newblk *)aip;
12302				if (newblk->nb_jnewblk != NULL) {
12303					jwait(&newblk->nb_jnewblk->jn_list,
12304					    waitfor);
12305					goto restart;
12306				}
12307				if (newblk->nb_state & DEPCOMPLETE)
12308					continue;
12309				nbp = newblk->nb_bmsafemap->sm_buf;
12310				nbp = getdirtybuf(nbp, &lk, waitfor);
12311				if (nbp == NULL)
12312					goto restart;
12313				FREE_LOCK(&lk);
12314				if ((error = bwrite(nbp)) != 0)
12315					goto out;
12316				ACQUIRE_LOCK(&lk);
12317				goto restart;
12318			}
12319			continue;
12320
12321		case D_PAGEDEP:
12322			/*
12323			 * Only flush directory entries in synchronous passes.
12324			 */
12325			if (waitfor != MNT_WAIT) {
12326				error = EBUSY;
12327				goto out_unlock;
12328			}
12329			/*
12330			 * While syncing snapshots, we must allow recursive
12331			 * lookups.
12332			 */
12333			BUF_AREC(bp);
12334			/*
12335			 * We are trying to sync a directory that may
12336			 * have dependencies on both its own metadata
12337			 * and/or dependencies on the inodes of any
12338			 * recently allocated files. We walk its diradd
12339			 * lists pushing out the associated inode.
12340			 */
12341			pagedep = WK_PAGEDEP(wk);
12342			for (i = 0; i < DAHASHSZ; i++) {
12343				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
12344					continue;
12345				if ((error = flush_pagedep_deps(vp, wk->wk_mp,
12346				    &pagedep->pd_diraddhd[i]))) {
12347					BUF_NOREC(bp);
12348					goto out_unlock;
12349				}
12350			}
12351			BUF_NOREC(bp);
12352			continue;
12353
12354		case D_FREEWORK:
12355		case D_FREEDEP:
12356		case D_JSEGDEP:
12357		case D_JNEWBLK:
12358			continue;
12359
12360		default:
12361			panic("softdep_sync_buf: Unknown type %s",
12362			    TYPENAME(wk->wk_type));
12363			/* NOTREACHED */
12364		}
12365	}
12366out_unlock:
12367	FREE_LOCK(&lk);
12368out:
12369	return (error);
12370}
12371
12372/*
12373 * Flush the dependencies associated with an inodedep.
12374 * Called with splbio blocked.
12375 */
12376static int
12377flush_inodedep_deps(vp, mp, ino)
12378	struct vnode *vp;
12379	struct mount *mp;
12380	ino_t ino;
12381{
12382	struct inodedep *inodedep;
12383	struct inoref *inoref;
12384	int error, waitfor;
12385
12386	/*
12387	 * This work is done in two passes. The first pass grabs most
12388	 * of the buffers and begins asynchronously writing them. The
12389	 * only way to wait for these asynchronous writes is to sleep
12390	 * on the filesystem vnode which may stay busy for a long time
12391	 * if the filesystem is active. So, instead, we make a second
12392	 * pass over the dependencies blocking on each write. In the
12393	 * usual case we will be blocking against a write that we
12394	 * initiated, so when it is done the dependency will have been
12395	 * resolved. Thus the second pass is expected to end quickly.
12396	 * We give a brief window at the top of the loop to allow
12397	 * any pending I/O to complete.
12398	 */
12399	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
12400		if (error)
12401			return (error);
12402		FREE_LOCK(&lk);
12403		ACQUIRE_LOCK(&lk);
12404restart:
12405		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12406			return (0);
12407		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12408			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12409			    == DEPCOMPLETE) {
12410				jwait(&inoref->if_list, MNT_WAIT);
12411				goto restart;
12412			}
12413		}
12414		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
12415		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
12416		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
12417		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
12418			continue;
12419		/*
12420		 * If pass2, we are done, otherwise do pass 2.
12421		 */
12422		if (waitfor == MNT_WAIT)
12423			break;
12424		waitfor = MNT_WAIT;
12425	}
12426	/*
12427	 * Try freeing inodedep in case all dependencies have been removed.
12428	 */
12429	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
12430		(void) free_inodedep(inodedep);
12431	return (0);
12432}
12433
12434/*
12435 * Flush an inode dependency list.
12436 * Called with splbio blocked.
12437 */
12438static int
12439flush_deplist(listhead, waitfor, errorp)
12440	struct allocdirectlst *listhead;
12441	int waitfor;
12442	int *errorp;
12443{
12444	struct allocdirect *adp;
12445	struct newblk *newblk;
12446	struct buf *bp;
12447
12448	rw_assert(&lk, RA_WLOCKED);
12449	TAILQ_FOREACH(adp, listhead, ad_next) {
12450		newblk = (struct newblk *)adp;
12451		if (newblk->nb_jnewblk != NULL) {
12452			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12453			return (1);
12454		}
12455		if (newblk->nb_state & DEPCOMPLETE)
12456			continue;
12457		bp = newblk->nb_bmsafemap->sm_buf;
12458		bp = getdirtybuf(bp, &lk, waitfor);
12459		if (bp == NULL) {
12460			if (waitfor == MNT_NOWAIT)
12461				continue;
12462			return (1);
12463		}
12464		FREE_LOCK(&lk);
12465		if (waitfor == MNT_NOWAIT)
12466			bawrite(bp);
12467		else
12468			*errorp = bwrite(bp);
12469		ACQUIRE_LOCK(&lk);
12470		return (1);
12471	}
12472	return (0);
12473}
12474
12475/*
12476 * Flush dependencies associated with an allocdirect block.
12477 */
12478static int
12479flush_newblk_dep(vp, mp, lbn)
12480	struct vnode *vp;
12481	struct mount *mp;
12482	ufs_lbn_t lbn;
12483{
12484	struct newblk *newblk;
12485	struct bufobj *bo;
12486	struct inode *ip;
12487	struct buf *bp;
12488	ufs2_daddr_t blkno;
12489	int error;
12490
12491	error = 0;
12492	bo = &vp->v_bufobj;
12493	ip = VTOI(vp);
12494	blkno = DIP(ip, i_db[lbn]);
12495	if (blkno == 0)
12496		panic("flush_newblk_dep: Missing block");
12497	ACQUIRE_LOCK(&lk);
12498	/*
12499	 * Loop until all dependencies related to this block are satisfied.
12500	 * We must be careful to restart after each sleep in case a write
12501	 * completes some part of this process for us.
12502	 */
12503	for (;;) {
12504		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
12505			FREE_LOCK(&lk);
12506			break;
12507		}
12508		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
12509			panic("flush_newblk_deps: Bad newblk %p", newblk);
12510		/*
12511		 * Flush the journal.
12512		 */
12513		if (newblk->nb_jnewblk != NULL) {
12514			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12515			continue;
12516		}
12517		/*
12518		 * Write the bitmap dependency.
12519		 */
12520		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
12521			bp = newblk->nb_bmsafemap->sm_buf;
12522			bp = getdirtybuf(bp, &lk, MNT_WAIT);
12523			if (bp == NULL)
12524				continue;
12525			FREE_LOCK(&lk);
12526			error = bwrite(bp);
12527			if (error)
12528				break;
12529			ACQUIRE_LOCK(&lk);
12530			continue;
12531		}
12532		/*
12533		 * Write the buffer.
12534		 */
12535		FREE_LOCK(&lk);
12536		BO_LOCK(bo);
12537		bp = gbincore(bo, lbn);
12538		if (bp != NULL) {
12539			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
12540			    LK_INTERLOCK, BO_LOCKPTR(bo));
12541			if (error == ENOLCK) {
12542				ACQUIRE_LOCK(&lk);
12543				continue; /* Slept, retry */
12544			}
12545			if (error != 0)
12546				break;	/* Failed */
12547			if (bp->b_flags & B_DELWRI) {
12548				bremfree(bp);
12549				error = bwrite(bp);
12550				if (error)
12551					break;
12552			} else
12553				BUF_UNLOCK(bp);
12554		} else
12555			BO_UNLOCK(bo);
12556		/*
12557		 * We have to wait for the direct pointers to
12558		 * point at the newdirblk before the dependency
12559		 * will go away.
12560		 */
12561		error = ffs_update(vp, 1);
12562		if (error)
12563			break;
12564		ACQUIRE_LOCK(&lk);
12565	}
12566	return (error);
12567}
12568
12569/*
12570 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
12571 * Called with splbio blocked.
12572 */
12573static int
12574flush_pagedep_deps(pvp, mp, diraddhdp)
12575	struct vnode *pvp;
12576	struct mount *mp;
12577	struct diraddhd *diraddhdp;
12578{
12579	struct inodedep *inodedep;
12580	struct inoref *inoref;
12581	struct ufsmount *ump;
12582	struct diradd *dap;
12583	struct vnode *vp;
12584	int error = 0;
12585	struct buf *bp;
12586	ino_t inum;
12587
12588	ump = VFSTOUFS(mp);
12589restart:
12590	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
12591		/*
12592		 * Flush ourselves if this directory entry
12593		 * has a MKDIR_PARENT dependency.
12594		 */
12595		if (dap->da_state & MKDIR_PARENT) {
12596			FREE_LOCK(&lk);
12597			if ((error = ffs_update(pvp, 1)) != 0)
12598				break;
12599			ACQUIRE_LOCK(&lk);
12600			/*
12601			 * If that cleared dependencies, go on to next.
12602			 */
12603			if (dap != LIST_FIRST(diraddhdp))
12604				continue;
12605			if (dap->da_state & MKDIR_PARENT)
12606				panic("flush_pagedep_deps: MKDIR_PARENT");
12607		}
12608		/*
12609		 * A newly allocated directory must have its "." and
12610		 * ".." entries written out before its name can be
12611		 * committed in its parent.
12612		 */
12613		inum = dap->da_newinum;
12614		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12615			panic("flush_pagedep_deps: lost inode1");
12616		/*
12617		 * Wait for any pending journal adds to complete so we don't
12618		 * cause rollbacks while syncing.
12619		 */
12620		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12621			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12622			    == DEPCOMPLETE) {
12623				jwait(&inoref->if_list, MNT_WAIT);
12624				goto restart;
12625			}
12626		}
12627		if (dap->da_state & MKDIR_BODY) {
12628			FREE_LOCK(&lk);
12629			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12630			    FFSV_FORCEINSMQ)))
12631				break;
12632			error = flush_newblk_dep(vp, mp, 0);
12633			/*
12634			 * If we still have the dependency we might need to
12635			 * update the vnode to sync the new link count to
12636			 * disk.
12637			 */
12638			if (error == 0 && dap == LIST_FIRST(diraddhdp))
12639				error = ffs_update(vp, 1);
12640			vput(vp);
12641			if (error != 0)
12642				break;
12643			ACQUIRE_LOCK(&lk);
12644			/*
12645			 * If that cleared dependencies, go on to next.
12646			 */
12647			if (dap != LIST_FIRST(diraddhdp))
12648				continue;
12649			if (dap->da_state & MKDIR_BODY) {
12650				inodedep_lookup(UFSTOVFS(ump), inum, 0,
12651				    &inodedep);
12652				panic("flush_pagedep_deps: MKDIR_BODY "
12653				    "inodedep %p dap %p vp %p",
12654				    inodedep, dap, vp);
12655			}
12656		}
12657		/*
12658		 * Flush the inode on which the directory entry depends.
12659		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
12660		 * the only remaining dependency is that the updated inode
12661		 * count must get pushed to disk. The inode has already
12662		 * been pushed into its inode buffer (via VOP_UPDATE) at
12663		 * the time of the reference count change. So we need only
12664		 * locate that buffer, ensure that there will be no rollback
12665		 * caused by a bitmap dependency, then write the inode buffer.
12666		 */
12667retry:
12668		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12669			panic("flush_pagedep_deps: lost inode");
12670		/*
12671		 * If the inode still has bitmap dependencies,
12672		 * push them to disk.
12673		 */
12674		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
12675			bp = inodedep->id_bmsafemap->sm_buf;
12676			bp = getdirtybuf(bp, &lk, MNT_WAIT);
12677			if (bp == NULL)
12678				goto retry;
12679			FREE_LOCK(&lk);
12680			if ((error = bwrite(bp)) != 0)
12681				break;
12682			ACQUIRE_LOCK(&lk);
12683			if (dap != LIST_FIRST(diraddhdp))
12684				continue;
12685		}
12686		/*
12687		 * If the inode is still sitting in a buffer waiting
12688		 * to be written or waiting for the link count to be
12689		 * adjusted update it here to flush it to disk.
12690		 */
12691		if (dap == LIST_FIRST(diraddhdp)) {
12692			FREE_LOCK(&lk);
12693			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12694			    FFSV_FORCEINSMQ)))
12695				break;
12696			error = ffs_update(vp, 1);
12697			vput(vp);
12698			if (error)
12699				break;
12700			ACQUIRE_LOCK(&lk);
12701		}
12702		/*
12703		 * If we have failed to get rid of all the dependencies
12704		 * then something is seriously wrong.
12705		 */
12706		if (dap == LIST_FIRST(diraddhdp)) {
12707			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
12708			panic("flush_pagedep_deps: failed to flush "
12709			    "inodedep %p ino %ju dap %p",
12710			    inodedep, (uintmax_t)inum, dap);
12711		}
12712	}
12713	if (error)
12714		ACQUIRE_LOCK(&lk);
12715	return (error);
12716}
12717
12718/*
12719 * A large burst of file addition or deletion activity can drive the
12720 * memory load excessively high. First attempt to slow things down
12721 * using the techniques below. If that fails, this routine requests
12722 * the offending operations to fall back to running synchronously
12723 * until the memory load returns to a reasonable level.
12724 */
12725int
12726softdep_slowdown(vp)
12727	struct vnode *vp;
12728{
12729	struct ufsmount *ump;
12730	int jlow;
12731	int max_softdeps_hard;
12732
12733	ACQUIRE_LOCK(&lk);
12734	jlow = 0;
12735	/*
12736	 * Check for journal space if needed.
12737	 */
12738	if (DOINGSUJ(vp)) {
12739		ump = VFSTOUFS(vp->v_mount);
12740		if (journal_space(ump, 0) == 0)
12741			jlow = 1;
12742	}
12743	max_softdeps_hard = max_softdeps * 11 / 10;
12744	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
12745	    dep_current[D_INODEDEP] < max_softdeps_hard &&
12746	    VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps &&
12747	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0) {
12748		FREE_LOCK(&lk);
12749  		return (0);
12750	}
12751	if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow)
12752		softdep_speedup();
12753	stat_sync_limit_hit += 1;
12754	FREE_LOCK(&lk);
12755	if (DOINGSUJ(vp))
12756		return (0);
12757	return (1);
12758}
12759
12760/*
12761 * Called by the allocation routines when they are about to fail
12762 * in the hope that we can free up the requested resource (inodes
12763 * or disk space).
12764 *
12765 * First check to see if the work list has anything on it. If it has,
12766 * clean up entries until we successfully free the requested resource.
12767 * Because this process holds inodes locked, we cannot handle any remove
12768 * requests that might block on a locked inode as that could lead to
12769 * deadlock. If the worklist yields none of the requested resource,
12770 * start syncing out vnodes to free up the needed space.
12771 */
12772int
12773softdep_request_cleanup(fs, vp, cred, resource)
12774	struct fs *fs;
12775	struct vnode *vp;
12776	struct ucred *cred;
12777	int resource;
12778{
12779	struct ufsmount *ump;
12780	struct mount *mp;
12781	struct vnode *lvp, *mvp;
12782	long starttime;
12783	ufs2_daddr_t needed;
12784	int error;
12785
12786	/*
12787	 * If we are being called because of a process doing a
12788	 * copy-on-write, then it is not safe to process any
12789	 * worklist items as we will recurse into the copyonwrite
12790	 * routine.  This will result in an incoherent snapshot.
12791	 * If the vnode that we hold is a snapshot, we must avoid
12792	 * handling other resources that could cause deadlock.
12793	 */
12794	if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
12795		return (0);
12796
12797	if (resource == FLUSH_BLOCKS_WAIT)
12798		stat_cleanup_blkrequests += 1;
12799	else
12800		stat_cleanup_inorequests += 1;
12801
12802	mp = vp->v_mount;
12803	ump = VFSTOUFS(mp);
12804	mtx_assert(UFS_MTX(ump), MA_OWNED);
12805	UFS_UNLOCK(ump);
12806	error = ffs_update(vp, 1);
12807	if (error != 0) {
12808		UFS_LOCK(ump);
12809		return (0);
12810	}
12811	/*
12812	 * If we are in need of resources, consider pausing for
12813	 * tickdelay to give ourselves some breathing room.
12814	 */
12815	ACQUIRE_LOCK(&lk);
12816	process_removes(vp);
12817	process_truncates(vp);
12818	request_cleanup(UFSTOVFS(ump), resource);
12819	FREE_LOCK(&lk);
12820	/*
12821	 * Now clean up at least as many resources as we will need.
12822	 *
12823	 * When requested to clean up inodes, the number that are needed
12824	 * is set by the number of simultaneous writers (mnt_writeopcount)
12825	 * plus a bit of slop (2) in case some more writers show up while
12826	 * we are cleaning.
12827	 *
12828	 * When requested to free up space, the amount of space that
12829	 * we need is enough blocks to allocate a full-sized segment
12830	 * (fs_contigsumsize). The number of such segments that will
12831	 * be needed is set by the number of simultaneous writers
12832	 * (mnt_writeopcount) plus a bit of slop (2) in case some more
12833	 * writers show up while we are cleaning.
12834	 *
12835	 * Additionally, if we are unpriviledged and allocating space,
12836	 * we need to ensure that we clean up enough blocks to get the
12837	 * needed number of blocks over the threshhold of the minimum
12838	 * number of blocks required to be kept free by the filesystem
12839	 * (fs_minfree).
12840	 */
12841	if (resource == FLUSH_INODES_WAIT) {
12842		needed = vp->v_mount->mnt_writeopcount + 2;
12843	} else if (resource == FLUSH_BLOCKS_WAIT) {
12844		needed = (vp->v_mount->mnt_writeopcount + 2) *
12845		    fs->fs_contigsumsize;
12846		if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
12847			needed += fragstoblks(fs,
12848			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
12849			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
12850	} else {
12851		UFS_LOCK(ump);
12852		printf("softdep_request_cleanup: Unknown resource type %d\n",
12853		    resource);
12854		return (0);
12855	}
12856	starttime = time_second;
12857retry:
12858	if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
12859	    fs->fs_cstotal.cs_nbfree <= needed) ||
12860	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
12861	    fs->fs_cstotal.cs_nifree <= needed)) {
12862		ACQUIRE_LOCK(&lk);
12863		if (ump->softdep_on_worklist > 0 &&
12864		    process_worklist_item(UFSTOVFS(ump),
12865		    ump->softdep_on_worklist, LK_NOWAIT) != 0)
12866			stat_worklist_push += 1;
12867		FREE_LOCK(&lk);
12868	}
12869	/*
12870	 * If we still need resources and there are no more worklist
12871	 * entries to process to obtain them, we have to start flushing
12872	 * the dirty vnodes to force the release of additional requests
12873	 * to the worklist that we can then process to reap addition
12874	 * resources. We walk the vnodes associated with the mount point
12875	 * until we get the needed worklist requests that we can reap.
12876	 */
12877	if ((resource == FLUSH_BLOCKS_WAIT &&
12878	     fs->fs_cstotal.cs_nbfree <= needed) ||
12879	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
12880	     fs->fs_cstotal.cs_nifree <= needed)) {
12881		MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
12882			if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
12883				VI_UNLOCK(lvp);
12884				continue;
12885			}
12886			if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
12887			    curthread))
12888				continue;
12889			if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
12890				vput(lvp);
12891				continue;
12892			}
12893			(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
12894			vput(lvp);
12895		}
12896		lvp = ump->um_devvp;
12897		if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
12898			VOP_FSYNC(lvp, MNT_NOWAIT, curthread);
12899			VOP_UNLOCK(lvp, 0);
12900		}
12901		if (ump->softdep_on_worklist > 0) {
12902			stat_cleanup_retries += 1;
12903			goto retry;
12904		}
12905		stat_cleanup_failures += 1;
12906	}
12907	if (time_second - starttime > stat_cleanup_high_delay)
12908		stat_cleanup_high_delay = time_second - starttime;
12909	UFS_LOCK(ump);
12910	return (1);
12911}
12912
12913/*
12914 * If memory utilization has gotten too high, deliberately slow things
12915 * down and speed up the I/O processing.
12916 */
12917extern struct thread *syncertd;
12918static int
12919request_cleanup(mp, resource)
12920	struct mount *mp;
12921	int resource;
12922{
12923	struct thread *td = curthread;
12924	struct ufsmount *ump;
12925
12926	rw_assert(&lk, RA_WLOCKED);
12927	/*
12928	 * We never hold up the filesystem syncer or buf daemon.
12929	 */
12930	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
12931		return (0);
12932	ump = VFSTOUFS(mp);
12933	/*
12934	 * First check to see if the work list has gotten backlogged.
12935	 * If it has, co-opt this process to help clean up two entries.
12936	 * Because this process may hold inodes locked, we cannot
12937	 * handle any remove requests that might block on a locked
12938	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
12939	 * to avoid recursively processing the worklist.
12940	 */
12941	if (ump->softdep_on_worklist > max_softdeps / 10) {
12942		td->td_pflags |= TDP_SOFTDEP;
12943		process_worklist_item(mp, 2, LK_NOWAIT);
12944		td->td_pflags &= ~TDP_SOFTDEP;
12945		stat_worklist_push += 2;
12946		return(1);
12947	}
12948	/*
12949	 * Next, we attempt to speed up the syncer process. If that
12950	 * is successful, then we allow the process to continue.
12951	 */
12952	if (softdep_speedup() &&
12953	    resource != FLUSH_BLOCKS_WAIT &&
12954	    resource != FLUSH_INODES_WAIT)
12955		return(0);
12956	/*
12957	 * If we are resource constrained on inode dependencies, try
12958	 * flushing some dirty inodes. Otherwise, we are constrained
12959	 * by file deletions, so try accelerating flushes of directories
12960	 * with removal dependencies. We would like to do the cleanup
12961	 * here, but we probably hold an inode locked at this point and
12962	 * that might deadlock against one that we try to clean. So,
12963	 * the best that we can do is request the syncer daemon to do
12964	 * the cleanup for us.
12965	 */
12966	switch (resource) {
12967
12968	case FLUSH_INODES:
12969	case FLUSH_INODES_WAIT:
12970		stat_ino_limit_push += 1;
12971		req_clear_inodedeps += 1;
12972		stat_countp = &stat_ino_limit_hit;
12973		break;
12974
12975	case FLUSH_BLOCKS:
12976	case FLUSH_BLOCKS_WAIT:
12977		stat_blk_limit_push += 1;
12978		req_clear_remove += 1;
12979		stat_countp = &stat_blk_limit_hit;
12980		break;
12981
12982	default:
12983		panic("request_cleanup: unknown type");
12984	}
12985	/*
12986	 * Hopefully the syncer daemon will catch up and awaken us.
12987	 * We wait at most tickdelay before proceeding in any case.
12988	 */
12989	proc_waiting += 1;
12990	if (callout_pending(&softdep_callout) == FALSE)
12991		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
12992		    pause_timer, 0);
12993
12994	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
12995	proc_waiting -= 1;
12996	return (1);
12997}
12998
12999/*
13000 * Awaken processes pausing in request_cleanup and clear proc_waiting
13001 * to indicate that there is no longer a timer running.
13002 */
13003static void
13004pause_timer(arg)
13005	void *arg;
13006{
13007
13008	/*
13009	 * The callout_ API has acquired mtx and will hold it around this
13010	 * function call.
13011	 */
13012	*stat_countp += 1;
13013	wakeup_one(&proc_waiting);
13014	if (proc_waiting > 0)
13015		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13016		    pause_timer, 0);
13017}
13018
13019/*
13020 * Flush out a directory with at least one removal dependency in an effort to
13021 * reduce the number of dirrem, freefile, and freeblks dependency structures.
13022 */
13023static void
13024clear_remove(void)
13025{
13026	struct pagedep_hashhead *pagedephd;
13027	struct pagedep *pagedep;
13028	static int next = 0;
13029	struct mount *mp;
13030	struct vnode *vp;
13031	struct bufobj *bo;
13032	int error, cnt;
13033	ino_t ino;
13034
13035	rw_assert(&lk, RA_WLOCKED);
13036
13037	for (cnt = 0; cnt <= pagedep_hash; cnt++) {
13038		pagedephd = &pagedep_hashtbl[next++];
13039		if (next > pagedep_hash)
13040			next = 0;
13041		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
13042			if (LIST_EMPTY(&pagedep->pd_dirremhd))
13043				continue;
13044			mp = pagedep->pd_list.wk_mp;
13045			ino = pagedep->pd_ino;
13046			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13047				continue;
13048			FREE_LOCK(&lk);
13049
13050			/*
13051			 * Let unmount clear deps
13052			 */
13053			error = vfs_busy(mp, MBF_NOWAIT);
13054			if (error != 0)
13055				goto finish_write;
13056			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13057			     FFSV_FORCEINSMQ);
13058			vfs_unbusy(mp);
13059			if (error != 0) {
13060				softdep_error("clear_remove: vget", error);
13061				goto finish_write;
13062			}
13063			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13064				softdep_error("clear_remove: fsync", error);
13065			bo = &vp->v_bufobj;
13066			BO_LOCK(bo);
13067			drain_output(vp);
13068			BO_UNLOCK(bo);
13069			vput(vp);
13070		finish_write:
13071			vn_finished_write(mp);
13072			ACQUIRE_LOCK(&lk);
13073			return;
13074		}
13075	}
13076}
13077
13078/*
13079 * Clear out a block of dirty inodes in an effort to reduce
13080 * the number of inodedep dependency structures.
13081 */
13082static void
13083clear_inodedeps(void)
13084{
13085	struct inodedep_hashhead *inodedephd;
13086	struct inodedep *inodedep;
13087	static int next = 0;
13088	struct mount *mp;
13089	struct vnode *vp;
13090	struct fs *fs;
13091	int error, cnt;
13092	ino_t firstino, lastino, ino;
13093
13094	rw_assert(&lk, RA_WLOCKED);
13095	/*
13096	 * Pick a random inode dependency to be cleared.
13097	 * We will then gather up all the inodes in its block
13098	 * that have dependencies and flush them out.
13099	 */
13100	for (cnt = 0; cnt <= inodedep_hash; cnt++) {
13101		inodedephd = &inodedep_hashtbl[next++];
13102		if (next > inodedep_hash)
13103			next = 0;
13104		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
13105			break;
13106	}
13107	if (inodedep == NULL)
13108		return;
13109	fs = inodedep->id_fs;
13110	mp = inodedep->id_list.wk_mp;
13111	/*
13112	 * Find the last inode in the block with dependencies.
13113	 */
13114	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
13115	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
13116		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
13117			break;
13118	/*
13119	 * Asynchronously push all but the last inode with dependencies.
13120	 * Synchronously push the last inode with dependencies to ensure
13121	 * that the inode block gets written to free up the inodedeps.
13122	 */
13123	for (ino = firstino; ino <= lastino; ino++) {
13124		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13125			continue;
13126		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13127			continue;
13128		FREE_LOCK(&lk);
13129		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
13130		if (error != 0) {
13131			vn_finished_write(mp);
13132			ACQUIRE_LOCK(&lk);
13133			return;
13134		}
13135		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13136		    FFSV_FORCEINSMQ)) != 0) {
13137			softdep_error("clear_inodedeps: vget", error);
13138			vfs_unbusy(mp);
13139			vn_finished_write(mp);
13140			ACQUIRE_LOCK(&lk);
13141			return;
13142		}
13143		vfs_unbusy(mp);
13144		if (ino == lastino) {
13145			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
13146				softdep_error("clear_inodedeps: fsync1", error);
13147		} else {
13148			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13149				softdep_error("clear_inodedeps: fsync2", error);
13150			BO_LOCK(&vp->v_bufobj);
13151			drain_output(vp);
13152			BO_UNLOCK(&vp->v_bufobj);
13153		}
13154		vput(vp);
13155		vn_finished_write(mp);
13156		ACQUIRE_LOCK(&lk);
13157	}
13158}
13159
13160void
13161softdep_buf_append(bp, wkhd)
13162	struct buf *bp;
13163	struct workhead *wkhd;
13164{
13165	struct worklist *wk;
13166
13167	ACQUIRE_LOCK(&lk);
13168	while ((wk = LIST_FIRST(wkhd)) != NULL) {
13169		WORKLIST_REMOVE(wk);
13170		WORKLIST_INSERT(&bp->b_dep, wk);
13171	}
13172	FREE_LOCK(&lk);
13173
13174}
13175
13176void
13177softdep_inode_append(ip, cred, wkhd)
13178	struct inode *ip;
13179	struct ucred *cred;
13180	struct workhead *wkhd;
13181{
13182	struct buf *bp;
13183	struct fs *fs;
13184	int error;
13185
13186	fs = ip->i_fs;
13187	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
13188	    (int)fs->fs_bsize, cred, &bp);
13189	if (error) {
13190		bqrelse(bp);
13191		softdep_freework(wkhd);
13192		return;
13193	}
13194	softdep_buf_append(bp, wkhd);
13195	bqrelse(bp);
13196}
13197
13198void
13199softdep_freework(wkhd)
13200	struct workhead *wkhd;
13201{
13202
13203	ACQUIRE_LOCK(&lk);
13204	handle_jwork(wkhd);
13205	FREE_LOCK(&lk);
13206}
13207
13208/*
13209 * Function to determine if the buffer has outstanding dependencies
13210 * that will cause a roll-back if the buffer is written. If wantcount
13211 * is set, return number of dependencies, otherwise just yes or no.
13212 */
13213static int
13214softdep_count_dependencies(bp, wantcount)
13215	struct buf *bp;
13216	int wantcount;
13217{
13218	struct worklist *wk;
13219	struct bmsafemap *bmsafemap;
13220	struct freework *freework;
13221	struct inodedep *inodedep;
13222	struct indirdep *indirdep;
13223	struct freeblks *freeblks;
13224	struct allocindir *aip;
13225	struct pagedep *pagedep;
13226	struct dirrem *dirrem;
13227	struct newblk *newblk;
13228	struct mkdir *mkdir;
13229	struct diradd *dap;
13230	int i, retval;
13231
13232	retval = 0;
13233	ACQUIRE_LOCK(&lk);
13234	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
13235		switch (wk->wk_type) {
13236
13237		case D_INODEDEP:
13238			inodedep = WK_INODEDEP(wk);
13239			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
13240				/* bitmap allocation dependency */
13241				retval += 1;
13242				if (!wantcount)
13243					goto out;
13244			}
13245			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
13246				/* direct block pointer dependency */
13247				retval += 1;
13248				if (!wantcount)
13249					goto out;
13250			}
13251			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
13252				/* direct block pointer dependency */
13253				retval += 1;
13254				if (!wantcount)
13255					goto out;
13256			}
13257			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
13258				/* Add reference dependency. */
13259				retval += 1;
13260				if (!wantcount)
13261					goto out;
13262			}
13263			continue;
13264
13265		case D_INDIRDEP:
13266			indirdep = WK_INDIRDEP(wk);
13267
13268			TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
13269				/* indirect truncation dependency */
13270				retval += 1;
13271				if (!wantcount)
13272					goto out;
13273			}
13274
13275			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
13276				/* indirect block pointer dependency */
13277				retval += 1;
13278				if (!wantcount)
13279					goto out;
13280			}
13281			continue;
13282
13283		case D_PAGEDEP:
13284			pagedep = WK_PAGEDEP(wk);
13285			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
13286				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
13287					/* Journal remove ref dependency. */
13288					retval += 1;
13289					if (!wantcount)
13290						goto out;
13291				}
13292			}
13293			for (i = 0; i < DAHASHSZ; i++) {
13294
13295				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
13296					/* directory entry dependency */
13297					retval += 1;
13298					if (!wantcount)
13299						goto out;
13300				}
13301			}
13302			continue;
13303
13304		case D_BMSAFEMAP:
13305			bmsafemap = WK_BMSAFEMAP(wk);
13306			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
13307				/* Add reference dependency. */
13308				retval += 1;
13309				if (!wantcount)
13310					goto out;
13311			}
13312			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
13313				/* Allocate block dependency. */
13314				retval += 1;
13315				if (!wantcount)
13316					goto out;
13317			}
13318			continue;
13319
13320		case D_FREEBLKS:
13321			freeblks = WK_FREEBLKS(wk);
13322			if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
13323				/* Freeblk journal dependency. */
13324				retval += 1;
13325				if (!wantcount)
13326					goto out;
13327			}
13328			continue;
13329
13330		case D_ALLOCDIRECT:
13331		case D_ALLOCINDIR:
13332			newblk = WK_NEWBLK(wk);
13333			if (newblk->nb_jnewblk) {
13334				/* Journal allocate dependency. */
13335				retval += 1;
13336				if (!wantcount)
13337					goto out;
13338			}
13339			continue;
13340
13341		case D_MKDIR:
13342			mkdir = WK_MKDIR(wk);
13343			if (mkdir->md_jaddref) {
13344				/* Journal reference dependency. */
13345				retval += 1;
13346				if (!wantcount)
13347					goto out;
13348			}
13349			continue;
13350
13351		case D_FREEWORK:
13352		case D_FREEDEP:
13353		case D_JSEGDEP:
13354		case D_JSEG:
13355		case D_SBDEP:
13356			/* never a dependency on these blocks */
13357			continue;
13358
13359		default:
13360			panic("softdep_count_dependencies: Unexpected type %s",
13361			    TYPENAME(wk->wk_type));
13362			/* NOTREACHED */
13363		}
13364	}
13365out:
13366	FREE_LOCK(&lk);
13367	return retval;
13368}
13369
13370/*
13371 * Acquire exclusive access to a buffer.
13372 * Must be called with a locked mtx parameter.
13373 * Return acquired buffer or NULL on failure.
13374 */
13375static struct buf *
13376getdirtybuf(bp, lock, waitfor)
13377	struct buf *bp;
13378	struct rwlock *lock;
13379	int waitfor;
13380{
13381	int error;
13382
13383	rw_assert(lock, RA_WLOCKED);
13384	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
13385		if (waitfor != MNT_WAIT)
13386			return (NULL);
13387		error = BUF_LOCK(bp,
13388		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
13389		/*
13390		 * Even if we sucessfully acquire bp here, we have dropped
13391		 * lock, which may violates our guarantee.
13392		 */
13393		if (error == 0)
13394			BUF_UNLOCK(bp);
13395		else if (error != ENOLCK)
13396			panic("getdirtybuf: inconsistent lock: %d", error);
13397		rw_wlock(lock);
13398		return (NULL);
13399	}
13400	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13401		if (lock == &lk && waitfor == MNT_WAIT) {
13402			rw_wunlock(lock);
13403			BO_LOCK(bp->b_bufobj);
13404			BUF_UNLOCK(bp);
13405			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13406				bp->b_vflags |= BV_BKGRDWAIT;
13407				msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
13408				       PRIBIO | PDROP, "getbuf", 0);
13409			} else
13410				BO_UNLOCK(bp->b_bufobj);
13411			rw_wlock(lock);
13412			return (NULL);
13413		}
13414		BUF_UNLOCK(bp);
13415		if (waitfor != MNT_WAIT)
13416			return (NULL);
13417		/*
13418		 * The lock argument must be bp->b_vp's mutex in
13419		 * this case.
13420		 */
13421#ifdef	DEBUG_VFS_LOCKS
13422		if (bp->b_vp->v_type != VCHR)
13423			ASSERT_BO_WLOCKED(bp->b_bufobj);
13424#endif
13425		bp->b_vflags |= BV_BKGRDWAIT;
13426		rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
13427		return (NULL);
13428	}
13429	if ((bp->b_flags & B_DELWRI) == 0) {
13430		BUF_UNLOCK(bp);
13431		return (NULL);
13432	}
13433	bremfree(bp);
13434	return (bp);
13435}
13436
13437
13438/*
13439 * Check if it is safe to suspend the file system now.  On entry,
13440 * the vnode interlock for devvp should be held.  Return 0 with
13441 * the mount interlock held if the file system can be suspended now,
13442 * otherwise return EAGAIN with the mount interlock held.
13443 */
13444int
13445softdep_check_suspend(struct mount *mp,
13446		      struct vnode *devvp,
13447		      int softdep_deps,
13448		      int softdep_accdeps,
13449		      int secondary_writes,
13450		      int secondary_accwrites)
13451{
13452	struct bufobj *bo;
13453	struct ufsmount *ump;
13454	int error;
13455
13456	ump = VFSTOUFS(mp);
13457	bo = &devvp->v_bufobj;
13458	ASSERT_BO_WLOCKED(bo);
13459
13460	for (;;) {
13461		if (!TRY_ACQUIRE_LOCK(&lk)) {
13462			BO_UNLOCK(bo);
13463			ACQUIRE_LOCK(&lk);
13464			FREE_LOCK(&lk);
13465			BO_LOCK(bo);
13466			continue;
13467		}
13468		MNT_ILOCK(mp);
13469		if (mp->mnt_secondary_writes != 0) {
13470			FREE_LOCK(&lk);
13471			BO_UNLOCK(bo);
13472			msleep(&mp->mnt_secondary_writes,
13473			       MNT_MTX(mp),
13474			       (PUSER - 1) | PDROP, "secwr", 0);
13475			BO_LOCK(bo);
13476			continue;
13477		}
13478		break;
13479	}
13480
13481	/*
13482	 * Reasons for needing more work before suspend:
13483	 * - Dirty buffers on devvp.
13484	 * - Softdep activity occurred after start of vnode sync loop
13485	 * - Secondary writes occurred after start of vnode sync loop
13486	 */
13487	error = 0;
13488	if (bo->bo_numoutput > 0 ||
13489	    bo->bo_dirty.bv_cnt > 0 ||
13490	    softdep_deps != 0 ||
13491	    ump->softdep_deps != 0 ||
13492	    softdep_accdeps != ump->softdep_accdeps ||
13493	    secondary_writes != 0 ||
13494	    mp->mnt_secondary_writes != 0 ||
13495	    secondary_accwrites != mp->mnt_secondary_accwrites)
13496		error = EAGAIN;
13497	FREE_LOCK(&lk);
13498	BO_UNLOCK(bo);
13499	return (error);
13500}
13501
13502
13503/*
13504 * Get the number of dependency structures for the file system, both
13505 * the current number and the total number allocated.  These will
13506 * later be used to detect that softdep processing has occurred.
13507 */
13508void
13509softdep_get_depcounts(struct mount *mp,
13510		      int *softdep_depsp,
13511		      int *softdep_accdepsp)
13512{
13513	struct ufsmount *ump;
13514
13515	ump = VFSTOUFS(mp);
13516	ACQUIRE_LOCK(&lk);
13517	*softdep_depsp = ump->softdep_deps;
13518	*softdep_accdepsp = ump->softdep_accdeps;
13519	FREE_LOCK(&lk);
13520}
13521
13522/*
13523 * Wait for pending output on a vnode to complete.
13524 * Must be called with vnode lock and interlock locked.
13525 *
13526 * XXX: Should just be a call to bufobj_wwait().
13527 */
13528static void
13529drain_output(vp)
13530	struct vnode *vp;
13531{
13532	struct bufobj *bo;
13533
13534	bo = &vp->v_bufobj;
13535	ASSERT_VOP_LOCKED(vp, "drain_output");
13536	ASSERT_BO_WLOCKED(bo);
13537
13538	while (bo->bo_numoutput) {
13539		bo->bo_flag |= BO_WWAIT;
13540		msleep((caddr_t)&bo->bo_numoutput,
13541		    BO_LOCKPTR(bo), PRIBIO + 1, "drainvp", 0);
13542	}
13543}
13544
13545/*
13546 * Called whenever a buffer that is being invalidated or reallocated
13547 * contains dependencies. This should only happen if an I/O error has
13548 * occurred. The routine is called with the buffer locked.
13549 */
13550static void
13551softdep_deallocate_dependencies(bp)
13552	struct buf *bp;
13553{
13554
13555	if ((bp->b_ioflags & BIO_ERROR) == 0)
13556		panic("softdep_deallocate_dependencies: dangling deps");
13557	if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
13558		softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
13559	else
13560		printf("softdep_deallocate_dependencies: "
13561		    "got error %d while accessing filesystem\n", bp->b_error);
13562	if (bp->b_error != ENXIO)
13563		panic("softdep_deallocate_dependencies: unrecovered I/O error");
13564}
13565
13566/*
13567 * Function to handle asynchronous write errors in the filesystem.
13568 */
13569static void
13570softdep_error(func, error)
13571	char *func;
13572	int error;
13573{
13574
13575	/* XXX should do something better! */
13576	printf("%s: got error %d while accessing filesystem\n", func, error);
13577}
13578
13579#ifdef DDB
13580
13581static void
13582inodedep_print(struct inodedep *inodedep, int verbose)
13583{
13584	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
13585	    " saveino %p\n",
13586	    inodedep, inodedep->id_fs, inodedep->id_state,
13587	    (intmax_t)inodedep->id_ino,
13588	    (intmax_t)fsbtodb(inodedep->id_fs,
13589	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
13590	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
13591	    inodedep->id_savedino1);
13592
13593	if (verbose == 0)
13594		return;
13595
13596	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
13597	    "mkdiradd %p\n",
13598	    LIST_FIRST(&inodedep->id_pendinghd),
13599	    LIST_FIRST(&inodedep->id_bufwait),
13600	    LIST_FIRST(&inodedep->id_inowait),
13601	    TAILQ_FIRST(&inodedep->id_inoreflst),
13602	    inodedep->id_mkdiradd);
13603	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
13604	    TAILQ_FIRST(&inodedep->id_inoupdt),
13605	    TAILQ_FIRST(&inodedep->id_newinoupdt),
13606	    TAILQ_FIRST(&inodedep->id_extupdt),
13607	    TAILQ_FIRST(&inodedep->id_newextupdt));
13608}
13609
13610DB_SHOW_COMMAND(inodedep, db_show_inodedep)
13611{
13612
13613	if (have_addr == 0) {
13614		db_printf("Address required\n");
13615		return;
13616	}
13617	inodedep_print((struct inodedep*)addr, 1);
13618}
13619
13620DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
13621{
13622	struct inodedep_hashhead *inodedephd;
13623	struct inodedep *inodedep;
13624	struct fs *fs;
13625	int cnt;
13626
13627	fs = have_addr ? (struct fs *)addr : NULL;
13628	for (cnt = 0; cnt < inodedep_hash; cnt++) {
13629		inodedephd = &inodedep_hashtbl[cnt];
13630		LIST_FOREACH(inodedep, inodedephd, id_hash) {
13631			if (fs != NULL && fs != inodedep->id_fs)
13632				continue;
13633			inodedep_print(inodedep, 0);
13634		}
13635	}
13636}
13637
13638DB_SHOW_COMMAND(worklist, db_show_worklist)
13639{
13640	struct worklist *wk;
13641
13642	if (have_addr == 0) {
13643		db_printf("Address required\n");
13644		return;
13645	}
13646	wk = (struct worklist *)addr;
13647	printf("worklist: %p type %s state 0x%X\n",
13648	    wk, TYPENAME(wk->wk_type), wk->wk_state);
13649}
13650
13651DB_SHOW_COMMAND(workhead, db_show_workhead)
13652{
13653	struct workhead *wkhd;
13654	struct worklist *wk;
13655	int i;
13656
13657	if (have_addr == 0) {
13658		db_printf("Address required\n");
13659		return;
13660	}
13661	wkhd = (struct workhead *)addr;
13662	wk = LIST_FIRST(wkhd);
13663	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
13664		db_printf("worklist: %p type %s state 0x%X",
13665		    wk, TYPENAME(wk->wk_type), wk->wk_state);
13666	if (i == 100)
13667		db_printf("workhead overflow");
13668	printf("\n");
13669}
13670
13671
13672DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
13673{
13674	struct jaddref *jaddref;
13675	struct diradd *diradd;
13676	struct mkdir *mkdir;
13677
13678	LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
13679		diradd = mkdir->md_diradd;
13680		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
13681		    mkdir, mkdir->md_state, diradd, diradd->da_state);
13682		if ((jaddref = mkdir->md_jaddref) != NULL)
13683			db_printf(" jaddref %p jaddref state 0x%X",
13684			    jaddref, jaddref->ja_state);
13685		db_printf("\n");
13686	}
13687}
13688
13689/* exported to ffs_vfsops.c */
13690extern void db_print_ffs(struct ufsmount *ump);
13691void
13692db_print_ffs(struct ufsmount *ump)
13693{
13694	db_printf("mp %p %s devvp %p fs %p su_wl %d su_deps %d su_req %d\n",
13695	    ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname,
13696	    ump->um_devvp, ump->um_fs, ump->softdep_on_worklist,
13697	    ump->softdep_deps, ump->softdep_req);
13698}
13699
13700#endif /* DDB */
13701
13702#endif /* SOFTUPDATES */
13703