1/*-
2 * Copyright 1998, 2000 Marshall Kirk McKusick.
3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4 * All rights reserved.
5 *
6 * The soft updates code is derived from the appendix of a University
7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8 * "Soft Updates: A Solution to the Metadata Update Problem in File
9 * Systems", CSE-TR-254-95, August 1995).
10 *
11 * Further information about soft updates can be obtained from:
12 *
13 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
14 *	1614 Oxford Street		mckusick@mckusick.com
15 *	Berkeley, CA 94709-1608		+1-510-843-9542
16 *	USA
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 *
22 * 1. Redistributions of source code must retain the above copyright
23 *    notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 *    notice, this list of conditions and the following disclaimer in the
26 *    documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
40 */
41
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD$");
44
45#include "opt_ffs.h"
46#include "opt_quota.h"
47#include "opt_ddb.h"
48
49/*
50 * For now we want the safety net that the DEBUG flag provides.
51 */
52#ifndef DEBUG
53#define DEBUG
54#endif
55
56#include <sys/param.h>
57#include <sys/kernel.h>
58#include <sys/systm.h>
59#include <sys/bio.h>
60#include <sys/buf.h>
61#include <sys/kdb.h>
62#include <sys/kthread.h>
63#include <sys/ktr.h>
64#include <sys/limits.h>
65#include <sys/lock.h>
66#include <sys/malloc.h>
67#include <sys/mount.h>
68#include <sys/mutex.h>
69#include <sys/namei.h>
70#include <sys/priv.h>
71#include <sys/proc.h>
72#include <sys/stat.h>
73#include <sys/sysctl.h>
74#include <sys/syslog.h>
75#include <sys/vnode.h>
76#include <sys/conf.h>
77
78#include <ufs/ufs/dir.h>
79#include <ufs/ufs/extattr.h>
80#include <ufs/ufs/quota.h>
81#include <ufs/ufs/inode.h>
82#include <ufs/ufs/ufsmount.h>
83#include <ufs/ffs/fs.h>
84#include <ufs/ffs/softdep.h>
85#include <ufs/ffs/ffs_extern.h>
86#include <ufs/ufs/ufs_extern.h>
87
88#include <vm/vm.h>
89#include <vm/vm_extern.h>
90#include <vm/vm_object.h>
91
92#include <geom/geom.h>
93
94#include <ddb/ddb.h>
95
96#define	KTR_SUJ	0	/* Define to KTR_SPARE. */
97
98#ifndef SOFTUPDATES
99
100int
101softdep_flushfiles(oldmnt, flags, td)
102	struct mount *oldmnt;
103	int flags;
104	struct thread *td;
105{
106
107	panic("softdep_flushfiles called");
108}
109
110int
111softdep_mount(devvp, mp, fs, cred)
112	struct vnode *devvp;
113	struct mount *mp;
114	struct fs *fs;
115	struct ucred *cred;
116{
117
118	return (0);
119}
120
121void
122softdep_initialize()
123{
124
125	return;
126}
127
128void
129softdep_uninitialize()
130{
131
132	return;
133}
134
135void
136softdep_unmount(mp)
137	struct mount *mp;
138{
139
140}
141
142void
143softdep_setup_sbupdate(ump, fs, bp)
144	struct ufsmount *ump;
145	struct fs *fs;
146	struct buf *bp;
147{
148}
149
150void
151softdep_setup_inomapdep(bp, ip, newinum, mode)
152	struct buf *bp;
153	struct inode *ip;
154	ino_t newinum;
155	int mode;
156{
157
158	panic("softdep_setup_inomapdep called");
159}
160
161void
162softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
163	struct buf *bp;
164	struct mount *mp;
165	ufs2_daddr_t newblkno;
166	int frags;
167	int oldfrags;
168{
169
170	panic("softdep_setup_blkmapdep called");
171}
172
173void
174softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
175	struct inode *ip;
176	ufs_lbn_t lbn;
177	ufs2_daddr_t newblkno;
178	ufs2_daddr_t oldblkno;
179	long newsize;
180	long oldsize;
181	struct buf *bp;
182{
183
184	panic("softdep_setup_allocdirect called");
185}
186
187void
188softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
189	struct inode *ip;
190	ufs_lbn_t lbn;
191	ufs2_daddr_t newblkno;
192	ufs2_daddr_t oldblkno;
193	long newsize;
194	long oldsize;
195	struct buf *bp;
196{
197
198	panic("softdep_setup_allocext called");
199}
200
201void
202softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
203	struct inode *ip;
204	ufs_lbn_t lbn;
205	struct buf *bp;
206	int ptrno;
207	ufs2_daddr_t newblkno;
208	ufs2_daddr_t oldblkno;
209	struct buf *nbp;
210{
211
212	panic("softdep_setup_allocindir_page called");
213}
214
215void
216softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
217	struct buf *nbp;
218	struct inode *ip;
219	struct buf *bp;
220	int ptrno;
221	ufs2_daddr_t newblkno;
222{
223
224	panic("softdep_setup_allocindir_meta called");
225}
226
227void
228softdep_journal_freeblocks(ip, cred, length, flags)
229	struct inode *ip;
230	struct ucred *cred;
231	off_t length;
232	int flags;
233{
234
235	panic("softdep_journal_freeblocks called");
236}
237
238void
239softdep_journal_fsync(ip)
240	struct inode *ip;
241{
242
243	panic("softdep_journal_fsync called");
244}
245
246void
247softdep_setup_freeblocks(ip, length, flags)
248	struct inode *ip;
249	off_t length;
250	int flags;
251{
252
253	panic("softdep_setup_freeblocks called");
254}
255
256void
257softdep_freefile(pvp, ino, mode)
258		struct vnode *pvp;
259		ino_t ino;
260		int mode;
261{
262
263	panic("softdep_freefile called");
264}
265
266int
267softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
268	struct buf *bp;
269	struct inode *dp;
270	off_t diroffset;
271	ino_t newinum;
272	struct buf *newdirbp;
273	int isnewblk;
274{
275
276	panic("softdep_setup_directory_add called");
277}
278
279void
280softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
281	struct buf *bp;
282	struct inode *dp;
283	caddr_t base;
284	caddr_t oldloc;
285	caddr_t newloc;
286	int entrysize;
287{
288
289	panic("softdep_change_directoryentry_offset called");
290}
291
292void
293softdep_setup_remove(bp, dp, ip, isrmdir)
294	struct buf *bp;
295	struct inode *dp;
296	struct inode *ip;
297	int isrmdir;
298{
299
300	panic("softdep_setup_remove called");
301}
302
303void
304softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
305	struct buf *bp;
306	struct inode *dp;
307	struct inode *ip;
308	ino_t newinum;
309	int isrmdir;
310{
311
312	panic("softdep_setup_directory_change called");
313}
314
315void
316softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
317	struct mount *mp;
318	struct buf *bp;
319	ufs2_daddr_t blkno;
320	int frags;
321	struct workhead *wkhd;
322{
323
324	panic("%s called", __FUNCTION__);
325}
326
327void
328softdep_setup_inofree(mp, bp, ino, wkhd)
329	struct mount *mp;
330	struct buf *bp;
331	ino_t ino;
332	struct workhead *wkhd;
333{
334
335	panic("%s called", __FUNCTION__);
336}
337
338void
339softdep_setup_unlink(dp, ip)
340	struct inode *dp;
341	struct inode *ip;
342{
343
344	panic("%s called", __FUNCTION__);
345}
346
347void
348softdep_setup_link(dp, ip)
349	struct inode *dp;
350	struct inode *ip;
351{
352
353	panic("%s called", __FUNCTION__);
354}
355
356void
357softdep_revert_link(dp, ip)
358	struct inode *dp;
359	struct inode *ip;
360{
361
362	panic("%s called", __FUNCTION__);
363}
364
365void
366softdep_setup_rmdir(dp, ip)
367	struct inode *dp;
368	struct inode *ip;
369{
370
371	panic("%s called", __FUNCTION__);
372}
373
374void
375softdep_revert_rmdir(dp, ip)
376	struct inode *dp;
377	struct inode *ip;
378{
379
380	panic("%s called", __FUNCTION__);
381}
382
383void
384softdep_setup_create(dp, ip)
385	struct inode *dp;
386	struct inode *ip;
387{
388
389	panic("%s called", __FUNCTION__);
390}
391
392void
393softdep_revert_create(dp, ip)
394	struct inode *dp;
395	struct inode *ip;
396{
397
398	panic("%s called", __FUNCTION__);
399}
400
401void
402softdep_setup_mkdir(dp, ip)
403	struct inode *dp;
404	struct inode *ip;
405{
406
407	panic("%s called", __FUNCTION__);
408}
409
410void
411softdep_revert_mkdir(dp, ip)
412	struct inode *dp;
413	struct inode *ip;
414{
415
416	panic("%s called", __FUNCTION__);
417}
418
419void
420softdep_setup_dotdot_link(dp, ip)
421	struct inode *dp;
422	struct inode *ip;
423{
424
425	panic("%s called", __FUNCTION__);
426}
427
428int
429softdep_prealloc(vp, waitok)
430	struct vnode *vp;
431	int waitok;
432{
433
434	panic("%s called", __FUNCTION__);
435
436	return (0);
437}
438
439int
440softdep_journal_lookup(mp, vpp)
441	struct mount *mp;
442	struct vnode **vpp;
443{
444
445	return (ENOENT);
446}
447
448void
449softdep_change_linkcnt(ip)
450	struct inode *ip;
451{
452
453	panic("softdep_change_linkcnt called");
454}
455
456void
457softdep_load_inodeblock(ip)
458	struct inode *ip;
459{
460
461	panic("softdep_load_inodeblock called");
462}
463
464void
465softdep_update_inodeblock(ip, bp, waitfor)
466	struct inode *ip;
467	struct buf *bp;
468	int waitfor;
469{
470
471	panic("softdep_update_inodeblock called");
472}
473
474int
475softdep_fsync(vp)
476	struct vnode *vp;	/* the "in_core" copy of the inode */
477{
478
479	return (0);
480}
481
482void
483softdep_fsync_mountdev(vp)
484	struct vnode *vp;
485{
486
487	return;
488}
489
490int
491softdep_flushworklist(oldmnt, countp, td)
492	struct mount *oldmnt;
493	int *countp;
494	struct thread *td;
495{
496
497	*countp = 0;
498	return (0);
499}
500
501int
502softdep_sync_metadata(struct vnode *vp)
503{
504
505	return (0);
506}
507
508int
509softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
510{
511
512	return (0);
513}
514
515int
516softdep_slowdown(vp)
517	struct vnode *vp;
518{
519
520	panic("softdep_slowdown called");
521}
522
523void
524softdep_releasefile(ip)
525	struct inode *ip;	/* inode with the zero effective link count */
526{
527
528	panic("softdep_releasefile called");
529}
530
531int
532softdep_request_cleanup(fs, vp, cred, resource)
533	struct fs *fs;
534	struct vnode *vp;
535	struct ucred *cred;
536	int resource;
537{
538
539	return (0);
540}
541
542int
543softdep_check_suspend(struct mount *mp,
544		      struct vnode *devvp,
545		      int softdep_deps,
546		      int softdep_accdeps,
547		      int secondary_writes,
548		      int secondary_accwrites)
549{
550	struct bufobj *bo;
551	int error;
552
553	(void) softdep_deps,
554	(void) softdep_accdeps;
555
556	bo = &devvp->v_bufobj;
557	ASSERT_BO_LOCKED(bo);
558
559	MNT_ILOCK(mp);
560	while (mp->mnt_secondary_writes != 0) {
561		BO_UNLOCK(bo);
562		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
563		    (PUSER - 1) | PDROP, "secwr", 0);
564		BO_LOCK(bo);
565		MNT_ILOCK(mp);
566	}
567
568	/*
569	 * Reasons for needing more work before suspend:
570	 * - Dirty buffers on devvp.
571	 * - Secondary writes occurred after start of vnode sync loop
572	 */
573	error = 0;
574	if (bo->bo_numoutput > 0 ||
575	    bo->bo_dirty.bv_cnt > 0 ||
576	    secondary_writes != 0 ||
577	    mp->mnt_secondary_writes != 0 ||
578	    secondary_accwrites != mp->mnt_secondary_accwrites)
579		error = EAGAIN;
580	BO_UNLOCK(bo);
581	return (error);
582}
583
584void
585softdep_get_depcounts(struct mount *mp,
586		      int *softdepactivep,
587		      int *softdepactiveaccp)
588{
589	(void) mp;
590	*softdepactivep = 0;
591	*softdepactiveaccp = 0;
592}
593
594void
595softdep_buf_append(bp, wkhd)
596	struct buf *bp;
597	struct workhead *wkhd;
598{
599
600	panic("softdep_buf_appendwork called");
601}
602
603void
604softdep_inode_append(ip, cred, wkhd)
605	struct inode *ip;
606	struct ucred *cred;
607	struct workhead *wkhd;
608{
609
610	panic("softdep_inode_appendwork called");
611}
612
613void
614softdep_freework(wkhd)
615	struct workhead *wkhd;
616{
617
618	panic("softdep_freework called");
619}
620
621#else
622
623FEATURE(softupdates, "FFS soft-updates support");
624
625/*
626 * These definitions need to be adapted to the system to which
627 * this file is being ported.
628 */
629
630#define M_SOFTDEP_FLAGS	(M_WAITOK)
631
632#define	D_PAGEDEP	0
633#define	D_INODEDEP	1
634#define	D_BMSAFEMAP	2
635#define	D_NEWBLK	3
636#define	D_ALLOCDIRECT	4
637#define	D_INDIRDEP	5
638#define	D_ALLOCINDIR	6
639#define	D_FREEFRAG	7
640#define	D_FREEBLKS	8
641#define	D_FREEFILE	9
642#define	D_DIRADD	10
643#define	D_MKDIR		11
644#define	D_DIRREM	12
645#define	D_NEWDIRBLK	13
646#define	D_FREEWORK	14
647#define	D_FREEDEP	15
648#define	D_JADDREF	16
649#define	D_JREMREF	17
650#define	D_JMVREF	18
651#define	D_JNEWBLK	19
652#define	D_JFREEBLK	20
653#define	D_JFREEFRAG	21
654#define	D_JSEG		22
655#define	D_JSEGDEP	23
656#define	D_SBDEP		24
657#define	D_JTRUNC	25
658#define	D_JFSYNC	26
659#define	D_SENTINEL	27
660#define	D_LAST		D_SENTINEL
661
662unsigned long dep_current[D_LAST + 1];
663unsigned long dep_highuse[D_LAST + 1];
664unsigned long dep_total[D_LAST + 1];
665unsigned long dep_write[D_LAST + 1];
666
667static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
668    "soft updates stats");
669static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
670    "total dependencies allocated");
671static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
672    "high use dependencies allocated");
673static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
674    "current dependencies allocated");
675static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
676    "current dependencies written");
677
678#define	SOFTDEP_TYPE(type, str, long)					\
679    static MALLOC_DEFINE(M_ ## type, #str, long);			\
680    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
681	&dep_total[D_ ## type], 0, "");					\
682    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
683	&dep_current[D_ ## type], 0, "");				\
684    SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, 	\
685	&dep_highuse[D_ ## type], 0, "");				\
686    SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, 	\
687	&dep_write[D_ ## type], 0, "");
688
689SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
690SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
691SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
692    "Block or frag allocated from cyl group map");
693SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
694SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
695SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
696SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
697SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
698SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
699SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
700SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
701SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
702SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
703SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
704SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
705SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
706SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
707SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
708SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
709SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
710SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
711SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
712SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
713SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
714SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
715SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
716SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
717
718static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
719
720static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
721static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
722
723/*
724 * translate from workitem type to memory type
725 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
726 */
727static struct malloc_type *memtype[] = {
728	M_PAGEDEP,
729	M_INODEDEP,
730	M_BMSAFEMAP,
731	M_NEWBLK,
732	M_ALLOCDIRECT,
733	M_INDIRDEP,
734	M_ALLOCINDIR,
735	M_FREEFRAG,
736	M_FREEBLKS,
737	M_FREEFILE,
738	M_DIRADD,
739	M_MKDIR,
740	M_DIRREM,
741	M_NEWDIRBLK,
742	M_FREEWORK,
743	M_FREEDEP,
744	M_JADDREF,
745	M_JREMREF,
746	M_JMVREF,
747	M_JNEWBLK,
748	M_JFREEBLK,
749	M_JFREEFRAG,
750	M_JSEG,
751	M_JSEGDEP,
752	M_SBDEP,
753	M_JTRUNC,
754	M_JFSYNC,
755	M_SENTINEL
756};
757
758static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;
759
760#define DtoM(type) (memtype[type])
761
762/*
763 * Names of malloc types.
764 */
765#define TYPENAME(type)  \
766	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
767/*
768 * End system adaptation definitions.
769 */
770
771#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
772#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
773
774/*
775 * Forward declarations.
776 */
777struct inodedep_hashhead;
778struct newblk_hashhead;
779struct pagedep_hashhead;
780struct bmsafemap_hashhead;
781
782/*
783 * Private journaling structures.
784 */
785struct jblocks {
786	struct jseglst	jb_segs;	/* TAILQ of current segments. */
787	struct jseg	*jb_writeseg;	/* Next write to complete. */
788	struct jseg	*jb_oldestseg;	/* Oldest segment with valid entries. */
789	struct jextent	*jb_extent;	/* Extent array. */
790	uint64_t	jb_nextseq;	/* Next sequence number. */
791	uint64_t	jb_oldestwrseq;	/* Oldest written sequence number. */
792	uint8_t		jb_needseg;	/* Need a forced segment. */
793	uint8_t		jb_suspended;	/* Did journal suspend writes? */
794	int		jb_avail;	/* Available extents. */
795	int		jb_used;	/* Last used extent. */
796	int		jb_head;	/* Allocator head. */
797	int		jb_off;		/* Allocator extent offset. */
798	int		jb_blocks;	/* Total disk blocks covered. */
799	int		jb_free;	/* Total disk blocks free. */
800	int		jb_min;		/* Minimum free space. */
801	int		jb_low;		/* Low on space. */
802	int		jb_age;		/* Insertion time of oldest rec. */
803};
804
805struct jextent {
806	ufs2_daddr_t	je_daddr;	/* Disk block address. */
807	int		je_blocks;	/* Disk block count. */
808};
809
810/*
811 * Internal function prototypes.
812 */
813static	void softdep_error(char *, int);
814static	void drain_output(struct vnode *);
815static	struct buf *getdirtybuf(struct buf *, struct mtx *, int);
816static	void clear_remove(struct thread *);
817static	void clear_inodedeps(struct thread *);
818static	void unlinked_inodedep(struct mount *, struct inodedep *);
819static	void clear_unlinked_inodedep(struct inodedep *);
820static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
821static	int flush_pagedep_deps(struct vnode *, struct mount *,
822	    struct diraddhd *);
823static	int free_pagedep(struct pagedep *);
824static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
825static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
826static	int flush_deplist(struct allocdirectlst *, int, int *);
827static	int sync_cgs(struct mount *, int);
828static	int handle_written_filepage(struct pagedep *, struct buf *);
829static	int handle_written_sbdep(struct sbdep *, struct buf *);
830static	void initiate_write_sbdep(struct sbdep *);
831static	void diradd_inode_written(struct diradd *, struct inodedep *);
832static	int handle_written_indirdep(struct indirdep *, struct buf *,
833	    struct buf**);
834static	int handle_written_inodeblock(struct inodedep *, struct buf *);
835static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
836	    uint8_t *);
837static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
838static	void handle_written_jaddref(struct jaddref *);
839static	void handle_written_jremref(struct jremref *);
840static	void handle_written_jseg(struct jseg *, struct buf *);
841static	void handle_written_jnewblk(struct jnewblk *);
842static	void handle_written_jblkdep(struct jblkdep *);
843static	void handle_written_jfreefrag(struct jfreefrag *);
844static	void complete_jseg(struct jseg *);
845static	void complete_jsegs(struct jseg *);
846static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
847static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
848static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
849static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
850static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
851static	void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
852static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
853static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
854static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
855static	inline void inoref_write(struct inoref *, struct jseg *,
856	    struct jrefrec *);
857static	void handle_allocdirect_partdone(struct allocdirect *,
858	    struct workhead *);
859static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
860	    struct workhead *);
861static	void indirdep_complete(struct indirdep *);
862static	int indirblk_lookup(struct mount *, ufs2_daddr_t);
863static	void indirblk_insert(struct freework *);
864static	void indirblk_remove(struct freework *);
865static	void handle_allocindir_partdone(struct allocindir *);
866static	void initiate_write_filepage(struct pagedep *, struct buf *);
867static	void initiate_write_indirdep(struct indirdep*, struct buf *);
868static	void handle_written_mkdir(struct mkdir *, int);
869static	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
870	    uint8_t *);
871static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
872static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
873static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
874static	void handle_workitem_freefile(struct freefile *);
875static	int handle_workitem_remove(struct dirrem *, int);
876static	struct dirrem *newdirrem(struct buf *, struct inode *,
877	    struct inode *, int, struct dirrem **);
878static	struct indirdep *indirdep_lookup(struct mount *, struct inode *,
879	    struct buf *);
880static	void cancel_indirdep(struct indirdep *, struct buf *,
881	    struct freeblks *);
882static	void free_indirdep(struct indirdep *);
883static	void free_diradd(struct diradd *, struct workhead *);
884static	void merge_diradd(struct inodedep *, struct diradd *);
885static	void complete_diradd(struct diradd *);
886static	struct diradd *diradd_lookup(struct pagedep *, int);
887static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
888	    struct jremref *);
889static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
890	    struct jremref *);
891static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
892	    struct jremref *, struct jremref *);
893static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
894	    struct jremref *);
895static	void cancel_allocindir(struct allocindir *, struct buf *bp,
896	    struct freeblks *, int);
897static	int setup_trunc_indir(struct freeblks *, struct inode *,
898	    ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
899static	void complete_trunc_indir(struct freework *);
900static	void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
901	    int);
902static	void complete_mkdir(struct mkdir *);
903static	void free_newdirblk(struct newdirblk *);
904static	void free_jremref(struct jremref *);
905static	void free_jaddref(struct jaddref *);
906static	void free_jsegdep(struct jsegdep *);
907static	void free_jsegs(struct jblocks *);
908static	void rele_jseg(struct jseg *);
909static	void free_jseg(struct jseg *, struct jblocks *);
910static	void free_jnewblk(struct jnewblk *);
911static	void free_jblkdep(struct jblkdep *);
912static	void free_jfreefrag(struct jfreefrag *);
913static	void free_freedep(struct freedep *);
914static	void journal_jremref(struct dirrem *, struct jremref *,
915	    struct inodedep *);
916static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
917static	int cancel_jaddref(struct jaddref *, struct inodedep *,
918	    struct workhead *);
919static	void cancel_jfreefrag(struct jfreefrag *);
920static	inline void setup_freedirect(struct freeblks *, struct inode *,
921	    int, int);
922static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
923static	inline void setup_freeindir(struct freeblks *, struct inode *, int,
924	    ufs_lbn_t, int);
925static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
926static	void freeblks_free(struct ufsmount *, struct freeblks *, int);
927static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
928ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
929static	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
930static	void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
931	    int, int);
932static	void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
933static 	int cancel_pagedep(struct pagedep *, struct freeblks *, int);
934static	int deallocate_dependencies(struct buf *, struct freeblks *, int);
935static	void newblk_freefrag(struct newblk*);
936static	void free_newblk(struct newblk *);
937static	void cancel_allocdirect(struct allocdirectlst *,
938	    struct allocdirect *, struct freeblks *);
939static	int check_inode_unwritten(struct inodedep *);
940static	int free_inodedep(struct inodedep *);
941static	void freework_freeblock(struct freework *);
942static	void freework_enqueue(struct freework *);
943static	int handle_workitem_freeblocks(struct freeblks *, int);
944static	int handle_complete_freeblocks(struct freeblks *, int);
945static	void handle_workitem_indirblk(struct freework *);
946static	void handle_written_freework(struct freework *);
947static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
948static	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
949	    struct workhead *);
950static	struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
951	    struct inodedep *, struct allocindir *, ufs_lbn_t);
952static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
953	    ufs2_daddr_t, ufs_lbn_t);
954static	void handle_workitem_freefrag(struct freefrag *);
955static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
956	    ufs_lbn_t);
957static	void allocdirect_merge(struct allocdirectlst *,
958	    struct allocdirect *, struct allocdirect *);
959static	struct freefrag *allocindir_merge(struct allocindir *,
960	    struct allocindir *);
961static	int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int,
962	    struct bmsafemap **);
963static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
964	    int cg, struct bmsafemap *);
965static	int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t,
966	    int, struct newblk **);
967static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
968static	int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
969	    struct inodedep **);
970static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
971static	int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
972	    int, struct pagedep **);
973static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
974	    struct mount *mp, int, struct pagedep **);
975static	void pause_timer(void *);
976static	int request_cleanup(struct mount *, int);
977static	int process_worklist_item(struct mount *, int, int);
978static	void process_removes(struct vnode *);
979static	void process_truncates(struct vnode *);
980static	void jwork_move(struct workhead *, struct workhead *);
981static	void jwork_insert(struct workhead *, struct jsegdep *);
982static	void add_to_worklist(struct worklist *, int);
983static	void wake_worklist(struct worklist *);
984static	void wait_worklist(struct worklist *, char *);
985static	void remove_from_worklist(struct worklist *);
986static	void softdep_flush(void);
987static	void softdep_flushjournal(struct mount *);
988static	int softdep_speedup(void);
989static	void worklist_speedup(void);
990static	int journal_mount(struct mount *, struct fs *, struct ucred *);
991static	void journal_unmount(struct mount *);
992static	int journal_space(struct ufsmount *, int);
993static	void journal_suspend(struct ufsmount *);
994static	int journal_unsuspend(struct ufsmount *ump);
995static	void softdep_prelink(struct vnode *, struct vnode *);
996static	void add_to_journal(struct worklist *);
997static	void remove_from_journal(struct worklist *);
998static	void softdep_process_journal(struct mount *, struct worklist *, int);
999static	struct jremref *newjremref(struct dirrem *, struct inode *,
1000	    struct inode *ip, off_t, nlink_t);
1001static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
1002	    uint16_t);
1003static	inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
1004	    uint16_t);
1005static	inline struct jsegdep *inoref_jseg(struct inoref *);
1006static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
1007static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
1008	    ufs2_daddr_t, int);
1009static	struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
1010static	void move_newblock_dep(struct jaddref *, struct inodedep *);
1011static	void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
1012static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
1013	    ufs2_daddr_t, long, ufs_lbn_t);
1014static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
1015	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
1016static	int jwait(struct worklist *, int);
1017static	struct inodedep *inodedep_lookup_ip(struct inode *);
1018static	int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
1019static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
1020static	void handle_jwork(struct workhead *);
1021static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
1022	    struct mkdir **);
1023static	struct jblocks *jblocks_create(void);
1024static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
1025static	void jblocks_free(struct jblocks *, struct mount *, int);
1026static	void jblocks_destroy(struct jblocks *);
1027static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
1028
1029/*
1030 * Exported softdep operations.
1031 */
1032static	void softdep_disk_io_initiation(struct buf *);
1033static	void softdep_disk_write_complete(struct buf *);
1034static	void softdep_deallocate_dependencies(struct buf *);
1035static	int softdep_count_dependencies(struct buf *bp, int);
1036
1037static struct mtx lk;
1038MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
1039
1040#define TRY_ACQUIRE_LOCK(lk)		mtx_trylock(lk)
1041#define ACQUIRE_LOCK(lk)		mtx_lock(lk)
1042#define FREE_LOCK(lk)			mtx_unlock(lk)
1043
1044#define	BUF_AREC(bp)			lockallowrecurse(&(bp)->b_lock)
1045#define	BUF_NOREC(bp)			lockdisablerecurse(&(bp)->b_lock)
1046
1047/*
1048 * Worklist queue management.
1049 * These routines require that the lock be held.
1050 */
1051#ifndef /* NOT */ DEBUG
1052#define WORKLIST_INSERT(head, item) do {	\
1053	(item)->wk_state |= ONWORKLIST;		\
1054	LIST_INSERT_HEAD(head, item, wk_list);	\
1055} while (0)
1056#define WORKLIST_REMOVE(item) do {		\
1057	(item)->wk_state &= ~ONWORKLIST;	\
1058	LIST_REMOVE(item, wk_list);		\
1059} while (0)
1060#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
1061#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
1062
1063#else /* DEBUG */
1064static	void worklist_insert(struct workhead *, struct worklist *, int);
1065static	void worklist_remove(struct worklist *, int);
1066
1067#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
1068#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
1069#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
1070#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
1071
1072static void
1073worklist_insert(head, item, locked)
1074	struct workhead *head;
1075	struct worklist *item;
1076	int locked;
1077{
1078
1079	if (locked)
1080		mtx_assert(&lk, MA_OWNED);
1081	if (item->wk_state & ONWORKLIST)
1082		panic("worklist_insert: %p %s(0x%X) already on list",
1083		    item, TYPENAME(item->wk_type), item->wk_state);
1084	item->wk_state |= ONWORKLIST;
1085	LIST_INSERT_HEAD(head, item, wk_list);
1086}
1087
1088static void
1089worklist_remove(item, locked)
1090	struct worklist *item;
1091	int locked;
1092{
1093
1094	if (locked)
1095		mtx_assert(&lk, MA_OWNED);
1096	if ((item->wk_state & ONWORKLIST) == 0)
1097		panic("worklist_remove: %p %s(0x%X) not on list",
1098		    item, TYPENAME(item->wk_type), item->wk_state);
1099	item->wk_state &= ~ONWORKLIST;
1100	LIST_REMOVE(item, wk_list);
1101}
1102#endif /* DEBUG */
1103
1104/*
1105 * Merge two jsegdeps keeping only the oldest one as newer references
1106 * can't be discarded until after older references.
1107 */
1108static inline struct jsegdep *
1109jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1110{
1111	struct jsegdep *swp;
1112
1113	if (two == NULL)
1114		return (one);
1115
1116	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1117		swp = one;
1118		one = two;
1119		two = swp;
1120	}
1121	WORKLIST_REMOVE(&two->jd_list);
1122	free_jsegdep(two);
1123
1124	return (one);
1125}
1126
1127/*
1128 * If two freedeps are compatible free one to reduce list size.
1129 */
1130static inline struct freedep *
1131freedep_merge(struct freedep *one, struct freedep *two)
1132{
1133	if (two == NULL)
1134		return (one);
1135
1136	if (one->fd_freework == two->fd_freework) {
1137		WORKLIST_REMOVE(&two->fd_list);
1138		free_freedep(two);
1139	}
1140	return (one);
1141}
1142
1143/*
1144 * Move journal work from one list to another.  Duplicate freedeps and
1145 * jsegdeps are coalesced to keep the lists as small as possible.
1146 */
1147static void
1148jwork_move(dst, src)
1149	struct workhead *dst;
1150	struct workhead *src;
1151{
1152	struct freedep *freedep;
1153	struct jsegdep *jsegdep;
1154	struct worklist *wkn;
1155	struct worklist *wk;
1156
1157	KASSERT(dst != src,
1158	    ("jwork_move: dst == src"));
1159	freedep = NULL;
1160	jsegdep = NULL;
1161	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1162		if (wk->wk_type == D_JSEGDEP)
1163			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1164		if (wk->wk_type == D_FREEDEP)
1165			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1166	}
1167
1168	mtx_assert(&lk, MA_OWNED);
1169	while ((wk = LIST_FIRST(src)) != NULL) {
1170		WORKLIST_REMOVE(wk);
1171		WORKLIST_INSERT(dst, wk);
1172		if (wk->wk_type == D_JSEGDEP) {
1173			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1174			continue;
1175		}
1176		if (wk->wk_type == D_FREEDEP)
1177			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1178	}
1179}
1180
1181static void
1182jwork_insert(dst, jsegdep)
1183	struct workhead *dst;
1184	struct jsegdep *jsegdep;
1185{
1186	struct jsegdep *jsegdepn;
1187	struct worklist *wk;
1188
1189	LIST_FOREACH(wk, dst, wk_list)
1190		if (wk->wk_type == D_JSEGDEP)
1191			break;
1192	if (wk == NULL) {
1193		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1194		return;
1195	}
1196	jsegdepn = WK_JSEGDEP(wk);
1197	if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1198		WORKLIST_REMOVE(wk);
1199		free_jsegdep(jsegdepn);
1200		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1201	} else
1202		free_jsegdep(jsegdep);
1203}
1204
1205/*
1206 * Routines for tracking and managing workitems.
1207 */
1208static	void workitem_free(struct worklist *, int);
1209static	void workitem_alloc(struct worklist *, int, struct mount *);
1210static	void workitem_reassign(struct worklist *, int);
1211
1212#define	WORKITEM_FREE(item, type) \
1213	workitem_free((struct worklist *)(item), (type))
1214#define	WORKITEM_REASSIGN(item, type) \
1215	workitem_reassign((struct worklist *)(item), (type))
1216
1217static void
1218workitem_free(item, type)
1219	struct worklist *item;
1220	int type;
1221{
1222	struct ufsmount *ump;
1223	mtx_assert(&lk, MA_OWNED);
1224
1225#ifdef DEBUG
1226	if (item->wk_state & ONWORKLIST)
1227		panic("workitem_free: %s(0x%X) still on list",
1228		    TYPENAME(item->wk_type), item->wk_state);
1229	if (item->wk_type != type && type != D_NEWBLK)
1230		panic("workitem_free: type mismatch %s != %s",
1231		    TYPENAME(item->wk_type), TYPENAME(type));
1232#endif
1233	if (item->wk_state & IOWAITING)
1234		wakeup(item);
1235	ump = VFSTOUFS(item->wk_mp);
1236	KASSERT(ump->softdep_deps > 0,
1237	    ("workitem_free: %s: softdep_deps going negative",
1238	    ump->um_fs->fs_fsmnt));
1239	if (--ump->softdep_deps == 0 && ump->softdep_req)
1240		wakeup(&ump->softdep_deps);
1241	KASSERT(dep_current[item->wk_type] > 0,
1242	    ("workitem_free: %s: dep_current[%s] going negative",
1243	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1244	dep_current[item->wk_type]--;
1245	free(item, DtoM(type));
1246}
1247
1248static void
1249workitem_alloc(item, type, mp)
1250	struct worklist *item;
1251	int type;
1252	struct mount *mp;
1253{
1254	struct ufsmount *ump;
1255
1256	item->wk_type = type;
1257	item->wk_mp = mp;
1258	item->wk_state = 0;
1259
1260	ump = VFSTOUFS(mp);
1261	ACQUIRE_LOCK(&lk);
1262	dep_current[type]++;
1263	if (dep_current[type] > dep_highuse[type])
1264		dep_highuse[type] = dep_current[type];
1265	dep_total[type]++;
1266	ump->softdep_deps++;
1267	ump->softdep_accdeps++;
1268	FREE_LOCK(&lk);
1269}
1270
1271static void
1272workitem_reassign(item, newtype)
1273	struct worklist *item;
1274	int newtype;
1275{
1276
1277	KASSERT(dep_current[item->wk_type] > 0,
1278	    ("workitem_reassign: %s: dep_current[%s] going negative",
1279	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1280	dep_current[item->wk_type]--;
1281	dep_current[newtype]++;
1282	if (dep_current[newtype] > dep_highuse[newtype])
1283		dep_highuse[newtype] = dep_current[newtype];
1284	dep_total[newtype]++;
1285	item->wk_type = newtype;
1286}
1287
1288/*
1289 * Workitem queue management
1290 */
1291static int max_softdeps;	/* maximum number of structs before slowdown */
1292static int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
1293static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1294static int proc_waiting;	/* tracks whether we have a timeout posted */
1295static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1296static struct callout softdep_callout;
1297static int req_pending;
1298static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1299static int req_clear_remove;	/* syncer process flush some freeblks */
1300static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1301
1302/*
1303 * runtime statistics
1304 */
1305static int stat_worklist_push;	/* number of worklist cleanups */
1306static int stat_blk_limit_push;	/* number of times block limit neared */
1307static int stat_ino_limit_push;	/* number of times inode limit neared */
1308static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1309static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1310static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1311static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1312static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1313static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1314static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1315static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1316static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1317static int stat_journal_min;	/* Times hit journal min threshold */
1318static int stat_journal_low;	/* Times hit journal low threshold */
1319static int stat_journal_wait;	/* Times blocked in jwait(). */
1320static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1321static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1322static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1323static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1324static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1325static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1326static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1327static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1328static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1329
1330SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1331    &max_softdeps, 0, "");
1332SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1333    &tickdelay, 0, "");
1334SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
1335    &maxindirdeps, 0, "");
1336SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1337    &stat_worklist_push, 0,"");
1338SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1339    &stat_blk_limit_push, 0,"");
1340SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1341    &stat_ino_limit_push, 0,"");
1342SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1343    &stat_blk_limit_hit, 0, "");
1344SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1345    &stat_ino_limit_hit, 0, "");
1346SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1347    &stat_sync_limit_hit, 0, "");
1348SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1349    &stat_indir_blk_ptrs, 0, "");
1350SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1351    &stat_inode_bitmap, 0, "");
1352SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1353    &stat_direct_blk_ptrs, 0, "");
1354SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1355    &stat_dir_entry, 0, "");
1356SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1357    &stat_jaddref, 0, "");
1358SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1359    &stat_jnewblk, 0, "");
1360SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1361    &stat_journal_low, 0, "");
1362SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1363    &stat_journal_min, 0, "");
1364SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1365    &stat_journal_wait, 0, "");
1366SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1367    &stat_jwait_filepage, 0, "");
1368SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1369    &stat_jwait_freeblks, 0, "");
1370SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1371    &stat_jwait_inode, 0, "");
1372SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1373    &stat_jwait_newblk, 0, "");
1374SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1375    &stat_cleanup_blkrequests, 0, "");
1376SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1377    &stat_cleanup_inorequests, 0, "");
1378SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1379    &stat_cleanup_high_delay, 0, "");
1380SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1381    &stat_cleanup_retries, 0, "");
1382SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1383    &stat_cleanup_failures, 0, "");
1384SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1385    &softdep_flushcache, 0, "");
1386
1387SYSCTL_DECL(_vfs_ffs);
1388
1389LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl;
1390static u_long	bmsafemap_hash;	/* size of hash table - 1 */
1391
1392static int compute_summary_at_mount = 0;	/* Whether to recompute the summary at mount time */
1393SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1394	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1395
1396static struct proc *softdepproc;
1397static struct kproc_desc softdep_kp = {
1398	"softdepflush",
1399	softdep_flush,
1400	&softdepproc
1401};
1402SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
1403    &softdep_kp);
1404
1405static void
1406softdep_flush(void)
1407{
1408	struct mount *nmp;
1409	struct mount *mp;
1410	struct ufsmount *ump;
1411	struct thread *td;
1412	int remaining;
1413	int progress;
1414	int vfslocked;
1415
1416	td = curthread;
1417	td->td_pflags |= TDP_NORUNNINGBUF;
1418
1419	for (;;) {
1420		kproc_suspend_check(softdepproc);
1421		vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
1422		ACQUIRE_LOCK(&lk);
1423		/*
1424		 * If requested, try removing inode or removal dependencies.
1425		 */
1426		if (req_clear_inodedeps) {
1427			clear_inodedeps(td);
1428			req_clear_inodedeps -= 1;
1429			wakeup_one(&proc_waiting);
1430		}
1431		if (req_clear_remove) {
1432			clear_remove(td);
1433			req_clear_remove -= 1;
1434			wakeup_one(&proc_waiting);
1435		}
1436		FREE_LOCK(&lk);
1437		VFS_UNLOCK_GIANT(vfslocked);
1438		remaining = progress = 0;
1439		mtx_lock(&mountlist_mtx);
1440		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
1441			nmp = TAILQ_NEXT(mp, mnt_list);
1442			if (MOUNTEDSOFTDEP(mp) == 0)
1443				continue;
1444			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
1445				continue;
1446			vfslocked = VFS_LOCK_GIANT(mp);
1447			progress += softdep_process_worklist(mp, 0);
1448			ump = VFSTOUFS(mp);
1449			remaining += ump->softdep_on_worklist;
1450			VFS_UNLOCK_GIANT(vfslocked);
1451			mtx_lock(&mountlist_mtx);
1452			nmp = TAILQ_NEXT(mp, mnt_list);
1453			vfs_unbusy(mp);
1454		}
1455		mtx_unlock(&mountlist_mtx);
1456		if (remaining && progress)
1457			continue;
1458		ACQUIRE_LOCK(&lk);
1459		if (!req_pending)
1460			msleep(&req_pending, &lk, PVM, "sdflush", hz);
1461		req_pending = 0;
1462		FREE_LOCK(&lk);
1463	}
1464}
1465
1466static void
1467worklist_speedup(void)
1468{
1469	mtx_assert(&lk, MA_OWNED);
1470	if (req_pending == 0) {
1471		req_pending = 1;
1472		wakeup(&req_pending);
1473	}
1474}
1475
1476static int
1477softdep_speedup(void)
1478{
1479
1480	worklist_speedup();
1481	bd_speedup();
1482	return speedup_syncer();
1483}
1484
1485/*
1486 * Add an item to the end of the work queue.
1487 * This routine requires that the lock be held.
1488 * This is the only routine that adds items to the list.
1489 * The following routine is the only one that removes items
1490 * and does so in order from first to last.
1491 */
1492
1493#define	WK_HEAD		0x0001	/* Add to HEAD. */
1494#define	WK_NODELAY	0x0002	/* Process immediately. */
1495
1496static void
1497add_to_worklist(wk, flags)
1498	struct worklist *wk;
1499	int flags;
1500{
1501	struct ufsmount *ump;
1502
1503	mtx_assert(&lk, MA_OWNED);
1504	ump = VFSTOUFS(wk->wk_mp);
1505	if (wk->wk_state & ONWORKLIST)
1506		panic("add_to_worklist: %s(0x%X) already on list",
1507		    TYPENAME(wk->wk_type), wk->wk_state);
1508	wk->wk_state |= ONWORKLIST;
1509	if (ump->softdep_on_worklist == 0) {
1510		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1511		ump->softdep_worklist_tail = wk;
1512	} else if (flags & WK_HEAD) {
1513		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1514	} else {
1515		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1516		ump->softdep_worklist_tail = wk;
1517	}
1518	ump->softdep_on_worklist += 1;
1519	if (flags & WK_NODELAY)
1520		worklist_speedup();
1521}
1522
1523/*
1524 * Remove the item to be processed. If we are removing the last
1525 * item on the list, we need to recalculate the tail pointer.
1526 */
1527static void
1528remove_from_worklist(wk)
1529	struct worklist *wk;
1530{
1531	struct ufsmount *ump;
1532
1533	ump = VFSTOUFS(wk->wk_mp);
1534	WORKLIST_REMOVE(wk);
1535	if (ump->softdep_worklist_tail == wk)
1536		ump->softdep_worklist_tail =
1537		    (struct worklist *)wk->wk_list.le_prev;
1538	ump->softdep_on_worklist -= 1;
1539}
1540
1541static void
1542wake_worklist(wk)
1543	struct worklist *wk;
1544{
1545	if (wk->wk_state & IOWAITING) {
1546		wk->wk_state &= ~IOWAITING;
1547		wakeup(wk);
1548	}
1549}
1550
1551static void
1552wait_worklist(wk, wmesg)
1553	struct worklist *wk;
1554	char *wmesg;
1555{
1556
1557	wk->wk_state |= IOWAITING;
1558	msleep(wk, &lk, PVM, wmesg, 0);
1559}
1560
1561/*
1562 * Process that runs once per second to handle items in the background queue.
1563 *
1564 * Note that we ensure that everything is done in the order in which they
1565 * appear in the queue. The code below depends on this property to ensure
1566 * that blocks of a file are freed before the inode itself is freed. This
1567 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1568 * until all the old ones have been purged from the dependency lists.
1569 */
1570int
1571softdep_process_worklist(mp, full)
1572	struct mount *mp;
1573	int full;
1574{
1575	struct thread *td = curthread;
1576	int cnt, matchcnt;
1577	struct ufsmount *ump;
1578	long starttime;
1579
1580	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1581	/*
1582	 * Record the process identifier of our caller so that we can give
1583	 * this process preferential treatment in request_cleanup below.
1584	 */
1585	matchcnt = 0;
1586	ump = VFSTOUFS(mp);
1587	ACQUIRE_LOCK(&lk);
1588	starttime = time_second;
1589	softdep_process_journal(mp, NULL, full?MNT_WAIT:0);
1590	while (ump->softdep_on_worklist > 0) {
1591		if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1592			break;
1593		else
1594			matchcnt += cnt;
1595		/*
1596		 * If requested, try removing inode or removal dependencies.
1597		 */
1598		if (req_clear_inodedeps) {
1599			clear_inodedeps(td);
1600			req_clear_inodedeps -= 1;
1601			wakeup_one(&proc_waiting);
1602		}
1603		if (req_clear_remove) {
1604			clear_remove(td);
1605			req_clear_remove -= 1;
1606			wakeup_one(&proc_waiting);
1607		}
1608		/*
1609		 * We do not generally want to stop for buffer space, but if
1610		 * we are really being a buffer hog, we will stop and wait.
1611		 */
1612		if (should_yield()) {
1613			FREE_LOCK(&lk);
1614			kern_yield(PRI_UNCHANGED);
1615			bwillwrite();
1616			ACQUIRE_LOCK(&lk);
1617		}
1618		/*
1619		 * Never allow processing to run for more than one
1620		 * second. Otherwise the other mountpoints may get
1621		 * excessively backlogged.
1622		 */
1623		if (!full && starttime != time_second)
1624			break;
1625	}
1626	if (full == 0)
1627		journal_unsuspend(ump);
1628	FREE_LOCK(&lk);
1629	return (matchcnt);
1630}
1631
1632/*
1633 * Process all removes associated with a vnode if we are running out of
1634 * journal space.  Any other process which attempts to flush these will
1635 * be unable as we have the vnodes locked.
1636 */
1637static void
1638process_removes(vp)
1639	struct vnode *vp;
1640{
1641	struct inodedep *inodedep;
1642	struct dirrem *dirrem;
1643	struct mount *mp;
1644	ino_t inum;
1645
1646	mtx_assert(&lk, MA_OWNED);
1647
1648	mp = vp->v_mount;
1649	inum = VTOI(vp)->i_number;
1650	for (;;) {
1651top:
1652		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1653			return;
1654		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1655			/*
1656			 * If another thread is trying to lock this vnode
1657			 * it will fail but we must wait for it to do so
1658			 * before we can proceed.
1659			 */
1660			if (dirrem->dm_state & INPROGRESS) {
1661				wait_worklist(&dirrem->dm_list, "pwrwait");
1662				goto top;
1663			}
1664			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1665			    (COMPLETE | ONWORKLIST))
1666				break;
1667		}
1668		if (dirrem == NULL)
1669			return;
1670		remove_from_worklist(&dirrem->dm_list);
1671		FREE_LOCK(&lk);
1672		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1673			panic("process_removes: suspended filesystem");
1674		handle_workitem_remove(dirrem, 0);
1675		vn_finished_secondary_write(mp);
1676		ACQUIRE_LOCK(&lk);
1677	}
1678}
1679
1680/*
1681 * Process all truncations associated with a vnode if we are running out
1682 * of journal space.  This is called when the vnode lock is already held
1683 * and no other process can clear the truncation.  This function returns
1684 * a value greater than zero if it did any work.
1685 */
1686static void
1687process_truncates(vp)
1688	struct vnode *vp;
1689{
1690	struct inodedep *inodedep;
1691	struct freeblks *freeblks;
1692	struct mount *mp;
1693	ino_t inum;
1694	int cgwait;
1695
1696	mtx_assert(&lk, MA_OWNED);
1697
1698	mp = vp->v_mount;
1699	inum = VTOI(vp)->i_number;
1700	for (;;) {
1701		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1702			return;
1703		cgwait = 0;
1704		TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1705			/* Journal entries not yet written.  */
1706			if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1707				jwait(&LIST_FIRST(
1708				    &freeblks->fb_jblkdephd)->jb_list,
1709				    MNT_WAIT);
1710				break;
1711			}
1712			/* Another thread is executing this item. */
1713			if (freeblks->fb_state & INPROGRESS) {
1714				wait_worklist(&freeblks->fb_list, "ptrwait");
1715				break;
1716			}
1717			/* Freeblks is waiting on a inode write. */
1718			if ((freeblks->fb_state & COMPLETE) == 0) {
1719				FREE_LOCK(&lk);
1720				ffs_update(vp, 1);
1721				ACQUIRE_LOCK(&lk);
1722				break;
1723			}
1724			if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1725			    (ALLCOMPLETE | ONWORKLIST)) {
1726				remove_from_worklist(&freeblks->fb_list);
1727				freeblks->fb_state |= INPROGRESS;
1728				FREE_LOCK(&lk);
1729				if (vn_start_secondary_write(NULL, &mp,
1730				    V_NOWAIT))
1731					panic("process_truncates: "
1732					    "suspended filesystem");
1733				handle_workitem_freeblocks(freeblks, 0);
1734				vn_finished_secondary_write(mp);
1735				ACQUIRE_LOCK(&lk);
1736				break;
1737			}
1738			if (freeblks->fb_cgwait)
1739				cgwait++;
1740		}
1741		if (cgwait) {
1742			FREE_LOCK(&lk);
1743			sync_cgs(mp, MNT_WAIT);
1744			ffs_sync_snap(mp, MNT_WAIT);
1745			ACQUIRE_LOCK(&lk);
1746			continue;
1747		}
1748		if (freeblks == NULL)
1749			break;
1750	}
1751	return;
1752}
1753
1754/*
1755 * Process one item on the worklist.
1756 */
1757static int
1758process_worklist_item(mp, target, flags)
1759	struct mount *mp;
1760	int target;
1761	int flags;
1762{
1763	struct worklist sentinel;
1764	struct worklist *wk;
1765	struct ufsmount *ump;
1766	int matchcnt;
1767	int error;
1768
1769	mtx_assert(&lk, MA_OWNED);
1770	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1771	/*
1772	 * If we are being called because of a process doing a
1773	 * copy-on-write, then it is not safe to write as we may
1774	 * recurse into the copy-on-write routine.
1775	 */
1776	if (curthread->td_pflags & TDP_COWINPROGRESS)
1777		return (-1);
1778	PHOLD(curproc);	/* Don't let the stack go away. */
1779	ump = VFSTOUFS(mp);
1780	matchcnt = 0;
1781	sentinel.wk_mp = NULL;
1782	sentinel.wk_type = D_SENTINEL;
1783	LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1784	for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1785	    wk = LIST_NEXT(&sentinel, wk_list)) {
1786		if (wk->wk_type == D_SENTINEL) {
1787			LIST_REMOVE(&sentinel, wk_list);
1788			LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1789			continue;
1790		}
1791		if (wk->wk_state & INPROGRESS)
1792			panic("process_worklist_item: %p already in progress.",
1793			    wk);
1794		wk->wk_state |= INPROGRESS;
1795		remove_from_worklist(wk);
1796		FREE_LOCK(&lk);
1797		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1798			panic("process_worklist_item: suspended filesystem");
1799		switch (wk->wk_type) {
1800		case D_DIRREM:
1801			/* removal of a directory entry */
1802			error = handle_workitem_remove(WK_DIRREM(wk), flags);
1803			break;
1804
1805		case D_FREEBLKS:
1806			/* releasing blocks and/or fragments from a file */
1807			error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1808			    flags);
1809			break;
1810
1811		case D_FREEFRAG:
1812			/* releasing a fragment when replaced as a file grows */
1813			handle_workitem_freefrag(WK_FREEFRAG(wk));
1814			error = 0;
1815			break;
1816
1817		case D_FREEFILE:
1818			/* releasing an inode when its link count drops to 0 */
1819			handle_workitem_freefile(WK_FREEFILE(wk));
1820			error = 0;
1821			break;
1822
1823		default:
1824			panic("%s_process_worklist: Unknown type %s",
1825			    "softdep", TYPENAME(wk->wk_type));
1826			/* NOTREACHED */
1827		}
1828		vn_finished_secondary_write(mp);
1829		ACQUIRE_LOCK(&lk);
1830		if (error == 0) {
1831			if (++matchcnt == target)
1832				break;
1833			continue;
1834		}
1835		/*
1836		 * We have to retry the worklist item later.  Wake up any
1837		 * waiters who may be able to complete it immediately and
1838		 * add the item back to the head so we don't try to execute
1839		 * it again.
1840		 */
1841		wk->wk_state &= ~INPROGRESS;
1842		wake_worklist(wk);
1843		add_to_worklist(wk, WK_HEAD);
1844	}
1845	LIST_REMOVE(&sentinel, wk_list);
1846	/* Sentinal could've become the tail from remove_from_worklist. */
1847	if (ump->softdep_worklist_tail == &sentinel)
1848		ump->softdep_worklist_tail =
1849		    (struct worklist *)sentinel.wk_list.le_prev;
1850	PRELE(curproc);
1851	return (matchcnt);
1852}
1853
1854/*
1855 * Move dependencies from one buffer to another.
1856 */
1857int
1858softdep_move_dependencies(oldbp, newbp)
1859	struct buf *oldbp;
1860	struct buf *newbp;
1861{
1862	struct worklist *wk, *wktail;
1863	int dirty;
1864
1865	dirty = 0;
1866	wktail = NULL;
1867	ACQUIRE_LOCK(&lk);
1868	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1869		LIST_REMOVE(wk, wk_list);
1870		if (wk->wk_type == D_BMSAFEMAP &&
1871		    bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
1872			dirty = 1;
1873		if (wktail == 0)
1874			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1875		else
1876			LIST_INSERT_AFTER(wktail, wk, wk_list);
1877		wktail = wk;
1878	}
1879	FREE_LOCK(&lk);
1880
1881	return (dirty);
1882}
1883
1884/*
1885 * Purge the work list of all items associated with a particular mount point.
1886 */
1887int
1888softdep_flushworklist(oldmnt, countp, td)
1889	struct mount *oldmnt;
1890	int *countp;
1891	struct thread *td;
1892{
1893	struct vnode *devvp;
1894	int count, error = 0;
1895	struct ufsmount *ump;
1896
1897	/*
1898	 * Alternately flush the block device associated with the mount
1899	 * point and process any dependencies that the flushing
1900	 * creates. We continue until no more worklist dependencies
1901	 * are found.
1902	 */
1903	*countp = 0;
1904	ump = VFSTOUFS(oldmnt);
1905	devvp = ump->um_devvp;
1906	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1907		*countp += count;
1908		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1909		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1910		VOP_UNLOCK(devvp, 0);
1911		if (error)
1912			break;
1913	}
1914	return (error);
1915}
1916
1917int
1918softdep_waitidle(struct mount *mp)
1919{
1920	struct ufsmount *ump;
1921	int error;
1922	int i;
1923
1924	ump = VFSTOUFS(mp);
1925	ACQUIRE_LOCK(&lk);
1926	for (i = 0; i < 10 && ump->softdep_deps; i++) {
1927		ump->softdep_req = 1;
1928		if (ump->softdep_on_worklist)
1929			panic("softdep_waitidle: work added after flush.");
1930		msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
1931	}
1932	ump->softdep_req = 0;
1933	FREE_LOCK(&lk);
1934	error = 0;
1935	if (i == 10) {
1936		error = EBUSY;
1937		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1938		    mp);
1939	}
1940
1941	return (error);
1942}
1943
1944/*
1945 * Flush all vnodes and worklist items associated with a specified mount point.
1946 */
1947int
1948softdep_flushfiles(oldmnt, flags, td)
1949	struct mount *oldmnt;
1950	int flags;
1951	struct thread *td;
1952{
1953#ifdef QUOTA
1954	struct ufsmount *ump;
1955	int i;
1956#endif
1957	int error, early, depcount, loopcnt, retry_flush_count, retry;
1958	int morework;
1959
1960	loopcnt = 10;
1961	retry_flush_count = 3;
1962retry_flush:
1963	error = 0;
1964
1965	/*
1966	 * Alternately flush the vnodes associated with the mount
1967	 * point and process any dependencies that the flushing
1968	 * creates. In theory, this loop can happen at most twice,
1969	 * but we give it a few extra just to be sure.
1970	 */
1971	for (; loopcnt > 0; loopcnt--) {
1972		/*
1973		 * Do another flush in case any vnodes were brought in
1974		 * as part of the cleanup operations.
1975		 */
1976		early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
1977		    MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
1978		if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
1979			break;
1980		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1981		    depcount == 0)
1982			break;
1983	}
1984	/*
1985	 * If we are unmounting then it is an error to fail. If we
1986	 * are simply trying to downgrade to read-only, then filesystem
1987	 * activity can keep us busy forever, so we just fail with EBUSY.
1988	 */
1989	if (loopcnt == 0) {
1990		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1991			panic("softdep_flushfiles: looping");
1992		error = EBUSY;
1993	}
1994	if (!error)
1995		error = softdep_waitidle(oldmnt);
1996	if (!error) {
1997		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
1998			retry = 0;
1999			MNT_ILOCK(oldmnt);
2000			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
2001			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
2002			morework = oldmnt->mnt_nvnodelistsize > 0;
2003#ifdef QUOTA
2004			ump = VFSTOUFS(oldmnt);
2005			UFS_LOCK(ump);
2006			for (i = 0; i < MAXQUOTAS; i++) {
2007				if (ump->um_quotas[i] != NULLVP)
2008					morework = 1;
2009			}
2010			UFS_UNLOCK(ump);
2011#endif
2012			if (morework) {
2013				if (--retry_flush_count > 0) {
2014					retry = 1;
2015					loopcnt = 3;
2016				} else
2017					error = EBUSY;
2018			}
2019			MNT_IUNLOCK(oldmnt);
2020			if (retry)
2021				goto retry_flush;
2022		}
2023	}
2024	return (error);
2025}
2026
2027/*
2028 * Structure hashing.
2029 *
2030 * There are three types of structures that can be looked up:
2031 *	1) pagedep structures identified by mount point, inode number,
2032 *	   and logical block.
2033 *	2) inodedep structures identified by mount point and inode number.
2034 *	3) newblk structures identified by mount point and
2035 *	   physical block number.
2036 *
2037 * The "pagedep" and "inodedep" dependency structures are hashed
2038 * separately from the file blocks and inodes to which they correspond.
2039 * This separation helps when the in-memory copy of an inode or
2040 * file block must be replaced. It also obviates the need to access
2041 * an inode or file page when simply updating (or de-allocating)
2042 * dependency structures. Lookup of newblk structures is needed to
2043 * find newly allocated blocks when trying to associate them with
2044 * their allocdirect or allocindir structure.
2045 *
2046 * The lookup routines optionally create and hash a new instance when
2047 * an existing entry is not found.
2048 */
2049#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
2050#define NODELAY		0x0002	/* cannot do background work */
2051
2052/*
2053 * Structures and routines associated with pagedep caching.
2054 */
2055LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
2056u_long	pagedep_hash;		/* size of hash table - 1 */
2057#define	PAGEDEP_HASH(mp, inum, lbn) \
2058	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
2059	    pagedep_hash])
2060
2061static int
2062pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
2063	struct pagedep_hashhead *pagedephd;
2064	ino_t ino;
2065	ufs_lbn_t lbn;
2066	struct mount *mp;
2067	int flags;
2068	struct pagedep **pagedeppp;
2069{
2070	struct pagedep *pagedep;
2071
2072	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2073		if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn &&
2074		    mp == pagedep->pd_list.wk_mp) {
2075			*pagedeppp = pagedep;
2076			return (1);
2077		}
2078	}
2079	*pagedeppp = NULL;
2080	return (0);
2081}
2082/*
2083 * Look up a pagedep. Return 1 if found, 0 otherwise.
2084 * If not found, allocate if DEPALLOC flag is passed.
2085 * Found or allocated entry is returned in pagedeppp.
2086 * This routine must be called with splbio interrupts blocked.
2087 */
2088static int
2089pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
2090	struct mount *mp;
2091	struct buf *bp;
2092	ino_t ino;
2093	ufs_lbn_t lbn;
2094	int flags;
2095	struct pagedep **pagedeppp;
2096{
2097	struct pagedep *pagedep;
2098	struct pagedep_hashhead *pagedephd;
2099	struct worklist *wk;
2100	int ret;
2101	int i;
2102
2103	mtx_assert(&lk, MA_OWNED);
2104	if (bp) {
2105		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2106			if (wk->wk_type == D_PAGEDEP) {
2107				*pagedeppp = WK_PAGEDEP(wk);
2108				return (1);
2109			}
2110		}
2111	}
2112	pagedephd = PAGEDEP_HASH(mp, ino, lbn);
2113	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
2114	if (ret) {
2115		if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2116			WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2117		return (1);
2118	}
2119	if ((flags & DEPALLOC) == 0)
2120		return (0);
2121	FREE_LOCK(&lk);
2122	pagedep = malloc(sizeof(struct pagedep),
2123	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2124	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2125	ACQUIRE_LOCK(&lk);
2126	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
2127	if (*pagedeppp) {
2128		/*
2129		 * This should never happen since we only create pagedeps
2130		 * with the vnode lock held.  Could be an assert.
2131		 */
2132		WORKITEM_FREE(pagedep, D_PAGEDEP);
2133		return (ret);
2134	}
2135	pagedep->pd_ino = ino;
2136	pagedep->pd_lbn = lbn;
2137	LIST_INIT(&pagedep->pd_dirremhd);
2138	LIST_INIT(&pagedep->pd_pendinghd);
2139	for (i = 0; i < DAHASHSZ; i++)
2140		LIST_INIT(&pagedep->pd_diraddhd[i]);
2141	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2142	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2143	*pagedeppp = pagedep;
2144	return (0);
2145}
2146
2147/*
2148 * Structures and routines associated with inodedep caching.
2149 */
2150LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
2151static u_long	inodedep_hash;	/* size of hash table - 1 */
2152#define	INODEDEP_HASH(fs, inum) \
2153      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
2154
2155static int
2156inodedep_find(inodedephd, fs, inum, inodedeppp)
2157	struct inodedep_hashhead *inodedephd;
2158	struct fs *fs;
2159	ino_t inum;
2160	struct inodedep **inodedeppp;
2161{
2162	struct inodedep *inodedep;
2163
2164	LIST_FOREACH(inodedep, inodedephd, id_hash)
2165		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
2166			break;
2167	if (inodedep) {
2168		*inodedeppp = inodedep;
2169		return (1);
2170	}
2171	*inodedeppp = NULL;
2172
2173	return (0);
2174}
2175/*
2176 * Look up an inodedep. Return 1 if found, 0 if not found.
2177 * If not found, allocate if DEPALLOC flag is passed.
2178 * Found or allocated entry is returned in inodedeppp.
2179 * This routine must be called with splbio interrupts blocked.
2180 */
2181static int
2182inodedep_lookup(mp, inum, flags, inodedeppp)
2183	struct mount *mp;
2184	ino_t inum;
2185	int flags;
2186	struct inodedep **inodedeppp;
2187{
2188	struct inodedep *inodedep;
2189	struct inodedep_hashhead *inodedephd;
2190	struct fs *fs;
2191
2192	mtx_assert(&lk, MA_OWNED);
2193	fs = VFSTOUFS(mp)->um_fs;
2194	inodedephd = INODEDEP_HASH(fs, inum);
2195
2196	if (inodedep_find(inodedephd, fs, inum, inodedeppp))
2197		return (1);
2198	if ((flags & DEPALLOC) == 0)
2199		return (0);
2200	/*
2201	 * If we are over our limit, try to improve the situation.
2202	 */
2203	if (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0)
2204		request_cleanup(mp, FLUSH_INODES);
2205	FREE_LOCK(&lk);
2206	inodedep = malloc(sizeof(struct inodedep),
2207		M_INODEDEP, M_SOFTDEP_FLAGS);
2208	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2209	ACQUIRE_LOCK(&lk);
2210	if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
2211		WORKITEM_FREE(inodedep, D_INODEDEP);
2212		return (1);
2213	}
2214	inodedep->id_fs = fs;
2215	inodedep->id_ino = inum;
2216	inodedep->id_state = ALLCOMPLETE;
2217	inodedep->id_nlinkdelta = 0;
2218	inodedep->id_savedino1 = NULL;
2219	inodedep->id_savedsize = -1;
2220	inodedep->id_savedextsize = -1;
2221	inodedep->id_savednlink = -1;
2222	inodedep->id_bmsafemap = NULL;
2223	inodedep->id_mkdiradd = NULL;
2224	LIST_INIT(&inodedep->id_dirremhd);
2225	LIST_INIT(&inodedep->id_pendinghd);
2226	LIST_INIT(&inodedep->id_inowait);
2227	LIST_INIT(&inodedep->id_bufwait);
2228	TAILQ_INIT(&inodedep->id_inoreflst);
2229	TAILQ_INIT(&inodedep->id_inoupdt);
2230	TAILQ_INIT(&inodedep->id_newinoupdt);
2231	TAILQ_INIT(&inodedep->id_extupdt);
2232	TAILQ_INIT(&inodedep->id_newextupdt);
2233	TAILQ_INIT(&inodedep->id_freeblklst);
2234	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2235	*inodedeppp = inodedep;
2236	return (0);
2237}
2238
2239/*
2240 * Structures and routines associated with newblk caching.
2241 */
2242LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
2243u_long	newblk_hash;		/* size of hash table - 1 */
2244#define	NEWBLK_HASH(fs, inum) \
2245	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
2246
2247static int
2248newblk_find(newblkhd, mp, newblkno, flags, newblkpp)
2249	struct newblk_hashhead *newblkhd;
2250	struct mount *mp;
2251	ufs2_daddr_t newblkno;
2252	int flags;
2253	struct newblk **newblkpp;
2254{
2255	struct newblk *newblk;
2256
2257	LIST_FOREACH(newblk, newblkhd, nb_hash) {
2258		if (newblkno != newblk->nb_newblkno)
2259			continue;
2260		if (mp != newblk->nb_list.wk_mp)
2261			continue;
2262		/*
2263		 * If we're creating a new dependency don't match those that
2264		 * have already been converted to allocdirects.  This is for
2265		 * a frag extend.
2266		 */
2267		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2268			continue;
2269		break;
2270	}
2271	if (newblk) {
2272		*newblkpp = newblk;
2273		return (1);
2274	}
2275	*newblkpp = NULL;
2276	return (0);
2277}
2278
2279/*
2280 * Look up a newblk. Return 1 if found, 0 if not found.
2281 * If not found, allocate if DEPALLOC flag is passed.
2282 * Found or allocated entry is returned in newblkpp.
2283 */
2284static int
2285newblk_lookup(mp, newblkno, flags, newblkpp)
2286	struct mount *mp;
2287	ufs2_daddr_t newblkno;
2288	int flags;
2289	struct newblk **newblkpp;
2290{
2291	struct newblk *newblk;
2292	struct newblk_hashhead *newblkhd;
2293
2294	newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno);
2295	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp))
2296		return (1);
2297	if ((flags & DEPALLOC) == 0)
2298		return (0);
2299	FREE_LOCK(&lk);
2300	newblk = malloc(sizeof(union allblk), M_NEWBLK,
2301	    M_SOFTDEP_FLAGS | M_ZERO);
2302	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2303	ACQUIRE_LOCK(&lk);
2304	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) {
2305		WORKITEM_FREE(newblk, D_NEWBLK);
2306		return (1);
2307	}
2308	newblk->nb_freefrag = NULL;
2309	LIST_INIT(&newblk->nb_indirdeps);
2310	LIST_INIT(&newblk->nb_newdirblk);
2311	LIST_INIT(&newblk->nb_jwork);
2312	newblk->nb_state = ATTACHED;
2313	newblk->nb_newblkno = newblkno;
2314	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2315	*newblkpp = newblk;
2316	return (0);
2317}
2318
2319/*
2320 * Structures and routines associated with freed indirect block caching.
2321 */
2322struct freeworklst *indir_hashtbl;
2323u_long	indir_hash;		/* size of hash table - 1 */
2324#define	INDIR_HASH(mp, blkno) \
2325	(&indir_hashtbl[((((register_t)(mp)) >> 13) + (blkno)) & indir_hash])
2326
2327/*
2328 * Lookup an indirect block in the indir hash table.  The freework is
2329 * removed and potentially freed.  The caller must do a blocking journal
2330 * write before writing to the blkno.
2331 */
2332static int
2333indirblk_lookup(mp, blkno)
2334	struct mount *mp;
2335	ufs2_daddr_t blkno;
2336{
2337	struct freework *freework;
2338	struct freeworklst *wkhd;
2339
2340	wkhd = INDIR_HASH(mp, blkno);
2341	TAILQ_FOREACH(freework, wkhd, fw_next) {
2342		if (freework->fw_blkno != blkno)
2343			continue;
2344		if (freework->fw_list.wk_mp != mp)
2345			continue;
2346		indirblk_remove(freework);
2347		return (1);
2348	}
2349	return (0);
2350}
2351
2352/*
2353 * Insert an indirect block represented by freework into the indirblk
2354 * hash table so that it may prevent the block from being re-used prior
2355 * to the journal being written.
2356 */
2357static void
2358indirblk_insert(freework)
2359	struct freework *freework;
2360{
2361	struct jblocks *jblocks;
2362	struct jseg *jseg;
2363
2364	jblocks = VFSTOUFS(freework->fw_list.wk_mp)->softdep_jblocks;
2365	jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2366	if (jseg == NULL)
2367		return;
2368
2369	LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2370	TAILQ_INSERT_HEAD(INDIR_HASH(freework->fw_list.wk_mp,
2371	    freework->fw_blkno), freework, fw_next);
2372	freework->fw_state &= ~DEPCOMPLETE;
2373}
2374
2375static void
2376indirblk_remove(freework)
2377	struct freework *freework;
2378{
2379
2380	LIST_REMOVE(freework, fw_segs);
2381	TAILQ_REMOVE(INDIR_HASH(freework->fw_list.wk_mp,
2382	    freework->fw_blkno), freework, fw_next);
2383	freework->fw_state |= DEPCOMPLETE;
2384	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2385		WORKITEM_FREE(freework, D_FREEWORK);
2386}
2387
2388/*
2389 * Executed during filesystem system initialization before
2390 * mounting any filesystems.
2391 */
2392void
2393softdep_initialize()
2394{
2395	int i;
2396
2397	LIST_INIT(&mkdirlisthd);
2398	max_softdeps = desiredvnodes * 4;
2399	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash);
2400	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
2401	newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK, &newblk_hash);
2402	bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
2403	i = 1 << (ffs(desiredvnodes / 10) - 1);
2404	indir_hashtbl = malloc(i * sizeof(indir_hashtbl[0]), M_FREEWORK,
2405	    M_WAITOK);
2406	indir_hash = i - 1;
2407	for (i = 0; i <= indir_hash; i++)
2408		TAILQ_INIT(&indir_hashtbl[i]);
2409
2410	/* initialise bioops hack */
2411	bioops.io_start = softdep_disk_io_initiation;
2412	bioops.io_complete = softdep_disk_write_complete;
2413	bioops.io_deallocate = softdep_deallocate_dependencies;
2414	bioops.io_countdeps = softdep_count_dependencies;
2415
2416	/* Initialize the callout with an mtx. */
2417	callout_init_mtx(&softdep_callout, &lk, 0);
2418}
2419
2420/*
2421 * Executed after all filesystems have been unmounted during
2422 * filesystem module unload.
2423 */
2424void
2425softdep_uninitialize()
2426{
2427
2428	callout_drain(&softdep_callout);
2429	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
2430	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
2431	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
2432	hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash);
2433	free(indir_hashtbl, M_FREEWORK);
2434}
2435
2436/*
2437 * Called at mount time to notify the dependency code that a
2438 * filesystem wishes to use it.
2439 */
2440int
2441softdep_mount(devvp, mp, fs, cred)
2442	struct vnode *devvp;
2443	struct mount *mp;
2444	struct fs *fs;
2445	struct ucred *cred;
2446{
2447	struct csum_total cstotal;
2448	struct ufsmount *ump;
2449	struct cg *cgp;
2450	struct buf *bp;
2451	int error, cyl;
2452
2453	MNT_ILOCK(mp);
2454	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2455	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2456		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2457			MNTK_SOFTDEP | MNTK_NOASYNC;
2458	}
2459	MNT_IUNLOCK(mp);
2460	ump = VFSTOUFS(mp);
2461	LIST_INIT(&ump->softdep_workitem_pending);
2462	LIST_INIT(&ump->softdep_journal_pending);
2463	TAILQ_INIT(&ump->softdep_unlinked);
2464	LIST_INIT(&ump->softdep_dirtycg);
2465	ump->softdep_worklist_tail = NULL;
2466	ump->softdep_on_worklist = 0;
2467	ump->softdep_deps = 0;
2468	if ((fs->fs_flags & FS_SUJ) &&
2469	    (error = journal_mount(mp, fs, cred)) != 0) {
2470		printf("Failed to start journal: %d\n", error);
2471		return (error);
2472	}
2473	/*
2474	 * When doing soft updates, the counters in the
2475	 * superblock may have gotten out of sync. Recomputation
2476	 * can take a long time and can be deferred for background
2477	 * fsck.  However, the old behavior of scanning the cylinder
2478	 * groups and recalculating them at mount time is available
2479	 * by setting vfs.ffs.compute_summary_at_mount to one.
2480	 */
2481	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2482		return (0);
2483	bzero(&cstotal, sizeof cstotal);
2484	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2485		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2486		    fs->fs_cgsize, cred, &bp)) != 0) {
2487			brelse(bp);
2488			return (error);
2489		}
2490		cgp = (struct cg *)bp->b_data;
2491		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2492		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2493		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2494		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2495		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2496		brelse(bp);
2497	}
2498#ifdef DEBUG
2499	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2500		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2501#endif
2502	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2503	return (0);
2504}
2505
2506void
2507softdep_unmount(mp)
2508	struct mount *mp;
2509{
2510
2511	MNT_ILOCK(mp);
2512	mp->mnt_flag &= ~MNT_SOFTDEP;
2513	if (MOUNTEDSUJ(mp) == 0) {
2514		MNT_IUNLOCK(mp);
2515		return;
2516	}
2517	mp->mnt_flag &= ~MNT_SUJ;
2518	MNT_IUNLOCK(mp);
2519	journal_unmount(mp);
2520}
2521
2522static struct jblocks *
2523jblocks_create(void)
2524{
2525	struct jblocks *jblocks;
2526
2527	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2528	TAILQ_INIT(&jblocks->jb_segs);
2529	jblocks->jb_avail = 10;
2530	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2531	    M_JBLOCKS, M_WAITOK | M_ZERO);
2532
2533	return (jblocks);
2534}
2535
2536static ufs2_daddr_t
2537jblocks_alloc(jblocks, bytes, actual)
2538	struct jblocks *jblocks;
2539	int bytes;
2540	int *actual;
2541{
2542	ufs2_daddr_t daddr;
2543	struct jextent *jext;
2544	int freecnt;
2545	int blocks;
2546
2547	blocks = bytes / DEV_BSIZE;
2548	jext = &jblocks->jb_extent[jblocks->jb_head];
2549	freecnt = jext->je_blocks - jblocks->jb_off;
2550	if (freecnt == 0) {
2551		jblocks->jb_off = 0;
2552		if (++jblocks->jb_head > jblocks->jb_used)
2553			jblocks->jb_head = 0;
2554		jext = &jblocks->jb_extent[jblocks->jb_head];
2555		freecnt = jext->je_blocks;
2556	}
2557	if (freecnt > blocks)
2558		freecnt = blocks;
2559	*actual = freecnt * DEV_BSIZE;
2560	daddr = jext->je_daddr + jblocks->jb_off;
2561	jblocks->jb_off += freecnt;
2562	jblocks->jb_free -= freecnt;
2563
2564	return (daddr);
2565}
2566
2567static void
2568jblocks_free(jblocks, mp, bytes)
2569	struct jblocks *jblocks;
2570	struct mount *mp;
2571	int bytes;
2572{
2573
2574	jblocks->jb_free += bytes / DEV_BSIZE;
2575	if (jblocks->jb_suspended)
2576		worklist_speedup();
2577	wakeup(jblocks);
2578}
2579
2580static void
2581jblocks_destroy(jblocks)
2582	struct jblocks *jblocks;
2583{
2584
2585	if (jblocks->jb_extent)
2586		free(jblocks->jb_extent, M_JBLOCKS);
2587	free(jblocks, M_JBLOCKS);
2588}
2589
2590static void
2591jblocks_add(jblocks, daddr, blocks)
2592	struct jblocks *jblocks;
2593	ufs2_daddr_t daddr;
2594	int blocks;
2595{
2596	struct jextent *jext;
2597
2598	jblocks->jb_blocks += blocks;
2599	jblocks->jb_free += blocks;
2600	jext = &jblocks->jb_extent[jblocks->jb_used];
2601	/* Adding the first block. */
2602	if (jext->je_daddr == 0) {
2603		jext->je_daddr = daddr;
2604		jext->je_blocks = blocks;
2605		return;
2606	}
2607	/* Extending the last extent. */
2608	if (jext->je_daddr + jext->je_blocks == daddr) {
2609		jext->je_blocks += blocks;
2610		return;
2611	}
2612	/* Adding a new extent. */
2613	if (++jblocks->jb_used == jblocks->jb_avail) {
2614		jblocks->jb_avail *= 2;
2615		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2616		    M_JBLOCKS, M_WAITOK | M_ZERO);
2617		memcpy(jext, jblocks->jb_extent,
2618		    sizeof(struct jextent) * jblocks->jb_used);
2619		free(jblocks->jb_extent, M_JBLOCKS);
2620		jblocks->jb_extent = jext;
2621	}
2622	jext = &jblocks->jb_extent[jblocks->jb_used];
2623	jext->je_daddr = daddr;
2624	jext->je_blocks = blocks;
2625	return;
2626}
2627
2628int
2629softdep_journal_lookup(mp, vpp)
2630	struct mount *mp;
2631	struct vnode **vpp;
2632{
2633	struct componentname cnp;
2634	struct vnode *dvp;
2635	ino_t sujournal;
2636	int error;
2637
2638	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2639	if (error)
2640		return (error);
2641	bzero(&cnp, sizeof(cnp));
2642	cnp.cn_nameiop = LOOKUP;
2643	cnp.cn_flags = ISLASTCN;
2644	cnp.cn_thread = curthread;
2645	cnp.cn_cred = curthread->td_ucred;
2646	cnp.cn_pnbuf = SUJ_FILE;
2647	cnp.cn_nameptr = SUJ_FILE;
2648	cnp.cn_namelen = strlen(SUJ_FILE);
2649	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2650	vput(dvp);
2651	if (error != 0)
2652		return (error);
2653	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2654	return (error);
2655}
2656
2657/*
2658 * Open and verify the journal file.
2659 */
2660static int
2661journal_mount(mp, fs, cred)
2662	struct mount *mp;
2663	struct fs *fs;
2664	struct ucred *cred;
2665{
2666	struct jblocks *jblocks;
2667	struct vnode *vp;
2668	struct inode *ip;
2669	ufs2_daddr_t blkno;
2670	int bcount;
2671	int error;
2672	int i;
2673
2674	error = softdep_journal_lookup(mp, &vp);
2675	if (error != 0) {
2676		printf("Failed to find journal.  Use tunefs to create one\n");
2677		return (error);
2678	}
2679	ip = VTOI(vp);
2680	if (ip->i_size < SUJ_MIN) {
2681		error = ENOSPC;
2682		goto out;
2683	}
2684	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2685	jblocks = jblocks_create();
2686	for (i = 0; i < bcount; i++) {
2687		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2688		if (error)
2689			break;
2690		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2691	}
2692	if (error) {
2693		jblocks_destroy(jblocks);
2694		goto out;
2695	}
2696	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2697	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2698	VFSTOUFS(mp)->softdep_jblocks = jblocks;
2699out:
2700	if (error == 0) {
2701		MNT_ILOCK(mp);
2702		mp->mnt_flag |= MNT_SUJ;
2703		mp->mnt_flag &= ~MNT_SOFTDEP;
2704		MNT_IUNLOCK(mp);
2705		/*
2706		 * Only validate the journal contents if the
2707		 * filesystem is clean, otherwise we write the logs
2708		 * but they'll never be used.  If the filesystem was
2709		 * still dirty when we mounted it the journal is
2710		 * invalid and a new journal can only be valid if it
2711		 * starts from a clean mount.
2712		 */
2713		if (fs->fs_clean) {
2714			DIP_SET(ip, i_modrev, fs->fs_mtime);
2715			ip->i_flags |= IN_MODIFIED;
2716			ffs_update(vp, 1);
2717		}
2718	}
2719	vput(vp);
2720	return (error);
2721}
2722
2723static void
2724journal_unmount(mp)
2725	struct mount *mp;
2726{
2727	struct ufsmount *ump;
2728
2729	ump = VFSTOUFS(mp);
2730	if (ump->softdep_jblocks)
2731		jblocks_destroy(ump->softdep_jblocks);
2732	ump->softdep_jblocks = NULL;
2733}
2734
2735/*
2736 * Called when a journal record is ready to be written.  Space is allocated
2737 * and the journal entry is created when the journal is flushed to stable
2738 * store.
2739 */
2740static void
2741add_to_journal(wk)
2742	struct worklist *wk;
2743{
2744	struct ufsmount *ump;
2745
2746	mtx_assert(&lk, MA_OWNED);
2747	ump = VFSTOUFS(wk->wk_mp);
2748	if (wk->wk_state & ONWORKLIST)
2749		panic("add_to_journal: %s(0x%X) already on list",
2750		    TYPENAME(wk->wk_type), wk->wk_state);
2751	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2752	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2753		ump->softdep_jblocks->jb_age = ticks;
2754		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2755	} else
2756		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2757	ump->softdep_journal_tail = wk;
2758	ump->softdep_on_journal += 1;
2759}
2760
2761/*
2762 * Remove an arbitrary item for the journal worklist maintain the tail
2763 * pointer.  This happens when a new operation obviates the need to
2764 * journal an old operation.
2765 */
2766static void
2767remove_from_journal(wk)
2768	struct worklist *wk;
2769{
2770	struct ufsmount *ump;
2771
2772	mtx_assert(&lk, MA_OWNED);
2773	ump = VFSTOUFS(wk->wk_mp);
2774#ifdef SUJ_DEBUG
2775	{
2776		struct worklist *wkn;
2777
2778		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2779			if (wkn == wk)
2780				break;
2781		if (wkn == NULL)
2782			panic("remove_from_journal: %p is not in journal", wk);
2783	}
2784#endif
2785	/*
2786	 * We emulate a TAILQ to save space in most structures which do not
2787	 * require TAILQ semantics.  Here we must update the tail position
2788	 * when removing the tail which is not the final entry. This works
2789	 * only if the worklist linkage are at the beginning of the structure.
2790	 */
2791	if (ump->softdep_journal_tail == wk)
2792		ump->softdep_journal_tail =
2793		    (struct worklist *)wk->wk_list.le_prev;
2794
2795	WORKLIST_REMOVE(wk);
2796	ump->softdep_on_journal -= 1;
2797}
2798
2799/*
2800 * Check for journal space as well as dependency limits so the prelink
2801 * code can throttle both journaled and non-journaled filesystems.
2802 * Threshold is 0 for low and 1 for min.
2803 */
2804static int
2805journal_space(ump, thresh)
2806	struct ufsmount *ump;
2807	int thresh;
2808{
2809	struct jblocks *jblocks;
2810	int avail;
2811
2812	jblocks = ump->softdep_jblocks;
2813	if (jblocks == NULL)
2814		return (1);
2815	/*
2816	 * We use a tighter restriction here to prevent request_cleanup()
2817	 * running in threads from running into locks we currently hold.
2818	 */
2819	if (dep_current[D_INODEDEP] > (max_softdeps / 10) * 9)
2820		return (0);
2821	if (thresh)
2822		thresh = jblocks->jb_min;
2823	else
2824		thresh = jblocks->jb_low;
2825	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2826	avail = jblocks->jb_free - avail;
2827
2828	return (avail > thresh);
2829}
2830
2831static void
2832journal_suspend(ump)
2833	struct ufsmount *ump;
2834{
2835	struct jblocks *jblocks;
2836	struct mount *mp;
2837
2838	mp = UFSTOVFS(ump);
2839	jblocks = ump->softdep_jblocks;
2840	MNT_ILOCK(mp);
2841	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2842		stat_journal_min++;
2843		mp->mnt_kern_flag |= MNTK_SUSPEND;
2844		mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
2845	}
2846	jblocks->jb_suspended = 1;
2847	MNT_IUNLOCK(mp);
2848}
2849
2850static int
2851journal_unsuspend(struct ufsmount *ump)
2852{
2853	struct jblocks *jblocks;
2854	struct mount *mp;
2855
2856	mp = UFSTOVFS(ump);
2857	jblocks = ump->softdep_jblocks;
2858
2859	if (jblocks != NULL && jblocks->jb_suspended &&
2860	    journal_space(ump, jblocks->jb_min)) {
2861		jblocks->jb_suspended = 0;
2862		FREE_LOCK(&lk);
2863		mp->mnt_susp_owner = curthread;
2864		vfs_write_resume(mp);
2865		ACQUIRE_LOCK(&lk);
2866		return (1);
2867	}
2868	return (0);
2869}
2870
2871/*
2872 * Called before any allocation function to be certain that there is
2873 * sufficient space in the journal prior to creating any new records.
2874 * Since in the case of block allocation we may have multiple locked
2875 * buffers at the time of the actual allocation we can not block
2876 * when the journal records are created.  Doing so would create a deadlock
2877 * if any of these buffers needed to be flushed to reclaim space.  Instead
2878 * we require a sufficiently large amount of available space such that
2879 * each thread in the system could have passed this allocation check and
2880 * still have sufficient free space.  With 20% of a minimum journal size
2881 * of 1MB we have 6553 records available.
2882 */
2883int
2884softdep_prealloc(vp, waitok)
2885	struct vnode *vp;
2886	int waitok;
2887{
2888	struct ufsmount *ump;
2889
2890	/*
2891	 * Nothing to do if we are not running journaled soft updates.
2892	 * If we currently hold the snapshot lock, we must avoid handling
2893	 * other resources that could cause deadlock.
2894	 */
2895	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)))
2896		return (0);
2897	ump = VFSTOUFS(vp->v_mount);
2898	ACQUIRE_LOCK(&lk);
2899	if (journal_space(ump, 0)) {
2900		FREE_LOCK(&lk);
2901		return (0);
2902	}
2903	stat_journal_low++;
2904	FREE_LOCK(&lk);
2905	if (waitok == MNT_NOWAIT)
2906		return (ENOSPC);
2907	/*
2908	 * Attempt to sync this vnode once to flush any journal
2909	 * work attached to it.
2910	 */
2911	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
2912		ffs_syncvnode(vp, waitok, 0);
2913	ACQUIRE_LOCK(&lk);
2914	process_removes(vp);
2915	process_truncates(vp);
2916	if (journal_space(ump, 0) == 0) {
2917		softdep_speedup();
2918		if (journal_space(ump, 1) == 0)
2919			journal_suspend(ump);
2920	}
2921	FREE_LOCK(&lk);
2922
2923	return (0);
2924}
2925
2926/*
2927 * Before adjusting a link count on a vnode verify that we have sufficient
2928 * journal space.  If not, process operations that depend on the currently
2929 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
2930 * and softdep flush threads can not acquire these locks to reclaim space.
2931 */
2932static void
2933softdep_prelink(dvp, vp)
2934	struct vnode *dvp;
2935	struct vnode *vp;
2936{
2937	struct ufsmount *ump;
2938
2939	ump = VFSTOUFS(dvp->v_mount);
2940	mtx_assert(&lk, MA_OWNED);
2941	/*
2942	 * Nothing to do if we have sufficient journal space.
2943	 * If we currently hold the snapshot lock, we must avoid
2944	 * handling other resources that could cause deadlock.
2945	 */
2946	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
2947		return;
2948	stat_journal_low++;
2949	FREE_LOCK(&lk);
2950	if (vp)
2951		ffs_syncvnode(vp, MNT_NOWAIT, 0);
2952	ffs_syncvnode(dvp, MNT_WAIT, 0);
2953	ACQUIRE_LOCK(&lk);
2954	/* Process vp before dvp as it may create .. removes. */
2955	if (vp) {
2956		process_removes(vp);
2957		process_truncates(vp);
2958	}
2959	process_removes(dvp);
2960	process_truncates(dvp);
2961	softdep_speedup();
2962	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
2963	if (journal_space(ump, 0) == 0) {
2964		softdep_speedup();
2965		if (journal_space(ump, 1) == 0)
2966			journal_suspend(ump);
2967	}
2968}
2969
2970static void
2971jseg_write(ump, jseg, data)
2972	struct ufsmount *ump;
2973	struct jseg *jseg;
2974	uint8_t *data;
2975{
2976	struct jsegrec *rec;
2977
2978	rec = (struct jsegrec *)data;
2979	rec->jsr_seq = jseg->js_seq;
2980	rec->jsr_oldest = jseg->js_oldseq;
2981	rec->jsr_cnt = jseg->js_cnt;
2982	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
2983	rec->jsr_crc = 0;
2984	rec->jsr_time = ump->um_fs->fs_mtime;
2985}
2986
2987static inline void
2988inoref_write(inoref, jseg, rec)
2989	struct inoref *inoref;
2990	struct jseg *jseg;
2991	struct jrefrec *rec;
2992{
2993
2994	inoref->if_jsegdep->jd_seg = jseg;
2995	rec->jr_ino = inoref->if_ino;
2996	rec->jr_parent = inoref->if_parent;
2997	rec->jr_nlink = inoref->if_nlink;
2998	rec->jr_mode = inoref->if_mode;
2999	rec->jr_diroff = inoref->if_diroff;
3000}
3001
3002static void
3003jaddref_write(jaddref, jseg, data)
3004	struct jaddref *jaddref;
3005	struct jseg *jseg;
3006	uint8_t *data;
3007{
3008	struct jrefrec *rec;
3009
3010	rec = (struct jrefrec *)data;
3011	rec->jr_op = JOP_ADDREF;
3012	inoref_write(&jaddref->ja_ref, jseg, rec);
3013}
3014
3015static void
3016jremref_write(jremref, jseg, data)
3017	struct jremref *jremref;
3018	struct jseg *jseg;
3019	uint8_t *data;
3020{
3021	struct jrefrec *rec;
3022
3023	rec = (struct jrefrec *)data;
3024	rec->jr_op = JOP_REMREF;
3025	inoref_write(&jremref->jr_ref, jseg, rec);
3026}
3027
3028static void
3029jmvref_write(jmvref, jseg, data)
3030	struct jmvref *jmvref;
3031	struct jseg *jseg;
3032	uint8_t *data;
3033{
3034	struct jmvrec *rec;
3035
3036	rec = (struct jmvrec *)data;
3037	rec->jm_op = JOP_MVREF;
3038	rec->jm_ino = jmvref->jm_ino;
3039	rec->jm_parent = jmvref->jm_parent;
3040	rec->jm_oldoff = jmvref->jm_oldoff;
3041	rec->jm_newoff = jmvref->jm_newoff;
3042}
3043
3044static void
3045jnewblk_write(jnewblk, jseg, data)
3046	struct jnewblk *jnewblk;
3047	struct jseg *jseg;
3048	uint8_t *data;
3049{
3050	struct jblkrec *rec;
3051
3052	jnewblk->jn_jsegdep->jd_seg = jseg;
3053	rec = (struct jblkrec *)data;
3054	rec->jb_op = JOP_NEWBLK;
3055	rec->jb_ino = jnewblk->jn_ino;
3056	rec->jb_blkno = jnewblk->jn_blkno;
3057	rec->jb_lbn = jnewblk->jn_lbn;
3058	rec->jb_frags = jnewblk->jn_frags;
3059	rec->jb_oldfrags = jnewblk->jn_oldfrags;
3060}
3061
3062static void
3063jfreeblk_write(jfreeblk, jseg, data)
3064	struct jfreeblk *jfreeblk;
3065	struct jseg *jseg;
3066	uint8_t *data;
3067{
3068	struct jblkrec *rec;
3069
3070	jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3071	rec = (struct jblkrec *)data;
3072	rec->jb_op = JOP_FREEBLK;
3073	rec->jb_ino = jfreeblk->jf_ino;
3074	rec->jb_blkno = jfreeblk->jf_blkno;
3075	rec->jb_lbn = jfreeblk->jf_lbn;
3076	rec->jb_frags = jfreeblk->jf_frags;
3077	rec->jb_oldfrags = 0;
3078}
3079
3080static void
3081jfreefrag_write(jfreefrag, jseg, data)
3082	struct jfreefrag *jfreefrag;
3083	struct jseg *jseg;
3084	uint8_t *data;
3085{
3086	struct jblkrec *rec;
3087
3088	jfreefrag->fr_jsegdep->jd_seg = jseg;
3089	rec = (struct jblkrec *)data;
3090	rec->jb_op = JOP_FREEBLK;
3091	rec->jb_ino = jfreefrag->fr_ino;
3092	rec->jb_blkno = jfreefrag->fr_blkno;
3093	rec->jb_lbn = jfreefrag->fr_lbn;
3094	rec->jb_frags = jfreefrag->fr_frags;
3095	rec->jb_oldfrags = 0;
3096}
3097
3098static void
3099jtrunc_write(jtrunc, jseg, data)
3100	struct jtrunc *jtrunc;
3101	struct jseg *jseg;
3102	uint8_t *data;
3103{
3104	struct jtrncrec *rec;
3105
3106	jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3107	rec = (struct jtrncrec *)data;
3108	rec->jt_op = JOP_TRUNC;
3109	rec->jt_ino = jtrunc->jt_ino;
3110	rec->jt_size = jtrunc->jt_size;
3111	rec->jt_extsize = jtrunc->jt_extsize;
3112}
3113
3114static void
3115jfsync_write(jfsync, jseg, data)
3116	struct jfsync *jfsync;
3117	struct jseg *jseg;
3118	uint8_t *data;
3119{
3120	struct jtrncrec *rec;
3121
3122	rec = (struct jtrncrec *)data;
3123	rec->jt_op = JOP_SYNC;
3124	rec->jt_ino = jfsync->jfs_ino;
3125	rec->jt_size = jfsync->jfs_size;
3126	rec->jt_extsize = jfsync->jfs_extsize;
3127}
3128
3129static void
3130softdep_flushjournal(mp)
3131	struct mount *mp;
3132{
3133	struct jblocks *jblocks;
3134	struct ufsmount *ump;
3135
3136	if (MOUNTEDSUJ(mp) == 0)
3137		return;
3138	ump = VFSTOUFS(mp);
3139	jblocks = ump->softdep_jblocks;
3140	ACQUIRE_LOCK(&lk);
3141	while (ump->softdep_on_journal) {
3142		jblocks->jb_needseg = 1;
3143		softdep_process_journal(mp, NULL, MNT_WAIT);
3144	}
3145	FREE_LOCK(&lk);
3146}
3147
3148static void softdep_synchronize_completed(struct bio *);
3149static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3150
3151static void
3152softdep_synchronize_completed(bp)
3153        struct bio *bp;
3154{
3155	struct jseg *oldest;
3156	struct jseg *jseg;
3157
3158	/*
3159	 * caller1 marks the last segment written before we issued the
3160	 * synchronize cache.
3161	 */
3162	jseg = bp->bio_caller1;
3163	oldest = NULL;
3164	ACQUIRE_LOCK(&lk);
3165	/*
3166	 * Mark all the journal entries waiting on the synchronize cache
3167	 * as completed so they may continue on.
3168	 */
3169	while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3170		jseg->js_state |= COMPLETE;
3171		oldest = jseg;
3172		jseg = TAILQ_PREV(jseg, jseglst, js_next);
3173	}
3174	/*
3175	 * Restart deferred journal entry processing from the oldest
3176	 * completed jseg.
3177	 */
3178	if (oldest)
3179		complete_jsegs(oldest);
3180
3181	FREE_LOCK(&lk);
3182	g_destroy_bio(bp);
3183}
3184
3185/*
3186 * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3187 * barriers.  The journal must be written prior to any blocks that depend
3188 * on it and the journal can not be released until the blocks have be
3189 * written.  This code handles both barriers simultaneously.
3190 */
3191static void
3192softdep_synchronize(bp, ump, caller1)
3193	struct bio *bp;
3194	struct ufsmount *ump;
3195	void *caller1;
3196{
3197
3198	bp->bio_cmd = BIO_FLUSH;
3199	bp->bio_flags |= BIO_ORDERED;
3200	bp->bio_data = NULL;
3201	bp->bio_offset = ump->um_cp->provider->mediasize;
3202	bp->bio_length = 0;
3203	bp->bio_done = softdep_synchronize_completed;
3204	bp->bio_caller1 = caller1;
3205	g_io_request(bp,
3206	    (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
3207}
3208
3209/*
3210 * Flush some journal records to disk.
3211 */
3212static void
3213softdep_process_journal(mp, needwk, flags)
3214	struct mount *mp;
3215	struct worklist *needwk;
3216	int flags;
3217{
3218	struct jblocks *jblocks;
3219	struct ufsmount *ump;
3220	struct worklist *wk;
3221	struct jseg *jseg;
3222	struct buf *bp;
3223	struct bio *bio;
3224	uint8_t *data;
3225	struct fs *fs;
3226	int shouldflush;
3227	int segwritten;
3228	int jrecmin;	/* Minimum records per block. */
3229	int jrecmax;	/* Maximum records per block. */
3230	int size;
3231	int cnt;
3232	int off;
3233	int devbsize;
3234
3235	if (MOUNTEDSUJ(mp) == 0)
3236		return;
3237	shouldflush = softdep_flushcache;
3238	bio = NULL;
3239	jseg = NULL;
3240	ump = VFSTOUFS(mp);
3241	fs = ump->um_fs;
3242	jblocks = ump->softdep_jblocks;
3243	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3244	/*
3245	 * We write anywhere between a disk block and fs block.  The upper
3246	 * bound is picked to prevent buffer cache fragmentation and limit
3247	 * processing time per I/O.
3248	 */
3249	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3250	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3251	segwritten = 0;
3252	for (;;) {
3253		cnt = ump->softdep_on_journal;
3254		/*
3255		 * Criteria for writing a segment:
3256		 * 1) We have a full block.
3257		 * 2) We're called from jwait() and haven't found the
3258		 *    journal item yet.
3259		 * 3) Always write if needseg is set.
3260		 * 4) If we are called from process_worklist and have
3261		 *    not yet written anything we write a partial block
3262		 *    to enforce a 1 second maximum latency on journal
3263		 *    entries.
3264		 */
3265		if (cnt < (jrecmax - 1) && needwk == NULL &&
3266		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3267			break;
3268		cnt++;
3269		/*
3270		 * Verify some free journal space.  softdep_prealloc() should
3271		 * guarantee that we don't run out so this is indicative of
3272		 * a problem with the flow control.  Try to recover
3273		 * gracefully in any event.
3274		 */
3275		while (jblocks->jb_free == 0) {
3276			if (flags != MNT_WAIT)
3277				break;
3278			printf("softdep: Out of journal space!\n");
3279			softdep_speedup();
3280			msleep(jblocks, &lk, PRIBIO, "jblocks", hz);
3281		}
3282		FREE_LOCK(&lk);
3283		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3284		workitem_alloc(&jseg->js_list, D_JSEG, mp);
3285		LIST_INIT(&jseg->js_entries);
3286		LIST_INIT(&jseg->js_indirs);
3287		jseg->js_state = ATTACHED;
3288		if (shouldflush == 0)
3289			jseg->js_state |= COMPLETE;
3290		else if (bio == NULL)
3291			bio = g_alloc_bio();
3292		jseg->js_jblocks = jblocks;
3293		bp = geteblk(fs->fs_bsize, 0);
3294		ACQUIRE_LOCK(&lk);
3295		/*
3296		 * If there was a race while we were allocating the block
3297		 * and jseg the entry we care about was likely written.
3298		 * We bail out in both the WAIT and NOWAIT case and assume
3299		 * the caller will loop if the entry it cares about is
3300		 * not written.
3301		 */
3302		cnt = ump->softdep_on_journal;
3303		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3304			bp->b_flags |= B_INVAL | B_NOCACHE;
3305			WORKITEM_FREE(jseg, D_JSEG);
3306			FREE_LOCK(&lk);
3307			brelse(bp);
3308			ACQUIRE_LOCK(&lk);
3309			break;
3310		}
3311		/*
3312		 * Calculate the disk block size required for the available
3313		 * records rounded to the min size.
3314		 */
3315		if (cnt == 0)
3316			size = devbsize;
3317		else if (cnt < jrecmax)
3318			size = howmany(cnt, jrecmin) * devbsize;
3319		else
3320			size = fs->fs_bsize;
3321		/*
3322		 * Allocate a disk block for this journal data and account
3323		 * for truncation of the requested size if enough contiguous
3324		 * space was not available.
3325		 */
3326		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3327		bp->b_lblkno = bp->b_blkno;
3328		bp->b_offset = bp->b_blkno * DEV_BSIZE;
3329		bp->b_bcount = size;
3330		bp->b_bufobj = &ump->um_devvp->v_bufobj;
3331		bp->b_flags &= ~B_INVAL;
3332		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3333		/*
3334		 * Initialize our jseg with cnt records.  Assign the next
3335		 * sequence number to it and link it in-order.
3336		 */
3337		cnt = MIN(cnt, (size / devbsize) * jrecmin);
3338		jseg->js_buf = bp;
3339		jseg->js_cnt = cnt;
3340		jseg->js_refs = cnt + 1;	/* Self ref. */
3341		jseg->js_size = size;
3342		jseg->js_seq = jblocks->jb_nextseq++;
3343		if (jblocks->jb_oldestseg == NULL)
3344			jblocks->jb_oldestseg = jseg;
3345		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3346		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3347		if (jblocks->jb_writeseg == NULL)
3348			jblocks->jb_writeseg = jseg;
3349		/*
3350		 * Start filling in records from the pending list.
3351		 */
3352		data = bp->b_data;
3353		off = 0;
3354		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3355		    != NULL) {
3356			if (cnt == 0)
3357				break;
3358			/* Place a segment header on every device block. */
3359			if ((off % devbsize) == 0) {
3360				jseg_write(ump, jseg, data);
3361				off += JREC_SIZE;
3362				data = bp->b_data + off;
3363			}
3364			if (wk == needwk)
3365				needwk = NULL;
3366			remove_from_journal(wk);
3367			wk->wk_state |= INPROGRESS;
3368			WORKLIST_INSERT(&jseg->js_entries, wk);
3369			switch (wk->wk_type) {
3370			case D_JADDREF:
3371				jaddref_write(WK_JADDREF(wk), jseg, data);
3372				break;
3373			case D_JREMREF:
3374				jremref_write(WK_JREMREF(wk), jseg, data);
3375				break;
3376			case D_JMVREF:
3377				jmvref_write(WK_JMVREF(wk), jseg, data);
3378				break;
3379			case D_JNEWBLK:
3380				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3381				break;
3382			case D_JFREEBLK:
3383				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3384				break;
3385			case D_JFREEFRAG:
3386				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3387				break;
3388			case D_JTRUNC:
3389				jtrunc_write(WK_JTRUNC(wk), jseg, data);
3390				break;
3391			case D_JFSYNC:
3392				jfsync_write(WK_JFSYNC(wk), jseg, data);
3393				break;
3394			default:
3395				panic("process_journal: Unknown type %s",
3396				    TYPENAME(wk->wk_type));
3397				/* NOTREACHED */
3398			}
3399			off += JREC_SIZE;
3400			data = bp->b_data + off;
3401			cnt--;
3402		}
3403		/*
3404		 * Write this one buffer and continue.
3405		 */
3406		segwritten = 1;
3407		jblocks->jb_needseg = 0;
3408		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3409		FREE_LOCK(&lk);
3410		BO_LOCK(bp->b_bufobj);
3411		bgetvp(ump->um_devvp, bp);
3412		BO_UNLOCK(bp->b_bufobj);
3413		/*
3414		 * We only do the blocking wait once we find the journal
3415		 * entry we're looking for.
3416		 */
3417		if (needwk == NULL && flags == MNT_WAIT)
3418			bwrite(bp);
3419		else
3420			bawrite(bp);
3421		ACQUIRE_LOCK(&lk);
3422	}
3423	/*
3424	 * If we wrote a segment issue a synchronize cache so the journal
3425	 * is reflected on disk before the data is written.  Since reclaiming
3426	 * journal space also requires writing a journal record this
3427	 * process also enforces a barrier before reclamation.
3428	 */
3429	if (segwritten && shouldflush) {
3430		softdep_synchronize(bio, ump,
3431		    TAILQ_LAST(&jblocks->jb_segs, jseglst));
3432	} else if (bio)
3433		g_destroy_bio(bio);
3434	/*
3435	 * If we've suspended the filesystem because we ran out of journal
3436	 * space either try to sync it here to make some progress or
3437	 * unsuspend it if we already have.
3438	 */
3439	if (flags == 0 && jblocks->jb_suspended) {
3440		if (journal_unsuspend(ump))
3441			return;
3442		FREE_LOCK(&lk);
3443		VFS_SYNC(mp, MNT_NOWAIT);
3444		ffs_sbupdate(ump, MNT_WAIT, 0);
3445		ACQUIRE_LOCK(&lk);
3446	}
3447}
3448
3449/*
3450 * Complete a jseg, allowing all dependencies awaiting journal writes
3451 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3452 * structures so that the journal segment can be freed to reclaim space.
3453 */
3454static void
3455complete_jseg(jseg)
3456	struct jseg *jseg;
3457{
3458	struct worklist *wk;
3459	struct jmvref *jmvref;
3460	int waiting;
3461#ifdef INVARIANTS
3462	int i = 0;
3463#endif
3464
3465	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3466		WORKLIST_REMOVE(wk);
3467		waiting = wk->wk_state & IOWAITING;
3468		wk->wk_state &= ~(INPROGRESS | IOWAITING);
3469		wk->wk_state |= COMPLETE;
3470		KASSERT(i++ < jseg->js_cnt,
3471		    ("handle_written_jseg: overflow %d >= %d",
3472		    i - 1, jseg->js_cnt));
3473		switch (wk->wk_type) {
3474		case D_JADDREF:
3475			handle_written_jaddref(WK_JADDREF(wk));
3476			break;
3477		case D_JREMREF:
3478			handle_written_jremref(WK_JREMREF(wk));
3479			break;
3480		case D_JMVREF:
3481			rele_jseg(jseg);	/* No jsegdep. */
3482			jmvref = WK_JMVREF(wk);
3483			LIST_REMOVE(jmvref, jm_deps);
3484			if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3485				free_pagedep(jmvref->jm_pagedep);
3486			WORKITEM_FREE(jmvref, D_JMVREF);
3487			break;
3488		case D_JNEWBLK:
3489			handle_written_jnewblk(WK_JNEWBLK(wk));
3490			break;
3491		case D_JFREEBLK:
3492			handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3493			break;
3494		case D_JTRUNC:
3495			handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3496			break;
3497		case D_JFSYNC:
3498			rele_jseg(jseg);	/* No jsegdep. */
3499			WORKITEM_FREE(wk, D_JFSYNC);
3500			break;
3501		case D_JFREEFRAG:
3502			handle_written_jfreefrag(WK_JFREEFRAG(wk));
3503			break;
3504		default:
3505			panic("handle_written_jseg: Unknown type %s",
3506			    TYPENAME(wk->wk_type));
3507			/* NOTREACHED */
3508		}
3509		if (waiting)
3510			wakeup(wk);
3511	}
3512	/* Release the self reference so the structure may be freed. */
3513	rele_jseg(jseg);
3514}
3515
3516/*
3517 * Determine which jsegs are ready for completion processing.  Waits for
3518 * synchronize cache to complete as well as forcing in-order completion
3519 * of journal entries.
3520 */
3521static void
3522complete_jsegs(jseg)
3523	struct jseg *jseg;
3524{
3525	struct jblocks *jblocks;
3526	struct jseg *jsegn;
3527
3528	jblocks = jseg->js_jblocks;
3529	/*
3530	 * Don't allow out of order completions.  If this isn't the first
3531	 * block wait for it to write before we're done.
3532	 */
3533	if (jseg != jblocks->jb_writeseg)
3534		return;
3535	/* Iterate through available jsegs processing their entries. */
3536	while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3537		jblocks->jb_oldestwrseq = jseg->js_oldseq;
3538		jsegn = TAILQ_NEXT(jseg, js_next);
3539		complete_jseg(jseg);
3540		jseg = jsegn;
3541	}
3542	jblocks->jb_writeseg = jseg;
3543	/*
3544	 * Attempt to free jsegs now that oldestwrseq may have advanced.
3545	 */
3546	free_jsegs(jblocks);
3547}
3548
3549/*
3550 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
3551 * the final completions.
3552 */
3553static void
3554handle_written_jseg(jseg, bp)
3555	struct jseg *jseg;
3556	struct buf *bp;
3557{
3558
3559	if (jseg->js_refs == 0)
3560		panic("handle_written_jseg: No self-reference on %p", jseg);
3561	jseg->js_state |= DEPCOMPLETE;
3562	/*
3563	 * We'll never need this buffer again, set flags so it will be
3564	 * discarded.
3565	 */
3566	bp->b_flags |= B_INVAL | B_NOCACHE;
3567	complete_jsegs(jseg);
3568}
3569
3570static inline struct jsegdep *
3571inoref_jseg(inoref)
3572	struct inoref *inoref;
3573{
3574	struct jsegdep *jsegdep;
3575
3576	jsegdep = inoref->if_jsegdep;
3577	inoref->if_jsegdep = NULL;
3578
3579	return (jsegdep);
3580}
3581
3582/*
3583 * Called once a jremref has made it to stable store.  The jremref is marked
3584 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3585 * for the jremref to complete will be awoken by free_jremref.
3586 */
3587static void
3588handle_written_jremref(jremref)
3589	struct jremref *jremref;
3590{
3591	struct inodedep *inodedep;
3592	struct jsegdep *jsegdep;
3593	struct dirrem *dirrem;
3594
3595	/* Grab the jsegdep. */
3596	jsegdep = inoref_jseg(&jremref->jr_ref);
3597	/*
3598	 * Remove us from the inoref list.
3599	 */
3600	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3601	    0, &inodedep) == 0)
3602		panic("handle_written_jremref: Lost inodedep");
3603	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3604	/*
3605	 * Complete the dirrem.
3606	 */
3607	dirrem = jremref->jr_dirrem;
3608	jremref->jr_dirrem = NULL;
3609	LIST_REMOVE(jremref, jr_deps);
3610	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3611	jwork_insert(&dirrem->dm_jwork, jsegdep);
3612	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3613	    (dirrem->dm_state & COMPLETE) != 0)
3614		add_to_worklist(&dirrem->dm_list, 0);
3615	free_jremref(jremref);
3616}
3617
3618/*
3619 * Called once a jaddref has made it to stable store.  The dependency is
3620 * marked complete and any dependent structures are added to the inode
3621 * bufwait list to be completed as soon as it is written.  If a bitmap write
3622 * depends on this entry we move the inode into the inodedephd of the
3623 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3624 */
3625static void
3626handle_written_jaddref(jaddref)
3627	struct jaddref *jaddref;
3628{
3629	struct jsegdep *jsegdep;
3630	struct inodedep *inodedep;
3631	struct diradd *diradd;
3632	struct mkdir *mkdir;
3633
3634	/* Grab the jsegdep. */
3635	jsegdep = inoref_jseg(&jaddref->ja_ref);
3636	mkdir = NULL;
3637	diradd = NULL;
3638	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3639	    0, &inodedep) == 0)
3640		panic("handle_written_jaddref: Lost inodedep.");
3641	if (jaddref->ja_diradd == NULL)
3642		panic("handle_written_jaddref: No dependency");
3643	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3644		diradd = jaddref->ja_diradd;
3645		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3646	} else if (jaddref->ja_state & MKDIR_PARENT) {
3647		mkdir = jaddref->ja_mkdir;
3648		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3649	} else if (jaddref->ja_state & MKDIR_BODY)
3650		mkdir = jaddref->ja_mkdir;
3651	else
3652		panic("handle_written_jaddref: Unknown dependency %p",
3653		    jaddref->ja_diradd);
3654	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3655	/*
3656	 * Remove us from the inode list.
3657	 */
3658	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3659	/*
3660	 * The mkdir may be waiting on the jaddref to clear before freeing.
3661	 */
3662	if (mkdir) {
3663		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3664		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3665		    TYPENAME(mkdir->md_list.wk_type)));
3666		mkdir->md_jaddref = NULL;
3667		diradd = mkdir->md_diradd;
3668		mkdir->md_state |= DEPCOMPLETE;
3669		complete_mkdir(mkdir);
3670	}
3671	jwork_insert(&diradd->da_jwork, jsegdep);
3672	if (jaddref->ja_state & NEWBLOCK) {
3673		inodedep->id_state |= ONDEPLIST;
3674		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3675		    inodedep, id_deps);
3676	}
3677	free_jaddref(jaddref);
3678}
3679
3680/*
3681 * Called once a jnewblk journal is written.  The allocdirect or allocindir
3682 * is placed in the bmsafemap to await notification of a written bitmap.  If
3683 * the operation was canceled we add the segdep to the appropriate
3684 * dependency to free the journal space once the canceling operation
3685 * completes.
3686 */
3687static void
3688handle_written_jnewblk(jnewblk)
3689	struct jnewblk *jnewblk;
3690{
3691	struct bmsafemap *bmsafemap;
3692	struct freefrag *freefrag;
3693	struct freework *freework;
3694	struct jsegdep *jsegdep;
3695	struct newblk *newblk;
3696
3697	/* Grab the jsegdep. */
3698	jsegdep = jnewblk->jn_jsegdep;
3699	jnewblk->jn_jsegdep = NULL;
3700	if (jnewblk->jn_dep == NULL)
3701		panic("handle_written_jnewblk: No dependency for the segdep.");
3702	switch (jnewblk->jn_dep->wk_type) {
3703	case D_NEWBLK:
3704	case D_ALLOCDIRECT:
3705	case D_ALLOCINDIR:
3706		/*
3707		 * Add the written block to the bmsafemap so it can
3708		 * be notified when the bitmap is on disk.
3709		 */
3710		newblk = WK_NEWBLK(jnewblk->jn_dep);
3711		newblk->nb_jnewblk = NULL;
3712		if ((newblk->nb_state & GOINGAWAY) == 0) {
3713			bmsafemap = newblk->nb_bmsafemap;
3714			newblk->nb_state |= ONDEPLIST;
3715			LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3716			    nb_deps);
3717		}
3718		jwork_insert(&newblk->nb_jwork, jsegdep);
3719		break;
3720	case D_FREEFRAG:
3721		/*
3722		 * A newblock being removed by a freefrag when replaced by
3723		 * frag extension.
3724		 */
3725		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3726		freefrag->ff_jdep = NULL;
3727		jwork_insert(&freefrag->ff_jwork, jsegdep);
3728		break;
3729	case D_FREEWORK:
3730		/*
3731		 * A direct block was removed by truncate.
3732		 */
3733		freework = WK_FREEWORK(jnewblk->jn_dep);
3734		freework->fw_jnewblk = NULL;
3735		jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
3736		break;
3737	default:
3738		panic("handle_written_jnewblk: Unknown type %d.",
3739		    jnewblk->jn_dep->wk_type);
3740	}
3741	jnewblk->jn_dep = NULL;
3742	free_jnewblk(jnewblk);
3743}
3744
3745/*
3746 * Cancel a jfreefrag that won't be needed, probably due to colliding with
3747 * an in-flight allocation that has not yet been committed.  Divorce us
3748 * from the freefrag and mark it DEPCOMPLETE so that it may be added
3749 * to the worklist.
3750 */
3751static void
3752cancel_jfreefrag(jfreefrag)
3753	struct jfreefrag *jfreefrag;
3754{
3755	struct freefrag *freefrag;
3756
3757	if (jfreefrag->fr_jsegdep) {
3758		free_jsegdep(jfreefrag->fr_jsegdep);
3759		jfreefrag->fr_jsegdep = NULL;
3760	}
3761	freefrag = jfreefrag->fr_freefrag;
3762	jfreefrag->fr_freefrag = NULL;
3763	free_jfreefrag(jfreefrag);
3764	freefrag->ff_state |= DEPCOMPLETE;
3765	CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
3766}
3767
3768/*
3769 * Free a jfreefrag when the parent freefrag is rendered obsolete.
3770 */
3771static void
3772free_jfreefrag(jfreefrag)
3773	struct jfreefrag *jfreefrag;
3774{
3775
3776	if (jfreefrag->fr_state & INPROGRESS)
3777		WORKLIST_REMOVE(&jfreefrag->fr_list);
3778	else if (jfreefrag->fr_state & ONWORKLIST)
3779		remove_from_journal(&jfreefrag->fr_list);
3780	if (jfreefrag->fr_freefrag != NULL)
3781		panic("free_jfreefrag:  Still attached to a freefrag.");
3782	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3783}
3784
3785/*
3786 * Called when the journal write for a jfreefrag completes.  The parent
3787 * freefrag is added to the worklist if this completes its dependencies.
3788 */
3789static void
3790handle_written_jfreefrag(jfreefrag)
3791	struct jfreefrag *jfreefrag;
3792{
3793	struct jsegdep *jsegdep;
3794	struct freefrag *freefrag;
3795
3796	/* Grab the jsegdep. */
3797	jsegdep = jfreefrag->fr_jsegdep;
3798	jfreefrag->fr_jsegdep = NULL;
3799	freefrag = jfreefrag->fr_freefrag;
3800	if (freefrag == NULL)
3801		panic("handle_written_jfreefrag: No freefrag.");
3802	freefrag->ff_state |= DEPCOMPLETE;
3803	freefrag->ff_jdep = NULL;
3804	jwork_insert(&freefrag->ff_jwork, jsegdep);
3805	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3806		add_to_worklist(&freefrag->ff_list, 0);
3807	jfreefrag->fr_freefrag = NULL;
3808	free_jfreefrag(jfreefrag);
3809}
3810
3811/*
3812 * Called when the journal write for a jfreeblk completes.  The jfreeblk
3813 * is removed from the freeblks list of pending journal writes and the
3814 * jsegdep is moved to the freeblks jwork to be completed when all blocks
3815 * have been reclaimed.
3816 */
3817static void
3818handle_written_jblkdep(jblkdep)
3819	struct jblkdep *jblkdep;
3820{
3821	struct freeblks *freeblks;
3822	struct jsegdep *jsegdep;
3823
3824	/* Grab the jsegdep. */
3825	jsegdep = jblkdep->jb_jsegdep;
3826	jblkdep->jb_jsegdep = NULL;
3827	freeblks = jblkdep->jb_freeblks;
3828	LIST_REMOVE(jblkdep, jb_deps);
3829	jwork_insert(&freeblks->fb_jwork, jsegdep);
3830	/*
3831	 * If the freeblks is all journaled, we can add it to the worklist.
3832	 */
3833	if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
3834	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
3835		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
3836
3837	free_jblkdep(jblkdep);
3838}
3839
3840static struct jsegdep *
3841newjsegdep(struct worklist *wk)
3842{
3843	struct jsegdep *jsegdep;
3844
3845	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3846	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3847	jsegdep->jd_seg = NULL;
3848
3849	return (jsegdep);
3850}
3851
3852static struct jmvref *
3853newjmvref(dp, ino, oldoff, newoff)
3854	struct inode *dp;
3855	ino_t ino;
3856	off_t oldoff;
3857	off_t newoff;
3858{
3859	struct jmvref *jmvref;
3860
3861	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3862	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3863	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3864	jmvref->jm_parent = dp->i_number;
3865	jmvref->jm_ino = ino;
3866	jmvref->jm_oldoff = oldoff;
3867	jmvref->jm_newoff = newoff;
3868
3869	return (jmvref);
3870}
3871
3872/*
3873 * Allocate a new jremref that tracks the removal of ip from dp with the
3874 * directory entry offset of diroff.  Mark the entry as ATTACHED and
3875 * DEPCOMPLETE as we have all the information required for the journal write
3876 * and the directory has already been removed from the buffer.  The caller
3877 * is responsible for linking the jremref into the pagedep and adding it
3878 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
3879 * a DOTDOT addition so handle_workitem_remove() can properly assign
3880 * the jsegdep when we're done.
3881 */
3882static struct jremref *
3883newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
3884    off_t diroff, nlink_t nlink)
3885{
3886	struct jremref *jremref;
3887
3888	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
3889	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
3890	jremref->jr_state = ATTACHED;
3891	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
3892	   nlink, ip->i_mode);
3893	jremref->jr_dirrem = dirrem;
3894
3895	return (jremref);
3896}
3897
3898static inline void
3899newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
3900    nlink_t nlink, uint16_t mode)
3901{
3902
3903	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
3904	inoref->if_diroff = diroff;
3905	inoref->if_ino = ino;
3906	inoref->if_parent = parent;
3907	inoref->if_nlink = nlink;
3908	inoref->if_mode = mode;
3909}
3910
3911/*
3912 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
3913 * directory offset may not be known until later.  The caller is responsible
3914 * adding the entry to the journal when this information is available.  nlink
3915 * should be the link count prior to the addition and mode is only required
3916 * to have the correct FMT.
3917 */
3918static struct jaddref *
3919newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
3920    uint16_t mode)
3921{
3922	struct jaddref *jaddref;
3923
3924	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
3925	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
3926	jaddref->ja_state = ATTACHED;
3927	jaddref->ja_mkdir = NULL;
3928	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
3929
3930	return (jaddref);
3931}
3932
3933/*
3934 * Create a new free dependency for a freework.  The caller is responsible
3935 * for adjusting the reference count when it has the lock held.  The freedep
3936 * will track an outstanding bitmap write that will ultimately clear the
3937 * freework to continue.
3938 */
3939static struct freedep *
3940newfreedep(struct freework *freework)
3941{
3942	struct freedep *freedep;
3943
3944	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
3945	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
3946	freedep->fd_freework = freework;
3947
3948	return (freedep);
3949}
3950
3951/*
3952 * Free a freedep structure once the buffer it is linked to is written.  If
3953 * this is the last reference to the freework schedule it for completion.
3954 */
3955static void
3956free_freedep(freedep)
3957	struct freedep *freedep;
3958{
3959	struct freework *freework;
3960
3961	freework = freedep->fd_freework;
3962	freework->fw_freeblks->fb_cgwait--;
3963	if (--freework->fw_ref == 0)
3964		freework_enqueue(freework);
3965	WORKITEM_FREE(freedep, D_FREEDEP);
3966}
3967
3968/*
3969 * Allocate a new freework structure that may be a level in an indirect
3970 * when parent is not NULL or a top level block when it is.  The top level
3971 * freework structures are allocated without lk held and before the freeblks
3972 * is visible outside of softdep_setup_freeblocks().
3973 */
3974static struct freework *
3975newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
3976	struct ufsmount *ump;
3977	struct freeblks *freeblks;
3978	struct freework *parent;
3979	ufs_lbn_t lbn;
3980	ufs2_daddr_t nb;
3981	int frags;
3982	int off;
3983	int journal;
3984{
3985	struct freework *freework;
3986
3987	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
3988	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
3989	freework->fw_state = ATTACHED;
3990	freework->fw_jnewblk = NULL;
3991	freework->fw_freeblks = freeblks;
3992	freework->fw_parent = parent;
3993	freework->fw_lbn = lbn;
3994	freework->fw_blkno = nb;
3995	freework->fw_frags = frags;
3996	freework->fw_indir = NULL;
3997	freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR)
3998		? 0 : NINDIR(ump->um_fs) + 1;
3999	freework->fw_start = freework->fw_off = off;
4000	if (journal)
4001		newjfreeblk(freeblks, lbn, nb, frags);
4002	if (parent == NULL) {
4003		ACQUIRE_LOCK(&lk);
4004		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
4005		freeblks->fb_ref++;
4006		FREE_LOCK(&lk);
4007	}
4008
4009	return (freework);
4010}
4011
4012/*
4013 * Eliminate a jfreeblk for a block that does not need journaling.
4014 */
4015static void
4016cancel_jfreeblk(freeblks, blkno)
4017	struct freeblks *freeblks;
4018	ufs2_daddr_t blkno;
4019{
4020	struct jfreeblk *jfreeblk;
4021	struct jblkdep *jblkdep;
4022
4023	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4024		if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4025			continue;
4026		jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4027		if (jfreeblk->jf_blkno == blkno)
4028			break;
4029	}
4030	if (jblkdep == NULL)
4031		return;
4032	CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4033	free_jsegdep(jblkdep->jb_jsegdep);
4034	LIST_REMOVE(jblkdep, jb_deps);
4035	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4036}
4037
4038/*
4039 * Allocate a new jfreeblk to journal top level block pointer when truncating
4040 * a file.  The caller must add this to the worklist when lk is held.
4041 */
4042static struct jfreeblk *
4043newjfreeblk(freeblks, lbn, blkno, frags)
4044	struct freeblks *freeblks;
4045	ufs_lbn_t lbn;
4046	ufs2_daddr_t blkno;
4047	int frags;
4048{
4049	struct jfreeblk *jfreeblk;
4050
4051	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4052	workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4053	    freeblks->fb_list.wk_mp);
4054	jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4055	jfreeblk->jf_dep.jb_freeblks = freeblks;
4056	jfreeblk->jf_ino = freeblks->fb_inum;
4057	jfreeblk->jf_lbn = lbn;
4058	jfreeblk->jf_blkno = blkno;
4059	jfreeblk->jf_frags = frags;
4060	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4061
4062	return (jfreeblk);
4063}
4064
4065/*
4066 * Allocate a new jtrunc to track a partial truncation.
4067 */
4068static struct jtrunc *
4069newjtrunc(freeblks, size, extsize)
4070	struct freeblks *freeblks;
4071	off_t size;
4072	int extsize;
4073{
4074	struct jtrunc *jtrunc;
4075
4076	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4077	workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4078	    freeblks->fb_list.wk_mp);
4079	jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4080	jtrunc->jt_dep.jb_freeblks = freeblks;
4081	jtrunc->jt_ino = freeblks->fb_inum;
4082	jtrunc->jt_size = size;
4083	jtrunc->jt_extsize = extsize;
4084	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4085
4086	return (jtrunc);
4087}
4088
4089/*
4090 * If we're canceling a new bitmap we have to search for another ref
4091 * to move into the bmsafemap dep.  This might be better expressed
4092 * with another structure.
4093 */
4094static void
4095move_newblock_dep(jaddref, inodedep)
4096	struct jaddref *jaddref;
4097	struct inodedep *inodedep;
4098{
4099	struct inoref *inoref;
4100	struct jaddref *jaddrefn;
4101
4102	jaddrefn = NULL;
4103	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4104	    inoref = TAILQ_NEXT(inoref, if_deps)) {
4105		if ((jaddref->ja_state & NEWBLOCK) &&
4106		    inoref->if_list.wk_type == D_JADDREF) {
4107			jaddrefn = (struct jaddref *)inoref;
4108			break;
4109		}
4110	}
4111	if (jaddrefn == NULL)
4112		return;
4113	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4114	jaddrefn->ja_state |= jaddref->ja_state &
4115	    (ATTACHED | UNDONE | NEWBLOCK);
4116	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4117	jaddref->ja_state |= ATTACHED;
4118	LIST_REMOVE(jaddref, ja_bmdeps);
4119	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4120	    ja_bmdeps);
4121}
4122
4123/*
4124 * Cancel a jaddref either before it has been written or while it is being
4125 * written.  This happens when a link is removed before the add reaches
4126 * the disk.  The jaddref dependency is kept linked into the bmsafemap
4127 * and inode to prevent the link count or bitmap from reaching the disk
4128 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4129 * required.
4130 *
4131 * Returns 1 if the canceled addref requires journaling of the remove and
4132 * 0 otherwise.
4133 */
4134static int
4135cancel_jaddref(jaddref, inodedep, wkhd)
4136	struct jaddref *jaddref;
4137	struct inodedep *inodedep;
4138	struct workhead *wkhd;
4139{
4140	struct inoref *inoref;
4141	struct jsegdep *jsegdep;
4142	int needsj;
4143
4144	KASSERT((jaddref->ja_state & COMPLETE) == 0,
4145	    ("cancel_jaddref: Canceling complete jaddref"));
4146	if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4147		needsj = 1;
4148	else
4149		needsj = 0;
4150	if (inodedep == NULL)
4151		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4152		    0, &inodedep) == 0)
4153			panic("cancel_jaddref: Lost inodedep");
4154	/*
4155	 * We must adjust the nlink of any reference operation that follows
4156	 * us so that it is consistent with the in-memory reference.  This
4157	 * ensures that inode nlink rollbacks always have the correct link.
4158	 */
4159	if (needsj == 0) {
4160		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4161		    inoref = TAILQ_NEXT(inoref, if_deps)) {
4162			if (inoref->if_state & GOINGAWAY)
4163				break;
4164			inoref->if_nlink--;
4165		}
4166	}
4167	jsegdep = inoref_jseg(&jaddref->ja_ref);
4168	if (jaddref->ja_state & NEWBLOCK)
4169		move_newblock_dep(jaddref, inodedep);
4170	wake_worklist(&jaddref->ja_list);
4171	jaddref->ja_mkdir = NULL;
4172	if (jaddref->ja_state & INPROGRESS) {
4173		jaddref->ja_state &= ~INPROGRESS;
4174		WORKLIST_REMOVE(&jaddref->ja_list);
4175		jwork_insert(wkhd, jsegdep);
4176	} else {
4177		free_jsegdep(jsegdep);
4178		if (jaddref->ja_state & DEPCOMPLETE)
4179			remove_from_journal(&jaddref->ja_list);
4180	}
4181	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4182	/*
4183	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4184	 * can arrange for them to be freed with the bitmap.  Otherwise we
4185	 * no longer need this addref attached to the inoreflst and it
4186	 * will incorrectly adjust nlink if we leave it.
4187	 */
4188	if ((jaddref->ja_state & NEWBLOCK) == 0) {
4189		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4190		    if_deps);
4191		jaddref->ja_state |= COMPLETE;
4192		free_jaddref(jaddref);
4193		return (needsj);
4194	}
4195	/*
4196	 * Leave the head of the list for jsegdeps for fast merging.
4197	 */
4198	if (LIST_FIRST(wkhd) != NULL) {
4199		jaddref->ja_state |= ONWORKLIST;
4200		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4201	} else
4202		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4203
4204	return (needsj);
4205}
4206
4207/*
4208 * Attempt to free a jaddref structure when some work completes.  This
4209 * should only succeed once the entry is written and all dependencies have
4210 * been notified.
4211 */
4212static void
4213free_jaddref(jaddref)
4214	struct jaddref *jaddref;
4215{
4216
4217	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4218		return;
4219	if (jaddref->ja_ref.if_jsegdep)
4220		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4221		    jaddref, jaddref->ja_state);
4222	if (jaddref->ja_state & NEWBLOCK)
4223		LIST_REMOVE(jaddref, ja_bmdeps);
4224	if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4225		panic("free_jaddref: Bad state %p(0x%X)",
4226		    jaddref, jaddref->ja_state);
4227	if (jaddref->ja_mkdir != NULL)
4228		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4229	WORKITEM_FREE(jaddref, D_JADDREF);
4230}
4231
4232/*
4233 * Free a jremref structure once it has been written or discarded.
4234 */
4235static void
4236free_jremref(jremref)
4237	struct jremref *jremref;
4238{
4239
4240	if (jremref->jr_ref.if_jsegdep)
4241		free_jsegdep(jremref->jr_ref.if_jsegdep);
4242	if (jremref->jr_state & INPROGRESS)
4243		panic("free_jremref: IO still pending");
4244	WORKITEM_FREE(jremref, D_JREMREF);
4245}
4246
4247/*
4248 * Free a jnewblk structure.
4249 */
4250static void
4251free_jnewblk(jnewblk)
4252	struct jnewblk *jnewblk;
4253{
4254
4255	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4256		return;
4257	LIST_REMOVE(jnewblk, jn_deps);
4258	if (jnewblk->jn_dep != NULL)
4259		panic("free_jnewblk: Dependency still attached.");
4260	WORKITEM_FREE(jnewblk, D_JNEWBLK);
4261}
4262
4263/*
4264 * Cancel a jnewblk which has been been made redundant by frag extension.
4265 */
4266static void
4267cancel_jnewblk(jnewblk, wkhd)
4268	struct jnewblk *jnewblk;
4269	struct workhead *wkhd;
4270{
4271	struct jsegdep *jsegdep;
4272
4273	CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4274	jsegdep = jnewblk->jn_jsegdep;
4275	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4276		panic("cancel_jnewblk: Invalid state");
4277	jnewblk->jn_jsegdep  = NULL;
4278	jnewblk->jn_dep = NULL;
4279	jnewblk->jn_state |= GOINGAWAY;
4280	if (jnewblk->jn_state & INPROGRESS) {
4281		jnewblk->jn_state &= ~INPROGRESS;
4282		WORKLIST_REMOVE(&jnewblk->jn_list);
4283		jwork_insert(wkhd, jsegdep);
4284	} else {
4285		free_jsegdep(jsegdep);
4286		remove_from_journal(&jnewblk->jn_list);
4287	}
4288	wake_worklist(&jnewblk->jn_list);
4289	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4290}
4291
4292static void
4293free_jblkdep(jblkdep)
4294	struct jblkdep *jblkdep;
4295{
4296
4297	if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4298		WORKITEM_FREE(jblkdep, D_JFREEBLK);
4299	else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4300		WORKITEM_FREE(jblkdep, D_JTRUNC);
4301	else
4302		panic("free_jblkdep: Unexpected type %s",
4303		    TYPENAME(jblkdep->jb_list.wk_type));
4304}
4305
4306/*
4307 * Free a single jseg once it is no longer referenced in memory or on
4308 * disk.  Reclaim journal blocks and dependencies waiting for the segment
4309 * to disappear.
4310 */
4311static void
4312free_jseg(jseg, jblocks)
4313	struct jseg *jseg;
4314	struct jblocks *jblocks;
4315{
4316	struct freework *freework;
4317
4318	/*
4319	 * Free freework structures that were lingering to indicate freed
4320	 * indirect blocks that forced journal write ordering on reallocate.
4321	 */
4322	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4323		indirblk_remove(freework);
4324	if (jblocks->jb_oldestseg == jseg)
4325		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4326	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4327	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4328	KASSERT(LIST_EMPTY(&jseg->js_entries),
4329	    ("free_jseg: Freed jseg has valid entries."));
4330	WORKITEM_FREE(jseg, D_JSEG);
4331}
4332
4333/*
4334 * Free all jsegs that meet the criteria for being reclaimed and update
4335 * oldestseg.
4336 */
4337static void
4338free_jsegs(jblocks)
4339	struct jblocks *jblocks;
4340{
4341	struct jseg *jseg;
4342
4343	/*
4344	 * Free only those jsegs which have none allocated before them to
4345	 * preserve the journal space ordering.
4346	 */
4347	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4348		/*
4349		 * Only reclaim space when nothing depends on this journal
4350		 * set and another set has written that it is no longer
4351		 * valid.
4352		 */
4353		if (jseg->js_refs != 0) {
4354			jblocks->jb_oldestseg = jseg;
4355			return;
4356		}
4357		if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4358			break;
4359		if (jseg->js_seq > jblocks->jb_oldestwrseq)
4360			break;
4361		/*
4362		 * We can free jsegs that didn't write entries when
4363		 * oldestwrseq == js_seq.
4364		 */
4365		if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4366		    jseg->js_cnt != 0)
4367			break;
4368		free_jseg(jseg, jblocks);
4369	}
4370	/*
4371	 * If we exited the loop above we still must discover the
4372	 * oldest valid segment.
4373	 */
4374	if (jseg)
4375		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4376		     jseg = TAILQ_NEXT(jseg, js_next))
4377			if (jseg->js_refs != 0)
4378				break;
4379	jblocks->jb_oldestseg = jseg;
4380	/*
4381	 * The journal has no valid records but some jsegs may still be
4382	 * waiting on oldestwrseq to advance.  We force a small record
4383	 * out to permit these lingering records to be reclaimed.
4384	 */
4385	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4386		jblocks->jb_needseg = 1;
4387}
4388
4389/*
4390 * Release one reference to a jseg and free it if the count reaches 0.  This
4391 * should eventually reclaim journal space as well.
4392 */
4393static void
4394rele_jseg(jseg)
4395	struct jseg *jseg;
4396{
4397
4398	KASSERT(jseg->js_refs > 0,
4399	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4400	if (--jseg->js_refs != 0)
4401		return;
4402	free_jsegs(jseg->js_jblocks);
4403}
4404
4405/*
4406 * Release a jsegdep and decrement the jseg count.
4407 */
4408static void
4409free_jsegdep(jsegdep)
4410	struct jsegdep *jsegdep;
4411{
4412
4413	if (jsegdep->jd_seg)
4414		rele_jseg(jsegdep->jd_seg);
4415	WORKITEM_FREE(jsegdep, D_JSEGDEP);
4416}
4417
4418/*
4419 * Wait for a journal item to make it to disk.  Initiate journal processing
4420 * if required.
4421 */
4422static int
4423jwait(wk, waitfor)
4424	struct worklist *wk;
4425	int waitfor;
4426{
4427
4428	/*
4429	 * Blocking journal waits cause slow synchronous behavior.  Record
4430	 * stats on the frequency of these blocking operations.
4431	 */
4432	if (waitfor == MNT_WAIT) {
4433		stat_journal_wait++;
4434		switch (wk->wk_type) {
4435		case D_JREMREF:
4436		case D_JMVREF:
4437			stat_jwait_filepage++;
4438			break;
4439		case D_JTRUNC:
4440		case D_JFREEBLK:
4441			stat_jwait_freeblks++;
4442			break;
4443		case D_JNEWBLK:
4444			stat_jwait_newblk++;
4445			break;
4446		case D_JADDREF:
4447			stat_jwait_inode++;
4448			break;
4449		default:
4450			break;
4451		}
4452	}
4453	/*
4454	 * If IO has not started we process the journal.  We can't mark the
4455	 * worklist item as IOWAITING because we drop the lock while
4456	 * processing the journal and the worklist entry may be freed after
4457	 * this point.  The caller may call back in and re-issue the request.
4458	 */
4459	if ((wk->wk_state & INPROGRESS) == 0) {
4460		softdep_process_journal(wk->wk_mp, wk, waitfor);
4461		if (waitfor != MNT_WAIT)
4462			return (EBUSY);
4463		return (0);
4464	}
4465	if (waitfor != MNT_WAIT)
4466		return (EBUSY);
4467	wait_worklist(wk, "jwait");
4468	return (0);
4469}
4470
4471/*
4472 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4473 * appropriate.  This is a convenience function to reduce duplicate code
4474 * for the setup and revert functions below.
4475 */
4476static struct inodedep *
4477inodedep_lookup_ip(ip)
4478	struct inode *ip;
4479{
4480	struct inodedep *inodedep;
4481	int dflags;
4482
4483	KASSERT(ip->i_nlink >= ip->i_effnlink,
4484	    ("inodedep_lookup_ip: bad delta"));
4485	dflags = DEPALLOC;
4486	if (IS_SNAPSHOT(ip))
4487		dflags |= NODELAY;
4488	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags,
4489	    &inodedep);
4490	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4491	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4492
4493	return (inodedep);
4494}
4495
4496/*
4497 * Called prior to creating a new inode and linking it to a directory.  The
4498 * jaddref structure must already be allocated by softdep_setup_inomapdep
4499 * and it is discovered here so we can initialize the mode and update
4500 * nlinkdelta.
4501 */
4502void
4503softdep_setup_create(dp, ip)
4504	struct inode *dp;
4505	struct inode *ip;
4506{
4507	struct inodedep *inodedep;
4508	struct jaddref *jaddref;
4509	struct vnode *dvp;
4510
4511	KASSERT(ip->i_nlink == 1,
4512	    ("softdep_setup_create: Invalid link count."));
4513	dvp = ITOV(dp);
4514	ACQUIRE_LOCK(&lk);
4515	inodedep = inodedep_lookup_ip(ip);
4516	if (DOINGSUJ(dvp)) {
4517		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4518		    inoreflst);
4519		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4520		    ("softdep_setup_create: No addref structure present."));
4521	}
4522	softdep_prelink(dvp, NULL);
4523	FREE_LOCK(&lk);
4524}
4525
4526/*
4527 * Create a jaddref structure to track the addition of a DOTDOT link when
4528 * we are reparenting an inode as part of a rename.  This jaddref will be
4529 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4530 * non-journaling softdep.
4531 */
4532void
4533softdep_setup_dotdot_link(dp, ip)
4534	struct inode *dp;
4535	struct inode *ip;
4536{
4537	struct inodedep *inodedep;
4538	struct jaddref *jaddref;
4539	struct vnode *dvp;
4540	struct vnode *vp;
4541
4542	dvp = ITOV(dp);
4543	vp = ITOV(ip);
4544	jaddref = NULL;
4545	/*
4546	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4547	 * is used as a normal link would be.
4548	 */
4549	if (DOINGSUJ(dvp))
4550		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4551		    dp->i_effnlink - 1, dp->i_mode);
4552	ACQUIRE_LOCK(&lk);
4553	inodedep = inodedep_lookup_ip(dp);
4554	if (jaddref)
4555		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4556		    if_deps);
4557	softdep_prelink(dvp, ITOV(ip));
4558	FREE_LOCK(&lk);
4559}
4560
4561/*
4562 * Create a jaddref structure to track a new link to an inode.  The directory
4563 * offset is not known until softdep_setup_directory_add or
4564 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4565 * softdep.
4566 */
4567void
4568softdep_setup_link(dp, ip)
4569	struct inode *dp;
4570	struct inode *ip;
4571{
4572	struct inodedep *inodedep;
4573	struct jaddref *jaddref;
4574	struct vnode *dvp;
4575
4576	dvp = ITOV(dp);
4577	jaddref = NULL;
4578	if (DOINGSUJ(dvp))
4579		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4580		    ip->i_mode);
4581	ACQUIRE_LOCK(&lk);
4582	inodedep = inodedep_lookup_ip(ip);
4583	if (jaddref)
4584		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4585		    if_deps);
4586	softdep_prelink(dvp, ITOV(ip));
4587	FREE_LOCK(&lk);
4588}
4589
4590/*
4591 * Called to create the jaddref structures to track . and .. references as
4592 * well as lookup and further initialize the incomplete jaddref created
4593 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4594 * nlinkdelta for non-journaling softdep.
4595 */
4596void
4597softdep_setup_mkdir(dp, ip)
4598	struct inode *dp;
4599	struct inode *ip;
4600{
4601	struct inodedep *inodedep;
4602	struct jaddref *dotdotaddref;
4603	struct jaddref *dotaddref;
4604	struct jaddref *jaddref;
4605	struct vnode *dvp;
4606
4607	dvp = ITOV(dp);
4608	dotaddref = dotdotaddref = NULL;
4609	if (DOINGSUJ(dvp)) {
4610		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4611		    ip->i_mode);
4612		dotaddref->ja_state |= MKDIR_BODY;
4613		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4614		    dp->i_effnlink - 1, dp->i_mode);
4615		dotdotaddref->ja_state |= MKDIR_PARENT;
4616	}
4617	ACQUIRE_LOCK(&lk);
4618	inodedep = inodedep_lookup_ip(ip);
4619	if (DOINGSUJ(dvp)) {
4620		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4621		    inoreflst);
4622		KASSERT(jaddref != NULL,
4623		    ("softdep_setup_mkdir: No addref structure present."));
4624		KASSERT(jaddref->ja_parent == dp->i_number,
4625		    ("softdep_setup_mkdir: bad parent %d",
4626		    jaddref->ja_parent));
4627		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4628		    if_deps);
4629	}
4630	inodedep = inodedep_lookup_ip(dp);
4631	if (DOINGSUJ(dvp))
4632		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4633		    &dotdotaddref->ja_ref, if_deps);
4634	softdep_prelink(ITOV(dp), NULL);
4635	FREE_LOCK(&lk);
4636}
4637
4638/*
4639 * Called to track nlinkdelta of the inode and parent directories prior to
4640 * unlinking a directory.
4641 */
4642void
4643softdep_setup_rmdir(dp, ip)
4644	struct inode *dp;
4645	struct inode *ip;
4646{
4647	struct vnode *dvp;
4648
4649	dvp = ITOV(dp);
4650	ACQUIRE_LOCK(&lk);
4651	(void) inodedep_lookup_ip(ip);
4652	(void) inodedep_lookup_ip(dp);
4653	softdep_prelink(dvp, ITOV(ip));
4654	FREE_LOCK(&lk);
4655}
4656
4657/*
4658 * Called to track nlinkdelta of the inode and parent directories prior to
4659 * unlink.
4660 */
4661void
4662softdep_setup_unlink(dp, ip)
4663	struct inode *dp;
4664	struct inode *ip;
4665{
4666	struct vnode *dvp;
4667
4668	dvp = ITOV(dp);
4669	ACQUIRE_LOCK(&lk);
4670	(void) inodedep_lookup_ip(ip);
4671	(void) inodedep_lookup_ip(dp);
4672	softdep_prelink(dvp, ITOV(ip));
4673	FREE_LOCK(&lk);
4674}
4675
4676/*
4677 * Called to release the journal structures created by a failed non-directory
4678 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4679 */
4680void
4681softdep_revert_create(dp, ip)
4682	struct inode *dp;
4683	struct inode *ip;
4684{
4685	struct inodedep *inodedep;
4686	struct jaddref *jaddref;
4687	struct vnode *dvp;
4688
4689	dvp = ITOV(dp);
4690	ACQUIRE_LOCK(&lk);
4691	inodedep = inodedep_lookup_ip(ip);
4692	if (DOINGSUJ(dvp)) {
4693		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4694		    inoreflst);
4695		KASSERT(jaddref->ja_parent == dp->i_number,
4696		    ("softdep_revert_create: addref parent mismatch"));
4697		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4698	}
4699	FREE_LOCK(&lk);
4700}
4701
4702/*
4703 * Called to release the journal structures created by a failed dotdot link
4704 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4705 */
4706void
4707softdep_revert_dotdot_link(dp, ip)
4708	struct inode *dp;
4709	struct inode *ip;
4710{
4711	struct inodedep *inodedep;
4712	struct jaddref *jaddref;
4713	struct vnode *dvp;
4714
4715	dvp = ITOV(dp);
4716	ACQUIRE_LOCK(&lk);
4717	inodedep = inodedep_lookup_ip(dp);
4718	if (DOINGSUJ(dvp)) {
4719		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4720		    inoreflst);
4721		KASSERT(jaddref->ja_parent == ip->i_number,
4722		    ("softdep_revert_dotdot_link: addref parent mismatch"));
4723		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4724	}
4725	FREE_LOCK(&lk);
4726}
4727
4728/*
4729 * Called to release the journal structures created by a failed link
4730 * addition.  Adjusts nlinkdelta for non-journaling softdep.
4731 */
4732void
4733softdep_revert_link(dp, ip)
4734	struct inode *dp;
4735	struct inode *ip;
4736{
4737	struct inodedep *inodedep;
4738	struct jaddref *jaddref;
4739	struct vnode *dvp;
4740
4741	dvp = ITOV(dp);
4742	ACQUIRE_LOCK(&lk);
4743	inodedep = inodedep_lookup_ip(ip);
4744	if (DOINGSUJ(dvp)) {
4745		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4746		    inoreflst);
4747		KASSERT(jaddref->ja_parent == dp->i_number,
4748		    ("softdep_revert_link: addref parent mismatch"));
4749		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4750	}
4751	FREE_LOCK(&lk);
4752}
4753
4754/*
4755 * Called to release the journal structures created by a failed mkdir
4756 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4757 */
4758void
4759softdep_revert_mkdir(dp, ip)
4760	struct inode *dp;
4761	struct inode *ip;
4762{
4763	struct inodedep *inodedep;
4764	struct jaddref *jaddref;
4765	struct jaddref *dotaddref;
4766	struct vnode *dvp;
4767
4768	dvp = ITOV(dp);
4769
4770	ACQUIRE_LOCK(&lk);
4771	inodedep = inodedep_lookup_ip(dp);
4772	if (DOINGSUJ(dvp)) {
4773		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4774		    inoreflst);
4775		KASSERT(jaddref->ja_parent == ip->i_number,
4776		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4777		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4778	}
4779	inodedep = inodedep_lookup_ip(ip);
4780	if (DOINGSUJ(dvp)) {
4781		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4782		    inoreflst);
4783		KASSERT(jaddref->ja_parent == dp->i_number,
4784		    ("softdep_revert_mkdir: addref parent mismatch"));
4785		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4786		    inoreflst, if_deps);
4787		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4788		KASSERT(dotaddref->ja_parent == ip->i_number,
4789		    ("softdep_revert_mkdir: dot addref parent mismatch"));
4790		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4791	}
4792	FREE_LOCK(&lk);
4793}
4794
4795/*
4796 * Called to correct nlinkdelta after a failed rmdir.
4797 */
4798void
4799softdep_revert_rmdir(dp, ip)
4800	struct inode *dp;
4801	struct inode *ip;
4802{
4803
4804	ACQUIRE_LOCK(&lk);
4805	(void) inodedep_lookup_ip(ip);
4806	(void) inodedep_lookup_ip(dp);
4807	FREE_LOCK(&lk);
4808}
4809
4810/*
4811 * Protecting the freemaps (or bitmaps).
4812 *
4813 * To eliminate the need to execute fsck before mounting a filesystem
4814 * after a power failure, one must (conservatively) guarantee that the
4815 * on-disk copy of the bitmaps never indicate that a live inode or block is
4816 * free.  So, when a block or inode is allocated, the bitmap should be
4817 * updated (on disk) before any new pointers.  When a block or inode is
4818 * freed, the bitmap should not be updated until all pointers have been
4819 * reset.  The latter dependency is handled by the delayed de-allocation
4820 * approach described below for block and inode de-allocation.  The former
4821 * dependency is handled by calling the following procedure when a block or
4822 * inode is allocated. When an inode is allocated an "inodedep" is created
4823 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4824 * Each "inodedep" is also inserted into the hash indexing structure so
4825 * that any additional link additions can be made dependent on the inode
4826 * allocation.
4827 *
4828 * The ufs filesystem maintains a number of free block counts (e.g., per
4829 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4830 * in addition to the bitmaps.  These counts are used to improve efficiency
4831 * during allocation and therefore must be consistent with the bitmaps.
4832 * There is no convenient way to guarantee post-crash consistency of these
4833 * counts with simple update ordering, for two main reasons: (1) The counts
4834 * and bitmaps for a single cylinder group block are not in the same disk
4835 * sector.  If a disk write is interrupted (e.g., by power failure), one may
4836 * be written and the other not.  (2) Some of the counts are located in the
4837 * superblock rather than the cylinder group block. So, we focus our soft
4838 * updates implementation on protecting the bitmaps. When mounting a
4839 * filesystem, we recompute the auxiliary counts from the bitmaps.
4840 */
4841
4842/*
4843 * Called just after updating the cylinder group block to allocate an inode.
4844 */
4845void
4846softdep_setup_inomapdep(bp, ip, newinum, mode)
4847	struct buf *bp;		/* buffer for cylgroup block with inode map */
4848	struct inode *ip;	/* inode related to allocation */
4849	ino_t newinum;		/* new inode number being allocated */
4850	int mode;
4851{
4852	struct inodedep *inodedep;
4853	struct bmsafemap *bmsafemap;
4854	struct jaddref *jaddref;
4855	struct mount *mp;
4856	struct fs *fs;
4857
4858	mp = UFSTOVFS(ip->i_ump);
4859	fs = ip->i_ump->um_fs;
4860	jaddref = NULL;
4861
4862	/*
4863	 * Allocate the journal reference add structure so that the bitmap
4864	 * can be dependent on it.
4865	 */
4866	if (MOUNTEDSUJ(mp)) {
4867		jaddref = newjaddref(ip, newinum, 0, 0, mode);
4868		jaddref->ja_state |= NEWBLOCK;
4869	}
4870
4871	/*
4872	 * Create a dependency for the newly allocated inode.
4873	 * Panic if it already exists as something is seriously wrong.
4874	 * Otherwise add it to the dependency list for the buffer holding
4875	 * the cylinder group map from which it was allocated.
4876	 *
4877	 * We have to preallocate a bmsafemap entry in case it is needed
4878	 * in bmsafemap_lookup since once we allocate the inodedep, we
4879	 * have to finish initializing it before we can FREE_LOCK().
4880	 * By preallocating, we avoid FREE_LOCK() while doing a malloc
4881	 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
4882	 * creating the inodedep as it can be freed during the time
4883	 * that we FREE_LOCK() while allocating the inodedep. We must
4884	 * call workitem_alloc() before entering the locked section as
4885	 * it also acquires the lock and we must avoid trying doing so
4886	 * recursively.
4887	 */
4888	bmsafemap = malloc(sizeof(struct bmsafemap),
4889	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
4890	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
4891	ACQUIRE_LOCK(&lk);
4892	if ((inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep)))
4893		panic("softdep_setup_inomapdep: dependency %p for new"
4894		    "inode already exists", inodedep);
4895	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
4896	if (jaddref) {
4897		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
4898		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4899		    if_deps);
4900	} else {
4901		inodedep->id_state |= ONDEPLIST;
4902		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
4903	}
4904	inodedep->id_bmsafemap = bmsafemap;
4905	inodedep->id_state &= ~DEPCOMPLETE;
4906	FREE_LOCK(&lk);
4907}
4908
4909/*
4910 * Called just after updating the cylinder group block to
4911 * allocate block or fragment.
4912 */
4913void
4914softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
4915	struct buf *bp;		/* buffer for cylgroup block with block map */
4916	struct mount *mp;	/* filesystem doing allocation */
4917	ufs2_daddr_t newblkno;	/* number of newly allocated block */
4918	int frags;		/* Number of fragments. */
4919	int oldfrags;		/* Previous number of fragments for extend. */
4920{
4921	struct newblk *newblk;
4922	struct bmsafemap *bmsafemap;
4923	struct jnewblk *jnewblk;
4924	struct fs *fs;
4925
4926	fs = VFSTOUFS(mp)->um_fs;
4927	jnewblk = NULL;
4928	/*
4929	 * Create a dependency for the newly allocated block.
4930	 * Add it to the dependency list for the buffer holding
4931	 * the cylinder group map from which it was allocated.
4932	 */
4933	if (MOUNTEDSUJ(mp)) {
4934		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
4935		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
4936		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
4937		jnewblk->jn_state = ATTACHED;
4938		jnewblk->jn_blkno = newblkno;
4939		jnewblk->jn_frags = frags;
4940		jnewblk->jn_oldfrags = oldfrags;
4941#ifdef SUJ_DEBUG
4942		{
4943			struct cg *cgp;
4944			uint8_t *blksfree;
4945			long bno;
4946			int i;
4947
4948			cgp = (struct cg *)bp->b_data;
4949			blksfree = cg_blksfree(cgp);
4950			bno = dtogd(fs, jnewblk->jn_blkno);
4951			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
4952			    i++) {
4953				if (isset(blksfree, bno + i))
4954					panic("softdep_setup_blkmapdep: "
4955					    "free fragment %d from %d-%d "
4956					    "state 0x%X dep %p", i,
4957					    jnewblk->jn_oldfrags,
4958					    jnewblk->jn_frags,
4959					    jnewblk->jn_state,
4960					    jnewblk->jn_dep);
4961			}
4962		}
4963#endif
4964	}
4965
4966	CTR3(KTR_SUJ,
4967	    "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
4968	    newblkno, frags, oldfrags);
4969	ACQUIRE_LOCK(&lk);
4970	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
4971		panic("softdep_setup_blkmapdep: found block");
4972	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
4973	    dtog(fs, newblkno), NULL);
4974	if (jnewblk) {
4975		jnewblk->jn_dep = (struct worklist *)newblk;
4976		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
4977	} else {
4978		newblk->nb_state |= ONDEPLIST;
4979		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
4980	}
4981	newblk->nb_bmsafemap = bmsafemap;
4982	newblk->nb_jnewblk = jnewblk;
4983	FREE_LOCK(&lk);
4984}
4985
4986#define	BMSAFEMAP_HASH(fs, cg) \
4987      (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash])
4988
4989static int
4990bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp)
4991	struct bmsafemap_hashhead *bmsafemaphd;
4992	struct mount *mp;
4993	int cg;
4994	struct bmsafemap **bmsafemapp;
4995{
4996	struct bmsafemap *bmsafemap;
4997
4998	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
4999		if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg)
5000			break;
5001	if (bmsafemap) {
5002		*bmsafemapp = bmsafemap;
5003		return (1);
5004	}
5005	*bmsafemapp = NULL;
5006
5007	return (0);
5008}
5009
5010/*
5011 * Find the bmsafemap associated with a cylinder group buffer.
5012 * If none exists, create one. The buffer must be locked when
5013 * this routine is called and this routine must be called with
5014 * the softdep lock held. To avoid giving up the lock while
5015 * allocating a new bmsafemap, a preallocated bmsafemap may be
5016 * provided. If it is provided but not needed, it is freed.
5017 */
5018static struct bmsafemap *
5019bmsafemap_lookup(mp, bp, cg, newbmsafemap)
5020	struct mount *mp;
5021	struct buf *bp;
5022	int cg;
5023	struct bmsafemap *newbmsafemap;
5024{
5025	struct bmsafemap_hashhead *bmsafemaphd;
5026	struct bmsafemap *bmsafemap, *collision;
5027	struct worklist *wk;
5028	struct fs *fs;
5029
5030	mtx_assert(&lk, MA_OWNED);
5031	KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
5032	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5033		if (wk->wk_type == D_BMSAFEMAP) {
5034			if (newbmsafemap)
5035				WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5036			return (WK_BMSAFEMAP(wk));
5037		}
5038	}
5039	fs = VFSTOUFS(mp)->um_fs;
5040	bmsafemaphd = BMSAFEMAP_HASH(fs, cg);
5041	if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) {
5042		if (newbmsafemap)
5043			WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5044		return (bmsafemap);
5045	}
5046	if (newbmsafemap) {
5047		bmsafemap = newbmsafemap;
5048	} else {
5049		FREE_LOCK(&lk);
5050		bmsafemap = malloc(sizeof(struct bmsafemap),
5051			M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5052		workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5053		ACQUIRE_LOCK(&lk);
5054	}
5055	bmsafemap->sm_buf = bp;
5056	LIST_INIT(&bmsafemap->sm_inodedephd);
5057	LIST_INIT(&bmsafemap->sm_inodedepwr);
5058	LIST_INIT(&bmsafemap->sm_newblkhd);
5059	LIST_INIT(&bmsafemap->sm_newblkwr);
5060	LIST_INIT(&bmsafemap->sm_jaddrefhd);
5061	LIST_INIT(&bmsafemap->sm_jnewblkhd);
5062	LIST_INIT(&bmsafemap->sm_freehd);
5063	LIST_INIT(&bmsafemap->sm_freewr);
5064	if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) {
5065		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5066		return (collision);
5067	}
5068	bmsafemap->sm_cg = cg;
5069	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5070	LIST_INSERT_HEAD(&VFSTOUFS(mp)->softdep_dirtycg, bmsafemap, sm_next);
5071	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5072	return (bmsafemap);
5073}
5074
5075/*
5076 * Direct block allocation dependencies.
5077 *
5078 * When a new block is allocated, the corresponding disk locations must be
5079 * initialized (with zeros or new data) before the on-disk inode points to
5080 * them.  Also, the freemap from which the block was allocated must be
5081 * updated (on disk) before the inode's pointer. These two dependencies are
5082 * independent of each other and are needed for all file blocks and indirect
5083 * blocks that are pointed to directly by the inode.  Just before the
5084 * "in-core" version of the inode is updated with a newly allocated block
5085 * number, a procedure (below) is called to setup allocation dependency
5086 * structures.  These structures are removed when the corresponding
5087 * dependencies are satisfied or when the block allocation becomes obsolete
5088 * (i.e., the file is deleted, the block is de-allocated, or the block is a
5089 * fragment that gets upgraded).  All of these cases are handled in
5090 * procedures described later.
5091 *
5092 * When a file extension causes a fragment to be upgraded, either to a larger
5093 * fragment or to a full block, the on-disk location may change (if the
5094 * previous fragment could not simply be extended). In this case, the old
5095 * fragment must be de-allocated, but not until after the inode's pointer has
5096 * been updated. In most cases, this is handled by later procedures, which
5097 * will construct a "freefrag" structure to be added to the workitem queue
5098 * when the inode update is complete (or obsolete).  The main exception to
5099 * this is when an allocation occurs while a pending allocation dependency
5100 * (for the same block pointer) remains.  This case is handled in the main
5101 * allocation dependency setup procedure by immediately freeing the
5102 * unreferenced fragments.
5103 */
5104void
5105softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5106	struct inode *ip;	/* inode to which block is being added */
5107	ufs_lbn_t off;		/* block pointer within inode */
5108	ufs2_daddr_t newblkno;	/* disk block number being added */
5109	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
5110	long newsize;		/* size of new block */
5111	long oldsize;		/* size of new block */
5112	struct buf *bp;		/* bp for allocated block */
5113{
5114	struct allocdirect *adp, *oldadp;
5115	struct allocdirectlst *adphead;
5116	struct freefrag *freefrag;
5117	struct inodedep *inodedep;
5118	struct pagedep *pagedep;
5119	struct jnewblk *jnewblk;
5120	struct newblk *newblk;
5121	struct mount *mp;
5122	ufs_lbn_t lbn;
5123
5124	lbn = bp->b_lblkno;
5125	mp = UFSTOVFS(ip->i_ump);
5126	if (oldblkno && oldblkno != newblkno)
5127		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5128	else
5129		freefrag = NULL;
5130
5131	CTR6(KTR_SUJ,
5132	    "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5133	    "off %jd newsize %ld oldsize %d",
5134	    ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5135	ACQUIRE_LOCK(&lk);
5136	if (off >= NDADDR) {
5137		if (lbn > 0)
5138			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5139			    lbn, off);
5140		/* allocating an indirect block */
5141		if (oldblkno != 0)
5142			panic("softdep_setup_allocdirect: non-zero indir");
5143	} else {
5144		if (off != lbn)
5145			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5146			    lbn, off);
5147		/*
5148		 * Allocating a direct block.
5149		 *
5150		 * If we are allocating a directory block, then we must
5151		 * allocate an associated pagedep to track additions and
5152		 * deletions.
5153		 */
5154		if ((ip->i_mode & IFMT) == IFDIR)
5155			pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5156			    &pagedep);
5157	}
5158	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5159		panic("softdep_setup_allocdirect: lost block");
5160	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5161	    ("softdep_setup_allocdirect: newblk already initialized"));
5162	/*
5163	 * Convert the newblk to an allocdirect.
5164	 */
5165	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5166	adp = (struct allocdirect *)newblk;
5167	newblk->nb_freefrag = freefrag;
5168	adp->ad_offset = off;
5169	adp->ad_oldblkno = oldblkno;
5170	adp->ad_newsize = newsize;
5171	adp->ad_oldsize = oldsize;
5172
5173	/*
5174	 * Finish initializing the journal.
5175	 */
5176	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5177		jnewblk->jn_ino = ip->i_number;
5178		jnewblk->jn_lbn = lbn;
5179		add_to_journal(&jnewblk->jn_list);
5180	}
5181	if (freefrag && freefrag->ff_jdep != NULL &&
5182	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5183		add_to_journal(freefrag->ff_jdep);
5184	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5185	adp->ad_inodedep = inodedep;
5186
5187	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5188	/*
5189	 * The list of allocdirects must be kept in sorted and ascending
5190	 * order so that the rollback routines can quickly determine the
5191	 * first uncommitted block (the size of the file stored on disk
5192	 * ends at the end of the lowest committed fragment, or if there
5193	 * are no fragments, at the end of the highest committed block).
5194	 * Since files generally grow, the typical case is that the new
5195	 * block is to be added at the end of the list. We speed this
5196	 * special case by checking against the last allocdirect in the
5197	 * list before laboriously traversing the list looking for the
5198	 * insertion point.
5199	 */
5200	adphead = &inodedep->id_newinoupdt;
5201	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5202	if (oldadp == NULL || oldadp->ad_offset <= off) {
5203		/* insert at end of list */
5204		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5205		if (oldadp != NULL && oldadp->ad_offset == off)
5206			allocdirect_merge(adphead, adp, oldadp);
5207		FREE_LOCK(&lk);
5208		return;
5209	}
5210	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5211		if (oldadp->ad_offset >= off)
5212			break;
5213	}
5214	if (oldadp == NULL)
5215		panic("softdep_setup_allocdirect: lost entry");
5216	/* insert in middle of list */
5217	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5218	if (oldadp->ad_offset == off)
5219		allocdirect_merge(adphead, adp, oldadp);
5220
5221	FREE_LOCK(&lk);
5222}
5223
5224/*
5225 * Merge a newer and older journal record to be stored either in a
5226 * newblock or freefrag.  This handles aggregating journal records for
5227 * fragment allocation into a second record as well as replacing a
5228 * journal free with an aborted journal allocation.  A segment for the
5229 * oldest record will be placed on wkhd if it has been written.  If not
5230 * the segment for the newer record will suffice.
5231 */
5232static struct worklist *
5233jnewblk_merge(new, old, wkhd)
5234	struct worklist *new;
5235	struct worklist *old;
5236	struct workhead *wkhd;
5237{
5238	struct jnewblk *njnewblk;
5239	struct jnewblk *jnewblk;
5240
5241	/* Handle NULLs to simplify callers. */
5242	if (new == NULL)
5243		return (old);
5244	if (old == NULL)
5245		return (new);
5246	/* Replace a jfreefrag with a jnewblk. */
5247	if (new->wk_type == D_JFREEFRAG) {
5248		if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5249			panic("jnewblk_merge: blkno mismatch: %p, %p",
5250			    old, new);
5251		cancel_jfreefrag(WK_JFREEFRAG(new));
5252		return (old);
5253	}
5254	if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5255		panic("jnewblk_merge: Bad type: old %d new %d\n",
5256		    old->wk_type, new->wk_type);
5257	/*
5258	 * Handle merging of two jnewblk records that describe
5259	 * different sets of fragments in the same block.
5260	 */
5261	jnewblk = WK_JNEWBLK(old);
5262	njnewblk = WK_JNEWBLK(new);
5263	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5264		panic("jnewblk_merge: Merging disparate blocks.");
5265	/*
5266	 * The record may be rolled back in the cg.
5267	 */
5268	if (jnewblk->jn_state & UNDONE) {
5269		jnewblk->jn_state &= ~UNDONE;
5270		njnewblk->jn_state |= UNDONE;
5271		njnewblk->jn_state &= ~ATTACHED;
5272	}
5273	/*
5274	 * We modify the newer addref and free the older so that if neither
5275	 * has been written the most up-to-date copy will be on disk.  If
5276	 * both have been written but rolled back we only temporarily need
5277	 * one of them to fix the bits when the cg write completes.
5278	 */
5279	jnewblk->jn_state |= ATTACHED | COMPLETE;
5280	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5281	cancel_jnewblk(jnewblk, wkhd);
5282	WORKLIST_REMOVE(&jnewblk->jn_list);
5283	free_jnewblk(jnewblk);
5284	return (new);
5285}
5286
5287/*
5288 * Replace an old allocdirect dependency with a newer one.
5289 * This routine must be called with splbio interrupts blocked.
5290 */
5291static void
5292allocdirect_merge(adphead, newadp, oldadp)
5293	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
5294	struct allocdirect *newadp;	/* allocdirect being added */
5295	struct allocdirect *oldadp;	/* existing allocdirect being checked */
5296{
5297	struct worklist *wk;
5298	struct freefrag *freefrag;
5299
5300	freefrag = NULL;
5301	mtx_assert(&lk, MA_OWNED);
5302	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5303	    newadp->ad_oldsize != oldadp->ad_newsize ||
5304	    newadp->ad_offset >= NDADDR)
5305		panic("%s %jd != new %jd || old size %ld != new %ld",
5306		    "allocdirect_merge: old blkno",
5307		    (intmax_t)newadp->ad_oldblkno,
5308		    (intmax_t)oldadp->ad_newblkno,
5309		    newadp->ad_oldsize, oldadp->ad_newsize);
5310	newadp->ad_oldblkno = oldadp->ad_oldblkno;
5311	newadp->ad_oldsize = oldadp->ad_oldsize;
5312	/*
5313	 * If the old dependency had a fragment to free or had never
5314	 * previously had a block allocated, then the new dependency
5315	 * can immediately post its freefrag and adopt the old freefrag.
5316	 * This action is done by swapping the freefrag dependencies.
5317	 * The new dependency gains the old one's freefrag, and the
5318	 * old one gets the new one and then immediately puts it on
5319	 * the worklist when it is freed by free_newblk. It is
5320	 * not possible to do this swap when the old dependency had a
5321	 * non-zero size but no previous fragment to free. This condition
5322	 * arises when the new block is an extension of the old block.
5323	 * Here, the first part of the fragment allocated to the new
5324	 * dependency is part of the block currently claimed on disk by
5325	 * the old dependency, so cannot legitimately be freed until the
5326	 * conditions for the new dependency are fulfilled.
5327	 */
5328	freefrag = newadp->ad_freefrag;
5329	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5330		newadp->ad_freefrag = oldadp->ad_freefrag;
5331		oldadp->ad_freefrag = freefrag;
5332	}
5333	/*
5334	 * If we are tracking a new directory-block allocation,
5335	 * move it from the old allocdirect to the new allocdirect.
5336	 */
5337	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5338		WORKLIST_REMOVE(wk);
5339		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5340			panic("allocdirect_merge: extra newdirblk");
5341		WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5342	}
5343	TAILQ_REMOVE(adphead, oldadp, ad_next);
5344	/*
5345	 * We need to move any journal dependencies over to the freefrag
5346	 * that releases this block if it exists.  Otherwise we are
5347	 * extending an existing block and we'll wait until that is
5348	 * complete to release the journal space and extend the
5349	 * new journal to cover this old space as well.
5350	 */
5351	if (freefrag == NULL) {
5352		if (oldadp->ad_newblkno != newadp->ad_newblkno)
5353			panic("allocdirect_merge: %jd != %jd",
5354			    oldadp->ad_newblkno, newadp->ad_newblkno);
5355		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5356		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5357		    &oldadp->ad_block.nb_jnewblk->jn_list,
5358		    &newadp->ad_block.nb_jwork);
5359		oldadp->ad_block.nb_jnewblk = NULL;
5360		cancel_newblk(&oldadp->ad_block, NULL,
5361		    &newadp->ad_block.nb_jwork);
5362	} else {
5363		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5364		    &freefrag->ff_list, &freefrag->ff_jwork);
5365		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5366		    &freefrag->ff_jwork);
5367	}
5368	free_newblk(&oldadp->ad_block);
5369}
5370
5371/*
5372 * Allocate a jfreefrag structure to journal a single block free.
5373 */
5374static struct jfreefrag *
5375newjfreefrag(freefrag, ip, blkno, size, lbn)
5376	struct freefrag *freefrag;
5377	struct inode *ip;
5378	ufs2_daddr_t blkno;
5379	long size;
5380	ufs_lbn_t lbn;
5381{
5382	struct jfreefrag *jfreefrag;
5383	struct fs *fs;
5384
5385	fs = ip->i_fs;
5386	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5387	    M_SOFTDEP_FLAGS);
5388	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
5389	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5390	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5391	jfreefrag->fr_ino = ip->i_number;
5392	jfreefrag->fr_lbn = lbn;
5393	jfreefrag->fr_blkno = blkno;
5394	jfreefrag->fr_frags = numfrags(fs, size);
5395	jfreefrag->fr_freefrag = freefrag;
5396
5397	return (jfreefrag);
5398}
5399
5400/*
5401 * Allocate a new freefrag structure.
5402 */
5403static struct freefrag *
5404newfreefrag(ip, blkno, size, lbn)
5405	struct inode *ip;
5406	ufs2_daddr_t blkno;
5407	long size;
5408	ufs_lbn_t lbn;
5409{
5410	struct freefrag *freefrag;
5411	struct fs *fs;
5412
5413	CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5414	    ip->i_number, blkno, size, lbn);
5415	fs = ip->i_fs;
5416	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5417		panic("newfreefrag: frag size");
5418	freefrag = malloc(sizeof(struct freefrag),
5419	    M_FREEFRAG, M_SOFTDEP_FLAGS);
5420	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
5421	freefrag->ff_state = ATTACHED;
5422	LIST_INIT(&freefrag->ff_jwork);
5423	freefrag->ff_inum = ip->i_number;
5424	freefrag->ff_vtype = ITOV(ip)->v_type;
5425	freefrag->ff_blkno = blkno;
5426	freefrag->ff_fragsize = size;
5427
5428	if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) {
5429		freefrag->ff_jdep = (struct worklist *)
5430		    newjfreefrag(freefrag, ip, blkno, size, lbn);
5431	} else {
5432		freefrag->ff_state |= DEPCOMPLETE;
5433		freefrag->ff_jdep = NULL;
5434	}
5435
5436	return (freefrag);
5437}
5438
5439/*
5440 * This workitem de-allocates fragments that were replaced during
5441 * file block allocation.
5442 */
5443static void
5444handle_workitem_freefrag(freefrag)
5445	struct freefrag *freefrag;
5446{
5447	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5448	struct workhead wkhd;
5449
5450	CTR3(KTR_SUJ,
5451	    "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5452	    freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5453	/*
5454	 * It would be illegal to add new completion items to the
5455	 * freefrag after it was schedule to be done so it must be
5456	 * safe to modify the list head here.
5457	 */
5458	LIST_INIT(&wkhd);
5459	ACQUIRE_LOCK(&lk);
5460	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5461	/*
5462	 * If the journal has not been written we must cancel it here.
5463	 */
5464	if (freefrag->ff_jdep) {
5465		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5466			panic("handle_workitem_freefrag: Unexpected type %d\n",
5467			    freefrag->ff_jdep->wk_type);
5468		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5469	}
5470	FREE_LOCK(&lk);
5471	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5472	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
5473	ACQUIRE_LOCK(&lk);
5474	WORKITEM_FREE(freefrag, D_FREEFRAG);
5475	FREE_LOCK(&lk);
5476}
5477
5478/*
5479 * Set up a dependency structure for an external attributes data block.
5480 * This routine follows much of the structure of softdep_setup_allocdirect.
5481 * See the description of softdep_setup_allocdirect above for details.
5482 */
5483void
5484softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5485	struct inode *ip;
5486	ufs_lbn_t off;
5487	ufs2_daddr_t newblkno;
5488	ufs2_daddr_t oldblkno;
5489	long newsize;
5490	long oldsize;
5491	struct buf *bp;
5492{
5493	struct allocdirect *adp, *oldadp;
5494	struct allocdirectlst *adphead;
5495	struct freefrag *freefrag;
5496	struct inodedep *inodedep;
5497	struct jnewblk *jnewblk;
5498	struct newblk *newblk;
5499	struct mount *mp;
5500	ufs_lbn_t lbn;
5501
5502	if (off >= NXADDR)
5503		panic("softdep_setup_allocext: lbn %lld > NXADDR",
5504		    (long long)off);
5505
5506	lbn = bp->b_lblkno;
5507	mp = UFSTOVFS(ip->i_ump);
5508	if (oldblkno && oldblkno != newblkno)
5509		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5510	else
5511		freefrag = NULL;
5512
5513	ACQUIRE_LOCK(&lk);
5514	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5515		panic("softdep_setup_allocext: lost block");
5516	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5517	    ("softdep_setup_allocext: newblk already initialized"));
5518	/*
5519	 * Convert the newblk to an allocdirect.
5520	 */
5521	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5522	adp = (struct allocdirect *)newblk;
5523	newblk->nb_freefrag = freefrag;
5524	adp->ad_offset = off;
5525	adp->ad_oldblkno = oldblkno;
5526	adp->ad_newsize = newsize;
5527	adp->ad_oldsize = oldsize;
5528	adp->ad_state |=  EXTDATA;
5529
5530	/*
5531	 * Finish initializing the journal.
5532	 */
5533	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5534		jnewblk->jn_ino = ip->i_number;
5535		jnewblk->jn_lbn = lbn;
5536		add_to_journal(&jnewblk->jn_list);
5537	}
5538	if (freefrag && freefrag->ff_jdep != NULL &&
5539	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5540		add_to_journal(freefrag->ff_jdep);
5541	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5542	adp->ad_inodedep = inodedep;
5543
5544	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5545	/*
5546	 * The list of allocdirects must be kept in sorted and ascending
5547	 * order so that the rollback routines can quickly determine the
5548	 * first uncommitted block (the size of the file stored on disk
5549	 * ends at the end of the lowest committed fragment, or if there
5550	 * are no fragments, at the end of the highest committed block).
5551	 * Since files generally grow, the typical case is that the new
5552	 * block is to be added at the end of the list. We speed this
5553	 * special case by checking against the last allocdirect in the
5554	 * list before laboriously traversing the list looking for the
5555	 * insertion point.
5556	 */
5557	adphead = &inodedep->id_newextupdt;
5558	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5559	if (oldadp == NULL || oldadp->ad_offset <= off) {
5560		/* insert at end of list */
5561		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5562		if (oldadp != NULL && oldadp->ad_offset == off)
5563			allocdirect_merge(adphead, adp, oldadp);
5564		FREE_LOCK(&lk);
5565		return;
5566	}
5567	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5568		if (oldadp->ad_offset >= off)
5569			break;
5570	}
5571	if (oldadp == NULL)
5572		panic("softdep_setup_allocext: lost entry");
5573	/* insert in middle of list */
5574	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5575	if (oldadp->ad_offset == off)
5576		allocdirect_merge(adphead, adp, oldadp);
5577	FREE_LOCK(&lk);
5578}
5579
5580/*
5581 * Indirect block allocation dependencies.
5582 *
5583 * The same dependencies that exist for a direct block also exist when
5584 * a new block is allocated and pointed to by an entry in a block of
5585 * indirect pointers. The undo/redo states described above are also
5586 * used here. Because an indirect block contains many pointers that
5587 * may have dependencies, a second copy of the entire in-memory indirect
5588 * block is kept. The buffer cache copy is always completely up-to-date.
5589 * The second copy, which is used only as a source for disk writes,
5590 * contains only the safe pointers (i.e., those that have no remaining
5591 * update dependencies). The second copy is freed when all pointers
5592 * are safe. The cache is not allowed to replace indirect blocks with
5593 * pending update dependencies. If a buffer containing an indirect
5594 * block with dependencies is written, these routines will mark it
5595 * dirty again. It can only be successfully written once all the
5596 * dependencies are removed. The ffs_fsync routine in conjunction with
5597 * softdep_sync_metadata work together to get all the dependencies
5598 * removed so that a file can be successfully written to disk. Three
5599 * procedures are used when setting up indirect block pointer
5600 * dependencies. The division is necessary because of the organization
5601 * of the "balloc" routine and because of the distinction between file
5602 * pages and file metadata blocks.
5603 */
5604
5605/*
5606 * Allocate a new allocindir structure.
5607 */
5608static struct allocindir *
5609newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
5610	struct inode *ip;	/* inode for file being extended */
5611	int ptrno;		/* offset of pointer in indirect block */
5612	ufs2_daddr_t newblkno;	/* disk block number being added */
5613	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5614	ufs_lbn_t lbn;
5615{
5616	struct newblk *newblk;
5617	struct allocindir *aip;
5618	struct freefrag *freefrag;
5619	struct jnewblk *jnewblk;
5620
5621	if (oldblkno)
5622		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
5623	else
5624		freefrag = NULL;
5625	ACQUIRE_LOCK(&lk);
5626	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
5627		panic("new_allocindir: lost block");
5628	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5629	    ("newallocindir: newblk already initialized"));
5630	WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
5631	newblk->nb_freefrag = freefrag;
5632	aip = (struct allocindir *)newblk;
5633	aip->ai_offset = ptrno;
5634	aip->ai_oldblkno = oldblkno;
5635	aip->ai_lbn = lbn;
5636	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5637		jnewblk->jn_ino = ip->i_number;
5638		jnewblk->jn_lbn = lbn;
5639		add_to_journal(&jnewblk->jn_list);
5640	}
5641	if (freefrag && freefrag->ff_jdep != NULL &&
5642	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5643		add_to_journal(freefrag->ff_jdep);
5644	return (aip);
5645}
5646
5647/*
5648 * Called just before setting an indirect block pointer
5649 * to a newly allocated file page.
5650 */
5651void
5652softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
5653	struct inode *ip;	/* inode for file being extended */
5654	ufs_lbn_t lbn;		/* allocated block number within file */
5655	struct buf *bp;		/* buffer with indirect blk referencing page */
5656	int ptrno;		/* offset of pointer in indirect block */
5657	ufs2_daddr_t newblkno;	/* disk block number being added */
5658	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5659	struct buf *nbp;	/* buffer holding allocated page */
5660{
5661	struct inodedep *inodedep;
5662	struct freefrag *freefrag;
5663	struct allocindir *aip;
5664	struct pagedep *pagedep;
5665	struct mount *mp;
5666	int dflags;
5667
5668	if (lbn != nbp->b_lblkno)
5669		panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
5670		    lbn, bp->b_lblkno);
5671	CTR4(KTR_SUJ,
5672	    "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
5673	    "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
5674	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
5675	mp = UFSTOVFS(ip->i_ump);
5676	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
5677	dflags = DEPALLOC;
5678	if (IS_SNAPSHOT(ip))
5679		dflags |= NODELAY;
5680	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
5681	/*
5682	 * If we are allocating a directory page, then we must
5683	 * allocate an associated pagedep to track additions and
5684	 * deletions.
5685	 */
5686	if ((ip->i_mode & IFMT) == IFDIR)
5687		pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
5688	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5689	freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5690	FREE_LOCK(&lk);
5691	if (freefrag)
5692		handle_workitem_freefrag(freefrag);
5693}
5694
5695/*
5696 * Called just before setting an indirect block pointer to a
5697 * newly allocated indirect block.
5698 */
5699void
5700softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
5701	struct buf *nbp;	/* newly allocated indirect block */
5702	struct inode *ip;	/* inode for file being extended */
5703	struct buf *bp;		/* indirect block referencing allocated block */
5704	int ptrno;		/* offset of pointer in indirect block */
5705	ufs2_daddr_t newblkno;	/* disk block number being added */
5706{
5707	struct inodedep *inodedep;
5708	struct allocindir *aip;
5709	ufs_lbn_t lbn;
5710	int dflags;
5711
5712	CTR3(KTR_SUJ,
5713	    "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
5714	    ip->i_number, newblkno, ptrno);
5715	lbn = nbp->b_lblkno;
5716	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
5717	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
5718	dflags = DEPALLOC;
5719	if (IS_SNAPSHOT(ip))
5720		dflags |= NODELAY;
5721	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
5722	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5723	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
5724		panic("softdep_setup_allocindir_meta: Block already existed");
5725	FREE_LOCK(&lk);
5726}
5727
5728static void
5729indirdep_complete(indirdep)
5730	struct indirdep *indirdep;
5731{
5732	struct allocindir *aip;
5733
5734	LIST_REMOVE(indirdep, ir_next);
5735	indirdep->ir_state |= DEPCOMPLETE;
5736
5737	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5738		LIST_REMOVE(aip, ai_next);
5739		free_newblk(&aip->ai_block);
5740	}
5741	/*
5742	 * If this indirdep is not attached to a buf it was simply waiting
5743	 * on completion to clear completehd.  free_indirdep() asserts
5744	 * that nothing is dangling.
5745	 */
5746	if ((indirdep->ir_state & ONWORKLIST) == 0)
5747		free_indirdep(indirdep);
5748}
5749
5750static struct indirdep *
5751indirdep_lookup(mp, ip, bp)
5752	struct mount *mp;
5753	struct inode *ip;
5754	struct buf *bp;
5755{
5756	struct indirdep *indirdep, *newindirdep;
5757	struct newblk *newblk;
5758	struct worklist *wk;
5759	struct fs *fs;
5760	ufs2_daddr_t blkno;
5761
5762	mtx_assert(&lk, MA_OWNED);
5763	indirdep = NULL;
5764	newindirdep = NULL;
5765	fs = ip->i_fs;
5766	for (;;) {
5767		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5768			if (wk->wk_type != D_INDIRDEP)
5769				continue;
5770			indirdep = WK_INDIRDEP(wk);
5771			break;
5772		}
5773		/* Found on the buffer worklist, no new structure to free. */
5774		if (indirdep != NULL && newindirdep == NULL)
5775			return (indirdep);
5776		if (indirdep != NULL && newindirdep != NULL)
5777			panic("indirdep_lookup: simultaneous create");
5778		/* None found on the buffer and a new structure is ready. */
5779		if (indirdep == NULL && newindirdep != NULL)
5780			break;
5781		/* None found and no new structure available. */
5782		FREE_LOCK(&lk);
5783		newindirdep = malloc(sizeof(struct indirdep),
5784		    M_INDIRDEP, M_SOFTDEP_FLAGS);
5785		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5786		newindirdep->ir_state = ATTACHED;
5787		if (ip->i_ump->um_fstype == UFS1)
5788			newindirdep->ir_state |= UFS1FMT;
5789		TAILQ_INIT(&newindirdep->ir_trunc);
5790		newindirdep->ir_saveddata = NULL;
5791		LIST_INIT(&newindirdep->ir_deplisthd);
5792		LIST_INIT(&newindirdep->ir_donehd);
5793		LIST_INIT(&newindirdep->ir_writehd);
5794		LIST_INIT(&newindirdep->ir_completehd);
5795		if (bp->b_blkno == bp->b_lblkno) {
5796			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5797			    NULL, NULL);
5798			bp->b_blkno = blkno;
5799		}
5800		newindirdep->ir_freeblks = NULL;
5801		newindirdep->ir_savebp =
5802		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5803		newindirdep->ir_bp = bp;
5804		BUF_KERNPROC(newindirdep->ir_savebp);
5805		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5806		ACQUIRE_LOCK(&lk);
5807	}
5808	indirdep = newindirdep;
5809	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
5810	/*
5811	 * If the block is not yet allocated we don't set DEPCOMPLETE so
5812	 * that we don't free dependencies until the pointers are valid.
5813	 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
5814	 * than using the hash.
5815	 */
5816	if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
5817		LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
5818	else
5819		indirdep->ir_state |= DEPCOMPLETE;
5820	return (indirdep);
5821}
5822
5823/*
5824 * Called to finish the allocation of the "aip" allocated
5825 * by one of the two routines above.
5826 */
5827static struct freefrag *
5828setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
5829	struct buf *bp;		/* in-memory copy of the indirect block */
5830	struct inode *ip;	/* inode for file being extended */
5831	struct inodedep *inodedep; /* Inodedep for ip */
5832	struct allocindir *aip;	/* allocindir allocated by the above routines */
5833	ufs_lbn_t lbn;		/* Logical block number for this block. */
5834{
5835	struct fs *fs;
5836	struct indirdep *indirdep;
5837	struct allocindir *oldaip;
5838	struct freefrag *freefrag;
5839	struct mount *mp;
5840
5841	mtx_assert(&lk, MA_OWNED);
5842	mp = UFSTOVFS(ip->i_ump);
5843	fs = ip->i_fs;
5844	if (bp->b_lblkno >= 0)
5845		panic("setup_allocindir_phase2: not indir blk");
5846	KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
5847	    ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
5848	indirdep = indirdep_lookup(mp, ip, bp);
5849	KASSERT(indirdep->ir_savebp != NULL,
5850	    ("setup_allocindir_phase2 NULL ir_savebp"));
5851	aip->ai_indirdep = indirdep;
5852	/*
5853	 * Check for an unwritten dependency for this indirect offset.  If
5854	 * there is, merge the old dependency into the new one.  This happens
5855	 * as a result of reallocblk only.
5856	 */
5857	freefrag = NULL;
5858	if (aip->ai_oldblkno != 0) {
5859		LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
5860			if (oldaip->ai_offset == aip->ai_offset) {
5861				freefrag = allocindir_merge(aip, oldaip);
5862				goto done;
5863			}
5864		}
5865		LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
5866			if (oldaip->ai_offset == aip->ai_offset) {
5867				freefrag = allocindir_merge(aip, oldaip);
5868				goto done;
5869			}
5870		}
5871	}
5872done:
5873	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
5874	return (freefrag);
5875}
5876
5877/*
5878 * Merge two allocindirs which refer to the same block.  Move newblock
5879 * dependencies and setup the freefrags appropriately.
5880 */
5881static struct freefrag *
5882allocindir_merge(aip, oldaip)
5883	struct allocindir *aip;
5884	struct allocindir *oldaip;
5885{
5886	struct freefrag *freefrag;
5887	struct worklist *wk;
5888
5889	if (oldaip->ai_newblkno != aip->ai_oldblkno)
5890		panic("allocindir_merge: blkno");
5891	aip->ai_oldblkno = oldaip->ai_oldblkno;
5892	freefrag = aip->ai_freefrag;
5893	aip->ai_freefrag = oldaip->ai_freefrag;
5894	oldaip->ai_freefrag = NULL;
5895	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
5896	/*
5897	 * If we are tracking a new directory-block allocation,
5898	 * move it from the old allocindir to the new allocindir.
5899	 */
5900	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
5901		WORKLIST_REMOVE(wk);
5902		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
5903			panic("allocindir_merge: extra newdirblk");
5904		WORKLIST_INSERT(&aip->ai_newdirblk, wk);
5905	}
5906	/*
5907	 * We can skip journaling for this freefrag and just complete
5908	 * any pending journal work for the allocindir that is being
5909	 * removed after the freefrag completes.
5910	 */
5911	if (freefrag->ff_jdep)
5912		cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
5913	LIST_REMOVE(oldaip, ai_next);
5914	freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
5915	    &freefrag->ff_list, &freefrag->ff_jwork);
5916	free_newblk(&oldaip->ai_block);
5917
5918	return (freefrag);
5919}
5920
5921static inline void
5922setup_freedirect(freeblks, ip, i, needj)
5923	struct freeblks *freeblks;
5924	struct inode *ip;
5925	int i;
5926	int needj;
5927{
5928	ufs2_daddr_t blkno;
5929	int frags;
5930
5931	blkno = DIP(ip, i_db[i]);
5932	if (blkno == 0)
5933		return;
5934	DIP_SET(ip, i_db[i], 0);
5935	frags = sblksize(ip->i_fs, ip->i_size, i);
5936	frags = numfrags(ip->i_fs, frags);
5937	newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj);
5938}
5939
5940static inline void
5941setup_freeext(freeblks, ip, i, needj)
5942	struct freeblks *freeblks;
5943	struct inode *ip;
5944	int i;
5945	int needj;
5946{
5947	ufs2_daddr_t blkno;
5948	int frags;
5949
5950	blkno = ip->i_din2->di_extb[i];
5951	if (blkno == 0)
5952		return;
5953	ip->i_din2->di_extb[i] = 0;
5954	frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
5955	frags = numfrags(ip->i_fs, frags);
5956	newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
5957}
5958
5959static inline void
5960setup_freeindir(freeblks, ip, i, lbn, needj)
5961	struct freeblks *freeblks;
5962	struct inode *ip;
5963	int i;
5964	ufs_lbn_t lbn;
5965	int needj;
5966{
5967	ufs2_daddr_t blkno;
5968
5969	blkno = DIP(ip, i_ib[i]);
5970	if (blkno == 0)
5971		return;
5972	DIP_SET(ip, i_ib[i], 0);
5973	newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
5974	    0, needj);
5975}
5976
5977static inline struct freeblks *
5978newfreeblks(mp, ip)
5979	struct mount *mp;
5980	struct inode *ip;
5981{
5982	struct freeblks *freeblks;
5983
5984	freeblks = malloc(sizeof(struct freeblks),
5985		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
5986	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
5987	LIST_INIT(&freeblks->fb_jblkdephd);
5988	LIST_INIT(&freeblks->fb_jwork);
5989	freeblks->fb_ref = 0;
5990	freeblks->fb_cgwait = 0;
5991	freeblks->fb_state = ATTACHED;
5992	freeblks->fb_uid = ip->i_uid;
5993	freeblks->fb_inum = ip->i_number;
5994	freeblks->fb_vtype = ITOV(ip)->v_type;
5995	freeblks->fb_modrev = DIP(ip, i_modrev);
5996	freeblks->fb_devvp = ip->i_devvp;
5997	freeblks->fb_chkcnt = 0;
5998	freeblks->fb_len = 0;
5999
6000	return (freeblks);
6001}
6002
6003static void
6004trunc_indirdep(indirdep, freeblks, bp, off)
6005	struct indirdep *indirdep;
6006	struct freeblks *freeblks;
6007	struct buf *bp;
6008	int off;
6009{
6010	struct allocindir *aip, *aipn;
6011
6012	/*
6013	 * The first set of allocindirs won't be in savedbp.
6014	 */
6015	LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
6016		if (aip->ai_offset > off)
6017			cancel_allocindir(aip, bp, freeblks, 1);
6018	LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
6019		if (aip->ai_offset > off)
6020			cancel_allocindir(aip, bp, freeblks, 1);
6021	/*
6022	 * These will exist in savedbp.
6023	 */
6024	LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
6025		if (aip->ai_offset > off)
6026			cancel_allocindir(aip, NULL, freeblks, 0);
6027	LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
6028		if (aip->ai_offset > off)
6029			cancel_allocindir(aip, NULL, freeblks, 0);
6030}
6031
6032/*
6033 * Follow the chain of indirects down to lastlbn creating a freework
6034 * structure for each.  This will be used to start indir_trunc() at
6035 * the right offset and create the journal records for the parrtial
6036 * truncation.  A second step will handle the truncated dependencies.
6037 */
6038static int
6039setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
6040	struct freeblks *freeblks;
6041	struct inode *ip;
6042	ufs_lbn_t lbn;
6043	ufs_lbn_t lastlbn;
6044	ufs2_daddr_t blkno;
6045{
6046	struct indirdep *indirdep;
6047	struct indirdep *indirn;
6048	struct freework *freework;
6049	struct newblk *newblk;
6050	struct mount *mp;
6051	struct buf *bp;
6052	uint8_t *start;
6053	uint8_t *end;
6054	ufs_lbn_t lbnadd;
6055	int level;
6056	int error;
6057	int off;
6058
6059
6060	freework = NULL;
6061	if (blkno == 0)
6062		return (0);
6063	mp = freeblks->fb_list.wk_mp;
6064	bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
6065	if ((bp->b_flags & B_CACHE) == 0) {
6066		bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
6067		bp->b_iocmd = BIO_READ;
6068		bp->b_flags &= ~B_INVAL;
6069		bp->b_ioflags &= ~BIO_ERROR;
6070		vfs_busy_pages(bp, 0);
6071		bp->b_iooffset = dbtob(bp->b_blkno);
6072		bstrategy(bp);
6073		curthread->td_ru.ru_inblock++;
6074		error = bufwait(bp);
6075		if (error) {
6076			brelse(bp);
6077			return (error);
6078		}
6079	}
6080	level = lbn_level(lbn);
6081	lbnadd = lbn_offset(ip->i_fs, level);
6082	/*
6083	 * Compute the offset of the last block we want to keep.  Store
6084	 * in the freework the first block we want to completely free.
6085	 */
6086	off = (lastlbn - -(lbn + level)) / lbnadd;
6087	if (off + 1 == NINDIR(ip->i_fs))
6088		goto nowork;
6089	freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1,
6090	    0);
6091	/*
6092	 * Link the freework into the indirdep.  This will prevent any new
6093	 * allocations from proceeding until we are finished with the
6094	 * truncate and the block is written.
6095	 */
6096	ACQUIRE_LOCK(&lk);
6097	indirdep = indirdep_lookup(mp, ip, bp);
6098	if (indirdep->ir_freeblks)
6099		panic("setup_trunc_indir: indirdep already truncated.");
6100	TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
6101	freework->fw_indir = indirdep;
6102	/*
6103	 * Cancel any allocindirs that will not make it to disk.
6104	 * We have to do this for all copies of the indirdep that
6105	 * live on this newblk.
6106	 */
6107	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
6108		newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk);
6109		LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
6110			trunc_indirdep(indirn, freeblks, bp, off);
6111	} else
6112		trunc_indirdep(indirdep, freeblks, bp, off);
6113	FREE_LOCK(&lk);
6114	/*
6115	 * Creation is protected by the buf lock. The saveddata is only
6116	 * needed if a full truncation follows a partial truncation but it
6117	 * is difficult to allocate in that case so we fetch it anyway.
6118	 */
6119	if (indirdep->ir_saveddata == NULL)
6120		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
6121		    M_SOFTDEP_FLAGS);
6122nowork:
6123	/* Fetch the blkno of the child and the zero start offset. */
6124	if (ip->i_ump->um_fstype == UFS1) {
6125		blkno = ((ufs1_daddr_t *)bp->b_data)[off];
6126		start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
6127	} else {
6128		blkno = ((ufs2_daddr_t *)bp->b_data)[off];
6129		start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
6130	}
6131	if (freework) {
6132		/* Zero the truncated pointers. */
6133		end = bp->b_data + bp->b_bcount;
6134		bzero(start, end - start);
6135		bdwrite(bp);
6136	} else
6137		bqrelse(bp);
6138	if (level == 0)
6139		return (0);
6140	lbn++; /* adjust level */
6141	lbn -= (off * lbnadd);
6142	return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
6143}
6144
6145/*
6146 * Complete the partial truncation of an indirect block setup by
6147 * setup_trunc_indir().  This zeros the truncated pointers in the saved
6148 * copy and writes them to disk before the freeblks is allowed to complete.
6149 */
6150static void
6151complete_trunc_indir(freework)
6152	struct freework *freework;
6153{
6154	struct freework *fwn;
6155	struct indirdep *indirdep;
6156	struct buf *bp;
6157	uintptr_t start;
6158	int count;
6159
6160	indirdep = freework->fw_indir;
6161	for (;;) {
6162		bp = indirdep->ir_bp;
6163		/* See if the block was discarded. */
6164		if (bp == NULL)
6165			break;
6166		/* Inline part of getdirtybuf().  We dont want bremfree. */
6167		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
6168			break;
6169		if (BUF_LOCK(bp,
6170		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, &lk) == 0)
6171			BUF_UNLOCK(bp);
6172		ACQUIRE_LOCK(&lk);
6173	}
6174	mtx_assert(&lk, MA_OWNED);
6175	freework->fw_state |= DEPCOMPLETE;
6176	TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
6177	/*
6178	 * Zero the pointers in the saved copy.
6179	 */
6180	if (indirdep->ir_state & UFS1FMT)
6181		start = sizeof(ufs1_daddr_t);
6182	else
6183		start = sizeof(ufs2_daddr_t);
6184	start *= freework->fw_start;
6185	count = indirdep->ir_savebp->b_bcount - start;
6186	start += (uintptr_t)indirdep->ir_savebp->b_data;
6187	bzero((char *)start, count);
6188	/*
6189	 * We need to start the next truncation in the list if it has not
6190	 * been started yet.
6191	 */
6192	fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6193	if (fwn != NULL) {
6194		if (fwn->fw_freeblks == indirdep->ir_freeblks)
6195			TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6196		if ((fwn->fw_state & ONWORKLIST) == 0)
6197			freework_enqueue(fwn);
6198	}
6199	/*
6200	 * If bp is NULL the block was fully truncated, restore
6201	 * the saved block list otherwise free it if it is no
6202	 * longer needed.
6203	 */
6204	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6205		if (bp == NULL)
6206			bcopy(indirdep->ir_saveddata,
6207			    indirdep->ir_savebp->b_data,
6208			    indirdep->ir_savebp->b_bcount);
6209		free(indirdep->ir_saveddata, M_INDIRDEP);
6210		indirdep->ir_saveddata = NULL;
6211	}
6212	/*
6213	 * When bp is NULL there is a full truncation pending.  We
6214	 * must wait for this full truncation to be journaled before
6215	 * we can release this freework because the disk pointers will
6216	 * never be written as zero.
6217	 */
6218	if (bp == NULL)  {
6219		if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6220			handle_written_freework(freework);
6221		else
6222			WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6223			   &freework->fw_list);
6224	} else {
6225		/* Complete when the real copy is written. */
6226		WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6227		BUF_UNLOCK(bp);
6228	}
6229}
6230
6231/*
6232 * Calculate the number of blocks we are going to release where datablocks
6233 * is the current total and length is the new file size.
6234 */
6235ufs2_daddr_t
6236blkcount(fs, datablocks, length)
6237	struct fs *fs;
6238	ufs2_daddr_t datablocks;
6239	off_t length;
6240{
6241	off_t totblks, numblks;
6242
6243	totblks = 0;
6244	numblks = howmany(length, fs->fs_bsize);
6245	if (numblks <= NDADDR) {
6246		totblks = howmany(length, fs->fs_fsize);
6247		goto out;
6248	}
6249        totblks = blkstofrags(fs, numblks);
6250	numblks -= NDADDR;
6251	/*
6252	 * Count all single, then double, then triple indirects required.
6253	 * Subtracting one indirects worth of blocks for each pass
6254	 * acknowledges one of each pointed to by the inode.
6255	 */
6256	for (;;) {
6257		totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6258		numblks -= NINDIR(fs);
6259		if (numblks <= 0)
6260			break;
6261		numblks = howmany(numblks, NINDIR(fs));
6262	}
6263out:
6264	totblks = fsbtodb(fs, totblks);
6265	/*
6266	 * Handle sparse files.  We can't reclaim more blocks than the inode
6267	 * references.  We will correct it later in handle_complete_freeblks()
6268	 * when we know the real count.
6269	 */
6270	if (totblks > datablocks)
6271		return (0);
6272	return (datablocks - totblks);
6273}
6274
6275/*
6276 * Handle freeblocks for journaled softupdate filesystems.
6277 *
6278 * Contrary to normal softupdates, we must preserve the block pointers in
6279 * indirects until their subordinates are free.  This is to avoid journaling
6280 * every block that is freed which may consume more space than the journal
6281 * itself.  The recovery program will see the free block journals at the
6282 * base of the truncated area and traverse them to reclaim space.  The
6283 * pointers in the inode may be cleared immediately after the journal
6284 * records are written because each direct and indirect pointer in the
6285 * inode is recorded in a journal.  This permits full truncation to proceed
6286 * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6287 *
6288 * The algorithm is as follows:
6289 * 1) Traverse the in-memory state and create journal entries to release
6290 *    the relevant blocks and full indirect trees.
6291 * 2) Traverse the indirect block chain adding partial truncation freework
6292 *    records to indirects in the path to lastlbn.  The freework will
6293 *    prevent new allocation dependencies from being satisfied in this
6294 *    indirect until the truncation completes.
6295 * 3) Read and lock the inode block, performing an update with the new size
6296 *    and pointers.  This prevents truncated data from becoming valid on
6297 *    disk through step 4.
6298 * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6299 *    eliminate journal work for those records that do not require it.
6300 * 5) Schedule the journal records to be written followed by the inode block.
6301 * 6) Allocate any necessary frags for the end of file.
6302 * 7) Zero any partially truncated blocks.
6303 *
6304 * From this truncation proceeds asynchronously using the freework and
6305 * indir_trunc machinery.  The file will not be extended again into a
6306 * partially truncated indirect block until all work is completed but
6307 * the normal dependency mechanism ensures that it is rolled back/forward
6308 * as appropriate.  Further truncation may occur without delay and is
6309 * serialized in indir_trunc().
6310 */
6311void
6312softdep_journal_freeblocks(ip, cred, length, flags)
6313	struct inode *ip;	/* The inode whose length is to be reduced */
6314	struct ucred *cred;
6315	off_t length;		/* The new length for the file */
6316	int flags;		/* IO_EXT and/or IO_NORMAL */
6317{
6318	struct freeblks *freeblks, *fbn;
6319	struct worklist *wk, *wkn;
6320	struct inodedep *inodedep;
6321	struct jblkdep *jblkdep;
6322	struct allocdirect *adp, *adpn;
6323	struct fs *fs;
6324	struct buf *bp;
6325	struct vnode *vp;
6326	struct mount *mp;
6327	ufs2_daddr_t extblocks, datablocks;
6328	ufs_lbn_t tmpval, lbn, lastlbn;
6329	int frags, lastoff, iboff, allocblock, needj, dflags, error, i;
6330
6331	fs = ip->i_fs;
6332	mp = UFSTOVFS(ip->i_ump);
6333	vp = ITOV(ip);
6334	needj = 1;
6335	iboff = -1;
6336	allocblock = 0;
6337	extblocks = 0;
6338	datablocks = 0;
6339	frags = 0;
6340	freeblks = newfreeblks(mp, ip);
6341	ACQUIRE_LOCK(&lk);
6342	/*
6343	 * If we're truncating a removed file that will never be written
6344	 * we don't need to journal the block frees.  The canceled journals
6345	 * for the allocations will suffice.
6346	 */
6347	dflags = DEPALLOC;
6348	if (IS_SNAPSHOT(ip))
6349		dflags |= NODELAY;
6350	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6351	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6352	    length == 0)
6353		needj = 0;
6354	CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
6355	    ip->i_number, length, needj);
6356	FREE_LOCK(&lk);
6357	/*
6358	 * Calculate the lbn that we are truncating to.  This results in -1
6359	 * if we're truncating the 0 bytes.  So it is the last lbn we want
6360	 * to keep, not the first lbn we want to truncate.
6361	 */
6362	lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6363	lastoff = blkoff(fs, length);
6364	/*
6365	 * Compute frags we are keeping in lastlbn.  0 means all.
6366	 */
6367	if (lastlbn >= 0 && lastlbn < NDADDR) {
6368		frags = fragroundup(fs, lastoff);
6369		/* adp offset of last valid allocdirect. */
6370		iboff = lastlbn;
6371	} else if (lastlbn > 0)
6372		iboff = NDADDR;
6373	if (fs->fs_magic == FS_UFS2_MAGIC)
6374		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6375	/*
6376	 * Handle normal data blocks and indirects.  This section saves
6377	 * values used after the inode update to complete frag and indirect
6378	 * truncation.
6379	 */
6380	if ((flags & IO_NORMAL) != 0) {
6381		/*
6382		 * Handle truncation of whole direct and indirect blocks.
6383		 */
6384		for (i = iboff + 1; i < NDADDR; i++)
6385			setup_freedirect(freeblks, ip, i, needj);
6386		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6387		    i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6388			/* Release a whole indirect tree. */
6389			if (lbn > lastlbn) {
6390				setup_freeindir(freeblks, ip, i, -lbn -i,
6391				    needj);
6392				continue;
6393			}
6394			iboff = i + NDADDR;
6395			/*
6396			 * Traverse partially truncated indirect tree.
6397			 */
6398			if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6399				setup_trunc_indir(freeblks, ip, -lbn - i,
6400				    lastlbn, DIP(ip, i_ib[i]));
6401		}
6402		/*
6403		 * Handle partial truncation to a frag boundary.
6404		 */
6405		if (frags) {
6406			ufs2_daddr_t blkno;
6407			long oldfrags;
6408
6409			oldfrags = blksize(fs, ip, lastlbn);
6410			blkno = DIP(ip, i_db[lastlbn]);
6411			if (blkno && oldfrags != frags) {
6412				oldfrags -= frags;
6413				oldfrags = numfrags(ip->i_fs, oldfrags);
6414				blkno += numfrags(ip->i_fs, frags);
6415				newfreework(ip->i_ump, freeblks, NULL, lastlbn,
6416				    blkno, oldfrags, 0, needj);
6417			} else if (blkno == 0)
6418				allocblock = 1;
6419		}
6420		/*
6421		 * Add a journal record for partial truncate if we are
6422		 * handling indirect blocks.  Non-indirects need no extra
6423		 * journaling.
6424		 */
6425		if (length != 0 && lastlbn >= NDADDR) {
6426			ip->i_flag |= IN_TRUNCATED;
6427			newjtrunc(freeblks, length, 0);
6428		}
6429		ip->i_size = length;
6430		DIP_SET(ip, i_size, ip->i_size);
6431		datablocks = DIP(ip, i_blocks) - extblocks;
6432		if (length != 0)
6433			datablocks = blkcount(ip->i_fs, datablocks, length);
6434		freeblks->fb_len = length;
6435	}
6436	if ((flags & IO_EXT) != 0) {
6437		for (i = 0; i < NXADDR; i++)
6438			setup_freeext(freeblks, ip, i, needj);
6439		ip->i_din2->di_extsize = 0;
6440		datablocks += extblocks;
6441	}
6442#ifdef QUOTA
6443	/* Reference the quotas in case the block count is wrong in the end. */
6444	quotaref(vp, freeblks->fb_quota);
6445	(void) chkdq(ip, -datablocks, NOCRED, 0);
6446#endif
6447	freeblks->fb_chkcnt = -datablocks;
6448	UFS_LOCK(ip->i_ump);
6449	fs->fs_pendingblocks += datablocks;
6450	UFS_UNLOCK(ip->i_ump);
6451	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6452	/*
6453	 * Handle truncation of incomplete alloc direct dependencies.  We
6454	 * hold the inode block locked to prevent incomplete dependencies
6455	 * from reaching the disk while we are eliminating those that
6456	 * have been truncated.  This is a partially inlined ffs_update().
6457	 */
6458	ufs_itimes(vp);
6459	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6460	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6461	    (int)fs->fs_bsize, cred, &bp);
6462	if (error) {
6463		brelse(bp);
6464		softdep_error("softdep_journal_freeblocks", error);
6465		return;
6466	}
6467	if (bp->b_bufsize == fs->fs_bsize)
6468		bp->b_flags |= B_CLUSTEROK;
6469	softdep_update_inodeblock(ip, bp, 0);
6470	if (ip->i_ump->um_fstype == UFS1)
6471		*((struct ufs1_dinode *)bp->b_data +
6472		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6473	else
6474		*((struct ufs2_dinode *)bp->b_data +
6475		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6476	ACQUIRE_LOCK(&lk);
6477	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6478	if ((inodedep->id_state & IOSTARTED) != 0)
6479		panic("softdep_setup_freeblocks: inode busy");
6480	/*
6481	 * Add the freeblks structure to the list of operations that
6482	 * must await the zero'ed inode being written to disk. If we
6483	 * still have a bitmap dependency (needj), then the inode
6484	 * has never been written to disk, so we can process the
6485	 * freeblks below once we have deleted the dependencies.
6486	 */
6487	if (needj)
6488		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6489	else
6490		freeblks->fb_state |= COMPLETE;
6491	if ((flags & IO_NORMAL) != 0) {
6492		TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6493			if (adp->ad_offset > iboff)
6494				cancel_allocdirect(&inodedep->id_inoupdt, adp,
6495				    freeblks);
6496			/*
6497			 * Truncate the allocdirect.  We could eliminate
6498			 * or modify journal records as well.
6499			 */
6500			else if (adp->ad_offset == iboff && frags)
6501				adp->ad_newsize = frags;
6502		}
6503	}
6504	if ((flags & IO_EXT) != 0)
6505		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6506			cancel_allocdirect(&inodedep->id_extupdt, adp,
6507			    freeblks);
6508	/*
6509	 * Scan the bufwait list for newblock dependencies that will never
6510	 * make it to disk.
6511	 */
6512	LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
6513		if (wk->wk_type != D_ALLOCDIRECT)
6514			continue;
6515		adp = WK_ALLOCDIRECT(wk);
6516		if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
6517		    ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
6518			cancel_jfreeblk(freeblks, adp->ad_newblkno);
6519			cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
6520			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6521		}
6522	}
6523	/*
6524	 * Add journal work.
6525	 */
6526	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6527		add_to_journal(&jblkdep->jb_list);
6528	FREE_LOCK(&lk);
6529	bdwrite(bp);
6530	/*
6531	 * Truncate dependency structures beyond length.
6532	 */
6533	trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6534	/*
6535	 * This is only set when we need to allocate a fragment because
6536	 * none existed at the end of a frag-sized file.  It handles only
6537	 * allocating a new, zero filled block.
6538	 */
6539	if (allocblock) {
6540		ip->i_size = length - lastoff;
6541		DIP_SET(ip, i_size, ip->i_size);
6542		error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6543		if (error != 0) {
6544			softdep_error("softdep_journal_freeblks", error);
6545			return;
6546		}
6547		ip->i_size = length;
6548		DIP_SET(ip, i_size, length);
6549		ip->i_flag |= IN_CHANGE | IN_UPDATE;
6550		allocbuf(bp, frags);
6551		ffs_update(vp, 0);
6552		bawrite(bp);
6553	} else if (lastoff != 0 && vp->v_type != VDIR) {
6554		int size;
6555
6556		/*
6557		 * Zero the end of a truncated frag or block.
6558		 */
6559		size = sblksize(fs, length, lastlbn);
6560		error = bread(vp, lastlbn, size, cred, &bp);
6561		if (error) {
6562			softdep_error("softdep_journal_freeblks", error);
6563			return;
6564		}
6565		bzero((char *)bp->b_data + lastoff, size - lastoff);
6566		bawrite(bp);
6567
6568	}
6569	ACQUIRE_LOCK(&lk);
6570	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6571	TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
6572	freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
6573	/*
6574	 * We zero earlier truncations so they don't erroneously
6575	 * update i_blocks.
6576	 */
6577	if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
6578		TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
6579			fbn->fb_len = 0;
6580	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
6581	    LIST_EMPTY(&freeblks->fb_jblkdephd))
6582		freeblks->fb_state |= INPROGRESS;
6583	else
6584		freeblks = NULL;
6585	FREE_LOCK(&lk);
6586	if (freeblks)
6587		handle_workitem_freeblocks(freeblks, 0);
6588	trunc_pages(ip, length, extblocks, flags);
6589
6590}
6591
6592/*
6593 * Flush a JOP_SYNC to the journal.
6594 */
6595void
6596softdep_journal_fsync(ip)
6597	struct inode *ip;
6598{
6599	struct jfsync *jfsync;
6600
6601	if ((ip->i_flag & IN_TRUNCATED) == 0)
6602		return;
6603	ip->i_flag &= ~IN_TRUNCATED;
6604	jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
6605	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump));
6606	jfsync->jfs_size = ip->i_size;
6607	jfsync->jfs_ino = ip->i_number;
6608	ACQUIRE_LOCK(&lk);
6609	add_to_journal(&jfsync->jfs_list);
6610	jwait(&jfsync->jfs_list, MNT_WAIT);
6611	FREE_LOCK(&lk);
6612}
6613
6614/*
6615 * Block de-allocation dependencies.
6616 *
6617 * When blocks are de-allocated, the on-disk pointers must be nullified before
6618 * the blocks are made available for use by other files.  (The true
6619 * requirement is that old pointers must be nullified before new on-disk
6620 * pointers are set.  We chose this slightly more stringent requirement to
6621 * reduce complexity.) Our implementation handles this dependency by updating
6622 * the inode (or indirect block) appropriately but delaying the actual block
6623 * de-allocation (i.e., freemap and free space count manipulation) until
6624 * after the updated versions reach stable storage.  After the disk is
6625 * updated, the blocks can be safely de-allocated whenever it is convenient.
6626 * This implementation handles only the common case of reducing a file's
6627 * length to zero. Other cases are handled by the conventional synchronous
6628 * write approach.
6629 *
6630 * The ffs implementation with which we worked double-checks
6631 * the state of the block pointers and file size as it reduces
6632 * a file's length.  Some of this code is replicated here in our
6633 * soft updates implementation.  The freeblks->fb_chkcnt field is
6634 * used to transfer a part of this information to the procedure
6635 * that eventually de-allocates the blocks.
6636 *
6637 * This routine should be called from the routine that shortens
6638 * a file's length, before the inode's size or block pointers
6639 * are modified. It will save the block pointer information for
6640 * later release and zero the inode so that the calling routine
6641 * can release it.
6642 */
6643void
6644softdep_setup_freeblocks(ip, length, flags)
6645	struct inode *ip;	/* The inode whose length is to be reduced */
6646	off_t length;		/* The new length for the file */
6647	int flags;		/* IO_EXT and/or IO_NORMAL */
6648{
6649	struct ufs1_dinode *dp1;
6650	struct ufs2_dinode *dp2;
6651	struct freeblks *freeblks;
6652	struct inodedep *inodedep;
6653	struct allocdirect *adp;
6654	struct buf *bp;
6655	struct fs *fs;
6656	ufs2_daddr_t extblocks, datablocks;
6657	struct mount *mp;
6658	int i, delay, error, dflags;
6659	ufs_lbn_t tmpval;
6660	ufs_lbn_t lbn;
6661
6662	CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
6663	    ip->i_number, length);
6664	fs = ip->i_fs;
6665	mp = UFSTOVFS(ip->i_ump);
6666	if (length != 0)
6667		panic("softdep_setup_freeblocks: non-zero length");
6668	freeblks = newfreeblks(mp, ip);
6669	extblocks = 0;
6670	datablocks = 0;
6671	if (fs->fs_magic == FS_UFS2_MAGIC)
6672		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6673	if ((flags & IO_NORMAL) != 0) {
6674		for (i = 0; i < NDADDR; i++)
6675			setup_freedirect(freeblks, ip, i, 0);
6676		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6677		    i++, lbn += tmpval, tmpval *= NINDIR(fs))
6678			setup_freeindir(freeblks, ip, i, -lbn -i, 0);
6679		ip->i_size = 0;
6680		DIP_SET(ip, i_size, 0);
6681		datablocks = DIP(ip, i_blocks) - extblocks;
6682	}
6683	if ((flags & IO_EXT) != 0) {
6684		for (i = 0; i < NXADDR; i++)
6685			setup_freeext(freeblks, ip, i, 0);
6686		ip->i_din2->di_extsize = 0;
6687		datablocks += extblocks;
6688	}
6689#ifdef QUOTA
6690	/* Reference the quotas in case the block count is wrong in the end. */
6691	quotaref(ITOV(ip), freeblks->fb_quota);
6692	(void) chkdq(ip, -datablocks, NOCRED, 0);
6693#endif
6694	freeblks->fb_chkcnt = -datablocks;
6695	UFS_LOCK(ip->i_ump);
6696	fs->fs_pendingblocks += datablocks;
6697	UFS_UNLOCK(ip->i_ump);
6698	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6699	/*
6700	 * Push the zero'ed inode to to its disk buffer so that we are free
6701	 * to delete its dependencies below. Once the dependencies are gone
6702	 * the buffer can be safely released.
6703	 */
6704	if ((error = bread(ip->i_devvp,
6705	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6706	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
6707		brelse(bp);
6708		softdep_error("softdep_setup_freeblocks", error);
6709	}
6710	if (ip->i_ump->um_fstype == UFS1) {
6711		dp1 = ((struct ufs1_dinode *)bp->b_data +
6712		    ino_to_fsbo(fs, ip->i_number));
6713		ip->i_din1->di_freelink = dp1->di_freelink;
6714		*dp1 = *ip->i_din1;
6715	} else {
6716		dp2 = ((struct ufs2_dinode *)bp->b_data +
6717		    ino_to_fsbo(fs, ip->i_number));
6718		ip->i_din2->di_freelink = dp2->di_freelink;
6719		*dp2 = *ip->i_din2;
6720	}
6721	/*
6722	 * Find and eliminate any inode dependencies.
6723	 */
6724	ACQUIRE_LOCK(&lk);
6725	dflags = DEPALLOC;
6726	if (IS_SNAPSHOT(ip))
6727		dflags |= NODELAY;
6728	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6729	if ((inodedep->id_state & IOSTARTED) != 0)
6730		panic("softdep_setup_freeblocks: inode busy");
6731	/*
6732	 * Add the freeblks structure to the list of operations that
6733	 * must await the zero'ed inode being written to disk. If we
6734	 * still have a bitmap dependency (delay == 0), then the inode
6735	 * has never been written to disk, so we can process the
6736	 * freeblks below once we have deleted the dependencies.
6737	 */
6738	delay = (inodedep->id_state & DEPCOMPLETE);
6739	if (delay)
6740		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6741	else
6742		freeblks->fb_state |= COMPLETE;
6743	/*
6744	 * Because the file length has been truncated to zero, any
6745	 * pending block allocation dependency structures associated
6746	 * with this inode are obsolete and can simply be de-allocated.
6747	 * We must first merge the two dependency lists to get rid of
6748	 * any duplicate freefrag structures, then purge the merged list.
6749	 * If we still have a bitmap dependency, then the inode has never
6750	 * been written to disk, so we can free any fragments without delay.
6751	 */
6752	if (flags & IO_NORMAL) {
6753		merge_inode_lists(&inodedep->id_newinoupdt,
6754		    &inodedep->id_inoupdt);
6755		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
6756			cancel_allocdirect(&inodedep->id_inoupdt, adp,
6757			    freeblks);
6758	}
6759	if (flags & IO_EXT) {
6760		merge_inode_lists(&inodedep->id_newextupdt,
6761		    &inodedep->id_extupdt);
6762		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6763			cancel_allocdirect(&inodedep->id_extupdt, adp,
6764			    freeblks);
6765	}
6766	FREE_LOCK(&lk);
6767	bdwrite(bp);
6768	trunc_dependencies(ip, freeblks, -1, 0, flags);
6769	ACQUIRE_LOCK(&lk);
6770	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
6771		(void) free_inodedep(inodedep);
6772	freeblks->fb_state |= DEPCOMPLETE;
6773	/*
6774	 * If the inode with zeroed block pointers is now on disk
6775	 * we can start freeing blocks.
6776	 */
6777	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
6778		freeblks->fb_state |= INPROGRESS;
6779	else
6780		freeblks = NULL;
6781	FREE_LOCK(&lk);
6782	if (freeblks)
6783		handle_workitem_freeblocks(freeblks, 0);
6784	trunc_pages(ip, length, extblocks, flags);
6785}
6786
6787/*
6788 * Eliminate pages from the page cache that back parts of this inode and
6789 * adjust the vnode pager's idea of our size.  This prevents stale data
6790 * from hanging around in the page cache.
6791 */
6792static void
6793trunc_pages(ip, length, extblocks, flags)
6794	struct inode *ip;
6795	off_t length;
6796	ufs2_daddr_t extblocks;
6797	int flags;
6798{
6799	struct vnode *vp;
6800	struct fs *fs;
6801	ufs_lbn_t lbn;
6802	off_t end, extend;
6803
6804	vp = ITOV(ip);
6805	fs = ip->i_fs;
6806	extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
6807	if ((flags & IO_EXT) != 0)
6808		vn_pages_remove(vp, extend, 0);
6809	if ((flags & IO_NORMAL) == 0)
6810		return;
6811	BO_LOCK(&vp->v_bufobj);
6812	drain_output(vp);
6813	BO_UNLOCK(&vp->v_bufobj);
6814	/*
6815	 * The vnode pager eliminates file pages we eliminate indirects
6816	 * below.
6817	 */
6818	vnode_pager_setsize(vp, length);
6819	/*
6820	 * Calculate the end based on the last indirect we want to keep.  If
6821	 * the block extends into indirects we can just use the negative of
6822	 * its lbn.  Doubles and triples exist at lower numbers so we must
6823	 * be careful not to remove those, if they exist.  double and triple
6824	 * indirect lbns do not overlap with others so it is not important
6825	 * to verify how many levels are required.
6826	 */
6827	lbn = lblkno(fs, length);
6828	if (lbn >= NDADDR) {
6829		/* Calculate the virtual lbn of the triple indirect. */
6830		lbn = -lbn - (NIADDR - 1);
6831		end = OFF_TO_IDX(lblktosize(fs, lbn));
6832	} else
6833		end = extend;
6834	vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
6835}
6836
6837/*
6838 * See if the buf bp is in the range eliminated by truncation.
6839 */
6840static int
6841trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
6842	struct buf *bp;
6843	int *blkoffp;
6844	ufs_lbn_t lastlbn;
6845	int lastoff;
6846	int flags;
6847{
6848	ufs_lbn_t lbn;
6849
6850	*blkoffp = 0;
6851	/* Only match ext/normal blocks as appropriate. */
6852	if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
6853	    ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
6854		return (0);
6855	/* ALTDATA is always a full truncation. */
6856	if ((bp->b_xflags & BX_ALTDATA) != 0)
6857		return (1);
6858	/* -1 is full truncation. */
6859	if (lastlbn == -1)
6860		return (1);
6861	/*
6862	 * If this is a partial truncate we only want those
6863	 * blocks and indirect blocks that cover the range
6864	 * we're after.
6865	 */
6866	lbn = bp->b_lblkno;
6867	if (lbn < 0)
6868		lbn = -(lbn + lbn_level(lbn));
6869	if (lbn < lastlbn)
6870		return (0);
6871	/* Here we only truncate lblkno if it's partial. */
6872	if (lbn == lastlbn) {
6873		if (lastoff == 0)
6874			return (0);
6875		*blkoffp = lastoff;
6876	}
6877	return (1);
6878}
6879
6880/*
6881 * Eliminate any dependencies that exist in memory beyond lblkno:off
6882 */
6883static void
6884trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
6885	struct inode *ip;
6886	struct freeblks *freeblks;
6887	ufs_lbn_t lastlbn;
6888	int lastoff;
6889	int flags;
6890{
6891	struct bufobj *bo;
6892	struct vnode *vp;
6893	struct buf *bp;
6894	struct fs *fs;
6895	int blkoff;
6896
6897	/*
6898	 * We must wait for any I/O in progress to finish so that
6899	 * all potential buffers on the dirty list will be visible.
6900	 * Once they are all there, walk the list and get rid of
6901	 * any dependencies.
6902	 */
6903	fs = ip->i_fs;
6904	vp = ITOV(ip);
6905	bo = &vp->v_bufobj;
6906	BO_LOCK(bo);
6907	drain_output(vp);
6908	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
6909		bp->b_vflags &= ~BV_SCANNED;
6910restart:
6911	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
6912		if (bp->b_vflags & BV_SCANNED)
6913			continue;
6914		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
6915			bp->b_vflags |= BV_SCANNED;
6916			continue;
6917		}
6918		if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL)
6919			goto restart;
6920		BO_UNLOCK(bo);
6921		if (deallocate_dependencies(bp, freeblks, blkoff))
6922			bqrelse(bp);
6923		else
6924			brelse(bp);
6925		BO_LOCK(bo);
6926		goto restart;
6927	}
6928	/*
6929	 * Now do the work of vtruncbuf while also matching indirect blocks.
6930	 */
6931	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
6932		bp->b_vflags &= ~BV_SCANNED;
6933cleanrestart:
6934	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
6935		if (bp->b_vflags & BV_SCANNED)
6936			continue;
6937		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
6938			bp->b_vflags |= BV_SCANNED;
6939			continue;
6940		}
6941		if (BUF_LOCK(bp,
6942		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6943		    BO_MTX(bo)) == ENOLCK) {
6944			BO_LOCK(bo);
6945			goto cleanrestart;
6946		}
6947		bp->b_vflags |= BV_SCANNED;
6948		BO_LOCK(bo);
6949		bremfree(bp);
6950		BO_UNLOCK(bo);
6951		if (blkoff != 0) {
6952			allocbuf(bp, blkoff);
6953			bqrelse(bp);
6954		} else {
6955			bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
6956			brelse(bp);
6957		}
6958		BO_LOCK(bo);
6959		goto cleanrestart;
6960	}
6961	drain_output(vp);
6962	BO_UNLOCK(bo);
6963}
6964
6965static int
6966cancel_pagedep(pagedep, freeblks, blkoff)
6967	struct pagedep *pagedep;
6968	struct freeblks *freeblks;
6969	int blkoff;
6970{
6971	struct jremref *jremref;
6972	struct jmvref *jmvref;
6973	struct dirrem *dirrem, *tmp;
6974	int i;
6975
6976	/*
6977	 * Copy any directory remove dependencies to the list
6978	 * to be processed after the freeblks proceeds.  If
6979	 * directory entry never made it to disk they
6980	 * can be dumped directly onto the work list.
6981	 */
6982	LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
6983		/* Skip this directory removal if it is intended to remain. */
6984		if (dirrem->dm_offset < blkoff)
6985			continue;
6986		/*
6987		 * If there are any dirrems we wait for the journal write
6988		 * to complete and then restart the buf scan as the lock
6989		 * has been dropped.
6990		 */
6991		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
6992			jwait(&jremref->jr_list, MNT_WAIT);
6993			return (ERESTART);
6994		}
6995		LIST_REMOVE(dirrem, dm_next);
6996		dirrem->dm_dirinum = pagedep->pd_ino;
6997		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
6998	}
6999	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
7000		jwait(&jmvref->jm_list, MNT_WAIT);
7001		return (ERESTART);
7002	}
7003	/*
7004	 * When we're partially truncating a pagedep we just want to flush
7005	 * journal entries and return.  There can not be any adds in the
7006	 * truncated portion of the directory and newblk must remain if
7007	 * part of the block remains.
7008	 */
7009	if (blkoff != 0) {
7010		struct diradd *dap;
7011
7012		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
7013			if (dap->da_offset > blkoff)
7014				panic("cancel_pagedep: diradd %p off %d > %d",
7015				    dap, dap->da_offset, blkoff);
7016		for (i = 0; i < DAHASHSZ; i++)
7017			LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
7018				if (dap->da_offset > blkoff)
7019					panic("cancel_pagedep: diradd %p off %d > %d",
7020					    dap, dap->da_offset, blkoff);
7021		return (0);
7022	}
7023	/*
7024	 * There should be no directory add dependencies present
7025	 * as the directory could not be truncated until all
7026	 * children were removed.
7027	 */
7028	KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
7029	    ("deallocate_dependencies: pendinghd != NULL"));
7030	for (i = 0; i < DAHASHSZ; i++)
7031		KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
7032		    ("deallocate_dependencies: diraddhd != NULL"));
7033	if ((pagedep->pd_state & NEWBLOCK) != 0)
7034		free_newdirblk(pagedep->pd_newdirblk);
7035	if (free_pagedep(pagedep) == 0)
7036		panic("Failed to free pagedep %p", pagedep);
7037	return (0);
7038}
7039
7040/*
7041 * Reclaim any dependency structures from a buffer that is about to
7042 * be reallocated to a new vnode. The buffer must be locked, thus,
7043 * no I/O completion operations can occur while we are manipulating
7044 * its associated dependencies. The mutex is held so that other I/O's
7045 * associated with related dependencies do not occur.
7046 */
7047static int
7048deallocate_dependencies(bp, freeblks, off)
7049	struct buf *bp;
7050	struct freeblks *freeblks;
7051	int off;
7052{
7053	struct indirdep *indirdep;
7054	struct pagedep *pagedep;
7055	struct allocdirect *adp;
7056	struct worklist *wk, *wkn;
7057
7058	ACQUIRE_LOCK(&lk);
7059	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
7060		switch (wk->wk_type) {
7061		case D_INDIRDEP:
7062			indirdep = WK_INDIRDEP(wk);
7063			if (bp->b_lblkno >= 0 ||
7064			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
7065				panic("deallocate_dependencies: not indir");
7066			cancel_indirdep(indirdep, bp, freeblks);
7067			continue;
7068
7069		case D_PAGEDEP:
7070			pagedep = WK_PAGEDEP(wk);
7071			if (cancel_pagedep(pagedep, freeblks, off)) {
7072				FREE_LOCK(&lk);
7073				return (ERESTART);
7074			}
7075			continue;
7076
7077		case D_ALLOCINDIR:
7078			/*
7079			 * Simply remove the allocindir, we'll find it via
7080			 * the indirdep where we can clear pointers if
7081			 * needed.
7082			 */
7083			WORKLIST_REMOVE(wk);
7084			continue;
7085
7086		case D_FREEWORK:
7087			/*
7088			 * A truncation is waiting for the zero'd pointers
7089			 * to be written.  It can be freed when the freeblks
7090			 * is journaled.
7091			 */
7092			WORKLIST_REMOVE(wk);
7093			wk->wk_state |= ONDEPLIST;
7094			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7095			break;
7096
7097		case D_ALLOCDIRECT:
7098			adp = WK_ALLOCDIRECT(wk);
7099			if (off != 0)
7100				continue;
7101			/* FALLTHROUGH */
7102		default:
7103			panic("deallocate_dependencies: Unexpected type %s",
7104			    TYPENAME(wk->wk_type));
7105			/* NOTREACHED */
7106		}
7107	}
7108	FREE_LOCK(&lk);
7109	/*
7110	 * Don't throw away this buf, we were partially truncating and
7111	 * some deps may always remain.
7112	 */
7113	if (off) {
7114		allocbuf(bp, off);
7115		bp->b_vflags |= BV_SCANNED;
7116		return (EBUSY);
7117	}
7118	bp->b_flags |= B_INVAL | B_NOCACHE;
7119
7120	return (0);
7121}
7122
7123/*
7124 * An allocdirect is being canceled due to a truncate.  We must make sure
7125 * the journal entry is released in concert with the blkfree that releases
7126 * the storage.  Completed journal entries must not be released until the
7127 * space is no longer pointed to by the inode or in the bitmap.
7128 */
7129static void
7130cancel_allocdirect(adphead, adp, freeblks)
7131	struct allocdirectlst *adphead;
7132	struct allocdirect *adp;
7133	struct freeblks *freeblks;
7134{
7135	struct freework *freework;
7136	struct newblk *newblk;
7137	struct worklist *wk;
7138
7139	TAILQ_REMOVE(adphead, adp, ad_next);
7140	newblk = (struct newblk *)adp;
7141	freework = NULL;
7142	/*
7143	 * Find the correct freework structure.
7144	 */
7145	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
7146		if (wk->wk_type != D_FREEWORK)
7147			continue;
7148		freework = WK_FREEWORK(wk);
7149		if (freework->fw_blkno == newblk->nb_newblkno)
7150			break;
7151	}
7152	if (freework == NULL)
7153		panic("cancel_allocdirect: Freework not found");
7154	/*
7155	 * If a newblk exists at all we still have the journal entry that
7156	 * initiated the allocation so we do not need to journal the free.
7157	 */
7158	cancel_jfreeblk(freeblks, freework->fw_blkno);
7159	/*
7160	 * If the journal hasn't been written the jnewblk must be passed
7161	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
7162	 * this by linking the journal dependency into the freework to be
7163	 * freed when freework_freeblock() is called.  If the journal has
7164	 * been written we can simply reclaim the journal space when the
7165	 * freeblks work is complete.
7166	 */
7167	freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
7168	    &freeblks->fb_jwork);
7169	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7170}
7171
7172
7173/*
7174 * Cancel a new block allocation.  May be an indirect or direct block.  We
7175 * remove it from various lists and return any journal record that needs to
7176 * be resolved by the caller.
7177 *
7178 * A special consideration is made for indirects which were never pointed
7179 * at on disk and will never be found once this block is released.
7180 */
7181static struct jnewblk *
7182cancel_newblk(newblk, wk, wkhd)
7183	struct newblk *newblk;
7184	struct worklist *wk;
7185	struct workhead *wkhd;
7186{
7187	struct jnewblk *jnewblk;
7188
7189	CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
7190
7191	newblk->nb_state |= GOINGAWAY;
7192	/*
7193	 * Previously we traversed the completedhd on each indirdep
7194	 * attached to this newblk to cancel them and gather journal
7195	 * work.  Since we need only the oldest journal segment and
7196	 * the lowest point on the tree will always have the oldest
7197	 * journal segment we are free to release the segments
7198	 * of any subordinates and may leave the indirdep list to
7199	 * indirdep_complete() when this newblk is freed.
7200	 */
7201	if (newblk->nb_state & ONDEPLIST) {
7202		newblk->nb_state &= ~ONDEPLIST;
7203		LIST_REMOVE(newblk, nb_deps);
7204	}
7205	if (newblk->nb_state & ONWORKLIST)
7206		WORKLIST_REMOVE(&newblk->nb_list);
7207	/*
7208	 * If the journal entry hasn't been written we save a pointer to
7209	 * the dependency that frees it until it is written or the
7210	 * superseding operation completes.
7211	 */
7212	jnewblk = newblk->nb_jnewblk;
7213	if (jnewblk != NULL && wk != NULL) {
7214		newblk->nb_jnewblk = NULL;
7215		jnewblk->jn_dep = wk;
7216	}
7217	if (!LIST_EMPTY(&newblk->nb_jwork))
7218		jwork_move(wkhd, &newblk->nb_jwork);
7219	/*
7220	 * When truncating we must free the newdirblk early to remove
7221	 * the pagedep from the hash before returning.
7222	 */
7223	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7224		free_newdirblk(WK_NEWDIRBLK(wk));
7225	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7226		panic("cancel_newblk: extra newdirblk");
7227
7228	return (jnewblk);
7229}
7230
7231/*
7232 * Schedule the freefrag associated with a newblk to be released once
7233 * the pointers are written and the previous block is no longer needed.
7234 */
7235static void
7236newblk_freefrag(newblk)
7237	struct newblk *newblk;
7238{
7239	struct freefrag *freefrag;
7240
7241	if (newblk->nb_freefrag == NULL)
7242		return;
7243	freefrag = newblk->nb_freefrag;
7244	newblk->nb_freefrag = NULL;
7245	freefrag->ff_state |= COMPLETE;
7246	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7247		add_to_worklist(&freefrag->ff_list, 0);
7248}
7249
7250/*
7251 * Free a newblk. Generate a new freefrag work request if appropriate.
7252 * This must be called after the inode pointer and any direct block pointers
7253 * are valid or fully removed via truncate or frag extension.
7254 */
7255static void
7256free_newblk(newblk)
7257	struct newblk *newblk;
7258{
7259	struct indirdep *indirdep;
7260	struct worklist *wk;
7261
7262	KASSERT(newblk->nb_jnewblk == NULL,
7263	    ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
7264	KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
7265	    ("free_newblk: unclaimed newblk"));
7266	mtx_assert(&lk, MA_OWNED);
7267	newblk_freefrag(newblk);
7268	if (newblk->nb_state & ONDEPLIST)
7269		LIST_REMOVE(newblk, nb_deps);
7270	if (newblk->nb_state & ONWORKLIST)
7271		WORKLIST_REMOVE(&newblk->nb_list);
7272	LIST_REMOVE(newblk, nb_hash);
7273	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7274		free_newdirblk(WK_NEWDIRBLK(wk));
7275	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7276		panic("free_newblk: extra newdirblk");
7277	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7278		indirdep_complete(indirdep);
7279	handle_jwork(&newblk->nb_jwork);
7280	WORKITEM_FREE(newblk, D_NEWBLK);
7281}
7282
7283/*
7284 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7285 * This routine must be called with splbio interrupts blocked.
7286 */
7287static void
7288free_newdirblk(newdirblk)
7289	struct newdirblk *newdirblk;
7290{
7291	struct pagedep *pagedep;
7292	struct diradd *dap;
7293	struct worklist *wk;
7294
7295	mtx_assert(&lk, MA_OWNED);
7296	WORKLIST_REMOVE(&newdirblk->db_list);
7297	/*
7298	 * If the pagedep is still linked onto the directory buffer
7299	 * dependency chain, then some of the entries on the
7300	 * pd_pendinghd list may not be committed to disk yet. In
7301	 * this case, we will simply clear the NEWBLOCK flag and
7302	 * let the pd_pendinghd list be processed when the pagedep
7303	 * is next written. If the pagedep is no longer on the buffer
7304	 * dependency chain, then all the entries on the pd_pending
7305	 * list are committed to disk and we can free them here.
7306	 */
7307	pagedep = newdirblk->db_pagedep;
7308	pagedep->pd_state &= ~NEWBLOCK;
7309	if ((pagedep->pd_state & ONWORKLIST) == 0) {
7310		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7311			free_diradd(dap, NULL);
7312		/*
7313		 * If no dependencies remain, the pagedep will be freed.
7314		 */
7315		free_pagedep(pagedep);
7316	}
7317	/* Should only ever be one item in the list. */
7318	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7319		WORKLIST_REMOVE(wk);
7320		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7321	}
7322	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7323}
7324
7325/*
7326 * Prepare an inode to be freed. The actual free operation is not
7327 * done until the zero'ed inode has been written to disk.
7328 */
7329void
7330softdep_freefile(pvp, ino, mode)
7331	struct vnode *pvp;
7332	ino_t ino;
7333	int mode;
7334{
7335	struct inode *ip = VTOI(pvp);
7336	struct inodedep *inodedep;
7337	struct freefile *freefile;
7338	struct freeblks *freeblks;
7339
7340	/*
7341	 * This sets up the inode de-allocation dependency.
7342	 */
7343	freefile = malloc(sizeof(struct freefile),
7344		M_FREEFILE, M_SOFTDEP_FLAGS);
7345	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7346	freefile->fx_mode = mode;
7347	freefile->fx_oldinum = ino;
7348	freefile->fx_devvp = ip->i_devvp;
7349	LIST_INIT(&freefile->fx_jwork);
7350	UFS_LOCK(ip->i_ump);
7351	ip->i_fs->fs_pendinginodes += 1;
7352	UFS_UNLOCK(ip->i_ump);
7353
7354	/*
7355	 * If the inodedep does not exist, then the zero'ed inode has
7356	 * been written to disk. If the allocated inode has never been
7357	 * written to disk, then the on-disk inode is zero'ed. In either
7358	 * case we can free the file immediately.  If the journal was
7359	 * canceled before being written the inode will never make it to
7360	 * disk and we must send the canceled journal entrys to
7361	 * ffs_freefile() to be cleared in conjunction with the bitmap.
7362	 * Any blocks waiting on the inode to write can be safely freed
7363	 * here as it will never been written.
7364	 */
7365	ACQUIRE_LOCK(&lk);
7366	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7367	if (inodedep) {
7368		/*
7369		 * Clear out freeblks that no longer need to reference
7370		 * this inode.
7371		 */
7372		while ((freeblks =
7373		    TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7374			TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7375			    fb_next);
7376			freeblks->fb_state &= ~ONDEPLIST;
7377		}
7378		/*
7379		 * Remove this inode from the unlinked list.
7380		 */
7381		if (inodedep->id_state & UNLINKED) {
7382			/*
7383			 * Save the journal work to be freed with the bitmap
7384			 * before we clear UNLINKED.  Otherwise it can be lost
7385			 * if the inode block is written.
7386			 */
7387			handle_bufwait(inodedep, &freefile->fx_jwork);
7388			clear_unlinked_inodedep(inodedep);
7389			/* Re-acquire inodedep as we've dropped lk. */
7390			inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7391		}
7392	}
7393	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7394		FREE_LOCK(&lk);
7395		handle_workitem_freefile(freefile);
7396		return;
7397	}
7398	if ((inodedep->id_state & DEPCOMPLETE) == 0)
7399		inodedep->id_state |= GOINGAWAY;
7400	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7401	FREE_LOCK(&lk);
7402	if (ip->i_number == ino)
7403		ip->i_flag |= IN_MODIFIED;
7404}
7405
7406/*
7407 * Check to see if an inode has never been written to disk. If
7408 * so free the inodedep and return success, otherwise return failure.
7409 * This routine must be called with splbio interrupts blocked.
7410 *
7411 * If we still have a bitmap dependency, then the inode has never
7412 * been written to disk. Drop the dependency as it is no longer
7413 * necessary since the inode is being deallocated. We set the
7414 * ALLCOMPLETE flags since the bitmap now properly shows that the
7415 * inode is not allocated. Even if the inode is actively being
7416 * written, it has been rolled back to its zero'ed state, so we
7417 * are ensured that a zero inode is what is on the disk. For short
7418 * lived files, this change will usually result in removing all the
7419 * dependencies from the inode so that it can be freed immediately.
7420 */
7421static int
7422check_inode_unwritten(inodedep)
7423	struct inodedep *inodedep;
7424{
7425
7426	mtx_assert(&lk, MA_OWNED);
7427
7428	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7429	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7430	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7431	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7432	    !LIST_EMPTY(&inodedep->id_inowait) ||
7433	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7434	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7435	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7436	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7437	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7438	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7439	    inodedep->id_mkdiradd != NULL ||
7440	    inodedep->id_nlinkdelta != 0)
7441		return (0);
7442	/*
7443	 * Another process might be in initiate_write_inodeblock_ufs[12]
7444	 * trying to allocate memory without holding "Softdep Lock".
7445	 */
7446	if ((inodedep->id_state & IOSTARTED) != 0 &&
7447	    inodedep->id_savedino1 == NULL)
7448		return (0);
7449
7450	if (inodedep->id_state & ONDEPLIST)
7451		LIST_REMOVE(inodedep, id_deps);
7452	inodedep->id_state &= ~ONDEPLIST;
7453	inodedep->id_state |= ALLCOMPLETE;
7454	inodedep->id_bmsafemap = NULL;
7455	if (inodedep->id_state & ONWORKLIST)
7456		WORKLIST_REMOVE(&inodedep->id_list);
7457	if (inodedep->id_savedino1 != NULL) {
7458		free(inodedep->id_savedino1, M_SAVEDINO);
7459		inodedep->id_savedino1 = NULL;
7460	}
7461	if (free_inodedep(inodedep) == 0)
7462		panic("check_inode_unwritten: busy inode");
7463	return (1);
7464}
7465
7466/*
7467 * Try to free an inodedep structure. Return 1 if it could be freed.
7468 */
7469static int
7470free_inodedep(inodedep)
7471	struct inodedep *inodedep;
7472{
7473
7474	mtx_assert(&lk, MA_OWNED);
7475	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7476	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7477	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7478	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7479	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7480	    !LIST_EMPTY(&inodedep->id_inowait) ||
7481	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7482	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7483	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7484	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7485	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7486	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7487	    inodedep->id_mkdiradd != NULL ||
7488	    inodedep->id_nlinkdelta != 0 ||
7489	    inodedep->id_savedino1 != NULL)
7490		return (0);
7491	if (inodedep->id_state & ONDEPLIST)
7492		LIST_REMOVE(inodedep, id_deps);
7493	LIST_REMOVE(inodedep, id_hash);
7494	WORKITEM_FREE(inodedep, D_INODEDEP);
7495	return (1);
7496}
7497
7498/*
7499 * Free the block referenced by a freework structure.  The parent freeblks
7500 * structure is released and completed when the final cg bitmap reaches
7501 * the disk.  This routine may be freeing a jnewblk which never made it to
7502 * disk in which case we do not have to wait as the operation is undone
7503 * in memory immediately.
7504 */
7505static void
7506freework_freeblock(freework)
7507	struct freework *freework;
7508{
7509	struct freeblks *freeblks;
7510	struct jnewblk *jnewblk;
7511	struct ufsmount *ump;
7512	struct workhead wkhd;
7513	struct fs *fs;
7514	int bsize;
7515	int needj;
7516
7517	mtx_assert(&lk, MA_OWNED);
7518	/*
7519	 * Handle partial truncate separately.
7520	 */
7521	if (freework->fw_indir) {
7522		complete_trunc_indir(freework);
7523		return;
7524	}
7525	freeblks = freework->fw_freeblks;
7526	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7527	fs = ump->um_fs;
7528	needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7529	bsize = lfragtosize(fs, freework->fw_frags);
7530	LIST_INIT(&wkhd);
7531	/*
7532	 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7533	 * on the indirblk hashtable and prevents premature freeing.
7534	 */
7535	freework->fw_state |= DEPCOMPLETE;
7536	/*
7537	 * SUJ needs to wait for the segment referencing freed indirect
7538	 * blocks to expire so that we know the checker will not confuse
7539	 * a re-allocated indirect block with its old contents.
7540	 */
7541	if (needj && freework->fw_lbn <= -NDADDR)
7542		indirblk_insert(freework);
7543	/*
7544	 * If we are canceling an existing jnewblk pass it to the free
7545	 * routine, otherwise pass the freeblk which will ultimately
7546	 * release the freeblks.  If we're not journaling, we can just
7547	 * free the freeblks immediately.
7548	 */
7549	jnewblk = freework->fw_jnewblk;
7550	if (jnewblk != NULL) {
7551		cancel_jnewblk(jnewblk, &wkhd);
7552		needj = 0;
7553	} else if (needj) {
7554		freework->fw_state |= DELAYEDFREE;
7555		freeblks->fb_cgwait++;
7556		WORKLIST_INSERT(&wkhd, &freework->fw_list);
7557	}
7558	FREE_LOCK(&lk);
7559	freeblks_free(ump, freeblks, btodb(bsize));
7560	CTR4(KTR_SUJ,
7561	    "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
7562	    freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
7563	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
7564	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
7565	ACQUIRE_LOCK(&lk);
7566	/*
7567	 * The jnewblk will be discarded and the bits in the map never
7568	 * made it to disk.  We can immediately free the freeblk.
7569	 */
7570	if (needj == 0)
7571		handle_written_freework(freework);
7572}
7573
7574/*
7575 * We enqueue freework items that need processing back on the freeblks and
7576 * add the freeblks to the worklist.  This makes it easier to find all work
7577 * required to flush a truncation in process_truncates().
7578 */
7579static void
7580freework_enqueue(freework)
7581	struct freework *freework;
7582{
7583	struct freeblks *freeblks;
7584
7585	freeblks = freework->fw_freeblks;
7586	if ((freework->fw_state & INPROGRESS) == 0)
7587		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
7588	if ((freeblks->fb_state &
7589	    (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
7590	    LIST_EMPTY(&freeblks->fb_jblkdephd))
7591		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7592}
7593
7594/*
7595 * Start, continue, or finish the process of freeing an indirect block tree.
7596 * The free operation may be paused at any point with fw_off containing the
7597 * offset to restart from.  This enables us to implement some flow control
7598 * for large truncates which may fan out and generate a huge number of
7599 * dependencies.
7600 */
7601static void
7602handle_workitem_indirblk(freework)
7603	struct freework *freework;
7604{
7605	struct freeblks *freeblks;
7606	struct ufsmount *ump;
7607	struct fs *fs;
7608
7609	freeblks = freework->fw_freeblks;
7610	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7611	fs = ump->um_fs;
7612	if (freework->fw_state & DEPCOMPLETE) {
7613		handle_written_freework(freework);
7614		return;
7615	}
7616	if (freework->fw_off == NINDIR(fs)) {
7617		freework_freeblock(freework);
7618		return;
7619	}
7620	freework->fw_state |= INPROGRESS;
7621	FREE_LOCK(&lk);
7622	indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
7623	    freework->fw_lbn);
7624	ACQUIRE_LOCK(&lk);
7625}
7626
7627/*
7628 * Called when a freework structure attached to a cg buf is written.  The
7629 * ref on either the parent or the freeblks structure is released and
7630 * the freeblks is added back to the worklist if there is more work to do.
7631 */
7632static void
7633handle_written_freework(freework)
7634	struct freework *freework;
7635{
7636	struct freeblks *freeblks;
7637	struct freework *parent;
7638
7639	freeblks = freework->fw_freeblks;
7640	parent = freework->fw_parent;
7641	if (freework->fw_state & DELAYEDFREE)
7642		freeblks->fb_cgwait--;
7643	freework->fw_state |= COMPLETE;
7644	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
7645		WORKITEM_FREE(freework, D_FREEWORK);
7646	if (parent) {
7647		if (--parent->fw_ref == 0)
7648			freework_enqueue(parent);
7649		return;
7650	}
7651	if (--freeblks->fb_ref != 0)
7652		return;
7653	if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
7654	    ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
7655		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7656}
7657
7658/*
7659 * This workitem routine performs the block de-allocation.
7660 * The workitem is added to the pending list after the updated
7661 * inode block has been written to disk.  As mentioned above,
7662 * checks regarding the number of blocks de-allocated (compared
7663 * to the number of blocks allocated for the file) are also
7664 * performed in this function.
7665 */
7666static int
7667handle_workitem_freeblocks(freeblks, flags)
7668	struct freeblks *freeblks;
7669	int flags;
7670{
7671	struct freework *freework;
7672	struct newblk *newblk;
7673	struct allocindir *aip;
7674	struct ufsmount *ump;
7675	struct worklist *wk;
7676
7677	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
7678	    ("handle_workitem_freeblocks: Journal entries not written."));
7679	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7680	ACQUIRE_LOCK(&lk);
7681	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
7682		WORKLIST_REMOVE(wk);
7683		switch (wk->wk_type) {
7684		case D_DIRREM:
7685			wk->wk_state |= COMPLETE;
7686			add_to_worklist(wk, 0);
7687			continue;
7688
7689		case D_ALLOCDIRECT:
7690			free_newblk(WK_NEWBLK(wk));
7691			continue;
7692
7693		case D_ALLOCINDIR:
7694			aip = WK_ALLOCINDIR(wk);
7695			freework = NULL;
7696			if (aip->ai_state & DELAYEDFREE) {
7697				FREE_LOCK(&lk);
7698				freework = newfreework(ump, freeblks, NULL,
7699				    aip->ai_lbn, aip->ai_newblkno,
7700				    ump->um_fs->fs_frag, 0, 0);
7701				ACQUIRE_LOCK(&lk);
7702			}
7703			newblk = WK_NEWBLK(wk);
7704			if (newblk->nb_jnewblk) {
7705				freework->fw_jnewblk = newblk->nb_jnewblk;
7706				newblk->nb_jnewblk->jn_dep = &freework->fw_list;
7707				newblk->nb_jnewblk = NULL;
7708			}
7709			free_newblk(newblk);
7710			continue;
7711
7712		case D_FREEWORK:
7713			freework = WK_FREEWORK(wk);
7714			if (freework->fw_lbn <= -NDADDR)
7715				handle_workitem_indirblk(freework);
7716			else
7717				freework_freeblock(freework);
7718			continue;
7719		default:
7720			panic("handle_workitem_freeblocks: Unknown type %s",
7721			    TYPENAME(wk->wk_type));
7722		}
7723	}
7724	if (freeblks->fb_ref != 0) {
7725		freeblks->fb_state &= ~INPROGRESS;
7726		wake_worklist(&freeblks->fb_list);
7727		freeblks = NULL;
7728	}
7729	FREE_LOCK(&lk);
7730	if (freeblks)
7731		return handle_complete_freeblocks(freeblks, flags);
7732	return (0);
7733}
7734
7735/*
7736 * Handle completion of block free via truncate.  This allows fs_pending
7737 * to track the actual free block count more closely than if we only updated
7738 * it at the end.  We must be careful to handle cases where the block count
7739 * on free was incorrect.
7740 */
7741static void
7742freeblks_free(ump, freeblks, blocks)
7743	struct ufsmount *ump;
7744	struct freeblks *freeblks;
7745	int blocks;
7746{
7747	struct fs *fs;
7748	ufs2_daddr_t remain;
7749
7750	UFS_LOCK(ump);
7751	remain = -freeblks->fb_chkcnt;
7752	freeblks->fb_chkcnt += blocks;
7753	if (remain > 0) {
7754		if (remain < blocks)
7755			blocks = remain;
7756		fs = ump->um_fs;
7757		fs->fs_pendingblocks -= blocks;
7758	}
7759	UFS_UNLOCK(ump);
7760}
7761
7762/*
7763 * Once all of the freework workitems are complete we can retire the
7764 * freeblocks dependency and any journal work awaiting completion.  This
7765 * can not be called until all other dependencies are stable on disk.
7766 */
7767static int
7768handle_complete_freeblocks(freeblks, flags)
7769	struct freeblks *freeblks;
7770	int flags;
7771{
7772	struct inodedep *inodedep;
7773	struct inode *ip;
7774	struct vnode *vp;
7775	struct fs *fs;
7776	struct ufsmount *ump;
7777	ufs2_daddr_t spare;
7778
7779	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7780	fs = ump->um_fs;
7781	flags = LK_EXCLUSIVE | flags;
7782	spare = freeblks->fb_chkcnt;
7783
7784	/*
7785	 * If we did not release the expected number of blocks we may have
7786	 * to adjust the inode block count here.  Only do so if it wasn't
7787	 * a truncation to zero and the modrev still matches.
7788	 */
7789	if (spare && freeblks->fb_len != 0) {
7790		if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7791		    flags, &vp, FFSV_FORCEINSMQ) != 0)
7792			return (EBUSY);
7793		ip = VTOI(vp);
7794		if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
7795			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
7796			ip->i_flag |= IN_CHANGE;
7797			/*
7798			 * We must wait so this happens before the
7799			 * journal is reclaimed.
7800			 */
7801			ffs_update(vp, 1);
7802		}
7803		vput(vp);
7804	}
7805	if (spare < 0) {
7806		UFS_LOCK(ump);
7807		fs->fs_pendingblocks += spare;
7808		UFS_UNLOCK(ump);
7809	}
7810#ifdef QUOTA
7811	/* Handle spare. */
7812	if (spare)
7813		quotaadj(freeblks->fb_quota, ump, -spare);
7814	quotarele(freeblks->fb_quota);
7815#endif
7816	ACQUIRE_LOCK(&lk);
7817	if (freeblks->fb_state & ONDEPLIST) {
7818		inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7819		    0, &inodedep);
7820		TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
7821		freeblks->fb_state &= ~ONDEPLIST;
7822		if (TAILQ_EMPTY(&inodedep->id_freeblklst))
7823			free_inodedep(inodedep);
7824	}
7825	/*
7826	 * All of the freeblock deps must be complete prior to this call
7827	 * so it's now safe to complete earlier outstanding journal entries.
7828	 */
7829	handle_jwork(&freeblks->fb_jwork);
7830	WORKITEM_FREE(freeblks, D_FREEBLKS);
7831	FREE_LOCK(&lk);
7832	return (0);
7833}
7834
7835/*
7836 * Release blocks associated with the freeblks and stored in the indirect
7837 * block dbn. If level is greater than SINGLE, the block is an indirect block
7838 * and recursive calls to indirtrunc must be used to cleanse other indirect
7839 * blocks.
7840 *
7841 * This handles partial and complete truncation of blocks.  Partial is noted
7842 * with goingaway == 0.  In this case the freework is completed after the
7843 * zero'd indirects are written to disk.  For full truncation the freework
7844 * is completed after the block is freed.
7845 */
7846static void
7847indir_trunc(freework, dbn, lbn)
7848	struct freework *freework;
7849	ufs2_daddr_t dbn;
7850	ufs_lbn_t lbn;
7851{
7852	struct freework *nfreework;
7853	struct workhead wkhd;
7854	struct freeblks *freeblks;
7855	struct buf *bp;
7856	struct fs *fs;
7857	struct indirdep *indirdep;
7858	struct ufsmount *ump;
7859	ufs1_daddr_t *bap1 = 0;
7860	ufs2_daddr_t nb, nnb, *bap2 = 0;
7861	ufs_lbn_t lbnadd, nlbn;
7862	int i, nblocks, ufs1fmt;
7863	int freedblocks;
7864	int goingaway;
7865	int freedeps;
7866	int needj;
7867	int level;
7868	int cnt;
7869
7870	freeblks = freework->fw_freeblks;
7871	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7872	fs = ump->um_fs;
7873	/*
7874	 * Get buffer of block pointers to be freed.  There are three cases:
7875	 *
7876	 * 1) Partial truncate caches the indirdep pointer in the freework
7877	 *    which provides us a back copy to the save bp which holds the
7878	 *    pointers we want to clear.  When this completes the zero
7879	 *    pointers are written to the real copy.
7880	 * 2) The indirect is being completely truncated, cancel_indirdep()
7881	 *    eliminated the real copy and placed the indirdep on the saved
7882	 *    copy.  The indirdep and buf are discarded when this completes.
7883	 * 3) The indirect was not in memory, we read a copy off of the disk
7884	 *    using the devvp and drop and invalidate the buffer when we're
7885	 *    done.
7886	 */
7887	goingaway = 1;
7888	indirdep = NULL;
7889	if (freework->fw_indir != NULL) {
7890		goingaway = 0;
7891		indirdep = freework->fw_indir;
7892		bp = indirdep->ir_savebp;
7893		if (bp == NULL || bp->b_blkno != dbn)
7894			panic("indir_trunc: Bad saved buf %p blkno %jd",
7895			    bp, (intmax_t)dbn);
7896	} else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
7897		/*
7898		 * The lock prevents the buf dep list from changing and
7899	 	 * indirects on devvp should only ever have one dependency.
7900		 */
7901		indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
7902		if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
7903			panic("indir_trunc: Bad indirdep %p from buf %p",
7904			    indirdep, bp);
7905	} else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
7906	    NOCRED, &bp) != 0) {
7907		brelse(bp);
7908		return;
7909	}
7910	ACQUIRE_LOCK(&lk);
7911	/* Protects against a race with complete_trunc_indir(). */
7912	freework->fw_state &= ~INPROGRESS;
7913	/*
7914	 * If we have an indirdep we need to enforce the truncation order
7915	 * and discard it when it is complete.
7916	 */
7917	if (indirdep) {
7918		if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
7919		    !TAILQ_EMPTY(&indirdep->ir_trunc)) {
7920			/*
7921			 * Add the complete truncate to the list on the
7922			 * indirdep to enforce in-order processing.
7923			 */
7924			if (freework->fw_indir == NULL)
7925				TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
7926				    freework, fw_next);
7927			FREE_LOCK(&lk);
7928			return;
7929		}
7930		/*
7931		 * If we're goingaway, free the indirdep.  Otherwise it will
7932		 * linger until the write completes.
7933		 */
7934		if (goingaway) {
7935			free_indirdep(indirdep);
7936			ump->um_numindirdeps -= 1;
7937		}
7938	}
7939	FREE_LOCK(&lk);
7940	/* Initialize pointers depending on block size. */
7941	if (ump->um_fstype == UFS1) {
7942		bap1 = (ufs1_daddr_t *)bp->b_data;
7943		nb = bap1[freework->fw_off];
7944		ufs1fmt = 1;
7945	} else {
7946		bap2 = (ufs2_daddr_t *)bp->b_data;
7947		nb = bap2[freework->fw_off];
7948		ufs1fmt = 0;
7949	}
7950	level = lbn_level(lbn);
7951	needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
7952	lbnadd = lbn_offset(fs, level);
7953	nblocks = btodb(fs->fs_bsize);
7954	nfreework = freework;
7955	freedeps = 0;
7956	cnt = 0;
7957	/*
7958	 * Reclaim blocks.  Traverses into nested indirect levels and
7959	 * arranges for the current level to be freed when subordinates
7960	 * are free when journaling.
7961	 */
7962	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
7963		if (i != NINDIR(fs) - 1) {
7964			if (ufs1fmt)
7965				nnb = bap1[i+1];
7966			else
7967				nnb = bap2[i+1];
7968		} else
7969			nnb = 0;
7970		if (nb == 0)
7971			continue;
7972		cnt++;
7973		if (level != 0) {
7974			nlbn = (lbn + 1) - (i * lbnadd);
7975			if (needj != 0) {
7976				nfreework = newfreework(ump, freeblks, freework,
7977				    nlbn, nb, fs->fs_frag, 0, 0);
7978				freedeps++;
7979			}
7980			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
7981		} else {
7982			struct freedep *freedep;
7983
7984			/*
7985			 * Attempt to aggregate freedep dependencies for
7986			 * all blocks being released to the same CG.
7987			 */
7988			LIST_INIT(&wkhd);
7989			if (needj != 0 &&
7990			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
7991				freedep = newfreedep(freework);
7992				WORKLIST_INSERT_UNLOCKED(&wkhd,
7993				    &freedep->fd_list);
7994				freedeps++;
7995			}
7996			CTR3(KTR_SUJ,
7997			    "indir_trunc: ino %d blkno %jd size %ld",
7998			    freeblks->fb_inum, nb, fs->fs_bsize);
7999			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
8000			    fs->fs_bsize, freeblks->fb_inum,
8001			    freeblks->fb_vtype, &wkhd);
8002		}
8003	}
8004	if (goingaway) {
8005		bp->b_flags |= B_INVAL | B_NOCACHE;
8006		brelse(bp);
8007	}
8008	freedblocks = 0;
8009	if (level == 0)
8010		freedblocks = (nblocks * cnt);
8011	if (needj == 0)
8012		freedblocks += nblocks;
8013	freeblks_free(ump, freeblks, freedblocks);
8014	/*
8015	 * If we are journaling set up the ref counts and offset so this
8016	 * indirect can be completed when its children are free.
8017	 */
8018	if (needj) {
8019		ACQUIRE_LOCK(&lk);
8020		freework->fw_off = i;
8021		freework->fw_ref += freedeps;
8022		freework->fw_ref -= NINDIR(fs) + 1;
8023		if (level == 0)
8024			freeblks->fb_cgwait += freedeps;
8025		if (freework->fw_ref == 0)
8026			freework_freeblock(freework);
8027		FREE_LOCK(&lk);
8028		return;
8029	}
8030	/*
8031	 * If we're not journaling we can free the indirect now.
8032	 */
8033	dbn = dbtofsb(fs, dbn);
8034	CTR3(KTR_SUJ,
8035	    "indir_trunc 2: ino %d blkno %jd size %ld",
8036	    freeblks->fb_inum, dbn, fs->fs_bsize);
8037	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
8038	    freeblks->fb_inum, freeblks->fb_vtype, NULL);
8039	/* Non SUJ softdep does single-threaded truncations. */
8040	if (freework->fw_blkno == dbn) {
8041		freework->fw_state |= ALLCOMPLETE;
8042		ACQUIRE_LOCK(&lk);
8043		handle_written_freework(freework);
8044		FREE_LOCK(&lk);
8045	}
8046	return;
8047}
8048
8049/*
8050 * Cancel an allocindir when it is removed via truncation.  When bp is not
8051 * NULL the indirect never appeared on disk and is scheduled to be freed
8052 * independently of the indir so we can more easily track journal work.
8053 */
8054static void
8055cancel_allocindir(aip, bp, freeblks, trunc)
8056	struct allocindir *aip;
8057	struct buf *bp;
8058	struct freeblks *freeblks;
8059	int trunc;
8060{
8061	struct indirdep *indirdep;
8062	struct freefrag *freefrag;
8063	struct newblk *newblk;
8064
8065	newblk = (struct newblk *)aip;
8066	LIST_REMOVE(aip, ai_next);
8067	/*
8068	 * We must eliminate the pointer in bp if it must be freed on its
8069	 * own due to partial truncate or pending journal work.
8070	 */
8071	if (bp && (trunc || newblk->nb_jnewblk)) {
8072		/*
8073		 * Clear the pointer and mark the aip to be freed
8074		 * directly if it never existed on disk.
8075		 */
8076		aip->ai_state |= DELAYEDFREE;
8077		indirdep = aip->ai_indirdep;
8078		if (indirdep->ir_state & UFS1FMT)
8079			((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8080		else
8081			((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8082	}
8083	/*
8084	 * When truncating the previous pointer will be freed via
8085	 * savedbp.  Eliminate the freefrag which would dup free.
8086	 */
8087	if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
8088		newblk->nb_freefrag = NULL;
8089		if (freefrag->ff_jdep)
8090			cancel_jfreefrag(
8091			    WK_JFREEFRAG(freefrag->ff_jdep));
8092		jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
8093		WORKITEM_FREE(freefrag, D_FREEFRAG);
8094	}
8095	/*
8096	 * If the journal hasn't been written the jnewblk must be passed
8097	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
8098	 * this by leaving the journal dependency on the newblk to be freed
8099	 * when a freework is created in handle_workitem_freeblocks().
8100	 */
8101	cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
8102	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
8103}
8104
8105/*
8106 * Create the mkdir dependencies for . and .. in a new directory.  Link them
8107 * in to a newdirblk so any subsequent additions are tracked properly.  The
8108 * caller is responsible for adding the mkdir1 dependency to the journal
8109 * and updating id_mkdiradd.  This function returns with lk held.
8110 */
8111static struct mkdir *
8112setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
8113	struct diradd *dap;
8114	ino_t newinum;
8115	ino_t dinum;
8116	struct buf *newdirbp;
8117	struct mkdir **mkdirp;
8118{
8119	struct newblk *newblk;
8120	struct pagedep *pagedep;
8121	struct inodedep *inodedep;
8122	struct newdirblk *newdirblk = 0;
8123	struct mkdir *mkdir1, *mkdir2;
8124	struct worklist *wk;
8125	struct jaddref *jaddref;
8126	struct mount *mp;
8127
8128	mp = dap->da_list.wk_mp;
8129	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
8130	    M_SOFTDEP_FLAGS);
8131	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8132	LIST_INIT(&newdirblk->db_mkdir);
8133	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8134	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
8135	mkdir1->md_state = ATTACHED | MKDIR_BODY;
8136	mkdir1->md_diradd = dap;
8137	mkdir1->md_jaddref = NULL;
8138	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8139	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
8140	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
8141	mkdir2->md_diradd = dap;
8142	mkdir2->md_jaddref = NULL;
8143	if (MOUNTEDSUJ(mp) == 0) {
8144		mkdir1->md_state |= DEPCOMPLETE;
8145		mkdir2->md_state |= DEPCOMPLETE;
8146	}
8147	/*
8148	 * Dependency on "." and ".." being written to disk.
8149	 */
8150	mkdir1->md_buf = newdirbp;
8151	ACQUIRE_LOCK(&lk);
8152	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
8153	/*
8154	 * We must link the pagedep, allocdirect, and newdirblk for
8155	 * the initial file page so the pointer to the new directory
8156	 * is not written until the directory contents are live and
8157	 * any subsequent additions are not marked live until the
8158	 * block is reachable via the inode.
8159	 */
8160	if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
8161		panic("setup_newdir: lost pagedep");
8162	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
8163		if (wk->wk_type == D_ALLOCDIRECT)
8164			break;
8165	if (wk == NULL)
8166		panic("setup_newdir: lost allocdirect");
8167	if (pagedep->pd_state & NEWBLOCK)
8168		panic("setup_newdir: NEWBLOCK already set");
8169	newblk = WK_NEWBLK(wk);
8170	pagedep->pd_state |= NEWBLOCK;
8171	pagedep->pd_newdirblk = newdirblk;
8172	newdirblk->db_pagedep = pagedep;
8173	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8174	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
8175	/*
8176	 * Look up the inodedep for the parent directory so that we
8177	 * can link mkdir2 into the pending dotdot jaddref or
8178	 * the inode write if there is none.  If the inode is
8179	 * ALLCOMPLETE and no jaddref is present all dependencies have
8180	 * been satisfied and mkdir2 can be freed.
8181	 */
8182	inodedep_lookup(mp, dinum, 0, &inodedep);
8183	if (MOUNTEDSUJ(mp)) {
8184		if (inodedep == NULL)
8185			panic("setup_newdir: Lost parent.");
8186		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8187		    inoreflst);
8188		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
8189		    (jaddref->ja_state & MKDIR_PARENT),
8190		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
8191		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
8192		mkdir2->md_jaddref = jaddref;
8193		jaddref->ja_mkdir = mkdir2;
8194	} else if (inodedep == NULL ||
8195	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8196		dap->da_state &= ~MKDIR_PARENT;
8197		WORKITEM_FREE(mkdir2, D_MKDIR);
8198		mkdir2 = NULL;
8199	} else {
8200		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
8201		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
8202	}
8203	*mkdirp = mkdir2;
8204
8205	return (mkdir1);
8206}
8207
8208/*
8209 * Directory entry addition dependencies.
8210 *
8211 * When adding a new directory entry, the inode (with its incremented link
8212 * count) must be written to disk before the directory entry's pointer to it.
8213 * Also, if the inode is newly allocated, the corresponding freemap must be
8214 * updated (on disk) before the directory entry's pointer. These requirements
8215 * are met via undo/redo on the directory entry's pointer, which consists
8216 * simply of the inode number.
8217 *
8218 * As directory entries are added and deleted, the free space within a
8219 * directory block can become fragmented.  The ufs filesystem will compact
8220 * a fragmented directory block to make space for a new entry. When this
8221 * occurs, the offsets of previously added entries change. Any "diradd"
8222 * dependency structures corresponding to these entries must be updated with
8223 * the new offsets.
8224 */
8225
8226/*
8227 * This routine is called after the in-memory inode's link
8228 * count has been incremented, but before the directory entry's
8229 * pointer to the inode has been set.
8230 */
8231int
8232softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
8233	struct buf *bp;		/* buffer containing directory block */
8234	struct inode *dp;	/* inode for directory */
8235	off_t diroffset;	/* offset of new entry in directory */
8236	ino_t newinum;		/* inode referenced by new directory entry */
8237	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
8238	int isnewblk;		/* entry is in a newly allocated block */
8239{
8240	int offset;		/* offset of new entry within directory block */
8241	ufs_lbn_t lbn;		/* block in directory containing new entry */
8242	struct fs *fs;
8243	struct diradd *dap;
8244	struct newblk *newblk;
8245	struct pagedep *pagedep;
8246	struct inodedep *inodedep;
8247	struct newdirblk *newdirblk = 0;
8248	struct mkdir *mkdir1, *mkdir2;
8249	struct jaddref *jaddref;
8250	struct mount *mp;
8251	int isindir;
8252
8253	/*
8254	 * Whiteouts have no dependencies.
8255	 */
8256	if (newinum == WINO) {
8257		if (newdirbp != NULL)
8258			bdwrite(newdirbp);
8259		return (0);
8260	}
8261	jaddref = NULL;
8262	mkdir1 = mkdir2 = NULL;
8263	mp = UFSTOVFS(dp->i_ump);
8264	fs = dp->i_fs;
8265	lbn = lblkno(fs, diroffset);
8266	offset = blkoff(fs, diroffset);
8267	dap = malloc(sizeof(struct diradd), M_DIRADD,
8268		M_SOFTDEP_FLAGS|M_ZERO);
8269	workitem_alloc(&dap->da_list, D_DIRADD, mp);
8270	dap->da_offset = offset;
8271	dap->da_newinum = newinum;
8272	dap->da_state = ATTACHED;
8273	LIST_INIT(&dap->da_jwork);
8274	isindir = bp->b_lblkno >= NDADDR;
8275	if (isnewblk &&
8276	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8277		newdirblk = malloc(sizeof(struct newdirblk),
8278		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8279		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8280		LIST_INIT(&newdirblk->db_mkdir);
8281	}
8282	/*
8283	 * If we're creating a new directory setup the dependencies and set
8284	 * the dap state to wait for them.  Otherwise it's COMPLETE and
8285	 * we can move on.
8286	 */
8287	if (newdirbp == NULL) {
8288		dap->da_state |= DEPCOMPLETE;
8289		ACQUIRE_LOCK(&lk);
8290	} else {
8291		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8292		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8293		    &mkdir2);
8294	}
8295	/*
8296	 * Link into parent directory pagedep to await its being written.
8297	 */
8298	pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8299#ifdef DEBUG
8300	if (diradd_lookup(pagedep, offset) != NULL)
8301		panic("softdep_setup_directory_add: %p already at off %d\n",
8302		    diradd_lookup(pagedep, offset), offset);
8303#endif
8304	dap->da_pagedep = pagedep;
8305	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8306	    da_pdlist);
8307	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
8308	/*
8309	 * If we're journaling, link the diradd into the jaddref so it
8310	 * may be completed after the journal entry is written.  Otherwise,
8311	 * link the diradd into its inodedep.  If the inode is not yet
8312	 * written place it on the bufwait list, otherwise do the post-inode
8313	 * write processing to put it on the id_pendinghd list.
8314	 */
8315	if (MOUNTEDSUJ(mp)) {
8316		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8317		    inoreflst);
8318		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8319		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8320		jaddref->ja_diroff = diroffset;
8321		jaddref->ja_diradd = dap;
8322		add_to_journal(&jaddref->ja_list);
8323	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8324		diradd_inode_written(dap, inodedep);
8325	else
8326		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8327	/*
8328	 * Add the journal entries for . and .. links now that the primary
8329	 * link is written.
8330	 */
8331	if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8332		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8333		    inoreflst, if_deps);
8334		KASSERT(jaddref != NULL &&
8335		    jaddref->ja_ino == jaddref->ja_parent &&
8336		    (jaddref->ja_state & MKDIR_BODY),
8337		    ("softdep_setup_directory_add: bad dot jaddref %p",
8338		    jaddref));
8339		mkdir1->md_jaddref = jaddref;
8340		jaddref->ja_mkdir = mkdir1;
8341		/*
8342		 * It is important that the dotdot journal entry
8343		 * is added prior to the dot entry since dot writes
8344		 * both the dot and dotdot links.  These both must
8345		 * be added after the primary link for the journal
8346		 * to remain consistent.
8347		 */
8348		add_to_journal(&mkdir2->md_jaddref->ja_list);
8349		add_to_journal(&jaddref->ja_list);
8350	}
8351	/*
8352	 * If we are adding a new directory remember this diradd so that if
8353	 * we rename it we can keep the dot and dotdot dependencies.  If
8354	 * we are adding a new name for an inode that has a mkdiradd we
8355	 * must be in rename and we have to move the dot and dotdot
8356	 * dependencies to this new name.  The old name is being orphaned
8357	 * soon.
8358	 */
8359	if (mkdir1 != NULL) {
8360		if (inodedep->id_mkdiradd != NULL)
8361			panic("softdep_setup_directory_add: Existing mkdir");
8362		inodedep->id_mkdiradd = dap;
8363	} else if (inodedep->id_mkdiradd)
8364		merge_diradd(inodedep, dap);
8365	if (newdirblk) {
8366		/*
8367		 * There is nothing to do if we are already tracking
8368		 * this block.
8369		 */
8370		if ((pagedep->pd_state & NEWBLOCK) != 0) {
8371			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8372			FREE_LOCK(&lk);
8373			return (0);
8374		}
8375		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8376		    == 0)
8377			panic("softdep_setup_directory_add: lost entry");
8378		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8379		pagedep->pd_state |= NEWBLOCK;
8380		pagedep->pd_newdirblk = newdirblk;
8381		newdirblk->db_pagedep = pagedep;
8382		FREE_LOCK(&lk);
8383		/*
8384		 * If we extended into an indirect signal direnter to sync.
8385		 */
8386		if (isindir)
8387			return (1);
8388		return (0);
8389	}
8390	FREE_LOCK(&lk);
8391	return (0);
8392}
8393
8394/*
8395 * This procedure is called to change the offset of a directory
8396 * entry when compacting a directory block which must be owned
8397 * exclusively by the caller. Note that the actual entry movement
8398 * must be done in this procedure to ensure that no I/O completions
8399 * occur while the move is in progress.
8400 */
8401void
8402softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
8403	struct buf *bp;		/* Buffer holding directory block. */
8404	struct inode *dp;	/* inode for directory */
8405	caddr_t base;		/* address of dp->i_offset */
8406	caddr_t oldloc;		/* address of old directory location */
8407	caddr_t newloc;		/* address of new directory location */
8408	int entrysize;		/* size of directory entry */
8409{
8410	int offset, oldoffset, newoffset;
8411	struct pagedep *pagedep;
8412	struct jmvref *jmvref;
8413	struct diradd *dap;
8414	struct direct *de;
8415	struct mount *mp;
8416	ufs_lbn_t lbn;
8417	int flags;
8418
8419	mp = UFSTOVFS(dp->i_ump);
8420	de = (struct direct *)oldloc;
8421	jmvref = NULL;
8422	flags = 0;
8423	/*
8424	 * Moves are always journaled as it would be too complex to
8425	 * determine if any affected adds or removes are present in the
8426	 * journal.
8427	 */
8428	if (MOUNTEDSUJ(mp)) {
8429		flags = DEPALLOC;
8430		jmvref = newjmvref(dp, de->d_ino,
8431		    dp->i_offset + (oldloc - base),
8432		    dp->i_offset + (newloc - base));
8433	}
8434	lbn = lblkno(dp->i_fs, dp->i_offset);
8435	offset = blkoff(dp->i_fs, dp->i_offset);
8436	oldoffset = offset + (oldloc - base);
8437	newoffset = offset + (newloc - base);
8438	ACQUIRE_LOCK(&lk);
8439	if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8440		goto done;
8441	dap = diradd_lookup(pagedep, oldoffset);
8442	if (dap) {
8443		dap->da_offset = newoffset;
8444		newoffset = DIRADDHASH(newoffset);
8445		oldoffset = DIRADDHASH(oldoffset);
8446		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8447		    newoffset != oldoffset) {
8448			LIST_REMOVE(dap, da_pdlist);
8449			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8450			    dap, da_pdlist);
8451		}
8452	}
8453done:
8454	if (jmvref) {
8455		jmvref->jm_pagedep = pagedep;
8456		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8457		add_to_journal(&jmvref->jm_list);
8458	}
8459	bcopy(oldloc, newloc, entrysize);
8460	FREE_LOCK(&lk);
8461}
8462
8463/*
8464 * Move the mkdir dependencies and journal work from one diradd to another
8465 * when renaming a directory.  The new name must depend on the mkdir deps
8466 * completing as the old name did.  Directories can only have one valid link
8467 * at a time so one must be canonical.
8468 */
8469static void
8470merge_diradd(inodedep, newdap)
8471	struct inodedep *inodedep;
8472	struct diradd *newdap;
8473{
8474	struct diradd *olddap;
8475	struct mkdir *mkdir, *nextmd;
8476	short state;
8477
8478	olddap = inodedep->id_mkdiradd;
8479	inodedep->id_mkdiradd = newdap;
8480	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8481		newdap->da_state &= ~DEPCOMPLETE;
8482		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
8483			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8484			if (mkdir->md_diradd != olddap)
8485				continue;
8486			mkdir->md_diradd = newdap;
8487			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8488			newdap->da_state |= state;
8489			olddap->da_state &= ~state;
8490			if ((olddap->da_state &
8491			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
8492				break;
8493		}
8494		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8495			panic("merge_diradd: unfound ref");
8496	}
8497	/*
8498	 * Any mkdir related journal items are not safe to be freed until
8499	 * the new name is stable.
8500	 */
8501	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8502	olddap->da_state |= DEPCOMPLETE;
8503	complete_diradd(olddap);
8504}
8505
8506/*
8507 * Move the diradd to the pending list when all diradd dependencies are
8508 * complete.
8509 */
8510static void
8511complete_diradd(dap)
8512	struct diradd *dap;
8513{
8514	struct pagedep *pagedep;
8515
8516	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8517		if (dap->da_state & DIRCHG)
8518			pagedep = dap->da_previous->dm_pagedep;
8519		else
8520			pagedep = dap->da_pagedep;
8521		LIST_REMOVE(dap, da_pdlist);
8522		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8523	}
8524}
8525
8526/*
8527 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
8528 * add entries and conditonally journal the remove.
8529 */
8530static void
8531cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
8532	struct diradd *dap;
8533	struct dirrem *dirrem;
8534	struct jremref *jremref;
8535	struct jremref *dotremref;
8536	struct jremref *dotdotremref;
8537{
8538	struct inodedep *inodedep;
8539	struct jaddref *jaddref;
8540	struct inoref *inoref;
8541	struct mkdir *mkdir;
8542
8543	/*
8544	 * If no remove references were allocated we're on a non-journaled
8545	 * filesystem and can skip the cancel step.
8546	 */
8547	if (jremref == NULL) {
8548		free_diradd(dap, NULL);
8549		return;
8550	}
8551	/*
8552	 * Cancel the primary name an free it if it does not require
8553	 * journaling.
8554	 */
8555	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
8556	    0, &inodedep) != 0) {
8557		/* Abort the addref that reference this diradd.  */
8558		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
8559			if (inoref->if_list.wk_type != D_JADDREF)
8560				continue;
8561			jaddref = (struct jaddref *)inoref;
8562			if (jaddref->ja_diradd != dap)
8563				continue;
8564			if (cancel_jaddref(jaddref, inodedep,
8565			    &dirrem->dm_jwork) == 0) {
8566				free_jremref(jremref);
8567				jremref = NULL;
8568			}
8569			break;
8570		}
8571	}
8572	/*
8573	 * Cancel subordinate names and free them if they do not require
8574	 * journaling.
8575	 */
8576	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8577		LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
8578			if (mkdir->md_diradd != dap)
8579				continue;
8580			if ((jaddref = mkdir->md_jaddref) == NULL)
8581				continue;
8582			mkdir->md_jaddref = NULL;
8583			if (mkdir->md_state & MKDIR_PARENT) {
8584				if (cancel_jaddref(jaddref, NULL,
8585				    &dirrem->dm_jwork) == 0) {
8586					free_jremref(dotdotremref);
8587					dotdotremref = NULL;
8588				}
8589			} else {
8590				if (cancel_jaddref(jaddref, inodedep,
8591				    &dirrem->dm_jwork) == 0) {
8592					free_jremref(dotremref);
8593					dotremref = NULL;
8594				}
8595			}
8596		}
8597	}
8598
8599	if (jremref)
8600		journal_jremref(dirrem, jremref, inodedep);
8601	if (dotremref)
8602		journal_jremref(dirrem, dotremref, inodedep);
8603	if (dotdotremref)
8604		journal_jremref(dirrem, dotdotremref, NULL);
8605	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
8606	free_diradd(dap, &dirrem->dm_jwork);
8607}
8608
8609/*
8610 * Free a diradd dependency structure. This routine must be called
8611 * with splbio interrupts blocked.
8612 */
8613static void
8614free_diradd(dap, wkhd)
8615	struct diradd *dap;
8616	struct workhead *wkhd;
8617{
8618	struct dirrem *dirrem;
8619	struct pagedep *pagedep;
8620	struct inodedep *inodedep;
8621	struct mkdir *mkdir, *nextmd;
8622
8623	mtx_assert(&lk, MA_OWNED);
8624	LIST_REMOVE(dap, da_pdlist);
8625	if (dap->da_state & ONWORKLIST)
8626		WORKLIST_REMOVE(&dap->da_list);
8627	if ((dap->da_state & DIRCHG) == 0) {
8628		pagedep = dap->da_pagedep;
8629	} else {
8630		dirrem = dap->da_previous;
8631		pagedep = dirrem->dm_pagedep;
8632		dirrem->dm_dirinum = pagedep->pd_ino;
8633		dirrem->dm_state |= COMPLETE;
8634		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8635			add_to_worklist(&dirrem->dm_list, 0);
8636	}
8637	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
8638	    0, &inodedep) != 0)
8639		if (inodedep->id_mkdiradd == dap)
8640			inodedep->id_mkdiradd = NULL;
8641	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8642		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
8643			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8644			if (mkdir->md_diradd != dap)
8645				continue;
8646			dap->da_state &=
8647			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
8648			LIST_REMOVE(mkdir, md_mkdirs);
8649			if (mkdir->md_state & ONWORKLIST)
8650				WORKLIST_REMOVE(&mkdir->md_list);
8651			if (mkdir->md_jaddref != NULL)
8652				panic("free_diradd: Unexpected jaddref");
8653			WORKITEM_FREE(mkdir, D_MKDIR);
8654			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
8655				break;
8656		}
8657		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8658			panic("free_diradd: unfound ref");
8659	}
8660	if (inodedep)
8661		free_inodedep(inodedep);
8662	/*
8663	 * Free any journal segments waiting for the directory write.
8664	 */
8665	handle_jwork(&dap->da_jwork);
8666	WORKITEM_FREE(dap, D_DIRADD);
8667}
8668
8669/*
8670 * Directory entry removal dependencies.
8671 *
8672 * When removing a directory entry, the entry's inode pointer must be
8673 * zero'ed on disk before the corresponding inode's link count is decremented
8674 * (possibly freeing the inode for re-use). This dependency is handled by
8675 * updating the directory entry but delaying the inode count reduction until
8676 * after the directory block has been written to disk. After this point, the
8677 * inode count can be decremented whenever it is convenient.
8678 */
8679
8680/*
8681 * This routine should be called immediately after removing
8682 * a directory entry.  The inode's link count should not be
8683 * decremented by the calling procedure -- the soft updates
8684 * code will do this task when it is safe.
8685 */
8686void
8687softdep_setup_remove(bp, dp, ip, isrmdir)
8688	struct buf *bp;		/* buffer containing directory block */
8689	struct inode *dp;	/* inode for the directory being modified */
8690	struct inode *ip;	/* inode for directory entry being removed */
8691	int isrmdir;		/* indicates if doing RMDIR */
8692{
8693	struct dirrem *dirrem, *prevdirrem;
8694	struct inodedep *inodedep;
8695	int direct;
8696
8697	/*
8698	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
8699	 * newdirrem() to setup the full directory remove which requires
8700	 * isrmdir > 1.
8701	 */
8702	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8703	/*
8704	 * Add the dirrem to the inodedep's pending remove list for quick
8705	 * discovery later.
8706	 */
8707	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8708	    &inodedep) == 0)
8709		panic("softdep_setup_remove: Lost inodedep.");
8710	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
8711	dirrem->dm_state |= ONDEPLIST;
8712	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8713
8714	/*
8715	 * If the COMPLETE flag is clear, then there were no active
8716	 * entries and we want to roll back to a zeroed entry until
8717	 * the new inode is committed to disk. If the COMPLETE flag is
8718	 * set then we have deleted an entry that never made it to
8719	 * disk. If the entry we deleted resulted from a name change,
8720	 * then the old name still resides on disk. We cannot delete
8721	 * its inode (returned to us in prevdirrem) until the zeroed
8722	 * directory entry gets to disk. The new inode has never been
8723	 * referenced on the disk, so can be deleted immediately.
8724	 */
8725	if ((dirrem->dm_state & COMPLETE) == 0) {
8726		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
8727		    dm_next);
8728		FREE_LOCK(&lk);
8729	} else {
8730		if (prevdirrem != NULL)
8731			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
8732			    prevdirrem, dm_next);
8733		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
8734		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
8735		FREE_LOCK(&lk);
8736		if (direct)
8737			handle_workitem_remove(dirrem, 0);
8738	}
8739}
8740
8741/*
8742 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
8743 * pd_pendinghd list of a pagedep.
8744 */
8745static struct diradd *
8746diradd_lookup(pagedep, offset)
8747	struct pagedep *pagedep;
8748	int offset;
8749{
8750	struct diradd *dap;
8751
8752	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
8753		if (dap->da_offset == offset)
8754			return (dap);
8755	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
8756		if (dap->da_offset == offset)
8757			return (dap);
8758	return (NULL);
8759}
8760
8761/*
8762 * Search for a .. diradd dependency in a directory that is being removed.
8763 * If the directory was renamed to a new parent we have a diradd rather
8764 * than a mkdir for the .. entry.  We need to cancel it now before
8765 * it is found in truncate().
8766 */
8767static struct jremref *
8768cancel_diradd_dotdot(ip, dirrem, jremref)
8769	struct inode *ip;
8770	struct dirrem *dirrem;
8771	struct jremref *jremref;
8772{
8773	struct pagedep *pagedep;
8774	struct diradd *dap;
8775	struct worklist *wk;
8776
8777	if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0,
8778	    &pagedep) == 0)
8779		return (jremref);
8780	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
8781	if (dap == NULL)
8782		return (jremref);
8783	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
8784	/*
8785	 * Mark any journal work as belonging to the parent so it is freed
8786	 * with the .. reference.
8787	 */
8788	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
8789		wk->wk_state |= MKDIR_PARENT;
8790	return (NULL);
8791}
8792
8793/*
8794 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
8795 * replace it with a dirrem/diradd pair as a result of re-parenting a
8796 * directory.  This ensures that we don't simultaneously have a mkdir and
8797 * a diradd for the same .. entry.
8798 */
8799static struct jremref *
8800cancel_mkdir_dotdot(ip, dirrem, jremref)
8801	struct inode *ip;
8802	struct dirrem *dirrem;
8803	struct jremref *jremref;
8804{
8805	struct inodedep *inodedep;
8806	struct jaddref *jaddref;
8807	struct mkdir *mkdir;
8808	struct diradd *dap;
8809
8810	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8811	    &inodedep) == 0)
8812		return (jremref);
8813	dap = inodedep->id_mkdiradd;
8814	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
8815		return (jremref);
8816	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir;
8817	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
8818		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
8819			break;
8820	if (mkdir == NULL)
8821		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
8822	if ((jaddref = mkdir->md_jaddref) != NULL) {
8823		mkdir->md_jaddref = NULL;
8824		jaddref->ja_state &= ~MKDIR_PARENT;
8825		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
8826		    &inodedep) == 0)
8827			panic("cancel_mkdir_dotdot: Lost parent inodedep");
8828		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
8829			journal_jremref(dirrem, jremref, inodedep);
8830			jremref = NULL;
8831		}
8832	}
8833	if (mkdir->md_state & ONWORKLIST)
8834		WORKLIST_REMOVE(&mkdir->md_list);
8835	mkdir->md_state |= ALLCOMPLETE;
8836	complete_mkdir(mkdir);
8837	return (jremref);
8838}
8839
8840static void
8841journal_jremref(dirrem, jremref, inodedep)
8842	struct dirrem *dirrem;
8843	struct jremref *jremref;
8844	struct inodedep *inodedep;
8845{
8846
8847	if (inodedep == NULL)
8848		if (inodedep_lookup(jremref->jr_list.wk_mp,
8849		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
8850			panic("journal_jremref: Lost inodedep");
8851	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
8852	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
8853	add_to_journal(&jremref->jr_list);
8854}
8855
8856static void
8857dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
8858	struct dirrem *dirrem;
8859	struct jremref *jremref;
8860	struct jremref *dotremref;
8861	struct jremref *dotdotremref;
8862{
8863	struct inodedep *inodedep;
8864
8865
8866	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
8867	    &inodedep) == 0)
8868		panic("dirrem_journal: Lost inodedep");
8869	journal_jremref(dirrem, jremref, inodedep);
8870	if (dotremref)
8871		journal_jremref(dirrem, dotremref, inodedep);
8872	if (dotdotremref)
8873		journal_jremref(dirrem, dotdotremref, NULL);
8874}
8875
8876/*
8877 * Allocate a new dirrem if appropriate and return it along with
8878 * its associated pagedep. Called without a lock, returns with lock.
8879 */
8880static struct dirrem *
8881newdirrem(bp, dp, ip, isrmdir, prevdirremp)
8882	struct buf *bp;		/* buffer containing directory block */
8883	struct inode *dp;	/* inode for the directory being modified */
8884	struct inode *ip;	/* inode for directory entry being removed */
8885	int isrmdir;		/* indicates if doing RMDIR */
8886	struct dirrem **prevdirremp; /* previously referenced inode, if any */
8887{
8888	int offset;
8889	ufs_lbn_t lbn;
8890	struct diradd *dap;
8891	struct dirrem *dirrem;
8892	struct pagedep *pagedep;
8893	struct jremref *jremref;
8894	struct jremref *dotremref;
8895	struct jremref *dotdotremref;
8896	struct vnode *dvp;
8897
8898	/*
8899	 * Whiteouts have no deletion dependencies.
8900	 */
8901	if (ip == NULL)
8902		panic("newdirrem: whiteout");
8903	dvp = ITOV(dp);
8904	/*
8905	 * If we are over our limit, try to improve the situation.
8906	 * Limiting the number of dirrem structures will also limit
8907	 * the number of freefile and freeblks structures.
8908	 */
8909	ACQUIRE_LOCK(&lk);
8910	if (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2)
8911		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS);
8912	FREE_LOCK(&lk);
8913	dirrem = malloc(sizeof(struct dirrem),
8914		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
8915	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
8916	LIST_INIT(&dirrem->dm_jremrefhd);
8917	LIST_INIT(&dirrem->dm_jwork);
8918	dirrem->dm_state = isrmdir ? RMDIR : 0;
8919	dirrem->dm_oldinum = ip->i_number;
8920	*prevdirremp = NULL;
8921	/*
8922	 * Allocate remove reference structures to track journal write
8923	 * dependencies.  We will always have one for the link and
8924	 * when doing directories we will always have one more for dot.
8925	 * When renaming a directory we skip the dotdot link change so
8926	 * this is not needed.
8927	 */
8928	jremref = dotremref = dotdotremref = NULL;
8929	if (DOINGSUJ(dvp)) {
8930		if (isrmdir) {
8931			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
8932			    ip->i_effnlink + 2);
8933			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
8934			    ip->i_effnlink + 1);
8935			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
8936			    dp->i_effnlink + 1);
8937			dotdotremref->jr_state |= MKDIR_PARENT;
8938		} else
8939			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
8940			    ip->i_effnlink + 1);
8941	}
8942	ACQUIRE_LOCK(&lk);
8943	lbn = lblkno(dp->i_fs, dp->i_offset);
8944	offset = blkoff(dp->i_fs, dp->i_offset);
8945	pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC,
8946	    &pagedep);
8947	dirrem->dm_pagedep = pagedep;
8948	dirrem->dm_offset = offset;
8949	/*
8950	 * If we're renaming a .. link to a new directory, cancel any
8951	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
8952	 * the jremref is preserved for any potential diradd in this
8953	 * location.  This can not coincide with a rmdir.
8954	 */
8955	if (dp->i_offset == DOTDOT_OFFSET) {
8956		if (isrmdir)
8957			panic("newdirrem: .. directory change during remove?");
8958		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
8959	}
8960	/*
8961	 * If we're removing a directory search for the .. dependency now and
8962	 * cancel it.  Any pending journal work will be added to the dirrem
8963	 * to be completed when the workitem remove completes.
8964	 */
8965	if (isrmdir)
8966		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
8967	/*
8968	 * Check for a diradd dependency for the same directory entry.
8969	 * If present, then both dependencies become obsolete and can
8970	 * be de-allocated.
8971	 */
8972	dap = diradd_lookup(pagedep, offset);
8973	if (dap == NULL) {
8974		/*
8975		 * Link the jremref structures into the dirrem so they are
8976		 * written prior to the pagedep.
8977		 */
8978		if (jremref)
8979			dirrem_journal(dirrem, jremref, dotremref,
8980			    dotdotremref);
8981		return (dirrem);
8982	}
8983	/*
8984	 * Must be ATTACHED at this point.
8985	 */
8986	if ((dap->da_state & ATTACHED) == 0)
8987		panic("newdirrem: not ATTACHED");
8988	if (dap->da_newinum != ip->i_number)
8989		panic("newdirrem: inum %d should be %d",
8990		    ip->i_number, dap->da_newinum);
8991	/*
8992	 * If we are deleting a changed name that never made it to disk,
8993	 * then return the dirrem describing the previous inode (which
8994	 * represents the inode currently referenced from this entry on disk).
8995	 */
8996	if ((dap->da_state & DIRCHG) != 0) {
8997		*prevdirremp = dap->da_previous;
8998		dap->da_state &= ~DIRCHG;
8999		dap->da_pagedep = pagedep;
9000	}
9001	/*
9002	 * We are deleting an entry that never made it to disk.
9003	 * Mark it COMPLETE so we can delete its inode immediately.
9004	 */
9005	dirrem->dm_state |= COMPLETE;
9006	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
9007#ifdef SUJ_DEBUG
9008	if (isrmdir == 0) {
9009		struct worklist *wk;
9010
9011		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9012			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
9013				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
9014	}
9015#endif
9016
9017	return (dirrem);
9018}
9019
9020/*
9021 * Directory entry change dependencies.
9022 *
9023 * Changing an existing directory entry requires that an add operation
9024 * be completed first followed by a deletion. The semantics for the addition
9025 * are identical to the description of adding a new entry above except
9026 * that the rollback is to the old inode number rather than zero. Once
9027 * the addition dependency is completed, the removal is done as described
9028 * in the removal routine above.
9029 */
9030
9031/*
9032 * This routine should be called immediately after changing
9033 * a directory entry.  The inode's link count should not be
9034 * decremented by the calling procedure -- the soft updates
9035 * code will perform this task when it is safe.
9036 */
9037void
9038softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
9039	struct buf *bp;		/* buffer containing directory block */
9040	struct inode *dp;	/* inode for the directory being modified */
9041	struct inode *ip;	/* inode for directory entry being removed */
9042	ino_t newinum;		/* new inode number for changed entry */
9043	int isrmdir;		/* indicates if doing RMDIR */
9044{
9045	int offset;
9046	struct diradd *dap = NULL;
9047	struct dirrem *dirrem, *prevdirrem;
9048	struct pagedep *pagedep;
9049	struct inodedep *inodedep;
9050	struct jaddref *jaddref;
9051	struct mount *mp;
9052
9053	offset = blkoff(dp->i_fs, dp->i_offset);
9054	mp = UFSTOVFS(dp->i_ump);
9055
9056	/*
9057	 * Whiteouts do not need diradd dependencies.
9058	 */
9059	if (newinum != WINO) {
9060		dap = malloc(sizeof(struct diradd),
9061		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
9062		workitem_alloc(&dap->da_list, D_DIRADD, mp);
9063		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
9064		dap->da_offset = offset;
9065		dap->da_newinum = newinum;
9066		LIST_INIT(&dap->da_jwork);
9067	}
9068
9069	/*
9070	 * Allocate a new dirrem and ACQUIRE_LOCK.
9071	 */
9072	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9073	pagedep = dirrem->dm_pagedep;
9074	/*
9075	 * The possible values for isrmdir:
9076	 *	0 - non-directory file rename
9077	 *	1 - directory rename within same directory
9078	 *   inum - directory rename to new directory of given inode number
9079	 * When renaming to a new directory, we are both deleting and
9080	 * creating a new directory entry, so the link count on the new
9081	 * directory should not change. Thus we do not need the followup
9082	 * dirrem which is usually done in handle_workitem_remove. We set
9083	 * the DIRCHG flag to tell handle_workitem_remove to skip the
9084	 * followup dirrem.
9085	 */
9086	if (isrmdir > 1)
9087		dirrem->dm_state |= DIRCHG;
9088
9089	/*
9090	 * Whiteouts have no additional dependencies,
9091	 * so just put the dirrem on the correct list.
9092	 */
9093	if (newinum == WINO) {
9094		if ((dirrem->dm_state & COMPLETE) == 0) {
9095			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
9096			    dm_next);
9097		} else {
9098			dirrem->dm_dirinum = pagedep->pd_ino;
9099			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9100				add_to_worklist(&dirrem->dm_list, 0);
9101		}
9102		FREE_LOCK(&lk);
9103		return;
9104	}
9105	/*
9106	 * Add the dirrem to the inodedep's pending remove list for quick
9107	 * discovery later.  A valid nlinkdelta ensures that this lookup
9108	 * will not fail.
9109	 */
9110	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9111		panic("softdep_setup_directory_change: Lost inodedep.");
9112	dirrem->dm_state |= ONDEPLIST;
9113	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9114
9115	/*
9116	 * If the COMPLETE flag is clear, then there were no active
9117	 * entries and we want to roll back to the previous inode until
9118	 * the new inode is committed to disk. If the COMPLETE flag is
9119	 * set, then we have deleted an entry that never made it to disk.
9120	 * If the entry we deleted resulted from a name change, then the old
9121	 * inode reference still resides on disk. Any rollback that we do
9122	 * needs to be to that old inode (returned to us in prevdirrem). If
9123	 * the entry we deleted resulted from a create, then there is
9124	 * no entry on the disk, so we want to roll back to zero rather
9125	 * than the uncommitted inode. In either of the COMPLETE cases we
9126	 * want to immediately free the unwritten and unreferenced inode.
9127	 */
9128	if ((dirrem->dm_state & COMPLETE) == 0) {
9129		dap->da_previous = dirrem;
9130	} else {
9131		if (prevdirrem != NULL) {
9132			dap->da_previous = prevdirrem;
9133		} else {
9134			dap->da_state &= ~DIRCHG;
9135			dap->da_pagedep = pagedep;
9136		}
9137		dirrem->dm_dirinum = pagedep->pd_ino;
9138		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9139			add_to_worklist(&dirrem->dm_list, 0);
9140	}
9141	/*
9142	 * Lookup the jaddref for this journal entry.  We must finish
9143	 * initializing it and make the diradd write dependent on it.
9144	 * If we're not journaling, put it on the id_bufwait list if the
9145	 * inode is not yet written. If it is written, do the post-inode
9146	 * write processing to put it on the id_pendinghd list.
9147	 */
9148	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
9149	if (MOUNTEDSUJ(mp)) {
9150		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
9151		    inoreflst);
9152		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
9153		    ("softdep_setup_directory_change: bad jaddref %p",
9154		    jaddref));
9155		jaddref->ja_diroff = dp->i_offset;
9156		jaddref->ja_diradd = dap;
9157		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9158		    dap, da_pdlist);
9159		add_to_journal(&jaddref->ja_list);
9160	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
9161		dap->da_state |= COMPLETE;
9162		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9163		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9164	} else {
9165		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9166		    dap, da_pdlist);
9167		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
9168	}
9169	/*
9170	 * If we're making a new name for a directory that has not been
9171	 * committed when need to move the dot and dotdot references to
9172	 * this new name.
9173	 */
9174	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
9175		merge_diradd(inodedep, dap);
9176	FREE_LOCK(&lk);
9177}
9178
9179/*
9180 * Called whenever the link count on an inode is changed.
9181 * It creates an inode dependency so that the new reference(s)
9182 * to the inode cannot be committed to disk until the updated
9183 * inode has been written.
9184 */
9185void
9186softdep_change_linkcnt(ip)
9187	struct inode *ip;	/* the inode with the increased link count */
9188{
9189	struct inodedep *inodedep;
9190	int dflags;
9191
9192	ACQUIRE_LOCK(&lk);
9193	dflags = DEPALLOC;
9194	if (IS_SNAPSHOT(ip))
9195		dflags |= NODELAY;
9196	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
9197	if (ip->i_nlink < ip->i_effnlink)
9198		panic("softdep_change_linkcnt: bad delta");
9199	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9200	FREE_LOCK(&lk);
9201}
9202
9203/*
9204 * Attach a sbdep dependency to the superblock buf so that we can keep
9205 * track of the head of the linked list of referenced but unlinked inodes.
9206 */
9207void
9208softdep_setup_sbupdate(ump, fs, bp)
9209	struct ufsmount *ump;
9210	struct fs *fs;
9211	struct buf *bp;
9212{
9213	struct sbdep *sbdep;
9214	struct worklist *wk;
9215
9216	if (MOUNTEDSUJ(UFSTOVFS(ump)) == 0)
9217		return;
9218	LIST_FOREACH(wk, &bp->b_dep, wk_list)
9219		if (wk->wk_type == D_SBDEP)
9220			break;
9221	if (wk != NULL)
9222		return;
9223	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
9224	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9225	sbdep->sb_fs = fs;
9226	sbdep->sb_ump = ump;
9227	ACQUIRE_LOCK(&lk);
9228	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9229	FREE_LOCK(&lk);
9230}
9231
9232/*
9233 * Return the first unlinked inodedep which is ready to be the head of the
9234 * list.  The inodedep and all those after it must have valid next pointers.
9235 */
9236static struct inodedep *
9237first_unlinked_inodedep(ump)
9238	struct ufsmount *ump;
9239{
9240	struct inodedep *inodedep;
9241	struct inodedep *idp;
9242
9243	mtx_assert(&lk, MA_OWNED);
9244	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9245	    inodedep; inodedep = idp) {
9246		if ((inodedep->id_state & UNLINKNEXT) == 0)
9247			return (NULL);
9248		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9249		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9250			break;
9251		if ((inodedep->id_state & UNLINKPREV) == 0)
9252			break;
9253	}
9254	return (inodedep);
9255}
9256
9257/*
9258 * Set the sujfree unlinked head pointer prior to writing a superblock.
9259 */
9260static void
9261initiate_write_sbdep(sbdep)
9262	struct sbdep *sbdep;
9263{
9264	struct inodedep *inodedep;
9265	struct fs *bpfs;
9266	struct fs *fs;
9267
9268	bpfs = sbdep->sb_fs;
9269	fs = sbdep->sb_ump->um_fs;
9270	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9271	if (inodedep) {
9272		fs->fs_sujfree = inodedep->id_ino;
9273		inodedep->id_state |= UNLINKPREV;
9274	} else
9275		fs->fs_sujfree = 0;
9276	bpfs->fs_sujfree = fs->fs_sujfree;
9277}
9278
9279/*
9280 * After a superblock is written determine whether it must be written again
9281 * due to a changing unlinked list head.
9282 */
9283static int
9284handle_written_sbdep(sbdep, bp)
9285	struct sbdep *sbdep;
9286	struct buf *bp;
9287{
9288	struct inodedep *inodedep;
9289	struct mount *mp;
9290	struct fs *fs;
9291
9292	mtx_assert(&lk, MA_OWNED);
9293	fs = sbdep->sb_fs;
9294	mp = UFSTOVFS(sbdep->sb_ump);
9295	/*
9296	 * If the superblock doesn't match the in-memory list start over.
9297	 */
9298	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9299	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9300	    (inodedep == NULL && fs->fs_sujfree != 0)) {
9301		bdirty(bp);
9302		return (1);
9303	}
9304	WORKITEM_FREE(sbdep, D_SBDEP);
9305	if (fs->fs_sujfree == 0)
9306		return (0);
9307	/*
9308	 * Now that we have a record of this inode in stable store allow it
9309	 * to be written to free up pending work.  Inodes may see a lot of
9310	 * write activity after they are unlinked which we must not hold up.
9311	 */
9312	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9313		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9314			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9315			    inodedep, inodedep->id_state);
9316		if (inodedep->id_state & UNLINKONLIST)
9317			break;
9318		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9319	}
9320
9321	return (0);
9322}
9323
9324/*
9325 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9326 */
9327static void
9328unlinked_inodedep(mp, inodedep)
9329	struct mount *mp;
9330	struct inodedep *inodedep;
9331{
9332	struct ufsmount *ump;
9333
9334	mtx_assert(&lk, MA_OWNED);
9335	if (MOUNTEDSUJ(mp) == 0)
9336		return;
9337	ump = VFSTOUFS(mp);
9338	ump->um_fs->fs_fmod = 1;
9339	if (inodedep->id_state & UNLINKED)
9340		panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9341	inodedep->id_state |= UNLINKED;
9342	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9343}
9344
9345/*
9346 * Remove an inodedep from the unlinked inodedep list.  This may require
9347 * disk writes if the inode has made it that far.
9348 */
9349static void
9350clear_unlinked_inodedep(inodedep)
9351	struct inodedep *inodedep;
9352{
9353	struct ufsmount *ump;
9354	struct inodedep *idp;
9355	struct inodedep *idn;
9356	struct fs *fs;
9357	struct buf *bp;
9358	ino_t ino;
9359	ino_t nino;
9360	ino_t pino;
9361	int error;
9362
9363	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9364	fs = ump->um_fs;
9365	ino = inodedep->id_ino;
9366	error = 0;
9367	for (;;) {
9368		mtx_assert(&lk, MA_OWNED);
9369		KASSERT((inodedep->id_state & UNLINKED) != 0,
9370		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9371		    inodedep));
9372		/*
9373		 * If nothing has yet been written simply remove us from
9374		 * the in memory list and return.  This is the most common
9375		 * case where handle_workitem_remove() loses the final
9376		 * reference.
9377		 */
9378		if ((inodedep->id_state & UNLINKLINKS) == 0)
9379			break;
9380		/*
9381		 * If we have a NEXT pointer and no PREV pointer we can simply
9382		 * clear NEXT's PREV and remove ourselves from the list.  Be
9383		 * careful not to clear PREV if the superblock points at
9384		 * next as well.
9385		 */
9386		idn = TAILQ_NEXT(inodedep, id_unlinked);
9387		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9388			if (idn && fs->fs_sujfree != idn->id_ino)
9389				idn->id_state &= ~UNLINKPREV;
9390			break;
9391		}
9392		/*
9393		 * Here we have an inodedep which is actually linked into
9394		 * the list.  We must remove it by forcing a write to the
9395		 * link before us, whether it be the superblock or an inode.
9396		 * Unfortunately the list may change while we're waiting
9397		 * on the buf lock for either resource so we must loop until
9398		 * we lock the right one.  If both the superblock and an
9399		 * inode point to this inode we must clear the inode first
9400		 * followed by the superblock.
9401		 */
9402		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9403		pino = 0;
9404		if (idp && (idp->id_state & UNLINKNEXT))
9405			pino = idp->id_ino;
9406		FREE_LOCK(&lk);
9407		if (pino == 0) {
9408			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9409			    (int)fs->fs_sbsize, 0, 0, 0);
9410		} else {
9411			error = bread(ump->um_devvp,
9412			    fsbtodb(fs, ino_to_fsba(fs, pino)),
9413			    (int)fs->fs_bsize, NOCRED, &bp);
9414			if (error)
9415				brelse(bp);
9416		}
9417		ACQUIRE_LOCK(&lk);
9418		if (error)
9419			break;
9420		/* If the list has changed restart the loop. */
9421		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9422		nino = 0;
9423		if (idp && (idp->id_state & UNLINKNEXT))
9424			nino = idp->id_ino;
9425		if (nino != pino ||
9426		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9427			FREE_LOCK(&lk);
9428			brelse(bp);
9429			ACQUIRE_LOCK(&lk);
9430			continue;
9431		}
9432		nino = 0;
9433		idn = TAILQ_NEXT(inodedep, id_unlinked);
9434		if (idn)
9435			nino = idn->id_ino;
9436		/*
9437		 * Remove us from the in memory list.  After this we cannot
9438		 * access the inodedep.
9439		 */
9440		KASSERT((inodedep->id_state & UNLINKED) != 0,
9441		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9442		    inodedep));
9443		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9444		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9445		FREE_LOCK(&lk);
9446		/*
9447		 * The predecessor's next pointer is manually updated here
9448		 * so that the NEXT flag is never cleared for an element
9449		 * that is in the list.
9450		 */
9451		if (pino == 0) {
9452			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9453			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9454			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9455			    bp);
9456		} else if (fs->fs_magic == FS_UFS1_MAGIC)
9457			((struct ufs1_dinode *)bp->b_data +
9458			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9459		else
9460			((struct ufs2_dinode *)bp->b_data +
9461			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9462		/*
9463		 * If the bwrite fails we have no recourse to recover.  The
9464		 * filesystem is corrupted already.
9465		 */
9466		bwrite(bp);
9467		ACQUIRE_LOCK(&lk);
9468		/*
9469		 * If the superblock pointer still needs to be cleared force
9470		 * a write here.
9471		 */
9472		if (fs->fs_sujfree == ino) {
9473			FREE_LOCK(&lk);
9474			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9475			    (int)fs->fs_sbsize, 0, 0, 0);
9476			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9477			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9478			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9479			    bp);
9480			bwrite(bp);
9481			ACQUIRE_LOCK(&lk);
9482		}
9483
9484		if (fs->fs_sujfree != ino)
9485			return;
9486		panic("clear_unlinked_inodedep: Failed to clear free head");
9487	}
9488	if (inodedep->id_ino == fs->fs_sujfree)
9489		panic("clear_unlinked_inodedep: Freeing head of free list");
9490	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9491	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9492	return;
9493}
9494
9495/*
9496 * This workitem decrements the inode's link count.
9497 * If the link count reaches zero, the file is removed.
9498 */
9499static int
9500handle_workitem_remove(dirrem, flags)
9501	struct dirrem *dirrem;
9502	int flags;
9503{
9504	struct inodedep *inodedep;
9505	struct workhead dotdotwk;
9506	struct worklist *wk;
9507	struct ufsmount *ump;
9508	struct mount *mp;
9509	struct vnode *vp;
9510	struct inode *ip;
9511	ino_t oldinum;
9512
9513	if (dirrem->dm_state & ONWORKLIST)
9514		panic("handle_workitem_remove: dirrem %p still on worklist",
9515		    dirrem);
9516	oldinum = dirrem->dm_oldinum;
9517	mp = dirrem->dm_list.wk_mp;
9518	ump = VFSTOUFS(mp);
9519	flags |= LK_EXCLUSIVE;
9520	if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
9521		return (EBUSY);
9522	ip = VTOI(vp);
9523	ACQUIRE_LOCK(&lk);
9524	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
9525		panic("handle_workitem_remove: lost inodedep");
9526	if (dirrem->dm_state & ONDEPLIST)
9527		LIST_REMOVE(dirrem, dm_inonext);
9528	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9529	    ("handle_workitem_remove:  Journal entries not written."));
9530
9531	/*
9532	 * Move all dependencies waiting on the remove to complete
9533	 * from the dirrem to the inode inowait list to be completed
9534	 * after the inode has been updated and written to disk.  Any
9535	 * marked MKDIR_PARENT are saved to be completed when the .. ref
9536	 * is removed.
9537	 */
9538	LIST_INIT(&dotdotwk);
9539	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
9540		WORKLIST_REMOVE(wk);
9541		if (wk->wk_state & MKDIR_PARENT) {
9542			wk->wk_state &= ~MKDIR_PARENT;
9543			WORKLIST_INSERT(&dotdotwk, wk);
9544			continue;
9545		}
9546		WORKLIST_INSERT(&inodedep->id_inowait, wk);
9547	}
9548	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
9549	/*
9550	 * Normal file deletion.
9551	 */
9552	if ((dirrem->dm_state & RMDIR) == 0) {
9553		ip->i_nlink--;
9554		DIP_SET(ip, i_nlink, ip->i_nlink);
9555		ip->i_flag |= IN_CHANGE;
9556		if (ip->i_nlink < ip->i_effnlink)
9557			panic("handle_workitem_remove: bad file delta");
9558		if (ip->i_nlink == 0)
9559			unlinked_inodedep(mp, inodedep);
9560		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9561		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9562		    ("handle_workitem_remove: worklist not empty. %s",
9563		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
9564		WORKITEM_FREE(dirrem, D_DIRREM);
9565		FREE_LOCK(&lk);
9566		goto out;
9567	}
9568	/*
9569	 * Directory deletion. Decrement reference count for both the
9570	 * just deleted parent directory entry and the reference for ".".
9571	 * Arrange to have the reference count on the parent decremented
9572	 * to account for the loss of "..".
9573	 */
9574	ip->i_nlink -= 2;
9575	DIP_SET(ip, i_nlink, ip->i_nlink);
9576	ip->i_flag |= IN_CHANGE;
9577	if (ip->i_nlink < ip->i_effnlink)
9578		panic("handle_workitem_remove: bad dir delta");
9579	if (ip->i_nlink == 0)
9580		unlinked_inodedep(mp, inodedep);
9581	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9582	/*
9583	 * Rename a directory to a new parent. Since, we are both deleting
9584	 * and creating a new directory entry, the link count on the new
9585	 * directory should not change. Thus we skip the followup dirrem.
9586	 */
9587	if (dirrem->dm_state & DIRCHG) {
9588		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9589		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
9590		WORKITEM_FREE(dirrem, D_DIRREM);
9591		FREE_LOCK(&lk);
9592		goto out;
9593	}
9594	dirrem->dm_state = ONDEPLIST;
9595	dirrem->dm_oldinum = dirrem->dm_dirinum;
9596	/*
9597	 * Place the dirrem on the parent's diremhd list.
9598	 */
9599	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
9600		panic("handle_workitem_remove: lost dir inodedep");
9601	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9602	/*
9603	 * If the allocated inode has never been written to disk, then
9604	 * the on-disk inode is zero'ed and we can remove the file
9605	 * immediately.  When journaling if the inode has been marked
9606	 * unlinked and not DEPCOMPLETE we know it can never be written.
9607	 */
9608	inodedep_lookup(mp, oldinum, 0, &inodedep);
9609	if (inodedep == NULL ||
9610	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
9611	    check_inode_unwritten(inodedep)) {
9612		FREE_LOCK(&lk);
9613		vput(vp);
9614		return handle_workitem_remove(dirrem, flags);
9615	}
9616	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
9617	FREE_LOCK(&lk);
9618	ip->i_flag |= IN_CHANGE;
9619out:
9620	ffs_update(vp, 0);
9621	vput(vp);
9622	return (0);
9623}
9624
9625/*
9626 * Inode de-allocation dependencies.
9627 *
9628 * When an inode's link count is reduced to zero, it can be de-allocated. We
9629 * found it convenient to postpone de-allocation until after the inode is
9630 * written to disk with its new link count (zero).  At this point, all of the
9631 * on-disk inode's block pointers are nullified and, with careful dependency
9632 * list ordering, all dependencies related to the inode will be satisfied and
9633 * the corresponding dependency structures de-allocated.  So, if/when the
9634 * inode is reused, there will be no mixing of old dependencies with new
9635 * ones.  This artificial dependency is set up by the block de-allocation
9636 * procedure above (softdep_setup_freeblocks) and completed by the
9637 * following procedure.
9638 */
9639static void
9640handle_workitem_freefile(freefile)
9641	struct freefile *freefile;
9642{
9643	struct workhead wkhd;
9644	struct fs *fs;
9645	struct inodedep *idp;
9646	struct ufsmount *ump;
9647	int error;
9648
9649	ump = VFSTOUFS(freefile->fx_list.wk_mp);
9650	fs = ump->um_fs;
9651#ifdef DEBUG
9652	ACQUIRE_LOCK(&lk);
9653	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
9654	FREE_LOCK(&lk);
9655	if (error)
9656		panic("handle_workitem_freefile: inodedep %p survived", idp);
9657#endif
9658	UFS_LOCK(ump);
9659	fs->fs_pendinginodes -= 1;
9660	UFS_UNLOCK(ump);
9661	LIST_INIT(&wkhd);
9662	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
9663	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
9664	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
9665		softdep_error("handle_workitem_freefile", error);
9666	ACQUIRE_LOCK(&lk);
9667	WORKITEM_FREE(freefile, D_FREEFILE);
9668	FREE_LOCK(&lk);
9669}
9670
9671
9672/*
9673 * Helper function which unlinks marker element from work list and returns
9674 * the next element on the list.
9675 */
9676static __inline struct worklist *
9677markernext(struct worklist *marker)
9678{
9679	struct worklist *next;
9680
9681	next = LIST_NEXT(marker, wk_list);
9682	LIST_REMOVE(marker, wk_list);
9683	return next;
9684}
9685
9686/*
9687 * Disk writes.
9688 *
9689 * The dependency structures constructed above are most actively used when file
9690 * system blocks are written to disk.  No constraints are placed on when a
9691 * block can be written, but unsatisfied update dependencies are made safe by
9692 * modifying (or replacing) the source memory for the duration of the disk
9693 * write.  When the disk write completes, the memory block is again brought
9694 * up-to-date.
9695 *
9696 * In-core inode structure reclamation.
9697 *
9698 * Because there are a finite number of "in-core" inode structures, they are
9699 * reused regularly.  By transferring all inode-related dependencies to the
9700 * in-memory inode block and indexing them separately (via "inodedep"s), we
9701 * can allow "in-core" inode structures to be reused at any time and avoid
9702 * any increase in contention.
9703 *
9704 * Called just before entering the device driver to initiate a new disk I/O.
9705 * The buffer must be locked, thus, no I/O completion operations can occur
9706 * while we are manipulating its associated dependencies.
9707 */
9708static void
9709softdep_disk_io_initiation(bp)
9710	struct buf *bp;		/* structure describing disk write to occur */
9711{
9712	struct worklist *wk;
9713	struct worklist marker;
9714	struct inodedep *inodedep;
9715	struct freeblks *freeblks;
9716	struct jblkdep *jblkdep;
9717	struct newblk *newblk;
9718
9719	/*
9720	 * We only care about write operations. There should never
9721	 * be dependencies for reads.
9722	 */
9723	if (bp->b_iocmd != BIO_WRITE)
9724		panic("softdep_disk_io_initiation: not write");
9725
9726	if (bp->b_vflags & BV_BKGRDINPROG)
9727		panic("softdep_disk_io_initiation: Writing buffer with "
9728		    "background write in progress: %p", bp);
9729
9730	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
9731	PHOLD(curproc);			/* Don't swap out kernel stack */
9732
9733	ACQUIRE_LOCK(&lk);
9734	/*
9735	 * Do any necessary pre-I/O processing.
9736	 */
9737	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
9738	     wk = markernext(&marker)) {
9739		LIST_INSERT_AFTER(wk, &marker, wk_list);
9740		switch (wk->wk_type) {
9741
9742		case D_PAGEDEP:
9743			initiate_write_filepage(WK_PAGEDEP(wk), bp);
9744			continue;
9745
9746		case D_INODEDEP:
9747			inodedep = WK_INODEDEP(wk);
9748			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
9749				initiate_write_inodeblock_ufs1(inodedep, bp);
9750			else
9751				initiate_write_inodeblock_ufs2(inodedep, bp);
9752			continue;
9753
9754		case D_INDIRDEP:
9755			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
9756			continue;
9757
9758		case D_BMSAFEMAP:
9759			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
9760			continue;
9761
9762		case D_JSEG:
9763			WK_JSEG(wk)->js_buf = NULL;
9764			continue;
9765
9766		case D_FREEBLKS:
9767			freeblks = WK_FREEBLKS(wk);
9768			jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
9769			/*
9770			 * We have to wait for the freeblks to be journaled
9771			 * before we can write an inodeblock with updated
9772			 * pointers.  Be careful to arrange the marker so
9773			 * we revisit the freeblks if it's not removed by
9774			 * the first jwait().
9775			 */
9776			if (jblkdep != NULL) {
9777				LIST_REMOVE(&marker, wk_list);
9778				LIST_INSERT_BEFORE(wk, &marker, wk_list);
9779				jwait(&jblkdep->jb_list, MNT_WAIT);
9780			}
9781			continue;
9782		case D_ALLOCDIRECT:
9783		case D_ALLOCINDIR:
9784			/*
9785			 * We have to wait for the jnewblk to be journaled
9786			 * before we can write to a block if the contents
9787			 * may be confused with an earlier file's indirect
9788			 * at recovery time.  Handle the marker as described
9789			 * above.
9790			 */
9791			newblk = WK_NEWBLK(wk);
9792			if (newblk->nb_jnewblk != NULL &&
9793			    indirblk_lookup(newblk->nb_list.wk_mp,
9794			    newblk->nb_newblkno)) {
9795				LIST_REMOVE(&marker, wk_list);
9796				LIST_INSERT_BEFORE(wk, &marker, wk_list);
9797				jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
9798			}
9799			continue;
9800
9801		case D_SBDEP:
9802			initiate_write_sbdep(WK_SBDEP(wk));
9803			continue;
9804
9805		case D_MKDIR:
9806		case D_FREEWORK:
9807		case D_FREEDEP:
9808		case D_JSEGDEP:
9809			continue;
9810
9811		default:
9812			panic("handle_disk_io_initiation: Unexpected type %s",
9813			    TYPENAME(wk->wk_type));
9814			/* NOTREACHED */
9815		}
9816	}
9817	FREE_LOCK(&lk);
9818	PRELE(curproc);			/* Allow swapout of kernel stack */
9819}
9820
9821/*
9822 * Called from within the procedure above to deal with unsatisfied
9823 * allocation dependencies in a directory. The buffer must be locked,
9824 * thus, no I/O completion operations can occur while we are
9825 * manipulating its associated dependencies.
9826 */
9827static void
9828initiate_write_filepage(pagedep, bp)
9829	struct pagedep *pagedep;
9830	struct buf *bp;
9831{
9832	struct jremref *jremref;
9833	struct jmvref *jmvref;
9834	struct dirrem *dirrem;
9835	struct diradd *dap;
9836	struct direct *ep;
9837	int i;
9838
9839	if (pagedep->pd_state & IOSTARTED) {
9840		/*
9841		 * This can only happen if there is a driver that does not
9842		 * understand chaining. Here biodone will reissue the call
9843		 * to strategy for the incomplete buffers.
9844		 */
9845		printf("initiate_write_filepage: already started\n");
9846		return;
9847	}
9848	pagedep->pd_state |= IOSTARTED;
9849	/*
9850	 * Wait for all journal remove dependencies to hit the disk.
9851	 * We can not allow any potentially conflicting directory adds
9852	 * to be visible before removes and rollback is too difficult.
9853	 * lk may be dropped and re-acquired, however we hold the buf
9854	 * locked so the dependency can not go away.
9855	 */
9856	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
9857		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
9858			jwait(&jremref->jr_list, MNT_WAIT);
9859	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
9860		jwait(&jmvref->jm_list, MNT_WAIT);
9861	for (i = 0; i < DAHASHSZ; i++) {
9862		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
9863			ep = (struct direct *)
9864			    ((char *)bp->b_data + dap->da_offset);
9865			if (ep->d_ino != dap->da_newinum)
9866				panic("%s: dir inum %d != new %d",
9867				    "initiate_write_filepage",
9868				    ep->d_ino, dap->da_newinum);
9869			if (dap->da_state & DIRCHG)
9870				ep->d_ino = dap->da_previous->dm_oldinum;
9871			else
9872				ep->d_ino = 0;
9873			dap->da_state &= ~ATTACHED;
9874			dap->da_state |= UNDONE;
9875		}
9876	}
9877}
9878
9879/*
9880 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
9881 * Note that any bug fixes made to this routine must be done in the
9882 * version found below.
9883 *
9884 * Called from within the procedure above to deal with unsatisfied
9885 * allocation dependencies in an inodeblock. The buffer must be
9886 * locked, thus, no I/O completion operations can occur while we
9887 * are manipulating its associated dependencies.
9888 */
9889static void
9890initiate_write_inodeblock_ufs1(inodedep, bp)
9891	struct inodedep *inodedep;
9892	struct buf *bp;			/* The inode block */
9893{
9894	struct allocdirect *adp, *lastadp;
9895	struct ufs1_dinode *dp;
9896	struct ufs1_dinode *sip;
9897	struct inoref *inoref;
9898	struct fs *fs;
9899	ufs_lbn_t i;
9900#ifdef INVARIANTS
9901	ufs_lbn_t prevlbn = 0;
9902#endif
9903	int deplist;
9904
9905	if (inodedep->id_state & IOSTARTED)
9906		panic("initiate_write_inodeblock_ufs1: already started");
9907	inodedep->id_state |= IOSTARTED;
9908	fs = inodedep->id_fs;
9909	dp = (struct ufs1_dinode *)bp->b_data +
9910	    ino_to_fsbo(fs, inodedep->id_ino);
9911
9912	/*
9913	 * If we're on the unlinked list but have not yet written our
9914	 * next pointer initialize it here.
9915	 */
9916	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
9917		struct inodedep *inon;
9918
9919		inon = TAILQ_NEXT(inodedep, id_unlinked);
9920		dp->di_freelink = inon ? inon->id_ino : 0;
9921	}
9922	/*
9923	 * If the bitmap is not yet written, then the allocated
9924	 * inode cannot be written to disk.
9925	 */
9926	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
9927		if (inodedep->id_savedino1 != NULL)
9928			panic("initiate_write_inodeblock_ufs1: I/O underway");
9929		FREE_LOCK(&lk);
9930		sip = malloc(sizeof(struct ufs1_dinode),
9931		    M_SAVEDINO, M_SOFTDEP_FLAGS);
9932		ACQUIRE_LOCK(&lk);
9933		inodedep->id_savedino1 = sip;
9934		*inodedep->id_savedino1 = *dp;
9935		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
9936		dp->di_gen = inodedep->id_savedino1->di_gen;
9937		dp->di_freelink = inodedep->id_savedino1->di_freelink;
9938		return;
9939	}
9940	/*
9941	 * If no dependencies, then there is nothing to roll back.
9942	 */
9943	inodedep->id_savedsize = dp->di_size;
9944	inodedep->id_savedextsize = 0;
9945	inodedep->id_savednlink = dp->di_nlink;
9946	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
9947	    TAILQ_EMPTY(&inodedep->id_inoreflst))
9948		return;
9949	/*
9950	 * Revert the link count to that of the first unwritten journal entry.
9951	 */
9952	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
9953	if (inoref)
9954		dp->di_nlink = inoref->if_nlink;
9955	/*
9956	 * Set the dependencies to busy.
9957	 */
9958	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
9959	     adp = TAILQ_NEXT(adp, ad_next)) {
9960#ifdef INVARIANTS
9961		if (deplist != 0 && prevlbn >= adp->ad_offset)
9962			panic("softdep_write_inodeblock: lbn order");
9963		prevlbn = adp->ad_offset;
9964		if (adp->ad_offset < NDADDR &&
9965		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
9966			panic("%s: direct pointer #%jd mismatch %d != %jd",
9967			    "softdep_write_inodeblock",
9968			    (intmax_t)adp->ad_offset,
9969			    dp->di_db[adp->ad_offset],
9970			    (intmax_t)adp->ad_newblkno);
9971		if (adp->ad_offset >= NDADDR &&
9972		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
9973			panic("%s: indirect pointer #%jd mismatch %d != %jd",
9974			    "softdep_write_inodeblock",
9975			    (intmax_t)adp->ad_offset - NDADDR,
9976			    dp->di_ib[adp->ad_offset - NDADDR],
9977			    (intmax_t)adp->ad_newblkno);
9978		deplist |= 1 << adp->ad_offset;
9979		if ((adp->ad_state & ATTACHED) == 0)
9980			panic("softdep_write_inodeblock: Unknown state 0x%x",
9981			    adp->ad_state);
9982#endif /* INVARIANTS */
9983		adp->ad_state &= ~ATTACHED;
9984		adp->ad_state |= UNDONE;
9985	}
9986	/*
9987	 * The on-disk inode cannot claim to be any larger than the last
9988	 * fragment that has been written. Otherwise, the on-disk inode
9989	 * might have fragments that were not the last block in the file
9990	 * which would corrupt the filesystem.
9991	 */
9992	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
9993	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
9994		if (adp->ad_offset >= NDADDR)
9995			break;
9996		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
9997		/* keep going until hitting a rollback to a frag */
9998		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
9999			continue;
10000		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10001		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10002#ifdef INVARIANTS
10003			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10004				panic("softdep_write_inodeblock: lost dep1");
10005#endif /* INVARIANTS */
10006			dp->di_db[i] = 0;
10007		}
10008		for (i = 0; i < NIADDR; i++) {
10009#ifdef INVARIANTS
10010			if (dp->di_ib[i] != 0 &&
10011			    (deplist & ((1 << NDADDR) << i)) == 0)
10012				panic("softdep_write_inodeblock: lost dep2");
10013#endif /* INVARIANTS */
10014			dp->di_ib[i] = 0;
10015		}
10016		return;
10017	}
10018	/*
10019	 * If we have zero'ed out the last allocated block of the file,
10020	 * roll back the size to the last currently allocated block.
10021	 * We know that this last allocated block is a full-sized as
10022	 * we already checked for fragments in the loop above.
10023	 */
10024	if (lastadp != NULL &&
10025	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10026		for (i = lastadp->ad_offset; i >= 0; i--)
10027			if (dp->di_db[i] != 0)
10028				break;
10029		dp->di_size = (i + 1) * fs->fs_bsize;
10030	}
10031	/*
10032	 * The only dependencies are for indirect blocks.
10033	 *
10034	 * The file size for indirect block additions is not guaranteed.
10035	 * Such a guarantee would be non-trivial to achieve. The conventional
10036	 * synchronous write implementation also does not make this guarantee.
10037	 * Fsck should catch and fix discrepancies. Arguably, the file size
10038	 * can be over-estimated without destroying integrity when the file
10039	 * moves into the indirect blocks (i.e., is large). If we want to
10040	 * postpone fsck, we are stuck with this argument.
10041	 */
10042	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10043		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10044}
10045
10046/*
10047 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
10048 * Note that any bug fixes made to this routine must be done in the
10049 * version found above.
10050 *
10051 * Called from within the procedure above to deal with unsatisfied
10052 * allocation dependencies in an inodeblock. The buffer must be
10053 * locked, thus, no I/O completion operations can occur while we
10054 * are manipulating its associated dependencies.
10055 */
10056static void
10057initiate_write_inodeblock_ufs2(inodedep, bp)
10058	struct inodedep *inodedep;
10059	struct buf *bp;			/* The inode block */
10060{
10061	struct allocdirect *adp, *lastadp;
10062	struct ufs2_dinode *dp;
10063	struct ufs2_dinode *sip;
10064	struct inoref *inoref;
10065	struct fs *fs;
10066	ufs_lbn_t i;
10067#ifdef INVARIANTS
10068	ufs_lbn_t prevlbn = 0;
10069#endif
10070	int deplist;
10071
10072	if (inodedep->id_state & IOSTARTED)
10073		panic("initiate_write_inodeblock_ufs2: already started");
10074	inodedep->id_state |= IOSTARTED;
10075	fs = inodedep->id_fs;
10076	dp = (struct ufs2_dinode *)bp->b_data +
10077	    ino_to_fsbo(fs, inodedep->id_ino);
10078
10079	/*
10080	 * If we're on the unlinked list but have not yet written our
10081	 * next pointer initialize it here.
10082	 */
10083	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10084		struct inodedep *inon;
10085
10086		inon = TAILQ_NEXT(inodedep, id_unlinked);
10087		dp->di_freelink = inon ? inon->id_ino : 0;
10088	}
10089	/*
10090	 * If the bitmap is not yet written, then the allocated
10091	 * inode cannot be written to disk.
10092	 */
10093	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10094		if (inodedep->id_savedino2 != NULL)
10095			panic("initiate_write_inodeblock_ufs2: I/O underway");
10096		FREE_LOCK(&lk);
10097		sip = malloc(sizeof(struct ufs2_dinode),
10098		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10099		ACQUIRE_LOCK(&lk);
10100		inodedep->id_savedino2 = sip;
10101		*inodedep->id_savedino2 = *dp;
10102		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10103		dp->di_gen = inodedep->id_savedino2->di_gen;
10104		dp->di_freelink = inodedep->id_savedino2->di_freelink;
10105		return;
10106	}
10107	/*
10108	 * If no dependencies, then there is nothing to roll back.
10109	 */
10110	inodedep->id_savedsize = dp->di_size;
10111	inodedep->id_savedextsize = dp->di_extsize;
10112	inodedep->id_savednlink = dp->di_nlink;
10113	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10114	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
10115	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10116		return;
10117	/*
10118	 * Revert the link count to that of the first unwritten journal entry.
10119	 */
10120	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10121	if (inoref)
10122		dp->di_nlink = inoref->if_nlink;
10123
10124	/*
10125	 * Set the ext data dependencies to busy.
10126	 */
10127	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10128	     adp = TAILQ_NEXT(adp, ad_next)) {
10129#ifdef INVARIANTS
10130		if (deplist != 0 && prevlbn >= adp->ad_offset)
10131			panic("softdep_write_inodeblock: lbn order");
10132		prevlbn = adp->ad_offset;
10133		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10134			panic("%s: direct pointer #%jd mismatch %jd != %jd",
10135			    "softdep_write_inodeblock",
10136			    (intmax_t)adp->ad_offset,
10137			    (intmax_t)dp->di_extb[adp->ad_offset],
10138			    (intmax_t)adp->ad_newblkno);
10139		deplist |= 1 << adp->ad_offset;
10140		if ((adp->ad_state & ATTACHED) == 0)
10141			panic("softdep_write_inodeblock: Unknown state 0x%x",
10142			    adp->ad_state);
10143#endif /* INVARIANTS */
10144		adp->ad_state &= ~ATTACHED;
10145		adp->ad_state |= UNDONE;
10146	}
10147	/*
10148	 * The on-disk inode cannot claim to be any larger than the last
10149	 * fragment that has been written. Otherwise, the on-disk inode
10150	 * might have fragments that were not the last block in the ext
10151	 * data which would corrupt the filesystem.
10152	 */
10153	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10154	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10155		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10156		/* keep going until hitting a rollback to a frag */
10157		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10158			continue;
10159		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10160		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
10161#ifdef INVARIANTS
10162			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10163				panic("softdep_write_inodeblock: lost dep1");
10164#endif /* INVARIANTS */
10165			dp->di_extb[i] = 0;
10166		}
10167		lastadp = NULL;
10168		break;
10169	}
10170	/*
10171	 * If we have zero'ed out the last allocated block of the ext
10172	 * data, roll back the size to the last currently allocated block.
10173	 * We know that this last allocated block is a full-sized as
10174	 * we already checked for fragments in the loop above.
10175	 */
10176	if (lastadp != NULL &&
10177	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10178		for (i = lastadp->ad_offset; i >= 0; i--)
10179			if (dp->di_extb[i] != 0)
10180				break;
10181		dp->di_extsize = (i + 1) * fs->fs_bsize;
10182	}
10183	/*
10184	 * Set the file data dependencies to busy.
10185	 */
10186	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10187	     adp = TAILQ_NEXT(adp, ad_next)) {
10188#ifdef INVARIANTS
10189		if (deplist != 0 && prevlbn >= adp->ad_offset)
10190			panic("softdep_write_inodeblock: lbn order");
10191		if ((adp->ad_state & ATTACHED) == 0)
10192			panic("inodedep %p and adp %p not attached", inodedep, adp);
10193		prevlbn = adp->ad_offset;
10194		if (adp->ad_offset < NDADDR &&
10195		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10196			panic("%s: direct pointer #%jd mismatch %jd != %jd",
10197			    "softdep_write_inodeblock",
10198			    (intmax_t)adp->ad_offset,
10199			    (intmax_t)dp->di_db[adp->ad_offset],
10200			    (intmax_t)adp->ad_newblkno);
10201		if (adp->ad_offset >= NDADDR &&
10202		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10203			panic("%s indirect pointer #%jd mismatch %jd != %jd",
10204			    "softdep_write_inodeblock:",
10205			    (intmax_t)adp->ad_offset - NDADDR,
10206			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
10207			    (intmax_t)adp->ad_newblkno);
10208		deplist |= 1 << adp->ad_offset;
10209		if ((adp->ad_state & ATTACHED) == 0)
10210			panic("softdep_write_inodeblock: Unknown state 0x%x",
10211			    adp->ad_state);
10212#endif /* INVARIANTS */
10213		adp->ad_state &= ~ATTACHED;
10214		adp->ad_state |= UNDONE;
10215	}
10216	/*
10217	 * The on-disk inode cannot claim to be any larger than the last
10218	 * fragment that has been written. Otherwise, the on-disk inode
10219	 * might have fragments that were not the last block in the file
10220	 * which would corrupt the filesystem.
10221	 */
10222	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10223	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10224		if (adp->ad_offset >= NDADDR)
10225			break;
10226		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10227		/* keep going until hitting a rollback to a frag */
10228		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10229			continue;
10230		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10231		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10232#ifdef INVARIANTS
10233			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10234				panic("softdep_write_inodeblock: lost dep2");
10235#endif /* INVARIANTS */
10236			dp->di_db[i] = 0;
10237		}
10238		for (i = 0; i < NIADDR; i++) {
10239#ifdef INVARIANTS
10240			if (dp->di_ib[i] != 0 &&
10241			    (deplist & ((1 << NDADDR) << i)) == 0)
10242				panic("softdep_write_inodeblock: lost dep3");
10243#endif /* INVARIANTS */
10244			dp->di_ib[i] = 0;
10245		}
10246		return;
10247	}
10248	/*
10249	 * If we have zero'ed out the last allocated block of the file,
10250	 * roll back the size to the last currently allocated block.
10251	 * We know that this last allocated block is a full-sized as
10252	 * we already checked for fragments in the loop above.
10253	 */
10254	if (lastadp != NULL &&
10255	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10256		for (i = lastadp->ad_offset; i >= 0; i--)
10257			if (dp->di_db[i] != 0)
10258				break;
10259		dp->di_size = (i + 1) * fs->fs_bsize;
10260	}
10261	/*
10262	 * The only dependencies are for indirect blocks.
10263	 *
10264	 * The file size for indirect block additions is not guaranteed.
10265	 * Such a guarantee would be non-trivial to achieve. The conventional
10266	 * synchronous write implementation also does not make this guarantee.
10267	 * Fsck should catch and fix discrepancies. Arguably, the file size
10268	 * can be over-estimated without destroying integrity when the file
10269	 * moves into the indirect blocks (i.e., is large). If we want to
10270	 * postpone fsck, we are stuck with this argument.
10271	 */
10272	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10273		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10274}
10275
10276/*
10277 * Cancel an indirdep as a result of truncation.  Release all of the
10278 * children allocindirs and place their journal work on the appropriate
10279 * list.
10280 */
10281static void
10282cancel_indirdep(indirdep, bp, freeblks)
10283	struct indirdep *indirdep;
10284	struct buf *bp;
10285	struct freeblks *freeblks;
10286{
10287	struct allocindir *aip;
10288
10289	/*
10290	 * None of the indirect pointers will ever be visible,
10291	 * so they can simply be tossed. GOINGAWAY ensures
10292	 * that allocated pointers will be saved in the buffer
10293	 * cache until they are freed. Note that they will
10294	 * only be able to be found by their physical address
10295	 * since the inode mapping the logical address will
10296	 * be gone. The save buffer used for the safe copy
10297	 * was allocated in setup_allocindir_phase2 using
10298	 * the physical address so it could be used for this
10299	 * purpose. Hence we swap the safe copy with the real
10300	 * copy, allowing the safe copy to be freed and holding
10301	 * on to the real copy for later use in indir_trunc.
10302	 */
10303	if (indirdep->ir_state & GOINGAWAY)
10304		panic("cancel_indirdep: already gone");
10305	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10306		indirdep->ir_state |= DEPCOMPLETE;
10307		LIST_REMOVE(indirdep, ir_next);
10308	}
10309	indirdep->ir_state |= GOINGAWAY;
10310	VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1;
10311	/*
10312	 * Pass in bp for blocks still have journal writes
10313	 * pending so we can cancel them on their own.
10314	 */
10315	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
10316		cancel_allocindir(aip, bp, freeblks, 0);
10317	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
10318		cancel_allocindir(aip, NULL, freeblks, 0);
10319	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
10320		cancel_allocindir(aip, NULL, freeblks, 0);
10321	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
10322		cancel_allocindir(aip, NULL, freeblks, 0);
10323	/*
10324	 * If there are pending partial truncations we need to keep the
10325	 * old block copy around until they complete.  This is because
10326	 * the current b_data is not a perfect superset of the available
10327	 * blocks.
10328	 */
10329	if (TAILQ_EMPTY(&indirdep->ir_trunc))
10330		bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10331	else
10332		bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10333	WORKLIST_REMOVE(&indirdep->ir_list);
10334	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10335	indirdep->ir_bp = NULL;
10336	indirdep->ir_freeblks = freeblks;
10337}
10338
10339/*
10340 * Free an indirdep once it no longer has new pointers to track.
10341 */
10342static void
10343free_indirdep(indirdep)
10344	struct indirdep *indirdep;
10345{
10346
10347	KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10348	    ("free_indirdep: Indir trunc list not empty."));
10349	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10350	    ("free_indirdep: Complete head not empty."));
10351	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10352	    ("free_indirdep: write head not empty."));
10353	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10354	    ("free_indirdep: done head not empty."));
10355	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10356	    ("free_indirdep: deplist head not empty."));
10357	KASSERT((indirdep->ir_state & DEPCOMPLETE),
10358	    ("free_indirdep: %p still on newblk list.", indirdep));
10359	KASSERT(indirdep->ir_saveddata == NULL,
10360	    ("free_indirdep: %p still has saved data.", indirdep));
10361	if (indirdep->ir_state & ONWORKLIST)
10362		WORKLIST_REMOVE(&indirdep->ir_list);
10363	WORKITEM_FREE(indirdep, D_INDIRDEP);
10364}
10365
10366/*
10367 * Called before a write to an indirdep.  This routine is responsible for
10368 * rolling back pointers to a safe state which includes only those
10369 * allocindirs which have been completed.
10370 */
10371static void
10372initiate_write_indirdep(indirdep, bp)
10373	struct indirdep *indirdep;
10374	struct buf *bp;
10375{
10376
10377	indirdep->ir_state |= IOSTARTED;
10378	if (indirdep->ir_state & GOINGAWAY)
10379		panic("disk_io_initiation: indirdep gone");
10380	/*
10381	 * If there are no remaining dependencies, this will be writing
10382	 * the real pointers.
10383	 */
10384	if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10385	    TAILQ_EMPTY(&indirdep->ir_trunc))
10386		return;
10387	/*
10388	 * Replace up-to-date version with safe version.
10389	 */
10390	if (indirdep->ir_saveddata == NULL) {
10391		FREE_LOCK(&lk);
10392		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10393		    M_SOFTDEP_FLAGS);
10394		ACQUIRE_LOCK(&lk);
10395	}
10396	indirdep->ir_state &= ~ATTACHED;
10397	indirdep->ir_state |= UNDONE;
10398	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10399	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10400	    bp->b_bcount);
10401}
10402
10403/*
10404 * Called when an inode has been cleared in a cg bitmap.  This finally
10405 * eliminates any canceled jaddrefs
10406 */
10407void
10408softdep_setup_inofree(mp, bp, ino, wkhd)
10409	struct mount *mp;
10410	struct buf *bp;
10411	ino_t ino;
10412	struct workhead *wkhd;
10413{
10414	struct worklist *wk, *wkn;
10415	struct inodedep *inodedep;
10416	uint8_t *inosused;
10417	struct cg *cgp;
10418	struct fs *fs;
10419
10420	ACQUIRE_LOCK(&lk);
10421	fs = VFSTOUFS(mp)->um_fs;
10422	cgp = (struct cg *)bp->b_data;
10423	inosused = cg_inosused(cgp);
10424	if (isset(inosused, ino % fs->fs_ipg))
10425		panic("softdep_setup_inofree: inode %d not freed.", ino);
10426	if (inodedep_lookup(mp, ino, 0, &inodedep))
10427		panic("softdep_setup_inofree: ino %d has existing inodedep %p",
10428		    ino, inodedep);
10429	if (wkhd) {
10430		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10431			if (wk->wk_type != D_JADDREF)
10432				continue;
10433			WORKLIST_REMOVE(wk);
10434			/*
10435			 * We can free immediately even if the jaddref
10436			 * isn't attached in a background write as now
10437			 * the bitmaps are reconciled.
10438			 */
10439			wk->wk_state |= COMPLETE | ATTACHED;
10440			free_jaddref(WK_JADDREF(wk));
10441		}
10442		jwork_move(&bp->b_dep, wkhd);
10443	}
10444	FREE_LOCK(&lk);
10445}
10446
10447
10448/*
10449 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10450 * map.  Any dependencies waiting for the write to clear are added to the
10451 * buf's list and any jnewblks that are being canceled are discarded
10452 * immediately.
10453 */
10454void
10455softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
10456	struct mount *mp;
10457	struct buf *bp;
10458	ufs2_daddr_t blkno;
10459	int frags;
10460	struct workhead *wkhd;
10461{
10462	struct bmsafemap *bmsafemap;
10463	struct jnewblk *jnewblk;
10464	struct worklist *wk;
10465	struct fs *fs;
10466#ifdef SUJ_DEBUG
10467	uint8_t *blksfree;
10468	struct cg *cgp;
10469	ufs2_daddr_t jstart;
10470	ufs2_daddr_t jend;
10471	ufs2_daddr_t end;
10472	long bno;
10473	int i;
10474#endif
10475
10476	CTR3(KTR_SUJ,
10477	    "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
10478	    blkno, frags, wkhd);
10479
10480	ACQUIRE_LOCK(&lk);
10481	/* Lookup the bmsafemap so we track when it is dirty. */
10482	fs = VFSTOUFS(mp)->um_fs;
10483	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10484	/*
10485	 * Detach any jnewblks which have been canceled.  They must linger
10486	 * until the bitmap is cleared again by ffs_blkfree() to prevent
10487	 * an unjournaled allocation from hitting the disk.
10488	 */
10489	if (wkhd) {
10490		while ((wk = LIST_FIRST(wkhd)) != NULL) {
10491			CTR2(KTR_SUJ,
10492			    "softdep_setup_blkfree: blkno %jd wk type %d",
10493			    blkno, wk->wk_type);
10494			WORKLIST_REMOVE(wk);
10495			if (wk->wk_type != D_JNEWBLK) {
10496				WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
10497				continue;
10498			}
10499			jnewblk = WK_JNEWBLK(wk);
10500			KASSERT(jnewblk->jn_state & GOINGAWAY,
10501			    ("softdep_setup_blkfree: jnewblk not canceled."));
10502#ifdef SUJ_DEBUG
10503			/*
10504			 * Assert that this block is free in the bitmap
10505			 * before we discard the jnewblk.
10506			 */
10507			cgp = (struct cg *)bp->b_data;
10508			blksfree = cg_blksfree(cgp);
10509			bno = dtogd(fs, jnewblk->jn_blkno);
10510			for (i = jnewblk->jn_oldfrags;
10511			    i < jnewblk->jn_frags; i++) {
10512				if (isset(blksfree, bno + i))
10513					continue;
10514				panic("softdep_setup_blkfree: not free");
10515			}
10516#endif
10517			/*
10518			 * Even if it's not attached we can free immediately
10519			 * as the new bitmap is correct.
10520			 */
10521			wk->wk_state |= COMPLETE | ATTACHED;
10522			free_jnewblk(jnewblk);
10523		}
10524	}
10525
10526#ifdef SUJ_DEBUG
10527	/*
10528	 * Assert that we are not freeing a block which has an outstanding
10529	 * allocation dependency.
10530	 */
10531	fs = VFSTOUFS(mp)->um_fs;
10532	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10533	end = blkno + frags;
10534	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10535		/*
10536		 * Don't match against blocks that will be freed when the
10537		 * background write is done.
10538		 */
10539		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
10540		    (COMPLETE | DEPCOMPLETE))
10541			continue;
10542		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
10543		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
10544		if ((blkno >= jstart && blkno < jend) ||
10545		    (end > jstart && end <= jend)) {
10546			printf("state 0x%X %jd - %d %d dep %p\n",
10547			    jnewblk->jn_state, jnewblk->jn_blkno,
10548			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
10549			    jnewblk->jn_dep);
10550			panic("softdep_setup_blkfree: "
10551			    "%jd-%jd(%d) overlaps with %jd-%jd",
10552			    blkno, end, frags, jstart, jend);
10553		}
10554	}
10555#endif
10556	FREE_LOCK(&lk);
10557}
10558
10559/*
10560 * Revert a block allocation when the journal record that describes it
10561 * is not yet written.
10562 */
10563int
10564jnewblk_rollback(jnewblk, fs, cgp, blksfree)
10565	struct jnewblk *jnewblk;
10566	struct fs *fs;
10567	struct cg *cgp;
10568	uint8_t *blksfree;
10569{
10570	ufs1_daddr_t fragno;
10571	long cgbno, bbase;
10572	int frags, blk;
10573	int i;
10574
10575	frags = 0;
10576	cgbno = dtogd(fs, jnewblk->jn_blkno);
10577	/*
10578	 * We have to test which frags need to be rolled back.  We may
10579	 * be operating on a stale copy when doing background writes.
10580	 */
10581	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
10582		if (isclr(blksfree, cgbno + i))
10583			frags++;
10584	if (frags == 0)
10585		return (0);
10586	/*
10587	 * This is mostly ffs_blkfree() sans some validation and
10588	 * superblock updates.
10589	 */
10590	if (frags == fs->fs_frag) {
10591		fragno = fragstoblks(fs, cgbno);
10592		ffs_setblock(fs, blksfree, fragno);
10593		ffs_clusteracct(fs, cgp, fragno, 1);
10594		cgp->cg_cs.cs_nbfree++;
10595	} else {
10596		cgbno += jnewblk->jn_oldfrags;
10597		bbase = cgbno - fragnum(fs, cgbno);
10598		/* Decrement the old frags.  */
10599		blk = blkmap(fs, blksfree, bbase);
10600		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
10601		/* Deallocate the fragment */
10602		for (i = 0; i < frags; i++)
10603			setbit(blksfree, cgbno + i);
10604		cgp->cg_cs.cs_nffree += frags;
10605		/* Add back in counts associated with the new frags */
10606		blk = blkmap(fs, blksfree, bbase);
10607		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
10608		/* If a complete block has been reassembled, account for it. */
10609		fragno = fragstoblks(fs, bbase);
10610		if (ffs_isblock(fs, blksfree, fragno)) {
10611			cgp->cg_cs.cs_nffree -= fs->fs_frag;
10612			ffs_clusteracct(fs, cgp, fragno, 1);
10613			cgp->cg_cs.cs_nbfree++;
10614		}
10615	}
10616	stat_jnewblk++;
10617	jnewblk->jn_state &= ~ATTACHED;
10618	jnewblk->jn_state |= UNDONE;
10619
10620	return (frags);
10621}
10622
10623static void
10624initiate_write_bmsafemap(bmsafemap, bp)
10625	struct bmsafemap *bmsafemap;
10626	struct buf *bp;			/* The cg block. */
10627{
10628	struct jaddref *jaddref;
10629	struct jnewblk *jnewblk;
10630	uint8_t *inosused;
10631	uint8_t *blksfree;
10632	struct cg *cgp;
10633	struct fs *fs;
10634	ino_t ino;
10635
10636	if (bmsafemap->sm_state & IOSTARTED)
10637		return;
10638	bmsafemap->sm_state |= IOSTARTED;
10639	/*
10640	 * Clear any inode allocations which are pending journal writes.
10641	 */
10642	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
10643		cgp = (struct cg *)bp->b_data;
10644		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10645		inosused = cg_inosused(cgp);
10646		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
10647			ino = jaddref->ja_ino % fs->fs_ipg;
10648			if (isset(inosused, ino)) {
10649				if ((jaddref->ja_mode & IFMT) == IFDIR)
10650					cgp->cg_cs.cs_ndir--;
10651				cgp->cg_cs.cs_nifree++;
10652				clrbit(inosused, ino);
10653				jaddref->ja_state &= ~ATTACHED;
10654				jaddref->ja_state |= UNDONE;
10655				stat_jaddref++;
10656			} else
10657				panic("initiate_write_bmsafemap: inode %d "
10658				    "marked free", jaddref->ja_ino);
10659		}
10660	}
10661	/*
10662	 * Clear any block allocations which are pending journal writes.
10663	 */
10664	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
10665		cgp = (struct cg *)bp->b_data;
10666		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10667		blksfree = cg_blksfree(cgp);
10668		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10669			if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
10670				continue;
10671			panic("initiate_write_bmsafemap: block %jd "
10672			    "marked free", jnewblk->jn_blkno);
10673		}
10674	}
10675	/*
10676	 * Move allocation lists to the written lists so they can be
10677	 * cleared once the block write is complete.
10678	 */
10679	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
10680	    inodedep, id_deps);
10681	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
10682	    newblk, nb_deps);
10683	LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
10684	    wk_list);
10685}
10686
10687/*
10688 * This routine is called during the completion interrupt
10689 * service routine for a disk write (from the procedure called
10690 * by the device driver to inform the filesystem caches of
10691 * a request completion).  It should be called early in this
10692 * procedure, before the block is made available to other
10693 * processes or other routines are called.
10694 *
10695 */
10696static void
10697softdep_disk_write_complete(bp)
10698	struct buf *bp;		/* describes the completed disk write */
10699{
10700	struct worklist *wk;
10701	struct worklist *owk;
10702	struct workhead reattach;
10703	struct freeblks *freeblks;
10704	struct buf *sbp;
10705
10706	/*
10707	 * If an error occurred while doing the write, then the data
10708	 * has not hit the disk and the dependencies cannot be unrolled.
10709	 */
10710	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
10711		return;
10712	LIST_INIT(&reattach);
10713	/*
10714	 * This lock must not be released anywhere in this code segment.
10715	 */
10716	sbp = NULL;
10717	owk = NULL;
10718	ACQUIRE_LOCK(&lk);
10719	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
10720		WORKLIST_REMOVE(wk);
10721		dep_write[wk->wk_type]++;
10722		if (wk == owk)
10723			panic("duplicate worklist: %p\n", wk);
10724		owk = wk;
10725		switch (wk->wk_type) {
10726
10727		case D_PAGEDEP:
10728			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
10729				WORKLIST_INSERT(&reattach, wk);
10730			continue;
10731
10732		case D_INODEDEP:
10733			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
10734				WORKLIST_INSERT(&reattach, wk);
10735			continue;
10736
10737		case D_BMSAFEMAP:
10738			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
10739				WORKLIST_INSERT(&reattach, wk);
10740			continue;
10741
10742		case D_MKDIR:
10743			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
10744			continue;
10745
10746		case D_ALLOCDIRECT:
10747			wk->wk_state |= COMPLETE;
10748			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
10749			continue;
10750
10751		case D_ALLOCINDIR:
10752			wk->wk_state |= COMPLETE;
10753			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
10754			continue;
10755
10756		case D_INDIRDEP:
10757			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
10758				WORKLIST_INSERT(&reattach, wk);
10759			continue;
10760
10761		case D_FREEBLKS:
10762			wk->wk_state |= COMPLETE;
10763			freeblks = WK_FREEBLKS(wk);
10764			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
10765			    LIST_EMPTY(&freeblks->fb_jblkdephd))
10766				add_to_worklist(wk, WK_NODELAY);
10767			continue;
10768
10769		case D_FREEWORK:
10770			handle_written_freework(WK_FREEWORK(wk));
10771			break;
10772
10773		case D_JSEGDEP:
10774			free_jsegdep(WK_JSEGDEP(wk));
10775			continue;
10776
10777		case D_JSEG:
10778			handle_written_jseg(WK_JSEG(wk), bp);
10779			continue;
10780
10781		case D_SBDEP:
10782			if (handle_written_sbdep(WK_SBDEP(wk), bp))
10783				WORKLIST_INSERT(&reattach, wk);
10784			continue;
10785
10786		case D_FREEDEP:
10787			free_freedep(WK_FREEDEP(wk));
10788			continue;
10789
10790		default:
10791			panic("handle_disk_write_complete: Unknown type %s",
10792			    TYPENAME(wk->wk_type));
10793			/* NOTREACHED */
10794		}
10795	}
10796	/*
10797	 * Reattach any requests that must be redone.
10798	 */
10799	while ((wk = LIST_FIRST(&reattach)) != NULL) {
10800		WORKLIST_REMOVE(wk);
10801		WORKLIST_INSERT(&bp->b_dep, wk);
10802	}
10803	FREE_LOCK(&lk);
10804	if (sbp)
10805		brelse(sbp);
10806}
10807
10808/*
10809 * Called from within softdep_disk_write_complete above. Note that
10810 * this routine is always called from interrupt level with further
10811 * splbio interrupts blocked.
10812 */
10813static void
10814handle_allocdirect_partdone(adp, wkhd)
10815	struct allocdirect *adp;	/* the completed allocdirect */
10816	struct workhead *wkhd;		/* Work to do when inode is writtne. */
10817{
10818	struct allocdirectlst *listhead;
10819	struct allocdirect *listadp;
10820	struct inodedep *inodedep;
10821	long bsize;
10822
10823	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
10824		return;
10825	/*
10826	 * The on-disk inode cannot claim to be any larger than the last
10827	 * fragment that has been written. Otherwise, the on-disk inode
10828	 * might have fragments that were not the last block in the file
10829	 * which would corrupt the filesystem. Thus, we cannot free any
10830	 * allocdirects after one whose ad_oldblkno claims a fragment as
10831	 * these blocks must be rolled back to zero before writing the inode.
10832	 * We check the currently active set of allocdirects in id_inoupdt
10833	 * or id_extupdt as appropriate.
10834	 */
10835	inodedep = adp->ad_inodedep;
10836	bsize = inodedep->id_fs->fs_bsize;
10837	if (adp->ad_state & EXTDATA)
10838		listhead = &inodedep->id_extupdt;
10839	else
10840		listhead = &inodedep->id_inoupdt;
10841	TAILQ_FOREACH(listadp, listhead, ad_next) {
10842		/* found our block */
10843		if (listadp == adp)
10844			break;
10845		/* continue if ad_oldlbn is not a fragment */
10846		if (listadp->ad_oldsize == 0 ||
10847		    listadp->ad_oldsize == bsize)
10848			continue;
10849		/* hit a fragment */
10850		return;
10851	}
10852	/*
10853	 * If we have reached the end of the current list without
10854	 * finding the just finished dependency, then it must be
10855	 * on the future dependency list. Future dependencies cannot
10856	 * be freed until they are moved to the current list.
10857	 */
10858	if (listadp == NULL) {
10859#ifdef DEBUG
10860		if (adp->ad_state & EXTDATA)
10861			listhead = &inodedep->id_newextupdt;
10862		else
10863			listhead = &inodedep->id_newinoupdt;
10864		TAILQ_FOREACH(listadp, listhead, ad_next)
10865			/* found our block */
10866			if (listadp == adp)
10867				break;
10868		if (listadp == NULL)
10869			panic("handle_allocdirect_partdone: lost dep");
10870#endif /* DEBUG */
10871		return;
10872	}
10873	/*
10874	 * If we have found the just finished dependency, then queue
10875	 * it along with anything that follows it that is complete.
10876	 * Since the pointer has not yet been written in the inode
10877	 * as the dependency prevents it, place the allocdirect on the
10878	 * bufwait list where it will be freed once the pointer is
10879	 * valid.
10880	 */
10881	if (wkhd == NULL)
10882		wkhd = &inodedep->id_bufwait;
10883	for (; adp; adp = listadp) {
10884		listadp = TAILQ_NEXT(adp, ad_next);
10885		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
10886			return;
10887		TAILQ_REMOVE(listhead, adp, ad_next);
10888		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
10889	}
10890}
10891
10892/*
10893 * Called from within softdep_disk_write_complete above.  This routine
10894 * completes successfully written allocindirs.
10895 */
10896static void
10897handle_allocindir_partdone(aip)
10898	struct allocindir *aip;		/* the completed allocindir */
10899{
10900	struct indirdep *indirdep;
10901
10902	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
10903		return;
10904	indirdep = aip->ai_indirdep;
10905	LIST_REMOVE(aip, ai_next);
10906	/*
10907	 * Don't set a pointer while the buffer is undergoing IO or while
10908	 * we have active truncations.
10909	 */
10910	if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
10911		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
10912		return;
10913	}
10914	if (indirdep->ir_state & UFS1FMT)
10915		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
10916		    aip->ai_newblkno;
10917	else
10918		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
10919		    aip->ai_newblkno;
10920	/*
10921	 * Await the pointer write before freeing the allocindir.
10922	 */
10923	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
10924}
10925
10926/*
10927 * Release segments held on a jwork list.
10928 */
10929static void
10930handle_jwork(wkhd)
10931	struct workhead *wkhd;
10932{
10933	struct worklist *wk;
10934
10935	while ((wk = LIST_FIRST(wkhd)) != NULL) {
10936		WORKLIST_REMOVE(wk);
10937		switch (wk->wk_type) {
10938		case D_JSEGDEP:
10939			free_jsegdep(WK_JSEGDEP(wk));
10940			continue;
10941		case D_FREEDEP:
10942			free_freedep(WK_FREEDEP(wk));
10943			continue;
10944		case D_FREEFRAG:
10945			rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
10946			WORKITEM_FREE(wk, D_FREEFRAG);
10947			continue;
10948		case D_FREEWORK:
10949			handle_written_freework(WK_FREEWORK(wk));
10950			continue;
10951		default:
10952			panic("handle_jwork: Unknown type %s\n",
10953			    TYPENAME(wk->wk_type));
10954		}
10955	}
10956}
10957
10958/*
10959 * Handle the bufwait list on an inode when it is safe to release items
10960 * held there.  This normally happens after an inode block is written but
10961 * may be delayed and handled later if there are pending journal items that
10962 * are not yet safe to be released.
10963 */
10964static struct freefile *
10965handle_bufwait(inodedep, refhd)
10966	struct inodedep *inodedep;
10967	struct workhead *refhd;
10968{
10969	struct jaddref *jaddref;
10970	struct freefile *freefile;
10971	struct worklist *wk;
10972
10973	freefile = NULL;
10974	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
10975		WORKLIST_REMOVE(wk);
10976		switch (wk->wk_type) {
10977		case D_FREEFILE:
10978			/*
10979			 * We defer adding freefile to the worklist
10980			 * until all other additions have been made to
10981			 * ensure that it will be done after all the
10982			 * old blocks have been freed.
10983			 */
10984			if (freefile != NULL)
10985				panic("handle_bufwait: freefile");
10986			freefile = WK_FREEFILE(wk);
10987			continue;
10988
10989		case D_MKDIR:
10990			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
10991			continue;
10992
10993		case D_DIRADD:
10994			diradd_inode_written(WK_DIRADD(wk), inodedep);
10995			continue;
10996
10997		case D_FREEFRAG:
10998			wk->wk_state |= COMPLETE;
10999			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
11000				add_to_worklist(wk, 0);
11001			continue;
11002
11003		case D_DIRREM:
11004			wk->wk_state |= COMPLETE;
11005			add_to_worklist(wk, 0);
11006			continue;
11007
11008		case D_ALLOCDIRECT:
11009		case D_ALLOCINDIR:
11010			free_newblk(WK_NEWBLK(wk));
11011			continue;
11012
11013		case D_JNEWBLK:
11014			wk->wk_state |= COMPLETE;
11015			free_jnewblk(WK_JNEWBLK(wk));
11016			continue;
11017
11018		/*
11019		 * Save freed journal segments and add references on
11020		 * the supplied list which will delay their release
11021		 * until the cg bitmap is cleared on disk.
11022		 */
11023		case D_JSEGDEP:
11024			if (refhd == NULL)
11025				free_jsegdep(WK_JSEGDEP(wk));
11026			else
11027				WORKLIST_INSERT(refhd, wk);
11028			continue;
11029
11030		case D_JADDREF:
11031			jaddref = WK_JADDREF(wk);
11032			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
11033			    if_deps);
11034			/*
11035			 * Transfer any jaddrefs to the list to be freed with
11036			 * the bitmap if we're handling a removed file.
11037			 */
11038			if (refhd == NULL) {
11039				wk->wk_state |= COMPLETE;
11040				free_jaddref(jaddref);
11041			} else
11042				WORKLIST_INSERT(refhd, wk);
11043			continue;
11044
11045		default:
11046			panic("handle_bufwait: Unknown type %p(%s)",
11047			    wk, TYPENAME(wk->wk_type));
11048			/* NOTREACHED */
11049		}
11050	}
11051	return (freefile);
11052}
11053/*
11054 * Called from within softdep_disk_write_complete above to restore
11055 * in-memory inode block contents to their most up-to-date state. Note
11056 * that this routine is always called from interrupt level with further
11057 * splbio interrupts blocked.
11058 */
11059static int
11060handle_written_inodeblock(inodedep, bp)
11061	struct inodedep *inodedep;
11062	struct buf *bp;		/* buffer containing the inode block */
11063{
11064	struct freefile *freefile;
11065	struct allocdirect *adp, *nextadp;
11066	struct ufs1_dinode *dp1 = NULL;
11067	struct ufs2_dinode *dp2 = NULL;
11068	struct workhead wkhd;
11069	int hadchanges, fstype;
11070	ino_t freelink;
11071
11072	LIST_INIT(&wkhd);
11073	hadchanges = 0;
11074	freefile = NULL;
11075	if ((inodedep->id_state & IOSTARTED) == 0)
11076		panic("handle_written_inodeblock: not started");
11077	inodedep->id_state &= ~IOSTARTED;
11078	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11079		fstype = UFS1;
11080		dp1 = (struct ufs1_dinode *)bp->b_data +
11081		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11082		freelink = dp1->di_freelink;
11083	} else {
11084		fstype = UFS2;
11085		dp2 = (struct ufs2_dinode *)bp->b_data +
11086		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11087		freelink = dp2->di_freelink;
11088	}
11089	/*
11090	 * Leave this inodeblock dirty until it's in the list.
11091	 */
11092	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) {
11093		struct inodedep *inon;
11094
11095		inon = TAILQ_NEXT(inodedep, id_unlinked);
11096		if ((inon == NULL && freelink == 0) ||
11097		    (inon && inon->id_ino == freelink)) {
11098			if (inon)
11099				inon->id_state |= UNLINKPREV;
11100			inodedep->id_state |= UNLINKNEXT;
11101		}
11102		hadchanges = 1;
11103	}
11104	/*
11105	 * If we had to rollback the inode allocation because of
11106	 * bitmaps being incomplete, then simply restore it.
11107	 * Keep the block dirty so that it will not be reclaimed until
11108	 * all associated dependencies have been cleared and the
11109	 * corresponding updates written to disk.
11110	 */
11111	if (inodedep->id_savedino1 != NULL) {
11112		hadchanges = 1;
11113		if (fstype == UFS1)
11114			*dp1 = *inodedep->id_savedino1;
11115		else
11116			*dp2 = *inodedep->id_savedino2;
11117		free(inodedep->id_savedino1, M_SAVEDINO);
11118		inodedep->id_savedino1 = NULL;
11119		if ((bp->b_flags & B_DELWRI) == 0)
11120			stat_inode_bitmap++;
11121		bdirty(bp);
11122		/*
11123		 * If the inode is clear here and GOINGAWAY it will never
11124		 * be written.  Process the bufwait and clear any pending
11125		 * work which may include the freefile.
11126		 */
11127		if (inodedep->id_state & GOINGAWAY)
11128			goto bufwait;
11129		return (1);
11130	}
11131	inodedep->id_state |= COMPLETE;
11132	/*
11133	 * Roll forward anything that had to be rolled back before
11134	 * the inode could be updated.
11135	 */
11136	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11137		nextadp = TAILQ_NEXT(adp, ad_next);
11138		if (adp->ad_state & ATTACHED)
11139			panic("handle_written_inodeblock: new entry");
11140		if (fstype == UFS1) {
11141			if (adp->ad_offset < NDADDR) {
11142				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11143					panic("%s %s #%jd mismatch %d != %jd",
11144					    "handle_written_inodeblock:",
11145					    "direct pointer",
11146					    (intmax_t)adp->ad_offset,
11147					    dp1->di_db[adp->ad_offset],
11148					    (intmax_t)adp->ad_oldblkno);
11149				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11150			} else {
11151				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
11152					panic("%s: %s #%jd allocated as %d",
11153					    "handle_written_inodeblock",
11154					    "indirect pointer",
11155					    (intmax_t)adp->ad_offset - NDADDR,
11156					    dp1->di_ib[adp->ad_offset - NDADDR]);
11157				dp1->di_ib[adp->ad_offset - NDADDR] =
11158				    adp->ad_newblkno;
11159			}
11160		} else {
11161			if (adp->ad_offset < NDADDR) {
11162				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11163					panic("%s: %s #%jd %s %jd != %jd",
11164					    "handle_written_inodeblock",
11165					    "direct pointer",
11166					    (intmax_t)adp->ad_offset, "mismatch",
11167					    (intmax_t)dp2->di_db[adp->ad_offset],
11168					    (intmax_t)adp->ad_oldblkno);
11169				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11170			} else {
11171				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
11172					panic("%s: %s #%jd allocated as %jd",
11173					    "handle_written_inodeblock",
11174					    "indirect pointer",
11175					    (intmax_t)adp->ad_offset - NDADDR,
11176					    (intmax_t)
11177					    dp2->di_ib[adp->ad_offset - NDADDR]);
11178				dp2->di_ib[adp->ad_offset - NDADDR] =
11179				    adp->ad_newblkno;
11180			}
11181		}
11182		adp->ad_state &= ~UNDONE;
11183		adp->ad_state |= ATTACHED;
11184		hadchanges = 1;
11185	}
11186	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11187		nextadp = TAILQ_NEXT(adp, ad_next);
11188		if (adp->ad_state & ATTACHED)
11189			panic("handle_written_inodeblock: new entry");
11190		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11191			panic("%s: direct pointers #%jd %s %jd != %jd",
11192			    "handle_written_inodeblock",
11193			    (intmax_t)adp->ad_offset, "mismatch",
11194			    (intmax_t)dp2->di_extb[adp->ad_offset],
11195			    (intmax_t)adp->ad_oldblkno);
11196		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11197		adp->ad_state &= ~UNDONE;
11198		adp->ad_state |= ATTACHED;
11199		hadchanges = 1;
11200	}
11201	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11202		stat_direct_blk_ptrs++;
11203	/*
11204	 * Reset the file size to its most up-to-date value.
11205	 */
11206	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11207		panic("handle_written_inodeblock: bad size");
11208	if (inodedep->id_savednlink > LINK_MAX)
11209		panic("handle_written_inodeblock: Invalid link count "
11210		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
11211	if (fstype == UFS1) {
11212		if (dp1->di_nlink != inodedep->id_savednlink) {
11213			dp1->di_nlink = inodedep->id_savednlink;
11214			hadchanges = 1;
11215		}
11216		if (dp1->di_size != inodedep->id_savedsize) {
11217			dp1->di_size = inodedep->id_savedsize;
11218			hadchanges = 1;
11219		}
11220	} else {
11221		if (dp2->di_nlink != inodedep->id_savednlink) {
11222			dp2->di_nlink = inodedep->id_savednlink;
11223			hadchanges = 1;
11224		}
11225		if (dp2->di_size != inodedep->id_savedsize) {
11226			dp2->di_size = inodedep->id_savedsize;
11227			hadchanges = 1;
11228		}
11229		if (dp2->di_extsize != inodedep->id_savedextsize) {
11230			dp2->di_extsize = inodedep->id_savedextsize;
11231			hadchanges = 1;
11232		}
11233	}
11234	inodedep->id_savedsize = -1;
11235	inodedep->id_savedextsize = -1;
11236	inodedep->id_savednlink = -1;
11237	/*
11238	 * If there were any rollbacks in the inode block, then it must be
11239	 * marked dirty so that its will eventually get written back in
11240	 * its correct form.
11241	 */
11242	if (hadchanges)
11243		bdirty(bp);
11244bufwait:
11245	/*
11246	 * Process any allocdirects that completed during the update.
11247	 */
11248	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11249		handle_allocdirect_partdone(adp, &wkhd);
11250	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11251		handle_allocdirect_partdone(adp, &wkhd);
11252	/*
11253	 * Process deallocations that were held pending until the
11254	 * inode had been written to disk. Freeing of the inode
11255	 * is delayed until after all blocks have been freed to
11256	 * avoid creation of new <vfsid, inum, lbn> triples
11257	 * before the old ones have been deleted.  Completely
11258	 * unlinked inodes are not processed until the unlinked
11259	 * inode list is written or the last reference is removed.
11260	 */
11261	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11262		freefile = handle_bufwait(inodedep, NULL);
11263		if (freefile && !LIST_EMPTY(&wkhd)) {
11264			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11265			freefile = NULL;
11266		}
11267	}
11268	/*
11269	 * Move rolled forward dependency completions to the bufwait list
11270	 * now that those that were already written have been processed.
11271	 */
11272	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11273		panic("handle_written_inodeblock: bufwait but no changes");
11274	jwork_move(&inodedep->id_bufwait, &wkhd);
11275
11276	if (freefile != NULL) {
11277		/*
11278		 * If the inode is goingaway it was never written.  Fake up
11279		 * the state here so free_inodedep() can succeed.
11280		 */
11281		if (inodedep->id_state & GOINGAWAY)
11282			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11283		if (free_inodedep(inodedep) == 0)
11284			panic("handle_written_inodeblock: live inodedep %p",
11285			    inodedep);
11286		add_to_worklist(&freefile->fx_list, 0);
11287		return (0);
11288	}
11289
11290	/*
11291	 * If no outstanding dependencies, free it.
11292	 */
11293	if (free_inodedep(inodedep) ||
11294	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11295	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11296	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11297	     LIST_FIRST(&inodedep->id_bufwait) == 0))
11298		return (0);
11299	return (hadchanges);
11300}
11301
11302static int
11303handle_written_indirdep(indirdep, bp, bpp)
11304	struct indirdep *indirdep;
11305	struct buf *bp;
11306	struct buf **bpp;
11307{
11308	struct allocindir *aip;
11309	struct buf *sbp;
11310	int chgs;
11311
11312	if (indirdep->ir_state & GOINGAWAY)
11313		panic("handle_written_indirdep: indirdep gone");
11314	if ((indirdep->ir_state & IOSTARTED) == 0)
11315		panic("handle_written_indirdep: IO not started");
11316	chgs = 0;
11317	/*
11318	 * If there were rollbacks revert them here.
11319	 */
11320	if (indirdep->ir_saveddata) {
11321		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11322		if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11323			free(indirdep->ir_saveddata, M_INDIRDEP);
11324			indirdep->ir_saveddata = NULL;
11325		}
11326		chgs = 1;
11327	}
11328	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11329	indirdep->ir_state |= ATTACHED;
11330	/*
11331	 * Move allocindirs with written pointers to the completehd if
11332	 * the indirdep's pointer is not yet written.  Otherwise
11333	 * free them here.
11334	 */
11335	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
11336		LIST_REMOVE(aip, ai_next);
11337		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11338			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11339			    ai_next);
11340			newblk_freefrag(&aip->ai_block);
11341			continue;
11342		}
11343		free_newblk(&aip->ai_block);
11344	}
11345	/*
11346	 * Move allocindirs that have finished dependency processing from
11347	 * the done list to the write list after updating the pointers.
11348	 */
11349	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11350		while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
11351			handle_allocindir_partdone(aip);
11352			if (aip == LIST_FIRST(&indirdep->ir_donehd))
11353				panic("disk_write_complete: not gone");
11354			chgs = 1;
11355		}
11356	}
11357	/*
11358	 * Preserve the indirdep if there were any changes or if it is not
11359	 * yet valid on disk.
11360	 */
11361	if (chgs) {
11362		stat_indir_blk_ptrs++;
11363		bdirty(bp);
11364		return (1);
11365	}
11366	/*
11367	 * If there were no changes we can discard the savedbp and detach
11368	 * ourselves from the buf.  We are only carrying completed pointers
11369	 * in this case.
11370	 */
11371	sbp = indirdep->ir_savebp;
11372	sbp->b_flags |= B_INVAL | B_NOCACHE;
11373	indirdep->ir_savebp = NULL;
11374	indirdep->ir_bp = NULL;
11375	if (*bpp != NULL)
11376		panic("handle_written_indirdep: bp already exists.");
11377	*bpp = sbp;
11378	/*
11379	 * The indirdep may not be freed until its parent points at it.
11380	 */
11381	if (indirdep->ir_state & DEPCOMPLETE)
11382		free_indirdep(indirdep);
11383
11384	return (0);
11385}
11386
11387/*
11388 * Process a diradd entry after its dependent inode has been written.
11389 * This routine must be called with splbio interrupts blocked.
11390 */
11391static void
11392diradd_inode_written(dap, inodedep)
11393	struct diradd *dap;
11394	struct inodedep *inodedep;
11395{
11396
11397	dap->da_state |= COMPLETE;
11398	complete_diradd(dap);
11399	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
11400}
11401
11402/*
11403 * Returns true if the bmsafemap will have rollbacks when written.  Must
11404 * only be called with lk and the buf lock on the cg held.
11405 */
11406static int
11407bmsafemap_backgroundwrite(bmsafemap, bp)
11408	struct bmsafemap *bmsafemap;
11409	struct buf *bp;
11410{
11411	int dirty;
11412
11413	dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
11414	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
11415	/*
11416	 * If we're initiating a background write we need to process the
11417	 * rollbacks as they exist now, not as they exist when IO starts.
11418	 * No other consumers will look at the contents of the shadowed
11419	 * buf so this is safe to do here.
11420	 */
11421	if (bp->b_xflags & BX_BKGRDMARKER)
11422		initiate_write_bmsafemap(bmsafemap, bp);
11423
11424	return (dirty);
11425}
11426
11427/*
11428 * Re-apply an allocation when a cg write is complete.
11429 */
11430static int
11431jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
11432	struct jnewblk *jnewblk;
11433	struct fs *fs;
11434	struct cg *cgp;
11435	uint8_t *blksfree;
11436{
11437	ufs1_daddr_t fragno;
11438	ufs2_daddr_t blkno;
11439	long cgbno, bbase;
11440	int frags, blk;
11441	int i;
11442
11443	frags = 0;
11444	cgbno = dtogd(fs, jnewblk->jn_blkno);
11445	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
11446		if (isclr(blksfree, cgbno + i))
11447			panic("jnewblk_rollforward: re-allocated fragment");
11448		frags++;
11449	}
11450	if (frags == fs->fs_frag) {
11451		blkno = fragstoblks(fs, cgbno);
11452		ffs_clrblock(fs, blksfree, (long)blkno);
11453		ffs_clusteracct(fs, cgp, blkno, -1);
11454		cgp->cg_cs.cs_nbfree--;
11455	} else {
11456		bbase = cgbno - fragnum(fs, cgbno);
11457		cgbno += jnewblk->jn_oldfrags;
11458                /* If a complete block had been reassembled, account for it. */
11459		fragno = fragstoblks(fs, bbase);
11460		if (ffs_isblock(fs, blksfree, fragno)) {
11461			cgp->cg_cs.cs_nffree += fs->fs_frag;
11462			ffs_clusteracct(fs, cgp, fragno, -1);
11463			cgp->cg_cs.cs_nbfree--;
11464		}
11465		/* Decrement the old frags.  */
11466		blk = blkmap(fs, blksfree, bbase);
11467		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11468		/* Allocate the fragment */
11469		for (i = 0; i < frags; i++)
11470			clrbit(blksfree, cgbno + i);
11471		cgp->cg_cs.cs_nffree -= frags;
11472		/* Add back in counts associated with the new frags */
11473		blk = blkmap(fs, blksfree, bbase);
11474		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11475	}
11476	return (frags);
11477}
11478
11479/*
11480 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
11481 * changes if it's not a background write.  Set all written dependencies
11482 * to DEPCOMPLETE and free the structure if possible.
11483 */
11484static int
11485handle_written_bmsafemap(bmsafemap, bp)
11486	struct bmsafemap *bmsafemap;
11487	struct buf *bp;
11488{
11489	struct newblk *newblk;
11490	struct inodedep *inodedep;
11491	struct jaddref *jaddref, *jatmp;
11492	struct jnewblk *jnewblk, *jntmp;
11493	struct ufsmount *ump;
11494	uint8_t *inosused;
11495	uint8_t *blksfree;
11496	struct cg *cgp;
11497	struct fs *fs;
11498	ino_t ino;
11499	int chgs;
11500
11501	if ((bmsafemap->sm_state & IOSTARTED) == 0)
11502		panic("initiate_write_bmsafemap: Not started\n");
11503	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
11504	chgs = 0;
11505	bmsafemap->sm_state &= ~IOSTARTED;
11506	/*
11507	 * Release journal work that was waiting on the write.
11508	 */
11509	handle_jwork(&bmsafemap->sm_freewr);
11510
11511	/*
11512	 * Restore unwritten inode allocation pending jaddref writes.
11513	 */
11514	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
11515		cgp = (struct cg *)bp->b_data;
11516		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11517		inosused = cg_inosused(cgp);
11518		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
11519		    ja_bmdeps, jatmp) {
11520			if ((jaddref->ja_state & UNDONE) == 0)
11521				continue;
11522			ino = jaddref->ja_ino % fs->fs_ipg;
11523			if (isset(inosused, ino))
11524				panic("handle_written_bmsafemap: "
11525				    "re-allocated inode");
11526			if ((bp->b_xflags & BX_BKGRDMARKER) == 0) {
11527				if ((jaddref->ja_mode & IFMT) == IFDIR)
11528					cgp->cg_cs.cs_ndir++;
11529				cgp->cg_cs.cs_nifree--;
11530				setbit(inosused, ino);
11531				chgs = 1;
11532			}
11533			jaddref->ja_state &= ~UNDONE;
11534			jaddref->ja_state |= ATTACHED;
11535			free_jaddref(jaddref);
11536		}
11537	}
11538	/*
11539	 * Restore any block allocations which are pending journal writes.
11540	 */
11541	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11542		cgp = (struct cg *)bp->b_data;
11543		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11544		blksfree = cg_blksfree(cgp);
11545		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
11546		    jntmp) {
11547			if ((jnewblk->jn_state & UNDONE) == 0)
11548				continue;
11549			if ((bp->b_xflags & BX_BKGRDMARKER) == 0 &&
11550			    jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
11551				chgs = 1;
11552			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
11553			jnewblk->jn_state |= ATTACHED;
11554			free_jnewblk(jnewblk);
11555		}
11556	}
11557	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
11558		newblk->nb_state |= DEPCOMPLETE;
11559		newblk->nb_state &= ~ONDEPLIST;
11560		newblk->nb_bmsafemap = NULL;
11561		LIST_REMOVE(newblk, nb_deps);
11562		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
11563			handle_allocdirect_partdone(
11564			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
11565		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
11566			handle_allocindir_partdone(
11567			    WK_ALLOCINDIR(&newblk->nb_list));
11568		else if (newblk->nb_list.wk_type != D_NEWBLK)
11569			panic("handle_written_bmsafemap: Unexpected type: %s",
11570			    TYPENAME(newblk->nb_list.wk_type));
11571	}
11572	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
11573		inodedep->id_state |= DEPCOMPLETE;
11574		inodedep->id_state &= ~ONDEPLIST;
11575		LIST_REMOVE(inodedep, id_deps);
11576		inodedep->id_bmsafemap = NULL;
11577	}
11578	LIST_REMOVE(bmsafemap, sm_next);
11579	if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
11580	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
11581	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
11582	    LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
11583	    LIST_EMPTY(&bmsafemap->sm_freehd)) {
11584		LIST_REMOVE(bmsafemap, sm_hash);
11585		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
11586		return (0);
11587	}
11588	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
11589	bdirty(bp);
11590	return (1);
11591}
11592
11593/*
11594 * Try to free a mkdir dependency.
11595 */
11596static void
11597complete_mkdir(mkdir)
11598	struct mkdir *mkdir;
11599{
11600	struct diradd *dap;
11601
11602	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
11603		return;
11604	LIST_REMOVE(mkdir, md_mkdirs);
11605	dap = mkdir->md_diradd;
11606	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
11607	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
11608		dap->da_state |= DEPCOMPLETE;
11609		complete_diradd(dap);
11610	}
11611	WORKITEM_FREE(mkdir, D_MKDIR);
11612}
11613
11614/*
11615 * Handle the completion of a mkdir dependency.
11616 */
11617static void
11618handle_written_mkdir(mkdir, type)
11619	struct mkdir *mkdir;
11620	int type;
11621{
11622
11623	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
11624		panic("handle_written_mkdir: bad type");
11625	mkdir->md_state |= COMPLETE;
11626	complete_mkdir(mkdir);
11627}
11628
11629static int
11630free_pagedep(pagedep)
11631	struct pagedep *pagedep;
11632{
11633	int i;
11634
11635	if (pagedep->pd_state & NEWBLOCK)
11636		return (0);
11637	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
11638		return (0);
11639	for (i = 0; i < DAHASHSZ; i++)
11640		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
11641			return (0);
11642	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
11643		return (0);
11644	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
11645		return (0);
11646	if (pagedep->pd_state & ONWORKLIST)
11647		WORKLIST_REMOVE(&pagedep->pd_list);
11648	LIST_REMOVE(pagedep, pd_hash);
11649	WORKITEM_FREE(pagedep, D_PAGEDEP);
11650
11651	return (1);
11652}
11653
11654/*
11655 * Called from within softdep_disk_write_complete above.
11656 * A write operation was just completed. Removed inodes can
11657 * now be freed and associated block pointers may be committed.
11658 * Note that this routine is always called from interrupt level
11659 * with further splbio interrupts blocked.
11660 */
11661static int
11662handle_written_filepage(pagedep, bp)
11663	struct pagedep *pagedep;
11664	struct buf *bp;		/* buffer containing the written page */
11665{
11666	struct dirrem *dirrem;
11667	struct diradd *dap, *nextdap;
11668	struct direct *ep;
11669	int i, chgs;
11670
11671	if ((pagedep->pd_state & IOSTARTED) == 0)
11672		panic("handle_written_filepage: not started");
11673	pagedep->pd_state &= ~IOSTARTED;
11674	/*
11675	 * Process any directory removals that have been committed.
11676	 */
11677	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
11678		LIST_REMOVE(dirrem, dm_next);
11679		dirrem->dm_state |= COMPLETE;
11680		dirrem->dm_dirinum = pagedep->pd_ino;
11681		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
11682		    ("handle_written_filepage: Journal entries not written."));
11683		add_to_worklist(&dirrem->dm_list, 0);
11684	}
11685	/*
11686	 * Free any directory additions that have been committed.
11687	 * If it is a newly allocated block, we have to wait until
11688	 * the on-disk directory inode claims the new block.
11689	 */
11690	if ((pagedep->pd_state & NEWBLOCK) == 0)
11691		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
11692			free_diradd(dap, NULL);
11693	/*
11694	 * Uncommitted directory entries must be restored.
11695	 */
11696	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
11697		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
11698		     dap = nextdap) {
11699			nextdap = LIST_NEXT(dap, da_pdlist);
11700			if (dap->da_state & ATTACHED)
11701				panic("handle_written_filepage: attached");
11702			ep = (struct direct *)
11703			    ((char *)bp->b_data + dap->da_offset);
11704			ep->d_ino = dap->da_newinum;
11705			dap->da_state &= ~UNDONE;
11706			dap->da_state |= ATTACHED;
11707			chgs = 1;
11708			/*
11709			 * If the inode referenced by the directory has
11710			 * been written out, then the dependency can be
11711			 * moved to the pending list.
11712			 */
11713			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
11714				LIST_REMOVE(dap, da_pdlist);
11715				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
11716				    da_pdlist);
11717			}
11718		}
11719	}
11720	/*
11721	 * If there were any rollbacks in the directory, then it must be
11722	 * marked dirty so that its will eventually get written back in
11723	 * its correct form.
11724	 */
11725	if (chgs) {
11726		if ((bp->b_flags & B_DELWRI) == 0)
11727			stat_dir_entry++;
11728		bdirty(bp);
11729		return (1);
11730	}
11731	/*
11732	 * If we are not waiting for a new directory block to be
11733	 * claimed by its inode, then the pagedep will be freed.
11734	 * Otherwise it will remain to track any new entries on
11735	 * the page in case they are fsync'ed.
11736	 */
11737	free_pagedep(pagedep);
11738	return (0);
11739}
11740
11741/*
11742 * Writing back in-core inode structures.
11743 *
11744 * The filesystem only accesses an inode's contents when it occupies an
11745 * "in-core" inode structure.  These "in-core" structures are separate from
11746 * the page frames used to cache inode blocks.  Only the latter are
11747 * transferred to/from the disk.  So, when the updated contents of the
11748 * "in-core" inode structure are copied to the corresponding in-memory inode
11749 * block, the dependencies are also transferred.  The following procedure is
11750 * called when copying a dirty "in-core" inode to a cached inode block.
11751 */
11752
11753/*
11754 * Called when an inode is loaded from disk. If the effective link count
11755 * differed from the actual link count when it was last flushed, then we
11756 * need to ensure that the correct effective link count is put back.
11757 */
11758void
11759softdep_load_inodeblock(ip)
11760	struct inode *ip;	/* the "in_core" copy of the inode */
11761{
11762	struct inodedep *inodedep;
11763
11764	/*
11765	 * Check for alternate nlink count.
11766	 */
11767	ip->i_effnlink = ip->i_nlink;
11768	ACQUIRE_LOCK(&lk);
11769	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
11770	    &inodedep) == 0) {
11771		FREE_LOCK(&lk);
11772		return;
11773	}
11774	ip->i_effnlink -= inodedep->id_nlinkdelta;
11775	FREE_LOCK(&lk);
11776}
11777
11778/*
11779 * This routine is called just before the "in-core" inode
11780 * information is to be copied to the in-memory inode block.
11781 * Recall that an inode block contains several inodes. If
11782 * the force flag is set, then the dependencies will be
11783 * cleared so that the update can always be made. Note that
11784 * the buffer is locked when this routine is called, so we
11785 * will never be in the middle of writing the inode block
11786 * to disk.
11787 */
11788void
11789softdep_update_inodeblock(ip, bp, waitfor)
11790	struct inode *ip;	/* the "in_core" copy of the inode */
11791	struct buf *bp;		/* the buffer containing the inode block */
11792	int waitfor;		/* nonzero => update must be allowed */
11793{
11794	struct inodedep *inodedep;
11795	struct inoref *inoref;
11796	struct worklist *wk;
11797	struct mount *mp;
11798	struct buf *ibp;
11799	struct fs *fs;
11800	int error;
11801
11802	mp = UFSTOVFS(ip->i_ump);
11803	fs = ip->i_fs;
11804	/*
11805	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
11806	 * does not have access to the in-core ip so must write directly into
11807	 * the inode block buffer when setting freelink.
11808	 */
11809	if (fs->fs_magic == FS_UFS1_MAGIC)
11810		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
11811		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
11812	else
11813		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
11814		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
11815	/*
11816	 * If the effective link count is not equal to the actual link
11817	 * count, then we must track the difference in an inodedep while
11818	 * the inode is (potentially) tossed out of the cache. Otherwise,
11819	 * if there is no existing inodedep, then there are no dependencies
11820	 * to track.
11821	 */
11822	ACQUIRE_LOCK(&lk);
11823again:
11824	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
11825		FREE_LOCK(&lk);
11826		if (ip->i_effnlink != ip->i_nlink)
11827			panic("softdep_update_inodeblock: bad link count");
11828		return;
11829	}
11830	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
11831		panic("softdep_update_inodeblock: bad delta");
11832	/*
11833	 * If we're flushing all dependencies we must also move any waiting
11834	 * for journal writes onto the bufwait list prior to I/O.
11835	 */
11836	if (waitfor) {
11837		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
11838			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
11839			    == DEPCOMPLETE) {
11840				jwait(&inoref->if_list, MNT_WAIT);
11841				goto again;
11842			}
11843		}
11844	}
11845	/*
11846	 * Changes have been initiated. Anything depending on these
11847	 * changes cannot occur until this inode has been written.
11848	 */
11849	inodedep->id_state &= ~COMPLETE;
11850	if ((inodedep->id_state & ONWORKLIST) == 0)
11851		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
11852	/*
11853	 * Any new dependencies associated with the incore inode must
11854	 * now be moved to the list associated with the buffer holding
11855	 * the in-memory copy of the inode. Once merged process any
11856	 * allocdirects that are completed by the merger.
11857	 */
11858	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
11859	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
11860		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
11861		    NULL);
11862	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
11863	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
11864		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
11865		    NULL);
11866	/*
11867	 * Now that the inode has been pushed into the buffer, the
11868	 * operations dependent on the inode being written to disk
11869	 * can be moved to the id_bufwait so that they will be
11870	 * processed when the buffer I/O completes.
11871	 */
11872	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
11873		WORKLIST_REMOVE(wk);
11874		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
11875	}
11876	/*
11877	 * Newly allocated inodes cannot be written until the bitmap
11878	 * that allocates them have been written (indicated by
11879	 * DEPCOMPLETE being set in id_state). If we are doing a
11880	 * forced sync (e.g., an fsync on a file), we force the bitmap
11881	 * to be written so that the update can be done.
11882	 */
11883	if (waitfor == 0) {
11884		FREE_LOCK(&lk);
11885		return;
11886	}
11887retry:
11888	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
11889		FREE_LOCK(&lk);
11890		return;
11891	}
11892	ibp = inodedep->id_bmsafemap->sm_buf;
11893	ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
11894	if (ibp == NULL) {
11895		/*
11896		 * If ibp came back as NULL, the dependency could have been
11897		 * freed while we slept.  Look it up again, and check to see
11898		 * that it has completed.
11899		 */
11900		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
11901			goto retry;
11902		FREE_LOCK(&lk);
11903		return;
11904	}
11905	FREE_LOCK(&lk);
11906	if ((error = bwrite(ibp)) != 0)
11907		softdep_error("softdep_update_inodeblock: bwrite", error);
11908}
11909
11910/*
11911 * Merge the a new inode dependency list (such as id_newinoupdt) into an
11912 * old inode dependency list (such as id_inoupdt). This routine must be
11913 * called with splbio interrupts blocked.
11914 */
11915static void
11916merge_inode_lists(newlisthead, oldlisthead)
11917	struct allocdirectlst *newlisthead;
11918	struct allocdirectlst *oldlisthead;
11919{
11920	struct allocdirect *listadp, *newadp;
11921
11922	newadp = TAILQ_FIRST(newlisthead);
11923	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
11924		if (listadp->ad_offset < newadp->ad_offset) {
11925			listadp = TAILQ_NEXT(listadp, ad_next);
11926			continue;
11927		}
11928		TAILQ_REMOVE(newlisthead, newadp, ad_next);
11929		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
11930		if (listadp->ad_offset == newadp->ad_offset) {
11931			allocdirect_merge(oldlisthead, newadp,
11932			    listadp);
11933			listadp = newadp;
11934		}
11935		newadp = TAILQ_FIRST(newlisthead);
11936	}
11937	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
11938		TAILQ_REMOVE(newlisthead, newadp, ad_next);
11939		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
11940	}
11941}
11942
11943/*
11944 * If we are doing an fsync, then we must ensure that any directory
11945 * entries for the inode have been written after the inode gets to disk.
11946 */
11947int
11948softdep_fsync(vp)
11949	struct vnode *vp;	/* the "in_core" copy of the inode */
11950{
11951	struct inodedep *inodedep;
11952	struct pagedep *pagedep;
11953	struct inoref *inoref;
11954	struct worklist *wk;
11955	struct diradd *dap;
11956	struct mount *mp;
11957	struct vnode *pvp;
11958	struct inode *ip;
11959	struct buf *bp;
11960	struct fs *fs;
11961	struct thread *td = curthread;
11962	int error, flushparent, pagedep_new_block;
11963	ino_t parentino;
11964	ufs_lbn_t lbn;
11965
11966	ip = VTOI(vp);
11967	fs = ip->i_fs;
11968	mp = vp->v_mount;
11969	ACQUIRE_LOCK(&lk);
11970restart:
11971	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
11972		FREE_LOCK(&lk);
11973		return (0);
11974	}
11975	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
11976		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
11977		    == DEPCOMPLETE) {
11978			jwait(&inoref->if_list, MNT_WAIT);
11979			goto restart;
11980		}
11981	}
11982	if (!LIST_EMPTY(&inodedep->id_inowait) ||
11983	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
11984	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
11985	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
11986	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
11987		panic("softdep_fsync: pending ops %p", inodedep);
11988	for (error = 0, flushparent = 0; ; ) {
11989		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
11990			break;
11991		if (wk->wk_type != D_DIRADD)
11992			panic("softdep_fsync: Unexpected type %s",
11993			    TYPENAME(wk->wk_type));
11994		dap = WK_DIRADD(wk);
11995		/*
11996		 * Flush our parent if this directory entry has a MKDIR_PARENT
11997		 * dependency or is contained in a newly allocated block.
11998		 */
11999		if (dap->da_state & DIRCHG)
12000			pagedep = dap->da_previous->dm_pagedep;
12001		else
12002			pagedep = dap->da_pagedep;
12003		parentino = pagedep->pd_ino;
12004		lbn = pagedep->pd_lbn;
12005		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
12006			panic("softdep_fsync: dirty");
12007		if ((dap->da_state & MKDIR_PARENT) ||
12008		    (pagedep->pd_state & NEWBLOCK))
12009			flushparent = 1;
12010		else
12011			flushparent = 0;
12012		/*
12013		 * If we are being fsync'ed as part of vgone'ing this vnode,
12014		 * then we will not be able to release and recover the
12015		 * vnode below, so we just have to give up on writing its
12016		 * directory entry out. It will eventually be written, just
12017		 * not now, but then the user was not asking to have it
12018		 * written, so we are not breaking any promises.
12019		 */
12020		if (vp->v_iflag & VI_DOOMED)
12021			break;
12022		/*
12023		 * We prevent deadlock by always fetching inodes from the
12024		 * root, moving down the directory tree. Thus, when fetching
12025		 * our parent directory, we first try to get the lock. If
12026		 * that fails, we must unlock ourselves before requesting
12027		 * the lock on our parent. See the comment in ufs_lookup
12028		 * for details on possible races.
12029		 */
12030		FREE_LOCK(&lk);
12031		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
12032		    FFSV_FORCEINSMQ)) {
12033			error = vfs_busy(mp, MBF_NOWAIT);
12034			if (error != 0) {
12035				vfs_ref(mp);
12036				VOP_UNLOCK(vp, 0);
12037				error = vfs_busy(mp, 0);
12038				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12039				vfs_rel(mp);
12040				if (error != 0)
12041					return (ENOENT);
12042				if (vp->v_iflag & VI_DOOMED) {
12043					vfs_unbusy(mp);
12044					return (ENOENT);
12045				}
12046			}
12047			VOP_UNLOCK(vp, 0);
12048			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
12049			    &pvp, FFSV_FORCEINSMQ);
12050			vfs_unbusy(mp);
12051			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12052			if (vp->v_iflag & VI_DOOMED) {
12053				if (error == 0)
12054					vput(pvp);
12055				error = ENOENT;
12056			}
12057			if (error != 0)
12058				return (error);
12059		}
12060		/*
12061		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12062		 * that are contained in direct blocks will be resolved by
12063		 * doing a ffs_update. Pagedeps contained in indirect blocks
12064		 * may require a complete sync'ing of the directory. So, we
12065		 * try the cheap and fast ffs_update first, and if that fails,
12066		 * then we do the slower ffs_syncvnode of the directory.
12067		 */
12068		if (flushparent) {
12069			int locked;
12070
12071			if ((error = ffs_update(pvp, 1)) != 0) {
12072				vput(pvp);
12073				return (error);
12074			}
12075			ACQUIRE_LOCK(&lk);
12076			locked = 1;
12077			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12078				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12079					if (wk->wk_type != D_DIRADD)
12080						panic("softdep_fsync: Unexpected type %s",
12081						      TYPENAME(wk->wk_type));
12082					dap = WK_DIRADD(wk);
12083					if (dap->da_state & DIRCHG)
12084						pagedep = dap->da_previous->dm_pagedep;
12085					else
12086						pagedep = dap->da_pagedep;
12087					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12088					FREE_LOCK(&lk);
12089					locked = 0;
12090					if (pagedep_new_block && (error =
12091					    ffs_syncvnode(pvp, MNT_WAIT, 0))) {
12092						vput(pvp);
12093						return (error);
12094					}
12095				}
12096			}
12097			if (locked)
12098				FREE_LOCK(&lk);
12099		}
12100		/*
12101		 * Flush directory page containing the inode's name.
12102		 */
12103		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12104		    &bp);
12105		if (error == 0)
12106			error = bwrite(bp);
12107		else
12108			brelse(bp);
12109		vput(pvp);
12110		if (error != 0)
12111			return (error);
12112		ACQUIRE_LOCK(&lk);
12113		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12114			break;
12115	}
12116	FREE_LOCK(&lk);
12117	return (0);
12118}
12119
12120/*
12121 * Flush all the dirty bitmaps associated with the block device
12122 * before flushing the rest of the dirty blocks so as to reduce
12123 * the number of dependencies that will have to be rolled back.
12124 *
12125 * XXX Unused?
12126 */
12127void
12128softdep_fsync_mountdev(vp)
12129	struct vnode *vp;
12130{
12131	struct buf *bp, *nbp;
12132	struct worklist *wk;
12133	struct bufobj *bo;
12134
12135	if (!vn_isdisk(vp, NULL))
12136		panic("softdep_fsync_mountdev: vnode not a disk");
12137	bo = &vp->v_bufobj;
12138restart:
12139	BO_LOCK(bo);
12140	ACQUIRE_LOCK(&lk);
12141	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
12142		/*
12143		 * If it is already scheduled, skip to the next buffer.
12144		 */
12145		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
12146			continue;
12147
12148		if ((bp->b_flags & B_DELWRI) == 0)
12149			panic("softdep_fsync_mountdev: not dirty");
12150		/*
12151		 * We are only interested in bitmaps with outstanding
12152		 * dependencies.
12153		 */
12154		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
12155		    wk->wk_type != D_BMSAFEMAP ||
12156		    (bp->b_vflags & BV_BKGRDINPROG)) {
12157			BUF_UNLOCK(bp);
12158			continue;
12159		}
12160		FREE_LOCK(&lk);
12161		BO_UNLOCK(bo);
12162		bremfree(bp);
12163		(void) bawrite(bp);
12164		goto restart;
12165	}
12166	FREE_LOCK(&lk);
12167	drain_output(vp);
12168	BO_UNLOCK(bo);
12169}
12170
12171/*
12172 * Sync all cylinder groups that were dirty at the time this function is
12173 * called.  Newly dirtied cgs will be inserted before the sentinel.  This
12174 * is used to flush freedep activity that may be holding up writes to a
12175 * indirect block.
12176 */
12177static int
12178sync_cgs(mp, waitfor)
12179	struct mount *mp;
12180	int waitfor;
12181{
12182	struct bmsafemap *bmsafemap;
12183	struct bmsafemap *sentinel;
12184	struct ufsmount *ump;
12185	struct buf *bp;
12186	int error;
12187
12188	sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
12189	sentinel->sm_cg = -1;
12190	ump = VFSTOUFS(mp);
12191	error = 0;
12192	ACQUIRE_LOCK(&lk);
12193	LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
12194	for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
12195	    bmsafemap = LIST_NEXT(sentinel, sm_next)) {
12196		/* Skip sentinels and cgs with no work to release. */
12197		if (bmsafemap->sm_cg == -1 ||
12198		    (LIST_EMPTY(&bmsafemap->sm_freehd) &&
12199		    LIST_EMPTY(&bmsafemap->sm_freewr))) {
12200			LIST_REMOVE(sentinel, sm_next);
12201			LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12202			continue;
12203		}
12204		/*
12205		 * If we don't get the lock and we're waiting try again, if
12206		 * not move on to the next buf and try to sync it.
12207		 */
12208		bp = getdirtybuf(bmsafemap->sm_buf, &lk, waitfor);
12209		if (bp == NULL && waitfor == MNT_WAIT)
12210			continue;
12211		LIST_REMOVE(sentinel, sm_next);
12212		LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12213		if (bp == NULL)
12214			continue;
12215		FREE_LOCK(&lk);
12216		if (waitfor == MNT_NOWAIT)
12217			bawrite(bp);
12218		else
12219			error = bwrite(bp);
12220		ACQUIRE_LOCK(&lk);
12221		if (error)
12222			break;
12223	}
12224	LIST_REMOVE(sentinel, sm_next);
12225	FREE_LOCK(&lk);
12226	free(sentinel, M_BMSAFEMAP);
12227	return (error);
12228}
12229
12230/*
12231 * This routine is called when we are trying to synchronously flush a
12232 * file. This routine must eliminate any filesystem metadata dependencies
12233 * so that the syncing routine can succeed.
12234 */
12235int
12236softdep_sync_metadata(struct vnode *vp)
12237{
12238	int error;
12239
12240	/*
12241	 * Ensure that any direct block dependencies have been cleared,
12242	 * truncations are started, and inode references are journaled.
12243	 */
12244	ACQUIRE_LOCK(&lk);
12245	/*
12246	 * Write all journal records to prevent rollbacks on devvp.
12247	 */
12248	if (vp->v_type == VCHR)
12249		softdep_flushjournal(vp->v_mount);
12250	error = flush_inodedep_deps(vp, vp->v_mount, VTOI(vp)->i_number);
12251	/*
12252	 * Ensure that all truncates are written so we won't find deps on
12253	 * indirect blocks.
12254	 */
12255	process_truncates(vp);
12256	FREE_LOCK(&lk);
12257
12258	return (error);
12259}
12260
12261/*
12262 * This routine is called when we are attempting to sync a buf with
12263 * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12264 * other IO it can but returns EBUSY if the buffer is not yet able to
12265 * be written.  Dependencies which will not cause rollbacks will always
12266 * return 0.
12267 */
12268int
12269softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
12270{
12271	struct indirdep *indirdep;
12272	struct pagedep *pagedep;
12273	struct allocindir *aip;
12274	struct newblk *newblk;
12275	struct buf *nbp;
12276	struct worklist *wk;
12277	int i, error;
12278
12279	/*
12280	 * For VCHR we just don't want to force flush any dependencies that
12281	 * will cause rollbacks.
12282	 */
12283	if (vp->v_type == VCHR) {
12284		if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12285			return (EBUSY);
12286		return (0);
12287	}
12288	ACQUIRE_LOCK(&lk);
12289	/*
12290	 * As we hold the buffer locked, none of its dependencies
12291	 * will disappear.
12292	 */
12293	error = 0;
12294top:
12295	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12296		switch (wk->wk_type) {
12297
12298		case D_ALLOCDIRECT:
12299		case D_ALLOCINDIR:
12300			newblk = WK_NEWBLK(wk);
12301			if (newblk->nb_jnewblk != NULL) {
12302				if (waitfor == MNT_NOWAIT) {
12303					error = EBUSY;
12304					goto out_unlock;
12305				}
12306				jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12307				goto top;
12308			}
12309			if (newblk->nb_state & DEPCOMPLETE ||
12310			    waitfor == MNT_NOWAIT)
12311				continue;
12312			nbp = newblk->nb_bmsafemap->sm_buf;
12313			nbp = getdirtybuf(nbp, &lk, waitfor);
12314			if (nbp == NULL)
12315				goto top;
12316			FREE_LOCK(&lk);
12317			if ((error = bwrite(nbp)) != 0)
12318				goto out;
12319			ACQUIRE_LOCK(&lk);
12320			continue;
12321
12322		case D_INDIRDEP:
12323			indirdep = WK_INDIRDEP(wk);
12324			if (waitfor == MNT_NOWAIT) {
12325				if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
12326				    !LIST_EMPTY(&indirdep->ir_deplisthd)) {
12327					error = EBUSY;
12328					goto out_unlock;
12329				}
12330			}
12331			if (!TAILQ_EMPTY(&indirdep->ir_trunc))
12332				panic("softdep_sync_buf: truncation pending.");
12333		restart:
12334			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
12335				newblk = (struct newblk *)aip;
12336				if (newblk->nb_jnewblk != NULL) {
12337					jwait(&newblk->nb_jnewblk->jn_list,
12338					    waitfor);
12339					goto restart;
12340				}
12341				if (newblk->nb_state & DEPCOMPLETE)
12342					continue;
12343				nbp = newblk->nb_bmsafemap->sm_buf;
12344				nbp = getdirtybuf(nbp, &lk, waitfor);
12345				if (nbp == NULL)
12346					goto restart;
12347				FREE_LOCK(&lk);
12348				if ((error = bwrite(nbp)) != 0)
12349					goto out;
12350				ACQUIRE_LOCK(&lk);
12351				goto restart;
12352			}
12353			continue;
12354
12355		case D_PAGEDEP:
12356			/*
12357			 * Only flush directory entries in synchronous passes.
12358			 */
12359			if (waitfor != MNT_WAIT) {
12360				error = EBUSY;
12361				goto out_unlock;
12362			}
12363			/*
12364			 * While syncing snapshots, we must allow recursive
12365			 * lookups.
12366			 */
12367			BUF_AREC(bp);
12368			/*
12369			 * We are trying to sync a directory that may
12370			 * have dependencies on both its own metadata
12371			 * and/or dependencies on the inodes of any
12372			 * recently allocated files. We walk its diradd
12373			 * lists pushing out the associated inode.
12374			 */
12375			pagedep = WK_PAGEDEP(wk);
12376			for (i = 0; i < DAHASHSZ; i++) {
12377				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
12378					continue;
12379				if ((error = flush_pagedep_deps(vp, wk->wk_mp,
12380				    &pagedep->pd_diraddhd[i]))) {
12381					BUF_NOREC(bp);
12382					goto out_unlock;
12383				}
12384			}
12385			BUF_NOREC(bp);
12386			continue;
12387
12388		case D_FREEWORK:
12389		case D_FREEDEP:
12390		case D_JSEGDEP:
12391		case D_JNEWBLK:
12392			continue;
12393
12394		default:
12395			panic("softdep_sync_buf: Unknown type %s",
12396			    TYPENAME(wk->wk_type));
12397			/* NOTREACHED */
12398		}
12399	}
12400out_unlock:
12401	FREE_LOCK(&lk);
12402out:
12403	return (error);
12404}
12405
12406/*
12407 * Flush the dependencies associated with an inodedep.
12408 * Called with splbio blocked.
12409 */
12410static int
12411flush_inodedep_deps(vp, mp, ino)
12412	struct vnode *vp;
12413	struct mount *mp;
12414	ino_t ino;
12415{
12416	struct inodedep *inodedep;
12417	struct inoref *inoref;
12418	int error, waitfor;
12419
12420	/*
12421	 * This work is done in two passes. The first pass grabs most
12422	 * of the buffers and begins asynchronously writing them. The
12423	 * only way to wait for these asynchronous writes is to sleep
12424	 * on the filesystem vnode which may stay busy for a long time
12425	 * if the filesystem is active. So, instead, we make a second
12426	 * pass over the dependencies blocking on each write. In the
12427	 * usual case we will be blocking against a write that we
12428	 * initiated, so when it is done the dependency will have been
12429	 * resolved. Thus the second pass is expected to end quickly.
12430	 * We give a brief window at the top of the loop to allow
12431	 * any pending I/O to complete.
12432	 */
12433	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
12434		if (error)
12435			return (error);
12436		FREE_LOCK(&lk);
12437		ACQUIRE_LOCK(&lk);
12438restart:
12439		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12440			return (0);
12441		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12442			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12443			    == DEPCOMPLETE) {
12444				jwait(&inoref->if_list, MNT_WAIT);
12445				goto restart;
12446			}
12447		}
12448		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
12449		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
12450		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
12451		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
12452			continue;
12453		/*
12454		 * If pass2, we are done, otherwise do pass 2.
12455		 */
12456		if (waitfor == MNT_WAIT)
12457			break;
12458		waitfor = MNT_WAIT;
12459	}
12460	/*
12461	 * Try freeing inodedep in case all dependencies have been removed.
12462	 */
12463	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
12464		(void) free_inodedep(inodedep);
12465	return (0);
12466}
12467
12468/*
12469 * Flush an inode dependency list.
12470 * Called with splbio blocked.
12471 */
12472static int
12473flush_deplist(listhead, waitfor, errorp)
12474	struct allocdirectlst *listhead;
12475	int waitfor;
12476	int *errorp;
12477{
12478	struct allocdirect *adp;
12479	struct newblk *newblk;
12480	struct buf *bp;
12481
12482	mtx_assert(&lk, MA_OWNED);
12483	TAILQ_FOREACH(adp, listhead, ad_next) {
12484		newblk = (struct newblk *)adp;
12485		if (newblk->nb_jnewblk != NULL) {
12486			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12487			return (1);
12488		}
12489		if (newblk->nb_state & DEPCOMPLETE)
12490			continue;
12491		bp = newblk->nb_bmsafemap->sm_buf;
12492		bp = getdirtybuf(bp, &lk, waitfor);
12493		if (bp == NULL) {
12494			if (waitfor == MNT_NOWAIT)
12495				continue;
12496			return (1);
12497		}
12498		FREE_LOCK(&lk);
12499		if (waitfor == MNT_NOWAIT)
12500			bawrite(bp);
12501		else
12502			*errorp = bwrite(bp);
12503		ACQUIRE_LOCK(&lk);
12504		return (1);
12505	}
12506	return (0);
12507}
12508
12509/*
12510 * Flush dependencies associated with an allocdirect block.
12511 */
12512static int
12513flush_newblk_dep(vp, mp, lbn)
12514	struct vnode *vp;
12515	struct mount *mp;
12516	ufs_lbn_t lbn;
12517{
12518	struct newblk *newblk;
12519	struct bufobj *bo;
12520	struct inode *ip;
12521	struct buf *bp;
12522	ufs2_daddr_t blkno;
12523	int error;
12524
12525	error = 0;
12526	bo = &vp->v_bufobj;
12527	ip = VTOI(vp);
12528	blkno = DIP(ip, i_db[lbn]);
12529	if (blkno == 0)
12530		panic("flush_newblk_dep: Missing block");
12531	ACQUIRE_LOCK(&lk);
12532	/*
12533	 * Loop until all dependencies related to this block are satisfied.
12534	 * We must be careful to restart after each sleep in case a write
12535	 * completes some part of this process for us.
12536	 */
12537	for (;;) {
12538		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
12539			FREE_LOCK(&lk);
12540			break;
12541		}
12542		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
12543			panic("flush_newblk_deps: Bad newblk %p", newblk);
12544		/*
12545		 * Flush the journal.
12546		 */
12547		if (newblk->nb_jnewblk != NULL) {
12548			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12549			continue;
12550		}
12551		/*
12552		 * Write the bitmap dependency.
12553		 */
12554		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
12555			bp = newblk->nb_bmsafemap->sm_buf;
12556			bp = getdirtybuf(bp, &lk, MNT_WAIT);
12557			if (bp == NULL)
12558				continue;
12559			FREE_LOCK(&lk);
12560			error = bwrite(bp);
12561			if (error)
12562				break;
12563			ACQUIRE_LOCK(&lk);
12564			continue;
12565		}
12566		/*
12567		 * Write the buffer.
12568		 */
12569		FREE_LOCK(&lk);
12570		BO_LOCK(bo);
12571		bp = gbincore(bo, lbn);
12572		if (bp != NULL) {
12573			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
12574			    LK_INTERLOCK, BO_MTX(bo));
12575			if (error == ENOLCK) {
12576				ACQUIRE_LOCK(&lk);
12577				continue; /* Slept, retry */
12578			}
12579			if (error != 0)
12580				break;	/* Failed */
12581			if (bp->b_flags & B_DELWRI) {
12582				bremfree(bp);
12583				error = bwrite(bp);
12584				if (error)
12585					break;
12586			} else
12587				BUF_UNLOCK(bp);
12588		} else
12589			BO_UNLOCK(bo);
12590		/*
12591		 * We have to wait for the direct pointers to
12592		 * point at the newdirblk before the dependency
12593		 * will go away.
12594		 */
12595		error = ffs_update(vp, 1);
12596		if (error)
12597			break;
12598		ACQUIRE_LOCK(&lk);
12599	}
12600	return (error);
12601}
12602
12603/*
12604 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
12605 * Called with splbio blocked.
12606 */
12607static int
12608flush_pagedep_deps(pvp, mp, diraddhdp)
12609	struct vnode *pvp;
12610	struct mount *mp;
12611	struct diraddhd *diraddhdp;
12612{
12613	struct inodedep *inodedep;
12614	struct inoref *inoref;
12615	struct ufsmount *ump;
12616	struct diradd *dap;
12617	struct vnode *vp;
12618	int error = 0;
12619	struct buf *bp;
12620	ino_t inum;
12621	struct diraddhd unfinished;
12622
12623	LIST_INIT(&unfinished);
12624	ump = VFSTOUFS(mp);
12625restart:
12626	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
12627		/*
12628		 * Flush ourselves if this directory entry
12629		 * has a MKDIR_PARENT dependency.
12630		 */
12631		if (dap->da_state & MKDIR_PARENT) {
12632			FREE_LOCK(&lk);
12633			if ((error = ffs_update(pvp, 1)) != 0)
12634				break;
12635			ACQUIRE_LOCK(&lk);
12636			/*
12637			 * If that cleared dependencies, go on to next.
12638			 */
12639			if (dap != LIST_FIRST(diraddhdp))
12640				continue;
12641			/*
12642			 * All MKDIR_PARENT dependencies and all the
12643			 * NEWBLOCK pagedeps that are contained in direct
12644			 * blocks were resolved by doing above ffs_update.
12645			 * Pagedeps contained in indirect blocks may
12646			 * require a complete sync'ing of the directory.
12647			 * We are in the midst of doing a complete sync,
12648			 * so if they are not resolved in this pass we
12649			 * defer them for now as they will be sync'ed by
12650			 * our caller shortly.
12651			 */
12652			LIST_REMOVE(dap, da_pdlist);
12653			LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
12654			continue;
12655		}
12656		/*
12657		 * A newly allocated directory must have its "." and
12658		 * ".." entries written out before its name can be
12659		 * committed in its parent.
12660		 */
12661		inum = dap->da_newinum;
12662		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12663			panic("flush_pagedep_deps: lost inode1");
12664		/*
12665		 * Wait for any pending journal adds to complete so we don't
12666		 * cause rollbacks while syncing.
12667		 */
12668		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12669			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12670			    == DEPCOMPLETE) {
12671				jwait(&inoref->if_list, MNT_WAIT);
12672				goto restart;
12673			}
12674		}
12675		if (dap->da_state & MKDIR_BODY) {
12676			FREE_LOCK(&lk);
12677			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12678			    FFSV_FORCEINSMQ)))
12679				break;
12680			error = flush_newblk_dep(vp, mp, 0);
12681			/*
12682			 * If we still have the dependency we might need to
12683			 * update the vnode to sync the new link count to
12684			 * disk.
12685			 */
12686			if (error == 0 && dap == LIST_FIRST(diraddhdp))
12687				error = ffs_update(vp, 1);
12688			vput(vp);
12689			if (error != 0)
12690				break;
12691			ACQUIRE_LOCK(&lk);
12692			/*
12693			 * If that cleared dependencies, go on to next.
12694			 */
12695			if (dap != LIST_FIRST(diraddhdp))
12696				continue;
12697			if (dap->da_state & MKDIR_BODY) {
12698				inodedep_lookup(UFSTOVFS(ump), inum, 0,
12699				    &inodedep);
12700				panic("flush_pagedep_deps: MKDIR_BODY "
12701				    "inodedep %p dap %p vp %p",
12702				    inodedep, dap, vp);
12703			}
12704		}
12705		/*
12706		 * Flush the inode on which the directory entry depends.
12707		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
12708		 * the only remaining dependency is that the updated inode
12709		 * count must get pushed to disk. The inode has already
12710		 * been pushed into its inode buffer (via VOP_UPDATE) at
12711		 * the time of the reference count change. So we need only
12712		 * locate that buffer, ensure that there will be no rollback
12713		 * caused by a bitmap dependency, then write the inode buffer.
12714		 */
12715retry:
12716		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12717			panic("flush_pagedep_deps: lost inode");
12718		/*
12719		 * If the inode still has bitmap dependencies,
12720		 * push them to disk.
12721		 */
12722		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
12723			bp = inodedep->id_bmsafemap->sm_buf;
12724			bp = getdirtybuf(bp, &lk, MNT_WAIT);
12725			if (bp == NULL)
12726				goto retry;
12727			FREE_LOCK(&lk);
12728			if ((error = bwrite(bp)) != 0)
12729				break;
12730			ACQUIRE_LOCK(&lk);
12731			if (dap != LIST_FIRST(diraddhdp))
12732				continue;
12733		}
12734		/*
12735		 * If the inode is still sitting in a buffer waiting
12736		 * to be written or waiting for the link count to be
12737		 * adjusted update it here to flush it to disk.
12738		 */
12739		if (dap == LIST_FIRST(diraddhdp)) {
12740			FREE_LOCK(&lk);
12741			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12742			    FFSV_FORCEINSMQ)))
12743				break;
12744			error = ffs_update(vp, 1);
12745			vput(vp);
12746			if (error)
12747				break;
12748			ACQUIRE_LOCK(&lk);
12749		}
12750		/*
12751		 * If we have failed to get rid of all the dependencies
12752		 * then something is seriously wrong.
12753		 */
12754		if (dap == LIST_FIRST(diraddhdp)) {
12755			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
12756			panic("flush_pagedep_deps: failed to flush "
12757			    "inodedep %p ino %d dap %p", inodedep, inum, dap);
12758		}
12759	}
12760	if (error)
12761		ACQUIRE_LOCK(&lk);
12762	while ((dap = LIST_FIRST(&unfinished)) != NULL) {
12763		LIST_REMOVE(dap, da_pdlist);
12764		LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
12765	}
12766	return (error);
12767}
12768
12769/*
12770 * A large burst of file addition or deletion activity can drive the
12771 * memory load excessively high. First attempt to slow things down
12772 * using the techniques below. If that fails, this routine requests
12773 * the offending operations to fall back to running synchronously
12774 * until the memory load returns to a reasonable level.
12775 */
12776int
12777softdep_slowdown(vp)
12778	struct vnode *vp;
12779{
12780	struct ufsmount *ump;
12781	int jlow;
12782	int max_softdeps_hard;
12783
12784	ACQUIRE_LOCK(&lk);
12785	jlow = 0;
12786	/*
12787	 * Check for journal space if needed.
12788	 */
12789	if (DOINGSUJ(vp)) {
12790		ump = VFSTOUFS(vp->v_mount);
12791		if (journal_space(ump, 0) == 0)
12792			jlow = 1;
12793	}
12794	max_softdeps_hard = max_softdeps * 11 / 10;
12795	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
12796	    dep_current[D_INODEDEP] < max_softdeps_hard &&
12797	    VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps &&
12798	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0) {
12799		FREE_LOCK(&lk);
12800  		return (0);
12801	}
12802	if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow)
12803		softdep_speedup();
12804	stat_sync_limit_hit += 1;
12805	FREE_LOCK(&lk);
12806	if (DOINGSUJ(vp))
12807		return (0);
12808	return (1);
12809}
12810
12811/*
12812 * Called by the allocation routines when they are about to fail
12813 * in the hope that we can free up the requested resource (inodes
12814 * or disk space).
12815 *
12816 * First check to see if the work list has anything on it. If it has,
12817 * clean up entries until we successfully free the requested resource.
12818 * Because this process holds inodes locked, we cannot handle any remove
12819 * requests that might block on a locked inode as that could lead to
12820 * deadlock. If the worklist yields none of the requested resource,
12821 * start syncing out vnodes to free up the needed space.
12822 */
12823int
12824softdep_request_cleanup(fs, vp, cred, resource)
12825	struct fs *fs;
12826	struct vnode *vp;
12827	struct ucred *cred;
12828	int resource;
12829{
12830	struct ufsmount *ump;
12831	struct mount *mp;
12832	struct vnode *lvp, *mvp;
12833	long starttime;
12834	ufs2_daddr_t needed;
12835	int error;
12836
12837	/*
12838	 * If we are being called because of a process doing a
12839	 * copy-on-write, then it is not safe to process any
12840	 * worklist items as we will recurse into the copyonwrite
12841	 * routine.  This will result in an incoherent snapshot.
12842	 * If the vnode that we hold is a snapshot, we must avoid
12843	 * handling other resources that could cause deadlock.
12844	 */
12845	if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
12846		return (0);
12847
12848	if (resource == FLUSH_BLOCKS_WAIT)
12849		stat_cleanup_blkrequests += 1;
12850	else
12851		stat_cleanup_inorequests += 1;
12852
12853	mp = vp->v_mount;
12854	ump = VFSTOUFS(mp);
12855	mtx_assert(UFS_MTX(ump), MA_OWNED);
12856	UFS_UNLOCK(ump);
12857	error = ffs_update(vp, 1);
12858	if (error != 0) {
12859		UFS_LOCK(ump);
12860		return (0);
12861	}
12862	/*
12863	 * If we are in need of resources, consider pausing for
12864	 * tickdelay to give ourselves some breathing room.
12865	 */
12866	ACQUIRE_LOCK(&lk);
12867	process_removes(vp);
12868	process_truncates(vp);
12869	request_cleanup(UFSTOVFS(ump), resource);
12870	FREE_LOCK(&lk);
12871	/*
12872	 * Now clean up at least as many resources as we will need.
12873	 *
12874	 * When requested to clean up inodes, the number that are needed
12875	 * is set by the number of simultaneous writers (mnt_writeopcount)
12876	 * plus a bit of slop (2) in case some more writers show up while
12877	 * we are cleaning.
12878	 *
12879	 * When requested to free up space, the amount of space that
12880	 * we need is enough blocks to allocate a full-sized segment
12881	 * (fs_contigsumsize). The number of such segments that will
12882	 * be needed is set by the number of simultaneous writers
12883	 * (mnt_writeopcount) plus a bit of slop (2) in case some more
12884	 * writers show up while we are cleaning.
12885	 *
12886	 * Additionally, if we are unpriviledged and allocating space,
12887	 * we need to ensure that we clean up enough blocks to get the
12888	 * needed number of blocks over the threshhold of the minimum
12889	 * number of blocks required to be kept free by the filesystem
12890	 * (fs_minfree).
12891	 */
12892	if (resource == FLUSH_INODES_WAIT) {
12893		needed = vp->v_mount->mnt_writeopcount + 2;
12894	} else if (resource == FLUSH_BLOCKS_WAIT) {
12895		needed = (vp->v_mount->mnt_writeopcount + 2) *
12896		    fs->fs_contigsumsize;
12897		if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
12898			needed += fragstoblks(fs,
12899			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
12900			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
12901	} else {
12902		UFS_LOCK(ump);
12903		printf("softdep_request_cleanup: Unknown resource type %d\n",
12904		    resource);
12905		return (0);
12906	}
12907	starttime = time_second;
12908retry:
12909	if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
12910	    fs->fs_cstotal.cs_nbfree <= needed) ||
12911	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
12912	    fs->fs_cstotal.cs_nifree <= needed)) {
12913		ACQUIRE_LOCK(&lk);
12914		if (ump->softdep_on_worklist > 0 &&
12915		    process_worklist_item(UFSTOVFS(ump),
12916		    ump->softdep_on_worklist, LK_NOWAIT) != 0)
12917			stat_worklist_push += 1;
12918		FREE_LOCK(&lk);
12919	}
12920	/*
12921	 * If we still need resources and there are no more worklist
12922	 * entries to process to obtain them, we have to start flushing
12923	 * the dirty vnodes to force the release of additional requests
12924	 * to the worklist that we can then process to reap addition
12925	 * resources. We walk the vnodes associated with the mount point
12926	 * until we get the needed worklist requests that we can reap.
12927	 */
12928	if ((resource == FLUSH_BLOCKS_WAIT &&
12929	     fs->fs_cstotal.cs_nbfree <= needed) ||
12930	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
12931	     fs->fs_cstotal.cs_nifree <= needed)) {
12932		MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
12933			if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
12934				VI_UNLOCK(lvp);
12935				continue;
12936			}
12937			if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
12938			    curthread))
12939				continue;
12940			if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
12941				vput(lvp);
12942				continue;
12943			}
12944			(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
12945			vput(lvp);
12946		}
12947		lvp = ump->um_devvp;
12948		if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
12949			VOP_FSYNC(lvp, MNT_NOWAIT, curthread);
12950			VOP_UNLOCK(lvp, 0);
12951		}
12952		if (ump->softdep_on_worklist > 0) {
12953			stat_cleanup_retries += 1;
12954			goto retry;
12955		}
12956		stat_cleanup_failures += 1;
12957	}
12958	if (time_second - starttime > stat_cleanup_high_delay)
12959		stat_cleanup_high_delay = time_second - starttime;
12960	UFS_LOCK(ump);
12961	return (1);
12962}
12963
12964/*
12965 * If memory utilization has gotten too high, deliberately slow things
12966 * down and speed up the I/O processing.
12967 */
12968extern struct thread *syncertd;
12969static int
12970request_cleanup(mp, resource)
12971	struct mount *mp;
12972	int resource;
12973{
12974	struct thread *td = curthread;
12975	struct ufsmount *ump;
12976
12977	mtx_assert(&lk, MA_OWNED);
12978	/*
12979	 * We never hold up the filesystem syncer or buf daemon.
12980	 */
12981	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
12982		return (0);
12983	ump = VFSTOUFS(mp);
12984	/*
12985	 * First check to see if the work list has gotten backlogged.
12986	 * If it has, co-opt this process to help clean up two entries.
12987	 * Because this process may hold inodes locked, we cannot
12988	 * handle any remove requests that might block on a locked
12989	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
12990	 * to avoid recursively processing the worklist.
12991	 */
12992	if (ump->softdep_on_worklist > max_softdeps / 10) {
12993		td->td_pflags |= TDP_SOFTDEP;
12994		process_worklist_item(mp, 2, LK_NOWAIT);
12995		td->td_pflags &= ~TDP_SOFTDEP;
12996		stat_worklist_push += 2;
12997		return(1);
12998	}
12999	/*
13000	 * Next, we attempt to speed up the syncer process. If that
13001	 * is successful, then we allow the process to continue.
13002	 */
13003	if (softdep_speedup() &&
13004	    resource != FLUSH_BLOCKS_WAIT &&
13005	    resource != FLUSH_INODES_WAIT)
13006		return(0);
13007	/*
13008	 * If we are resource constrained on inode dependencies, try
13009	 * flushing some dirty inodes. Otherwise, we are constrained
13010	 * by file deletions, so try accelerating flushes of directories
13011	 * with removal dependencies. We would like to do the cleanup
13012	 * here, but we probably hold an inode locked at this point and
13013	 * that might deadlock against one that we try to clean. So,
13014	 * the best that we can do is request the syncer daemon to do
13015	 * the cleanup for us.
13016	 */
13017	switch (resource) {
13018
13019	case FLUSH_INODES:
13020	case FLUSH_INODES_WAIT:
13021		stat_ino_limit_push += 1;
13022		req_clear_inodedeps += 1;
13023		stat_countp = &stat_ino_limit_hit;
13024		break;
13025
13026	case FLUSH_BLOCKS:
13027	case FLUSH_BLOCKS_WAIT:
13028		stat_blk_limit_push += 1;
13029		req_clear_remove += 1;
13030		stat_countp = &stat_blk_limit_hit;
13031		break;
13032
13033	default:
13034		panic("request_cleanup: unknown type");
13035	}
13036	/*
13037	 * Hopefully the syncer daemon will catch up and awaken us.
13038	 * We wait at most tickdelay before proceeding in any case.
13039	 */
13040	proc_waiting += 1;
13041	if (callout_pending(&softdep_callout) == FALSE)
13042		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13043		    pause_timer, 0);
13044
13045	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
13046	proc_waiting -= 1;
13047	return (1);
13048}
13049
13050/*
13051 * Awaken processes pausing in request_cleanup and clear proc_waiting
13052 * to indicate that there is no longer a timer running.
13053 */
13054static void
13055pause_timer(arg)
13056	void *arg;
13057{
13058
13059	/*
13060	 * The callout_ API has acquired mtx and will hold it around this
13061	 * function call.
13062	 */
13063	*stat_countp += 1;
13064	wakeup_one(&proc_waiting);
13065	if (proc_waiting > 0)
13066		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13067		    pause_timer, 0);
13068}
13069
13070/*
13071 * Flush out a directory with at least one removal dependency in an effort to
13072 * reduce the number of dirrem, freefile, and freeblks dependency structures.
13073 */
13074static void
13075clear_remove(td)
13076	struct thread *td;
13077{
13078	struct pagedep_hashhead *pagedephd;
13079	struct pagedep *pagedep;
13080	static int next = 0;
13081	struct mount *mp;
13082	struct vnode *vp;
13083	struct bufobj *bo;
13084	int error, cnt;
13085	ino_t ino;
13086
13087	mtx_assert(&lk, MA_OWNED);
13088
13089	for (cnt = 0; cnt <= pagedep_hash; cnt++) {
13090		pagedephd = &pagedep_hashtbl[next++];
13091		if (next > pagedep_hash)
13092			next = 0;
13093		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
13094			if (LIST_EMPTY(&pagedep->pd_dirremhd))
13095				continue;
13096			mp = pagedep->pd_list.wk_mp;
13097			ino = pagedep->pd_ino;
13098			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13099				continue;
13100			FREE_LOCK(&lk);
13101
13102			/*
13103			 * Let unmount clear deps
13104			 */
13105			error = vfs_busy(mp, MBF_NOWAIT);
13106			if (error != 0)
13107				goto finish_write;
13108			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13109			     FFSV_FORCEINSMQ);
13110			vfs_unbusy(mp);
13111			if (error != 0) {
13112				softdep_error("clear_remove: vget", error);
13113				goto finish_write;
13114			}
13115			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13116				softdep_error("clear_remove: fsync", error);
13117			bo = &vp->v_bufobj;
13118			BO_LOCK(bo);
13119			drain_output(vp);
13120			BO_UNLOCK(bo);
13121			vput(vp);
13122		finish_write:
13123			vn_finished_write(mp);
13124			ACQUIRE_LOCK(&lk);
13125			return;
13126		}
13127	}
13128}
13129
13130/*
13131 * Clear out a block of dirty inodes in an effort to reduce
13132 * the number of inodedep dependency structures.
13133 */
13134static void
13135clear_inodedeps(td)
13136	struct thread *td;
13137{
13138	struct inodedep_hashhead *inodedephd;
13139	struct inodedep *inodedep;
13140	static int next = 0;
13141	struct mount *mp;
13142	struct vnode *vp;
13143	struct fs *fs;
13144	int error, cnt;
13145	ino_t firstino, lastino, ino;
13146
13147	mtx_assert(&lk, MA_OWNED);
13148	/*
13149	 * Pick a random inode dependency to be cleared.
13150	 * We will then gather up all the inodes in its block
13151	 * that have dependencies and flush them out.
13152	 */
13153	for (cnt = 0; cnt <= inodedep_hash; cnt++) {
13154		inodedephd = &inodedep_hashtbl[next++];
13155		if (next > inodedep_hash)
13156			next = 0;
13157		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
13158			break;
13159	}
13160	if (inodedep == NULL)
13161		return;
13162	fs = inodedep->id_fs;
13163	mp = inodedep->id_list.wk_mp;
13164	/*
13165	 * Find the last inode in the block with dependencies.
13166	 */
13167	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
13168	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
13169		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
13170			break;
13171	/*
13172	 * Asynchronously push all but the last inode with dependencies.
13173	 * Synchronously push the last inode with dependencies to ensure
13174	 * that the inode block gets written to free up the inodedeps.
13175	 */
13176	for (ino = firstino; ino <= lastino; ino++) {
13177		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13178			continue;
13179		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13180			continue;
13181		FREE_LOCK(&lk);
13182		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
13183		if (error != 0) {
13184			vn_finished_write(mp);
13185			ACQUIRE_LOCK(&lk);
13186			return;
13187		}
13188		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13189		    FFSV_FORCEINSMQ)) != 0) {
13190			softdep_error("clear_inodedeps: vget", error);
13191			vfs_unbusy(mp);
13192			vn_finished_write(mp);
13193			ACQUIRE_LOCK(&lk);
13194			return;
13195		}
13196		vfs_unbusy(mp);
13197		if (ino == lastino) {
13198			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
13199				softdep_error("clear_inodedeps: fsync1", error);
13200		} else {
13201			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13202				softdep_error("clear_inodedeps: fsync2", error);
13203			BO_LOCK(&vp->v_bufobj);
13204			drain_output(vp);
13205			BO_UNLOCK(&vp->v_bufobj);
13206		}
13207		vput(vp);
13208		vn_finished_write(mp);
13209		ACQUIRE_LOCK(&lk);
13210	}
13211}
13212
13213void
13214softdep_buf_append(bp, wkhd)
13215	struct buf *bp;
13216	struct workhead *wkhd;
13217{
13218	struct worklist *wk;
13219
13220	ACQUIRE_LOCK(&lk);
13221	while ((wk = LIST_FIRST(wkhd)) != NULL) {
13222		WORKLIST_REMOVE(wk);
13223		WORKLIST_INSERT(&bp->b_dep, wk);
13224	}
13225	FREE_LOCK(&lk);
13226
13227}
13228
13229void
13230softdep_inode_append(ip, cred, wkhd)
13231	struct inode *ip;
13232	struct ucred *cred;
13233	struct workhead *wkhd;
13234{
13235	struct buf *bp;
13236	struct fs *fs;
13237	int error;
13238
13239	fs = ip->i_fs;
13240	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
13241	    (int)fs->fs_bsize, cred, &bp);
13242	if (error) {
13243		bqrelse(bp);
13244		softdep_freework(wkhd);
13245		return;
13246	}
13247	softdep_buf_append(bp, wkhd);
13248	bqrelse(bp);
13249}
13250
13251void
13252softdep_freework(wkhd)
13253	struct workhead *wkhd;
13254{
13255
13256	ACQUIRE_LOCK(&lk);
13257	handle_jwork(wkhd);
13258	FREE_LOCK(&lk);
13259}
13260
13261/*
13262 * Function to determine if the buffer has outstanding dependencies
13263 * that will cause a roll-back if the buffer is written. If wantcount
13264 * is set, return number of dependencies, otherwise just yes or no.
13265 */
13266static int
13267softdep_count_dependencies(bp, wantcount)
13268	struct buf *bp;
13269	int wantcount;
13270{
13271	struct worklist *wk;
13272	struct bmsafemap *bmsafemap;
13273	struct freework *freework;
13274	struct inodedep *inodedep;
13275	struct indirdep *indirdep;
13276	struct freeblks *freeblks;
13277	struct allocindir *aip;
13278	struct pagedep *pagedep;
13279	struct dirrem *dirrem;
13280	struct newblk *newblk;
13281	struct mkdir *mkdir;
13282	struct diradd *dap;
13283	int i, retval;
13284
13285	retval = 0;
13286	ACQUIRE_LOCK(&lk);
13287	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
13288		switch (wk->wk_type) {
13289
13290		case D_INODEDEP:
13291			inodedep = WK_INODEDEP(wk);
13292			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
13293				/* bitmap allocation dependency */
13294				retval += 1;
13295				if (!wantcount)
13296					goto out;
13297			}
13298			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
13299				/* direct block pointer dependency */
13300				retval += 1;
13301				if (!wantcount)
13302					goto out;
13303			}
13304			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
13305				/* direct block pointer dependency */
13306				retval += 1;
13307				if (!wantcount)
13308					goto out;
13309			}
13310			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
13311				/* Add reference dependency. */
13312				retval += 1;
13313				if (!wantcount)
13314					goto out;
13315			}
13316			continue;
13317
13318		case D_INDIRDEP:
13319			indirdep = WK_INDIRDEP(wk);
13320
13321			TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
13322				/* indirect truncation dependency */
13323				retval += 1;
13324				if (!wantcount)
13325					goto out;
13326			}
13327
13328			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
13329				/* indirect block pointer dependency */
13330				retval += 1;
13331				if (!wantcount)
13332					goto out;
13333			}
13334			continue;
13335
13336		case D_PAGEDEP:
13337			pagedep = WK_PAGEDEP(wk);
13338			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
13339				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
13340					/* Journal remove ref dependency. */
13341					retval += 1;
13342					if (!wantcount)
13343						goto out;
13344				}
13345			}
13346			for (i = 0; i < DAHASHSZ; i++) {
13347
13348				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
13349					/* directory entry dependency */
13350					retval += 1;
13351					if (!wantcount)
13352						goto out;
13353				}
13354			}
13355			continue;
13356
13357		case D_BMSAFEMAP:
13358			bmsafemap = WK_BMSAFEMAP(wk);
13359			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
13360				/* Add reference dependency. */
13361				retval += 1;
13362				if (!wantcount)
13363					goto out;
13364			}
13365			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
13366				/* Allocate block dependency. */
13367				retval += 1;
13368				if (!wantcount)
13369					goto out;
13370			}
13371			continue;
13372
13373		case D_FREEBLKS:
13374			freeblks = WK_FREEBLKS(wk);
13375			if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
13376				/* Freeblk journal dependency. */
13377				retval += 1;
13378				if (!wantcount)
13379					goto out;
13380			}
13381			continue;
13382
13383		case D_ALLOCDIRECT:
13384		case D_ALLOCINDIR:
13385			newblk = WK_NEWBLK(wk);
13386			if (newblk->nb_jnewblk) {
13387				/* Journal allocate dependency. */
13388				retval += 1;
13389				if (!wantcount)
13390					goto out;
13391			}
13392			continue;
13393
13394		case D_MKDIR:
13395			mkdir = WK_MKDIR(wk);
13396			if (mkdir->md_jaddref) {
13397				/* Journal reference dependency. */
13398				retval += 1;
13399				if (!wantcount)
13400					goto out;
13401			}
13402			continue;
13403
13404		case D_FREEWORK:
13405		case D_FREEDEP:
13406		case D_JSEGDEP:
13407		case D_JSEG:
13408		case D_SBDEP:
13409			/* never a dependency on these blocks */
13410			continue;
13411
13412		default:
13413			panic("softdep_count_dependencies: Unexpected type %s",
13414			    TYPENAME(wk->wk_type));
13415			/* NOTREACHED */
13416		}
13417	}
13418out:
13419	FREE_LOCK(&lk);
13420	return retval;
13421}
13422
13423/*
13424 * Acquire exclusive access to a buffer.
13425 * Must be called with a locked mtx parameter.
13426 * Return acquired buffer or NULL on failure.
13427 */
13428static struct buf *
13429getdirtybuf(bp, mtx, waitfor)
13430	struct buf *bp;
13431	struct mtx *mtx;
13432	int waitfor;
13433{
13434	int error;
13435
13436	mtx_assert(mtx, MA_OWNED);
13437	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
13438		if (waitfor != MNT_WAIT)
13439			return (NULL);
13440		error = BUF_LOCK(bp,
13441		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
13442		/*
13443		 * Even if we sucessfully acquire bp here, we have dropped
13444		 * mtx, which may violates our guarantee.
13445		 */
13446		if (error == 0)
13447			BUF_UNLOCK(bp);
13448		else if (error != ENOLCK)
13449			panic("getdirtybuf: inconsistent lock: %d", error);
13450		mtx_lock(mtx);
13451		return (NULL);
13452	}
13453	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13454		if (mtx == &lk && waitfor == MNT_WAIT) {
13455			mtx_unlock(mtx);
13456			BO_LOCK(bp->b_bufobj);
13457			BUF_UNLOCK(bp);
13458			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13459				bp->b_vflags |= BV_BKGRDWAIT;
13460				msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
13461				       PRIBIO | PDROP, "getbuf", 0);
13462			} else
13463				BO_UNLOCK(bp->b_bufobj);
13464			mtx_lock(mtx);
13465			return (NULL);
13466		}
13467		BUF_UNLOCK(bp);
13468		if (waitfor != MNT_WAIT)
13469			return (NULL);
13470		/*
13471		 * The mtx argument must be bp->b_vp's mutex in
13472		 * this case.
13473		 */
13474#ifdef	DEBUG_VFS_LOCKS
13475		if (bp->b_vp->v_type != VCHR)
13476			ASSERT_BO_LOCKED(bp->b_bufobj);
13477#endif
13478		bp->b_vflags |= BV_BKGRDWAIT;
13479		msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
13480		return (NULL);
13481	}
13482	if ((bp->b_flags & B_DELWRI) == 0) {
13483		BUF_UNLOCK(bp);
13484		return (NULL);
13485	}
13486	bremfree(bp);
13487	return (bp);
13488}
13489
13490
13491/*
13492 * Check if it is safe to suspend the file system now.  On entry,
13493 * the vnode interlock for devvp should be held.  Return 0 with
13494 * the mount interlock held if the file system can be suspended now,
13495 * otherwise return EAGAIN with the mount interlock held.
13496 */
13497int
13498softdep_check_suspend(struct mount *mp,
13499		      struct vnode *devvp,
13500		      int softdep_deps,
13501		      int softdep_accdeps,
13502		      int secondary_writes,
13503		      int secondary_accwrites)
13504{
13505	struct bufobj *bo;
13506	struct ufsmount *ump;
13507	int error;
13508
13509	ump = VFSTOUFS(mp);
13510	bo = &devvp->v_bufobj;
13511	ASSERT_BO_LOCKED(bo);
13512
13513	for (;;) {
13514		if (!TRY_ACQUIRE_LOCK(&lk)) {
13515			BO_UNLOCK(bo);
13516			ACQUIRE_LOCK(&lk);
13517			FREE_LOCK(&lk);
13518			BO_LOCK(bo);
13519			continue;
13520		}
13521		MNT_ILOCK(mp);
13522		if (mp->mnt_secondary_writes != 0) {
13523			FREE_LOCK(&lk);
13524			BO_UNLOCK(bo);
13525			msleep(&mp->mnt_secondary_writes,
13526			       MNT_MTX(mp),
13527			       (PUSER - 1) | PDROP, "secwr", 0);
13528			BO_LOCK(bo);
13529			continue;
13530		}
13531		break;
13532	}
13533
13534	/*
13535	 * Reasons for needing more work before suspend:
13536	 * - Dirty buffers on devvp.
13537	 * - Softdep activity occurred after start of vnode sync loop
13538	 * - Secondary writes occurred after start of vnode sync loop
13539	 */
13540	error = 0;
13541	if (bo->bo_numoutput > 0 ||
13542	    bo->bo_dirty.bv_cnt > 0 ||
13543	    softdep_deps != 0 ||
13544	    ump->softdep_deps != 0 ||
13545	    softdep_accdeps != ump->softdep_accdeps ||
13546	    secondary_writes != 0 ||
13547	    mp->mnt_secondary_writes != 0 ||
13548	    secondary_accwrites != mp->mnt_secondary_accwrites)
13549		error = EAGAIN;
13550	FREE_LOCK(&lk);
13551	BO_UNLOCK(bo);
13552	return (error);
13553}
13554
13555
13556/*
13557 * Get the number of dependency structures for the file system, both
13558 * the current number and the total number allocated.  These will
13559 * later be used to detect that softdep processing has occurred.
13560 */
13561void
13562softdep_get_depcounts(struct mount *mp,
13563		      int *softdep_depsp,
13564		      int *softdep_accdepsp)
13565{
13566	struct ufsmount *ump;
13567
13568	ump = VFSTOUFS(mp);
13569	ACQUIRE_LOCK(&lk);
13570	*softdep_depsp = ump->softdep_deps;
13571	*softdep_accdepsp = ump->softdep_accdeps;
13572	FREE_LOCK(&lk);
13573}
13574
13575/*
13576 * Wait for pending output on a vnode to complete.
13577 * Must be called with vnode lock and interlock locked.
13578 *
13579 * XXX: Should just be a call to bufobj_wwait().
13580 */
13581static void
13582drain_output(vp)
13583	struct vnode *vp;
13584{
13585	struct bufobj *bo;
13586
13587	bo = &vp->v_bufobj;
13588	ASSERT_VOP_LOCKED(vp, "drain_output");
13589	ASSERT_BO_LOCKED(bo);
13590
13591	while (bo->bo_numoutput) {
13592		bo->bo_flag |= BO_WWAIT;
13593		msleep((caddr_t)&bo->bo_numoutput,
13594		    BO_MTX(bo), PRIBIO + 1, "drainvp", 0);
13595	}
13596}
13597
13598/*
13599 * Called whenever a buffer that is being invalidated or reallocated
13600 * contains dependencies. This should only happen if an I/O error has
13601 * occurred. The routine is called with the buffer locked.
13602 */
13603static void
13604softdep_deallocate_dependencies(bp)
13605	struct buf *bp;
13606{
13607
13608	if ((bp->b_ioflags & BIO_ERROR) == 0)
13609		panic("softdep_deallocate_dependencies: dangling deps");
13610	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
13611	panic("softdep_deallocate_dependencies: unrecovered I/O error");
13612}
13613
13614/*
13615 * Function to handle asynchronous write errors in the filesystem.
13616 */
13617static void
13618softdep_error(func, error)
13619	char *func;
13620	int error;
13621{
13622
13623	/* XXX should do something better! */
13624	printf("%s: got error %d while accessing filesystem\n", func, error);
13625}
13626
13627#ifdef DDB
13628
13629static void
13630inodedep_print(struct inodedep *inodedep, int verbose)
13631{
13632	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
13633	    " saveino %p\n",
13634	    inodedep, inodedep->id_fs, inodedep->id_state,
13635	    (intmax_t)inodedep->id_ino,
13636	    (intmax_t)fsbtodb(inodedep->id_fs,
13637	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
13638	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
13639	    inodedep->id_savedino1);
13640
13641	if (verbose == 0)
13642		return;
13643
13644	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
13645	    "mkdiradd %p\n",
13646	    LIST_FIRST(&inodedep->id_pendinghd),
13647	    LIST_FIRST(&inodedep->id_bufwait),
13648	    LIST_FIRST(&inodedep->id_inowait),
13649	    TAILQ_FIRST(&inodedep->id_inoreflst),
13650	    inodedep->id_mkdiradd);
13651	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
13652	    TAILQ_FIRST(&inodedep->id_inoupdt),
13653	    TAILQ_FIRST(&inodedep->id_newinoupdt),
13654	    TAILQ_FIRST(&inodedep->id_extupdt),
13655	    TAILQ_FIRST(&inodedep->id_newextupdt));
13656}
13657
13658DB_SHOW_COMMAND(inodedep, db_show_inodedep)
13659{
13660
13661	if (have_addr == 0) {
13662		db_printf("Address required\n");
13663		return;
13664	}
13665	inodedep_print((struct inodedep*)addr, 1);
13666}
13667
13668DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
13669{
13670	struct inodedep_hashhead *inodedephd;
13671	struct inodedep *inodedep;
13672	struct fs *fs;
13673	int cnt;
13674
13675	fs = have_addr ? (struct fs *)addr : NULL;
13676	for (cnt = 0; cnt < inodedep_hash; cnt++) {
13677		inodedephd = &inodedep_hashtbl[cnt];
13678		LIST_FOREACH(inodedep, inodedephd, id_hash) {
13679			if (fs != NULL && fs != inodedep->id_fs)
13680				continue;
13681			inodedep_print(inodedep, 0);
13682		}
13683	}
13684}
13685
13686DB_SHOW_COMMAND(worklist, db_show_worklist)
13687{
13688	struct worklist *wk;
13689
13690	if (have_addr == 0) {
13691		db_printf("Address required\n");
13692		return;
13693	}
13694	wk = (struct worklist *)addr;
13695	printf("worklist: %p type %s state 0x%X\n",
13696	    wk, TYPENAME(wk->wk_type), wk->wk_state);
13697}
13698
13699DB_SHOW_COMMAND(workhead, db_show_workhead)
13700{
13701	struct workhead *wkhd;
13702	struct worklist *wk;
13703	int i;
13704
13705	if (have_addr == 0) {
13706		db_printf("Address required\n");
13707		return;
13708	}
13709	wkhd = (struct workhead *)addr;
13710	wk = LIST_FIRST(wkhd);
13711	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
13712		db_printf("worklist: %p type %s state 0x%X",
13713		    wk, TYPENAME(wk->wk_type), wk->wk_state);
13714	if (i == 100)
13715		db_printf("workhead overflow");
13716	printf("\n");
13717}
13718
13719
13720DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
13721{
13722	struct jaddref *jaddref;
13723	struct diradd *diradd;
13724	struct mkdir *mkdir;
13725
13726	LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
13727		diradd = mkdir->md_diradd;
13728		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
13729		    mkdir, mkdir->md_state, diradd, diradd->da_state);
13730		if ((jaddref = mkdir->md_jaddref) != NULL)
13731			db_printf(" jaddref %p jaddref state 0x%X",
13732			    jaddref, jaddref->ja_state);
13733		db_printf("\n");
13734	}
13735}
13736
13737#endif /* DDB */
13738
13739#endif /* SOFTUPDATES */
13740