ffs_softdep.c revision 249218
1/*-
2 * Copyright 1998, 2000 Marshall Kirk McKusick.
3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4 * All rights reserved.
5 *
6 * The soft updates code is derived from the appendix of a University
7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8 * "Soft Updates: A Solution to the Metadata Update Problem in File
9 * Systems", CSE-TR-254-95, August 1995).
10 *
11 * Further information about soft updates can be obtained from:
12 *
13 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
14 *	1614 Oxford Street		mckusick@mckusick.com
15 *	Berkeley, CA 94709-1608		+1-510-843-9542
16 *	USA
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 *
22 * 1. Redistributions of source code must retain the above copyright
23 *    notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 *    notice, this list of conditions and the following disclaimer in the
26 *    documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
40 */
41
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 249218 2013-04-06 22:21:23Z jeff $");
44
45#include "opt_ffs.h"
46#include "opt_quota.h"
47#include "opt_ddb.h"
48
49/*
50 * For now we want the safety net that the DEBUG flag provides.
51 */
52#ifndef DEBUG
53#define DEBUG
54#endif
55
56#include <sys/param.h>
57#include <sys/kernel.h>
58#include <sys/systm.h>
59#include <sys/bio.h>
60#include <sys/buf.h>
61#include <sys/kdb.h>
62#include <sys/kthread.h>
63#include <sys/ktr.h>
64#include <sys/limits.h>
65#include <sys/lock.h>
66#include <sys/malloc.h>
67#include <sys/mount.h>
68#include <sys/mutex.h>
69#include <sys/namei.h>
70#include <sys/priv.h>
71#include <sys/proc.h>
72#include <sys/stat.h>
73#include <sys/sysctl.h>
74#include <sys/syslog.h>
75#include <sys/vnode.h>
76#include <sys/conf.h>
77
78#include <ufs/ufs/dir.h>
79#include <ufs/ufs/extattr.h>
80#include <ufs/ufs/quota.h>
81#include <ufs/ufs/inode.h>
82#include <ufs/ufs/ufsmount.h>
83#include <ufs/ffs/fs.h>
84#include <ufs/ffs/softdep.h>
85#include <ufs/ffs/ffs_extern.h>
86#include <ufs/ufs/ufs_extern.h>
87
88#include <vm/vm.h>
89#include <vm/vm_extern.h>
90#include <vm/vm_object.h>
91
92#include <geom/geom.h>
93
94#include <ddb/ddb.h>
95
96#define	KTR_SUJ	0	/* Define to KTR_SPARE. */
97
98#ifndef SOFTUPDATES
99
100int
101softdep_flushfiles(oldmnt, flags, td)
102	struct mount *oldmnt;
103	int flags;
104	struct thread *td;
105{
106
107	panic("softdep_flushfiles called");
108}
109
110int
111softdep_mount(devvp, mp, fs, cred)
112	struct vnode *devvp;
113	struct mount *mp;
114	struct fs *fs;
115	struct ucred *cred;
116{
117
118	return (0);
119}
120
121void
122softdep_initialize()
123{
124
125	return;
126}
127
128void
129softdep_uninitialize()
130{
131
132	return;
133}
134
135void
136softdep_unmount(mp)
137	struct mount *mp;
138{
139
140}
141
142void
143softdep_setup_sbupdate(ump, fs, bp)
144	struct ufsmount *ump;
145	struct fs *fs;
146	struct buf *bp;
147{
148}
149
150void
151softdep_setup_inomapdep(bp, ip, newinum, mode)
152	struct buf *bp;
153	struct inode *ip;
154	ino_t newinum;
155	int mode;
156{
157
158	panic("softdep_setup_inomapdep called");
159}
160
161void
162softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
163	struct buf *bp;
164	struct mount *mp;
165	ufs2_daddr_t newblkno;
166	int frags;
167	int oldfrags;
168{
169
170	panic("softdep_setup_blkmapdep called");
171}
172
173void
174softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
175	struct inode *ip;
176	ufs_lbn_t lbn;
177	ufs2_daddr_t newblkno;
178	ufs2_daddr_t oldblkno;
179	long newsize;
180	long oldsize;
181	struct buf *bp;
182{
183
184	panic("softdep_setup_allocdirect called");
185}
186
187void
188softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
189	struct inode *ip;
190	ufs_lbn_t lbn;
191	ufs2_daddr_t newblkno;
192	ufs2_daddr_t oldblkno;
193	long newsize;
194	long oldsize;
195	struct buf *bp;
196{
197
198	panic("softdep_setup_allocext called");
199}
200
201void
202softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
203	struct inode *ip;
204	ufs_lbn_t lbn;
205	struct buf *bp;
206	int ptrno;
207	ufs2_daddr_t newblkno;
208	ufs2_daddr_t oldblkno;
209	struct buf *nbp;
210{
211
212	panic("softdep_setup_allocindir_page called");
213}
214
215void
216softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
217	struct buf *nbp;
218	struct inode *ip;
219	struct buf *bp;
220	int ptrno;
221	ufs2_daddr_t newblkno;
222{
223
224	panic("softdep_setup_allocindir_meta called");
225}
226
227void
228softdep_journal_freeblocks(ip, cred, length, flags)
229	struct inode *ip;
230	struct ucred *cred;
231	off_t length;
232	int flags;
233{
234
235	panic("softdep_journal_freeblocks called");
236}
237
238void
239softdep_journal_fsync(ip)
240	struct inode *ip;
241{
242
243	panic("softdep_journal_fsync called");
244}
245
246void
247softdep_setup_freeblocks(ip, length, flags)
248	struct inode *ip;
249	off_t length;
250	int flags;
251{
252
253	panic("softdep_setup_freeblocks called");
254}
255
256void
257softdep_freefile(pvp, ino, mode)
258		struct vnode *pvp;
259		ino_t ino;
260		int mode;
261{
262
263	panic("softdep_freefile called");
264}
265
266int
267softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
268	struct buf *bp;
269	struct inode *dp;
270	off_t diroffset;
271	ino_t newinum;
272	struct buf *newdirbp;
273	int isnewblk;
274{
275
276	panic("softdep_setup_directory_add called");
277}
278
279void
280softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
281	struct buf *bp;
282	struct inode *dp;
283	caddr_t base;
284	caddr_t oldloc;
285	caddr_t newloc;
286	int entrysize;
287{
288
289	panic("softdep_change_directoryentry_offset called");
290}
291
292void
293softdep_setup_remove(bp, dp, ip, isrmdir)
294	struct buf *bp;
295	struct inode *dp;
296	struct inode *ip;
297	int isrmdir;
298{
299
300	panic("softdep_setup_remove called");
301}
302
303void
304softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
305	struct buf *bp;
306	struct inode *dp;
307	struct inode *ip;
308	ino_t newinum;
309	int isrmdir;
310{
311
312	panic("softdep_setup_directory_change called");
313}
314
315void
316softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
317	struct mount *mp;
318	struct buf *bp;
319	ufs2_daddr_t blkno;
320	int frags;
321	struct workhead *wkhd;
322{
323
324	panic("%s called", __FUNCTION__);
325}
326
327void
328softdep_setup_inofree(mp, bp, ino, wkhd)
329	struct mount *mp;
330	struct buf *bp;
331	ino_t ino;
332	struct workhead *wkhd;
333{
334
335	panic("%s called", __FUNCTION__);
336}
337
338void
339softdep_setup_unlink(dp, ip)
340	struct inode *dp;
341	struct inode *ip;
342{
343
344	panic("%s called", __FUNCTION__);
345}
346
347void
348softdep_setup_link(dp, ip)
349	struct inode *dp;
350	struct inode *ip;
351{
352
353	panic("%s called", __FUNCTION__);
354}
355
356void
357softdep_revert_link(dp, ip)
358	struct inode *dp;
359	struct inode *ip;
360{
361
362	panic("%s called", __FUNCTION__);
363}
364
365void
366softdep_setup_rmdir(dp, ip)
367	struct inode *dp;
368	struct inode *ip;
369{
370
371	panic("%s called", __FUNCTION__);
372}
373
374void
375softdep_revert_rmdir(dp, ip)
376	struct inode *dp;
377	struct inode *ip;
378{
379
380	panic("%s called", __FUNCTION__);
381}
382
383void
384softdep_setup_create(dp, ip)
385	struct inode *dp;
386	struct inode *ip;
387{
388
389	panic("%s called", __FUNCTION__);
390}
391
392void
393softdep_revert_create(dp, ip)
394	struct inode *dp;
395	struct inode *ip;
396{
397
398	panic("%s called", __FUNCTION__);
399}
400
401void
402softdep_setup_mkdir(dp, ip)
403	struct inode *dp;
404	struct inode *ip;
405{
406
407	panic("%s called", __FUNCTION__);
408}
409
410void
411softdep_revert_mkdir(dp, ip)
412	struct inode *dp;
413	struct inode *ip;
414{
415
416	panic("%s called", __FUNCTION__);
417}
418
419void
420softdep_setup_dotdot_link(dp, ip)
421	struct inode *dp;
422	struct inode *ip;
423{
424
425	panic("%s called", __FUNCTION__);
426}
427
428int
429softdep_prealloc(vp, waitok)
430	struct vnode *vp;
431	int waitok;
432{
433
434	panic("%s called", __FUNCTION__);
435
436	return (0);
437}
438
439int
440softdep_journal_lookup(mp, vpp)
441	struct mount *mp;
442	struct vnode **vpp;
443{
444
445	return (ENOENT);
446}
447
448void
449softdep_change_linkcnt(ip)
450	struct inode *ip;
451{
452
453	panic("softdep_change_linkcnt called");
454}
455
456void
457softdep_load_inodeblock(ip)
458	struct inode *ip;
459{
460
461	panic("softdep_load_inodeblock called");
462}
463
464void
465softdep_update_inodeblock(ip, bp, waitfor)
466	struct inode *ip;
467	struct buf *bp;
468	int waitfor;
469{
470
471	panic("softdep_update_inodeblock called");
472}
473
474int
475softdep_fsync(vp)
476	struct vnode *vp;	/* the "in_core" copy of the inode */
477{
478
479	return (0);
480}
481
482void
483softdep_fsync_mountdev(vp)
484	struct vnode *vp;
485{
486
487	return;
488}
489
490int
491softdep_flushworklist(oldmnt, countp, td)
492	struct mount *oldmnt;
493	int *countp;
494	struct thread *td;
495{
496
497	*countp = 0;
498	return (0);
499}
500
501int
502softdep_sync_metadata(struct vnode *vp)
503{
504
505	return (0);
506}
507
508int
509softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
510{
511
512	return (0);
513}
514
515int
516softdep_slowdown(vp)
517	struct vnode *vp;
518{
519
520	panic("softdep_slowdown called");
521}
522
523void
524softdep_releasefile(ip)
525	struct inode *ip;	/* inode with the zero effective link count */
526{
527
528	panic("softdep_releasefile called");
529}
530
531int
532softdep_request_cleanup(fs, vp, cred, resource)
533	struct fs *fs;
534	struct vnode *vp;
535	struct ucred *cred;
536	int resource;
537{
538
539	return (0);
540}
541
542int
543softdep_check_suspend(struct mount *mp,
544		      struct vnode *devvp,
545		      int softdep_deps,
546		      int softdep_accdeps,
547		      int secondary_writes,
548		      int secondary_accwrites)
549{
550	struct bufobj *bo;
551	int error;
552
553	(void) softdep_deps,
554	(void) softdep_accdeps;
555
556	bo = &devvp->v_bufobj;
557	ASSERT_BO_LOCKED(bo);
558
559	MNT_ILOCK(mp);
560	while (mp->mnt_secondary_writes != 0) {
561		BO_UNLOCK(bo);
562		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
563		    (PUSER - 1) | PDROP, "secwr", 0);
564		BO_LOCK(bo);
565		MNT_ILOCK(mp);
566	}
567
568	/*
569	 * Reasons for needing more work before suspend:
570	 * - Dirty buffers on devvp.
571	 * - Secondary writes occurred after start of vnode sync loop
572	 */
573	error = 0;
574	if (bo->bo_numoutput > 0 ||
575	    bo->bo_dirty.bv_cnt > 0 ||
576	    secondary_writes != 0 ||
577	    mp->mnt_secondary_writes != 0 ||
578	    secondary_accwrites != mp->mnt_secondary_accwrites)
579		error = EAGAIN;
580	BO_UNLOCK(bo);
581	return (error);
582}
583
584void
585softdep_get_depcounts(struct mount *mp,
586		      int *softdepactivep,
587		      int *softdepactiveaccp)
588{
589	(void) mp;
590	*softdepactivep = 0;
591	*softdepactiveaccp = 0;
592}
593
594void
595softdep_buf_append(bp, wkhd)
596	struct buf *bp;
597	struct workhead *wkhd;
598{
599
600	panic("softdep_buf_appendwork called");
601}
602
603void
604softdep_inode_append(ip, cred, wkhd)
605	struct inode *ip;
606	struct ucred *cred;
607	struct workhead *wkhd;
608{
609
610	panic("softdep_inode_appendwork called");
611}
612
613void
614softdep_freework(wkhd)
615	struct workhead *wkhd;
616{
617
618	panic("softdep_freework called");
619}
620
621#else
622
623FEATURE(softupdates, "FFS soft-updates support");
624
625/*
626 * These definitions need to be adapted to the system to which
627 * this file is being ported.
628 */
629
630#define M_SOFTDEP_FLAGS	(M_WAITOK)
631
632#define	D_PAGEDEP	0
633#define	D_INODEDEP	1
634#define	D_BMSAFEMAP	2
635#define	D_NEWBLK	3
636#define	D_ALLOCDIRECT	4
637#define	D_INDIRDEP	5
638#define	D_ALLOCINDIR	6
639#define	D_FREEFRAG	7
640#define	D_FREEBLKS	8
641#define	D_FREEFILE	9
642#define	D_DIRADD	10
643#define	D_MKDIR		11
644#define	D_DIRREM	12
645#define	D_NEWDIRBLK	13
646#define	D_FREEWORK	14
647#define	D_FREEDEP	15
648#define	D_JADDREF	16
649#define	D_JREMREF	17
650#define	D_JMVREF	18
651#define	D_JNEWBLK	19
652#define	D_JFREEBLK	20
653#define	D_JFREEFRAG	21
654#define	D_JSEG		22
655#define	D_JSEGDEP	23
656#define	D_SBDEP		24
657#define	D_JTRUNC	25
658#define	D_JFSYNC	26
659#define	D_SENTINAL	27
660#define	D_LAST		D_SENTINAL
661
662unsigned long dep_current[D_LAST + 1];
663unsigned long dep_total[D_LAST + 1];
664unsigned long dep_write[D_LAST + 1];
665
666
667static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
668    "soft updates stats");
669static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
670    "total dependencies allocated");
671static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
672    "current dependencies allocated");
673static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
674    "current dependencies written");
675
676#define	SOFTDEP_TYPE(type, str, long)					\
677    static MALLOC_DEFINE(M_ ## type, #str, long);			\
678    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
679	&dep_total[D_ ## type], 0, "");					\
680    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
681	&dep_current[D_ ## type], 0, "");				\
682    SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, 	\
683	&dep_write[D_ ## type], 0, "");
684
685SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
686SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
687SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
688    "Block or frag allocated from cyl group map");
689SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
690SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
691SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
692SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
693SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
694SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
695SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
696SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
697SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
698SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
699SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
700SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
701SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
702SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
703SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
704SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
705SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
706SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
707SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
708SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
709SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
710SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
711SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
712SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
713
714static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
715static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
716
717/*
718 * translate from workitem type to memory type
719 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
720 */
721static struct malloc_type *memtype[] = {
722	M_PAGEDEP,
723	M_INODEDEP,
724	M_BMSAFEMAP,
725	M_NEWBLK,
726	M_ALLOCDIRECT,
727	M_INDIRDEP,
728	M_ALLOCINDIR,
729	M_FREEFRAG,
730	M_FREEBLKS,
731	M_FREEFILE,
732	M_DIRADD,
733	M_MKDIR,
734	M_DIRREM,
735	M_NEWDIRBLK,
736	M_FREEWORK,
737	M_FREEDEP,
738	M_JADDREF,
739	M_JREMREF,
740	M_JMVREF,
741	M_JNEWBLK,
742	M_JFREEBLK,
743	M_JFREEFRAG,
744	M_JSEG,
745	M_JSEGDEP,
746	M_SBDEP,
747	M_JTRUNC,
748	M_JFSYNC
749};
750
751static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;
752
753#define DtoM(type) (memtype[type])
754
755/*
756 * Names of malloc types.
757 */
758#define TYPENAME(type)  \
759	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
760/*
761 * End system adaptation definitions.
762 */
763
764#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
765#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
766
767/*
768 * Forward declarations.
769 */
770struct inodedep_hashhead;
771struct newblk_hashhead;
772struct pagedep_hashhead;
773struct bmsafemap_hashhead;
774
775/*
776 * Private journaling structures.
777 */
778struct jblocks {
779	struct jseglst	jb_segs;	/* TAILQ of current segments. */
780	struct jseg	*jb_writeseg;	/* Next write to complete. */
781	struct jseg	*jb_oldestseg;	/* Oldest segment with valid entries. */
782	struct jextent	*jb_extent;	/* Extent array. */
783	uint64_t	jb_nextseq;	/* Next sequence number. */
784	uint64_t	jb_oldestwrseq;	/* Oldest written sequence number. */
785	uint8_t		jb_needseg;	/* Need a forced segment. */
786	uint8_t		jb_suspended;	/* Did journal suspend writes? */
787	int		jb_avail;	/* Available extents. */
788	int		jb_used;	/* Last used extent. */
789	int		jb_head;	/* Allocator head. */
790	int		jb_off;		/* Allocator extent offset. */
791	int		jb_blocks;	/* Total disk blocks covered. */
792	int		jb_free;	/* Total disk blocks free. */
793	int		jb_min;		/* Minimum free space. */
794	int		jb_low;		/* Low on space. */
795	int		jb_age;		/* Insertion time of oldest rec. */
796};
797
798struct jextent {
799	ufs2_daddr_t	je_daddr;	/* Disk block address. */
800	int		je_blocks;	/* Disk block count. */
801};
802
803/*
804 * Internal function prototypes.
805 */
806static	void softdep_error(char *, int);
807static	void drain_output(struct vnode *);
808static	struct buf *getdirtybuf(struct buf *, struct mtx *, int);
809static	void clear_remove(void);
810static	void clear_inodedeps(void);
811static	void unlinked_inodedep(struct mount *, struct inodedep *);
812static	void clear_unlinked_inodedep(struct inodedep *);
813static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
814static	int flush_pagedep_deps(struct vnode *, struct mount *,
815	    struct diraddhd *);
816static	int free_pagedep(struct pagedep *);
817static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
818static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
819static	int flush_deplist(struct allocdirectlst *, int, int *);
820static	int sync_cgs(struct mount *, int);
821static	int handle_written_filepage(struct pagedep *, struct buf *);
822static	int handle_written_sbdep(struct sbdep *, struct buf *);
823static	void initiate_write_sbdep(struct sbdep *);
824static  void diradd_inode_written(struct diradd *, struct inodedep *);
825static	int handle_written_indirdep(struct indirdep *, struct buf *,
826	    struct buf**);
827static	int handle_written_inodeblock(struct inodedep *, struct buf *);
828static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
829	    uint8_t *);
830static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
831static	void handle_written_jaddref(struct jaddref *);
832static	void handle_written_jremref(struct jremref *);
833static	void handle_written_jseg(struct jseg *, struct buf *);
834static	void handle_written_jnewblk(struct jnewblk *);
835static	void handle_written_jblkdep(struct jblkdep *);
836static	void handle_written_jfreefrag(struct jfreefrag *);
837static	void complete_jseg(struct jseg *);
838static	void complete_jsegs(struct jseg *);
839static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
840static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
841static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
842static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
843static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
844static	void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
845static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
846static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
847static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
848static	inline void inoref_write(struct inoref *, struct jseg *,
849	    struct jrefrec *);
850static	void handle_allocdirect_partdone(struct allocdirect *,
851	    struct workhead *);
852static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
853	    struct workhead *);
854static	void indirdep_complete(struct indirdep *);
855static	int indirblk_lookup(struct mount *, ufs2_daddr_t);
856static	void indirblk_insert(struct freework *);
857static	void indirblk_remove(struct freework *);
858static	void handle_allocindir_partdone(struct allocindir *);
859static	void initiate_write_filepage(struct pagedep *, struct buf *);
860static	void initiate_write_indirdep(struct indirdep*, struct buf *);
861static	void handle_written_mkdir(struct mkdir *, int);
862static	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
863	    uint8_t *);
864static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
865static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
866static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
867static	void handle_workitem_freefile(struct freefile *);
868static	int handle_workitem_remove(struct dirrem *, int);
869static	struct dirrem *newdirrem(struct buf *, struct inode *,
870	    struct inode *, int, struct dirrem **);
871static	struct indirdep *indirdep_lookup(struct mount *, struct inode *,
872	    struct buf *);
873static	void cancel_indirdep(struct indirdep *, struct buf *,
874	    struct freeblks *);
875static	void free_indirdep(struct indirdep *);
876static	void free_diradd(struct diradd *, struct workhead *);
877static	void merge_diradd(struct inodedep *, struct diradd *);
878static	void complete_diradd(struct diradd *);
879static	struct diradd *diradd_lookup(struct pagedep *, int);
880static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
881	    struct jremref *);
882static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
883	    struct jremref *);
884static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
885	    struct jremref *, struct jremref *);
886static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
887	    struct jremref *);
888static	void cancel_allocindir(struct allocindir *, struct buf *bp,
889	    struct freeblks *, int);
890static	int setup_trunc_indir(struct freeblks *, struct inode *,
891	    ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
892static	void complete_trunc_indir(struct freework *);
893static	void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
894	    int);
895static	void complete_mkdir(struct mkdir *);
896static	void free_newdirblk(struct newdirblk *);
897static	void free_jremref(struct jremref *);
898static	void free_jaddref(struct jaddref *);
899static	void free_jsegdep(struct jsegdep *);
900static	void free_jsegs(struct jblocks *);
901static	void rele_jseg(struct jseg *);
902static	void free_jseg(struct jseg *, struct jblocks *);
903static	void free_jnewblk(struct jnewblk *);
904static	void free_jblkdep(struct jblkdep *);
905static	void free_jfreefrag(struct jfreefrag *);
906static	void free_freedep(struct freedep *);
907static	void journal_jremref(struct dirrem *, struct jremref *,
908	    struct inodedep *);
909static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
910static	int cancel_jaddref(struct jaddref *, struct inodedep *,
911	    struct workhead *);
912static	void cancel_jfreefrag(struct jfreefrag *);
913static	inline void setup_freedirect(struct freeblks *, struct inode *,
914	    int, int);
915static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
916static	inline void setup_freeindir(struct freeblks *, struct inode *, int,
917	    ufs_lbn_t, int);
918static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
919static	void freeblks_free(struct ufsmount *, struct freeblks *, int);
920static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
921ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
922static	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
923static	void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
924	    int, int);
925static	void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
926static 	int cancel_pagedep(struct pagedep *, struct freeblks *, int);
927static	int deallocate_dependencies(struct buf *, struct freeblks *, int);
928static	void newblk_freefrag(struct newblk*);
929static	void free_newblk(struct newblk *);
930static	void cancel_allocdirect(struct allocdirectlst *,
931	    struct allocdirect *, struct freeblks *);
932static	int check_inode_unwritten(struct inodedep *);
933static	int free_inodedep(struct inodedep *);
934static	void freework_freeblock(struct freework *);
935static	void freework_enqueue(struct freework *);
936static	int handle_workitem_freeblocks(struct freeblks *, int);
937static	int handle_complete_freeblocks(struct freeblks *, int);
938static	void handle_workitem_indirblk(struct freework *);
939static	void handle_written_freework(struct freework *);
940static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
941static	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
942	    struct workhead *);
943static	struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
944	    struct inodedep *, struct allocindir *, ufs_lbn_t);
945static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
946	    ufs2_daddr_t, ufs_lbn_t);
947static	void handle_workitem_freefrag(struct freefrag *);
948static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
949	    ufs_lbn_t);
950static	void allocdirect_merge(struct allocdirectlst *,
951	    struct allocdirect *, struct allocdirect *);
952static	struct freefrag *allocindir_merge(struct allocindir *,
953	    struct allocindir *);
954static	int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int,
955	    struct bmsafemap **);
956static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
957	    int cg, struct bmsafemap *);
958static	int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t,
959	    int, struct newblk **);
960static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
961static	int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
962	    struct inodedep **);
963static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
964static	int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
965	    int, struct pagedep **);
966static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
967	    struct mount *mp, int, struct pagedep **);
968static	void pause_timer(void *);
969static	int request_cleanup(struct mount *, int);
970static	int process_worklist_item(struct mount *, int, int);
971static	void process_removes(struct vnode *);
972static	void process_truncates(struct vnode *);
973static	void jwork_move(struct workhead *, struct workhead *);
974static	void jwork_insert(struct workhead *, struct jsegdep *);
975static	void add_to_worklist(struct worklist *, int);
976static	void wake_worklist(struct worklist *);
977static	void wait_worklist(struct worklist *, char *);
978static	void remove_from_worklist(struct worklist *);
979static	void softdep_flush(void);
980static	void softdep_flushjournal(struct mount *);
981static	int softdep_speedup(void);
982static	void worklist_speedup(void);
983static	int journal_mount(struct mount *, struct fs *, struct ucred *);
984static	void journal_unmount(struct mount *);
985static	int journal_space(struct ufsmount *, int);
986static	void journal_suspend(struct ufsmount *);
987static	int journal_unsuspend(struct ufsmount *ump);
988static	void softdep_prelink(struct vnode *, struct vnode *);
989static	void add_to_journal(struct worklist *);
990static	void remove_from_journal(struct worklist *);
991static	void softdep_process_journal(struct mount *, struct worklist *, int);
992static	struct jremref *newjremref(struct dirrem *, struct inode *,
993	    struct inode *ip, off_t, nlink_t);
994static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
995	    uint16_t);
996static	inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
997	    uint16_t);
998static	inline struct jsegdep *inoref_jseg(struct inoref *);
999static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
1000static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
1001	    ufs2_daddr_t, int);
1002static	struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
1003static	void move_newblock_dep(struct jaddref *, struct inodedep *);
1004static	void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
1005static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
1006	    ufs2_daddr_t, long, ufs_lbn_t);
1007static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
1008	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
1009static	int jwait(struct worklist *, int);
1010static	struct inodedep *inodedep_lookup_ip(struct inode *);
1011static	int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
1012static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
1013static	void handle_jwork(struct workhead *);
1014static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
1015	    struct mkdir **);
1016static	struct jblocks *jblocks_create(void);
1017static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
1018static	void jblocks_free(struct jblocks *, struct mount *, int);
1019static	void jblocks_destroy(struct jblocks *);
1020static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
1021
1022/*
1023 * Exported softdep operations.
1024 */
1025static	void softdep_disk_io_initiation(struct buf *);
1026static	void softdep_disk_write_complete(struct buf *);
1027static	void softdep_deallocate_dependencies(struct buf *);
1028static	int softdep_count_dependencies(struct buf *bp, int);
1029
1030static struct mtx lk;
1031MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
1032
1033#define TRY_ACQUIRE_LOCK(lk)		mtx_trylock(lk)
1034#define ACQUIRE_LOCK(lk)		mtx_lock(lk)
1035#define FREE_LOCK(lk)			mtx_unlock(lk)
1036
1037#define	BUF_AREC(bp)			lockallowrecurse(&(bp)->b_lock)
1038#define	BUF_NOREC(bp)			lockdisablerecurse(&(bp)->b_lock)
1039
1040/*
1041 * Worklist queue management.
1042 * These routines require that the lock be held.
1043 */
1044#ifndef /* NOT */ DEBUG
1045#define WORKLIST_INSERT(head, item) do {	\
1046	(item)->wk_state |= ONWORKLIST;		\
1047	LIST_INSERT_HEAD(head, item, wk_list);	\
1048} while (0)
1049#define WORKLIST_REMOVE(item) do {		\
1050	(item)->wk_state &= ~ONWORKLIST;	\
1051	LIST_REMOVE(item, wk_list);		\
1052} while (0)
1053#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
1054#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
1055
1056#else /* DEBUG */
1057static	void worklist_insert(struct workhead *, struct worklist *, int);
1058static	void worklist_remove(struct worklist *, int);
1059
1060#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
1061#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
1062#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
1063#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
1064
1065static void
1066worklist_insert(head, item, locked)
1067	struct workhead *head;
1068	struct worklist *item;
1069	int locked;
1070{
1071
1072	if (locked)
1073		mtx_assert(&lk, MA_OWNED);
1074	if (item->wk_state & ONWORKLIST)
1075		panic("worklist_insert: %p %s(0x%X) already on list",
1076		    item, TYPENAME(item->wk_type), item->wk_state);
1077	item->wk_state |= ONWORKLIST;
1078	LIST_INSERT_HEAD(head, item, wk_list);
1079}
1080
1081static void
1082worklist_remove(item, locked)
1083	struct worklist *item;
1084	int locked;
1085{
1086
1087	if (locked)
1088		mtx_assert(&lk, MA_OWNED);
1089	if ((item->wk_state & ONWORKLIST) == 0)
1090		panic("worklist_remove: %p %s(0x%X) not on list",
1091		    item, TYPENAME(item->wk_type), item->wk_state);
1092	item->wk_state &= ~ONWORKLIST;
1093	LIST_REMOVE(item, wk_list);
1094}
1095#endif /* DEBUG */
1096
1097/*
1098 * Merge two jsegdeps keeping only the oldest one as newer references
1099 * can't be discarded until after older references.
1100 */
1101static inline struct jsegdep *
1102jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1103{
1104	struct jsegdep *swp;
1105
1106	if (two == NULL)
1107		return (one);
1108
1109	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1110		swp = one;
1111		one = two;
1112		two = swp;
1113	}
1114	WORKLIST_REMOVE(&two->jd_list);
1115	free_jsegdep(two);
1116
1117	return (one);
1118}
1119
1120/*
1121 * If two freedeps are compatible free one to reduce list size.
1122 */
1123static inline struct freedep *
1124freedep_merge(struct freedep *one, struct freedep *two)
1125{
1126	if (two == NULL)
1127		return (one);
1128
1129	if (one->fd_freework == two->fd_freework) {
1130		WORKLIST_REMOVE(&two->fd_list);
1131		free_freedep(two);
1132	}
1133	return (one);
1134}
1135
1136/*
1137 * Move journal work from one list to another.  Duplicate freedeps and
1138 * jsegdeps are coalesced to keep the lists as small as possible.
1139 */
1140static void
1141jwork_move(dst, src)
1142	struct workhead *dst;
1143	struct workhead *src;
1144{
1145	struct freedep *freedep;
1146	struct jsegdep *jsegdep;
1147	struct worklist *wkn;
1148	struct worklist *wk;
1149
1150	KASSERT(dst != src,
1151	    ("jwork_move: dst == src"));
1152	freedep = NULL;
1153	jsegdep = NULL;
1154	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1155		if (wk->wk_type == D_JSEGDEP)
1156			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1157		if (wk->wk_type == D_FREEDEP)
1158			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1159	}
1160
1161	mtx_assert(&lk, MA_OWNED);
1162	while ((wk = LIST_FIRST(src)) != NULL) {
1163		WORKLIST_REMOVE(wk);
1164		WORKLIST_INSERT(dst, wk);
1165		if (wk->wk_type == D_JSEGDEP) {
1166			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1167			continue;
1168		}
1169		if (wk->wk_type == D_FREEDEP)
1170			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1171	}
1172}
1173
1174static void
1175jwork_insert(dst, jsegdep)
1176	struct workhead *dst;
1177	struct jsegdep *jsegdep;
1178{
1179	struct jsegdep *jsegdepn;
1180	struct worklist *wk;
1181
1182	LIST_FOREACH(wk, dst, wk_list)
1183		if (wk->wk_type == D_JSEGDEP)
1184			break;
1185	if (wk == NULL) {
1186		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1187		return;
1188	}
1189	jsegdepn = WK_JSEGDEP(wk);
1190	if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1191		WORKLIST_REMOVE(wk);
1192		free_jsegdep(jsegdepn);
1193		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1194	} else
1195		free_jsegdep(jsegdep);
1196}
1197
1198/*
1199 * Routines for tracking and managing workitems.
1200 */
1201static	void workitem_free(struct worklist *, int);
1202static	void workitem_alloc(struct worklist *, int, struct mount *);
1203
1204#define	WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
1205
1206static void
1207workitem_free(item, type)
1208	struct worklist *item;
1209	int type;
1210{
1211	struct ufsmount *ump;
1212	mtx_assert(&lk, MA_OWNED);
1213
1214#ifdef DEBUG
1215	if (item->wk_state & ONWORKLIST)
1216		panic("workitem_free: %s(0x%X) still on list",
1217		    TYPENAME(item->wk_type), item->wk_state);
1218	if (item->wk_type != type)
1219		panic("workitem_free: type mismatch %s != %s",
1220		    TYPENAME(item->wk_type), TYPENAME(type));
1221#endif
1222	if (item->wk_state & IOWAITING)
1223		wakeup(item);
1224	ump = VFSTOUFS(item->wk_mp);
1225	if (--ump->softdep_deps == 0 && ump->softdep_req)
1226		wakeup(&ump->softdep_deps);
1227	dep_current[type]--;
1228	free(item, DtoM(type));
1229}
1230
1231static void
1232workitem_alloc(item, type, mp)
1233	struct worklist *item;
1234	int type;
1235	struct mount *mp;
1236{
1237	struct ufsmount *ump;
1238
1239	item->wk_type = type;
1240	item->wk_mp = mp;
1241	item->wk_state = 0;
1242
1243	ump = VFSTOUFS(mp);
1244	ACQUIRE_LOCK(&lk);
1245	dep_current[type]++;
1246	dep_total[type]++;
1247	ump->softdep_deps++;
1248	ump->softdep_accdeps++;
1249	FREE_LOCK(&lk);
1250}
1251
1252/*
1253 * Workitem queue management
1254 */
1255static int max_softdeps;	/* maximum number of structs before slowdown */
1256static int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
1257static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1258static int proc_waiting;	/* tracks whether we have a timeout posted */
1259static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1260static struct callout softdep_callout;
1261static int req_pending;
1262static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1263static int req_clear_remove;	/* syncer process flush some freeblks */
1264static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1265
1266/*
1267 * runtime statistics
1268 */
1269static int stat_worklist_push;	/* number of worklist cleanups */
1270static int stat_blk_limit_push;	/* number of times block limit neared */
1271static int stat_ino_limit_push;	/* number of times inode limit neared */
1272static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1273static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1274static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1275static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1276static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1277static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1278static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1279static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1280static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1281static int stat_journal_min;	/* Times hit journal min threshold */
1282static int stat_journal_low;	/* Times hit journal low threshold */
1283static int stat_journal_wait;	/* Times blocked in jwait(). */
1284static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1285static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1286static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1287static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1288static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1289static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1290static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1291static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1292static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1293
1294SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1295    &max_softdeps, 0, "");
1296SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1297    &tickdelay, 0, "");
1298SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
1299    &maxindirdeps, 0, "");
1300SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1301    &stat_worklist_push, 0,"");
1302SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1303    &stat_blk_limit_push, 0,"");
1304SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1305    &stat_ino_limit_push, 0,"");
1306SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1307    &stat_blk_limit_hit, 0, "");
1308SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1309    &stat_ino_limit_hit, 0, "");
1310SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1311    &stat_sync_limit_hit, 0, "");
1312SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1313    &stat_indir_blk_ptrs, 0, "");
1314SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1315    &stat_inode_bitmap, 0, "");
1316SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1317    &stat_direct_blk_ptrs, 0, "");
1318SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1319    &stat_dir_entry, 0, "");
1320SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1321    &stat_jaddref, 0, "");
1322SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1323    &stat_jnewblk, 0, "");
1324SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1325    &stat_journal_low, 0, "");
1326SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1327    &stat_journal_min, 0, "");
1328SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1329    &stat_journal_wait, 0, "");
1330SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1331    &stat_jwait_filepage, 0, "");
1332SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1333    &stat_jwait_freeblks, 0, "");
1334SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1335    &stat_jwait_inode, 0, "");
1336SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1337    &stat_jwait_newblk, 0, "");
1338SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1339    &stat_cleanup_blkrequests, 0, "");
1340SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1341    &stat_cleanup_inorequests, 0, "");
1342SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1343    &stat_cleanup_high_delay, 0, "");
1344SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1345    &stat_cleanup_retries, 0, "");
1346SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1347    &stat_cleanup_failures, 0, "");
1348SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1349    &softdep_flushcache, 0, "");
1350
1351SYSCTL_DECL(_vfs_ffs);
1352
1353LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl;
1354static u_long	bmsafemap_hash;	/* size of hash table - 1 */
1355
1356static int compute_summary_at_mount = 0;	/* Whether to recompute the summary at mount time */
1357SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1358	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1359
1360static struct proc *softdepproc;
1361static struct kproc_desc softdep_kp = {
1362	"softdepflush",
1363	softdep_flush,
1364	&softdepproc
1365};
1366SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
1367    &softdep_kp);
1368
1369static void
1370softdep_flush(void)
1371{
1372	struct mount *nmp;
1373	struct mount *mp;
1374	struct ufsmount *ump;
1375	struct thread *td;
1376	int remaining;
1377	int progress;
1378
1379	td = curthread;
1380	td->td_pflags |= TDP_NORUNNINGBUF;
1381
1382	for (;;) {
1383		kproc_suspend_check(softdepproc);
1384		ACQUIRE_LOCK(&lk);
1385		/*
1386		 * If requested, try removing inode or removal dependencies.
1387		 */
1388		if (req_clear_inodedeps) {
1389			clear_inodedeps();
1390			req_clear_inodedeps -= 1;
1391			wakeup_one(&proc_waiting);
1392		}
1393		if (req_clear_remove) {
1394			clear_remove();
1395			req_clear_remove -= 1;
1396			wakeup_one(&proc_waiting);
1397		}
1398		FREE_LOCK(&lk);
1399		remaining = progress = 0;
1400		mtx_lock(&mountlist_mtx);
1401		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
1402			nmp = TAILQ_NEXT(mp, mnt_list);
1403			if (MOUNTEDSOFTDEP(mp) == 0)
1404				continue;
1405			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
1406				continue;
1407			progress += softdep_process_worklist(mp, 0);
1408			ump = VFSTOUFS(mp);
1409			remaining += ump->softdep_on_worklist;
1410			mtx_lock(&mountlist_mtx);
1411			nmp = TAILQ_NEXT(mp, mnt_list);
1412			vfs_unbusy(mp);
1413		}
1414		mtx_unlock(&mountlist_mtx);
1415		if (remaining && progress)
1416			continue;
1417		ACQUIRE_LOCK(&lk);
1418		if (!req_pending)
1419			msleep(&req_pending, &lk, PVM, "sdflush", hz);
1420		req_pending = 0;
1421		FREE_LOCK(&lk);
1422	}
1423}
1424
1425static void
1426worklist_speedup(void)
1427{
1428	mtx_assert(&lk, MA_OWNED);
1429	if (req_pending == 0) {
1430		req_pending = 1;
1431		wakeup(&req_pending);
1432	}
1433}
1434
1435static int
1436softdep_speedup(void)
1437{
1438
1439	worklist_speedup();
1440	bd_speedup();
1441	return speedup_syncer();
1442}
1443
1444/*
1445 * Add an item to the end of the work queue.
1446 * This routine requires that the lock be held.
1447 * This is the only routine that adds items to the list.
1448 * The following routine is the only one that removes items
1449 * and does so in order from first to last.
1450 */
1451
1452#define	WK_HEAD		0x0001	/* Add to HEAD. */
1453#define	WK_NODELAY	0x0002	/* Process immediately. */
1454
1455static void
1456add_to_worklist(wk, flags)
1457	struct worklist *wk;
1458	int flags;
1459{
1460	struct ufsmount *ump;
1461
1462	mtx_assert(&lk, MA_OWNED);
1463	ump = VFSTOUFS(wk->wk_mp);
1464	if (wk->wk_state & ONWORKLIST)
1465		panic("add_to_worklist: %s(0x%X) already on list",
1466		    TYPENAME(wk->wk_type), wk->wk_state);
1467	wk->wk_state |= ONWORKLIST;
1468	if (ump->softdep_on_worklist == 0) {
1469		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1470		ump->softdep_worklist_tail = wk;
1471	} else if (flags & WK_HEAD) {
1472		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1473	} else {
1474		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1475		ump->softdep_worklist_tail = wk;
1476	}
1477	ump->softdep_on_worklist += 1;
1478	if (flags & WK_NODELAY)
1479		worklist_speedup();
1480}
1481
1482/*
1483 * Remove the item to be processed. If we are removing the last
1484 * item on the list, we need to recalculate the tail pointer.
1485 */
1486static void
1487remove_from_worklist(wk)
1488	struct worklist *wk;
1489{
1490	struct ufsmount *ump;
1491
1492	ump = VFSTOUFS(wk->wk_mp);
1493	WORKLIST_REMOVE(wk);
1494	if (ump->softdep_worklist_tail == wk)
1495		ump->softdep_worklist_tail =
1496		    (struct worklist *)wk->wk_list.le_prev;
1497	ump->softdep_on_worklist -= 1;
1498}
1499
1500static void
1501wake_worklist(wk)
1502	struct worklist *wk;
1503{
1504	if (wk->wk_state & IOWAITING) {
1505		wk->wk_state &= ~IOWAITING;
1506		wakeup(wk);
1507	}
1508}
1509
1510static void
1511wait_worklist(wk, wmesg)
1512	struct worklist *wk;
1513	char *wmesg;
1514{
1515
1516	wk->wk_state |= IOWAITING;
1517	msleep(wk, &lk, PVM, wmesg, 0);
1518}
1519
1520/*
1521 * Process that runs once per second to handle items in the background queue.
1522 *
1523 * Note that we ensure that everything is done in the order in which they
1524 * appear in the queue. The code below depends on this property to ensure
1525 * that blocks of a file are freed before the inode itself is freed. This
1526 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1527 * until all the old ones have been purged from the dependency lists.
1528 */
1529int
1530softdep_process_worklist(mp, full)
1531	struct mount *mp;
1532	int full;
1533{
1534	int cnt, matchcnt;
1535	struct ufsmount *ump;
1536	long starttime;
1537
1538	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1539	/*
1540	 * Record the process identifier of our caller so that we can give
1541	 * this process preferential treatment in request_cleanup below.
1542	 */
1543	matchcnt = 0;
1544	ump = VFSTOUFS(mp);
1545	ACQUIRE_LOCK(&lk);
1546	starttime = time_second;
1547	softdep_process_journal(mp, NULL, full?MNT_WAIT:0);
1548	while (ump->softdep_on_worklist > 0) {
1549		if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1550			break;
1551		else
1552			matchcnt += cnt;
1553		/*
1554		 * If requested, try removing inode or removal dependencies.
1555		 */
1556		if (req_clear_inodedeps) {
1557			clear_inodedeps();
1558			req_clear_inodedeps -= 1;
1559			wakeup_one(&proc_waiting);
1560		}
1561		if (req_clear_remove) {
1562			clear_remove();
1563			req_clear_remove -= 1;
1564			wakeup_one(&proc_waiting);
1565		}
1566		/*
1567		 * We do not generally want to stop for buffer space, but if
1568		 * we are really being a buffer hog, we will stop and wait.
1569		 */
1570		if (should_yield()) {
1571			FREE_LOCK(&lk);
1572			kern_yield(PRI_USER);
1573			bwillwrite();
1574			ACQUIRE_LOCK(&lk);
1575		}
1576		/*
1577		 * Never allow processing to run for more than one
1578		 * second. Otherwise the other mountpoints may get
1579		 * excessively backlogged.
1580		 */
1581		if (!full && starttime != time_second)
1582			break;
1583	}
1584	if (full == 0)
1585		journal_unsuspend(ump);
1586	FREE_LOCK(&lk);
1587	return (matchcnt);
1588}
1589
1590/*
1591 * Process all removes associated with a vnode if we are running out of
1592 * journal space.  Any other process which attempts to flush these will
1593 * be unable as we have the vnodes locked.
1594 */
1595static void
1596process_removes(vp)
1597	struct vnode *vp;
1598{
1599	struct inodedep *inodedep;
1600	struct dirrem *dirrem;
1601	struct mount *mp;
1602	ino_t inum;
1603
1604	mtx_assert(&lk, MA_OWNED);
1605
1606	mp = vp->v_mount;
1607	inum = VTOI(vp)->i_number;
1608	for (;;) {
1609top:
1610		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1611			return;
1612		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1613			/*
1614			 * If another thread is trying to lock this vnode
1615			 * it will fail but we must wait for it to do so
1616			 * before we can proceed.
1617			 */
1618			if (dirrem->dm_state & INPROGRESS) {
1619				wait_worklist(&dirrem->dm_list, "pwrwait");
1620				goto top;
1621			}
1622			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1623			    (COMPLETE | ONWORKLIST))
1624				break;
1625		}
1626		if (dirrem == NULL)
1627			return;
1628		remove_from_worklist(&dirrem->dm_list);
1629		FREE_LOCK(&lk);
1630		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1631			panic("process_removes: suspended filesystem");
1632		handle_workitem_remove(dirrem, 0);
1633		vn_finished_secondary_write(mp);
1634		ACQUIRE_LOCK(&lk);
1635	}
1636}
1637
1638/*
1639 * Process all truncations associated with a vnode if we are running out
1640 * of journal space.  This is called when the vnode lock is already held
1641 * and no other process can clear the truncation.  This function returns
1642 * a value greater than zero if it did any work.
1643 */
1644static void
1645process_truncates(vp)
1646	struct vnode *vp;
1647{
1648	struct inodedep *inodedep;
1649	struct freeblks *freeblks;
1650	struct mount *mp;
1651	ino_t inum;
1652	int cgwait;
1653
1654	mtx_assert(&lk, MA_OWNED);
1655
1656	mp = vp->v_mount;
1657	inum = VTOI(vp)->i_number;
1658	for (;;) {
1659		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1660			return;
1661		cgwait = 0;
1662		TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1663			/* Journal entries not yet written.  */
1664			if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1665				jwait(&LIST_FIRST(
1666				    &freeblks->fb_jblkdephd)->jb_list,
1667				    MNT_WAIT);
1668				break;
1669			}
1670			/* Another thread is executing this item. */
1671			if (freeblks->fb_state & INPROGRESS) {
1672				wait_worklist(&freeblks->fb_list, "ptrwait");
1673				break;
1674			}
1675			/* Freeblks is waiting on a inode write. */
1676			if ((freeblks->fb_state & COMPLETE) == 0) {
1677				FREE_LOCK(&lk);
1678				ffs_update(vp, 1);
1679				ACQUIRE_LOCK(&lk);
1680				break;
1681			}
1682			if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1683			    (ALLCOMPLETE | ONWORKLIST)) {
1684				remove_from_worklist(&freeblks->fb_list);
1685				freeblks->fb_state |= INPROGRESS;
1686				FREE_LOCK(&lk);
1687				if (vn_start_secondary_write(NULL, &mp,
1688				    V_NOWAIT))
1689					panic("process_truncates: "
1690					    "suspended filesystem");
1691				handle_workitem_freeblocks(freeblks, 0);
1692				vn_finished_secondary_write(mp);
1693				ACQUIRE_LOCK(&lk);
1694				break;
1695			}
1696			if (freeblks->fb_cgwait)
1697				cgwait++;
1698		}
1699		if (cgwait) {
1700			FREE_LOCK(&lk);
1701			sync_cgs(mp, MNT_WAIT);
1702			ffs_sync_snap(mp, MNT_WAIT);
1703			ACQUIRE_LOCK(&lk);
1704			continue;
1705		}
1706		if (freeblks == NULL)
1707			break;
1708	}
1709	return;
1710}
1711
1712/*
1713 * Process one item on the worklist.
1714 */
1715static int
1716process_worklist_item(mp, target, flags)
1717	struct mount *mp;
1718	int target;
1719	int flags;
1720{
1721	struct worklist sintenel;
1722	struct worklist *wk;
1723	struct ufsmount *ump;
1724	int matchcnt;
1725	int error;
1726
1727	mtx_assert(&lk, MA_OWNED);
1728	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1729	/*
1730	 * If we are being called because of a process doing a
1731	 * copy-on-write, then it is not safe to write as we may
1732	 * recurse into the copy-on-write routine.
1733	 */
1734	if (curthread->td_pflags & TDP_COWINPROGRESS)
1735		return (-1);
1736	PHOLD(curproc);	/* Don't let the stack go away. */
1737	ump = VFSTOUFS(mp);
1738	matchcnt = 0;
1739	sintenel.wk_mp = NULL;
1740	sintenel.wk_type = D_SENTINAL;
1741	LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sintenel, wk_list);
1742	for (wk = LIST_NEXT(&sintenel, wk_list); wk != NULL;
1743	    wk = LIST_NEXT(&sintenel, wk_list)) {
1744		if (wk->wk_type == D_SENTINAL) {
1745			LIST_REMOVE(&sintenel, wk_list);
1746			LIST_INSERT_AFTER(wk, &sintenel, wk_list);
1747			continue;
1748		}
1749		if (wk->wk_state & INPROGRESS)
1750			panic("process_worklist_item: %p already in progress.",
1751			    wk);
1752		wk->wk_state |= INPROGRESS;
1753		remove_from_worklist(wk);
1754		FREE_LOCK(&lk);
1755		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1756			panic("process_worklist_item: suspended filesystem");
1757		switch (wk->wk_type) {
1758		case D_DIRREM:
1759			/* removal of a directory entry */
1760			error = handle_workitem_remove(WK_DIRREM(wk), flags);
1761			break;
1762
1763		case D_FREEBLKS:
1764			/* releasing blocks and/or fragments from a file */
1765			error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1766			    flags);
1767			break;
1768
1769		case D_FREEFRAG:
1770			/* releasing a fragment when replaced as a file grows */
1771			handle_workitem_freefrag(WK_FREEFRAG(wk));
1772			error = 0;
1773			break;
1774
1775		case D_FREEFILE:
1776			/* releasing an inode when its link count drops to 0 */
1777			handle_workitem_freefile(WK_FREEFILE(wk));
1778			error = 0;
1779			break;
1780
1781		default:
1782			panic("%s_process_worklist: Unknown type %s",
1783			    "softdep", TYPENAME(wk->wk_type));
1784			/* NOTREACHED */
1785		}
1786		vn_finished_secondary_write(mp);
1787		ACQUIRE_LOCK(&lk);
1788		if (error == 0) {
1789			if (++matchcnt == target)
1790				break;
1791			continue;
1792		}
1793		/*
1794		 * We have to retry the worklist item later.  Wake up any
1795		 * waiters who may be able to complete it immediately and
1796		 * add the item back to the head so we don't try to execute
1797		 * it again.
1798		 */
1799		wk->wk_state &= ~INPROGRESS;
1800		wake_worklist(wk);
1801		add_to_worklist(wk, WK_HEAD);
1802	}
1803	LIST_REMOVE(&sintenel, wk_list);
1804	/* Sentinal could've become the tail from remove_from_worklist. */
1805	if (ump->softdep_worklist_tail == &sintenel)
1806		ump->softdep_worklist_tail =
1807		    (struct worklist *)sintenel.wk_list.le_prev;
1808	PRELE(curproc);
1809	return (matchcnt);
1810}
1811
1812/*
1813 * Move dependencies from one buffer to another.
1814 */
1815int
1816softdep_move_dependencies(oldbp, newbp)
1817	struct buf *oldbp;
1818	struct buf *newbp;
1819{
1820	struct worklist *wk, *wktail;
1821	int dirty;
1822
1823	dirty = 0;
1824	wktail = NULL;
1825	ACQUIRE_LOCK(&lk);
1826	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1827		LIST_REMOVE(wk, wk_list);
1828		if (wk->wk_type == D_BMSAFEMAP &&
1829		    bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
1830			dirty = 1;
1831		if (wktail == 0)
1832			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1833		else
1834			LIST_INSERT_AFTER(wktail, wk, wk_list);
1835		wktail = wk;
1836	}
1837	FREE_LOCK(&lk);
1838
1839	return (dirty);
1840}
1841
1842/*
1843 * Purge the work list of all items associated with a particular mount point.
1844 */
1845int
1846softdep_flushworklist(oldmnt, countp, td)
1847	struct mount *oldmnt;
1848	int *countp;
1849	struct thread *td;
1850{
1851	struct vnode *devvp;
1852	int count, error = 0;
1853	struct ufsmount *ump;
1854
1855	/*
1856	 * Alternately flush the block device associated with the mount
1857	 * point and process any dependencies that the flushing
1858	 * creates. We continue until no more worklist dependencies
1859	 * are found.
1860	 */
1861	*countp = 0;
1862	ump = VFSTOUFS(oldmnt);
1863	devvp = ump->um_devvp;
1864	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1865		*countp += count;
1866		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1867		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1868		VOP_UNLOCK(devvp, 0);
1869		if (error)
1870			break;
1871	}
1872	return (error);
1873}
1874
1875int
1876softdep_waitidle(struct mount *mp)
1877{
1878	struct ufsmount *ump;
1879	int error;
1880	int i;
1881
1882	ump = VFSTOUFS(mp);
1883	ACQUIRE_LOCK(&lk);
1884	for (i = 0; i < 10 && ump->softdep_deps; i++) {
1885		ump->softdep_req = 1;
1886		if (ump->softdep_on_worklist)
1887			panic("softdep_waitidle: work added after flush.");
1888		msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
1889	}
1890	ump->softdep_req = 0;
1891	FREE_LOCK(&lk);
1892	error = 0;
1893	if (i == 10) {
1894		error = EBUSY;
1895		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1896		    mp);
1897	}
1898
1899	return (error);
1900}
1901
1902/*
1903 * Flush all vnodes and worklist items associated with a specified mount point.
1904 */
1905int
1906softdep_flushfiles(oldmnt, flags, td)
1907	struct mount *oldmnt;
1908	int flags;
1909	struct thread *td;
1910{
1911#ifdef QUOTA
1912	struct ufsmount *ump;
1913	int i;
1914#endif
1915	int error, early, depcount, loopcnt, retry_flush_count, retry;
1916	int morework;
1917
1918	loopcnt = 10;
1919	retry_flush_count = 3;
1920retry_flush:
1921	error = 0;
1922
1923	/*
1924	 * Alternately flush the vnodes associated with the mount
1925	 * point and process any dependencies that the flushing
1926	 * creates. In theory, this loop can happen at most twice,
1927	 * but we give it a few extra just to be sure.
1928	 */
1929	for (; loopcnt > 0; loopcnt--) {
1930		/*
1931		 * Do another flush in case any vnodes were brought in
1932		 * as part of the cleanup operations.
1933		 */
1934		early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
1935		    MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
1936		if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
1937			break;
1938		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1939		    depcount == 0)
1940			break;
1941	}
1942	/*
1943	 * If we are unmounting then it is an error to fail. If we
1944	 * are simply trying to downgrade to read-only, then filesystem
1945	 * activity can keep us busy forever, so we just fail with EBUSY.
1946	 */
1947	if (loopcnt == 0) {
1948		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1949			panic("softdep_flushfiles: looping");
1950		error = EBUSY;
1951	}
1952	if (!error)
1953		error = softdep_waitidle(oldmnt);
1954	if (!error) {
1955		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
1956			retry = 0;
1957			MNT_ILOCK(oldmnt);
1958			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
1959			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
1960			morework = oldmnt->mnt_nvnodelistsize > 0;
1961#ifdef QUOTA
1962			ump = VFSTOUFS(oldmnt);
1963			UFS_LOCK(ump);
1964			for (i = 0; i < MAXQUOTAS; i++) {
1965				if (ump->um_quotas[i] != NULLVP)
1966					morework = 1;
1967			}
1968			UFS_UNLOCK(ump);
1969#endif
1970			if (morework) {
1971				if (--retry_flush_count > 0) {
1972					retry = 1;
1973					loopcnt = 3;
1974				} else
1975					error = EBUSY;
1976			}
1977			MNT_IUNLOCK(oldmnt);
1978			if (retry)
1979				goto retry_flush;
1980		}
1981	}
1982	return (error);
1983}
1984
1985/*
1986 * Structure hashing.
1987 *
1988 * There are three types of structures that can be looked up:
1989 *	1) pagedep structures identified by mount point, inode number,
1990 *	   and logical block.
1991 *	2) inodedep structures identified by mount point and inode number.
1992 *	3) newblk structures identified by mount point and
1993 *	   physical block number.
1994 *
1995 * The "pagedep" and "inodedep" dependency structures are hashed
1996 * separately from the file blocks and inodes to which they correspond.
1997 * This separation helps when the in-memory copy of an inode or
1998 * file block must be replaced. It also obviates the need to access
1999 * an inode or file page when simply updating (or de-allocating)
2000 * dependency structures. Lookup of newblk structures is needed to
2001 * find newly allocated blocks when trying to associate them with
2002 * their allocdirect or allocindir structure.
2003 *
2004 * The lookup routines optionally create and hash a new instance when
2005 * an existing entry is not found.
2006 */
2007#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
2008#define NODELAY		0x0002	/* cannot do background work */
2009
2010/*
2011 * Structures and routines associated with pagedep caching.
2012 */
2013LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
2014u_long	pagedep_hash;		/* size of hash table - 1 */
2015#define	PAGEDEP_HASH(mp, inum, lbn) \
2016	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
2017	    pagedep_hash])
2018
2019static int
2020pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
2021	struct pagedep_hashhead *pagedephd;
2022	ino_t ino;
2023	ufs_lbn_t lbn;
2024	struct mount *mp;
2025	int flags;
2026	struct pagedep **pagedeppp;
2027{
2028	struct pagedep *pagedep;
2029
2030	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2031		if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn &&
2032		    mp == pagedep->pd_list.wk_mp) {
2033			*pagedeppp = pagedep;
2034			return (1);
2035		}
2036	}
2037	*pagedeppp = NULL;
2038	return (0);
2039}
2040/*
2041 * Look up a pagedep. Return 1 if found, 0 otherwise.
2042 * If not found, allocate if DEPALLOC flag is passed.
2043 * Found or allocated entry is returned in pagedeppp.
2044 * This routine must be called with splbio interrupts blocked.
2045 */
2046static int
2047pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
2048	struct mount *mp;
2049	struct buf *bp;
2050	ino_t ino;
2051	ufs_lbn_t lbn;
2052	int flags;
2053	struct pagedep **pagedeppp;
2054{
2055	struct pagedep *pagedep;
2056	struct pagedep_hashhead *pagedephd;
2057	struct worklist *wk;
2058	int ret;
2059	int i;
2060
2061	mtx_assert(&lk, MA_OWNED);
2062	if (bp) {
2063		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2064			if (wk->wk_type == D_PAGEDEP) {
2065				*pagedeppp = WK_PAGEDEP(wk);
2066				return (1);
2067			}
2068		}
2069	}
2070	pagedephd = PAGEDEP_HASH(mp, ino, lbn);
2071	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
2072	if (ret) {
2073		if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2074			WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2075		return (1);
2076	}
2077	if ((flags & DEPALLOC) == 0)
2078		return (0);
2079	FREE_LOCK(&lk);
2080	pagedep = malloc(sizeof(struct pagedep),
2081	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2082	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2083	ACQUIRE_LOCK(&lk);
2084	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
2085	if (*pagedeppp) {
2086		/*
2087		 * This should never happen since we only create pagedeps
2088		 * with the vnode lock held.  Could be an assert.
2089		 */
2090		WORKITEM_FREE(pagedep, D_PAGEDEP);
2091		return (ret);
2092	}
2093	pagedep->pd_ino = ino;
2094	pagedep->pd_lbn = lbn;
2095	LIST_INIT(&pagedep->pd_dirremhd);
2096	LIST_INIT(&pagedep->pd_pendinghd);
2097	for (i = 0; i < DAHASHSZ; i++)
2098		LIST_INIT(&pagedep->pd_diraddhd[i]);
2099	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2100	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2101	*pagedeppp = pagedep;
2102	return (0);
2103}
2104
2105/*
2106 * Structures and routines associated with inodedep caching.
2107 */
2108LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
2109static u_long	inodedep_hash;	/* size of hash table - 1 */
2110#define	INODEDEP_HASH(fs, inum) \
2111      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
2112
2113static int
2114inodedep_find(inodedephd, fs, inum, inodedeppp)
2115	struct inodedep_hashhead *inodedephd;
2116	struct fs *fs;
2117	ino_t inum;
2118	struct inodedep **inodedeppp;
2119{
2120	struct inodedep *inodedep;
2121
2122	LIST_FOREACH(inodedep, inodedephd, id_hash)
2123		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
2124			break;
2125	if (inodedep) {
2126		*inodedeppp = inodedep;
2127		return (1);
2128	}
2129	*inodedeppp = NULL;
2130
2131	return (0);
2132}
2133/*
2134 * Look up an inodedep. Return 1 if found, 0 if not found.
2135 * If not found, allocate if DEPALLOC flag is passed.
2136 * Found or allocated entry is returned in inodedeppp.
2137 * This routine must be called with splbio interrupts blocked.
2138 */
2139static int
2140inodedep_lookup(mp, inum, flags, inodedeppp)
2141	struct mount *mp;
2142	ino_t inum;
2143	int flags;
2144	struct inodedep **inodedeppp;
2145{
2146	struct inodedep *inodedep;
2147	struct inodedep_hashhead *inodedephd;
2148	struct fs *fs;
2149
2150	mtx_assert(&lk, MA_OWNED);
2151	fs = VFSTOUFS(mp)->um_fs;
2152	inodedephd = INODEDEP_HASH(fs, inum);
2153
2154	if (inodedep_find(inodedephd, fs, inum, inodedeppp))
2155		return (1);
2156	if ((flags & DEPALLOC) == 0)
2157		return (0);
2158	/*
2159	 * If we are over our limit, try to improve the situation.
2160	 */
2161	if (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0)
2162		request_cleanup(mp, FLUSH_INODES);
2163	FREE_LOCK(&lk);
2164	inodedep = malloc(sizeof(struct inodedep),
2165		M_INODEDEP, M_SOFTDEP_FLAGS);
2166	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2167	ACQUIRE_LOCK(&lk);
2168	if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
2169		WORKITEM_FREE(inodedep, D_INODEDEP);
2170		return (1);
2171	}
2172	inodedep->id_fs = fs;
2173	inodedep->id_ino = inum;
2174	inodedep->id_state = ALLCOMPLETE;
2175	inodedep->id_nlinkdelta = 0;
2176	inodedep->id_savedino1 = NULL;
2177	inodedep->id_savedsize = -1;
2178	inodedep->id_savedextsize = -1;
2179	inodedep->id_savednlink = -1;
2180	inodedep->id_bmsafemap = NULL;
2181	inodedep->id_mkdiradd = NULL;
2182	LIST_INIT(&inodedep->id_dirremhd);
2183	LIST_INIT(&inodedep->id_pendinghd);
2184	LIST_INIT(&inodedep->id_inowait);
2185	LIST_INIT(&inodedep->id_bufwait);
2186	TAILQ_INIT(&inodedep->id_inoreflst);
2187	TAILQ_INIT(&inodedep->id_inoupdt);
2188	TAILQ_INIT(&inodedep->id_newinoupdt);
2189	TAILQ_INIT(&inodedep->id_extupdt);
2190	TAILQ_INIT(&inodedep->id_newextupdt);
2191	TAILQ_INIT(&inodedep->id_freeblklst);
2192	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2193	*inodedeppp = inodedep;
2194	return (0);
2195}
2196
2197/*
2198 * Structures and routines associated with newblk caching.
2199 */
2200LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
2201u_long	newblk_hash;		/* size of hash table - 1 */
2202#define	NEWBLK_HASH(fs, inum) \
2203	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
2204
2205static int
2206newblk_find(newblkhd, mp, newblkno, flags, newblkpp)
2207	struct newblk_hashhead *newblkhd;
2208	struct mount *mp;
2209	ufs2_daddr_t newblkno;
2210	int flags;
2211	struct newblk **newblkpp;
2212{
2213	struct newblk *newblk;
2214
2215	LIST_FOREACH(newblk, newblkhd, nb_hash) {
2216		if (newblkno != newblk->nb_newblkno)
2217			continue;
2218		if (mp != newblk->nb_list.wk_mp)
2219			continue;
2220		/*
2221		 * If we're creating a new dependency don't match those that
2222		 * have already been converted to allocdirects.  This is for
2223		 * a frag extend.
2224		 */
2225		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2226			continue;
2227		break;
2228	}
2229	if (newblk) {
2230		*newblkpp = newblk;
2231		return (1);
2232	}
2233	*newblkpp = NULL;
2234	return (0);
2235}
2236
2237/*
2238 * Look up a newblk. Return 1 if found, 0 if not found.
2239 * If not found, allocate if DEPALLOC flag is passed.
2240 * Found or allocated entry is returned in newblkpp.
2241 */
2242static int
2243newblk_lookup(mp, newblkno, flags, newblkpp)
2244	struct mount *mp;
2245	ufs2_daddr_t newblkno;
2246	int flags;
2247	struct newblk **newblkpp;
2248{
2249	struct newblk *newblk;
2250	struct newblk_hashhead *newblkhd;
2251
2252	newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno);
2253	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp))
2254		return (1);
2255	if ((flags & DEPALLOC) == 0)
2256		return (0);
2257	FREE_LOCK(&lk);
2258	newblk = malloc(sizeof(union allblk), M_NEWBLK,
2259	    M_SOFTDEP_FLAGS | M_ZERO);
2260	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2261	ACQUIRE_LOCK(&lk);
2262	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) {
2263		WORKITEM_FREE(newblk, D_NEWBLK);
2264		return (1);
2265	}
2266	newblk->nb_freefrag = NULL;
2267	LIST_INIT(&newblk->nb_indirdeps);
2268	LIST_INIT(&newblk->nb_newdirblk);
2269	LIST_INIT(&newblk->nb_jwork);
2270	newblk->nb_state = ATTACHED;
2271	newblk->nb_newblkno = newblkno;
2272	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2273	*newblkpp = newblk;
2274	return (0);
2275}
2276
2277/*
2278 * Structures and routines associated with freed indirect block caching.
2279 */
2280struct freeworklst *indir_hashtbl;
2281u_long	indir_hash;		/* size of hash table - 1 */
2282#define	INDIR_HASH(mp, blkno) \
2283	(&indir_hashtbl[((((register_t)(mp)) >> 13) + (blkno)) & indir_hash])
2284
2285/*
2286 * Lookup an indirect block in the indir hash table.  The freework is
2287 * removed and potentially freed.  The caller must do a blocking journal
2288 * write before writing to the blkno.
2289 */
2290static int
2291indirblk_lookup(mp, blkno)
2292	struct mount *mp;
2293	ufs2_daddr_t blkno;
2294{
2295	struct freework *freework;
2296	struct freeworklst *wkhd;
2297
2298	wkhd = INDIR_HASH(mp, blkno);
2299	TAILQ_FOREACH(freework, wkhd, fw_next) {
2300		if (freework->fw_blkno != blkno)
2301			continue;
2302		if (freework->fw_list.wk_mp != mp)
2303			continue;
2304		indirblk_remove(freework);
2305		return (1);
2306	}
2307	return (0);
2308}
2309
2310/*
2311 * Insert an indirect block represented by freework into the indirblk
2312 * hash table so that it may prevent the block from being re-used prior
2313 * to the journal being written.
2314 */
2315static void
2316indirblk_insert(freework)
2317	struct freework *freework;
2318{
2319	struct jblocks *jblocks;
2320	struct jseg *jseg;
2321
2322	jblocks = VFSTOUFS(freework->fw_list.wk_mp)->softdep_jblocks;
2323	jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2324	if (jseg == NULL)
2325		return;
2326
2327	LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2328	TAILQ_INSERT_HEAD(INDIR_HASH(freework->fw_list.wk_mp,
2329	    freework->fw_blkno), freework, fw_next);
2330	freework->fw_state &= ~DEPCOMPLETE;
2331}
2332
2333static void
2334indirblk_remove(freework)
2335	struct freework *freework;
2336{
2337
2338	LIST_REMOVE(freework, fw_segs);
2339	TAILQ_REMOVE(INDIR_HASH(freework->fw_list.wk_mp,
2340	    freework->fw_blkno), freework, fw_next);
2341	freework->fw_state |= DEPCOMPLETE;
2342	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2343		WORKITEM_FREE(freework, D_FREEWORK);
2344}
2345
2346/*
2347 * Executed during filesystem system initialization before
2348 * mounting any filesystems.
2349 */
2350void
2351softdep_initialize()
2352{
2353	int i;
2354
2355	LIST_INIT(&mkdirlisthd);
2356	max_softdeps = desiredvnodes * 4;
2357	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash);
2358	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
2359	newblk_hashtbl = hashinit(desiredvnodes / 5,  M_NEWBLK, &newblk_hash);
2360	bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
2361	i = 1 << (ffs(desiredvnodes / 10) - 1);
2362	indir_hashtbl = malloc(i * sizeof(indir_hashtbl[0]), M_FREEWORK,
2363	    M_WAITOK);
2364	indir_hash = i - 1;
2365	for (i = 0; i <= indir_hash; i++)
2366		TAILQ_INIT(&indir_hashtbl[i]);
2367
2368	/* initialise bioops hack */
2369	bioops.io_start = softdep_disk_io_initiation;
2370	bioops.io_complete = softdep_disk_write_complete;
2371	bioops.io_deallocate = softdep_deallocate_dependencies;
2372	bioops.io_countdeps = softdep_count_dependencies;
2373
2374	/* Initialize the callout with an mtx. */
2375	callout_init_mtx(&softdep_callout, &lk, 0);
2376}
2377
2378/*
2379 * Executed after all filesystems have been unmounted during
2380 * filesystem module unload.
2381 */
2382void
2383softdep_uninitialize()
2384{
2385
2386	callout_drain(&softdep_callout);
2387	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
2388	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
2389	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
2390	hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash);
2391	free(indir_hashtbl, M_FREEWORK);
2392}
2393
2394/*
2395 * Called at mount time to notify the dependency code that a
2396 * filesystem wishes to use it.
2397 */
2398int
2399softdep_mount(devvp, mp, fs, cred)
2400	struct vnode *devvp;
2401	struct mount *mp;
2402	struct fs *fs;
2403	struct ucred *cred;
2404{
2405	struct csum_total cstotal;
2406	struct ufsmount *ump;
2407	struct cg *cgp;
2408	struct buf *bp;
2409	int error, cyl;
2410
2411	MNT_ILOCK(mp);
2412	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2413	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2414		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2415			MNTK_SOFTDEP | MNTK_NOASYNC;
2416	}
2417	MNT_IUNLOCK(mp);
2418	ump = VFSTOUFS(mp);
2419	LIST_INIT(&ump->softdep_workitem_pending);
2420	LIST_INIT(&ump->softdep_journal_pending);
2421	TAILQ_INIT(&ump->softdep_unlinked);
2422	LIST_INIT(&ump->softdep_dirtycg);
2423	ump->softdep_worklist_tail = NULL;
2424	ump->softdep_on_worklist = 0;
2425	ump->softdep_deps = 0;
2426	if ((fs->fs_flags & FS_SUJ) &&
2427	    (error = journal_mount(mp, fs, cred)) != 0) {
2428		printf("Failed to start journal: %d\n", error);
2429		return (error);
2430	}
2431	/*
2432	 * When doing soft updates, the counters in the
2433	 * superblock may have gotten out of sync. Recomputation
2434	 * can take a long time and can be deferred for background
2435	 * fsck.  However, the old behavior of scanning the cylinder
2436	 * groups and recalculating them at mount time is available
2437	 * by setting vfs.ffs.compute_summary_at_mount to one.
2438	 */
2439	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2440		return (0);
2441	bzero(&cstotal, sizeof cstotal);
2442	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2443		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2444		    fs->fs_cgsize, cred, &bp)) != 0) {
2445			brelse(bp);
2446			return (error);
2447		}
2448		cgp = (struct cg *)bp->b_data;
2449		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2450		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2451		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2452		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2453		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2454		brelse(bp);
2455	}
2456#ifdef DEBUG
2457	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2458		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2459#endif
2460	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2461	return (0);
2462}
2463
2464void
2465softdep_unmount(mp)
2466	struct mount *mp;
2467{
2468
2469	MNT_ILOCK(mp);
2470	mp->mnt_flag &= ~MNT_SOFTDEP;
2471	if (MOUNTEDSUJ(mp) == 0) {
2472		MNT_IUNLOCK(mp);
2473		return;
2474	}
2475	mp->mnt_flag &= ~MNT_SUJ;
2476	MNT_IUNLOCK(mp);
2477	journal_unmount(mp);
2478}
2479
2480static struct jblocks *
2481jblocks_create(void)
2482{
2483	struct jblocks *jblocks;
2484
2485	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2486	TAILQ_INIT(&jblocks->jb_segs);
2487	jblocks->jb_avail = 10;
2488	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2489	    M_JBLOCKS, M_WAITOK | M_ZERO);
2490
2491	return (jblocks);
2492}
2493
2494static ufs2_daddr_t
2495jblocks_alloc(jblocks, bytes, actual)
2496	struct jblocks *jblocks;
2497	int bytes;
2498	int *actual;
2499{
2500	ufs2_daddr_t daddr;
2501	struct jextent *jext;
2502	int freecnt;
2503	int blocks;
2504
2505	blocks = bytes / DEV_BSIZE;
2506	jext = &jblocks->jb_extent[jblocks->jb_head];
2507	freecnt = jext->je_blocks - jblocks->jb_off;
2508	if (freecnt == 0) {
2509		jblocks->jb_off = 0;
2510		if (++jblocks->jb_head > jblocks->jb_used)
2511			jblocks->jb_head = 0;
2512		jext = &jblocks->jb_extent[jblocks->jb_head];
2513		freecnt = jext->je_blocks;
2514	}
2515	if (freecnt > blocks)
2516		freecnt = blocks;
2517	*actual = freecnt * DEV_BSIZE;
2518	daddr = jext->je_daddr + jblocks->jb_off;
2519	jblocks->jb_off += freecnt;
2520	jblocks->jb_free -= freecnt;
2521
2522	return (daddr);
2523}
2524
2525static void
2526jblocks_free(jblocks, mp, bytes)
2527	struct jblocks *jblocks;
2528	struct mount *mp;
2529	int bytes;
2530{
2531
2532	jblocks->jb_free += bytes / DEV_BSIZE;
2533	if (jblocks->jb_suspended)
2534		worklist_speedup();
2535	wakeup(jblocks);
2536}
2537
2538static void
2539jblocks_destroy(jblocks)
2540	struct jblocks *jblocks;
2541{
2542
2543	if (jblocks->jb_extent)
2544		free(jblocks->jb_extent, M_JBLOCKS);
2545	free(jblocks, M_JBLOCKS);
2546}
2547
2548static void
2549jblocks_add(jblocks, daddr, blocks)
2550	struct jblocks *jblocks;
2551	ufs2_daddr_t daddr;
2552	int blocks;
2553{
2554	struct jextent *jext;
2555
2556	jblocks->jb_blocks += blocks;
2557	jblocks->jb_free += blocks;
2558	jext = &jblocks->jb_extent[jblocks->jb_used];
2559	/* Adding the first block. */
2560	if (jext->je_daddr == 0) {
2561		jext->je_daddr = daddr;
2562		jext->je_blocks = blocks;
2563		return;
2564	}
2565	/* Extending the last extent. */
2566	if (jext->je_daddr + jext->je_blocks == daddr) {
2567		jext->je_blocks += blocks;
2568		return;
2569	}
2570	/* Adding a new extent. */
2571	if (++jblocks->jb_used == jblocks->jb_avail) {
2572		jblocks->jb_avail *= 2;
2573		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2574		    M_JBLOCKS, M_WAITOK | M_ZERO);
2575		memcpy(jext, jblocks->jb_extent,
2576		    sizeof(struct jextent) * jblocks->jb_used);
2577		free(jblocks->jb_extent, M_JBLOCKS);
2578		jblocks->jb_extent = jext;
2579	}
2580	jext = &jblocks->jb_extent[jblocks->jb_used];
2581	jext->je_daddr = daddr;
2582	jext->je_blocks = blocks;
2583	return;
2584}
2585
2586int
2587softdep_journal_lookup(mp, vpp)
2588	struct mount *mp;
2589	struct vnode **vpp;
2590{
2591	struct componentname cnp;
2592	struct vnode *dvp;
2593	ino_t sujournal;
2594	int error;
2595
2596	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2597	if (error)
2598		return (error);
2599	bzero(&cnp, sizeof(cnp));
2600	cnp.cn_nameiop = LOOKUP;
2601	cnp.cn_flags = ISLASTCN;
2602	cnp.cn_thread = curthread;
2603	cnp.cn_cred = curthread->td_ucred;
2604	cnp.cn_pnbuf = SUJ_FILE;
2605	cnp.cn_nameptr = SUJ_FILE;
2606	cnp.cn_namelen = strlen(SUJ_FILE);
2607	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2608	vput(dvp);
2609	if (error != 0)
2610		return (error);
2611	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2612	return (error);
2613}
2614
2615/*
2616 * Open and verify the journal file.
2617 */
2618static int
2619journal_mount(mp, fs, cred)
2620	struct mount *mp;
2621	struct fs *fs;
2622	struct ucred *cred;
2623{
2624	struct jblocks *jblocks;
2625	struct vnode *vp;
2626	struct inode *ip;
2627	ufs2_daddr_t blkno;
2628	int bcount;
2629	int error;
2630	int i;
2631
2632	error = softdep_journal_lookup(mp, &vp);
2633	if (error != 0) {
2634		printf("Failed to find journal.  Use tunefs to create one\n");
2635		return (error);
2636	}
2637	ip = VTOI(vp);
2638	if (ip->i_size < SUJ_MIN) {
2639		error = ENOSPC;
2640		goto out;
2641	}
2642	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2643	jblocks = jblocks_create();
2644	for (i = 0; i < bcount; i++) {
2645		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2646		if (error)
2647			break;
2648		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2649	}
2650	if (error) {
2651		jblocks_destroy(jblocks);
2652		goto out;
2653	}
2654	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2655	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2656	VFSTOUFS(mp)->softdep_jblocks = jblocks;
2657out:
2658	if (error == 0) {
2659		MNT_ILOCK(mp);
2660		mp->mnt_flag |= MNT_SUJ;
2661		mp->mnt_flag &= ~MNT_SOFTDEP;
2662		MNT_IUNLOCK(mp);
2663		/*
2664		 * Only validate the journal contents if the
2665		 * filesystem is clean, otherwise we write the logs
2666		 * but they'll never be used.  If the filesystem was
2667		 * still dirty when we mounted it the journal is
2668		 * invalid and a new journal can only be valid if it
2669		 * starts from a clean mount.
2670		 */
2671		if (fs->fs_clean) {
2672			DIP_SET(ip, i_modrev, fs->fs_mtime);
2673			ip->i_flags |= IN_MODIFIED;
2674			ffs_update(vp, 1);
2675		}
2676	}
2677	vput(vp);
2678	return (error);
2679}
2680
2681static void
2682journal_unmount(mp)
2683	struct mount *mp;
2684{
2685	struct ufsmount *ump;
2686
2687	ump = VFSTOUFS(mp);
2688	if (ump->softdep_jblocks)
2689		jblocks_destroy(ump->softdep_jblocks);
2690	ump->softdep_jblocks = NULL;
2691}
2692
2693/*
2694 * Called when a journal record is ready to be written.  Space is allocated
2695 * and the journal entry is created when the journal is flushed to stable
2696 * store.
2697 */
2698static void
2699add_to_journal(wk)
2700	struct worklist *wk;
2701{
2702	struct ufsmount *ump;
2703
2704	mtx_assert(&lk, MA_OWNED);
2705	ump = VFSTOUFS(wk->wk_mp);
2706	if (wk->wk_state & ONWORKLIST)
2707		panic("add_to_journal: %s(0x%X) already on list",
2708		    TYPENAME(wk->wk_type), wk->wk_state);
2709	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2710	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2711		ump->softdep_jblocks->jb_age = ticks;
2712		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2713	} else
2714		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2715	ump->softdep_journal_tail = wk;
2716	ump->softdep_on_journal += 1;
2717}
2718
2719/*
2720 * Remove an arbitrary item for the journal worklist maintain the tail
2721 * pointer.  This happens when a new operation obviates the need to
2722 * journal an old operation.
2723 */
2724static void
2725remove_from_journal(wk)
2726	struct worklist *wk;
2727{
2728	struct ufsmount *ump;
2729
2730	mtx_assert(&lk, MA_OWNED);
2731	ump = VFSTOUFS(wk->wk_mp);
2732#ifdef SUJ_DEBUG
2733	{
2734		struct worklist *wkn;
2735
2736		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2737			if (wkn == wk)
2738				break;
2739		if (wkn == NULL)
2740			panic("remove_from_journal: %p is not in journal", wk);
2741	}
2742#endif
2743	/*
2744	 * We emulate a TAILQ to save space in most structures which do not
2745	 * require TAILQ semantics.  Here we must update the tail position
2746	 * when removing the tail which is not the final entry. This works
2747	 * only if the worklist linkage are at the beginning of the structure.
2748	 */
2749	if (ump->softdep_journal_tail == wk)
2750		ump->softdep_journal_tail =
2751		    (struct worklist *)wk->wk_list.le_prev;
2752
2753	WORKLIST_REMOVE(wk);
2754	ump->softdep_on_journal -= 1;
2755}
2756
2757/*
2758 * Check for journal space as well as dependency limits so the prelink
2759 * code can throttle both journaled and non-journaled filesystems.
2760 * Threshold is 0 for low and 1 for min.
2761 */
2762static int
2763journal_space(ump, thresh)
2764	struct ufsmount *ump;
2765	int thresh;
2766{
2767	struct jblocks *jblocks;
2768	int avail;
2769
2770	jblocks = ump->softdep_jblocks;
2771	if (jblocks == NULL)
2772		return (1);
2773	/*
2774	 * We use a tighter restriction here to prevent request_cleanup()
2775	 * running in threads from running into locks we currently hold.
2776	 */
2777	if (dep_current[D_INODEDEP] > (max_softdeps / 10) * 9)
2778		return (0);
2779	if (thresh)
2780		thresh = jblocks->jb_min;
2781	else
2782		thresh = jblocks->jb_low;
2783	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2784	avail = jblocks->jb_free - avail;
2785
2786	return (avail > thresh);
2787}
2788
2789static void
2790journal_suspend(ump)
2791	struct ufsmount *ump;
2792{
2793	struct jblocks *jblocks;
2794	struct mount *mp;
2795
2796	mp = UFSTOVFS(ump);
2797	jblocks = ump->softdep_jblocks;
2798	MNT_ILOCK(mp);
2799	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2800		stat_journal_min++;
2801		mp->mnt_kern_flag |= MNTK_SUSPEND;
2802		mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
2803	}
2804	jblocks->jb_suspended = 1;
2805	MNT_IUNLOCK(mp);
2806}
2807
2808static int
2809journal_unsuspend(struct ufsmount *ump)
2810{
2811	struct jblocks *jblocks;
2812	struct mount *mp;
2813
2814	mp = UFSTOVFS(ump);
2815	jblocks = ump->softdep_jblocks;
2816
2817	if (jblocks != NULL && jblocks->jb_suspended &&
2818	    journal_space(ump, jblocks->jb_min)) {
2819		jblocks->jb_suspended = 0;
2820		FREE_LOCK(&lk);
2821		mp->mnt_susp_owner = curthread;
2822		vfs_write_resume(mp, 0);
2823		ACQUIRE_LOCK(&lk);
2824		return (1);
2825	}
2826	return (0);
2827}
2828
2829/*
2830 * Called before any allocation function to be certain that there is
2831 * sufficient space in the journal prior to creating any new records.
2832 * Since in the case of block allocation we may have multiple locked
2833 * buffers at the time of the actual allocation we can not block
2834 * when the journal records are created.  Doing so would create a deadlock
2835 * if any of these buffers needed to be flushed to reclaim space.  Instead
2836 * we require a sufficiently large amount of available space such that
2837 * each thread in the system could have passed this allocation check and
2838 * still have sufficient free space.  With 20% of a minimum journal size
2839 * of 1MB we have 6553 records available.
2840 */
2841int
2842softdep_prealloc(vp, waitok)
2843	struct vnode *vp;
2844	int waitok;
2845{
2846	struct ufsmount *ump;
2847
2848	/*
2849	 * Nothing to do if we are not running journaled soft updates.
2850	 * If we currently hold the snapshot lock, we must avoid handling
2851	 * other resources that could cause deadlock.
2852	 */
2853	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)))
2854		return (0);
2855	ump = VFSTOUFS(vp->v_mount);
2856	ACQUIRE_LOCK(&lk);
2857	if (journal_space(ump, 0)) {
2858		FREE_LOCK(&lk);
2859		return (0);
2860	}
2861	stat_journal_low++;
2862	FREE_LOCK(&lk);
2863	if (waitok == MNT_NOWAIT)
2864		return (ENOSPC);
2865	/*
2866	 * Attempt to sync this vnode once to flush any journal
2867	 * work attached to it.
2868	 */
2869	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
2870		ffs_syncvnode(vp, waitok, 0);
2871	ACQUIRE_LOCK(&lk);
2872	process_removes(vp);
2873	process_truncates(vp);
2874	if (journal_space(ump, 0) == 0) {
2875		softdep_speedup();
2876		if (journal_space(ump, 1) == 0)
2877			journal_suspend(ump);
2878	}
2879	FREE_LOCK(&lk);
2880
2881	return (0);
2882}
2883
2884/*
2885 * Before adjusting a link count on a vnode verify that we have sufficient
2886 * journal space.  If not, process operations that depend on the currently
2887 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
2888 * and softdep flush threads can not acquire these locks to reclaim space.
2889 */
2890static void
2891softdep_prelink(dvp, vp)
2892	struct vnode *dvp;
2893	struct vnode *vp;
2894{
2895	struct ufsmount *ump;
2896
2897	ump = VFSTOUFS(dvp->v_mount);
2898	mtx_assert(&lk, MA_OWNED);
2899	/*
2900	 * Nothing to do if we have sufficient journal space.
2901	 * If we currently hold the snapshot lock, we must avoid
2902	 * handling other resources that could cause deadlock.
2903	 */
2904	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
2905		return;
2906	stat_journal_low++;
2907	FREE_LOCK(&lk);
2908	if (vp)
2909		ffs_syncvnode(vp, MNT_NOWAIT, 0);
2910	ffs_syncvnode(dvp, MNT_WAIT, 0);
2911	ACQUIRE_LOCK(&lk);
2912	/* Process vp before dvp as it may create .. removes. */
2913	if (vp) {
2914		process_removes(vp);
2915		process_truncates(vp);
2916	}
2917	process_removes(dvp);
2918	process_truncates(dvp);
2919	softdep_speedup();
2920	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
2921	if (journal_space(ump, 0) == 0) {
2922		softdep_speedup();
2923		if (journal_space(ump, 1) == 0)
2924			journal_suspend(ump);
2925	}
2926}
2927
2928static void
2929jseg_write(ump, jseg, data)
2930	struct ufsmount *ump;
2931	struct jseg *jseg;
2932	uint8_t *data;
2933{
2934	struct jsegrec *rec;
2935
2936	rec = (struct jsegrec *)data;
2937	rec->jsr_seq = jseg->js_seq;
2938	rec->jsr_oldest = jseg->js_oldseq;
2939	rec->jsr_cnt = jseg->js_cnt;
2940	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
2941	rec->jsr_crc = 0;
2942	rec->jsr_time = ump->um_fs->fs_mtime;
2943}
2944
2945static inline void
2946inoref_write(inoref, jseg, rec)
2947	struct inoref *inoref;
2948	struct jseg *jseg;
2949	struct jrefrec *rec;
2950{
2951
2952	inoref->if_jsegdep->jd_seg = jseg;
2953	rec->jr_ino = inoref->if_ino;
2954	rec->jr_parent = inoref->if_parent;
2955	rec->jr_nlink = inoref->if_nlink;
2956	rec->jr_mode = inoref->if_mode;
2957	rec->jr_diroff = inoref->if_diroff;
2958}
2959
2960static void
2961jaddref_write(jaddref, jseg, data)
2962	struct jaddref *jaddref;
2963	struct jseg *jseg;
2964	uint8_t *data;
2965{
2966	struct jrefrec *rec;
2967
2968	rec = (struct jrefrec *)data;
2969	rec->jr_op = JOP_ADDREF;
2970	inoref_write(&jaddref->ja_ref, jseg, rec);
2971}
2972
2973static void
2974jremref_write(jremref, jseg, data)
2975	struct jremref *jremref;
2976	struct jseg *jseg;
2977	uint8_t *data;
2978{
2979	struct jrefrec *rec;
2980
2981	rec = (struct jrefrec *)data;
2982	rec->jr_op = JOP_REMREF;
2983	inoref_write(&jremref->jr_ref, jseg, rec);
2984}
2985
2986static void
2987jmvref_write(jmvref, jseg, data)
2988	struct jmvref *jmvref;
2989	struct jseg *jseg;
2990	uint8_t *data;
2991{
2992	struct jmvrec *rec;
2993
2994	rec = (struct jmvrec *)data;
2995	rec->jm_op = JOP_MVREF;
2996	rec->jm_ino = jmvref->jm_ino;
2997	rec->jm_parent = jmvref->jm_parent;
2998	rec->jm_oldoff = jmvref->jm_oldoff;
2999	rec->jm_newoff = jmvref->jm_newoff;
3000}
3001
3002static void
3003jnewblk_write(jnewblk, jseg, data)
3004	struct jnewblk *jnewblk;
3005	struct jseg *jseg;
3006	uint8_t *data;
3007{
3008	struct jblkrec *rec;
3009
3010	jnewblk->jn_jsegdep->jd_seg = jseg;
3011	rec = (struct jblkrec *)data;
3012	rec->jb_op = JOP_NEWBLK;
3013	rec->jb_ino = jnewblk->jn_ino;
3014	rec->jb_blkno = jnewblk->jn_blkno;
3015	rec->jb_lbn = jnewblk->jn_lbn;
3016	rec->jb_frags = jnewblk->jn_frags;
3017	rec->jb_oldfrags = jnewblk->jn_oldfrags;
3018}
3019
3020static void
3021jfreeblk_write(jfreeblk, jseg, data)
3022	struct jfreeblk *jfreeblk;
3023	struct jseg *jseg;
3024	uint8_t *data;
3025{
3026	struct jblkrec *rec;
3027
3028	jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3029	rec = (struct jblkrec *)data;
3030	rec->jb_op = JOP_FREEBLK;
3031	rec->jb_ino = jfreeblk->jf_ino;
3032	rec->jb_blkno = jfreeblk->jf_blkno;
3033	rec->jb_lbn = jfreeblk->jf_lbn;
3034	rec->jb_frags = jfreeblk->jf_frags;
3035	rec->jb_oldfrags = 0;
3036}
3037
3038static void
3039jfreefrag_write(jfreefrag, jseg, data)
3040	struct jfreefrag *jfreefrag;
3041	struct jseg *jseg;
3042	uint8_t *data;
3043{
3044	struct jblkrec *rec;
3045
3046	jfreefrag->fr_jsegdep->jd_seg = jseg;
3047	rec = (struct jblkrec *)data;
3048	rec->jb_op = JOP_FREEBLK;
3049	rec->jb_ino = jfreefrag->fr_ino;
3050	rec->jb_blkno = jfreefrag->fr_blkno;
3051	rec->jb_lbn = jfreefrag->fr_lbn;
3052	rec->jb_frags = jfreefrag->fr_frags;
3053	rec->jb_oldfrags = 0;
3054}
3055
3056static void
3057jtrunc_write(jtrunc, jseg, data)
3058	struct jtrunc *jtrunc;
3059	struct jseg *jseg;
3060	uint8_t *data;
3061{
3062	struct jtrncrec *rec;
3063
3064	jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3065	rec = (struct jtrncrec *)data;
3066	rec->jt_op = JOP_TRUNC;
3067	rec->jt_ino = jtrunc->jt_ino;
3068	rec->jt_size = jtrunc->jt_size;
3069	rec->jt_extsize = jtrunc->jt_extsize;
3070}
3071
3072static void
3073jfsync_write(jfsync, jseg, data)
3074	struct jfsync *jfsync;
3075	struct jseg *jseg;
3076	uint8_t *data;
3077{
3078	struct jtrncrec *rec;
3079
3080	rec = (struct jtrncrec *)data;
3081	rec->jt_op = JOP_SYNC;
3082	rec->jt_ino = jfsync->jfs_ino;
3083	rec->jt_size = jfsync->jfs_size;
3084	rec->jt_extsize = jfsync->jfs_extsize;
3085}
3086
3087static void
3088softdep_flushjournal(mp)
3089	struct mount *mp;
3090{
3091	struct jblocks *jblocks;
3092	struct ufsmount *ump;
3093
3094	if (MOUNTEDSUJ(mp) == 0)
3095		return;
3096	ump = VFSTOUFS(mp);
3097	jblocks = ump->softdep_jblocks;
3098	ACQUIRE_LOCK(&lk);
3099	while (ump->softdep_on_journal) {
3100		jblocks->jb_needseg = 1;
3101		softdep_process_journal(mp, NULL, MNT_WAIT);
3102	}
3103	FREE_LOCK(&lk);
3104}
3105
3106static void softdep_synchronize_completed(struct bio *);
3107static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3108
3109static void
3110softdep_synchronize_completed(bp)
3111        struct bio *bp;
3112{
3113	struct jseg *oldest;
3114	struct jseg *jseg;
3115
3116	/*
3117	 * caller1 marks the last segment written before we issued the
3118	 * synchronize cache.
3119	 */
3120	jseg = bp->bio_caller1;
3121	oldest = NULL;
3122	ACQUIRE_LOCK(&lk);
3123	/*
3124	 * Mark all the journal entries waiting on the synchronize cache
3125	 * as completed so they may continue on.
3126	 */
3127	while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3128		jseg->js_state |= COMPLETE;
3129		oldest = jseg;
3130		jseg = TAILQ_PREV(jseg, jseglst, js_next);
3131	}
3132	/*
3133	 * Restart deferred journal entry processing from the oldest
3134	 * completed jseg.
3135	 */
3136	if (oldest)
3137		complete_jsegs(oldest);
3138
3139	FREE_LOCK(&lk);
3140	g_destroy_bio(bp);
3141}
3142
3143/*
3144 * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3145 * barriers.  The journal must be written prior to any blocks that depend
3146 * on it and the journal can not be released until the blocks have be
3147 * written.  This code handles both barriers simultaneously.
3148 */
3149static void
3150softdep_synchronize(bp, ump, caller1)
3151	struct bio *bp;
3152	struct ufsmount *ump;
3153	void *caller1;
3154{
3155
3156	bp->bio_cmd = BIO_FLUSH;
3157	bp->bio_flags |= BIO_ORDERED;
3158	bp->bio_data = NULL;
3159	bp->bio_offset = ump->um_cp->provider->mediasize;
3160	bp->bio_length = 0;
3161	bp->bio_done = softdep_synchronize_completed;
3162	bp->bio_caller1 = caller1;
3163	g_io_request(bp,
3164	    (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
3165}
3166
3167/*
3168 * Flush some journal records to disk.
3169 */
3170static void
3171softdep_process_journal(mp, needwk, flags)
3172	struct mount *mp;
3173	struct worklist *needwk;
3174	int flags;
3175{
3176	struct jblocks *jblocks;
3177	struct ufsmount *ump;
3178	struct worklist *wk;
3179	struct jseg *jseg;
3180	struct buf *bp;
3181	struct bio *bio;
3182	uint8_t *data;
3183	struct fs *fs;
3184	int shouldflush;
3185	int segwritten;
3186	int jrecmin;	/* Minimum records per block. */
3187	int jrecmax;	/* Maximum records per block. */
3188	int size;
3189	int cnt;
3190	int off;
3191	int devbsize;
3192
3193	if (MOUNTEDSUJ(mp) == 0)
3194		return;
3195	shouldflush = softdep_flushcache;
3196	bio = NULL;
3197	jseg = NULL;
3198	ump = VFSTOUFS(mp);
3199	fs = ump->um_fs;
3200	jblocks = ump->softdep_jblocks;
3201	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3202	/*
3203	 * We write anywhere between a disk block and fs block.  The upper
3204	 * bound is picked to prevent buffer cache fragmentation and limit
3205	 * processing time per I/O.
3206	 */
3207	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3208	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3209	segwritten = 0;
3210	for (;;) {
3211		cnt = ump->softdep_on_journal;
3212		/*
3213		 * Criteria for writing a segment:
3214		 * 1) We have a full block.
3215		 * 2) We're called from jwait() and haven't found the
3216		 *    journal item yet.
3217		 * 3) Always write if needseg is set.
3218		 * 4) If we are called from process_worklist and have
3219		 *    not yet written anything we write a partial block
3220		 *    to enforce a 1 second maximum latency on journal
3221		 *    entries.
3222		 */
3223		if (cnt < (jrecmax - 1) && needwk == NULL &&
3224		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3225			break;
3226		cnt++;
3227		/*
3228		 * Verify some free journal space.  softdep_prealloc() should
3229	 	 * guarantee that we don't run out so this is indicative of
3230		 * a problem with the flow control.  Try to recover
3231		 * gracefully in any event.
3232		 */
3233		while (jblocks->jb_free == 0) {
3234			if (flags != MNT_WAIT)
3235				break;
3236			printf("softdep: Out of journal space!\n");
3237			softdep_speedup();
3238			msleep(jblocks, &lk, PRIBIO, "jblocks", hz);
3239		}
3240		FREE_LOCK(&lk);
3241		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3242		workitem_alloc(&jseg->js_list, D_JSEG, mp);
3243		LIST_INIT(&jseg->js_entries);
3244		LIST_INIT(&jseg->js_indirs);
3245		jseg->js_state = ATTACHED;
3246		if (shouldflush == 0)
3247			jseg->js_state |= COMPLETE;
3248		else if (bio == NULL)
3249			bio = g_alloc_bio();
3250		jseg->js_jblocks = jblocks;
3251		bp = geteblk(fs->fs_bsize, 0);
3252		ACQUIRE_LOCK(&lk);
3253		/*
3254		 * If there was a race while we were allocating the block
3255		 * and jseg the entry we care about was likely written.
3256		 * We bail out in both the WAIT and NOWAIT case and assume
3257		 * the caller will loop if the entry it cares about is
3258		 * not written.
3259		 */
3260		cnt = ump->softdep_on_journal;
3261		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3262			bp->b_flags |= B_INVAL | B_NOCACHE;
3263			WORKITEM_FREE(jseg, D_JSEG);
3264			FREE_LOCK(&lk);
3265			brelse(bp);
3266			ACQUIRE_LOCK(&lk);
3267			break;
3268		}
3269		/*
3270		 * Calculate the disk block size required for the available
3271		 * records rounded to the min size.
3272		 */
3273		if (cnt == 0)
3274			size = devbsize;
3275		else if (cnt < jrecmax)
3276			size = howmany(cnt, jrecmin) * devbsize;
3277		else
3278			size = fs->fs_bsize;
3279		/*
3280		 * Allocate a disk block for this journal data and account
3281		 * for truncation of the requested size if enough contiguous
3282		 * space was not available.
3283		 */
3284		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3285		bp->b_lblkno = bp->b_blkno;
3286		bp->b_offset = bp->b_blkno * DEV_BSIZE;
3287		bp->b_bcount = size;
3288		bp->b_flags &= ~B_INVAL;
3289		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3290		/*
3291		 * Initialize our jseg with cnt records.  Assign the next
3292		 * sequence number to it and link it in-order.
3293		 */
3294		cnt = MIN(cnt, (size / devbsize) * jrecmin);
3295		jseg->js_buf = bp;
3296		jseg->js_cnt = cnt;
3297		jseg->js_refs = cnt + 1;	/* Self ref. */
3298		jseg->js_size = size;
3299		jseg->js_seq = jblocks->jb_nextseq++;
3300		if (jblocks->jb_oldestseg == NULL)
3301			jblocks->jb_oldestseg = jseg;
3302		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3303		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3304		if (jblocks->jb_writeseg == NULL)
3305			jblocks->jb_writeseg = jseg;
3306		/*
3307		 * Start filling in records from the pending list.
3308		 */
3309		data = bp->b_data;
3310		off = 0;
3311		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3312		    != NULL) {
3313			if (cnt == 0)
3314				break;
3315			/* Place a segment header on every device block. */
3316			if ((off % devbsize) == 0) {
3317				jseg_write(ump, jseg, data);
3318				off += JREC_SIZE;
3319				data = bp->b_data + off;
3320			}
3321			if (wk == needwk)
3322				needwk = NULL;
3323			remove_from_journal(wk);
3324			wk->wk_state |= INPROGRESS;
3325			WORKLIST_INSERT(&jseg->js_entries, wk);
3326			switch (wk->wk_type) {
3327			case D_JADDREF:
3328				jaddref_write(WK_JADDREF(wk), jseg, data);
3329				break;
3330			case D_JREMREF:
3331				jremref_write(WK_JREMREF(wk), jseg, data);
3332				break;
3333			case D_JMVREF:
3334				jmvref_write(WK_JMVREF(wk), jseg, data);
3335				break;
3336			case D_JNEWBLK:
3337				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3338				break;
3339			case D_JFREEBLK:
3340				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3341				break;
3342			case D_JFREEFRAG:
3343				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3344				break;
3345			case D_JTRUNC:
3346				jtrunc_write(WK_JTRUNC(wk), jseg, data);
3347				break;
3348			case D_JFSYNC:
3349				jfsync_write(WK_JFSYNC(wk), jseg, data);
3350				break;
3351			default:
3352				panic("process_journal: Unknown type %s",
3353				    TYPENAME(wk->wk_type));
3354				/* NOTREACHED */
3355			}
3356			off += JREC_SIZE;
3357			data = bp->b_data + off;
3358			cnt--;
3359		}
3360		/*
3361		 * Write this one buffer and continue.
3362		 */
3363		segwritten = 1;
3364		jblocks->jb_needseg = 0;
3365		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3366		FREE_LOCK(&lk);
3367		pbgetvp(ump->um_devvp, bp);
3368		/*
3369		 * We only do the blocking wait once we find the journal
3370		 * entry we're looking for.
3371		 */
3372		if (needwk == NULL && flags == MNT_WAIT)
3373			bwrite(bp);
3374		else
3375			bawrite(bp);
3376		ACQUIRE_LOCK(&lk);
3377	}
3378	/*
3379	 * If we wrote a segment issue a synchronize cache so the journal
3380	 * is reflected on disk before the data is written.  Since reclaiming
3381	 * journal space also requires writing a journal record this
3382	 * process also enforces a barrier before reclamation.
3383	 */
3384	if (segwritten && shouldflush) {
3385		softdep_synchronize(bio, ump,
3386		    TAILQ_LAST(&jblocks->jb_segs, jseglst));
3387	} else if (bio)
3388		g_destroy_bio(bio);
3389	/*
3390	 * If we've suspended the filesystem because we ran out of journal
3391	 * space either try to sync it here to make some progress or
3392	 * unsuspend it if we already have.
3393	 */
3394	if (flags == 0 && jblocks->jb_suspended) {
3395		if (journal_unsuspend(ump))
3396			return;
3397		FREE_LOCK(&lk);
3398		VFS_SYNC(mp, MNT_NOWAIT);
3399		ffs_sbupdate(ump, MNT_WAIT, 0);
3400		ACQUIRE_LOCK(&lk);
3401	}
3402}
3403
3404/*
3405 * Complete a jseg, allowing all dependencies awaiting journal writes
3406 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3407 * structures so that the journal segment can be freed to reclaim space.
3408 */
3409static void
3410complete_jseg(jseg)
3411	struct jseg *jseg;
3412{
3413	struct worklist *wk;
3414	struct jmvref *jmvref;
3415	int waiting;
3416#ifdef INVARIANTS
3417	int i = 0;
3418#endif
3419
3420	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3421		WORKLIST_REMOVE(wk);
3422		waiting = wk->wk_state & IOWAITING;
3423		wk->wk_state &= ~(INPROGRESS | IOWAITING);
3424		wk->wk_state |= COMPLETE;
3425		KASSERT(i++ < jseg->js_cnt,
3426		    ("handle_written_jseg: overflow %d >= %d",
3427		    i - 1, jseg->js_cnt));
3428		switch (wk->wk_type) {
3429		case D_JADDREF:
3430			handle_written_jaddref(WK_JADDREF(wk));
3431			break;
3432		case D_JREMREF:
3433			handle_written_jremref(WK_JREMREF(wk));
3434			break;
3435		case D_JMVREF:
3436			rele_jseg(jseg);	/* No jsegdep. */
3437			jmvref = WK_JMVREF(wk);
3438			LIST_REMOVE(jmvref, jm_deps);
3439			if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3440				free_pagedep(jmvref->jm_pagedep);
3441			WORKITEM_FREE(jmvref, D_JMVREF);
3442			break;
3443		case D_JNEWBLK:
3444			handle_written_jnewblk(WK_JNEWBLK(wk));
3445			break;
3446		case D_JFREEBLK:
3447			handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3448			break;
3449		case D_JTRUNC:
3450			handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3451			break;
3452		case D_JFSYNC:
3453			rele_jseg(jseg);	/* No jsegdep. */
3454			WORKITEM_FREE(wk, D_JFSYNC);
3455			break;
3456		case D_JFREEFRAG:
3457			handle_written_jfreefrag(WK_JFREEFRAG(wk));
3458			break;
3459		default:
3460			panic("handle_written_jseg: Unknown type %s",
3461			    TYPENAME(wk->wk_type));
3462			/* NOTREACHED */
3463		}
3464		if (waiting)
3465			wakeup(wk);
3466	}
3467	/* Release the self reference so the structure may be freed. */
3468	rele_jseg(jseg);
3469}
3470
3471/*
3472 * Determine which jsegs are ready for completion processing.  Waits for
3473 * synchronize cache to complete as well as forcing in-order completion
3474 * of journal entries.
3475 */
3476static void
3477complete_jsegs(jseg)
3478	struct jseg *jseg;
3479{
3480	struct jblocks *jblocks;
3481	struct jseg *jsegn;
3482
3483	jblocks = jseg->js_jblocks;
3484	/*
3485	 * Don't allow out of order completions.  If this isn't the first
3486	 * block wait for it to write before we're done.
3487	 */
3488	if (jseg != jblocks->jb_writeseg)
3489		return;
3490	/* Iterate through available jsegs processing their entries. */
3491	while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3492		jblocks->jb_oldestwrseq = jseg->js_oldseq;
3493		jsegn = TAILQ_NEXT(jseg, js_next);
3494		complete_jseg(jseg);
3495		jseg = jsegn;
3496	}
3497	jblocks->jb_writeseg = jseg;
3498	/*
3499	 * Attempt to free jsegs now that oldestwrseq may have advanced.
3500	 */
3501	free_jsegs(jblocks);
3502}
3503
3504/*
3505 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
3506 * the final completions.
3507 */
3508static void
3509handle_written_jseg(jseg, bp)
3510	struct jseg *jseg;
3511	struct buf *bp;
3512{
3513
3514	if (jseg->js_refs == 0)
3515		panic("handle_written_jseg: No self-reference on %p", jseg);
3516	jseg->js_state |= DEPCOMPLETE;
3517	/*
3518	 * We'll never need this buffer again, set flags so it will be
3519	 * discarded.
3520	 */
3521	bp->b_flags |= B_INVAL | B_NOCACHE;
3522	pbrelvp(bp);
3523	complete_jsegs(jseg);
3524}
3525
3526static inline struct jsegdep *
3527inoref_jseg(inoref)
3528	struct inoref *inoref;
3529{
3530	struct jsegdep *jsegdep;
3531
3532	jsegdep = inoref->if_jsegdep;
3533	inoref->if_jsegdep = NULL;
3534
3535	return (jsegdep);
3536}
3537
3538/*
3539 * Called once a jremref has made it to stable store.  The jremref is marked
3540 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3541 * for the jremref to complete will be awoken by free_jremref.
3542 */
3543static void
3544handle_written_jremref(jremref)
3545	struct jremref *jremref;
3546{
3547	struct inodedep *inodedep;
3548	struct jsegdep *jsegdep;
3549	struct dirrem *dirrem;
3550
3551	/* Grab the jsegdep. */
3552	jsegdep = inoref_jseg(&jremref->jr_ref);
3553	/*
3554	 * Remove us from the inoref list.
3555	 */
3556	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3557	    0, &inodedep) == 0)
3558		panic("handle_written_jremref: Lost inodedep");
3559	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3560	/*
3561	 * Complete the dirrem.
3562	 */
3563	dirrem = jremref->jr_dirrem;
3564	jremref->jr_dirrem = NULL;
3565	LIST_REMOVE(jremref, jr_deps);
3566	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3567	jwork_insert(&dirrem->dm_jwork, jsegdep);
3568	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3569	    (dirrem->dm_state & COMPLETE) != 0)
3570		add_to_worklist(&dirrem->dm_list, 0);
3571	free_jremref(jremref);
3572}
3573
3574/*
3575 * Called once a jaddref has made it to stable store.  The dependency is
3576 * marked complete and any dependent structures are added to the inode
3577 * bufwait list to be completed as soon as it is written.  If a bitmap write
3578 * depends on this entry we move the inode into the inodedephd of the
3579 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3580 */
3581static void
3582handle_written_jaddref(jaddref)
3583	struct jaddref *jaddref;
3584{
3585	struct jsegdep *jsegdep;
3586	struct inodedep *inodedep;
3587	struct diradd *diradd;
3588	struct mkdir *mkdir;
3589
3590	/* Grab the jsegdep. */
3591	jsegdep = inoref_jseg(&jaddref->ja_ref);
3592	mkdir = NULL;
3593	diradd = NULL;
3594	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3595	    0, &inodedep) == 0)
3596		panic("handle_written_jaddref: Lost inodedep.");
3597	if (jaddref->ja_diradd == NULL)
3598		panic("handle_written_jaddref: No dependency");
3599	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3600		diradd = jaddref->ja_diradd;
3601		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3602	} else if (jaddref->ja_state & MKDIR_PARENT) {
3603		mkdir = jaddref->ja_mkdir;
3604		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3605	} else if (jaddref->ja_state & MKDIR_BODY)
3606		mkdir = jaddref->ja_mkdir;
3607	else
3608		panic("handle_written_jaddref: Unknown dependency %p",
3609		    jaddref->ja_diradd);
3610	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3611	/*
3612	 * Remove us from the inode list.
3613	 */
3614	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3615	/*
3616	 * The mkdir may be waiting on the jaddref to clear before freeing.
3617	 */
3618	if (mkdir) {
3619		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3620		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3621		    TYPENAME(mkdir->md_list.wk_type)));
3622		mkdir->md_jaddref = NULL;
3623		diradd = mkdir->md_diradd;
3624		mkdir->md_state |= DEPCOMPLETE;
3625		complete_mkdir(mkdir);
3626	}
3627	jwork_insert(&diradd->da_jwork, jsegdep);
3628	if (jaddref->ja_state & NEWBLOCK) {
3629		inodedep->id_state |= ONDEPLIST;
3630		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3631		    inodedep, id_deps);
3632	}
3633	free_jaddref(jaddref);
3634}
3635
3636/*
3637 * Called once a jnewblk journal is written.  The allocdirect or allocindir
3638 * is placed in the bmsafemap to await notification of a written bitmap.  If
3639 * the operation was canceled we add the segdep to the appropriate
3640 * dependency to free the journal space once the canceling operation
3641 * completes.
3642 */
3643static void
3644handle_written_jnewblk(jnewblk)
3645	struct jnewblk *jnewblk;
3646{
3647	struct bmsafemap *bmsafemap;
3648	struct freefrag *freefrag;
3649	struct freework *freework;
3650	struct jsegdep *jsegdep;
3651	struct newblk *newblk;
3652
3653	/* Grab the jsegdep. */
3654	jsegdep = jnewblk->jn_jsegdep;
3655	jnewblk->jn_jsegdep = NULL;
3656	if (jnewblk->jn_dep == NULL)
3657		panic("handle_written_jnewblk: No dependency for the segdep.");
3658	switch (jnewblk->jn_dep->wk_type) {
3659	case D_NEWBLK:
3660	case D_ALLOCDIRECT:
3661	case D_ALLOCINDIR:
3662		/*
3663		 * Add the written block to the bmsafemap so it can
3664		 * be notified when the bitmap is on disk.
3665		 */
3666		newblk = WK_NEWBLK(jnewblk->jn_dep);
3667		newblk->nb_jnewblk = NULL;
3668		if ((newblk->nb_state & GOINGAWAY) == 0) {
3669			bmsafemap = newblk->nb_bmsafemap;
3670			newblk->nb_state |= ONDEPLIST;
3671			LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3672			    nb_deps);
3673		}
3674		jwork_insert(&newblk->nb_jwork, jsegdep);
3675		break;
3676	case D_FREEFRAG:
3677		/*
3678		 * A newblock being removed by a freefrag when replaced by
3679		 * frag extension.
3680		 */
3681		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3682		freefrag->ff_jdep = NULL;
3683		jwork_insert(&freefrag->ff_jwork, jsegdep);
3684		break;
3685	case D_FREEWORK:
3686		/*
3687		 * A direct block was removed by truncate.
3688		 */
3689		freework = WK_FREEWORK(jnewblk->jn_dep);
3690		freework->fw_jnewblk = NULL;
3691		jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
3692		break;
3693	default:
3694		panic("handle_written_jnewblk: Unknown type %d.",
3695		    jnewblk->jn_dep->wk_type);
3696	}
3697	jnewblk->jn_dep = NULL;
3698	free_jnewblk(jnewblk);
3699}
3700
3701/*
3702 * Cancel a jfreefrag that won't be needed, probably due to colliding with
3703 * an in-flight allocation that has not yet been committed.  Divorce us
3704 * from the freefrag and mark it DEPCOMPLETE so that it may be added
3705 * to the worklist.
3706 */
3707static void
3708cancel_jfreefrag(jfreefrag)
3709	struct jfreefrag *jfreefrag;
3710{
3711	struct freefrag *freefrag;
3712
3713	if (jfreefrag->fr_jsegdep) {
3714		free_jsegdep(jfreefrag->fr_jsegdep);
3715		jfreefrag->fr_jsegdep = NULL;
3716	}
3717	freefrag = jfreefrag->fr_freefrag;
3718	jfreefrag->fr_freefrag = NULL;
3719	free_jfreefrag(jfreefrag);
3720	freefrag->ff_state |= DEPCOMPLETE;
3721	CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
3722}
3723
3724/*
3725 * Free a jfreefrag when the parent freefrag is rendered obsolete.
3726 */
3727static void
3728free_jfreefrag(jfreefrag)
3729	struct jfreefrag *jfreefrag;
3730{
3731
3732	if (jfreefrag->fr_state & INPROGRESS)
3733		WORKLIST_REMOVE(&jfreefrag->fr_list);
3734	else if (jfreefrag->fr_state & ONWORKLIST)
3735		remove_from_journal(&jfreefrag->fr_list);
3736	if (jfreefrag->fr_freefrag != NULL)
3737		panic("free_jfreefrag:  Still attached to a freefrag.");
3738	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3739}
3740
3741/*
3742 * Called when the journal write for a jfreefrag completes.  The parent
3743 * freefrag is added to the worklist if this completes its dependencies.
3744 */
3745static void
3746handle_written_jfreefrag(jfreefrag)
3747	struct jfreefrag *jfreefrag;
3748{
3749	struct jsegdep *jsegdep;
3750	struct freefrag *freefrag;
3751
3752	/* Grab the jsegdep. */
3753	jsegdep = jfreefrag->fr_jsegdep;
3754	jfreefrag->fr_jsegdep = NULL;
3755	freefrag = jfreefrag->fr_freefrag;
3756	if (freefrag == NULL)
3757		panic("handle_written_jfreefrag: No freefrag.");
3758	freefrag->ff_state |= DEPCOMPLETE;
3759	freefrag->ff_jdep = NULL;
3760	jwork_insert(&freefrag->ff_jwork, jsegdep);
3761	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3762		add_to_worklist(&freefrag->ff_list, 0);
3763	jfreefrag->fr_freefrag = NULL;
3764	free_jfreefrag(jfreefrag);
3765}
3766
3767/*
3768 * Called when the journal write for a jfreeblk completes.  The jfreeblk
3769 * is removed from the freeblks list of pending journal writes and the
3770 * jsegdep is moved to the freeblks jwork to be completed when all blocks
3771 * have been reclaimed.
3772 */
3773static void
3774handle_written_jblkdep(jblkdep)
3775	struct jblkdep *jblkdep;
3776{
3777	struct freeblks *freeblks;
3778	struct jsegdep *jsegdep;
3779
3780	/* Grab the jsegdep. */
3781	jsegdep = jblkdep->jb_jsegdep;
3782	jblkdep->jb_jsegdep = NULL;
3783	freeblks = jblkdep->jb_freeblks;
3784	LIST_REMOVE(jblkdep, jb_deps);
3785	jwork_insert(&freeblks->fb_jwork, jsegdep);
3786	/*
3787	 * If the freeblks is all journaled, we can add it to the worklist.
3788	 */
3789	if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
3790	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
3791		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
3792
3793	free_jblkdep(jblkdep);
3794}
3795
3796static struct jsegdep *
3797newjsegdep(struct worklist *wk)
3798{
3799	struct jsegdep *jsegdep;
3800
3801	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3802	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3803	jsegdep->jd_seg = NULL;
3804
3805	return (jsegdep);
3806}
3807
3808static struct jmvref *
3809newjmvref(dp, ino, oldoff, newoff)
3810	struct inode *dp;
3811	ino_t ino;
3812	off_t oldoff;
3813	off_t newoff;
3814{
3815	struct jmvref *jmvref;
3816
3817	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3818	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3819	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3820	jmvref->jm_parent = dp->i_number;
3821	jmvref->jm_ino = ino;
3822	jmvref->jm_oldoff = oldoff;
3823	jmvref->jm_newoff = newoff;
3824
3825	return (jmvref);
3826}
3827
3828/*
3829 * Allocate a new jremref that tracks the removal of ip from dp with the
3830 * directory entry offset of diroff.  Mark the entry as ATTACHED and
3831 * DEPCOMPLETE as we have all the information required for the journal write
3832 * and the directory has already been removed from the buffer.  The caller
3833 * is responsible for linking the jremref into the pagedep and adding it
3834 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
3835 * a DOTDOT addition so handle_workitem_remove() can properly assign
3836 * the jsegdep when we're done.
3837 */
3838static struct jremref *
3839newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
3840    off_t diroff, nlink_t nlink)
3841{
3842	struct jremref *jremref;
3843
3844	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
3845	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
3846	jremref->jr_state = ATTACHED;
3847	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
3848	   nlink, ip->i_mode);
3849	jremref->jr_dirrem = dirrem;
3850
3851	return (jremref);
3852}
3853
3854static inline void
3855newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
3856    nlink_t nlink, uint16_t mode)
3857{
3858
3859	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
3860	inoref->if_diroff = diroff;
3861	inoref->if_ino = ino;
3862	inoref->if_parent = parent;
3863	inoref->if_nlink = nlink;
3864	inoref->if_mode = mode;
3865}
3866
3867/*
3868 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
3869 * directory offset may not be known until later.  The caller is responsible
3870 * adding the entry to the journal when this information is available.  nlink
3871 * should be the link count prior to the addition and mode is only required
3872 * to have the correct FMT.
3873 */
3874static struct jaddref *
3875newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
3876    uint16_t mode)
3877{
3878	struct jaddref *jaddref;
3879
3880	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
3881	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
3882	jaddref->ja_state = ATTACHED;
3883	jaddref->ja_mkdir = NULL;
3884	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
3885
3886	return (jaddref);
3887}
3888
3889/*
3890 * Create a new free dependency for a freework.  The caller is responsible
3891 * for adjusting the reference count when it has the lock held.  The freedep
3892 * will track an outstanding bitmap write that will ultimately clear the
3893 * freework to continue.
3894 */
3895static struct freedep *
3896newfreedep(struct freework *freework)
3897{
3898	struct freedep *freedep;
3899
3900	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
3901	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
3902	freedep->fd_freework = freework;
3903
3904	return (freedep);
3905}
3906
3907/*
3908 * Free a freedep structure once the buffer it is linked to is written.  If
3909 * this is the last reference to the freework schedule it for completion.
3910 */
3911static void
3912free_freedep(freedep)
3913	struct freedep *freedep;
3914{
3915	struct freework *freework;
3916
3917	freework = freedep->fd_freework;
3918	freework->fw_freeblks->fb_cgwait--;
3919	if (--freework->fw_ref == 0)
3920		freework_enqueue(freework);
3921	WORKITEM_FREE(freedep, D_FREEDEP);
3922}
3923
3924/*
3925 * Allocate a new freework structure that may be a level in an indirect
3926 * when parent is not NULL or a top level block when it is.  The top level
3927 * freework structures are allocated without lk held and before the freeblks
3928 * is visible outside of softdep_setup_freeblocks().
3929 */
3930static struct freework *
3931newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
3932	struct ufsmount *ump;
3933	struct freeblks *freeblks;
3934	struct freework *parent;
3935	ufs_lbn_t lbn;
3936	ufs2_daddr_t nb;
3937	int frags;
3938	int off;
3939	int journal;
3940{
3941	struct freework *freework;
3942
3943	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
3944	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
3945	freework->fw_state = ATTACHED;
3946	freework->fw_jnewblk = NULL;
3947	freework->fw_freeblks = freeblks;
3948	freework->fw_parent = parent;
3949	freework->fw_lbn = lbn;
3950	freework->fw_blkno = nb;
3951	freework->fw_frags = frags;
3952	freework->fw_indir = NULL;
3953	freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR)
3954		? 0 : NINDIR(ump->um_fs) + 1;
3955	freework->fw_start = freework->fw_off = off;
3956	if (journal)
3957		newjfreeblk(freeblks, lbn, nb, frags);
3958	if (parent == NULL) {
3959		ACQUIRE_LOCK(&lk);
3960		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
3961		freeblks->fb_ref++;
3962		FREE_LOCK(&lk);
3963	}
3964
3965	return (freework);
3966}
3967
3968/*
3969 * Eliminate a jfreeblk for a block that does not need journaling.
3970 */
3971static void
3972cancel_jfreeblk(freeblks, blkno)
3973	struct freeblks *freeblks;
3974	ufs2_daddr_t blkno;
3975{
3976	struct jfreeblk *jfreeblk;
3977	struct jblkdep *jblkdep;
3978
3979	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
3980		if (jblkdep->jb_list.wk_type != D_JFREEBLK)
3981			continue;
3982		jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
3983		if (jfreeblk->jf_blkno == blkno)
3984			break;
3985	}
3986	if (jblkdep == NULL)
3987		return;
3988	CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
3989	free_jsegdep(jblkdep->jb_jsegdep);
3990	LIST_REMOVE(jblkdep, jb_deps);
3991	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
3992}
3993
3994/*
3995 * Allocate a new jfreeblk to journal top level block pointer when truncating
3996 * a file.  The caller must add this to the worklist when lk is held.
3997 */
3998static struct jfreeblk *
3999newjfreeblk(freeblks, lbn, blkno, frags)
4000	struct freeblks *freeblks;
4001	ufs_lbn_t lbn;
4002	ufs2_daddr_t blkno;
4003	int frags;
4004{
4005	struct jfreeblk *jfreeblk;
4006
4007	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4008	workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4009	    freeblks->fb_list.wk_mp);
4010	jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4011	jfreeblk->jf_dep.jb_freeblks = freeblks;
4012	jfreeblk->jf_ino = freeblks->fb_inum;
4013	jfreeblk->jf_lbn = lbn;
4014	jfreeblk->jf_blkno = blkno;
4015	jfreeblk->jf_frags = frags;
4016	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4017
4018	return (jfreeblk);
4019}
4020
4021/*
4022 * Allocate a new jtrunc to track a partial truncation.
4023 */
4024static struct jtrunc *
4025newjtrunc(freeblks, size, extsize)
4026	struct freeblks *freeblks;
4027	off_t size;
4028	int extsize;
4029{
4030	struct jtrunc *jtrunc;
4031
4032	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4033	workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4034	    freeblks->fb_list.wk_mp);
4035	jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4036	jtrunc->jt_dep.jb_freeblks = freeblks;
4037	jtrunc->jt_ino = freeblks->fb_inum;
4038	jtrunc->jt_size = size;
4039	jtrunc->jt_extsize = extsize;
4040	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4041
4042	return (jtrunc);
4043}
4044
4045/*
4046 * If we're canceling a new bitmap we have to search for another ref
4047 * to move into the bmsafemap dep.  This might be better expressed
4048 * with another structure.
4049 */
4050static void
4051move_newblock_dep(jaddref, inodedep)
4052	struct jaddref *jaddref;
4053	struct inodedep *inodedep;
4054{
4055	struct inoref *inoref;
4056	struct jaddref *jaddrefn;
4057
4058	jaddrefn = NULL;
4059	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4060	    inoref = TAILQ_NEXT(inoref, if_deps)) {
4061		if ((jaddref->ja_state & NEWBLOCK) &&
4062		    inoref->if_list.wk_type == D_JADDREF) {
4063			jaddrefn = (struct jaddref *)inoref;
4064			break;
4065		}
4066	}
4067	if (jaddrefn == NULL)
4068		return;
4069	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4070	jaddrefn->ja_state |= jaddref->ja_state &
4071	    (ATTACHED | UNDONE | NEWBLOCK);
4072	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4073	jaddref->ja_state |= ATTACHED;
4074	LIST_REMOVE(jaddref, ja_bmdeps);
4075	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4076	    ja_bmdeps);
4077}
4078
4079/*
4080 * Cancel a jaddref either before it has been written or while it is being
4081 * written.  This happens when a link is removed before the add reaches
4082 * the disk.  The jaddref dependency is kept linked into the bmsafemap
4083 * and inode to prevent the link count or bitmap from reaching the disk
4084 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4085 * required.
4086 *
4087 * Returns 1 if the canceled addref requires journaling of the remove and
4088 * 0 otherwise.
4089 */
4090static int
4091cancel_jaddref(jaddref, inodedep, wkhd)
4092	struct jaddref *jaddref;
4093	struct inodedep *inodedep;
4094	struct workhead *wkhd;
4095{
4096	struct inoref *inoref;
4097	struct jsegdep *jsegdep;
4098	int needsj;
4099
4100	KASSERT((jaddref->ja_state & COMPLETE) == 0,
4101	    ("cancel_jaddref: Canceling complete jaddref"));
4102	if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4103		needsj = 1;
4104	else
4105		needsj = 0;
4106	if (inodedep == NULL)
4107		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4108		    0, &inodedep) == 0)
4109			panic("cancel_jaddref: Lost inodedep");
4110	/*
4111	 * We must adjust the nlink of any reference operation that follows
4112	 * us so that it is consistent with the in-memory reference.  This
4113	 * ensures that inode nlink rollbacks always have the correct link.
4114	 */
4115	if (needsj == 0) {
4116		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4117		    inoref = TAILQ_NEXT(inoref, if_deps)) {
4118			if (inoref->if_state & GOINGAWAY)
4119				break;
4120			inoref->if_nlink--;
4121		}
4122	}
4123	jsegdep = inoref_jseg(&jaddref->ja_ref);
4124	if (jaddref->ja_state & NEWBLOCK)
4125		move_newblock_dep(jaddref, inodedep);
4126	wake_worklist(&jaddref->ja_list);
4127	jaddref->ja_mkdir = NULL;
4128	if (jaddref->ja_state & INPROGRESS) {
4129		jaddref->ja_state &= ~INPROGRESS;
4130		WORKLIST_REMOVE(&jaddref->ja_list);
4131		jwork_insert(wkhd, jsegdep);
4132	} else {
4133		free_jsegdep(jsegdep);
4134		if (jaddref->ja_state & DEPCOMPLETE)
4135			remove_from_journal(&jaddref->ja_list);
4136	}
4137	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4138	/*
4139	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4140	 * can arrange for them to be freed with the bitmap.  Otherwise we
4141	 * no longer need this addref attached to the inoreflst and it
4142	 * will incorrectly adjust nlink if we leave it.
4143	 */
4144	if ((jaddref->ja_state & NEWBLOCK) == 0) {
4145		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4146		    if_deps);
4147		jaddref->ja_state |= COMPLETE;
4148		free_jaddref(jaddref);
4149		return (needsj);
4150	}
4151	/*
4152	 * Leave the head of the list for jsegdeps for fast merging.
4153	 */
4154	if (LIST_FIRST(wkhd) != NULL) {
4155		jaddref->ja_state |= ONWORKLIST;
4156		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4157	} else
4158		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4159
4160	return (needsj);
4161}
4162
4163/*
4164 * Attempt to free a jaddref structure when some work completes.  This
4165 * should only succeed once the entry is written and all dependencies have
4166 * been notified.
4167 */
4168static void
4169free_jaddref(jaddref)
4170	struct jaddref *jaddref;
4171{
4172
4173	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4174		return;
4175	if (jaddref->ja_ref.if_jsegdep)
4176		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4177		    jaddref, jaddref->ja_state);
4178	if (jaddref->ja_state & NEWBLOCK)
4179		LIST_REMOVE(jaddref, ja_bmdeps);
4180	if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4181		panic("free_jaddref: Bad state %p(0x%X)",
4182		    jaddref, jaddref->ja_state);
4183	if (jaddref->ja_mkdir != NULL)
4184		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4185	WORKITEM_FREE(jaddref, D_JADDREF);
4186}
4187
4188/*
4189 * Free a jremref structure once it has been written or discarded.
4190 */
4191static void
4192free_jremref(jremref)
4193	struct jremref *jremref;
4194{
4195
4196	if (jremref->jr_ref.if_jsegdep)
4197		free_jsegdep(jremref->jr_ref.if_jsegdep);
4198	if (jremref->jr_state & INPROGRESS)
4199		panic("free_jremref: IO still pending");
4200	WORKITEM_FREE(jremref, D_JREMREF);
4201}
4202
4203/*
4204 * Free a jnewblk structure.
4205 */
4206static void
4207free_jnewblk(jnewblk)
4208	struct jnewblk *jnewblk;
4209{
4210
4211	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4212		return;
4213	LIST_REMOVE(jnewblk, jn_deps);
4214	if (jnewblk->jn_dep != NULL)
4215		panic("free_jnewblk: Dependency still attached.");
4216	WORKITEM_FREE(jnewblk, D_JNEWBLK);
4217}
4218
4219/*
4220 * Cancel a jnewblk which has been been made redundant by frag extension.
4221 */
4222static void
4223cancel_jnewblk(jnewblk, wkhd)
4224	struct jnewblk *jnewblk;
4225	struct workhead *wkhd;
4226{
4227	struct jsegdep *jsegdep;
4228
4229	CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4230	jsegdep = jnewblk->jn_jsegdep;
4231	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4232		panic("cancel_jnewblk: Invalid state");
4233	jnewblk->jn_jsegdep  = NULL;
4234	jnewblk->jn_dep = NULL;
4235	jnewblk->jn_state |= GOINGAWAY;
4236	if (jnewblk->jn_state & INPROGRESS) {
4237		jnewblk->jn_state &= ~INPROGRESS;
4238		WORKLIST_REMOVE(&jnewblk->jn_list);
4239		jwork_insert(wkhd, jsegdep);
4240	} else {
4241		free_jsegdep(jsegdep);
4242		remove_from_journal(&jnewblk->jn_list);
4243	}
4244	wake_worklist(&jnewblk->jn_list);
4245	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4246}
4247
4248static void
4249free_jblkdep(jblkdep)
4250	struct jblkdep *jblkdep;
4251{
4252
4253	if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4254		WORKITEM_FREE(jblkdep, D_JFREEBLK);
4255	else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4256		WORKITEM_FREE(jblkdep, D_JTRUNC);
4257	else
4258		panic("free_jblkdep: Unexpected type %s",
4259		    TYPENAME(jblkdep->jb_list.wk_type));
4260}
4261
4262/*
4263 * Free a single jseg once it is no longer referenced in memory or on
4264 * disk.  Reclaim journal blocks and dependencies waiting for the segment
4265 * to disappear.
4266 */
4267static void
4268free_jseg(jseg, jblocks)
4269	struct jseg *jseg;
4270	struct jblocks *jblocks;
4271{
4272	struct freework *freework;
4273
4274	/*
4275	 * Free freework structures that were lingering to indicate freed
4276	 * indirect blocks that forced journal write ordering on reallocate.
4277	 */
4278	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4279		indirblk_remove(freework);
4280	if (jblocks->jb_oldestseg == jseg)
4281		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4282	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4283	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4284	KASSERT(LIST_EMPTY(&jseg->js_entries),
4285	    ("free_jseg: Freed jseg has valid entries."));
4286	WORKITEM_FREE(jseg, D_JSEG);
4287}
4288
4289/*
4290 * Free all jsegs that meet the criteria for being reclaimed and update
4291 * oldestseg.
4292 */
4293static void
4294free_jsegs(jblocks)
4295	struct jblocks *jblocks;
4296{
4297	struct jseg *jseg;
4298
4299	/*
4300	 * Free only those jsegs which have none allocated before them to
4301	 * preserve the journal space ordering.
4302	 */
4303	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4304		/*
4305		 * Only reclaim space when nothing depends on this journal
4306		 * set and another set has written that it is no longer
4307		 * valid.
4308		 */
4309		if (jseg->js_refs != 0) {
4310			jblocks->jb_oldestseg = jseg;
4311			return;
4312		}
4313		if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4314			break;
4315		if (jseg->js_seq > jblocks->jb_oldestwrseq)
4316			break;
4317		/*
4318		 * We can free jsegs that didn't write entries when
4319		 * oldestwrseq == js_seq.
4320		 */
4321		if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4322		    jseg->js_cnt != 0)
4323			break;
4324		free_jseg(jseg, jblocks);
4325	}
4326	/*
4327	 * If we exited the loop above we still must discover the
4328	 * oldest valid segment.
4329	 */
4330	if (jseg)
4331		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4332		     jseg = TAILQ_NEXT(jseg, js_next))
4333			if (jseg->js_refs != 0)
4334				break;
4335	jblocks->jb_oldestseg = jseg;
4336	/*
4337	 * The journal has no valid records but some jsegs may still be
4338	 * waiting on oldestwrseq to advance.  We force a small record
4339	 * out to permit these lingering records to be reclaimed.
4340	 */
4341	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4342		jblocks->jb_needseg = 1;
4343}
4344
4345/*
4346 * Release one reference to a jseg and free it if the count reaches 0.  This
4347 * should eventually reclaim journal space as well.
4348 */
4349static void
4350rele_jseg(jseg)
4351	struct jseg *jseg;
4352{
4353
4354	KASSERT(jseg->js_refs > 0,
4355	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4356	if (--jseg->js_refs != 0)
4357		return;
4358	free_jsegs(jseg->js_jblocks);
4359}
4360
4361/*
4362 * Release a jsegdep and decrement the jseg count.
4363 */
4364static void
4365free_jsegdep(jsegdep)
4366	struct jsegdep *jsegdep;
4367{
4368
4369	if (jsegdep->jd_seg)
4370		rele_jseg(jsegdep->jd_seg);
4371	WORKITEM_FREE(jsegdep, D_JSEGDEP);
4372}
4373
4374/*
4375 * Wait for a journal item to make it to disk.  Initiate journal processing
4376 * if required.
4377 */
4378static int
4379jwait(wk, waitfor)
4380	struct worklist *wk;
4381	int waitfor;
4382{
4383
4384	/*
4385	 * Blocking journal waits cause slow synchronous behavior.  Record
4386	 * stats on the frequency of these blocking operations.
4387	 */
4388	if (waitfor == MNT_WAIT) {
4389		stat_journal_wait++;
4390		switch (wk->wk_type) {
4391		case D_JREMREF:
4392		case D_JMVREF:
4393			stat_jwait_filepage++;
4394			break;
4395		case D_JTRUNC:
4396		case D_JFREEBLK:
4397			stat_jwait_freeblks++;
4398			break;
4399		case D_JNEWBLK:
4400			stat_jwait_newblk++;
4401			break;
4402		case D_JADDREF:
4403			stat_jwait_inode++;
4404			break;
4405		default:
4406			break;
4407		}
4408	}
4409	/*
4410	 * If IO has not started we process the journal.  We can't mark the
4411	 * worklist item as IOWAITING because we drop the lock while
4412	 * processing the journal and the worklist entry may be freed after
4413	 * this point.  The caller may call back in and re-issue the request.
4414	 */
4415	if ((wk->wk_state & INPROGRESS) == 0) {
4416		softdep_process_journal(wk->wk_mp, wk, waitfor);
4417		if (waitfor != MNT_WAIT)
4418			return (EBUSY);
4419		return (0);
4420	}
4421	if (waitfor != MNT_WAIT)
4422		return (EBUSY);
4423	wait_worklist(wk, "jwait");
4424	return (0);
4425}
4426
4427/*
4428 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4429 * appropriate.  This is a convenience function to reduce duplicate code
4430 * for the setup and revert functions below.
4431 */
4432static struct inodedep *
4433inodedep_lookup_ip(ip)
4434	struct inode *ip;
4435{
4436	struct inodedep *inodedep;
4437	int dflags;
4438
4439	KASSERT(ip->i_nlink >= ip->i_effnlink,
4440	    ("inodedep_lookup_ip: bad delta"));
4441	dflags = DEPALLOC;
4442	if (IS_SNAPSHOT(ip))
4443		dflags |= NODELAY;
4444	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags,
4445	    &inodedep);
4446	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4447	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4448
4449	return (inodedep);
4450}
4451
4452/*
4453 * Called prior to creating a new inode and linking it to a directory.  The
4454 * jaddref structure must already be allocated by softdep_setup_inomapdep
4455 * and it is discovered here so we can initialize the mode and update
4456 * nlinkdelta.
4457 */
4458void
4459softdep_setup_create(dp, ip)
4460	struct inode *dp;
4461	struct inode *ip;
4462{
4463	struct inodedep *inodedep;
4464	struct jaddref *jaddref;
4465	struct vnode *dvp;
4466
4467	KASSERT(ip->i_nlink == 1,
4468	    ("softdep_setup_create: Invalid link count."));
4469	dvp = ITOV(dp);
4470	ACQUIRE_LOCK(&lk);
4471	inodedep = inodedep_lookup_ip(ip);
4472	if (DOINGSUJ(dvp)) {
4473		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4474		    inoreflst);
4475		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4476		    ("softdep_setup_create: No addref structure present."));
4477	}
4478	softdep_prelink(dvp, NULL);
4479	FREE_LOCK(&lk);
4480}
4481
4482/*
4483 * Create a jaddref structure to track the addition of a DOTDOT link when
4484 * we are reparenting an inode as part of a rename.  This jaddref will be
4485 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4486 * non-journaling softdep.
4487 */
4488void
4489softdep_setup_dotdot_link(dp, ip)
4490	struct inode *dp;
4491	struct inode *ip;
4492{
4493	struct inodedep *inodedep;
4494	struct jaddref *jaddref;
4495	struct vnode *dvp;
4496	struct vnode *vp;
4497
4498	dvp = ITOV(dp);
4499	vp = ITOV(ip);
4500	jaddref = NULL;
4501	/*
4502	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4503	 * is used as a normal link would be.
4504	 */
4505	if (DOINGSUJ(dvp))
4506		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4507		    dp->i_effnlink - 1, dp->i_mode);
4508	ACQUIRE_LOCK(&lk);
4509	inodedep = inodedep_lookup_ip(dp);
4510	if (jaddref)
4511		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4512		    if_deps);
4513	softdep_prelink(dvp, ITOV(ip));
4514	FREE_LOCK(&lk);
4515}
4516
4517/*
4518 * Create a jaddref structure to track a new link to an inode.  The directory
4519 * offset is not known until softdep_setup_directory_add or
4520 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4521 * softdep.
4522 */
4523void
4524softdep_setup_link(dp, ip)
4525	struct inode *dp;
4526	struct inode *ip;
4527{
4528	struct inodedep *inodedep;
4529	struct jaddref *jaddref;
4530	struct vnode *dvp;
4531
4532	dvp = ITOV(dp);
4533	jaddref = NULL;
4534	if (DOINGSUJ(dvp))
4535		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4536		    ip->i_mode);
4537	ACQUIRE_LOCK(&lk);
4538	inodedep = inodedep_lookup_ip(ip);
4539	if (jaddref)
4540		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4541		    if_deps);
4542	softdep_prelink(dvp, ITOV(ip));
4543	FREE_LOCK(&lk);
4544}
4545
4546/*
4547 * Called to create the jaddref structures to track . and .. references as
4548 * well as lookup and further initialize the incomplete jaddref created
4549 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4550 * nlinkdelta for non-journaling softdep.
4551 */
4552void
4553softdep_setup_mkdir(dp, ip)
4554	struct inode *dp;
4555	struct inode *ip;
4556{
4557	struct inodedep *inodedep;
4558	struct jaddref *dotdotaddref;
4559	struct jaddref *dotaddref;
4560	struct jaddref *jaddref;
4561	struct vnode *dvp;
4562
4563	dvp = ITOV(dp);
4564	dotaddref = dotdotaddref = NULL;
4565	if (DOINGSUJ(dvp)) {
4566		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4567		    ip->i_mode);
4568		dotaddref->ja_state |= MKDIR_BODY;
4569		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4570		    dp->i_effnlink - 1, dp->i_mode);
4571		dotdotaddref->ja_state |= MKDIR_PARENT;
4572	}
4573	ACQUIRE_LOCK(&lk);
4574	inodedep = inodedep_lookup_ip(ip);
4575	if (DOINGSUJ(dvp)) {
4576		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4577		    inoreflst);
4578		KASSERT(jaddref != NULL,
4579		    ("softdep_setup_mkdir: No addref structure present."));
4580		KASSERT(jaddref->ja_parent == dp->i_number,
4581		    ("softdep_setup_mkdir: bad parent %ju",
4582		    (uintmax_t)jaddref->ja_parent));
4583		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4584		    if_deps);
4585	}
4586	inodedep = inodedep_lookup_ip(dp);
4587	if (DOINGSUJ(dvp))
4588		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4589		    &dotdotaddref->ja_ref, if_deps);
4590	softdep_prelink(ITOV(dp), NULL);
4591	FREE_LOCK(&lk);
4592}
4593
4594/*
4595 * Called to track nlinkdelta of the inode and parent directories prior to
4596 * unlinking a directory.
4597 */
4598void
4599softdep_setup_rmdir(dp, ip)
4600	struct inode *dp;
4601	struct inode *ip;
4602{
4603	struct vnode *dvp;
4604
4605	dvp = ITOV(dp);
4606	ACQUIRE_LOCK(&lk);
4607	(void) inodedep_lookup_ip(ip);
4608	(void) inodedep_lookup_ip(dp);
4609	softdep_prelink(dvp, ITOV(ip));
4610	FREE_LOCK(&lk);
4611}
4612
4613/*
4614 * Called to track nlinkdelta of the inode and parent directories prior to
4615 * unlink.
4616 */
4617void
4618softdep_setup_unlink(dp, ip)
4619	struct inode *dp;
4620	struct inode *ip;
4621{
4622	struct vnode *dvp;
4623
4624	dvp = ITOV(dp);
4625	ACQUIRE_LOCK(&lk);
4626	(void) inodedep_lookup_ip(ip);
4627	(void) inodedep_lookup_ip(dp);
4628	softdep_prelink(dvp, ITOV(ip));
4629	FREE_LOCK(&lk);
4630}
4631
4632/*
4633 * Called to release the journal structures created by a failed non-directory
4634 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4635 */
4636void
4637softdep_revert_create(dp, ip)
4638	struct inode *dp;
4639	struct inode *ip;
4640{
4641	struct inodedep *inodedep;
4642	struct jaddref *jaddref;
4643	struct vnode *dvp;
4644
4645	dvp = ITOV(dp);
4646	ACQUIRE_LOCK(&lk);
4647	inodedep = inodedep_lookup_ip(ip);
4648	if (DOINGSUJ(dvp)) {
4649		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4650		    inoreflst);
4651		KASSERT(jaddref->ja_parent == dp->i_number,
4652		    ("softdep_revert_create: addref parent mismatch"));
4653		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4654	}
4655	FREE_LOCK(&lk);
4656}
4657
4658/*
4659 * Called to release the journal structures created by a failed dotdot link
4660 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4661 */
4662void
4663softdep_revert_dotdot_link(dp, ip)
4664	struct inode *dp;
4665	struct inode *ip;
4666{
4667	struct inodedep *inodedep;
4668	struct jaddref *jaddref;
4669	struct vnode *dvp;
4670
4671	dvp = ITOV(dp);
4672	ACQUIRE_LOCK(&lk);
4673	inodedep = inodedep_lookup_ip(dp);
4674	if (DOINGSUJ(dvp)) {
4675		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4676		    inoreflst);
4677		KASSERT(jaddref->ja_parent == ip->i_number,
4678		    ("softdep_revert_dotdot_link: addref parent mismatch"));
4679		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4680	}
4681	FREE_LOCK(&lk);
4682}
4683
4684/*
4685 * Called to release the journal structures created by a failed link
4686 * addition.  Adjusts nlinkdelta for non-journaling softdep.
4687 */
4688void
4689softdep_revert_link(dp, ip)
4690	struct inode *dp;
4691	struct inode *ip;
4692{
4693	struct inodedep *inodedep;
4694	struct jaddref *jaddref;
4695	struct vnode *dvp;
4696
4697	dvp = ITOV(dp);
4698	ACQUIRE_LOCK(&lk);
4699	inodedep = inodedep_lookup_ip(ip);
4700	if (DOINGSUJ(dvp)) {
4701		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4702		    inoreflst);
4703		KASSERT(jaddref->ja_parent == dp->i_number,
4704		    ("softdep_revert_link: addref parent mismatch"));
4705		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4706	}
4707	FREE_LOCK(&lk);
4708}
4709
4710/*
4711 * Called to release the journal structures created by a failed mkdir
4712 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4713 */
4714void
4715softdep_revert_mkdir(dp, ip)
4716	struct inode *dp;
4717	struct inode *ip;
4718{
4719	struct inodedep *inodedep;
4720	struct jaddref *jaddref;
4721	struct jaddref *dotaddref;
4722	struct vnode *dvp;
4723
4724	dvp = ITOV(dp);
4725
4726	ACQUIRE_LOCK(&lk);
4727	inodedep = inodedep_lookup_ip(dp);
4728	if (DOINGSUJ(dvp)) {
4729		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4730		    inoreflst);
4731		KASSERT(jaddref->ja_parent == ip->i_number,
4732		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4733		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4734	}
4735	inodedep = inodedep_lookup_ip(ip);
4736	if (DOINGSUJ(dvp)) {
4737		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4738		    inoreflst);
4739		KASSERT(jaddref->ja_parent == dp->i_number,
4740		    ("softdep_revert_mkdir: addref parent mismatch"));
4741		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4742		    inoreflst, if_deps);
4743		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4744		KASSERT(dotaddref->ja_parent == ip->i_number,
4745		    ("softdep_revert_mkdir: dot addref parent mismatch"));
4746		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4747	}
4748	FREE_LOCK(&lk);
4749}
4750
4751/*
4752 * Called to correct nlinkdelta after a failed rmdir.
4753 */
4754void
4755softdep_revert_rmdir(dp, ip)
4756	struct inode *dp;
4757	struct inode *ip;
4758{
4759
4760	ACQUIRE_LOCK(&lk);
4761	(void) inodedep_lookup_ip(ip);
4762	(void) inodedep_lookup_ip(dp);
4763	FREE_LOCK(&lk);
4764}
4765
4766/*
4767 * Protecting the freemaps (or bitmaps).
4768 *
4769 * To eliminate the need to execute fsck before mounting a filesystem
4770 * after a power failure, one must (conservatively) guarantee that the
4771 * on-disk copy of the bitmaps never indicate that a live inode or block is
4772 * free.  So, when a block or inode is allocated, the bitmap should be
4773 * updated (on disk) before any new pointers.  When a block or inode is
4774 * freed, the bitmap should not be updated until all pointers have been
4775 * reset.  The latter dependency is handled by the delayed de-allocation
4776 * approach described below for block and inode de-allocation.  The former
4777 * dependency is handled by calling the following procedure when a block or
4778 * inode is allocated. When an inode is allocated an "inodedep" is created
4779 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4780 * Each "inodedep" is also inserted into the hash indexing structure so
4781 * that any additional link additions can be made dependent on the inode
4782 * allocation.
4783 *
4784 * The ufs filesystem maintains a number of free block counts (e.g., per
4785 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4786 * in addition to the bitmaps.  These counts are used to improve efficiency
4787 * during allocation and therefore must be consistent with the bitmaps.
4788 * There is no convenient way to guarantee post-crash consistency of these
4789 * counts with simple update ordering, for two main reasons: (1) The counts
4790 * and bitmaps for a single cylinder group block are not in the same disk
4791 * sector.  If a disk write is interrupted (e.g., by power failure), one may
4792 * be written and the other not.  (2) Some of the counts are located in the
4793 * superblock rather than the cylinder group block. So, we focus our soft
4794 * updates implementation on protecting the bitmaps. When mounting a
4795 * filesystem, we recompute the auxiliary counts from the bitmaps.
4796 */
4797
4798/*
4799 * Called just after updating the cylinder group block to allocate an inode.
4800 */
4801void
4802softdep_setup_inomapdep(bp, ip, newinum, mode)
4803	struct buf *bp;		/* buffer for cylgroup block with inode map */
4804	struct inode *ip;	/* inode related to allocation */
4805	ino_t newinum;		/* new inode number being allocated */
4806	int mode;
4807{
4808	struct inodedep *inodedep;
4809	struct bmsafemap *bmsafemap;
4810	struct jaddref *jaddref;
4811	struct mount *mp;
4812	struct fs *fs;
4813
4814	mp = UFSTOVFS(ip->i_ump);
4815	fs = ip->i_ump->um_fs;
4816	jaddref = NULL;
4817
4818	/*
4819	 * Allocate the journal reference add structure so that the bitmap
4820	 * can be dependent on it.
4821	 */
4822	if (MOUNTEDSUJ(mp)) {
4823		jaddref = newjaddref(ip, newinum, 0, 0, mode);
4824		jaddref->ja_state |= NEWBLOCK;
4825	}
4826
4827	/*
4828	 * Create a dependency for the newly allocated inode.
4829	 * Panic if it already exists as something is seriously wrong.
4830	 * Otherwise add it to the dependency list for the buffer holding
4831	 * the cylinder group map from which it was allocated.
4832	 *
4833	 * We have to preallocate a bmsafemap entry in case it is needed
4834	 * in bmsafemap_lookup since once we allocate the inodedep, we
4835	 * have to finish initializing it before we can FREE_LOCK().
4836	 * By preallocating, we avoid FREE_LOCK() while doing a malloc
4837	 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
4838	 * creating the inodedep as it can be freed during the time
4839	 * that we FREE_LOCK() while allocating the inodedep. We must
4840	 * call workitem_alloc() before entering the locked section as
4841	 * it also acquires the lock and we must avoid trying doing so
4842	 * recursively.
4843	 */
4844	bmsafemap = malloc(sizeof(struct bmsafemap),
4845	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
4846	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
4847	ACQUIRE_LOCK(&lk);
4848	if ((inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep)))
4849		panic("softdep_setup_inomapdep: dependency %p for new"
4850		    "inode already exists", inodedep);
4851	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
4852	if (jaddref) {
4853		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
4854		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4855		    if_deps);
4856	} else {
4857		inodedep->id_state |= ONDEPLIST;
4858		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
4859	}
4860	inodedep->id_bmsafemap = bmsafemap;
4861	inodedep->id_state &= ~DEPCOMPLETE;
4862	FREE_LOCK(&lk);
4863}
4864
4865/*
4866 * Called just after updating the cylinder group block to
4867 * allocate block or fragment.
4868 */
4869void
4870softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
4871	struct buf *bp;		/* buffer for cylgroup block with block map */
4872	struct mount *mp;	/* filesystem doing allocation */
4873	ufs2_daddr_t newblkno;	/* number of newly allocated block */
4874	int frags;		/* Number of fragments. */
4875	int oldfrags;		/* Previous number of fragments for extend. */
4876{
4877	struct newblk *newblk;
4878	struct bmsafemap *bmsafemap;
4879	struct jnewblk *jnewblk;
4880	struct fs *fs;
4881
4882	fs = VFSTOUFS(mp)->um_fs;
4883	jnewblk = NULL;
4884	/*
4885	 * Create a dependency for the newly allocated block.
4886	 * Add it to the dependency list for the buffer holding
4887	 * the cylinder group map from which it was allocated.
4888	 */
4889	if (MOUNTEDSUJ(mp)) {
4890		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
4891		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
4892		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
4893		jnewblk->jn_state = ATTACHED;
4894		jnewblk->jn_blkno = newblkno;
4895		jnewblk->jn_frags = frags;
4896		jnewblk->jn_oldfrags = oldfrags;
4897#ifdef SUJ_DEBUG
4898		{
4899			struct cg *cgp;
4900			uint8_t *blksfree;
4901			long bno;
4902			int i;
4903
4904			cgp = (struct cg *)bp->b_data;
4905			blksfree = cg_blksfree(cgp);
4906			bno = dtogd(fs, jnewblk->jn_blkno);
4907			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
4908			    i++) {
4909				if (isset(blksfree, bno + i))
4910					panic("softdep_setup_blkmapdep: "
4911					    "free fragment %d from %d-%d "
4912					    "state 0x%X dep %p", i,
4913					    jnewblk->jn_oldfrags,
4914					    jnewblk->jn_frags,
4915					    jnewblk->jn_state,
4916					    jnewblk->jn_dep);
4917			}
4918		}
4919#endif
4920	}
4921
4922	CTR3(KTR_SUJ,
4923	    "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
4924	    newblkno, frags, oldfrags);
4925	ACQUIRE_LOCK(&lk);
4926	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
4927		panic("softdep_setup_blkmapdep: found block");
4928	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
4929	    dtog(fs, newblkno), NULL);
4930	if (jnewblk) {
4931		jnewblk->jn_dep = (struct worklist *)newblk;
4932		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
4933	} else {
4934		newblk->nb_state |= ONDEPLIST;
4935		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
4936	}
4937	newblk->nb_bmsafemap = bmsafemap;
4938	newblk->nb_jnewblk = jnewblk;
4939	FREE_LOCK(&lk);
4940}
4941
4942#define	BMSAFEMAP_HASH(fs, cg) \
4943      (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash])
4944
4945static int
4946bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp)
4947	struct bmsafemap_hashhead *bmsafemaphd;
4948	struct mount *mp;
4949	int cg;
4950	struct bmsafemap **bmsafemapp;
4951{
4952	struct bmsafemap *bmsafemap;
4953
4954	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
4955		if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg)
4956			break;
4957	if (bmsafemap) {
4958		*bmsafemapp = bmsafemap;
4959		return (1);
4960	}
4961	*bmsafemapp = NULL;
4962
4963	return (0);
4964}
4965
4966/*
4967 * Find the bmsafemap associated with a cylinder group buffer.
4968 * If none exists, create one. The buffer must be locked when
4969 * this routine is called and this routine must be called with
4970 * the softdep lock held. To avoid giving up the lock while
4971 * allocating a new bmsafemap, a preallocated bmsafemap may be
4972 * provided. If it is provided but not needed, it is freed.
4973 */
4974static struct bmsafemap *
4975bmsafemap_lookup(mp, bp, cg, newbmsafemap)
4976	struct mount *mp;
4977	struct buf *bp;
4978	int cg;
4979	struct bmsafemap *newbmsafemap;
4980{
4981	struct bmsafemap_hashhead *bmsafemaphd;
4982	struct bmsafemap *bmsafemap, *collision;
4983	struct worklist *wk;
4984	struct fs *fs;
4985
4986	mtx_assert(&lk, MA_OWNED);
4987	if (bp)
4988		LIST_FOREACH(wk, &bp->b_dep, wk_list)
4989			if (wk->wk_type == D_BMSAFEMAP) {
4990				if (newbmsafemap)
4991					WORKITEM_FREE(newbmsafemap,D_BMSAFEMAP);
4992				return (WK_BMSAFEMAP(wk));
4993			}
4994	fs = VFSTOUFS(mp)->um_fs;
4995	bmsafemaphd = BMSAFEMAP_HASH(fs, cg);
4996	if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) {
4997		if (newbmsafemap)
4998			WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
4999		return (bmsafemap);
5000	}
5001	if (newbmsafemap) {
5002		bmsafemap = newbmsafemap;
5003	} else {
5004		FREE_LOCK(&lk);
5005		bmsafemap = malloc(sizeof(struct bmsafemap),
5006			M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5007		workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5008		ACQUIRE_LOCK(&lk);
5009	}
5010	bmsafemap->sm_buf = bp;
5011	LIST_INIT(&bmsafemap->sm_inodedephd);
5012	LIST_INIT(&bmsafemap->sm_inodedepwr);
5013	LIST_INIT(&bmsafemap->sm_newblkhd);
5014	LIST_INIT(&bmsafemap->sm_newblkwr);
5015	LIST_INIT(&bmsafemap->sm_jaddrefhd);
5016	LIST_INIT(&bmsafemap->sm_jnewblkhd);
5017	LIST_INIT(&bmsafemap->sm_freehd);
5018	LIST_INIT(&bmsafemap->sm_freewr);
5019	if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) {
5020		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5021		return (collision);
5022	}
5023	bmsafemap->sm_cg = cg;
5024	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5025	LIST_INSERT_HEAD(&VFSTOUFS(mp)->softdep_dirtycg, bmsafemap, sm_next);
5026	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5027	return (bmsafemap);
5028}
5029
5030/*
5031 * Direct block allocation dependencies.
5032 *
5033 * When a new block is allocated, the corresponding disk locations must be
5034 * initialized (with zeros or new data) before the on-disk inode points to
5035 * them.  Also, the freemap from which the block was allocated must be
5036 * updated (on disk) before the inode's pointer. These two dependencies are
5037 * independent of each other and are needed for all file blocks and indirect
5038 * blocks that are pointed to directly by the inode.  Just before the
5039 * "in-core" version of the inode is updated with a newly allocated block
5040 * number, a procedure (below) is called to setup allocation dependency
5041 * structures.  These structures are removed when the corresponding
5042 * dependencies are satisfied or when the block allocation becomes obsolete
5043 * (i.e., the file is deleted, the block is de-allocated, or the block is a
5044 * fragment that gets upgraded).  All of these cases are handled in
5045 * procedures described later.
5046 *
5047 * When a file extension causes a fragment to be upgraded, either to a larger
5048 * fragment or to a full block, the on-disk location may change (if the
5049 * previous fragment could not simply be extended). In this case, the old
5050 * fragment must be de-allocated, but not until after the inode's pointer has
5051 * been updated. In most cases, this is handled by later procedures, which
5052 * will construct a "freefrag" structure to be added to the workitem queue
5053 * when the inode update is complete (or obsolete).  The main exception to
5054 * this is when an allocation occurs while a pending allocation dependency
5055 * (for the same block pointer) remains.  This case is handled in the main
5056 * allocation dependency setup procedure by immediately freeing the
5057 * unreferenced fragments.
5058 */
5059void
5060softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5061	struct inode *ip;	/* inode to which block is being added */
5062	ufs_lbn_t off;		/* block pointer within inode */
5063	ufs2_daddr_t newblkno;	/* disk block number being added */
5064	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
5065	long newsize;		/* size of new block */
5066	long oldsize;		/* size of new block */
5067	struct buf *bp;		/* bp for allocated block */
5068{
5069	struct allocdirect *adp, *oldadp;
5070	struct allocdirectlst *adphead;
5071	struct freefrag *freefrag;
5072	struct inodedep *inodedep;
5073	struct pagedep *pagedep;
5074	struct jnewblk *jnewblk;
5075	struct newblk *newblk;
5076	struct mount *mp;
5077	ufs_lbn_t lbn;
5078
5079	lbn = bp->b_lblkno;
5080	mp = UFSTOVFS(ip->i_ump);
5081	if (oldblkno && oldblkno != newblkno)
5082		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5083	else
5084		freefrag = NULL;
5085
5086	CTR6(KTR_SUJ,
5087	    "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5088	    "off %jd newsize %ld oldsize %d",
5089	    ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5090	ACQUIRE_LOCK(&lk);
5091	if (off >= NDADDR) {
5092		if (lbn > 0)
5093			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5094			    lbn, off);
5095		/* allocating an indirect block */
5096		if (oldblkno != 0)
5097			panic("softdep_setup_allocdirect: non-zero indir");
5098	} else {
5099		if (off != lbn)
5100			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5101			    lbn, off);
5102		/*
5103		 * Allocating a direct block.
5104		 *
5105		 * If we are allocating a directory block, then we must
5106		 * allocate an associated pagedep to track additions and
5107		 * deletions.
5108		 */
5109		if ((ip->i_mode & IFMT) == IFDIR)
5110			pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5111			    &pagedep);
5112	}
5113	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5114		panic("softdep_setup_allocdirect: lost block");
5115	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5116	    ("softdep_setup_allocdirect: newblk already initialized"));
5117	/*
5118	 * Convert the newblk to an allocdirect.
5119	 */
5120	newblk->nb_list.wk_type = D_ALLOCDIRECT;
5121	adp = (struct allocdirect *)newblk;
5122	newblk->nb_freefrag = freefrag;
5123	adp->ad_offset = off;
5124	adp->ad_oldblkno = oldblkno;
5125	adp->ad_newsize = newsize;
5126	adp->ad_oldsize = oldsize;
5127
5128	/*
5129	 * Finish initializing the journal.
5130	 */
5131	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5132		jnewblk->jn_ino = ip->i_number;
5133		jnewblk->jn_lbn = lbn;
5134		add_to_journal(&jnewblk->jn_list);
5135	}
5136	if (freefrag && freefrag->ff_jdep != NULL &&
5137	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5138		add_to_journal(freefrag->ff_jdep);
5139	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5140	adp->ad_inodedep = inodedep;
5141
5142	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5143	/*
5144	 * The list of allocdirects must be kept in sorted and ascending
5145	 * order so that the rollback routines can quickly determine the
5146	 * first uncommitted block (the size of the file stored on disk
5147	 * ends at the end of the lowest committed fragment, or if there
5148	 * are no fragments, at the end of the highest committed block).
5149	 * Since files generally grow, the typical case is that the new
5150	 * block is to be added at the end of the list. We speed this
5151	 * special case by checking against the last allocdirect in the
5152	 * list before laboriously traversing the list looking for the
5153	 * insertion point.
5154	 */
5155	adphead = &inodedep->id_newinoupdt;
5156	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5157	if (oldadp == NULL || oldadp->ad_offset <= off) {
5158		/* insert at end of list */
5159		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5160		if (oldadp != NULL && oldadp->ad_offset == off)
5161			allocdirect_merge(adphead, adp, oldadp);
5162		FREE_LOCK(&lk);
5163		return;
5164	}
5165	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5166		if (oldadp->ad_offset >= off)
5167			break;
5168	}
5169	if (oldadp == NULL)
5170		panic("softdep_setup_allocdirect: lost entry");
5171	/* insert in middle of list */
5172	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5173	if (oldadp->ad_offset == off)
5174		allocdirect_merge(adphead, adp, oldadp);
5175
5176	FREE_LOCK(&lk);
5177}
5178
5179/*
5180 * Merge a newer and older journal record to be stored either in a
5181 * newblock or freefrag.  This handles aggregating journal records for
5182 * fragment allocation into a second record as well as replacing a
5183 * journal free with an aborted journal allocation.  A segment for the
5184 * oldest record will be placed on wkhd if it has been written.  If not
5185 * the segment for the newer record will suffice.
5186 */
5187static struct worklist *
5188jnewblk_merge(new, old, wkhd)
5189	struct worklist *new;
5190	struct worklist *old;
5191	struct workhead *wkhd;
5192{
5193	struct jnewblk *njnewblk;
5194	struct jnewblk *jnewblk;
5195
5196	/* Handle NULLs to simplify callers. */
5197	if (new == NULL)
5198		return (old);
5199	if (old == NULL)
5200		return (new);
5201	/* Replace a jfreefrag with a jnewblk. */
5202	if (new->wk_type == D_JFREEFRAG) {
5203		if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5204			panic("jnewblk_merge: blkno mismatch: %p, %p",
5205			    old, new);
5206		cancel_jfreefrag(WK_JFREEFRAG(new));
5207		return (old);
5208	}
5209	if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5210		panic("jnewblk_merge: Bad type: old %d new %d\n",
5211		    old->wk_type, new->wk_type);
5212	/*
5213	 * Handle merging of two jnewblk records that describe
5214	 * different sets of fragments in the same block.
5215	 */
5216	jnewblk = WK_JNEWBLK(old);
5217	njnewblk = WK_JNEWBLK(new);
5218	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5219		panic("jnewblk_merge: Merging disparate blocks.");
5220	/*
5221	 * The record may be rolled back in the cg.
5222	 */
5223	if (jnewblk->jn_state & UNDONE) {
5224		jnewblk->jn_state &= ~UNDONE;
5225		njnewblk->jn_state |= UNDONE;
5226		njnewblk->jn_state &= ~ATTACHED;
5227	}
5228	/*
5229	 * We modify the newer addref and free the older so that if neither
5230	 * has been written the most up-to-date copy will be on disk.  If
5231	 * both have been written but rolled back we only temporarily need
5232	 * one of them to fix the bits when the cg write completes.
5233	 */
5234	jnewblk->jn_state |= ATTACHED | COMPLETE;
5235	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5236	cancel_jnewblk(jnewblk, wkhd);
5237	WORKLIST_REMOVE(&jnewblk->jn_list);
5238	free_jnewblk(jnewblk);
5239	return (new);
5240}
5241
5242/*
5243 * Replace an old allocdirect dependency with a newer one.
5244 * This routine must be called with splbio interrupts blocked.
5245 */
5246static void
5247allocdirect_merge(adphead, newadp, oldadp)
5248	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
5249	struct allocdirect *newadp;	/* allocdirect being added */
5250	struct allocdirect *oldadp;	/* existing allocdirect being checked */
5251{
5252	struct worklist *wk;
5253	struct freefrag *freefrag;
5254
5255	freefrag = NULL;
5256	mtx_assert(&lk, MA_OWNED);
5257	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5258	    newadp->ad_oldsize != oldadp->ad_newsize ||
5259	    newadp->ad_offset >= NDADDR)
5260		panic("%s %jd != new %jd || old size %ld != new %ld",
5261		    "allocdirect_merge: old blkno",
5262		    (intmax_t)newadp->ad_oldblkno,
5263		    (intmax_t)oldadp->ad_newblkno,
5264		    newadp->ad_oldsize, oldadp->ad_newsize);
5265	newadp->ad_oldblkno = oldadp->ad_oldblkno;
5266	newadp->ad_oldsize = oldadp->ad_oldsize;
5267	/*
5268	 * If the old dependency had a fragment to free or had never
5269	 * previously had a block allocated, then the new dependency
5270	 * can immediately post its freefrag and adopt the old freefrag.
5271	 * This action is done by swapping the freefrag dependencies.
5272	 * The new dependency gains the old one's freefrag, and the
5273	 * old one gets the new one and then immediately puts it on
5274	 * the worklist when it is freed by free_newblk. It is
5275	 * not possible to do this swap when the old dependency had a
5276	 * non-zero size but no previous fragment to free. This condition
5277	 * arises when the new block is an extension of the old block.
5278	 * Here, the first part of the fragment allocated to the new
5279	 * dependency is part of the block currently claimed on disk by
5280	 * the old dependency, so cannot legitimately be freed until the
5281	 * conditions for the new dependency are fulfilled.
5282	 */
5283	freefrag = newadp->ad_freefrag;
5284	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5285		newadp->ad_freefrag = oldadp->ad_freefrag;
5286		oldadp->ad_freefrag = freefrag;
5287	}
5288	/*
5289	 * If we are tracking a new directory-block allocation,
5290	 * move it from the old allocdirect to the new allocdirect.
5291	 */
5292	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5293		WORKLIST_REMOVE(wk);
5294		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5295			panic("allocdirect_merge: extra newdirblk");
5296		WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5297	}
5298	TAILQ_REMOVE(adphead, oldadp, ad_next);
5299	/*
5300	 * We need to move any journal dependencies over to the freefrag
5301	 * that releases this block if it exists.  Otherwise we are
5302	 * extending an existing block and we'll wait until that is
5303	 * complete to release the journal space and extend the
5304	 * new journal to cover this old space as well.
5305	 */
5306	if (freefrag == NULL) {
5307		if (oldadp->ad_newblkno != newadp->ad_newblkno)
5308			panic("allocdirect_merge: %jd != %jd",
5309			    oldadp->ad_newblkno, newadp->ad_newblkno);
5310		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5311		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5312		    &oldadp->ad_block.nb_jnewblk->jn_list,
5313		    &newadp->ad_block.nb_jwork);
5314		oldadp->ad_block.nb_jnewblk = NULL;
5315		cancel_newblk(&oldadp->ad_block, NULL,
5316		    &newadp->ad_block.nb_jwork);
5317	} else {
5318		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5319		    &freefrag->ff_list, &freefrag->ff_jwork);
5320		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5321		    &freefrag->ff_jwork);
5322	}
5323	free_newblk(&oldadp->ad_block);
5324}
5325
5326/*
5327 * Allocate a jfreefrag structure to journal a single block free.
5328 */
5329static struct jfreefrag *
5330newjfreefrag(freefrag, ip, blkno, size, lbn)
5331	struct freefrag *freefrag;
5332	struct inode *ip;
5333	ufs2_daddr_t blkno;
5334	long size;
5335	ufs_lbn_t lbn;
5336{
5337	struct jfreefrag *jfreefrag;
5338	struct fs *fs;
5339
5340	fs = ip->i_fs;
5341	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5342	    M_SOFTDEP_FLAGS);
5343	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
5344	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5345	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5346	jfreefrag->fr_ino = ip->i_number;
5347	jfreefrag->fr_lbn = lbn;
5348	jfreefrag->fr_blkno = blkno;
5349	jfreefrag->fr_frags = numfrags(fs, size);
5350	jfreefrag->fr_freefrag = freefrag;
5351
5352	return (jfreefrag);
5353}
5354
5355/*
5356 * Allocate a new freefrag structure.
5357 */
5358static struct freefrag *
5359newfreefrag(ip, blkno, size, lbn)
5360	struct inode *ip;
5361	ufs2_daddr_t blkno;
5362	long size;
5363	ufs_lbn_t lbn;
5364{
5365	struct freefrag *freefrag;
5366	struct fs *fs;
5367
5368	CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5369	    ip->i_number, blkno, size, lbn);
5370	fs = ip->i_fs;
5371	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5372		panic("newfreefrag: frag size");
5373	freefrag = malloc(sizeof(struct freefrag),
5374	    M_FREEFRAG, M_SOFTDEP_FLAGS);
5375	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
5376	freefrag->ff_state = ATTACHED;
5377	LIST_INIT(&freefrag->ff_jwork);
5378	freefrag->ff_inum = ip->i_number;
5379	freefrag->ff_vtype = ITOV(ip)->v_type;
5380	freefrag->ff_blkno = blkno;
5381	freefrag->ff_fragsize = size;
5382
5383	if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) {
5384		freefrag->ff_jdep = (struct worklist *)
5385		    newjfreefrag(freefrag, ip, blkno, size, lbn);
5386	} else {
5387		freefrag->ff_state |= DEPCOMPLETE;
5388		freefrag->ff_jdep = NULL;
5389	}
5390
5391	return (freefrag);
5392}
5393
5394/*
5395 * This workitem de-allocates fragments that were replaced during
5396 * file block allocation.
5397 */
5398static void
5399handle_workitem_freefrag(freefrag)
5400	struct freefrag *freefrag;
5401{
5402	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5403	struct workhead wkhd;
5404
5405	CTR3(KTR_SUJ,
5406	    "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5407	    freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5408	/*
5409	 * It would be illegal to add new completion items to the
5410	 * freefrag after it was schedule to be done so it must be
5411	 * safe to modify the list head here.
5412	 */
5413	LIST_INIT(&wkhd);
5414	ACQUIRE_LOCK(&lk);
5415	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5416	/*
5417	 * If the journal has not been written we must cancel it here.
5418	 */
5419	if (freefrag->ff_jdep) {
5420		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5421			panic("handle_workitem_freefrag: Unexpected type %d\n",
5422			    freefrag->ff_jdep->wk_type);
5423		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5424	}
5425	FREE_LOCK(&lk);
5426	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5427	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
5428	ACQUIRE_LOCK(&lk);
5429	WORKITEM_FREE(freefrag, D_FREEFRAG);
5430	FREE_LOCK(&lk);
5431}
5432
5433/*
5434 * Set up a dependency structure for an external attributes data block.
5435 * This routine follows much of the structure of softdep_setup_allocdirect.
5436 * See the description of softdep_setup_allocdirect above for details.
5437 */
5438void
5439softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5440	struct inode *ip;
5441	ufs_lbn_t off;
5442	ufs2_daddr_t newblkno;
5443	ufs2_daddr_t oldblkno;
5444	long newsize;
5445	long oldsize;
5446	struct buf *bp;
5447{
5448	struct allocdirect *adp, *oldadp;
5449	struct allocdirectlst *adphead;
5450	struct freefrag *freefrag;
5451	struct inodedep *inodedep;
5452	struct jnewblk *jnewblk;
5453	struct newblk *newblk;
5454	struct mount *mp;
5455	ufs_lbn_t lbn;
5456
5457	if (off >= NXADDR)
5458		panic("softdep_setup_allocext: lbn %lld > NXADDR",
5459		    (long long)off);
5460
5461	lbn = bp->b_lblkno;
5462	mp = UFSTOVFS(ip->i_ump);
5463	if (oldblkno && oldblkno != newblkno)
5464		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5465	else
5466		freefrag = NULL;
5467
5468	ACQUIRE_LOCK(&lk);
5469	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5470		panic("softdep_setup_allocext: lost block");
5471	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5472	    ("softdep_setup_allocext: newblk already initialized"));
5473	/*
5474	 * Convert the newblk to an allocdirect.
5475	 */
5476	newblk->nb_list.wk_type = D_ALLOCDIRECT;
5477	adp = (struct allocdirect *)newblk;
5478	newblk->nb_freefrag = freefrag;
5479	adp->ad_offset = off;
5480	adp->ad_oldblkno = oldblkno;
5481	adp->ad_newsize = newsize;
5482	adp->ad_oldsize = oldsize;
5483	adp->ad_state |=  EXTDATA;
5484
5485	/*
5486	 * Finish initializing the journal.
5487	 */
5488	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5489		jnewblk->jn_ino = ip->i_number;
5490		jnewblk->jn_lbn = lbn;
5491		add_to_journal(&jnewblk->jn_list);
5492	}
5493	if (freefrag && freefrag->ff_jdep != NULL &&
5494	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5495		add_to_journal(freefrag->ff_jdep);
5496	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5497	adp->ad_inodedep = inodedep;
5498
5499	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5500	/*
5501	 * The list of allocdirects must be kept in sorted and ascending
5502	 * order so that the rollback routines can quickly determine the
5503	 * first uncommitted block (the size of the file stored on disk
5504	 * ends at the end of the lowest committed fragment, or if there
5505	 * are no fragments, at the end of the highest committed block).
5506	 * Since files generally grow, the typical case is that the new
5507	 * block is to be added at the end of the list. We speed this
5508	 * special case by checking against the last allocdirect in the
5509	 * list before laboriously traversing the list looking for the
5510	 * insertion point.
5511	 */
5512	adphead = &inodedep->id_newextupdt;
5513	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5514	if (oldadp == NULL || oldadp->ad_offset <= off) {
5515		/* insert at end of list */
5516		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5517		if (oldadp != NULL && oldadp->ad_offset == off)
5518			allocdirect_merge(adphead, adp, oldadp);
5519		FREE_LOCK(&lk);
5520		return;
5521	}
5522	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5523		if (oldadp->ad_offset >= off)
5524			break;
5525	}
5526	if (oldadp == NULL)
5527		panic("softdep_setup_allocext: lost entry");
5528	/* insert in middle of list */
5529	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5530	if (oldadp->ad_offset == off)
5531		allocdirect_merge(adphead, adp, oldadp);
5532	FREE_LOCK(&lk);
5533}
5534
5535/*
5536 * Indirect block allocation dependencies.
5537 *
5538 * The same dependencies that exist for a direct block also exist when
5539 * a new block is allocated and pointed to by an entry in a block of
5540 * indirect pointers. The undo/redo states described above are also
5541 * used here. Because an indirect block contains many pointers that
5542 * may have dependencies, a second copy of the entire in-memory indirect
5543 * block is kept. The buffer cache copy is always completely up-to-date.
5544 * The second copy, which is used only as a source for disk writes,
5545 * contains only the safe pointers (i.e., those that have no remaining
5546 * update dependencies). The second copy is freed when all pointers
5547 * are safe. The cache is not allowed to replace indirect blocks with
5548 * pending update dependencies. If a buffer containing an indirect
5549 * block with dependencies is written, these routines will mark it
5550 * dirty again. It can only be successfully written once all the
5551 * dependencies are removed. The ffs_fsync routine in conjunction with
5552 * softdep_sync_metadata work together to get all the dependencies
5553 * removed so that a file can be successfully written to disk. Three
5554 * procedures are used when setting up indirect block pointer
5555 * dependencies. The division is necessary because of the organization
5556 * of the "balloc" routine and because of the distinction between file
5557 * pages and file metadata blocks.
5558 */
5559
5560/*
5561 * Allocate a new allocindir structure.
5562 */
5563static struct allocindir *
5564newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
5565	struct inode *ip;	/* inode for file being extended */
5566	int ptrno;		/* offset of pointer in indirect block */
5567	ufs2_daddr_t newblkno;	/* disk block number being added */
5568	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5569	ufs_lbn_t lbn;
5570{
5571	struct newblk *newblk;
5572	struct allocindir *aip;
5573	struct freefrag *freefrag;
5574	struct jnewblk *jnewblk;
5575
5576	if (oldblkno)
5577		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
5578	else
5579		freefrag = NULL;
5580	ACQUIRE_LOCK(&lk);
5581	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
5582		panic("new_allocindir: lost block");
5583	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5584	    ("newallocindir: newblk already initialized"));
5585	newblk->nb_list.wk_type = D_ALLOCINDIR;
5586	newblk->nb_freefrag = freefrag;
5587	aip = (struct allocindir *)newblk;
5588	aip->ai_offset = ptrno;
5589	aip->ai_oldblkno = oldblkno;
5590	aip->ai_lbn = lbn;
5591	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5592		jnewblk->jn_ino = ip->i_number;
5593		jnewblk->jn_lbn = lbn;
5594		add_to_journal(&jnewblk->jn_list);
5595	}
5596	if (freefrag && freefrag->ff_jdep != NULL &&
5597	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5598		add_to_journal(freefrag->ff_jdep);
5599	return (aip);
5600}
5601
5602/*
5603 * Called just before setting an indirect block pointer
5604 * to a newly allocated file page.
5605 */
5606void
5607softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
5608	struct inode *ip;	/* inode for file being extended */
5609	ufs_lbn_t lbn;		/* allocated block number within file */
5610	struct buf *bp;		/* buffer with indirect blk referencing page */
5611	int ptrno;		/* offset of pointer in indirect block */
5612	ufs2_daddr_t newblkno;	/* disk block number being added */
5613	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5614	struct buf *nbp;	/* buffer holding allocated page */
5615{
5616	struct inodedep *inodedep;
5617	struct freefrag *freefrag;
5618	struct allocindir *aip;
5619	struct pagedep *pagedep;
5620	struct mount *mp;
5621	int dflags;
5622
5623	if (lbn != nbp->b_lblkno)
5624		panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
5625		    lbn, bp->b_lblkno);
5626	CTR4(KTR_SUJ,
5627	    "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
5628	    "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
5629	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
5630	mp = UFSTOVFS(ip->i_ump);
5631	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
5632	dflags = DEPALLOC;
5633	if (IS_SNAPSHOT(ip))
5634		dflags |= NODELAY;
5635	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
5636	/*
5637	 * If we are allocating a directory page, then we must
5638	 * allocate an associated pagedep to track additions and
5639	 * deletions.
5640	 */
5641	if ((ip->i_mode & IFMT) == IFDIR)
5642		pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
5643	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5644	freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5645	FREE_LOCK(&lk);
5646	if (freefrag)
5647		handle_workitem_freefrag(freefrag);
5648}
5649
5650/*
5651 * Called just before setting an indirect block pointer to a
5652 * newly allocated indirect block.
5653 */
5654void
5655softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
5656	struct buf *nbp;	/* newly allocated indirect block */
5657	struct inode *ip;	/* inode for file being extended */
5658	struct buf *bp;		/* indirect block referencing allocated block */
5659	int ptrno;		/* offset of pointer in indirect block */
5660	ufs2_daddr_t newblkno;	/* disk block number being added */
5661{
5662	struct inodedep *inodedep;
5663	struct allocindir *aip;
5664	ufs_lbn_t lbn;
5665	int dflags;
5666
5667	CTR3(KTR_SUJ,
5668	    "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
5669	    ip->i_number, newblkno, ptrno);
5670	lbn = nbp->b_lblkno;
5671	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
5672	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
5673	dflags = DEPALLOC;
5674	if (IS_SNAPSHOT(ip))
5675		dflags |= NODELAY;
5676	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
5677	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5678	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
5679		panic("softdep_setup_allocindir_meta: Block already existed");
5680	FREE_LOCK(&lk);
5681}
5682
5683static void
5684indirdep_complete(indirdep)
5685	struct indirdep *indirdep;
5686{
5687	struct allocindir *aip;
5688
5689	LIST_REMOVE(indirdep, ir_next);
5690	indirdep->ir_state |= DEPCOMPLETE;
5691
5692	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5693		LIST_REMOVE(aip, ai_next);
5694		free_newblk(&aip->ai_block);
5695	}
5696	/*
5697	 * If this indirdep is not attached to a buf it was simply waiting
5698	 * on completion to clear completehd.  free_indirdep() asserts
5699	 * that nothing is dangling.
5700	 */
5701	if ((indirdep->ir_state & ONWORKLIST) == 0)
5702		free_indirdep(indirdep);
5703}
5704
5705static struct indirdep *
5706indirdep_lookup(mp, ip, bp)
5707	struct mount *mp;
5708	struct inode *ip;
5709	struct buf *bp;
5710{
5711	struct indirdep *indirdep, *newindirdep;
5712	struct newblk *newblk;
5713	struct worklist *wk;
5714	struct fs *fs;
5715	ufs2_daddr_t blkno;
5716
5717	mtx_assert(&lk, MA_OWNED);
5718	indirdep = NULL;
5719	newindirdep = NULL;
5720	fs = ip->i_fs;
5721	for (;;) {
5722		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5723			if (wk->wk_type != D_INDIRDEP)
5724				continue;
5725			indirdep = WK_INDIRDEP(wk);
5726			break;
5727		}
5728		/* Found on the buffer worklist, no new structure to free. */
5729		if (indirdep != NULL && newindirdep == NULL)
5730			return (indirdep);
5731		if (indirdep != NULL && newindirdep != NULL)
5732			panic("indirdep_lookup: simultaneous create");
5733		/* None found on the buffer and a new structure is ready. */
5734		if (indirdep == NULL && newindirdep != NULL)
5735			break;
5736		/* None found and no new structure available. */
5737		FREE_LOCK(&lk);
5738		newindirdep = malloc(sizeof(struct indirdep),
5739		    M_INDIRDEP, M_SOFTDEP_FLAGS);
5740		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5741		newindirdep->ir_state = ATTACHED;
5742		if (ip->i_ump->um_fstype == UFS1)
5743			newindirdep->ir_state |= UFS1FMT;
5744		TAILQ_INIT(&newindirdep->ir_trunc);
5745		newindirdep->ir_saveddata = NULL;
5746		LIST_INIT(&newindirdep->ir_deplisthd);
5747		LIST_INIT(&newindirdep->ir_donehd);
5748		LIST_INIT(&newindirdep->ir_writehd);
5749		LIST_INIT(&newindirdep->ir_completehd);
5750		if (bp->b_blkno == bp->b_lblkno) {
5751			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5752			    NULL, NULL);
5753			bp->b_blkno = blkno;
5754		}
5755		newindirdep->ir_freeblks = NULL;
5756		newindirdep->ir_savebp =
5757		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5758		newindirdep->ir_bp = bp;
5759		BUF_KERNPROC(newindirdep->ir_savebp);
5760		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5761		ACQUIRE_LOCK(&lk);
5762	}
5763	indirdep = newindirdep;
5764	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
5765	/*
5766	 * If the block is not yet allocated we don't set DEPCOMPLETE so
5767	 * that we don't free dependencies until the pointers are valid.
5768	 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
5769	 * than using the hash.
5770	 */
5771	if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
5772		LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
5773	else
5774		indirdep->ir_state |= DEPCOMPLETE;
5775	return (indirdep);
5776}
5777
5778/*
5779 * Called to finish the allocation of the "aip" allocated
5780 * by one of the two routines above.
5781 */
5782static struct freefrag *
5783setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
5784	struct buf *bp;		/* in-memory copy of the indirect block */
5785	struct inode *ip;	/* inode for file being extended */
5786	struct inodedep *inodedep; /* Inodedep for ip */
5787	struct allocindir *aip;	/* allocindir allocated by the above routines */
5788	ufs_lbn_t lbn;		/* Logical block number for this block. */
5789{
5790	struct fs *fs;
5791	struct indirdep *indirdep;
5792	struct allocindir *oldaip;
5793	struct freefrag *freefrag;
5794	struct mount *mp;
5795
5796	mtx_assert(&lk, MA_OWNED);
5797	mp = UFSTOVFS(ip->i_ump);
5798	fs = ip->i_fs;
5799	if (bp->b_lblkno >= 0)
5800		panic("setup_allocindir_phase2: not indir blk");
5801	KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
5802	    ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
5803	indirdep = indirdep_lookup(mp, ip, bp);
5804	KASSERT(indirdep->ir_savebp != NULL,
5805	    ("setup_allocindir_phase2 NULL ir_savebp"));
5806	aip->ai_indirdep = indirdep;
5807	/*
5808	 * Check for an unwritten dependency for this indirect offset.  If
5809	 * there is, merge the old dependency into the new one.  This happens
5810	 * as a result of reallocblk only.
5811	 */
5812	freefrag = NULL;
5813	if (aip->ai_oldblkno != 0) {
5814		LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
5815			if (oldaip->ai_offset == aip->ai_offset) {
5816				freefrag = allocindir_merge(aip, oldaip);
5817				goto done;
5818			}
5819		}
5820		LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
5821			if (oldaip->ai_offset == aip->ai_offset) {
5822				freefrag = allocindir_merge(aip, oldaip);
5823				goto done;
5824			}
5825		}
5826	}
5827done:
5828	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
5829	return (freefrag);
5830}
5831
5832/*
5833 * Merge two allocindirs which refer to the same block.  Move newblock
5834 * dependencies and setup the freefrags appropriately.
5835 */
5836static struct freefrag *
5837allocindir_merge(aip, oldaip)
5838	struct allocindir *aip;
5839	struct allocindir *oldaip;
5840{
5841	struct freefrag *freefrag;
5842	struct worklist *wk;
5843
5844	if (oldaip->ai_newblkno != aip->ai_oldblkno)
5845		panic("allocindir_merge: blkno");
5846	aip->ai_oldblkno = oldaip->ai_oldblkno;
5847	freefrag = aip->ai_freefrag;
5848	aip->ai_freefrag = oldaip->ai_freefrag;
5849	oldaip->ai_freefrag = NULL;
5850	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
5851	/*
5852	 * If we are tracking a new directory-block allocation,
5853	 * move it from the old allocindir to the new allocindir.
5854	 */
5855	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
5856		WORKLIST_REMOVE(wk);
5857		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
5858			panic("allocindir_merge: extra newdirblk");
5859		WORKLIST_INSERT(&aip->ai_newdirblk, wk);
5860	}
5861	/*
5862	 * We can skip journaling for this freefrag and just complete
5863	 * any pending journal work for the allocindir that is being
5864	 * removed after the freefrag completes.
5865	 */
5866	if (freefrag->ff_jdep)
5867		cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
5868	LIST_REMOVE(oldaip, ai_next);
5869	freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
5870	    &freefrag->ff_list, &freefrag->ff_jwork);
5871	free_newblk(&oldaip->ai_block);
5872
5873	return (freefrag);
5874}
5875
5876static inline void
5877setup_freedirect(freeblks, ip, i, needj)
5878	struct freeblks *freeblks;
5879	struct inode *ip;
5880	int i;
5881	int needj;
5882{
5883	ufs2_daddr_t blkno;
5884	int frags;
5885
5886	blkno = DIP(ip, i_db[i]);
5887	if (blkno == 0)
5888		return;
5889	DIP_SET(ip, i_db[i], 0);
5890	frags = sblksize(ip->i_fs, ip->i_size, i);
5891	frags = numfrags(ip->i_fs, frags);
5892	newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj);
5893}
5894
5895static inline void
5896setup_freeext(freeblks, ip, i, needj)
5897	struct freeblks *freeblks;
5898	struct inode *ip;
5899	int i;
5900	int needj;
5901{
5902	ufs2_daddr_t blkno;
5903	int frags;
5904
5905	blkno = ip->i_din2->di_extb[i];
5906	if (blkno == 0)
5907		return;
5908	ip->i_din2->di_extb[i] = 0;
5909	frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
5910	frags = numfrags(ip->i_fs, frags);
5911	newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
5912}
5913
5914static inline void
5915setup_freeindir(freeblks, ip, i, lbn, needj)
5916	struct freeblks *freeblks;
5917	struct inode *ip;
5918	int i;
5919	ufs_lbn_t lbn;
5920	int needj;
5921{
5922	ufs2_daddr_t blkno;
5923
5924	blkno = DIP(ip, i_ib[i]);
5925	if (blkno == 0)
5926		return;
5927	DIP_SET(ip, i_ib[i], 0);
5928	newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
5929	    0, needj);
5930}
5931
5932static inline struct freeblks *
5933newfreeblks(mp, ip)
5934	struct mount *mp;
5935	struct inode *ip;
5936{
5937	struct freeblks *freeblks;
5938
5939	freeblks = malloc(sizeof(struct freeblks),
5940		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
5941	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
5942	LIST_INIT(&freeblks->fb_jblkdephd);
5943	LIST_INIT(&freeblks->fb_jwork);
5944	freeblks->fb_ref = 0;
5945	freeblks->fb_cgwait = 0;
5946	freeblks->fb_state = ATTACHED;
5947	freeblks->fb_uid = ip->i_uid;
5948	freeblks->fb_inum = ip->i_number;
5949	freeblks->fb_vtype = ITOV(ip)->v_type;
5950	freeblks->fb_modrev = DIP(ip, i_modrev);
5951	freeblks->fb_devvp = ip->i_devvp;
5952	freeblks->fb_chkcnt = 0;
5953	freeblks->fb_len = 0;
5954
5955	return (freeblks);
5956}
5957
5958static void
5959trunc_indirdep(indirdep, freeblks, bp, off)
5960	struct indirdep *indirdep;
5961	struct freeblks *freeblks;
5962	struct buf *bp;
5963	int off;
5964{
5965	struct allocindir *aip, *aipn;
5966
5967	/*
5968	 * The first set of allocindirs won't be in savedbp.
5969	 */
5970	LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
5971		if (aip->ai_offset > off)
5972			cancel_allocindir(aip, bp, freeblks, 1);
5973	LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
5974		if (aip->ai_offset > off)
5975			cancel_allocindir(aip, bp, freeblks, 1);
5976	/*
5977	 * These will exist in savedbp.
5978	 */
5979	LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
5980		if (aip->ai_offset > off)
5981			cancel_allocindir(aip, NULL, freeblks, 0);
5982	LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
5983		if (aip->ai_offset > off)
5984			cancel_allocindir(aip, NULL, freeblks, 0);
5985}
5986
5987/*
5988 * Follow the chain of indirects down to lastlbn creating a freework
5989 * structure for each.  This will be used to start indir_trunc() at
5990 * the right offset and create the journal records for the parrtial
5991 * truncation.  A second step will handle the truncated dependencies.
5992 */
5993static int
5994setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
5995	struct freeblks *freeblks;
5996	struct inode *ip;
5997	ufs_lbn_t lbn;
5998	ufs_lbn_t lastlbn;
5999	ufs2_daddr_t blkno;
6000{
6001	struct indirdep *indirdep;
6002	struct indirdep *indirn;
6003	struct freework *freework;
6004	struct newblk *newblk;
6005	struct mount *mp;
6006	struct buf *bp;
6007	uint8_t *start;
6008	uint8_t *end;
6009	ufs_lbn_t lbnadd;
6010	int level;
6011	int error;
6012	int off;
6013
6014
6015	freework = NULL;
6016	if (blkno == 0)
6017		return (0);
6018	mp = freeblks->fb_list.wk_mp;
6019	bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
6020	if ((bp->b_flags & B_CACHE) == 0) {
6021		bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
6022		bp->b_iocmd = BIO_READ;
6023		bp->b_flags &= ~B_INVAL;
6024		bp->b_ioflags &= ~BIO_ERROR;
6025		vfs_busy_pages(bp, 0);
6026		bp->b_iooffset = dbtob(bp->b_blkno);
6027		bstrategy(bp);
6028		curthread->td_ru.ru_inblock++;
6029		error = bufwait(bp);
6030		if (error) {
6031			brelse(bp);
6032			return (error);
6033		}
6034	}
6035	level = lbn_level(lbn);
6036	lbnadd = lbn_offset(ip->i_fs, level);
6037	/*
6038	 * Compute the offset of the last block we want to keep.  Store
6039	 * in the freework the first block we want to completely free.
6040	 */
6041	off = (lastlbn - -(lbn + level)) / lbnadd;
6042	if (off + 1 == NINDIR(ip->i_fs))
6043		goto nowork;
6044	freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1,
6045	    0);
6046	/*
6047	 * Link the freework into the indirdep.  This will prevent any new
6048	 * allocations from proceeding until we are finished with the
6049	 * truncate and the block is written.
6050	 */
6051	ACQUIRE_LOCK(&lk);
6052	indirdep = indirdep_lookup(mp, ip, bp);
6053	if (indirdep->ir_freeblks)
6054		panic("setup_trunc_indir: indirdep already truncated.");
6055	TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
6056	freework->fw_indir = indirdep;
6057	/*
6058	 * Cancel any allocindirs that will not make it to disk.
6059	 * We have to do this for all copies of the indirdep that
6060	 * live on this newblk.
6061	 */
6062	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
6063		newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk);
6064		LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
6065			trunc_indirdep(indirn, freeblks, bp, off);
6066	} else
6067		trunc_indirdep(indirdep, freeblks, bp, off);
6068	FREE_LOCK(&lk);
6069	/*
6070	 * Creation is protected by the buf lock. The saveddata is only
6071	 * needed if a full truncation follows a partial truncation but it
6072	 * is difficult to allocate in that case so we fetch it anyway.
6073	 */
6074	if (indirdep->ir_saveddata == NULL)
6075		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
6076		    M_SOFTDEP_FLAGS);
6077nowork:
6078	/* Fetch the blkno of the child and the zero start offset. */
6079	if (ip->i_ump->um_fstype == UFS1) {
6080		blkno = ((ufs1_daddr_t *)bp->b_data)[off];
6081		start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
6082	} else {
6083		blkno = ((ufs2_daddr_t *)bp->b_data)[off];
6084		start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
6085	}
6086	if (freework) {
6087		/* Zero the truncated pointers. */
6088		end = bp->b_data + bp->b_bcount;
6089		bzero(start, end - start);
6090		bdwrite(bp);
6091	} else
6092		bqrelse(bp);
6093	if (level == 0)
6094		return (0);
6095	lbn++; /* adjust level */
6096	lbn -= (off * lbnadd);
6097	return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
6098}
6099
6100/*
6101 * Complete the partial truncation of an indirect block setup by
6102 * setup_trunc_indir().  This zeros the truncated pointers in the saved
6103 * copy and writes them to disk before the freeblks is allowed to complete.
6104 */
6105static void
6106complete_trunc_indir(freework)
6107	struct freework *freework;
6108{
6109	struct freework *fwn;
6110	struct indirdep *indirdep;
6111	struct buf *bp;
6112	uintptr_t start;
6113	int count;
6114
6115	indirdep = freework->fw_indir;
6116	for (;;) {
6117		bp = indirdep->ir_bp;
6118		/* See if the block was discarded. */
6119		if (bp == NULL)
6120			break;
6121		/* Inline part of getdirtybuf().  We dont want bremfree. */
6122		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
6123			break;
6124		if (BUF_LOCK(bp,
6125		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, &lk) == 0)
6126			BUF_UNLOCK(bp);
6127		ACQUIRE_LOCK(&lk);
6128	}
6129	mtx_assert(&lk, MA_OWNED);
6130	freework->fw_state |= DEPCOMPLETE;
6131	TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
6132	/*
6133	 * Zero the pointers in the saved copy.
6134	 */
6135	if (indirdep->ir_state & UFS1FMT)
6136		start = sizeof(ufs1_daddr_t);
6137	else
6138		start = sizeof(ufs2_daddr_t);
6139	start *= freework->fw_start;
6140	count = indirdep->ir_savebp->b_bcount - start;
6141	start += (uintptr_t)indirdep->ir_savebp->b_data;
6142	bzero((char *)start, count);
6143	/*
6144	 * We need to start the next truncation in the list if it has not
6145	 * been started yet.
6146	 */
6147	fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6148	if (fwn != NULL) {
6149		if (fwn->fw_freeblks == indirdep->ir_freeblks)
6150			TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6151		if ((fwn->fw_state & ONWORKLIST) == 0)
6152			freework_enqueue(fwn);
6153	}
6154	/*
6155	 * If bp is NULL the block was fully truncated, restore
6156	 * the saved block list otherwise free it if it is no
6157	 * longer needed.
6158	 */
6159	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6160		if (bp == NULL)
6161			bcopy(indirdep->ir_saveddata,
6162			    indirdep->ir_savebp->b_data,
6163			    indirdep->ir_savebp->b_bcount);
6164		free(indirdep->ir_saveddata, M_INDIRDEP);
6165		indirdep->ir_saveddata = NULL;
6166	}
6167	/*
6168	 * When bp is NULL there is a full truncation pending.  We
6169	 * must wait for this full truncation to be journaled before
6170	 * we can release this freework because the disk pointers will
6171	 * never be written as zero.
6172	 */
6173	if (bp == NULL)  {
6174		if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6175			handle_written_freework(freework);
6176		else
6177			WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6178			   &freework->fw_list);
6179	} else {
6180		/* Complete when the real copy is written. */
6181		WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6182		BUF_UNLOCK(bp);
6183	}
6184}
6185
6186/*
6187 * Calculate the number of blocks we are going to release where datablocks
6188 * is the current total and length is the new file size.
6189 */
6190ufs2_daddr_t
6191blkcount(fs, datablocks, length)
6192	struct fs *fs;
6193	ufs2_daddr_t datablocks;
6194	off_t length;
6195{
6196	off_t totblks, numblks;
6197
6198	totblks = 0;
6199	numblks = howmany(length, fs->fs_bsize);
6200	if (numblks <= NDADDR) {
6201		totblks = howmany(length, fs->fs_fsize);
6202		goto out;
6203	}
6204        totblks = blkstofrags(fs, numblks);
6205	numblks -= NDADDR;
6206	/*
6207	 * Count all single, then double, then triple indirects required.
6208	 * Subtracting one indirects worth of blocks for each pass
6209	 * acknowledges one of each pointed to by the inode.
6210	 */
6211	for (;;) {
6212		totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6213		numblks -= NINDIR(fs);
6214		if (numblks <= 0)
6215			break;
6216		numblks = howmany(numblks, NINDIR(fs));
6217	}
6218out:
6219	totblks = fsbtodb(fs, totblks);
6220	/*
6221	 * Handle sparse files.  We can't reclaim more blocks than the inode
6222	 * references.  We will correct it later in handle_complete_freeblks()
6223	 * when we know the real count.
6224	 */
6225	if (totblks > datablocks)
6226		return (0);
6227	return (datablocks - totblks);
6228}
6229
6230/*
6231 * Handle freeblocks for journaled softupdate filesystems.
6232 *
6233 * Contrary to normal softupdates, we must preserve the block pointers in
6234 * indirects until their subordinates are free.  This is to avoid journaling
6235 * every block that is freed which may consume more space than the journal
6236 * itself.  The recovery program will see the free block journals at the
6237 * base of the truncated area and traverse them to reclaim space.  The
6238 * pointers in the inode may be cleared immediately after the journal
6239 * records are written because each direct and indirect pointer in the
6240 * inode is recorded in a journal.  This permits full truncation to proceed
6241 * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6242 *
6243 * The algorithm is as follows:
6244 * 1) Traverse the in-memory state and create journal entries to release
6245 *    the relevant blocks and full indirect trees.
6246 * 2) Traverse the indirect block chain adding partial truncation freework
6247 *    records to indirects in the path to lastlbn.  The freework will
6248 *    prevent new allocation dependencies from being satisfied in this
6249 *    indirect until the truncation completes.
6250 * 3) Read and lock the inode block, performing an update with the new size
6251 *    and pointers.  This prevents truncated data from becoming valid on
6252 *    disk through step 4.
6253 * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6254 *    eliminate journal work for those records that do not require it.
6255 * 5) Schedule the journal records to be written followed by the inode block.
6256 * 6) Allocate any necessary frags for the end of file.
6257 * 7) Zero any partially truncated blocks.
6258 *
6259 * From this truncation proceeds asynchronously using the freework and
6260 * indir_trunc machinery.  The file will not be extended again into a
6261 * partially truncated indirect block until all work is completed but
6262 * the normal dependency mechanism ensures that it is rolled back/forward
6263 * as appropriate.  Further truncation may occur without delay and is
6264 * serialized in indir_trunc().
6265 */
6266void
6267softdep_journal_freeblocks(ip, cred, length, flags)
6268	struct inode *ip;	/* The inode whose length is to be reduced */
6269	struct ucred *cred;
6270	off_t length;		/* The new length for the file */
6271	int flags;		/* IO_EXT and/or IO_NORMAL */
6272{
6273	struct freeblks *freeblks, *fbn;
6274	struct worklist *wk, *wkn;
6275	struct inodedep *inodedep;
6276	struct jblkdep *jblkdep;
6277	struct allocdirect *adp, *adpn;
6278	struct fs *fs;
6279	struct buf *bp;
6280	struct vnode *vp;
6281	struct mount *mp;
6282	ufs2_daddr_t extblocks, datablocks;
6283	ufs_lbn_t tmpval, lbn, lastlbn;
6284	int frags, lastoff, iboff, allocblock, needj, dflags, error, i;
6285
6286	fs = ip->i_fs;
6287	mp = UFSTOVFS(ip->i_ump);
6288	vp = ITOV(ip);
6289	needj = 1;
6290	iboff = -1;
6291	allocblock = 0;
6292	extblocks = 0;
6293	datablocks = 0;
6294	frags = 0;
6295	freeblks = newfreeblks(mp, ip);
6296	ACQUIRE_LOCK(&lk);
6297	/*
6298	 * If we're truncating a removed file that will never be written
6299	 * we don't need to journal the block frees.  The canceled journals
6300	 * for the allocations will suffice.
6301	 */
6302	dflags = DEPALLOC;
6303	if (IS_SNAPSHOT(ip))
6304		dflags |= NODELAY;
6305	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6306	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6307	    length == 0)
6308		needj = 0;
6309	CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
6310	    ip->i_number, length, needj);
6311	FREE_LOCK(&lk);
6312	/*
6313	 * Calculate the lbn that we are truncating to.  This results in -1
6314	 * if we're truncating the 0 bytes.  So it is the last lbn we want
6315	 * to keep, not the first lbn we want to truncate.
6316	 */
6317	lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6318	lastoff = blkoff(fs, length);
6319	/*
6320	 * Compute frags we are keeping in lastlbn.  0 means all.
6321	 */
6322	if (lastlbn >= 0 && lastlbn < NDADDR) {
6323		frags = fragroundup(fs, lastoff);
6324		/* adp offset of last valid allocdirect. */
6325		iboff = lastlbn;
6326	} else if (lastlbn > 0)
6327		iboff = NDADDR;
6328	if (fs->fs_magic == FS_UFS2_MAGIC)
6329		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6330	/*
6331	 * Handle normal data blocks and indirects.  This section saves
6332	 * values used after the inode update to complete frag and indirect
6333	 * truncation.
6334	 */
6335	if ((flags & IO_NORMAL) != 0) {
6336		/*
6337		 * Handle truncation of whole direct and indirect blocks.
6338		 */
6339		for (i = iboff + 1; i < NDADDR; i++)
6340			setup_freedirect(freeblks, ip, i, needj);
6341		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6342		    i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6343			/* Release a whole indirect tree. */
6344			if (lbn > lastlbn) {
6345				setup_freeindir(freeblks, ip, i, -lbn -i,
6346				    needj);
6347				continue;
6348			}
6349			iboff = i + NDADDR;
6350			/*
6351			 * Traverse partially truncated indirect tree.
6352			 */
6353			if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6354				setup_trunc_indir(freeblks, ip, -lbn - i,
6355				    lastlbn, DIP(ip, i_ib[i]));
6356		}
6357		/*
6358		 * Handle partial truncation to a frag boundary.
6359		 */
6360		if (frags) {
6361			ufs2_daddr_t blkno;
6362			long oldfrags;
6363
6364			oldfrags = blksize(fs, ip, lastlbn);
6365			blkno = DIP(ip, i_db[lastlbn]);
6366			if (blkno && oldfrags != frags) {
6367				oldfrags -= frags;
6368				oldfrags = numfrags(ip->i_fs, oldfrags);
6369				blkno += numfrags(ip->i_fs, frags);
6370				newfreework(ip->i_ump, freeblks, NULL, lastlbn,
6371				    blkno, oldfrags, 0, needj);
6372			} else if (blkno == 0)
6373				allocblock = 1;
6374		}
6375		/*
6376		 * Add a journal record for partial truncate if we are
6377		 * handling indirect blocks.  Non-indirects need no extra
6378		 * journaling.
6379		 */
6380		if (length != 0 && lastlbn >= NDADDR) {
6381			ip->i_flag |= IN_TRUNCATED;
6382			newjtrunc(freeblks, length, 0);
6383		}
6384		ip->i_size = length;
6385		DIP_SET(ip, i_size, ip->i_size);
6386		datablocks = DIP(ip, i_blocks) - extblocks;
6387		if (length != 0)
6388			datablocks = blkcount(ip->i_fs, datablocks, length);
6389		freeblks->fb_len = length;
6390	}
6391	if ((flags & IO_EXT) != 0) {
6392		for (i = 0; i < NXADDR; i++)
6393			setup_freeext(freeblks, ip, i, needj);
6394		ip->i_din2->di_extsize = 0;
6395		datablocks += extblocks;
6396	}
6397#ifdef QUOTA
6398	/* Reference the quotas in case the block count is wrong in the end. */
6399	quotaref(vp, freeblks->fb_quota);
6400	(void) chkdq(ip, -datablocks, NOCRED, 0);
6401#endif
6402	freeblks->fb_chkcnt = -datablocks;
6403	UFS_LOCK(ip->i_ump);
6404	fs->fs_pendingblocks += datablocks;
6405	UFS_UNLOCK(ip->i_ump);
6406	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6407	/*
6408	 * Handle truncation of incomplete alloc direct dependencies.  We
6409	 * hold the inode block locked to prevent incomplete dependencies
6410	 * from reaching the disk while we are eliminating those that
6411	 * have been truncated.  This is a partially inlined ffs_update().
6412	 */
6413	ufs_itimes(vp);
6414	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6415	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6416	    (int)fs->fs_bsize, cred, &bp);
6417	if (error) {
6418		brelse(bp);
6419		softdep_error("softdep_journal_freeblocks", error);
6420		return;
6421	}
6422	if (bp->b_bufsize == fs->fs_bsize)
6423		bp->b_flags |= B_CLUSTEROK;
6424	softdep_update_inodeblock(ip, bp, 0);
6425	if (ip->i_ump->um_fstype == UFS1)
6426		*((struct ufs1_dinode *)bp->b_data +
6427		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6428	else
6429		*((struct ufs2_dinode *)bp->b_data +
6430		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6431	ACQUIRE_LOCK(&lk);
6432	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6433	if ((inodedep->id_state & IOSTARTED) != 0)
6434		panic("softdep_setup_freeblocks: inode busy");
6435	/*
6436	 * Add the freeblks structure to the list of operations that
6437	 * must await the zero'ed inode being written to disk. If we
6438	 * still have a bitmap dependency (needj), then the inode
6439	 * has never been written to disk, so we can process the
6440	 * freeblks below once we have deleted the dependencies.
6441	 */
6442	if (needj)
6443		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6444	else
6445		freeblks->fb_state |= COMPLETE;
6446	if ((flags & IO_NORMAL) != 0) {
6447		TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6448			if (adp->ad_offset > iboff)
6449				cancel_allocdirect(&inodedep->id_inoupdt, adp,
6450				    freeblks);
6451			/*
6452			 * Truncate the allocdirect.  We could eliminate
6453			 * or modify journal records as well.
6454			 */
6455			else if (adp->ad_offset == iboff && frags)
6456				adp->ad_newsize = frags;
6457		}
6458	}
6459	if ((flags & IO_EXT) != 0)
6460		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6461			cancel_allocdirect(&inodedep->id_extupdt, adp,
6462			    freeblks);
6463	/*
6464	 * Scan the bufwait list for newblock dependencies that will never
6465	 * make it to disk.
6466	 */
6467	LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
6468		if (wk->wk_type != D_ALLOCDIRECT)
6469			continue;
6470		adp = WK_ALLOCDIRECT(wk);
6471		if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
6472		    ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
6473			cancel_jfreeblk(freeblks, adp->ad_newblkno);
6474			cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
6475			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6476		}
6477	}
6478	/*
6479	 * Add journal work.
6480	 */
6481	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6482		add_to_journal(&jblkdep->jb_list);
6483	FREE_LOCK(&lk);
6484	bdwrite(bp);
6485	/*
6486	 * Truncate dependency structures beyond length.
6487	 */
6488	trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6489	/*
6490	 * This is only set when we need to allocate a fragment because
6491	 * none existed at the end of a frag-sized file.  It handles only
6492	 * allocating a new, zero filled block.
6493	 */
6494	if (allocblock) {
6495		ip->i_size = length - lastoff;
6496		DIP_SET(ip, i_size, ip->i_size);
6497		error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6498		if (error != 0) {
6499			softdep_error("softdep_journal_freeblks", error);
6500			return;
6501		}
6502		ip->i_size = length;
6503		DIP_SET(ip, i_size, length);
6504		ip->i_flag |= IN_CHANGE | IN_UPDATE;
6505		allocbuf(bp, frags);
6506		ffs_update(vp, 0);
6507		bawrite(bp);
6508	} else if (lastoff != 0 && vp->v_type != VDIR) {
6509		int size;
6510
6511		/*
6512		 * Zero the end of a truncated frag or block.
6513		 */
6514		size = sblksize(fs, length, lastlbn);
6515		error = bread(vp, lastlbn, size, cred, &bp);
6516		if (error) {
6517			softdep_error("softdep_journal_freeblks", error);
6518			return;
6519		}
6520		bzero((char *)bp->b_data + lastoff, size - lastoff);
6521		bawrite(bp);
6522
6523	}
6524	ACQUIRE_LOCK(&lk);
6525	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6526	TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
6527	freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
6528	/*
6529	 * We zero earlier truncations so they don't erroneously
6530	 * update i_blocks.
6531	 */
6532	if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
6533		TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
6534			fbn->fb_len = 0;
6535	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
6536	    LIST_EMPTY(&freeblks->fb_jblkdephd))
6537		freeblks->fb_state |= INPROGRESS;
6538	else
6539		freeblks = NULL;
6540	FREE_LOCK(&lk);
6541	if (freeblks)
6542		handle_workitem_freeblocks(freeblks, 0);
6543	trunc_pages(ip, length, extblocks, flags);
6544
6545}
6546
6547/*
6548 * Flush a JOP_SYNC to the journal.
6549 */
6550void
6551softdep_journal_fsync(ip)
6552	struct inode *ip;
6553{
6554	struct jfsync *jfsync;
6555
6556	if ((ip->i_flag & IN_TRUNCATED) == 0)
6557		return;
6558	ip->i_flag &= ~IN_TRUNCATED;
6559	jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
6560	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump));
6561	jfsync->jfs_size = ip->i_size;
6562	jfsync->jfs_ino = ip->i_number;
6563	ACQUIRE_LOCK(&lk);
6564	add_to_journal(&jfsync->jfs_list);
6565	jwait(&jfsync->jfs_list, MNT_WAIT);
6566	FREE_LOCK(&lk);
6567}
6568
6569/*
6570 * Block de-allocation dependencies.
6571 *
6572 * When blocks are de-allocated, the on-disk pointers must be nullified before
6573 * the blocks are made available for use by other files.  (The true
6574 * requirement is that old pointers must be nullified before new on-disk
6575 * pointers are set.  We chose this slightly more stringent requirement to
6576 * reduce complexity.) Our implementation handles this dependency by updating
6577 * the inode (or indirect block) appropriately but delaying the actual block
6578 * de-allocation (i.e., freemap and free space count manipulation) until
6579 * after the updated versions reach stable storage.  After the disk is
6580 * updated, the blocks can be safely de-allocated whenever it is convenient.
6581 * This implementation handles only the common case of reducing a file's
6582 * length to zero. Other cases are handled by the conventional synchronous
6583 * write approach.
6584 *
6585 * The ffs implementation with which we worked double-checks
6586 * the state of the block pointers and file size as it reduces
6587 * a file's length.  Some of this code is replicated here in our
6588 * soft updates implementation.  The freeblks->fb_chkcnt field is
6589 * used to transfer a part of this information to the procedure
6590 * that eventually de-allocates the blocks.
6591 *
6592 * This routine should be called from the routine that shortens
6593 * a file's length, before the inode's size or block pointers
6594 * are modified. It will save the block pointer information for
6595 * later release and zero the inode so that the calling routine
6596 * can release it.
6597 */
6598void
6599softdep_setup_freeblocks(ip, length, flags)
6600	struct inode *ip;	/* The inode whose length is to be reduced */
6601	off_t length;		/* The new length for the file */
6602	int flags;		/* IO_EXT and/or IO_NORMAL */
6603{
6604	struct ufs1_dinode *dp1;
6605	struct ufs2_dinode *dp2;
6606	struct freeblks *freeblks;
6607	struct inodedep *inodedep;
6608	struct allocdirect *adp;
6609	struct buf *bp;
6610	struct fs *fs;
6611	ufs2_daddr_t extblocks, datablocks;
6612	struct mount *mp;
6613	int i, delay, error, dflags;
6614	ufs_lbn_t tmpval;
6615	ufs_lbn_t lbn;
6616
6617	CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
6618	    ip->i_number, length);
6619	fs = ip->i_fs;
6620	mp = UFSTOVFS(ip->i_ump);
6621	if (length != 0)
6622		panic("softdep_setup_freeblocks: non-zero length");
6623	freeblks = newfreeblks(mp, ip);
6624	extblocks = 0;
6625	datablocks = 0;
6626	if (fs->fs_magic == FS_UFS2_MAGIC)
6627		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6628	if ((flags & IO_NORMAL) != 0) {
6629		for (i = 0; i < NDADDR; i++)
6630			setup_freedirect(freeblks, ip, i, 0);
6631		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6632		    i++, lbn += tmpval, tmpval *= NINDIR(fs))
6633			setup_freeindir(freeblks, ip, i, -lbn -i, 0);
6634		ip->i_size = 0;
6635		DIP_SET(ip, i_size, 0);
6636		datablocks = DIP(ip, i_blocks) - extblocks;
6637	}
6638	if ((flags & IO_EXT) != 0) {
6639		for (i = 0; i < NXADDR; i++)
6640			setup_freeext(freeblks, ip, i, 0);
6641		ip->i_din2->di_extsize = 0;
6642		datablocks += extblocks;
6643	}
6644#ifdef QUOTA
6645	/* Reference the quotas in case the block count is wrong in the end. */
6646	quotaref(ITOV(ip), freeblks->fb_quota);
6647	(void) chkdq(ip, -datablocks, NOCRED, 0);
6648#endif
6649	freeblks->fb_chkcnt = -datablocks;
6650	UFS_LOCK(ip->i_ump);
6651	fs->fs_pendingblocks += datablocks;
6652	UFS_UNLOCK(ip->i_ump);
6653	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6654	/*
6655	 * Push the zero'ed inode to to its disk buffer so that we are free
6656	 * to delete its dependencies below. Once the dependencies are gone
6657	 * the buffer can be safely released.
6658	 */
6659	if ((error = bread(ip->i_devvp,
6660	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6661	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
6662		brelse(bp);
6663		softdep_error("softdep_setup_freeblocks", error);
6664	}
6665	if (ip->i_ump->um_fstype == UFS1) {
6666		dp1 = ((struct ufs1_dinode *)bp->b_data +
6667		    ino_to_fsbo(fs, ip->i_number));
6668		ip->i_din1->di_freelink = dp1->di_freelink;
6669		*dp1 = *ip->i_din1;
6670	} else {
6671		dp2 = ((struct ufs2_dinode *)bp->b_data +
6672		    ino_to_fsbo(fs, ip->i_number));
6673		ip->i_din2->di_freelink = dp2->di_freelink;
6674		*dp2 = *ip->i_din2;
6675	}
6676	/*
6677	 * Find and eliminate any inode dependencies.
6678	 */
6679	ACQUIRE_LOCK(&lk);
6680	dflags = DEPALLOC;
6681	if (IS_SNAPSHOT(ip))
6682		dflags |= NODELAY;
6683	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6684	if ((inodedep->id_state & IOSTARTED) != 0)
6685		panic("softdep_setup_freeblocks: inode busy");
6686	/*
6687	 * Add the freeblks structure to the list of operations that
6688	 * must await the zero'ed inode being written to disk. If we
6689	 * still have a bitmap dependency (delay == 0), then the inode
6690	 * has never been written to disk, so we can process the
6691	 * freeblks below once we have deleted the dependencies.
6692	 */
6693	delay = (inodedep->id_state & DEPCOMPLETE);
6694	if (delay)
6695		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6696	else
6697		freeblks->fb_state |= COMPLETE;
6698	/*
6699	 * Because the file length has been truncated to zero, any
6700	 * pending block allocation dependency structures associated
6701	 * with this inode are obsolete and can simply be de-allocated.
6702	 * We must first merge the two dependency lists to get rid of
6703	 * any duplicate freefrag structures, then purge the merged list.
6704	 * If we still have a bitmap dependency, then the inode has never
6705	 * been written to disk, so we can free any fragments without delay.
6706	 */
6707	if (flags & IO_NORMAL) {
6708		merge_inode_lists(&inodedep->id_newinoupdt,
6709		    &inodedep->id_inoupdt);
6710		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
6711			cancel_allocdirect(&inodedep->id_inoupdt, adp,
6712			    freeblks);
6713	}
6714	if (flags & IO_EXT) {
6715		merge_inode_lists(&inodedep->id_newextupdt,
6716		    &inodedep->id_extupdt);
6717		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6718			cancel_allocdirect(&inodedep->id_extupdt, adp,
6719			    freeblks);
6720	}
6721	FREE_LOCK(&lk);
6722	bdwrite(bp);
6723	trunc_dependencies(ip, freeblks, -1, 0, flags);
6724	ACQUIRE_LOCK(&lk);
6725	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
6726		(void) free_inodedep(inodedep);
6727	freeblks->fb_state |= DEPCOMPLETE;
6728	/*
6729	 * If the inode with zeroed block pointers is now on disk
6730	 * we can start freeing blocks.
6731	 */
6732	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
6733		freeblks->fb_state |= INPROGRESS;
6734	else
6735		freeblks = NULL;
6736	FREE_LOCK(&lk);
6737	if (freeblks)
6738		handle_workitem_freeblocks(freeblks, 0);
6739	trunc_pages(ip, length, extblocks, flags);
6740}
6741
6742/*
6743 * Eliminate pages from the page cache that back parts of this inode and
6744 * adjust the vnode pager's idea of our size.  This prevents stale data
6745 * from hanging around in the page cache.
6746 */
6747static void
6748trunc_pages(ip, length, extblocks, flags)
6749	struct inode *ip;
6750	off_t length;
6751	ufs2_daddr_t extblocks;
6752	int flags;
6753{
6754	struct vnode *vp;
6755	struct fs *fs;
6756	ufs_lbn_t lbn;
6757	off_t end, extend;
6758
6759	vp = ITOV(ip);
6760	fs = ip->i_fs;
6761	extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
6762	if ((flags & IO_EXT) != 0)
6763		vn_pages_remove(vp, extend, 0);
6764	if ((flags & IO_NORMAL) == 0)
6765		return;
6766	BO_LOCK(&vp->v_bufobj);
6767	drain_output(vp);
6768	BO_UNLOCK(&vp->v_bufobj);
6769	/*
6770	 * The vnode pager eliminates file pages we eliminate indirects
6771	 * below.
6772	 */
6773	vnode_pager_setsize(vp, length);
6774	/*
6775	 * Calculate the end based on the last indirect we want to keep.  If
6776	 * the block extends into indirects we can just use the negative of
6777	 * its lbn.  Doubles and triples exist at lower numbers so we must
6778	 * be careful not to remove those, if they exist.  double and triple
6779	 * indirect lbns do not overlap with others so it is not important
6780	 * to verify how many levels are required.
6781	 */
6782	lbn = lblkno(fs, length);
6783	if (lbn >= NDADDR) {
6784		/* Calculate the virtual lbn of the triple indirect. */
6785		lbn = -lbn - (NIADDR - 1);
6786		end = OFF_TO_IDX(lblktosize(fs, lbn));
6787	} else
6788		end = extend;
6789	vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
6790}
6791
6792/*
6793 * See if the buf bp is in the range eliminated by truncation.
6794 */
6795static int
6796trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
6797	struct buf *bp;
6798	int *blkoffp;
6799	ufs_lbn_t lastlbn;
6800	int lastoff;
6801	int flags;
6802{
6803	ufs_lbn_t lbn;
6804
6805	*blkoffp = 0;
6806	/* Only match ext/normal blocks as appropriate. */
6807	if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
6808	    ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
6809		return (0);
6810	/* ALTDATA is always a full truncation. */
6811	if ((bp->b_xflags & BX_ALTDATA) != 0)
6812		return (1);
6813	/* -1 is full truncation. */
6814	if (lastlbn == -1)
6815		return (1);
6816	/*
6817	 * If this is a partial truncate we only want those
6818	 * blocks and indirect blocks that cover the range
6819	 * we're after.
6820	 */
6821	lbn = bp->b_lblkno;
6822	if (lbn < 0)
6823		lbn = -(lbn + lbn_level(lbn));
6824	if (lbn < lastlbn)
6825		return (0);
6826	/* Here we only truncate lblkno if it's partial. */
6827	if (lbn == lastlbn) {
6828		if (lastoff == 0)
6829			return (0);
6830		*blkoffp = lastoff;
6831	}
6832	return (1);
6833}
6834
6835/*
6836 * Eliminate any dependencies that exist in memory beyond lblkno:off
6837 */
6838static void
6839trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
6840	struct inode *ip;
6841	struct freeblks *freeblks;
6842	ufs_lbn_t lastlbn;
6843	int lastoff;
6844	int flags;
6845{
6846	struct bufobj *bo;
6847	struct vnode *vp;
6848	struct buf *bp;
6849	struct fs *fs;
6850	int blkoff;
6851
6852	/*
6853	 * We must wait for any I/O in progress to finish so that
6854	 * all potential buffers on the dirty list will be visible.
6855	 * Once they are all there, walk the list and get rid of
6856	 * any dependencies.
6857	 */
6858	fs = ip->i_fs;
6859	vp = ITOV(ip);
6860	bo = &vp->v_bufobj;
6861	BO_LOCK(bo);
6862	drain_output(vp);
6863	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
6864		bp->b_vflags &= ~BV_SCANNED;
6865restart:
6866	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
6867		if (bp->b_vflags & BV_SCANNED)
6868			continue;
6869		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
6870			bp->b_vflags |= BV_SCANNED;
6871			continue;
6872		}
6873		if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL)
6874			goto restart;
6875		BO_UNLOCK(bo);
6876		if (deallocate_dependencies(bp, freeblks, blkoff))
6877			bqrelse(bp);
6878		else
6879			brelse(bp);
6880		BO_LOCK(bo);
6881		goto restart;
6882	}
6883	/*
6884	 * Now do the work of vtruncbuf while also matching indirect blocks.
6885	 */
6886	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
6887		bp->b_vflags &= ~BV_SCANNED;
6888cleanrestart:
6889	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
6890		if (bp->b_vflags & BV_SCANNED)
6891			continue;
6892		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
6893			bp->b_vflags |= BV_SCANNED;
6894			continue;
6895		}
6896		if (BUF_LOCK(bp,
6897		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6898		    BO_MTX(bo)) == ENOLCK) {
6899			BO_LOCK(bo);
6900			goto cleanrestart;
6901		}
6902		bp->b_vflags |= BV_SCANNED;
6903		BO_LOCK(bo);
6904		bremfree(bp);
6905		BO_UNLOCK(bo);
6906		if (blkoff != 0) {
6907			allocbuf(bp, blkoff);
6908			bqrelse(bp);
6909		} else {
6910			bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
6911			brelse(bp);
6912		}
6913		BO_LOCK(bo);
6914		goto cleanrestart;
6915	}
6916	drain_output(vp);
6917	BO_UNLOCK(bo);
6918}
6919
6920static int
6921cancel_pagedep(pagedep, freeblks, blkoff)
6922	struct pagedep *pagedep;
6923	struct freeblks *freeblks;
6924	int blkoff;
6925{
6926	struct jremref *jremref;
6927	struct jmvref *jmvref;
6928	struct dirrem *dirrem, *tmp;
6929	int i;
6930
6931	/*
6932	 * Copy any directory remove dependencies to the list
6933	 * to be processed after the freeblks proceeds.  If
6934	 * directory entry never made it to disk they
6935	 * can be dumped directly onto the work list.
6936	 */
6937	LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
6938		/* Skip this directory removal if it is intended to remain. */
6939		if (dirrem->dm_offset < blkoff)
6940			continue;
6941		/*
6942		 * If there are any dirrems we wait for the journal write
6943		 * to complete and then restart the buf scan as the lock
6944		 * has been dropped.
6945		 */
6946		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
6947			jwait(&jremref->jr_list, MNT_WAIT);
6948			return (ERESTART);
6949		}
6950		LIST_REMOVE(dirrem, dm_next);
6951		dirrem->dm_dirinum = pagedep->pd_ino;
6952		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
6953	}
6954	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
6955		jwait(&jmvref->jm_list, MNT_WAIT);
6956		return (ERESTART);
6957	}
6958	/*
6959	 * When we're partially truncating a pagedep we just want to flush
6960	 * journal entries and return.  There can not be any adds in the
6961	 * truncated portion of the directory and newblk must remain if
6962	 * part of the block remains.
6963	 */
6964	if (blkoff != 0) {
6965		struct diradd *dap;
6966
6967		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
6968			if (dap->da_offset > blkoff)
6969				panic("cancel_pagedep: diradd %p off %d > %d",
6970				    dap, dap->da_offset, blkoff);
6971		for (i = 0; i < DAHASHSZ; i++)
6972			LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
6973				if (dap->da_offset > blkoff)
6974					panic("cancel_pagedep: diradd %p off %d > %d",
6975					    dap, dap->da_offset, blkoff);
6976		return (0);
6977	}
6978	/*
6979	 * There should be no directory add dependencies present
6980	 * as the directory could not be truncated until all
6981	 * children were removed.
6982	 */
6983	KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
6984	    ("deallocate_dependencies: pendinghd != NULL"));
6985	for (i = 0; i < DAHASHSZ; i++)
6986		KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
6987		    ("deallocate_dependencies: diraddhd != NULL"));
6988	if ((pagedep->pd_state & NEWBLOCK) != 0)
6989		free_newdirblk(pagedep->pd_newdirblk);
6990	if (free_pagedep(pagedep) == 0)
6991		panic("Failed to free pagedep %p", pagedep);
6992	return (0);
6993}
6994
6995/*
6996 * Reclaim any dependency structures from a buffer that is about to
6997 * be reallocated to a new vnode. The buffer must be locked, thus,
6998 * no I/O completion operations can occur while we are manipulating
6999 * its associated dependencies. The mutex is held so that other I/O's
7000 * associated with related dependencies do not occur.
7001 */
7002static int
7003deallocate_dependencies(bp, freeblks, off)
7004	struct buf *bp;
7005	struct freeblks *freeblks;
7006	int off;
7007{
7008	struct indirdep *indirdep;
7009	struct pagedep *pagedep;
7010	struct allocdirect *adp;
7011	struct worklist *wk, *wkn;
7012
7013	ACQUIRE_LOCK(&lk);
7014	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
7015		switch (wk->wk_type) {
7016		case D_INDIRDEP:
7017			indirdep = WK_INDIRDEP(wk);
7018			if (bp->b_lblkno >= 0 ||
7019			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
7020				panic("deallocate_dependencies: not indir");
7021			cancel_indirdep(indirdep, bp, freeblks);
7022			continue;
7023
7024		case D_PAGEDEP:
7025			pagedep = WK_PAGEDEP(wk);
7026			if (cancel_pagedep(pagedep, freeblks, off)) {
7027				FREE_LOCK(&lk);
7028				return (ERESTART);
7029			}
7030			continue;
7031
7032		case D_ALLOCINDIR:
7033			/*
7034			 * Simply remove the allocindir, we'll find it via
7035			 * the indirdep where we can clear pointers if
7036			 * needed.
7037			 */
7038			WORKLIST_REMOVE(wk);
7039			continue;
7040
7041		case D_FREEWORK:
7042			/*
7043			 * A truncation is waiting for the zero'd pointers
7044			 * to be written.  It can be freed when the freeblks
7045			 * is journaled.
7046			 */
7047			WORKLIST_REMOVE(wk);
7048			wk->wk_state |= ONDEPLIST;
7049			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7050			break;
7051
7052		case D_ALLOCDIRECT:
7053			adp = WK_ALLOCDIRECT(wk);
7054			if (off != 0)
7055				continue;
7056			/* FALLTHROUGH */
7057		default:
7058			panic("deallocate_dependencies: Unexpected type %s",
7059			    TYPENAME(wk->wk_type));
7060			/* NOTREACHED */
7061		}
7062	}
7063	FREE_LOCK(&lk);
7064	/*
7065	 * Don't throw away this buf, we were partially truncating and
7066	 * some deps may always remain.
7067	 */
7068	if (off) {
7069		allocbuf(bp, off);
7070		bp->b_vflags |= BV_SCANNED;
7071		return (EBUSY);
7072	}
7073	bp->b_flags |= B_INVAL | B_NOCACHE;
7074
7075	return (0);
7076}
7077
7078/*
7079 * An allocdirect is being canceled due to a truncate.  We must make sure
7080 * the journal entry is released in concert with the blkfree that releases
7081 * the storage.  Completed journal entries must not be released until the
7082 * space is no longer pointed to by the inode or in the bitmap.
7083 */
7084static void
7085cancel_allocdirect(adphead, adp, freeblks)
7086	struct allocdirectlst *adphead;
7087	struct allocdirect *adp;
7088	struct freeblks *freeblks;
7089{
7090	struct freework *freework;
7091	struct newblk *newblk;
7092	struct worklist *wk;
7093
7094	TAILQ_REMOVE(adphead, adp, ad_next);
7095	newblk = (struct newblk *)adp;
7096	freework = NULL;
7097	/*
7098	 * Find the correct freework structure.
7099	 */
7100	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
7101		if (wk->wk_type != D_FREEWORK)
7102			continue;
7103		freework = WK_FREEWORK(wk);
7104		if (freework->fw_blkno == newblk->nb_newblkno)
7105			break;
7106	}
7107	if (freework == NULL)
7108		panic("cancel_allocdirect: Freework not found");
7109	/*
7110	 * If a newblk exists at all we still have the journal entry that
7111	 * initiated the allocation so we do not need to journal the free.
7112	 */
7113	cancel_jfreeblk(freeblks, freework->fw_blkno);
7114	/*
7115	 * If the journal hasn't been written the jnewblk must be passed
7116	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
7117	 * this by linking the journal dependency into the freework to be
7118	 * freed when freework_freeblock() is called.  If the journal has
7119	 * been written we can simply reclaim the journal space when the
7120	 * freeblks work is complete.
7121	 */
7122	freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
7123	    &freeblks->fb_jwork);
7124	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7125}
7126
7127
7128/*
7129 * Cancel a new block allocation.  May be an indirect or direct block.  We
7130 * remove it from various lists and return any journal record that needs to
7131 * be resolved by the caller.
7132 *
7133 * A special consideration is made for indirects which were never pointed
7134 * at on disk and will never be found once this block is released.
7135 */
7136static struct jnewblk *
7137cancel_newblk(newblk, wk, wkhd)
7138	struct newblk *newblk;
7139	struct worklist *wk;
7140	struct workhead *wkhd;
7141{
7142	struct jnewblk *jnewblk;
7143
7144	CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
7145
7146	newblk->nb_state |= GOINGAWAY;
7147	/*
7148	 * Previously we traversed the completedhd on each indirdep
7149	 * attached to this newblk to cancel them and gather journal
7150	 * work.  Since we need only the oldest journal segment and
7151	 * the lowest point on the tree will always have the oldest
7152	 * journal segment we are free to release the segments
7153	 * of any subordinates and may leave the indirdep list to
7154	 * indirdep_complete() when this newblk is freed.
7155	 */
7156	if (newblk->nb_state & ONDEPLIST) {
7157		newblk->nb_state &= ~ONDEPLIST;
7158		LIST_REMOVE(newblk, nb_deps);
7159	}
7160	if (newblk->nb_state & ONWORKLIST)
7161		WORKLIST_REMOVE(&newblk->nb_list);
7162	/*
7163	 * If the journal entry hasn't been written we save a pointer to
7164	 * the dependency that frees it until it is written or the
7165	 * superseding operation completes.
7166	 */
7167	jnewblk = newblk->nb_jnewblk;
7168	if (jnewblk != NULL && wk != NULL) {
7169		newblk->nb_jnewblk = NULL;
7170		jnewblk->jn_dep = wk;
7171	}
7172	if (!LIST_EMPTY(&newblk->nb_jwork))
7173		jwork_move(wkhd, &newblk->nb_jwork);
7174	/*
7175	 * When truncating we must free the newdirblk early to remove
7176	 * the pagedep from the hash before returning.
7177	 */
7178	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7179		free_newdirblk(WK_NEWDIRBLK(wk));
7180	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7181		panic("cancel_newblk: extra newdirblk");
7182
7183	return (jnewblk);
7184}
7185
7186/*
7187 * Schedule the freefrag associated with a newblk to be released once
7188 * the pointers are written and the previous block is no longer needed.
7189 */
7190static void
7191newblk_freefrag(newblk)
7192	struct newblk *newblk;
7193{
7194	struct freefrag *freefrag;
7195
7196	if (newblk->nb_freefrag == NULL)
7197		return;
7198	freefrag = newblk->nb_freefrag;
7199	newblk->nb_freefrag = NULL;
7200	freefrag->ff_state |= COMPLETE;
7201	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7202		add_to_worklist(&freefrag->ff_list, 0);
7203}
7204
7205/*
7206 * Free a newblk. Generate a new freefrag work request if appropriate.
7207 * This must be called after the inode pointer and any direct block pointers
7208 * are valid or fully removed via truncate or frag extension.
7209 */
7210static void
7211free_newblk(newblk)
7212	struct newblk *newblk;
7213{
7214	struct indirdep *indirdep;
7215	struct worklist *wk;
7216
7217	KASSERT(newblk->nb_jnewblk == NULL,
7218	    ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk));
7219	mtx_assert(&lk, MA_OWNED);
7220	newblk_freefrag(newblk);
7221	if (newblk->nb_state & ONDEPLIST)
7222		LIST_REMOVE(newblk, nb_deps);
7223	if (newblk->nb_state & ONWORKLIST)
7224		WORKLIST_REMOVE(&newblk->nb_list);
7225	LIST_REMOVE(newblk, nb_hash);
7226	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7227		free_newdirblk(WK_NEWDIRBLK(wk));
7228	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7229		panic("free_newblk: extra newdirblk");
7230	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7231		indirdep_complete(indirdep);
7232	handle_jwork(&newblk->nb_jwork);
7233	newblk->nb_list.wk_type = D_NEWBLK;
7234	WORKITEM_FREE(newblk, D_NEWBLK);
7235}
7236
7237/*
7238 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7239 * This routine must be called with splbio interrupts blocked.
7240 */
7241static void
7242free_newdirblk(newdirblk)
7243	struct newdirblk *newdirblk;
7244{
7245	struct pagedep *pagedep;
7246	struct diradd *dap;
7247	struct worklist *wk;
7248
7249	mtx_assert(&lk, MA_OWNED);
7250	WORKLIST_REMOVE(&newdirblk->db_list);
7251	/*
7252	 * If the pagedep is still linked onto the directory buffer
7253	 * dependency chain, then some of the entries on the
7254	 * pd_pendinghd list may not be committed to disk yet. In
7255	 * this case, we will simply clear the NEWBLOCK flag and
7256	 * let the pd_pendinghd list be processed when the pagedep
7257	 * is next written. If the pagedep is no longer on the buffer
7258	 * dependency chain, then all the entries on the pd_pending
7259	 * list are committed to disk and we can free them here.
7260	 */
7261	pagedep = newdirblk->db_pagedep;
7262	pagedep->pd_state &= ~NEWBLOCK;
7263	if ((pagedep->pd_state & ONWORKLIST) == 0) {
7264		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7265			free_diradd(dap, NULL);
7266		/*
7267		 * If no dependencies remain, the pagedep will be freed.
7268		 */
7269		free_pagedep(pagedep);
7270	}
7271	/* Should only ever be one item in the list. */
7272	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7273		WORKLIST_REMOVE(wk);
7274		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7275	}
7276	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7277}
7278
7279/*
7280 * Prepare an inode to be freed. The actual free operation is not
7281 * done until the zero'ed inode has been written to disk.
7282 */
7283void
7284softdep_freefile(pvp, ino, mode)
7285	struct vnode *pvp;
7286	ino_t ino;
7287	int mode;
7288{
7289	struct inode *ip = VTOI(pvp);
7290	struct inodedep *inodedep;
7291	struct freefile *freefile;
7292	struct freeblks *freeblks;
7293
7294	/*
7295	 * This sets up the inode de-allocation dependency.
7296	 */
7297	freefile = malloc(sizeof(struct freefile),
7298		M_FREEFILE, M_SOFTDEP_FLAGS);
7299	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7300	freefile->fx_mode = mode;
7301	freefile->fx_oldinum = ino;
7302	freefile->fx_devvp = ip->i_devvp;
7303	LIST_INIT(&freefile->fx_jwork);
7304	UFS_LOCK(ip->i_ump);
7305	ip->i_fs->fs_pendinginodes += 1;
7306	UFS_UNLOCK(ip->i_ump);
7307
7308	/*
7309	 * If the inodedep does not exist, then the zero'ed inode has
7310	 * been written to disk. If the allocated inode has never been
7311	 * written to disk, then the on-disk inode is zero'ed. In either
7312	 * case we can free the file immediately.  If the journal was
7313	 * canceled before being written the inode will never make it to
7314	 * disk and we must send the canceled journal entrys to
7315	 * ffs_freefile() to be cleared in conjunction with the bitmap.
7316	 * Any blocks waiting on the inode to write can be safely freed
7317	 * here as it will never been written.
7318	 */
7319	ACQUIRE_LOCK(&lk);
7320	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7321	if (inodedep) {
7322		/*
7323		 * Clear out freeblks that no longer need to reference
7324		 * this inode.
7325		 */
7326		while ((freeblks =
7327		    TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7328			TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7329			    fb_next);
7330			freeblks->fb_state &= ~ONDEPLIST;
7331		}
7332		/*
7333		 * Remove this inode from the unlinked list.
7334		 */
7335		if (inodedep->id_state & UNLINKED) {
7336			/*
7337			 * Save the journal work to be freed with the bitmap
7338			 * before we clear UNLINKED.  Otherwise it can be lost
7339			 * if the inode block is written.
7340			 */
7341			handle_bufwait(inodedep, &freefile->fx_jwork);
7342			clear_unlinked_inodedep(inodedep);
7343			/* Re-acquire inodedep as we've dropped lk. */
7344			inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7345		}
7346	}
7347	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7348		FREE_LOCK(&lk);
7349		handle_workitem_freefile(freefile);
7350		return;
7351	}
7352	if ((inodedep->id_state & DEPCOMPLETE) == 0)
7353		inodedep->id_state |= GOINGAWAY;
7354	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7355	FREE_LOCK(&lk);
7356	if (ip->i_number == ino)
7357		ip->i_flag |= IN_MODIFIED;
7358}
7359
7360/*
7361 * Check to see if an inode has never been written to disk. If
7362 * so free the inodedep and return success, otherwise return failure.
7363 * This routine must be called with splbio interrupts blocked.
7364 *
7365 * If we still have a bitmap dependency, then the inode has never
7366 * been written to disk. Drop the dependency as it is no longer
7367 * necessary since the inode is being deallocated. We set the
7368 * ALLCOMPLETE flags since the bitmap now properly shows that the
7369 * inode is not allocated. Even if the inode is actively being
7370 * written, it has been rolled back to its zero'ed state, so we
7371 * are ensured that a zero inode is what is on the disk. For short
7372 * lived files, this change will usually result in removing all the
7373 * dependencies from the inode so that it can be freed immediately.
7374 */
7375static int
7376check_inode_unwritten(inodedep)
7377	struct inodedep *inodedep;
7378{
7379
7380	mtx_assert(&lk, MA_OWNED);
7381
7382	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7383	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7384	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7385	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7386	    !LIST_EMPTY(&inodedep->id_inowait) ||
7387	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7388	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7389	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7390	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7391	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7392	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7393	    inodedep->id_mkdiradd != NULL ||
7394	    inodedep->id_nlinkdelta != 0)
7395		return (0);
7396	/*
7397	 * Another process might be in initiate_write_inodeblock_ufs[12]
7398	 * trying to allocate memory without holding "Softdep Lock".
7399	 */
7400	if ((inodedep->id_state & IOSTARTED) != 0 &&
7401	    inodedep->id_savedino1 == NULL)
7402		return (0);
7403
7404	if (inodedep->id_state & ONDEPLIST)
7405		LIST_REMOVE(inodedep, id_deps);
7406	inodedep->id_state &= ~ONDEPLIST;
7407	inodedep->id_state |= ALLCOMPLETE;
7408	inodedep->id_bmsafemap = NULL;
7409	if (inodedep->id_state & ONWORKLIST)
7410		WORKLIST_REMOVE(&inodedep->id_list);
7411	if (inodedep->id_savedino1 != NULL) {
7412		free(inodedep->id_savedino1, M_SAVEDINO);
7413		inodedep->id_savedino1 = NULL;
7414	}
7415	if (free_inodedep(inodedep) == 0)
7416		panic("check_inode_unwritten: busy inode");
7417	return (1);
7418}
7419
7420/*
7421 * Try to free an inodedep structure. Return 1 if it could be freed.
7422 */
7423static int
7424free_inodedep(inodedep)
7425	struct inodedep *inodedep;
7426{
7427
7428	mtx_assert(&lk, MA_OWNED);
7429	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7430	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7431	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7432	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7433	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7434	    !LIST_EMPTY(&inodedep->id_inowait) ||
7435	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7436	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7437	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7438	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7439	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7440	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7441	    inodedep->id_mkdiradd != NULL ||
7442	    inodedep->id_nlinkdelta != 0 ||
7443	    inodedep->id_savedino1 != NULL)
7444		return (0);
7445	if (inodedep->id_state & ONDEPLIST)
7446		LIST_REMOVE(inodedep, id_deps);
7447	LIST_REMOVE(inodedep, id_hash);
7448	WORKITEM_FREE(inodedep, D_INODEDEP);
7449	return (1);
7450}
7451
7452/*
7453 * Free the block referenced by a freework structure.  The parent freeblks
7454 * structure is released and completed when the final cg bitmap reaches
7455 * the disk.  This routine may be freeing a jnewblk which never made it to
7456 * disk in which case we do not have to wait as the operation is undone
7457 * in memory immediately.
7458 */
7459static void
7460freework_freeblock(freework)
7461	struct freework *freework;
7462{
7463	struct freeblks *freeblks;
7464	struct jnewblk *jnewblk;
7465	struct ufsmount *ump;
7466	struct workhead wkhd;
7467	struct fs *fs;
7468	int bsize;
7469	int needj;
7470
7471	mtx_assert(&lk, MA_OWNED);
7472	/*
7473	 * Handle partial truncate separately.
7474	 */
7475	if (freework->fw_indir) {
7476		complete_trunc_indir(freework);
7477		return;
7478	}
7479	freeblks = freework->fw_freeblks;
7480	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7481	fs = ump->um_fs;
7482	needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7483	bsize = lfragtosize(fs, freework->fw_frags);
7484	LIST_INIT(&wkhd);
7485	/*
7486	 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7487	 * on the indirblk hashtable and prevents premature freeing.
7488	 */
7489	freework->fw_state |= DEPCOMPLETE;
7490	/*
7491	 * SUJ needs to wait for the segment referencing freed indirect
7492	 * blocks to expire so that we know the checker will not confuse
7493	 * a re-allocated indirect block with its old contents.
7494	 */
7495	if (needj && freework->fw_lbn <= -NDADDR)
7496		indirblk_insert(freework);
7497	/*
7498	 * If we are canceling an existing jnewblk pass it to the free
7499	 * routine, otherwise pass the freeblk which will ultimately
7500	 * release the freeblks.  If we're not journaling, we can just
7501	 * free the freeblks immediately.
7502	 */
7503	jnewblk = freework->fw_jnewblk;
7504	if (jnewblk != NULL) {
7505		cancel_jnewblk(jnewblk, &wkhd);
7506		needj = 0;
7507	} else if (needj) {
7508		freework->fw_state |= DELAYEDFREE;
7509		freeblks->fb_cgwait++;
7510		WORKLIST_INSERT(&wkhd, &freework->fw_list);
7511	}
7512	FREE_LOCK(&lk);
7513	freeblks_free(ump, freeblks, btodb(bsize));
7514	CTR4(KTR_SUJ,
7515	    "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
7516	    freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
7517	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
7518	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
7519	ACQUIRE_LOCK(&lk);
7520	/*
7521	 * The jnewblk will be discarded and the bits in the map never
7522	 * made it to disk.  We can immediately free the freeblk.
7523	 */
7524	if (needj == 0)
7525		handle_written_freework(freework);
7526}
7527
7528/*
7529 * We enqueue freework items that need processing back on the freeblks and
7530 * add the freeblks to the worklist.  This makes it easier to find all work
7531 * required to flush a truncation in process_truncates().
7532 */
7533static void
7534freework_enqueue(freework)
7535	struct freework *freework;
7536{
7537	struct freeblks *freeblks;
7538
7539	freeblks = freework->fw_freeblks;
7540	if ((freework->fw_state & INPROGRESS) == 0)
7541		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
7542	if ((freeblks->fb_state &
7543	    (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
7544	    LIST_EMPTY(&freeblks->fb_jblkdephd))
7545		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7546}
7547
7548/*
7549 * Start, continue, or finish the process of freeing an indirect block tree.
7550 * The free operation may be paused at any point with fw_off containing the
7551 * offset to restart from.  This enables us to implement some flow control
7552 * for large truncates which may fan out and generate a huge number of
7553 * dependencies.
7554 */
7555static void
7556handle_workitem_indirblk(freework)
7557	struct freework *freework;
7558{
7559	struct freeblks *freeblks;
7560	struct ufsmount *ump;
7561	struct fs *fs;
7562
7563	freeblks = freework->fw_freeblks;
7564	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7565	fs = ump->um_fs;
7566	if (freework->fw_state & DEPCOMPLETE) {
7567		handle_written_freework(freework);
7568		return;
7569	}
7570	if (freework->fw_off == NINDIR(fs)) {
7571		freework_freeblock(freework);
7572		return;
7573	}
7574	freework->fw_state |= INPROGRESS;
7575	FREE_LOCK(&lk);
7576	indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
7577	    freework->fw_lbn);
7578	ACQUIRE_LOCK(&lk);
7579}
7580
7581/*
7582 * Called when a freework structure attached to a cg buf is written.  The
7583 * ref on either the parent or the freeblks structure is released and
7584 * the freeblks is added back to the worklist if there is more work to do.
7585 */
7586static void
7587handle_written_freework(freework)
7588	struct freework *freework;
7589{
7590	struct freeblks *freeblks;
7591	struct freework *parent;
7592
7593	freeblks = freework->fw_freeblks;
7594	parent = freework->fw_parent;
7595	if (freework->fw_state & DELAYEDFREE)
7596		freeblks->fb_cgwait--;
7597	freework->fw_state |= COMPLETE;
7598	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
7599		WORKITEM_FREE(freework, D_FREEWORK);
7600	if (parent) {
7601		if (--parent->fw_ref == 0)
7602			freework_enqueue(parent);
7603		return;
7604	}
7605	if (--freeblks->fb_ref != 0)
7606		return;
7607	if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
7608	    ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
7609		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7610}
7611
7612/*
7613 * This workitem routine performs the block de-allocation.
7614 * The workitem is added to the pending list after the updated
7615 * inode block has been written to disk.  As mentioned above,
7616 * checks regarding the number of blocks de-allocated (compared
7617 * to the number of blocks allocated for the file) are also
7618 * performed in this function.
7619 */
7620static int
7621handle_workitem_freeblocks(freeblks, flags)
7622	struct freeblks *freeblks;
7623	int flags;
7624{
7625	struct freework *freework;
7626	struct newblk *newblk;
7627	struct allocindir *aip;
7628	struct ufsmount *ump;
7629	struct worklist *wk;
7630
7631	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
7632	    ("handle_workitem_freeblocks: Journal entries not written."));
7633	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7634	ACQUIRE_LOCK(&lk);
7635	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
7636		WORKLIST_REMOVE(wk);
7637		switch (wk->wk_type) {
7638		case D_DIRREM:
7639			wk->wk_state |= COMPLETE;
7640			add_to_worklist(wk, 0);
7641			continue;
7642
7643		case D_ALLOCDIRECT:
7644			free_newblk(WK_NEWBLK(wk));
7645			continue;
7646
7647		case D_ALLOCINDIR:
7648			aip = WK_ALLOCINDIR(wk);
7649			freework = NULL;
7650			if (aip->ai_state & DELAYEDFREE) {
7651				FREE_LOCK(&lk);
7652				freework = newfreework(ump, freeblks, NULL,
7653				    aip->ai_lbn, aip->ai_newblkno,
7654				    ump->um_fs->fs_frag, 0, 0);
7655				ACQUIRE_LOCK(&lk);
7656			}
7657			newblk = WK_NEWBLK(wk);
7658			if (newblk->nb_jnewblk) {
7659				freework->fw_jnewblk = newblk->nb_jnewblk;
7660				newblk->nb_jnewblk->jn_dep = &freework->fw_list;
7661				newblk->nb_jnewblk = NULL;
7662			}
7663			free_newblk(newblk);
7664			continue;
7665
7666		case D_FREEWORK:
7667			freework = WK_FREEWORK(wk);
7668			if (freework->fw_lbn <= -NDADDR)
7669				handle_workitem_indirblk(freework);
7670			else
7671				freework_freeblock(freework);
7672			continue;
7673		default:
7674			panic("handle_workitem_freeblocks: Unknown type %s",
7675			    TYPENAME(wk->wk_type));
7676		}
7677	}
7678	if (freeblks->fb_ref != 0) {
7679		freeblks->fb_state &= ~INPROGRESS;
7680		wake_worklist(&freeblks->fb_list);
7681		freeblks = NULL;
7682	}
7683	FREE_LOCK(&lk);
7684	if (freeblks)
7685		return handle_complete_freeblocks(freeblks, flags);
7686	return (0);
7687}
7688
7689/*
7690 * Handle completion of block free via truncate.  This allows fs_pending
7691 * to track the actual free block count more closely than if we only updated
7692 * it at the end.  We must be careful to handle cases where the block count
7693 * on free was incorrect.
7694 */
7695static void
7696freeblks_free(ump, freeblks, blocks)
7697	struct ufsmount *ump;
7698	struct freeblks *freeblks;
7699	int blocks;
7700{
7701	struct fs *fs;
7702	ufs2_daddr_t remain;
7703
7704	UFS_LOCK(ump);
7705	remain = -freeblks->fb_chkcnt;
7706	freeblks->fb_chkcnt += blocks;
7707	if (remain > 0) {
7708		if (remain < blocks)
7709			blocks = remain;
7710		fs = ump->um_fs;
7711		fs->fs_pendingblocks -= blocks;
7712	}
7713	UFS_UNLOCK(ump);
7714}
7715
7716/*
7717 * Once all of the freework workitems are complete we can retire the
7718 * freeblocks dependency and any journal work awaiting completion.  This
7719 * can not be called until all other dependencies are stable on disk.
7720 */
7721static int
7722handle_complete_freeblocks(freeblks, flags)
7723	struct freeblks *freeblks;
7724	int flags;
7725{
7726	struct inodedep *inodedep;
7727	struct inode *ip;
7728	struct vnode *vp;
7729	struct fs *fs;
7730	struct ufsmount *ump;
7731	ufs2_daddr_t spare;
7732
7733	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7734	fs = ump->um_fs;
7735	flags = LK_EXCLUSIVE | flags;
7736	spare = freeblks->fb_chkcnt;
7737
7738	/*
7739	 * If we did not release the expected number of blocks we may have
7740	 * to adjust the inode block count here.  Only do so if it wasn't
7741	 * a truncation to zero and the modrev still matches.
7742	 */
7743	if (spare && freeblks->fb_len != 0) {
7744		if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7745		    flags, &vp, FFSV_FORCEINSMQ) != 0)
7746			return (EBUSY);
7747		ip = VTOI(vp);
7748		if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
7749			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
7750			ip->i_flag |= IN_CHANGE;
7751			/*
7752			 * We must wait so this happens before the
7753			 * journal is reclaimed.
7754			 */
7755			ffs_update(vp, 1);
7756		}
7757		vput(vp);
7758	}
7759	if (spare < 0) {
7760		UFS_LOCK(ump);
7761		fs->fs_pendingblocks += spare;
7762		UFS_UNLOCK(ump);
7763	}
7764#ifdef QUOTA
7765	/* Handle spare. */
7766	if (spare)
7767		quotaadj(freeblks->fb_quota, ump, -spare);
7768	quotarele(freeblks->fb_quota);
7769#endif
7770	ACQUIRE_LOCK(&lk);
7771	if (freeblks->fb_state & ONDEPLIST) {
7772		inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7773		    0, &inodedep);
7774		TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
7775		freeblks->fb_state &= ~ONDEPLIST;
7776		if (TAILQ_EMPTY(&inodedep->id_freeblklst))
7777			free_inodedep(inodedep);
7778	}
7779	/*
7780	 * All of the freeblock deps must be complete prior to this call
7781	 * so it's now safe to complete earlier outstanding journal entries.
7782	 */
7783	handle_jwork(&freeblks->fb_jwork);
7784	WORKITEM_FREE(freeblks, D_FREEBLKS);
7785	FREE_LOCK(&lk);
7786	return (0);
7787}
7788
7789/*
7790 * Release blocks associated with the freeblks and stored in the indirect
7791 * block dbn. If level is greater than SINGLE, the block is an indirect block
7792 * and recursive calls to indirtrunc must be used to cleanse other indirect
7793 * blocks.
7794 *
7795 * This handles partial and complete truncation of blocks.  Partial is noted
7796 * with goingaway == 0.  In this case the freework is completed after the
7797 * zero'd indirects are written to disk.  For full truncation the freework
7798 * is completed after the block is freed.
7799 */
7800static void
7801indir_trunc(freework, dbn, lbn)
7802	struct freework *freework;
7803	ufs2_daddr_t dbn;
7804	ufs_lbn_t lbn;
7805{
7806	struct freework *nfreework;
7807	struct workhead wkhd;
7808	struct freeblks *freeblks;
7809	struct buf *bp;
7810	struct fs *fs;
7811	struct indirdep *indirdep;
7812	struct ufsmount *ump;
7813	ufs1_daddr_t *bap1 = 0;
7814	ufs2_daddr_t nb, nnb, *bap2 = 0;
7815	ufs_lbn_t lbnadd, nlbn;
7816	int i, nblocks, ufs1fmt;
7817	int freedblocks;
7818	int goingaway;
7819	int freedeps;
7820	int needj;
7821	int level;
7822	int cnt;
7823
7824	freeblks = freework->fw_freeblks;
7825	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7826	fs = ump->um_fs;
7827	/*
7828	 * Get buffer of block pointers to be freed.  There are three cases:
7829	 *
7830	 * 1) Partial truncate caches the indirdep pointer in the freework
7831	 *    which provides us a back copy to the save bp which holds the
7832	 *    pointers we want to clear.  When this completes the zero
7833	 *    pointers are written to the real copy.
7834	 * 2) The indirect is being completely truncated, cancel_indirdep()
7835	 *    eliminated the real copy and placed the indirdep on the saved
7836	 *    copy.  The indirdep and buf are discarded when this completes.
7837	 * 3) The indirect was not in memory, we read a copy off of the disk
7838	 *    using the devvp and drop and invalidate the buffer when we're
7839	 *    done.
7840	 */
7841	goingaway = 1;
7842	indirdep = NULL;
7843	if (freework->fw_indir != NULL) {
7844		goingaway = 0;
7845		indirdep = freework->fw_indir;
7846		bp = indirdep->ir_savebp;
7847		if (bp == NULL || bp->b_blkno != dbn)
7848			panic("indir_trunc: Bad saved buf %p blkno %jd",
7849			    bp, (intmax_t)dbn);
7850	} else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
7851		/*
7852		 * The lock prevents the buf dep list from changing and
7853	 	 * indirects on devvp should only ever have one dependency.
7854		 */
7855		indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
7856		if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
7857			panic("indir_trunc: Bad indirdep %p from buf %p",
7858			    indirdep, bp);
7859	} else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
7860	    NOCRED, &bp) != 0) {
7861		brelse(bp);
7862		return;
7863	}
7864	ACQUIRE_LOCK(&lk);
7865	/* Protects against a race with complete_trunc_indir(). */
7866	freework->fw_state &= ~INPROGRESS;
7867	/*
7868	 * If we have an indirdep we need to enforce the truncation order
7869	 * and discard it when it is complete.
7870	 */
7871	if (indirdep) {
7872		if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
7873		    !TAILQ_EMPTY(&indirdep->ir_trunc)) {
7874			/*
7875			 * Add the complete truncate to the list on the
7876			 * indirdep to enforce in-order processing.
7877			 */
7878			if (freework->fw_indir == NULL)
7879				TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
7880				    freework, fw_next);
7881			FREE_LOCK(&lk);
7882			return;
7883		}
7884		/*
7885		 * If we're goingaway, free the indirdep.  Otherwise it will
7886		 * linger until the write completes.
7887		 */
7888		if (goingaway) {
7889			free_indirdep(indirdep);
7890			ump->um_numindirdeps -= 1;
7891		}
7892	}
7893	FREE_LOCK(&lk);
7894	/* Initialize pointers depending on block size. */
7895	if (ump->um_fstype == UFS1) {
7896		bap1 = (ufs1_daddr_t *)bp->b_data;
7897		nb = bap1[freework->fw_off];
7898		ufs1fmt = 1;
7899	} else {
7900		bap2 = (ufs2_daddr_t *)bp->b_data;
7901		nb = bap2[freework->fw_off];
7902		ufs1fmt = 0;
7903	}
7904	level = lbn_level(lbn);
7905	needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
7906	lbnadd = lbn_offset(fs, level);
7907	nblocks = btodb(fs->fs_bsize);
7908	nfreework = freework;
7909	freedeps = 0;
7910	cnt = 0;
7911	/*
7912	 * Reclaim blocks.  Traverses into nested indirect levels and
7913	 * arranges for the current level to be freed when subordinates
7914	 * are free when journaling.
7915	 */
7916	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
7917		if (i != NINDIR(fs) - 1) {
7918			if (ufs1fmt)
7919				nnb = bap1[i+1];
7920			else
7921				nnb = bap2[i+1];
7922		} else
7923			nnb = 0;
7924		if (nb == 0)
7925			continue;
7926		cnt++;
7927		if (level != 0) {
7928			nlbn = (lbn + 1) - (i * lbnadd);
7929			if (needj != 0) {
7930				nfreework = newfreework(ump, freeblks, freework,
7931				    nlbn, nb, fs->fs_frag, 0, 0);
7932				freedeps++;
7933			}
7934			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
7935		} else {
7936			struct freedep *freedep;
7937
7938			/*
7939			 * Attempt to aggregate freedep dependencies for
7940			 * all blocks being released to the same CG.
7941			 */
7942			LIST_INIT(&wkhd);
7943			if (needj != 0 &&
7944			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
7945				freedep = newfreedep(freework);
7946				WORKLIST_INSERT_UNLOCKED(&wkhd,
7947				    &freedep->fd_list);
7948				freedeps++;
7949			}
7950			CTR3(KTR_SUJ,
7951			    "indir_trunc: ino %d blkno %jd size %ld",
7952			    freeblks->fb_inum, nb, fs->fs_bsize);
7953			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
7954			    fs->fs_bsize, freeblks->fb_inum,
7955			    freeblks->fb_vtype, &wkhd);
7956		}
7957	}
7958	if (goingaway) {
7959		bp->b_flags |= B_INVAL | B_NOCACHE;
7960		brelse(bp);
7961	}
7962	freedblocks = 0;
7963	if (level == 0)
7964		freedblocks = (nblocks * cnt);
7965	if (needj == 0)
7966		freedblocks += nblocks;
7967	freeblks_free(ump, freeblks, freedblocks);
7968	/*
7969	 * If we are journaling set up the ref counts and offset so this
7970	 * indirect can be completed when its children are free.
7971	 */
7972	if (needj) {
7973		ACQUIRE_LOCK(&lk);
7974		freework->fw_off = i;
7975		freework->fw_ref += freedeps;
7976		freework->fw_ref -= NINDIR(fs) + 1;
7977		if (level == 0)
7978			freeblks->fb_cgwait += freedeps;
7979		if (freework->fw_ref == 0)
7980			freework_freeblock(freework);
7981		FREE_LOCK(&lk);
7982		return;
7983	}
7984	/*
7985	 * If we're not journaling we can free the indirect now.
7986	 */
7987	dbn = dbtofsb(fs, dbn);
7988	CTR3(KTR_SUJ,
7989	    "indir_trunc 2: ino %d blkno %jd size %ld",
7990	    freeblks->fb_inum, dbn, fs->fs_bsize);
7991	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
7992	    freeblks->fb_inum, freeblks->fb_vtype, NULL);
7993	/* Non SUJ softdep does single-threaded truncations. */
7994	if (freework->fw_blkno == dbn) {
7995		freework->fw_state |= ALLCOMPLETE;
7996		ACQUIRE_LOCK(&lk);
7997		handle_written_freework(freework);
7998		FREE_LOCK(&lk);
7999	}
8000	return;
8001}
8002
8003/*
8004 * Cancel an allocindir when it is removed via truncation.  When bp is not
8005 * NULL the indirect never appeared on disk and is scheduled to be freed
8006 * independently of the indir so we can more easily track journal work.
8007 */
8008static void
8009cancel_allocindir(aip, bp, freeblks, trunc)
8010	struct allocindir *aip;
8011	struct buf *bp;
8012	struct freeblks *freeblks;
8013	int trunc;
8014{
8015	struct indirdep *indirdep;
8016	struct freefrag *freefrag;
8017	struct newblk *newblk;
8018
8019	newblk = (struct newblk *)aip;
8020	LIST_REMOVE(aip, ai_next);
8021	/*
8022	 * We must eliminate the pointer in bp if it must be freed on its
8023	 * own due to partial truncate or pending journal work.
8024	 */
8025	if (bp && (trunc || newblk->nb_jnewblk)) {
8026		/*
8027		 * Clear the pointer and mark the aip to be freed
8028		 * directly if it never existed on disk.
8029		 */
8030		aip->ai_state |= DELAYEDFREE;
8031		indirdep = aip->ai_indirdep;
8032		if (indirdep->ir_state & UFS1FMT)
8033			((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8034		else
8035			((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8036	}
8037	/*
8038	 * When truncating the previous pointer will be freed via
8039	 * savedbp.  Eliminate the freefrag which would dup free.
8040	 */
8041	if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
8042		newblk->nb_freefrag = NULL;
8043		if (freefrag->ff_jdep)
8044			cancel_jfreefrag(
8045			    WK_JFREEFRAG(freefrag->ff_jdep));
8046		jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
8047		WORKITEM_FREE(freefrag, D_FREEFRAG);
8048	}
8049	/*
8050	 * If the journal hasn't been written the jnewblk must be passed
8051	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
8052	 * this by leaving the journal dependency on the newblk to be freed
8053	 * when a freework is created in handle_workitem_freeblocks().
8054	 */
8055	cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
8056	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
8057}
8058
8059/*
8060 * Create the mkdir dependencies for . and .. in a new directory.  Link them
8061 * in to a newdirblk so any subsequent additions are tracked properly.  The
8062 * caller is responsible for adding the mkdir1 dependency to the journal
8063 * and updating id_mkdiradd.  This function returns with lk held.
8064 */
8065static struct mkdir *
8066setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
8067	struct diradd *dap;
8068	ino_t newinum;
8069	ino_t dinum;
8070	struct buf *newdirbp;
8071	struct mkdir **mkdirp;
8072{
8073	struct newblk *newblk;
8074	struct pagedep *pagedep;
8075	struct inodedep *inodedep;
8076	struct newdirblk *newdirblk = 0;
8077	struct mkdir *mkdir1, *mkdir2;
8078	struct worklist *wk;
8079	struct jaddref *jaddref;
8080	struct mount *mp;
8081
8082	mp = dap->da_list.wk_mp;
8083	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
8084	    M_SOFTDEP_FLAGS);
8085	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8086	LIST_INIT(&newdirblk->db_mkdir);
8087	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8088	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
8089	mkdir1->md_state = ATTACHED | MKDIR_BODY;
8090	mkdir1->md_diradd = dap;
8091	mkdir1->md_jaddref = NULL;
8092	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8093	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
8094	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
8095	mkdir2->md_diradd = dap;
8096	mkdir2->md_jaddref = NULL;
8097	if (MOUNTEDSUJ(mp) == 0) {
8098		mkdir1->md_state |= DEPCOMPLETE;
8099		mkdir2->md_state |= DEPCOMPLETE;
8100	}
8101	/*
8102	 * Dependency on "." and ".." being written to disk.
8103	 */
8104	mkdir1->md_buf = newdirbp;
8105	ACQUIRE_LOCK(&lk);
8106	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
8107	/*
8108	 * We must link the pagedep, allocdirect, and newdirblk for
8109	 * the initial file page so the pointer to the new directory
8110	 * is not written until the directory contents are live and
8111	 * any subsequent additions are not marked live until the
8112	 * block is reachable via the inode.
8113	 */
8114	if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
8115		panic("setup_newdir: lost pagedep");
8116	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
8117		if (wk->wk_type == D_ALLOCDIRECT)
8118			break;
8119	if (wk == NULL)
8120		panic("setup_newdir: lost allocdirect");
8121	if (pagedep->pd_state & NEWBLOCK)
8122		panic("setup_newdir: NEWBLOCK already set");
8123	newblk = WK_NEWBLK(wk);
8124	pagedep->pd_state |= NEWBLOCK;
8125	pagedep->pd_newdirblk = newdirblk;
8126	newdirblk->db_pagedep = pagedep;
8127	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8128	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
8129	/*
8130	 * Look up the inodedep for the parent directory so that we
8131	 * can link mkdir2 into the pending dotdot jaddref or
8132	 * the inode write if there is none.  If the inode is
8133	 * ALLCOMPLETE and no jaddref is present all dependencies have
8134	 * been satisfied and mkdir2 can be freed.
8135	 */
8136	inodedep_lookup(mp, dinum, 0, &inodedep);
8137	if (MOUNTEDSUJ(mp)) {
8138		if (inodedep == NULL)
8139			panic("setup_newdir: Lost parent.");
8140		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8141		    inoreflst);
8142		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
8143		    (jaddref->ja_state & MKDIR_PARENT),
8144		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
8145		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
8146		mkdir2->md_jaddref = jaddref;
8147		jaddref->ja_mkdir = mkdir2;
8148	} else if (inodedep == NULL ||
8149	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8150		dap->da_state &= ~MKDIR_PARENT;
8151		WORKITEM_FREE(mkdir2, D_MKDIR);
8152	} else {
8153		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
8154		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
8155	}
8156	*mkdirp = mkdir2;
8157
8158	return (mkdir1);
8159}
8160
8161/*
8162 * Directory entry addition dependencies.
8163 *
8164 * When adding a new directory entry, the inode (with its incremented link
8165 * count) must be written to disk before the directory entry's pointer to it.
8166 * Also, if the inode is newly allocated, the corresponding freemap must be
8167 * updated (on disk) before the directory entry's pointer. These requirements
8168 * are met via undo/redo on the directory entry's pointer, which consists
8169 * simply of the inode number.
8170 *
8171 * As directory entries are added and deleted, the free space within a
8172 * directory block can become fragmented.  The ufs filesystem will compact
8173 * a fragmented directory block to make space for a new entry. When this
8174 * occurs, the offsets of previously added entries change. Any "diradd"
8175 * dependency structures corresponding to these entries must be updated with
8176 * the new offsets.
8177 */
8178
8179/*
8180 * This routine is called after the in-memory inode's link
8181 * count has been incremented, but before the directory entry's
8182 * pointer to the inode has been set.
8183 */
8184int
8185softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
8186	struct buf *bp;		/* buffer containing directory block */
8187	struct inode *dp;	/* inode for directory */
8188	off_t diroffset;	/* offset of new entry in directory */
8189	ino_t newinum;		/* inode referenced by new directory entry */
8190	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
8191	int isnewblk;		/* entry is in a newly allocated block */
8192{
8193	int offset;		/* offset of new entry within directory block */
8194	ufs_lbn_t lbn;		/* block in directory containing new entry */
8195	struct fs *fs;
8196	struct diradd *dap;
8197	struct newblk *newblk;
8198	struct pagedep *pagedep;
8199	struct inodedep *inodedep;
8200	struct newdirblk *newdirblk = 0;
8201	struct mkdir *mkdir1, *mkdir2;
8202	struct jaddref *jaddref;
8203	struct mount *mp;
8204	int isindir;
8205
8206	/*
8207	 * Whiteouts have no dependencies.
8208	 */
8209	if (newinum == WINO) {
8210		if (newdirbp != NULL)
8211			bdwrite(newdirbp);
8212		return (0);
8213	}
8214	jaddref = NULL;
8215	mkdir1 = mkdir2 = NULL;
8216	mp = UFSTOVFS(dp->i_ump);
8217	fs = dp->i_fs;
8218	lbn = lblkno(fs, diroffset);
8219	offset = blkoff(fs, diroffset);
8220	dap = malloc(sizeof(struct diradd), M_DIRADD,
8221		M_SOFTDEP_FLAGS|M_ZERO);
8222	workitem_alloc(&dap->da_list, D_DIRADD, mp);
8223	dap->da_offset = offset;
8224	dap->da_newinum = newinum;
8225	dap->da_state = ATTACHED;
8226	LIST_INIT(&dap->da_jwork);
8227	isindir = bp->b_lblkno >= NDADDR;
8228	if (isnewblk &&
8229	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8230		newdirblk = malloc(sizeof(struct newdirblk),
8231		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8232		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8233		LIST_INIT(&newdirblk->db_mkdir);
8234	}
8235	/*
8236	 * If we're creating a new directory setup the dependencies and set
8237	 * the dap state to wait for them.  Otherwise it's COMPLETE and
8238	 * we can move on.
8239	 */
8240	if (newdirbp == NULL) {
8241		dap->da_state |= DEPCOMPLETE;
8242		ACQUIRE_LOCK(&lk);
8243	} else {
8244		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8245		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8246		    &mkdir2);
8247	}
8248	/*
8249	 * Link into parent directory pagedep to await its being written.
8250	 */
8251	pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8252#ifdef DEBUG
8253	if (diradd_lookup(pagedep, offset) != NULL)
8254		panic("softdep_setup_directory_add: %p already at off %d\n",
8255		    diradd_lookup(pagedep, offset), offset);
8256#endif
8257	dap->da_pagedep = pagedep;
8258	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8259	    da_pdlist);
8260	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
8261	/*
8262	 * If we're journaling, link the diradd into the jaddref so it
8263	 * may be completed after the journal entry is written.  Otherwise,
8264	 * link the diradd into its inodedep.  If the inode is not yet
8265	 * written place it on the bufwait list, otherwise do the post-inode
8266	 * write processing to put it on the id_pendinghd list.
8267	 */
8268	if (MOUNTEDSUJ(mp)) {
8269		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8270		    inoreflst);
8271		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8272		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8273		jaddref->ja_diroff = diroffset;
8274		jaddref->ja_diradd = dap;
8275		add_to_journal(&jaddref->ja_list);
8276	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8277		diradd_inode_written(dap, inodedep);
8278	else
8279		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8280	/*
8281	 * Add the journal entries for . and .. links now that the primary
8282	 * link is written.
8283	 */
8284	if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8285		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8286		    inoreflst, if_deps);
8287		KASSERT(jaddref != NULL &&
8288		    jaddref->ja_ino == jaddref->ja_parent &&
8289		    (jaddref->ja_state & MKDIR_BODY),
8290		    ("softdep_setup_directory_add: bad dot jaddref %p",
8291		    jaddref));
8292		mkdir1->md_jaddref = jaddref;
8293		jaddref->ja_mkdir = mkdir1;
8294		/*
8295		 * It is important that the dotdot journal entry
8296		 * is added prior to the dot entry since dot writes
8297		 * both the dot and dotdot links.  These both must
8298		 * be added after the primary link for the journal
8299		 * to remain consistent.
8300		 */
8301		add_to_journal(&mkdir2->md_jaddref->ja_list);
8302		add_to_journal(&jaddref->ja_list);
8303	}
8304	/*
8305	 * If we are adding a new directory remember this diradd so that if
8306	 * we rename it we can keep the dot and dotdot dependencies.  If
8307	 * we are adding a new name for an inode that has a mkdiradd we
8308	 * must be in rename and we have to move the dot and dotdot
8309	 * dependencies to this new name.  The old name is being orphaned
8310	 * soon.
8311	 */
8312	if (mkdir1 != NULL) {
8313		if (inodedep->id_mkdiradd != NULL)
8314			panic("softdep_setup_directory_add: Existing mkdir");
8315		inodedep->id_mkdiradd = dap;
8316	} else if (inodedep->id_mkdiradd)
8317		merge_diradd(inodedep, dap);
8318	if (newdirblk) {
8319		/*
8320		 * There is nothing to do if we are already tracking
8321		 * this block.
8322		 */
8323		if ((pagedep->pd_state & NEWBLOCK) != 0) {
8324			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8325			FREE_LOCK(&lk);
8326			return (0);
8327		}
8328		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8329		    == 0)
8330			panic("softdep_setup_directory_add: lost entry");
8331		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8332		pagedep->pd_state |= NEWBLOCK;
8333		pagedep->pd_newdirblk = newdirblk;
8334		newdirblk->db_pagedep = pagedep;
8335		FREE_LOCK(&lk);
8336		/*
8337		 * If we extended into an indirect signal direnter to sync.
8338		 */
8339		if (isindir)
8340			return (1);
8341		return (0);
8342	}
8343	FREE_LOCK(&lk);
8344	return (0);
8345}
8346
8347/*
8348 * This procedure is called to change the offset of a directory
8349 * entry when compacting a directory block which must be owned
8350 * exclusively by the caller. Note that the actual entry movement
8351 * must be done in this procedure to ensure that no I/O completions
8352 * occur while the move is in progress.
8353 */
8354void
8355softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
8356	struct buf *bp;		/* Buffer holding directory block. */
8357	struct inode *dp;	/* inode for directory */
8358	caddr_t base;		/* address of dp->i_offset */
8359	caddr_t oldloc;		/* address of old directory location */
8360	caddr_t newloc;		/* address of new directory location */
8361	int entrysize;		/* size of directory entry */
8362{
8363	int offset, oldoffset, newoffset;
8364	struct pagedep *pagedep;
8365	struct jmvref *jmvref;
8366	struct diradd *dap;
8367	struct direct *de;
8368	struct mount *mp;
8369	ufs_lbn_t lbn;
8370	int flags;
8371
8372	mp = UFSTOVFS(dp->i_ump);
8373	de = (struct direct *)oldloc;
8374	jmvref = NULL;
8375	flags = 0;
8376	/*
8377	 * Moves are always journaled as it would be too complex to
8378	 * determine if any affected adds or removes are present in the
8379	 * journal.
8380	 */
8381	if (MOUNTEDSUJ(mp)) {
8382		flags = DEPALLOC;
8383		jmvref = newjmvref(dp, de->d_ino,
8384		    dp->i_offset + (oldloc - base),
8385		    dp->i_offset + (newloc - base));
8386	}
8387	lbn = lblkno(dp->i_fs, dp->i_offset);
8388	offset = blkoff(dp->i_fs, dp->i_offset);
8389	oldoffset = offset + (oldloc - base);
8390	newoffset = offset + (newloc - base);
8391	ACQUIRE_LOCK(&lk);
8392	if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8393		goto done;
8394	dap = diradd_lookup(pagedep, oldoffset);
8395	if (dap) {
8396		dap->da_offset = newoffset;
8397		newoffset = DIRADDHASH(newoffset);
8398		oldoffset = DIRADDHASH(oldoffset);
8399		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8400		    newoffset != oldoffset) {
8401			LIST_REMOVE(dap, da_pdlist);
8402			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8403			    dap, da_pdlist);
8404		}
8405	}
8406done:
8407	if (jmvref) {
8408		jmvref->jm_pagedep = pagedep;
8409		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8410		add_to_journal(&jmvref->jm_list);
8411	}
8412	bcopy(oldloc, newloc, entrysize);
8413	FREE_LOCK(&lk);
8414}
8415
8416/*
8417 * Move the mkdir dependencies and journal work from one diradd to another
8418 * when renaming a directory.  The new name must depend on the mkdir deps
8419 * completing as the old name did.  Directories can only have one valid link
8420 * at a time so one must be canonical.
8421 */
8422static void
8423merge_diradd(inodedep, newdap)
8424	struct inodedep *inodedep;
8425	struct diradd *newdap;
8426{
8427	struct diradd *olddap;
8428	struct mkdir *mkdir, *nextmd;
8429	short state;
8430
8431	olddap = inodedep->id_mkdiradd;
8432	inodedep->id_mkdiradd = newdap;
8433	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8434		newdap->da_state &= ~DEPCOMPLETE;
8435		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
8436			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8437			if (mkdir->md_diradd != olddap)
8438				continue;
8439			mkdir->md_diradd = newdap;
8440			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8441			newdap->da_state |= state;
8442			olddap->da_state &= ~state;
8443			if ((olddap->da_state &
8444			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
8445				break;
8446		}
8447		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8448			panic("merge_diradd: unfound ref");
8449	}
8450	/*
8451	 * Any mkdir related journal items are not safe to be freed until
8452	 * the new name is stable.
8453	 */
8454	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8455	olddap->da_state |= DEPCOMPLETE;
8456	complete_diradd(olddap);
8457}
8458
8459/*
8460 * Move the diradd to the pending list when all diradd dependencies are
8461 * complete.
8462 */
8463static void
8464complete_diradd(dap)
8465	struct diradd *dap;
8466{
8467	struct pagedep *pagedep;
8468
8469	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8470		if (dap->da_state & DIRCHG)
8471			pagedep = dap->da_previous->dm_pagedep;
8472		else
8473			pagedep = dap->da_pagedep;
8474		LIST_REMOVE(dap, da_pdlist);
8475		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8476	}
8477}
8478
8479/*
8480 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
8481 * add entries and conditonally journal the remove.
8482 */
8483static void
8484cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
8485	struct diradd *dap;
8486	struct dirrem *dirrem;
8487	struct jremref *jremref;
8488	struct jremref *dotremref;
8489	struct jremref *dotdotremref;
8490{
8491	struct inodedep *inodedep;
8492	struct jaddref *jaddref;
8493	struct inoref *inoref;
8494	struct mkdir *mkdir;
8495
8496	/*
8497	 * If no remove references were allocated we're on a non-journaled
8498	 * filesystem and can skip the cancel step.
8499	 */
8500	if (jremref == NULL) {
8501		free_diradd(dap, NULL);
8502		return;
8503	}
8504	/*
8505	 * Cancel the primary name an free it if it does not require
8506	 * journaling.
8507	 */
8508	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
8509	    0, &inodedep) != 0) {
8510		/* Abort the addref that reference this diradd.  */
8511		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
8512			if (inoref->if_list.wk_type != D_JADDREF)
8513				continue;
8514			jaddref = (struct jaddref *)inoref;
8515			if (jaddref->ja_diradd != dap)
8516				continue;
8517			if (cancel_jaddref(jaddref, inodedep,
8518			    &dirrem->dm_jwork) == 0) {
8519				free_jremref(jremref);
8520				jremref = NULL;
8521			}
8522			break;
8523		}
8524	}
8525	/*
8526	 * Cancel subordinate names and free them if they do not require
8527	 * journaling.
8528	 */
8529	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8530		LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
8531			if (mkdir->md_diradd != dap)
8532				continue;
8533			if ((jaddref = mkdir->md_jaddref) == NULL)
8534				continue;
8535			mkdir->md_jaddref = NULL;
8536			if (mkdir->md_state & MKDIR_PARENT) {
8537				if (cancel_jaddref(jaddref, NULL,
8538				    &dirrem->dm_jwork) == 0) {
8539					free_jremref(dotdotremref);
8540					dotdotremref = NULL;
8541				}
8542			} else {
8543				if (cancel_jaddref(jaddref, inodedep,
8544				    &dirrem->dm_jwork) == 0) {
8545					free_jremref(dotremref);
8546					dotremref = NULL;
8547				}
8548			}
8549		}
8550	}
8551
8552	if (jremref)
8553		journal_jremref(dirrem, jremref, inodedep);
8554	if (dotremref)
8555		journal_jremref(dirrem, dotremref, inodedep);
8556	if (dotdotremref)
8557		journal_jremref(dirrem, dotdotremref, NULL);
8558	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
8559	free_diradd(dap, &dirrem->dm_jwork);
8560}
8561
8562/*
8563 * Free a diradd dependency structure. This routine must be called
8564 * with splbio interrupts blocked.
8565 */
8566static void
8567free_diradd(dap, wkhd)
8568	struct diradd *dap;
8569	struct workhead *wkhd;
8570{
8571	struct dirrem *dirrem;
8572	struct pagedep *pagedep;
8573	struct inodedep *inodedep;
8574	struct mkdir *mkdir, *nextmd;
8575
8576	mtx_assert(&lk, MA_OWNED);
8577	LIST_REMOVE(dap, da_pdlist);
8578	if (dap->da_state & ONWORKLIST)
8579		WORKLIST_REMOVE(&dap->da_list);
8580	if ((dap->da_state & DIRCHG) == 0) {
8581		pagedep = dap->da_pagedep;
8582	} else {
8583		dirrem = dap->da_previous;
8584		pagedep = dirrem->dm_pagedep;
8585		dirrem->dm_dirinum = pagedep->pd_ino;
8586		dirrem->dm_state |= COMPLETE;
8587		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8588			add_to_worklist(&dirrem->dm_list, 0);
8589	}
8590	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
8591	    0, &inodedep) != 0)
8592		if (inodedep->id_mkdiradd == dap)
8593			inodedep->id_mkdiradd = NULL;
8594	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8595		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
8596			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8597			if (mkdir->md_diradd != dap)
8598				continue;
8599			dap->da_state &=
8600			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
8601			LIST_REMOVE(mkdir, md_mkdirs);
8602			if (mkdir->md_state & ONWORKLIST)
8603				WORKLIST_REMOVE(&mkdir->md_list);
8604			if (mkdir->md_jaddref != NULL)
8605				panic("free_diradd: Unexpected jaddref");
8606			WORKITEM_FREE(mkdir, D_MKDIR);
8607			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
8608				break;
8609		}
8610		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8611			panic("free_diradd: unfound ref");
8612	}
8613	if (inodedep)
8614		free_inodedep(inodedep);
8615	/*
8616	 * Free any journal segments waiting for the directory write.
8617	 */
8618	handle_jwork(&dap->da_jwork);
8619	WORKITEM_FREE(dap, D_DIRADD);
8620}
8621
8622/*
8623 * Directory entry removal dependencies.
8624 *
8625 * When removing a directory entry, the entry's inode pointer must be
8626 * zero'ed on disk before the corresponding inode's link count is decremented
8627 * (possibly freeing the inode for re-use). This dependency is handled by
8628 * updating the directory entry but delaying the inode count reduction until
8629 * after the directory block has been written to disk. After this point, the
8630 * inode count can be decremented whenever it is convenient.
8631 */
8632
8633/*
8634 * This routine should be called immediately after removing
8635 * a directory entry.  The inode's link count should not be
8636 * decremented by the calling procedure -- the soft updates
8637 * code will do this task when it is safe.
8638 */
8639void
8640softdep_setup_remove(bp, dp, ip, isrmdir)
8641	struct buf *bp;		/* buffer containing directory block */
8642	struct inode *dp;	/* inode for the directory being modified */
8643	struct inode *ip;	/* inode for directory entry being removed */
8644	int isrmdir;		/* indicates if doing RMDIR */
8645{
8646	struct dirrem *dirrem, *prevdirrem;
8647	struct inodedep *inodedep;
8648	int direct;
8649
8650	/*
8651	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
8652	 * newdirrem() to setup the full directory remove which requires
8653	 * isrmdir > 1.
8654	 */
8655	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8656	/*
8657	 * Add the dirrem to the inodedep's pending remove list for quick
8658	 * discovery later.
8659	 */
8660	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8661	    &inodedep) == 0)
8662		panic("softdep_setup_remove: Lost inodedep.");
8663	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
8664	dirrem->dm_state |= ONDEPLIST;
8665	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8666
8667	/*
8668	 * If the COMPLETE flag is clear, then there were no active
8669	 * entries and we want to roll back to a zeroed entry until
8670	 * the new inode is committed to disk. If the COMPLETE flag is
8671	 * set then we have deleted an entry that never made it to
8672	 * disk. If the entry we deleted resulted from a name change,
8673	 * then the old name still resides on disk. We cannot delete
8674	 * its inode (returned to us in prevdirrem) until the zeroed
8675	 * directory entry gets to disk. The new inode has never been
8676	 * referenced on the disk, so can be deleted immediately.
8677	 */
8678	if ((dirrem->dm_state & COMPLETE) == 0) {
8679		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
8680		    dm_next);
8681		FREE_LOCK(&lk);
8682	} else {
8683		if (prevdirrem != NULL)
8684			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
8685			    prevdirrem, dm_next);
8686		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
8687		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
8688		FREE_LOCK(&lk);
8689		if (direct)
8690			handle_workitem_remove(dirrem, 0);
8691	}
8692}
8693
8694/*
8695 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
8696 * pd_pendinghd list of a pagedep.
8697 */
8698static struct diradd *
8699diradd_lookup(pagedep, offset)
8700	struct pagedep *pagedep;
8701	int offset;
8702{
8703	struct diradd *dap;
8704
8705	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
8706		if (dap->da_offset == offset)
8707			return (dap);
8708	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
8709		if (dap->da_offset == offset)
8710			return (dap);
8711	return (NULL);
8712}
8713
8714/*
8715 * Search for a .. diradd dependency in a directory that is being removed.
8716 * If the directory was renamed to a new parent we have a diradd rather
8717 * than a mkdir for the .. entry.  We need to cancel it now before
8718 * it is found in truncate().
8719 */
8720static struct jremref *
8721cancel_diradd_dotdot(ip, dirrem, jremref)
8722	struct inode *ip;
8723	struct dirrem *dirrem;
8724	struct jremref *jremref;
8725{
8726	struct pagedep *pagedep;
8727	struct diradd *dap;
8728	struct worklist *wk;
8729
8730	if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0,
8731	    &pagedep) == 0)
8732		return (jremref);
8733	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
8734	if (dap == NULL)
8735		return (jremref);
8736	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
8737	/*
8738	 * Mark any journal work as belonging to the parent so it is freed
8739	 * with the .. reference.
8740	 */
8741	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
8742		wk->wk_state |= MKDIR_PARENT;
8743	return (NULL);
8744}
8745
8746/*
8747 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
8748 * replace it with a dirrem/diradd pair as a result of re-parenting a
8749 * directory.  This ensures that we don't simultaneously have a mkdir and
8750 * a diradd for the same .. entry.
8751 */
8752static struct jremref *
8753cancel_mkdir_dotdot(ip, dirrem, jremref)
8754	struct inode *ip;
8755	struct dirrem *dirrem;
8756	struct jremref *jremref;
8757{
8758	struct inodedep *inodedep;
8759	struct jaddref *jaddref;
8760	struct mkdir *mkdir;
8761	struct diradd *dap;
8762
8763	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8764	    &inodedep) == 0)
8765		return (jremref);
8766	dap = inodedep->id_mkdiradd;
8767	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
8768		return (jremref);
8769	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir;
8770	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
8771		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
8772			break;
8773	if (mkdir == NULL)
8774		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
8775	if ((jaddref = mkdir->md_jaddref) != NULL) {
8776		mkdir->md_jaddref = NULL;
8777		jaddref->ja_state &= ~MKDIR_PARENT;
8778		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
8779		    &inodedep) == 0)
8780			panic("cancel_mkdir_dotdot: Lost parent inodedep");
8781		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
8782			journal_jremref(dirrem, jremref, inodedep);
8783			jremref = NULL;
8784		}
8785	}
8786	if (mkdir->md_state & ONWORKLIST)
8787		WORKLIST_REMOVE(&mkdir->md_list);
8788	mkdir->md_state |= ALLCOMPLETE;
8789	complete_mkdir(mkdir);
8790	return (jremref);
8791}
8792
8793static void
8794journal_jremref(dirrem, jremref, inodedep)
8795	struct dirrem *dirrem;
8796	struct jremref *jremref;
8797	struct inodedep *inodedep;
8798{
8799
8800	if (inodedep == NULL)
8801		if (inodedep_lookup(jremref->jr_list.wk_mp,
8802		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
8803			panic("journal_jremref: Lost inodedep");
8804	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
8805	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
8806	add_to_journal(&jremref->jr_list);
8807}
8808
8809static void
8810dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
8811	struct dirrem *dirrem;
8812	struct jremref *jremref;
8813	struct jremref *dotremref;
8814	struct jremref *dotdotremref;
8815{
8816	struct inodedep *inodedep;
8817
8818
8819	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
8820	    &inodedep) == 0)
8821		panic("dirrem_journal: Lost inodedep");
8822	journal_jremref(dirrem, jremref, inodedep);
8823	if (dotremref)
8824		journal_jremref(dirrem, dotremref, inodedep);
8825	if (dotdotremref)
8826		journal_jremref(dirrem, dotdotremref, NULL);
8827}
8828
8829/*
8830 * Allocate a new dirrem if appropriate and return it along with
8831 * its associated pagedep. Called without a lock, returns with lock.
8832 */
8833static struct dirrem *
8834newdirrem(bp, dp, ip, isrmdir, prevdirremp)
8835	struct buf *bp;		/* buffer containing directory block */
8836	struct inode *dp;	/* inode for the directory being modified */
8837	struct inode *ip;	/* inode for directory entry being removed */
8838	int isrmdir;		/* indicates if doing RMDIR */
8839	struct dirrem **prevdirremp; /* previously referenced inode, if any */
8840{
8841	int offset;
8842	ufs_lbn_t lbn;
8843	struct diradd *dap;
8844	struct dirrem *dirrem;
8845	struct pagedep *pagedep;
8846	struct jremref *jremref;
8847	struct jremref *dotremref;
8848	struct jremref *dotdotremref;
8849	struct vnode *dvp;
8850
8851	/*
8852	 * Whiteouts have no deletion dependencies.
8853	 */
8854	if (ip == NULL)
8855		panic("newdirrem: whiteout");
8856	dvp = ITOV(dp);
8857	/*
8858	 * If we are over our limit, try to improve the situation.
8859	 * Limiting the number of dirrem structures will also limit
8860	 * the number of freefile and freeblks structures.
8861	 */
8862	ACQUIRE_LOCK(&lk);
8863	if (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2)
8864		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS);
8865	FREE_LOCK(&lk);
8866	dirrem = malloc(sizeof(struct dirrem),
8867		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
8868	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
8869	LIST_INIT(&dirrem->dm_jremrefhd);
8870	LIST_INIT(&dirrem->dm_jwork);
8871	dirrem->dm_state = isrmdir ? RMDIR : 0;
8872	dirrem->dm_oldinum = ip->i_number;
8873	*prevdirremp = NULL;
8874	/*
8875	 * Allocate remove reference structures to track journal write
8876	 * dependencies.  We will always have one for the link and
8877	 * when doing directories we will always have one more for dot.
8878	 * When renaming a directory we skip the dotdot link change so
8879	 * this is not needed.
8880	 */
8881	jremref = dotremref = dotdotremref = NULL;
8882	if (DOINGSUJ(dvp)) {
8883		if (isrmdir) {
8884			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
8885			    ip->i_effnlink + 2);
8886			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
8887			    ip->i_effnlink + 1);
8888			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
8889			    dp->i_effnlink + 1);
8890			dotdotremref->jr_state |= MKDIR_PARENT;
8891		} else
8892			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
8893			    ip->i_effnlink + 1);
8894	}
8895	ACQUIRE_LOCK(&lk);
8896	lbn = lblkno(dp->i_fs, dp->i_offset);
8897	offset = blkoff(dp->i_fs, dp->i_offset);
8898	pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC,
8899	    &pagedep);
8900	dirrem->dm_pagedep = pagedep;
8901	dirrem->dm_offset = offset;
8902	/*
8903	 * If we're renaming a .. link to a new directory, cancel any
8904	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
8905	 * the jremref is preserved for any potential diradd in this
8906	 * location.  This can not coincide with a rmdir.
8907	 */
8908	if (dp->i_offset == DOTDOT_OFFSET) {
8909		if (isrmdir)
8910			panic("newdirrem: .. directory change during remove?");
8911		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
8912	}
8913	/*
8914	 * If we're removing a directory search for the .. dependency now and
8915	 * cancel it.  Any pending journal work will be added to the dirrem
8916	 * to be completed when the workitem remove completes.
8917	 */
8918	if (isrmdir)
8919		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
8920	/*
8921	 * Check for a diradd dependency for the same directory entry.
8922	 * If present, then both dependencies become obsolete and can
8923	 * be de-allocated.
8924	 */
8925	dap = diradd_lookup(pagedep, offset);
8926	if (dap == NULL) {
8927		/*
8928		 * Link the jremref structures into the dirrem so they are
8929		 * written prior to the pagedep.
8930		 */
8931		if (jremref)
8932			dirrem_journal(dirrem, jremref, dotremref,
8933			    dotdotremref);
8934		return (dirrem);
8935	}
8936	/*
8937	 * Must be ATTACHED at this point.
8938	 */
8939	if ((dap->da_state & ATTACHED) == 0)
8940		panic("newdirrem: not ATTACHED");
8941	if (dap->da_newinum != ip->i_number)
8942		panic("newdirrem: inum %ju should be %ju",
8943		    (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
8944	/*
8945	 * If we are deleting a changed name that never made it to disk,
8946	 * then return the dirrem describing the previous inode (which
8947	 * represents the inode currently referenced from this entry on disk).
8948	 */
8949	if ((dap->da_state & DIRCHG) != 0) {
8950		*prevdirremp = dap->da_previous;
8951		dap->da_state &= ~DIRCHG;
8952		dap->da_pagedep = pagedep;
8953	}
8954	/*
8955	 * We are deleting an entry that never made it to disk.
8956	 * Mark it COMPLETE so we can delete its inode immediately.
8957	 */
8958	dirrem->dm_state |= COMPLETE;
8959	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
8960#ifdef SUJ_DEBUG
8961	if (isrmdir == 0) {
8962		struct worklist *wk;
8963
8964		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
8965			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
8966				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
8967	}
8968#endif
8969
8970	return (dirrem);
8971}
8972
8973/*
8974 * Directory entry change dependencies.
8975 *
8976 * Changing an existing directory entry requires that an add operation
8977 * be completed first followed by a deletion. The semantics for the addition
8978 * are identical to the description of adding a new entry above except
8979 * that the rollback is to the old inode number rather than zero. Once
8980 * the addition dependency is completed, the removal is done as described
8981 * in the removal routine above.
8982 */
8983
8984/*
8985 * This routine should be called immediately after changing
8986 * a directory entry.  The inode's link count should not be
8987 * decremented by the calling procedure -- the soft updates
8988 * code will perform this task when it is safe.
8989 */
8990void
8991softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
8992	struct buf *bp;		/* buffer containing directory block */
8993	struct inode *dp;	/* inode for the directory being modified */
8994	struct inode *ip;	/* inode for directory entry being removed */
8995	ino_t newinum;		/* new inode number for changed entry */
8996	int isrmdir;		/* indicates if doing RMDIR */
8997{
8998	int offset;
8999	struct diradd *dap = NULL;
9000	struct dirrem *dirrem, *prevdirrem;
9001	struct pagedep *pagedep;
9002	struct inodedep *inodedep;
9003	struct jaddref *jaddref;
9004	struct mount *mp;
9005
9006	offset = blkoff(dp->i_fs, dp->i_offset);
9007	mp = UFSTOVFS(dp->i_ump);
9008
9009	/*
9010	 * Whiteouts do not need diradd dependencies.
9011	 */
9012	if (newinum != WINO) {
9013		dap = malloc(sizeof(struct diradd),
9014		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
9015		workitem_alloc(&dap->da_list, D_DIRADD, mp);
9016		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
9017		dap->da_offset = offset;
9018		dap->da_newinum = newinum;
9019		LIST_INIT(&dap->da_jwork);
9020	}
9021
9022	/*
9023	 * Allocate a new dirrem and ACQUIRE_LOCK.
9024	 */
9025	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9026	pagedep = dirrem->dm_pagedep;
9027	/*
9028	 * The possible values for isrmdir:
9029	 *	0 - non-directory file rename
9030	 *	1 - directory rename within same directory
9031	 *   inum - directory rename to new directory of given inode number
9032	 * When renaming to a new directory, we are both deleting and
9033	 * creating a new directory entry, so the link count on the new
9034	 * directory should not change. Thus we do not need the followup
9035	 * dirrem which is usually done in handle_workitem_remove. We set
9036	 * the DIRCHG flag to tell handle_workitem_remove to skip the
9037	 * followup dirrem.
9038	 */
9039	if (isrmdir > 1)
9040		dirrem->dm_state |= DIRCHG;
9041
9042	/*
9043	 * Whiteouts have no additional dependencies,
9044	 * so just put the dirrem on the correct list.
9045	 */
9046	if (newinum == WINO) {
9047		if ((dirrem->dm_state & COMPLETE) == 0) {
9048			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
9049			    dm_next);
9050		} else {
9051			dirrem->dm_dirinum = pagedep->pd_ino;
9052			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9053				add_to_worklist(&dirrem->dm_list, 0);
9054		}
9055		FREE_LOCK(&lk);
9056		return;
9057	}
9058	/*
9059	 * Add the dirrem to the inodedep's pending remove list for quick
9060	 * discovery later.  A valid nlinkdelta ensures that this lookup
9061	 * will not fail.
9062	 */
9063	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9064		panic("softdep_setup_directory_change: Lost inodedep.");
9065	dirrem->dm_state |= ONDEPLIST;
9066	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9067
9068	/*
9069	 * If the COMPLETE flag is clear, then there were no active
9070	 * entries and we want to roll back to the previous inode until
9071	 * the new inode is committed to disk. If the COMPLETE flag is
9072	 * set, then we have deleted an entry that never made it to disk.
9073	 * If the entry we deleted resulted from a name change, then the old
9074	 * inode reference still resides on disk. Any rollback that we do
9075	 * needs to be to that old inode (returned to us in prevdirrem). If
9076	 * the entry we deleted resulted from a create, then there is
9077	 * no entry on the disk, so we want to roll back to zero rather
9078	 * than the uncommitted inode. In either of the COMPLETE cases we
9079	 * want to immediately free the unwritten and unreferenced inode.
9080	 */
9081	if ((dirrem->dm_state & COMPLETE) == 0) {
9082		dap->da_previous = dirrem;
9083	} else {
9084		if (prevdirrem != NULL) {
9085			dap->da_previous = prevdirrem;
9086		} else {
9087			dap->da_state &= ~DIRCHG;
9088			dap->da_pagedep = pagedep;
9089		}
9090		dirrem->dm_dirinum = pagedep->pd_ino;
9091		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9092			add_to_worklist(&dirrem->dm_list, 0);
9093	}
9094	/*
9095	 * Lookup the jaddref for this journal entry.  We must finish
9096	 * initializing it and make the diradd write dependent on it.
9097	 * If we're not journaling, put it on the id_bufwait list if the
9098	 * inode is not yet written. If it is written, do the post-inode
9099	 * write processing to put it on the id_pendinghd list.
9100	 */
9101	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
9102	if (MOUNTEDSUJ(mp)) {
9103		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
9104		    inoreflst);
9105		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
9106		    ("softdep_setup_directory_change: bad jaddref %p",
9107		    jaddref));
9108		jaddref->ja_diroff = dp->i_offset;
9109		jaddref->ja_diradd = dap;
9110		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9111		    dap, da_pdlist);
9112		add_to_journal(&jaddref->ja_list);
9113	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
9114		dap->da_state |= COMPLETE;
9115		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9116		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9117	} else {
9118		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9119		    dap, da_pdlist);
9120		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
9121	}
9122	/*
9123	 * If we're making a new name for a directory that has not been
9124	 * committed when need to move the dot and dotdot references to
9125	 * this new name.
9126	 */
9127	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
9128		merge_diradd(inodedep, dap);
9129	FREE_LOCK(&lk);
9130}
9131
9132/*
9133 * Called whenever the link count on an inode is changed.
9134 * It creates an inode dependency so that the new reference(s)
9135 * to the inode cannot be committed to disk until the updated
9136 * inode has been written.
9137 */
9138void
9139softdep_change_linkcnt(ip)
9140	struct inode *ip;	/* the inode with the increased link count */
9141{
9142	struct inodedep *inodedep;
9143	int dflags;
9144
9145	ACQUIRE_LOCK(&lk);
9146	dflags = DEPALLOC;
9147	if (IS_SNAPSHOT(ip))
9148		dflags |= NODELAY;
9149	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
9150	if (ip->i_nlink < ip->i_effnlink)
9151		panic("softdep_change_linkcnt: bad delta");
9152	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9153	FREE_LOCK(&lk);
9154}
9155
9156/*
9157 * Attach a sbdep dependency to the superblock buf so that we can keep
9158 * track of the head of the linked list of referenced but unlinked inodes.
9159 */
9160void
9161softdep_setup_sbupdate(ump, fs, bp)
9162	struct ufsmount *ump;
9163	struct fs *fs;
9164	struct buf *bp;
9165{
9166	struct sbdep *sbdep;
9167	struct worklist *wk;
9168
9169	if (MOUNTEDSUJ(UFSTOVFS(ump)) == 0)
9170		return;
9171	LIST_FOREACH(wk, &bp->b_dep, wk_list)
9172		if (wk->wk_type == D_SBDEP)
9173			break;
9174	if (wk != NULL)
9175		return;
9176	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
9177	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9178	sbdep->sb_fs = fs;
9179	sbdep->sb_ump = ump;
9180	ACQUIRE_LOCK(&lk);
9181	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9182	FREE_LOCK(&lk);
9183}
9184
9185/*
9186 * Return the first unlinked inodedep which is ready to be the head of the
9187 * list.  The inodedep and all those after it must have valid next pointers.
9188 */
9189static struct inodedep *
9190first_unlinked_inodedep(ump)
9191	struct ufsmount *ump;
9192{
9193	struct inodedep *inodedep;
9194	struct inodedep *idp;
9195
9196	mtx_assert(&lk, MA_OWNED);
9197	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9198	    inodedep; inodedep = idp) {
9199		if ((inodedep->id_state & UNLINKNEXT) == 0)
9200			return (NULL);
9201		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9202		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9203			break;
9204		if ((inodedep->id_state & UNLINKPREV) == 0)
9205			break;
9206	}
9207	return (inodedep);
9208}
9209
9210/*
9211 * Set the sujfree unlinked head pointer prior to writing a superblock.
9212 */
9213static void
9214initiate_write_sbdep(sbdep)
9215	struct sbdep *sbdep;
9216{
9217	struct inodedep *inodedep;
9218	struct fs *bpfs;
9219	struct fs *fs;
9220
9221	bpfs = sbdep->sb_fs;
9222	fs = sbdep->sb_ump->um_fs;
9223	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9224	if (inodedep) {
9225		fs->fs_sujfree = inodedep->id_ino;
9226		inodedep->id_state |= UNLINKPREV;
9227	} else
9228		fs->fs_sujfree = 0;
9229	bpfs->fs_sujfree = fs->fs_sujfree;
9230}
9231
9232/*
9233 * After a superblock is written determine whether it must be written again
9234 * due to a changing unlinked list head.
9235 */
9236static int
9237handle_written_sbdep(sbdep, bp)
9238	struct sbdep *sbdep;
9239	struct buf *bp;
9240{
9241	struct inodedep *inodedep;
9242	struct mount *mp;
9243	struct fs *fs;
9244
9245	mtx_assert(&lk, MA_OWNED);
9246	fs = sbdep->sb_fs;
9247	mp = UFSTOVFS(sbdep->sb_ump);
9248	/*
9249	 * If the superblock doesn't match the in-memory list start over.
9250	 */
9251	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9252	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9253	    (inodedep == NULL && fs->fs_sujfree != 0)) {
9254		bdirty(bp);
9255		return (1);
9256	}
9257	WORKITEM_FREE(sbdep, D_SBDEP);
9258	if (fs->fs_sujfree == 0)
9259		return (0);
9260	/*
9261	 * Now that we have a record of this inode in stable store allow it
9262	 * to be written to free up pending work.  Inodes may see a lot of
9263	 * write activity after they are unlinked which we must not hold up.
9264	 */
9265	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9266		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9267			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9268			    inodedep, inodedep->id_state);
9269		if (inodedep->id_state & UNLINKONLIST)
9270			break;
9271		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9272	}
9273
9274	return (0);
9275}
9276
9277/*
9278 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9279 */
9280static void
9281unlinked_inodedep(mp, inodedep)
9282	struct mount *mp;
9283	struct inodedep *inodedep;
9284{
9285	struct ufsmount *ump;
9286
9287	mtx_assert(&lk, MA_OWNED);
9288	if (MOUNTEDSUJ(mp) == 0)
9289		return;
9290	ump = VFSTOUFS(mp);
9291	ump->um_fs->fs_fmod = 1;
9292	if (inodedep->id_state & UNLINKED)
9293		panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9294	inodedep->id_state |= UNLINKED;
9295	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9296}
9297
9298/*
9299 * Remove an inodedep from the unlinked inodedep list.  This may require
9300 * disk writes if the inode has made it that far.
9301 */
9302static void
9303clear_unlinked_inodedep(inodedep)
9304	struct inodedep *inodedep;
9305{
9306	struct ufsmount *ump;
9307	struct inodedep *idp;
9308	struct inodedep *idn;
9309	struct fs *fs;
9310	struct buf *bp;
9311	ino_t ino;
9312	ino_t nino;
9313	ino_t pino;
9314	int error;
9315
9316	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9317	fs = ump->um_fs;
9318	ino = inodedep->id_ino;
9319	error = 0;
9320	for (;;) {
9321		mtx_assert(&lk, MA_OWNED);
9322		KASSERT((inodedep->id_state & UNLINKED) != 0,
9323		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9324		    inodedep));
9325		/*
9326		 * If nothing has yet been written simply remove us from
9327		 * the in memory list and return.  This is the most common
9328		 * case where handle_workitem_remove() loses the final
9329		 * reference.
9330		 */
9331		if ((inodedep->id_state & UNLINKLINKS) == 0)
9332			break;
9333		/*
9334		 * If we have a NEXT pointer and no PREV pointer we can simply
9335		 * clear NEXT's PREV and remove ourselves from the list.  Be
9336		 * careful not to clear PREV if the superblock points at
9337		 * next as well.
9338		 */
9339		idn = TAILQ_NEXT(inodedep, id_unlinked);
9340		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9341			if (idn && fs->fs_sujfree != idn->id_ino)
9342				idn->id_state &= ~UNLINKPREV;
9343			break;
9344		}
9345		/*
9346		 * Here we have an inodedep which is actually linked into
9347		 * the list.  We must remove it by forcing a write to the
9348		 * link before us, whether it be the superblock or an inode.
9349		 * Unfortunately the list may change while we're waiting
9350		 * on the buf lock for either resource so we must loop until
9351		 * we lock the right one.  If both the superblock and an
9352		 * inode point to this inode we must clear the inode first
9353		 * followed by the superblock.
9354		 */
9355		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9356		pino = 0;
9357		if (idp && (idp->id_state & UNLINKNEXT))
9358			pino = idp->id_ino;
9359		FREE_LOCK(&lk);
9360		if (pino == 0)
9361			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9362			    (int)fs->fs_sbsize, 0, 0, 0);
9363		else
9364			error = bread(ump->um_devvp,
9365			    fsbtodb(fs, ino_to_fsba(fs, pino)),
9366			    (int)fs->fs_bsize, NOCRED, &bp);
9367		ACQUIRE_LOCK(&lk);
9368		if (error)
9369			break;
9370		/* If the list has changed restart the loop. */
9371		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9372		nino = 0;
9373		if (idp && (idp->id_state & UNLINKNEXT))
9374			nino = idp->id_ino;
9375		if (nino != pino ||
9376		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9377			FREE_LOCK(&lk);
9378			brelse(bp);
9379			ACQUIRE_LOCK(&lk);
9380			continue;
9381		}
9382		nino = 0;
9383		idn = TAILQ_NEXT(inodedep, id_unlinked);
9384		if (idn)
9385			nino = idn->id_ino;
9386		/*
9387		 * Remove us from the in memory list.  After this we cannot
9388		 * access the inodedep.
9389		 */
9390		KASSERT((inodedep->id_state & UNLINKED) != 0,
9391		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9392		    inodedep));
9393		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9394		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9395		FREE_LOCK(&lk);
9396		/*
9397		 * The predecessor's next pointer is manually updated here
9398		 * so that the NEXT flag is never cleared for an element
9399		 * that is in the list.
9400		 */
9401		if (pino == 0) {
9402			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9403			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9404			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9405			    bp);
9406		} else if (fs->fs_magic == FS_UFS1_MAGIC)
9407			((struct ufs1_dinode *)bp->b_data +
9408			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9409		else
9410			((struct ufs2_dinode *)bp->b_data +
9411			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9412		/*
9413		 * If the bwrite fails we have no recourse to recover.  The
9414		 * filesystem is corrupted already.
9415		 */
9416		bwrite(bp);
9417		ACQUIRE_LOCK(&lk);
9418		/*
9419		 * If the superblock pointer still needs to be cleared force
9420		 * a write here.
9421		 */
9422		if (fs->fs_sujfree == ino) {
9423			FREE_LOCK(&lk);
9424			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9425			    (int)fs->fs_sbsize, 0, 0, 0);
9426			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9427			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9428			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9429			    bp);
9430			bwrite(bp);
9431			ACQUIRE_LOCK(&lk);
9432		}
9433
9434		if (fs->fs_sujfree != ino)
9435			return;
9436		panic("clear_unlinked_inodedep: Failed to clear free head");
9437	}
9438	if (inodedep->id_ino == fs->fs_sujfree)
9439		panic("clear_unlinked_inodedep: Freeing head of free list");
9440	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9441	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9442	return;
9443}
9444
9445/*
9446 * This workitem decrements the inode's link count.
9447 * If the link count reaches zero, the file is removed.
9448 */
9449static int
9450handle_workitem_remove(dirrem, flags)
9451	struct dirrem *dirrem;
9452	int flags;
9453{
9454	struct inodedep *inodedep;
9455	struct workhead dotdotwk;
9456	struct worklist *wk;
9457	struct ufsmount *ump;
9458	struct mount *mp;
9459	struct vnode *vp;
9460	struct inode *ip;
9461	ino_t oldinum;
9462
9463	if (dirrem->dm_state & ONWORKLIST)
9464		panic("handle_workitem_remove: dirrem %p still on worklist",
9465		    dirrem);
9466	oldinum = dirrem->dm_oldinum;
9467	mp = dirrem->dm_list.wk_mp;
9468	ump = VFSTOUFS(mp);
9469	flags |= LK_EXCLUSIVE;
9470	if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
9471		return (EBUSY);
9472	ip = VTOI(vp);
9473	ACQUIRE_LOCK(&lk);
9474	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
9475		panic("handle_workitem_remove: lost inodedep");
9476	if (dirrem->dm_state & ONDEPLIST)
9477		LIST_REMOVE(dirrem, dm_inonext);
9478	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9479	    ("handle_workitem_remove:  Journal entries not written."));
9480
9481	/*
9482	 * Move all dependencies waiting on the remove to complete
9483	 * from the dirrem to the inode inowait list to be completed
9484	 * after the inode has been updated and written to disk.  Any
9485	 * marked MKDIR_PARENT are saved to be completed when the .. ref
9486	 * is removed.
9487	 */
9488	LIST_INIT(&dotdotwk);
9489	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
9490		WORKLIST_REMOVE(wk);
9491		if (wk->wk_state & MKDIR_PARENT) {
9492			wk->wk_state &= ~MKDIR_PARENT;
9493			WORKLIST_INSERT(&dotdotwk, wk);
9494			continue;
9495		}
9496		WORKLIST_INSERT(&inodedep->id_inowait, wk);
9497	}
9498	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
9499	/*
9500	 * Normal file deletion.
9501	 */
9502	if ((dirrem->dm_state & RMDIR) == 0) {
9503		ip->i_nlink--;
9504		DIP_SET(ip, i_nlink, ip->i_nlink);
9505		ip->i_flag |= IN_CHANGE;
9506		if (ip->i_nlink < ip->i_effnlink)
9507			panic("handle_workitem_remove: bad file delta");
9508		if (ip->i_nlink == 0)
9509			unlinked_inodedep(mp, inodedep);
9510		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9511		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9512		    ("handle_workitem_remove: worklist not empty. %s",
9513		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
9514		WORKITEM_FREE(dirrem, D_DIRREM);
9515		FREE_LOCK(&lk);
9516		goto out;
9517	}
9518	/*
9519	 * Directory deletion. Decrement reference count for both the
9520	 * just deleted parent directory entry and the reference for ".".
9521	 * Arrange to have the reference count on the parent decremented
9522	 * to account for the loss of "..".
9523	 */
9524	ip->i_nlink -= 2;
9525	DIP_SET(ip, i_nlink, ip->i_nlink);
9526	ip->i_flag |= IN_CHANGE;
9527	if (ip->i_nlink < ip->i_effnlink)
9528		panic("handle_workitem_remove: bad dir delta");
9529	if (ip->i_nlink == 0)
9530		unlinked_inodedep(mp, inodedep);
9531	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9532	/*
9533	 * Rename a directory to a new parent. Since, we are both deleting
9534	 * and creating a new directory entry, the link count on the new
9535	 * directory should not change. Thus we skip the followup dirrem.
9536	 */
9537	if (dirrem->dm_state & DIRCHG) {
9538		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9539		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
9540		WORKITEM_FREE(dirrem, D_DIRREM);
9541		FREE_LOCK(&lk);
9542		goto out;
9543	}
9544	dirrem->dm_state = ONDEPLIST;
9545	dirrem->dm_oldinum = dirrem->dm_dirinum;
9546	/*
9547	 * Place the dirrem on the parent's diremhd list.
9548	 */
9549	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
9550		panic("handle_workitem_remove: lost dir inodedep");
9551	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9552	/*
9553	 * If the allocated inode has never been written to disk, then
9554	 * the on-disk inode is zero'ed and we can remove the file
9555	 * immediately.  When journaling if the inode has been marked
9556	 * unlinked and not DEPCOMPLETE we know it can never be written.
9557	 */
9558	inodedep_lookup(mp, oldinum, 0, &inodedep);
9559	if (inodedep == NULL ||
9560	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
9561	    check_inode_unwritten(inodedep)) {
9562		FREE_LOCK(&lk);
9563		vput(vp);
9564		return handle_workitem_remove(dirrem, flags);
9565	}
9566	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
9567	FREE_LOCK(&lk);
9568	ip->i_flag |= IN_CHANGE;
9569out:
9570	ffs_update(vp, 0);
9571	vput(vp);
9572	return (0);
9573}
9574
9575/*
9576 * Inode de-allocation dependencies.
9577 *
9578 * When an inode's link count is reduced to zero, it can be de-allocated. We
9579 * found it convenient to postpone de-allocation until after the inode is
9580 * written to disk with its new link count (zero).  At this point, all of the
9581 * on-disk inode's block pointers are nullified and, with careful dependency
9582 * list ordering, all dependencies related to the inode will be satisfied and
9583 * the corresponding dependency structures de-allocated.  So, if/when the
9584 * inode is reused, there will be no mixing of old dependencies with new
9585 * ones.  This artificial dependency is set up by the block de-allocation
9586 * procedure above (softdep_setup_freeblocks) and completed by the
9587 * following procedure.
9588 */
9589static void
9590handle_workitem_freefile(freefile)
9591	struct freefile *freefile;
9592{
9593	struct workhead wkhd;
9594	struct fs *fs;
9595	struct inodedep *idp;
9596	struct ufsmount *ump;
9597	int error;
9598
9599	ump = VFSTOUFS(freefile->fx_list.wk_mp);
9600	fs = ump->um_fs;
9601#ifdef DEBUG
9602	ACQUIRE_LOCK(&lk);
9603	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
9604	FREE_LOCK(&lk);
9605	if (error)
9606		panic("handle_workitem_freefile: inodedep %p survived", idp);
9607#endif
9608	UFS_LOCK(ump);
9609	fs->fs_pendinginodes -= 1;
9610	UFS_UNLOCK(ump);
9611	LIST_INIT(&wkhd);
9612	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
9613	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
9614	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
9615		softdep_error("handle_workitem_freefile", error);
9616	ACQUIRE_LOCK(&lk);
9617	WORKITEM_FREE(freefile, D_FREEFILE);
9618	FREE_LOCK(&lk);
9619}
9620
9621
9622/*
9623 * Helper function which unlinks marker element from work list and returns
9624 * the next element on the list.
9625 */
9626static __inline struct worklist *
9627markernext(struct worklist *marker)
9628{
9629	struct worklist *next;
9630
9631	next = LIST_NEXT(marker, wk_list);
9632	LIST_REMOVE(marker, wk_list);
9633	return next;
9634}
9635
9636/*
9637 * Disk writes.
9638 *
9639 * The dependency structures constructed above are most actively used when file
9640 * system blocks are written to disk.  No constraints are placed on when a
9641 * block can be written, but unsatisfied update dependencies are made safe by
9642 * modifying (or replacing) the source memory for the duration of the disk
9643 * write.  When the disk write completes, the memory block is again brought
9644 * up-to-date.
9645 *
9646 * In-core inode structure reclamation.
9647 *
9648 * Because there are a finite number of "in-core" inode structures, they are
9649 * reused regularly.  By transferring all inode-related dependencies to the
9650 * in-memory inode block and indexing them separately (via "inodedep"s), we
9651 * can allow "in-core" inode structures to be reused at any time and avoid
9652 * any increase in contention.
9653 *
9654 * Called just before entering the device driver to initiate a new disk I/O.
9655 * The buffer must be locked, thus, no I/O completion operations can occur
9656 * while we are manipulating its associated dependencies.
9657 */
9658static void
9659softdep_disk_io_initiation(bp)
9660	struct buf *bp;		/* structure describing disk write to occur */
9661{
9662	struct worklist *wk;
9663	struct worklist marker;
9664	struct inodedep *inodedep;
9665	struct freeblks *freeblks;
9666	struct jblkdep *jblkdep;
9667	struct newblk *newblk;
9668
9669	/*
9670	 * We only care about write operations. There should never
9671	 * be dependencies for reads.
9672	 */
9673	if (bp->b_iocmd != BIO_WRITE)
9674		panic("softdep_disk_io_initiation: not write");
9675
9676	if (bp->b_vflags & BV_BKGRDINPROG)
9677		panic("softdep_disk_io_initiation: Writing buffer with "
9678		    "background write in progress: %p", bp);
9679
9680	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
9681	PHOLD(curproc);			/* Don't swap out kernel stack */
9682
9683	ACQUIRE_LOCK(&lk);
9684	/*
9685	 * Do any necessary pre-I/O processing.
9686	 */
9687	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
9688	     wk = markernext(&marker)) {
9689		LIST_INSERT_AFTER(wk, &marker, wk_list);
9690		switch (wk->wk_type) {
9691
9692		case D_PAGEDEP:
9693			initiate_write_filepage(WK_PAGEDEP(wk), bp);
9694			continue;
9695
9696		case D_INODEDEP:
9697			inodedep = WK_INODEDEP(wk);
9698			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
9699				initiate_write_inodeblock_ufs1(inodedep, bp);
9700			else
9701				initiate_write_inodeblock_ufs2(inodedep, bp);
9702			continue;
9703
9704		case D_INDIRDEP:
9705			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
9706			continue;
9707
9708		case D_BMSAFEMAP:
9709			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
9710			continue;
9711
9712		case D_JSEG:
9713			WK_JSEG(wk)->js_buf = NULL;
9714			continue;
9715
9716		case D_FREEBLKS:
9717			freeblks = WK_FREEBLKS(wk);
9718			jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
9719			/*
9720			 * We have to wait for the freeblks to be journaled
9721			 * before we can write an inodeblock with updated
9722			 * pointers.  Be careful to arrange the marker so
9723			 * we revisit the freeblks if it's not removed by
9724			 * the first jwait().
9725			 */
9726			if (jblkdep != NULL) {
9727				LIST_REMOVE(&marker, wk_list);
9728				LIST_INSERT_BEFORE(wk, &marker, wk_list);
9729				jwait(&jblkdep->jb_list, MNT_WAIT);
9730			}
9731			continue;
9732		case D_ALLOCDIRECT:
9733		case D_ALLOCINDIR:
9734			/*
9735			 * We have to wait for the jnewblk to be journaled
9736			 * before we can write to a block if the contents
9737			 * may be confused with an earlier file's indirect
9738			 * at recovery time.  Handle the marker as described
9739			 * above.
9740			 */
9741			newblk = WK_NEWBLK(wk);
9742			if (newblk->nb_jnewblk != NULL &&
9743			    indirblk_lookup(newblk->nb_list.wk_mp,
9744			    newblk->nb_newblkno)) {
9745				LIST_REMOVE(&marker, wk_list);
9746				LIST_INSERT_BEFORE(wk, &marker, wk_list);
9747				jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
9748			}
9749			continue;
9750
9751		case D_SBDEP:
9752			initiate_write_sbdep(WK_SBDEP(wk));
9753			continue;
9754
9755		case D_MKDIR:
9756		case D_FREEWORK:
9757		case D_FREEDEP:
9758		case D_JSEGDEP:
9759			continue;
9760
9761		default:
9762			panic("handle_disk_io_initiation: Unexpected type %s",
9763			    TYPENAME(wk->wk_type));
9764			/* NOTREACHED */
9765		}
9766	}
9767	FREE_LOCK(&lk);
9768	PRELE(curproc);			/* Allow swapout of kernel stack */
9769}
9770
9771/*
9772 * Called from within the procedure above to deal with unsatisfied
9773 * allocation dependencies in a directory. The buffer must be locked,
9774 * thus, no I/O completion operations can occur while we are
9775 * manipulating its associated dependencies.
9776 */
9777static void
9778initiate_write_filepage(pagedep, bp)
9779	struct pagedep *pagedep;
9780	struct buf *bp;
9781{
9782	struct jremref *jremref;
9783	struct jmvref *jmvref;
9784	struct dirrem *dirrem;
9785	struct diradd *dap;
9786	struct direct *ep;
9787	int i;
9788
9789	if (pagedep->pd_state & IOSTARTED) {
9790		/*
9791		 * This can only happen if there is a driver that does not
9792		 * understand chaining. Here biodone will reissue the call
9793		 * to strategy for the incomplete buffers.
9794		 */
9795		printf("initiate_write_filepage: already started\n");
9796		return;
9797	}
9798	pagedep->pd_state |= IOSTARTED;
9799	/*
9800	 * Wait for all journal remove dependencies to hit the disk.
9801	 * We can not allow any potentially conflicting directory adds
9802	 * to be visible before removes and rollback is too difficult.
9803	 * lk may be dropped and re-acquired, however we hold the buf
9804	 * locked so the dependency can not go away.
9805	 */
9806	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
9807		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
9808			jwait(&jremref->jr_list, MNT_WAIT);
9809	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
9810		jwait(&jmvref->jm_list, MNT_WAIT);
9811	for (i = 0; i < DAHASHSZ; i++) {
9812		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
9813			ep = (struct direct *)
9814			    ((char *)bp->b_data + dap->da_offset);
9815			if (ep->d_ino != dap->da_newinum)
9816				panic("%s: dir inum %ju != new %ju",
9817				    "initiate_write_filepage",
9818				    (uintmax_t)ep->d_ino,
9819				    (uintmax_t)dap->da_newinum);
9820			if (dap->da_state & DIRCHG)
9821				ep->d_ino = dap->da_previous->dm_oldinum;
9822			else
9823				ep->d_ino = 0;
9824			dap->da_state &= ~ATTACHED;
9825			dap->da_state |= UNDONE;
9826		}
9827	}
9828}
9829
9830/*
9831 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
9832 * Note that any bug fixes made to this routine must be done in the
9833 * version found below.
9834 *
9835 * Called from within the procedure above to deal with unsatisfied
9836 * allocation dependencies in an inodeblock. The buffer must be
9837 * locked, thus, no I/O completion operations can occur while we
9838 * are manipulating its associated dependencies.
9839 */
9840static void
9841initiate_write_inodeblock_ufs1(inodedep, bp)
9842	struct inodedep *inodedep;
9843	struct buf *bp;			/* The inode block */
9844{
9845	struct allocdirect *adp, *lastadp;
9846	struct ufs1_dinode *dp;
9847	struct ufs1_dinode *sip;
9848	struct inoref *inoref;
9849	struct fs *fs;
9850	ufs_lbn_t i;
9851#ifdef INVARIANTS
9852	ufs_lbn_t prevlbn = 0;
9853#endif
9854	int deplist;
9855
9856	if (inodedep->id_state & IOSTARTED)
9857		panic("initiate_write_inodeblock_ufs1: already started");
9858	inodedep->id_state |= IOSTARTED;
9859	fs = inodedep->id_fs;
9860	dp = (struct ufs1_dinode *)bp->b_data +
9861	    ino_to_fsbo(fs, inodedep->id_ino);
9862
9863	/*
9864	 * If we're on the unlinked list but have not yet written our
9865	 * next pointer initialize it here.
9866	 */
9867	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
9868		struct inodedep *inon;
9869
9870		inon = TAILQ_NEXT(inodedep, id_unlinked);
9871		dp->di_freelink = inon ? inon->id_ino : 0;
9872	}
9873	/*
9874	 * If the bitmap is not yet written, then the allocated
9875	 * inode cannot be written to disk.
9876	 */
9877	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
9878		if (inodedep->id_savedino1 != NULL)
9879			panic("initiate_write_inodeblock_ufs1: I/O underway");
9880		FREE_LOCK(&lk);
9881		sip = malloc(sizeof(struct ufs1_dinode),
9882		    M_SAVEDINO, M_SOFTDEP_FLAGS);
9883		ACQUIRE_LOCK(&lk);
9884		inodedep->id_savedino1 = sip;
9885		*inodedep->id_savedino1 = *dp;
9886		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
9887		dp->di_gen = inodedep->id_savedino1->di_gen;
9888		dp->di_freelink = inodedep->id_savedino1->di_freelink;
9889		return;
9890	}
9891	/*
9892	 * If no dependencies, then there is nothing to roll back.
9893	 */
9894	inodedep->id_savedsize = dp->di_size;
9895	inodedep->id_savedextsize = 0;
9896	inodedep->id_savednlink = dp->di_nlink;
9897	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
9898	    TAILQ_EMPTY(&inodedep->id_inoreflst))
9899		return;
9900	/*
9901	 * Revert the link count to that of the first unwritten journal entry.
9902	 */
9903	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
9904	if (inoref)
9905		dp->di_nlink = inoref->if_nlink;
9906	/*
9907	 * Set the dependencies to busy.
9908	 */
9909	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
9910	     adp = TAILQ_NEXT(adp, ad_next)) {
9911#ifdef INVARIANTS
9912		if (deplist != 0 && prevlbn >= adp->ad_offset)
9913			panic("softdep_write_inodeblock: lbn order");
9914		prevlbn = adp->ad_offset;
9915		if (adp->ad_offset < NDADDR &&
9916		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
9917			panic("%s: direct pointer #%jd mismatch %d != %jd",
9918			    "softdep_write_inodeblock",
9919			    (intmax_t)adp->ad_offset,
9920			    dp->di_db[adp->ad_offset],
9921			    (intmax_t)adp->ad_newblkno);
9922		if (adp->ad_offset >= NDADDR &&
9923		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
9924			panic("%s: indirect pointer #%jd mismatch %d != %jd",
9925			    "softdep_write_inodeblock",
9926			    (intmax_t)adp->ad_offset - NDADDR,
9927			    dp->di_ib[adp->ad_offset - NDADDR],
9928			    (intmax_t)adp->ad_newblkno);
9929		deplist |= 1 << adp->ad_offset;
9930		if ((adp->ad_state & ATTACHED) == 0)
9931			panic("softdep_write_inodeblock: Unknown state 0x%x",
9932			    adp->ad_state);
9933#endif /* INVARIANTS */
9934		adp->ad_state &= ~ATTACHED;
9935		adp->ad_state |= UNDONE;
9936	}
9937	/*
9938	 * The on-disk inode cannot claim to be any larger than the last
9939	 * fragment that has been written. Otherwise, the on-disk inode
9940	 * might have fragments that were not the last block in the file
9941	 * which would corrupt the filesystem.
9942	 */
9943	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
9944	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
9945		if (adp->ad_offset >= NDADDR)
9946			break;
9947		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
9948		/* keep going until hitting a rollback to a frag */
9949		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
9950			continue;
9951		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
9952		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
9953#ifdef INVARIANTS
9954			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
9955				panic("softdep_write_inodeblock: lost dep1");
9956#endif /* INVARIANTS */
9957			dp->di_db[i] = 0;
9958		}
9959		for (i = 0; i < NIADDR; i++) {
9960#ifdef INVARIANTS
9961			if (dp->di_ib[i] != 0 &&
9962			    (deplist & ((1 << NDADDR) << i)) == 0)
9963				panic("softdep_write_inodeblock: lost dep2");
9964#endif /* INVARIANTS */
9965			dp->di_ib[i] = 0;
9966		}
9967		return;
9968	}
9969	/*
9970	 * If we have zero'ed out the last allocated block of the file,
9971	 * roll back the size to the last currently allocated block.
9972	 * We know that this last allocated block is a full-sized as
9973	 * we already checked for fragments in the loop above.
9974	 */
9975	if (lastadp != NULL &&
9976	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
9977		for (i = lastadp->ad_offset; i >= 0; i--)
9978			if (dp->di_db[i] != 0)
9979				break;
9980		dp->di_size = (i + 1) * fs->fs_bsize;
9981	}
9982	/*
9983	 * The only dependencies are for indirect blocks.
9984	 *
9985	 * The file size for indirect block additions is not guaranteed.
9986	 * Such a guarantee would be non-trivial to achieve. The conventional
9987	 * synchronous write implementation also does not make this guarantee.
9988	 * Fsck should catch and fix discrepancies. Arguably, the file size
9989	 * can be over-estimated without destroying integrity when the file
9990	 * moves into the indirect blocks (i.e., is large). If we want to
9991	 * postpone fsck, we are stuck with this argument.
9992	 */
9993	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
9994		dp->di_ib[adp->ad_offset - NDADDR] = 0;
9995}
9996
9997/*
9998 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
9999 * Note that any bug fixes made to this routine must be done in the
10000 * version found above.
10001 *
10002 * Called from within the procedure above to deal with unsatisfied
10003 * allocation dependencies in an inodeblock. The buffer must be
10004 * locked, thus, no I/O completion operations can occur while we
10005 * are manipulating its associated dependencies.
10006 */
10007static void
10008initiate_write_inodeblock_ufs2(inodedep, bp)
10009	struct inodedep *inodedep;
10010	struct buf *bp;			/* The inode block */
10011{
10012	struct allocdirect *adp, *lastadp;
10013	struct ufs2_dinode *dp;
10014	struct ufs2_dinode *sip;
10015	struct inoref *inoref;
10016	struct fs *fs;
10017	ufs_lbn_t i;
10018#ifdef INVARIANTS
10019	ufs_lbn_t prevlbn = 0;
10020#endif
10021	int deplist;
10022
10023	if (inodedep->id_state & IOSTARTED)
10024		panic("initiate_write_inodeblock_ufs2: already started");
10025	inodedep->id_state |= IOSTARTED;
10026	fs = inodedep->id_fs;
10027	dp = (struct ufs2_dinode *)bp->b_data +
10028	    ino_to_fsbo(fs, inodedep->id_ino);
10029
10030	/*
10031	 * If we're on the unlinked list but have not yet written our
10032	 * next pointer initialize it here.
10033	 */
10034	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10035		struct inodedep *inon;
10036
10037		inon = TAILQ_NEXT(inodedep, id_unlinked);
10038		dp->di_freelink = inon ? inon->id_ino : 0;
10039	}
10040	/*
10041	 * If the bitmap is not yet written, then the allocated
10042	 * inode cannot be written to disk.
10043	 */
10044	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10045		if (inodedep->id_savedino2 != NULL)
10046			panic("initiate_write_inodeblock_ufs2: I/O underway");
10047		FREE_LOCK(&lk);
10048		sip = malloc(sizeof(struct ufs2_dinode),
10049		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10050		ACQUIRE_LOCK(&lk);
10051		inodedep->id_savedino2 = sip;
10052		*inodedep->id_savedino2 = *dp;
10053		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10054		dp->di_gen = inodedep->id_savedino2->di_gen;
10055		dp->di_freelink = inodedep->id_savedino2->di_freelink;
10056		return;
10057	}
10058	/*
10059	 * If no dependencies, then there is nothing to roll back.
10060	 */
10061	inodedep->id_savedsize = dp->di_size;
10062	inodedep->id_savedextsize = dp->di_extsize;
10063	inodedep->id_savednlink = dp->di_nlink;
10064	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10065	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
10066	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10067		return;
10068	/*
10069	 * Revert the link count to that of the first unwritten journal entry.
10070	 */
10071	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10072	if (inoref)
10073		dp->di_nlink = inoref->if_nlink;
10074
10075	/*
10076	 * Set the ext data dependencies to busy.
10077	 */
10078	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10079	     adp = TAILQ_NEXT(adp, ad_next)) {
10080#ifdef INVARIANTS
10081		if (deplist != 0 && prevlbn >= adp->ad_offset)
10082			panic("softdep_write_inodeblock: lbn order");
10083		prevlbn = adp->ad_offset;
10084		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10085			panic("%s: direct pointer #%jd mismatch %jd != %jd",
10086			    "softdep_write_inodeblock",
10087			    (intmax_t)adp->ad_offset,
10088			    (intmax_t)dp->di_extb[adp->ad_offset],
10089			    (intmax_t)adp->ad_newblkno);
10090		deplist |= 1 << adp->ad_offset;
10091		if ((adp->ad_state & ATTACHED) == 0)
10092			panic("softdep_write_inodeblock: Unknown state 0x%x",
10093			    adp->ad_state);
10094#endif /* INVARIANTS */
10095		adp->ad_state &= ~ATTACHED;
10096		adp->ad_state |= UNDONE;
10097	}
10098	/*
10099	 * The on-disk inode cannot claim to be any larger than the last
10100	 * fragment that has been written. Otherwise, the on-disk inode
10101	 * might have fragments that were not the last block in the ext
10102	 * data which would corrupt the filesystem.
10103	 */
10104	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10105	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10106		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10107		/* keep going until hitting a rollback to a frag */
10108		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10109			continue;
10110		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10111		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
10112#ifdef INVARIANTS
10113			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10114				panic("softdep_write_inodeblock: lost dep1");
10115#endif /* INVARIANTS */
10116			dp->di_extb[i] = 0;
10117		}
10118		lastadp = NULL;
10119		break;
10120	}
10121	/*
10122	 * If we have zero'ed out the last allocated block of the ext
10123	 * data, roll back the size to the last currently allocated block.
10124	 * We know that this last allocated block is a full-sized as
10125	 * we already checked for fragments in the loop above.
10126	 */
10127	if (lastadp != NULL &&
10128	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10129		for (i = lastadp->ad_offset; i >= 0; i--)
10130			if (dp->di_extb[i] != 0)
10131				break;
10132		dp->di_extsize = (i + 1) * fs->fs_bsize;
10133	}
10134	/*
10135	 * Set the file data dependencies to busy.
10136	 */
10137	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10138	     adp = TAILQ_NEXT(adp, ad_next)) {
10139#ifdef INVARIANTS
10140		if (deplist != 0 && prevlbn >= adp->ad_offset)
10141			panic("softdep_write_inodeblock: lbn order");
10142		if ((adp->ad_state & ATTACHED) == 0)
10143			panic("inodedep %p and adp %p not attached", inodedep, adp);
10144		prevlbn = adp->ad_offset;
10145		if (adp->ad_offset < NDADDR &&
10146		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10147			panic("%s: direct pointer #%jd mismatch %jd != %jd",
10148			    "softdep_write_inodeblock",
10149			    (intmax_t)adp->ad_offset,
10150			    (intmax_t)dp->di_db[adp->ad_offset],
10151			    (intmax_t)adp->ad_newblkno);
10152		if (adp->ad_offset >= NDADDR &&
10153		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10154			panic("%s indirect pointer #%jd mismatch %jd != %jd",
10155			    "softdep_write_inodeblock:",
10156			    (intmax_t)adp->ad_offset - NDADDR,
10157			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
10158			    (intmax_t)adp->ad_newblkno);
10159		deplist |= 1 << adp->ad_offset;
10160		if ((adp->ad_state & ATTACHED) == 0)
10161			panic("softdep_write_inodeblock: Unknown state 0x%x",
10162			    adp->ad_state);
10163#endif /* INVARIANTS */
10164		adp->ad_state &= ~ATTACHED;
10165		adp->ad_state |= UNDONE;
10166	}
10167	/*
10168	 * The on-disk inode cannot claim to be any larger than the last
10169	 * fragment that has been written. Otherwise, the on-disk inode
10170	 * might have fragments that were not the last block in the file
10171	 * which would corrupt the filesystem.
10172	 */
10173	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10174	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10175		if (adp->ad_offset >= NDADDR)
10176			break;
10177		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10178		/* keep going until hitting a rollback to a frag */
10179		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10180			continue;
10181		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10182		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10183#ifdef INVARIANTS
10184			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10185				panic("softdep_write_inodeblock: lost dep2");
10186#endif /* INVARIANTS */
10187			dp->di_db[i] = 0;
10188		}
10189		for (i = 0; i < NIADDR; i++) {
10190#ifdef INVARIANTS
10191			if (dp->di_ib[i] != 0 &&
10192			    (deplist & ((1 << NDADDR) << i)) == 0)
10193				panic("softdep_write_inodeblock: lost dep3");
10194#endif /* INVARIANTS */
10195			dp->di_ib[i] = 0;
10196		}
10197		return;
10198	}
10199	/*
10200	 * If we have zero'ed out the last allocated block of the file,
10201	 * roll back the size to the last currently allocated block.
10202	 * We know that this last allocated block is a full-sized as
10203	 * we already checked for fragments in the loop above.
10204	 */
10205	if (lastadp != NULL &&
10206	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10207		for (i = lastadp->ad_offset; i >= 0; i--)
10208			if (dp->di_db[i] != 0)
10209				break;
10210		dp->di_size = (i + 1) * fs->fs_bsize;
10211	}
10212	/*
10213	 * The only dependencies are for indirect blocks.
10214	 *
10215	 * The file size for indirect block additions is not guaranteed.
10216	 * Such a guarantee would be non-trivial to achieve. The conventional
10217	 * synchronous write implementation also does not make this guarantee.
10218	 * Fsck should catch and fix discrepancies. Arguably, the file size
10219	 * can be over-estimated without destroying integrity when the file
10220	 * moves into the indirect blocks (i.e., is large). If we want to
10221	 * postpone fsck, we are stuck with this argument.
10222	 */
10223	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10224		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10225}
10226
10227/*
10228 * Cancel an indirdep as a result of truncation.  Release all of the
10229 * children allocindirs and place their journal work on the appropriate
10230 * list.
10231 */
10232static void
10233cancel_indirdep(indirdep, bp, freeblks)
10234	struct indirdep *indirdep;
10235	struct buf *bp;
10236	struct freeblks *freeblks;
10237{
10238	struct allocindir *aip;
10239
10240	/*
10241	 * None of the indirect pointers will ever be visible,
10242	 * so they can simply be tossed. GOINGAWAY ensures
10243	 * that allocated pointers will be saved in the buffer
10244	 * cache until they are freed. Note that they will
10245	 * only be able to be found by their physical address
10246	 * since the inode mapping the logical address will
10247	 * be gone. The save buffer used for the safe copy
10248	 * was allocated in setup_allocindir_phase2 using
10249	 * the physical address so it could be used for this
10250	 * purpose. Hence we swap the safe copy with the real
10251	 * copy, allowing the safe copy to be freed and holding
10252	 * on to the real copy for later use in indir_trunc.
10253	 */
10254	if (indirdep->ir_state & GOINGAWAY)
10255		panic("cancel_indirdep: already gone");
10256	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10257		indirdep->ir_state |= DEPCOMPLETE;
10258		LIST_REMOVE(indirdep, ir_next);
10259	}
10260	indirdep->ir_state |= GOINGAWAY;
10261	VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1;
10262	/*
10263	 * Pass in bp for blocks still have journal writes
10264	 * pending so we can cancel them on their own.
10265	 */
10266	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
10267		cancel_allocindir(aip, bp, freeblks, 0);
10268	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
10269		cancel_allocindir(aip, NULL, freeblks, 0);
10270	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
10271		cancel_allocindir(aip, NULL, freeblks, 0);
10272	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
10273		cancel_allocindir(aip, NULL, freeblks, 0);
10274	/*
10275	 * If there are pending partial truncations we need to keep the
10276	 * old block copy around until they complete.  This is because
10277	 * the current b_data is not a perfect superset of the available
10278	 * blocks.
10279	 */
10280	if (TAILQ_EMPTY(&indirdep->ir_trunc))
10281		bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10282	else
10283		bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10284	WORKLIST_REMOVE(&indirdep->ir_list);
10285	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10286	indirdep->ir_bp = NULL;
10287	indirdep->ir_freeblks = freeblks;
10288}
10289
10290/*
10291 * Free an indirdep once it no longer has new pointers to track.
10292 */
10293static void
10294free_indirdep(indirdep)
10295	struct indirdep *indirdep;
10296{
10297
10298	KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10299	    ("free_indirdep: Indir trunc list not empty."));
10300	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10301	    ("free_indirdep: Complete head not empty."));
10302	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10303	    ("free_indirdep: write head not empty."));
10304	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10305	    ("free_indirdep: done head not empty."));
10306	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10307	    ("free_indirdep: deplist head not empty."));
10308	KASSERT((indirdep->ir_state & DEPCOMPLETE),
10309	    ("free_indirdep: %p still on newblk list.", indirdep));
10310	KASSERT(indirdep->ir_saveddata == NULL,
10311	    ("free_indirdep: %p still has saved data.", indirdep));
10312	if (indirdep->ir_state & ONWORKLIST)
10313		WORKLIST_REMOVE(&indirdep->ir_list);
10314	WORKITEM_FREE(indirdep, D_INDIRDEP);
10315}
10316
10317/*
10318 * Called before a write to an indirdep.  This routine is responsible for
10319 * rolling back pointers to a safe state which includes only those
10320 * allocindirs which have been completed.
10321 */
10322static void
10323initiate_write_indirdep(indirdep, bp)
10324	struct indirdep *indirdep;
10325	struct buf *bp;
10326{
10327
10328	indirdep->ir_state |= IOSTARTED;
10329	if (indirdep->ir_state & GOINGAWAY)
10330		panic("disk_io_initiation: indirdep gone");
10331	/*
10332	 * If there are no remaining dependencies, this will be writing
10333	 * the real pointers.
10334	 */
10335	if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10336	    TAILQ_EMPTY(&indirdep->ir_trunc))
10337		return;
10338	/*
10339	 * Replace up-to-date version with safe version.
10340	 */
10341	if (indirdep->ir_saveddata == NULL) {
10342		FREE_LOCK(&lk);
10343		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10344		    M_SOFTDEP_FLAGS);
10345		ACQUIRE_LOCK(&lk);
10346	}
10347	indirdep->ir_state &= ~ATTACHED;
10348	indirdep->ir_state |= UNDONE;
10349	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10350	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10351	    bp->b_bcount);
10352}
10353
10354/*
10355 * Called when an inode has been cleared in a cg bitmap.  This finally
10356 * eliminates any canceled jaddrefs
10357 */
10358void
10359softdep_setup_inofree(mp, bp, ino, wkhd)
10360	struct mount *mp;
10361	struct buf *bp;
10362	ino_t ino;
10363	struct workhead *wkhd;
10364{
10365	struct worklist *wk, *wkn;
10366	struct inodedep *inodedep;
10367	uint8_t *inosused;
10368	struct cg *cgp;
10369	struct fs *fs;
10370
10371	ACQUIRE_LOCK(&lk);
10372	fs = VFSTOUFS(mp)->um_fs;
10373	cgp = (struct cg *)bp->b_data;
10374	inosused = cg_inosused(cgp);
10375	if (isset(inosused, ino % fs->fs_ipg))
10376		panic("softdep_setup_inofree: inode %ju not freed.",
10377		    (uintmax_t)ino);
10378	if (inodedep_lookup(mp, ino, 0, &inodedep))
10379		panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
10380		    (uintmax_t)ino, inodedep);
10381	if (wkhd) {
10382		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10383			if (wk->wk_type != D_JADDREF)
10384				continue;
10385			WORKLIST_REMOVE(wk);
10386			/*
10387			 * We can free immediately even if the jaddref
10388			 * isn't attached in a background write as now
10389			 * the bitmaps are reconciled.
10390		 	 */
10391			wk->wk_state |= COMPLETE | ATTACHED;
10392			free_jaddref(WK_JADDREF(wk));
10393		}
10394		jwork_move(&bp->b_dep, wkhd);
10395	}
10396	FREE_LOCK(&lk);
10397}
10398
10399
10400/*
10401 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10402 * map.  Any dependencies waiting for the write to clear are added to the
10403 * buf's list and any jnewblks that are being canceled are discarded
10404 * immediately.
10405 */
10406void
10407softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
10408	struct mount *mp;
10409	struct buf *bp;
10410	ufs2_daddr_t blkno;
10411	int frags;
10412	struct workhead *wkhd;
10413{
10414	struct bmsafemap *bmsafemap;
10415	struct jnewblk *jnewblk;
10416	struct worklist *wk;
10417	struct fs *fs;
10418#ifdef SUJ_DEBUG
10419	uint8_t *blksfree;
10420	struct cg *cgp;
10421	ufs2_daddr_t jstart;
10422	ufs2_daddr_t jend;
10423	ufs2_daddr_t end;
10424	long bno;
10425	int i;
10426#endif
10427
10428	CTR3(KTR_SUJ,
10429	    "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
10430	    blkno, frags, wkhd);
10431
10432	ACQUIRE_LOCK(&lk);
10433	/* Lookup the bmsafemap so we track when it is dirty. */
10434	fs = VFSTOUFS(mp)->um_fs;
10435	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10436	/*
10437	 * Detach any jnewblks which have been canceled.  They must linger
10438	 * until the bitmap is cleared again by ffs_blkfree() to prevent
10439	 * an unjournaled allocation from hitting the disk.
10440	 */
10441	if (wkhd) {
10442		while ((wk = LIST_FIRST(wkhd)) != NULL) {
10443			CTR2(KTR_SUJ,
10444			    "softdep_setup_blkfree: blkno %jd wk type %d",
10445			    blkno, wk->wk_type);
10446			WORKLIST_REMOVE(wk);
10447			if (wk->wk_type != D_JNEWBLK) {
10448				WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
10449				continue;
10450			}
10451			jnewblk = WK_JNEWBLK(wk);
10452			KASSERT(jnewblk->jn_state & GOINGAWAY,
10453			    ("softdep_setup_blkfree: jnewblk not canceled."));
10454#ifdef SUJ_DEBUG
10455			/*
10456			 * Assert that this block is free in the bitmap
10457			 * before we discard the jnewblk.
10458			 */
10459			cgp = (struct cg *)bp->b_data;
10460			blksfree = cg_blksfree(cgp);
10461			bno = dtogd(fs, jnewblk->jn_blkno);
10462			for (i = jnewblk->jn_oldfrags;
10463			    i < jnewblk->jn_frags; i++) {
10464				if (isset(blksfree, bno + i))
10465					continue;
10466				panic("softdep_setup_blkfree: not free");
10467			}
10468#endif
10469			/*
10470			 * Even if it's not attached we can free immediately
10471			 * as the new bitmap is correct.
10472			 */
10473			wk->wk_state |= COMPLETE | ATTACHED;
10474			free_jnewblk(jnewblk);
10475		}
10476	}
10477
10478#ifdef SUJ_DEBUG
10479	/*
10480	 * Assert that we are not freeing a block which has an outstanding
10481	 * allocation dependency.
10482	 */
10483	fs = VFSTOUFS(mp)->um_fs;
10484	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10485	end = blkno + frags;
10486	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10487		/*
10488		 * Don't match against blocks that will be freed when the
10489		 * background write is done.
10490		 */
10491		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
10492		    (COMPLETE | DEPCOMPLETE))
10493			continue;
10494		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
10495		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
10496		if ((blkno >= jstart && blkno < jend) ||
10497		    (end > jstart && end <= jend)) {
10498			printf("state 0x%X %jd - %d %d dep %p\n",
10499			    jnewblk->jn_state, jnewblk->jn_blkno,
10500			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
10501			    jnewblk->jn_dep);
10502			panic("softdep_setup_blkfree: "
10503			    "%jd-%jd(%d) overlaps with %jd-%jd",
10504			    blkno, end, frags, jstart, jend);
10505		}
10506	}
10507#endif
10508	FREE_LOCK(&lk);
10509}
10510
10511/*
10512 * Revert a block allocation when the journal record that describes it
10513 * is not yet written.
10514 */
10515int
10516jnewblk_rollback(jnewblk, fs, cgp, blksfree)
10517	struct jnewblk *jnewblk;
10518	struct fs *fs;
10519	struct cg *cgp;
10520	uint8_t *blksfree;
10521{
10522	ufs1_daddr_t fragno;
10523	long cgbno, bbase;
10524	int frags, blk;
10525	int i;
10526
10527	frags = 0;
10528	cgbno = dtogd(fs, jnewblk->jn_blkno);
10529	/*
10530	 * We have to test which frags need to be rolled back.  We may
10531	 * be operating on a stale copy when doing background writes.
10532	 */
10533	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
10534		if (isclr(blksfree, cgbno + i))
10535			frags++;
10536	if (frags == 0)
10537		return (0);
10538	/*
10539	 * This is mostly ffs_blkfree() sans some validation and
10540	 * superblock updates.
10541	 */
10542	if (frags == fs->fs_frag) {
10543		fragno = fragstoblks(fs, cgbno);
10544		ffs_setblock(fs, blksfree, fragno);
10545		ffs_clusteracct(fs, cgp, fragno, 1);
10546		cgp->cg_cs.cs_nbfree++;
10547	} else {
10548		cgbno += jnewblk->jn_oldfrags;
10549		bbase = cgbno - fragnum(fs, cgbno);
10550		/* Decrement the old frags.  */
10551		blk = blkmap(fs, blksfree, bbase);
10552		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
10553		/* Deallocate the fragment */
10554		for (i = 0; i < frags; i++)
10555			setbit(blksfree, cgbno + i);
10556		cgp->cg_cs.cs_nffree += frags;
10557		/* Add back in counts associated with the new frags */
10558		blk = blkmap(fs, blksfree, bbase);
10559		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
10560                /* If a complete block has been reassembled, account for it. */
10561		fragno = fragstoblks(fs, bbase);
10562		if (ffs_isblock(fs, blksfree, fragno)) {
10563			cgp->cg_cs.cs_nffree -= fs->fs_frag;
10564			ffs_clusteracct(fs, cgp, fragno, 1);
10565			cgp->cg_cs.cs_nbfree++;
10566		}
10567	}
10568	stat_jnewblk++;
10569	jnewblk->jn_state &= ~ATTACHED;
10570	jnewblk->jn_state |= UNDONE;
10571
10572	return (frags);
10573}
10574
10575static void
10576initiate_write_bmsafemap(bmsafemap, bp)
10577	struct bmsafemap *bmsafemap;
10578	struct buf *bp;			/* The cg block. */
10579{
10580	struct jaddref *jaddref;
10581	struct jnewblk *jnewblk;
10582	uint8_t *inosused;
10583	uint8_t *blksfree;
10584	struct cg *cgp;
10585	struct fs *fs;
10586	ino_t ino;
10587
10588	if (bmsafemap->sm_state & IOSTARTED)
10589		return;
10590	bmsafemap->sm_state |= IOSTARTED;
10591	/*
10592	 * Clear any inode allocations which are pending journal writes.
10593	 */
10594	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
10595		cgp = (struct cg *)bp->b_data;
10596		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10597		inosused = cg_inosused(cgp);
10598		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
10599			ino = jaddref->ja_ino % fs->fs_ipg;
10600			if (isset(inosused, ino)) {
10601				if ((jaddref->ja_mode & IFMT) == IFDIR)
10602					cgp->cg_cs.cs_ndir--;
10603				cgp->cg_cs.cs_nifree++;
10604				clrbit(inosused, ino);
10605				jaddref->ja_state &= ~ATTACHED;
10606				jaddref->ja_state |= UNDONE;
10607				stat_jaddref++;
10608			} else
10609				panic("initiate_write_bmsafemap: inode %ju "
10610				    "marked free", (uintmax_t)jaddref->ja_ino);
10611		}
10612	}
10613	/*
10614	 * Clear any block allocations which are pending journal writes.
10615	 */
10616	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
10617		cgp = (struct cg *)bp->b_data;
10618		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10619		blksfree = cg_blksfree(cgp);
10620		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10621			if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
10622				continue;
10623			panic("initiate_write_bmsafemap: block %jd "
10624			    "marked free", jnewblk->jn_blkno);
10625		}
10626	}
10627	/*
10628	 * Move allocation lists to the written lists so they can be
10629	 * cleared once the block write is complete.
10630	 */
10631	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
10632	    inodedep, id_deps);
10633	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
10634	    newblk, nb_deps);
10635	LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
10636	    wk_list);
10637}
10638
10639/*
10640 * This routine is called during the completion interrupt
10641 * service routine for a disk write (from the procedure called
10642 * by the device driver to inform the filesystem caches of
10643 * a request completion).  It should be called early in this
10644 * procedure, before the block is made available to other
10645 * processes or other routines are called.
10646 *
10647 */
10648static void
10649softdep_disk_write_complete(bp)
10650	struct buf *bp;		/* describes the completed disk write */
10651{
10652	struct worklist *wk;
10653	struct worklist *owk;
10654	struct workhead reattach;
10655	struct freeblks *freeblks;
10656	struct buf *sbp;
10657
10658	/*
10659	 * If an error occurred while doing the write, then the data
10660	 * has not hit the disk and the dependencies cannot be unrolled.
10661	 */
10662	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
10663		return;
10664	LIST_INIT(&reattach);
10665	/*
10666	 * This lock must not be released anywhere in this code segment.
10667	 */
10668	sbp = NULL;
10669	owk = NULL;
10670	ACQUIRE_LOCK(&lk);
10671	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
10672		WORKLIST_REMOVE(wk);
10673		dep_write[wk->wk_type]++;
10674		if (wk == owk)
10675			panic("duplicate worklist: %p\n", wk);
10676		owk = wk;
10677		switch (wk->wk_type) {
10678
10679		case D_PAGEDEP:
10680			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
10681				WORKLIST_INSERT(&reattach, wk);
10682			continue;
10683
10684		case D_INODEDEP:
10685			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
10686				WORKLIST_INSERT(&reattach, wk);
10687			continue;
10688
10689		case D_BMSAFEMAP:
10690			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
10691				WORKLIST_INSERT(&reattach, wk);
10692			continue;
10693
10694		case D_MKDIR:
10695			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
10696			continue;
10697
10698		case D_ALLOCDIRECT:
10699			wk->wk_state |= COMPLETE;
10700			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
10701			continue;
10702
10703		case D_ALLOCINDIR:
10704			wk->wk_state |= COMPLETE;
10705			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
10706			continue;
10707
10708		case D_INDIRDEP:
10709			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
10710				WORKLIST_INSERT(&reattach, wk);
10711			continue;
10712
10713		case D_FREEBLKS:
10714			wk->wk_state |= COMPLETE;
10715			freeblks = WK_FREEBLKS(wk);
10716			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
10717			    LIST_EMPTY(&freeblks->fb_jblkdephd))
10718				add_to_worklist(wk, WK_NODELAY);
10719			continue;
10720
10721		case D_FREEWORK:
10722			handle_written_freework(WK_FREEWORK(wk));
10723			break;
10724
10725		case D_JSEGDEP:
10726			free_jsegdep(WK_JSEGDEP(wk));
10727			continue;
10728
10729		case D_JSEG:
10730			handle_written_jseg(WK_JSEG(wk), bp);
10731			continue;
10732
10733		case D_SBDEP:
10734			if (handle_written_sbdep(WK_SBDEP(wk), bp))
10735				WORKLIST_INSERT(&reattach, wk);
10736			continue;
10737
10738		case D_FREEDEP:
10739			free_freedep(WK_FREEDEP(wk));
10740			continue;
10741
10742		default:
10743			panic("handle_disk_write_complete: Unknown type %s",
10744			    TYPENAME(wk->wk_type));
10745			/* NOTREACHED */
10746		}
10747	}
10748	/*
10749	 * Reattach any requests that must be redone.
10750	 */
10751	while ((wk = LIST_FIRST(&reattach)) != NULL) {
10752		WORKLIST_REMOVE(wk);
10753		WORKLIST_INSERT(&bp->b_dep, wk);
10754	}
10755	FREE_LOCK(&lk);
10756	if (sbp)
10757		brelse(sbp);
10758}
10759
10760/*
10761 * Called from within softdep_disk_write_complete above. Note that
10762 * this routine is always called from interrupt level with further
10763 * splbio interrupts blocked.
10764 */
10765static void
10766handle_allocdirect_partdone(adp, wkhd)
10767	struct allocdirect *adp;	/* the completed allocdirect */
10768	struct workhead *wkhd;		/* Work to do when inode is writtne. */
10769{
10770	struct allocdirectlst *listhead;
10771	struct allocdirect *listadp;
10772	struct inodedep *inodedep;
10773	long bsize;
10774
10775	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
10776		return;
10777	/*
10778	 * The on-disk inode cannot claim to be any larger than the last
10779	 * fragment that has been written. Otherwise, the on-disk inode
10780	 * might have fragments that were not the last block in the file
10781	 * which would corrupt the filesystem. Thus, we cannot free any
10782	 * allocdirects after one whose ad_oldblkno claims a fragment as
10783	 * these blocks must be rolled back to zero before writing the inode.
10784	 * We check the currently active set of allocdirects in id_inoupdt
10785	 * or id_extupdt as appropriate.
10786	 */
10787	inodedep = adp->ad_inodedep;
10788	bsize = inodedep->id_fs->fs_bsize;
10789	if (adp->ad_state & EXTDATA)
10790		listhead = &inodedep->id_extupdt;
10791	else
10792		listhead = &inodedep->id_inoupdt;
10793	TAILQ_FOREACH(listadp, listhead, ad_next) {
10794		/* found our block */
10795		if (listadp == adp)
10796			break;
10797		/* continue if ad_oldlbn is not a fragment */
10798		if (listadp->ad_oldsize == 0 ||
10799		    listadp->ad_oldsize == bsize)
10800			continue;
10801		/* hit a fragment */
10802		return;
10803	}
10804	/*
10805	 * If we have reached the end of the current list without
10806	 * finding the just finished dependency, then it must be
10807	 * on the future dependency list. Future dependencies cannot
10808	 * be freed until they are moved to the current list.
10809	 */
10810	if (listadp == NULL) {
10811#ifdef DEBUG
10812		if (adp->ad_state & EXTDATA)
10813			listhead = &inodedep->id_newextupdt;
10814		else
10815			listhead = &inodedep->id_newinoupdt;
10816		TAILQ_FOREACH(listadp, listhead, ad_next)
10817			/* found our block */
10818			if (listadp == adp)
10819				break;
10820		if (listadp == NULL)
10821			panic("handle_allocdirect_partdone: lost dep");
10822#endif /* DEBUG */
10823		return;
10824	}
10825	/*
10826	 * If we have found the just finished dependency, then queue
10827	 * it along with anything that follows it that is complete.
10828	 * Since the pointer has not yet been written in the inode
10829	 * as the dependency prevents it, place the allocdirect on the
10830	 * bufwait list where it will be freed once the pointer is
10831	 * valid.
10832	 */
10833	if (wkhd == NULL)
10834		wkhd = &inodedep->id_bufwait;
10835	for (; adp; adp = listadp) {
10836		listadp = TAILQ_NEXT(adp, ad_next);
10837		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
10838			return;
10839		TAILQ_REMOVE(listhead, adp, ad_next);
10840		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
10841	}
10842}
10843
10844/*
10845 * Called from within softdep_disk_write_complete above.  This routine
10846 * completes successfully written allocindirs.
10847 */
10848static void
10849handle_allocindir_partdone(aip)
10850	struct allocindir *aip;		/* the completed allocindir */
10851{
10852	struct indirdep *indirdep;
10853
10854	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
10855		return;
10856	indirdep = aip->ai_indirdep;
10857	LIST_REMOVE(aip, ai_next);
10858	/*
10859	 * Don't set a pointer while the buffer is undergoing IO or while
10860	 * we have active truncations.
10861	 */
10862	if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
10863		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
10864		return;
10865	}
10866	if (indirdep->ir_state & UFS1FMT)
10867		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
10868		    aip->ai_newblkno;
10869	else
10870		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
10871		    aip->ai_newblkno;
10872	/*
10873	 * Await the pointer write before freeing the allocindir.
10874	 */
10875	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
10876}
10877
10878/*
10879 * Release segments held on a jwork list.
10880 */
10881static void
10882handle_jwork(wkhd)
10883	struct workhead *wkhd;
10884{
10885	struct worklist *wk;
10886
10887	while ((wk = LIST_FIRST(wkhd)) != NULL) {
10888		WORKLIST_REMOVE(wk);
10889		switch (wk->wk_type) {
10890		case D_JSEGDEP:
10891			free_jsegdep(WK_JSEGDEP(wk));
10892			continue;
10893		case D_FREEDEP:
10894			free_freedep(WK_FREEDEP(wk));
10895			continue;
10896		case D_FREEFRAG:
10897			rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
10898			WORKITEM_FREE(wk, D_FREEFRAG);
10899			continue;
10900		case D_FREEWORK:
10901			handle_written_freework(WK_FREEWORK(wk));
10902			continue;
10903		default:
10904			panic("handle_jwork: Unknown type %s\n",
10905			    TYPENAME(wk->wk_type));
10906		}
10907	}
10908}
10909
10910/*
10911 * Handle the bufwait list on an inode when it is safe to release items
10912 * held there.  This normally happens after an inode block is written but
10913 * may be delayed and handled later if there are pending journal items that
10914 * are not yet safe to be released.
10915 */
10916static struct freefile *
10917handle_bufwait(inodedep, refhd)
10918	struct inodedep *inodedep;
10919	struct workhead *refhd;
10920{
10921	struct jaddref *jaddref;
10922	struct freefile *freefile;
10923	struct worklist *wk;
10924
10925	freefile = NULL;
10926	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
10927		WORKLIST_REMOVE(wk);
10928		switch (wk->wk_type) {
10929		case D_FREEFILE:
10930			/*
10931			 * We defer adding freefile to the worklist
10932			 * until all other additions have been made to
10933			 * ensure that it will be done after all the
10934			 * old blocks have been freed.
10935			 */
10936			if (freefile != NULL)
10937				panic("handle_bufwait: freefile");
10938			freefile = WK_FREEFILE(wk);
10939			continue;
10940
10941		case D_MKDIR:
10942			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
10943			continue;
10944
10945		case D_DIRADD:
10946			diradd_inode_written(WK_DIRADD(wk), inodedep);
10947			continue;
10948
10949		case D_FREEFRAG:
10950			wk->wk_state |= COMPLETE;
10951			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
10952				add_to_worklist(wk, 0);
10953			continue;
10954
10955		case D_DIRREM:
10956			wk->wk_state |= COMPLETE;
10957			add_to_worklist(wk, 0);
10958			continue;
10959
10960		case D_ALLOCDIRECT:
10961		case D_ALLOCINDIR:
10962			free_newblk(WK_NEWBLK(wk));
10963			continue;
10964
10965		case D_JNEWBLK:
10966			wk->wk_state |= COMPLETE;
10967			free_jnewblk(WK_JNEWBLK(wk));
10968			continue;
10969
10970		/*
10971		 * Save freed journal segments and add references on
10972		 * the supplied list which will delay their release
10973		 * until the cg bitmap is cleared on disk.
10974		 */
10975		case D_JSEGDEP:
10976			if (refhd == NULL)
10977				free_jsegdep(WK_JSEGDEP(wk));
10978			else
10979				WORKLIST_INSERT(refhd, wk);
10980			continue;
10981
10982		case D_JADDREF:
10983			jaddref = WK_JADDREF(wk);
10984			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
10985			    if_deps);
10986			/*
10987			 * Transfer any jaddrefs to the list to be freed with
10988			 * the bitmap if we're handling a removed file.
10989			 */
10990			if (refhd == NULL) {
10991				wk->wk_state |= COMPLETE;
10992				free_jaddref(jaddref);
10993			} else
10994				WORKLIST_INSERT(refhd, wk);
10995			continue;
10996
10997		default:
10998			panic("handle_bufwait: Unknown type %p(%s)",
10999			    wk, TYPENAME(wk->wk_type));
11000			/* NOTREACHED */
11001		}
11002	}
11003	return (freefile);
11004}
11005/*
11006 * Called from within softdep_disk_write_complete above to restore
11007 * in-memory inode block contents to their most up-to-date state. Note
11008 * that this routine is always called from interrupt level with further
11009 * splbio interrupts blocked.
11010 */
11011static int
11012handle_written_inodeblock(inodedep, bp)
11013	struct inodedep *inodedep;
11014	struct buf *bp;		/* buffer containing the inode block */
11015{
11016	struct freefile *freefile;
11017	struct allocdirect *adp, *nextadp;
11018	struct ufs1_dinode *dp1 = NULL;
11019	struct ufs2_dinode *dp2 = NULL;
11020	struct workhead wkhd;
11021	int hadchanges, fstype;
11022	ino_t freelink;
11023
11024	LIST_INIT(&wkhd);
11025	hadchanges = 0;
11026	freefile = NULL;
11027	if ((inodedep->id_state & IOSTARTED) == 0)
11028		panic("handle_written_inodeblock: not started");
11029	inodedep->id_state &= ~IOSTARTED;
11030	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11031		fstype = UFS1;
11032		dp1 = (struct ufs1_dinode *)bp->b_data +
11033		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11034		freelink = dp1->di_freelink;
11035	} else {
11036		fstype = UFS2;
11037		dp2 = (struct ufs2_dinode *)bp->b_data +
11038		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11039		freelink = dp2->di_freelink;
11040	}
11041	/*
11042	 * Leave this inodeblock dirty until it's in the list.
11043	 */
11044	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) {
11045		struct inodedep *inon;
11046
11047		inon = TAILQ_NEXT(inodedep, id_unlinked);
11048		if ((inon == NULL && freelink == 0) ||
11049		    (inon && inon->id_ino == freelink)) {
11050			if (inon)
11051				inon->id_state |= UNLINKPREV;
11052			inodedep->id_state |= UNLINKNEXT;
11053		}
11054		hadchanges = 1;
11055	}
11056	/*
11057	 * If we had to rollback the inode allocation because of
11058	 * bitmaps being incomplete, then simply restore it.
11059	 * Keep the block dirty so that it will not be reclaimed until
11060	 * all associated dependencies have been cleared and the
11061	 * corresponding updates written to disk.
11062	 */
11063	if (inodedep->id_savedino1 != NULL) {
11064		hadchanges = 1;
11065		if (fstype == UFS1)
11066			*dp1 = *inodedep->id_savedino1;
11067		else
11068			*dp2 = *inodedep->id_savedino2;
11069		free(inodedep->id_savedino1, M_SAVEDINO);
11070		inodedep->id_savedino1 = NULL;
11071		if ((bp->b_flags & B_DELWRI) == 0)
11072			stat_inode_bitmap++;
11073		bdirty(bp);
11074		/*
11075		 * If the inode is clear here and GOINGAWAY it will never
11076		 * be written.  Process the bufwait and clear any pending
11077		 * work which may include the freefile.
11078		 */
11079		if (inodedep->id_state & GOINGAWAY)
11080			goto bufwait;
11081		return (1);
11082	}
11083	inodedep->id_state |= COMPLETE;
11084	/*
11085	 * Roll forward anything that had to be rolled back before
11086	 * the inode could be updated.
11087	 */
11088	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11089		nextadp = TAILQ_NEXT(adp, ad_next);
11090		if (adp->ad_state & ATTACHED)
11091			panic("handle_written_inodeblock: new entry");
11092		if (fstype == UFS1) {
11093			if (adp->ad_offset < NDADDR) {
11094				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11095					panic("%s %s #%jd mismatch %d != %jd",
11096					    "handle_written_inodeblock:",
11097					    "direct pointer",
11098					    (intmax_t)adp->ad_offset,
11099					    dp1->di_db[adp->ad_offset],
11100					    (intmax_t)adp->ad_oldblkno);
11101				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11102			} else {
11103				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
11104					panic("%s: %s #%jd allocated as %d",
11105					    "handle_written_inodeblock",
11106					    "indirect pointer",
11107					    (intmax_t)adp->ad_offset - NDADDR,
11108					    dp1->di_ib[adp->ad_offset - NDADDR]);
11109				dp1->di_ib[adp->ad_offset - NDADDR] =
11110				    adp->ad_newblkno;
11111			}
11112		} else {
11113			if (adp->ad_offset < NDADDR) {
11114				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11115					panic("%s: %s #%jd %s %jd != %jd",
11116					    "handle_written_inodeblock",
11117					    "direct pointer",
11118					    (intmax_t)adp->ad_offset, "mismatch",
11119					    (intmax_t)dp2->di_db[adp->ad_offset],
11120					    (intmax_t)adp->ad_oldblkno);
11121				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11122			} else {
11123				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
11124					panic("%s: %s #%jd allocated as %jd",
11125					    "handle_written_inodeblock",
11126					    "indirect pointer",
11127					    (intmax_t)adp->ad_offset - NDADDR,
11128					    (intmax_t)
11129					    dp2->di_ib[adp->ad_offset - NDADDR]);
11130				dp2->di_ib[adp->ad_offset - NDADDR] =
11131				    adp->ad_newblkno;
11132			}
11133		}
11134		adp->ad_state &= ~UNDONE;
11135		adp->ad_state |= ATTACHED;
11136		hadchanges = 1;
11137	}
11138	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11139		nextadp = TAILQ_NEXT(adp, ad_next);
11140		if (adp->ad_state & ATTACHED)
11141			panic("handle_written_inodeblock: new entry");
11142		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11143			panic("%s: direct pointers #%jd %s %jd != %jd",
11144			    "handle_written_inodeblock",
11145			    (intmax_t)adp->ad_offset, "mismatch",
11146			    (intmax_t)dp2->di_extb[adp->ad_offset],
11147			    (intmax_t)adp->ad_oldblkno);
11148		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11149		adp->ad_state &= ~UNDONE;
11150		adp->ad_state |= ATTACHED;
11151		hadchanges = 1;
11152	}
11153	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11154		stat_direct_blk_ptrs++;
11155	/*
11156	 * Reset the file size to its most up-to-date value.
11157	 */
11158	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11159		panic("handle_written_inodeblock: bad size");
11160	if (inodedep->id_savednlink > LINK_MAX)
11161		panic("handle_written_inodeblock: Invalid link count "
11162		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
11163	if (fstype == UFS1) {
11164		if (dp1->di_nlink != inodedep->id_savednlink) {
11165			dp1->di_nlink = inodedep->id_savednlink;
11166			hadchanges = 1;
11167		}
11168		if (dp1->di_size != inodedep->id_savedsize) {
11169			dp1->di_size = inodedep->id_savedsize;
11170			hadchanges = 1;
11171		}
11172	} else {
11173		if (dp2->di_nlink != inodedep->id_savednlink) {
11174			dp2->di_nlink = inodedep->id_savednlink;
11175			hadchanges = 1;
11176		}
11177		if (dp2->di_size != inodedep->id_savedsize) {
11178			dp2->di_size = inodedep->id_savedsize;
11179			hadchanges = 1;
11180		}
11181		if (dp2->di_extsize != inodedep->id_savedextsize) {
11182			dp2->di_extsize = inodedep->id_savedextsize;
11183			hadchanges = 1;
11184		}
11185	}
11186	inodedep->id_savedsize = -1;
11187	inodedep->id_savedextsize = -1;
11188	inodedep->id_savednlink = -1;
11189	/*
11190	 * If there were any rollbacks in the inode block, then it must be
11191	 * marked dirty so that its will eventually get written back in
11192	 * its correct form.
11193	 */
11194	if (hadchanges)
11195		bdirty(bp);
11196bufwait:
11197	/*
11198	 * Process any allocdirects that completed during the update.
11199	 */
11200	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11201		handle_allocdirect_partdone(adp, &wkhd);
11202	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11203		handle_allocdirect_partdone(adp, &wkhd);
11204	/*
11205	 * Process deallocations that were held pending until the
11206	 * inode had been written to disk. Freeing of the inode
11207	 * is delayed until after all blocks have been freed to
11208	 * avoid creation of new <vfsid, inum, lbn> triples
11209	 * before the old ones have been deleted.  Completely
11210	 * unlinked inodes are not processed until the unlinked
11211	 * inode list is written or the last reference is removed.
11212	 */
11213	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11214		freefile = handle_bufwait(inodedep, NULL);
11215		if (freefile && !LIST_EMPTY(&wkhd)) {
11216			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11217			freefile = NULL;
11218		}
11219	}
11220	/*
11221	 * Move rolled forward dependency completions to the bufwait list
11222	 * now that those that were already written have been processed.
11223	 */
11224	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11225		panic("handle_written_inodeblock: bufwait but no changes");
11226	jwork_move(&inodedep->id_bufwait, &wkhd);
11227
11228	if (freefile != NULL) {
11229		/*
11230		 * If the inode is goingaway it was never written.  Fake up
11231		 * the state here so free_inodedep() can succeed.
11232		 */
11233		if (inodedep->id_state & GOINGAWAY)
11234			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11235		if (free_inodedep(inodedep) == 0)
11236			panic("handle_written_inodeblock: live inodedep %p",
11237			    inodedep);
11238		add_to_worklist(&freefile->fx_list, 0);
11239		return (0);
11240	}
11241
11242	/*
11243	 * If no outstanding dependencies, free it.
11244	 */
11245	if (free_inodedep(inodedep) ||
11246	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11247	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11248	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11249	     LIST_FIRST(&inodedep->id_bufwait) == 0))
11250		return (0);
11251	return (hadchanges);
11252}
11253
11254static int
11255handle_written_indirdep(indirdep, bp, bpp)
11256	struct indirdep *indirdep;
11257	struct buf *bp;
11258	struct buf **bpp;
11259{
11260	struct allocindir *aip;
11261	struct buf *sbp;
11262	int chgs;
11263
11264	if (indirdep->ir_state & GOINGAWAY)
11265		panic("handle_written_indirdep: indirdep gone");
11266	if ((indirdep->ir_state & IOSTARTED) == 0)
11267		panic("handle_written_indirdep: IO not started");
11268	chgs = 0;
11269	/*
11270	 * If there were rollbacks revert them here.
11271	 */
11272	if (indirdep->ir_saveddata) {
11273		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11274		if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11275			free(indirdep->ir_saveddata, M_INDIRDEP);
11276			indirdep->ir_saveddata = NULL;
11277		}
11278		chgs = 1;
11279	}
11280	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11281	indirdep->ir_state |= ATTACHED;
11282	/*
11283	 * Move allocindirs with written pointers to the completehd if
11284	 * the indirdep's pointer is not yet written.  Otherwise
11285	 * free them here.
11286	 */
11287	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
11288		LIST_REMOVE(aip, ai_next);
11289		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11290			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11291			    ai_next);
11292			newblk_freefrag(&aip->ai_block);
11293			continue;
11294		}
11295		free_newblk(&aip->ai_block);
11296	}
11297	/*
11298	 * Move allocindirs that have finished dependency processing from
11299	 * the done list to the write list after updating the pointers.
11300	 */
11301	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11302		while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
11303			handle_allocindir_partdone(aip);
11304			if (aip == LIST_FIRST(&indirdep->ir_donehd))
11305				panic("disk_write_complete: not gone");
11306			chgs = 1;
11307		}
11308	}
11309	/*
11310	 * Preserve the indirdep if there were any changes or if it is not
11311	 * yet valid on disk.
11312	 */
11313	if (chgs) {
11314		stat_indir_blk_ptrs++;
11315		bdirty(bp);
11316		return (1);
11317	}
11318	/*
11319	 * If there were no changes we can discard the savedbp and detach
11320	 * ourselves from the buf.  We are only carrying completed pointers
11321	 * in this case.
11322	 */
11323	sbp = indirdep->ir_savebp;
11324	sbp->b_flags |= B_INVAL | B_NOCACHE;
11325	indirdep->ir_savebp = NULL;
11326	indirdep->ir_bp = NULL;
11327	if (*bpp != NULL)
11328		panic("handle_written_indirdep: bp already exists.");
11329	*bpp = sbp;
11330	/*
11331	 * The indirdep may not be freed until its parent points at it.
11332	 */
11333	if (indirdep->ir_state & DEPCOMPLETE)
11334		free_indirdep(indirdep);
11335
11336	return (0);
11337}
11338
11339/*
11340 * Process a diradd entry after its dependent inode has been written.
11341 * This routine must be called with splbio interrupts blocked.
11342 */
11343static void
11344diradd_inode_written(dap, inodedep)
11345	struct diradd *dap;
11346	struct inodedep *inodedep;
11347{
11348
11349	dap->da_state |= COMPLETE;
11350	complete_diradd(dap);
11351	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
11352}
11353
11354/*
11355 * Returns true if the bmsafemap will have rollbacks when written.  Must
11356 * only be called with lk and the buf lock on the cg held.
11357 */
11358static int
11359bmsafemap_backgroundwrite(bmsafemap, bp)
11360	struct bmsafemap *bmsafemap;
11361	struct buf *bp;
11362{
11363	int dirty;
11364
11365	dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
11366	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
11367	/*
11368	 * If we're initiating a background write we need to process the
11369	 * rollbacks as they exist now, not as they exist when IO starts.
11370	 * No other consumers will look at the contents of the shadowed
11371	 * buf so this is safe to do here.
11372	 */
11373	if (bp->b_xflags & BX_BKGRDMARKER)
11374		initiate_write_bmsafemap(bmsafemap, bp);
11375
11376	return (dirty);
11377}
11378
11379/*
11380 * Re-apply an allocation when a cg write is complete.
11381 */
11382static int
11383jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
11384	struct jnewblk *jnewblk;
11385	struct fs *fs;
11386	struct cg *cgp;
11387	uint8_t *blksfree;
11388{
11389	ufs1_daddr_t fragno;
11390	ufs2_daddr_t blkno;
11391	long cgbno, bbase;
11392	int frags, blk;
11393	int i;
11394
11395	frags = 0;
11396	cgbno = dtogd(fs, jnewblk->jn_blkno);
11397	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
11398		if (isclr(blksfree, cgbno + i))
11399			panic("jnewblk_rollforward: re-allocated fragment");
11400		frags++;
11401	}
11402	if (frags == fs->fs_frag) {
11403		blkno = fragstoblks(fs, cgbno);
11404		ffs_clrblock(fs, blksfree, (long)blkno);
11405		ffs_clusteracct(fs, cgp, blkno, -1);
11406		cgp->cg_cs.cs_nbfree--;
11407	} else {
11408		bbase = cgbno - fragnum(fs, cgbno);
11409		cgbno += jnewblk->jn_oldfrags;
11410                /* If a complete block had been reassembled, account for it. */
11411		fragno = fragstoblks(fs, bbase);
11412		if (ffs_isblock(fs, blksfree, fragno)) {
11413			cgp->cg_cs.cs_nffree += fs->fs_frag;
11414			ffs_clusteracct(fs, cgp, fragno, -1);
11415			cgp->cg_cs.cs_nbfree--;
11416		}
11417		/* Decrement the old frags.  */
11418		blk = blkmap(fs, blksfree, bbase);
11419		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11420		/* Allocate the fragment */
11421		for (i = 0; i < frags; i++)
11422			clrbit(blksfree, cgbno + i);
11423		cgp->cg_cs.cs_nffree -= frags;
11424		/* Add back in counts associated with the new frags */
11425		blk = blkmap(fs, blksfree, bbase);
11426		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11427	}
11428	return (frags);
11429}
11430
11431/*
11432 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
11433 * changes if it's not a background write.  Set all written dependencies
11434 * to DEPCOMPLETE and free the structure if possible.
11435 */
11436static int
11437handle_written_bmsafemap(bmsafemap, bp)
11438	struct bmsafemap *bmsafemap;
11439	struct buf *bp;
11440{
11441	struct newblk *newblk;
11442	struct inodedep *inodedep;
11443	struct jaddref *jaddref, *jatmp;
11444	struct jnewblk *jnewblk, *jntmp;
11445	struct ufsmount *ump;
11446	uint8_t *inosused;
11447	uint8_t *blksfree;
11448	struct cg *cgp;
11449	struct fs *fs;
11450	ino_t ino;
11451	int foreground;
11452	int chgs;
11453
11454	if ((bmsafemap->sm_state & IOSTARTED) == 0)
11455		panic("initiate_write_bmsafemap: Not started\n");
11456	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
11457	chgs = 0;
11458	bmsafemap->sm_state &= ~IOSTARTED;
11459	foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
11460	/*
11461	 * Release journal work that was waiting on the write.
11462	 */
11463	handle_jwork(&bmsafemap->sm_freewr);
11464
11465	/*
11466	 * Restore unwritten inode allocation pending jaddref writes.
11467	 */
11468	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
11469		cgp = (struct cg *)bp->b_data;
11470		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11471		inosused = cg_inosused(cgp);
11472		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
11473		    ja_bmdeps, jatmp) {
11474			if ((jaddref->ja_state & UNDONE) == 0)
11475				continue;
11476			ino = jaddref->ja_ino % fs->fs_ipg;
11477			if (isset(inosused, ino))
11478				panic("handle_written_bmsafemap: "
11479				    "re-allocated inode");
11480			/* Do the roll-forward only if it's a real copy. */
11481			if (foreground) {
11482				if ((jaddref->ja_mode & IFMT) == IFDIR)
11483					cgp->cg_cs.cs_ndir++;
11484				cgp->cg_cs.cs_nifree--;
11485				setbit(inosused, ino);
11486				chgs = 1;
11487			}
11488			jaddref->ja_state &= ~UNDONE;
11489			jaddref->ja_state |= ATTACHED;
11490			free_jaddref(jaddref);
11491		}
11492	}
11493	/*
11494	 * Restore any block allocations which are pending journal writes.
11495	 */
11496	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11497		cgp = (struct cg *)bp->b_data;
11498		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11499		blksfree = cg_blksfree(cgp);
11500		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
11501		    jntmp) {
11502			if ((jnewblk->jn_state & UNDONE) == 0)
11503				continue;
11504			/* Do the roll-forward only if it's a real copy. */
11505			if (foreground &&
11506			    jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
11507				chgs = 1;
11508			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
11509			jnewblk->jn_state |= ATTACHED;
11510			free_jnewblk(jnewblk);
11511		}
11512	}
11513	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
11514		newblk->nb_state |= DEPCOMPLETE;
11515		newblk->nb_state &= ~ONDEPLIST;
11516		newblk->nb_bmsafemap = NULL;
11517		LIST_REMOVE(newblk, nb_deps);
11518		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
11519			handle_allocdirect_partdone(
11520			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
11521		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
11522			handle_allocindir_partdone(
11523			    WK_ALLOCINDIR(&newblk->nb_list));
11524		else if (newblk->nb_list.wk_type != D_NEWBLK)
11525			panic("handle_written_bmsafemap: Unexpected type: %s",
11526			    TYPENAME(newblk->nb_list.wk_type));
11527	}
11528	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
11529		inodedep->id_state |= DEPCOMPLETE;
11530		inodedep->id_state &= ~ONDEPLIST;
11531		LIST_REMOVE(inodedep, id_deps);
11532		inodedep->id_bmsafemap = NULL;
11533	}
11534	LIST_REMOVE(bmsafemap, sm_next);
11535	if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
11536	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
11537	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
11538	    LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
11539	    LIST_EMPTY(&bmsafemap->sm_freehd)) {
11540		LIST_REMOVE(bmsafemap, sm_hash);
11541		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
11542		return (0);
11543	}
11544	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
11545	if (foreground)
11546		bdirty(bp);
11547	return (1);
11548}
11549
11550/*
11551 * Try to free a mkdir dependency.
11552 */
11553static void
11554complete_mkdir(mkdir)
11555	struct mkdir *mkdir;
11556{
11557	struct diradd *dap;
11558
11559	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
11560		return;
11561	LIST_REMOVE(mkdir, md_mkdirs);
11562	dap = mkdir->md_diradd;
11563	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
11564	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
11565		dap->da_state |= DEPCOMPLETE;
11566		complete_diradd(dap);
11567	}
11568	WORKITEM_FREE(mkdir, D_MKDIR);
11569}
11570
11571/*
11572 * Handle the completion of a mkdir dependency.
11573 */
11574static void
11575handle_written_mkdir(mkdir, type)
11576	struct mkdir *mkdir;
11577	int type;
11578{
11579
11580	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
11581		panic("handle_written_mkdir: bad type");
11582	mkdir->md_state |= COMPLETE;
11583	complete_mkdir(mkdir);
11584}
11585
11586static int
11587free_pagedep(pagedep)
11588	struct pagedep *pagedep;
11589{
11590	int i;
11591
11592	if (pagedep->pd_state & NEWBLOCK)
11593		return (0);
11594	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
11595		return (0);
11596	for (i = 0; i < DAHASHSZ; i++)
11597		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
11598			return (0);
11599	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
11600		return (0);
11601	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
11602		return (0);
11603	if (pagedep->pd_state & ONWORKLIST)
11604		WORKLIST_REMOVE(&pagedep->pd_list);
11605	LIST_REMOVE(pagedep, pd_hash);
11606	WORKITEM_FREE(pagedep, D_PAGEDEP);
11607
11608	return (1);
11609}
11610
11611/*
11612 * Called from within softdep_disk_write_complete above.
11613 * A write operation was just completed. Removed inodes can
11614 * now be freed and associated block pointers may be committed.
11615 * Note that this routine is always called from interrupt level
11616 * with further splbio interrupts blocked.
11617 */
11618static int
11619handle_written_filepage(pagedep, bp)
11620	struct pagedep *pagedep;
11621	struct buf *bp;		/* buffer containing the written page */
11622{
11623	struct dirrem *dirrem;
11624	struct diradd *dap, *nextdap;
11625	struct direct *ep;
11626	int i, chgs;
11627
11628	if ((pagedep->pd_state & IOSTARTED) == 0)
11629		panic("handle_written_filepage: not started");
11630	pagedep->pd_state &= ~IOSTARTED;
11631	/*
11632	 * Process any directory removals that have been committed.
11633	 */
11634	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
11635		LIST_REMOVE(dirrem, dm_next);
11636		dirrem->dm_state |= COMPLETE;
11637		dirrem->dm_dirinum = pagedep->pd_ino;
11638		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
11639		    ("handle_written_filepage: Journal entries not written."));
11640		add_to_worklist(&dirrem->dm_list, 0);
11641	}
11642	/*
11643	 * Free any directory additions that have been committed.
11644	 * If it is a newly allocated block, we have to wait until
11645	 * the on-disk directory inode claims the new block.
11646	 */
11647	if ((pagedep->pd_state & NEWBLOCK) == 0)
11648		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
11649			free_diradd(dap, NULL);
11650	/*
11651	 * Uncommitted directory entries must be restored.
11652	 */
11653	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
11654		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
11655		     dap = nextdap) {
11656			nextdap = LIST_NEXT(dap, da_pdlist);
11657			if (dap->da_state & ATTACHED)
11658				panic("handle_written_filepage: attached");
11659			ep = (struct direct *)
11660			    ((char *)bp->b_data + dap->da_offset);
11661			ep->d_ino = dap->da_newinum;
11662			dap->da_state &= ~UNDONE;
11663			dap->da_state |= ATTACHED;
11664			chgs = 1;
11665			/*
11666			 * If the inode referenced by the directory has
11667			 * been written out, then the dependency can be
11668			 * moved to the pending list.
11669			 */
11670			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
11671				LIST_REMOVE(dap, da_pdlist);
11672				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
11673				    da_pdlist);
11674			}
11675		}
11676	}
11677	/*
11678	 * If there were any rollbacks in the directory, then it must be
11679	 * marked dirty so that its will eventually get written back in
11680	 * its correct form.
11681	 */
11682	if (chgs) {
11683		if ((bp->b_flags & B_DELWRI) == 0)
11684			stat_dir_entry++;
11685		bdirty(bp);
11686		return (1);
11687	}
11688	/*
11689	 * If we are not waiting for a new directory block to be
11690	 * claimed by its inode, then the pagedep will be freed.
11691	 * Otherwise it will remain to track any new entries on
11692	 * the page in case they are fsync'ed.
11693	 */
11694	free_pagedep(pagedep);
11695	return (0);
11696}
11697
11698/*
11699 * Writing back in-core inode structures.
11700 *
11701 * The filesystem only accesses an inode's contents when it occupies an
11702 * "in-core" inode structure.  These "in-core" structures are separate from
11703 * the page frames used to cache inode blocks.  Only the latter are
11704 * transferred to/from the disk.  So, when the updated contents of the
11705 * "in-core" inode structure are copied to the corresponding in-memory inode
11706 * block, the dependencies are also transferred.  The following procedure is
11707 * called when copying a dirty "in-core" inode to a cached inode block.
11708 */
11709
11710/*
11711 * Called when an inode is loaded from disk. If the effective link count
11712 * differed from the actual link count when it was last flushed, then we
11713 * need to ensure that the correct effective link count is put back.
11714 */
11715void
11716softdep_load_inodeblock(ip)
11717	struct inode *ip;	/* the "in_core" copy of the inode */
11718{
11719	struct inodedep *inodedep;
11720
11721	/*
11722	 * Check for alternate nlink count.
11723	 */
11724	ip->i_effnlink = ip->i_nlink;
11725	ACQUIRE_LOCK(&lk);
11726	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
11727	    &inodedep) == 0) {
11728		FREE_LOCK(&lk);
11729		return;
11730	}
11731	ip->i_effnlink -= inodedep->id_nlinkdelta;
11732	FREE_LOCK(&lk);
11733}
11734
11735/*
11736 * This routine is called just before the "in-core" inode
11737 * information is to be copied to the in-memory inode block.
11738 * Recall that an inode block contains several inodes. If
11739 * the force flag is set, then the dependencies will be
11740 * cleared so that the update can always be made. Note that
11741 * the buffer is locked when this routine is called, so we
11742 * will never be in the middle of writing the inode block
11743 * to disk.
11744 */
11745void
11746softdep_update_inodeblock(ip, bp, waitfor)
11747	struct inode *ip;	/* the "in_core" copy of the inode */
11748	struct buf *bp;		/* the buffer containing the inode block */
11749	int waitfor;		/* nonzero => update must be allowed */
11750{
11751	struct inodedep *inodedep;
11752	struct inoref *inoref;
11753	struct worklist *wk;
11754	struct mount *mp;
11755	struct buf *ibp;
11756	struct fs *fs;
11757	int error;
11758
11759	mp = UFSTOVFS(ip->i_ump);
11760	fs = ip->i_fs;
11761	/*
11762	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
11763	 * does not have access to the in-core ip so must write directly into
11764	 * the inode block buffer when setting freelink.
11765	 */
11766	if (fs->fs_magic == FS_UFS1_MAGIC)
11767		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
11768		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
11769	else
11770		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
11771		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
11772	/*
11773	 * If the effective link count is not equal to the actual link
11774	 * count, then we must track the difference in an inodedep while
11775	 * the inode is (potentially) tossed out of the cache. Otherwise,
11776	 * if there is no existing inodedep, then there are no dependencies
11777	 * to track.
11778	 */
11779	ACQUIRE_LOCK(&lk);
11780again:
11781	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
11782		FREE_LOCK(&lk);
11783		if (ip->i_effnlink != ip->i_nlink)
11784			panic("softdep_update_inodeblock: bad link count");
11785		return;
11786	}
11787	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
11788		panic("softdep_update_inodeblock: bad delta");
11789	/*
11790	 * If we're flushing all dependencies we must also move any waiting
11791	 * for journal writes onto the bufwait list prior to I/O.
11792	 */
11793	if (waitfor) {
11794		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
11795			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
11796			    == DEPCOMPLETE) {
11797				jwait(&inoref->if_list, MNT_WAIT);
11798				goto again;
11799			}
11800		}
11801	}
11802	/*
11803	 * Changes have been initiated. Anything depending on these
11804	 * changes cannot occur until this inode has been written.
11805	 */
11806	inodedep->id_state &= ~COMPLETE;
11807	if ((inodedep->id_state & ONWORKLIST) == 0)
11808		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
11809	/*
11810	 * Any new dependencies associated with the incore inode must
11811	 * now be moved to the list associated with the buffer holding
11812	 * the in-memory copy of the inode. Once merged process any
11813	 * allocdirects that are completed by the merger.
11814	 */
11815	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
11816	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
11817		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
11818		    NULL);
11819	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
11820	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
11821		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
11822		    NULL);
11823	/*
11824	 * Now that the inode has been pushed into the buffer, the
11825	 * operations dependent on the inode being written to disk
11826	 * can be moved to the id_bufwait so that they will be
11827	 * processed when the buffer I/O completes.
11828	 */
11829	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
11830		WORKLIST_REMOVE(wk);
11831		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
11832	}
11833	/*
11834	 * Newly allocated inodes cannot be written until the bitmap
11835	 * that allocates them have been written (indicated by
11836	 * DEPCOMPLETE being set in id_state). If we are doing a
11837	 * forced sync (e.g., an fsync on a file), we force the bitmap
11838	 * to be written so that the update can be done.
11839	 */
11840	if (waitfor == 0) {
11841		FREE_LOCK(&lk);
11842		return;
11843	}
11844retry:
11845	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
11846		FREE_LOCK(&lk);
11847		return;
11848	}
11849	ibp = inodedep->id_bmsafemap->sm_buf;
11850	ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
11851	if (ibp == NULL) {
11852		/*
11853		 * If ibp came back as NULL, the dependency could have been
11854		 * freed while we slept.  Look it up again, and check to see
11855		 * that it has completed.
11856		 */
11857		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
11858			goto retry;
11859		FREE_LOCK(&lk);
11860		return;
11861	}
11862	FREE_LOCK(&lk);
11863	if ((error = bwrite(ibp)) != 0)
11864		softdep_error("softdep_update_inodeblock: bwrite", error);
11865}
11866
11867/*
11868 * Merge the a new inode dependency list (such as id_newinoupdt) into an
11869 * old inode dependency list (such as id_inoupdt). This routine must be
11870 * called with splbio interrupts blocked.
11871 */
11872static void
11873merge_inode_lists(newlisthead, oldlisthead)
11874	struct allocdirectlst *newlisthead;
11875	struct allocdirectlst *oldlisthead;
11876{
11877	struct allocdirect *listadp, *newadp;
11878
11879	newadp = TAILQ_FIRST(newlisthead);
11880	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
11881		if (listadp->ad_offset < newadp->ad_offset) {
11882			listadp = TAILQ_NEXT(listadp, ad_next);
11883			continue;
11884		}
11885		TAILQ_REMOVE(newlisthead, newadp, ad_next);
11886		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
11887		if (listadp->ad_offset == newadp->ad_offset) {
11888			allocdirect_merge(oldlisthead, newadp,
11889			    listadp);
11890			listadp = newadp;
11891		}
11892		newadp = TAILQ_FIRST(newlisthead);
11893	}
11894	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
11895		TAILQ_REMOVE(newlisthead, newadp, ad_next);
11896		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
11897	}
11898}
11899
11900/*
11901 * If we are doing an fsync, then we must ensure that any directory
11902 * entries for the inode have been written after the inode gets to disk.
11903 */
11904int
11905softdep_fsync(vp)
11906	struct vnode *vp;	/* the "in_core" copy of the inode */
11907{
11908	struct inodedep *inodedep;
11909	struct pagedep *pagedep;
11910	struct inoref *inoref;
11911	struct worklist *wk;
11912	struct diradd *dap;
11913	struct mount *mp;
11914	struct vnode *pvp;
11915	struct inode *ip;
11916	struct buf *bp;
11917	struct fs *fs;
11918	struct thread *td = curthread;
11919	int error, flushparent, pagedep_new_block;
11920	ino_t parentino;
11921	ufs_lbn_t lbn;
11922
11923	ip = VTOI(vp);
11924	fs = ip->i_fs;
11925	mp = vp->v_mount;
11926	ACQUIRE_LOCK(&lk);
11927restart:
11928	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
11929		FREE_LOCK(&lk);
11930		return (0);
11931	}
11932	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
11933		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
11934		    == DEPCOMPLETE) {
11935			jwait(&inoref->if_list, MNT_WAIT);
11936			goto restart;
11937		}
11938	}
11939	if (!LIST_EMPTY(&inodedep->id_inowait) ||
11940	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
11941	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
11942	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
11943	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
11944		panic("softdep_fsync: pending ops %p", inodedep);
11945	for (error = 0, flushparent = 0; ; ) {
11946		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
11947			break;
11948		if (wk->wk_type != D_DIRADD)
11949			panic("softdep_fsync: Unexpected type %s",
11950			    TYPENAME(wk->wk_type));
11951		dap = WK_DIRADD(wk);
11952		/*
11953		 * Flush our parent if this directory entry has a MKDIR_PARENT
11954		 * dependency or is contained in a newly allocated block.
11955		 */
11956		if (dap->da_state & DIRCHG)
11957			pagedep = dap->da_previous->dm_pagedep;
11958		else
11959			pagedep = dap->da_pagedep;
11960		parentino = pagedep->pd_ino;
11961		lbn = pagedep->pd_lbn;
11962		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
11963			panic("softdep_fsync: dirty");
11964		if ((dap->da_state & MKDIR_PARENT) ||
11965		    (pagedep->pd_state & NEWBLOCK))
11966			flushparent = 1;
11967		else
11968			flushparent = 0;
11969		/*
11970		 * If we are being fsync'ed as part of vgone'ing this vnode,
11971		 * then we will not be able to release and recover the
11972		 * vnode below, so we just have to give up on writing its
11973		 * directory entry out. It will eventually be written, just
11974		 * not now, but then the user was not asking to have it
11975		 * written, so we are not breaking any promises.
11976		 */
11977		if (vp->v_iflag & VI_DOOMED)
11978			break;
11979		/*
11980		 * We prevent deadlock by always fetching inodes from the
11981		 * root, moving down the directory tree. Thus, when fetching
11982		 * our parent directory, we first try to get the lock. If
11983		 * that fails, we must unlock ourselves before requesting
11984		 * the lock on our parent. See the comment in ufs_lookup
11985		 * for details on possible races.
11986		 */
11987		FREE_LOCK(&lk);
11988		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
11989		    FFSV_FORCEINSMQ)) {
11990			error = vfs_busy(mp, MBF_NOWAIT);
11991			if (error != 0) {
11992				vfs_ref(mp);
11993				VOP_UNLOCK(vp, 0);
11994				error = vfs_busy(mp, 0);
11995				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
11996				vfs_rel(mp);
11997				if (error != 0)
11998					return (ENOENT);
11999				if (vp->v_iflag & VI_DOOMED) {
12000					vfs_unbusy(mp);
12001					return (ENOENT);
12002				}
12003			}
12004			VOP_UNLOCK(vp, 0);
12005			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
12006			    &pvp, FFSV_FORCEINSMQ);
12007			vfs_unbusy(mp);
12008			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12009			if (vp->v_iflag & VI_DOOMED) {
12010				if (error == 0)
12011					vput(pvp);
12012				error = ENOENT;
12013			}
12014			if (error != 0)
12015				return (error);
12016		}
12017		/*
12018		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12019		 * that are contained in direct blocks will be resolved by
12020		 * doing a ffs_update. Pagedeps contained in indirect blocks
12021		 * may require a complete sync'ing of the directory. So, we
12022		 * try the cheap and fast ffs_update first, and if that fails,
12023		 * then we do the slower ffs_syncvnode of the directory.
12024		 */
12025		if (flushparent) {
12026			int locked;
12027
12028			if ((error = ffs_update(pvp, 1)) != 0) {
12029				vput(pvp);
12030				return (error);
12031			}
12032			ACQUIRE_LOCK(&lk);
12033			locked = 1;
12034			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12035				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12036					if (wk->wk_type != D_DIRADD)
12037						panic("softdep_fsync: Unexpected type %s",
12038						      TYPENAME(wk->wk_type));
12039					dap = WK_DIRADD(wk);
12040					if (dap->da_state & DIRCHG)
12041						pagedep = dap->da_previous->dm_pagedep;
12042					else
12043						pagedep = dap->da_pagedep;
12044					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12045					FREE_LOCK(&lk);
12046					locked = 0;
12047					if (pagedep_new_block && (error =
12048					    ffs_syncvnode(pvp, MNT_WAIT, 0))) {
12049						vput(pvp);
12050						return (error);
12051					}
12052				}
12053			}
12054			if (locked)
12055				FREE_LOCK(&lk);
12056		}
12057		/*
12058		 * Flush directory page containing the inode's name.
12059		 */
12060		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12061		    &bp);
12062		if (error == 0)
12063			error = bwrite(bp);
12064		else
12065			brelse(bp);
12066		vput(pvp);
12067		if (error != 0)
12068			return (error);
12069		ACQUIRE_LOCK(&lk);
12070		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12071			break;
12072	}
12073	FREE_LOCK(&lk);
12074	return (0);
12075}
12076
12077/*
12078 * Flush all the dirty bitmaps associated with the block device
12079 * before flushing the rest of the dirty blocks so as to reduce
12080 * the number of dependencies that will have to be rolled back.
12081 *
12082 * XXX Unused?
12083 */
12084void
12085softdep_fsync_mountdev(vp)
12086	struct vnode *vp;
12087{
12088	struct buf *bp, *nbp;
12089	struct worklist *wk;
12090	struct bufobj *bo;
12091
12092	if (!vn_isdisk(vp, NULL))
12093		panic("softdep_fsync_mountdev: vnode not a disk");
12094	bo = &vp->v_bufobj;
12095restart:
12096	BO_LOCK(bo);
12097	ACQUIRE_LOCK(&lk);
12098	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
12099		/*
12100		 * If it is already scheduled, skip to the next buffer.
12101		 */
12102		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
12103			continue;
12104
12105		if ((bp->b_flags & B_DELWRI) == 0)
12106			panic("softdep_fsync_mountdev: not dirty");
12107		/*
12108		 * We are only interested in bitmaps with outstanding
12109		 * dependencies.
12110		 */
12111		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
12112		    wk->wk_type != D_BMSAFEMAP ||
12113		    (bp->b_vflags & BV_BKGRDINPROG)) {
12114			BUF_UNLOCK(bp);
12115			continue;
12116		}
12117		FREE_LOCK(&lk);
12118		BO_UNLOCK(bo);
12119		bremfree(bp);
12120		(void) bawrite(bp);
12121		goto restart;
12122	}
12123	FREE_LOCK(&lk);
12124	drain_output(vp);
12125	BO_UNLOCK(bo);
12126}
12127
12128/*
12129 * Sync all cylinder groups that were dirty at the time this function is
12130 * called.  Newly dirtied cgs will be inserted before the sintenel.  This
12131 * is used to flush freedep activity that may be holding up writes to a
12132 * indirect block.
12133 */
12134static int
12135sync_cgs(mp, waitfor)
12136	struct mount *mp;
12137	int waitfor;
12138{
12139	struct bmsafemap *bmsafemap;
12140	struct bmsafemap *sintenel;
12141	struct ufsmount *ump;
12142	struct buf *bp;
12143	int error;
12144
12145	sintenel = malloc(sizeof(*sintenel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
12146	sintenel->sm_cg = -1;
12147	ump = VFSTOUFS(mp);
12148	error = 0;
12149	ACQUIRE_LOCK(&lk);
12150	LIST_INSERT_HEAD(&ump->softdep_dirtycg, sintenel, sm_next);
12151	for (bmsafemap = LIST_NEXT(sintenel, sm_next); bmsafemap != NULL;
12152	    bmsafemap = LIST_NEXT(sintenel, sm_next)) {
12153		/* Skip sintenels and cgs with no work to release. */
12154		if (bmsafemap->sm_cg == -1 ||
12155		    (LIST_EMPTY(&bmsafemap->sm_freehd) &&
12156		    LIST_EMPTY(&bmsafemap->sm_freewr))) {
12157			LIST_REMOVE(sintenel, sm_next);
12158			LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next);
12159			continue;
12160		}
12161		/*
12162		 * If we don't get the lock and we're waiting try again, if
12163		 * not move on to the next buf and try to sync it.
12164		 */
12165		bp = getdirtybuf(bmsafemap->sm_buf, &lk, waitfor);
12166		if (bp == NULL && waitfor == MNT_WAIT)
12167			continue;
12168		LIST_REMOVE(sintenel, sm_next);
12169		LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next);
12170		if (bp == NULL)
12171			continue;
12172		FREE_LOCK(&lk);
12173		if (waitfor == MNT_NOWAIT)
12174			bawrite(bp);
12175		else
12176			error = bwrite(bp);
12177		ACQUIRE_LOCK(&lk);
12178		if (error)
12179			break;
12180	}
12181	LIST_REMOVE(sintenel, sm_next);
12182	FREE_LOCK(&lk);
12183	free(sintenel, M_BMSAFEMAP);
12184	return (error);
12185}
12186
12187/*
12188 * This routine is called when we are trying to synchronously flush a
12189 * file. This routine must eliminate any filesystem metadata dependencies
12190 * so that the syncing routine can succeed.
12191 */
12192int
12193softdep_sync_metadata(struct vnode *vp)
12194{
12195	int error;
12196
12197	/*
12198	 * Ensure that any direct block dependencies have been cleared,
12199	 * truncations are started, and inode references are journaled.
12200	 */
12201	ACQUIRE_LOCK(&lk);
12202	/*
12203	 * Write all journal records to prevent rollbacks on devvp.
12204	 */
12205	if (vp->v_type == VCHR)
12206		softdep_flushjournal(vp->v_mount);
12207	error = flush_inodedep_deps(vp, vp->v_mount, VTOI(vp)->i_number);
12208	/*
12209	 * Ensure that all truncates are written so we won't find deps on
12210	 * indirect blocks.
12211	 */
12212	process_truncates(vp);
12213	FREE_LOCK(&lk);
12214
12215	return (error);
12216}
12217
12218/*
12219 * This routine is called when we are attempting to sync a buf with
12220 * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12221 * other IO it can but returns EBUSY if the buffer is not yet able to
12222 * be written.  Dependencies which will not cause rollbacks will always
12223 * return 0.
12224 */
12225int
12226softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
12227{
12228	struct indirdep *indirdep;
12229	struct pagedep *pagedep;
12230	struct allocindir *aip;
12231	struct newblk *newblk;
12232	struct buf *nbp;
12233	struct worklist *wk;
12234	int i, error;
12235
12236	/*
12237	 * For VCHR we just don't want to force flush any dependencies that
12238	 * will cause rollbacks.
12239	 */
12240	if (vp->v_type == VCHR) {
12241		if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12242			return (EBUSY);
12243		return (0);
12244	}
12245	ACQUIRE_LOCK(&lk);
12246	/*
12247	 * As we hold the buffer locked, none of its dependencies
12248	 * will disappear.
12249	 */
12250	error = 0;
12251top:
12252	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12253		switch (wk->wk_type) {
12254
12255		case D_ALLOCDIRECT:
12256		case D_ALLOCINDIR:
12257			newblk = WK_NEWBLK(wk);
12258			if (newblk->nb_jnewblk != NULL) {
12259				if (waitfor == MNT_NOWAIT) {
12260					error = EBUSY;
12261					goto out_unlock;
12262				}
12263				jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12264				goto top;
12265			}
12266			if (newblk->nb_state & DEPCOMPLETE ||
12267			    waitfor == MNT_NOWAIT)
12268				continue;
12269			nbp = newblk->nb_bmsafemap->sm_buf;
12270			nbp = getdirtybuf(nbp, &lk, waitfor);
12271			if (nbp == NULL)
12272				goto top;
12273			FREE_LOCK(&lk);
12274			if ((error = bwrite(nbp)) != 0)
12275				goto out;
12276			ACQUIRE_LOCK(&lk);
12277			continue;
12278
12279		case D_INDIRDEP:
12280			indirdep = WK_INDIRDEP(wk);
12281			if (waitfor == MNT_NOWAIT) {
12282				if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
12283				    !LIST_EMPTY(&indirdep->ir_deplisthd)) {
12284					error = EBUSY;
12285					goto out_unlock;
12286				}
12287			}
12288			if (!TAILQ_EMPTY(&indirdep->ir_trunc))
12289				panic("softdep_sync_buf: truncation pending.");
12290		restart:
12291			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
12292				newblk = (struct newblk *)aip;
12293				if (newblk->nb_jnewblk != NULL) {
12294					jwait(&newblk->nb_jnewblk->jn_list,
12295					    waitfor);
12296					goto restart;
12297				}
12298				if (newblk->nb_state & DEPCOMPLETE)
12299					continue;
12300				nbp = newblk->nb_bmsafemap->sm_buf;
12301				nbp = getdirtybuf(nbp, &lk, waitfor);
12302				if (nbp == NULL)
12303					goto restart;
12304				FREE_LOCK(&lk);
12305				if ((error = bwrite(nbp)) != 0)
12306					goto out;
12307				ACQUIRE_LOCK(&lk);
12308				goto restart;
12309			}
12310			continue;
12311
12312		case D_PAGEDEP:
12313			/*
12314			 * Only flush directory entries in synchronous passes.
12315			 */
12316			if (waitfor != MNT_WAIT) {
12317				error = EBUSY;
12318				goto out_unlock;
12319			}
12320			/*
12321			 * While syncing snapshots, we must allow recursive
12322			 * lookups.
12323			 */
12324			BUF_AREC(bp);
12325			/*
12326			 * We are trying to sync a directory that may
12327			 * have dependencies on both its own metadata
12328			 * and/or dependencies on the inodes of any
12329			 * recently allocated files. We walk its diradd
12330			 * lists pushing out the associated inode.
12331			 */
12332			pagedep = WK_PAGEDEP(wk);
12333			for (i = 0; i < DAHASHSZ; i++) {
12334				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
12335					continue;
12336				if ((error = flush_pagedep_deps(vp, wk->wk_mp,
12337				    &pagedep->pd_diraddhd[i]))) {
12338					BUF_NOREC(bp);
12339					goto out_unlock;
12340				}
12341			}
12342			BUF_NOREC(bp);
12343			continue;
12344
12345		case D_FREEWORK:
12346		case D_FREEDEP:
12347		case D_JSEGDEP:
12348		case D_JNEWBLK:
12349			continue;
12350
12351		default:
12352			panic("softdep_sync_buf: Unknown type %s",
12353			    TYPENAME(wk->wk_type));
12354			/* NOTREACHED */
12355		}
12356	}
12357out_unlock:
12358	FREE_LOCK(&lk);
12359out:
12360	return (error);
12361}
12362
12363/*
12364 * Flush the dependencies associated with an inodedep.
12365 * Called with splbio blocked.
12366 */
12367static int
12368flush_inodedep_deps(vp, mp, ino)
12369	struct vnode *vp;
12370	struct mount *mp;
12371	ino_t ino;
12372{
12373	struct inodedep *inodedep;
12374	struct inoref *inoref;
12375	int error, waitfor;
12376
12377	/*
12378	 * This work is done in two passes. The first pass grabs most
12379	 * of the buffers and begins asynchronously writing them. The
12380	 * only way to wait for these asynchronous writes is to sleep
12381	 * on the filesystem vnode which may stay busy for a long time
12382	 * if the filesystem is active. So, instead, we make a second
12383	 * pass over the dependencies blocking on each write. In the
12384	 * usual case we will be blocking against a write that we
12385	 * initiated, so when it is done the dependency will have been
12386	 * resolved. Thus the second pass is expected to end quickly.
12387	 * We give a brief window at the top of the loop to allow
12388	 * any pending I/O to complete.
12389	 */
12390	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
12391		if (error)
12392			return (error);
12393		FREE_LOCK(&lk);
12394		ACQUIRE_LOCK(&lk);
12395restart:
12396		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12397			return (0);
12398		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12399			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12400			    == DEPCOMPLETE) {
12401				jwait(&inoref->if_list, MNT_WAIT);
12402				goto restart;
12403			}
12404		}
12405		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
12406		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
12407		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
12408		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
12409			continue;
12410		/*
12411		 * If pass2, we are done, otherwise do pass 2.
12412		 */
12413		if (waitfor == MNT_WAIT)
12414			break;
12415		waitfor = MNT_WAIT;
12416	}
12417	/*
12418	 * Try freeing inodedep in case all dependencies have been removed.
12419	 */
12420	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
12421		(void) free_inodedep(inodedep);
12422	return (0);
12423}
12424
12425/*
12426 * Flush an inode dependency list.
12427 * Called with splbio blocked.
12428 */
12429static int
12430flush_deplist(listhead, waitfor, errorp)
12431	struct allocdirectlst *listhead;
12432	int waitfor;
12433	int *errorp;
12434{
12435	struct allocdirect *adp;
12436	struct newblk *newblk;
12437	struct buf *bp;
12438
12439	mtx_assert(&lk, MA_OWNED);
12440	TAILQ_FOREACH(adp, listhead, ad_next) {
12441		newblk = (struct newblk *)adp;
12442		if (newblk->nb_jnewblk != NULL) {
12443			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12444			return (1);
12445		}
12446		if (newblk->nb_state & DEPCOMPLETE)
12447			continue;
12448		bp = newblk->nb_bmsafemap->sm_buf;
12449		bp = getdirtybuf(bp, &lk, waitfor);
12450		if (bp == NULL) {
12451			if (waitfor == MNT_NOWAIT)
12452				continue;
12453			return (1);
12454		}
12455		FREE_LOCK(&lk);
12456		if (waitfor == MNT_NOWAIT)
12457			bawrite(bp);
12458		else
12459			*errorp = bwrite(bp);
12460		ACQUIRE_LOCK(&lk);
12461		return (1);
12462	}
12463	return (0);
12464}
12465
12466/*
12467 * Flush dependencies associated with an allocdirect block.
12468 */
12469static int
12470flush_newblk_dep(vp, mp, lbn)
12471	struct vnode *vp;
12472	struct mount *mp;
12473	ufs_lbn_t lbn;
12474{
12475	struct newblk *newblk;
12476	struct bufobj *bo;
12477	struct inode *ip;
12478	struct buf *bp;
12479	ufs2_daddr_t blkno;
12480	int error;
12481
12482	error = 0;
12483	bo = &vp->v_bufobj;
12484	ip = VTOI(vp);
12485	blkno = DIP(ip, i_db[lbn]);
12486	if (blkno == 0)
12487		panic("flush_newblk_dep: Missing block");
12488	ACQUIRE_LOCK(&lk);
12489	/*
12490	 * Loop until all dependencies related to this block are satisfied.
12491	 * We must be careful to restart after each sleep in case a write
12492	 * completes some part of this process for us.
12493	 */
12494	for (;;) {
12495		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
12496			FREE_LOCK(&lk);
12497			break;
12498		}
12499		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
12500			panic("flush_newblk_deps: Bad newblk %p", newblk);
12501		/*
12502		 * Flush the journal.
12503		 */
12504		if (newblk->nb_jnewblk != NULL) {
12505			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12506			continue;
12507		}
12508		/*
12509		 * Write the bitmap dependency.
12510		 */
12511		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
12512			bp = newblk->nb_bmsafemap->sm_buf;
12513			bp = getdirtybuf(bp, &lk, MNT_WAIT);
12514			if (bp == NULL)
12515				continue;
12516			FREE_LOCK(&lk);
12517			error = bwrite(bp);
12518			if (error)
12519				break;
12520			ACQUIRE_LOCK(&lk);
12521			continue;
12522		}
12523		/*
12524		 * Write the buffer.
12525		 */
12526		FREE_LOCK(&lk);
12527		BO_LOCK(bo);
12528		bp = gbincore(bo, lbn);
12529		if (bp != NULL) {
12530			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
12531			    LK_INTERLOCK, BO_MTX(bo));
12532			if (error == ENOLCK) {
12533				ACQUIRE_LOCK(&lk);
12534				continue; /* Slept, retry */
12535			}
12536			if (error != 0)
12537				break;	/* Failed */
12538			if (bp->b_flags & B_DELWRI) {
12539				bremfree(bp);
12540				error = bwrite(bp);
12541				if (error)
12542					break;
12543			} else
12544				BUF_UNLOCK(bp);
12545		} else
12546			BO_UNLOCK(bo);
12547		/*
12548		 * We have to wait for the direct pointers to
12549		 * point at the newdirblk before the dependency
12550		 * will go away.
12551		 */
12552		error = ffs_update(vp, 1);
12553		if (error)
12554			break;
12555		ACQUIRE_LOCK(&lk);
12556	}
12557	return (error);
12558}
12559
12560/*
12561 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
12562 * Called with splbio blocked.
12563 */
12564static int
12565flush_pagedep_deps(pvp, mp, diraddhdp)
12566	struct vnode *pvp;
12567	struct mount *mp;
12568	struct diraddhd *diraddhdp;
12569{
12570	struct inodedep *inodedep;
12571	struct inoref *inoref;
12572	struct ufsmount *ump;
12573	struct diradd *dap;
12574	struct vnode *vp;
12575	int error = 0;
12576	struct buf *bp;
12577	ino_t inum;
12578
12579	ump = VFSTOUFS(mp);
12580restart:
12581	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
12582		/*
12583		 * Flush ourselves if this directory entry
12584		 * has a MKDIR_PARENT dependency.
12585		 */
12586		if (dap->da_state & MKDIR_PARENT) {
12587			FREE_LOCK(&lk);
12588			if ((error = ffs_update(pvp, 1)) != 0)
12589				break;
12590			ACQUIRE_LOCK(&lk);
12591			/*
12592			 * If that cleared dependencies, go on to next.
12593			 */
12594			if (dap != LIST_FIRST(diraddhdp))
12595				continue;
12596			if (dap->da_state & MKDIR_PARENT)
12597				panic("flush_pagedep_deps: MKDIR_PARENT");
12598		}
12599		/*
12600		 * A newly allocated directory must have its "." and
12601		 * ".." entries written out before its name can be
12602		 * committed in its parent.
12603		 */
12604		inum = dap->da_newinum;
12605		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12606			panic("flush_pagedep_deps: lost inode1");
12607		/*
12608		 * Wait for any pending journal adds to complete so we don't
12609		 * cause rollbacks while syncing.
12610		 */
12611		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12612			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12613			    == DEPCOMPLETE) {
12614				jwait(&inoref->if_list, MNT_WAIT);
12615				goto restart;
12616			}
12617		}
12618		if (dap->da_state & MKDIR_BODY) {
12619			FREE_LOCK(&lk);
12620			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12621			    FFSV_FORCEINSMQ)))
12622				break;
12623			error = flush_newblk_dep(vp, mp, 0);
12624			/*
12625			 * If we still have the dependency we might need to
12626			 * update the vnode to sync the new link count to
12627			 * disk.
12628			 */
12629			if (error == 0 && dap == LIST_FIRST(diraddhdp))
12630				error = ffs_update(vp, 1);
12631			vput(vp);
12632			if (error != 0)
12633				break;
12634			ACQUIRE_LOCK(&lk);
12635			/*
12636			 * If that cleared dependencies, go on to next.
12637			 */
12638			if (dap != LIST_FIRST(diraddhdp))
12639				continue;
12640			if (dap->da_state & MKDIR_BODY) {
12641				inodedep_lookup(UFSTOVFS(ump), inum, 0,
12642				    &inodedep);
12643				panic("flush_pagedep_deps: MKDIR_BODY "
12644				    "inodedep %p dap %p vp %p",
12645				    inodedep, dap, vp);
12646			}
12647		}
12648		/*
12649		 * Flush the inode on which the directory entry depends.
12650		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
12651		 * the only remaining dependency is that the updated inode
12652		 * count must get pushed to disk. The inode has already
12653		 * been pushed into its inode buffer (via VOP_UPDATE) at
12654		 * the time of the reference count change. So we need only
12655		 * locate that buffer, ensure that there will be no rollback
12656		 * caused by a bitmap dependency, then write the inode buffer.
12657		 */
12658retry:
12659		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12660			panic("flush_pagedep_deps: lost inode");
12661		/*
12662		 * If the inode still has bitmap dependencies,
12663		 * push them to disk.
12664		 */
12665		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
12666			bp = inodedep->id_bmsafemap->sm_buf;
12667			bp = getdirtybuf(bp, &lk, MNT_WAIT);
12668			if (bp == NULL)
12669				goto retry;
12670			FREE_LOCK(&lk);
12671			if ((error = bwrite(bp)) != 0)
12672				break;
12673			ACQUIRE_LOCK(&lk);
12674			if (dap != LIST_FIRST(diraddhdp))
12675				continue;
12676		}
12677		/*
12678		 * If the inode is still sitting in a buffer waiting
12679		 * to be written or waiting for the link count to be
12680		 * adjusted update it here to flush it to disk.
12681		 */
12682		if (dap == LIST_FIRST(diraddhdp)) {
12683			FREE_LOCK(&lk);
12684			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12685			    FFSV_FORCEINSMQ)))
12686				break;
12687			error = ffs_update(vp, 1);
12688			vput(vp);
12689			if (error)
12690				break;
12691			ACQUIRE_LOCK(&lk);
12692		}
12693		/*
12694		 * If we have failed to get rid of all the dependencies
12695		 * then something is seriously wrong.
12696		 */
12697		if (dap == LIST_FIRST(diraddhdp)) {
12698			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
12699			panic("flush_pagedep_deps: failed to flush "
12700			    "inodedep %p ino %ju dap %p",
12701			    inodedep, (uintmax_t)inum, dap);
12702		}
12703	}
12704	if (error)
12705		ACQUIRE_LOCK(&lk);
12706	return (error);
12707}
12708
12709/*
12710 * A large burst of file addition or deletion activity can drive the
12711 * memory load excessively high. First attempt to slow things down
12712 * using the techniques below. If that fails, this routine requests
12713 * the offending operations to fall back to running synchronously
12714 * until the memory load returns to a reasonable level.
12715 */
12716int
12717softdep_slowdown(vp)
12718	struct vnode *vp;
12719{
12720	struct ufsmount *ump;
12721	int jlow;
12722	int max_softdeps_hard;
12723
12724	ACQUIRE_LOCK(&lk);
12725	jlow = 0;
12726	/*
12727	 * Check for journal space if needed.
12728	 */
12729	if (DOINGSUJ(vp)) {
12730		ump = VFSTOUFS(vp->v_mount);
12731		if (journal_space(ump, 0) == 0)
12732			jlow = 1;
12733	}
12734	max_softdeps_hard = max_softdeps * 11 / 10;
12735	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
12736	    dep_current[D_INODEDEP] < max_softdeps_hard &&
12737	    VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps &&
12738	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0) {
12739		FREE_LOCK(&lk);
12740  		return (0);
12741	}
12742	if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow)
12743		softdep_speedup();
12744	stat_sync_limit_hit += 1;
12745	FREE_LOCK(&lk);
12746	if (DOINGSUJ(vp))
12747		return (0);
12748	return (1);
12749}
12750
12751/*
12752 * Called by the allocation routines when they are about to fail
12753 * in the hope that we can free up the requested resource (inodes
12754 * or disk space).
12755 *
12756 * First check to see if the work list has anything on it. If it has,
12757 * clean up entries until we successfully free the requested resource.
12758 * Because this process holds inodes locked, we cannot handle any remove
12759 * requests that might block on a locked inode as that could lead to
12760 * deadlock. If the worklist yields none of the requested resource,
12761 * start syncing out vnodes to free up the needed space.
12762 */
12763int
12764softdep_request_cleanup(fs, vp, cred, resource)
12765	struct fs *fs;
12766	struct vnode *vp;
12767	struct ucred *cred;
12768	int resource;
12769{
12770	struct ufsmount *ump;
12771	struct mount *mp;
12772	struct vnode *lvp, *mvp;
12773	long starttime;
12774	ufs2_daddr_t needed;
12775	int error;
12776
12777	/*
12778	 * If we are being called because of a process doing a
12779	 * copy-on-write, then it is not safe to process any
12780	 * worklist items as we will recurse into the copyonwrite
12781	 * routine.  This will result in an incoherent snapshot.
12782	 * If the vnode that we hold is a snapshot, we must avoid
12783	 * handling other resources that could cause deadlock.
12784	 */
12785	if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
12786		return (0);
12787
12788	if (resource == FLUSH_BLOCKS_WAIT)
12789		stat_cleanup_blkrequests += 1;
12790	else
12791		stat_cleanup_inorequests += 1;
12792
12793	mp = vp->v_mount;
12794	ump = VFSTOUFS(mp);
12795	mtx_assert(UFS_MTX(ump), MA_OWNED);
12796	UFS_UNLOCK(ump);
12797	error = ffs_update(vp, 1);
12798	if (error != 0) {
12799		UFS_LOCK(ump);
12800		return (0);
12801	}
12802	/*
12803	 * If we are in need of resources, consider pausing for
12804	 * tickdelay to give ourselves some breathing room.
12805	 */
12806	ACQUIRE_LOCK(&lk);
12807	process_removes(vp);
12808	process_truncates(vp);
12809	request_cleanup(UFSTOVFS(ump), resource);
12810	FREE_LOCK(&lk);
12811	/*
12812	 * Now clean up at least as many resources as we will need.
12813	 *
12814	 * When requested to clean up inodes, the number that are needed
12815	 * is set by the number of simultaneous writers (mnt_writeopcount)
12816	 * plus a bit of slop (2) in case some more writers show up while
12817	 * we are cleaning.
12818	 *
12819	 * When requested to free up space, the amount of space that
12820	 * we need is enough blocks to allocate a full-sized segment
12821	 * (fs_contigsumsize). The number of such segments that will
12822	 * be needed is set by the number of simultaneous writers
12823	 * (mnt_writeopcount) plus a bit of slop (2) in case some more
12824	 * writers show up while we are cleaning.
12825	 *
12826	 * Additionally, if we are unpriviledged and allocating space,
12827	 * we need to ensure that we clean up enough blocks to get the
12828	 * needed number of blocks over the threshhold of the minimum
12829	 * number of blocks required to be kept free by the filesystem
12830	 * (fs_minfree).
12831	 */
12832	if (resource == FLUSH_INODES_WAIT) {
12833		needed = vp->v_mount->mnt_writeopcount + 2;
12834	} else if (resource == FLUSH_BLOCKS_WAIT) {
12835		needed = (vp->v_mount->mnt_writeopcount + 2) *
12836		    fs->fs_contigsumsize;
12837		if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
12838			needed += fragstoblks(fs,
12839			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
12840			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
12841	} else {
12842		UFS_LOCK(ump);
12843		printf("softdep_request_cleanup: Unknown resource type %d\n",
12844		    resource);
12845		return (0);
12846	}
12847	starttime = time_second;
12848retry:
12849	if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
12850	    fs->fs_cstotal.cs_nbfree <= needed) ||
12851	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
12852	    fs->fs_cstotal.cs_nifree <= needed)) {
12853		ACQUIRE_LOCK(&lk);
12854		if (ump->softdep_on_worklist > 0 &&
12855		    process_worklist_item(UFSTOVFS(ump),
12856		    ump->softdep_on_worklist, LK_NOWAIT) != 0)
12857			stat_worklist_push += 1;
12858		FREE_LOCK(&lk);
12859	}
12860	/*
12861	 * If we still need resources and there are no more worklist
12862	 * entries to process to obtain them, we have to start flushing
12863	 * the dirty vnodes to force the release of additional requests
12864	 * to the worklist that we can then process to reap addition
12865	 * resources. We walk the vnodes associated with the mount point
12866	 * until we get the needed worklist requests that we can reap.
12867	 */
12868	if ((resource == FLUSH_BLOCKS_WAIT &&
12869	     fs->fs_cstotal.cs_nbfree <= needed) ||
12870	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
12871	     fs->fs_cstotal.cs_nifree <= needed)) {
12872		MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
12873			if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
12874				VI_UNLOCK(lvp);
12875				continue;
12876			}
12877			if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
12878			    curthread))
12879				continue;
12880			if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
12881				vput(lvp);
12882				continue;
12883			}
12884			(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
12885			vput(lvp);
12886		}
12887		lvp = ump->um_devvp;
12888		if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
12889			VOP_FSYNC(lvp, MNT_NOWAIT, curthread);
12890			VOP_UNLOCK(lvp, 0);
12891		}
12892		if (ump->softdep_on_worklist > 0) {
12893			stat_cleanup_retries += 1;
12894			goto retry;
12895		}
12896		stat_cleanup_failures += 1;
12897	}
12898	if (time_second - starttime > stat_cleanup_high_delay)
12899		stat_cleanup_high_delay = time_second - starttime;
12900	UFS_LOCK(ump);
12901	return (1);
12902}
12903
12904/*
12905 * If memory utilization has gotten too high, deliberately slow things
12906 * down and speed up the I/O processing.
12907 */
12908extern struct thread *syncertd;
12909static int
12910request_cleanup(mp, resource)
12911	struct mount *mp;
12912	int resource;
12913{
12914	struct thread *td = curthread;
12915	struct ufsmount *ump;
12916
12917	mtx_assert(&lk, MA_OWNED);
12918	/*
12919	 * We never hold up the filesystem syncer or buf daemon.
12920	 */
12921	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
12922		return (0);
12923	ump = VFSTOUFS(mp);
12924	/*
12925	 * First check to see if the work list has gotten backlogged.
12926	 * If it has, co-opt this process to help clean up two entries.
12927	 * Because this process may hold inodes locked, we cannot
12928	 * handle any remove requests that might block on a locked
12929	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
12930	 * to avoid recursively processing the worklist.
12931	 */
12932	if (ump->softdep_on_worklist > max_softdeps / 10) {
12933		td->td_pflags |= TDP_SOFTDEP;
12934		process_worklist_item(mp, 2, LK_NOWAIT);
12935		td->td_pflags &= ~TDP_SOFTDEP;
12936		stat_worklist_push += 2;
12937		return(1);
12938	}
12939	/*
12940	 * Next, we attempt to speed up the syncer process. If that
12941	 * is successful, then we allow the process to continue.
12942	 */
12943	if (softdep_speedup() &&
12944	    resource != FLUSH_BLOCKS_WAIT &&
12945	    resource != FLUSH_INODES_WAIT)
12946		return(0);
12947	/*
12948	 * If we are resource constrained on inode dependencies, try
12949	 * flushing some dirty inodes. Otherwise, we are constrained
12950	 * by file deletions, so try accelerating flushes of directories
12951	 * with removal dependencies. We would like to do the cleanup
12952	 * here, but we probably hold an inode locked at this point and
12953	 * that might deadlock against one that we try to clean. So,
12954	 * the best that we can do is request the syncer daemon to do
12955	 * the cleanup for us.
12956	 */
12957	switch (resource) {
12958
12959	case FLUSH_INODES:
12960	case FLUSH_INODES_WAIT:
12961		stat_ino_limit_push += 1;
12962		req_clear_inodedeps += 1;
12963		stat_countp = &stat_ino_limit_hit;
12964		break;
12965
12966	case FLUSH_BLOCKS:
12967	case FLUSH_BLOCKS_WAIT:
12968		stat_blk_limit_push += 1;
12969		req_clear_remove += 1;
12970		stat_countp = &stat_blk_limit_hit;
12971		break;
12972
12973	default:
12974		panic("request_cleanup: unknown type");
12975	}
12976	/*
12977	 * Hopefully the syncer daemon will catch up and awaken us.
12978	 * We wait at most tickdelay before proceeding in any case.
12979	 */
12980	proc_waiting += 1;
12981	if (callout_pending(&softdep_callout) == FALSE)
12982		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
12983		    pause_timer, 0);
12984
12985	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
12986	proc_waiting -= 1;
12987	return (1);
12988}
12989
12990/*
12991 * Awaken processes pausing in request_cleanup and clear proc_waiting
12992 * to indicate that there is no longer a timer running.
12993 */
12994static void
12995pause_timer(arg)
12996	void *arg;
12997{
12998
12999	/*
13000	 * The callout_ API has acquired mtx and will hold it around this
13001	 * function call.
13002	 */
13003	*stat_countp += 1;
13004	wakeup_one(&proc_waiting);
13005	if (proc_waiting > 0)
13006		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13007		    pause_timer, 0);
13008}
13009
13010/*
13011 * Flush out a directory with at least one removal dependency in an effort to
13012 * reduce the number of dirrem, freefile, and freeblks dependency structures.
13013 */
13014static void
13015clear_remove(void)
13016{
13017	struct pagedep_hashhead *pagedephd;
13018	struct pagedep *pagedep;
13019	static int next = 0;
13020	struct mount *mp;
13021	struct vnode *vp;
13022	struct bufobj *bo;
13023	int error, cnt;
13024	ino_t ino;
13025
13026	mtx_assert(&lk, MA_OWNED);
13027
13028	for (cnt = 0; cnt <= pagedep_hash; cnt++) {
13029		pagedephd = &pagedep_hashtbl[next++];
13030		if (next > pagedep_hash)
13031			next = 0;
13032		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
13033			if (LIST_EMPTY(&pagedep->pd_dirremhd))
13034				continue;
13035			mp = pagedep->pd_list.wk_mp;
13036			ino = pagedep->pd_ino;
13037			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13038				continue;
13039			FREE_LOCK(&lk);
13040
13041			/*
13042			 * Let unmount clear deps
13043			 */
13044			error = vfs_busy(mp, MBF_NOWAIT);
13045			if (error != 0)
13046				goto finish_write;
13047			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13048			     FFSV_FORCEINSMQ);
13049			vfs_unbusy(mp);
13050			if (error != 0) {
13051				softdep_error("clear_remove: vget", error);
13052				goto finish_write;
13053			}
13054			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13055				softdep_error("clear_remove: fsync", error);
13056			bo = &vp->v_bufobj;
13057			BO_LOCK(bo);
13058			drain_output(vp);
13059			BO_UNLOCK(bo);
13060			vput(vp);
13061		finish_write:
13062			vn_finished_write(mp);
13063			ACQUIRE_LOCK(&lk);
13064			return;
13065		}
13066	}
13067}
13068
13069/*
13070 * Clear out a block of dirty inodes in an effort to reduce
13071 * the number of inodedep dependency structures.
13072 */
13073static void
13074clear_inodedeps(void)
13075{
13076	struct inodedep_hashhead *inodedephd;
13077	struct inodedep *inodedep;
13078	static int next = 0;
13079	struct mount *mp;
13080	struct vnode *vp;
13081	struct fs *fs;
13082	int error, cnt;
13083	ino_t firstino, lastino, ino;
13084
13085	mtx_assert(&lk, MA_OWNED);
13086	/*
13087	 * Pick a random inode dependency to be cleared.
13088	 * We will then gather up all the inodes in its block
13089	 * that have dependencies and flush them out.
13090	 */
13091	for (cnt = 0; cnt <= inodedep_hash; cnt++) {
13092		inodedephd = &inodedep_hashtbl[next++];
13093		if (next > inodedep_hash)
13094			next = 0;
13095		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
13096			break;
13097	}
13098	if (inodedep == NULL)
13099		return;
13100	fs = inodedep->id_fs;
13101	mp = inodedep->id_list.wk_mp;
13102	/*
13103	 * Find the last inode in the block with dependencies.
13104	 */
13105	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
13106	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
13107		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
13108			break;
13109	/*
13110	 * Asynchronously push all but the last inode with dependencies.
13111	 * Synchronously push the last inode with dependencies to ensure
13112	 * that the inode block gets written to free up the inodedeps.
13113	 */
13114	for (ino = firstino; ino <= lastino; ino++) {
13115		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13116			continue;
13117		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13118			continue;
13119		FREE_LOCK(&lk);
13120		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
13121		if (error != 0) {
13122			vn_finished_write(mp);
13123			ACQUIRE_LOCK(&lk);
13124			return;
13125		}
13126		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13127		    FFSV_FORCEINSMQ)) != 0) {
13128			softdep_error("clear_inodedeps: vget", error);
13129			vfs_unbusy(mp);
13130			vn_finished_write(mp);
13131			ACQUIRE_LOCK(&lk);
13132			return;
13133		}
13134		vfs_unbusy(mp);
13135		if (ino == lastino) {
13136			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
13137				softdep_error("clear_inodedeps: fsync1", error);
13138		} else {
13139			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13140				softdep_error("clear_inodedeps: fsync2", error);
13141			BO_LOCK(&vp->v_bufobj);
13142			drain_output(vp);
13143			BO_UNLOCK(&vp->v_bufobj);
13144		}
13145		vput(vp);
13146		vn_finished_write(mp);
13147		ACQUIRE_LOCK(&lk);
13148	}
13149}
13150
13151void
13152softdep_buf_append(bp, wkhd)
13153	struct buf *bp;
13154	struct workhead *wkhd;
13155{
13156	struct worklist *wk;
13157
13158	ACQUIRE_LOCK(&lk);
13159	while ((wk = LIST_FIRST(wkhd)) != NULL) {
13160		WORKLIST_REMOVE(wk);
13161		WORKLIST_INSERT(&bp->b_dep, wk);
13162	}
13163	FREE_LOCK(&lk);
13164
13165}
13166
13167void
13168softdep_inode_append(ip, cred, wkhd)
13169	struct inode *ip;
13170	struct ucred *cred;
13171	struct workhead *wkhd;
13172{
13173	struct buf *bp;
13174	struct fs *fs;
13175	int error;
13176
13177	fs = ip->i_fs;
13178	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
13179	    (int)fs->fs_bsize, cred, &bp);
13180	if (error) {
13181		softdep_freework(wkhd);
13182		return;
13183	}
13184	softdep_buf_append(bp, wkhd);
13185	bqrelse(bp);
13186}
13187
13188void
13189softdep_freework(wkhd)
13190	struct workhead *wkhd;
13191{
13192
13193	ACQUIRE_LOCK(&lk);
13194	handle_jwork(wkhd);
13195	FREE_LOCK(&lk);
13196}
13197
13198/*
13199 * Function to determine if the buffer has outstanding dependencies
13200 * that will cause a roll-back if the buffer is written. If wantcount
13201 * is set, return number of dependencies, otherwise just yes or no.
13202 */
13203static int
13204softdep_count_dependencies(bp, wantcount)
13205	struct buf *bp;
13206	int wantcount;
13207{
13208	struct worklist *wk;
13209	struct bmsafemap *bmsafemap;
13210	struct freework *freework;
13211	struct inodedep *inodedep;
13212	struct indirdep *indirdep;
13213	struct freeblks *freeblks;
13214	struct allocindir *aip;
13215	struct pagedep *pagedep;
13216	struct dirrem *dirrem;
13217	struct newblk *newblk;
13218	struct mkdir *mkdir;
13219	struct diradd *dap;
13220	int i, retval;
13221
13222	retval = 0;
13223	ACQUIRE_LOCK(&lk);
13224	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
13225		switch (wk->wk_type) {
13226
13227		case D_INODEDEP:
13228			inodedep = WK_INODEDEP(wk);
13229			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
13230				/* bitmap allocation dependency */
13231				retval += 1;
13232				if (!wantcount)
13233					goto out;
13234			}
13235			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
13236				/* direct block pointer dependency */
13237				retval += 1;
13238				if (!wantcount)
13239					goto out;
13240			}
13241			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
13242				/* direct block pointer dependency */
13243				retval += 1;
13244				if (!wantcount)
13245					goto out;
13246			}
13247			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
13248				/* Add reference dependency. */
13249				retval += 1;
13250				if (!wantcount)
13251					goto out;
13252			}
13253			continue;
13254
13255		case D_INDIRDEP:
13256			indirdep = WK_INDIRDEP(wk);
13257
13258			TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
13259				/* indirect truncation dependency */
13260				retval += 1;
13261				if (!wantcount)
13262					goto out;
13263			}
13264
13265			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
13266				/* indirect block pointer dependency */
13267				retval += 1;
13268				if (!wantcount)
13269					goto out;
13270			}
13271			continue;
13272
13273		case D_PAGEDEP:
13274			pagedep = WK_PAGEDEP(wk);
13275			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
13276				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
13277					/* Journal remove ref dependency. */
13278					retval += 1;
13279					if (!wantcount)
13280						goto out;
13281				}
13282			}
13283			for (i = 0; i < DAHASHSZ; i++) {
13284
13285				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
13286					/* directory entry dependency */
13287					retval += 1;
13288					if (!wantcount)
13289						goto out;
13290				}
13291			}
13292			continue;
13293
13294		case D_BMSAFEMAP:
13295			bmsafemap = WK_BMSAFEMAP(wk);
13296			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
13297				/* Add reference dependency. */
13298				retval += 1;
13299				if (!wantcount)
13300					goto out;
13301			}
13302			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
13303				/* Allocate block dependency. */
13304				retval += 1;
13305				if (!wantcount)
13306					goto out;
13307			}
13308			continue;
13309
13310		case D_FREEBLKS:
13311			freeblks = WK_FREEBLKS(wk);
13312			if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
13313				/* Freeblk journal dependency. */
13314				retval += 1;
13315				if (!wantcount)
13316					goto out;
13317			}
13318			continue;
13319
13320		case D_ALLOCDIRECT:
13321		case D_ALLOCINDIR:
13322			newblk = WK_NEWBLK(wk);
13323			if (newblk->nb_jnewblk) {
13324				/* Journal allocate dependency. */
13325				retval += 1;
13326				if (!wantcount)
13327					goto out;
13328			}
13329			continue;
13330
13331		case D_MKDIR:
13332			mkdir = WK_MKDIR(wk);
13333			if (mkdir->md_jaddref) {
13334				/* Journal reference dependency. */
13335				retval += 1;
13336				if (!wantcount)
13337					goto out;
13338			}
13339			continue;
13340
13341		case D_FREEWORK:
13342		case D_FREEDEP:
13343		case D_JSEGDEP:
13344		case D_JSEG:
13345		case D_SBDEP:
13346			/* never a dependency on these blocks */
13347			continue;
13348
13349		default:
13350			panic("softdep_count_dependencies: Unexpected type %s",
13351			    TYPENAME(wk->wk_type));
13352			/* NOTREACHED */
13353		}
13354	}
13355out:
13356	FREE_LOCK(&lk);
13357	return retval;
13358}
13359
13360/*
13361 * Acquire exclusive access to a buffer.
13362 * Must be called with a locked mtx parameter.
13363 * Return acquired buffer or NULL on failure.
13364 */
13365static struct buf *
13366getdirtybuf(bp, mtx, waitfor)
13367	struct buf *bp;
13368	struct mtx *mtx;
13369	int waitfor;
13370{
13371	int error;
13372
13373	mtx_assert(mtx, MA_OWNED);
13374	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
13375		if (waitfor != MNT_WAIT)
13376			return (NULL);
13377		error = BUF_LOCK(bp,
13378		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
13379		/*
13380		 * Even if we sucessfully acquire bp here, we have dropped
13381		 * mtx, which may violates our guarantee.
13382		 */
13383		if (error == 0)
13384			BUF_UNLOCK(bp);
13385		else if (error != ENOLCK)
13386			panic("getdirtybuf: inconsistent lock: %d", error);
13387		mtx_lock(mtx);
13388		return (NULL);
13389	}
13390	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13391		if (mtx == &lk && waitfor == MNT_WAIT) {
13392			mtx_unlock(mtx);
13393			BO_LOCK(bp->b_bufobj);
13394			BUF_UNLOCK(bp);
13395			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13396				bp->b_vflags |= BV_BKGRDWAIT;
13397				msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
13398				       PRIBIO | PDROP, "getbuf", 0);
13399			} else
13400				BO_UNLOCK(bp->b_bufobj);
13401			mtx_lock(mtx);
13402			return (NULL);
13403		}
13404		BUF_UNLOCK(bp);
13405		if (waitfor != MNT_WAIT)
13406			return (NULL);
13407		/*
13408		 * The mtx argument must be bp->b_vp's mutex in
13409		 * this case.
13410		 */
13411#ifdef	DEBUG_VFS_LOCKS
13412		if (bp->b_vp->v_type != VCHR)
13413			ASSERT_BO_LOCKED(bp->b_bufobj);
13414#endif
13415		bp->b_vflags |= BV_BKGRDWAIT;
13416		msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
13417		return (NULL);
13418	}
13419	if ((bp->b_flags & B_DELWRI) == 0) {
13420		BUF_UNLOCK(bp);
13421		return (NULL);
13422	}
13423	bremfree(bp);
13424	return (bp);
13425}
13426
13427
13428/*
13429 * Check if it is safe to suspend the file system now.  On entry,
13430 * the vnode interlock for devvp should be held.  Return 0 with
13431 * the mount interlock held if the file system can be suspended now,
13432 * otherwise return EAGAIN with the mount interlock held.
13433 */
13434int
13435softdep_check_suspend(struct mount *mp,
13436		      struct vnode *devvp,
13437		      int softdep_deps,
13438		      int softdep_accdeps,
13439		      int secondary_writes,
13440		      int secondary_accwrites)
13441{
13442	struct bufobj *bo;
13443	struct ufsmount *ump;
13444	int error;
13445
13446	ump = VFSTOUFS(mp);
13447	bo = &devvp->v_bufobj;
13448	ASSERT_BO_LOCKED(bo);
13449
13450	for (;;) {
13451		if (!TRY_ACQUIRE_LOCK(&lk)) {
13452			BO_UNLOCK(bo);
13453			ACQUIRE_LOCK(&lk);
13454			FREE_LOCK(&lk);
13455			BO_LOCK(bo);
13456			continue;
13457		}
13458		MNT_ILOCK(mp);
13459		if (mp->mnt_secondary_writes != 0) {
13460			FREE_LOCK(&lk);
13461			BO_UNLOCK(bo);
13462			msleep(&mp->mnt_secondary_writes,
13463			       MNT_MTX(mp),
13464			       (PUSER - 1) | PDROP, "secwr", 0);
13465			BO_LOCK(bo);
13466			continue;
13467		}
13468		break;
13469	}
13470
13471	/*
13472	 * Reasons for needing more work before suspend:
13473	 * - Dirty buffers on devvp.
13474	 * - Softdep activity occurred after start of vnode sync loop
13475	 * - Secondary writes occurred after start of vnode sync loop
13476	 */
13477	error = 0;
13478	if (bo->bo_numoutput > 0 ||
13479	    bo->bo_dirty.bv_cnt > 0 ||
13480	    softdep_deps != 0 ||
13481	    ump->softdep_deps != 0 ||
13482	    softdep_accdeps != ump->softdep_accdeps ||
13483	    secondary_writes != 0 ||
13484	    mp->mnt_secondary_writes != 0 ||
13485	    secondary_accwrites != mp->mnt_secondary_accwrites)
13486		error = EAGAIN;
13487	FREE_LOCK(&lk);
13488	BO_UNLOCK(bo);
13489	return (error);
13490}
13491
13492
13493/*
13494 * Get the number of dependency structures for the file system, both
13495 * the current number and the total number allocated.  These will
13496 * later be used to detect that softdep processing has occurred.
13497 */
13498void
13499softdep_get_depcounts(struct mount *mp,
13500		      int *softdep_depsp,
13501		      int *softdep_accdepsp)
13502{
13503	struct ufsmount *ump;
13504
13505	ump = VFSTOUFS(mp);
13506	ACQUIRE_LOCK(&lk);
13507	*softdep_depsp = ump->softdep_deps;
13508	*softdep_accdepsp = ump->softdep_accdeps;
13509	FREE_LOCK(&lk);
13510}
13511
13512/*
13513 * Wait for pending output on a vnode to complete.
13514 * Must be called with vnode lock and interlock locked.
13515 *
13516 * XXX: Should just be a call to bufobj_wwait().
13517 */
13518static void
13519drain_output(vp)
13520	struct vnode *vp;
13521{
13522	struct bufobj *bo;
13523
13524	bo = &vp->v_bufobj;
13525	ASSERT_VOP_LOCKED(vp, "drain_output");
13526	ASSERT_BO_LOCKED(bo);
13527
13528	while (bo->bo_numoutput) {
13529		bo->bo_flag |= BO_WWAIT;
13530		msleep((caddr_t)&bo->bo_numoutput,
13531		    BO_MTX(bo), PRIBIO + 1, "drainvp", 0);
13532	}
13533}
13534
13535/*
13536 * Called whenever a buffer that is being invalidated or reallocated
13537 * contains dependencies. This should only happen if an I/O error has
13538 * occurred. The routine is called with the buffer locked.
13539 */
13540static void
13541softdep_deallocate_dependencies(bp)
13542	struct buf *bp;
13543{
13544
13545	if ((bp->b_ioflags & BIO_ERROR) == 0)
13546		panic("softdep_deallocate_dependencies: dangling deps");
13547	if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
13548		softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
13549	else
13550		printf("softdep_deallocate_dependencies: "
13551		    "got error %d while accessing filesystem\n", bp->b_error);
13552	if (bp->b_error != ENXIO)
13553		panic("softdep_deallocate_dependencies: unrecovered I/O error");
13554}
13555
13556/*
13557 * Function to handle asynchronous write errors in the filesystem.
13558 */
13559static void
13560softdep_error(func, error)
13561	char *func;
13562	int error;
13563{
13564
13565	/* XXX should do something better! */
13566	printf("%s: got error %d while accessing filesystem\n", func, error);
13567}
13568
13569#ifdef DDB
13570
13571static void
13572inodedep_print(struct inodedep *inodedep, int verbose)
13573{
13574	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
13575	    " saveino %p\n",
13576	    inodedep, inodedep->id_fs, inodedep->id_state,
13577	    (intmax_t)inodedep->id_ino,
13578	    (intmax_t)fsbtodb(inodedep->id_fs,
13579	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
13580	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
13581	    inodedep->id_savedino1);
13582
13583	if (verbose == 0)
13584		return;
13585
13586	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
13587	    "mkdiradd %p\n",
13588	    LIST_FIRST(&inodedep->id_pendinghd),
13589	    LIST_FIRST(&inodedep->id_bufwait),
13590	    LIST_FIRST(&inodedep->id_inowait),
13591	    TAILQ_FIRST(&inodedep->id_inoreflst),
13592	    inodedep->id_mkdiradd);
13593	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
13594	    TAILQ_FIRST(&inodedep->id_inoupdt),
13595	    TAILQ_FIRST(&inodedep->id_newinoupdt),
13596	    TAILQ_FIRST(&inodedep->id_extupdt),
13597	    TAILQ_FIRST(&inodedep->id_newextupdt));
13598}
13599
13600DB_SHOW_COMMAND(inodedep, db_show_inodedep)
13601{
13602
13603	if (have_addr == 0) {
13604		db_printf("Address required\n");
13605		return;
13606	}
13607	inodedep_print((struct inodedep*)addr, 1);
13608}
13609
13610DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
13611{
13612	struct inodedep_hashhead *inodedephd;
13613	struct inodedep *inodedep;
13614	struct fs *fs;
13615	int cnt;
13616
13617	fs = have_addr ? (struct fs *)addr : NULL;
13618	for (cnt = 0; cnt < inodedep_hash; cnt++) {
13619		inodedephd = &inodedep_hashtbl[cnt];
13620		LIST_FOREACH(inodedep, inodedephd, id_hash) {
13621			if (fs != NULL && fs != inodedep->id_fs)
13622				continue;
13623			inodedep_print(inodedep, 0);
13624		}
13625	}
13626}
13627
13628DB_SHOW_COMMAND(worklist, db_show_worklist)
13629{
13630	struct worklist *wk;
13631
13632	if (have_addr == 0) {
13633		db_printf("Address required\n");
13634		return;
13635	}
13636	wk = (struct worklist *)addr;
13637	printf("worklist: %p type %s state 0x%X\n",
13638	    wk, TYPENAME(wk->wk_type), wk->wk_state);
13639}
13640
13641DB_SHOW_COMMAND(workhead, db_show_workhead)
13642{
13643	struct workhead *wkhd;
13644	struct worklist *wk;
13645	int i;
13646
13647	if (have_addr == 0) {
13648		db_printf("Address required\n");
13649		return;
13650	}
13651	wkhd = (struct workhead *)addr;
13652	wk = LIST_FIRST(wkhd);
13653	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
13654		db_printf("worklist: %p type %s state 0x%X",
13655		    wk, TYPENAME(wk->wk_type), wk->wk_state);
13656	if (i == 100)
13657		db_printf("workhead overflow");
13658	printf("\n");
13659}
13660
13661
13662DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
13663{
13664	struct jaddref *jaddref;
13665	struct diradd *diradd;
13666	struct mkdir *mkdir;
13667
13668	LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
13669		diradd = mkdir->md_diradd;
13670		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
13671		    mkdir, mkdir->md_state, diradd, diradd->da_state);
13672		if ((jaddref = mkdir->md_jaddref) != NULL)
13673			db_printf(" jaddref %p jaddref state 0x%X",
13674			    jaddref, jaddref->ja_state);
13675		db_printf("\n");
13676	}
13677}
13678
13679#endif /* DDB */
13680
13681#endif /* SOFTUPDATES */
13682