ffs_softdep.c revision 220282
1/*-
2 * Copyright 1998, 2000 Marshall Kirk McKusick.
3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4 * All rights reserved.
5 *
6 * The soft updates code is derived from the appendix of a University
7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8 * "Soft Updates: A Solution to the Metadata Update Problem in File
9 * Systems", CSE-TR-254-95, August 1995).
10 *
11 * Further information about soft updates can be obtained from:
12 *
13 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
14 *	1614 Oxford Street		mckusick@mckusick.com
15 *	Berkeley, CA 94709-1608		+1-510-843-9542
16 *	USA
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 *
22 * 1. Redistributions of source code must retain the above copyright
23 *    notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 *    notice, this list of conditions and the following disclaimer in the
26 *    documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
40 */
41
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 220282 2011-04-02 21:52:58Z jeff $");
44
45#include "opt_ffs.h"
46#include "opt_ddb.h"
47
48/*
49 * For now we want the safety net that the DEBUG flag provides.
50 */
51#ifndef DEBUG
52#define DEBUG
53#endif
54
55#include <sys/param.h>
56#include <sys/kernel.h>
57#include <sys/systm.h>
58#include <sys/bio.h>
59#include <sys/buf.h>
60#include <sys/kdb.h>
61#include <sys/kthread.h>
62#include <sys/lock.h>
63#include <sys/malloc.h>
64#include <sys/mount.h>
65#include <sys/mutex.h>
66#include <sys/namei.h>
67#include <sys/proc.h>
68#include <sys/stat.h>
69#include <sys/sysctl.h>
70#include <sys/syslog.h>
71#include <sys/vnode.h>
72#include <sys/conf.h>
73#include <ufs/ufs/dir.h>
74#include <ufs/ufs/extattr.h>
75#include <ufs/ufs/quota.h>
76#include <ufs/ufs/inode.h>
77#include <ufs/ufs/ufsmount.h>
78#include <ufs/ffs/fs.h>
79#include <ufs/ffs/softdep.h>
80#include <ufs/ffs/ffs_extern.h>
81#include <ufs/ufs/ufs_extern.h>
82
83#include <vm/vm.h>
84
85#include <ddb/ddb.h>
86
87#ifndef SOFTUPDATES
88
89int
90softdep_flushfiles(oldmnt, flags, td)
91	struct mount *oldmnt;
92	int flags;
93	struct thread *td;
94{
95
96	panic("softdep_flushfiles called");
97}
98
99int
100softdep_mount(devvp, mp, fs, cred)
101	struct vnode *devvp;
102	struct mount *mp;
103	struct fs *fs;
104	struct ucred *cred;
105{
106
107	return (0);
108}
109
110void
111softdep_initialize()
112{
113
114	return;
115}
116
117void
118softdep_uninitialize()
119{
120
121	return;
122}
123
124void
125softdep_unmount(mp)
126	struct mount *mp;
127{
128
129}
130
131void
132softdep_setup_sbupdate(ump, fs, bp)
133	struct ufsmount *ump;
134	struct fs *fs;
135	struct buf *bp;
136{
137}
138
139void
140softdep_setup_inomapdep(bp, ip, newinum)
141	struct buf *bp;
142	struct inode *ip;
143	ino_t newinum;
144{
145
146	panic("softdep_setup_inomapdep called");
147}
148
149void
150softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
151	struct buf *bp;
152	struct mount *mp;
153	ufs2_daddr_t newblkno;
154	int frags;
155	int oldfrags;
156{
157
158	panic("softdep_setup_blkmapdep called");
159}
160
161void
162softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
163	struct inode *ip;
164	ufs_lbn_t lbn;
165	ufs2_daddr_t newblkno;
166	ufs2_daddr_t oldblkno;
167	long newsize;
168	long oldsize;
169	struct buf *bp;
170{
171
172	panic("softdep_setup_allocdirect called");
173}
174
175void
176softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
177	struct inode *ip;
178	ufs_lbn_t lbn;
179	ufs2_daddr_t newblkno;
180	ufs2_daddr_t oldblkno;
181	long newsize;
182	long oldsize;
183	struct buf *bp;
184{
185
186	panic("softdep_setup_allocext called");
187}
188
189void
190softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
191	struct inode *ip;
192	ufs_lbn_t lbn;
193	struct buf *bp;
194	int ptrno;
195	ufs2_daddr_t newblkno;
196	ufs2_daddr_t oldblkno;
197	struct buf *nbp;
198{
199
200	panic("softdep_setup_allocindir_page called");
201}
202
203void
204softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
205	struct buf *nbp;
206	struct inode *ip;
207	struct buf *bp;
208	int ptrno;
209	ufs2_daddr_t newblkno;
210{
211
212	panic("softdep_setup_allocindir_meta called");
213}
214
215void
216softdep_setup_freeblocks(ip, length, flags)
217	struct inode *ip;
218	off_t length;
219	int flags;
220{
221
222	panic("softdep_setup_freeblocks called");
223}
224
225void
226softdep_freefile(pvp, ino, mode)
227		struct vnode *pvp;
228		ino_t ino;
229		int mode;
230{
231
232	panic("softdep_freefile called");
233}
234
235int
236softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
237	struct buf *bp;
238	struct inode *dp;
239	off_t diroffset;
240	ino_t newinum;
241	struct buf *newdirbp;
242	int isnewblk;
243{
244
245	panic("softdep_setup_directory_add called");
246}
247
248void
249softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
250	struct buf *bp;
251	struct inode *dp;
252	caddr_t base;
253	caddr_t oldloc;
254	caddr_t newloc;
255	int entrysize;
256{
257
258	panic("softdep_change_directoryentry_offset called");
259}
260
261void
262softdep_setup_remove(bp, dp, ip, isrmdir)
263	struct buf *bp;
264	struct inode *dp;
265	struct inode *ip;
266	int isrmdir;
267{
268
269	panic("softdep_setup_remove called");
270}
271
272void
273softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
274	struct buf *bp;
275	struct inode *dp;
276	struct inode *ip;
277	ino_t newinum;
278	int isrmdir;
279{
280
281	panic("softdep_setup_directory_change called");
282}
283
284void *
285softdep_setup_trunc(vp, length, flags)
286	struct vnode *vp;
287	off_t length;
288	int flags;
289{
290
291	panic("%s called", __FUNCTION__);
292
293	return (NULL);
294}
295
296int
297softdep_complete_trunc(vp, cookie)
298	struct vnode *vp;
299	void *cookie;
300{
301
302	panic("%s called", __FUNCTION__);
303
304	return (0);
305}
306
307void
308softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
309	struct mount *mp;
310	struct buf *bp;
311	ufs2_daddr_t blkno;
312	int frags;
313	struct workhead *wkhd;
314{
315
316	panic("%s called", __FUNCTION__);
317}
318
319void
320softdep_setup_inofree(mp, bp, ino, wkhd)
321	struct mount *mp;
322	struct buf *bp;
323	ino_t ino;
324	struct workhead *wkhd;
325{
326
327	panic("%s called", __FUNCTION__);
328}
329
330void
331softdep_setup_unlink(dp, ip)
332	struct inode *dp;
333	struct inode *ip;
334{
335
336	panic("%s called", __FUNCTION__);
337}
338
339void
340softdep_setup_link(dp, ip)
341	struct inode *dp;
342	struct inode *ip;
343{
344
345	panic("%s called", __FUNCTION__);
346}
347
348void
349softdep_revert_link(dp, ip)
350	struct inode *dp;
351	struct inode *ip;
352{
353
354	panic("%s called", __FUNCTION__);
355}
356
357void
358softdep_setup_rmdir(dp, ip)
359	struct inode *dp;
360	struct inode *ip;
361{
362
363	panic("%s called", __FUNCTION__);
364}
365
366void
367softdep_revert_rmdir(dp, ip)
368	struct inode *dp;
369	struct inode *ip;
370{
371
372	panic("%s called", __FUNCTION__);
373}
374
375void
376softdep_setup_create(dp, ip)
377	struct inode *dp;
378	struct inode *ip;
379{
380
381	panic("%s called", __FUNCTION__);
382}
383
384void
385softdep_revert_create(dp, ip)
386	struct inode *dp;
387	struct inode *ip;
388{
389
390	panic("%s called", __FUNCTION__);
391}
392
393void
394softdep_setup_mkdir(dp, ip)
395	struct inode *dp;
396	struct inode *ip;
397{
398
399	panic("%s called", __FUNCTION__);
400}
401
402void
403softdep_revert_mkdir(dp, ip)
404	struct inode *dp;
405	struct inode *ip;
406{
407
408	panic("%s called", __FUNCTION__);
409}
410
411void
412softdep_setup_dotdot_link(dp, ip)
413	struct inode *dp;
414	struct inode *ip;
415{
416
417	panic("%s called", __FUNCTION__);
418}
419
420int
421softdep_prealloc(vp, waitok)
422	struct vnode *vp;
423	int waitok;
424{
425
426	panic("%s called", __FUNCTION__);
427
428	return (0);
429}
430
431int
432softdep_journal_lookup(mp, vpp)
433	struct mount *mp;
434	struct vnode **vpp;
435{
436
437	return (ENOENT);
438}
439
440void
441softdep_change_linkcnt(ip)
442	struct inode *ip;
443{
444
445	panic("softdep_change_linkcnt called");
446}
447
448void
449softdep_load_inodeblock(ip)
450	struct inode *ip;
451{
452
453	panic("softdep_load_inodeblock called");
454}
455
456void
457softdep_update_inodeblock(ip, bp, waitfor)
458	struct inode *ip;
459	struct buf *bp;
460	int waitfor;
461{
462
463	panic("softdep_update_inodeblock called");
464}
465
466int
467softdep_fsync(vp)
468	struct vnode *vp;	/* the "in_core" copy of the inode */
469{
470
471	return (0);
472}
473
474void
475softdep_fsync_mountdev(vp)
476	struct vnode *vp;
477{
478
479	return;
480}
481
482int
483softdep_flushworklist(oldmnt, countp, td)
484	struct mount *oldmnt;
485	int *countp;
486	struct thread *td;
487{
488
489	*countp = 0;
490	return (0);
491}
492
493int
494softdep_sync_metadata(struct vnode *vp)
495{
496
497	return (0);
498}
499
500int
501softdep_slowdown(vp)
502	struct vnode *vp;
503{
504
505	panic("softdep_slowdown called");
506}
507
508void
509softdep_releasefile(ip)
510	struct inode *ip;	/* inode with the zero effective link count */
511{
512
513	panic("softdep_releasefile called");
514}
515
516int
517softdep_request_cleanup(fs, vp, resource)
518	struct fs *fs;
519	struct vnode *vp;
520	int resource;
521{
522
523	return (0);
524}
525
526int
527softdep_check_suspend(struct mount *mp,
528		      struct vnode *devvp,
529		      int softdep_deps,
530		      int softdep_accdeps,
531		      int secondary_writes,
532		      int secondary_accwrites)
533{
534	struct bufobj *bo;
535	int error;
536
537	(void) softdep_deps,
538	(void) softdep_accdeps;
539
540	bo = &devvp->v_bufobj;
541	ASSERT_BO_LOCKED(bo);
542
543	MNT_ILOCK(mp);
544	while (mp->mnt_secondary_writes != 0) {
545		BO_UNLOCK(bo);
546		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
547		    (PUSER - 1) | PDROP, "secwr", 0);
548		BO_LOCK(bo);
549		MNT_ILOCK(mp);
550	}
551
552	/*
553	 * Reasons for needing more work before suspend:
554	 * - Dirty buffers on devvp.
555	 * - Secondary writes occurred after start of vnode sync loop
556	 */
557	error = 0;
558	if (bo->bo_numoutput > 0 ||
559	    bo->bo_dirty.bv_cnt > 0 ||
560	    secondary_writes != 0 ||
561	    mp->mnt_secondary_writes != 0 ||
562	    secondary_accwrites != mp->mnt_secondary_accwrites)
563		error = EAGAIN;
564	BO_UNLOCK(bo);
565	return (error);
566}
567
568void
569softdep_get_depcounts(struct mount *mp,
570		      int *softdepactivep,
571		      int *softdepactiveaccp)
572{
573	(void) mp;
574	*softdepactivep = 0;
575	*softdepactiveaccp = 0;
576}
577
578#else
579
580FEATURE(softupdates, "FFS soft-updates support");
581
582/*
583 * These definitions need to be adapted to the system to which
584 * this file is being ported.
585 */
586
587#define M_SOFTDEP_FLAGS	(M_WAITOK)
588
589#define	D_PAGEDEP	0
590#define	D_INODEDEP	1
591#define	D_BMSAFEMAP	2
592#define	D_NEWBLK	3
593#define	D_ALLOCDIRECT	4
594#define	D_INDIRDEP	5
595#define	D_ALLOCINDIR	6
596#define	D_FREEFRAG	7
597#define	D_FREEBLKS	8
598#define	D_FREEFILE	9
599#define	D_DIRADD	10
600#define	D_MKDIR		11
601#define	D_DIRREM	12
602#define	D_NEWDIRBLK	13
603#define	D_FREEWORK	14
604#define	D_FREEDEP	15
605#define	D_JADDREF	16
606#define	D_JREMREF	17
607#define	D_JMVREF	18
608#define	D_JNEWBLK	19
609#define	D_JFREEBLK	20
610#define	D_JFREEFRAG	21
611#define	D_JSEG		22
612#define	D_JSEGDEP	23
613#define	D_SBDEP		24
614#define	D_JTRUNC	25
615#define	D_LAST		D_JTRUNC
616
617unsigned long dep_current[D_LAST + 1];
618unsigned long dep_total[D_LAST + 1];
619
620
621SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats");
622SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
623    "total dependencies allocated");
624SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
625    "current dependencies allocated");
626
627#define	SOFTDEP_TYPE(type, str, long)					\
628    static MALLOC_DEFINE(M_ ## type, #str, long);			\
629    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
630	&dep_total[D_ ## type], 0, "");					\
631    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
632	&dep_current[D_ ## type], 0, "");
633
634SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
635SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
636SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
637    "Block or frag allocated from cyl group map");
638SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
639SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
640SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
641SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
642SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
643SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
644SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
645SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
646SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
647SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
648SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
649SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
650SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
651SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
652SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
653SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
654SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
655SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
656SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
657SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
658SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
659SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
660SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
661
662static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
663static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
664
665/*
666 * translate from workitem type to memory type
667 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
668 */
669static struct malloc_type *memtype[] = {
670	M_PAGEDEP,
671	M_INODEDEP,
672	M_BMSAFEMAP,
673	M_NEWBLK,
674	M_ALLOCDIRECT,
675	M_INDIRDEP,
676	M_ALLOCINDIR,
677	M_FREEFRAG,
678	M_FREEBLKS,
679	M_FREEFILE,
680	M_DIRADD,
681	M_MKDIR,
682	M_DIRREM,
683	M_NEWDIRBLK,
684	M_FREEWORK,
685	M_FREEDEP,
686	M_JADDREF,
687	M_JREMREF,
688	M_JMVREF,
689	M_JNEWBLK,
690	M_JFREEBLK,
691	M_JFREEFRAG,
692	M_JSEG,
693	M_JSEGDEP,
694	M_SBDEP,
695	M_JTRUNC
696};
697
698static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;
699
700#define DtoM(type) (memtype[type])
701
702/*
703 * Names of malloc types.
704 */
705#define TYPENAME(type)  \
706	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
707/*
708 * End system adaptation definitions.
709 */
710
711#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
712#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
713
714/*
715 * Forward declarations.
716 */
717struct inodedep_hashhead;
718struct newblk_hashhead;
719struct pagedep_hashhead;
720struct bmsafemap_hashhead;
721
722/*
723 * Internal function prototypes.
724 */
725static	void softdep_error(char *, int);
726static	void drain_output(struct vnode *);
727static	struct buf *getdirtybuf(struct buf *, struct mtx *, int);
728static	void clear_remove(struct thread *);
729static	void clear_inodedeps(struct thread *);
730static	void unlinked_inodedep(struct mount *, struct inodedep *);
731static	void clear_unlinked_inodedep(struct inodedep *);
732static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
733static	int flush_pagedep_deps(struct vnode *, struct mount *,
734	    struct diraddhd *);
735static	void free_pagedep(struct pagedep *);
736static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
737static	int flush_inodedep_deps(struct mount *, ino_t);
738static	int flush_deplist(struct allocdirectlst *, int, int *);
739static	int handle_written_filepage(struct pagedep *, struct buf *);
740static	int handle_written_sbdep(struct sbdep *, struct buf *);
741static	void initiate_write_sbdep(struct sbdep *);
742static  void diradd_inode_written(struct diradd *, struct inodedep *);
743static	int handle_written_indirdep(struct indirdep *, struct buf *,
744	    struct buf**);
745static	int handle_written_inodeblock(struct inodedep *, struct buf *);
746static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
747static	void handle_written_jaddref(struct jaddref *);
748static	void handle_written_jremref(struct jremref *);
749static	void handle_written_jseg(struct jseg *, struct buf *);
750static	void handle_written_jnewblk(struct jnewblk *);
751static	void handle_written_jfreeblk(struct jfreeblk *);
752static	void handle_written_jfreefrag(struct jfreefrag *);
753static	void complete_jseg(struct jseg *);
754static	void jseg_write(struct ufsmount *ump, struct jblocks *, struct jseg *,
755	    uint8_t *);
756static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
757static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
758static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
759static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
760static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
761static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
762static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
763static	inline void inoref_write(struct inoref *, struct jseg *,
764	    struct jrefrec *);
765static	void handle_allocdirect_partdone(struct allocdirect *,
766	    struct workhead *);
767static	void cancel_newblk(struct newblk *, struct workhead *);
768static	void indirdep_complete(struct indirdep *);
769static	void handle_allocindir_partdone(struct allocindir *);
770static	void initiate_write_filepage(struct pagedep *, struct buf *);
771static	void initiate_write_indirdep(struct indirdep*, struct buf *);
772static	void handle_written_mkdir(struct mkdir *, int);
773static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
774static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
775static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
776static	void handle_workitem_freefile(struct freefile *);
777static	void handle_workitem_remove(struct dirrem *, struct vnode *);
778static	struct dirrem *newdirrem(struct buf *, struct inode *,
779	    struct inode *, int, struct dirrem **);
780static	void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *,
781	    struct freeblks *);
782static	void free_indirdep(struct indirdep *);
783static	void free_diradd(struct diradd *, struct workhead *);
784static	void merge_diradd(struct inodedep *, struct diradd *);
785static	void complete_diradd(struct diradd *);
786static	struct diradd *diradd_lookup(struct pagedep *, int);
787static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
788	    struct jremref *);
789static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
790	    struct jremref *);
791static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
792	    struct jremref *, struct jremref *);
793static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
794	    struct jremref *);
795static	void cancel_allocindir(struct allocindir *, struct inodedep *,
796	    struct freeblks *);
797static	void complete_mkdir(struct mkdir *);
798static	void free_newdirblk(struct newdirblk *);
799static	void free_jremref(struct jremref *);
800static	void free_jaddref(struct jaddref *);
801static	void free_jsegdep(struct jsegdep *);
802static	void free_jseg(struct jseg *);
803static	void free_jnewblk(struct jnewblk *);
804static	void free_jfreeblk(struct jfreeblk *);
805static	void free_jfreefrag(struct jfreefrag *);
806static	void free_freedep(struct freedep *);
807static	void journal_jremref(struct dirrem *, struct jremref *,
808	    struct inodedep *);
809static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
810static	int cancel_jaddref(struct jaddref *, struct inodedep *,
811	    struct workhead *);
812static	void cancel_jfreefrag(struct jfreefrag *);
813static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
814static	int deallocate_dependencies(struct buf *, struct inodedep *,
815	    struct freeblks *);
816static	void free_newblk(struct newblk *);
817static	void cancel_allocdirect(struct allocdirectlst *,
818	    struct allocdirect *, struct freeblks *, int);
819static	int check_inode_unwritten(struct inodedep *);
820static	int free_inodedep(struct inodedep *);
821static	void freework_freeblock(struct freework *);
822static	void handle_workitem_freeblocks(struct freeblks *, int);
823static	void handle_complete_freeblocks(struct freeblks *);
824static	void handle_workitem_indirblk(struct freework *);
825static	void handle_written_freework(struct freework *);
826static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
827static	void setup_allocindir_phase2(struct buf *, struct inode *,
828	    struct inodedep *, struct allocindir *, ufs_lbn_t);
829static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
830	    ufs2_daddr_t, ufs_lbn_t);
831static	void handle_workitem_freefrag(struct freefrag *);
832static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
833	    ufs_lbn_t);
834static	void allocdirect_merge(struct allocdirectlst *,
835	    struct allocdirect *, struct allocdirect *);
836static	struct freefrag *allocindir_merge(struct allocindir *,
837	    struct allocindir *);
838static	int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int,
839	    struct bmsafemap **);
840static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
841	    int cg);
842static	int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t,
843	    int, struct newblk **);
844static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
845static	int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
846	    struct inodedep **);
847static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
848static	int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int,
849	    struct pagedep **);
850static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
851	    struct mount *mp, int, struct pagedep **);
852static	void pause_timer(void *);
853static	int request_cleanup(struct mount *, int);
854static	int process_worklist_item(struct mount *, int);
855static	void process_removes(struct vnode *);
856static	void jwork_move(struct workhead *, struct workhead *);
857static	void add_to_worklist(struct worklist *, int);
858static	void remove_from_worklist(struct worklist *);
859static	void softdep_flush(void);
860static	int softdep_speedup(void);
861static	void worklist_speedup(void);
862static	int journal_mount(struct mount *, struct fs *, struct ucred *);
863static	void journal_unmount(struct mount *);
864static	int journal_space(struct ufsmount *, int);
865static	void journal_suspend(struct ufsmount *);
866static	int journal_unsuspend(struct ufsmount *ump);
867static	void softdep_prelink(struct vnode *, struct vnode *);
868static	void add_to_journal(struct worklist *);
869static	void remove_from_journal(struct worklist *);
870static	void softdep_process_journal(struct mount *, int);
871static	struct jremref *newjremref(struct dirrem *, struct inode *,
872	    struct inode *ip, off_t, nlink_t);
873static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
874	    uint16_t);
875static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
876	    uint16_t);
877static inline struct jsegdep *inoref_jseg(struct inoref *);
878static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
879static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
880	    ufs2_daddr_t, int);
881static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
882	    ufs2_daddr_t, long, ufs_lbn_t);
883static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
884	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int);
885static	void jwait(struct worklist *wk);
886static	struct inodedep *inodedep_lookup_ip(struct inode *);
887static	int bmsafemap_rollbacks(struct bmsafemap *);
888static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
889static	void handle_jwork(struct workhead *);
890static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
891	    struct mkdir **);
892static	struct jblocks *jblocks_create(void);
893static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
894static	void jblocks_free(struct jblocks *, struct mount *, int);
895static	void jblocks_destroy(struct jblocks *);
896static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
897
898/*
899 * Exported softdep operations.
900 */
901static	void softdep_disk_io_initiation(struct buf *);
902static	void softdep_disk_write_complete(struct buf *);
903static	void softdep_deallocate_dependencies(struct buf *);
904static	int softdep_count_dependencies(struct buf *bp, int);
905
906static struct mtx lk;
907MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
908
909#define TRY_ACQUIRE_LOCK(lk)		mtx_trylock(lk)
910#define ACQUIRE_LOCK(lk)		mtx_lock(lk)
911#define FREE_LOCK(lk)			mtx_unlock(lk)
912
913#define	BUF_AREC(bp)			lockallowrecurse(&(bp)->b_lock)
914#define	BUF_NOREC(bp)			lockdisablerecurse(&(bp)->b_lock)
915
916/*
917 * Worklist queue management.
918 * These routines require that the lock be held.
919 */
920#ifndef /* NOT */ DEBUG
921#define WORKLIST_INSERT(head, item) do {	\
922	(item)->wk_state |= ONWORKLIST;		\
923	LIST_INSERT_HEAD(head, item, wk_list);	\
924} while (0)
925#define WORKLIST_REMOVE(item) do {		\
926	(item)->wk_state &= ~ONWORKLIST;	\
927	LIST_REMOVE(item, wk_list);		\
928} while (0)
929#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
930#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
931
932#else /* DEBUG */
933static	void worklist_insert(struct workhead *, struct worklist *, int);
934static	void worklist_remove(struct worklist *, int);
935
936#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
937#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
938#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
939#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
940
941static void
942worklist_insert(head, item, locked)
943	struct workhead *head;
944	struct worklist *item;
945	int locked;
946{
947
948	if (locked)
949		mtx_assert(&lk, MA_OWNED);
950	if (item->wk_state & ONWORKLIST)
951		panic("worklist_insert: %p %s(0x%X) already on list",
952		    item, TYPENAME(item->wk_type), item->wk_state);
953	item->wk_state |= ONWORKLIST;
954	LIST_INSERT_HEAD(head, item, wk_list);
955}
956
957static void
958worklist_remove(item, locked)
959	struct worklist *item;
960	int locked;
961{
962
963	if (locked)
964		mtx_assert(&lk, MA_OWNED);
965	if ((item->wk_state & ONWORKLIST) == 0)
966		panic("worklist_remove: %p %s(0x%X) not on list",
967		    item, TYPENAME(item->wk_type), item->wk_state);
968	item->wk_state &= ~ONWORKLIST;
969	LIST_REMOVE(item, wk_list);
970}
971#endif /* DEBUG */
972
973/*
974 * Merge two jsegdeps keeping only the oldest one as newer references
975 * can't be discarded until after older references.
976 */
977static inline struct jsegdep *
978jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
979{
980	struct jsegdep *swp;
981
982	if (two == NULL)
983		return (one);
984
985	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
986		swp = one;
987		one = two;
988		two = swp;
989	}
990	WORKLIST_REMOVE(&two->jd_list);
991	free_jsegdep(two);
992
993	return (one);
994}
995
996/*
997 * If two freedeps are compatible free one to reduce list size.
998 */
999static inline struct freedep *
1000freedep_merge(struct freedep *one, struct freedep *two)
1001{
1002	if (two == NULL)
1003		return (one);
1004
1005	if (one->fd_freework == two->fd_freework) {
1006		WORKLIST_REMOVE(&two->fd_list);
1007		free_freedep(two);
1008	}
1009	return (one);
1010}
1011
1012/*
1013 * Move journal work from one list to another.  Duplicate freedeps and
1014 * jsegdeps are coalesced to keep the lists as small as possible.
1015 */
1016static void
1017jwork_move(dst, src)
1018	struct workhead *dst;
1019	struct workhead *src;
1020{
1021	struct freedep *freedep;
1022	struct jsegdep *jsegdep;
1023	struct worklist *wkn;
1024	struct worklist *wk;
1025
1026	KASSERT(dst != src,
1027	    ("jwork_move: dst == src"));
1028	freedep = NULL;
1029	jsegdep = NULL;
1030	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1031		if (wk->wk_type == D_JSEGDEP)
1032			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1033		if (wk->wk_type == D_FREEDEP)
1034			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1035	}
1036
1037	mtx_assert(&lk, MA_OWNED);
1038	while ((wk = LIST_FIRST(src)) != NULL) {
1039		WORKLIST_REMOVE(wk);
1040		WORKLIST_INSERT(dst, wk);
1041		if (wk->wk_type == D_JSEGDEP) {
1042			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1043			continue;
1044		}
1045		if (wk->wk_type == D_FREEDEP)
1046			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1047	}
1048}
1049
1050/*
1051 * Routines for tracking and managing workitems.
1052 */
1053static	void workitem_free(struct worklist *, int);
1054static	void workitem_alloc(struct worklist *, int, struct mount *);
1055
1056#define	WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
1057
1058static void
1059workitem_free(item, type)
1060	struct worklist *item;
1061	int type;
1062{
1063	struct ufsmount *ump;
1064	mtx_assert(&lk, MA_OWNED);
1065
1066#ifdef DEBUG
1067	if (item->wk_state & ONWORKLIST)
1068		panic("workitem_free: %s(0x%X) still on list",
1069		    TYPENAME(item->wk_type), item->wk_state);
1070	if (item->wk_type != type)
1071		panic("workitem_free: type mismatch %s != %s",
1072		    TYPENAME(item->wk_type), TYPENAME(type));
1073#endif
1074	ump = VFSTOUFS(item->wk_mp);
1075	if (--ump->softdep_deps == 0 && ump->softdep_req)
1076		wakeup(&ump->softdep_deps);
1077	dep_current[type]--;
1078	free(item, DtoM(type));
1079}
1080
1081static void
1082workitem_alloc(item, type, mp)
1083	struct worklist *item;
1084	int type;
1085	struct mount *mp;
1086{
1087	item->wk_type = type;
1088	item->wk_mp = mp;
1089	item->wk_state = 0;
1090	ACQUIRE_LOCK(&lk);
1091	dep_current[type]++;
1092	dep_total[type]++;
1093	VFSTOUFS(mp)->softdep_deps++;
1094	VFSTOUFS(mp)->softdep_accdeps++;
1095	FREE_LOCK(&lk);
1096}
1097
1098/*
1099 * Workitem queue management
1100 */
1101static int max_softdeps;	/* maximum number of structs before slowdown */
1102static int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
1103static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1104static int proc_waiting;	/* tracks whether we have a timeout posted */
1105static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1106static struct callout softdep_callout;
1107static int req_pending;
1108static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1109static int req_clear_remove;	/* syncer process flush some freeblks */
1110static long num_freeblkdep;	/* number of freeblks workitems allocated */
1111
1112/*
1113 * runtime statistics
1114 */
1115static int stat_worklist_push;	/* number of worklist cleanups */
1116static int stat_blk_limit_push;	/* number of times block limit neared */
1117static int stat_ino_limit_push;	/* number of times inode limit neared */
1118static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1119static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1120static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1121static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1122static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1123static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1124static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1125static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1126static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1127static int stat_journal_min;	/* Times hit journal min threshold */
1128static int stat_journal_low;	/* Times hit journal low threshold */
1129static int stat_journal_wait;	/* Times blocked in jwait(). */
1130static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1131static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1132static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1133static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1134
1135SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1136    &max_softdeps, 0, "");
1137SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1138    &tickdelay, 0, "");
1139SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
1140    &maxindirdeps, 0, "");
1141SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1142    &stat_worklist_push, 0,"");
1143SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1144    &stat_blk_limit_push, 0,"");
1145SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1146    &stat_ino_limit_push, 0,"");
1147SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1148    &stat_blk_limit_hit, 0, "");
1149SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1150    &stat_ino_limit_hit, 0, "");
1151SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1152    &stat_sync_limit_hit, 0, "");
1153SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1154    &stat_indir_blk_ptrs, 0, "");
1155SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1156    &stat_inode_bitmap, 0, "");
1157SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1158    &stat_direct_blk_ptrs, 0, "");
1159SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1160    &stat_dir_entry, 0, "");
1161SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1162    &stat_jaddref, 0, "");
1163SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1164    &stat_jnewblk, 0, "");
1165SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1166    &stat_journal_low, 0, "");
1167SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1168    &stat_journal_min, 0, "");
1169SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1170    &stat_journal_wait, 0, "");
1171SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1172    &stat_jwait_filepage, 0, "");
1173SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1174    &stat_jwait_freeblks, 0, "");
1175SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1176    &stat_jwait_inode, 0, "");
1177SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1178    &stat_jwait_newblk, 0, "");
1179
1180SYSCTL_DECL(_vfs_ffs);
1181
1182LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl;
1183static u_long	bmsafemap_hash;	/* size of hash table - 1 */
1184
1185static int compute_summary_at_mount = 0;	/* Whether to recompute the summary at mount time */
1186SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1187	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1188
1189static struct proc *softdepproc;
1190static struct kproc_desc softdep_kp = {
1191	"softdepflush",
1192	softdep_flush,
1193	&softdepproc
1194};
1195SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
1196    &softdep_kp);
1197
1198static void
1199softdep_flush(void)
1200{
1201	struct mount *nmp;
1202	struct mount *mp;
1203	struct ufsmount *ump;
1204	struct thread *td;
1205	int remaining;
1206	int progress;
1207	int vfslocked;
1208
1209	td = curthread;
1210	td->td_pflags |= TDP_NORUNNINGBUF;
1211
1212	for (;;) {
1213		kproc_suspend_check(softdepproc);
1214		vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
1215		ACQUIRE_LOCK(&lk);
1216		/*
1217		 * If requested, try removing inode or removal dependencies.
1218		 */
1219		if (req_clear_inodedeps) {
1220			clear_inodedeps(td);
1221			req_clear_inodedeps -= 1;
1222			wakeup_one(&proc_waiting);
1223		}
1224		if (req_clear_remove) {
1225			clear_remove(td);
1226			req_clear_remove -= 1;
1227			wakeup_one(&proc_waiting);
1228		}
1229		FREE_LOCK(&lk);
1230		VFS_UNLOCK_GIANT(vfslocked);
1231		remaining = progress = 0;
1232		mtx_lock(&mountlist_mtx);
1233		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
1234			nmp = TAILQ_NEXT(mp, mnt_list);
1235			if ((mp->mnt_flag & MNT_SOFTDEP) == 0)
1236				continue;
1237			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
1238				continue;
1239			vfslocked = VFS_LOCK_GIANT(mp);
1240			progress += softdep_process_worklist(mp, 0);
1241			ump = VFSTOUFS(mp);
1242			remaining += ump->softdep_on_worklist -
1243				ump->softdep_on_worklist_inprogress;
1244			VFS_UNLOCK_GIANT(vfslocked);
1245			mtx_lock(&mountlist_mtx);
1246			nmp = TAILQ_NEXT(mp, mnt_list);
1247			vfs_unbusy(mp);
1248		}
1249		mtx_unlock(&mountlist_mtx);
1250		if (remaining && progress)
1251			continue;
1252		ACQUIRE_LOCK(&lk);
1253		if (!req_pending)
1254			msleep(&req_pending, &lk, PVM, "sdflush", hz);
1255		req_pending = 0;
1256		FREE_LOCK(&lk);
1257	}
1258}
1259
1260static void
1261worklist_speedup(void)
1262{
1263	mtx_assert(&lk, MA_OWNED);
1264	if (req_pending == 0) {
1265		req_pending = 1;
1266		wakeup(&req_pending);
1267	}
1268}
1269
1270static int
1271softdep_speedup(void)
1272{
1273
1274	worklist_speedup();
1275	bd_speedup();
1276	return speedup_syncer();
1277}
1278
1279/*
1280 * Add an item to the end of the work queue.
1281 * This routine requires that the lock be held.
1282 * This is the only routine that adds items to the list.
1283 * The following routine is the only one that removes items
1284 * and does so in order from first to last.
1285 */
1286static void
1287add_to_worklist(wk, nodelay)
1288	struct worklist *wk;
1289	int nodelay;
1290{
1291	struct ufsmount *ump;
1292
1293	mtx_assert(&lk, MA_OWNED);
1294	ump = VFSTOUFS(wk->wk_mp);
1295	if (wk->wk_state & ONWORKLIST)
1296		panic("add_to_worklist: %s(0x%X) already on list",
1297		    TYPENAME(wk->wk_type), wk->wk_state);
1298	wk->wk_state |= ONWORKLIST;
1299	if (LIST_EMPTY(&ump->softdep_workitem_pending))
1300		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1301	else
1302		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1303	ump->softdep_worklist_tail = wk;
1304	ump->softdep_on_worklist += 1;
1305	if (nodelay)
1306		worklist_speedup();
1307}
1308
1309/*
1310 * Remove the item to be processed. If we are removing the last
1311 * item on the list, we need to recalculate the tail pointer.
1312 */
1313static void
1314remove_from_worklist(wk)
1315	struct worklist *wk;
1316{
1317	struct ufsmount *ump;
1318	struct worklist *wkend;
1319
1320	ump = VFSTOUFS(wk->wk_mp);
1321	WORKLIST_REMOVE(wk);
1322	if (wk == ump->softdep_worklist_tail) {
1323		LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
1324			if (LIST_NEXT(wkend, wk_list) == NULL)
1325				break;
1326		ump->softdep_worklist_tail = wkend;
1327	}
1328	ump->softdep_on_worklist -= 1;
1329}
1330
1331/*
1332 * Process that runs once per second to handle items in the background queue.
1333 *
1334 * Note that we ensure that everything is done in the order in which they
1335 * appear in the queue. The code below depends on this property to ensure
1336 * that blocks of a file are freed before the inode itself is freed. This
1337 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1338 * until all the old ones have been purged from the dependency lists.
1339 */
1340int
1341softdep_process_worklist(mp, full)
1342	struct mount *mp;
1343	int full;
1344{
1345	struct thread *td = curthread;
1346	int cnt, matchcnt;
1347	struct ufsmount *ump;
1348	long starttime;
1349
1350	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1351	/*
1352	 * Record the process identifier of our caller so that we can give
1353	 * this process preferential treatment in request_cleanup below.
1354	 */
1355	matchcnt = 0;
1356	ump = VFSTOUFS(mp);
1357	ACQUIRE_LOCK(&lk);
1358	starttime = time_second;
1359	softdep_process_journal(mp, full?MNT_WAIT:0);
1360	while (ump->softdep_on_worklist > 0) {
1361		if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1)
1362			break;
1363		else
1364			matchcnt += cnt;
1365		/*
1366		 * If requested, try removing inode or removal dependencies.
1367		 */
1368		if (req_clear_inodedeps) {
1369			clear_inodedeps(td);
1370			req_clear_inodedeps -= 1;
1371			wakeup_one(&proc_waiting);
1372		}
1373		if (req_clear_remove) {
1374			clear_remove(td);
1375			req_clear_remove -= 1;
1376			wakeup_one(&proc_waiting);
1377		}
1378		/*
1379		 * We do not generally want to stop for buffer space, but if
1380		 * we are really being a buffer hog, we will stop and wait.
1381		 */
1382		if (should_yield()) {
1383			FREE_LOCK(&lk);
1384			kern_yield(-1);
1385			bwillwrite();
1386			ACQUIRE_LOCK(&lk);
1387		}
1388		/*
1389		 * Never allow processing to run for more than one
1390		 * second. Otherwise the other mountpoints may get
1391		 * excessively backlogged.
1392		 */
1393		if (!full && starttime != time_second)
1394			break;
1395	}
1396	if (full == 0)
1397		journal_unsuspend(ump);
1398	FREE_LOCK(&lk);
1399	return (matchcnt);
1400}
1401
1402/*
1403 * Process all removes associated with a vnode if we are running out of
1404 * journal space.  Any other process which attempts to flush these will
1405 * be unable as we have the vnodes locked.
1406 */
1407static void
1408process_removes(vp)
1409	struct vnode *vp;
1410{
1411	struct inodedep *inodedep;
1412	struct dirrem *dirrem;
1413	struct mount *mp;
1414	ino_t inum;
1415
1416	mtx_assert(&lk, MA_OWNED);
1417
1418	mp = vp->v_mount;
1419	inum = VTOI(vp)->i_number;
1420	for (;;) {
1421		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1422			return;
1423		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext)
1424			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1425			    (COMPLETE | ONWORKLIST))
1426				break;
1427		if (dirrem == NULL)
1428			return;
1429		/*
1430		 * If another thread is trying to lock this vnode it will
1431		 * fail but we must wait for it to do so before we can
1432		 * proceed.
1433		 */
1434		if (dirrem->dm_state & INPROGRESS) {
1435			dirrem->dm_state |= IOWAITING;
1436			msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0);
1437			continue;
1438		}
1439		remove_from_worklist(&dirrem->dm_list);
1440		FREE_LOCK(&lk);
1441		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1442			panic("process_removes: suspended filesystem");
1443		handle_workitem_remove(dirrem, vp);
1444		vn_finished_secondary_write(mp);
1445		ACQUIRE_LOCK(&lk);
1446	}
1447}
1448
1449/*
1450 * Process one item on the worklist.
1451 */
1452static int
1453process_worklist_item(mp, flags)
1454	struct mount *mp;
1455	int flags;
1456{
1457	struct worklist *wk;
1458	struct ufsmount *ump;
1459	struct vnode *vp;
1460	int matchcnt = 0;
1461
1462	mtx_assert(&lk, MA_OWNED);
1463	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1464	/*
1465	 * If we are being called because of a process doing a
1466	 * copy-on-write, then it is not safe to write as we may
1467	 * recurse into the copy-on-write routine.
1468	 */
1469	if (curthread->td_pflags & TDP_COWINPROGRESS)
1470		return (-1);
1471	/*
1472	 * Normally we just process each item on the worklist in order.
1473	 * However, if we are in a situation where we cannot lock any
1474	 * inodes, we have to skip over any dirrem requests whose
1475	 * vnodes are resident and locked.
1476	 */
1477	vp = NULL;
1478	ump = VFSTOUFS(mp);
1479	LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
1480		if (wk->wk_state & INPROGRESS)
1481			continue;
1482		if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
1483			break;
1484		wk->wk_state |= INPROGRESS;
1485		ump->softdep_on_worklist_inprogress++;
1486		FREE_LOCK(&lk);
1487		ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum,
1488		    LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ);
1489		ACQUIRE_LOCK(&lk);
1490		if (wk->wk_state & IOWAITING) {
1491			wk->wk_state &= ~IOWAITING;
1492			wakeup(wk);
1493		}
1494		wk->wk_state &= ~INPROGRESS;
1495		ump->softdep_on_worklist_inprogress--;
1496		if (vp != NULL)
1497			break;
1498	}
1499	if (wk == 0)
1500		return (-1);
1501	remove_from_worklist(wk);
1502	FREE_LOCK(&lk);
1503	if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1504		panic("process_worklist_item: suspended filesystem");
1505	matchcnt++;
1506	switch (wk->wk_type) {
1507
1508	case D_DIRREM:
1509		/* removal of a directory entry */
1510		handle_workitem_remove(WK_DIRREM(wk), vp);
1511		if (vp)
1512			vput(vp);
1513		break;
1514
1515	case D_FREEBLKS:
1516		/* releasing blocks and/or fragments from a file */
1517		handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
1518		break;
1519
1520	case D_FREEFRAG:
1521		/* releasing a fragment when replaced as a file grows */
1522		handle_workitem_freefrag(WK_FREEFRAG(wk));
1523		break;
1524
1525	case D_FREEFILE:
1526		/* releasing an inode when its link count drops to 0 */
1527		handle_workitem_freefile(WK_FREEFILE(wk));
1528		break;
1529
1530	case D_FREEWORK:
1531		/* Final block in an indirect was freed. */
1532		handle_workitem_indirblk(WK_FREEWORK(wk));
1533		break;
1534
1535	default:
1536		panic("%s_process_worklist: Unknown type %s",
1537		    "softdep", TYPENAME(wk->wk_type));
1538		/* NOTREACHED */
1539	}
1540	vn_finished_secondary_write(mp);
1541	ACQUIRE_LOCK(&lk);
1542	return (matchcnt);
1543}
1544
1545/*
1546 * Move dependencies from one buffer to another.
1547 */
1548int
1549softdep_move_dependencies(oldbp, newbp)
1550	struct buf *oldbp;
1551	struct buf *newbp;
1552{
1553	struct worklist *wk, *wktail;
1554	int dirty;
1555
1556	dirty = 0;
1557	wktail = NULL;
1558	ACQUIRE_LOCK(&lk);
1559	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1560		LIST_REMOVE(wk, wk_list);
1561		if (wk->wk_type == D_BMSAFEMAP &&
1562		    bmsafemap_rollbacks(WK_BMSAFEMAP(wk)))
1563			dirty = 1;
1564		if (wktail == 0)
1565			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1566		else
1567			LIST_INSERT_AFTER(wktail, wk, wk_list);
1568		wktail = wk;
1569	}
1570	FREE_LOCK(&lk);
1571
1572	return (dirty);
1573}
1574
1575/*
1576 * Purge the work list of all items associated with a particular mount point.
1577 */
1578int
1579softdep_flushworklist(oldmnt, countp, td)
1580	struct mount *oldmnt;
1581	int *countp;
1582	struct thread *td;
1583{
1584	struct vnode *devvp;
1585	int count, error = 0;
1586	struct ufsmount *ump;
1587
1588	/*
1589	 * Alternately flush the block device associated with the mount
1590	 * point and process any dependencies that the flushing
1591	 * creates. We continue until no more worklist dependencies
1592	 * are found.
1593	 */
1594	*countp = 0;
1595	ump = VFSTOUFS(oldmnt);
1596	devvp = ump->um_devvp;
1597	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1598		*countp += count;
1599		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1600		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1601		VOP_UNLOCK(devvp, 0);
1602		if (error)
1603			break;
1604	}
1605	return (error);
1606}
1607
1608int
1609softdep_waitidle(struct mount *mp)
1610{
1611	struct ufsmount *ump;
1612	int error;
1613	int i;
1614
1615	ump = VFSTOUFS(mp);
1616	ACQUIRE_LOCK(&lk);
1617	for (i = 0; i < 10 && ump->softdep_deps; i++) {
1618		ump->softdep_req = 1;
1619		if (ump->softdep_on_worklist)
1620			panic("softdep_waitidle: work added after flush.");
1621		msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
1622	}
1623	ump->softdep_req = 0;
1624	FREE_LOCK(&lk);
1625	error = 0;
1626	if (i == 10) {
1627		error = EBUSY;
1628		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1629		    mp);
1630	}
1631
1632	return (error);
1633}
1634
1635/*
1636 * Flush all vnodes and worklist items associated with a specified mount point.
1637 */
1638int
1639softdep_flushfiles(oldmnt, flags, td)
1640	struct mount *oldmnt;
1641	int flags;
1642	struct thread *td;
1643{
1644	int error, depcount, loopcnt, retry_flush_count, retry;
1645
1646	loopcnt = 10;
1647	retry_flush_count = 3;
1648retry_flush:
1649	error = 0;
1650
1651	/*
1652	 * Alternately flush the vnodes associated with the mount
1653	 * point and process any dependencies that the flushing
1654	 * creates. In theory, this loop can happen at most twice,
1655	 * but we give it a few extra just to be sure.
1656	 */
1657	for (; loopcnt > 0; loopcnt--) {
1658		/*
1659		 * Do another flush in case any vnodes were brought in
1660		 * as part of the cleanup operations.
1661		 */
1662		if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
1663			break;
1664		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1665		    depcount == 0)
1666			break;
1667	}
1668	/*
1669	 * If we are unmounting then it is an error to fail. If we
1670	 * are simply trying to downgrade to read-only, then filesystem
1671	 * activity can keep us busy forever, so we just fail with EBUSY.
1672	 */
1673	if (loopcnt == 0) {
1674		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1675			panic("softdep_flushfiles: looping");
1676		error = EBUSY;
1677	}
1678	if (!error)
1679		error = softdep_waitidle(oldmnt);
1680	if (!error) {
1681		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
1682			retry = 0;
1683			MNT_ILOCK(oldmnt);
1684			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
1685			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
1686			if (oldmnt->mnt_nvnodelistsize > 0) {
1687				if (--retry_flush_count > 0) {
1688					retry = 1;
1689					loopcnt = 3;
1690				} else
1691					error = EBUSY;
1692			}
1693			MNT_IUNLOCK(oldmnt);
1694			if (retry)
1695				goto retry_flush;
1696		}
1697	}
1698	return (error);
1699}
1700
1701/*
1702 * Structure hashing.
1703 *
1704 * There are three types of structures that can be looked up:
1705 *	1) pagedep structures identified by mount point, inode number,
1706 *	   and logical block.
1707 *	2) inodedep structures identified by mount point and inode number.
1708 *	3) newblk structures identified by mount point and
1709 *	   physical block number.
1710 *
1711 * The "pagedep" and "inodedep" dependency structures are hashed
1712 * separately from the file blocks and inodes to which they correspond.
1713 * This separation helps when the in-memory copy of an inode or
1714 * file block must be replaced. It also obviates the need to access
1715 * an inode or file page when simply updating (or de-allocating)
1716 * dependency structures. Lookup of newblk structures is needed to
1717 * find newly allocated blocks when trying to associate them with
1718 * their allocdirect or allocindir structure.
1719 *
1720 * The lookup routines optionally create and hash a new instance when
1721 * an existing entry is not found.
1722 */
1723#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
1724#define NODELAY		0x0002	/* cannot do background work */
1725
1726/*
1727 * Structures and routines associated with pagedep caching.
1728 */
1729LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
1730u_long	pagedep_hash;		/* size of hash table - 1 */
1731#define	PAGEDEP_HASH(mp, inum, lbn) \
1732	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
1733	    pagedep_hash])
1734
1735static int
1736pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
1737	struct pagedep_hashhead *pagedephd;
1738	ino_t ino;
1739	ufs_lbn_t lbn;
1740	struct mount *mp;
1741	int flags;
1742	struct pagedep **pagedeppp;
1743{
1744	struct pagedep *pagedep;
1745
1746	LIST_FOREACH(pagedep, pagedephd, pd_hash)
1747		if (ino == pagedep->pd_ino &&
1748		    lbn == pagedep->pd_lbn &&
1749		    mp == pagedep->pd_list.wk_mp)
1750			break;
1751	if (pagedep) {
1752		*pagedeppp = pagedep;
1753		if ((flags & DEPALLOC) != 0 &&
1754		    (pagedep->pd_state & ONWORKLIST) == 0)
1755			return (0);
1756		return (1);
1757	}
1758	*pagedeppp = NULL;
1759	return (0);
1760}
1761/*
1762 * Look up a pagedep. Return 1 if found, 0 if not found or found
1763 * when asked to allocate but not associated with any buffer.
1764 * If not found, allocate if DEPALLOC flag is passed.
1765 * Found or allocated entry is returned in pagedeppp.
1766 * This routine must be called with splbio interrupts blocked.
1767 */
1768static int
1769pagedep_lookup(mp, ino, lbn, flags, pagedeppp)
1770	struct mount *mp;
1771	ino_t ino;
1772	ufs_lbn_t lbn;
1773	int flags;
1774	struct pagedep **pagedeppp;
1775{
1776	struct pagedep *pagedep;
1777	struct pagedep_hashhead *pagedephd;
1778	int ret;
1779	int i;
1780
1781	mtx_assert(&lk, MA_OWNED);
1782	pagedephd = PAGEDEP_HASH(mp, ino, lbn);
1783
1784	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
1785	if (*pagedeppp || (flags & DEPALLOC) == 0)
1786		return (ret);
1787	FREE_LOCK(&lk);
1788	pagedep = malloc(sizeof(struct pagedep),
1789	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
1790	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
1791	ACQUIRE_LOCK(&lk);
1792	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
1793	if (*pagedeppp) {
1794		WORKITEM_FREE(pagedep, D_PAGEDEP);
1795		return (ret);
1796	}
1797	pagedep->pd_ino = ino;
1798	pagedep->pd_lbn = lbn;
1799	LIST_INIT(&pagedep->pd_dirremhd);
1800	LIST_INIT(&pagedep->pd_pendinghd);
1801	for (i = 0; i < DAHASHSZ; i++)
1802		LIST_INIT(&pagedep->pd_diraddhd[i]);
1803	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
1804	*pagedeppp = pagedep;
1805	return (0);
1806}
1807
1808/*
1809 * Structures and routines associated with inodedep caching.
1810 */
1811LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
1812static u_long	inodedep_hash;	/* size of hash table - 1 */
1813static long	num_inodedep;	/* number of inodedep allocated */
1814#define	INODEDEP_HASH(fs, inum) \
1815      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
1816
1817static int
1818inodedep_find(inodedephd, fs, inum, inodedeppp)
1819	struct inodedep_hashhead *inodedephd;
1820	struct fs *fs;
1821	ino_t inum;
1822	struct inodedep **inodedeppp;
1823{
1824	struct inodedep *inodedep;
1825
1826	LIST_FOREACH(inodedep, inodedephd, id_hash)
1827		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1828			break;
1829	if (inodedep) {
1830		*inodedeppp = inodedep;
1831		return (1);
1832	}
1833	*inodedeppp = NULL;
1834
1835	return (0);
1836}
1837/*
1838 * Look up an inodedep. Return 1 if found, 0 if not found.
1839 * If not found, allocate if DEPALLOC flag is passed.
1840 * Found or allocated entry is returned in inodedeppp.
1841 * This routine must be called with splbio interrupts blocked.
1842 */
1843static int
1844inodedep_lookup(mp, inum, flags, inodedeppp)
1845	struct mount *mp;
1846	ino_t inum;
1847	int flags;
1848	struct inodedep **inodedeppp;
1849{
1850	struct inodedep *inodedep;
1851	struct inodedep_hashhead *inodedephd;
1852	struct fs *fs;
1853
1854	mtx_assert(&lk, MA_OWNED);
1855	fs = VFSTOUFS(mp)->um_fs;
1856	inodedephd = INODEDEP_HASH(fs, inum);
1857
1858	if (inodedep_find(inodedephd, fs, inum, inodedeppp))
1859		return (1);
1860	if ((flags & DEPALLOC) == 0)
1861		return (0);
1862	/*
1863	 * If we are over our limit, try to improve the situation.
1864	 */
1865	if (num_inodedep > max_softdeps && (flags & NODELAY) == 0)
1866		request_cleanup(mp, FLUSH_INODES);
1867	FREE_LOCK(&lk);
1868	inodedep = malloc(sizeof(struct inodedep),
1869		M_INODEDEP, M_SOFTDEP_FLAGS);
1870	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
1871	ACQUIRE_LOCK(&lk);
1872	if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
1873		WORKITEM_FREE(inodedep, D_INODEDEP);
1874		return (1);
1875	}
1876	num_inodedep += 1;
1877	inodedep->id_fs = fs;
1878	inodedep->id_ino = inum;
1879	inodedep->id_state = ALLCOMPLETE;
1880	inodedep->id_nlinkdelta = 0;
1881	inodedep->id_savedino1 = NULL;
1882	inodedep->id_savedsize = -1;
1883	inodedep->id_savedextsize = -1;
1884	inodedep->id_savednlink = -1;
1885	inodedep->id_bmsafemap = NULL;
1886	inodedep->id_mkdiradd = NULL;
1887	LIST_INIT(&inodedep->id_dirremhd);
1888	LIST_INIT(&inodedep->id_pendinghd);
1889	LIST_INIT(&inodedep->id_inowait);
1890	LIST_INIT(&inodedep->id_bufwait);
1891	TAILQ_INIT(&inodedep->id_inoreflst);
1892	TAILQ_INIT(&inodedep->id_inoupdt);
1893	TAILQ_INIT(&inodedep->id_newinoupdt);
1894	TAILQ_INIT(&inodedep->id_extupdt);
1895	TAILQ_INIT(&inodedep->id_newextupdt);
1896	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1897	*inodedeppp = inodedep;
1898	return (0);
1899}
1900
1901/*
1902 * Structures and routines associated with newblk caching.
1903 */
1904LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1905u_long	newblk_hash;		/* size of hash table - 1 */
1906#define	NEWBLK_HASH(fs, inum) \
1907	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1908
1909static int
1910newblk_find(newblkhd, mp, newblkno, flags, newblkpp)
1911	struct newblk_hashhead *newblkhd;
1912	struct mount *mp;
1913	ufs2_daddr_t newblkno;
1914	int flags;
1915	struct newblk **newblkpp;
1916{
1917	struct newblk *newblk;
1918
1919	LIST_FOREACH(newblk, newblkhd, nb_hash) {
1920		if (newblkno != newblk->nb_newblkno)
1921			continue;
1922		if (mp != newblk->nb_list.wk_mp)
1923			continue;
1924		/*
1925		 * If we're creating a new dependency don't match those that
1926		 * have already been converted to allocdirects.  This is for
1927		 * a frag extend.
1928		 */
1929		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
1930			continue;
1931		break;
1932	}
1933	if (newblk) {
1934		*newblkpp = newblk;
1935		return (1);
1936	}
1937	*newblkpp = NULL;
1938	return (0);
1939}
1940
1941/*
1942 * Look up a newblk. Return 1 if found, 0 if not found.
1943 * If not found, allocate if DEPALLOC flag is passed.
1944 * Found or allocated entry is returned in newblkpp.
1945 */
1946static int
1947newblk_lookup(mp, newblkno, flags, newblkpp)
1948	struct mount *mp;
1949	ufs2_daddr_t newblkno;
1950	int flags;
1951	struct newblk **newblkpp;
1952{
1953	struct newblk *newblk;
1954	struct newblk_hashhead *newblkhd;
1955
1956	newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno);
1957	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp))
1958		return (1);
1959	if ((flags & DEPALLOC) == 0)
1960		return (0);
1961	FREE_LOCK(&lk);
1962	newblk = malloc(sizeof(union allblk), M_NEWBLK,
1963	    M_SOFTDEP_FLAGS | M_ZERO);
1964	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
1965	ACQUIRE_LOCK(&lk);
1966	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) {
1967		WORKITEM_FREE(newblk, D_NEWBLK);
1968		return (1);
1969	}
1970	newblk->nb_freefrag = NULL;
1971	LIST_INIT(&newblk->nb_indirdeps);
1972	LIST_INIT(&newblk->nb_newdirblk);
1973	LIST_INIT(&newblk->nb_jwork);
1974	newblk->nb_state = ATTACHED;
1975	newblk->nb_newblkno = newblkno;
1976	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1977	*newblkpp = newblk;
1978	return (0);
1979}
1980
1981/*
1982 * Executed during filesystem system initialization before
1983 * mounting any filesystems.
1984 */
1985void
1986softdep_initialize()
1987{
1988
1989	LIST_INIT(&mkdirlisthd);
1990	max_softdeps = desiredvnodes * 4;
1991	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash);
1992	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
1993	newblk_hashtbl = hashinit(desiredvnodes / 5,  M_NEWBLK, &newblk_hash);
1994	bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
1995
1996	/* initialise bioops hack */
1997	bioops.io_start = softdep_disk_io_initiation;
1998	bioops.io_complete = softdep_disk_write_complete;
1999	bioops.io_deallocate = softdep_deallocate_dependencies;
2000	bioops.io_countdeps = softdep_count_dependencies;
2001
2002	/* Initialize the callout with an mtx. */
2003	callout_init_mtx(&softdep_callout, &lk, 0);
2004}
2005
2006/*
2007 * Executed after all filesystems have been unmounted during
2008 * filesystem module unload.
2009 */
2010void
2011softdep_uninitialize()
2012{
2013
2014	callout_drain(&softdep_callout);
2015	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
2016	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
2017	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
2018	hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash);
2019}
2020
2021/*
2022 * Called at mount time to notify the dependency code that a
2023 * filesystem wishes to use it.
2024 */
2025int
2026softdep_mount(devvp, mp, fs, cred)
2027	struct vnode *devvp;
2028	struct mount *mp;
2029	struct fs *fs;
2030	struct ucred *cred;
2031{
2032	struct csum_total cstotal;
2033	struct ufsmount *ump;
2034	struct cg *cgp;
2035	struct buf *bp;
2036	int error, cyl;
2037
2038	MNT_ILOCK(mp);
2039	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2040	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2041		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2042			MNTK_SOFTDEP;
2043		mp->mnt_noasync++;
2044	}
2045	MNT_IUNLOCK(mp);
2046	ump = VFSTOUFS(mp);
2047	LIST_INIT(&ump->softdep_workitem_pending);
2048	LIST_INIT(&ump->softdep_journal_pending);
2049	TAILQ_INIT(&ump->softdep_unlinked);
2050	ump->softdep_worklist_tail = NULL;
2051	ump->softdep_on_worklist = 0;
2052	ump->softdep_deps = 0;
2053	if ((fs->fs_flags & FS_SUJ) &&
2054	    (error = journal_mount(mp, fs, cred)) != 0) {
2055		printf("Failed to start journal: %d\n", error);
2056		return (error);
2057	}
2058	/*
2059	 * When doing soft updates, the counters in the
2060	 * superblock may have gotten out of sync. Recomputation
2061	 * can take a long time and can be deferred for background
2062	 * fsck.  However, the old behavior of scanning the cylinder
2063	 * groups and recalculating them at mount time is available
2064	 * by setting vfs.ffs.compute_summary_at_mount to one.
2065	 */
2066	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2067		return (0);
2068	bzero(&cstotal, sizeof cstotal);
2069	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2070		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2071		    fs->fs_cgsize, cred, &bp)) != 0) {
2072			brelse(bp);
2073			return (error);
2074		}
2075		cgp = (struct cg *)bp->b_data;
2076		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2077		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2078		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2079		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2080		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2081		brelse(bp);
2082	}
2083#ifdef DEBUG
2084	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2085		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2086#endif
2087	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2088	return (0);
2089}
2090
2091void
2092softdep_unmount(mp)
2093	struct mount *mp;
2094{
2095
2096	if (mp->mnt_kern_flag & MNTK_SUJ)
2097		journal_unmount(mp);
2098}
2099
2100struct jblocks {
2101	struct jseglst	jb_segs;	/* TAILQ of current segments. */
2102	struct jseg	*jb_writeseg;	/* Next write to complete. */
2103	struct jextent	*jb_extent;	/* Extent array. */
2104	uint64_t	jb_nextseq;	/* Next sequence number. */
2105	uint64_t	jb_oldestseq;	/* Oldest active sequence number. */
2106	int		jb_avail;	/* Available extents. */
2107	int		jb_used;	/* Last used extent. */
2108	int		jb_head;	/* Allocator head. */
2109	int		jb_off;		/* Allocator extent offset. */
2110	int		jb_blocks;	/* Total disk blocks covered. */
2111	int		jb_free;	/* Total disk blocks free. */
2112	int		jb_min;		/* Minimum free space. */
2113	int		jb_low;		/* Low on space. */
2114	int		jb_age;		/* Insertion time of oldest rec. */
2115	int		jb_suspended;	/* Did journal suspend writes? */
2116};
2117
2118struct jextent {
2119	ufs2_daddr_t	je_daddr;	/* Disk block address. */
2120	int		je_blocks;	/* Disk block count. */
2121};
2122
2123static struct jblocks *
2124jblocks_create(void)
2125{
2126	struct jblocks *jblocks;
2127
2128	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2129	TAILQ_INIT(&jblocks->jb_segs);
2130	jblocks->jb_avail = 10;
2131	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2132	    M_JBLOCKS, M_WAITOK | M_ZERO);
2133
2134	return (jblocks);
2135}
2136
2137static ufs2_daddr_t
2138jblocks_alloc(jblocks, bytes, actual)
2139	struct jblocks *jblocks;
2140	int bytes;
2141	int *actual;
2142{
2143	ufs2_daddr_t daddr;
2144	struct jextent *jext;
2145	int freecnt;
2146	int blocks;
2147
2148	blocks = bytes / DEV_BSIZE;
2149	jext = &jblocks->jb_extent[jblocks->jb_head];
2150	freecnt = jext->je_blocks - jblocks->jb_off;
2151	if (freecnt == 0) {
2152		jblocks->jb_off = 0;
2153		if (++jblocks->jb_head > jblocks->jb_used)
2154			jblocks->jb_head = 0;
2155		jext = &jblocks->jb_extent[jblocks->jb_head];
2156		freecnt = jext->je_blocks;
2157	}
2158	if (freecnt > blocks)
2159		freecnt = blocks;
2160	*actual = freecnt * DEV_BSIZE;
2161	daddr = jext->je_daddr + jblocks->jb_off;
2162	jblocks->jb_off += freecnt;
2163	jblocks->jb_free -= freecnt;
2164
2165	return (daddr);
2166}
2167
2168static void
2169jblocks_free(jblocks, mp, bytes)
2170	struct jblocks *jblocks;
2171	struct mount *mp;
2172	int bytes;
2173{
2174
2175	jblocks->jb_free += bytes / DEV_BSIZE;
2176	if (jblocks->jb_suspended)
2177		worklist_speedup();
2178	wakeup(jblocks);
2179}
2180
2181static void
2182jblocks_destroy(jblocks)
2183	struct jblocks *jblocks;
2184{
2185
2186	if (jblocks->jb_extent)
2187		free(jblocks->jb_extent, M_JBLOCKS);
2188	free(jblocks, M_JBLOCKS);
2189}
2190
2191static void
2192jblocks_add(jblocks, daddr, blocks)
2193	struct jblocks *jblocks;
2194	ufs2_daddr_t daddr;
2195	int blocks;
2196{
2197	struct jextent *jext;
2198
2199	jblocks->jb_blocks += blocks;
2200	jblocks->jb_free += blocks;
2201	jext = &jblocks->jb_extent[jblocks->jb_used];
2202	/* Adding the first block. */
2203	if (jext->je_daddr == 0) {
2204		jext->je_daddr = daddr;
2205		jext->je_blocks = blocks;
2206		return;
2207	}
2208	/* Extending the last extent. */
2209	if (jext->je_daddr + jext->je_blocks == daddr) {
2210		jext->je_blocks += blocks;
2211		return;
2212	}
2213	/* Adding a new extent. */
2214	if (++jblocks->jb_used == jblocks->jb_avail) {
2215		jblocks->jb_avail *= 2;
2216		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2217		    M_JBLOCKS, M_WAITOK | M_ZERO);
2218		memcpy(jext, jblocks->jb_extent,
2219		    sizeof(struct jextent) * jblocks->jb_used);
2220		free(jblocks->jb_extent, M_JBLOCKS);
2221		jblocks->jb_extent = jext;
2222	}
2223	jext = &jblocks->jb_extent[jblocks->jb_used];
2224	jext->je_daddr = daddr;
2225	jext->je_blocks = blocks;
2226	return;
2227}
2228
2229int
2230softdep_journal_lookup(mp, vpp)
2231	struct mount *mp;
2232	struct vnode **vpp;
2233{
2234	struct componentname cnp;
2235	struct vnode *dvp;
2236	ino_t sujournal;
2237	int error;
2238
2239	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2240	if (error)
2241		return (error);
2242	bzero(&cnp, sizeof(cnp));
2243	cnp.cn_nameiop = LOOKUP;
2244	cnp.cn_flags = ISLASTCN;
2245	cnp.cn_thread = curthread;
2246	cnp.cn_cred = curthread->td_ucred;
2247	cnp.cn_pnbuf = SUJ_FILE;
2248	cnp.cn_nameptr = SUJ_FILE;
2249	cnp.cn_namelen = strlen(SUJ_FILE);
2250	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2251	vput(dvp);
2252	if (error != 0)
2253		return (error);
2254	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2255	return (error);
2256}
2257
2258/*
2259 * Open and verify the journal file.
2260 */
2261static int
2262journal_mount(mp, fs, cred)
2263	struct mount *mp;
2264	struct fs *fs;
2265	struct ucred *cred;
2266{
2267	struct jblocks *jblocks;
2268	struct vnode *vp;
2269	struct inode *ip;
2270	ufs2_daddr_t blkno;
2271	int bcount;
2272	int error;
2273	int i;
2274
2275	error = softdep_journal_lookup(mp, &vp);
2276	if (error != 0) {
2277		printf("Failed to find journal.  Use tunefs to create one\n");
2278		return (error);
2279	}
2280	ip = VTOI(vp);
2281	if (ip->i_size < SUJ_MIN) {
2282		error = ENOSPC;
2283		goto out;
2284	}
2285	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2286	jblocks = jblocks_create();
2287	for (i = 0; i < bcount; i++) {
2288		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2289		if (error)
2290			break;
2291		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2292	}
2293	if (error) {
2294		jblocks_destroy(jblocks);
2295		goto out;
2296	}
2297	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2298	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2299	VFSTOUFS(mp)->softdep_jblocks = jblocks;
2300out:
2301	if (error == 0) {
2302		MNT_ILOCK(mp);
2303		mp->mnt_kern_flag |= MNTK_SUJ;
2304		MNT_IUNLOCK(mp);
2305		/*
2306		 * Only validate the journal contents if the
2307		 * filesystem is clean, otherwise we write the logs
2308		 * but they'll never be used.  If the filesystem was
2309		 * still dirty when we mounted it the journal is
2310		 * invalid and a new journal can only be valid if it
2311		 * starts from a clean mount.
2312		 */
2313		if (fs->fs_clean) {
2314			DIP_SET(ip, i_modrev, fs->fs_mtime);
2315			ip->i_flags |= IN_MODIFIED;
2316			ffs_update(vp, 1);
2317		}
2318	}
2319	vput(vp);
2320	return (error);
2321}
2322
2323static void
2324journal_unmount(mp)
2325	struct mount *mp;
2326{
2327	struct ufsmount *ump;
2328
2329	ump = VFSTOUFS(mp);
2330	if (ump->softdep_jblocks)
2331		jblocks_destroy(ump->softdep_jblocks);
2332	ump->softdep_jblocks = NULL;
2333}
2334
2335/*
2336 * Called when a journal record is ready to be written.  Space is allocated
2337 * and the journal entry is created when the journal is flushed to stable
2338 * store.
2339 */
2340static void
2341add_to_journal(wk)
2342	struct worklist *wk;
2343{
2344	struct ufsmount *ump;
2345
2346	mtx_assert(&lk, MA_OWNED);
2347	ump = VFSTOUFS(wk->wk_mp);
2348	if (wk->wk_state & ONWORKLIST)
2349		panic("add_to_journal: %s(0x%X) already on list",
2350		    TYPENAME(wk->wk_type), wk->wk_state);
2351	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2352	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2353		ump->softdep_jblocks->jb_age = ticks;
2354		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2355	} else
2356		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2357	ump->softdep_journal_tail = wk;
2358	ump->softdep_on_journal += 1;
2359}
2360
2361/*
2362 * Remove an arbitrary item for the journal worklist maintain the tail
2363 * pointer.  This happens when a new operation obviates the need to
2364 * journal an old operation.
2365 */
2366static void
2367remove_from_journal(wk)
2368	struct worklist *wk;
2369{
2370	struct ufsmount *ump;
2371
2372	mtx_assert(&lk, MA_OWNED);
2373	ump = VFSTOUFS(wk->wk_mp);
2374#ifdef SUJ_DEBUG
2375	{
2376		struct worklist *wkn;
2377
2378		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2379			if (wkn == wk)
2380				break;
2381		if (wkn == NULL)
2382			panic("remove_from_journal: %p is not in journal", wk);
2383	}
2384#endif
2385	/*
2386	 * We emulate a TAILQ to save space in most structures which do not
2387	 * require TAILQ semantics.  Here we must update the tail position
2388	 * when removing the tail which is not the final entry. This works
2389	 * only if the worklist linkage are at the beginning of the structure.
2390	 */
2391	if (ump->softdep_journal_tail == wk)
2392		ump->softdep_journal_tail =
2393		    (struct worklist *)wk->wk_list.le_prev;
2394
2395	WORKLIST_REMOVE(wk);
2396	ump->softdep_on_journal -= 1;
2397}
2398
2399/*
2400 * Check for journal space as well as dependency limits so the prelink
2401 * code can throttle both journaled and non-journaled filesystems.
2402 * Threshold is 0 for low and 1 for min.
2403 */
2404static int
2405journal_space(ump, thresh)
2406	struct ufsmount *ump;
2407	int thresh;
2408{
2409	struct jblocks *jblocks;
2410	int avail;
2411
2412	jblocks = ump->softdep_jblocks;
2413	if (jblocks == NULL)
2414		return (1);
2415	/*
2416	 * We use a tighter restriction here to prevent request_cleanup()
2417	 * running in threads from running into locks we currently hold.
2418	 */
2419	if (num_inodedep > (max_softdeps / 10) * 9)
2420		return (0);
2421	if (thresh)
2422		thresh = jblocks->jb_min;
2423	else
2424		thresh = jblocks->jb_low;
2425	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2426	avail = jblocks->jb_free - avail;
2427
2428	return (avail > thresh);
2429}
2430
2431static void
2432journal_suspend(ump)
2433	struct ufsmount *ump;
2434{
2435	struct jblocks *jblocks;
2436	struct mount *mp;
2437
2438	mp = UFSTOVFS(ump);
2439	jblocks = ump->softdep_jblocks;
2440	MNT_ILOCK(mp);
2441	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2442		stat_journal_min++;
2443		mp->mnt_kern_flag |= MNTK_SUSPEND;
2444		mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
2445	}
2446	jblocks->jb_suspended = 1;
2447	MNT_IUNLOCK(mp);
2448}
2449
2450static int
2451journal_unsuspend(struct ufsmount *ump)
2452{
2453	struct jblocks *jblocks;
2454	struct mount *mp;
2455
2456	mp = UFSTOVFS(ump);
2457	jblocks = ump->softdep_jblocks;
2458
2459	if (jblocks != NULL && jblocks->jb_suspended &&
2460	    journal_space(ump, jblocks->jb_min)) {
2461		jblocks->jb_suspended = 0;
2462		FREE_LOCK(&lk);
2463		mp->mnt_susp_owner = curthread;
2464		vfs_write_resume(mp);
2465		ACQUIRE_LOCK(&lk);
2466		return (1);
2467	}
2468	return (0);
2469}
2470
2471/*
2472 * Called before any allocation function to be certain that there is
2473 * sufficient space in the journal prior to creating any new records.
2474 * Since in the case of block allocation we may have multiple locked
2475 * buffers at the time of the actual allocation we can not block
2476 * when the journal records are created.  Doing so would create a deadlock
2477 * if any of these buffers needed to be flushed to reclaim space.  Instead
2478 * we require a sufficiently large amount of available space such that
2479 * each thread in the system could have passed this allocation check and
2480 * still have sufficient free space.  With 20% of a minimum journal size
2481 * of 1MB we have 6553 records available.
2482 */
2483int
2484softdep_prealloc(vp, waitok)
2485	struct vnode *vp;
2486	int waitok;
2487{
2488	struct ufsmount *ump;
2489
2490	if (DOINGSUJ(vp) == 0)
2491		return (0);
2492	ump = VFSTOUFS(vp->v_mount);
2493	ACQUIRE_LOCK(&lk);
2494	if (journal_space(ump, 0)) {
2495		FREE_LOCK(&lk);
2496		return (0);
2497	}
2498	stat_journal_low++;
2499	FREE_LOCK(&lk);
2500	if (waitok == MNT_NOWAIT)
2501		return (ENOSPC);
2502	/*
2503	 * Attempt to sync this vnode once to flush any journal
2504	 * work attached to it.
2505	 */
2506	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
2507		ffs_syncvnode(vp, waitok);
2508	ACQUIRE_LOCK(&lk);
2509	process_removes(vp);
2510	if (journal_space(ump, 0) == 0) {
2511		softdep_speedup();
2512		if (journal_space(ump, 1) == 0)
2513			journal_suspend(ump);
2514	}
2515	FREE_LOCK(&lk);
2516
2517	return (0);
2518}
2519
2520/*
2521 * Before adjusting a link count on a vnode verify that we have sufficient
2522 * journal space.  If not, process operations that depend on the currently
2523 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
2524 * and softdep flush threads can not acquire these locks to reclaim space.
2525 */
2526static void
2527softdep_prelink(dvp, vp)
2528	struct vnode *dvp;
2529	struct vnode *vp;
2530{
2531	struct ufsmount *ump;
2532
2533	ump = VFSTOUFS(dvp->v_mount);
2534	mtx_assert(&lk, MA_OWNED);
2535	if (journal_space(ump, 0))
2536		return;
2537	stat_journal_low++;
2538	FREE_LOCK(&lk);
2539	if (vp)
2540		ffs_syncvnode(vp, MNT_NOWAIT);
2541	ffs_syncvnode(dvp, MNT_WAIT);
2542	ACQUIRE_LOCK(&lk);
2543	/* Process vp before dvp as it may create .. removes. */
2544	if (vp)
2545		process_removes(vp);
2546	process_removes(dvp);
2547	softdep_speedup();
2548	process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
2549	process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
2550	if (journal_space(ump, 0) == 0) {
2551		softdep_speedup();
2552		if (journal_space(ump, 1) == 0)
2553			journal_suspend(ump);
2554	}
2555}
2556
2557static void
2558jseg_write(ump, jblocks, jseg, data)
2559	struct ufsmount *ump;
2560	struct jblocks *jblocks;
2561	struct jseg *jseg;
2562	uint8_t *data;
2563{
2564	struct jsegrec *rec;
2565
2566	rec = (struct jsegrec *)data;
2567	rec->jsr_seq = jseg->js_seq;
2568	rec->jsr_oldest = jblocks->jb_oldestseq;
2569	rec->jsr_cnt = jseg->js_cnt;
2570	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
2571	rec->jsr_crc = 0;
2572	rec->jsr_time = ump->um_fs->fs_mtime;
2573}
2574
2575static inline void
2576inoref_write(inoref, jseg, rec)
2577	struct inoref *inoref;
2578	struct jseg *jseg;
2579	struct jrefrec *rec;
2580{
2581
2582	inoref->if_jsegdep->jd_seg = jseg;
2583	rec->jr_ino = inoref->if_ino;
2584	rec->jr_parent = inoref->if_parent;
2585	rec->jr_nlink = inoref->if_nlink;
2586	rec->jr_mode = inoref->if_mode;
2587	rec->jr_diroff = inoref->if_diroff;
2588}
2589
2590static void
2591jaddref_write(jaddref, jseg, data)
2592	struct jaddref *jaddref;
2593	struct jseg *jseg;
2594	uint8_t *data;
2595{
2596	struct jrefrec *rec;
2597
2598	rec = (struct jrefrec *)data;
2599	rec->jr_op = JOP_ADDREF;
2600	inoref_write(&jaddref->ja_ref, jseg, rec);
2601}
2602
2603static void
2604jremref_write(jremref, jseg, data)
2605	struct jremref *jremref;
2606	struct jseg *jseg;
2607	uint8_t *data;
2608{
2609	struct jrefrec *rec;
2610
2611	rec = (struct jrefrec *)data;
2612	rec->jr_op = JOP_REMREF;
2613	inoref_write(&jremref->jr_ref, jseg, rec);
2614}
2615
2616static void
2617jmvref_write(jmvref, jseg, data)
2618	struct jmvref *jmvref;
2619	struct jseg *jseg;
2620	uint8_t *data;
2621{
2622	struct jmvrec *rec;
2623
2624	rec = (struct jmvrec *)data;
2625	rec->jm_op = JOP_MVREF;
2626	rec->jm_ino = jmvref->jm_ino;
2627	rec->jm_parent = jmvref->jm_parent;
2628	rec->jm_oldoff = jmvref->jm_oldoff;
2629	rec->jm_newoff = jmvref->jm_newoff;
2630}
2631
2632static void
2633jnewblk_write(jnewblk, jseg, data)
2634	struct jnewblk *jnewblk;
2635	struct jseg *jseg;
2636	uint8_t *data;
2637{
2638	struct jblkrec *rec;
2639
2640	jnewblk->jn_jsegdep->jd_seg = jseg;
2641	rec = (struct jblkrec *)data;
2642	rec->jb_op = JOP_NEWBLK;
2643	rec->jb_ino = jnewblk->jn_ino;
2644	rec->jb_blkno = jnewblk->jn_blkno;
2645	rec->jb_lbn = jnewblk->jn_lbn;
2646	rec->jb_frags = jnewblk->jn_frags;
2647	rec->jb_oldfrags = jnewblk->jn_oldfrags;
2648}
2649
2650static void
2651jfreeblk_write(jfreeblk, jseg, data)
2652	struct jfreeblk *jfreeblk;
2653	struct jseg *jseg;
2654	uint8_t *data;
2655{
2656	struct jblkrec *rec;
2657
2658	jfreeblk->jf_jsegdep->jd_seg = jseg;
2659	rec = (struct jblkrec *)data;
2660	rec->jb_op = JOP_FREEBLK;
2661	rec->jb_ino = jfreeblk->jf_ino;
2662	rec->jb_blkno = jfreeblk->jf_blkno;
2663	rec->jb_lbn = jfreeblk->jf_lbn;
2664	rec->jb_frags = jfreeblk->jf_frags;
2665	rec->jb_oldfrags = 0;
2666}
2667
2668static void
2669jfreefrag_write(jfreefrag, jseg, data)
2670	struct jfreefrag *jfreefrag;
2671	struct jseg *jseg;
2672	uint8_t *data;
2673{
2674	struct jblkrec *rec;
2675
2676	jfreefrag->fr_jsegdep->jd_seg = jseg;
2677	rec = (struct jblkrec *)data;
2678	rec->jb_op = JOP_FREEBLK;
2679	rec->jb_ino = jfreefrag->fr_ino;
2680	rec->jb_blkno = jfreefrag->fr_blkno;
2681	rec->jb_lbn = jfreefrag->fr_lbn;
2682	rec->jb_frags = jfreefrag->fr_frags;
2683	rec->jb_oldfrags = 0;
2684}
2685
2686static void
2687jtrunc_write(jtrunc, jseg, data)
2688	struct jtrunc *jtrunc;
2689	struct jseg *jseg;
2690	uint8_t *data;
2691{
2692	struct jtrncrec *rec;
2693
2694	rec = (struct jtrncrec *)data;
2695	rec->jt_op = JOP_TRUNC;
2696	rec->jt_ino = jtrunc->jt_ino;
2697	rec->jt_size = jtrunc->jt_size;
2698	rec->jt_extsize = jtrunc->jt_extsize;
2699}
2700
2701/*
2702 * Flush some journal records to disk.
2703 */
2704static void
2705softdep_process_journal(mp, flags)
2706	struct mount *mp;
2707	int flags;
2708{
2709	struct jblocks *jblocks;
2710	struct ufsmount *ump;
2711	struct worklist *wk;
2712	struct jseg *jseg;
2713	struct buf *bp;
2714	uint8_t *data;
2715	struct fs *fs;
2716	int segwritten;
2717	int jrecmin;	/* Minimum records per block. */
2718	int jrecmax;	/* Maximum records per block. */
2719	int size;
2720	int cnt;
2721	int off;
2722	int devbsize;
2723
2724	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
2725		return;
2726	ump = VFSTOUFS(mp);
2727	fs = ump->um_fs;
2728	jblocks = ump->softdep_jblocks;
2729	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
2730	/*
2731	 * We write anywhere between a disk block and fs block.  The upper
2732	 * bound is picked to prevent buffer cache fragmentation and limit
2733	 * processing time per I/O.
2734	 */
2735	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
2736	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
2737	segwritten = 0;
2738	while ((cnt = ump->softdep_on_journal) != 0) {
2739		/*
2740		 * Create a new segment to hold as many as 'cnt' journal
2741		 * entries and add them to the segment.  Notice cnt is
2742		 * off by one to account for the space required by the
2743		 * jsegrec.  If we don't have a full block to log skip it
2744		 * unless we haven't written anything.
2745		 */
2746		cnt++;
2747		if (cnt < jrecmax && segwritten)
2748			break;
2749		/*
2750		 * Verify some free journal space.  softdep_prealloc() should
2751	 	 * guarantee that we don't run out so this is indicative of
2752		 * a problem with the flow control.  Try to recover
2753		 * gracefully in any event.
2754		 */
2755		while (jblocks->jb_free == 0) {
2756			if (flags != MNT_WAIT)
2757				break;
2758			printf("softdep: Out of journal space!\n");
2759			softdep_speedup();
2760			msleep(jblocks, &lk, PRIBIO, "jblocks", hz);
2761		}
2762		FREE_LOCK(&lk);
2763		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
2764		workitem_alloc(&jseg->js_list, D_JSEG, mp);
2765		LIST_INIT(&jseg->js_entries);
2766		jseg->js_state = ATTACHED;
2767		jseg->js_jblocks = jblocks;
2768		bp = geteblk(fs->fs_bsize, 0);
2769		ACQUIRE_LOCK(&lk);
2770		/*
2771		 * If there was a race while we were allocating the block
2772		 * and jseg the entry we care about was likely written.
2773		 * We bail out in both the WAIT and NOWAIT case and assume
2774		 * the caller will loop if the entry it cares about is
2775		 * not written.
2776		 */
2777		if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) {
2778			bp->b_flags |= B_INVAL | B_NOCACHE;
2779			WORKITEM_FREE(jseg, D_JSEG);
2780			FREE_LOCK(&lk);
2781			brelse(bp);
2782			ACQUIRE_LOCK(&lk);
2783			break;
2784		}
2785		/*
2786		 * Calculate the disk block size required for the available
2787		 * records rounded to the min size.
2788		 */
2789		cnt = ump->softdep_on_journal;
2790		if (cnt < jrecmax)
2791			size = howmany(cnt, jrecmin) * devbsize;
2792		else
2793			size = fs->fs_bsize;
2794		/*
2795		 * Allocate a disk block for this journal data and account
2796		 * for truncation of the requested size if enough contiguous
2797		 * space was not available.
2798		 */
2799		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
2800		bp->b_lblkno = bp->b_blkno;
2801		bp->b_offset = bp->b_blkno * DEV_BSIZE;
2802		bp->b_bcount = size;
2803		bp->b_bufobj = &ump->um_devvp->v_bufobj;
2804		bp->b_flags &= ~B_INVAL;
2805		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
2806		/*
2807		 * Initialize our jseg with cnt records.  Assign the next
2808		 * sequence number to it and link it in-order.
2809		 */
2810		cnt = MIN(ump->softdep_on_journal,
2811		    (size / devbsize) * jrecmin);
2812		jseg->js_buf = bp;
2813		jseg->js_cnt = cnt;
2814		jseg->js_refs = cnt + 1;	/* Self ref. */
2815		jseg->js_size = size;
2816		jseg->js_seq = jblocks->jb_nextseq++;
2817		if (TAILQ_EMPTY(&jblocks->jb_segs))
2818			jblocks->jb_oldestseq = jseg->js_seq;
2819		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
2820		if (jblocks->jb_writeseg == NULL)
2821			jblocks->jb_writeseg = jseg;
2822		/*
2823		 * Start filling in records from the pending list.
2824		 */
2825		data = bp->b_data;
2826		off = 0;
2827		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
2828		    != NULL) {
2829			/* Place a segment header on every device block. */
2830			if ((off % devbsize) == 0) {
2831				jseg_write(ump, jblocks, jseg, data);
2832				off += JREC_SIZE;
2833				data = bp->b_data + off;
2834			}
2835			remove_from_journal(wk);
2836			wk->wk_state |= IOSTARTED;
2837			WORKLIST_INSERT(&jseg->js_entries, wk);
2838			switch (wk->wk_type) {
2839			case D_JADDREF:
2840				jaddref_write(WK_JADDREF(wk), jseg, data);
2841				break;
2842			case D_JREMREF:
2843				jremref_write(WK_JREMREF(wk), jseg, data);
2844				break;
2845			case D_JMVREF:
2846				jmvref_write(WK_JMVREF(wk), jseg, data);
2847				break;
2848			case D_JNEWBLK:
2849				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
2850				break;
2851			case D_JFREEBLK:
2852				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
2853				break;
2854			case D_JFREEFRAG:
2855				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
2856				break;
2857			case D_JTRUNC:
2858				jtrunc_write(WK_JTRUNC(wk), jseg, data);
2859				break;
2860			default:
2861				panic("process_journal: Unknown type %s",
2862				    TYPENAME(wk->wk_type));
2863				/* NOTREACHED */
2864			}
2865			if (--cnt == 0)
2866				break;
2867			off += JREC_SIZE;
2868			data = bp->b_data + off;
2869		}
2870		/*
2871		 * Write this one buffer and continue.
2872		 */
2873		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
2874		FREE_LOCK(&lk);
2875		BO_LOCK(bp->b_bufobj);
2876		bgetvp(ump->um_devvp, bp);
2877		BO_UNLOCK(bp->b_bufobj);
2878		if (flags == MNT_NOWAIT)
2879			bawrite(bp);
2880		else
2881			bwrite(bp);
2882		ACQUIRE_LOCK(&lk);
2883	}
2884	/*
2885	 * If we've suspended the filesystem because we ran out of journal
2886	 * space either try to sync it here to make some progress or
2887	 * unsuspend it if we already have.
2888	 */
2889	if (flags == 0 && jblocks->jb_suspended) {
2890		if (journal_unsuspend(ump))
2891			return;
2892		FREE_LOCK(&lk);
2893		VFS_SYNC(mp, MNT_NOWAIT);
2894		ffs_sbupdate(ump, MNT_WAIT, 0);
2895		ACQUIRE_LOCK(&lk);
2896	}
2897}
2898
2899/*
2900 * Complete a jseg, allowing all dependencies awaiting journal writes
2901 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
2902 * structures so that the journal segment can be freed to reclaim space.
2903 */
2904static void
2905complete_jseg(jseg)
2906	struct jseg *jseg;
2907{
2908	struct worklist *wk;
2909	struct jmvref *jmvref;
2910	int waiting;
2911#ifdef INVARIANTS
2912	int i = 0;
2913#endif
2914
2915	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
2916		WORKLIST_REMOVE(wk);
2917		waiting = wk->wk_state & IOWAITING;
2918		wk->wk_state &= ~(IOSTARTED | IOWAITING);
2919		wk->wk_state |= COMPLETE;
2920		KASSERT(i++ < jseg->js_cnt,
2921		    ("handle_written_jseg: overflow %d >= %d",
2922		    i - 1, jseg->js_cnt));
2923		switch (wk->wk_type) {
2924		case D_JADDREF:
2925			handle_written_jaddref(WK_JADDREF(wk));
2926			break;
2927		case D_JREMREF:
2928			handle_written_jremref(WK_JREMREF(wk));
2929			break;
2930		case D_JMVREF:
2931			/* No jsegdep here. */
2932			free_jseg(jseg);
2933			jmvref = WK_JMVREF(wk);
2934			LIST_REMOVE(jmvref, jm_deps);
2935			free_pagedep(jmvref->jm_pagedep);
2936			WORKITEM_FREE(jmvref, D_JMVREF);
2937			break;
2938		case D_JNEWBLK:
2939			handle_written_jnewblk(WK_JNEWBLK(wk));
2940			break;
2941		case D_JFREEBLK:
2942			handle_written_jfreeblk(WK_JFREEBLK(wk));
2943			break;
2944		case D_JFREEFRAG:
2945			handle_written_jfreefrag(WK_JFREEFRAG(wk));
2946			break;
2947		case D_JTRUNC:
2948			WK_JTRUNC(wk)->jt_jsegdep->jd_seg = jseg;
2949			WORKITEM_FREE(wk, D_JTRUNC);
2950			break;
2951		default:
2952			panic("handle_written_jseg: Unknown type %s",
2953			    TYPENAME(wk->wk_type));
2954			/* NOTREACHED */
2955		}
2956		if (waiting)
2957			wakeup(wk);
2958	}
2959	/* Release the self reference so the structure may be freed. */
2960	free_jseg(jseg);
2961}
2962
2963/*
2964 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Handle jseg
2965 * completions in order only.
2966 */
2967static void
2968handle_written_jseg(jseg, bp)
2969	struct jseg *jseg;
2970	struct buf *bp;
2971{
2972	struct jblocks *jblocks;
2973	struct jseg *jsegn;
2974
2975	if (jseg->js_refs == 0)
2976		panic("handle_written_jseg: No self-reference on %p", jseg);
2977	jseg->js_state |= DEPCOMPLETE;
2978	/*
2979	 * We'll never need this buffer again, set flags so it will be
2980	 * discarded.
2981	 */
2982	bp->b_flags |= B_INVAL | B_NOCACHE;
2983	jblocks = jseg->js_jblocks;
2984	/*
2985	 * Don't allow out of order completions.  If this isn't the first
2986	 * block wait for it to write before we're done.
2987	 */
2988	if (jseg != jblocks->jb_writeseg)
2989		return;
2990	/* Iterate through available jsegs processing their entries. */
2991	do {
2992		jsegn = TAILQ_NEXT(jseg, js_next);
2993		complete_jseg(jseg);
2994		jseg = jsegn;
2995	} while (jseg && jseg->js_state & DEPCOMPLETE);
2996	jblocks->jb_writeseg = jseg;
2997}
2998
2999static inline struct jsegdep *
3000inoref_jseg(inoref)
3001	struct inoref *inoref;
3002{
3003	struct jsegdep *jsegdep;
3004
3005	jsegdep = inoref->if_jsegdep;
3006	inoref->if_jsegdep = NULL;
3007
3008	return (jsegdep);
3009}
3010
3011/*
3012 * Called once a jremref has made it to stable store.  The jremref is marked
3013 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3014 * for the jremref to complete will be awoken by free_jremref.
3015 */
3016static void
3017handle_written_jremref(jremref)
3018	struct jremref *jremref;
3019{
3020	struct inodedep *inodedep;
3021	struct jsegdep *jsegdep;
3022	struct dirrem *dirrem;
3023
3024	/* Grab the jsegdep. */
3025	jsegdep = inoref_jseg(&jremref->jr_ref);
3026	/*
3027	 * Remove us from the inoref list.
3028	 */
3029	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3030	    0, &inodedep) == 0)
3031		panic("handle_written_jremref: Lost inodedep");
3032	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3033	/*
3034	 * Complete the dirrem.
3035	 */
3036	dirrem = jremref->jr_dirrem;
3037	jremref->jr_dirrem = NULL;
3038	LIST_REMOVE(jremref, jr_deps);
3039	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3040	WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list);
3041	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3042	    (dirrem->dm_state & COMPLETE) != 0)
3043		add_to_worklist(&dirrem->dm_list, 0);
3044	free_jremref(jremref);
3045}
3046
3047/*
3048 * Called once a jaddref has made it to stable store.  The dependency is
3049 * marked complete and any dependent structures are added to the inode
3050 * bufwait list to be completed as soon as it is written.  If a bitmap write
3051 * depends on this entry we move the inode into the inodedephd of the
3052 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3053 */
3054static void
3055handle_written_jaddref(jaddref)
3056	struct jaddref *jaddref;
3057{
3058	struct jsegdep *jsegdep;
3059	struct inodedep *inodedep;
3060	struct diradd *diradd;
3061	struct mkdir *mkdir;
3062
3063	/* Grab the jsegdep. */
3064	jsegdep = inoref_jseg(&jaddref->ja_ref);
3065	mkdir = NULL;
3066	diradd = NULL;
3067	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3068	    0, &inodedep) == 0)
3069		panic("handle_written_jaddref: Lost inodedep.");
3070	if (jaddref->ja_diradd == NULL)
3071		panic("handle_written_jaddref: No dependency");
3072	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3073		diradd = jaddref->ja_diradd;
3074		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3075	} else if (jaddref->ja_state & MKDIR_PARENT) {
3076		mkdir = jaddref->ja_mkdir;
3077		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3078	} else if (jaddref->ja_state & MKDIR_BODY)
3079		mkdir = jaddref->ja_mkdir;
3080	else
3081		panic("handle_written_jaddref: Unknown dependency %p",
3082		    jaddref->ja_diradd);
3083	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3084	/*
3085	 * Remove us from the inode list.
3086	 */
3087	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3088	/*
3089	 * The mkdir may be waiting on the jaddref to clear before freeing.
3090	 */
3091	if (mkdir) {
3092		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3093		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3094		    TYPENAME(mkdir->md_list.wk_type)));
3095		mkdir->md_jaddref = NULL;
3096		diradd = mkdir->md_diradd;
3097		mkdir->md_state |= DEPCOMPLETE;
3098		complete_mkdir(mkdir);
3099	}
3100	WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list);
3101	if (jaddref->ja_state & NEWBLOCK) {
3102		inodedep->id_state |= ONDEPLIST;
3103		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3104		    inodedep, id_deps);
3105	}
3106	free_jaddref(jaddref);
3107}
3108
3109/*
3110 * Called once a jnewblk journal is written.  The allocdirect or allocindir
3111 * is placed in the bmsafemap to await notification of a written bitmap.
3112 */
3113static void
3114handle_written_jnewblk(jnewblk)
3115	struct jnewblk *jnewblk;
3116{
3117	struct bmsafemap *bmsafemap;
3118	struct jsegdep *jsegdep;
3119	struct newblk *newblk;
3120
3121	/* Grab the jsegdep. */
3122	jsegdep = jnewblk->jn_jsegdep;
3123	jnewblk->jn_jsegdep = NULL;
3124	/*
3125	 * Add the written block to the bmsafemap so it can be notified when
3126	 * the bitmap is on disk.
3127	 */
3128	newblk = jnewblk->jn_newblk;
3129	jnewblk->jn_newblk = NULL;
3130	if (newblk == NULL)
3131		panic("handle_written_jnewblk: No dependency for the segdep.");
3132
3133	newblk->nb_jnewblk = NULL;
3134	bmsafemap = newblk->nb_bmsafemap;
3135	WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list);
3136	newblk->nb_state |= ONDEPLIST;
3137	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
3138	free_jnewblk(jnewblk);
3139}
3140
3141/*
3142 * Cancel a jfreefrag that won't be needed, probably due to colliding with
3143 * an in-flight allocation that has not yet been committed.  Divorce us
3144 * from the freefrag and mark it DEPCOMPLETE so that it may be added
3145 * to the worklist.
3146 */
3147static void
3148cancel_jfreefrag(jfreefrag)
3149	struct jfreefrag *jfreefrag;
3150{
3151	struct freefrag *freefrag;
3152
3153	if (jfreefrag->fr_jsegdep) {
3154		free_jsegdep(jfreefrag->fr_jsegdep);
3155		jfreefrag->fr_jsegdep = NULL;
3156	}
3157	freefrag = jfreefrag->fr_freefrag;
3158	jfreefrag->fr_freefrag = NULL;
3159	freefrag->ff_jfreefrag = NULL;
3160	free_jfreefrag(jfreefrag);
3161	freefrag->ff_state |= DEPCOMPLETE;
3162}
3163
3164/*
3165 * Free a jfreefrag when the parent freefrag is rendered obsolete.
3166 */
3167static void
3168free_jfreefrag(jfreefrag)
3169	struct jfreefrag *jfreefrag;
3170{
3171
3172	if (jfreefrag->fr_state & IOSTARTED)
3173		WORKLIST_REMOVE(&jfreefrag->fr_list);
3174	else if (jfreefrag->fr_state & ONWORKLIST)
3175		remove_from_journal(&jfreefrag->fr_list);
3176	if (jfreefrag->fr_freefrag != NULL)
3177		panic("free_jfreefrag:  Still attached to a freefrag.");
3178	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3179}
3180
3181/*
3182 * Called when the journal write for a jfreefrag completes.  The parent
3183 * freefrag is added to the worklist if this completes its dependencies.
3184 */
3185static void
3186handle_written_jfreefrag(jfreefrag)
3187	struct jfreefrag *jfreefrag;
3188{
3189	struct jsegdep *jsegdep;
3190	struct freefrag *freefrag;
3191
3192	/* Grab the jsegdep. */
3193	jsegdep = jfreefrag->fr_jsegdep;
3194	jfreefrag->fr_jsegdep = NULL;
3195	freefrag = jfreefrag->fr_freefrag;
3196	if (freefrag == NULL)
3197		panic("handle_written_jfreefrag: No freefrag.");
3198	freefrag->ff_state |= DEPCOMPLETE;
3199	freefrag->ff_jfreefrag = NULL;
3200	WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list);
3201	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3202		add_to_worklist(&freefrag->ff_list, 0);
3203	jfreefrag->fr_freefrag = NULL;
3204	free_jfreefrag(jfreefrag);
3205}
3206
3207/*
3208 * Called when the journal write for a jfreeblk completes.  The jfreeblk
3209 * is removed from the freeblks list of pending journal writes and the
3210 * jsegdep is moved to the freeblks jwork to be completed when all blocks
3211 * have been reclaimed.
3212 */
3213static void
3214handle_written_jfreeblk(jfreeblk)
3215	struct jfreeblk *jfreeblk;
3216{
3217	struct freeblks *freeblks;
3218	struct jsegdep *jsegdep;
3219
3220	/* Grab the jsegdep. */
3221	jsegdep = jfreeblk->jf_jsegdep;
3222	jfreeblk->jf_jsegdep = NULL;
3223	freeblks = jfreeblk->jf_freeblks;
3224	LIST_REMOVE(jfreeblk, jf_deps);
3225	WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list);
3226	/*
3227	 * If the freeblks is all journaled, we can add it to the worklist.
3228	 */
3229	if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) &&
3230	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) {
3231		/* Remove from the b_dep that is waiting on this write. */
3232		if (freeblks->fb_state & ONWORKLIST)
3233			WORKLIST_REMOVE(&freeblks->fb_list);
3234		add_to_worklist(&freeblks->fb_list, 1);
3235	}
3236
3237	free_jfreeblk(jfreeblk);
3238}
3239
3240static struct jsegdep *
3241newjsegdep(struct worklist *wk)
3242{
3243	struct jsegdep *jsegdep;
3244
3245	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3246	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3247	jsegdep->jd_seg = NULL;
3248
3249	return (jsegdep);
3250}
3251
3252static struct jmvref *
3253newjmvref(dp, ino, oldoff, newoff)
3254	struct inode *dp;
3255	ino_t ino;
3256	off_t oldoff;
3257	off_t newoff;
3258{
3259	struct jmvref *jmvref;
3260
3261	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3262	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3263	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3264	jmvref->jm_parent = dp->i_number;
3265	jmvref->jm_ino = ino;
3266	jmvref->jm_oldoff = oldoff;
3267	jmvref->jm_newoff = newoff;
3268
3269	return (jmvref);
3270}
3271
3272/*
3273 * Allocate a new jremref that tracks the removal of ip from dp with the
3274 * directory entry offset of diroff.  Mark the entry as ATTACHED and
3275 * DEPCOMPLETE as we have all the information required for the journal write
3276 * and the directory has already been removed from the buffer.  The caller
3277 * is responsible for linking the jremref into the pagedep and adding it
3278 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
3279 * a DOTDOT addition so handle_workitem_remove() can properly assign
3280 * the jsegdep when we're done.
3281 */
3282static struct jremref *
3283newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
3284    off_t diroff, nlink_t nlink)
3285{
3286	struct jremref *jremref;
3287
3288	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
3289	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
3290	jremref->jr_state = ATTACHED;
3291	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
3292	   nlink, ip->i_mode);
3293	jremref->jr_dirrem = dirrem;
3294
3295	return (jremref);
3296}
3297
3298static inline void
3299newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
3300    nlink_t nlink, uint16_t mode)
3301{
3302
3303	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
3304	inoref->if_diroff = diroff;
3305	inoref->if_ino = ino;
3306	inoref->if_parent = parent;
3307	inoref->if_nlink = nlink;
3308	inoref->if_mode = mode;
3309}
3310
3311/*
3312 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
3313 * directory offset may not be known until later.  The caller is responsible
3314 * adding the entry to the journal when this information is available.  nlink
3315 * should be the link count prior to the addition and mode is only required
3316 * to have the correct FMT.
3317 */
3318static struct jaddref *
3319newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
3320    uint16_t mode)
3321{
3322	struct jaddref *jaddref;
3323
3324	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
3325	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
3326	jaddref->ja_state = ATTACHED;
3327	jaddref->ja_mkdir = NULL;
3328	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
3329
3330	return (jaddref);
3331}
3332
3333/*
3334 * Create a new free dependency for a freework.  The caller is responsible
3335 * for adjusting the reference count when it has the lock held.  The freedep
3336 * will track an outstanding bitmap write that will ultimately clear the
3337 * freework to continue.
3338 */
3339static struct freedep *
3340newfreedep(struct freework *freework)
3341{
3342	struct freedep *freedep;
3343
3344	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
3345	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
3346	freedep->fd_freework = freework;
3347
3348	return (freedep);
3349}
3350
3351/*
3352 * Free a freedep structure once the buffer it is linked to is written.  If
3353 * this is the last reference to the freework schedule it for completion.
3354 */
3355static void
3356free_freedep(freedep)
3357	struct freedep *freedep;
3358{
3359
3360	if (--freedep->fd_freework->fw_ref == 0)
3361		add_to_worklist(&freedep->fd_freework->fw_list, 1);
3362	WORKITEM_FREE(freedep, D_FREEDEP);
3363}
3364
3365/*
3366 * Allocate a new freework structure that may be a level in an indirect
3367 * when parent is not NULL or a top level block when it is.  The top level
3368 * freework structures are allocated without lk held and before the freeblks
3369 * is visible outside of softdep_setup_freeblocks().
3370 */
3371static struct freework *
3372newfreework(ump, freeblks, parent, lbn, nb, frags, journal)
3373	struct ufsmount *ump;
3374	struct freeblks *freeblks;
3375	struct freework *parent;
3376	ufs_lbn_t lbn;
3377	ufs2_daddr_t nb;
3378	int frags;
3379	int journal;
3380{
3381	struct freework *freework;
3382
3383	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
3384	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
3385	freework->fw_freeblks = freeblks;
3386	freework->fw_parent = parent;
3387	freework->fw_lbn = lbn;
3388	freework->fw_blkno = nb;
3389	freework->fw_frags = frags;
3390	freework->fw_ref = ((UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ) == 0 ||
3391	    lbn >= -NXADDR) ? 0 : NINDIR(ump->um_fs) + 1;
3392	freework->fw_off = 0;
3393	LIST_INIT(&freework->fw_jwork);
3394
3395	if (parent == NULL) {
3396		WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd,
3397		    &freework->fw_list);
3398		freeblks->fb_ref++;
3399	}
3400	if (journal)
3401		newjfreeblk(freeblks, lbn, nb, frags);
3402
3403	return (freework);
3404}
3405
3406/*
3407 * Allocate a new jfreeblk to journal top level block pointer when truncating
3408 * a file.  The caller must add this to the worklist when lk is held.
3409 */
3410static struct jfreeblk *
3411newjfreeblk(freeblks, lbn, blkno, frags)
3412	struct freeblks *freeblks;
3413	ufs_lbn_t lbn;
3414	ufs2_daddr_t blkno;
3415	int frags;
3416{
3417	struct jfreeblk *jfreeblk;
3418
3419	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
3420	workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp);
3421	jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list);
3422	jfreeblk->jf_state = ATTACHED | DEPCOMPLETE;
3423	jfreeblk->jf_ino = freeblks->fb_previousinum;
3424	jfreeblk->jf_lbn = lbn;
3425	jfreeblk->jf_blkno = blkno;
3426	jfreeblk->jf_frags = frags;
3427	jfreeblk->jf_freeblks = freeblks;
3428	LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps);
3429
3430	return (jfreeblk);
3431}
3432
3433static void move_newblock_dep(struct jaddref *, struct inodedep *);
3434/*
3435 * If we're canceling a new bitmap we have to search for another ref
3436 * to move into the bmsafemap dep.  This might be better expressed
3437 * with another structure.
3438 */
3439static void
3440move_newblock_dep(jaddref, inodedep)
3441	struct jaddref *jaddref;
3442	struct inodedep *inodedep;
3443{
3444	struct inoref *inoref;
3445	struct jaddref *jaddrefn;
3446
3447	jaddrefn = NULL;
3448	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
3449	    inoref = TAILQ_NEXT(inoref, if_deps)) {
3450		if ((jaddref->ja_state & NEWBLOCK) &&
3451		    inoref->if_list.wk_type == D_JADDREF) {
3452			jaddrefn = (struct jaddref *)inoref;
3453			break;
3454		}
3455	}
3456	if (jaddrefn == NULL)
3457		return;
3458	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
3459	jaddrefn->ja_state |= jaddref->ja_state &
3460	    (ATTACHED | UNDONE | NEWBLOCK);
3461	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
3462	jaddref->ja_state |= ATTACHED;
3463	LIST_REMOVE(jaddref, ja_bmdeps);
3464	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
3465	    ja_bmdeps);
3466}
3467
3468/*
3469 * Cancel a jaddref either before it has been written or while it is being
3470 * written.  This happens when a link is removed before the add reaches
3471 * the disk.  The jaddref dependency is kept linked into the bmsafemap
3472 * and inode to prevent the link count or bitmap from reaching the disk
3473 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
3474 * required.
3475 *
3476 * Returns 1 if the canceled addref requires journaling of the remove and
3477 * 0 otherwise.
3478 */
3479static int
3480cancel_jaddref(jaddref, inodedep, wkhd)
3481	struct jaddref *jaddref;
3482	struct inodedep *inodedep;
3483	struct workhead *wkhd;
3484{
3485	struct inoref *inoref;
3486	struct jsegdep *jsegdep;
3487	int needsj;
3488
3489	KASSERT((jaddref->ja_state & COMPLETE) == 0,
3490	    ("cancel_jaddref: Canceling complete jaddref"));
3491	if (jaddref->ja_state & (IOSTARTED | COMPLETE))
3492		needsj = 1;
3493	else
3494		needsj = 0;
3495	if (inodedep == NULL)
3496		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3497		    0, &inodedep) == 0)
3498			panic("cancel_jaddref: Lost inodedep");
3499	/*
3500	 * We must adjust the nlink of any reference operation that follows
3501	 * us so that it is consistent with the in-memory reference.  This
3502	 * ensures that inode nlink rollbacks always have the correct link.
3503	 */
3504	if (needsj == 0) {
3505		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
3506		    inoref = TAILQ_NEXT(inoref, if_deps)) {
3507			if (inoref->if_state & GOINGAWAY)
3508				break;
3509			inoref->if_nlink--;
3510		}
3511	}
3512	jsegdep = inoref_jseg(&jaddref->ja_ref);
3513	if (jaddref->ja_state & NEWBLOCK)
3514		move_newblock_dep(jaddref, inodedep);
3515	if (jaddref->ja_state & IOWAITING) {
3516		jaddref->ja_state &= ~IOWAITING;
3517		wakeup(&jaddref->ja_list);
3518	}
3519	jaddref->ja_mkdir = NULL;
3520	if (jaddref->ja_state & IOSTARTED) {
3521		jaddref->ja_state &= ~IOSTARTED;
3522		WORKLIST_REMOVE(&jaddref->ja_list);
3523		WORKLIST_INSERT(wkhd, &jsegdep->jd_list);
3524	} else {
3525		free_jsegdep(jsegdep);
3526		if (jaddref->ja_state & DEPCOMPLETE)
3527			remove_from_journal(&jaddref->ja_list);
3528	}
3529	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
3530	/*
3531	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
3532	 * can arrange for them to be freed with the bitmap.  Otherwise we
3533	 * no longer need this addref attached to the inoreflst and it
3534	 * will incorrectly adjust nlink if we leave it.
3535	 */
3536	if ((jaddref->ja_state & NEWBLOCK) == 0) {
3537		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
3538		    if_deps);
3539		jaddref->ja_state |= COMPLETE;
3540		free_jaddref(jaddref);
3541		return (needsj);
3542	}
3543	/*
3544	 * Leave the head of the list for jsegdeps for fast merging.
3545	 */
3546	if (LIST_FIRST(wkhd) != NULL) {
3547		jaddref->ja_state |= ONWORKLIST;
3548		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
3549	} else
3550		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
3551
3552	return (needsj);
3553}
3554
3555/*
3556 * Attempt to free a jaddref structure when some work completes.  This
3557 * should only succeed once the entry is written and all dependencies have
3558 * been notified.
3559 */
3560static void
3561free_jaddref(jaddref)
3562	struct jaddref *jaddref;
3563{
3564
3565	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
3566		return;
3567	if (jaddref->ja_ref.if_jsegdep)
3568		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
3569		    jaddref, jaddref->ja_state);
3570	if (jaddref->ja_state & NEWBLOCK)
3571		LIST_REMOVE(jaddref, ja_bmdeps);
3572	if (jaddref->ja_state & (IOSTARTED | ONWORKLIST))
3573		panic("free_jaddref: Bad state %p(0x%X)",
3574		    jaddref, jaddref->ja_state);
3575	if (jaddref->ja_mkdir != NULL)
3576		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
3577	WORKITEM_FREE(jaddref, D_JADDREF);
3578}
3579
3580/*
3581 * Free a jremref structure once it has been written or discarded.
3582 */
3583static void
3584free_jremref(jremref)
3585	struct jremref *jremref;
3586{
3587
3588	if (jremref->jr_ref.if_jsegdep)
3589		free_jsegdep(jremref->jr_ref.if_jsegdep);
3590	if (jremref->jr_state & IOSTARTED)
3591		panic("free_jremref: IO still pending");
3592	WORKITEM_FREE(jremref, D_JREMREF);
3593}
3594
3595/*
3596 * Free a jnewblk structure.
3597 */
3598static void
3599free_jnewblk(jnewblk)
3600	struct jnewblk *jnewblk;
3601{
3602
3603	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
3604		return;
3605	LIST_REMOVE(jnewblk, jn_deps);
3606	if (jnewblk->jn_newblk != NULL)
3607		panic("free_jnewblk: Dependency still attached.");
3608	WORKITEM_FREE(jnewblk, D_JNEWBLK);
3609}
3610
3611/*
3612 * Cancel a jnewblk which has been superseded by a freeblk.  The jnewblk
3613 * is kept linked into the bmsafemap until the free completes, thus
3614 * preventing the modified state from ever reaching disk.  The free
3615 * routine must pass this structure via ffs_blkfree() to
3616 * softdep_setup_freeblks() so there is no race in releasing the space.
3617 */
3618static void
3619cancel_jnewblk(jnewblk, wkhd)
3620	struct jnewblk *jnewblk;
3621	struct workhead *wkhd;
3622{
3623	struct jsegdep *jsegdep;
3624
3625	jsegdep = jnewblk->jn_jsegdep;
3626	jnewblk->jn_jsegdep  = NULL;
3627	free_jsegdep(jsegdep);
3628	jnewblk->jn_newblk = NULL;
3629	jnewblk->jn_state |= GOINGAWAY;
3630	if (jnewblk->jn_state & IOSTARTED) {
3631		jnewblk->jn_state &= ~IOSTARTED;
3632		WORKLIST_REMOVE(&jnewblk->jn_list);
3633	} else
3634		remove_from_journal(&jnewblk->jn_list);
3635	/*
3636	 * Leave the head of the list for jsegdeps for fast merging.
3637	 */
3638	if (LIST_FIRST(wkhd) != NULL) {
3639		jnewblk->jn_state |= ONWORKLIST;
3640		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jnewblk->jn_list, wk_list);
3641	} else
3642		WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
3643	if (jnewblk->jn_state & IOWAITING) {
3644		jnewblk->jn_state &= ~IOWAITING;
3645		wakeup(&jnewblk->jn_list);
3646	}
3647}
3648
3649static void
3650free_jfreeblk(jfreeblk)
3651	struct jfreeblk *jfreeblk;
3652{
3653
3654	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
3655}
3656
3657/*
3658 * Release one reference to a jseg and free it if the count reaches 0.  This
3659 * should eventually reclaim journal space as well.
3660 */
3661static void
3662free_jseg(jseg)
3663	struct jseg *jseg;
3664{
3665	struct jblocks *jblocks;
3666
3667	KASSERT(jseg->js_refs > 0,
3668	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
3669	if (--jseg->js_refs != 0)
3670		return;
3671	/*
3672	 * Free only those jsegs which have none allocated before them to
3673	 * preserve the journal space ordering.
3674	 */
3675	jblocks = jseg->js_jblocks;
3676	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
3677		jblocks->jb_oldestseq = jseg->js_seq;
3678		if (jseg->js_refs != 0)
3679			break;
3680		TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
3681		jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
3682		KASSERT(LIST_EMPTY(&jseg->js_entries),
3683		    ("free_jseg: Freed jseg has valid entries."));
3684		WORKITEM_FREE(jseg, D_JSEG);
3685	}
3686}
3687
3688/*
3689 * Release a jsegdep and decrement the jseg count.
3690 */
3691static void
3692free_jsegdep(jsegdep)
3693	struct jsegdep *jsegdep;
3694{
3695
3696	if (jsegdep->jd_seg)
3697		free_jseg(jsegdep->jd_seg);
3698	WORKITEM_FREE(jsegdep, D_JSEGDEP);
3699}
3700
3701/*
3702 * Wait for a journal item to make it to disk.  Initiate journal processing
3703 * if required.
3704 */
3705static void
3706jwait(wk)
3707	struct worklist *wk;
3708{
3709
3710	stat_journal_wait++;
3711	/*
3712	 * If IO has not started we process the journal.  We can't mark the
3713	 * worklist item as IOWAITING because we drop the lock while
3714	 * processing the journal and the worklist entry may be freed after
3715	 * this point.  The caller may call back in and re-issue the request.
3716	 */
3717	if ((wk->wk_state & IOSTARTED) == 0) {
3718		softdep_process_journal(wk->wk_mp, MNT_WAIT);
3719		return;
3720	}
3721	wk->wk_state |= IOWAITING;
3722	msleep(wk, &lk, PRIBIO, "jwait", 0);
3723}
3724
3725/*
3726 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
3727 * appropriate.  This is a convenience function to reduce duplicate code
3728 * for the setup and revert functions below.
3729 */
3730static struct inodedep *
3731inodedep_lookup_ip(ip)
3732	struct inode *ip;
3733{
3734	struct inodedep *inodedep;
3735
3736	KASSERT(ip->i_nlink >= ip->i_effnlink,
3737	    ("inodedep_lookup_ip: bad delta"));
3738	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
3739	    DEPALLOC, &inodedep);
3740	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3741
3742	return (inodedep);
3743}
3744
3745/*
3746 * Create a journal entry that describes a truncate that we're about to
3747 * perform.  The inode allocations and frees between here and the completion
3748 * of the operation are done asynchronously and without journaling.  At
3749 * the end of the operation the vnode is sync'd and the journal space
3750 * is released.  Recovery will discover the partially completed truncate
3751 * and complete it.
3752 */
3753void *
3754softdep_setup_trunc(vp, length, flags)
3755	struct vnode *vp;
3756	off_t length;
3757	int flags;
3758{
3759	struct jsegdep *jsegdep;
3760	struct jtrunc *jtrunc;
3761	struct ufsmount *ump;
3762	struct inode *ip;
3763
3764	softdep_prealloc(vp, MNT_WAIT);
3765	ip = VTOI(vp);
3766	ump = VFSTOUFS(vp->v_mount);
3767	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
3768	workitem_alloc(&jtrunc->jt_list, D_JTRUNC, vp->v_mount);
3769	jsegdep = jtrunc->jt_jsegdep = newjsegdep(&jtrunc->jt_list);
3770	jtrunc->jt_ino = ip->i_number;
3771	jtrunc->jt_extsize = 0;
3772	jtrunc->jt_size = length;
3773	if ((flags & IO_EXT) == 0 && ump->um_fstype == UFS2)
3774		jtrunc->jt_extsize = ip->i_din2->di_extsize;
3775	if ((flags & IO_NORMAL) == 0)
3776		jtrunc->jt_size = DIP(ip, i_size);
3777	ACQUIRE_LOCK(&lk);
3778	add_to_journal(&jtrunc->jt_list);
3779	while (jsegdep->jd_seg == NULL) {
3780		stat_jwait_freeblks++;
3781		jwait(&jtrunc->jt_list);
3782	}
3783	FREE_LOCK(&lk);
3784
3785	return (jsegdep);
3786}
3787
3788/*
3789 * After synchronous truncation is complete we free sync the vnode and
3790 * release the jsegdep so the journal space can be freed.
3791 */
3792int
3793softdep_complete_trunc(vp, cookie)
3794	struct vnode *vp;
3795	void *cookie;
3796{
3797	int error;
3798
3799	error = ffs_syncvnode(vp, MNT_WAIT);
3800	ACQUIRE_LOCK(&lk);
3801	free_jsegdep((struct jsegdep *)cookie);
3802	FREE_LOCK(&lk);
3803
3804	return (error);
3805}
3806
3807/*
3808 * Called prior to creating a new inode and linking it to a directory.  The
3809 * jaddref structure must already be allocated by softdep_setup_inomapdep
3810 * and it is discovered here so we can initialize the mode and update
3811 * nlinkdelta.
3812 */
3813void
3814softdep_setup_create(dp, ip)
3815	struct inode *dp;
3816	struct inode *ip;
3817{
3818	struct inodedep *inodedep;
3819	struct jaddref *jaddref;
3820	struct vnode *dvp;
3821
3822	KASSERT(ip->i_nlink == 1,
3823	    ("softdep_setup_create: Invalid link count."));
3824	dvp = ITOV(dp);
3825	ACQUIRE_LOCK(&lk);
3826	inodedep = inodedep_lookup_ip(ip);
3827	if (DOINGSUJ(dvp)) {
3828		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3829		    inoreflst);
3830		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
3831		    ("softdep_setup_create: No addref structure present."));
3832		jaddref->ja_mode = ip->i_mode;
3833	}
3834	softdep_prelink(dvp, NULL);
3835	FREE_LOCK(&lk);
3836}
3837
3838/*
3839 * Create a jaddref structure to track the addition of a DOTDOT link when
3840 * we are reparenting an inode as part of a rename.  This jaddref will be
3841 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
3842 * non-journaling softdep.
3843 */
3844void
3845softdep_setup_dotdot_link(dp, ip)
3846	struct inode *dp;
3847	struct inode *ip;
3848{
3849	struct inodedep *inodedep;
3850	struct jaddref *jaddref;
3851	struct vnode *dvp;
3852	struct vnode *vp;
3853
3854	dvp = ITOV(dp);
3855	vp = ITOV(ip);
3856	jaddref = NULL;
3857	/*
3858	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
3859	 * is used as a normal link would be.
3860	 */
3861	if (DOINGSUJ(dvp))
3862		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
3863		    dp->i_effnlink - 1, dp->i_mode);
3864	ACQUIRE_LOCK(&lk);
3865	inodedep = inodedep_lookup_ip(dp);
3866	if (jaddref)
3867		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
3868		    if_deps);
3869	softdep_prelink(dvp, ITOV(ip));
3870	FREE_LOCK(&lk);
3871}
3872
3873/*
3874 * Create a jaddref structure to track a new link to an inode.  The directory
3875 * offset is not known until softdep_setup_directory_add or
3876 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
3877 * softdep.
3878 */
3879void
3880softdep_setup_link(dp, ip)
3881	struct inode *dp;
3882	struct inode *ip;
3883{
3884	struct inodedep *inodedep;
3885	struct jaddref *jaddref;
3886	struct vnode *dvp;
3887
3888	dvp = ITOV(dp);
3889	jaddref = NULL;
3890	if (DOINGSUJ(dvp))
3891		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
3892		    ip->i_mode);
3893	ACQUIRE_LOCK(&lk);
3894	inodedep = inodedep_lookup_ip(ip);
3895	if (jaddref)
3896		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
3897		    if_deps);
3898	softdep_prelink(dvp, ITOV(ip));
3899	FREE_LOCK(&lk);
3900}
3901
3902/*
3903 * Called to create the jaddref structures to track . and .. references as
3904 * well as lookup and further initialize the incomplete jaddref created
3905 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
3906 * nlinkdelta for non-journaling softdep.
3907 */
3908void
3909softdep_setup_mkdir(dp, ip)
3910	struct inode *dp;
3911	struct inode *ip;
3912{
3913	struct inodedep *inodedep;
3914	struct jaddref *dotdotaddref;
3915	struct jaddref *dotaddref;
3916	struct jaddref *jaddref;
3917	struct vnode *dvp;
3918
3919	dvp = ITOV(dp);
3920	dotaddref = dotdotaddref = NULL;
3921	if (DOINGSUJ(dvp)) {
3922		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
3923		    ip->i_mode);
3924		dotaddref->ja_state |= MKDIR_BODY;
3925		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
3926		    dp->i_effnlink - 1, dp->i_mode);
3927		dotdotaddref->ja_state |= MKDIR_PARENT;
3928	}
3929	ACQUIRE_LOCK(&lk);
3930	inodedep = inodedep_lookup_ip(ip);
3931	if (DOINGSUJ(dvp)) {
3932		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3933		    inoreflst);
3934		KASSERT(jaddref != NULL,
3935		    ("softdep_setup_mkdir: No addref structure present."));
3936		KASSERT(jaddref->ja_parent == dp->i_number,
3937		    ("softdep_setup_mkdir: bad parent %d",
3938		    jaddref->ja_parent));
3939		jaddref->ja_mode = ip->i_mode;
3940		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
3941		    if_deps);
3942	}
3943	inodedep = inodedep_lookup_ip(dp);
3944	if (DOINGSUJ(dvp))
3945		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
3946		    &dotdotaddref->ja_ref, if_deps);
3947	softdep_prelink(ITOV(dp), NULL);
3948	FREE_LOCK(&lk);
3949}
3950
3951/*
3952 * Called to track nlinkdelta of the inode and parent directories prior to
3953 * unlinking a directory.
3954 */
3955void
3956softdep_setup_rmdir(dp, ip)
3957	struct inode *dp;
3958	struct inode *ip;
3959{
3960	struct vnode *dvp;
3961
3962	dvp = ITOV(dp);
3963	ACQUIRE_LOCK(&lk);
3964	(void) inodedep_lookup_ip(ip);
3965	(void) inodedep_lookup_ip(dp);
3966	softdep_prelink(dvp, ITOV(ip));
3967	FREE_LOCK(&lk);
3968}
3969
3970/*
3971 * Called to track nlinkdelta of the inode and parent directories prior to
3972 * unlink.
3973 */
3974void
3975softdep_setup_unlink(dp, ip)
3976	struct inode *dp;
3977	struct inode *ip;
3978{
3979	struct vnode *dvp;
3980
3981	dvp = ITOV(dp);
3982	ACQUIRE_LOCK(&lk);
3983	(void) inodedep_lookup_ip(ip);
3984	(void) inodedep_lookup_ip(dp);
3985	softdep_prelink(dvp, ITOV(ip));
3986	FREE_LOCK(&lk);
3987}
3988
3989/*
3990 * Called to release the journal structures created by a failed non-directory
3991 * creation.  Adjusts nlinkdelta for non-journaling softdep.
3992 */
3993void
3994softdep_revert_create(dp, ip)
3995	struct inode *dp;
3996	struct inode *ip;
3997{
3998	struct inodedep *inodedep;
3999	struct jaddref *jaddref;
4000	struct vnode *dvp;
4001
4002	dvp = ITOV(dp);
4003	ACQUIRE_LOCK(&lk);
4004	inodedep = inodedep_lookup_ip(ip);
4005	if (DOINGSUJ(dvp)) {
4006		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4007		    inoreflst);
4008		KASSERT(jaddref->ja_parent == dp->i_number,
4009		    ("softdep_revert_create: addref parent mismatch"));
4010		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4011	}
4012	FREE_LOCK(&lk);
4013}
4014
4015/*
4016 * Called to release the journal structures created by a failed dotdot link
4017 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4018 */
4019void
4020softdep_revert_dotdot_link(dp, ip)
4021	struct inode *dp;
4022	struct inode *ip;
4023{
4024	struct inodedep *inodedep;
4025	struct jaddref *jaddref;
4026	struct vnode *dvp;
4027
4028	dvp = ITOV(dp);
4029	ACQUIRE_LOCK(&lk);
4030	inodedep = inodedep_lookup_ip(dp);
4031	if (DOINGSUJ(dvp)) {
4032		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4033		    inoreflst);
4034		KASSERT(jaddref->ja_parent == ip->i_number,
4035		    ("softdep_revert_dotdot_link: addref parent mismatch"));
4036		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4037	}
4038	FREE_LOCK(&lk);
4039}
4040
4041/*
4042 * Called to release the journal structures created by a failed link
4043 * addition.  Adjusts nlinkdelta for non-journaling softdep.
4044 */
4045void
4046softdep_revert_link(dp, ip)
4047	struct inode *dp;
4048	struct inode *ip;
4049{
4050	struct inodedep *inodedep;
4051	struct jaddref *jaddref;
4052	struct vnode *dvp;
4053
4054	dvp = ITOV(dp);
4055	ACQUIRE_LOCK(&lk);
4056	inodedep = inodedep_lookup_ip(ip);
4057	if (DOINGSUJ(dvp)) {
4058		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4059		    inoreflst);
4060		KASSERT(jaddref->ja_parent == dp->i_number,
4061		    ("softdep_revert_link: addref parent mismatch"));
4062		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4063	}
4064	FREE_LOCK(&lk);
4065}
4066
4067/*
4068 * Called to release the journal structures created by a failed mkdir
4069 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4070 */
4071void
4072softdep_revert_mkdir(dp, ip)
4073	struct inode *dp;
4074	struct inode *ip;
4075{
4076	struct inodedep *inodedep;
4077	struct jaddref *jaddref;
4078	struct jaddref *dotaddref;
4079	struct vnode *dvp;
4080
4081	dvp = ITOV(dp);
4082
4083	ACQUIRE_LOCK(&lk);
4084	inodedep = inodedep_lookup_ip(dp);
4085	if (DOINGSUJ(dvp)) {
4086		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4087		    inoreflst);
4088		KASSERT(jaddref->ja_parent == ip->i_number,
4089		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4090		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4091	}
4092	inodedep = inodedep_lookup_ip(ip);
4093	if (DOINGSUJ(dvp)) {
4094		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4095		    inoreflst);
4096		KASSERT(jaddref->ja_parent == dp->i_number,
4097		    ("softdep_revert_mkdir: addref parent mismatch"));
4098		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4099		    inoreflst, if_deps);
4100		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4101		KASSERT(dotaddref->ja_parent == ip->i_number,
4102		    ("softdep_revert_mkdir: dot addref parent mismatch"));
4103		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4104	}
4105	FREE_LOCK(&lk);
4106}
4107
4108/*
4109 * Called to correct nlinkdelta after a failed rmdir.
4110 */
4111void
4112softdep_revert_rmdir(dp, ip)
4113	struct inode *dp;
4114	struct inode *ip;
4115{
4116
4117	ACQUIRE_LOCK(&lk);
4118	(void) inodedep_lookup_ip(ip);
4119	(void) inodedep_lookup_ip(dp);
4120	FREE_LOCK(&lk);
4121}
4122
4123/*
4124 * Protecting the freemaps (or bitmaps).
4125 *
4126 * To eliminate the need to execute fsck before mounting a filesystem
4127 * after a power failure, one must (conservatively) guarantee that the
4128 * on-disk copy of the bitmaps never indicate that a live inode or block is
4129 * free.  So, when a block or inode is allocated, the bitmap should be
4130 * updated (on disk) before any new pointers.  When a block or inode is
4131 * freed, the bitmap should not be updated until all pointers have been
4132 * reset.  The latter dependency is handled by the delayed de-allocation
4133 * approach described below for block and inode de-allocation.  The former
4134 * dependency is handled by calling the following procedure when a block or
4135 * inode is allocated. When an inode is allocated an "inodedep" is created
4136 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4137 * Each "inodedep" is also inserted into the hash indexing structure so
4138 * that any additional link additions can be made dependent on the inode
4139 * allocation.
4140 *
4141 * The ufs filesystem maintains a number of free block counts (e.g., per
4142 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4143 * in addition to the bitmaps.  These counts are used to improve efficiency
4144 * during allocation and therefore must be consistent with the bitmaps.
4145 * There is no convenient way to guarantee post-crash consistency of these
4146 * counts with simple update ordering, for two main reasons: (1) The counts
4147 * and bitmaps for a single cylinder group block are not in the same disk
4148 * sector.  If a disk write is interrupted (e.g., by power failure), one may
4149 * be written and the other not.  (2) Some of the counts are located in the
4150 * superblock rather than the cylinder group block. So, we focus our soft
4151 * updates implementation on protecting the bitmaps. When mounting a
4152 * filesystem, we recompute the auxiliary counts from the bitmaps.
4153 */
4154
4155/*
4156 * Called just after updating the cylinder group block to allocate an inode.
4157 */
4158void
4159softdep_setup_inomapdep(bp, ip, newinum)
4160	struct buf *bp;		/* buffer for cylgroup block with inode map */
4161	struct inode *ip;	/* inode related to allocation */
4162	ino_t newinum;		/* new inode number being allocated */
4163{
4164	struct inodedep *inodedep;
4165	struct bmsafemap *bmsafemap;
4166	struct jaddref *jaddref;
4167	struct mount *mp;
4168	struct fs *fs;
4169
4170	mp = UFSTOVFS(ip->i_ump);
4171	fs = ip->i_ump->um_fs;
4172	jaddref = NULL;
4173
4174	/*
4175	 * Allocate the journal reference add structure so that the bitmap
4176	 * can be dependent on it.
4177	 */
4178	if (mp->mnt_kern_flag & MNTK_SUJ) {
4179		jaddref = newjaddref(ip, newinum, 0, 0, 0);
4180		jaddref->ja_state |= NEWBLOCK;
4181	}
4182
4183	/*
4184	 * Create a dependency for the newly allocated inode.
4185	 * Panic if it already exists as something is seriously wrong.
4186	 * Otherwise add it to the dependency list for the buffer holding
4187	 * the cylinder group map from which it was allocated.
4188	 */
4189	ACQUIRE_LOCK(&lk);
4190	if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep)))
4191		panic("softdep_setup_inomapdep: dependency %p for new"
4192		    "inode already exists", inodedep);
4193	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum));
4194	if (jaddref) {
4195		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
4196		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4197		    if_deps);
4198	} else {
4199		inodedep->id_state |= ONDEPLIST;
4200		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
4201	}
4202	inodedep->id_bmsafemap = bmsafemap;
4203	inodedep->id_state &= ~DEPCOMPLETE;
4204	FREE_LOCK(&lk);
4205}
4206
4207/*
4208 * Called just after updating the cylinder group block to
4209 * allocate block or fragment.
4210 */
4211void
4212softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
4213	struct buf *bp;		/* buffer for cylgroup block with block map */
4214	struct mount *mp;	/* filesystem doing allocation */
4215	ufs2_daddr_t newblkno;	/* number of newly allocated block */
4216	int frags;		/* Number of fragments. */
4217	int oldfrags;		/* Previous number of fragments for extend. */
4218{
4219	struct newblk *newblk;
4220	struct bmsafemap *bmsafemap;
4221	struct jnewblk *jnewblk;
4222	struct fs *fs;
4223
4224	fs = VFSTOUFS(mp)->um_fs;
4225	jnewblk = NULL;
4226	/*
4227	 * Create a dependency for the newly allocated block.
4228	 * Add it to the dependency list for the buffer holding
4229	 * the cylinder group map from which it was allocated.
4230	 */
4231	if (mp->mnt_kern_flag & MNTK_SUJ) {
4232		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
4233		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
4234		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
4235		jnewblk->jn_state = ATTACHED;
4236		jnewblk->jn_blkno = newblkno;
4237		jnewblk->jn_frags = frags;
4238		jnewblk->jn_oldfrags = oldfrags;
4239#ifdef SUJ_DEBUG
4240		{
4241			struct cg *cgp;
4242			uint8_t *blksfree;
4243			long bno;
4244			int i;
4245
4246			cgp = (struct cg *)bp->b_data;
4247			blksfree = cg_blksfree(cgp);
4248			bno = dtogd(fs, jnewblk->jn_blkno);
4249			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
4250			    i++) {
4251				if (isset(blksfree, bno + i))
4252					panic("softdep_setup_blkmapdep: "
4253					    "free fragment %d from %d-%d "
4254					    "state 0x%X dep %p", i,
4255					    jnewblk->jn_oldfrags,
4256					    jnewblk->jn_frags,
4257					    jnewblk->jn_state,
4258					    jnewblk->jn_newblk);
4259			}
4260		}
4261#endif
4262	}
4263	ACQUIRE_LOCK(&lk);
4264	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
4265		panic("softdep_setup_blkmapdep: found block");
4266	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
4267	    dtog(fs, newblkno));
4268	if (jnewblk) {
4269		jnewblk->jn_newblk = newblk;
4270		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
4271	} else {
4272		newblk->nb_state |= ONDEPLIST;
4273		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
4274	}
4275	newblk->nb_bmsafemap = bmsafemap;
4276	newblk->nb_jnewblk = jnewblk;
4277	FREE_LOCK(&lk);
4278}
4279
4280#define	BMSAFEMAP_HASH(fs, cg) \
4281      (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash])
4282
4283static int
4284bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp)
4285	struct bmsafemap_hashhead *bmsafemaphd;
4286	struct mount *mp;
4287	int cg;
4288	struct bmsafemap **bmsafemapp;
4289{
4290	struct bmsafemap *bmsafemap;
4291
4292	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
4293		if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg)
4294			break;
4295	if (bmsafemap) {
4296		*bmsafemapp = bmsafemap;
4297		return (1);
4298	}
4299	*bmsafemapp = NULL;
4300
4301	return (0);
4302}
4303
4304/*
4305 * Find the bmsafemap associated with a cylinder group buffer.
4306 * If none exists, create one. The buffer must be locked when
4307 * this routine is called and this routine must be called with
4308 * splbio interrupts blocked.
4309 */
4310static struct bmsafemap *
4311bmsafemap_lookup(mp, bp, cg)
4312	struct mount *mp;
4313	struct buf *bp;
4314	int cg;
4315{
4316	struct bmsafemap_hashhead *bmsafemaphd;
4317	struct bmsafemap *bmsafemap, *collision;
4318	struct worklist *wk;
4319	struct fs *fs;
4320
4321	mtx_assert(&lk, MA_OWNED);
4322	if (bp)
4323		LIST_FOREACH(wk, &bp->b_dep, wk_list)
4324			if (wk->wk_type == D_BMSAFEMAP)
4325				return (WK_BMSAFEMAP(wk));
4326	fs = VFSTOUFS(mp)->um_fs;
4327	bmsafemaphd = BMSAFEMAP_HASH(fs, cg);
4328	if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1)
4329		return (bmsafemap);
4330	FREE_LOCK(&lk);
4331	bmsafemap = malloc(sizeof(struct bmsafemap),
4332		M_BMSAFEMAP, M_SOFTDEP_FLAGS);
4333	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
4334	bmsafemap->sm_buf = bp;
4335	LIST_INIT(&bmsafemap->sm_inodedephd);
4336	LIST_INIT(&bmsafemap->sm_inodedepwr);
4337	LIST_INIT(&bmsafemap->sm_newblkhd);
4338	LIST_INIT(&bmsafemap->sm_newblkwr);
4339	LIST_INIT(&bmsafemap->sm_jaddrefhd);
4340	LIST_INIT(&bmsafemap->sm_jnewblkhd);
4341	ACQUIRE_LOCK(&lk);
4342	if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) {
4343		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
4344		return (collision);
4345	}
4346	bmsafemap->sm_cg = cg;
4347	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
4348	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
4349	return (bmsafemap);
4350}
4351
4352/*
4353 * Direct block allocation dependencies.
4354 *
4355 * When a new block is allocated, the corresponding disk locations must be
4356 * initialized (with zeros or new data) before the on-disk inode points to
4357 * them.  Also, the freemap from which the block was allocated must be
4358 * updated (on disk) before the inode's pointer. These two dependencies are
4359 * independent of each other and are needed for all file blocks and indirect
4360 * blocks that are pointed to directly by the inode.  Just before the
4361 * "in-core" version of the inode is updated with a newly allocated block
4362 * number, a procedure (below) is called to setup allocation dependency
4363 * structures.  These structures are removed when the corresponding
4364 * dependencies are satisfied or when the block allocation becomes obsolete
4365 * (i.e., the file is deleted, the block is de-allocated, or the block is a
4366 * fragment that gets upgraded).  All of these cases are handled in
4367 * procedures described later.
4368 *
4369 * When a file extension causes a fragment to be upgraded, either to a larger
4370 * fragment or to a full block, the on-disk location may change (if the
4371 * previous fragment could not simply be extended). In this case, the old
4372 * fragment must be de-allocated, but not until after the inode's pointer has
4373 * been updated. In most cases, this is handled by later procedures, which
4374 * will construct a "freefrag" structure to be added to the workitem queue
4375 * when the inode update is complete (or obsolete).  The main exception to
4376 * this is when an allocation occurs while a pending allocation dependency
4377 * (for the same block pointer) remains.  This case is handled in the main
4378 * allocation dependency setup procedure by immediately freeing the
4379 * unreferenced fragments.
4380 */
4381void
4382softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
4383	struct inode *ip;	/* inode to which block is being added */
4384	ufs_lbn_t off;		/* block pointer within inode */
4385	ufs2_daddr_t newblkno;	/* disk block number being added */
4386	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
4387	long newsize;		/* size of new block */
4388	long oldsize;		/* size of new block */
4389	struct buf *bp;		/* bp for allocated block */
4390{
4391	struct allocdirect *adp, *oldadp;
4392	struct allocdirectlst *adphead;
4393	struct freefrag *freefrag;
4394	struct inodedep *inodedep;
4395	struct pagedep *pagedep;
4396	struct jnewblk *jnewblk;
4397	struct newblk *newblk;
4398	struct mount *mp;
4399	ufs_lbn_t lbn;
4400
4401	lbn = bp->b_lblkno;
4402	mp = UFSTOVFS(ip->i_ump);
4403	if (oldblkno && oldblkno != newblkno)
4404		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
4405	else
4406		freefrag = NULL;
4407
4408	ACQUIRE_LOCK(&lk);
4409	if (off >= NDADDR) {
4410		if (lbn > 0)
4411			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
4412			    lbn, off);
4413		/* allocating an indirect block */
4414		if (oldblkno != 0)
4415			panic("softdep_setup_allocdirect: non-zero indir");
4416	} else {
4417		if (off != lbn)
4418			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
4419			    lbn, off);
4420		/*
4421		 * Allocating a direct block.
4422		 *
4423		 * If we are allocating a directory block, then we must
4424		 * allocate an associated pagedep to track additions and
4425		 * deletions.
4426		 */
4427		if ((ip->i_mode & IFMT) == IFDIR &&
4428		    pagedep_lookup(mp, ip->i_number, off, DEPALLOC,
4429		    &pagedep) == 0)
4430			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
4431	}
4432	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
4433		panic("softdep_setup_allocdirect: lost block");
4434	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4435	    ("softdep_setup_allocdirect: newblk already initialized"));
4436	/*
4437	 * Convert the newblk to an allocdirect.
4438	 */
4439	newblk->nb_list.wk_type = D_ALLOCDIRECT;
4440	adp = (struct allocdirect *)newblk;
4441	newblk->nb_freefrag = freefrag;
4442	adp->ad_offset = off;
4443	adp->ad_oldblkno = oldblkno;
4444	adp->ad_newsize = newsize;
4445	adp->ad_oldsize = oldsize;
4446
4447	/*
4448	 * Finish initializing the journal.
4449	 */
4450	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4451		jnewblk->jn_ino = ip->i_number;
4452		jnewblk->jn_lbn = lbn;
4453		add_to_journal(&jnewblk->jn_list);
4454	}
4455	if (freefrag && freefrag->ff_jfreefrag != NULL)
4456		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
4457	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
4458	adp->ad_inodedep = inodedep;
4459
4460	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
4461	/*
4462	 * The list of allocdirects must be kept in sorted and ascending
4463	 * order so that the rollback routines can quickly determine the
4464	 * first uncommitted block (the size of the file stored on disk
4465	 * ends at the end of the lowest committed fragment, or if there
4466	 * are no fragments, at the end of the highest committed block).
4467	 * Since files generally grow, the typical case is that the new
4468	 * block is to be added at the end of the list. We speed this
4469	 * special case by checking against the last allocdirect in the
4470	 * list before laboriously traversing the list looking for the
4471	 * insertion point.
4472	 */
4473	adphead = &inodedep->id_newinoupdt;
4474	oldadp = TAILQ_LAST(adphead, allocdirectlst);
4475	if (oldadp == NULL || oldadp->ad_offset <= off) {
4476		/* insert at end of list */
4477		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
4478		if (oldadp != NULL && oldadp->ad_offset == off)
4479			allocdirect_merge(adphead, adp, oldadp);
4480		FREE_LOCK(&lk);
4481		return;
4482	}
4483	TAILQ_FOREACH(oldadp, adphead, ad_next) {
4484		if (oldadp->ad_offset >= off)
4485			break;
4486	}
4487	if (oldadp == NULL)
4488		panic("softdep_setup_allocdirect: lost entry");
4489	/* insert in middle of list */
4490	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
4491	if (oldadp->ad_offset == off)
4492		allocdirect_merge(adphead, adp, oldadp);
4493
4494	FREE_LOCK(&lk);
4495}
4496
4497/*
4498 * Replace an old allocdirect dependency with a newer one.
4499 * This routine must be called with splbio interrupts blocked.
4500 */
4501static void
4502allocdirect_merge(adphead, newadp, oldadp)
4503	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
4504	struct allocdirect *newadp;	/* allocdirect being added */
4505	struct allocdirect *oldadp;	/* existing allocdirect being checked */
4506{
4507	struct worklist *wk;
4508	struct freefrag *freefrag;
4509	struct newdirblk *newdirblk;
4510
4511	freefrag = NULL;
4512	mtx_assert(&lk, MA_OWNED);
4513	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
4514	    newadp->ad_oldsize != oldadp->ad_newsize ||
4515	    newadp->ad_offset >= NDADDR)
4516		panic("%s %jd != new %jd || old size %ld != new %ld",
4517		    "allocdirect_merge: old blkno",
4518		    (intmax_t)newadp->ad_oldblkno,
4519		    (intmax_t)oldadp->ad_newblkno,
4520		    newadp->ad_oldsize, oldadp->ad_newsize);
4521	newadp->ad_oldblkno = oldadp->ad_oldblkno;
4522	newadp->ad_oldsize = oldadp->ad_oldsize;
4523	/*
4524	 * If the old dependency had a fragment to free or had never
4525	 * previously had a block allocated, then the new dependency
4526	 * can immediately post its freefrag and adopt the old freefrag.
4527	 * This action is done by swapping the freefrag dependencies.
4528	 * The new dependency gains the old one's freefrag, and the
4529	 * old one gets the new one and then immediately puts it on
4530	 * the worklist when it is freed by free_newblk. It is
4531	 * not possible to do this swap when the old dependency had a
4532	 * non-zero size but no previous fragment to free. This condition
4533	 * arises when the new block is an extension of the old block.
4534	 * Here, the first part of the fragment allocated to the new
4535	 * dependency is part of the block currently claimed on disk by
4536	 * the old dependency, so cannot legitimately be freed until the
4537	 * conditions for the new dependency are fulfilled.
4538	 */
4539	freefrag = newadp->ad_freefrag;
4540	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
4541		newadp->ad_freefrag = oldadp->ad_freefrag;
4542		oldadp->ad_freefrag = freefrag;
4543	}
4544	/*
4545	 * If we are tracking a new directory-block allocation,
4546	 * move it from the old allocdirect to the new allocdirect.
4547	 */
4548	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
4549		newdirblk = WK_NEWDIRBLK(wk);
4550		WORKLIST_REMOVE(&newdirblk->db_list);
4551		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
4552			panic("allocdirect_merge: extra newdirblk");
4553		WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
4554	}
4555	TAILQ_REMOVE(adphead, oldadp, ad_next);
4556	/*
4557	 * We need to move any journal dependencies over to the freefrag
4558	 * that releases this block if it exists.  Otherwise we are
4559	 * extending an existing block and we'll wait until that is
4560	 * complete to release the journal space and extend the
4561	 * new journal to cover this old space as well.
4562	 */
4563	if (freefrag == NULL) {
4564		struct jnewblk *jnewblk;
4565		struct jnewblk *njnewblk;
4566
4567		if (oldadp->ad_newblkno != newadp->ad_newblkno)
4568			panic("allocdirect_merge: %jd != %jd",
4569			    oldadp->ad_newblkno, newadp->ad_newblkno);
4570		jnewblk = oldadp->ad_block.nb_jnewblk;
4571		cancel_newblk(&oldadp->ad_block, &newadp->ad_block.nb_jwork);
4572		/*
4573		 * We have an unwritten jnewblk, we need to merge the
4574		 * frag bits with our own.  The newer adp's journal can not
4575		 * be written prior to the old one so no need to check for
4576		 * it here.
4577		 */
4578		if (jnewblk) {
4579			njnewblk = newadp->ad_block.nb_jnewblk;
4580			if (njnewblk == NULL)
4581				panic("allocdirect_merge: No jnewblk");
4582			if (jnewblk->jn_state & UNDONE) {
4583				njnewblk->jn_state |= UNDONE | NEWBLOCK;
4584				njnewblk->jn_state &= ~ATTACHED;
4585				jnewblk->jn_state &= ~UNDONE;
4586			}
4587			njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
4588			WORKLIST_REMOVE(&jnewblk->jn_list);
4589			jnewblk->jn_state |= ATTACHED | COMPLETE;
4590			free_jnewblk(jnewblk);
4591		}
4592	} else {
4593		/*
4594		 * We can skip journaling for this freefrag and just complete
4595		 * any pending journal work for the allocdirect that is being
4596		 * removed after the freefrag completes.
4597		 */
4598		if (freefrag->ff_jfreefrag)
4599			cancel_jfreefrag(freefrag->ff_jfreefrag);
4600		cancel_newblk(&oldadp->ad_block, &freefrag->ff_jwork);
4601	}
4602	free_newblk(&oldadp->ad_block);
4603}
4604
4605/*
4606 * Allocate a jfreefrag structure to journal a single block free.
4607 */
4608static struct jfreefrag *
4609newjfreefrag(freefrag, ip, blkno, size, lbn)
4610	struct freefrag *freefrag;
4611	struct inode *ip;
4612	ufs2_daddr_t blkno;
4613	long size;
4614	ufs_lbn_t lbn;
4615{
4616	struct jfreefrag *jfreefrag;
4617	struct fs *fs;
4618
4619	fs = ip->i_fs;
4620	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
4621	    M_SOFTDEP_FLAGS);
4622	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
4623	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
4624	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
4625	jfreefrag->fr_ino = ip->i_number;
4626	jfreefrag->fr_lbn = lbn;
4627	jfreefrag->fr_blkno = blkno;
4628	jfreefrag->fr_frags = numfrags(fs, size);
4629	jfreefrag->fr_freefrag = freefrag;
4630
4631	return (jfreefrag);
4632}
4633
4634/*
4635 * Allocate a new freefrag structure.
4636 */
4637static struct freefrag *
4638newfreefrag(ip, blkno, size, lbn)
4639	struct inode *ip;
4640	ufs2_daddr_t blkno;
4641	long size;
4642	ufs_lbn_t lbn;
4643{
4644	struct freefrag *freefrag;
4645	struct fs *fs;
4646
4647	fs = ip->i_fs;
4648	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
4649		panic("newfreefrag: frag size");
4650	freefrag = malloc(sizeof(struct freefrag),
4651	    M_FREEFRAG, M_SOFTDEP_FLAGS);
4652	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
4653	freefrag->ff_state = ATTACHED;
4654	LIST_INIT(&freefrag->ff_jwork);
4655	freefrag->ff_inum = ip->i_number;
4656	freefrag->ff_blkno = blkno;
4657	freefrag->ff_fragsize = size;
4658
4659	if (fs->fs_flags & FS_SUJ) {
4660		freefrag->ff_jfreefrag =
4661		    newjfreefrag(freefrag, ip, blkno, size, lbn);
4662	} else {
4663		freefrag->ff_state |= DEPCOMPLETE;
4664		freefrag->ff_jfreefrag = NULL;
4665	}
4666
4667	return (freefrag);
4668}
4669
4670/*
4671 * This workitem de-allocates fragments that were replaced during
4672 * file block allocation.
4673 */
4674static void
4675handle_workitem_freefrag(freefrag)
4676	struct freefrag *freefrag;
4677{
4678	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
4679	struct workhead wkhd;
4680
4681	/*
4682	 * It would be illegal to add new completion items to the
4683	 * freefrag after it was schedule to be done so it must be
4684	 * safe to modify the list head here.
4685	 */
4686	LIST_INIT(&wkhd);
4687	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
4688	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
4689	    freefrag->ff_fragsize, freefrag->ff_inum, &wkhd);
4690	ACQUIRE_LOCK(&lk);
4691	WORKITEM_FREE(freefrag, D_FREEFRAG);
4692	FREE_LOCK(&lk);
4693}
4694
4695/*
4696 * Set up a dependency structure for an external attributes data block.
4697 * This routine follows much of the structure of softdep_setup_allocdirect.
4698 * See the description of softdep_setup_allocdirect above for details.
4699 */
4700void
4701softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
4702	struct inode *ip;
4703	ufs_lbn_t off;
4704	ufs2_daddr_t newblkno;
4705	ufs2_daddr_t oldblkno;
4706	long newsize;
4707	long oldsize;
4708	struct buf *bp;
4709{
4710	struct allocdirect *adp, *oldadp;
4711	struct allocdirectlst *adphead;
4712	struct freefrag *freefrag;
4713	struct inodedep *inodedep;
4714	struct jnewblk *jnewblk;
4715	struct newblk *newblk;
4716	struct mount *mp;
4717	ufs_lbn_t lbn;
4718
4719	if (off >= NXADDR)
4720		panic("softdep_setup_allocext: lbn %lld > NXADDR",
4721		    (long long)off);
4722
4723	lbn = bp->b_lblkno;
4724	mp = UFSTOVFS(ip->i_ump);
4725	if (oldblkno && oldblkno != newblkno)
4726		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
4727	else
4728		freefrag = NULL;
4729
4730	ACQUIRE_LOCK(&lk);
4731	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
4732		panic("softdep_setup_allocext: lost block");
4733	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4734	    ("softdep_setup_allocext: newblk already initialized"));
4735	/*
4736	 * Convert the newblk to an allocdirect.
4737	 */
4738	newblk->nb_list.wk_type = D_ALLOCDIRECT;
4739	adp = (struct allocdirect *)newblk;
4740	newblk->nb_freefrag = freefrag;
4741	adp->ad_offset = off;
4742	adp->ad_oldblkno = oldblkno;
4743	adp->ad_newsize = newsize;
4744	adp->ad_oldsize = oldsize;
4745	adp->ad_state |=  EXTDATA;
4746
4747	/*
4748	 * Finish initializing the journal.
4749	 */
4750	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4751		jnewblk->jn_ino = ip->i_number;
4752		jnewblk->jn_lbn = lbn;
4753		add_to_journal(&jnewblk->jn_list);
4754	}
4755	if (freefrag && freefrag->ff_jfreefrag != NULL)
4756		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
4757	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
4758	adp->ad_inodedep = inodedep;
4759
4760	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
4761	/*
4762	 * The list of allocdirects must be kept in sorted and ascending
4763	 * order so that the rollback routines can quickly determine the
4764	 * first uncommitted block (the size of the file stored on disk
4765	 * ends at the end of the lowest committed fragment, or if there
4766	 * are no fragments, at the end of the highest committed block).
4767	 * Since files generally grow, the typical case is that the new
4768	 * block is to be added at the end of the list. We speed this
4769	 * special case by checking against the last allocdirect in the
4770	 * list before laboriously traversing the list looking for the
4771	 * insertion point.
4772	 */
4773	adphead = &inodedep->id_newextupdt;
4774	oldadp = TAILQ_LAST(adphead, allocdirectlst);
4775	if (oldadp == NULL || oldadp->ad_offset <= off) {
4776		/* insert at end of list */
4777		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
4778		if (oldadp != NULL && oldadp->ad_offset == off)
4779			allocdirect_merge(adphead, adp, oldadp);
4780		FREE_LOCK(&lk);
4781		return;
4782	}
4783	TAILQ_FOREACH(oldadp, adphead, ad_next) {
4784		if (oldadp->ad_offset >= off)
4785			break;
4786	}
4787	if (oldadp == NULL)
4788		panic("softdep_setup_allocext: lost entry");
4789	/* insert in middle of list */
4790	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
4791	if (oldadp->ad_offset == off)
4792		allocdirect_merge(adphead, adp, oldadp);
4793	FREE_LOCK(&lk);
4794}
4795
4796/*
4797 * Indirect block allocation dependencies.
4798 *
4799 * The same dependencies that exist for a direct block also exist when
4800 * a new block is allocated and pointed to by an entry in a block of
4801 * indirect pointers. The undo/redo states described above are also
4802 * used here. Because an indirect block contains many pointers that
4803 * may have dependencies, a second copy of the entire in-memory indirect
4804 * block is kept. The buffer cache copy is always completely up-to-date.
4805 * The second copy, which is used only as a source for disk writes,
4806 * contains only the safe pointers (i.e., those that have no remaining
4807 * update dependencies). The second copy is freed when all pointers
4808 * are safe. The cache is not allowed to replace indirect blocks with
4809 * pending update dependencies. If a buffer containing an indirect
4810 * block with dependencies is written, these routines will mark it
4811 * dirty again. It can only be successfully written once all the
4812 * dependencies are removed. The ffs_fsync routine in conjunction with
4813 * softdep_sync_metadata work together to get all the dependencies
4814 * removed so that a file can be successfully written to disk. Three
4815 * procedures are used when setting up indirect block pointer
4816 * dependencies. The division is necessary because of the organization
4817 * of the "balloc" routine and because of the distinction between file
4818 * pages and file metadata blocks.
4819 */
4820
4821/*
4822 * Allocate a new allocindir structure.
4823 */
4824static struct allocindir *
4825newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
4826	struct inode *ip;	/* inode for file being extended */
4827	int ptrno;		/* offset of pointer in indirect block */
4828	ufs2_daddr_t newblkno;	/* disk block number being added */
4829	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
4830	ufs_lbn_t lbn;
4831{
4832	struct newblk *newblk;
4833	struct allocindir *aip;
4834	struct freefrag *freefrag;
4835	struct jnewblk *jnewblk;
4836
4837	if (oldblkno)
4838		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
4839	else
4840		freefrag = NULL;
4841	ACQUIRE_LOCK(&lk);
4842	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
4843		panic("new_allocindir: lost block");
4844	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4845	    ("newallocindir: newblk already initialized"));
4846	newblk->nb_list.wk_type = D_ALLOCINDIR;
4847	newblk->nb_freefrag = freefrag;
4848	aip = (struct allocindir *)newblk;
4849	aip->ai_offset = ptrno;
4850	aip->ai_oldblkno = oldblkno;
4851	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4852		jnewblk->jn_ino = ip->i_number;
4853		jnewblk->jn_lbn = lbn;
4854		add_to_journal(&jnewblk->jn_list);
4855	}
4856	if (freefrag && freefrag->ff_jfreefrag != NULL)
4857		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
4858	return (aip);
4859}
4860
4861/*
4862 * Called just before setting an indirect block pointer
4863 * to a newly allocated file page.
4864 */
4865void
4866softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
4867	struct inode *ip;	/* inode for file being extended */
4868	ufs_lbn_t lbn;		/* allocated block number within file */
4869	struct buf *bp;		/* buffer with indirect blk referencing page */
4870	int ptrno;		/* offset of pointer in indirect block */
4871	ufs2_daddr_t newblkno;	/* disk block number being added */
4872	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
4873	struct buf *nbp;	/* buffer holding allocated page */
4874{
4875	struct inodedep *inodedep;
4876	struct allocindir *aip;
4877	struct pagedep *pagedep;
4878	struct mount *mp;
4879
4880	if (lbn != nbp->b_lblkno)
4881		panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
4882		    lbn, bp->b_lblkno);
4883	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
4884	mp = UFSTOVFS(ip->i_ump);
4885	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
4886	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
4887	/*
4888	 * If we are allocating a directory page, then we must
4889	 * allocate an associated pagedep to track additions and
4890	 * deletions.
4891	 */
4892	if ((ip->i_mode & IFMT) == IFDIR &&
4893	    pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0)
4894		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
4895	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
4896	setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
4897	FREE_LOCK(&lk);
4898}
4899
4900/*
4901 * Called just before setting an indirect block pointer to a
4902 * newly allocated indirect block.
4903 */
4904void
4905softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
4906	struct buf *nbp;	/* newly allocated indirect block */
4907	struct inode *ip;	/* inode for file being extended */
4908	struct buf *bp;		/* indirect block referencing allocated block */
4909	int ptrno;		/* offset of pointer in indirect block */
4910	ufs2_daddr_t newblkno;	/* disk block number being added */
4911{
4912	struct inodedep *inodedep;
4913	struct allocindir *aip;
4914	ufs_lbn_t lbn;
4915
4916	lbn = nbp->b_lblkno;
4917	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
4918	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
4919	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
4920	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
4921	setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
4922	FREE_LOCK(&lk);
4923}
4924
4925static void
4926indirdep_complete(indirdep)
4927	struct indirdep *indirdep;
4928{
4929	struct allocindir *aip;
4930
4931	LIST_REMOVE(indirdep, ir_next);
4932	indirdep->ir_state &= ~ONDEPLIST;
4933
4934	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
4935		LIST_REMOVE(aip, ai_next);
4936		free_newblk(&aip->ai_block);
4937	}
4938	/*
4939	 * If this indirdep is not attached to a buf it was simply waiting
4940	 * on completion to clear completehd.  free_indirdep() asserts
4941	 * that nothing is dangling.
4942	 */
4943	if ((indirdep->ir_state & ONWORKLIST) == 0)
4944		free_indirdep(indirdep);
4945}
4946
4947/*
4948 * Called to finish the allocation of the "aip" allocated
4949 * by one of the two routines above.
4950 */
4951static void
4952setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
4953	struct buf *bp;		/* in-memory copy of the indirect block */
4954	struct inode *ip;	/* inode for file being extended */
4955	struct inodedep *inodedep; /* Inodedep for ip */
4956	struct allocindir *aip;	/* allocindir allocated by the above routines */
4957	ufs_lbn_t lbn;		/* Logical block number for this block. */
4958{
4959	struct worklist *wk;
4960	struct fs *fs;
4961	struct newblk *newblk;
4962	struct indirdep *indirdep, *newindirdep;
4963	struct allocindir *oldaip;
4964	struct freefrag *freefrag;
4965	struct mount *mp;
4966	ufs2_daddr_t blkno;
4967
4968	mp = UFSTOVFS(ip->i_ump);
4969	fs = ip->i_fs;
4970	mtx_assert(&lk, MA_OWNED);
4971	if (bp->b_lblkno >= 0)
4972		panic("setup_allocindir_phase2: not indir blk");
4973	for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) {
4974		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4975			if (wk->wk_type != D_INDIRDEP)
4976				continue;
4977			indirdep = WK_INDIRDEP(wk);
4978			break;
4979		}
4980		if (indirdep == NULL && newindirdep) {
4981			indirdep = newindirdep;
4982			newindirdep = NULL;
4983			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
4984			if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0,
4985			    &newblk)) {
4986				indirdep->ir_state |= ONDEPLIST;
4987				LIST_INSERT_HEAD(&newblk->nb_indirdeps,
4988				    indirdep, ir_next);
4989			} else
4990				indirdep->ir_state |= DEPCOMPLETE;
4991		}
4992		if (indirdep) {
4993			aip->ai_indirdep = indirdep;
4994			/*
4995			 * Check to see if there is an existing dependency
4996			 * for this block. If there is, merge the old
4997			 * dependency into the new one.  This happens
4998			 * as a result of reallocblk only.
4999			 */
5000			if (aip->ai_oldblkno == 0)
5001				oldaip = NULL;
5002			else
5003
5004				LIST_FOREACH(oldaip, &indirdep->ir_deplisthd,
5005				    ai_next)
5006					if (oldaip->ai_offset == aip->ai_offset)
5007						break;
5008			if (oldaip != NULL)
5009				freefrag = allocindir_merge(aip, oldaip);
5010			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
5011			KASSERT(aip->ai_offset >= 0 &&
5012			    aip->ai_offset < NINDIR(ip->i_ump->um_fs),
5013			    ("setup_allocindir_phase2: Bad offset %d",
5014			    aip->ai_offset));
5015			KASSERT(indirdep->ir_savebp != NULL,
5016			    ("setup_allocindir_phase2 NULL ir_savebp"));
5017			if (ip->i_ump->um_fstype == UFS1)
5018				((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
5019				    [aip->ai_offset] = aip->ai_oldblkno;
5020			else
5021				((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
5022				    [aip->ai_offset] = aip->ai_oldblkno;
5023			FREE_LOCK(&lk);
5024			if (freefrag != NULL)
5025				handle_workitem_freefrag(freefrag);
5026		} else
5027			FREE_LOCK(&lk);
5028		if (newindirdep) {
5029			newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
5030			brelse(newindirdep->ir_savebp);
5031			ACQUIRE_LOCK(&lk);
5032			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
5033			if (indirdep)
5034				break;
5035			FREE_LOCK(&lk);
5036		}
5037		if (indirdep) {
5038			ACQUIRE_LOCK(&lk);
5039			break;
5040		}
5041		newindirdep = malloc(sizeof(struct indirdep),
5042			M_INDIRDEP, M_SOFTDEP_FLAGS);
5043		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5044		newindirdep->ir_state = ATTACHED;
5045		if (ip->i_ump->um_fstype == UFS1)
5046			newindirdep->ir_state |= UFS1FMT;
5047		newindirdep->ir_saveddata = NULL;
5048		LIST_INIT(&newindirdep->ir_deplisthd);
5049		LIST_INIT(&newindirdep->ir_donehd);
5050		LIST_INIT(&newindirdep->ir_writehd);
5051		LIST_INIT(&newindirdep->ir_completehd);
5052		LIST_INIT(&newindirdep->ir_jwork);
5053		if (bp->b_blkno == bp->b_lblkno) {
5054			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5055			    NULL, NULL);
5056			bp->b_blkno = blkno;
5057		}
5058		newindirdep->ir_savebp =
5059		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5060		BUF_KERNPROC(newindirdep->ir_savebp);
5061		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5062		ACQUIRE_LOCK(&lk);
5063	}
5064}
5065
5066/*
5067 * Merge two allocindirs which refer to the same block.  Move newblock
5068 * dependencies and setup the freefrags appropriately.
5069 */
5070static struct freefrag *
5071allocindir_merge(aip, oldaip)
5072	struct allocindir *aip;
5073	struct allocindir *oldaip;
5074{
5075	struct newdirblk *newdirblk;
5076	struct freefrag *freefrag;
5077	struct worklist *wk;
5078
5079	if (oldaip->ai_newblkno != aip->ai_oldblkno)
5080		panic("allocindir_merge: blkno");
5081	aip->ai_oldblkno = oldaip->ai_oldblkno;
5082	freefrag = aip->ai_freefrag;
5083	aip->ai_freefrag = oldaip->ai_freefrag;
5084	oldaip->ai_freefrag = NULL;
5085	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
5086	/*
5087	 * If we are tracking a new directory-block allocation,
5088	 * move it from the old allocindir to the new allocindir.
5089	 */
5090	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
5091		newdirblk = WK_NEWDIRBLK(wk);
5092		WORKLIST_REMOVE(&newdirblk->db_list);
5093		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
5094			panic("allocindir_merge: extra newdirblk");
5095		WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list);
5096	}
5097	/*
5098	 * We can skip journaling for this freefrag and just complete
5099	 * any pending journal work for the allocindir that is being
5100	 * removed after the freefrag completes.
5101	 */
5102	if (freefrag->ff_jfreefrag)
5103		cancel_jfreefrag(freefrag->ff_jfreefrag);
5104	LIST_REMOVE(oldaip, ai_next);
5105	cancel_newblk(&oldaip->ai_block, &freefrag->ff_jwork);
5106	free_newblk(&oldaip->ai_block);
5107
5108	return (freefrag);
5109}
5110
5111/*
5112 * Block de-allocation dependencies.
5113 *
5114 * When blocks are de-allocated, the on-disk pointers must be nullified before
5115 * the blocks are made available for use by other files.  (The true
5116 * requirement is that old pointers must be nullified before new on-disk
5117 * pointers are set.  We chose this slightly more stringent requirement to
5118 * reduce complexity.) Our implementation handles this dependency by updating
5119 * the inode (or indirect block) appropriately but delaying the actual block
5120 * de-allocation (i.e., freemap and free space count manipulation) until
5121 * after the updated versions reach stable storage.  After the disk is
5122 * updated, the blocks can be safely de-allocated whenever it is convenient.
5123 * This implementation handles only the common case of reducing a file's
5124 * length to zero. Other cases are handled by the conventional synchronous
5125 * write approach.
5126 *
5127 * The ffs implementation with which we worked double-checks
5128 * the state of the block pointers and file size as it reduces
5129 * a file's length.  Some of this code is replicated here in our
5130 * soft updates implementation.  The freeblks->fb_chkcnt field is
5131 * used to transfer a part of this information to the procedure
5132 * that eventually de-allocates the blocks.
5133 *
5134 * This routine should be called from the routine that shortens
5135 * a file's length, before the inode's size or block pointers
5136 * are modified. It will save the block pointer information for
5137 * later release and zero the inode so that the calling routine
5138 * can release it.
5139 */
5140void
5141softdep_setup_freeblocks(ip, length, flags)
5142	struct inode *ip;	/* The inode whose length is to be reduced */
5143	off_t length;		/* The new length for the file */
5144	int flags;		/* IO_EXT and/or IO_NORMAL */
5145{
5146	struct ufs1_dinode *dp1;
5147	struct ufs2_dinode *dp2;
5148	struct freeblks *freeblks;
5149	struct inodedep *inodedep;
5150	struct allocdirect *adp;
5151	struct jfreeblk *jfreeblk;
5152	struct bufobj *bo;
5153	struct vnode *vp;
5154	struct buf *bp;
5155	struct fs *fs;
5156	ufs2_daddr_t extblocks, datablocks;
5157	struct mount *mp;
5158	int i, delay, error;
5159	ufs2_daddr_t blkno;
5160	ufs_lbn_t tmpval;
5161	ufs_lbn_t lbn;
5162	long oldextsize;
5163	long oldsize;
5164	int frags;
5165	int needj;
5166
5167	fs = ip->i_fs;
5168	mp = UFSTOVFS(ip->i_ump);
5169	if (length != 0)
5170		panic("softdep_setup_freeblocks: non-zero length");
5171	freeblks = malloc(sizeof(struct freeblks),
5172		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
5173	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
5174	LIST_INIT(&freeblks->fb_jfreeblkhd);
5175	LIST_INIT(&freeblks->fb_jwork);
5176	freeblks->fb_state = ATTACHED;
5177	freeblks->fb_uid = ip->i_uid;
5178	freeblks->fb_previousinum = ip->i_number;
5179	freeblks->fb_devvp = ip->i_devvp;
5180	freeblks->fb_chkcnt = 0;
5181	ACQUIRE_LOCK(&lk);
5182	/*
5183	 * If we're truncating a removed file that will never be written
5184	 * we don't need to journal the block frees.  The canceled journals
5185	 * for the allocations will suffice.
5186	 */
5187	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5188	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED ||
5189	    (fs->fs_flags & FS_SUJ) == 0)
5190		needj = 0;
5191	else
5192		needj = 1;
5193	num_freeblkdep++;
5194	FREE_LOCK(&lk);
5195	extblocks = 0;
5196	if (fs->fs_magic == FS_UFS2_MAGIC)
5197		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
5198	datablocks = DIP(ip, i_blocks) - extblocks;
5199	if ((flags & IO_NORMAL) != 0) {
5200		oldsize = ip->i_size;
5201		ip->i_size = 0;
5202		DIP_SET(ip, i_size, 0);
5203		freeblks->fb_chkcnt = datablocks;
5204		for (i = 0; i < NDADDR; i++) {
5205			blkno = DIP(ip, i_db[i]);
5206			DIP_SET(ip, i_db[i], 0);
5207			if (blkno == 0)
5208				continue;
5209			frags = sblksize(fs, oldsize, i);
5210			frags = numfrags(fs, frags);
5211			newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags,
5212			    needj);
5213		}
5214		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
5215		    i++, tmpval *= NINDIR(fs)) {
5216			blkno = DIP(ip, i_ib[i]);
5217			DIP_SET(ip, i_ib[i], 0);
5218			if (blkno)
5219				newfreework(ip->i_ump, freeblks, NULL, -lbn - i,
5220				    blkno, fs->fs_frag, needj);
5221			lbn += tmpval;
5222		}
5223		UFS_LOCK(ip->i_ump);
5224		fs->fs_pendingblocks += datablocks;
5225		UFS_UNLOCK(ip->i_ump);
5226	}
5227	if ((flags & IO_EXT) != 0) {
5228		oldextsize = ip->i_din2->di_extsize;
5229		ip->i_din2->di_extsize = 0;
5230		freeblks->fb_chkcnt += extblocks;
5231		for (i = 0; i < NXADDR; i++) {
5232			blkno = ip->i_din2->di_extb[i];
5233			ip->i_din2->di_extb[i] = 0;
5234			if (blkno == 0)
5235				continue;
5236			frags = sblksize(fs, oldextsize, i);
5237			frags = numfrags(fs, frags);
5238			newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno,
5239			    frags, needj);
5240		}
5241	}
5242	if (LIST_EMPTY(&freeblks->fb_jfreeblkhd))
5243		needj = 0;
5244	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
5245	/*
5246	 * Push the zero'ed inode to to its disk buffer so that we are free
5247	 * to delete its dependencies below. Once the dependencies are gone
5248	 * the buffer can be safely released.
5249	 */
5250	if ((error = bread(ip->i_devvp,
5251	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
5252	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
5253		brelse(bp);
5254		softdep_error("softdep_setup_freeblocks", error);
5255	}
5256	if (ip->i_ump->um_fstype == UFS1) {
5257		dp1 = ((struct ufs1_dinode *)bp->b_data +
5258		    ino_to_fsbo(fs, ip->i_number));
5259		ip->i_din1->di_freelink = dp1->di_freelink;
5260		*dp1 = *ip->i_din1;
5261	} else {
5262		dp2 = ((struct ufs2_dinode *)bp->b_data +
5263		    ino_to_fsbo(fs, ip->i_number));
5264		ip->i_din2->di_freelink = dp2->di_freelink;
5265		*dp2 = *ip->i_din2;
5266	}
5267	/*
5268	 * Find and eliminate any inode dependencies.
5269	 */
5270	ACQUIRE_LOCK(&lk);
5271	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5272	if ((inodedep->id_state & IOSTARTED) != 0)
5273		panic("softdep_setup_freeblocks: inode busy");
5274	/*
5275	 * Add the freeblks structure to the list of operations that
5276	 * must await the zero'ed inode being written to disk. If we
5277	 * still have a bitmap dependency (delay == 0), then the inode
5278	 * has never been written to disk, so we can process the
5279	 * freeblks below once we have deleted the dependencies.
5280	 */
5281	delay = (inodedep->id_state & DEPCOMPLETE);
5282	if (delay)
5283		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
5284	else if (needj)
5285		freeblks->fb_state |= COMPLETE;
5286	/*
5287	 * Because the file length has been truncated to zero, any
5288	 * pending block allocation dependency structures associated
5289	 * with this inode are obsolete and can simply be de-allocated.
5290	 * We must first merge the two dependency lists to get rid of
5291	 * any duplicate freefrag structures, then purge the merged list.
5292	 * If we still have a bitmap dependency, then the inode has never
5293	 * been written to disk, so we can free any fragments without delay.
5294	 */
5295	if (flags & IO_NORMAL) {
5296		merge_inode_lists(&inodedep->id_newinoupdt,
5297		    &inodedep->id_inoupdt);
5298		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
5299			cancel_allocdirect(&inodedep->id_inoupdt, adp,
5300			    freeblks, delay);
5301	}
5302	if (flags & IO_EXT) {
5303		merge_inode_lists(&inodedep->id_newextupdt,
5304		    &inodedep->id_extupdt);
5305		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
5306			cancel_allocdirect(&inodedep->id_extupdt, adp,
5307			    freeblks, delay);
5308	}
5309	LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps)
5310		add_to_journal(&jfreeblk->jf_list);
5311
5312	FREE_LOCK(&lk);
5313	bdwrite(bp);
5314	/*
5315	 * We must wait for any I/O in progress to finish so that
5316	 * all potential buffers on the dirty list will be visible.
5317	 * Once they are all there, walk the list and get rid of
5318	 * any dependencies.
5319	 */
5320	vp = ITOV(ip);
5321	bo = &vp->v_bufobj;
5322	BO_LOCK(bo);
5323	drain_output(vp);
5324restart:
5325	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
5326		if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
5327		    ((flags & IO_NORMAL) == 0 &&
5328		      (bp->b_xflags & BX_ALTDATA) == 0))
5329			continue;
5330		if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL)
5331			goto restart;
5332		BO_UNLOCK(bo);
5333		ACQUIRE_LOCK(&lk);
5334		(void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
5335		if (deallocate_dependencies(bp, inodedep, freeblks))
5336			bp->b_flags |= B_INVAL | B_NOCACHE;
5337		FREE_LOCK(&lk);
5338		brelse(bp);
5339		BO_LOCK(bo);
5340		goto restart;
5341	}
5342	BO_UNLOCK(bo);
5343	ACQUIRE_LOCK(&lk);
5344	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
5345		(void) free_inodedep(inodedep);
5346
5347	if (delay || needj)
5348		freeblks->fb_state |= DEPCOMPLETE;
5349	if (delay) {
5350		/*
5351		 * If the inode with zeroed block pointers is now on disk
5352		 * we can start freeing blocks. Add freeblks to the worklist
5353		 * instead of calling  handle_workitem_freeblocks directly as
5354		 * it is more likely that additional IO is needed to complete
5355		 * the request here than in the !delay case.
5356		 */
5357		if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
5358			add_to_worklist(&freeblks->fb_list, 1);
5359	}
5360	if (needj && LIST_EMPTY(&freeblks->fb_jfreeblkhd))
5361		needj = 0;
5362
5363	FREE_LOCK(&lk);
5364	/*
5365	 * If the inode has never been written to disk (delay == 0) and
5366	 * we're not waiting on any journal writes, then we can process the
5367	 * freeblks now that we have deleted the dependencies.
5368	 */
5369	if (!delay && !needj)
5370		handle_workitem_freeblocks(freeblks, 0);
5371}
5372
5373/*
5374 * Reclaim any dependency structures from a buffer that is about to
5375 * be reallocated to a new vnode. The buffer must be locked, thus,
5376 * no I/O completion operations can occur while we are manipulating
5377 * its associated dependencies. The mutex is held so that other I/O's
5378 * associated with related dependencies do not occur.  Returns 1 if
5379 * all dependencies were cleared, 0 otherwise.
5380 */
5381static int
5382deallocate_dependencies(bp, inodedep, freeblks)
5383	struct buf *bp;
5384	struct inodedep *inodedep;
5385	struct freeblks *freeblks;
5386{
5387	struct worklist *wk;
5388	struct indirdep *indirdep;
5389	struct newdirblk *newdirblk;
5390	struct allocindir *aip;
5391	struct pagedep *pagedep;
5392	struct jremref *jremref;
5393	struct jmvref *jmvref;
5394	struct dirrem *dirrem;
5395	int i;
5396
5397	mtx_assert(&lk, MA_OWNED);
5398	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
5399		switch (wk->wk_type) {
5400
5401		case D_INDIRDEP:
5402			indirdep = WK_INDIRDEP(wk);
5403			if (bp->b_lblkno >= 0 ||
5404			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
5405				panic("deallocate_dependencies: not indir");
5406			cancel_indirdep(indirdep, bp, inodedep, freeblks);
5407			continue;
5408
5409		case D_PAGEDEP:
5410			pagedep = WK_PAGEDEP(wk);
5411			/*
5412			 * There should be no directory add dependencies present
5413			 * as the directory could not be truncated until all
5414			 * children were removed.
5415			 */
5416			KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
5417			    ("deallocate_dependencies: pendinghd != NULL"));
5418			for (i = 0; i < DAHASHSZ; i++)
5419				KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
5420				    ("deallocate_dependencies: diraddhd != NULL"));
5421			/*
5422			 * Copy any directory remove dependencies to the list
5423			 * to be processed after the zero'ed inode is written.
5424			 * If the inode has already been written, then they
5425			 * can be dumped directly onto the work list.
5426			 */
5427			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
5428				/*
5429				 * If there are any dirrems we wait for
5430				 * the journal write to complete and
5431				 * then restart the buf scan as the lock
5432				 * has been dropped.
5433				 */
5434				while ((jremref =
5435				    LIST_FIRST(&dirrem->dm_jremrefhd))
5436				    != NULL) {
5437					stat_jwait_filepage++;
5438					jwait(&jremref->jr_list);
5439					return (0);
5440				}
5441				LIST_REMOVE(dirrem, dm_next);
5442				dirrem->dm_dirinum = pagedep->pd_ino;
5443				if (inodedep == NULL ||
5444				    (inodedep->id_state & ALLCOMPLETE) ==
5445				     ALLCOMPLETE) {
5446					dirrem->dm_state |= COMPLETE;
5447					add_to_worklist(&dirrem->dm_list, 0);
5448				} else
5449					WORKLIST_INSERT(&inodedep->id_bufwait,
5450					    &dirrem->dm_list);
5451			}
5452			if ((pagedep->pd_state & NEWBLOCK) != 0) {
5453				newdirblk = pagedep->pd_newdirblk;
5454				WORKLIST_REMOVE(&newdirblk->db_list);
5455				free_newdirblk(newdirblk);
5456			}
5457			while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd))
5458			    != NULL) {
5459				stat_jwait_filepage++;
5460				jwait(&jmvref->jm_list);
5461				return (0);
5462			}
5463			WORKLIST_REMOVE(&pagedep->pd_list);
5464			LIST_REMOVE(pagedep, pd_hash);
5465			WORKITEM_FREE(pagedep, D_PAGEDEP);
5466			continue;
5467
5468		case D_ALLOCINDIR:
5469			aip = WK_ALLOCINDIR(wk);
5470			cancel_allocindir(aip, inodedep, freeblks);
5471			continue;
5472
5473		case D_ALLOCDIRECT:
5474		case D_INODEDEP:
5475			panic("deallocate_dependencies: Unexpected type %s",
5476			    TYPENAME(wk->wk_type));
5477			/* NOTREACHED */
5478
5479		default:
5480			panic("deallocate_dependencies: Unknown type %s",
5481			    TYPENAME(wk->wk_type));
5482			/* NOTREACHED */
5483		}
5484	}
5485
5486	return (1);
5487}
5488
5489/*
5490 * An allocdirect is being canceled due to a truncate.  We must make sure
5491 * the journal entry is released in concert with the blkfree that releases
5492 * the storage.  Completed journal entries must not be released until the
5493 * space is no longer pointed to by the inode or in the bitmap.
5494 */
5495static void
5496cancel_allocdirect(adphead, adp, freeblks, delay)
5497	struct allocdirectlst *adphead;
5498	struct allocdirect *adp;
5499	struct freeblks *freeblks;
5500	int delay;
5501{
5502	struct freework *freework;
5503	struct newblk *newblk;
5504	struct worklist *wk;
5505	ufs_lbn_t lbn;
5506
5507	TAILQ_REMOVE(adphead, adp, ad_next);
5508	newblk = (struct newblk *)adp;
5509	/*
5510	 * If the journal hasn't been written the jnewblk must be passed
5511	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
5512	 * this by linking the journal dependency into the freework to be
5513	 * freed when freework_freeblock() is called.  If the journal has
5514	 * been written we can simply reclaim the journal space when the
5515	 * freeblks work is complete.
5516	 */
5517	if (newblk->nb_jnewblk == NULL) {
5518		cancel_newblk(newblk, &freeblks->fb_jwork);
5519		goto found;
5520	}
5521	lbn = newblk->nb_jnewblk->jn_lbn;
5522	/*
5523	 * Find the correct freework structure so it releases the canceled
5524	 * journal when the bitmap is cleared.  This preserves rollback
5525	 * until the allocation is reverted.
5526	 */
5527	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
5528		freework = WK_FREEWORK(wk);
5529		if (freework->fw_lbn != lbn)
5530			continue;
5531		cancel_newblk(newblk, &freework->fw_jwork);
5532		goto found;
5533	}
5534	panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn);
5535found:
5536	if (delay)
5537		WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
5538		    &newblk->nb_list);
5539	else
5540		free_newblk(newblk);
5541	return;
5542}
5543
5544
5545static void
5546cancel_newblk(newblk, wkhd)
5547	struct newblk *newblk;
5548	struct workhead *wkhd;
5549{
5550	struct indirdep *indirdep;
5551	struct allocindir *aip;
5552
5553	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
5554		indirdep->ir_state &= ~ONDEPLIST;
5555		LIST_REMOVE(indirdep, ir_next);
5556		/*
5557		 * If an indirdep is not on the buf worklist we need to
5558		 * free it here as deallocate_dependencies() will never
5559		 * find it.  These pointers were never visible on disk and
5560		 * can be discarded immediately.
5561		 */
5562		while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5563			LIST_REMOVE(aip, ai_next);
5564			cancel_newblk(&aip->ai_block, wkhd);
5565			free_newblk(&aip->ai_block);
5566		}
5567		/*
5568		 * If this indirdep is not attached to a buf it was simply
5569		 * waiting on completion to clear completehd.  free_indirdep()
5570		 * asserts that nothing is dangling.
5571		 */
5572		if ((indirdep->ir_state & ONWORKLIST) == 0)
5573			free_indirdep(indirdep);
5574	}
5575	if (newblk->nb_state & ONDEPLIST) {
5576		newblk->nb_state &= ~ONDEPLIST;
5577		LIST_REMOVE(newblk, nb_deps);
5578	}
5579	if (newblk->nb_state & ONWORKLIST)
5580		WORKLIST_REMOVE(&newblk->nb_list);
5581	/*
5582	 * If the journal entry hasn't been written we hold onto the dep
5583	 * until it is safe to free along with the other journal work.
5584	 */
5585	if (newblk->nb_jnewblk != NULL) {
5586		cancel_jnewblk(newblk->nb_jnewblk, wkhd);
5587		newblk->nb_jnewblk = NULL;
5588	}
5589	if (!LIST_EMPTY(&newblk->nb_jwork))
5590		jwork_move(wkhd, &newblk->nb_jwork);
5591}
5592
5593/*
5594 * Free a newblk. Generate a new freefrag work request if appropriate.
5595 * This must be called after the inode pointer and any direct block pointers
5596 * are valid or fully removed via truncate or frag extension.
5597 */
5598static void
5599free_newblk(newblk)
5600	struct newblk *newblk;
5601{
5602	struct indirdep *indirdep;
5603	struct newdirblk *newdirblk;
5604	struct freefrag *freefrag;
5605	struct worklist *wk;
5606
5607	mtx_assert(&lk, MA_OWNED);
5608	if (newblk->nb_state & ONDEPLIST)
5609		LIST_REMOVE(newblk, nb_deps);
5610	if (newblk->nb_state & ONWORKLIST)
5611		WORKLIST_REMOVE(&newblk->nb_list);
5612	LIST_REMOVE(newblk, nb_hash);
5613	if ((freefrag = newblk->nb_freefrag) != NULL) {
5614		freefrag->ff_state |= COMPLETE;
5615		if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
5616			add_to_worklist(&freefrag->ff_list, 0);
5617	}
5618	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) {
5619		newdirblk = WK_NEWDIRBLK(wk);
5620		WORKLIST_REMOVE(&newdirblk->db_list);
5621		if (!LIST_EMPTY(&newblk->nb_newdirblk))
5622			panic("free_newblk: extra newdirblk");
5623		free_newdirblk(newdirblk);
5624	}
5625	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
5626		indirdep->ir_state |= DEPCOMPLETE;
5627		indirdep_complete(indirdep);
5628	}
5629	KASSERT(newblk->nb_jnewblk == NULL,
5630	    ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk));
5631	handle_jwork(&newblk->nb_jwork);
5632	newblk->nb_list.wk_type = D_NEWBLK;
5633	WORKITEM_FREE(newblk, D_NEWBLK);
5634}
5635
5636/*
5637 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
5638 * This routine must be called with splbio interrupts blocked.
5639 */
5640static void
5641free_newdirblk(newdirblk)
5642	struct newdirblk *newdirblk;
5643{
5644	struct pagedep *pagedep;
5645	struct diradd *dap;
5646	struct worklist *wk;
5647	int i;
5648
5649	mtx_assert(&lk, MA_OWNED);
5650	/*
5651	 * If the pagedep is still linked onto the directory buffer
5652	 * dependency chain, then some of the entries on the
5653	 * pd_pendinghd list may not be committed to disk yet. In
5654	 * this case, we will simply clear the NEWBLOCK flag and
5655	 * let the pd_pendinghd list be processed when the pagedep
5656	 * is next written. If the pagedep is no longer on the buffer
5657	 * dependency chain, then all the entries on the pd_pending
5658	 * list are committed to disk and we can free them here.
5659	 */
5660	pagedep = newdirblk->db_pagedep;
5661	pagedep->pd_state &= ~NEWBLOCK;
5662	if ((pagedep->pd_state & ONWORKLIST) == 0)
5663		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
5664			free_diradd(dap, NULL);
5665	/*
5666	 * If no dependencies remain, the pagedep will be freed.
5667	 */
5668	for (i = 0; i < DAHASHSZ; i++)
5669		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
5670			break;
5671	if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 &&
5672	    LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
5673		KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL,
5674		    ("free_newdirblk: Freeing non-free pagedep %p", pagedep));
5675		LIST_REMOVE(pagedep, pd_hash);
5676		WORKITEM_FREE(pagedep, D_PAGEDEP);
5677	}
5678	/* Should only ever be one item in the list. */
5679	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
5680		WORKLIST_REMOVE(wk);
5681		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
5682	}
5683	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
5684}
5685
5686/*
5687 * Prepare an inode to be freed. The actual free operation is not
5688 * done until the zero'ed inode has been written to disk.
5689 */
5690void
5691softdep_freefile(pvp, ino, mode)
5692	struct vnode *pvp;
5693	ino_t ino;
5694	int mode;
5695{
5696	struct inode *ip = VTOI(pvp);
5697	struct inodedep *inodedep;
5698	struct freefile *freefile;
5699
5700	/*
5701	 * This sets up the inode de-allocation dependency.
5702	 */
5703	freefile = malloc(sizeof(struct freefile),
5704		M_FREEFILE, M_SOFTDEP_FLAGS);
5705	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
5706	freefile->fx_mode = mode;
5707	freefile->fx_oldinum = ino;
5708	freefile->fx_devvp = ip->i_devvp;
5709	LIST_INIT(&freefile->fx_jwork);
5710	UFS_LOCK(ip->i_ump);
5711	ip->i_fs->fs_pendinginodes += 1;
5712	UFS_UNLOCK(ip->i_ump);
5713
5714	/*
5715	 * If the inodedep does not exist, then the zero'ed inode has
5716	 * been written to disk. If the allocated inode has never been
5717	 * written to disk, then the on-disk inode is zero'ed. In either
5718	 * case we can free the file immediately.  If the journal was
5719	 * canceled before being written the inode will never make it to
5720	 * disk and we must send the canceled journal entrys to
5721	 * ffs_freefile() to be cleared in conjunction with the bitmap.
5722	 * Any blocks waiting on the inode to write can be safely freed
5723	 * here as it will never been written.
5724	 */
5725	ACQUIRE_LOCK(&lk);
5726	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
5727	/*
5728	 * Remove this inode from the unlinked list and set
5729	 * GOINGAWAY as appropriate to indicate that this inode
5730	 * will never be written.
5731	 */
5732	if (inodedep && inodedep->id_state & UNLINKED) {
5733		/*
5734		 * Save the journal work to be freed with the bitmap
5735		 * before we clear UNLINKED.  Otherwise it can be lost
5736		 * if the inode block is written.
5737		 */
5738		handle_bufwait(inodedep, &freefile->fx_jwork);
5739		clear_unlinked_inodedep(inodedep);
5740		/* Re-acquire inodedep as we've dropped lk. */
5741		inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
5742	}
5743	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
5744		FREE_LOCK(&lk);
5745		handle_workitem_freefile(freefile);
5746		return;
5747	}
5748	if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0)
5749		inodedep->id_state |= GOINGAWAY;
5750	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
5751	FREE_LOCK(&lk);
5752	if (ip->i_number == ino)
5753		ip->i_flag |= IN_MODIFIED;
5754}
5755
5756/*
5757 * Check to see if an inode has never been written to disk. If
5758 * so free the inodedep and return success, otherwise return failure.
5759 * This routine must be called with splbio interrupts blocked.
5760 *
5761 * If we still have a bitmap dependency, then the inode has never
5762 * been written to disk. Drop the dependency as it is no longer
5763 * necessary since the inode is being deallocated. We set the
5764 * ALLCOMPLETE flags since the bitmap now properly shows that the
5765 * inode is not allocated. Even if the inode is actively being
5766 * written, it has been rolled back to its zero'ed state, so we
5767 * are ensured that a zero inode is what is on the disk. For short
5768 * lived files, this change will usually result in removing all the
5769 * dependencies from the inode so that it can be freed immediately.
5770 */
5771static int
5772check_inode_unwritten(inodedep)
5773	struct inodedep *inodedep;
5774{
5775
5776	mtx_assert(&lk, MA_OWNED);
5777
5778	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
5779	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
5780	    !LIST_EMPTY(&inodedep->id_bufwait) ||
5781	    !LIST_EMPTY(&inodedep->id_inowait) ||
5782	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
5783	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
5784	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
5785	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
5786	    inodedep->id_mkdiradd != NULL ||
5787	    inodedep->id_nlinkdelta != 0)
5788		return (0);
5789	/*
5790	 * Another process might be in initiate_write_inodeblock_ufs[12]
5791	 * trying to allocate memory without holding "Softdep Lock".
5792	 */
5793	if ((inodedep->id_state & IOSTARTED) != 0 &&
5794	    inodedep->id_savedino1 == NULL)
5795		return (0);
5796
5797	if (inodedep->id_state & ONDEPLIST)
5798		LIST_REMOVE(inodedep, id_deps);
5799	inodedep->id_state &= ~ONDEPLIST;
5800	inodedep->id_state |= ALLCOMPLETE;
5801	inodedep->id_bmsafemap = NULL;
5802	if (inodedep->id_state & ONWORKLIST)
5803		WORKLIST_REMOVE(&inodedep->id_list);
5804	if (inodedep->id_savedino1 != NULL) {
5805		free(inodedep->id_savedino1, M_SAVEDINO);
5806		inodedep->id_savedino1 = NULL;
5807	}
5808	if (free_inodedep(inodedep) == 0)
5809		panic("check_inode_unwritten: busy inode");
5810	return (1);
5811}
5812
5813/*
5814 * Try to free an inodedep structure. Return 1 if it could be freed.
5815 */
5816static int
5817free_inodedep(inodedep)
5818	struct inodedep *inodedep;
5819{
5820
5821	mtx_assert(&lk, MA_OWNED);
5822	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
5823	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
5824	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
5825	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
5826	    !LIST_EMPTY(&inodedep->id_bufwait) ||
5827	    !LIST_EMPTY(&inodedep->id_inowait) ||
5828	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
5829	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
5830	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
5831	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
5832	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
5833	    inodedep->id_mkdiradd != NULL ||
5834	    inodedep->id_nlinkdelta != 0 ||
5835	    inodedep->id_savedino1 != NULL)
5836		return (0);
5837	if (inodedep->id_state & ONDEPLIST)
5838		LIST_REMOVE(inodedep, id_deps);
5839	LIST_REMOVE(inodedep, id_hash);
5840	WORKITEM_FREE(inodedep, D_INODEDEP);
5841	num_inodedep -= 1;
5842	return (1);
5843}
5844
5845/*
5846 * Free the block referenced by a freework structure.  The parent freeblks
5847 * structure is released and completed when the final cg bitmap reaches
5848 * the disk.  This routine may be freeing a jnewblk which never made it to
5849 * disk in which case we do not have to wait as the operation is undone
5850 * in memory immediately.
5851 */
5852static void
5853freework_freeblock(freework)
5854	struct freework *freework;
5855{
5856	struct freeblks *freeblks;
5857	struct ufsmount *ump;
5858	struct workhead wkhd;
5859	struct fs *fs;
5860	int complete;
5861	int pending;
5862	int bsize;
5863	int needj;
5864
5865	freeblks = freework->fw_freeblks;
5866	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
5867	fs = ump->um_fs;
5868	needj = freeblks->fb_list.wk_mp->mnt_kern_flag & MNTK_SUJ;
5869	complete = 0;
5870	LIST_INIT(&wkhd);
5871	/*
5872	 * If we are canceling an existing jnewblk pass it to the free
5873	 * routine, otherwise pass the freeblk which will ultimately
5874	 * release the freeblks.  If we're not journaling, we can just
5875	 * free the freeblks immediately.
5876	 */
5877	if (!LIST_EMPTY(&freework->fw_jwork)) {
5878		LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list);
5879		complete = 1;
5880	} else if (needj)
5881		WORKLIST_INSERT_UNLOCKED(&wkhd, &freework->fw_list);
5882	bsize = lfragtosize(fs, freework->fw_frags);
5883	pending = btodb(bsize);
5884	ACQUIRE_LOCK(&lk);
5885	freeblks->fb_chkcnt -= pending;
5886	FREE_LOCK(&lk);
5887	/*
5888	 * extattr blocks don't show up in pending blocks.  XXX why?
5889	 */
5890	if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) {
5891		UFS_LOCK(ump);
5892		fs->fs_pendingblocks -= pending;
5893		UFS_UNLOCK(ump);
5894	}
5895	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno,
5896	    bsize, freeblks->fb_previousinum, &wkhd);
5897	if (complete == 0 && needj)
5898		return;
5899	/*
5900	 * The jnewblk will be discarded and the bits in the map never
5901	 * made it to disk.  We can immediately free the freeblk.
5902	 */
5903	ACQUIRE_LOCK(&lk);
5904	handle_written_freework(freework);
5905	FREE_LOCK(&lk);
5906}
5907
5908/*
5909 * Start, continue, or finish the process of freeing an indirect block tree.
5910 * The free operation may be paused at any point with fw_off containing the
5911 * offset to restart from.  This enables us to implement some flow control
5912 * for large truncates which may fan out and generate a huge number of
5913 * dependencies.
5914 */
5915static void
5916handle_workitem_indirblk(freework)
5917	struct freework *freework;
5918{
5919	struct freeblks *freeblks;
5920	struct ufsmount *ump;
5921	struct fs *fs;
5922
5923
5924	freeblks = freework->fw_freeblks;
5925	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
5926	fs = ump->um_fs;
5927	if (freework->fw_off == NINDIR(fs))
5928		freework_freeblock(freework);
5929	else
5930		indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
5931		    freework->fw_lbn);
5932}
5933
5934/*
5935 * Called when a freework structure attached to a cg buf is written.  The
5936 * ref on either the parent or the freeblks structure is released and
5937 * either may be added to the worklist if it is the final ref.
5938 */
5939static void
5940handle_written_freework(freework)
5941	struct freework *freework;
5942{
5943	struct freeblks *freeblks;
5944	struct freework *parent;
5945
5946	freeblks = freework->fw_freeblks;
5947	parent = freework->fw_parent;
5948	if (parent) {
5949		if (--parent->fw_ref != 0)
5950			parent = NULL;
5951		freeblks = NULL;
5952	} else if (--freeblks->fb_ref != 0)
5953		freeblks = NULL;
5954	WORKITEM_FREE(freework, D_FREEWORK);
5955	/*
5956	 * Don't delay these block frees or it takes an intolerable amount
5957	 * of time to process truncates and free their journal entries.
5958	 */
5959	if (freeblks)
5960		add_to_worklist(&freeblks->fb_list, 1);
5961	if (parent)
5962		add_to_worklist(&parent->fw_list, 1);
5963}
5964
5965/*
5966 * This workitem routine performs the block de-allocation.
5967 * The workitem is added to the pending list after the updated
5968 * inode block has been written to disk.  As mentioned above,
5969 * checks regarding the number of blocks de-allocated (compared
5970 * to the number of blocks allocated for the file) are also
5971 * performed in this function.
5972 */
5973static void
5974handle_workitem_freeblocks(freeblks, flags)
5975	struct freeblks *freeblks;
5976	int flags;
5977{
5978	struct freework *freework;
5979	struct worklist *wk;
5980
5981	KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd),
5982	    ("handle_workitem_freeblocks: Journal entries not written."));
5983	if (LIST_EMPTY(&freeblks->fb_freeworkhd)) {
5984		handle_complete_freeblocks(freeblks);
5985		return;
5986	}
5987	freeblks->fb_ref++;
5988	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
5989		KASSERT(wk->wk_type == D_FREEWORK,
5990		    ("handle_workitem_freeblocks: Unknown type %s",
5991		    TYPENAME(wk->wk_type)));
5992		WORKLIST_REMOVE_UNLOCKED(wk);
5993		freework = WK_FREEWORK(wk);
5994		if (freework->fw_lbn <= -NDADDR)
5995			handle_workitem_indirblk(freework);
5996		else
5997			freework_freeblock(freework);
5998	}
5999	ACQUIRE_LOCK(&lk);
6000	if (--freeblks->fb_ref != 0)
6001		freeblks = NULL;
6002	FREE_LOCK(&lk);
6003	if (freeblks)
6004		handle_complete_freeblocks(freeblks);
6005}
6006
6007/*
6008 * Once all of the freework workitems are complete we can retire the
6009 * freeblocks dependency and any journal work awaiting completion.  This
6010 * can not be called until all other dependencies are stable on disk.
6011 */
6012static void
6013handle_complete_freeblocks(freeblks)
6014	struct freeblks *freeblks;
6015{
6016	struct inode *ip;
6017	struct vnode *vp;
6018	struct fs *fs;
6019	struct ufsmount *ump;
6020	int flags;
6021
6022	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
6023	fs = ump->um_fs;
6024	flags = LK_NOWAIT;
6025
6026	/*
6027	 * If we still have not finished background cleanup, then check
6028	 * to see if the block count needs to be adjusted.
6029	 */
6030	if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 &&
6031	    ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
6032	    (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) {
6033		ip = VTOI(vp);
6034		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt);
6035		ip->i_flag |= IN_CHANGE;
6036		vput(vp);
6037	}
6038
6039	if (!(freeblks->fb_chkcnt == 0 ||
6040	    ((fs->fs_flags & FS_UNCLEAN) != 0 && (flags & LK_NOWAIT) == 0)))
6041	        printf(
6042	"handle_workitem_freeblocks: inode %ju block count %jd\n",
6043		   (uintmax_t)freeblks->fb_previousinum,
6044		   (intmax_t)freeblks->fb_chkcnt);
6045
6046	ACQUIRE_LOCK(&lk);
6047	/*
6048	 * All of the freeblock deps must be complete prior to this call
6049	 * so it's now safe to complete earlier outstanding journal entries.
6050	 */
6051	handle_jwork(&freeblks->fb_jwork);
6052	WORKITEM_FREE(freeblks, D_FREEBLKS);
6053	num_freeblkdep--;
6054	FREE_LOCK(&lk);
6055}
6056
6057/*
6058 * Release blocks associated with the inode ip and stored in the indirect
6059 * block dbn. If level is greater than SINGLE, the block is an indirect block
6060 * and recursive calls to indirtrunc must be used to cleanse other indirect
6061 * blocks.
6062 */
6063static void
6064indir_trunc(freework, dbn, lbn)
6065	struct freework *freework;
6066	ufs2_daddr_t dbn;
6067	ufs_lbn_t lbn;
6068{
6069	struct freework *nfreework;
6070	struct workhead wkhd;
6071	struct jnewblk *jnewblk;
6072	struct freeblks *freeblks;
6073	struct buf *bp;
6074	struct fs *fs;
6075	struct worklist *wkn;
6076	struct worklist *wk;
6077	struct indirdep *indirdep;
6078	struct ufsmount *ump;
6079	ufs1_daddr_t *bap1 = 0;
6080	ufs2_daddr_t nb, nnb, *bap2 = 0;
6081	ufs_lbn_t lbnadd;
6082	int i, nblocks, ufs1fmt;
6083	int fs_pendingblocks;
6084	int freedeps;
6085	int needj;
6086	int level;
6087	int cnt;
6088
6089	LIST_INIT(&wkhd);
6090	level = lbn_level(lbn);
6091	if (level == -1)
6092		panic("indir_trunc: Invalid lbn %jd\n", lbn);
6093	freeblks = freework->fw_freeblks;
6094	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
6095	fs = ump->um_fs;
6096	fs_pendingblocks = 0;
6097	freedeps = 0;
6098	needj = UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ;
6099	lbnadd = lbn_offset(fs, level);
6100	/*
6101	 * Get buffer of block pointers to be freed. This routine is not
6102	 * called until the zero'ed inode has been written, so it is safe
6103	 * to free blocks as they are encountered. Because the inode has
6104	 * been zero'ed, calls to bmap on these blocks will fail. So, we
6105	 * have to use the on-disk address and the block device for the
6106	 * filesystem to look them up. If the file was deleted before its
6107	 * indirect blocks were all written to disk, the routine that set
6108	 * us up (deallocate_dependencies) will have arranged to leave
6109	 * a complete copy of the indirect block in memory for our use.
6110	 * Otherwise we have to read the blocks in from the disk.
6111	 */
6112#ifdef notyet
6113	bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
6114	    GB_NOCREAT);
6115#else
6116	bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
6117#endif
6118	ACQUIRE_LOCK(&lk);
6119	if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
6120		if (wk->wk_type != D_INDIRDEP ||
6121		    (wk->wk_state & GOINGAWAY) == 0)
6122			panic("indir_trunc: lost indirdep %p", wk);
6123		indirdep = WK_INDIRDEP(wk);
6124		LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list);
6125		free_indirdep(indirdep);
6126		if (!LIST_EMPTY(&bp->b_dep))
6127			panic("indir_trunc: dangling dep %p",
6128			    LIST_FIRST(&bp->b_dep));
6129		ump->um_numindirdeps -= 1;
6130		FREE_LOCK(&lk);
6131	} else {
6132#ifdef notyet
6133		if (bp)
6134			brelse(bp);
6135#endif
6136		FREE_LOCK(&lk);
6137		if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
6138		    NOCRED, &bp) != 0) {
6139			brelse(bp);
6140			return;
6141		}
6142	}
6143	/*
6144	 * Recursively free indirect blocks.
6145	 */
6146	if (ump->um_fstype == UFS1) {
6147		ufs1fmt = 1;
6148		bap1 = (ufs1_daddr_t *)bp->b_data;
6149	} else {
6150		ufs1fmt = 0;
6151		bap2 = (ufs2_daddr_t *)bp->b_data;
6152	}
6153
6154	/*
6155	 * Reclaim indirect blocks which never made it to disk.
6156	 */
6157	cnt = 0;
6158	LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) {
6159		if (wk->wk_type != D_JNEWBLK)
6160			continue;
6161		ACQUIRE_LOCK(&lk);
6162		WORKLIST_REMOVE(wk);
6163		FREE_LOCK(&lk);
6164		jnewblk = WK_JNEWBLK(wk);
6165		if (jnewblk->jn_lbn > 0)
6166			i = (jnewblk->jn_lbn - -lbn) / lbnadd;
6167		else
6168			i = (-(jnewblk->jn_lbn + level - 1) - -(lbn + level)) /
6169			    lbnadd;
6170		KASSERT(i >= 0 && i < NINDIR(fs),
6171		    ("indir_trunc: Index out of range %d parent %jd lbn %jd level %d",
6172		    i, lbn, jnewblk->jn_lbn, level));
6173		/* Clear the pointer so it isn't found below. */
6174		if (ufs1fmt) {
6175			nb = bap1[i];
6176			bap1[i] = 0;
6177		} else {
6178			nb = bap2[i];
6179			bap2[i] = 0;
6180		}
6181		KASSERT(nb == jnewblk->jn_blkno,
6182		    ("indir_trunc: Block mismatch %jd != %jd",
6183		    nb, jnewblk->jn_blkno));
6184		if (level != 0) {
6185			ufs_lbn_t nlbn;
6186
6187			nlbn = (lbn + 1) - (i * lbnadd);
6188			nfreework = newfreework(ump, freeblks, freework,
6189			    nlbn, nb, fs->fs_frag, 0);
6190			WORKLIST_INSERT_UNLOCKED(&nfreework->fw_jwork, wk);
6191			freedeps++;
6192			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
6193		} else {
6194			struct workhead freewk;
6195
6196			LIST_INIT(&freewk);
6197			ACQUIRE_LOCK(&lk);
6198			WORKLIST_INSERT(&freewk, wk);
6199			FREE_LOCK(&lk);
6200			ffs_blkfree(ump, fs, freeblks->fb_devvp,
6201			    jnewblk->jn_blkno, fs->fs_bsize,
6202			    freeblks->fb_previousinum, &freewk);
6203		}
6204		cnt++;
6205	}
6206	ACQUIRE_LOCK(&lk);
6207	/* Any remaining journal work can be completed with freeblks. */
6208	jwork_move(&freeblks->fb_jwork, &wkhd);
6209	FREE_LOCK(&lk);
6210	nblocks = btodb(fs->fs_bsize);
6211	if (ufs1fmt)
6212		nb = bap1[0];
6213	else
6214		nb = bap2[0];
6215	nfreework = freework;
6216	/*
6217	 * Reclaim on disk blocks.
6218	 */
6219	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
6220		if (i != NINDIR(fs) - 1) {
6221			if (ufs1fmt)
6222				nnb = bap1[i+1];
6223			else
6224				nnb = bap2[i+1];
6225		} else
6226			nnb = 0;
6227		if (nb == 0)
6228			continue;
6229		cnt++;
6230		if (level != 0) {
6231			ufs_lbn_t nlbn;
6232
6233			nlbn = (lbn + 1) - (i * lbnadd);
6234			if (needj != 0) {
6235				nfreework = newfreework(ump, freeblks, freework,
6236				    nlbn, nb, fs->fs_frag, 0);
6237				freedeps++;
6238			}
6239			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
6240		} else {
6241			struct freedep *freedep;
6242
6243			/*
6244			 * Attempt to aggregate freedep dependencies for
6245			 * all blocks being released to the same CG.
6246			 */
6247			LIST_INIT(&wkhd);
6248			if (needj != 0 &&
6249			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
6250				freedep = newfreedep(freework);
6251				WORKLIST_INSERT_UNLOCKED(&wkhd,
6252				    &freedep->fd_list);
6253				freedeps++;
6254			}
6255			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
6256			    fs->fs_bsize, freeblks->fb_previousinum, &wkhd);
6257		}
6258	}
6259	if (level == 0)
6260		fs_pendingblocks = (nblocks * cnt);
6261	/*
6262	 * If we're not journaling we can free the indirect now.  Otherwise
6263	 * setup the ref counts and offset so this indirect can be completed
6264	 * when its children are free.
6265	 */
6266	if (needj == 0) {
6267		fs_pendingblocks += nblocks;
6268		dbn = dbtofsb(fs, dbn);
6269		ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
6270		    freeblks->fb_previousinum, NULL);
6271		ACQUIRE_LOCK(&lk);
6272		freeblks->fb_chkcnt -= fs_pendingblocks;
6273		if (freework->fw_blkno == dbn)
6274			handle_written_freework(freework);
6275		FREE_LOCK(&lk);
6276		freework = NULL;
6277	} else {
6278		ACQUIRE_LOCK(&lk);
6279		freework->fw_off = i;
6280		freework->fw_ref += freedeps;
6281		freework->fw_ref -= NINDIR(fs) + 1;
6282		if (freework->fw_ref != 0)
6283			freework = NULL;
6284		freeblks->fb_chkcnt -= fs_pendingblocks;
6285		FREE_LOCK(&lk);
6286	}
6287	if (fs_pendingblocks) {
6288		UFS_LOCK(ump);
6289		fs->fs_pendingblocks -= fs_pendingblocks;
6290		UFS_UNLOCK(ump);
6291	}
6292	bp->b_flags |= B_INVAL | B_NOCACHE;
6293	brelse(bp);
6294	if (freework)
6295		handle_workitem_indirblk(freework);
6296	return;
6297}
6298
6299/*
6300 * Cancel an allocindir when it is removed via truncation.
6301 */
6302static void
6303cancel_allocindir(aip, inodedep, freeblks)
6304	struct allocindir *aip;
6305	struct inodedep *inodedep;
6306	struct freeblks *freeblks;
6307{
6308	struct newblk *newblk;
6309
6310	/*
6311	 * If the journal hasn't been written the jnewblk must be passed
6312	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
6313	 * this by linking the journal dependency into the indirdep to be
6314	 * freed when indir_trunc() is called.  If the journal has already
6315	 * been written we can simply reclaim the journal space when the
6316	 * freeblks work is complete.
6317	 */
6318	LIST_REMOVE(aip, ai_next);
6319	newblk = (struct newblk *)aip;
6320	if (newblk->nb_jnewblk == NULL)
6321		cancel_newblk(newblk, &freeblks->fb_jwork);
6322	else
6323		cancel_newblk(newblk, &aip->ai_indirdep->ir_jwork);
6324	if (inodedep && inodedep->id_state & DEPCOMPLETE)
6325		WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list);
6326	else
6327		free_newblk(newblk);
6328}
6329
6330/*
6331 * Create the mkdir dependencies for . and .. in a new directory.  Link them
6332 * in to a newdirblk so any subsequent additions are tracked properly.  The
6333 * caller is responsible for adding the mkdir1 dependency to the journal
6334 * and updating id_mkdiradd.  This function returns with lk held.
6335 */
6336static struct mkdir *
6337setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
6338	struct diradd *dap;
6339	ino_t newinum;
6340	ino_t dinum;
6341	struct buf *newdirbp;
6342	struct mkdir **mkdirp;
6343{
6344	struct newblk *newblk;
6345	struct pagedep *pagedep;
6346	struct inodedep *inodedep;
6347	struct newdirblk *newdirblk = 0;
6348	struct mkdir *mkdir1, *mkdir2;
6349	struct worklist *wk;
6350	struct jaddref *jaddref;
6351	struct mount *mp;
6352
6353	mp = dap->da_list.wk_mp;
6354	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
6355	    M_SOFTDEP_FLAGS);
6356	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
6357	LIST_INIT(&newdirblk->db_mkdir);
6358	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
6359	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
6360	mkdir1->md_state = ATTACHED | MKDIR_BODY;
6361	mkdir1->md_diradd = dap;
6362	mkdir1->md_jaddref = NULL;
6363	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
6364	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
6365	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
6366	mkdir2->md_diradd = dap;
6367	mkdir2->md_jaddref = NULL;
6368	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) {
6369		mkdir1->md_state |= DEPCOMPLETE;
6370		mkdir2->md_state |= DEPCOMPLETE;
6371	}
6372	/*
6373	 * Dependency on "." and ".." being written to disk.
6374	 */
6375	mkdir1->md_buf = newdirbp;
6376	ACQUIRE_LOCK(&lk);
6377	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
6378	/*
6379	 * We must link the pagedep, allocdirect, and newdirblk for
6380	 * the initial file page so the pointer to the new directory
6381	 * is not written until the directory contents are live and
6382	 * any subsequent additions are not marked live until the
6383	 * block is reachable via the inode.
6384	 */
6385	if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0)
6386		panic("setup_newdir: lost pagedep");
6387	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
6388		if (wk->wk_type == D_ALLOCDIRECT)
6389			break;
6390	if (wk == NULL)
6391		panic("setup_newdir: lost allocdirect");
6392	newblk = WK_NEWBLK(wk);
6393	pagedep->pd_state |= NEWBLOCK;
6394	pagedep->pd_newdirblk = newdirblk;
6395	newdirblk->db_pagedep = pagedep;
6396	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
6397	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
6398	/*
6399	 * Look up the inodedep for the parent directory so that we
6400	 * can link mkdir2 into the pending dotdot jaddref or
6401	 * the inode write if there is none.  If the inode is
6402	 * ALLCOMPLETE and no jaddref is present all dependencies have
6403	 * been satisfied and mkdir2 can be freed.
6404	 */
6405	inodedep_lookup(mp, dinum, 0, &inodedep);
6406	if (mp->mnt_kern_flag & MNTK_SUJ) {
6407		if (inodedep == NULL)
6408			panic("setup_newdir: Lost parent.");
6409		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
6410		    inoreflst);
6411		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
6412		    (jaddref->ja_state & MKDIR_PARENT),
6413		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
6414		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
6415		mkdir2->md_jaddref = jaddref;
6416		jaddref->ja_mkdir = mkdir2;
6417	} else if (inodedep == NULL ||
6418	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
6419		dap->da_state &= ~MKDIR_PARENT;
6420		WORKITEM_FREE(mkdir2, D_MKDIR);
6421	} else {
6422		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
6423		WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
6424	}
6425	*mkdirp = mkdir2;
6426
6427	return (mkdir1);
6428}
6429
6430/*
6431 * Directory entry addition dependencies.
6432 *
6433 * When adding a new directory entry, the inode (with its incremented link
6434 * count) must be written to disk before the directory entry's pointer to it.
6435 * Also, if the inode is newly allocated, the corresponding freemap must be
6436 * updated (on disk) before the directory entry's pointer. These requirements
6437 * are met via undo/redo on the directory entry's pointer, which consists
6438 * simply of the inode number.
6439 *
6440 * As directory entries are added and deleted, the free space within a
6441 * directory block can become fragmented.  The ufs filesystem will compact
6442 * a fragmented directory block to make space for a new entry. When this
6443 * occurs, the offsets of previously added entries change. Any "diradd"
6444 * dependency structures corresponding to these entries must be updated with
6445 * the new offsets.
6446 */
6447
6448/*
6449 * This routine is called after the in-memory inode's link
6450 * count has been incremented, but before the directory entry's
6451 * pointer to the inode has been set.
6452 */
6453int
6454softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
6455	struct buf *bp;		/* buffer containing directory block */
6456	struct inode *dp;	/* inode for directory */
6457	off_t diroffset;	/* offset of new entry in directory */
6458	ino_t newinum;		/* inode referenced by new directory entry */
6459	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
6460	int isnewblk;		/* entry is in a newly allocated block */
6461{
6462	int offset;		/* offset of new entry within directory block */
6463	ufs_lbn_t lbn;		/* block in directory containing new entry */
6464	struct fs *fs;
6465	struct diradd *dap;
6466	struct newblk *newblk;
6467	struct pagedep *pagedep;
6468	struct inodedep *inodedep;
6469	struct newdirblk *newdirblk = 0;
6470	struct mkdir *mkdir1, *mkdir2;
6471	struct jaddref *jaddref;
6472	struct mount *mp;
6473	int isindir;
6474
6475	/*
6476	 * Whiteouts have no dependencies.
6477	 */
6478	if (newinum == WINO) {
6479		if (newdirbp != NULL)
6480			bdwrite(newdirbp);
6481		return (0);
6482	}
6483	jaddref = NULL;
6484	mkdir1 = mkdir2 = NULL;
6485	mp = UFSTOVFS(dp->i_ump);
6486	fs = dp->i_fs;
6487	lbn = lblkno(fs, diroffset);
6488	offset = blkoff(fs, diroffset);
6489	dap = malloc(sizeof(struct diradd), M_DIRADD,
6490		M_SOFTDEP_FLAGS|M_ZERO);
6491	workitem_alloc(&dap->da_list, D_DIRADD, mp);
6492	dap->da_offset = offset;
6493	dap->da_newinum = newinum;
6494	dap->da_state = ATTACHED;
6495	LIST_INIT(&dap->da_jwork);
6496	isindir = bp->b_lblkno >= NDADDR;
6497	if (isnewblk &&
6498	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
6499		newdirblk = malloc(sizeof(struct newdirblk),
6500		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
6501		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
6502		LIST_INIT(&newdirblk->db_mkdir);
6503	}
6504	/*
6505	 * If we're creating a new directory setup the dependencies and set
6506	 * the dap state to wait for them.  Otherwise it's COMPLETE and
6507	 * we can move on.
6508	 */
6509	if (newdirbp == NULL) {
6510		dap->da_state |= DEPCOMPLETE;
6511		ACQUIRE_LOCK(&lk);
6512	} else {
6513		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
6514		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
6515		    &mkdir2);
6516	}
6517	/*
6518	 * Link into parent directory pagedep to await its being written.
6519	 */
6520	if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0)
6521		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
6522#ifdef DEBUG
6523	if (diradd_lookup(pagedep, offset) != NULL)
6524		panic("softdep_setup_directory_add: %p already at off %d\n",
6525		    diradd_lookup(pagedep, offset), offset);
6526#endif
6527	dap->da_pagedep = pagedep;
6528	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
6529	    da_pdlist);
6530	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
6531	/*
6532	 * If we're journaling, link the diradd into the jaddref so it
6533	 * may be completed after the journal entry is written.  Otherwise,
6534	 * link the diradd into its inodedep.  If the inode is not yet
6535	 * written place it on the bufwait list, otherwise do the post-inode
6536	 * write processing to put it on the id_pendinghd list.
6537	 */
6538	if (mp->mnt_kern_flag & MNTK_SUJ) {
6539		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
6540		    inoreflst);
6541		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
6542		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
6543		jaddref->ja_diroff = diroffset;
6544		jaddref->ja_diradd = dap;
6545		add_to_journal(&jaddref->ja_list);
6546	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
6547		diradd_inode_written(dap, inodedep);
6548	else
6549		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
6550	/*
6551	 * Add the journal entries for . and .. links now that the primary
6552	 * link is written.
6553	 */
6554	if (mkdir1 != NULL && mp->mnt_kern_flag & MNTK_SUJ) {
6555		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
6556		    inoreflst, if_deps);
6557		KASSERT(jaddref != NULL &&
6558		    jaddref->ja_ino == jaddref->ja_parent &&
6559		    (jaddref->ja_state & MKDIR_BODY),
6560		    ("softdep_setup_directory_add: bad dot jaddref %p",
6561		    jaddref));
6562		mkdir1->md_jaddref = jaddref;
6563		jaddref->ja_mkdir = mkdir1;
6564		/*
6565		 * It is important that the dotdot journal entry
6566		 * is added prior to the dot entry since dot writes
6567		 * both the dot and dotdot links.  These both must
6568		 * be added after the primary link for the journal
6569		 * to remain consistent.
6570		 */
6571		add_to_journal(&mkdir2->md_jaddref->ja_list);
6572		add_to_journal(&jaddref->ja_list);
6573	}
6574	/*
6575	 * If we are adding a new directory remember this diradd so that if
6576	 * we rename it we can keep the dot and dotdot dependencies.  If
6577	 * we are adding a new name for an inode that has a mkdiradd we
6578	 * must be in rename and we have to move the dot and dotdot
6579	 * dependencies to this new name.  The old name is being orphaned
6580	 * soon.
6581	 */
6582	if (mkdir1 != NULL) {
6583		if (inodedep->id_mkdiradd != NULL)
6584			panic("softdep_setup_directory_add: Existing mkdir");
6585		inodedep->id_mkdiradd = dap;
6586	} else if (inodedep->id_mkdiradd)
6587		merge_diradd(inodedep, dap);
6588	if (newdirblk) {
6589		/*
6590		 * There is nothing to do if we are already tracking
6591		 * this block.
6592		 */
6593		if ((pagedep->pd_state & NEWBLOCK) != 0) {
6594			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
6595			FREE_LOCK(&lk);
6596			return (0);
6597		}
6598		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
6599		    == 0)
6600			panic("softdep_setup_directory_add: lost entry");
6601		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
6602		pagedep->pd_state |= NEWBLOCK;
6603		pagedep->pd_newdirblk = newdirblk;
6604		newdirblk->db_pagedep = pagedep;
6605		FREE_LOCK(&lk);
6606		/*
6607		 * If we extended into an indirect signal direnter to sync.
6608		 */
6609		if (isindir)
6610			return (1);
6611		return (0);
6612	}
6613	FREE_LOCK(&lk);
6614	return (0);
6615}
6616
6617/*
6618 * This procedure is called to change the offset of a directory
6619 * entry when compacting a directory block which must be owned
6620 * exclusively by the caller. Note that the actual entry movement
6621 * must be done in this procedure to ensure that no I/O completions
6622 * occur while the move is in progress.
6623 */
6624void
6625softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
6626	struct buf *bp;		/* Buffer holding directory block. */
6627	struct inode *dp;	/* inode for directory */
6628	caddr_t base;		/* address of dp->i_offset */
6629	caddr_t oldloc;		/* address of old directory location */
6630	caddr_t newloc;		/* address of new directory location */
6631	int entrysize;		/* size of directory entry */
6632{
6633	int offset, oldoffset, newoffset;
6634	struct pagedep *pagedep;
6635	struct jmvref *jmvref;
6636	struct diradd *dap;
6637	struct direct *de;
6638	struct mount *mp;
6639	ufs_lbn_t lbn;
6640	int flags;
6641
6642	mp = UFSTOVFS(dp->i_ump);
6643	de = (struct direct *)oldloc;
6644	jmvref = NULL;
6645	flags = 0;
6646	/*
6647	 * Moves are always journaled as it would be too complex to
6648	 * determine if any affected adds or removes are present in the
6649	 * journal.
6650	 */
6651	if (mp->mnt_kern_flag & MNTK_SUJ)  {
6652		flags = DEPALLOC;
6653		jmvref = newjmvref(dp, de->d_ino,
6654		    dp->i_offset + (oldloc - base),
6655		    dp->i_offset + (newloc - base));
6656	}
6657	lbn = lblkno(dp->i_fs, dp->i_offset);
6658	offset = blkoff(dp->i_fs, dp->i_offset);
6659	oldoffset = offset + (oldloc - base);
6660	newoffset = offset + (newloc - base);
6661	ACQUIRE_LOCK(&lk);
6662	if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) {
6663		if (pagedep)
6664			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
6665		goto done;
6666	}
6667	dap = diradd_lookup(pagedep, oldoffset);
6668	if (dap) {
6669		dap->da_offset = newoffset;
6670		newoffset = DIRADDHASH(newoffset);
6671		oldoffset = DIRADDHASH(oldoffset);
6672		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
6673		    newoffset != oldoffset) {
6674			LIST_REMOVE(dap, da_pdlist);
6675			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
6676			    dap, da_pdlist);
6677		}
6678	}
6679done:
6680	if (jmvref) {
6681		jmvref->jm_pagedep = pagedep;
6682		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
6683		add_to_journal(&jmvref->jm_list);
6684	}
6685	bcopy(oldloc, newloc, entrysize);
6686	FREE_LOCK(&lk);
6687}
6688
6689/*
6690 * Move the mkdir dependencies and journal work from one diradd to another
6691 * when renaming a directory.  The new name must depend on the mkdir deps
6692 * completing as the old name did.  Directories can only have one valid link
6693 * at a time so one must be canonical.
6694 */
6695static void
6696merge_diradd(inodedep, newdap)
6697	struct inodedep *inodedep;
6698	struct diradd *newdap;
6699{
6700	struct diradd *olddap;
6701	struct mkdir *mkdir, *nextmd;
6702	short state;
6703
6704	olddap = inodedep->id_mkdiradd;
6705	inodedep->id_mkdiradd = newdap;
6706	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
6707		newdap->da_state &= ~DEPCOMPLETE;
6708		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
6709			nextmd = LIST_NEXT(mkdir, md_mkdirs);
6710			if (mkdir->md_diradd != olddap)
6711				continue;
6712			mkdir->md_diradd = newdap;
6713			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
6714			newdap->da_state |= state;
6715			olddap->da_state &= ~state;
6716			if ((olddap->da_state &
6717			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
6718				break;
6719		}
6720		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
6721			panic("merge_diradd: unfound ref");
6722	}
6723	/*
6724	 * Any mkdir related journal items are not safe to be freed until
6725	 * the new name is stable.
6726	 */
6727	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
6728	olddap->da_state |= DEPCOMPLETE;
6729	complete_diradd(olddap);
6730}
6731
6732/*
6733 * Move the diradd to the pending list when all diradd dependencies are
6734 * complete.
6735 */
6736static void
6737complete_diradd(dap)
6738	struct diradd *dap;
6739{
6740	struct pagedep *pagedep;
6741
6742	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
6743		if (dap->da_state & DIRCHG)
6744			pagedep = dap->da_previous->dm_pagedep;
6745		else
6746			pagedep = dap->da_pagedep;
6747		LIST_REMOVE(dap, da_pdlist);
6748		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
6749	}
6750}
6751
6752/*
6753 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
6754 * add entries and conditonally journal the remove.
6755 */
6756static void
6757cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
6758	struct diradd *dap;
6759	struct dirrem *dirrem;
6760	struct jremref *jremref;
6761	struct jremref *dotremref;
6762	struct jremref *dotdotremref;
6763{
6764	struct inodedep *inodedep;
6765	struct jaddref *jaddref;
6766	struct inoref *inoref;
6767	struct mkdir *mkdir;
6768
6769	/*
6770	 * If no remove references were allocated we're on a non-journaled
6771	 * filesystem and can skip the cancel step.
6772	 */
6773	if (jremref == NULL) {
6774		free_diradd(dap, NULL);
6775		return;
6776	}
6777	/*
6778	 * Cancel the primary name an free it if it does not require
6779	 * journaling.
6780	 */
6781	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
6782	    0, &inodedep) != 0) {
6783		/* Abort the addref that reference this diradd.  */
6784		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
6785			if (inoref->if_list.wk_type != D_JADDREF)
6786				continue;
6787			jaddref = (struct jaddref *)inoref;
6788			if (jaddref->ja_diradd != dap)
6789				continue;
6790			if (cancel_jaddref(jaddref, inodedep,
6791			    &dirrem->dm_jwork) == 0) {
6792				free_jremref(jremref);
6793				jremref = NULL;
6794			}
6795			break;
6796		}
6797	}
6798	/*
6799	 * Cancel subordinate names and free them if they do not require
6800	 * journaling.
6801	 */
6802	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
6803		LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
6804			if (mkdir->md_diradd != dap)
6805				continue;
6806			if ((jaddref = mkdir->md_jaddref) == NULL)
6807				continue;
6808			mkdir->md_jaddref = NULL;
6809			if (mkdir->md_state & MKDIR_PARENT) {
6810				if (cancel_jaddref(jaddref, NULL,
6811				    &dirrem->dm_jwork) == 0) {
6812					free_jremref(dotdotremref);
6813					dotdotremref = NULL;
6814				}
6815			} else {
6816				if (cancel_jaddref(jaddref, inodedep,
6817				    &dirrem->dm_jwork) == 0) {
6818					free_jremref(dotremref);
6819					dotremref = NULL;
6820				}
6821			}
6822		}
6823	}
6824
6825	if (jremref)
6826		journal_jremref(dirrem, jremref, inodedep);
6827	if (dotremref)
6828		journal_jremref(dirrem, dotremref, inodedep);
6829	if (dotdotremref)
6830		journal_jremref(dirrem, dotdotremref, NULL);
6831	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
6832	free_diradd(dap, &dirrem->dm_jwork);
6833}
6834
6835/*
6836 * Free a diradd dependency structure. This routine must be called
6837 * with splbio interrupts blocked.
6838 */
6839static void
6840free_diradd(dap, wkhd)
6841	struct diradd *dap;
6842	struct workhead *wkhd;
6843{
6844	struct dirrem *dirrem;
6845	struct pagedep *pagedep;
6846	struct inodedep *inodedep;
6847	struct mkdir *mkdir, *nextmd;
6848
6849	mtx_assert(&lk, MA_OWNED);
6850	LIST_REMOVE(dap, da_pdlist);
6851	if (dap->da_state & ONWORKLIST)
6852		WORKLIST_REMOVE(&dap->da_list);
6853	if ((dap->da_state & DIRCHG) == 0) {
6854		pagedep = dap->da_pagedep;
6855	} else {
6856		dirrem = dap->da_previous;
6857		pagedep = dirrem->dm_pagedep;
6858		dirrem->dm_dirinum = pagedep->pd_ino;
6859		dirrem->dm_state |= COMPLETE;
6860		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
6861			add_to_worklist(&dirrem->dm_list, 0);
6862	}
6863	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
6864	    0, &inodedep) != 0)
6865		if (inodedep->id_mkdiradd == dap)
6866			inodedep->id_mkdiradd = NULL;
6867	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
6868		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
6869			nextmd = LIST_NEXT(mkdir, md_mkdirs);
6870			if (mkdir->md_diradd != dap)
6871				continue;
6872			dap->da_state &=
6873			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
6874			LIST_REMOVE(mkdir, md_mkdirs);
6875			if (mkdir->md_state & ONWORKLIST)
6876				WORKLIST_REMOVE(&mkdir->md_list);
6877			if (mkdir->md_jaddref != NULL)
6878				panic("free_diradd: Unexpected jaddref");
6879			WORKITEM_FREE(mkdir, D_MKDIR);
6880			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
6881				break;
6882		}
6883		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
6884			panic("free_diradd: unfound ref");
6885	}
6886	if (inodedep)
6887		free_inodedep(inodedep);
6888	/*
6889	 * Free any journal segments waiting for the directory write.
6890	 */
6891	handle_jwork(&dap->da_jwork);
6892	WORKITEM_FREE(dap, D_DIRADD);
6893}
6894
6895/*
6896 * Directory entry removal dependencies.
6897 *
6898 * When removing a directory entry, the entry's inode pointer must be
6899 * zero'ed on disk before the corresponding inode's link count is decremented
6900 * (possibly freeing the inode for re-use). This dependency is handled by
6901 * updating the directory entry but delaying the inode count reduction until
6902 * after the directory block has been written to disk. After this point, the
6903 * inode count can be decremented whenever it is convenient.
6904 */
6905
6906/*
6907 * This routine should be called immediately after removing
6908 * a directory entry.  The inode's link count should not be
6909 * decremented by the calling procedure -- the soft updates
6910 * code will do this task when it is safe.
6911 */
6912void
6913softdep_setup_remove(bp, dp, ip, isrmdir)
6914	struct buf *bp;		/* buffer containing directory block */
6915	struct inode *dp;	/* inode for the directory being modified */
6916	struct inode *ip;	/* inode for directory entry being removed */
6917	int isrmdir;		/* indicates if doing RMDIR */
6918{
6919	struct dirrem *dirrem, *prevdirrem;
6920	struct inodedep *inodedep;
6921	int direct;
6922
6923	/*
6924	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
6925	 * newdirrem() to setup the full directory remove which requires
6926	 * isrmdir > 1.
6927	 */
6928	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
6929	/*
6930	 * Add the dirrem to the inodedep's pending remove list for quick
6931	 * discovery later.
6932	 */
6933	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
6934	    &inodedep) == 0)
6935		panic("softdep_setup_remove: Lost inodedep.");
6936	dirrem->dm_state |= ONDEPLIST;
6937	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
6938
6939	/*
6940	 * If the COMPLETE flag is clear, then there were no active
6941	 * entries and we want to roll back to a zeroed entry until
6942	 * the new inode is committed to disk. If the COMPLETE flag is
6943	 * set then we have deleted an entry that never made it to
6944	 * disk. If the entry we deleted resulted from a name change,
6945	 * then the old name still resides on disk. We cannot delete
6946	 * its inode (returned to us in prevdirrem) until the zeroed
6947	 * directory entry gets to disk. The new inode has never been
6948	 * referenced on the disk, so can be deleted immediately.
6949	 */
6950	if ((dirrem->dm_state & COMPLETE) == 0) {
6951		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
6952		    dm_next);
6953		FREE_LOCK(&lk);
6954	} else {
6955		if (prevdirrem != NULL)
6956			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
6957			    prevdirrem, dm_next);
6958		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
6959		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
6960		FREE_LOCK(&lk);
6961		if (direct)
6962			handle_workitem_remove(dirrem, NULL);
6963	}
6964}
6965
6966/*
6967 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
6968 * pd_pendinghd list of a pagedep.
6969 */
6970static struct diradd *
6971diradd_lookup(pagedep, offset)
6972	struct pagedep *pagedep;
6973	int offset;
6974{
6975	struct diradd *dap;
6976
6977	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
6978		if (dap->da_offset == offset)
6979			return (dap);
6980	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
6981		if (dap->da_offset == offset)
6982			return (dap);
6983	return (NULL);
6984}
6985
6986/*
6987 * Search for a .. diradd dependency in a directory that is being removed.
6988 * If the directory was renamed to a new parent we have a diradd rather
6989 * than a mkdir for the .. entry.  We need to cancel it now before
6990 * it is found in truncate().
6991 */
6992static struct jremref *
6993cancel_diradd_dotdot(ip, dirrem, jremref)
6994	struct inode *ip;
6995	struct dirrem *dirrem;
6996	struct jremref *jremref;
6997{
6998	struct pagedep *pagedep;
6999	struct diradd *dap;
7000	struct worklist *wk;
7001
7002	if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0,
7003	    &pagedep) == 0)
7004		return (jremref);
7005	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
7006	if (dap == NULL)
7007		return (jremref);
7008	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
7009	/*
7010	 * Mark any journal work as belonging to the parent so it is freed
7011	 * with the .. reference.
7012	 */
7013	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
7014		wk->wk_state |= MKDIR_PARENT;
7015	return (NULL);
7016}
7017
7018/*
7019 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
7020 * replace it with a dirrem/diradd pair as a result of re-parenting a
7021 * directory.  This ensures that we don't simultaneously have a mkdir and
7022 * a diradd for the same .. entry.
7023 */
7024static struct jremref *
7025cancel_mkdir_dotdot(ip, dirrem, jremref)
7026	struct inode *ip;
7027	struct dirrem *dirrem;
7028	struct jremref *jremref;
7029{
7030	struct inodedep *inodedep;
7031	struct jaddref *jaddref;
7032	struct mkdir *mkdir;
7033	struct diradd *dap;
7034
7035	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
7036	    &inodedep) == 0)
7037		panic("cancel_mkdir_dotdot: Lost inodedep");
7038	dap = inodedep->id_mkdiradd;
7039	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
7040		return (jremref);
7041	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir;
7042	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
7043		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
7044			break;
7045	if (mkdir == NULL)
7046		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
7047	if ((jaddref = mkdir->md_jaddref) != NULL) {
7048		mkdir->md_jaddref = NULL;
7049		jaddref->ja_state &= ~MKDIR_PARENT;
7050		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
7051		    &inodedep) == 0)
7052			panic("cancel_mkdir_dotdot: Lost parent inodedep");
7053		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
7054			journal_jremref(dirrem, jremref, inodedep);
7055			jremref = NULL;
7056		}
7057	}
7058	if (mkdir->md_state & ONWORKLIST)
7059		WORKLIST_REMOVE(&mkdir->md_list);
7060	mkdir->md_state |= ALLCOMPLETE;
7061	complete_mkdir(mkdir);
7062	return (jremref);
7063}
7064
7065static void
7066journal_jremref(dirrem, jremref, inodedep)
7067	struct dirrem *dirrem;
7068	struct jremref *jremref;
7069	struct inodedep *inodedep;
7070{
7071
7072	if (inodedep == NULL)
7073		if (inodedep_lookup(jremref->jr_list.wk_mp,
7074		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
7075			panic("journal_jremref: Lost inodedep");
7076	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
7077	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
7078	add_to_journal(&jremref->jr_list);
7079}
7080
7081static void
7082dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
7083	struct dirrem *dirrem;
7084	struct jremref *jremref;
7085	struct jremref *dotremref;
7086	struct jremref *dotdotremref;
7087{
7088	struct inodedep *inodedep;
7089
7090
7091	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
7092	    &inodedep) == 0)
7093		panic("dirrem_journal: Lost inodedep");
7094	journal_jremref(dirrem, jremref, inodedep);
7095	if (dotremref)
7096		journal_jremref(dirrem, dotremref, inodedep);
7097	if (dotdotremref)
7098		journal_jremref(dirrem, dotdotremref, NULL);
7099}
7100
7101/*
7102 * Allocate a new dirrem if appropriate and return it along with
7103 * its associated pagedep. Called without a lock, returns with lock.
7104 */
7105static long num_dirrem;		/* number of dirrem allocated */
7106static struct dirrem *
7107newdirrem(bp, dp, ip, isrmdir, prevdirremp)
7108	struct buf *bp;		/* buffer containing directory block */
7109	struct inode *dp;	/* inode for the directory being modified */
7110	struct inode *ip;	/* inode for directory entry being removed */
7111	int isrmdir;		/* indicates if doing RMDIR */
7112	struct dirrem **prevdirremp; /* previously referenced inode, if any */
7113{
7114	int offset;
7115	ufs_lbn_t lbn;
7116	struct diradd *dap;
7117	struct dirrem *dirrem;
7118	struct pagedep *pagedep;
7119	struct jremref *jremref;
7120	struct jremref *dotremref;
7121	struct jremref *dotdotremref;
7122	struct vnode *dvp;
7123
7124	/*
7125	 * Whiteouts have no deletion dependencies.
7126	 */
7127	if (ip == NULL)
7128		panic("newdirrem: whiteout");
7129	dvp = ITOV(dp);
7130	/*
7131	 * If we are over our limit, try to improve the situation.
7132	 * Limiting the number of dirrem structures will also limit
7133	 * the number of freefile and freeblks structures.
7134	 */
7135	ACQUIRE_LOCK(&lk);
7136	if (!(ip->i_flags & SF_SNAPSHOT) && num_dirrem > max_softdeps / 2)
7137		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS);
7138	num_dirrem += 1;
7139	FREE_LOCK(&lk);
7140	dirrem = malloc(sizeof(struct dirrem),
7141		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
7142	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
7143	LIST_INIT(&dirrem->dm_jremrefhd);
7144	LIST_INIT(&dirrem->dm_jwork);
7145	dirrem->dm_state = isrmdir ? RMDIR : 0;
7146	dirrem->dm_oldinum = ip->i_number;
7147	*prevdirremp = NULL;
7148	/*
7149	 * Allocate remove reference structures to track journal write
7150	 * dependencies.  We will always have one for the link and
7151	 * when doing directories we will always have one more for dot.
7152	 * When renaming a directory we skip the dotdot link change so
7153	 * this is not needed.
7154	 */
7155	jremref = dotremref = dotdotremref = NULL;
7156	if (DOINGSUJ(dvp)) {
7157		if (isrmdir) {
7158			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
7159			    ip->i_effnlink + 2);
7160			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
7161			    ip->i_effnlink + 1);
7162			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
7163			    dp->i_effnlink + 1);
7164			dotdotremref->jr_state |= MKDIR_PARENT;
7165		} else
7166			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
7167			    ip->i_effnlink + 1);
7168	}
7169	ACQUIRE_LOCK(&lk);
7170	lbn = lblkno(dp->i_fs, dp->i_offset);
7171	offset = blkoff(dp->i_fs, dp->i_offset);
7172	if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC,
7173	    &pagedep) == 0)
7174		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
7175	dirrem->dm_pagedep = pagedep;
7176	/*
7177	 * If we're renaming a .. link to a new directory, cancel any
7178	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
7179	 * the jremref is preserved for any potential diradd in this
7180	 * location.  This can not coincide with a rmdir.
7181	 */
7182	if (dp->i_offset == DOTDOT_OFFSET) {
7183		if (isrmdir)
7184			panic("newdirrem: .. directory change during remove?");
7185		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
7186	}
7187	/*
7188	 * If we're removing a directory search for the .. dependency now and
7189	 * cancel it.  Any pending journal work will be added to the dirrem
7190	 * to be completed when the workitem remove completes.
7191	 */
7192	if (isrmdir)
7193		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
7194	/*
7195	 * Check for a diradd dependency for the same directory entry.
7196	 * If present, then both dependencies become obsolete and can
7197	 * be de-allocated.
7198	 */
7199	dap = diradd_lookup(pagedep, offset);
7200	if (dap == NULL) {
7201		/*
7202		 * Link the jremref structures into the dirrem so they are
7203		 * written prior to the pagedep.
7204		 */
7205		if (jremref)
7206			dirrem_journal(dirrem, jremref, dotremref,
7207			    dotdotremref);
7208		return (dirrem);
7209	}
7210	/*
7211	 * Must be ATTACHED at this point.
7212	 */
7213	if ((dap->da_state & ATTACHED) == 0)
7214		panic("newdirrem: not ATTACHED");
7215	if (dap->da_newinum != ip->i_number)
7216		panic("newdirrem: inum %d should be %d",
7217		    ip->i_number, dap->da_newinum);
7218	/*
7219	 * If we are deleting a changed name that never made it to disk,
7220	 * then return the dirrem describing the previous inode (which
7221	 * represents the inode currently referenced from this entry on disk).
7222	 */
7223	if ((dap->da_state & DIRCHG) != 0) {
7224		*prevdirremp = dap->da_previous;
7225		dap->da_state &= ~DIRCHG;
7226		dap->da_pagedep = pagedep;
7227	}
7228	/*
7229	 * We are deleting an entry that never made it to disk.
7230	 * Mark it COMPLETE so we can delete its inode immediately.
7231	 */
7232	dirrem->dm_state |= COMPLETE;
7233	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
7234#ifdef SUJ_DEBUG
7235	if (isrmdir == 0) {
7236		struct worklist *wk;
7237
7238		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
7239			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
7240				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
7241	}
7242#endif
7243
7244	return (dirrem);
7245}
7246
7247/*
7248 * Directory entry change dependencies.
7249 *
7250 * Changing an existing directory entry requires that an add operation
7251 * be completed first followed by a deletion. The semantics for the addition
7252 * are identical to the description of adding a new entry above except
7253 * that the rollback is to the old inode number rather than zero. Once
7254 * the addition dependency is completed, the removal is done as described
7255 * in the removal routine above.
7256 */
7257
7258/*
7259 * This routine should be called immediately after changing
7260 * a directory entry.  The inode's link count should not be
7261 * decremented by the calling procedure -- the soft updates
7262 * code will perform this task when it is safe.
7263 */
7264void
7265softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
7266	struct buf *bp;		/* buffer containing directory block */
7267	struct inode *dp;	/* inode for the directory being modified */
7268	struct inode *ip;	/* inode for directory entry being removed */
7269	ino_t newinum;		/* new inode number for changed entry */
7270	int isrmdir;		/* indicates if doing RMDIR */
7271{
7272	int offset;
7273	struct diradd *dap = NULL;
7274	struct dirrem *dirrem, *prevdirrem;
7275	struct pagedep *pagedep;
7276	struct inodedep *inodedep;
7277	struct jaddref *jaddref;
7278	struct mount *mp;
7279
7280	offset = blkoff(dp->i_fs, dp->i_offset);
7281	mp = UFSTOVFS(dp->i_ump);
7282
7283	/*
7284	 * Whiteouts do not need diradd dependencies.
7285	 */
7286	if (newinum != WINO) {
7287		dap = malloc(sizeof(struct diradd),
7288		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
7289		workitem_alloc(&dap->da_list, D_DIRADD, mp);
7290		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
7291		dap->da_offset = offset;
7292		dap->da_newinum = newinum;
7293		LIST_INIT(&dap->da_jwork);
7294	}
7295
7296	/*
7297	 * Allocate a new dirrem and ACQUIRE_LOCK.
7298	 */
7299	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
7300	pagedep = dirrem->dm_pagedep;
7301	/*
7302	 * The possible values for isrmdir:
7303	 *	0 - non-directory file rename
7304	 *	1 - directory rename within same directory
7305	 *   inum - directory rename to new directory of given inode number
7306	 * When renaming to a new directory, we are both deleting and
7307	 * creating a new directory entry, so the link count on the new
7308	 * directory should not change. Thus we do not need the followup
7309	 * dirrem which is usually done in handle_workitem_remove. We set
7310	 * the DIRCHG flag to tell handle_workitem_remove to skip the
7311	 * followup dirrem.
7312	 */
7313	if (isrmdir > 1)
7314		dirrem->dm_state |= DIRCHG;
7315
7316	/*
7317	 * Whiteouts have no additional dependencies,
7318	 * so just put the dirrem on the correct list.
7319	 */
7320	if (newinum == WINO) {
7321		if ((dirrem->dm_state & COMPLETE) == 0) {
7322			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
7323			    dm_next);
7324		} else {
7325			dirrem->dm_dirinum = pagedep->pd_ino;
7326			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
7327				add_to_worklist(&dirrem->dm_list, 0);
7328		}
7329		FREE_LOCK(&lk);
7330		return;
7331	}
7332	/*
7333	 * Add the dirrem to the inodedep's pending remove list for quick
7334	 * discovery later.  A valid nlinkdelta ensures that this lookup
7335	 * will not fail.
7336	 */
7337	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
7338		panic("softdep_setup_directory_change: Lost inodedep.");
7339	dirrem->dm_state |= ONDEPLIST;
7340	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
7341
7342	/*
7343	 * If the COMPLETE flag is clear, then there were no active
7344	 * entries and we want to roll back to the previous inode until
7345	 * the new inode is committed to disk. If the COMPLETE flag is
7346	 * set, then we have deleted an entry that never made it to disk.
7347	 * If the entry we deleted resulted from a name change, then the old
7348	 * inode reference still resides on disk. Any rollback that we do
7349	 * needs to be to that old inode (returned to us in prevdirrem). If
7350	 * the entry we deleted resulted from a create, then there is
7351	 * no entry on the disk, so we want to roll back to zero rather
7352	 * than the uncommitted inode. In either of the COMPLETE cases we
7353	 * want to immediately free the unwritten and unreferenced inode.
7354	 */
7355	if ((dirrem->dm_state & COMPLETE) == 0) {
7356		dap->da_previous = dirrem;
7357	} else {
7358		if (prevdirrem != NULL) {
7359			dap->da_previous = prevdirrem;
7360		} else {
7361			dap->da_state &= ~DIRCHG;
7362			dap->da_pagedep = pagedep;
7363		}
7364		dirrem->dm_dirinum = pagedep->pd_ino;
7365		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
7366			add_to_worklist(&dirrem->dm_list, 0);
7367	}
7368	/*
7369	 * Lookup the jaddref for this journal entry.  We must finish
7370	 * initializing it and make the diradd write dependent on it.
7371	 * If we're not journaling Put it on the id_bufwait list if the inode
7372	 * is not yet written. If it is written, do the post-inode write
7373	 * processing to put it on the id_pendinghd list.
7374	 */
7375	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
7376	if (mp->mnt_kern_flag & MNTK_SUJ) {
7377		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
7378		    inoreflst);
7379		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
7380		    ("softdep_setup_directory_change: bad jaddref %p",
7381		    jaddref));
7382		jaddref->ja_diroff = dp->i_offset;
7383		jaddref->ja_diradd = dap;
7384		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
7385		    dap, da_pdlist);
7386		add_to_journal(&jaddref->ja_list);
7387	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
7388		dap->da_state |= COMPLETE;
7389		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
7390		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
7391	} else {
7392		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
7393		    dap, da_pdlist);
7394		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
7395	}
7396	/*
7397	 * If we're making a new name for a directory that has not been
7398	 * committed when need to move the dot and dotdot references to
7399	 * this new name.
7400	 */
7401	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
7402		merge_diradd(inodedep, dap);
7403	FREE_LOCK(&lk);
7404}
7405
7406/*
7407 * Called whenever the link count on an inode is changed.
7408 * It creates an inode dependency so that the new reference(s)
7409 * to the inode cannot be committed to disk until the updated
7410 * inode has been written.
7411 */
7412void
7413softdep_change_linkcnt(ip)
7414	struct inode *ip;	/* the inode with the increased link count */
7415{
7416	struct inodedep *inodedep;
7417
7418	ACQUIRE_LOCK(&lk);
7419	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
7420	if (ip->i_nlink < ip->i_effnlink)
7421		panic("softdep_change_linkcnt: bad delta");
7422	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
7423	FREE_LOCK(&lk);
7424}
7425
7426/*
7427 * Attach a sbdep dependency to the superblock buf so that we can keep
7428 * track of the head of the linked list of referenced but unlinked inodes.
7429 */
7430void
7431softdep_setup_sbupdate(ump, fs, bp)
7432	struct ufsmount *ump;
7433	struct fs *fs;
7434	struct buf *bp;
7435{
7436	struct sbdep *sbdep;
7437	struct worklist *wk;
7438
7439	if ((fs->fs_flags & FS_SUJ) == 0)
7440		return;
7441	LIST_FOREACH(wk, &bp->b_dep, wk_list)
7442		if (wk->wk_type == D_SBDEP)
7443			break;
7444	if (wk != NULL)
7445		return;
7446	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
7447	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
7448	sbdep->sb_fs = fs;
7449	sbdep->sb_ump = ump;
7450	ACQUIRE_LOCK(&lk);
7451	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
7452	FREE_LOCK(&lk);
7453}
7454
7455/*
7456 * Return the first unlinked inodedep which is ready to be the head of the
7457 * list.  The inodedep and all those after it must have valid next pointers.
7458 */
7459static struct inodedep *
7460first_unlinked_inodedep(ump)
7461	struct ufsmount *ump;
7462{
7463	struct inodedep *inodedep;
7464	struct inodedep *idp;
7465
7466	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
7467	    inodedep; inodedep = idp) {
7468		if ((inodedep->id_state & UNLINKNEXT) == 0)
7469			return (NULL);
7470		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
7471		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
7472			break;
7473		if ((inodedep->id_state & UNLINKPREV) == 0)
7474			panic("first_unlinked_inodedep: prev != next");
7475	}
7476	if (inodedep == NULL)
7477		return (NULL);
7478
7479	return (inodedep);
7480}
7481
7482/*
7483 * Set the sujfree unlinked head pointer prior to writing a superblock.
7484 */
7485static void
7486initiate_write_sbdep(sbdep)
7487	struct sbdep *sbdep;
7488{
7489	struct inodedep *inodedep;
7490	struct fs *bpfs;
7491	struct fs *fs;
7492
7493	bpfs = sbdep->sb_fs;
7494	fs = sbdep->sb_ump->um_fs;
7495	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
7496	if (inodedep) {
7497		fs->fs_sujfree = inodedep->id_ino;
7498		inodedep->id_state |= UNLINKPREV;
7499	} else
7500		fs->fs_sujfree = 0;
7501	bpfs->fs_sujfree = fs->fs_sujfree;
7502}
7503
7504/*
7505 * After a superblock is written determine whether it must be written again
7506 * due to a changing unlinked list head.
7507 */
7508static int
7509handle_written_sbdep(sbdep, bp)
7510	struct sbdep *sbdep;
7511	struct buf *bp;
7512{
7513	struct inodedep *inodedep;
7514	struct mount *mp;
7515	struct fs *fs;
7516
7517	fs = sbdep->sb_fs;
7518	mp = UFSTOVFS(sbdep->sb_ump);
7519	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
7520	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
7521	    (inodedep == NULL && fs->fs_sujfree != 0)) {
7522		bdirty(bp);
7523		return (1);
7524	}
7525	WORKITEM_FREE(sbdep, D_SBDEP);
7526	if (fs->fs_sujfree == 0)
7527		return (0);
7528	if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0)
7529		panic("handle_written_sbdep: lost inodedep");
7530	/*
7531	 * Now that we have a record of this inode in stable store allow it
7532	 * to be written to free up pending work.  Inodes may see a lot of
7533	 * write activity after they are unlinked which we must not hold up.
7534	 */
7535	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
7536		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
7537			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
7538			    inodedep, inodedep->id_state);
7539		if (inodedep->id_state & UNLINKONLIST)
7540			break;
7541		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
7542	}
7543
7544	return (0);
7545}
7546
7547/*
7548 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
7549 */
7550static void
7551unlinked_inodedep(mp, inodedep)
7552	struct mount *mp;
7553	struct inodedep *inodedep;
7554{
7555	struct ufsmount *ump;
7556
7557	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
7558		return;
7559	ump = VFSTOUFS(mp);
7560	ump->um_fs->fs_fmod = 1;
7561	inodedep->id_state |= UNLINKED;
7562	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
7563}
7564
7565/*
7566 * Remove an inodedep from the unlinked inodedep list.  This may require
7567 * disk writes if the inode has made it that far.
7568 */
7569static void
7570clear_unlinked_inodedep(inodedep)
7571	struct inodedep *inodedep;
7572{
7573	struct ufsmount *ump;
7574	struct inodedep *idp;
7575	struct inodedep *idn;
7576	struct fs *fs;
7577	struct buf *bp;
7578	ino_t ino;
7579	ino_t nino;
7580	ino_t pino;
7581	int error;
7582
7583	ump = VFSTOUFS(inodedep->id_list.wk_mp);
7584	fs = ump->um_fs;
7585	ino = inodedep->id_ino;
7586	error = 0;
7587	for (;;) {
7588		/*
7589		 * If nothing has yet been written simply remove us from
7590		 * the in memory list and return.  This is the most common
7591		 * case where handle_workitem_remove() loses the final
7592		 * reference.
7593		 */
7594		if ((inodedep->id_state & UNLINKLINKS) == 0)
7595			break;
7596		/*
7597		 * If we have a NEXT pointer and no PREV pointer we can simply
7598		 * clear NEXT's PREV and remove ourselves from the list.  Be
7599		 * careful not to clear PREV if the superblock points at
7600		 * next as well.
7601		 */
7602		idn = TAILQ_NEXT(inodedep, id_unlinked);
7603		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
7604			if (idn && fs->fs_sujfree != idn->id_ino)
7605				idn->id_state &= ~UNLINKPREV;
7606			break;
7607		}
7608		/*
7609		 * Here we have an inodedep which is actually linked into
7610		 * the list.  We must remove it by forcing a write to the
7611		 * link before us, whether it be the superblock or an inode.
7612		 * Unfortunately the list may change while we're waiting
7613		 * on the buf lock for either resource so we must loop until
7614		 * we lock the right one.  If both the superblock and an
7615		 * inode point to this inode we must clear the inode first
7616		 * followed by the superblock.
7617		 */
7618		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
7619		pino = 0;
7620		if (idp && (idp->id_state & UNLINKNEXT))
7621			pino = idp->id_ino;
7622		FREE_LOCK(&lk);
7623		if (pino == 0)
7624			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
7625			    (int)fs->fs_sbsize, 0, 0, 0);
7626		else
7627			error = bread(ump->um_devvp,
7628			    fsbtodb(fs, ino_to_fsba(fs, pino)),
7629			    (int)fs->fs_bsize, NOCRED, &bp);
7630		ACQUIRE_LOCK(&lk);
7631		if (error)
7632			break;
7633		/* If the list has changed restart the loop. */
7634		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
7635		nino = 0;
7636		if (idp && (idp->id_state & UNLINKNEXT))
7637			nino = idp->id_ino;
7638		if (nino != pino ||
7639		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
7640			FREE_LOCK(&lk);
7641			brelse(bp);
7642			ACQUIRE_LOCK(&lk);
7643			continue;
7644		}
7645		/*
7646		 * Remove us from the in memory list.  After this we cannot
7647		 * access the inodedep.
7648		 */
7649		idn = TAILQ_NEXT(inodedep, id_unlinked);
7650		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
7651		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
7652		/*
7653		 * Determine the next inode number.
7654		 */
7655		nino = 0;
7656		if (idn) {
7657			/*
7658			 * If next isn't on the list we can just clear prev's
7659			 * state and schedule it to be fixed later.  No need
7660			 * to synchronously write if we're not in the real
7661			 * list.
7662			 */
7663			if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) {
7664				idp->id_state &= ~UNLINKNEXT;
7665				if ((idp->id_state & ONWORKLIST) == 0)
7666					WORKLIST_INSERT(&bp->b_dep,
7667					    &idp->id_list);
7668				FREE_LOCK(&lk);
7669				bawrite(bp);
7670				ACQUIRE_LOCK(&lk);
7671				return;
7672			}
7673			nino = idn->id_ino;
7674		}
7675		FREE_LOCK(&lk);
7676		/*
7677		 * The predecessor's next pointer is manually updated here
7678		 * so that the NEXT flag is never cleared for an element
7679		 * that is in the list.
7680		 */
7681		if (pino == 0) {
7682			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
7683			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
7684			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
7685			    bp);
7686		} else if (fs->fs_magic == FS_UFS1_MAGIC)
7687			((struct ufs1_dinode *)bp->b_data +
7688			    ino_to_fsbo(fs, pino))->di_freelink = nino;
7689		else
7690			((struct ufs2_dinode *)bp->b_data +
7691			    ino_to_fsbo(fs, pino))->di_freelink = nino;
7692		/*
7693		 * If the bwrite fails we have no recourse to recover.  The
7694		 * filesystem is corrupted already.
7695		 */
7696		bwrite(bp);
7697		ACQUIRE_LOCK(&lk);
7698		/*
7699		 * If the superblock pointer still needs to be cleared force
7700		 * a write here.
7701		 */
7702		if (fs->fs_sujfree == ino) {
7703			FREE_LOCK(&lk);
7704			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
7705			    (int)fs->fs_sbsize, 0, 0, 0);
7706			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
7707			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
7708			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
7709			    bp);
7710			bwrite(bp);
7711			ACQUIRE_LOCK(&lk);
7712		}
7713		if (fs->fs_sujfree != ino)
7714			return;
7715		panic("clear_unlinked_inodedep: Failed to clear free head");
7716	}
7717	if (inodedep->id_ino == fs->fs_sujfree)
7718		panic("clear_unlinked_inodedep: Freeing head of free list");
7719	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
7720	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
7721	return;
7722}
7723
7724/*
7725 * This workitem decrements the inode's link count.
7726 * If the link count reaches zero, the file is removed.
7727 */
7728static void
7729handle_workitem_remove(dirrem, xp)
7730	struct dirrem *dirrem;
7731	struct vnode *xp;
7732{
7733	struct inodedep *inodedep;
7734	struct workhead dotdotwk;
7735	struct worklist *wk;
7736	struct ufsmount *ump;
7737	struct mount *mp;
7738	struct vnode *vp;
7739	struct inode *ip;
7740	ino_t oldinum;
7741	int error;
7742
7743	if (dirrem->dm_state & ONWORKLIST)
7744		panic("handle_workitem_remove: dirrem %p still on worklist",
7745		    dirrem);
7746	oldinum = dirrem->dm_oldinum;
7747	mp = dirrem->dm_list.wk_mp;
7748	ump = VFSTOUFS(mp);
7749	if ((vp = xp) == NULL &&
7750	    (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp,
7751	    FFSV_FORCEINSMQ)) != 0) {
7752		softdep_error("handle_workitem_remove: vget", error);
7753		return;
7754	}
7755	ip = VTOI(vp);
7756	ACQUIRE_LOCK(&lk);
7757	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
7758		panic("handle_workitem_remove: lost inodedep");
7759	if (dirrem->dm_state & ONDEPLIST)
7760		LIST_REMOVE(dirrem, dm_inonext);
7761	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
7762	    ("handle_workitem_remove:  Journal entries not written."));
7763
7764	/*
7765	 * Move all dependencies waiting on the remove to complete
7766	 * from the dirrem to the inode inowait list to be completed
7767	 * after the inode has been updated and written to disk.  Any
7768	 * marked MKDIR_PARENT are saved to be completed when the .. ref
7769	 * is removed.
7770	 */
7771	LIST_INIT(&dotdotwk);
7772	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
7773		WORKLIST_REMOVE(wk);
7774		if (wk->wk_state & MKDIR_PARENT) {
7775			wk->wk_state &= ~MKDIR_PARENT;
7776			WORKLIST_INSERT(&dotdotwk, wk);
7777			continue;
7778		}
7779		WORKLIST_INSERT(&inodedep->id_inowait, wk);
7780	}
7781	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
7782	/*
7783	 * Normal file deletion.
7784	 */
7785	if ((dirrem->dm_state & RMDIR) == 0) {
7786		ip->i_nlink--;
7787		DIP_SET(ip, i_nlink, ip->i_nlink);
7788		ip->i_flag |= IN_CHANGE;
7789		if (ip->i_nlink < ip->i_effnlink)
7790			panic("handle_workitem_remove: bad file delta");
7791		if (ip->i_nlink == 0)
7792			unlinked_inodedep(mp, inodedep);
7793		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
7794		num_dirrem -= 1;
7795		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
7796		    ("handle_workitem_remove: worklist not empty. %s",
7797		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
7798		WORKITEM_FREE(dirrem, D_DIRREM);
7799		FREE_LOCK(&lk);
7800		goto out;
7801	}
7802	/*
7803	 * Directory deletion. Decrement reference count for both the
7804	 * just deleted parent directory entry and the reference for ".".
7805	 * Arrange to have the reference count on the parent decremented
7806	 * to account for the loss of "..".
7807	 */
7808	ip->i_nlink -= 2;
7809	DIP_SET(ip, i_nlink, ip->i_nlink);
7810	ip->i_flag |= IN_CHANGE;
7811	if (ip->i_nlink < ip->i_effnlink)
7812		panic("handle_workitem_remove: bad dir delta");
7813	if (ip->i_nlink == 0)
7814		unlinked_inodedep(mp, inodedep);
7815	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
7816	/*
7817	 * Rename a directory to a new parent. Since, we are both deleting
7818	 * and creating a new directory entry, the link count on the new
7819	 * directory should not change. Thus we skip the followup dirrem.
7820	 */
7821	if (dirrem->dm_state & DIRCHG) {
7822		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
7823		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
7824		num_dirrem -= 1;
7825		WORKITEM_FREE(dirrem, D_DIRREM);
7826		FREE_LOCK(&lk);
7827		goto out;
7828	}
7829	dirrem->dm_state = ONDEPLIST;
7830	dirrem->dm_oldinum = dirrem->dm_dirinum;
7831	/*
7832	 * Place the dirrem on the parent's diremhd list.
7833	 */
7834	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
7835		panic("handle_workitem_remove: lost dir inodedep");
7836	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
7837	/*
7838	 * If the allocated inode has never been written to disk, then
7839	 * the on-disk inode is zero'ed and we can remove the file
7840	 * immediately.  When journaling if the inode has been marked
7841	 * unlinked and not DEPCOMPLETE we know it can never be written.
7842	 */
7843	inodedep_lookup(mp, oldinum, 0, &inodedep);
7844	if (inodedep == NULL ||
7845	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
7846	    check_inode_unwritten(inodedep)) {
7847		if (xp != NULL)
7848			add_to_worklist(&dirrem->dm_list, 0);
7849		FREE_LOCK(&lk);
7850		if (xp == NULL) {
7851			vput(vp);
7852			handle_workitem_remove(dirrem, NULL);
7853		}
7854		return;
7855	}
7856	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
7857	FREE_LOCK(&lk);
7858	ip->i_flag |= IN_CHANGE;
7859out:
7860	ffs_update(vp, 0);
7861	if (xp == NULL)
7862		vput(vp);
7863}
7864
7865/*
7866 * Inode de-allocation dependencies.
7867 *
7868 * When an inode's link count is reduced to zero, it can be de-allocated. We
7869 * found it convenient to postpone de-allocation until after the inode is
7870 * written to disk with its new link count (zero).  At this point, all of the
7871 * on-disk inode's block pointers are nullified and, with careful dependency
7872 * list ordering, all dependencies related to the inode will be satisfied and
7873 * the corresponding dependency structures de-allocated.  So, if/when the
7874 * inode is reused, there will be no mixing of old dependencies with new
7875 * ones.  This artificial dependency is set up by the block de-allocation
7876 * procedure above (softdep_setup_freeblocks) and completed by the
7877 * following procedure.
7878 */
7879static void
7880handle_workitem_freefile(freefile)
7881	struct freefile *freefile;
7882{
7883	struct workhead wkhd;
7884	struct fs *fs;
7885	struct inodedep *idp;
7886	struct ufsmount *ump;
7887	int error;
7888
7889	ump = VFSTOUFS(freefile->fx_list.wk_mp);
7890	fs = ump->um_fs;
7891#ifdef DEBUG
7892	ACQUIRE_LOCK(&lk);
7893	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
7894	FREE_LOCK(&lk);
7895	if (error)
7896		panic("handle_workitem_freefile: inodedep %p survived", idp);
7897#endif
7898	UFS_LOCK(ump);
7899	fs->fs_pendinginodes -= 1;
7900	UFS_UNLOCK(ump);
7901	LIST_INIT(&wkhd);
7902	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
7903	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
7904	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
7905		softdep_error("handle_workitem_freefile", error);
7906	ACQUIRE_LOCK(&lk);
7907	WORKITEM_FREE(freefile, D_FREEFILE);
7908	FREE_LOCK(&lk);
7909}
7910
7911
7912/*
7913 * Helper function which unlinks marker element from work list and returns
7914 * the next element on the list.
7915 */
7916static __inline struct worklist *
7917markernext(struct worklist *marker)
7918{
7919	struct worklist *next;
7920
7921	next = LIST_NEXT(marker, wk_list);
7922	LIST_REMOVE(marker, wk_list);
7923	return next;
7924}
7925
7926/*
7927 * Disk writes.
7928 *
7929 * The dependency structures constructed above are most actively used when file
7930 * system blocks are written to disk.  No constraints are placed on when a
7931 * block can be written, but unsatisfied update dependencies are made safe by
7932 * modifying (or replacing) the source memory for the duration of the disk
7933 * write.  When the disk write completes, the memory block is again brought
7934 * up-to-date.
7935 *
7936 * In-core inode structure reclamation.
7937 *
7938 * Because there are a finite number of "in-core" inode structures, they are
7939 * reused regularly.  By transferring all inode-related dependencies to the
7940 * in-memory inode block and indexing them separately (via "inodedep"s), we
7941 * can allow "in-core" inode structures to be reused at any time and avoid
7942 * any increase in contention.
7943 *
7944 * Called just before entering the device driver to initiate a new disk I/O.
7945 * The buffer must be locked, thus, no I/O completion operations can occur
7946 * while we are manipulating its associated dependencies.
7947 */
7948static void
7949softdep_disk_io_initiation(bp)
7950	struct buf *bp;		/* structure describing disk write to occur */
7951{
7952	struct worklist *wk;
7953	struct worklist marker;
7954	struct inodedep *inodedep;
7955	struct freeblks *freeblks;
7956	struct jfreeblk *jfreeblk;
7957	struct newblk *newblk;
7958
7959	/*
7960	 * We only care about write operations. There should never
7961	 * be dependencies for reads.
7962	 */
7963	if (bp->b_iocmd != BIO_WRITE)
7964		panic("softdep_disk_io_initiation: not write");
7965
7966	if (bp->b_vflags & BV_BKGRDINPROG)
7967		panic("softdep_disk_io_initiation: Writing buffer with "
7968		    "background write in progress: %p", bp);
7969
7970	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
7971	PHOLD(curproc);			/* Don't swap out kernel stack */
7972
7973	ACQUIRE_LOCK(&lk);
7974	/*
7975	 * Do any necessary pre-I/O processing.
7976	 */
7977	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
7978	     wk = markernext(&marker)) {
7979		LIST_INSERT_AFTER(wk, &marker, wk_list);
7980		switch (wk->wk_type) {
7981
7982		case D_PAGEDEP:
7983			initiate_write_filepage(WK_PAGEDEP(wk), bp);
7984			continue;
7985
7986		case D_INODEDEP:
7987			inodedep = WK_INODEDEP(wk);
7988			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
7989				initiate_write_inodeblock_ufs1(inodedep, bp);
7990			else
7991				initiate_write_inodeblock_ufs2(inodedep, bp);
7992			continue;
7993
7994		case D_INDIRDEP:
7995			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
7996			continue;
7997
7998		case D_BMSAFEMAP:
7999			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
8000			continue;
8001
8002		case D_JSEG:
8003			WK_JSEG(wk)->js_buf = NULL;
8004			continue;
8005
8006		case D_FREEBLKS:
8007			freeblks = WK_FREEBLKS(wk);
8008			jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd);
8009			/*
8010			 * We have to wait for the jfreeblks to be journaled
8011			 * before we can write an inodeblock with updated
8012			 * pointers.  Be careful to arrange the marker so
8013			 * we revisit the jfreeblk if it's not removed by
8014			 * the first jwait().
8015			 */
8016			if (jfreeblk != NULL) {
8017				LIST_REMOVE(&marker, wk_list);
8018				LIST_INSERT_BEFORE(wk, &marker, wk_list);
8019				jwait(&jfreeblk->jf_list);
8020			}
8021			continue;
8022		case D_ALLOCDIRECT:
8023		case D_ALLOCINDIR:
8024			/*
8025			 * We have to wait for the jnewblk to be journaled
8026			 * before we can write to a block otherwise the
8027			 * contents may be confused with an earlier file
8028			 * at recovery time.  Handle the marker as described
8029			 * above.
8030			 */
8031			newblk = WK_NEWBLK(wk);
8032			if (newblk->nb_jnewblk != NULL) {
8033				LIST_REMOVE(&marker, wk_list);
8034				LIST_INSERT_BEFORE(wk, &marker, wk_list);
8035				jwait(&newblk->nb_jnewblk->jn_list);
8036			}
8037			continue;
8038
8039		case D_SBDEP:
8040			initiate_write_sbdep(WK_SBDEP(wk));
8041			continue;
8042
8043		case D_MKDIR:
8044		case D_FREEWORK:
8045		case D_FREEDEP:
8046		case D_JSEGDEP:
8047			continue;
8048
8049		default:
8050			panic("handle_disk_io_initiation: Unexpected type %s",
8051			    TYPENAME(wk->wk_type));
8052			/* NOTREACHED */
8053		}
8054	}
8055	FREE_LOCK(&lk);
8056	PRELE(curproc);			/* Allow swapout of kernel stack */
8057}
8058
8059/*
8060 * Called from within the procedure above to deal with unsatisfied
8061 * allocation dependencies in a directory. The buffer must be locked,
8062 * thus, no I/O completion operations can occur while we are
8063 * manipulating its associated dependencies.
8064 */
8065static void
8066initiate_write_filepage(pagedep, bp)
8067	struct pagedep *pagedep;
8068	struct buf *bp;
8069{
8070	struct jremref *jremref;
8071	struct jmvref *jmvref;
8072	struct dirrem *dirrem;
8073	struct diradd *dap;
8074	struct direct *ep;
8075	int i;
8076
8077	if (pagedep->pd_state & IOSTARTED) {
8078		/*
8079		 * This can only happen if there is a driver that does not
8080		 * understand chaining. Here biodone will reissue the call
8081		 * to strategy for the incomplete buffers.
8082		 */
8083		printf("initiate_write_filepage: already started\n");
8084		return;
8085	}
8086	pagedep->pd_state |= IOSTARTED;
8087	/*
8088	 * Wait for all journal remove dependencies to hit the disk.
8089	 * We can not allow any potentially conflicting directory adds
8090	 * to be visible before removes and rollback is too difficult.
8091	 * lk may be dropped and re-acquired, however we hold the buf
8092	 * locked so the dependency can not go away.
8093	 */
8094	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
8095		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
8096			stat_jwait_filepage++;
8097			jwait(&jremref->jr_list);
8098		}
8099	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
8100		stat_jwait_filepage++;
8101		jwait(&jmvref->jm_list);
8102	}
8103	for (i = 0; i < DAHASHSZ; i++) {
8104		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
8105			ep = (struct direct *)
8106			    ((char *)bp->b_data + dap->da_offset);
8107			if (ep->d_ino != dap->da_newinum)
8108				panic("%s: dir inum %d != new %d",
8109				    "initiate_write_filepage",
8110				    ep->d_ino, dap->da_newinum);
8111			if (dap->da_state & DIRCHG)
8112				ep->d_ino = dap->da_previous->dm_oldinum;
8113			else
8114				ep->d_ino = 0;
8115			dap->da_state &= ~ATTACHED;
8116			dap->da_state |= UNDONE;
8117		}
8118	}
8119}
8120
8121/*
8122 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
8123 * Note that any bug fixes made to this routine must be done in the
8124 * version found below.
8125 *
8126 * Called from within the procedure above to deal with unsatisfied
8127 * allocation dependencies in an inodeblock. The buffer must be
8128 * locked, thus, no I/O completion operations can occur while we
8129 * are manipulating its associated dependencies.
8130 */
8131static void
8132initiate_write_inodeblock_ufs1(inodedep, bp)
8133	struct inodedep *inodedep;
8134	struct buf *bp;			/* The inode block */
8135{
8136	struct allocdirect *adp, *lastadp;
8137	struct ufs1_dinode *dp;
8138	struct ufs1_dinode *sip;
8139	struct inoref *inoref;
8140	struct fs *fs;
8141	ufs_lbn_t i;
8142#ifdef INVARIANTS
8143	ufs_lbn_t prevlbn = 0;
8144#endif
8145	int deplist;
8146
8147	if (inodedep->id_state & IOSTARTED)
8148		panic("initiate_write_inodeblock_ufs1: already started");
8149	inodedep->id_state |= IOSTARTED;
8150	fs = inodedep->id_fs;
8151	dp = (struct ufs1_dinode *)bp->b_data +
8152	    ino_to_fsbo(fs, inodedep->id_ino);
8153
8154	/*
8155	 * If we're on the unlinked list but have not yet written our
8156	 * next pointer initialize it here.
8157	 */
8158	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
8159		struct inodedep *inon;
8160
8161		inon = TAILQ_NEXT(inodedep, id_unlinked);
8162		dp->di_freelink = inon ? inon->id_ino : 0;
8163	}
8164	/*
8165	 * If the bitmap is not yet written, then the allocated
8166	 * inode cannot be written to disk.
8167	 */
8168	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
8169		if (inodedep->id_savedino1 != NULL)
8170			panic("initiate_write_inodeblock_ufs1: I/O underway");
8171		FREE_LOCK(&lk);
8172		sip = malloc(sizeof(struct ufs1_dinode),
8173		    M_SAVEDINO, M_SOFTDEP_FLAGS);
8174		ACQUIRE_LOCK(&lk);
8175		inodedep->id_savedino1 = sip;
8176		*inodedep->id_savedino1 = *dp;
8177		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
8178		dp->di_gen = inodedep->id_savedino1->di_gen;
8179		dp->di_freelink = inodedep->id_savedino1->di_freelink;
8180		return;
8181	}
8182	/*
8183	 * If no dependencies, then there is nothing to roll back.
8184	 */
8185	inodedep->id_savedsize = dp->di_size;
8186	inodedep->id_savedextsize = 0;
8187	inodedep->id_savednlink = dp->di_nlink;
8188	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
8189	    TAILQ_EMPTY(&inodedep->id_inoreflst))
8190		return;
8191	/*
8192	 * Revert the link count to that of the first unwritten journal entry.
8193	 */
8194	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
8195	if (inoref)
8196		dp->di_nlink = inoref->if_nlink;
8197	/*
8198	 * Set the dependencies to busy.
8199	 */
8200	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8201	     adp = TAILQ_NEXT(adp, ad_next)) {
8202#ifdef INVARIANTS
8203		if (deplist != 0 && prevlbn >= adp->ad_offset)
8204			panic("softdep_write_inodeblock: lbn order");
8205		prevlbn = adp->ad_offset;
8206		if (adp->ad_offset < NDADDR &&
8207		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
8208			panic("%s: direct pointer #%jd mismatch %d != %jd",
8209			    "softdep_write_inodeblock",
8210			    (intmax_t)adp->ad_offset,
8211			    dp->di_db[adp->ad_offset],
8212			    (intmax_t)adp->ad_newblkno);
8213		if (adp->ad_offset >= NDADDR &&
8214		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
8215			panic("%s: indirect pointer #%jd mismatch %d != %jd",
8216			    "softdep_write_inodeblock",
8217			    (intmax_t)adp->ad_offset - NDADDR,
8218			    dp->di_ib[adp->ad_offset - NDADDR],
8219			    (intmax_t)adp->ad_newblkno);
8220		deplist |= 1 << adp->ad_offset;
8221		if ((adp->ad_state & ATTACHED) == 0)
8222			panic("softdep_write_inodeblock: Unknown state 0x%x",
8223			    adp->ad_state);
8224#endif /* INVARIANTS */
8225		adp->ad_state &= ~ATTACHED;
8226		adp->ad_state |= UNDONE;
8227	}
8228	/*
8229	 * The on-disk inode cannot claim to be any larger than the last
8230	 * fragment that has been written. Otherwise, the on-disk inode
8231	 * might have fragments that were not the last block in the file
8232	 * which would corrupt the filesystem.
8233	 */
8234	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8235	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8236		if (adp->ad_offset >= NDADDR)
8237			break;
8238		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
8239		/* keep going until hitting a rollback to a frag */
8240		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8241			continue;
8242		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8243		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
8244#ifdef INVARIANTS
8245			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
8246				panic("softdep_write_inodeblock: lost dep1");
8247#endif /* INVARIANTS */
8248			dp->di_db[i] = 0;
8249		}
8250		for (i = 0; i < NIADDR; i++) {
8251#ifdef INVARIANTS
8252			if (dp->di_ib[i] != 0 &&
8253			    (deplist & ((1 << NDADDR) << i)) == 0)
8254				panic("softdep_write_inodeblock: lost dep2");
8255#endif /* INVARIANTS */
8256			dp->di_ib[i] = 0;
8257		}
8258		return;
8259	}
8260	/*
8261	 * If we have zero'ed out the last allocated block of the file,
8262	 * roll back the size to the last currently allocated block.
8263	 * We know that this last allocated block is a full-sized as
8264	 * we already checked for fragments in the loop above.
8265	 */
8266	if (lastadp != NULL &&
8267	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8268		for (i = lastadp->ad_offset; i >= 0; i--)
8269			if (dp->di_db[i] != 0)
8270				break;
8271		dp->di_size = (i + 1) * fs->fs_bsize;
8272	}
8273	/*
8274	 * The only dependencies are for indirect blocks.
8275	 *
8276	 * The file size for indirect block additions is not guaranteed.
8277	 * Such a guarantee would be non-trivial to achieve. The conventional
8278	 * synchronous write implementation also does not make this guarantee.
8279	 * Fsck should catch and fix discrepancies. Arguably, the file size
8280	 * can be over-estimated without destroying integrity when the file
8281	 * moves into the indirect blocks (i.e., is large). If we want to
8282	 * postpone fsck, we are stuck with this argument.
8283	 */
8284	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
8285		dp->di_ib[adp->ad_offset - NDADDR] = 0;
8286}
8287
8288/*
8289 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
8290 * Note that any bug fixes made to this routine must be done in the
8291 * version found above.
8292 *
8293 * Called from within the procedure above to deal with unsatisfied
8294 * allocation dependencies in an inodeblock. The buffer must be
8295 * locked, thus, no I/O completion operations can occur while we
8296 * are manipulating its associated dependencies.
8297 */
8298static void
8299initiate_write_inodeblock_ufs2(inodedep, bp)
8300	struct inodedep *inodedep;
8301	struct buf *bp;			/* The inode block */
8302{
8303	struct allocdirect *adp, *lastadp;
8304	struct ufs2_dinode *dp;
8305	struct ufs2_dinode *sip;
8306	struct inoref *inoref;
8307	struct fs *fs;
8308	ufs_lbn_t i;
8309#ifdef INVARIANTS
8310	ufs_lbn_t prevlbn = 0;
8311#endif
8312	int deplist;
8313
8314	if (inodedep->id_state & IOSTARTED)
8315		panic("initiate_write_inodeblock_ufs2: already started");
8316	inodedep->id_state |= IOSTARTED;
8317	fs = inodedep->id_fs;
8318	dp = (struct ufs2_dinode *)bp->b_data +
8319	    ino_to_fsbo(fs, inodedep->id_ino);
8320
8321	/*
8322	 * If we're on the unlinked list but have not yet written our
8323	 * next pointer initialize it here.
8324	 */
8325	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
8326		struct inodedep *inon;
8327
8328		inon = TAILQ_NEXT(inodedep, id_unlinked);
8329		dp->di_freelink = inon ? inon->id_ino : 0;
8330	}
8331	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) ==
8332	    (UNLINKED | UNLINKNEXT)) {
8333		struct inodedep *inon;
8334		ino_t freelink;
8335
8336		inon = TAILQ_NEXT(inodedep, id_unlinked);
8337		freelink = inon ? inon->id_ino : 0;
8338		if (freelink != dp->di_freelink)
8339			panic("ino %p(0x%X) %d, %d != %d",
8340			    inodedep, inodedep->id_state, inodedep->id_ino,
8341			    freelink, dp->di_freelink);
8342	}
8343	/*
8344	 * If the bitmap is not yet written, then the allocated
8345	 * inode cannot be written to disk.
8346	 */
8347	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
8348		if (inodedep->id_savedino2 != NULL)
8349			panic("initiate_write_inodeblock_ufs2: I/O underway");
8350		FREE_LOCK(&lk);
8351		sip = malloc(sizeof(struct ufs2_dinode),
8352		    M_SAVEDINO, M_SOFTDEP_FLAGS);
8353		ACQUIRE_LOCK(&lk);
8354		inodedep->id_savedino2 = sip;
8355		*inodedep->id_savedino2 = *dp;
8356		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
8357		dp->di_gen = inodedep->id_savedino2->di_gen;
8358		dp->di_freelink = inodedep->id_savedino2->di_freelink;
8359		return;
8360	}
8361	/*
8362	 * If no dependencies, then there is nothing to roll back.
8363	 */
8364	inodedep->id_savedsize = dp->di_size;
8365	inodedep->id_savedextsize = dp->di_extsize;
8366	inodedep->id_savednlink = dp->di_nlink;
8367	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
8368	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
8369	    TAILQ_EMPTY(&inodedep->id_inoreflst))
8370		return;
8371	/*
8372	 * Revert the link count to that of the first unwritten journal entry.
8373	 */
8374	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
8375	if (inoref)
8376		dp->di_nlink = inoref->if_nlink;
8377
8378	/*
8379	 * Set the ext data dependencies to busy.
8380	 */
8381	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
8382	     adp = TAILQ_NEXT(adp, ad_next)) {
8383#ifdef INVARIANTS
8384		if (deplist != 0 && prevlbn >= adp->ad_offset)
8385			panic("softdep_write_inodeblock: lbn order");
8386		prevlbn = adp->ad_offset;
8387		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
8388			panic("%s: direct pointer #%jd mismatch %jd != %jd",
8389			    "softdep_write_inodeblock",
8390			    (intmax_t)adp->ad_offset,
8391			    (intmax_t)dp->di_extb[adp->ad_offset],
8392			    (intmax_t)adp->ad_newblkno);
8393		deplist |= 1 << adp->ad_offset;
8394		if ((adp->ad_state & ATTACHED) == 0)
8395			panic("softdep_write_inodeblock: Unknown state 0x%x",
8396			    adp->ad_state);
8397#endif /* INVARIANTS */
8398		adp->ad_state &= ~ATTACHED;
8399		adp->ad_state |= UNDONE;
8400	}
8401	/*
8402	 * The on-disk inode cannot claim to be any larger than the last
8403	 * fragment that has been written. Otherwise, the on-disk inode
8404	 * might have fragments that were not the last block in the ext
8405	 * data which would corrupt the filesystem.
8406	 */
8407	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
8408	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8409		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
8410		/* keep going until hitting a rollback to a frag */
8411		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8412			continue;
8413		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8414		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
8415#ifdef INVARIANTS
8416			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
8417				panic("softdep_write_inodeblock: lost dep1");
8418#endif /* INVARIANTS */
8419			dp->di_extb[i] = 0;
8420		}
8421		lastadp = NULL;
8422		break;
8423	}
8424	/*
8425	 * If we have zero'ed out the last allocated block of the ext
8426	 * data, roll back the size to the last currently allocated block.
8427	 * We know that this last allocated block is a full-sized as
8428	 * we already checked for fragments in the loop above.
8429	 */
8430	if (lastadp != NULL &&
8431	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8432		for (i = lastadp->ad_offset; i >= 0; i--)
8433			if (dp->di_extb[i] != 0)
8434				break;
8435		dp->di_extsize = (i + 1) * fs->fs_bsize;
8436	}
8437	/*
8438	 * Set the file data dependencies to busy.
8439	 */
8440	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8441	     adp = TAILQ_NEXT(adp, ad_next)) {
8442#ifdef INVARIANTS
8443		if (deplist != 0 && prevlbn >= adp->ad_offset)
8444			panic("softdep_write_inodeblock: lbn order");
8445		prevlbn = adp->ad_offset;
8446		if (adp->ad_offset < NDADDR &&
8447		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
8448			panic("%s: direct pointer #%jd mismatch %jd != %jd",
8449			    "softdep_write_inodeblock",
8450			    (intmax_t)adp->ad_offset,
8451			    (intmax_t)dp->di_db[adp->ad_offset],
8452			    (intmax_t)adp->ad_newblkno);
8453		if (adp->ad_offset >= NDADDR &&
8454		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
8455			panic("%s indirect pointer #%jd mismatch %jd != %jd",
8456			    "softdep_write_inodeblock:",
8457			    (intmax_t)adp->ad_offset - NDADDR,
8458			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
8459			    (intmax_t)adp->ad_newblkno);
8460		deplist |= 1 << adp->ad_offset;
8461		if ((adp->ad_state & ATTACHED) == 0)
8462			panic("softdep_write_inodeblock: Unknown state 0x%x",
8463			    adp->ad_state);
8464#endif /* INVARIANTS */
8465		adp->ad_state &= ~ATTACHED;
8466		adp->ad_state |= UNDONE;
8467	}
8468	/*
8469	 * The on-disk inode cannot claim to be any larger than the last
8470	 * fragment that has been written. Otherwise, the on-disk inode
8471	 * might have fragments that were not the last block in the file
8472	 * which would corrupt the filesystem.
8473	 */
8474	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8475	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8476		if (adp->ad_offset >= NDADDR)
8477			break;
8478		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
8479		/* keep going until hitting a rollback to a frag */
8480		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8481			continue;
8482		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8483		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
8484#ifdef INVARIANTS
8485			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
8486				panic("softdep_write_inodeblock: lost dep2");
8487#endif /* INVARIANTS */
8488			dp->di_db[i] = 0;
8489		}
8490		for (i = 0; i < NIADDR; i++) {
8491#ifdef INVARIANTS
8492			if (dp->di_ib[i] != 0 &&
8493			    (deplist & ((1 << NDADDR) << i)) == 0)
8494				panic("softdep_write_inodeblock: lost dep3");
8495#endif /* INVARIANTS */
8496			dp->di_ib[i] = 0;
8497		}
8498		return;
8499	}
8500	/*
8501	 * If we have zero'ed out the last allocated block of the file,
8502	 * roll back the size to the last currently allocated block.
8503	 * We know that this last allocated block is a full-sized as
8504	 * we already checked for fragments in the loop above.
8505	 */
8506	if (lastadp != NULL &&
8507	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8508		for (i = lastadp->ad_offset; i >= 0; i--)
8509			if (dp->di_db[i] != 0)
8510				break;
8511		dp->di_size = (i + 1) * fs->fs_bsize;
8512	}
8513	/*
8514	 * The only dependencies are for indirect blocks.
8515	 *
8516	 * The file size for indirect block additions is not guaranteed.
8517	 * Such a guarantee would be non-trivial to achieve. The conventional
8518	 * synchronous write implementation also does not make this guarantee.
8519	 * Fsck should catch and fix discrepancies. Arguably, the file size
8520	 * can be over-estimated without destroying integrity when the file
8521	 * moves into the indirect blocks (i.e., is large). If we want to
8522	 * postpone fsck, we are stuck with this argument.
8523	 */
8524	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
8525		dp->di_ib[adp->ad_offset - NDADDR] = 0;
8526}
8527
8528/*
8529 * Cancel an indirdep as a result of truncation.  Release all of the
8530 * children allocindirs and place their journal work on the appropriate
8531 * list.
8532 */
8533static void
8534cancel_indirdep(indirdep, bp, inodedep, freeblks)
8535	struct indirdep *indirdep;
8536	struct buf *bp;
8537	struct inodedep *inodedep;
8538	struct freeblks *freeblks;
8539{
8540	struct allocindir *aip;
8541
8542	/*
8543	 * None of the indirect pointers will ever be visible,
8544	 * so they can simply be tossed. GOINGAWAY ensures
8545	 * that allocated pointers will be saved in the buffer
8546	 * cache until they are freed. Note that they will
8547	 * only be able to be found by their physical address
8548	 * since the inode mapping the logical address will
8549	 * be gone. The save buffer used for the safe copy
8550	 * was allocated in setup_allocindir_phase2 using
8551	 * the physical address so it could be used for this
8552	 * purpose. Hence we swap the safe copy with the real
8553	 * copy, allowing the safe copy to be freed and holding
8554	 * on to the real copy for later use in indir_trunc.
8555	 */
8556	if (indirdep->ir_state & GOINGAWAY)
8557		panic("cancel_indirdep: already gone");
8558	if (indirdep->ir_state & ONDEPLIST) {
8559		indirdep->ir_state &= ~ONDEPLIST;
8560		LIST_REMOVE(indirdep, ir_next);
8561	}
8562	indirdep->ir_state |= GOINGAWAY;
8563	VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1;
8564	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
8565		cancel_allocindir(aip, inodedep, freeblks);
8566	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
8567		cancel_allocindir(aip, inodedep, freeblks);
8568	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
8569		cancel_allocindir(aip, inodedep, freeblks);
8570	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
8571		cancel_allocindir(aip, inodedep, freeblks);
8572	bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
8573	WORKLIST_REMOVE(&indirdep->ir_list);
8574	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
8575	indirdep->ir_savebp = NULL;
8576}
8577
8578/*
8579 * Free an indirdep once it no longer has new pointers to track.
8580 */
8581static void
8582free_indirdep(indirdep)
8583	struct indirdep *indirdep;
8584{
8585
8586	KASSERT(LIST_EMPTY(&indirdep->ir_jwork),
8587	    ("free_indirdep: Journal work not empty."));
8588	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
8589	    ("free_indirdep: Complete head not empty."));
8590	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
8591	    ("free_indirdep: write head not empty."));
8592	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
8593	    ("free_indirdep: done head not empty."));
8594	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
8595	    ("free_indirdep: deplist head not empty."));
8596	KASSERT(indirdep->ir_savebp == NULL,
8597	    ("free_indirdep: %p ir_savebp != NULL", indirdep));
8598	KASSERT((indirdep->ir_state & ONDEPLIST) == 0,
8599	    ("free_indirdep: %p still on deplist.", indirdep));
8600	if (indirdep->ir_state & ONWORKLIST)
8601		WORKLIST_REMOVE(&indirdep->ir_list);
8602	WORKITEM_FREE(indirdep, D_INDIRDEP);
8603}
8604
8605/*
8606 * Called before a write to an indirdep.  This routine is responsible for
8607 * rolling back pointers to a safe state which includes only those
8608 * allocindirs which have been completed.
8609 */
8610static void
8611initiate_write_indirdep(indirdep, bp)
8612	struct indirdep *indirdep;
8613	struct buf *bp;
8614{
8615
8616	if (indirdep->ir_state & GOINGAWAY)
8617		panic("disk_io_initiation: indirdep gone");
8618
8619	/*
8620	 * If there are no remaining dependencies, this will be writing
8621	 * the real pointers.
8622	 */
8623	if (LIST_EMPTY(&indirdep->ir_deplisthd))
8624		return;
8625	/*
8626	 * Replace up-to-date version with safe version.
8627	 */
8628	FREE_LOCK(&lk);
8629	indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
8630	    M_SOFTDEP_FLAGS);
8631	ACQUIRE_LOCK(&lk);
8632	indirdep->ir_state &= ~ATTACHED;
8633	indirdep->ir_state |= UNDONE;
8634	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
8635	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
8636	    bp->b_bcount);
8637}
8638
8639/*
8640 * Called when an inode has been cleared in a cg bitmap.  This finally
8641 * eliminates any canceled jaddrefs
8642 */
8643void
8644softdep_setup_inofree(mp, bp, ino, wkhd)
8645	struct mount *mp;
8646	struct buf *bp;
8647	ino_t ino;
8648	struct workhead *wkhd;
8649{
8650	struct worklist *wk, *wkn;
8651	struct inodedep *inodedep;
8652	uint8_t *inosused;
8653	struct cg *cgp;
8654	struct fs *fs;
8655
8656	ACQUIRE_LOCK(&lk);
8657	fs = VFSTOUFS(mp)->um_fs;
8658	cgp = (struct cg *)bp->b_data;
8659	inosused = cg_inosused(cgp);
8660	if (isset(inosused, ino % fs->fs_ipg))
8661		panic("softdep_setup_inofree: inode %d not freed.", ino);
8662	if (inodedep_lookup(mp, ino, 0, &inodedep))
8663		panic("softdep_setup_inofree: ino %d has existing inodedep %p",
8664		    ino, inodedep);
8665	if (wkhd) {
8666		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
8667			if (wk->wk_type != D_JADDREF)
8668				continue;
8669			WORKLIST_REMOVE(wk);
8670			/*
8671			 * We can free immediately even if the jaddref
8672			 * isn't attached in a background write as now
8673			 * the bitmaps are reconciled.
8674		 	 */
8675			wk->wk_state |= COMPLETE | ATTACHED;
8676			free_jaddref(WK_JADDREF(wk));
8677		}
8678		jwork_move(&bp->b_dep, wkhd);
8679	}
8680	FREE_LOCK(&lk);
8681}
8682
8683
8684/*
8685 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
8686 * map.  Any dependencies waiting for the write to clear are added to the
8687 * buf's list and any jnewblks that are being canceled are discarded
8688 * immediately.
8689 */
8690void
8691softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
8692	struct mount *mp;
8693	struct buf *bp;
8694	ufs2_daddr_t blkno;
8695	int frags;
8696	struct workhead *wkhd;
8697{
8698	struct jnewblk *jnewblk;
8699	struct worklist *wk, *wkn;
8700#ifdef SUJ_DEBUG
8701	struct bmsafemap *bmsafemap;
8702	struct fs *fs;
8703	uint8_t *blksfree;
8704	struct cg *cgp;
8705	ufs2_daddr_t jstart;
8706	ufs2_daddr_t jend;
8707	ufs2_daddr_t end;
8708	long bno;
8709	int i;
8710#endif
8711
8712	ACQUIRE_LOCK(&lk);
8713	/*
8714	 * Detach any jnewblks which have been canceled.  They must linger
8715	 * until the bitmap is cleared again by ffs_blkfree() to prevent
8716	 * an unjournaled allocation from hitting the disk.
8717	 */
8718	if (wkhd) {
8719		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
8720			if (wk->wk_type != D_JNEWBLK)
8721				continue;
8722			jnewblk = WK_JNEWBLK(wk);
8723			KASSERT(jnewblk->jn_state & GOINGAWAY,
8724			    ("softdep_setup_blkfree: jnewblk not canceled."));
8725			WORKLIST_REMOVE(wk);
8726#ifdef SUJ_DEBUG
8727			/*
8728			 * Assert that this block is free in the bitmap
8729			 * before we discard the jnewblk.
8730			 */
8731			fs = VFSTOUFS(mp)->um_fs;
8732			cgp = (struct cg *)bp->b_data;
8733			blksfree = cg_blksfree(cgp);
8734			bno = dtogd(fs, jnewblk->jn_blkno);
8735			for (i = jnewblk->jn_oldfrags;
8736			    i < jnewblk->jn_frags; i++) {
8737				if (isset(blksfree, bno + i))
8738					continue;
8739				panic("softdep_setup_blkfree: not free");
8740			}
8741#endif
8742			/*
8743			 * Even if it's not attached we can free immediately
8744			 * as the new bitmap is correct.
8745			 */
8746			wk->wk_state |= COMPLETE | ATTACHED;
8747			free_jnewblk(jnewblk);
8748		}
8749		/*
8750		 * The buf must be locked by the caller otherwise these could
8751		 * be added while it's being written and the write would
8752		 * complete them before they made it to disk.
8753		 */
8754		jwork_move(&bp->b_dep, wkhd);
8755	}
8756
8757#ifdef SUJ_DEBUG
8758	/*
8759	 * Assert that we are not freeing a block which has an outstanding
8760	 * allocation dependency.
8761	 */
8762	fs = VFSTOUFS(mp)->um_fs;
8763	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno));
8764	end = blkno + frags;
8765	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
8766		/*
8767		 * Don't match against blocks that will be freed when the
8768		 * background write is done.
8769		 */
8770		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
8771		    (COMPLETE | DEPCOMPLETE))
8772			continue;
8773		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
8774		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
8775		if ((blkno >= jstart && blkno < jend) ||
8776		    (end > jstart && end <= jend)) {
8777			printf("state 0x%X %jd - %d %d dep %p\n",
8778			    jnewblk->jn_state, jnewblk->jn_blkno,
8779			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
8780			    jnewblk->jn_newblk);
8781			panic("softdep_setup_blkfree: "
8782			    "%jd-%jd(%d) overlaps with %jd-%jd",
8783			    blkno, end, frags, jstart, jend);
8784		}
8785	}
8786#endif
8787	FREE_LOCK(&lk);
8788}
8789
8790static void
8791initiate_write_bmsafemap(bmsafemap, bp)
8792	struct bmsafemap *bmsafemap;
8793	struct buf *bp;			/* The cg block. */
8794{
8795	struct jaddref *jaddref;
8796	struct jnewblk *jnewblk;
8797	uint8_t *inosused;
8798	uint8_t *blksfree;
8799	struct cg *cgp;
8800	struct fs *fs;
8801	int cleared;
8802	ino_t ino;
8803	long bno;
8804	int i;
8805
8806	if (bmsafemap->sm_state & IOSTARTED)
8807		panic("initiate_write_bmsafemap: Already started\n");
8808	bmsafemap->sm_state |= IOSTARTED;
8809	/*
8810	 * Clear any inode allocations which are pending journal writes.
8811	 */
8812	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
8813		cgp = (struct cg *)bp->b_data;
8814		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
8815		inosused = cg_inosused(cgp);
8816		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
8817			ino = jaddref->ja_ino % fs->fs_ipg;
8818			/*
8819			 * If this is a background copy the inode may not
8820			 * be marked used yet.
8821			 */
8822			if (isset(inosused, ino)) {
8823				if ((jaddref->ja_mode & IFMT) == IFDIR)
8824					cgp->cg_cs.cs_ndir--;
8825				cgp->cg_cs.cs_nifree++;
8826				clrbit(inosused, ino);
8827				jaddref->ja_state &= ~ATTACHED;
8828				jaddref->ja_state |= UNDONE;
8829				stat_jaddref++;
8830			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
8831				panic("initiate_write_bmsafemap: inode %d "
8832				    "marked free", jaddref->ja_ino);
8833		}
8834	}
8835	/*
8836	 * Clear any block allocations which are pending journal writes.
8837	 */
8838	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
8839		cgp = (struct cg *)bp->b_data;
8840		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
8841		blksfree = cg_blksfree(cgp);
8842		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
8843			bno = dtogd(fs, jnewblk->jn_blkno);
8844			cleared = 0;
8845			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
8846			    i++) {
8847				if (isclr(blksfree, bno + i)) {
8848					cleared = 1;
8849					setbit(blksfree, bno + i);
8850				}
8851			}
8852			/*
8853			 * We may not clear the block if it's a background
8854			 * copy.  In that case there is no reason to detach
8855			 * it.
8856			 */
8857			if (cleared) {
8858				stat_jnewblk++;
8859				jnewblk->jn_state &= ~ATTACHED;
8860				jnewblk->jn_state |= UNDONE;
8861			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
8862				panic("initiate_write_bmsafemap: block %jd "
8863				    "marked free", jnewblk->jn_blkno);
8864		}
8865	}
8866	/*
8867	 * Move allocation lists to the written lists so they can be
8868	 * cleared once the block write is complete.
8869	 */
8870	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
8871	    inodedep, id_deps);
8872	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
8873	    newblk, nb_deps);
8874}
8875
8876/*
8877 * This routine is called during the completion interrupt
8878 * service routine for a disk write (from the procedure called
8879 * by the device driver to inform the filesystem caches of
8880 * a request completion).  It should be called early in this
8881 * procedure, before the block is made available to other
8882 * processes or other routines are called.
8883 *
8884 */
8885static void
8886softdep_disk_write_complete(bp)
8887	struct buf *bp;		/* describes the completed disk write */
8888{
8889	struct worklist *wk;
8890	struct worklist *owk;
8891	struct workhead reattach;
8892	struct buf *sbp;
8893
8894	/*
8895	 * If an error occurred while doing the write, then the data
8896	 * has not hit the disk and the dependencies cannot be unrolled.
8897	 */
8898	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
8899		return;
8900	LIST_INIT(&reattach);
8901	/*
8902	 * This lock must not be released anywhere in this code segment.
8903	 */
8904	sbp = NULL;
8905	owk = NULL;
8906	ACQUIRE_LOCK(&lk);
8907	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
8908		WORKLIST_REMOVE(wk);
8909		if (wk == owk)
8910			panic("duplicate worklist: %p\n", wk);
8911		owk = wk;
8912		switch (wk->wk_type) {
8913
8914		case D_PAGEDEP:
8915			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
8916				WORKLIST_INSERT(&reattach, wk);
8917			continue;
8918
8919		case D_INODEDEP:
8920			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
8921				WORKLIST_INSERT(&reattach, wk);
8922			continue;
8923
8924		case D_BMSAFEMAP:
8925			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
8926				WORKLIST_INSERT(&reattach, wk);
8927			continue;
8928
8929		case D_MKDIR:
8930			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
8931			continue;
8932
8933		case D_ALLOCDIRECT:
8934			wk->wk_state |= COMPLETE;
8935			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
8936			continue;
8937
8938		case D_ALLOCINDIR:
8939			wk->wk_state |= COMPLETE;
8940			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
8941			continue;
8942
8943		case D_INDIRDEP:
8944			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
8945				WORKLIST_INSERT(&reattach, wk);
8946			continue;
8947
8948		case D_FREEBLKS:
8949			wk->wk_state |= COMPLETE;
8950			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
8951				add_to_worklist(wk, 1);
8952			continue;
8953
8954		case D_FREEWORK:
8955			handle_written_freework(WK_FREEWORK(wk));
8956			break;
8957
8958		case D_FREEDEP:
8959			free_freedep(WK_FREEDEP(wk));
8960			continue;
8961
8962		case D_JSEGDEP:
8963			free_jsegdep(WK_JSEGDEP(wk));
8964			continue;
8965
8966		case D_JSEG:
8967			handle_written_jseg(WK_JSEG(wk), bp);
8968			continue;
8969
8970		case D_SBDEP:
8971			if (handle_written_sbdep(WK_SBDEP(wk), bp))
8972				WORKLIST_INSERT(&reattach, wk);
8973			continue;
8974
8975		default:
8976			panic("handle_disk_write_complete: Unknown type %s",
8977			    TYPENAME(wk->wk_type));
8978			/* NOTREACHED */
8979		}
8980	}
8981	/*
8982	 * Reattach any requests that must be redone.
8983	 */
8984	while ((wk = LIST_FIRST(&reattach)) != NULL) {
8985		WORKLIST_REMOVE(wk);
8986		WORKLIST_INSERT(&bp->b_dep, wk);
8987	}
8988	FREE_LOCK(&lk);
8989	if (sbp)
8990		brelse(sbp);
8991}
8992
8993/*
8994 * Called from within softdep_disk_write_complete above. Note that
8995 * this routine is always called from interrupt level with further
8996 * splbio interrupts blocked.
8997 */
8998static void
8999handle_allocdirect_partdone(adp, wkhd)
9000	struct allocdirect *adp;	/* the completed allocdirect */
9001	struct workhead *wkhd;		/* Work to do when inode is writtne. */
9002{
9003	struct allocdirectlst *listhead;
9004	struct allocdirect *listadp;
9005	struct inodedep *inodedep;
9006	long bsize;
9007
9008	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
9009		return;
9010	/*
9011	 * The on-disk inode cannot claim to be any larger than the last
9012	 * fragment that has been written. Otherwise, the on-disk inode
9013	 * might have fragments that were not the last block in the file
9014	 * which would corrupt the filesystem. Thus, we cannot free any
9015	 * allocdirects after one whose ad_oldblkno claims a fragment as
9016	 * these blocks must be rolled back to zero before writing the inode.
9017	 * We check the currently active set of allocdirects in id_inoupdt
9018	 * or id_extupdt as appropriate.
9019	 */
9020	inodedep = adp->ad_inodedep;
9021	bsize = inodedep->id_fs->fs_bsize;
9022	if (adp->ad_state & EXTDATA)
9023		listhead = &inodedep->id_extupdt;
9024	else
9025		listhead = &inodedep->id_inoupdt;
9026	TAILQ_FOREACH(listadp, listhead, ad_next) {
9027		/* found our block */
9028		if (listadp == adp)
9029			break;
9030		/* continue if ad_oldlbn is not a fragment */
9031		if (listadp->ad_oldsize == 0 ||
9032		    listadp->ad_oldsize == bsize)
9033			continue;
9034		/* hit a fragment */
9035		return;
9036	}
9037	/*
9038	 * If we have reached the end of the current list without
9039	 * finding the just finished dependency, then it must be
9040	 * on the future dependency list. Future dependencies cannot
9041	 * be freed until they are moved to the current list.
9042	 */
9043	if (listadp == NULL) {
9044#ifdef DEBUG
9045		if (adp->ad_state & EXTDATA)
9046			listhead = &inodedep->id_newextupdt;
9047		else
9048			listhead = &inodedep->id_newinoupdt;
9049		TAILQ_FOREACH(listadp, listhead, ad_next)
9050			/* found our block */
9051			if (listadp == adp)
9052				break;
9053		if (listadp == NULL)
9054			panic("handle_allocdirect_partdone: lost dep");
9055#endif /* DEBUG */
9056		return;
9057	}
9058	/*
9059	 * If we have found the just finished dependency, then queue
9060	 * it along with anything that follows it that is complete.
9061	 * Since the pointer has not yet been written in the inode
9062	 * as the dependency prevents it, place the allocdirect on the
9063	 * bufwait list where it will be freed once the pointer is
9064	 * valid.
9065	 */
9066	if (wkhd == NULL)
9067		wkhd = &inodedep->id_bufwait;
9068	for (; adp; adp = listadp) {
9069		listadp = TAILQ_NEXT(adp, ad_next);
9070		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
9071			return;
9072		TAILQ_REMOVE(listhead, adp, ad_next);
9073		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
9074	}
9075}
9076
9077/*
9078 * Called from within softdep_disk_write_complete above.  This routine
9079 * completes successfully written allocindirs.
9080 */
9081static void
9082handle_allocindir_partdone(aip)
9083	struct allocindir *aip;		/* the completed allocindir */
9084{
9085	struct indirdep *indirdep;
9086
9087	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
9088		return;
9089	indirdep = aip->ai_indirdep;
9090	LIST_REMOVE(aip, ai_next);
9091	if (indirdep->ir_state & UNDONE) {
9092		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
9093		return;
9094	}
9095	if (indirdep->ir_state & UFS1FMT)
9096		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
9097		    aip->ai_newblkno;
9098	else
9099		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
9100		    aip->ai_newblkno;
9101	/*
9102	 * Await the pointer write before freeing the allocindir.
9103	 */
9104	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
9105}
9106
9107/*
9108 * Release segments held on a jwork list.
9109 */
9110static void
9111handle_jwork(wkhd)
9112	struct workhead *wkhd;
9113{
9114	struct worklist *wk;
9115
9116	while ((wk = LIST_FIRST(wkhd)) != NULL) {
9117		WORKLIST_REMOVE(wk);
9118		switch (wk->wk_type) {
9119		case D_JSEGDEP:
9120			free_jsegdep(WK_JSEGDEP(wk));
9121			continue;
9122		default:
9123			panic("handle_jwork: Unknown type %s\n",
9124			    TYPENAME(wk->wk_type));
9125		}
9126	}
9127}
9128
9129/*
9130 * Handle the bufwait list on an inode when it is safe to release items
9131 * held there.  This normally happens after an inode block is written but
9132 * may be delayed and handled later if there are pending journal items that
9133 * are not yet safe to be released.
9134 */
9135static struct freefile *
9136handle_bufwait(inodedep, refhd)
9137	struct inodedep *inodedep;
9138	struct workhead *refhd;
9139{
9140	struct jaddref *jaddref;
9141	struct freefile *freefile;
9142	struct worklist *wk;
9143
9144	freefile = NULL;
9145	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
9146		WORKLIST_REMOVE(wk);
9147		switch (wk->wk_type) {
9148		case D_FREEFILE:
9149			/*
9150			 * We defer adding freefile to the worklist
9151			 * until all other additions have been made to
9152			 * ensure that it will be done after all the
9153			 * old blocks have been freed.
9154			 */
9155			if (freefile != NULL)
9156				panic("handle_bufwait: freefile");
9157			freefile = WK_FREEFILE(wk);
9158			continue;
9159
9160		case D_MKDIR:
9161			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
9162			continue;
9163
9164		case D_DIRADD:
9165			diradd_inode_written(WK_DIRADD(wk), inodedep);
9166			continue;
9167
9168		case D_FREEFRAG:
9169			wk->wk_state |= COMPLETE;
9170			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
9171				add_to_worklist(wk, 0);
9172			continue;
9173
9174		case D_DIRREM:
9175			wk->wk_state |= COMPLETE;
9176			add_to_worklist(wk, 0);
9177			continue;
9178
9179		case D_ALLOCDIRECT:
9180		case D_ALLOCINDIR:
9181			free_newblk(WK_NEWBLK(wk));
9182			continue;
9183
9184		case D_JNEWBLK:
9185			wk->wk_state |= COMPLETE;
9186			free_jnewblk(WK_JNEWBLK(wk));
9187			continue;
9188
9189		/*
9190		 * Save freed journal segments and add references on
9191		 * the supplied list which will delay their release
9192		 * until the cg bitmap is cleared on disk.
9193		 */
9194		case D_JSEGDEP:
9195			if (refhd == NULL)
9196				free_jsegdep(WK_JSEGDEP(wk));
9197			else
9198				WORKLIST_INSERT(refhd, wk);
9199			continue;
9200
9201		case D_JADDREF:
9202			jaddref = WK_JADDREF(wk);
9203			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
9204			    if_deps);
9205			/*
9206			 * Transfer any jaddrefs to the list to be freed with
9207			 * the bitmap if we're handling a removed file.
9208			 */
9209			if (refhd == NULL) {
9210				wk->wk_state |= COMPLETE;
9211				free_jaddref(jaddref);
9212			} else
9213				WORKLIST_INSERT(refhd, wk);
9214			continue;
9215
9216		default:
9217			panic("handle_bufwait: Unknown type %p(%s)",
9218			    wk, TYPENAME(wk->wk_type));
9219			/* NOTREACHED */
9220		}
9221	}
9222	return (freefile);
9223}
9224/*
9225 * Called from within softdep_disk_write_complete above to restore
9226 * in-memory inode block contents to their most up-to-date state. Note
9227 * that this routine is always called from interrupt level with further
9228 * splbio interrupts blocked.
9229 */
9230static int
9231handle_written_inodeblock(inodedep, bp)
9232	struct inodedep *inodedep;
9233	struct buf *bp;		/* buffer containing the inode block */
9234{
9235	struct freefile *freefile;
9236	struct allocdirect *adp, *nextadp;
9237	struct ufs1_dinode *dp1 = NULL;
9238	struct ufs2_dinode *dp2 = NULL;
9239	struct workhead wkhd;
9240	int hadchanges, fstype;
9241	ino_t freelink;
9242
9243	LIST_INIT(&wkhd);
9244	hadchanges = 0;
9245	freefile = NULL;
9246	if ((inodedep->id_state & IOSTARTED) == 0)
9247		panic("handle_written_inodeblock: not started");
9248	inodedep->id_state &= ~IOSTARTED;
9249	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
9250		fstype = UFS1;
9251		dp1 = (struct ufs1_dinode *)bp->b_data +
9252		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
9253		freelink = dp1->di_freelink;
9254	} else {
9255		fstype = UFS2;
9256		dp2 = (struct ufs2_dinode *)bp->b_data +
9257		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
9258		freelink = dp2->di_freelink;
9259	}
9260	/*
9261	 * If we wrote a valid freelink pointer during the last write
9262	 * record it here.
9263	 */
9264	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
9265		struct inodedep *inon;
9266
9267		inon = TAILQ_NEXT(inodedep, id_unlinked);
9268		if ((inon == NULL && freelink == 0) ||
9269		    (inon && inon->id_ino == freelink)) {
9270			if (inon)
9271				inon->id_state |= UNLINKPREV;
9272			inodedep->id_state |= UNLINKNEXT;
9273		} else
9274			hadchanges = 1;
9275	}
9276	/* Leave this inodeblock dirty until it's in the list. */
9277	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED)
9278		hadchanges = 1;
9279	/*
9280	 * If we had to rollback the inode allocation because of
9281	 * bitmaps being incomplete, then simply restore it.
9282	 * Keep the block dirty so that it will not be reclaimed until
9283	 * all associated dependencies have been cleared and the
9284	 * corresponding updates written to disk.
9285	 */
9286	if (inodedep->id_savedino1 != NULL) {
9287		hadchanges = 1;
9288		if (fstype == UFS1)
9289			*dp1 = *inodedep->id_savedino1;
9290		else
9291			*dp2 = *inodedep->id_savedino2;
9292		free(inodedep->id_savedino1, M_SAVEDINO);
9293		inodedep->id_savedino1 = NULL;
9294		if ((bp->b_flags & B_DELWRI) == 0)
9295			stat_inode_bitmap++;
9296		bdirty(bp);
9297		/*
9298		 * If the inode is clear here and GOINGAWAY it will never
9299		 * be written.  Process the bufwait and clear any pending
9300		 * work which may include the freefile.
9301		 */
9302		if (inodedep->id_state & GOINGAWAY)
9303			goto bufwait;
9304		return (1);
9305	}
9306	inodedep->id_state |= COMPLETE;
9307	/*
9308	 * Roll forward anything that had to be rolled back before
9309	 * the inode could be updated.
9310	 */
9311	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
9312		nextadp = TAILQ_NEXT(adp, ad_next);
9313		if (adp->ad_state & ATTACHED)
9314			panic("handle_written_inodeblock: new entry");
9315		if (fstype == UFS1) {
9316			if (adp->ad_offset < NDADDR) {
9317				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
9318					panic("%s %s #%jd mismatch %d != %jd",
9319					    "handle_written_inodeblock:",
9320					    "direct pointer",
9321					    (intmax_t)adp->ad_offset,
9322					    dp1->di_db[adp->ad_offset],
9323					    (intmax_t)adp->ad_oldblkno);
9324				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
9325			} else {
9326				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
9327					panic("%s: %s #%jd allocated as %d",
9328					    "handle_written_inodeblock",
9329					    "indirect pointer",
9330					    (intmax_t)adp->ad_offset - NDADDR,
9331					    dp1->di_ib[adp->ad_offset - NDADDR]);
9332				dp1->di_ib[adp->ad_offset - NDADDR] =
9333				    adp->ad_newblkno;
9334			}
9335		} else {
9336			if (adp->ad_offset < NDADDR) {
9337				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
9338					panic("%s: %s #%jd %s %jd != %jd",
9339					    "handle_written_inodeblock",
9340					    "direct pointer",
9341					    (intmax_t)adp->ad_offset, "mismatch",
9342					    (intmax_t)dp2->di_db[adp->ad_offset],
9343					    (intmax_t)adp->ad_oldblkno);
9344				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
9345			} else {
9346				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
9347					panic("%s: %s #%jd allocated as %jd",
9348					    "handle_written_inodeblock",
9349					    "indirect pointer",
9350					    (intmax_t)adp->ad_offset - NDADDR,
9351					    (intmax_t)
9352					    dp2->di_ib[adp->ad_offset - NDADDR]);
9353				dp2->di_ib[adp->ad_offset - NDADDR] =
9354				    adp->ad_newblkno;
9355			}
9356		}
9357		adp->ad_state &= ~UNDONE;
9358		adp->ad_state |= ATTACHED;
9359		hadchanges = 1;
9360	}
9361	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
9362		nextadp = TAILQ_NEXT(adp, ad_next);
9363		if (adp->ad_state & ATTACHED)
9364			panic("handle_written_inodeblock: new entry");
9365		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
9366			panic("%s: direct pointers #%jd %s %jd != %jd",
9367			    "handle_written_inodeblock",
9368			    (intmax_t)adp->ad_offset, "mismatch",
9369			    (intmax_t)dp2->di_extb[adp->ad_offset],
9370			    (intmax_t)adp->ad_oldblkno);
9371		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
9372		adp->ad_state &= ~UNDONE;
9373		adp->ad_state |= ATTACHED;
9374		hadchanges = 1;
9375	}
9376	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
9377		stat_direct_blk_ptrs++;
9378	/*
9379	 * Reset the file size to its most up-to-date value.
9380	 */
9381	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
9382		panic("handle_written_inodeblock: bad size");
9383	if (inodedep->id_savednlink > LINK_MAX)
9384		panic("handle_written_inodeblock: Invalid link count "
9385		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
9386	if (fstype == UFS1) {
9387		if (dp1->di_nlink != inodedep->id_savednlink) {
9388			dp1->di_nlink = inodedep->id_savednlink;
9389			hadchanges = 1;
9390		}
9391		if (dp1->di_size != inodedep->id_savedsize) {
9392			dp1->di_size = inodedep->id_savedsize;
9393			hadchanges = 1;
9394		}
9395	} else {
9396		if (dp2->di_nlink != inodedep->id_savednlink) {
9397			dp2->di_nlink = inodedep->id_savednlink;
9398			hadchanges = 1;
9399		}
9400		if (dp2->di_size != inodedep->id_savedsize) {
9401			dp2->di_size = inodedep->id_savedsize;
9402			hadchanges = 1;
9403		}
9404		if (dp2->di_extsize != inodedep->id_savedextsize) {
9405			dp2->di_extsize = inodedep->id_savedextsize;
9406			hadchanges = 1;
9407		}
9408	}
9409	inodedep->id_savedsize = -1;
9410	inodedep->id_savedextsize = -1;
9411	inodedep->id_savednlink = -1;
9412	/*
9413	 * If there were any rollbacks in the inode block, then it must be
9414	 * marked dirty so that its will eventually get written back in
9415	 * its correct form.
9416	 */
9417	if (hadchanges)
9418		bdirty(bp);
9419bufwait:
9420	/*
9421	 * Process any allocdirects that completed during the update.
9422	 */
9423	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
9424		handle_allocdirect_partdone(adp, &wkhd);
9425	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
9426		handle_allocdirect_partdone(adp, &wkhd);
9427	/*
9428	 * Process deallocations that were held pending until the
9429	 * inode had been written to disk. Freeing of the inode
9430	 * is delayed until after all blocks have been freed to
9431	 * avoid creation of new <vfsid, inum, lbn> triples
9432	 * before the old ones have been deleted.  Completely
9433	 * unlinked inodes are not processed until the unlinked
9434	 * inode list is written or the last reference is removed.
9435	 */
9436	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
9437		freefile = handle_bufwait(inodedep, NULL);
9438		if (freefile && !LIST_EMPTY(&wkhd)) {
9439			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
9440			freefile = NULL;
9441		}
9442	}
9443	/*
9444	 * Move rolled forward dependency completions to the bufwait list
9445	 * now that those that were already written have been processed.
9446	 */
9447	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
9448		panic("handle_written_inodeblock: bufwait but no changes");
9449	jwork_move(&inodedep->id_bufwait, &wkhd);
9450
9451	if (freefile != NULL) {
9452		/*
9453		 * If the inode is goingaway it was never written.  Fake up
9454		 * the state here so free_inodedep() can succeed.
9455		 */
9456		if (inodedep->id_state & GOINGAWAY)
9457			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
9458		if (free_inodedep(inodedep) == 0)
9459			panic("handle_written_inodeblock: live inodedep %p",
9460			    inodedep);
9461		add_to_worklist(&freefile->fx_list, 0);
9462		return (0);
9463	}
9464
9465	/*
9466	 * If no outstanding dependencies, free it.
9467	 */
9468	if (free_inodedep(inodedep) ||
9469	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
9470	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
9471	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
9472	     LIST_FIRST(&inodedep->id_bufwait) == 0))
9473		return (0);
9474	return (hadchanges);
9475}
9476
9477static int
9478handle_written_indirdep(indirdep, bp, bpp)
9479	struct indirdep *indirdep;
9480	struct buf *bp;
9481	struct buf **bpp;
9482{
9483	struct allocindir *aip;
9484	int chgs;
9485
9486	if (indirdep->ir_state & GOINGAWAY)
9487		panic("disk_write_complete: indirdep gone");
9488	chgs = 0;
9489	/*
9490	 * If there were rollbacks revert them here.
9491	 */
9492	if (indirdep->ir_saveddata) {
9493		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
9494		free(indirdep->ir_saveddata, M_INDIRDEP);
9495		indirdep->ir_saveddata = 0;
9496		chgs = 1;
9497	}
9498	indirdep->ir_state &= ~UNDONE;
9499	indirdep->ir_state |= ATTACHED;
9500	/*
9501	 * Move allocindirs with written pointers to the completehd if
9502	 * the indirdep's pointer is not yet written.  Otherwise
9503	 * free them here.
9504	 */
9505	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
9506		LIST_REMOVE(aip, ai_next);
9507		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
9508			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
9509			    ai_next);
9510			continue;
9511		}
9512		free_newblk(&aip->ai_block);
9513	}
9514	/*
9515	 * Move allocindirs that have finished dependency processing from
9516	 * the done list to the write list after updating the pointers.
9517	 */
9518	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
9519		handle_allocindir_partdone(aip);
9520		if (aip == LIST_FIRST(&indirdep->ir_donehd))
9521			panic("disk_write_complete: not gone");
9522		chgs = 1;
9523	}
9524	/*
9525	 * If this indirdep has been detached from its newblk during
9526	 * I/O we need to keep this dep attached to the buffer so
9527	 * deallocate_dependencies can find it and properly resolve
9528	 * any outstanding dependencies.
9529	 */
9530	if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0)
9531		chgs = 1;
9532	if ((bp->b_flags & B_DELWRI) == 0)
9533		stat_indir_blk_ptrs++;
9534	/*
9535	 * If there were no changes we can discard the savedbp and detach
9536	 * ourselves from the buf.  We are only carrying completed pointers
9537	 * in this case.
9538	 */
9539	if (chgs == 0) {
9540		struct buf *sbp;
9541
9542		sbp = indirdep->ir_savebp;
9543		sbp->b_flags |= B_INVAL | B_NOCACHE;
9544		indirdep->ir_savebp = NULL;
9545		if (*bpp != NULL)
9546			panic("handle_written_indirdep: bp already exists.");
9547		*bpp = sbp;
9548	} else
9549		bdirty(bp);
9550	/*
9551	 * If there are no fresh dependencies and none waiting on writes
9552	 * we can free the indirdep.
9553	 */
9554	if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) {
9555		if (indirdep->ir_state & ONDEPLIST)
9556			LIST_REMOVE(indirdep, ir_next);
9557		free_indirdep(indirdep);
9558		return (0);
9559	}
9560
9561	return (chgs);
9562}
9563
9564/*
9565 * Process a diradd entry after its dependent inode has been written.
9566 * This routine must be called with splbio interrupts blocked.
9567 */
9568static void
9569diradd_inode_written(dap, inodedep)
9570	struct diradd *dap;
9571	struct inodedep *inodedep;
9572{
9573
9574	dap->da_state |= COMPLETE;
9575	complete_diradd(dap);
9576	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9577}
9578
9579/*
9580 * Returns true if the bmsafemap will have rollbacks when written.  Must
9581 * only be called with lk and the buf lock on the cg held.
9582 */
9583static int
9584bmsafemap_rollbacks(bmsafemap)
9585	struct bmsafemap *bmsafemap;
9586{
9587
9588	return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
9589	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd));
9590}
9591
9592/*
9593 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
9594 * changes if it's not a background write.  Set all written dependencies
9595 * to DEPCOMPLETE and free the structure if possible.
9596 */
9597static int
9598handle_written_bmsafemap(bmsafemap, bp)
9599	struct bmsafemap *bmsafemap;
9600	struct buf *bp;
9601{
9602	struct newblk *newblk;
9603	struct inodedep *inodedep;
9604	struct jaddref *jaddref, *jatmp;
9605	struct jnewblk *jnewblk, *jntmp;
9606	uint8_t *inosused;
9607	uint8_t *blksfree;
9608	struct cg *cgp;
9609	struct fs *fs;
9610	ino_t ino;
9611	long bno;
9612	int chgs;
9613	int i;
9614
9615	if ((bmsafemap->sm_state & IOSTARTED) == 0)
9616		panic("initiate_write_bmsafemap: Not started\n");
9617	chgs = 0;
9618	bmsafemap->sm_state &= ~IOSTARTED;
9619	/*
9620	 * Restore unwritten inode allocation pending jaddref writes.
9621	 */
9622	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
9623		cgp = (struct cg *)bp->b_data;
9624		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
9625		inosused = cg_inosused(cgp);
9626		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
9627		    ja_bmdeps, jatmp) {
9628			if ((jaddref->ja_state & UNDONE) == 0)
9629				continue;
9630			ino = jaddref->ja_ino % fs->fs_ipg;
9631			if (isset(inosused, ino))
9632				panic("handle_written_bmsafemap: "
9633				    "re-allocated inode");
9634			if ((bp->b_xflags & BX_BKGRDMARKER) == 0) {
9635				if ((jaddref->ja_mode & IFMT) == IFDIR)
9636					cgp->cg_cs.cs_ndir++;
9637				cgp->cg_cs.cs_nifree--;
9638				setbit(inosused, ino);
9639				chgs = 1;
9640			}
9641			jaddref->ja_state &= ~UNDONE;
9642			jaddref->ja_state |= ATTACHED;
9643			free_jaddref(jaddref);
9644		}
9645	}
9646	/*
9647	 * Restore any block allocations which are pending journal writes.
9648	 */
9649	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
9650		cgp = (struct cg *)bp->b_data;
9651		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
9652		blksfree = cg_blksfree(cgp);
9653		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
9654		    jntmp) {
9655			if ((jnewblk->jn_state & UNDONE) == 0)
9656				continue;
9657			bno = dtogd(fs, jnewblk->jn_blkno);
9658			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
9659			    i++) {
9660				if (bp->b_xflags & BX_BKGRDMARKER)
9661					break;
9662				if ((jnewblk->jn_state & NEWBLOCK) == 0 &&
9663				    isclr(blksfree, bno + i))
9664					panic("handle_written_bmsafemap: "
9665					    "re-allocated fragment");
9666				clrbit(blksfree, bno + i);
9667				chgs = 1;
9668			}
9669			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
9670			jnewblk->jn_state |= ATTACHED;
9671			free_jnewblk(jnewblk);
9672		}
9673	}
9674	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
9675		newblk->nb_state |= DEPCOMPLETE;
9676		newblk->nb_state &= ~ONDEPLIST;
9677		newblk->nb_bmsafemap = NULL;
9678		LIST_REMOVE(newblk, nb_deps);
9679		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
9680			handle_allocdirect_partdone(
9681			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
9682		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
9683			handle_allocindir_partdone(
9684			    WK_ALLOCINDIR(&newblk->nb_list));
9685		else if (newblk->nb_list.wk_type != D_NEWBLK)
9686			panic("handle_written_bmsafemap: Unexpected type: %s",
9687			    TYPENAME(newblk->nb_list.wk_type));
9688	}
9689	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
9690		inodedep->id_state |= DEPCOMPLETE;
9691		inodedep->id_state &= ~ONDEPLIST;
9692		LIST_REMOVE(inodedep, id_deps);
9693		inodedep->id_bmsafemap = NULL;
9694	}
9695	if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
9696	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
9697	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
9698	    LIST_EMPTY(&bmsafemap->sm_inodedephd)) {
9699		if (chgs)
9700			bdirty(bp);
9701		LIST_REMOVE(bmsafemap, sm_hash);
9702		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
9703		return (0);
9704	}
9705	bdirty(bp);
9706	return (1);
9707}
9708
9709/*
9710 * Try to free a mkdir dependency.
9711 */
9712static void
9713complete_mkdir(mkdir)
9714	struct mkdir *mkdir;
9715{
9716	struct diradd *dap;
9717
9718	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
9719		return;
9720	LIST_REMOVE(mkdir, md_mkdirs);
9721	dap = mkdir->md_diradd;
9722	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
9723	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
9724		dap->da_state |= DEPCOMPLETE;
9725		complete_diradd(dap);
9726	}
9727	WORKITEM_FREE(mkdir, D_MKDIR);
9728}
9729
9730/*
9731 * Handle the completion of a mkdir dependency.
9732 */
9733static void
9734handle_written_mkdir(mkdir, type)
9735	struct mkdir *mkdir;
9736	int type;
9737{
9738
9739	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
9740		panic("handle_written_mkdir: bad type");
9741	mkdir->md_state |= COMPLETE;
9742	complete_mkdir(mkdir);
9743}
9744
9745static void
9746free_pagedep(pagedep)
9747	struct pagedep *pagedep;
9748{
9749	int i;
9750
9751	if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST))
9752		return;
9753	for (i = 0; i < DAHASHSZ; i++)
9754		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
9755			return;
9756	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
9757		return;
9758	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
9759		return;
9760	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
9761		return;
9762	LIST_REMOVE(pagedep, pd_hash);
9763	WORKITEM_FREE(pagedep, D_PAGEDEP);
9764}
9765
9766/*
9767 * Called from within softdep_disk_write_complete above.
9768 * A write operation was just completed. Removed inodes can
9769 * now be freed and associated block pointers may be committed.
9770 * Note that this routine is always called from interrupt level
9771 * with further splbio interrupts blocked.
9772 */
9773static int
9774handle_written_filepage(pagedep, bp)
9775	struct pagedep *pagedep;
9776	struct buf *bp;		/* buffer containing the written page */
9777{
9778	struct dirrem *dirrem;
9779	struct diradd *dap, *nextdap;
9780	struct direct *ep;
9781	int i, chgs;
9782
9783	if ((pagedep->pd_state & IOSTARTED) == 0)
9784		panic("handle_written_filepage: not started");
9785	pagedep->pd_state &= ~IOSTARTED;
9786	/*
9787	 * Process any directory removals that have been committed.
9788	 */
9789	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
9790		LIST_REMOVE(dirrem, dm_next);
9791		dirrem->dm_state |= COMPLETE;
9792		dirrem->dm_dirinum = pagedep->pd_ino;
9793		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9794		    ("handle_written_filepage: Journal entries not written."));
9795		add_to_worklist(&dirrem->dm_list, 0);
9796	}
9797	/*
9798	 * Free any directory additions that have been committed.
9799	 * If it is a newly allocated block, we have to wait until
9800	 * the on-disk directory inode claims the new block.
9801	 */
9802	if ((pagedep->pd_state & NEWBLOCK) == 0)
9803		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
9804			free_diradd(dap, NULL);
9805	/*
9806	 * Uncommitted directory entries must be restored.
9807	 */
9808	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
9809		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
9810		     dap = nextdap) {
9811			nextdap = LIST_NEXT(dap, da_pdlist);
9812			if (dap->da_state & ATTACHED)
9813				panic("handle_written_filepage: attached");
9814			ep = (struct direct *)
9815			    ((char *)bp->b_data + dap->da_offset);
9816			ep->d_ino = dap->da_newinum;
9817			dap->da_state &= ~UNDONE;
9818			dap->da_state |= ATTACHED;
9819			chgs = 1;
9820			/*
9821			 * If the inode referenced by the directory has
9822			 * been written out, then the dependency can be
9823			 * moved to the pending list.
9824			 */
9825			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
9826				LIST_REMOVE(dap, da_pdlist);
9827				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
9828				    da_pdlist);
9829			}
9830		}
9831	}
9832	/*
9833	 * If there were any rollbacks in the directory, then it must be
9834	 * marked dirty so that its will eventually get written back in
9835	 * its correct form.
9836	 */
9837	if (chgs) {
9838		if ((bp->b_flags & B_DELWRI) == 0)
9839			stat_dir_entry++;
9840		bdirty(bp);
9841		return (1);
9842	}
9843	/*
9844	 * If we are not waiting for a new directory block to be
9845	 * claimed by its inode, then the pagedep will be freed.
9846	 * Otherwise it will remain to track any new entries on
9847	 * the page in case they are fsync'ed.
9848	 */
9849	if ((pagedep->pd_state & NEWBLOCK) == 0 &&
9850	    LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
9851		LIST_REMOVE(pagedep, pd_hash);
9852		WORKITEM_FREE(pagedep, D_PAGEDEP);
9853	}
9854	return (0);
9855}
9856
9857/*
9858 * Writing back in-core inode structures.
9859 *
9860 * The filesystem only accesses an inode's contents when it occupies an
9861 * "in-core" inode structure.  These "in-core" structures are separate from
9862 * the page frames used to cache inode blocks.  Only the latter are
9863 * transferred to/from the disk.  So, when the updated contents of the
9864 * "in-core" inode structure are copied to the corresponding in-memory inode
9865 * block, the dependencies are also transferred.  The following procedure is
9866 * called when copying a dirty "in-core" inode to a cached inode block.
9867 */
9868
9869/*
9870 * Called when an inode is loaded from disk. If the effective link count
9871 * differed from the actual link count when it was last flushed, then we
9872 * need to ensure that the correct effective link count is put back.
9873 */
9874void
9875softdep_load_inodeblock(ip)
9876	struct inode *ip;	/* the "in_core" copy of the inode */
9877{
9878	struct inodedep *inodedep;
9879
9880	/*
9881	 * Check for alternate nlink count.
9882	 */
9883	ip->i_effnlink = ip->i_nlink;
9884	ACQUIRE_LOCK(&lk);
9885	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
9886	    &inodedep) == 0) {
9887		FREE_LOCK(&lk);
9888		return;
9889	}
9890	ip->i_effnlink -= inodedep->id_nlinkdelta;
9891	FREE_LOCK(&lk);
9892}
9893
9894/*
9895 * This routine is called just before the "in-core" inode
9896 * information is to be copied to the in-memory inode block.
9897 * Recall that an inode block contains several inodes. If
9898 * the force flag is set, then the dependencies will be
9899 * cleared so that the update can always be made. Note that
9900 * the buffer is locked when this routine is called, so we
9901 * will never be in the middle of writing the inode block
9902 * to disk.
9903 */
9904void
9905softdep_update_inodeblock(ip, bp, waitfor)
9906	struct inode *ip;	/* the "in_core" copy of the inode */
9907	struct buf *bp;		/* the buffer containing the inode block */
9908	int waitfor;		/* nonzero => update must be allowed */
9909{
9910	struct inodedep *inodedep;
9911	struct inoref *inoref;
9912	struct worklist *wk;
9913	struct mount *mp;
9914	struct buf *ibp;
9915	struct fs *fs;
9916	int error;
9917
9918	mp = UFSTOVFS(ip->i_ump);
9919	fs = ip->i_fs;
9920	/*
9921	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
9922	 * does not have access to the in-core ip so must write directly into
9923	 * the inode block buffer when setting freelink.
9924	 */
9925	if (fs->fs_magic == FS_UFS1_MAGIC)
9926		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
9927		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
9928	else
9929		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
9930		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
9931	/*
9932	 * If the effective link count is not equal to the actual link
9933	 * count, then we must track the difference in an inodedep while
9934	 * the inode is (potentially) tossed out of the cache. Otherwise,
9935	 * if there is no existing inodedep, then there are no dependencies
9936	 * to track.
9937	 */
9938	ACQUIRE_LOCK(&lk);
9939again:
9940	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
9941		FREE_LOCK(&lk);
9942		if (ip->i_effnlink != ip->i_nlink)
9943			panic("softdep_update_inodeblock: bad link count");
9944		return;
9945	}
9946	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
9947		panic("softdep_update_inodeblock: bad delta");
9948	/*
9949	 * If we're flushing all dependencies we must also move any waiting
9950	 * for journal writes onto the bufwait list prior to I/O.
9951	 */
9952	if (waitfor) {
9953		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
9954			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
9955			    == DEPCOMPLETE) {
9956				stat_jwait_inode++;
9957				jwait(&inoref->if_list);
9958				goto again;
9959			}
9960		}
9961	}
9962	/*
9963	 * Changes have been initiated. Anything depending on these
9964	 * changes cannot occur until this inode has been written.
9965	 */
9966	inodedep->id_state &= ~COMPLETE;
9967	if ((inodedep->id_state & ONWORKLIST) == 0)
9968		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
9969	/*
9970	 * Any new dependencies associated with the incore inode must
9971	 * now be moved to the list associated with the buffer holding
9972	 * the in-memory copy of the inode. Once merged process any
9973	 * allocdirects that are completed by the merger.
9974	 */
9975	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
9976	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
9977		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
9978		    NULL);
9979	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
9980	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
9981		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
9982		    NULL);
9983	/*
9984	 * Now that the inode has been pushed into the buffer, the
9985	 * operations dependent on the inode being written to disk
9986	 * can be moved to the id_bufwait so that they will be
9987	 * processed when the buffer I/O completes.
9988	 */
9989	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
9990		WORKLIST_REMOVE(wk);
9991		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
9992	}
9993	/*
9994	 * Newly allocated inodes cannot be written until the bitmap
9995	 * that allocates them have been written (indicated by
9996	 * DEPCOMPLETE being set in id_state). If we are doing a
9997	 * forced sync (e.g., an fsync on a file), we force the bitmap
9998	 * to be written so that the update can be done.
9999	 */
10000	if (waitfor == 0) {
10001		FREE_LOCK(&lk);
10002		return;
10003	}
10004retry:
10005	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
10006		FREE_LOCK(&lk);
10007		return;
10008	}
10009	ibp = inodedep->id_bmsafemap->sm_buf;
10010	ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
10011	if (ibp == NULL) {
10012		/*
10013		 * If ibp came back as NULL, the dependency could have been
10014		 * freed while we slept.  Look it up again, and check to see
10015		 * that it has completed.
10016		 */
10017		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
10018			goto retry;
10019		FREE_LOCK(&lk);
10020		return;
10021	}
10022	FREE_LOCK(&lk);
10023	if ((error = bwrite(ibp)) != 0)
10024		softdep_error("softdep_update_inodeblock: bwrite", error);
10025}
10026
10027/*
10028 * Merge the a new inode dependency list (such as id_newinoupdt) into an
10029 * old inode dependency list (such as id_inoupdt). This routine must be
10030 * called with splbio interrupts blocked.
10031 */
10032static void
10033merge_inode_lists(newlisthead, oldlisthead)
10034	struct allocdirectlst *newlisthead;
10035	struct allocdirectlst *oldlisthead;
10036{
10037	struct allocdirect *listadp, *newadp;
10038
10039	newadp = TAILQ_FIRST(newlisthead);
10040	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
10041		if (listadp->ad_offset < newadp->ad_offset) {
10042			listadp = TAILQ_NEXT(listadp, ad_next);
10043			continue;
10044		}
10045		TAILQ_REMOVE(newlisthead, newadp, ad_next);
10046		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
10047		if (listadp->ad_offset == newadp->ad_offset) {
10048			allocdirect_merge(oldlisthead, newadp,
10049			    listadp);
10050			listadp = newadp;
10051		}
10052		newadp = TAILQ_FIRST(newlisthead);
10053	}
10054	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
10055		TAILQ_REMOVE(newlisthead, newadp, ad_next);
10056		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
10057	}
10058}
10059
10060/*
10061 * If we are doing an fsync, then we must ensure that any directory
10062 * entries for the inode have been written after the inode gets to disk.
10063 */
10064int
10065softdep_fsync(vp)
10066	struct vnode *vp;	/* the "in_core" copy of the inode */
10067{
10068	struct inodedep *inodedep;
10069	struct pagedep *pagedep;
10070	struct inoref *inoref;
10071	struct worklist *wk;
10072	struct diradd *dap;
10073	struct mount *mp;
10074	struct vnode *pvp;
10075	struct inode *ip;
10076	struct buf *bp;
10077	struct fs *fs;
10078	struct thread *td = curthread;
10079	int error, flushparent, pagedep_new_block;
10080	ino_t parentino;
10081	ufs_lbn_t lbn;
10082
10083	ip = VTOI(vp);
10084	fs = ip->i_fs;
10085	mp = vp->v_mount;
10086	ACQUIRE_LOCK(&lk);
10087restart:
10088	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
10089		FREE_LOCK(&lk);
10090		return (0);
10091	}
10092	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
10093		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
10094		    == DEPCOMPLETE) {
10095			stat_jwait_inode++;
10096			jwait(&inoref->if_list);
10097			goto restart;
10098		}
10099	}
10100	if (!LIST_EMPTY(&inodedep->id_inowait) ||
10101	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
10102	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
10103	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
10104	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
10105		panic("softdep_fsync: pending ops %p", inodedep);
10106	for (error = 0, flushparent = 0; ; ) {
10107		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
10108			break;
10109		if (wk->wk_type != D_DIRADD)
10110			panic("softdep_fsync: Unexpected type %s",
10111			    TYPENAME(wk->wk_type));
10112		dap = WK_DIRADD(wk);
10113		/*
10114		 * Flush our parent if this directory entry has a MKDIR_PARENT
10115		 * dependency or is contained in a newly allocated block.
10116		 */
10117		if (dap->da_state & DIRCHG)
10118			pagedep = dap->da_previous->dm_pagedep;
10119		else
10120			pagedep = dap->da_pagedep;
10121		parentino = pagedep->pd_ino;
10122		lbn = pagedep->pd_lbn;
10123		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
10124			panic("softdep_fsync: dirty");
10125		if ((dap->da_state & MKDIR_PARENT) ||
10126		    (pagedep->pd_state & NEWBLOCK))
10127			flushparent = 1;
10128		else
10129			flushparent = 0;
10130		/*
10131		 * If we are being fsync'ed as part of vgone'ing this vnode,
10132		 * then we will not be able to release and recover the
10133		 * vnode below, so we just have to give up on writing its
10134		 * directory entry out. It will eventually be written, just
10135		 * not now, but then the user was not asking to have it
10136		 * written, so we are not breaking any promises.
10137		 */
10138		if (vp->v_iflag & VI_DOOMED)
10139			break;
10140		/*
10141		 * We prevent deadlock by always fetching inodes from the
10142		 * root, moving down the directory tree. Thus, when fetching
10143		 * our parent directory, we first try to get the lock. If
10144		 * that fails, we must unlock ourselves before requesting
10145		 * the lock on our parent. See the comment in ufs_lookup
10146		 * for details on possible races.
10147		 */
10148		FREE_LOCK(&lk);
10149		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
10150		    FFSV_FORCEINSMQ)) {
10151			error = vfs_busy(mp, MBF_NOWAIT);
10152			if (error != 0) {
10153				vfs_ref(mp);
10154				VOP_UNLOCK(vp, 0);
10155				error = vfs_busy(mp, 0);
10156				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
10157				vfs_rel(mp);
10158				if (error != 0)
10159					return (ENOENT);
10160				if (vp->v_iflag & VI_DOOMED) {
10161					vfs_unbusy(mp);
10162					return (ENOENT);
10163				}
10164			}
10165			VOP_UNLOCK(vp, 0);
10166			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
10167			    &pvp, FFSV_FORCEINSMQ);
10168			vfs_unbusy(mp);
10169			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
10170			if (vp->v_iflag & VI_DOOMED) {
10171				if (error == 0)
10172					vput(pvp);
10173				error = ENOENT;
10174			}
10175			if (error != 0)
10176				return (error);
10177		}
10178		/*
10179		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
10180		 * that are contained in direct blocks will be resolved by
10181		 * doing a ffs_update. Pagedeps contained in indirect blocks
10182		 * may require a complete sync'ing of the directory. So, we
10183		 * try the cheap and fast ffs_update first, and if that fails,
10184		 * then we do the slower ffs_syncvnode of the directory.
10185		 */
10186		if (flushparent) {
10187			int locked;
10188
10189			if ((error = ffs_update(pvp, 1)) != 0) {
10190				vput(pvp);
10191				return (error);
10192			}
10193			ACQUIRE_LOCK(&lk);
10194			locked = 1;
10195			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
10196				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
10197					if (wk->wk_type != D_DIRADD)
10198						panic("softdep_fsync: Unexpected type %s",
10199						      TYPENAME(wk->wk_type));
10200					dap = WK_DIRADD(wk);
10201					if (dap->da_state & DIRCHG)
10202						pagedep = dap->da_previous->dm_pagedep;
10203					else
10204						pagedep = dap->da_pagedep;
10205					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
10206					FREE_LOCK(&lk);
10207					locked = 0;
10208					if (pagedep_new_block &&
10209					    (error = ffs_syncvnode(pvp, MNT_WAIT))) {
10210						vput(pvp);
10211						return (error);
10212					}
10213				}
10214			}
10215			if (locked)
10216				FREE_LOCK(&lk);
10217		}
10218		/*
10219		 * Flush directory page containing the inode's name.
10220		 */
10221		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
10222		    &bp);
10223		if (error == 0)
10224			error = bwrite(bp);
10225		else
10226			brelse(bp);
10227		vput(pvp);
10228		if (error != 0)
10229			return (error);
10230		ACQUIRE_LOCK(&lk);
10231		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
10232			break;
10233	}
10234	FREE_LOCK(&lk);
10235	return (0);
10236}
10237
10238/*
10239 * Flush all the dirty bitmaps associated with the block device
10240 * before flushing the rest of the dirty blocks so as to reduce
10241 * the number of dependencies that will have to be rolled back.
10242 */
10243void
10244softdep_fsync_mountdev(vp)
10245	struct vnode *vp;
10246{
10247	struct buf *bp, *nbp;
10248	struct worklist *wk;
10249	struct bufobj *bo;
10250
10251	if (!vn_isdisk(vp, NULL))
10252		panic("softdep_fsync_mountdev: vnode not a disk");
10253	bo = &vp->v_bufobj;
10254restart:
10255	BO_LOCK(bo);
10256	ACQUIRE_LOCK(&lk);
10257	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
10258		/*
10259		 * If it is already scheduled, skip to the next buffer.
10260		 */
10261		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
10262			continue;
10263
10264		if ((bp->b_flags & B_DELWRI) == 0)
10265			panic("softdep_fsync_mountdev: not dirty");
10266		/*
10267		 * We are only interested in bitmaps with outstanding
10268		 * dependencies.
10269		 */
10270		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
10271		    wk->wk_type != D_BMSAFEMAP ||
10272		    (bp->b_vflags & BV_BKGRDINPROG)) {
10273			BUF_UNLOCK(bp);
10274			continue;
10275		}
10276		FREE_LOCK(&lk);
10277		BO_UNLOCK(bo);
10278		bremfree(bp);
10279		(void) bawrite(bp);
10280		goto restart;
10281	}
10282	FREE_LOCK(&lk);
10283	drain_output(vp);
10284	BO_UNLOCK(bo);
10285}
10286
10287/*
10288 * This routine is called when we are trying to synchronously flush a
10289 * file. This routine must eliminate any filesystem metadata dependencies
10290 * so that the syncing routine can succeed by pushing the dirty blocks
10291 * associated with the file. If any I/O errors occur, they are returned.
10292 */
10293int
10294softdep_sync_metadata(struct vnode *vp)
10295{
10296	struct pagedep *pagedep;
10297	struct allocindir *aip;
10298	struct newblk *newblk;
10299	struct buf *bp, *nbp;
10300	struct worklist *wk;
10301	struct bufobj *bo;
10302	int i, error, waitfor;
10303
10304	if (!DOINGSOFTDEP(vp))
10305		return (0);
10306	/*
10307	 * Ensure that any direct block dependencies have been cleared.
10308	 */
10309	ACQUIRE_LOCK(&lk);
10310	if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) {
10311		FREE_LOCK(&lk);
10312		return (error);
10313	}
10314	FREE_LOCK(&lk);
10315	/*
10316	 * For most files, the only metadata dependencies are the
10317	 * cylinder group maps that allocate their inode or blocks.
10318	 * The block allocation dependencies can be found by traversing
10319	 * the dependency lists for any buffers that remain on their
10320	 * dirty buffer list. The inode allocation dependency will
10321	 * be resolved when the inode is updated with MNT_WAIT.
10322	 * This work is done in two passes. The first pass grabs most
10323	 * of the buffers and begins asynchronously writing them. The
10324	 * only way to wait for these asynchronous writes is to sleep
10325	 * on the filesystem vnode which may stay busy for a long time
10326	 * if the filesystem is active. So, instead, we make a second
10327	 * pass over the dependencies blocking on each write. In the
10328	 * usual case we will be blocking against a write that we
10329	 * initiated, so when it is done the dependency will have been
10330	 * resolved. Thus the second pass is expected to end quickly.
10331	 */
10332	waitfor = MNT_NOWAIT;
10333	bo = &vp->v_bufobj;
10334
10335top:
10336	/*
10337	 * We must wait for any I/O in progress to finish so that
10338	 * all potential buffers on the dirty list will be visible.
10339	 */
10340	BO_LOCK(bo);
10341	drain_output(vp);
10342	while ((bp = TAILQ_FIRST(&bo->bo_dirty.bv_hd)) != NULL) {
10343		bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT);
10344		if (bp)
10345			break;
10346	}
10347	BO_UNLOCK(bo);
10348	if (bp == NULL)
10349		return (0);
10350loop:
10351	/* While syncing snapshots, we must allow recursive lookups */
10352	BUF_AREC(bp);
10353	ACQUIRE_LOCK(&lk);
10354	/*
10355	 * As we hold the buffer locked, none of its dependencies
10356	 * will disappear.
10357	 */
10358	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
10359		switch (wk->wk_type) {
10360
10361		case D_ALLOCDIRECT:
10362		case D_ALLOCINDIR:
10363			newblk = WK_NEWBLK(wk);
10364			if (newblk->nb_jnewblk != NULL) {
10365				stat_jwait_newblk++;
10366				jwait(&newblk->nb_jnewblk->jn_list);
10367				goto restart;
10368			}
10369			if (newblk->nb_state & DEPCOMPLETE)
10370				continue;
10371			nbp = newblk->nb_bmsafemap->sm_buf;
10372			nbp = getdirtybuf(nbp, &lk, waitfor);
10373			if (nbp == NULL)
10374				continue;
10375			FREE_LOCK(&lk);
10376			if (waitfor == MNT_NOWAIT) {
10377				bawrite(nbp);
10378			} else if ((error = bwrite(nbp)) != 0) {
10379				break;
10380			}
10381			ACQUIRE_LOCK(&lk);
10382			continue;
10383
10384		case D_INDIRDEP:
10385		restart:
10386
10387			LIST_FOREACH(aip,
10388			    &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
10389				newblk = (struct newblk *)aip;
10390				if (newblk->nb_jnewblk != NULL) {
10391					stat_jwait_newblk++;
10392					jwait(&newblk->nb_jnewblk->jn_list);
10393					goto restart;
10394				}
10395				if (newblk->nb_state & DEPCOMPLETE)
10396					continue;
10397				nbp = newblk->nb_bmsafemap->sm_buf;
10398				nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
10399				if (nbp == NULL)
10400					goto restart;
10401				FREE_LOCK(&lk);
10402				if ((error = bwrite(nbp)) != 0) {
10403					goto loop_end;
10404				}
10405				ACQUIRE_LOCK(&lk);
10406				goto restart;
10407			}
10408			continue;
10409
10410		case D_PAGEDEP:
10411			/*
10412			 * We are trying to sync a directory that may
10413			 * have dependencies on both its own metadata
10414			 * and/or dependencies on the inodes of any
10415			 * recently allocated files. We walk its diradd
10416			 * lists pushing out the associated inode.
10417			 */
10418			pagedep = WK_PAGEDEP(wk);
10419			for (i = 0; i < DAHASHSZ; i++) {
10420				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
10421					continue;
10422				if ((error =
10423				    flush_pagedep_deps(vp, wk->wk_mp,
10424						&pagedep->pd_diraddhd[i]))) {
10425					FREE_LOCK(&lk);
10426					goto loop_end;
10427				}
10428			}
10429			continue;
10430
10431		default:
10432			panic("softdep_sync_metadata: Unknown type %s",
10433			    TYPENAME(wk->wk_type));
10434			/* NOTREACHED */
10435		}
10436	loop_end:
10437		/* We reach here only in error and unlocked */
10438		if (error == 0)
10439			panic("softdep_sync_metadata: zero error");
10440		BUF_NOREC(bp);
10441		bawrite(bp);
10442		return (error);
10443	}
10444	FREE_LOCK(&lk);
10445	BO_LOCK(bo);
10446	while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) {
10447		nbp = getdirtybuf(nbp, BO_MTX(bo), MNT_WAIT);
10448		if (nbp)
10449			break;
10450	}
10451	BO_UNLOCK(bo);
10452	BUF_NOREC(bp);
10453	bawrite(bp);
10454	if (nbp != NULL) {
10455		bp = nbp;
10456		goto loop;
10457	}
10458	/*
10459	 * The brief unlock is to allow any pent up dependency
10460	 * processing to be done. Then proceed with the second pass.
10461	 */
10462	if (waitfor == MNT_NOWAIT) {
10463		waitfor = MNT_WAIT;
10464		goto top;
10465	}
10466
10467	/*
10468	 * If we have managed to get rid of all the dirty buffers,
10469	 * then we are done. For certain directories and block
10470	 * devices, we may need to do further work.
10471	 *
10472	 * We must wait for any I/O in progress to finish so that
10473	 * all potential buffers on the dirty list will be visible.
10474	 */
10475	BO_LOCK(bo);
10476	drain_output(vp);
10477	BO_UNLOCK(bo);
10478	return ffs_update(vp, 1);
10479	/* return (0); */
10480}
10481
10482/*
10483 * Flush the dependencies associated with an inodedep.
10484 * Called with splbio blocked.
10485 */
10486static int
10487flush_inodedep_deps(mp, ino)
10488	struct mount *mp;
10489	ino_t ino;
10490{
10491	struct inodedep *inodedep;
10492	struct inoref *inoref;
10493	int error, waitfor;
10494
10495	/*
10496	 * This work is done in two passes. The first pass grabs most
10497	 * of the buffers and begins asynchronously writing them. The
10498	 * only way to wait for these asynchronous writes is to sleep
10499	 * on the filesystem vnode which may stay busy for a long time
10500	 * if the filesystem is active. So, instead, we make a second
10501	 * pass over the dependencies blocking on each write. In the
10502	 * usual case we will be blocking against a write that we
10503	 * initiated, so when it is done the dependency will have been
10504	 * resolved. Thus the second pass is expected to end quickly.
10505	 * We give a brief window at the top of the loop to allow
10506	 * any pending I/O to complete.
10507	 */
10508	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
10509		if (error)
10510			return (error);
10511		FREE_LOCK(&lk);
10512		ACQUIRE_LOCK(&lk);
10513restart:
10514		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
10515			return (0);
10516		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
10517			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
10518			    == DEPCOMPLETE) {
10519				stat_jwait_inode++;
10520				jwait(&inoref->if_list);
10521				goto restart;
10522			}
10523		}
10524		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
10525		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
10526		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
10527		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
10528			continue;
10529		/*
10530		 * If pass2, we are done, otherwise do pass 2.
10531		 */
10532		if (waitfor == MNT_WAIT)
10533			break;
10534		waitfor = MNT_WAIT;
10535	}
10536	/*
10537	 * Try freeing inodedep in case all dependencies have been removed.
10538	 */
10539	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
10540		(void) free_inodedep(inodedep);
10541	return (0);
10542}
10543
10544/*
10545 * Flush an inode dependency list.
10546 * Called with splbio blocked.
10547 */
10548static int
10549flush_deplist(listhead, waitfor, errorp)
10550	struct allocdirectlst *listhead;
10551	int waitfor;
10552	int *errorp;
10553{
10554	struct allocdirect *adp;
10555	struct newblk *newblk;
10556	struct buf *bp;
10557
10558	mtx_assert(&lk, MA_OWNED);
10559	TAILQ_FOREACH(adp, listhead, ad_next) {
10560		newblk = (struct newblk *)adp;
10561		if (newblk->nb_jnewblk != NULL) {
10562			stat_jwait_newblk++;
10563			jwait(&newblk->nb_jnewblk->jn_list);
10564			return (1);
10565		}
10566		if (newblk->nb_state & DEPCOMPLETE)
10567			continue;
10568		bp = newblk->nb_bmsafemap->sm_buf;
10569		bp = getdirtybuf(bp, &lk, waitfor);
10570		if (bp == NULL) {
10571			if (waitfor == MNT_NOWAIT)
10572				continue;
10573			return (1);
10574		}
10575		FREE_LOCK(&lk);
10576		if (waitfor == MNT_NOWAIT) {
10577			bawrite(bp);
10578		} else if ((*errorp = bwrite(bp)) != 0) {
10579			ACQUIRE_LOCK(&lk);
10580			return (1);
10581		}
10582		ACQUIRE_LOCK(&lk);
10583		return (1);
10584	}
10585	return (0);
10586}
10587
10588/*
10589 * Flush dependencies associated with an allocdirect block.
10590 */
10591static int
10592flush_newblk_dep(vp, mp, lbn)
10593	struct vnode *vp;
10594	struct mount *mp;
10595	ufs_lbn_t lbn;
10596{
10597	struct newblk *newblk;
10598	struct bufobj *bo;
10599	struct inode *ip;
10600	struct buf *bp;
10601	ufs2_daddr_t blkno;
10602	int error;
10603
10604	error = 0;
10605	bo = &vp->v_bufobj;
10606	ip = VTOI(vp);
10607	blkno = DIP(ip, i_db[lbn]);
10608	if (blkno == 0)
10609		panic("flush_newblk_dep: Missing block");
10610	ACQUIRE_LOCK(&lk);
10611	/*
10612	 * Loop until all dependencies related to this block are satisfied.
10613	 * We must be careful to restart after each sleep in case a write
10614	 * completes some part of this process for us.
10615	 */
10616	for (;;) {
10617		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
10618			FREE_LOCK(&lk);
10619			break;
10620		}
10621		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
10622			panic("flush_newblk_deps: Bad newblk %p", newblk);
10623		/*
10624		 * Flush the journal.
10625		 */
10626		if (newblk->nb_jnewblk != NULL) {
10627			stat_jwait_newblk++;
10628			jwait(&newblk->nb_jnewblk->jn_list);
10629			continue;
10630		}
10631		/*
10632		 * Write the bitmap dependency.
10633		 */
10634		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
10635			bp = newblk->nb_bmsafemap->sm_buf;
10636			bp = getdirtybuf(bp, &lk, MNT_WAIT);
10637			if (bp == NULL)
10638				continue;
10639			FREE_LOCK(&lk);
10640			error = bwrite(bp);
10641			if (error)
10642				break;
10643			ACQUIRE_LOCK(&lk);
10644			continue;
10645		}
10646		/*
10647		 * Write the buffer.
10648		 */
10649		FREE_LOCK(&lk);
10650		BO_LOCK(bo);
10651		bp = gbincore(bo, lbn);
10652		if (bp != NULL) {
10653			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
10654			    LK_INTERLOCK, BO_MTX(bo));
10655			if (error == ENOLCK) {
10656				ACQUIRE_LOCK(&lk);
10657				continue; /* Slept, retry */
10658			}
10659			if (error != 0)
10660				break;	/* Failed */
10661			if (bp->b_flags & B_DELWRI) {
10662				bremfree(bp);
10663				error = bwrite(bp);
10664				if (error)
10665					break;
10666			} else
10667				BUF_UNLOCK(bp);
10668		} else
10669			BO_UNLOCK(bo);
10670		/*
10671		 * We have to wait for the direct pointers to
10672		 * point at the newdirblk before the dependency
10673		 * will go away.
10674		 */
10675		error = ffs_update(vp, MNT_WAIT);
10676		if (error)
10677			break;
10678		ACQUIRE_LOCK(&lk);
10679	}
10680	return (error);
10681}
10682
10683/*
10684 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
10685 * Called with splbio blocked.
10686 */
10687static int
10688flush_pagedep_deps(pvp, mp, diraddhdp)
10689	struct vnode *pvp;
10690	struct mount *mp;
10691	struct diraddhd *diraddhdp;
10692{
10693	struct inodedep *inodedep;
10694	struct inoref *inoref;
10695	struct ufsmount *ump;
10696	struct diradd *dap;
10697	struct vnode *vp;
10698	int error = 0;
10699	struct buf *bp;
10700	ino_t inum;
10701
10702	ump = VFSTOUFS(mp);
10703restart:
10704	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
10705		/*
10706		 * Flush ourselves if this directory entry
10707		 * has a MKDIR_PARENT dependency.
10708		 */
10709		if (dap->da_state & MKDIR_PARENT) {
10710			FREE_LOCK(&lk);
10711			if ((error = ffs_update(pvp, MNT_WAIT)) != 0)
10712				break;
10713			ACQUIRE_LOCK(&lk);
10714			/*
10715			 * If that cleared dependencies, go on to next.
10716			 */
10717			if (dap != LIST_FIRST(diraddhdp))
10718				continue;
10719			if (dap->da_state & MKDIR_PARENT)
10720				panic("flush_pagedep_deps: MKDIR_PARENT");
10721		}
10722		/*
10723		 * A newly allocated directory must have its "." and
10724		 * ".." entries written out before its name can be
10725		 * committed in its parent.
10726		 */
10727		inum = dap->da_newinum;
10728		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
10729			panic("flush_pagedep_deps: lost inode1");
10730		/*
10731		 * Wait for any pending journal adds to complete so we don't
10732		 * cause rollbacks while syncing.
10733		 */
10734		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
10735			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
10736			    == DEPCOMPLETE) {
10737				stat_jwait_inode++;
10738				jwait(&inoref->if_list);
10739				goto restart;
10740			}
10741		}
10742		if (dap->da_state & MKDIR_BODY) {
10743			FREE_LOCK(&lk);
10744			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
10745			    FFSV_FORCEINSMQ)))
10746				break;
10747			error = flush_newblk_dep(vp, mp, 0);
10748			/*
10749			 * If we still have the dependency we might need to
10750			 * update the vnode to sync the new link count to
10751			 * disk.
10752			 */
10753			if (error == 0 && dap == LIST_FIRST(diraddhdp))
10754				error = ffs_update(vp, MNT_WAIT);
10755			vput(vp);
10756			if (error != 0)
10757				break;
10758			ACQUIRE_LOCK(&lk);
10759			/*
10760			 * If that cleared dependencies, go on to next.
10761			 */
10762			if (dap != LIST_FIRST(diraddhdp))
10763				continue;
10764			if (dap->da_state & MKDIR_BODY) {
10765				inodedep_lookup(UFSTOVFS(ump), inum, 0,
10766				    &inodedep);
10767				panic("flush_pagedep_deps: MKDIR_BODY "
10768				    "inodedep %p dap %p vp %p",
10769				    inodedep, dap, vp);
10770			}
10771		}
10772		/*
10773		 * Flush the inode on which the directory entry depends.
10774		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
10775		 * the only remaining dependency is that the updated inode
10776		 * count must get pushed to disk. The inode has already
10777		 * been pushed into its inode buffer (via VOP_UPDATE) at
10778		 * the time of the reference count change. So we need only
10779		 * locate that buffer, ensure that there will be no rollback
10780		 * caused by a bitmap dependency, then write the inode buffer.
10781		 */
10782retry:
10783		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
10784			panic("flush_pagedep_deps: lost inode");
10785		/*
10786		 * If the inode still has bitmap dependencies,
10787		 * push them to disk.
10788		 */
10789		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
10790			bp = inodedep->id_bmsafemap->sm_buf;
10791			bp = getdirtybuf(bp, &lk, MNT_WAIT);
10792			if (bp == NULL)
10793				goto retry;
10794			FREE_LOCK(&lk);
10795			if ((error = bwrite(bp)) != 0)
10796				break;
10797			ACQUIRE_LOCK(&lk);
10798			if (dap != LIST_FIRST(diraddhdp))
10799				continue;
10800		}
10801		/*
10802		 * If the inode is still sitting in a buffer waiting
10803		 * to be written or waiting for the link count to be
10804		 * adjusted update it here to flush it to disk.
10805		 */
10806		if (dap == LIST_FIRST(diraddhdp)) {
10807			FREE_LOCK(&lk);
10808			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
10809			    FFSV_FORCEINSMQ)))
10810				break;
10811			error = ffs_update(vp, MNT_WAIT);
10812			vput(vp);
10813			if (error)
10814				break;
10815			ACQUIRE_LOCK(&lk);
10816		}
10817		/*
10818		 * If we have failed to get rid of all the dependencies
10819		 * then something is seriously wrong.
10820		 */
10821		if (dap == LIST_FIRST(diraddhdp)) {
10822			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
10823			panic("flush_pagedep_deps: failed to flush "
10824			    "inodedep %p ino %d dap %p", inodedep, inum, dap);
10825		}
10826	}
10827	if (error)
10828		ACQUIRE_LOCK(&lk);
10829	return (error);
10830}
10831
10832/*
10833 * A large burst of file addition or deletion activity can drive the
10834 * memory load excessively high. First attempt to slow things down
10835 * using the techniques below. If that fails, this routine requests
10836 * the offending operations to fall back to running synchronously
10837 * until the memory load returns to a reasonable level.
10838 */
10839int
10840softdep_slowdown(vp)
10841	struct vnode *vp;
10842{
10843	struct ufsmount *ump;
10844	int jlow;
10845	int max_softdeps_hard;
10846
10847	ACQUIRE_LOCK(&lk);
10848	jlow = 0;
10849	/*
10850	 * Check for journal space if needed.
10851	 */
10852	if (DOINGSUJ(vp)) {
10853		ump = VFSTOUFS(vp->v_mount);
10854		if (journal_space(ump, 0) == 0)
10855			jlow = 1;
10856	}
10857	max_softdeps_hard = max_softdeps * 11 / 10;
10858	if (num_dirrem < max_softdeps_hard / 2 &&
10859	    num_inodedep < max_softdeps_hard &&
10860	    VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps &&
10861	    num_freeblkdep < max_softdeps_hard && jlow == 0) {
10862		FREE_LOCK(&lk);
10863  		return (0);
10864	}
10865	if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow)
10866		softdep_speedup();
10867	stat_sync_limit_hit += 1;
10868	FREE_LOCK(&lk);
10869	return (1);
10870}
10871
10872/*
10873 * Called by the allocation routines when they are about to fail
10874 * in the hope that we can free up the requested resource (inodes
10875 * or disk space).
10876 *
10877 * First check to see if the work list has anything on it. If it has,
10878 * clean up entries until we successfully free the requested resource.
10879 * Because this process holds inodes locked, we cannot handle any remove
10880 * requests that might block on a locked inode as that could lead to
10881 * deadlock. If the worklist yields none of the requested resource,
10882 * encourage the syncer daemon to help us. In no event will we try for
10883 * longer than tickdelay seconds.
10884 */
10885int
10886softdep_request_cleanup(fs, vp, resource)
10887	struct fs *fs;
10888	struct vnode *vp;
10889	int resource;
10890{
10891	struct ufsmount *ump;
10892	long starttime;
10893	ufs2_daddr_t needed;
10894	int error;
10895
10896	ump = VTOI(vp)->i_ump;
10897	mtx_assert(UFS_MTX(ump), MA_OWNED);
10898	if (resource == FLUSH_BLOCKS_WAIT)
10899		needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
10900	else if (resource == FLUSH_INODES_WAIT)
10901		needed = fs->fs_cstotal.cs_nifree + 2;
10902	else
10903		return (0);
10904	starttime = time_second + tickdelay;
10905	/*
10906	 * If we are being called because of a process doing a
10907	 * copy-on-write, then it is not safe to update the vnode
10908	 * as we may recurse into the copy-on-write routine.
10909	 */
10910	if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
10911		UFS_UNLOCK(ump);
10912		error = ffs_update(vp, 1);
10913		UFS_LOCK(ump);
10914		if (error != 0)
10915			return (0);
10916	}
10917	while ((resource == FLUSH_BLOCKS_WAIT && fs->fs_pendingblocks > 0 &&
10918		fs->fs_cstotal.cs_nbfree <= needed) ||
10919	       (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
10920		fs->fs_cstotal.cs_nifree <= needed)) {
10921		if (time_second > starttime)
10922			return (0);
10923		UFS_UNLOCK(ump);
10924		ACQUIRE_LOCK(&lk);
10925		process_removes(vp);
10926		if (ump->softdep_on_worklist > 0 &&
10927		    process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
10928			stat_worklist_push += 1;
10929			FREE_LOCK(&lk);
10930			UFS_LOCK(ump);
10931			continue;
10932		}
10933		request_cleanup(UFSTOVFS(ump), resource);
10934		FREE_LOCK(&lk);
10935		UFS_LOCK(ump);
10936	}
10937	return (1);
10938}
10939
10940/*
10941 * If memory utilization has gotten too high, deliberately slow things
10942 * down and speed up the I/O processing.
10943 */
10944extern struct thread *syncertd;
10945static int
10946request_cleanup(mp, resource)
10947	struct mount *mp;
10948	int resource;
10949{
10950	struct thread *td = curthread;
10951	struct ufsmount *ump;
10952
10953	mtx_assert(&lk, MA_OWNED);
10954	/*
10955	 * We never hold up the filesystem syncer or buf daemon.
10956	 */
10957	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
10958		return (0);
10959	ump = VFSTOUFS(mp);
10960	/*
10961	 * First check to see if the work list has gotten backlogged.
10962	 * If it has, co-opt this process to help clean up two entries.
10963	 * Because this process may hold inodes locked, we cannot
10964	 * handle any remove requests that might block on a locked
10965	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
10966	 * to avoid recursively processing the worklist.
10967	 */
10968	if (ump->softdep_on_worklist > max_softdeps / 10) {
10969		td->td_pflags |= TDP_SOFTDEP;
10970		process_worklist_item(mp, LK_NOWAIT);
10971		process_worklist_item(mp, LK_NOWAIT);
10972		td->td_pflags &= ~TDP_SOFTDEP;
10973		stat_worklist_push += 2;
10974		return(1);
10975	}
10976	/*
10977	 * Next, we attempt to speed up the syncer process. If that
10978	 * is successful, then we allow the process to continue.
10979	 */
10980	if (softdep_speedup() &&
10981	    resource != FLUSH_BLOCKS_WAIT &&
10982	    resource != FLUSH_INODES_WAIT)
10983		return(0);
10984	/*
10985	 * If we are resource constrained on inode dependencies, try
10986	 * flushing some dirty inodes. Otherwise, we are constrained
10987	 * by file deletions, so try accelerating flushes of directories
10988	 * with removal dependencies. We would like to do the cleanup
10989	 * here, but we probably hold an inode locked at this point and
10990	 * that might deadlock against one that we try to clean. So,
10991	 * the best that we can do is request the syncer daemon to do
10992	 * the cleanup for us.
10993	 */
10994	switch (resource) {
10995
10996	case FLUSH_INODES:
10997	case FLUSH_INODES_WAIT:
10998		stat_ino_limit_push += 1;
10999		req_clear_inodedeps += 1;
11000		stat_countp = &stat_ino_limit_hit;
11001		break;
11002
11003	case FLUSH_BLOCKS:
11004	case FLUSH_BLOCKS_WAIT:
11005		stat_blk_limit_push += 1;
11006		req_clear_remove += 1;
11007		stat_countp = &stat_blk_limit_hit;
11008		break;
11009
11010	default:
11011		panic("request_cleanup: unknown type");
11012	}
11013	/*
11014	 * Hopefully the syncer daemon will catch up and awaken us.
11015	 * We wait at most tickdelay before proceeding in any case.
11016	 */
11017	proc_waiting += 1;
11018	if (callout_pending(&softdep_callout) == FALSE)
11019		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
11020		    pause_timer, 0);
11021
11022	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
11023	proc_waiting -= 1;
11024	return (1);
11025}
11026
11027/*
11028 * Awaken processes pausing in request_cleanup and clear proc_waiting
11029 * to indicate that there is no longer a timer running.
11030 */
11031static void
11032pause_timer(arg)
11033	void *arg;
11034{
11035
11036	/*
11037	 * The callout_ API has acquired mtx and will hold it around this
11038	 * function call.
11039	 */
11040	*stat_countp += 1;
11041	wakeup_one(&proc_waiting);
11042	if (proc_waiting > 0)
11043		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
11044		    pause_timer, 0);
11045}
11046
11047/*
11048 * Flush out a directory with at least one removal dependency in an effort to
11049 * reduce the number of dirrem, freefile, and freeblks dependency structures.
11050 */
11051static void
11052clear_remove(td)
11053	struct thread *td;
11054{
11055	struct pagedep_hashhead *pagedephd;
11056	struct pagedep *pagedep;
11057	static int next = 0;
11058	struct mount *mp;
11059	struct vnode *vp;
11060	struct bufobj *bo;
11061	int error, cnt;
11062	ino_t ino;
11063
11064	mtx_assert(&lk, MA_OWNED);
11065
11066	for (cnt = 0; cnt < pagedep_hash; cnt++) {
11067		pagedephd = &pagedep_hashtbl[next++];
11068		if (next >= pagedep_hash)
11069			next = 0;
11070		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
11071			if (LIST_EMPTY(&pagedep->pd_dirremhd))
11072				continue;
11073			mp = pagedep->pd_list.wk_mp;
11074			ino = pagedep->pd_ino;
11075			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
11076				continue;
11077			FREE_LOCK(&lk);
11078
11079			/*
11080			 * Let unmount clear deps
11081			 */
11082			error = vfs_busy(mp, MBF_NOWAIT);
11083			if (error != 0)
11084				goto finish_write;
11085			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
11086			     FFSV_FORCEINSMQ);
11087			vfs_unbusy(mp);
11088			if (error != 0) {
11089				softdep_error("clear_remove: vget", error);
11090				goto finish_write;
11091			}
11092			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
11093				softdep_error("clear_remove: fsync", error);
11094			bo = &vp->v_bufobj;
11095			BO_LOCK(bo);
11096			drain_output(vp);
11097			BO_UNLOCK(bo);
11098			vput(vp);
11099		finish_write:
11100			vn_finished_write(mp);
11101			ACQUIRE_LOCK(&lk);
11102			return;
11103		}
11104	}
11105}
11106
11107/*
11108 * Clear out a block of dirty inodes in an effort to reduce
11109 * the number of inodedep dependency structures.
11110 */
11111static void
11112clear_inodedeps(td)
11113	struct thread *td;
11114{
11115	struct inodedep_hashhead *inodedephd;
11116	struct inodedep *inodedep;
11117	static int next = 0;
11118	struct mount *mp;
11119	struct vnode *vp;
11120	struct fs *fs;
11121	int error, cnt;
11122	ino_t firstino, lastino, ino;
11123
11124	mtx_assert(&lk, MA_OWNED);
11125	/*
11126	 * Pick a random inode dependency to be cleared.
11127	 * We will then gather up all the inodes in its block
11128	 * that have dependencies and flush them out.
11129	 */
11130	for (cnt = 0; cnt < inodedep_hash; cnt++) {
11131		inodedephd = &inodedep_hashtbl[next++];
11132		if (next >= inodedep_hash)
11133			next = 0;
11134		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
11135			break;
11136	}
11137	if (inodedep == NULL)
11138		return;
11139	fs = inodedep->id_fs;
11140	mp = inodedep->id_list.wk_mp;
11141	/*
11142	 * Find the last inode in the block with dependencies.
11143	 */
11144	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
11145	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
11146		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
11147			break;
11148	/*
11149	 * Asynchronously push all but the last inode with dependencies.
11150	 * Synchronously push the last inode with dependencies to ensure
11151	 * that the inode block gets written to free up the inodedeps.
11152	 */
11153	for (ino = firstino; ino <= lastino; ino++) {
11154		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
11155			continue;
11156		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
11157			continue;
11158		FREE_LOCK(&lk);
11159		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
11160		if (error != 0) {
11161			vn_finished_write(mp);
11162			ACQUIRE_LOCK(&lk);
11163			return;
11164		}
11165		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
11166		    FFSV_FORCEINSMQ)) != 0) {
11167			softdep_error("clear_inodedeps: vget", error);
11168			vfs_unbusy(mp);
11169			vn_finished_write(mp);
11170			ACQUIRE_LOCK(&lk);
11171			return;
11172		}
11173		vfs_unbusy(mp);
11174		if (ino == lastino) {
11175			if ((error = ffs_syncvnode(vp, MNT_WAIT)))
11176				softdep_error("clear_inodedeps: fsync1", error);
11177		} else {
11178			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
11179				softdep_error("clear_inodedeps: fsync2", error);
11180			BO_LOCK(&vp->v_bufobj);
11181			drain_output(vp);
11182			BO_UNLOCK(&vp->v_bufobj);
11183		}
11184		vput(vp);
11185		vn_finished_write(mp);
11186		ACQUIRE_LOCK(&lk);
11187	}
11188}
11189
11190/*
11191 * Function to determine if the buffer has outstanding dependencies
11192 * that will cause a roll-back if the buffer is written. If wantcount
11193 * is set, return number of dependencies, otherwise just yes or no.
11194 */
11195static int
11196softdep_count_dependencies(bp, wantcount)
11197	struct buf *bp;
11198	int wantcount;
11199{
11200	struct worklist *wk;
11201	struct bmsafemap *bmsafemap;
11202	struct inodedep *inodedep;
11203	struct indirdep *indirdep;
11204	struct freeblks *freeblks;
11205	struct allocindir *aip;
11206	struct pagedep *pagedep;
11207	struct dirrem *dirrem;
11208	struct newblk *newblk;
11209	struct mkdir *mkdir;
11210	struct diradd *dap;
11211	int i, retval;
11212
11213	retval = 0;
11214	ACQUIRE_LOCK(&lk);
11215	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
11216		switch (wk->wk_type) {
11217
11218		case D_INODEDEP:
11219			inodedep = WK_INODEDEP(wk);
11220			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
11221				/* bitmap allocation dependency */
11222				retval += 1;
11223				if (!wantcount)
11224					goto out;
11225			}
11226			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
11227				/* direct block pointer dependency */
11228				retval += 1;
11229				if (!wantcount)
11230					goto out;
11231			}
11232			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
11233				/* direct block pointer dependency */
11234				retval += 1;
11235				if (!wantcount)
11236					goto out;
11237			}
11238			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
11239				/* Add reference dependency. */
11240				retval += 1;
11241				if (!wantcount)
11242					goto out;
11243			}
11244			continue;
11245
11246		case D_INDIRDEP:
11247			indirdep = WK_INDIRDEP(wk);
11248
11249			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
11250				/* indirect block pointer dependency */
11251				retval += 1;
11252				if (!wantcount)
11253					goto out;
11254			}
11255			continue;
11256
11257		case D_PAGEDEP:
11258			pagedep = WK_PAGEDEP(wk);
11259			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
11260				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
11261					/* Journal remove ref dependency. */
11262					retval += 1;
11263					if (!wantcount)
11264						goto out;
11265				}
11266			}
11267			for (i = 0; i < DAHASHSZ; i++) {
11268
11269				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
11270					/* directory entry dependency */
11271					retval += 1;
11272					if (!wantcount)
11273						goto out;
11274				}
11275			}
11276			continue;
11277
11278		case D_BMSAFEMAP:
11279			bmsafemap = WK_BMSAFEMAP(wk);
11280			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
11281				/* Add reference dependency. */
11282				retval += 1;
11283				if (!wantcount)
11284					goto out;
11285			}
11286			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
11287				/* Allocate block dependency. */
11288				retval += 1;
11289				if (!wantcount)
11290					goto out;
11291			}
11292			continue;
11293
11294		case D_FREEBLKS:
11295			freeblks = WK_FREEBLKS(wk);
11296			if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) {
11297				/* Freeblk journal dependency. */
11298				retval += 1;
11299				if (!wantcount)
11300					goto out;
11301			}
11302			continue;
11303
11304		case D_ALLOCDIRECT:
11305		case D_ALLOCINDIR:
11306			newblk = WK_NEWBLK(wk);
11307			if (newblk->nb_jnewblk) {
11308				/* Journal allocate dependency. */
11309				retval += 1;
11310				if (!wantcount)
11311					goto out;
11312			}
11313			continue;
11314
11315		case D_MKDIR:
11316			mkdir = WK_MKDIR(wk);
11317			if (mkdir->md_jaddref) {
11318				/* Journal reference dependency. */
11319				retval += 1;
11320				if (!wantcount)
11321					goto out;
11322			}
11323			continue;
11324
11325		case D_FREEWORK:
11326		case D_FREEDEP:
11327		case D_JSEGDEP:
11328		case D_JSEG:
11329		case D_SBDEP:
11330			/* never a dependency on these blocks */
11331			continue;
11332
11333		default:
11334			panic("softdep_count_dependencies: Unexpected type %s",
11335			    TYPENAME(wk->wk_type));
11336			/* NOTREACHED */
11337		}
11338	}
11339out:
11340	FREE_LOCK(&lk);
11341	return retval;
11342}
11343
11344/*
11345 * Acquire exclusive access to a buffer.
11346 * Must be called with a locked mtx parameter.
11347 * Return acquired buffer or NULL on failure.
11348 */
11349static struct buf *
11350getdirtybuf(bp, mtx, waitfor)
11351	struct buf *bp;
11352	struct mtx *mtx;
11353	int waitfor;
11354{
11355	int error;
11356
11357	mtx_assert(mtx, MA_OWNED);
11358	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
11359		if (waitfor != MNT_WAIT)
11360			return (NULL);
11361		error = BUF_LOCK(bp,
11362		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
11363		/*
11364		 * Even if we sucessfully acquire bp here, we have dropped
11365		 * mtx, which may violates our guarantee.
11366		 */
11367		if (error == 0)
11368			BUF_UNLOCK(bp);
11369		else if (error != ENOLCK)
11370			panic("getdirtybuf: inconsistent lock: %d", error);
11371		mtx_lock(mtx);
11372		return (NULL);
11373	}
11374	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
11375		if (mtx == &lk && waitfor == MNT_WAIT) {
11376			mtx_unlock(mtx);
11377			BO_LOCK(bp->b_bufobj);
11378			BUF_UNLOCK(bp);
11379			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
11380				bp->b_vflags |= BV_BKGRDWAIT;
11381				msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
11382				       PRIBIO | PDROP, "getbuf", 0);
11383			} else
11384				BO_UNLOCK(bp->b_bufobj);
11385			mtx_lock(mtx);
11386			return (NULL);
11387		}
11388		BUF_UNLOCK(bp);
11389		if (waitfor != MNT_WAIT)
11390			return (NULL);
11391		/*
11392		 * The mtx argument must be bp->b_vp's mutex in
11393		 * this case.
11394		 */
11395#ifdef	DEBUG_VFS_LOCKS
11396		if (bp->b_vp->v_type != VCHR)
11397			ASSERT_BO_LOCKED(bp->b_bufobj);
11398#endif
11399		bp->b_vflags |= BV_BKGRDWAIT;
11400		msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
11401		return (NULL);
11402	}
11403	if ((bp->b_flags & B_DELWRI) == 0) {
11404		BUF_UNLOCK(bp);
11405		return (NULL);
11406	}
11407	bremfree(bp);
11408	return (bp);
11409}
11410
11411
11412/*
11413 * Check if it is safe to suspend the file system now.  On entry,
11414 * the vnode interlock for devvp should be held.  Return 0 with
11415 * the mount interlock held if the file system can be suspended now,
11416 * otherwise return EAGAIN with the mount interlock held.
11417 */
11418int
11419softdep_check_suspend(struct mount *mp,
11420		      struct vnode *devvp,
11421		      int softdep_deps,
11422		      int softdep_accdeps,
11423		      int secondary_writes,
11424		      int secondary_accwrites)
11425{
11426	struct bufobj *bo;
11427	struct ufsmount *ump;
11428	int error;
11429
11430	ump = VFSTOUFS(mp);
11431	bo = &devvp->v_bufobj;
11432	ASSERT_BO_LOCKED(bo);
11433
11434	for (;;) {
11435		if (!TRY_ACQUIRE_LOCK(&lk)) {
11436			BO_UNLOCK(bo);
11437			ACQUIRE_LOCK(&lk);
11438			FREE_LOCK(&lk);
11439			BO_LOCK(bo);
11440			continue;
11441		}
11442		MNT_ILOCK(mp);
11443		if (mp->mnt_secondary_writes != 0) {
11444			FREE_LOCK(&lk);
11445			BO_UNLOCK(bo);
11446			msleep(&mp->mnt_secondary_writes,
11447			       MNT_MTX(mp),
11448			       (PUSER - 1) | PDROP, "secwr", 0);
11449			BO_LOCK(bo);
11450			continue;
11451		}
11452		break;
11453	}
11454
11455	/*
11456	 * Reasons for needing more work before suspend:
11457	 * - Dirty buffers on devvp.
11458	 * - Softdep activity occurred after start of vnode sync loop
11459	 * - Secondary writes occurred after start of vnode sync loop
11460	 */
11461	error = 0;
11462	if (bo->bo_numoutput > 0 ||
11463	    bo->bo_dirty.bv_cnt > 0 ||
11464	    softdep_deps != 0 ||
11465	    ump->softdep_deps != 0 ||
11466	    softdep_accdeps != ump->softdep_accdeps ||
11467	    secondary_writes != 0 ||
11468	    mp->mnt_secondary_writes != 0 ||
11469	    secondary_accwrites != mp->mnt_secondary_accwrites)
11470		error = EAGAIN;
11471	FREE_LOCK(&lk);
11472	BO_UNLOCK(bo);
11473	return (error);
11474}
11475
11476
11477/*
11478 * Get the number of dependency structures for the file system, both
11479 * the current number and the total number allocated.  These will
11480 * later be used to detect that softdep processing has occurred.
11481 */
11482void
11483softdep_get_depcounts(struct mount *mp,
11484		      int *softdep_depsp,
11485		      int *softdep_accdepsp)
11486{
11487	struct ufsmount *ump;
11488
11489	ump = VFSTOUFS(mp);
11490	ACQUIRE_LOCK(&lk);
11491	*softdep_depsp = ump->softdep_deps;
11492	*softdep_accdepsp = ump->softdep_accdeps;
11493	FREE_LOCK(&lk);
11494}
11495
11496/*
11497 * Wait for pending output on a vnode to complete.
11498 * Must be called with vnode lock and interlock locked.
11499 *
11500 * XXX: Should just be a call to bufobj_wwait().
11501 */
11502static void
11503drain_output(vp)
11504	struct vnode *vp;
11505{
11506	struct bufobj *bo;
11507
11508	bo = &vp->v_bufobj;
11509	ASSERT_VOP_LOCKED(vp, "drain_output");
11510	ASSERT_BO_LOCKED(bo);
11511
11512	while (bo->bo_numoutput) {
11513		bo->bo_flag |= BO_WWAIT;
11514		msleep((caddr_t)&bo->bo_numoutput,
11515		    BO_MTX(bo), PRIBIO + 1, "drainvp", 0);
11516	}
11517}
11518
11519/*
11520 * Called whenever a buffer that is being invalidated or reallocated
11521 * contains dependencies. This should only happen if an I/O error has
11522 * occurred. The routine is called with the buffer locked.
11523 */
11524static void
11525softdep_deallocate_dependencies(bp)
11526	struct buf *bp;
11527{
11528
11529	if ((bp->b_ioflags & BIO_ERROR) == 0)
11530		panic("softdep_deallocate_dependencies: dangling deps");
11531	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
11532	panic("softdep_deallocate_dependencies: unrecovered I/O error");
11533}
11534
11535/*
11536 * Function to handle asynchronous write errors in the filesystem.
11537 */
11538static void
11539softdep_error(func, error)
11540	char *func;
11541	int error;
11542{
11543
11544	/* XXX should do something better! */
11545	printf("%s: got error %d while accessing filesystem\n", func, error);
11546}
11547
11548#ifdef DDB
11549
11550static void
11551inodedep_print(struct inodedep *inodedep, int verbose)
11552{
11553	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
11554	    " saveino %p\n",
11555	    inodedep, inodedep->id_fs, inodedep->id_state,
11556	    (intmax_t)inodedep->id_ino,
11557	    (intmax_t)fsbtodb(inodedep->id_fs,
11558	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
11559	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
11560	    inodedep->id_savedino1);
11561
11562	if (verbose == 0)
11563		return;
11564
11565	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
11566	    "mkdiradd %p\n",
11567	    LIST_FIRST(&inodedep->id_pendinghd),
11568	    LIST_FIRST(&inodedep->id_bufwait),
11569	    LIST_FIRST(&inodedep->id_inowait),
11570	    TAILQ_FIRST(&inodedep->id_inoreflst),
11571	    inodedep->id_mkdiradd);
11572	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
11573	    TAILQ_FIRST(&inodedep->id_inoupdt),
11574	    TAILQ_FIRST(&inodedep->id_newinoupdt),
11575	    TAILQ_FIRST(&inodedep->id_extupdt),
11576	    TAILQ_FIRST(&inodedep->id_newextupdt));
11577}
11578
11579DB_SHOW_COMMAND(inodedep, db_show_inodedep)
11580{
11581
11582	if (have_addr == 0) {
11583		db_printf("Address required\n");
11584		return;
11585	}
11586	inodedep_print((struct inodedep*)addr, 1);
11587}
11588
11589DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
11590{
11591	struct inodedep_hashhead *inodedephd;
11592	struct inodedep *inodedep;
11593	struct fs *fs;
11594	int cnt;
11595
11596	fs = have_addr ? (struct fs *)addr : NULL;
11597	for (cnt = 0; cnt < inodedep_hash; cnt++) {
11598		inodedephd = &inodedep_hashtbl[cnt];
11599		LIST_FOREACH(inodedep, inodedephd, id_hash) {
11600			if (fs != NULL && fs != inodedep->id_fs)
11601				continue;
11602			inodedep_print(inodedep, 0);
11603		}
11604	}
11605}
11606
11607DB_SHOW_COMMAND(worklist, db_show_worklist)
11608{
11609	struct worklist *wk;
11610
11611	if (have_addr == 0) {
11612		db_printf("Address required\n");
11613		return;
11614	}
11615	wk = (struct worklist *)addr;
11616	printf("worklist: %p type %s state 0x%X\n",
11617	    wk, TYPENAME(wk->wk_type), wk->wk_state);
11618}
11619
11620DB_SHOW_COMMAND(workhead, db_show_workhead)
11621{
11622	struct workhead *wkhd;
11623	struct worklist *wk;
11624	int i;
11625
11626	if (have_addr == 0) {
11627		db_printf("Address required\n");
11628		return;
11629	}
11630	wkhd = (struct workhead *)addr;
11631	wk = LIST_FIRST(wkhd);
11632	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
11633		db_printf("worklist: %p type %s state 0x%X",
11634		    wk, TYPENAME(wk->wk_type), wk->wk_state);
11635	if (i == 100)
11636		db_printf("workhead overflow");
11637	printf("\n");
11638}
11639
11640
11641DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
11642{
11643	struct jaddref *jaddref;
11644	struct diradd *diradd;
11645	struct mkdir *mkdir;
11646
11647	LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
11648		diradd = mkdir->md_diradd;
11649		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
11650		    mkdir, mkdir->md_state, diradd, diradd->da_state);
11651		if ((jaddref = mkdir->md_jaddref) != NULL)
11652			db_printf(" jaddref %p jaddref state 0x%X",
11653			    jaddref, jaddref->ja_state);
11654		db_printf("\n");
11655	}
11656}
11657
11658#endif /* DDB */
11659
11660#endif /* SOFTUPDATES */
11661