ffs_softdep.c revision 207142
1/*-
2 * Copyright 1998, 2000 Marshall Kirk McKusick.
3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4 * All rights reserved.
5 *
6 * The soft updates code is derived from the appendix of a University
7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8 * "Soft Updates: A Solution to the Metadata Update Problem in File
9 * Systems", CSE-TR-254-95, August 1995).
10 *
11 * Further information about soft updates can be obtained from:
12 *
13 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
14 *	1614 Oxford Street		mckusick@mckusick.com
15 *	Berkeley, CA 94709-1608		+1-510-843-9542
16 *	USA
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 *
22 * 1. Redistributions of source code must retain the above copyright
23 *    notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 *    notice, this list of conditions and the following disclaimer in the
26 *    documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
40 */
41
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 207142 2010-04-24 07:36:33Z pjd $");
44
45#include "opt_ffs.h"
46#include "opt_ddb.h"
47
48/*
49 * For now we want the safety net that the DEBUG flag provides.
50 */
51#ifndef DEBUG
52#define DEBUG
53#endif
54#define	SUJ_DEBUG
55
56#include <sys/param.h>
57#include <sys/kernel.h>
58#include <sys/systm.h>
59#include <sys/bio.h>
60#include <sys/buf.h>
61#include <sys/kdb.h>
62#include <sys/kthread.h>
63#include <sys/lock.h>
64#include <sys/malloc.h>
65#include <sys/mount.h>
66#include <sys/mutex.h>
67#include <sys/namei.h>
68#include <sys/proc.h>
69#include <sys/stat.h>
70#include <sys/sysctl.h>
71#include <sys/syslog.h>
72#include <sys/vnode.h>
73#include <sys/conf.h>
74#include <ufs/ufs/dir.h>
75#include <ufs/ufs/extattr.h>
76#include <ufs/ufs/quota.h>
77#include <ufs/ufs/inode.h>
78#include <ufs/ufs/ufsmount.h>
79#include <ufs/ffs/fs.h>
80#include <ufs/ffs/softdep.h>
81#include <ufs/ffs/ffs_extern.h>
82#include <ufs/ufs/ufs_extern.h>
83
84#include <vm/vm.h>
85
86#include <ddb/ddb.h>
87
88#ifndef SOFTUPDATES
89
90int
91softdep_flushfiles(oldmnt, flags, td)
92	struct mount *oldmnt;
93	int flags;
94	struct thread *td;
95{
96
97	panic("softdep_flushfiles called");
98}
99
100int
101softdep_mount(devvp, mp, fs, cred)
102	struct vnode *devvp;
103	struct mount *mp;
104	struct fs *fs;
105	struct ucred *cred;
106{
107
108	return (0);
109}
110
111void
112softdep_initialize()
113{
114
115	return;
116}
117
118void
119softdep_uninitialize()
120{
121
122	return;
123}
124
125void
126softdep_setup_inomapdep(bp, ip, newinum)
127	struct buf *bp;
128	struct inode *ip;
129	ino_t newinum;
130{
131
132	panic("softdep_setup_inomapdep called");
133}
134
135void
136softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
137	struct buf *bp;
138	struct mount *mp;
139	ufs2_daddr_t newblkno;
140	int frags;
141	int oldfrags;
142{
143
144	panic("softdep_setup_blkmapdep called");
145}
146
147void
148softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
149	struct inode *ip;
150	ufs_lbn_t lbn;
151	ufs2_daddr_t newblkno;
152	ufs2_daddr_t oldblkno;
153	long newsize;
154	long oldsize;
155	struct buf *bp;
156{
157
158	panic("softdep_setup_allocdirect called");
159}
160
161void
162softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
163	struct inode *ip;
164	ufs_lbn_t lbn;
165	ufs2_daddr_t newblkno;
166	ufs2_daddr_t oldblkno;
167	long newsize;
168	long oldsize;
169	struct buf *bp;
170{
171
172	panic("softdep_setup_allocext called");
173}
174
175void
176softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
177	struct inode *ip;
178	ufs_lbn_t lbn;
179	struct buf *bp;
180	int ptrno;
181	ufs2_daddr_t newblkno;
182	ufs2_daddr_t oldblkno;
183	struct buf *nbp;
184{
185
186	panic("softdep_setup_allocindir_page called");
187}
188
189void
190softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
191	struct buf *nbp;
192	struct inode *ip;
193	struct buf *bp;
194	int ptrno;
195	ufs2_daddr_t newblkno;
196{
197
198	panic("softdep_setup_allocindir_meta called");
199}
200
201void
202softdep_setup_freeblocks(ip, length, flags)
203	struct inode *ip;
204	off_t length;
205	int flags;
206{
207
208	panic("softdep_setup_freeblocks called");
209}
210
211void
212softdep_freefile(pvp, ino, mode)
213		struct vnode *pvp;
214		ino_t ino;
215		int mode;
216{
217
218	panic("softdep_freefile called");
219}
220
221int
222softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
223	struct buf *bp;
224	struct inode *dp;
225	off_t diroffset;
226	ino_t newinum;
227	struct buf *newdirbp;
228	int isnewblk;
229{
230
231	panic("softdep_setup_directory_add called");
232}
233
234void
235softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
236	struct buf *bp;
237	struct inode *dp;
238	caddr_t base;
239	caddr_t oldloc;
240	caddr_t newloc;
241	int entrysize;
242{
243
244	panic("softdep_change_directoryentry_offset called");
245}
246
247void
248softdep_setup_remove(bp, dp, ip, isrmdir)
249	struct buf *bp;
250	struct inode *dp;
251	struct inode *ip;
252	int isrmdir;
253{
254
255	panic("softdep_setup_remove called");
256}
257
258void
259softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
260	struct buf *bp;
261	struct inode *dp;
262	struct inode *ip;
263	ino_t newinum;
264	int isrmdir;
265{
266
267	panic("softdep_setup_directory_change called");
268}
269
270void
271softdep_change_linkcnt(ip)
272	struct inode *ip;
273{
274
275	panic("softdep_change_linkcnt called");
276}
277
278void
279softdep_load_inodeblock(ip)
280	struct inode *ip;
281{
282
283	panic("softdep_load_inodeblock called");
284}
285
286void
287softdep_update_inodeblock(ip, bp, waitfor)
288	struct inode *ip;
289	struct buf *bp;
290	int waitfor;
291{
292
293	panic("softdep_update_inodeblock called");
294}
295
296int
297softdep_fsync(vp)
298	struct vnode *vp;	/* the "in_core" copy of the inode */
299{
300
301	return (0);
302}
303
304void
305softdep_fsync_mountdev(vp)
306	struct vnode *vp;
307{
308
309	return;
310}
311
312int
313softdep_flushworklist(oldmnt, countp, td)
314	struct mount *oldmnt;
315	int *countp;
316	struct thread *td;
317{
318
319	*countp = 0;
320	return (0);
321}
322
323int
324softdep_sync_metadata(struct vnode *vp)
325{
326
327	return (0);
328}
329
330int
331softdep_slowdown(vp)
332	struct vnode *vp;
333{
334
335	panic("softdep_slowdown called");
336}
337
338void
339softdep_releasefile(ip)
340	struct inode *ip;	/* inode with the zero effective link count */
341{
342
343	panic("softdep_releasefile called");
344}
345
346int
347softdep_request_cleanup(fs, vp)
348	struct fs *fs;
349	struct vnode *vp;
350{
351
352	return (0);
353}
354
355int
356softdep_check_suspend(struct mount *mp,
357		      struct vnode *devvp,
358		      int softdep_deps,
359		      int softdep_accdeps,
360		      int secondary_writes,
361		      int secondary_accwrites)
362{
363	struct bufobj *bo;
364	int error;
365
366	(void) softdep_deps,
367	(void) softdep_accdeps;
368
369	bo = &devvp->v_bufobj;
370	ASSERT_BO_LOCKED(bo);
371
372	MNT_ILOCK(mp);
373	while (mp->mnt_secondary_writes != 0) {
374		BO_UNLOCK(bo);
375		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
376		    (PUSER - 1) | PDROP, "secwr", 0);
377		BO_LOCK(bo);
378		MNT_ILOCK(mp);
379	}
380
381	/*
382	 * Reasons for needing more work before suspend:
383	 * - Dirty buffers on devvp.
384	 * - Secondary writes occurred after start of vnode sync loop
385	 */
386	error = 0;
387	if (bo->bo_numoutput > 0 ||
388	    bo->bo_dirty.bv_cnt > 0 ||
389	    secondary_writes != 0 ||
390	    mp->mnt_secondary_writes != 0 ||
391	    secondary_accwrites != mp->mnt_secondary_accwrites)
392		error = EAGAIN;
393	BO_UNLOCK(bo);
394	return (error);
395}
396
397void
398softdep_get_depcounts(struct mount *mp,
399		      int *softdepactivep,
400		      int *softdepactiveaccp)
401{
402	(void) mp;
403	*softdepactivep = 0;
404	*softdepactiveaccp = 0;
405}
406
407#else
408/*
409 * These definitions need to be adapted to the system to which
410 * this file is being ported.
411 */
412
413#define M_SOFTDEP_FLAGS	(M_WAITOK | M_USE_RESERVE)
414
415#define	D_PAGEDEP	0
416#define	D_INODEDEP	1
417#define	D_BMSAFEMAP	2
418#define	D_NEWBLK	3
419#define	D_ALLOCDIRECT	4
420#define	D_INDIRDEP	5
421#define	D_ALLOCINDIR	6
422#define	D_FREEFRAG	7
423#define	D_FREEBLKS	8
424#define	D_FREEFILE	9
425#define	D_DIRADD	10
426#define	D_MKDIR		11
427#define	D_DIRREM	12
428#define	D_NEWDIRBLK	13
429#define	D_FREEWORK	14
430#define	D_FREEDEP	15
431#define	D_JADDREF	16
432#define	D_JREMREF	17
433#define	D_JMVREF	18
434#define	D_JNEWBLK	19
435#define	D_JFREEBLK	20
436#define	D_JFREEFRAG	21
437#define	D_JSEG		22
438#define	D_JSEGDEP	23
439#define	D_SBDEP		24
440#define	D_JTRUNC	25
441#define	D_LAST		D_JTRUNC
442
443unsigned long dep_current[D_LAST + 1];
444unsigned long dep_total[D_LAST + 1];
445
446
447SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats");
448SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
449    "total dependencies allocated");
450SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
451    "current dependencies allocated");
452
453#define	SOFTDEP_TYPE(type, str, long)					\
454    static MALLOC_DEFINE(M_ ## type, #str, long);			\
455    SYSCTL_LONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
456	&dep_total[D_ ## type], 0, "");					\
457    SYSCTL_LONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
458	&dep_current[D_ ## type], 0, "");
459
460SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
461SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
462SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
463    "Block or frag allocated from cyl group map");
464SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
465SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
466SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
467SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
468SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
469SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
470SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
471SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
472SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
473SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
474SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
475SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
476SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
477SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
478SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
479SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
480SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
481SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
482SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
483SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
484SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
485SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
486SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
487
488static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
489static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
490
491/*
492 * translate from workitem type to memory type
493 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
494 */
495static struct malloc_type *memtype[] = {
496	M_PAGEDEP,
497	M_INODEDEP,
498	M_BMSAFEMAP,
499	M_NEWBLK,
500	M_ALLOCDIRECT,
501	M_INDIRDEP,
502	M_ALLOCINDIR,
503	M_FREEFRAG,
504	M_FREEBLKS,
505	M_FREEFILE,
506	M_DIRADD,
507	M_MKDIR,
508	M_DIRREM,
509	M_NEWDIRBLK,
510	M_FREEWORK,
511	M_FREEDEP,
512	M_JADDREF,
513	M_JREMREF,
514	M_JMVREF,
515	M_JNEWBLK,
516	M_JFREEBLK,
517	M_JFREEFRAG,
518	M_JSEG,
519	M_JSEGDEP,
520	M_SBDEP,
521	M_JTRUNC
522};
523
524#define DtoM(type) (memtype[type])
525
526/*
527 * Names of malloc types.
528 */
529#define TYPENAME(type)  \
530	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
531/*
532 * End system adaptation definitions.
533 */
534
535#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
536#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
537
538/*
539 * Forward declarations.
540 */
541struct inodedep_hashhead;
542struct newblk_hashhead;
543struct pagedep_hashhead;
544struct bmsafemap_hashhead;
545
546/*
547 * Internal function prototypes.
548 */
549static	void softdep_error(char *, int);
550static	void drain_output(struct vnode *);
551static	struct buf *getdirtybuf(struct buf *, struct mtx *, int);
552static	void clear_remove(struct thread *);
553static	void clear_inodedeps(struct thread *);
554static	void unlinked_inodedep(struct mount *, struct inodedep *);
555static	void clear_unlinked_inodedep(struct inodedep *);
556static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
557static	int flush_pagedep_deps(struct vnode *, struct mount *,
558	    struct diraddhd *);
559static	void free_pagedep(struct pagedep *);
560static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
561static	int flush_inodedep_deps(struct mount *, ino_t);
562static	int flush_deplist(struct allocdirectlst *, int, int *);
563static	int handle_written_filepage(struct pagedep *, struct buf *);
564static	int handle_written_sbdep(struct sbdep *, struct buf *);
565static	void initiate_write_sbdep(struct sbdep *);
566static  void diradd_inode_written(struct diradd *, struct inodedep *);
567static	int handle_written_indirdep(struct indirdep *, struct buf *,
568	    struct buf**);
569static	int handle_written_inodeblock(struct inodedep *, struct buf *);
570static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
571static	void handle_written_jaddref(struct jaddref *);
572static	void handle_written_jremref(struct jremref *);
573static	void handle_written_jseg(struct jseg *, struct buf *);
574static	void handle_written_jnewblk(struct jnewblk *);
575static	void handle_written_jfreeblk(struct jfreeblk *);
576static	void handle_written_jfreefrag(struct jfreefrag *);
577static	void complete_jseg(struct jseg *);
578static	void jseg_write(struct fs *, struct jblocks *, struct jseg *,
579	    uint8_t *);
580static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
581static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
582static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
583static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
584static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
585static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
586static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
587static	inline void inoref_write(struct inoref *, struct jseg *,
588	    struct jrefrec *);
589static	void handle_allocdirect_partdone(struct allocdirect *,
590	    struct workhead *);
591static	void cancel_newblk(struct newblk *, struct workhead *);
592static	void indirdep_complete(struct indirdep *);
593static	void handle_allocindir_partdone(struct allocindir *);
594static	void initiate_write_filepage(struct pagedep *, struct buf *);
595static	void initiate_write_indirdep(struct indirdep*, struct buf *);
596static	void handle_written_mkdir(struct mkdir *, int);
597static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
598static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
599static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
600static	void handle_workitem_freefile(struct freefile *);
601static	void handle_workitem_remove(struct dirrem *, struct vnode *);
602static	struct dirrem *newdirrem(struct buf *, struct inode *,
603	    struct inode *, int, struct dirrem **);
604static	void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *,
605	    struct freeblks *);
606static	void free_indirdep(struct indirdep *);
607static	void free_diradd(struct diradd *, struct workhead *);
608static	void merge_diradd(struct inodedep *, struct diradd *);
609static	void complete_diradd(struct diradd *);
610static	struct diradd *diradd_lookup(struct pagedep *, int);
611static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
612	    struct jremref *);
613static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
614	    struct jremref *);
615static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
616	    struct jremref *, struct jremref *);
617static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
618	    struct jremref *);
619static	void cancel_allocindir(struct allocindir *, struct inodedep *,
620	    struct freeblks *);
621static	void complete_mkdir(struct mkdir *);
622static	void free_newdirblk(struct newdirblk *);
623static	void free_jremref(struct jremref *);
624static	void free_jaddref(struct jaddref *);
625static	void free_jsegdep(struct jsegdep *);
626static	void free_jseg(struct jseg *);
627static	void free_jnewblk(struct jnewblk *);
628static	void free_jfreeblk(struct jfreeblk *);
629static	void free_jfreefrag(struct jfreefrag *);
630static	void free_freedep(struct freedep *);
631static	void journal_jremref(struct dirrem *, struct jremref *,
632	    struct inodedep *);
633static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
634static	int cancel_jaddref(struct jaddref *, struct inodedep *,
635	    struct workhead *);
636static	void cancel_jfreefrag(struct jfreefrag *);
637static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
638static	int deallocate_dependencies(struct buf *, struct inodedep *,
639	    struct freeblks *);
640static	void free_newblk(struct newblk *);
641static	void cancel_allocdirect(struct allocdirectlst *,
642	    struct allocdirect *, struct freeblks *, int);
643static	int check_inode_unwritten(struct inodedep *);
644static	int free_inodedep(struct inodedep *);
645static	void freework_freeblock(struct freework *);
646static	void handle_workitem_freeblocks(struct freeblks *, int);
647static	void handle_complete_freeblocks(struct freeblks *);
648static	void handle_workitem_indirblk(struct freework *);
649static	void handle_written_freework(struct freework *);
650static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
651static	void setup_allocindir_phase2(struct buf *, struct inode *,
652	    struct inodedep *, struct allocindir *, ufs_lbn_t);
653static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
654	    ufs2_daddr_t, ufs_lbn_t);
655static	void handle_workitem_freefrag(struct freefrag *);
656static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
657	    ufs_lbn_t);
658static	void allocdirect_merge(struct allocdirectlst *,
659	    struct allocdirect *, struct allocdirect *);
660static	struct freefrag *allocindir_merge(struct allocindir *,
661	    struct allocindir *);
662static	int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int,
663	    struct bmsafemap **);
664static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
665	    int cg);
666static	int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t,
667	    int, struct newblk **);
668static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
669static	int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
670	    struct inodedep **);
671static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
672static	int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int,
673	    struct pagedep **);
674static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
675	    struct mount *mp, int, struct pagedep **);
676static	void pause_timer(void *);
677static	int request_cleanup(struct mount *, int);
678static	int process_worklist_item(struct mount *, int);
679static	void process_removes(struct vnode *);
680static	void jwork_move(struct workhead *, struct workhead *);
681static	void add_to_worklist(struct worklist *, int);
682static	void remove_from_worklist(struct worklist *);
683static	void softdep_flush(void);
684static	int softdep_speedup(void);
685static	void worklist_speedup(void);
686static	int journal_mount(struct mount *, struct fs *, struct ucred *);
687static	void journal_unmount(struct mount *);
688static	int journal_space(struct ufsmount *, int);
689static	void journal_suspend(struct ufsmount *);
690static	void softdep_prelink(struct vnode *, struct vnode *);
691static	void add_to_journal(struct worklist *);
692static	void remove_from_journal(struct worklist *);
693static	void softdep_process_journal(struct mount *, int);
694static	struct jremref *newjremref(struct dirrem *, struct inode *,
695	    struct inode *ip, off_t, nlink_t);
696static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
697	    uint16_t);
698static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
699	    uint16_t);
700static inline struct jsegdep *inoref_jseg(struct inoref *);
701static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
702static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
703	    ufs2_daddr_t, int);
704static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
705	    ufs2_daddr_t, long, ufs_lbn_t);
706static	struct freework *newfreework(struct freeblks *, struct freework *,
707	    ufs_lbn_t, ufs2_daddr_t, int, int);
708static	void jwait(struct worklist *wk);
709static	struct inodedep *inodedep_lookup_ip(struct inode *);
710static	int bmsafemap_rollbacks(struct bmsafemap *);
711static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
712static	void handle_jwork(struct workhead *);
713static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
714	    struct mkdir **);
715static	struct jblocks *jblocks_create(void);
716static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
717static	void jblocks_free(struct jblocks *, struct mount *, int);
718static	void jblocks_destroy(struct jblocks *);
719static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
720
721/*
722 * Exported softdep operations.
723 */
724static	void softdep_disk_io_initiation(struct buf *);
725static	void softdep_disk_write_complete(struct buf *);
726static	void softdep_deallocate_dependencies(struct buf *);
727static	int softdep_count_dependencies(struct buf *bp, int);
728
729static struct mtx lk;
730MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
731
732#define TRY_ACQUIRE_LOCK(lk)		mtx_trylock(lk)
733#define ACQUIRE_LOCK(lk)		mtx_lock(lk)
734#define FREE_LOCK(lk)			mtx_unlock(lk)
735
736#define	BUF_AREC(bp)	((bp)->b_lock.lock_object.lo_flags |= LO_RECURSABLE)
737#define	BUF_NOREC(bp)	((bp)->b_lock.lock_object.lo_flags &= ~LO_RECURSABLE)
738
739/*
740 * Worklist queue management.
741 * These routines require that the lock be held.
742 */
743#ifndef /* NOT */ DEBUG
744#define WORKLIST_INSERT(head, item) do {	\
745	(item)->wk_state |= ONWORKLIST;		\
746	LIST_INSERT_HEAD(head, item, wk_list);	\
747} while (0)
748#define WORKLIST_REMOVE(item) do {		\
749	(item)->wk_state &= ~ONWORKLIST;	\
750	LIST_REMOVE(item, wk_list);		\
751} while (0)
752#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
753#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
754
755#else /* DEBUG */
756static	void worklist_insert(struct workhead *, struct worklist *, int);
757static	void worklist_remove(struct worklist *, int);
758
759#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
760#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
761#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
762#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
763
764static void
765worklist_insert(head, item, locked)
766	struct workhead *head;
767	struct worklist *item;
768	int locked;
769{
770
771	if (locked)
772		mtx_assert(&lk, MA_OWNED);
773	if (item->wk_state & ONWORKLIST)
774		panic("worklist_insert: %p %s(0x%X) already on list",
775		    item, TYPENAME(item->wk_type), item->wk_state);
776	item->wk_state |= ONWORKLIST;
777	LIST_INSERT_HEAD(head, item, wk_list);
778}
779
780static void
781worklist_remove(item, locked)
782	struct worklist *item;
783	int locked;
784{
785
786	if (locked)
787		mtx_assert(&lk, MA_OWNED);
788	if ((item->wk_state & ONWORKLIST) == 0)
789		panic("worklist_remove: %p %s(0x%X) not on list",
790		    item, TYPENAME(item->wk_type), item->wk_state);
791	item->wk_state &= ~ONWORKLIST;
792	LIST_REMOVE(item, wk_list);
793}
794#endif /* DEBUG */
795
796/*
797 * Merge two jsegdeps keeping only the oldest one as newer references
798 * can't be discarded until after older references.
799 */
800static inline struct jsegdep *
801jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
802{
803	struct jsegdep *swp;
804
805	if (two == NULL)
806		return (one);
807
808	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
809		swp = one;
810		one = two;
811		two = swp;
812	}
813	WORKLIST_REMOVE(&two->jd_list);
814	free_jsegdep(two);
815
816	return (one);
817}
818
819/*
820 * If two freedeps are compatible free one to reduce list size.
821 */
822static inline struct freedep *
823freedep_merge(struct freedep *one, struct freedep *two)
824{
825	if (two == NULL)
826		return (one);
827
828	if (one->fd_freework == two->fd_freework) {
829		WORKLIST_REMOVE(&two->fd_list);
830		free_freedep(two);
831	}
832	return (one);
833}
834
835/*
836 * Move journal work from one list to another.  Duplicate freedeps and
837 * jsegdeps are coalesced to keep the lists as small as possible.
838 */
839static void
840jwork_move(dst, src)
841	struct workhead *dst;
842	struct workhead *src;
843{
844	struct freedep *freedep;
845	struct jsegdep *jsegdep;
846	struct worklist *wkn;
847	struct worklist *wk;
848
849	KASSERT(dst != src,
850	    ("jwork_move: dst == src"));
851	freedep = NULL;
852	jsegdep = NULL;
853	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
854		if (wk->wk_type == D_JSEGDEP)
855			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
856		if (wk->wk_type == D_FREEDEP)
857			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
858	}
859
860	mtx_assert(&lk, MA_OWNED);
861	while ((wk = LIST_FIRST(src)) != NULL) {
862		WORKLIST_REMOVE(wk);
863		WORKLIST_INSERT(dst, wk);
864		if (wk->wk_type == D_JSEGDEP) {
865			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
866			continue;
867		}
868		if (wk->wk_type == D_FREEDEP)
869			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
870	}
871}
872
873/*
874 * Routines for tracking and managing workitems.
875 */
876static	void workitem_free(struct worklist *, int);
877static	void workitem_alloc(struct worklist *, int, struct mount *);
878
879#define	WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
880
881static void
882workitem_free(item, type)
883	struct worklist *item;
884	int type;
885{
886	struct ufsmount *ump;
887	mtx_assert(&lk, MA_OWNED);
888
889#ifdef DEBUG
890	if (item->wk_state & ONWORKLIST)
891		panic("workitem_free: %s(0x%X) still on list",
892		    TYPENAME(item->wk_type), item->wk_state);
893	if (item->wk_type != type)
894		panic("workitem_free: type mismatch %s != %s",
895		    TYPENAME(item->wk_type), TYPENAME(type));
896#endif
897	ump = VFSTOUFS(item->wk_mp);
898	if (--ump->softdep_deps == 0 && ump->softdep_req)
899		wakeup(&ump->softdep_deps);
900	dep_current[type]--;
901	free(item, DtoM(type));
902}
903
904static void
905workitem_alloc(item, type, mp)
906	struct worklist *item;
907	int type;
908	struct mount *mp;
909{
910	item->wk_type = type;
911	item->wk_mp = mp;
912	item->wk_state = 0;
913	ACQUIRE_LOCK(&lk);
914	dep_current[type]++;
915	dep_total[type]++;
916	VFSTOUFS(mp)->softdep_deps++;
917	VFSTOUFS(mp)->softdep_accdeps++;
918	FREE_LOCK(&lk);
919}
920
921/*
922 * Workitem queue management
923 */
924static int max_softdeps;	/* maximum number of structs before slowdown */
925static int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
926static int tickdelay = 2;	/* number of ticks to pause during slowdown */
927static int proc_waiting;	/* tracks whether we have a timeout posted */
928static int *stat_countp;	/* statistic to count in proc_waiting timeout */
929static struct callout softdep_callout;
930static int req_pending;
931static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
932#define FLUSH_INODES		1
933static int req_clear_remove;	/* syncer process flush some freeblks */
934#define FLUSH_REMOVE		2
935#define FLUSH_REMOVE_WAIT	3
936static long num_freeblkdep;	/* number of freeblks workitems allocated */
937
938/*
939 * runtime statistics
940 */
941static int stat_worklist_push;	/* number of worklist cleanups */
942static int stat_blk_limit_push;	/* number of times block limit neared */
943static int stat_ino_limit_push;	/* number of times inode limit neared */
944static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
945static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
946static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
947static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
948static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
949static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
950static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
951static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
952static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
953static int stat_journal_min;	/* Times hit journal min threshold */
954static int stat_journal_low;	/* Times hit journal low threshold */
955static int stat_journal_wait;	/* Times blocked in jwait(). */
956static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
957static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
958static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
959static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
960
961SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
962    &max_softdeps, 0, "");
963SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
964    &tickdelay, 0, "");
965SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
966    &maxindirdeps, 0, "");
967SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
968    &stat_worklist_push, 0,"");
969SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
970    &stat_blk_limit_push, 0,"");
971SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
972    &stat_ino_limit_push, 0,"");
973SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
974    &stat_blk_limit_hit, 0, "");
975SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
976    &stat_ino_limit_hit, 0, "");
977SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
978    &stat_sync_limit_hit, 0, "");
979SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
980    &stat_indir_blk_ptrs, 0, "");
981SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
982    &stat_inode_bitmap, 0, "");
983SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
984    &stat_direct_blk_ptrs, 0, "");
985SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
986    &stat_dir_entry, 0, "");
987SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
988    &stat_jaddref, 0, "");
989SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
990    &stat_jnewblk, 0, "");
991SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
992    &stat_journal_low, 0, "");
993SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
994    &stat_journal_min, 0, "");
995SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
996    &stat_journal_wait, 0, "");
997SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
998    &stat_jwait_filepage, 0, "");
999SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1000    &stat_jwait_freeblks, 0, "");
1001SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1002    &stat_jwait_inode, 0, "");
1003SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1004    &stat_jwait_newblk, 0, "");
1005
1006SYSCTL_DECL(_vfs_ffs);
1007
1008LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl;
1009static u_long	bmsafemap_hash;	/* size of hash table - 1 */
1010
1011static int compute_summary_at_mount = 0;	/* Whether to recompute the summary at mount time */
1012SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1013	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1014
1015static struct proc *softdepproc;
1016static struct kproc_desc softdep_kp = {
1017	"softdepflush",
1018	softdep_flush,
1019	&softdepproc
1020};
1021SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
1022    &softdep_kp);
1023
1024static void
1025softdep_flush(void)
1026{
1027	struct mount *nmp;
1028	struct mount *mp;
1029	struct ufsmount *ump;
1030	struct thread *td;
1031	int remaining;
1032	int vfslocked;
1033
1034	td = curthread;
1035	td->td_pflags |= TDP_NORUNNINGBUF;
1036
1037	for (;;) {
1038		kproc_suspend_check(softdepproc);
1039		vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
1040		ACQUIRE_LOCK(&lk);
1041		/*
1042		 * If requested, try removing inode or removal dependencies.
1043		 */
1044		if (req_clear_inodedeps) {
1045			clear_inodedeps(td);
1046			req_clear_inodedeps -= 1;
1047			wakeup_one(&proc_waiting);
1048		}
1049		if (req_clear_remove) {
1050			clear_remove(td);
1051			req_clear_remove -= 1;
1052			wakeup_one(&proc_waiting);
1053		}
1054		FREE_LOCK(&lk);
1055		VFS_UNLOCK_GIANT(vfslocked);
1056		remaining = 0;
1057		mtx_lock(&mountlist_mtx);
1058		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
1059			nmp = TAILQ_NEXT(mp, mnt_list);
1060			if ((mp->mnt_flag & MNT_SOFTDEP) == 0)
1061				continue;
1062			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
1063				continue;
1064			vfslocked = VFS_LOCK_GIANT(mp);
1065			softdep_process_worklist(mp, 0);
1066			ump = VFSTOUFS(mp);
1067			remaining += ump->softdep_on_worklist -
1068				ump->softdep_on_worklist_inprogress;
1069			VFS_UNLOCK_GIANT(vfslocked);
1070			mtx_lock(&mountlist_mtx);
1071			nmp = TAILQ_NEXT(mp, mnt_list);
1072			vfs_unbusy(mp);
1073		}
1074		mtx_unlock(&mountlist_mtx);
1075		if (remaining)
1076			continue;
1077		ACQUIRE_LOCK(&lk);
1078		if (!req_pending)
1079			msleep(&req_pending, &lk, PVM, "sdflush", hz);
1080		req_pending = 0;
1081		FREE_LOCK(&lk);
1082	}
1083}
1084
1085static void
1086worklist_speedup(void)
1087{
1088	mtx_assert(&lk, MA_OWNED);
1089	if (req_pending == 0) {
1090		req_pending = 1;
1091		wakeup(&req_pending);
1092	}
1093}
1094
1095static int
1096softdep_speedup(void)
1097{
1098
1099	worklist_speedup();
1100	bd_speedup();
1101	return speedup_syncer();
1102}
1103
1104/*
1105 * Add an item to the end of the work queue.
1106 * This routine requires that the lock be held.
1107 * This is the only routine that adds items to the list.
1108 * The following routine is the only one that removes items
1109 * and does so in order from first to last.
1110 */
1111static void
1112add_to_worklist(wk, nodelay)
1113	struct worklist *wk;
1114	int nodelay;
1115{
1116	struct ufsmount *ump;
1117
1118	mtx_assert(&lk, MA_OWNED);
1119	ump = VFSTOUFS(wk->wk_mp);
1120	if (wk->wk_state & ONWORKLIST)
1121		panic("add_to_worklist: %s(0x%X) already on list",
1122		    TYPENAME(wk->wk_type), wk->wk_state);
1123	wk->wk_state |= ONWORKLIST;
1124	if (LIST_EMPTY(&ump->softdep_workitem_pending))
1125		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1126	else
1127		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1128	ump->softdep_worklist_tail = wk;
1129	ump->softdep_on_worklist += 1;
1130	if (nodelay)
1131		worklist_speedup();
1132}
1133
1134/*
1135 * Remove the item to be processed. If we are removing the last
1136 * item on the list, we need to recalculate the tail pointer.
1137 */
1138static void
1139remove_from_worklist(wk)
1140	struct worklist *wk;
1141{
1142	struct ufsmount *ump;
1143	struct worklist *wkend;
1144
1145	ump = VFSTOUFS(wk->wk_mp);
1146	WORKLIST_REMOVE(wk);
1147	if (wk == ump->softdep_worklist_tail) {
1148		LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
1149			if (LIST_NEXT(wkend, wk_list) == NULL)
1150				break;
1151		ump->softdep_worklist_tail = wkend;
1152	}
1153	ump->softdep_on_worklist -= 1;
1154}
1155
1156/*
1157 * Process that runs once per second to handle items in the background queue.
1158 *
1159 * Note that we ensure that everything is done in the order in which they
1160 * appear in the queue. The code below depends on this property to ensure
1161 * that blocks of a file are freed before the inode itself is freed. This
1162 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1163 * until all the old ones have been purged from the dependency lists.
1164 */
1165int
1166softdep_process_worklist(mp, full)
1167	struct mount *mp;
1168	int full;
1169{
1170	struct thread *td = curthread;
1171	int cnt, matchcnt, loopcount;
1172	struct ufsmount *ump;
1173	long starttime;
1174
1175	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1176	/*
1177	 * Record the process identifier of our caller so that we can give
1178	 * this process preferential treatment in request_cleanup below.
1179	 */
1180	matchcnt = 0;
1181	ump = VFSTOUFS(mp);
1182	ACQUIRE_LOCK(&lk);
1183	loopcount = 1;
1184	starttime = time_second;
1185	softdep_process_journal(mp, full?MNT_WAIT:0);
1186	while (ump->softdep_on_worklist > 0) {
1187		if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1)
1188			break;
1189		else
1190			matchcnt += cnt;
1191		/*
1192		 * If requested, try removing inode or removal dependencies.
1193		 */
1194		if (req_clear_inodedeps) {
1195			clear_inodedeps(td);
1196			req_clear_inodedeps -= 1;
1197			wakeup_one(&proc_waiting);
1198		}
1199		if (req_clear_remove) {
1200			clear_remove(td);
1201			req_clear_remove -= 1;
1202			wakeup_one(&proc_waiting);
1203		}
1204		/*
1205		 * We do not generally want to stop for buffer space, but if
1206		 * we are really being a buffer hog, we will stop and wait.
1207		 */
1208		if (loopcount++ % 128 == 0) {
1209			FREE_LOCK(&lk);
1210			uio_yield();
1211			bwillwrite();
1212			ACQUIRE_LOCK(&lk);
1213		}
1214		/*
1215		 * Never allow processing to run for more than one
1216		 * second. Otherwise the other mountpoints may get
1217		 * excessively backlogged.
1218		 */
1219		if (!full && starttime != time_second)
1220			break;
1221	}
1222	FREE_LOCK(&lk);
1223	return (matchcnt);
1224}
1225
1226/*
1227 * Process all removes associated with a vnode if we are running out of
1228 * journal space.  Any other process which attempts to flush these will
1229 * be unable as we have the vnodes locked.
1230 */
1231static void
1232process_removes(vp)
1233	struct vnode *vp;
1234{
1235	struct inodedep *inodedep;
1236	struct dirrem *dirrem;
1237	struct mount *mp;
1238	ino_t inum;
1239
1240	mtx_assert(&lk, MA_OWNED);
1241
1242	mp = vp->v_mount;
1243	inum = VTOI(vp)->i_number;
1244	for (;;) {
1245		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1246			return;
1247		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext)
1248			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1249			    (COMPLETE | ONWORKLIST))
1250				break;
1251		if (dirrem == NULL)
1252			return;
1253		/*
1254		 * If another thread is trying to lock this vnode it will
1255		 * fail but we must wait for it to do so before we can
1256		 * proceed.
1257		 */
1258		if (dirrem->dm_state & INPROGRESS) {
1259			dirrem->dm_state |= IOWAITING;
1260			msleep(&dirrem->dm_list, &lk, PVM, "pwrwait", 0);
1261			continue;
1262		}
1263		remove_from_worklist(&dirrem->dm_list);
1264		FREE_LOCK(&lk);
1265		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1266			panic("process_removes: suspended filesystem");
1267		handle_workitem_remove(dirrem, vp);
1268		vn_finished_secondary_write(mp);
1269		ACQUIRE_LOCK(&lk);
1270	}
1271}
1272
1273/*
1274 * Process one item on the worklist.
1275 */
1276static int
1277process_worklist_item(mp, flags)
1278	struct mount *mp;
1279	int flags;
1280{
1281	struct worklist *wk, *wkXXX;
1282	struct ufsmount *ump;
1283	struct vnode *vp;
1284	int matchcnt = 0;
1285
1286	mtx_assert(&lk, MA_OWNED);
1287	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1288	/*
1289	 * If we are being called because of a process doing a
1290	 * copy-on-write, then it is not safe to write as we may
1291	 * recurse into the copy-on-write routine.
1292	 */
1293	if (curthread->td_pflags & TDP_COWINPROGRESS)
1294		return (-1);
1295	/*
1296	 * Normally we just process each item on the worklist in order.
1297	 * However, if we are in a situation where we cannot lock any
1298	 * inodes, we have to skip over any dirrem requests whose
1299	 * vnodes are resident and locked.
1300	 */
1301	vp = NULL;
1302	ump = VFSTOUFS(mp);
1303	LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
1304		if (wk->wk_state & INPROGRESS) {
1305			wkXXX = wk;
1306			continue;
1307		}
1308		wkXXX = wk;	/* Record the last valid wk pointer. */
1309		if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
1310			break;
1311		wk->wk_state |= INPROGRESS;
1312		ump->softdep_on_worklist_inprogress++;
1313		FREE_LOCK(&lk);
1314		ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum,
1315		    LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ);
1316		ACQUIRE_LOCK(&lk);
1317		if (wk->wk_state & IOWAITING) {
1318			wk->wk_state &= ~IOWAITING;
1319			wakeup(wk);
1320		}
1321		wk->wk_state &= ~INPROGRESS;
1322		ump->softdep_on_worklist_inprogress--;
1323		if (vp != NULL)
1324			break;
1325	}
1326	if (wk == 0)
1327		return (-1);
1328	remove_from_worklist(wk);
1329	FREE_LOCK(&lk);
1330	if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1331		panic("process_worklist_item: suspended filesystem");
1332	matchcnt++;
1333	switch (wk->wk_type) {
1334
1335	case D_DIRREM:
1336		/* removal of a directory entry */
1337		handle_workitem_remove(WK_DIRREM(wk), vp);
1338		if (vp)
1339			vput(vp);
1340		break;
1341
1342	case D_FREEBLKS:
1343		/* releasing blocks and/or fragments from a file */
1344		handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
1345		break;
1346
1347	case D_FREEFRAG:
1348		/* releasing a fragment when replaced as a file grows */
1349		handle_workitem_freefrag(WK_FREEFRAG(wk));
1350		break;
1351
1352	case D_FREEFILE:
1353		/* releasing an inode when its link count drops to 0 */
1354		handle_workitem_freefile(WK_FREEFILE(wk));
1355		break;
1356
1357	case D_FREEWORK:
1358		/* Final block in an indirect was freed. */
1359		handle_workitem_indirblk(WK_FREEWORK(wk));
1360		break;
1361
1362	default:
1363		panic("%s_process_worklist: Unknown type %s",
1364		    "softdep", TYPENAME(wk->wk_type));
1365		/* NOTREACHED */
1366	}
1367	vn_finished_secondary_write(mp);
1368	ACQUIRE_LOCK(&lk);
1369	return (matchcnt);
1370}
1371
1372/*
1373 * Move dependencies from one buffer to another.
1374 */
1375int
1376softdep_move_dependencies(oldbp, newbp)
1377	struct buf *oldbp;
1378	struct buf *newbp;
1379{
1380	struct worklist *wk, *wktail;
1381	int dirty;
1382
1383	dirty = 0;
1384	wktail = NULL;
1385	ACQUIRE_LOCK(&lk);
1386	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1387		LIST_REMOVE(wk, wk_list);
1388		if (wk->wk_type == D_BMSAFEMAP &&
1389		    bmsafemap_rollbacks(WK_BMSAFEMAP(wk)))
1390			dirty = 1;
1391		if (wktail == 0)
1392			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1393		else
1394			LIST_INSERT_AFTER(wktail, wk, wk_list);
1395		wktail = wk;
1396	}
1397	FREE_LOCK(&lk);
1398
1399	return (dirty);
1400}
1401
1402/*
1403 * Purge the work list of all items associated with a particular mount point.
1404 */
1405int
1406softdep_flushworklist(oldmnt, countp, td)
1407	struct mount *oldmnt;
1408	int *countp;
1409	struct thread *td;
1410{
1411	struct vnode *devvp;
1412	int count, error = 0;
1413	struct ufsmount *ump;
1414
1415	/*
1416	 * Alternately flush the block device associated with the mount
1417	 * point and process any dependencies that the flushing
1418	 * creates. We continue until no more worklist dependencies
1419	 * are found.
1420	 */
1421	*countp = 0;
1422	ump = VFSTOUFS(oldmnt);
1423	devvp = ump->um_devvp;
1424	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1425		*countp += count;
1426		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1427		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1428		VOP_UNLOCK(devvp, 0);
1429		if (error)
1430			break;
1431	}
1432	return (error);
1433}
1434
1435int
1436softdep_waitidle(struct mount *mp)
1437{
1438	struct ufsmount *ump;
1439	int error;
1440	int i;
1441
1442	ump = VFSTOUFS(mp);
1443	ACQUIRE_LOCK(&lk);
1444	for (i = 0; i < 10 && ump->softdep_deps; i++) {
1445		ump->softdep_req = 1;
1446		if (ump->softdep_on_worklist)
1447			panic("softdep_waitidle: work added after flush.");
1448		msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
1449	}
1450	ump->softdep_req = 0;
1451	FREE_LOCK(&lk);
1452	error = 0;
1453	if (i == 10) {
1454		error = EBUSY;
1455		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1456		    mp);
1457	}
1458
1459	return (error);
1460}
1461
1462/*
1463 * Flush all vnodes and worklist items associated with a specified mount point.
1464 */
1465int
1466softdep_flushfiles(oldmnt, flags, td)
1467	struct mount *oldmnt;
1468	int flags;
1469	struct thread *td;
1470{
1471	int error, depcount, loopcnt, retry_flush_count, retry;
1472
1473	loopcnt = 10;
1474	retry_flush_count = 3;
1475retry_flush:
1476	error = 0;
1477
1478	/*
1479	 * Alternately flush the vnodes associated with the mount
1480	 * point and process any dependencies that the flushing
1481	 * creates. In theory, this loop can happen at most twice,
1482	 * but we give it a few extra just to be sure.
1483	 */
1484	for (; loopcnt > 0; loopcnt--) {
1485		/*
1486		 * Do another flush in case any vnodes were brought in
1487		 * as part of the cleanup operations.
1488		 */
1489		if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
1490			break;
1491		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1492		    depcount == 0)
1493			break;
1494	}
1495	/*
1496	 * If we are unmounting then it is an error to fail. If we
1497	 * are simply trying to downgrade to read-only, then filesystem
1498	 * activity can keep us busy forever, so we just fail with EBUSY.
1499	 */
1500	if (loopcnt == 0) {
1501		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1502			panic("softdep_flushfiles: looping");
1503		error = EBUSY;
1504	}
1505	if (!error)
1506		error = softdep_waitidle(oldmnt);
1507	if (!error) {
1508		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
1509			retry = 0;
1510			MNT_ILOCK(oldmnt);
1511			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
1512			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
1513			if (oldmnt->mnt_nvnodelistsize > 0) {
1514				if (--retry_flush_count > 0) {
1515					retry = 1;
1516					loopcnt = 3;
1517				} else
1518					error = EBUSY;
1519			}
1520			MNT_IUNLOCK(oldmnt);
1521			if (retry)
1522				goto retry_flush;
1523		}
1524	}
1525	return (error);
1526}
1527
1528/*
1529 * Structure hashing.
1530 *
1531 * There are three types of structures that can be looked up:
1532 *	1) pagedep structures identified by mount point, inode number,
1533 *	   and logical block.
1534 *	2) inodedep structures identified by mount point and inode number.
1535 *	3) newblk structures identified by mount point and
1536 *	   physical block number.
1537 *
1538 * The "pagedep" and "inodedep" dependency structures are hashed
1539 * separately from the file blocks and inodes to which they correspond.
1540 * This separation helps when the in-memory copy of an inode or
1541 * file block must be replaced. It also obviates the need to access
1542 * an inode or file page when simply updating (or de-allocating)
1543 * dependency structures. Lookup of newblk structures is needed to
1544 * find newly allocated blocks when trying to associate them with
1545 * their allocdirect or allocindir structure.
1546 *
1547 * The lookup routines optionally create and hash a new instance when
1548 * an existing entry is not found.
1549 */
1550#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
1551#define NODELAY		0x0002	/* cannot do background work */
1552
1553/*
1554 * Structures and routines associated with pagedep caching.
1555 */
1556LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
1557u_long	pagedep_hash;		/* size of hash table - 1 */
1558#define	PAGEDEP_HASH(mp, inum, lbn) \
1559	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
1560	    pagedep_hash])
1561
1562static int
1563pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
1564	struct pagedep_hashhead *pagedephd;
1565	ino_t ino;
1566	ufs_lbn_t lbn;
1567	struct mount *mp;
1568	int flags;
1569	struct pagedep **pagedeppp;
1570{
1571	struct pagedep *pagedep;
1572
1573	LIST_FOREACH(pagedep, pagedephd, pd_hash)
1574		if (ino == pagedep->pd_ino &&
1575		    lbn == pagedep->pd_lbn &&
1576		    mp == pagedep->pd_list.wk_mp)
1577			break;
1578	if (pagedep) {
1579		*pagedeppp = pagedep;
1580		if ((flags & DEPALLOC) != 0 &&
1581		    (pagedep->pd_state & ONWORKLIST) == 0)
1582			return (0);
1583		return (1);
1584	}
1585	*pagedeppp = NULL;
1586	return (0);
1587}
1588/*
1589 * Look up a pagedep. Return 1 if found, 0 if not found or found
1590 * when asked to allocate but not associated with any buffer.
1591 * If not found, allocate if DEPALLOC flag is passed.
1592 * Found or allocated entry is returned in pagedeppp.
1593 * This routine must be called with splbio interrupts blocked.
1594 */
1595static int
1596pagedep_lookup(mp, ino, lbn, flags, pagedeppp)
1597	struct mount *mp;
1598	ino_t ino;
1599	ufs_lbn_t lbn;
1600	int flags;
1601	struct pagedep **pagedeppp;
1602{
1603	struct pagedep *pagedep;
1604	struct pagedep_hashhead *pagedephd;
1605	int ret;
1606	int i;
1607
1608	mtx_assert(&lk, MA_OWNED);
1609	pagedephd = PAGEDEP_HASH(mp, ino, lbn);
1610
1611	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
1612	if (*pagedeppp || (flags & DEPALLOC) == 0)
1613		return (ret);
1614	FREE_LOCK(&lk);
1615	pagedep = malloc(sizeof(struct pagedep),
1616	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
1617	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
1618	ACQUIRE_LOCK(&lk);
1619	ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp);
1620	if (*pagedeppp) {
1621		WORKITEM_FREE(pagedep, D_PAGEDEP);
1622		return (ret);
1623	}
1624	pagedep->pd_ino = ino;
1625	pagedep->pd_lbn = lbn;
1626	LIST_INIT(&pagedep->pd_dirremhd);
1627	LIST_INIT(&pagedep->pd_pendinghd);
1628	for (i = 0; i < DAHASHSZ; i++)
1629		LIST_INIT(&pagedep->pd_diraddhd[i]);
1630	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
1631	*pagedeppp = pagedep;
1632	return (0);
1633}
1634
1635/*
1636 * Structures and routines associated with inodedep caching.
1637 */
1638LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
1639static u_long	inodedep_hash;	/* size of hash table - 1 */
1640static long	num_inodedep;	/* number of inodedep allocated */
1641#define	INODEDEP_HASH(fs, inum) \
1642      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
1643
1644static int
1645inodedep_find(inodedephd, fs, inum, inodedeppp)
1646	struct inodedep_hashhead *inodedephd;
1647	struct fs *fs;
1648	ino_t inum;
1649	struct inodedep **inodedeppp;
1650{
1651	struct inodedep *inodedep;
1652
1653	LIST_FOREACH(inodedep, inodedephd, id_hash)
1654		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1655			break;
1656	if (inodedep) {
1657		*inodedeppp = inodedep;
1658		return (1);
1659	}
1660	*inodedeppp = NULL;
1661
1662	return (0);
1663}
1664/*
1665 * Look up an inodedep. Return 1 if found, 0 if not found.
1666 * If not found, allocate if DEPALLOC flag is passed.
1667 * Found or allocated entry is returned in inodedeppp.
1668 * This routine must be called with splbio interrupts blocked.
1669 */
1670static int
1671inodedep_lookup(mp, inum, flags, inodedeppp)
1672	struct mount *mp;
1673	ino_t inum;
1674	int flags;
1675	struct inodedep **inodedeppp;
1676{
1677	struct inodedep *inodedep;
1678	struct inodedep_hashhead *inodedephd;
1679	struct fs *fs;
1680
1681	mtx_assert(&lk, MA_OWNED);
1682	fs = VFSTOUFS(mp)->um_fs;
1683	inodedephd = INODEDEP_HASH(fs, inum);
1684
1685	if (inodedep_find(inodedephd, fs, inum, inodedeppp))
1686		return (1);
1687	if ((flags & DEPALLOC) == 0)
1688		return (0);
1689	/*
1690	 * If we are over our limit, try to improve the situation.
1691	 */
1692	if (num_inodedep > max_softdeps && (flags & NODELAY) == 0)
1693		request_cleanup(mp, FLUSH_INODES);
1694	FREE_LOCK(&lk);
1695	inodedep = malloc(sizeof(struct inodedep),
1696		M_INODEDEP, M_SOFTDEP_FLAGS);
1697	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
1698	ACQUIRE_LOCK(&lk);
1699	if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
1700		WORKITEM_FREE(inodedep, D_INODEDEP);
1701		return (1);
1702	}
1703	num_inodedep += 1;
1704	inodedep->id_fs = fs;
1705	inodedep->id_ino = inum;
1706	inodedep->id_state = ALLCOMPLETE;
1707	inodedep->id_nlinkdelta = 0;
1708	inodedep->id_savedino1 = NULL;
1709	inodedep->id_savedsize = -1;
1710	inodedep->id_savedextsize = -1;
1711	inodedep->id_savednlink = -1;
1712	inodedep->id_bmsafemap = NULL;
1713	inodedep->id_mkdiradd = NULL;
1714	LIST_INIT(&inodedep->id_dirremhd);
1715	LIST_INIT(&inodedep->id_pendinghd);
1716	LIST_INIT(&inodedep->id_inowait);
1717	LIST_INIT(&inodedep->id_bufwait);
1718	TAILQ_INIT(&inodedep->id_inoreflst);
1719	TAILQ_INIT(&inodedep->id_inoupdt);
1720	TAILQ_INIT(&inodedep->id_newinoupdt);
1721	TAILQ_INIT(&inodedep->id_extupdt);
1722	TAILQ_INIT(&inodedep->id_newextupdt);
1723	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1724	*inodedeppp = inodedep;
1725	return (0);
1726}
1727
1728/*
1729 * Structures and routines associated with newblk caching.
1730 */
1731LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1732u_long	newblk_hash;		/* size of hash table - 1 */
1733#define	NEWBLK_HASH(fs, inum) \
1734	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1735
1736static int
1737newblk_find(newblkhd, mp, newblkno, flags, newblkpp)
1738	struct newblk_hashhead *newblkhd;
1739	struct mount *mp;
1740	ufs2_daddr_t newblkno;
1741	int flags;
1742	struct newblk **newblkpp;
1743{
1744	struct newblk *newblk;
1745
1746	LIST_FOREACH(newblk, newblkhd, nb_hash) {
1747		if (newblkno != newblk->nb_newblkno)
1748			continue;
1749		if (mp != newblk->nb_list.wk_mp)
1750			continue;
1751		/*
1752		 * If we're creating a new dependency don't match those that
1753		 * have already been converted to allocdirects.  This is for
1754		 * a frag extend.
1755		 */
1756		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
1757			continue;
1758		break;
1759	}
1760	if (newblk) {
1761		*newblkpp = newblk;
1762		return (1);
1763	}
1764	*newblkpp = NULL;
1765	return (0);
1766}
1767
1768/*
1769 * Look up a newblk. Return 1 if found, 0 if not found.
1770 * If not found, allocate if DEPALLOC flag is passed.
1771 * Found or allocated entry is returned in newblkpp.
1772 */
1773static int
1774newblk_lookup(mp, newblkno, flags, newblkpp)
1775	struct mount *mp;
1776	ufs2_daddr_t newblkno;
1777	int flags;
1778	struct newblk **newblkpp;
1779{
1780	struct newblk *newblk;
1781	struct newblk_hashhead *newblkhd;
1782
1783	newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno);
1784	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp))
1785		return (1);
1786	if ((flags & DEPALLOC) == 0)
1787		return (0);
1788	FREE_LOCK(&lk);
1789	newblk = malloc(sizeof(union allblk), M_NEWBLK,
1790	    M_SOFTDEP_FLAGS | M_ZERO);
1791	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
1792	ACQUIRE_LOCK(&lk);
1793	if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) {
1794		WORKITEM_FREE(newblk, D_NEWBLK);
1795		return (1);
1796	}
1797	newblk->nb_freefrag = NULL;
1798	LIST_INIT(&newblk->nb_indirdeps);
1799	LIST_INIT(&newblk->nb_newdirblk);
1800	LIST_INIT(&newblk->nb_jwork);
1801	newblk->nb_state = ATTACHED;
1802	newblk->nb_newblkno = newblkno;
1803	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1804	*newblkpp = newblk;
1805	return (0);
1806}
1807
1808/*
1809 * Executed during filesystem system initialization before
1810 * mounting any filesystems.
1811 */
1812void
1813softdep_initialize()
1814{
1815
1816	LIST_INIT(&mkdirlisthd);
1817	max_softdeps = desiredvnodes * 4;
1818	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash);
1819	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
1820	newblk_hashtbl = hashinit(desiredvnodes / 5,  M_NEWBLK, &newblk_hash);
1821	bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
1822
1823	/* initialise bioops hack */
1824	bioops.io_start = softdep_disk_io_initiation;
1825	bioops.io_complete = softdep_disk_write_complete;
1826	bioops.io_deallocate = softdep_deallocate_dependencies;
1827	bioops.io_countdeps = softdep_count_dependencies;
1828
1829	/* Initialize the callout with an mtx. */
1830	callout_init_mtx(&softdep_callout, &lk, 0);
1831}
1832
1833/*
1834 * Executed after all filesystems have been unmounted during
1835 * filesystem module unload.
1836 */
1837void
1838softdep_uninitialize()
1839{
1840
1841	callout_drain(&softdep_callout);
1842	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
1843	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
1844	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
1845	hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash);
1846}
1847
1848/*
1849 * Called at mount time to notify the dependency code that a
1850 * filesystem wishes to use it.
1851 */
1852int
1853softdep_mount(devvp, mp, fs, cred)
1854	struct vnode *devvp;
1855	struct mount *mp;
1856	struct fs *fs;
1857	struct ucred *cred;
1858{
1859	struct csum_total cstotal;
1860	struct ufsmount *ump;
1861	struct cg *cgp;
1862	struct buf *bp;
1863	int error, cyl;
1864
1865	MNT_ILOCK(mp);
1866	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
1867	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
1868		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
1869			MNTK_SOFTDEP;
1870		mp->mnt_noasync++;
1871	}
1872	MNT_IUNLOCK(mp);
1873	ump = VFSTOUFS(mp);
1874	LIST_INIT(&ump->softdep_workitem_pending);
1875	LIST_INIT(&ump->softdep_journal_pending);
1876	TAILQ_INIT(&ump->softdep_unlinked);
1877	ump->softdep_worklist_tail = NULL;
1878	ump->softdep_on_worklist = 0;
1879	ump->softdep_deps = 0;
1880	if ((fs->fs_flags & FS_SUJ) &&
1881	    (error = journal_mount(mp, fs, cred)) != 0) {
1882		printf("Failed to start journal: %d\n", error);
1883		return (error);
1884	}
1885	/*
1886	 * When doing soft updates, the counters in the
1887	 * superblock may have gotten out of sync. Recomputation
1888	 * can take a long time and can be deferred for background
1889	 * fsck.  However, the old behavior of scanning the cylinder
1890	 * groups and recalculating them at mount time is available
1891	 * by setting vfs.ffs.compute_summary_at_mount to one.
1892	 */
1893	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
1894		return (0);
1895	bzero(&cstotal, sizeof cstotal);
1896	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1897		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
1898		    fs->fs_cgsize, cred, &bp)) != 0) {
1899			brelse(bp);
1900			return (error);
1901		}
1902		cgp = (struct cg *)bp->b_data;
1903		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1904		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1905		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1906		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1907		fs->fs_cs(fs, cyl) = cgp->cg_cs;
1908		brelse(bp);
1909	}
1910#ifdef DEBUG
1911	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1912		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
1913#endif
1914	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1915	return (0);
1916}
1917
1918void
1919softdep_unmount(mp)
1920	struct mount *mp;
1921{
1922
1923	if (mp->mnt_kern_flag & MNTK_SUJ)
1924		journal_unmount(mp);
1925}
1926
1927struct jblocks {
1928	struct jseglst	jb_segs;	/* TAILQ of current segments. */
1929	struct jseg	*jb_writeseg;	/* Next write to complete. */
1930	struct jextent	*jb_extent;	/* Extent array. */
1931	uint64_t	jb_nextseq;	/* Next sequence number. */
1932	uint64_t	jb_oldestseq;	/* Oldest active sequence number. */
1933	int		jb_avail;	/* Available extents. */
1934	int		jb_used;	/* Last used extent. */
1935	int		jb_head;	/* Allocator head. */
1936	int		jb_off;		/* Allocator extent offset. */
1937	int		jb_blocks;	/* Total disk blocks covered. */
1938	int		jb_free;	/* Total disk blocks free. */
1939	int		jb_min;		/* Minimum free space. */
1940	int		jb_low;		/* Low on space. */
1941	int		jb_age;		/* Insertion time of oldest rec. */
1942	int		jb_suspended;	/* Did journal suspend writes? */
1943};
1944
1945struct jextent {
1946	ufs2_daddr_t	je_daddr;	/* Disk block address. */
1947	int		je_blocks;	/* Disk block count. */
1948};
1949
1950static struct jblocks *
1951jblocks_create(void)
1952{
1953	struct jblocks *jblocks;
1954
1955	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
1956	TAILQ_INIT(&jblocks->jb_segs);
1957	jblocks->jb_avail = 10;
1958	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
1959	    M_JBLOCKS, M_WAITOK | M_ZERO);
1960
1961	return (jblocks);
1962}
1963
1964static ufs2_daddr_t
1965jblocks_alloc(jblocks, bytes, actual)
1966	struct jblocks *jblocks;
1967	int bytes;
1968	int *actual;
1969{
1970	ufs2_daddr_t daddr;
1971	struct jextent *jext;
1972	int freecnt;
1973	int blocks;
1974
1975	blocks = bytes / DEV_BSIZE;
1976	jext = &jblocks->jb_extent[jblocks->jb_head];
1977	freecnt = jext->je_blocks - jblocks->jb_off;
1978	if (freecnt == 0) {
1979		jblocks->jb_off = 0;
1980		if (++jblocks->jb_head > jblocks->jb_used)
1981			jblocks->jb_head = 0;
1982		jext = &jblocks->jb_extent[jblocks->jb_head];
1983		freecnt = jext->je_blocks;
1984	}
1985	if (freecnt > blocks)
1986		freecnt = blocks;
1987	*actual = freecnt * DEV_BSIZE;
1988	daddr = jext->je_daddr + jblocks->jb_off;
1989	jblocks->jb_off += freecnt;
1990	jblocks->jb_free -= freecnt;
1991
1992	return (daddr);
1993}
1994
1995static void
1996jblocks_free(jblocks, mp, bytes)
1997	struct jblocks *jblocks;
1998	struct mount *mp;
1999	int bytes;
2000{
2001
2002	jblocks->jb_free += bytes / DEV_BSIZE;
2003	if (jblocks->jb_suspended)
2004		worklist_speedup();
2005	wakeup(jblocks);
2006}
2007
2008static void
2009jblocks_destroy(jblocks)
2010	struct jblocks *jblocks;
2011{
2012
2013	if (jblocks->jb_extent)
2014		free(jblocks->jb_extent, M_JBLOCKS);
2015	free(jblocks, M_JBLOCKS);
2016}
2017
2018static void
2019jblocks_add(jblocks, daddr, blocks)
2020	struct jblocks *jblocks;
2021	ufs2_daddr_t daddr;
2022	int blocks;
2023{
2024	struct jextent *jext;
2025
2026	jblocks->jb_blocks += blocks;
2027	jblocks->jb_free += blocks;
2028	jext = &jblocks->jb_extent[jblocks->jb_used];
2029	/* Adding the first block. */
2030	if (jext->je_daddr == 0) {
2031		jext->je_daddr = daddr;
2032		jext->je_blocks = blocks;
2033		return;
2034	}
2035	/* Extending the last extent. */
2036	if (jext->je_daddr + jext->je_blocks == daddr) {
2037		jext->je_blocks += blocks;
2038		return;
2039	}
2040	/* Adding a new extent. */
2041	if (++jblocks->jb_used == jblocks->jb_avail) {
2042		jblocks->jb_avail *= 2;
2043		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2044		    M_JBLOCKS, M_WAITOK | M_ZERO);
2045		memcpy(jext, jblocks->jb_extent,
2046		    sizeof(struct jextent) * jblocks->jb_used);
2047		free(jblocks->jb_extent, M_JBLOCKS);
2048		jblocks->jb_extent = jext;
2049	}
2050	jext = &jblocks->jb_extent[jblocks->jb_used];
2051	jext->je_daddr = daddr;
2052	jext->je_blocks = blocks;
2053	return;
2054}
2055
2056int
2057softdep_journal_lookup(mp, vpp)
2058	struct mount *mp;
2059	struct vnode **vpp;
2060{
2061	struct componentname cnp;
2062	struct vnode *dvp;
2063	ino_t sujournal;
2064	int error;
2065
2066	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2067	if (error)
2068		return (error);
2069	bzero(&cnp, sizeof(cnp));
2070	cnp.cn_nameiop = LOOKUP;
2071	cnp.cn_flags = ISLASTCN;
2072	cnp.cn_thread = curthread;
2073	cnp.cn_cred = curthread->td_ucred;
2074	cnp.cn_pnbuf = SUJ_FILE;
2075	cnp.cn_nameptr = SUJ_FILE;
2076	cnp.cn_namelen = strlen(SUJ_FILE);
2077	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2078	vput(dvp);
2079	if (error != 0)
2080		return (error);
2081	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2082	return (error);
2083}
2084
2085/*
2086 * Open and verify the journal file.
2087 */
2088static int
2089journal_mount(mp, fs, cred)
2090	struct mount *mp;
2091	struct fs *fs;
2092	struct ucred *cred;
2093{
2094	struct jblocks *jblocks;
2095	struct vnode *vp;
2096	struct inode *ip;
2097	ufs2_daddr_t blkno;
2098	int bcount;
2099	int error;
2100	int i;
2101
2102	mp->mnt_kern_flag |= MNTK_SUJ;
2103	error = softdep_journal_lookup(mp, &vp);
2104	if (error != 0) {
2105		printf("Failed to find journal.  Use tunefs to create one\n");
2106		return (error);
2107	}
2108	ip = VTOI(vp);
2109	if (ip->i_size < SUJ_MIN) {
2110		error = ENOSPC;
2111		goto out;
2112	}
2113	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2114	jblocks = jblocks_create();
2115	for (i = 0; i < bcount; i++) {
2116		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2117		if (error)
2118			break;
2119		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2120	}
2121	if (error) {
2122		jblocks_destroy(jblocks);
2123		goto out;
2124	}
2125	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2126	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2127	/*
2128	 * Only validate the journal contents if the filesystem is clean,
2129	 * otherwise we write the logs but they'll never be used.  If the
2130	 * filesystem was still dirty when we mounted it the journal is
2131	 * invalid and a new journal can only be valid if it starts from a
2132	 * clean mount.
2133	 */
2134	if (fs->fs_clean) {
2135		DIP_SET(ip, i_modrev, fs->fs_mtime);
2136		ip->i_flags |= IN_MODIFIED;
2137		ffs_update(vp, 1);
2138	}
2139	VFSTOUFS(mp)->softdep_jblocks = jblocks;
2140out:
2141	vput(vp);
2142	return (error);
2143}
2144
2145static void
2146journal_unmount(mp)
2147	struct mount *mp;
2148{
2149	struct ufsmount *ump;
2150
2151	ump = VFSTOUFS(mp);
2152	if (ump->softdep_jblocks)
2153		jblocks_destroy(ump->softdep_jblocks);
2154	ump->softdep_jblocks = NULL;
2155}
2156
2157/*
2158 * Called when a journal record is ready to be written.  Space is allocated
2159 * and the journal entry is created when the journal is flushed to stable
2160 * store.
2161 */
2162static void
2163add_to_journal(wk)
2164	struct worklist *wk;
2165{
2166	struct ufsmount *ump;
2167
2168	mtx_assert(&lk, MA_OWNED);
2169	ump = VFSTOUFS(wk->wk_mp);
2170	if (wk->wk_state & ONWORKLIST)
2171		panic("add_to_journal: %s(0x%X) already on list",
2172		    TYPENAME(wk->wk_type), wk->wk_state);
2173	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2174	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2175		ump->softdep_jblocks->jb_age = ticks;
2176		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2177	} else
2178		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2179	ump->softdep_journal_tail = wk;
2180	ump->softdep_on_journal += 1;
2181}
2182
2183/*
2184 * Remove an arbitrary item for the journal worklist maintain the tail
2185 * pointer.  This happens when a new operation obviates the need to
2186 * journal an old operation.
2187 */
2188static void
2189remove_from_journal(wk)
2190	struct worklist *wk;
2191{
2192	struct ufsmount *ump;
2193
2194	mtx_assert(&lk, MA_OWNED);
2195	ump = VFSTOUFS(wk->wk_mp);
2196#ifdef DEBUG	/* XXX Expensive, temporary. */
2197	{
2198		struct worklist *wkn;
2199
2200		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2201			if (wkn == wk)
2202				break;
2203		if (wkn == NULL)
2204			panic("remove_from_journal: %p is not in journal", wk);
2205	}
2206#endif
2207	/*
2208	 * We emulate a TAILQ to save space in most structures which do not
2209	 * require TAILQ semantics.  Here we must update the tail position
2210	 * when removing the tail which is not the final entry.
2211	 */
2212	if (ump->softdep_journal_tail == wk)
2213		ump->softdep_journal_tail =
2214		    (struct worklist *)wk->wk_list.le_prev;
2215
2216	WORKLIST_REMOVE(wk);
2217	ump->softdep_on_journal -= 1;
2218}
2219
2220/*
2221 * Check for journal space as well as dependency limits so the prelink
2222 * code can throttle both journaled and non-journaled filesystems.
2223 * Threshold is 0 for low and 1 for min.
2224 */
2225static int
2226journal_space(ump, thresh)
2227	struct ufsmount *ump;
2228	int thresh;
2229{
2230	struct jblocks *jblocks;
2231	int avail;
2232
2233	/*
2234	 * We use a tighter restriction here to prevent request_cleanup()
2235	 * running in threads from running into locks we currently hold.
2236	 */
2237	if (num_inodedep > (max_softdeps / 10) * 9)
2238		return (0);
2239
2240	jblocks = ump->softdep_jblocks;
2241	if (jblocks == NULL)
2242		return (1);
2243	if (thresh)
2244		thresh = jblocks->jb_min;
2245	else
2246		thresh = jblocks->jb_low;
2247	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2248	avail = jblocks->jb_free - avail;
2249
2250	return (avail > thresh);
2251}
2252
2253static void
2254journal_suspend(ump)
2255	struct ufsmount *ump;
2256{
2257	struct jblocks *jblocks;
2258	struct mount *mp;
2259
2260	mp = UFSTOVFS(ump);
2261	jblocks = ump->softdep_jblocks;
2262	MNT_ILOCK(mp);
2263	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2264		stat_journal_min++;
2265		mp->mnt_kern_flag |= MNTK_SUSPEND;
2266		mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
2267	}
2268	jblocks->jb_suspended = 1;
2269	MNT_IUNLOCK(mp);
2270}
2271
2272/*
2273 * Called before any allocation function to be certain that there is
2274 * sufficient space in the journal prior to creating any new records.
2275 * Since in the case of block allocation we may have multiple locked
2276 * buffers at the time of the actual allocation we can not block
2277 * when the journal records are created.  Doing so would create a deadlock
2278 * if any of these buffers needed to be flushed to reclaim space.  Instead
2279 * we require a sufficiently large amount of available space such that
2280 * each thread in the system could have passed this allocation check and
2281 * still have sufficient free space.  With 20% of a minimum journal size
2282 * of 1MB we have 6553 records available.
2283 */
2284int
2285softdep_prealloc(vp, waitok)
2286	struct vnode *vp;
2287	int waitok;
2288{
2289	struct ufsmount *ump;
2290
2291	if (DOINGSUJ(vp) == 0)
2292		return (0);
2293	ump = VFSTOUFS(vp->v_mount);
2294	ACQUIRE_LOCK(&lk);
2295	if (journal_space(ump, 0)) {
2296		FREE_LOCK(&lk);
2297		return (0);
2298	}
2299	stat_journal_low++;
2300	FREE_LOCK(&lk);
2301	if (waitok == MNT_NOWAIT)
2302		return (ENOSPC);
2303	/*
2304	 * Attempt to sync this vnode once to flush any journal
2305	 * work attached to it.
2306	 */
2307	ffs_syncvnode(vp, waitok);
2308	ACQUIRE_LOCK(&lk);
2309	process_removes(vp);
2310	if (journal_space(ump, 0) == 0) {
2311		softdep_speedup();
2312		if (journal_space(ump, 1) == 0)
2313			journal_suspend(ump);
2314	}
2315	FREE_LOCK(&lk);
2316
2317	return (0);
2318}
2319
2320/*
2321 * Before adjusting a link count on a vnode verify that we have sufficient
2322 * journal space.  If not, process operations that depend on the currently
2323 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
2324 * and softdep flush threads can not acquire these locks to reclaim space.
2325 */
2326static void
2327softdep_prelink(dvp, vp)
2328	struct vnode *dvp;
2329	struct vnode *vp;
2330{
2331	struct ufsmount *ump;
2332
2333	ump = VFSTOUFS(dvp->v_mount);
2334	mtx_assert(&lk, MA_OWNED);
2335	if (journal_space(ump, 0))
2336		return;
2337	stat_journal_low++;
2338	FREE_LOCK(&lk);
2339	if (vp)
2340		ffs_syncvnode(vp, MNT_NOWAIT);
2341	ffs_syncvnode(dvp, MNT_WAIT);
2342	ACQUIRE_LOCK(&lk);
2343	/* Process vp before dvp as it may create .. removes. */
2344	if (vp)
2345		process_removes(vp);
2346	process_removes(dvp);
2347	softdep_speedup();
2348	process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
2349	process_worklist_item(UFSTOVFS(ump), LK_NOWAIT);
2350	if (journal_space(ump, 0) == 0) {
2351		softdep_speedup();
2352		if (journal_space(ump, 1) == 0)
2353			journal_suspend(ump);
2354	}
2355}
2356
2357static void
2358jseg_write(fs, jblocks, jseg, data)
2359	struct fs *fs;
2360	struct jblocks *jblocks;
2361	struct jseg *jseg;
2362	uint8_t *data;
2363{
2364	struct jsegrec *rec;
2365
2366	rec = (struct jsegrec *)data;
2367	rec->jsr_seq = jseg->js_seq;
2368	rec->jsr_oldest = jblocks->jb_oldestseq;
2369	rec->jsr_cnt = jseg->js_cnt;
2370	rec->jsr_blocks = jseg->js_size / DEV_BSIZE;
2371	rec->jsr_crc = 0;
2372	rec->jsr_time = fs->fs_mtime;
2373}
2374
2375static inline void
2376inoref_write(inoref, jseg, rec)
2377	struct inoref *inoref;
2378	struct jseg *jseg;
2379	struct jrefrec *rec;
2380{
2381
2382	inoref->if_jsegdep->jd_seg = jseg;
2383	rec->jr_ino = inoref->if_ino;
2384	rec->jr_parent = inoref->if_parent;
2385	rec->jr_nlink = inoref->if_nlink;
2386	rec->jr_mode = inoref->if_mode;
2387	rec->jr_diroff = inoref->if_diroff;
2388}
2389
2390static void
2391jaddref_write(jaddref, jseg, data)
2392	struct jaddref *jaddref;
2393	struct jseg *jseg;
2394	uint8_t *data;
2395{
2396	struct jrefrec *rec;
2397
2398	rec = (struct jrefrec *)data;
2399	rec->jr_op = JOP_ADDREF;
2400	inoref_write(&jaddref->ja_ref, jseg, rec);
2401}
2402
2403static void
2404jremref_write(jremref, jseg, data)
2405	struct jremref *jremref;
2406	struct jseg *jseg;
2407	uint8_t *data;
2408{
2409	struct jrefrec *rec;
2410
2411	rec = (struct jrefrec *)data;
2412	rec->jr_op = JOP_REMREF;
2413	inoref_write(&jremref->jr_ref, jseg, rec);
2414}
2415
2416static	void
2417jmvref_write(jmvref, jseg, data)
2418	struct jmvref *jmvref;
2419	struct jseg *jseg;
2420	uint8_t *data;
2421{
2422	struct jmvrec *rec;
2423
2424	rec = (struct jmvrec *)data;
2425	rec->jm_op = JOP_MVREF;
2426	rec->jm_ino = jmvref->jm_ino;
2427	rec->jm_parent = jmvref->jm_parent;
2428	rec->jm_oldoff = jmvref->jm_oldoff;
2429	rec->jm_newoff = jmvref->jm_newoff;
2430}
2431
2432static void
2433jnewblk_write(jnewblk, jseg, data)
2434	struct jnewblk *jnewblk;
2435	struct jseg *jseg;
2436	uint8_t *data;
2437{
2438	struct jblkrec *rec;
2439
2440	jnewblk->jn_jsegdep->jd_seg = jseg;
2441	rec = (struct jblkrec *)data;
2442	rec->jb_op = JOP_NEWBLK;
2443	rec->jb_ino = jnewblk->jn_ino;
2444	rec->jb_blkno = jnewblk->jn_blkno;
2445	rec->jb_lbn = jnewblk->jn_lbn;
2446	rec->jb_frags = jnewblk->jn_frags;
2447	rec->jb_oldfrags = jnewblk->jn_oldfrags;
2448}
2449
2450static void
2451jfreeblk_write(jfreeblk, jseg, data)
2452	struct jfreeblk *jfreeblk;
2453	struct jseg *jseg;
2454	uint8_t *data;
2455{
2456	struct jblkrec *rec;
2457
2458	jfreeblk->jf_jsegdep->jd_seg = jseg;
2459	rec = (struct jblkrec *)data;
2460	rec->jb_op = JOP_FREEBLK;
2461	rec->jb_ino = jfreeblk->jf_ino;
2462	rec->jb_blkno = jfreeblk->jf_blkno;
2463	rec->jb_lbn = jfreeblk->jf_lbn;
2464	rec->jb_frags = jfreeblk->jf_frags;
2465	rec->jb_oldfrags = 0;
2466}
2467
2468static void
2469jfreefrag_write(jfreefrag, jseg, data)
2470	struct jfreefrag *jfreefrag;
2471	struct jseg *jseg;
2472	uint8_t *data;
2473{
2474	struct jblkrec *rec;
2475
2476	jfreefrag->fr_jsegdep->jd_seg = jseg;
2477	rec = (struct jblkrec *)data;
2478	rec->jb_op = JOP_FREEBLK;
2479	rec->jb_ino = jfreefrag->fr_ino;
2480	rec->jb_blkno = jfreefrag->fr_blkno;
2481	rec->jb_lbn = jfreefrag->fr_lbn;
2482	rec->jb_frags = jfreefrag->fr_frags;
2483	rec->jb_oldfrags = 0;
2484}
2485
2486static void
2487jtrunc_write(jtrunc, jseg, data)
2488	struct jtrunc *jtrunc;
2489	struct jseg *jseg;
2490	uint8_t *data;
2491{
2492	struct jtrncrec *rec;
2493
2494	rec = (struct jtrncrec *)data;
2495	rec->jt_op = JOP_TRUNC;
2496	rec->jt_ino = jtrunc->jt_ino;
2497	rec->jt_size = jtrunc->jt_size;
2498	rec->jt_extsize = jtrunc->jt_extsize;
2499}
2500
2501/*
2502 * Flush some journal records to disk.
2503 */
2504static void
2505softdep_process_journal(mp, flags)
2506	struct mount *mp;
2507	int flags;
2508{
2509	struct jblocks *jblocks;
2510	struct ufsmount *ump;
2511	struct worklist *wk;
2512	struct jseg *jseg;
2513	struct buf *bp;
2514	uint8_t *data;
2515	struct fs *fs;
2516	int segwritten;
2517	int jrecmin;	/* Minimum records per block. */
2518	int jrecmax;	/* Maximum records per block. */
2519	int size;
2520	int cnt;
2521	int off;
2522
2523	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
2524		return;
2525	ump = VFSTOUFS(mp);
2526	fs = ump->um_fs;
2527	jblocks = ump->softdep_jblocks;
2528	/*
2529	 * We write anywhere between a disk block and fs block.  The upper
2530	 * bound is picked to prevent buffer cache fragmentation and limit
2531	 * processing time per I/O.
2532	 */
2533	jrecmin = (DEV_BSIZE / JREC_SIZE) - 1; /* -1 for seg header */
2534	jrecmax = (fs->fs_bsize / DEV_BSIZE) * jrecmin;
2535	segwritten = 0;
2536	while ((cnt = ump->softdep_on_journal) != 0) {
2537		/*
2538		 * Create a new segment to hold as many as 'cnt' journal
2539		 * entries and add them to the segment.  Notice cnt is
2540		 * off by one to account for the space required by the
2541		 * jsegrec.  If we don't have a full block to log skip it
2542		 * unless we haven't written anything.
2543		 */
2544		cnt++;
2545		if (cnt < jrecmax && segwritten)
2546			break;
2547		/*
2548		 * Verify some free journal space.  softdep_prealloc() should
2549	 	 * guarantee that we don't run out so this is indicative of
2550		 * a problem with the flow control.  Try to recover
2551		 * gracefully in any event.
2552		 */
2553		while (jblocks->jb_free == 0) {
2554			if (flags != MNT_WAIT)
2555				break;
2556			printf("softdep: Out of journal space!\n");
2557			softdep_speedup();
2558			msleep(jblocks, &lk, PRIBIO, "jblocks", 1);
2559		}
2560		FREE_LOCK(&lk);
2561		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
2562		workitem_alloc(&jseg->js_list, D_JSEG, mp);
2563		LIST_INIT(&jseg->js_entries);
2564		jseg->js_state = ATTACHED;
2565		jseg->js_jblocks = jblocks;
2566		bp = geteblk(fs->fs_bsize, 0);
2567		ACQUIRE_LOCK(&lk);
2568		/*
2569		 * If there was a race while we were allocating the block
2570		 * and jseg the entry we care about was likely written.
2571		 * We bail out in both the WAIT and NOWAIT case and assume
2572		 * the caller will loop if the entry it cares about is
2573		 * not written.
2574		 */
2575		if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) {
2576			bp->b_flags |= B_INVAL | B_NOCACHE;
2577			WORKITEM_FREE(jseg, D_JSEG);
2578			FREE_LOCK(&lk);
2579			brelse(bp);
2580			ACQUIRE_LOCK(&lk);
2581			break;
2582		}
2583		/*
2584		 * Calculate the disk block size required for the available
2585		 * records rounded to the min size.
2586		 */
2587		cnt = ump->softdep_on_journal;
2588		if (cnt < jrecmax)
2589			size = howmany(cnt, jrecmin) * DEV_BSIZE;
2590		else
2591			size = fs->fs_bsize;
2592		/*
2593		 * Allocate a disk block for this journal data and account
2594		 * for truncation of the requested size if enough contiguous
2595		 * space was not available.
2596		 */
2597		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
2598		bp->b_lblkno = bp->b_blkno;
2599		bp->b_offset = bp->b_blkno * DEV_BSIZE;
2600		bp->b_bcount = size;
2601		bp->b_bufobj = &ump->um_devvp->v_bufobj;
2602		bp->b_flags &= ~B_INVAL;
2603		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
2604		/*
2605		 * Initialize our jseg with cnt records.  Assign the next
2606		 * sequence number to it and link it in-order.
2607		 */
2608		cnt = MIN(ump->softdep_on_journal,
2609		    (size / DEV_BSIZE) * jrecmin);
2610		jseg->js_buf = bp;
2611		jseg->js_cnt = cnt;
2612		jseg->js_refs = cnt + 1;	/* Self ref. */
2613		jseg->js_size = size;
2614		jseg->js_seq = jblocks->jb_nextseq++;
2615		if (TAILQ_EMPTY(&jblocks->jb_segs))
2616			jblocks->jb_oldestseq = jseg->js_seq;
2617		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
2618		if (jblocks->jb_writeseg == NULL)
2619			jblocks->jb_writeseg = jseg;
2620		/*
2621		 * Start filling in records from the pending list.
2622		 */
2623		data = bp->b_data;
2624		off = 0;
2625		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
2626		    != NULL) {
2627			/* Place a segment header on every device block. */
2628			if ((off % DEV_BSIZE) == 0) {
2629				jseg_write(fs, jblocks, jseg, data);
2630				off += JREC_SIZE;
2631				data = bp->b_data + off;
2632			}
2633			remove_from_journal(wk);
2634			wk->wk_state |= IOSTARTED;
2635			WORKLIST_INSERT(&jseg->js_entries, wk);
2636			switch (wk->wk_type) {
2637			case D_JADDREF:
2638				jaddref_write(WK_JADDREF(wk), jseg, data);
2639				break;
2640			case D_JREMREF:
2641				jremref_write(WK_JREMREF(wk), jseg, data);
2642				break;
2643			case D_JMVREF:
2644				jmvref_write(WK_JMVREF(wk), jseg, data);
2645				break;
2646			case D_JNEWBLK:
2647				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
2648				break;
2649			case D_JFREEBLK:
2650				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
2651				break;
2652			case D_JFREEFRAG:
2653				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
2654				break;
2655			case D_JTRUNC:
2656				jtrunc_write(WK_JTRUNC(wk), jseg, data);
2657				break;
2658			default:
2659				panic("process_journal: Unknown type %s",
2660				    TYPENAME(wk->wk_type));
2661				/* NOTREACHED */
2662			}
2663			if (--cnt == 0)
2664				break;
2665			off += JREC_SIZE;
2666			data = bp->b_data + off;
2667		}
2668		/*
2669		 * Write this one buffer and continue.
2670		 */
2671		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
2672		FREE_LOCK(&lk);
2673		BO_LOCK(bp->b_bufobj);
2674		bgetvp(ump->um_devvp, bp);
2675		BO_UNLOCK(bp->b_bufobj);
2676		if (flags == MNT_NOWAIT)
2677			bawrite(bp);
2678		else
2679			bwrite(bp);
2680		ACQUIRE_LOCK(&lk);
2681	}
2682	/*
2683	 * If we've suspended the filesystem because we ran out of journal
2684	 * space either try to sync it here to make some progress or
2685	 * unsuspend it if we already have.
2686	 */
2687	if (flags == 0 && jblocks && jblocks->jb_suspended) {
2688		if (journal_space(ump, jblocks->jb_min)) {
2689			FREE_LOCK(&lk);
2690			jblocks->jb_suspended = 0;
2691			mp->mnt_susp_owner = curthread;
2692			vfs_write_resume(mp);
2693			ACQUIRE_LOCK(&lk);
2694			return;
2695		}
2696		FREE_LOCK(&lk);
2697		VFS_SYNC(mp, MNT_NOWAIT);
2698		ffs_sbupdate(ump, MNT_WAIT, 0);
2699		ACQUIRE_LOCK(&lk);
2700	}
2701}
2702
2703/*
2704 * Complete a jseg, allowing all dependencies awaiting journal writes
2705 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
2706 * structures so that the journal segment can be freed to reclaim space.
2707 */
2708static void
2709complete_jseg(jseg)
2710	struct jseg *jseg;
2711{
2712	struct worklist *wk;
2713	struct jmvref *jmvref;
2714	int waiting;
2715	int i;
2716
2717	i = 0;
2718	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
2719		WORKLIST_REMOVE(wk);
2720		waiting = wk->wk_state & IOWAITING;
2721		wk->wk_state &= ~(IOSTARTED | IOWAITING);
2722		wk->wk_state |= COMPLETE;
2723		KASSERT(i < jseg->js_cnt,
2724		    ("handle_written_jseg: overflow %d >= %d",
2725		    i, jseg->js_cnt));
2726		switch (wk->wk_type) {
2727		case D_JADDREF:
2728			handle_written_jaddref(WK_JADDREF(wk));
2729			break;
2730		case D_JREMREF:
2731			handle_written_jremref(WK_JREMREF(wk));
2732			break;
2733		case D_JMVREF:
2734			/* No jsegdep here. */
2735			free_jseg(jseg);
2736			jmvref = WK_JMVREF(wk);
2737			LIST_REMOVE(jmvref, jm_deps);
2738			free_pagedep(jmvref->jm_pagedep);
2739			WORKITEM_FREE(jmvref, D_JMVREF);
2740			break;
2741		case D_JNEWBLK:
2742			handle_written_jnewblk(WK_JNEWBLK(wk));
2743			break;
2744		case D_JFREEBLK:
2745			handle_written_jfreeblk(WK_JFREEBLK(wk));
2746			break;
2747		case D_JFREEFRAG:
2748			handle_written_jfreefrag(WK_JFREEFRAG(wk));
2749			break;
2750		case D_JTRUNC:
2751			WK_JTRUNC(wk)->jt_jsegdep->jd_seg = jseg;
2752			WORKITEM_FREE(wk, D_JTRUNC);
2753			break;
2754		default:
2755			panic("handle_written_jseg: Unknown type %s",
2756			    TYPENAME(wk->wk_type));
2757			/* NOTREACHED */
2758		}
2759		if (waiting)
2760			wakeup(wk);
2761	}
2762	/* Release the self reference so the structure may be freed. */
2763	free_jseg(jseg);
2764}
2765
2766/*
2767 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Handle jseg
2768 * completions in order only.
2769 */
2770static void
2771handle_written_jseg(jseg, bp)
2772	struct jseg *jseg;
2773	struct buf *bp;
2774{
2775	struct jblocks *jblocks;
2776	struct jseg *jsegn;
2777
2778	if (jseg->js_refs == 0)
2779		panic("handle_written_jseg: No self-reference on %p", jseg);
2780	jseg->js_state |= DEPCOMPLETE;
2781	/*
2782	 * We'll never need this buffer again, set flags so it will be
2783	 * discarded.
2784	 */
2785	bp->b_flags |= B_INVAL | B_NOCACHE;
2786	jblocks = jseg->js_jblocks;
2787	/*
2788	 * Don't allow out of order completions.  If this isn't the first
2789	 * block wait for it to write before we're done.
2790	 */
2791	if (jseg != jblocks->jb_writeseg)
2792		return;
2793	/* Iterate through available jsegs processing their entries. */
2794	do {
2795		jsegn = TAILQ_NEXT(jseg, js_next);
2796		complete_jseg(jseg);
2797		jseg = jsegn;
2798	} while (jseg && jseg->js_state & DEPCOMPLETE);
2799	jblocks->jb_writeseg = jseg;
2800}
2801
2802static inline struct jsegdep *
2803inoref_jseg(inoref)
2804	struct inoref *inoref;
2805{
2806	struct jsegdep *jsegdep;
2807
2808	jsegdep = inoref->if_jsegdep;
2809	inoref->if_jsegdep = NULL;
2810
2811	return (jsegdep);
2812}
2813
2814/*
2815 * Called once a jremref has made it to stable store.  The jremref is marked
2816 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
2817 * for the jremref to complete will be awoken by free_jremref.
2818 */
2819static void
2820handle_written_jremref(jremref)
2821	struct jremref *jremref;
2822{
2823	struct inodedep *inodedep;
2824	struct jsegdep *jsegdep;
2825	struct dirrem *dirrem;
2826
2827	/* Grab the jsegdep. */
2828	jsegdep = inoref_jseg(&jremref->jr_ref);
2829	/*
2830	 * Remove us from the inoref list.
2831	 */
2832	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
2833	    0, &inodedep) == 0)
2834		panic("handle_written_jremref: Lost inodedep");
2835	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
2836	/*
2837	 * Complete the dirrem.
2838	 */
2839	dirrem = jremref->jr_dirrem;
2840	jremref->jr_dirrem = NULL;
2841	LIST_REMOVE(jremref, jr_deps);
2842	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
2843	WORKLIST_INSERT(&dirrem->dm_jwork, &jsegdep->jd_list);
2844	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
2845	    (dirrem->dm_state & COMPLETE) != 0)
2846		add_to_worklist(&dirrem->dm_list, 0);
2847	free_jremref(jremref);
2848}
2849
2850/*
2851 * Called once a jaddref has made it to stable store.  The dependency is
2852 * marked complete and any dependent structures are added to the inode
2853 * bufwait list to be completed as soon as it is written.  If a bitmap write
2854 * depends on this entry we move the inode into the inodedephd of the
2855 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
2856 */
2857static void
2858handle_written_jaddref(jaddref)
2859	struct jaddref *jaddref;
2860{
2861	struct jsegdep *jsegdep;
2862	struct inodedep *inodedep;
2863	struct diradd *diradd;
2864	struct mkdir *mkdir;
2865
2866	/* Grab the jsegdep. */
2867	jsegdep = inoref_jseg(&jaddref->ja_ref);
2868	mkdir = NULL;
2869	diradd = NULL;
2870	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
2871	    0, &inodedep) == 0)
2872		panic("handle_written_jaddref: Lost inodedep.");
2873	if (jaddref->ja_diradd == NULL)
2874		panic("handle_written_jaddref: No dependency");
2875	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
2876		diradd = jaddref->ja_diradd;
2877		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
2878	} else if (jaddref->ja_state & MKDIR_PARENT) {
2879		mkdir = jaddref->ja_mkdir;
2880		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
2881	} else if (jaddref->ja_state & MKDIR_BODY)
2882		mkdir = jaddref->ja_mkdir;
2883	else
2884		panic("handle_written_jaddref: Unknown dependency %p",
2885		    jaddref->ja_diradd);
2886	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
2887	/*
2888	 * Remove us from the inode list.
2889	 */
2890	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
2891	/*
2892	 * The mkdir may be waiting on the jaddref to clear before freeing.
2893	 */
2894	if (mkdir) {
2895		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
2896		    ("handle_written_jaddref: Incorrect type for mkdir %s",
2897		    TYPENAME(mkdir->md_list.wk_type)));
2898		mkdir->md_jaddref = NULL;
2899		diradd = mkdir->md_diradd;
2900		mkdir->md_state |= DEPCOMPLETE;
2901		complete_mkdir(mkdir);
2902	}
2903	WORKLIST_INSERT(&diradd->da_jwork, &jsegdep->jd_list);
2904	if (jaddref->ja_state & NEWBLOCK) {
2905		inodedep->id_state |= ONDEPLIST;
2906		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
2907		    inodedep, id_deps);
2908	}
2909	free_jaddref(jaddref);
2910}
2911
2912/*
2913 * Called once a jnewblk journal is written.  The allocdirect or allocindir
2914 * is placed in the bmsafemap to await notification of a written bitmap.
2915 */
2916static void
2917handle_written_jnewblk(jnewblk)
2918	struct jnewblk *jnewblk;
2919{
2920	struct bmsafemap *bmsafemap;
2921	struct jsegdep *jsegdep;
2922	struct newblk *newblk;
2923
2924	/* Grab the jsegdep. */
2925	jsegdep = jnewblk->jn_jsegdep;
2926	jnewblk->jn_jsegdep = NULL;
2927	/*
2928	 * Add the written block to the bmsafemap so it can be notified when
2929	 * the bitmap is on disk.
2930	 */
2931	newblk = jnewblk->jn_newblk;
2932	jnewblk->jn_newblk = NULL;
2933	if (newblk == NULL)
2934		panic("handle_written_jnewblk: No dependency for the segdep.");
2935
2936	newblk->nb_jnewblk = NULL;
2937	bmsafemap = newblk->nb_bmsafemap;
2938	WORKLIST_INSERT(&newblk->nb_jwork, &jsegdep->jd_list);
2939	newblk->nb_state |= ONDEPLIST;
2940	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
2941	free_jnewblk(jnewblk);
2942}
2943
2944/*
2945 * Cancel a jfreefrag that won't be needed, probably due to colliding with
2946 * an in-flight allocation that has not yet been committed.  Divorce us
2947 * from the freefrag and mark it DEPCOMPLETE so that it may be added
2948 * to the worklist.
2949 */
2950static void
2951cancel_jfreefrag(jfreefrag)
2952	struct jfreefrag *jfreefrag;
2953{
2954	struct freefrag *freefrag;
2955
2956	if (jfreefrag->fr_jsegdep) {
2957		free_jsegdep(jfreefrag->fr_jsegdep);
2958		jfreefrag->fr_jsegdep = NULL;
2959	}
2960	freefrag = jfreefrag->fr_freefrag;
2961	jfreefrag->fr_freefrag = NULL;
2962	freefrag->ff_jfreefrag = NULL;
2963	free_jfreefrag(jfreefrag);
2964	freefrag->ff_state |= DEPCOMPLETE;
2965}
2966
2967/*
2968 * Free a jfreefrag when the parent freefrag is rendered obsolete.
2969 */
2970static void
2971free_jfreefrag(jfreefrag)
2972	struct jfreefrag *jfreefrag;
2973{
2974
2975	if (jfreefrag->fr_state & IOSTARTED)
2976		WORKLIST_REMOVE(&jfreefrag->fr_list);
2977	else if (jfreefrag->fr_state & ONWORKLIST)
2978		remove_from_journal(&jfreefrag->fr_list);
2979	if (jfreefrag->fr_freefrag != NULL)
2980		panic("free_jfreefrag:  Still attached to a freefrag.");
2981	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
2982}
2983
2984/*
2985 * Called when the journal write for a jfreefrag completes.  The parent
2986 * freefrag is added to the worklist if this completes its dependencies.
2987 */
2988static void
2989handle_written_jfreefrag(jfreefrag)
2990	struct jfreefrag *jfreefrag;
2991{
2992	struct jsegdep *jsegdep;
2993	struct freefrag *freefrag;
2994
2995	/* Grab the jsegdep. */
2996	jsegdep = jfreefrag->fr_jsegdep;
2997	jfreefrag->fr_jsegdep = NULL;
2998	freefrag = jfreefrag->fr_freefrag;
2999	if (freefrag == NULL)
3000		panic("handle_written_jfreefrag: No freefrag.");
3001	freefrag->ff_state |= DEPCOMPLETE;
3002	freefrag->ff_jfreefrag = NULL;
3003	WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list);
3004	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3005		add_to_worklist(&freefrag->ff_list, 0);
3006	jfreefrag->fr_freefrag = NULL;
3007	free_jfreefrag(jfreefrag);
3008}
3009
3010/*
3011 * Called when the journal write for a jfreeblk completes.  The jfreeblk
3012 * is removed from the freeblks list of pending journal writes and the
3013 * jsegdep is moved to the freeblks jwork to be completed when all blocks
3014 * have been reclaimed.
3015 */
3016static void
3017handle_written_jfreeblk(jfreeblk)
3018	struct jfreeblk *jfreeblk;
3019{
3020	struct freeblks *freeblks;
3021	struct jsegdep *jsegdep;
3022
3023	/* Grab the jsegdep. */
3024	jsegdep = jfreeblk->jf_jsegdep;
3025	jfreeblk->jf_jsegdep = NULL;
3026	freeblks = jfreeblk->jf_freeblks;
3027	LIST_REMOVE(jfreeblk, jf_deps);
3028	WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list);
3029	/*
3030	 * If the freeblks is all journaled, we can add it to the worklist.
3031	 */
3032	if (LIST_EMPTY(&freeblks->fb_jfreeblkhd) &&
3033	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) {
3034		/* Remove from the b_dep that is waiting on this write. */
3035		if (freeblks->fb_state & ONWORKLIST)
3036			WORKLIST_REMOVE(&freeblks->fb_list);
3037		add_to_worklist(&freeblks->fb_list, 1);
3038	}
3039
3040	free_jfreeblk(jfreeblk);
3041}
3042
3043static struct jsegdep *
3044newjsegdep(struct worklist *wk)
3045{
3046	struct jsegdep *jsegdep;
3047
3048	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3049	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3050	jsegdep->jd_seg = NULL;
3051
3052	return (jsegdep);
3053}
3054
3055static struct jmvref *
3056newjmvref(dp, ino, oldoff, newoff)
3057	struct inode *dp;
3058	ino_t ino;
3059	off_t oldoff;
3060	off_t newoff;
3061{
3062	struct jmvref *jmvref;
3063
3064	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3065	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3066	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3067	jmvref->jm_parent = dp->i_number;
3068	jmvref->jm_ino = ino;
3069	jmvref->jm_oldoff = oldoff;
3070	jmvref->jm_newoff = newoff;
3071
3072	return (jmvref);
3073}
3074
3075/*
3076 * Allocate a new jremref that tracks the removal of ip from dp with the
3077 * directory entry offset of diroff.  Mark the entry as ATTACHED and
3078 * DEPCOMPLETE as we have all the information required for the journal write
3079 * and the directory has already been removed from the buffer.  The caller
3080 * is responsible for linking the jremref into the pagedep and adding it
3081 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
3082 * a DOTDOT addition so handle_workitem_remove() can properly assign
3083 * the jsegdep when we're done.
3084 */
3085static struct jremref *
3086newjremref(dirrem, dp, ip, diroff, nlink)
3087	struct dirrem *dirrem;
3088	struct inode *dp;
3089	struct inode *ip;
3090	off_t diroff;
3091	nlink_t nlink;
3092{
3093	struct jremref *jremref;
3094
3095	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
3096	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
3097	jremref->jr_state = ATTACHED;
3098	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
3099	   nlink, ip->i_mode);
3100	jremref->jr_dirrem = dirrem;
3101
3102	return (jremref);
3103}
3104
3105static inline void
3106newinoref(inoref, ino, parent, diroff, nlink, mode)
3107	struct inoref *inoref;
3108	ino_t ino;
3109	ino_t parent;
3110	off_t diroff;
3111	nlink_t nlink;
3112	uint16_t mode;
3113{
3114
3115	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
3116	inoref->if_diroff = diroff;
3117	inoref->if_ino = ino;
3118	inoref->if_parent = parent;
3119	inoref->if_nlink = nlink;
3120	inoref->if_mode = mode;
3121}
3122
3123/*
3124 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
3125 * directory offset may not be known until later.  The caller is responsible
3126 * adding the entry to the journal when this information is available.  nlink
3127 * should be the link count prior to the addition and mode is only required
3128 * to have the correct FMT.
3129 */
3130static struct jaddref *
3131newjaddref(dp, ino, diroff, nlink, mode)
3132	struct inode *dp;
3133	ino_t ino;
3134	off_t diroff;
3135	int16_t nlink;
3136	uint16_t mode;
3137{
3138	struct jaddref *jaddref;
3139
3140	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
3141	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
3142	jaddref->ja_state = ATTACHED;
3143	jaddref->ja_mkdir = NULL;
3144	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
3145
3146	return (jaddref);
3147}
3148
3149/*
3150 * Create a new free dependency for a freework.  The caller is responsible
3151 * for adjusting the reference count when it has the lock held.  The freedep
3152 * will track an outstanding bitmap write that will ultimately clear the
3153 * freework to continue.
3154 */
3155static struct freedep *
3156newfreedep(struct freework *freework)
3157{
3158	struct freedep *freedep;
3159
3160	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
3161	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
3162	freedep->fd_freework = freework;
3163
3164	return (freedep);
3165}
3166
3167/*
3168 * Free a freedep structure once the buffer it is linked to is written.  If
3169 * this is the last reference to the freework schedule it for completion.
3170 */
3171static void
3172free_freedep(freedep)
3173	struct freedep *freedep;
3174{
3175
3176	if (--freedep->fd_freework->fw_ref == 0)
3177		add_to_worklist(&freedep->fd_freework->fw_list, 1);
3178	WORKITEM_FREE(freedep, D_FREEDEP);
3179}
3180
3181/*
3182 * Allocate a new freework structure that may be a level in an indirect
3183 * when parent is not NULL or a top level block when it is.  The top level
3184 * freework structures are allocated without lk held and before the freeblks
3185 * is visible outside of softdep_setup_freeblocks().
3186 */
3187static struct freework *
3188newfreework(freeblks, parent, lbn, nb, frags, journal)
3189	struct freeblks *freeblks;
3190	struct freework *parent;
3191	ufs_lbn_t lbn;
3192	ufs2_daddr_t nb;
3193	int frags;
3194	int journal;
3195{
3196	struct freework *freework;
3197
3198	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
3199	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
3200	freework->fw_freeblks = freeblks;
3201	freework->fw_parent = parent;
3202	freework->fw_lbn = lbn;
3203	freework->fw_blkno = nb;
3204	freework->fw_frags = frags;
3205	freework->fw_ref = 0;
3206	freework->fw_off = 0;
3207	LIST_INIT(&freework->fw_jwork);
3208
3209	if (parent == NULL) {
3210		WORKLIST_INSERT_UNLOCKED(&freeblks->fb_freeworkhd,
3211		    &freework->fw_list);
3212		freeblks->fb_ref++;
3213	}
3214	if (journal)
3215		newjfreeblk(freeblks, lbn, nb, frags);
3216
3217	return (freework);
3218}
3219
3220/*
3221 * Allocate a new jfreeblk to journal top level block pointer when truncating
3222 * a file.  The caller must add this to the worklist when lk is held.
3223 */
3224static struct jfreeblk *
3225newjfreeblk(freeblks, lbn, blkno, frags)
3226	struct freeblks *freeblks;
3227	ufs_lbn_t lbn;
3228	ufs2_daddr_t blkno;
3229	int frags;
3230{
3231	struct jfreeblk *jfreeblk;
3232
3233	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
3234	workitem_alloc(&jfreeblk->jf_list, D_JFREEBLK, freeblks->fb_list.wk_mp);
3235	jfreeblk->jf_jsegdep = newjsegdep(&jfreeblk->jf_list);
3236	jfreeblk->jf_state = ATTACHED | DEPCOMPLETE;
3237	jfreeblk->jf_ino = freeblks->fb_previousinum;
3238	jfreeblk->jf_lbn = lbn;
3239	jfreeblk->jf_blkno = blkno;
3240	jfreeblk->jf_frags = frags;
3241	jfreeblk->jf_freeblks = freeblks;
3242	LIST_INSERT_HEAD(&freeblks->fb_jfreeblkhd, jfreeblk, jf_deps);
3243
3244	return (jfreeblk);
3245}
3246
3247static void move_newblock_dep(struct jaddref *, struct inodedep *);
3248/*
3249 * If we're canceling a new bitmap we have to search for another ref
3250 * to move into the bmsafemap dep.  This might be better expressed
3251 * with another structure.
3252 */
3253static void
3254move_newblock_dep(jaddref, inodedep)
3255	struct jaddref *jaddref;
3256	struct inodedep *inodedep;
3257{
3258	struct inoref *inoref;
3259	struct jaddref *jaddrefn;
3260
3261	jaddrefn = NULL;
3262	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
3263	    inoref = TAILQ_NEXT(inoref, if_deps)) {
3264		if ((jaddref->ja_state & NEWBLOCK) &&
3265		    inoref->if_list.wk_type == D_JADDREF) {
3266			jaddrefn = (struct jaddref *)inoref;
3267			break;
3268		}
3269	}
3270	if (jaddrefn == NULL)
3271		return;
3272	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
3273	jaddrefn->ja_state |= jaddref->ja_state &
3274	    (ATTACHED | UNDONE | NEWBLOCK);
3275	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
3276	jaddref->ja_state |= ATTACHED;
3277	LIST_REMOVE(jaddref, ja_bmdeps);
3278	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
3279	    ja_bmdeps);
3280}
3281
3282/*
3283 * Cancel a jaddref either before it has been written or while it is being
3284 * written.  This happens when a link is removed before the add reaches
3285 * the disk.  The jaddref dependency is kept linked into the bmsafemap
3286 * and inode to prevent the link count or bitmap from reaching the disk
3287 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
3288 * required.
3289 *
3290 * Returns 1 if the canceled addref requires journaling of the remove and
3291 * 0 otherwise.
3292 */
3293static int
3294cancel_jaddref(jaddref, inodedep, wkhd)
3295	struct jaddref *jaddref;
3296	struct inodedep *inodedep;
3297	struct workhead *wkhd;
3298{
3299	struct inoref *inoref;
3300	struct jsegdep *jsegdep;
3301	int needsj;
3302
3303	KASSERT((jaddref->ja_state & COMPLETE) == 0,
3304	    ("cancel_jaddref: Canceling complete jaddref"));
3305	if (jaddref->ja_state & (IOSTARTED | COMPLETE))
3306		needsj = 1;
3307	else
3308		needsj = 0;
3309	if (inodedep == NULL)
3310		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3311		    0, &inodedep) == 0)
3312			panic("cancel_jaddref: Lost inodedep");
3313	/*
3314	 * We must adjust the nlink of any reference operation that follows
3315	 * us so that it is consistent with the in-memory reference.  This
3316	 * ensures that inode nlink rollbacks always have the correct link.
3317	 */
3318	if (needsj == 0)
3319		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
3320		    inoref = TAILQ_NEXT(inoref, if_deps))
3321			inoref->if_nlink--;
3322	jsegdep = inoref_jseg(&jaddref->ja_ref);
3323	if (jaddref->ja_state & NEWBLOCK)
3324		move_newblock_dep(jaddref, inodedep);
3325	if (jaddref->ja_state & IOWAITING) {
3326		jaddref->ja_state &= ~IOWAITING;
3327		wakeup(&jaddref->ja_list);
3328	}
3329	jaddref->ja_mkdir = NULL;
3330	if (jaddref->ja_state & IOSTARTED) {
3331		jaddref->ja_state &= ~IOSTARTED;
3332		WORKLIST_REMOVE(&jaddref->ja_list);
3333		WORKLIST_INSERT(wkhd, &jsegdep->jd_list);
3334	} else {
3335		free_jsegdep(jsegdep);
3336		remove_from_journal(&jaddref->ja_list);
3337	}
3338	/*
3339	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
3340	 * can arrange for them to be freed with the bitmap.  Otherwise we
3341	 * no longer need this addref attached to the inoreflst and it
3342	 * will incorrectly adjust nlink if we leave it.
3343	 */
3344	if ((jaddref->ja_state & NEWBLOCK) == 0) {
3345		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
3346		    if_deps);
3347		jaddref->ja_state |= COMPLETE;
3348		free_jaddref(jaddref);
3349		return (needsj);
3350	}
3351	jaddref->ja_state |= GOINGAWAY;
3352	/*
3353	 * Leave the head of the list for jsegdeps for fast merging.
3354	 */
3355	if (LIST_FIRST(wkhd) != NULL) {
3356		jaddref->ja_state |= ONWORKLIST;
3357		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
3358	} else
3359		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
3360
3361	return (needsj);
3362}
3363
3364/*
3365 * Attempt to free a jaddref structure when some work completes.  This
3366 * should only succeed once the entry is written and all dependencies have
3367 * been notified.
3368 */
3369static void
3370free_jaddref(jaddref)
3371	struct jaddref *jaddref;
3372{
3373
3374	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
3375		return;
3376	if (jaddref->ja_ref.if_jsegdep)
3377		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
3378		    jaddref, jaddref->ja_state);
3379	if (jaddref->ja_state & NEWBLOCK)
3380		LIST_REMOVE(jaddref, ja_bmdeps);
3381	if (jaddref->ja_state & (IOSTARTED | ONWORKLIST))
3382		panic("free_jaddref: Bad state %p(0x%X)",
3383		    jaddref, jaddref->ja_state);
3384	if (jaddref->ja_mkdir != NULL)
3385		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
3386	WORKITEM_FREE(jaddref, D_JADDREF);
3387}
3388
3389/*
3390 * Free a jremref structure once it has been written or discarded.
3391 */
3392static void
3393free_jremref(jremref)
3394	struct jremref *jremref;
3395{
3396
3397	if (jremref->jr_ref.if_jsegdep)
3398		free_jsegdep(jremref->jr_ref.if_jsegdep);
3399	if (jremref->jr_state & IOSTARTED)
3400		panic("free_jremref: IO still pending");
3401	WORKITEM_FREE(jremref, D_JREMREF);
3402}
3403
3404/*
3405 * Free a jnewblk structure.
3406 */
3407static void
3408free_jnewblk(jnewblk)
3409	struct jnewblk *jnewblk;
3410{
3411
3412	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
3413		return;
3414	LIST_REMOVE(jnewblk, jn_deps);
3415	if (jnewblk->jn_newblk != NULL)
3416		panic("free_jnewblk: Dependency still attached.");
3417	WORKITEM_FREE(jnewblk, D_JNEWBLK);
3418}
3419
3420/*
3421 * Cancel a jnewblk which has been superseded by a freeblk.  The jnewblk
3422 * is kept linked into the bmsafemap until the free completes, thus
3423 * preventing the modified state from ever reaching disk.  The free
3424 * routine must pass this structure via ffs_blkfree() to
3425 * softdep_setup_freeblks() so there is no race in releasing the space.
3426 */
3427static void
3428cancel_jnewblk(jnewblk, wkhd)
3429	struct jnewblk *jnewblk;
3430	struct workhead *wkhd;
3431{
3432	struct jsegdep *jsegdep;
3433
3434	jsegdep = jnewblk->jn_jsegdep;
3435	jnewblk->jn_jsegdep  = NULL;
3436	free_jsegdep(jsegdep);
3437	jnewblk->jn_newblk = NULL;
3438	jnewblk->jn_state |= GOINGAWAY;
3439	if (jnewblk->jn_state & IOSTARTED) {
3440		jnewblk->jn_state &= ~IOSTARTED;
3441		WORKLIST_REMOVE(&jnewblk->jn_list);
3442	} else
3443		remove_from_journal(&jnewblk->jn_list);
3444	/*
3445	 * Leave the head of the list for jsegdeps for fast merging.
3446	 */
3447	if (LIST_FIRST(wkhd) != NULL) {
3448		jnewblk->jn_state |= ONWORKLIST;
3449		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jnewblk->jn_list, wk_list);
3450	} else
3451		WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
3452	if (jnewblk->jn_state & IOWAITING) {
3453		jnewblk->jn_state &= ~IOWAITING;
3454		wakeup(&jnewblk->jn_list);
3455	}
3456}
3457
3458static void
3459free_jfreeblk(jfreeblk)
3460	struct jfreeblk *jfreeblk;
3461{
3462
3463	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
3464}
3465
3466/*
3467 * Release one reference to a jseg and free it if the count reaches 0.  This
3468 * should eventually reclaim journal space as well.
3469 */
3470static void
3471free_jseg(jseg)
3472	struct jseg *jseg;
3473{
3474	struct jblocks *jblocks;
3475
3476	KASSERT(jseg->js_refs > 0,
3477	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
3478	if (--jseg->js_refs != 0)
3479		return;
3480	/*
3481	 * Free only those jsegs which have none allocated before them to
3482	 * preserve the journal space ordering.
3483	 */
3484	jblocks = jseg->js_jblocks;
3485	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
3486		jblocks->jb_oldestseq = jseg->js_seq;
3487		if (jseg->js_refs != 0)
3488			break;
3489		TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
3490		jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
3491		KASSERT(LIST_EMPTY(&jseg->js_entries),
3492		    ("free_jseg: Freed jseg has valid entries."));
3493		WORKITEM_FREE(jseg, D_JSEG);
3494	}
3495}
3496
3497/*
3498 * Release a jsegdep and decrement the jseg count.
3499 */
3500static void
3501free_jsegdep(jsegdep)
3502	struct jsegdep *jsegdep;
3503{
3504
3505	if (jsegdep->jd_seg)
3506		free_jseg(jsegdep->jd_seg);
3507	WORKITEM_FREE(jsegdep, D_JSEGDEP);
3508}
3509
3510/*
3511 * Wait for a journal item to make it to disk.  Initiate journal processing
3512 * if required.
3513 */
3514static void
3515jwait(wk)
3516	struct worklist *wk;
3517{
3518
3519	stat_journal_wait++;
3520	/*
3521	 * If IO has not started we process the journal.  We can't mark the
3522	 * worklist item as IOWAITING because we drop the lock while
3523	 * processing the journal and the worklist entry may be freed after
3524	 * this point.  The caller may call back in and re-issue the request.
3525	 */
3526	if ((wk->wk_state & IOSTARTED) == 0) {
3527		softdep_process_journal(wk->wk_mp, MNT_WAIT);
3528		return;
3529	}
3530	wk->wk_state |= IOWAITING;
3531	msleep(wk, &lk, PRIBIO, "jwait", 0);
3532}
3533
3534/*
3535 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
3536 * appropriate.  This is a convenience function to reduce duplicate code
3537 * for the setup and revert functions below.
3538 */
3539static struct inodedep *
3540inodedep_lookup_ip(ip)
3541	struct inode *ip;
3542{
3543	struct inodedep *inodedep;
3544
3545	KASSERT(ip->i_nlink >= ip->i_effnlink,
3546	    ("inodedep_lookup_ip: bad delta"));
3547	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
3548	    DEPALLOC, &inodedep);
3549	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3550
3551	return (inodedep);
3552}
3553
3554/*
3555 * Create a journal entry that describes a truncate that we're about to
3556 * perform.  The inode allocations and frees between here and the completion
3557 * of the operation are done asynchronously and without journaling.  At
3558 * the end of the operation the vnode is sync'd and the journal space
3559 * is released.  Recovery will discover the partially completed truncate
3560 * and complete it.
3561 */
3562void *
3563softdep_setup_trunc(vp, length, flags)
3564	struct vnode *vp;
3565	off_t length;
3566	int flags;
3567{
3568	struct jsegdep *jsegdep;
3569	struct jtrunc *jtrunc;
3570	struct ufsmount *ump;
3571	struct inode *ip;
3572
3573	softdep_prealloc(vp, MNT_WAIT);
3574	ip = VTOI(vp);
3575	ump = VFSTOUFS(vp->v_mount);
3576	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
3577	workitem_alloc(&jtrunc->jt_list, D_JTRUNC, vp->v_mount);
3578	jsegdep = jtrunc->jt_jsegdep = newjsegdep(&jtrunc->jt_list);
3579	jtrunc->jt_ino = ip->i_number;
3580	jtrunc->jt_extsize = 0;
3581	jtrunc->jt_size = length;
3582	if ((flags & IO_EXT) == 0 && ump->um_fstype == UFS2)
3583		jtrunc->jt_extsize = ip->i_din2->di_extsize;
3584	if ((flags & IO_NORMAL) == 0)
3585		jtrunc->jt_size = DIP(ip, i_size);
3586	ACQUIRE_LOCK(&lk);
3587	add_to_journal(&jtrunc->jt_list);
3588	while (jsegdep->jd_seg == NULL) {
3589		stat_jwait_freeblks++;
3590		jwait(&jtrunc->jt_list);
3591	}
3592	FREE_LOCK(&lk);
3593
3594	return (jsegdep);
3595}
3596
3597/*
3598 * After synchronous truncation is complete we free sync the vnode and
3599 * release the jsegdep so the journal space can be freed.
3600 */
3601int
3602softdep_complete_trunc(vp, cookie)
3603	struct vnode *vp;
3604	void *cookie;
3605{
3606	int error;
3607
3608	error = ffs_syncvnode(vp, MNT_WAIT);
3609	ACQUIRE_LOCK(&lk);
3610	free_jsegdep((struct jsegdep *)cookie);
3611	FREE_LOCK(&lk);
3612
3613	return (error);
3614}
3615
3616/*
3617 * Called prior to creating a new inode and linking it to a directory.  The
3618 * jaddref structure must already be allocated by softdep_setup_inomapdep
3619 * and it is discovered here so we can initialize the mode and update
3620 * nlinkdelta.
3621 */
3622void
3623softdep_setup_create(dp, ip)
3624	struct inode *dp;
3625	struct inode *ip;
3626{
3627	struct inodedep *inodedep;
3628	struct jaddref *jaddref;
3629	struct vnode *dvp;
3630
3631	KASSERT(ip->i_nlink == 1,
3632	    ("softdep_setup_create: Invalid link count."));
3633	dvp = ITOV(dp);
3634	ACQUIRE_LOCK(&lk);
3635	inodedep = inodedep_lookup_ip(ip);
3636	if (DOINGSUJ(dvp)) {
3637		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3638		    inoreflst);
3639		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
3640		    ("softdep_setup_create: No addref structure present."));
3641		jaddref->ja_mode = ip->i_mode;
3642	}
3643	softdep_prelink(dvp, NULL);
3644	FREE_LOCK(&lk);
3645}
3646
3647/*
3648 * Create a jaddref structure to track the addition of a DOTDOT link when
3649 * we are reparenting an inode as part of a rename.  This jaddref will be
3650 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
3651 * non-journaling softdep.
3652 */
3653void
3654softdep_setup_dotdot_link(dp, ip)
3655	struct inode *dp;
3656	struct inode *ip;
3657{
3658	struct inodedep *inodedep;
3659	struct jaddref *jaddref;
3660	struct vnode *dvp;
3661	struct vnode *vp;
3662
3663	dvp = ITOV(dp);
3664	vp = ITOV(ip);
3665	jaddref = NULL;
3666	/*
3667	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
3668	 * is used as a normal link would be.
3669	 */
3670	if (DOINGSUJ(dvp))
3671		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
3672		    dp->i_effnlink - 1, dp->i_mode);
3673	ACQUIRE_LOCK(&lk);
3674	inodedep = inodedep_lookup_ip(dp);
3675	if (jaddref)
3676		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
3677		    if_deps);
3678	softdep_prelink(dvp, ITOV(ip));
3679	FREE_LOCK(&lk);
3680}
3681
3682/*
3683 * Create a jaddref structure to track a new link to an inode.  The directory
3684 * offset is not known until softdep_setup_directory_add or
3685 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
3686 * softdep.
3687 */
3688void
3689softdep_setup_link(dp, ip)
3690	struct inode *dp;
3691	struct inode *ip;
3692{
3693	struct inodedep *inodedep;
3694	struct jaddref *jaddref;
3695	struct vnode *dvp;
3696
3697	dvp = ITOV(dp);
3698	jaddref = NULL;
3699	if (DOINGSUJ(dvp))
3700		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
3701		    ip->i_mode);
3702	ACQUIRE_LOCK(&lk);
3703	inodedep = inodedep_lookup_ip(ip);
3704	if (jaddref)
3705		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
3706		    if_deps);
3707	softdep_prelink(dvp, ITOV(ip));
3708	FREE_LOCK(&lk);
3709}
3710
3711/*
3712 * Called to create the jaddref structures to track . and .. references as
3713 * well as lookup and further initialize the incomplete jaddref created
3714 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
3715 * nlinkdelta for non-journaling softdep.
3716 */
3717void
3718softdep_setup_mkdir(dp, ip)
3719	struct inode *dp;
3720	struct inode *ip;
3721{
3722	struct inodedep *inodedep;
3723	struct jaddref *dotdotaddref;
3724	struct jaddref *dotaddref;
3725	struct jaddref *jaddref;
3726	struct vnode *dvp;
3727
3728	dvp = ITOV(dp);
3729	dotaddref = dotdotaddref = NULL;
3730	if (DOINGSUJ(dvp)) {
3731		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
3732		    ip->i_mode);
3733		dotaddref->ja_state |= MKDIR_BODY;
3734		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
3735		    dp->i_effnlink - 1, dp->i_mode);
3736		dotdotaddref->ja_state |= MKDIR_PARENT;
3737	}
3738	ACQUIRE_LOCK(&lk);
3739	inodedep = inodedep_lookup_ip(ip);
3740	if (DOINGSUJ(dvp)) {
3741		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3742		    inoreflst);
3743		KASSERT(jaddref != NULL,
3744		    ("softdep_setup_mkdir: No addref structure present."));
3745		KASSERT(jaddref->ja_parent == dp->i_number,
3746		    ("softdep_setup_mkdir: bad parent %d",
3747		    jaddref->ja_parent));
3748		jaddref->ja_mode = ip->i_mode;
3749		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
3750		    if_deps);
3751	}
3752	inodedep = inodedep_lookup_ip(dp);
3753	if (DOINGSUJ(dvp))
3754		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
3755		    &dotdotaddref->ja_ref, if_deps);
3756	softdep_prelink(ITOV(dp), NULL);
3757	FREE_LOCK(&lk);
3758}
3759
3760/*
3761 * Called to track nlinkdelta of the inode and parent directories prior to
3762 * unlinking a directory.
3763 */
3764void
3765softdep_setup_rmdir(dp, ip)
3766	struct inode *dp;
3767	struct inode *ip;
3768{
3769	struct vnode *dvp;
3770
3771	dvp = ITOV(dp);
3772	ACQUIRE_LOCK(&lk);
3773	(void) inodedep_lookup_ip(ip);
3774	(void) inodedep_lookup_ip(dp);
3775	softdep_prelink(dvp, ITOV(ip));
3776	FREE_LOCK(&lk);
3777}
3778
3779/*
3780 * Called to track nlinkdelta of the inode and parent directories prior to
3781 * unlink.
3782 */
3783void
3784softdep_setup_unlink(dp, ip)
3785	struct inode *dp;
3786	struct inode *ip;
3787{
3788	struct vnode *dvp;
3789
3790	dvp = ITOV(dp);
3791	ACQUIRE_LOCK(&lk);
3792	(void) inodedep_lookup_ip(ip);
3793	(void) inodedep_lookup_ip(dp);
3794	softdep_prelink(dvp, ITOV(ip));
3795	FREE_LOCK(&lk);
3796}
3797
3798/*
3799 * Called to release the journal structures created by a failed non-directory
3800 * creation.  Adjusts nlinkdelta for non-journaling softdep.
3801 */
3802void
3803softdep_revert_create(dp, ip)
3804	struct inode *dp;
3805	struct inode *ip;
3806{
3807	struct inodedep *inodedep;
3808	struct jaddref *jaddref;
3809	struct vnode *dvp;
3810
3811	dvp = ITOV(dp);
3812	ACQUIRE_LOCK(&lk);
3813	inodedep = inodedep_lookup_ip(ip);
3814	if (DOINGSUJ(dvp)) {
3815		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3816		    inoreflst);
3817		KASSERT(jaddref->ja_parent == dp->i_number,
3818		    ("softdep_revert_create: addref parent mismatch"));
3819		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
3820	}
3821	FREE_LOCK(&lk);
3822}
3823
3824/*
3825 * Called to release the journal structures created by a failed dotdot link
3826 * creation.  Adjusts nlinkdelta for non-journaling softdep.
3827 */
3828void
3829softdep_revert_dotdot_link(dp, ip)
3830	struct inode *dp;
3831	struct inode *ip;
3832{
3833	struct inodedep *inodedep;
3834	struct jaddref *jaddref;
3835	struct vnode *dvp;
3836
3837	dvp = ITOV(dp);
3838	ACQUIRE_LOCK(&lk);
3839	inodedep = inodedep_lookup_ip(dp);
3840	if (DOINGSUJ(dvp)) {
3841		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3842		    inoreflst);
3843		KASSERT(jaddref->ja_parent == ip->i_number,
3844		    ("softdep_revert_dotdot_link: addref parent mismatch"));
3845		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
3846	}
3847	FREE_LOCK(&lk);
3848}
3849
3850/*
3851 * Called to release the journal structures created by a failed link
3852 * addition.  Adjusts nlinkdelta for non-journaling softdep.
3853 */
3854void
3855softdep_revert_link(dp, ip)
3856	struct inode *dp;
3857	struct inode *ip;
3858{
3859	struct inodedep *inodedep;
3860	struct jaddref *jaddref;
3861	struct vnode *dvp;
3862
3863	dvp = ITOV(dp);
3864	ACQUIRE_LOCK(&lk);
3865	inodedep = inodedep_lookup_ip(ip);
3866	if (DOINGSUJ(dvp)) {
3867		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3868		    inoreflst);
3869		KASSERT(jaddref->ja_parent == dp->i_number,
3870		    ("softdep_revert_link: addref parent mismatch"));
3871		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
3872	}
3873	FREE_LOCK(&lk);
3874}
3875
3876/*
3877 * Called to release the journal structures created by a failed mkdir
3878 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
3879 */
3880void
3881softdep_revert_mkdir(dp, ip)
3882	struct inode *dp;
3883	struct inode *ip;
3884{
3885	struct inodedep *inodedep;
3886	struct jaddref *jaddref;
3887	struct vnode *dvp;
3888
3889	dvp = ITOV(dp);
3890
3891	ACQUIRE_LOCK(&lk);
3892	inodedep = inodedep_lookup_ip(dp);
3893	if (DOINGSUJ(dvp)) {
3894		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3895		    inoreflst);
3896		KASSERT(jaddref->ja_parent == ip->i_number,
3897		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
3898		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
3899	}
3900	inodedep = inodedep_lookup_ip(ip);
3901	if (DOINGSUJ(dvp)) {
3902		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3903		    inoreflst);
3904		KASSERT(jaddref->ja_parent == dp->i_number,
3905		    ("softdep_revert_mkdir: addref parent mismatch"));
3906		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
3907		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
3908		    inoreflst);
3909		KASSERT(jaddref->ja_parent == ip->i_number,
3910		    ("softdep_revert_mkdir: dot addref parent mismatch"));
3911		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
3912	}
3913	FREE_LOCK(&lk);
3914}
3915
3916/*
3917 * Called to correct nlinkdelta after a failed rmdir.
3918 */
3919void
3920softdep_revert_rmdir(dp, ip)
3921	struct inode *dp;
3922	struct inode *ip;
3923{
3924
3925	ACQUIRE_LOCK(&lk);
3926	(void) inodedep_lookup_ip(ip);
3927	(void) inodedep_lookup_ip(dp);
3928	FREE_LOCK(&lk);
3929}
3930
3931/*
3932 * Protecting the freemaps (or bitmaps).
3933 *
3934 * To eliminate the need to execute fsck before mounting a filesystem
3935 * after a power failure, one must (conservatively) guarantee that the
3936 * on-disk copy of the bitmaps never indicate that a live inode or block is
3937 * free.  So, when a block or inode is allocated, the bitmap should be
3938 * updated (on disk) before any new pointers.  When a block or inode is
3939 * freed, the bitmap should not be updated until all pointers have been
3940 * reset.  The latter dependency is handled by the delayed de-allocation
3941 * approach described below for block and inode de-allocation.  The former
3942 * dependency is handled by calling the following procedure when a block or
3943 * inode is allocated. When an inode is allocated an "inodedep" is created
3944 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
3945 * Each "inodedep" is also inserted into the hash indexing structure so
3946 * that any additional link additions can be made dependent on the inode
3947 * allocation.
3948 *
3949 * The ufs filesystem maintains a number of free block counts (e.g., per
3950 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
3951 * in addition to the bitmaps.  These counts are used to improve efficiency
3952 * during allocation and therefore must be consistent with the bitmaps.
3953 * There is no convenient way to guarantee post-crash consistency of these
3954 * counts with simple update ordering, for two main reasons: (1) The counts
3955 * and bitmaps for a single cylinder group block are not in the same disk
3956 * sector.  If a disk write is interrupted (e.g., by power failure), one may
3957 * be written and the other not.  (2) Some of the counts are located in the
3958 * superblock rather than the cylinder group block. So, we focus our soft
3959 * updates implementation on protecting the bitmaps. When mounting a
3960 * filesystem, we recompute the auxiliary counts from the bitmaps.
3961 */
3962
3963/*
3964 * Called just after updating the cylinder group block to allocate an inode.
3965 */
3966void
3967softdep_setup_inomapdep(bp, ip, newinum)
3968	struct buf *bp;		/* buffer for cylgroup block with inode map */
3969	struct inode *ip;	/* inode related to allocation */
3970	ino_t newinum;		/* new inode number being allocated */
3971{
3972	struct inodedep *inodedep;
3973	struct bmsafemap *bmsafemap;
3974	struct jaddref *jaddref;
3975	struct mount *mp;
3976	struct fs *fs;
3977
3978	mp = UFSTOVFS(ip->i_ump);
3979	fs = ip->i_ump->um_fs;
3980	jaddref = NULL;
3981
3982	/*
3983	 * Allocate the journal reference add structure so that the bitmap
3984	 * can be dependent on it.
3985	 */
3986	if (mp->mnt_kern_flag & MNTK_SUJ) {
3987		jaddref = newjaddref(ip, newinum, 0, 0, 0);
3988		jaddref->ja_state |= NEWBLOCK;
3989	}
3990
3991	/*
3992	 * Create a dependency for the newly allocated inode.
3993	 * Panic if it already exists as something is seriously wrong.
3994	 * Otherwise add it to the dependency list for the buffer holding
3995	 * the cylinder group map from which it was allocated.
3996	 */
3997	ACQUIRE_LOCK(&lk);
3998	if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep)))
3999		panic("softdep_setup_inomapdep: dependency %p for new"
4000		    "inode already exists", inodedep);
4001	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum));
4002	if (jaddref) {
4003		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
4004		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4005		    if_deps);
4006	} else {
4007		inodedep->id_state |= ONDEPLIST;
4008		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
4009	}
4010	inodedep->id_bmsafemap = bmsafemap;
4011	inodedep->id_state &= ~DEPCOMPLETE;
4012	FREE_LOCK(&lk);
4013}
4014
4015/*
4016 * Called just after updating the cylinder group block to
4017 * allocate block or fragment.
4018 */
4019void
4020softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
4021	struct buf *bp;		/* buffer for cylgroup block with block map */
4022	struct mount *mp;	/* filesystem doing allocation */
4023	ufs2_daddr_t newblkno;	/* number of newly allocated block */
4024	int frags;		/* Number of fragments. */
4025	int oldfrags;		/* Previous number of fragments for extend. */
4026{
4027	struct newblk *newblk;
4028	struct bmsafemap *bmsafemap;
4029	struct jnewblk *jnewblk;
4030	struct fs *fs;
4031
4032	fs = VFSTOUFS(mp)->um_fs;
4033	jnewblk = NULL;
4034	/*
4035	 * Create a dependency for the newly allocated block.
4036	 * Add it to the dependency list for the buffer holding
4037	 * the cylinder group map from which it was allocated.
4038	 */
4039	if (mp->mnt_kern_flag & MNTK_SUJ) {
4040		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
4041		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
4042		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
4043		jnewblk->jn_state = ATTACHED;
4044		jnewblk->jn_blkno = newblkno;
4045		jnewblk->jn_frags = frags;
4046		jnewblk->jn_oldfrags = oldfrags;
4047#ifdef SUJ_DEBUG
4048		{
4049			struct cg *cgp;
4050			uint8_t *blksfree;
4051			long bno;
4052			int i;
4053
4054			cgp = (struct cg *)bp->b_data;
4055			blksfree = cg_blksfree(cgp);
4056			bno = dtogd(fs, jnewblk->jn_blkno);
4057			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
4058			    i++) {
4059				if (isset(blksfree, bno + i))
4060					panic("softdep_setup_blkmapdep: "
4061					    "free fragment %d from %d-%d "
4062					    "state 0x%X dep %p", i,
4063					    jnewblk->jn_oldfrags,
4064					    jnewblk->jn_frags,
4065					    jnewblk->jn_state,
4066					    jnewblk->jn_newblk);
4067			}
4068		}
4069#endif
4070	}
4071	ACQUIRE_LOCK(&lk);
4072	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
4073		panic("softdep_setup_blkmapdep: found block");
4074	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
4075	    dtog(fs, newblkno));
4076	if (jnewblk) {
4077		jnewblk->jn_newblk = newblk;
4078		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
4079	} else {
4080		newblk->nb_state |= ONDEPLIST;
4081		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
4082	}
4083	newblk->nb_bmsafemap = bmsafemap;
4084	newblk->nb_jnewblk = jnewblk;
4085	FREE_LOCK(&lk);
4086}
4087
4088#define	BMSAFEMAP_HASH(fs, cg) \
4089      (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash])
4090
4091static int
4092bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp)
4093	struct bmsafemap_hashhead *bmsafemaphd;
4094	struct mount *mp;
4095	int cg;
4096	struct bmsafemap **bmsafemapp;
4097{
4098	struct bmsafemap *bmsafemap;
4099
4100	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
4101		if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg)
4102			break;
4103	if (bmsafemap) {
4104		*bmsafemapp = bmsafemap;
4105		return (1);
4106	}
4107	*bmsafemapp = NULL;
4108
4109	return (0);
4110}
4111
4112/*
4113 * Find the bmsafemap associated with a cylinder group buffer.
4114 * If none exists, create one. The buffer must be locked when
4115 * this routine is called and this routine must be called with
4116 * splbio interrupts blocked.
4117 */
4118static struct bmsafemap *
4119bmsafemap_lookup(mp, bp, cg)
4120	struct mount *mp;
4121	struct buf *bp;
4122	int cg;
4123{
4124	struct bmsafemap_hashhead *bmsafemaphd;
4125	struct bmsafemap *bmsafemap, *collision;
4126	struct worklist *wk;
4127	struct fs *fs;
4128
4129	mtx_assert(&lk, MA_OWNED);
4130	if (bp)
4131		LIST_FOREACH(wk, &bp->b_dep, wk_list)
4132			if (wk->wk_type == D_BMSAFEMAP)
4133				return (WK_BMSAFEMAP(wk));
4134	fs = VFSTOUFS(mp)->um_fs;
4135	bmsafemaphd = BMSAFEMAP_HASH(fs, cg);
4136	if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1)
4137		return (bmsafemap);
4138	FREE_LOCK(&lk);
4139	bmsafemap = malloc(sizeof(struct bmsafemap),
4140		M_BMSAFEMAP, M_SOFTDEP_FLAGS);
4141	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
4142	bmsafemap->sm_buf = bp;
4143	LIST_INIT(&bmsafemap->sm_inodedephd);
4144	LIST_INIT(&bmsafemap->sm_inodedepwr);
4145	LIST_INIT(&bmsafemap->sm_newblkhd);
4146	LIST_INIT(&bmsafemap->sm_newblkwr);
4147	LIST_INIT(&bmsafemap->sm_jaddrefhd);
4148	LIST_INIT(&bmsafemap->sm_jnewblkhd);
4149	ACQUIRE_LOCK(&lk);
4150	if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) {
4151		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
4152		return (collision);
4153	}
4154	bmsafemap->sm_cg = cg;
4155	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
4156	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
4157	return (bmsafemap);
4158}
4159
4160/*
4161 * Direct block allocation dependencies.
4162 *
4163 * When a new block is allocated, the corresponding disk locations must be
4164 * initialized (with zeros or new data) before the on-disk inode points to
4165 * them.  Also, the freemap from which the block was allocated must be
4166 * updated (on disk) before the inode's pointer. These two dependencies are
4167 * independent of each other and are needed for all file blocks and indirect
4168 * blocks that are pointed to directly by the inode.  Just before the
4169 * "in-core" version of the inode is updated with a newly allocated block
4170 * number, a procedure (below) is called to setup allocation dependency
4171 * structures.  These structures are removed when the corresponding
4172 * dependencies are satisfied or when the block allocation becomes obsolete
4173 * (i.e., the file is deleted, the block is de-allocated, or the block is a
4174 * fragment that gets upgraded).  All of these cases are handled in
4175 * procedures described later.
4176 *
4177 * When a file extension causes a fragment to be upgraded, either to a larger
4178 * fragment or to a full block, the on-disk location may change (if the
4179 * previous fragment could not simply be extended). In this case, the old
4180 * fragment must be de-allocated, but not until after the inode's pointer has
4181 * been updated. In most cases, this is handled by later procedures, which
4182 * will construct a "freefrag" structure to be added to the workitem queue
4183 * when the inode update is complete (or obsolete).  The main exception to
4184 * this is when an allocation occurs while a pending allocation dependency
4185 * (for the same block pointer) remains.  This case is handled in the main
4186 * allocation dependency setup procedure by immediately freeing the
4187 * unreferenced fragments.
4188 */
4189void
4190softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
4191	struct inode *ip;	/* inode to which block is being added */
4192	ufs_lbn_t off;		/* block pointer within inode */
4193	ufs2_daddr_t newblkno;	/* disk block number being added */
4194	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
4195	long newsize;		/* size of new block */
4196	long oldsize;		/* size of new block */
4197	struct buf *bp;		/* bp for allocated block */
4198{
4199	struct allocdirect *adp, *oldadp;
4200	struct allocdirectlst *adphead;
4201	struct freefrag *freefrag;
4202	struct inodedep *inodedep;
4203	struct pagedep *pagedep;
4204	struct jnewblk *jnewblk;
4205	struct newblk *newblk;
4206	struct mount *mp;
4207	ufs_lbn_t lbn;
4208
4209	lbn = bp->b_lblkno;
4210	mp = UFSTOVFS(ip->i_ump);
4211	if (oldblkno && oldblkno != newblkno)
4212		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
4213	else
4214		freefrag = NULL;
4215
4216	ACQUIRE_LOCK(&lk);
4217	if (off >= NDADDR) {
4218		if (lbn > 0)
4219			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
4220			    lbn, off);
4221		/* allocating an indirect block */
4222		if (oldblkno != 0)
4223			panic("softdep_setup_allocdirect: non-zero indir");
4224	} else {
4225		if (off != lbn)
4226			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
4227			    lbn, off);
4228		/*
4229		 * Allocating a direct block.
4230		 *
4231		 * If we are allocating a directory block, then we must
4232		 * allocate an associated pagedep to track additions and
4233		 * deletions.
4234		 */
4235		if ((ip->i_mode & IFMT) == IFDIR &&
4236		    pagedep_lookup(mp, ip->i_number, off, DEPALLOC,
4237		    &pagedep) == 0)
4238			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
4239	}
4240	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
4241		panic("softdep_setup_allocdirect: lost block");
4242	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4243	    ("softdep_setup_allocdirect: newblk already initialized"));
4244	/*
4245	 * Convert the newblk to an allocdirect.
4246	 */
4247	newblk->nb_list.wk_type = D_ALLOCDIRECT;
4248	adp = (struct allocdirect *)newblk;
4249	newblk->nb_freefrag = freefrag;
4250	adp->ad_offset = off;
4251	adp->ad_oldblkno = oldblkno;
4252	adp->ad_newsize = newsize;
4253	adp->ad_oldsize = oldsize;
4254
4255	/*
4256	 * Finish initializing the journal.
4257	 */
4258	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4259		jnewblk->jn_ino = ip->i_number;
4260		jnewblk->jn_lbn = lbn;
4261		add_to_journal(&jnewblk->jn_list);
4262	}
4263	if (freefrag && freefrag->ff_jfreefrag != NULL)
4264		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
4265	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
4266	adp->ad_inodedep = inodedep;
4267
4268	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
4269	/*
4270	 * The list of allocdirects must be kept in sorted and ascending
4271	 * order so that the rollback routines can quickly determine the
4272	 * first uncommitted block (the size of the file stored on disk
4273	 * ends at the end of the lowest committed fragment, or if there
4274	 * are no fragments, at the end of the highest committed block).
4275	 * Since files generally grow, the typical case is that the new
4276	 * block is to be added at the end of the list. We speed this
4277	 * special case by checking against the last allocdirect in the
4278	 * list before laboriously traversing the list looking for the
4279	 * insertion point.
4280	 */
4281	adphead = &inodedep->id_newinoupdt;
4282	oldadp = TAILQ_LAST(adphead, allocdirectlst);
4283	if (oldadp == NULL || oldadp->ad_offset <= off) {
4284		/* insert at end of list */
4285		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
4286		if (oldadp != NULL && oldadp->ad_offset == off)
4287			allocdirect_merge(adphead, adp, oldadp);
4288		FREE_LOCK(&lk);
4289		return;
4290	}
4291	TAILQ_FOREACH(oldadp, adphead, ad_next) {
4292		if (oldadp->ad_offset >= off)
4293			break;
4294	}
4295	if (oldadp == NULL)
4296		panic("softdep_setup_allocdirect: lost entry");
4297	/* insert in middle of list */
4298	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
4299	if (oldadp->ad_offset == off)
4300		allocdirect_merge(adphead, adp, oldadp);
4301
4302	FREE_LOCK(&lk);
4303}
4304
4305/*
4306 * Replace an old allocdirect dependency with a newer one.
4307 * This routine must be called with splbio interrupts blocked.
4308 */
4309static void
4310allocdirect_merge(adphead, newadp, oldadp)
4311	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
4312	struct allocdirect *newadp;	/* allocdirect being added */
4313	struct allocdirect *oldadp;	/* existing allocdirect being checked */
4314{
4315	struct worklist *wk;
4316	struct freefrag *freefrag;
4317	struct newdirblk *newdirblk;
4318
4319	freefrag = NULL;
4320	mtx_assert(&lk, MA_OWNED);
4321	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
4322	    newadp->ad_oldsize != oldadp->ad_newsize ||
4323	    newadp->ad_offset >= NDADDR)
4324		panic("%s %jd != new %jd || old size %ld != new %ld",
4325		    "allocdirect_merge: old blkno",
4326		    (intmax_t)newadp->ad_oldblkno,
4327		    (intmax_t)oldadp->ad_newblkno,
4328		    newadp->ad_oldsize, oldadp->ad_newsize);
4329	newadp->ad_oldblkno = oldadp->ad_oldblkno;
4330	newadp->ad_oldsize = oldadp->ad_oldsize;
4331	/*
4332	 * If the old dependency had a fragment to free or had never
4333	 * previously had a block allocated, then the new dependency
4334	 * can immediately post its freefrag and adopt the old freefrag.
4335	 * This action is done by swapping the freefrag dependencies.
4336	 * The new dependency gains the old one's freefrag, and the
4337	 * old one gets the new one and then immediately puts it on
4338	 * the worklist when it is freed by free_newblk. It is
4339	 * not possible to do this swap when the old dependency had a
4340	 * non-zero size but no previous fragment to free. This condition
4341	 * arises when the new block is an extension of the old block.
4342	 * Here, the first part of the fragment allocated to the new
4343	 * dependency is part of the block currently claimed on disk by
4344	 * the old dependency, so cannot legitimately be freed until the
4345	 * conditions for the new dependency are fulfilled.
4346	 */
4347	freefrag = newadp->ad_freefrag;
4348	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
4349		newadp->ad_freefrag = oldadp->ad_freefrag;
4350		oldadp->ad_freefrag = freefrag;
4351	}
4352	/*
4353	 * If we are tracking a new directory-block allocation,
4354	 * move it from the old allocdirect to the new allocdirect.
4355	 */
4356	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
4357		newdirblk = WK_NEWDIRBLK(wk);
4358		WORKLIST_REMOVE(&newdirblk->db_list);
4359		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
4360			panic("allocdirect_merge: extra newdirblk");
4361		WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
4362	}
4363	TAILQ_REMOVE(adphead, oldadp, ad_next);
4364	/*
4365	 * We need to move any journal dependencies over to the freefrag
4366	 * that releases this block if it exists.  Otherwise we are
4367	 * extending an existing block and we'll wait until that is
4368	 * complete to release the journal space and extend the
4369	 * new journal to cover this old space as well.
4370	 */
4371	if (freefrag == NULL) {
4372		struct jnewblk *jnewblk;
4373		struct jnewblk *njnewblk;
4374
4375		if (oldadp->ad_newblkno != newadp->ad_newblkno)
4376			panic("allocdirect_merge: %jd != %jd",
4377			    oldadp->ad_newblkno, newadp->ad_newblkno);
4378		jnewblk = oldadp->ad_block.nb_jnewblk;
4379		cancel_newblk(&oldadp->ad_block, &newadp->ad_block.nb_jwork);
4380		/*
4381		 * We have an unwritten jnewblk, we need to merge the
4382		 * frag bits with our own.  The newer adp's journal can not
4383		 * be written prior to the old one so no need to check for
4384		 * it here.
4385		 */
4386		if (jnewblk) {
4387			njnewblk = newadp->ad_block.nb_jnewblk;
4388			if (njnewblk == NULL)
4389				panic("allocdirect_merge: No jnewblk");
4390			if (jnewblk->jn_state & UNDONE) {
4391				njnewblk->jn_state |= UNDONE | NEWBLOCK;
4392				njnewblk->jn_state &= ~ATTACHED;
4393				jnewblk->jn_state &= ~UNDONE;
4394			}
4395			njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
4396			WORKLIST_REMOVE(&jnewblk->jn_list);
4397			jnewblk->jn_state |= ATTACHED | COMPLETE;
4398			free_jnewblk(jnewblk);
4399		}
4400	} else {
4401		/*
4402		 * We can skip journaling for this freefrag and just complete
4403		 * any pending journal work for the allocdirect that is being
4404		 * removed after the freefrag completes.
4405		 */
4406		if (freefrag->ff_jfreefrag)
4407			cancel_jfreefrag(freefrag->ff_jfreefrag);
4408		cancel_newblk(&oldadp->ad_block, &freefrag->ff_jwork);
4409	}
4410	free_newblk(&oldadp->ad_block);
4411}
4412
4413/*
4414 * Allocate a jfreefrag structure to journal a single block free.
4415 */
4416static struct jfreefrag *
4417newjfreefrag(freefrag, ip, blkno, size, lbn)
4418	struct freefrag *freefrag;
4419	struct inode *ip;
4420	ufs2_daddr_t blkno;
4421	long size;
4422	ufs_lbn_t lbn;
4423{
4424	struct jfreefrag *jfreefrag;
4425	struct fs *fs;
4426
4427	fs = ip->i_fs;
4428	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
4429	    M_SOFTDEP_FLAGS);
4430	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
4431	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
4432	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
4433	jfreefrag->fr_ino = ip->i_number;
4434	jfreefrag->fr_lbn = lbn;
4435	jfreefrag->fr_blkno = blkno;
4436	jfreefrag->fr_frags = numfrags(fs, size);
4437	jfreefrag->fr_freefrag = freefrag;
4438
4439	return (jfreefrag);
4440}
4441
4442/*
4443 * Allocate a new freefrag structure.
4444 */
4445static struct freefrag *
4446newfreefrag(ip, blkno, size, lbn)
4447	struct inode *ip;
4448	ufs2_daddr_t blkno;
4449	long size;
4450	ufs_lbn_t lbn;
4451{
4452	struct freefrag *freefrag;
4453	struct fs *fs;
4454
4455	fs = ip->i_fs;
4456	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
4457		panic("newfreefrag: frag size");
4458	freefrag = malloc(sizeof(struct freefrag),
4459	    M_FREEFRAG, M_SOFTDEP_FLAGS);
4460	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
4461	freefrag->ff_state = ATTACHED;
4462	LIST_INIT(&freefrag->ff_jwork);
4463	freefrag->ff_inum = ip->i_number;
4464	freefrag->ff_blkno = blkno;
4465	freefrag->ff_fragsize = size;
4466
4467	if (fs->fs_flags & FS_SUJ) {
4468		freefrag->ff_jfreefrag =
4469		    newjfreefrag(freefrag, ip, blkno, size, lbn);
4470	} else {
4471		freefrag->ff_state |= DEPCOMPLETE;
4472		freefrag->ff_jfreefrag = NULL;
4473	}
4474
4475	return (freefrag);
4476}
4477
4478/*
4479 * This workitem de-allocates fragments that were replaced during
4480 * file block allocation.
4481 */
4482static void
4483handle_workitem_freefrag(freefrag)
4484	struct freefrag *freefrag;
4485{
4486	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
4487	struct workhead wkhd;
4488
4489	/*
4490	 * It would be illegal to add new completion items to the
4491	 * freefrag after it was schedule to be done so it must be
4492	 * safe to modify the list head here.
4493	 */
4494	LIST_INIT(&wkhd);
4495	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
4496	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
4497	    freefrag->ff_fragsize, freefrag->ff_inum, &wkhd);
4498	ACQUIRE_LOCK(&lk);
4499	WORKITEM_FREE(freefrag, D_FREEFRAG);
4500	FREE_LOCK(&lk);
4501}
4502
4503/*
4504 * Set up a dependency structure for an external attributes data block.
4505 * This routine follows much of the structure of softdep_setup_allocdirect.
4506 * See the description of softdep_setup_allocdirect above for details.
4507 */
4508void
4509softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
4510	struct inode *ip;
4511	ufs_lbn_t off;
4512	ufs2_daddr_t newblkno;
4513	ufs2_daddr_t oldblkno;
4514	long newsize;
4515	long oldsize;
4516	struct buf *bp;
4517{
4518	struct allocdirect *adp, *oldadp;
4519	struct allocdirectlst *adphead;
4520	struct freefrag *freefrag;
4521	struct inodedep *inodedep;
4522	struct jnewblk *jnewblk;
4523	struct newblk *newblk;
4524	struct mount *mp;
4525	ufs_lbn_t lbn;
4526
4527	if (off >= NXADDR)
4528		panic("softdep_setup_allocext: lbn %lld > NXADDR",
4529		    (long long)off);
4530
4531	lbn = bp->b_lblkno;
4532	mp = UFSTOVFS(ip->i_ump);
4533	if (oldblkno && oldblkno != newblkno)
4534		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
4535	else
4536		freefrag = NULL;
4537
4538	ACQUIRE_LOCK(&lk);
4539	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
4540		panic("softdep_setup_allocext: lost block");
4541	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4542	    ("softdep_setup_allocext: newblk already initialized"));
4543	/*
4544	 * Convert the newblk to an allocdirect.
4545	 */
4546	newblk->nb_list.wk_type = D_ALLOCDIRECT;
4547	adp = (struct allocdirect *)newblk;
4548	newblk->nb_freefrag = freefrag;
4549	adp->ad_offset = off;
4550	adp->ad_oldblkno = oldblkno;
4551	adp->ad_newsize = newsize;
4552	adp->ad_oldsize = oldsize;
4553	adp->ad_state |=  EXTDATA;
4554
4555	/*
4556	 * Finish initializing the journal.
4557	 */
4558	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4559		jnewblk->jn_ino = ip->i_number;
4560		jnewblk->jn_lbn = lbn;
4561		add_to_journal(&jnewblk->jn_list);
4562	}
4563	if (freefrag && freefrag->ff_jfreefrag != NULL)
4564		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
4565	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
4566	adp->ad_inodedep = inodedep;
4567
4568	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
4569	/*
4570	 * The list of allocdirects must be kept in sorted and ascending
4571	 * order so that the rollback routines can quickly determine the
4572	 * first uncommitted block (the size of the file stored on disk
4573	 * ends at the end of the lowest committed fragment, or if there
4574	 * are no fragments, at the end of the highest committed block).
4575	 * Since files generally grow, the typical case is that the new
4576	 * block is to be added at the end of the list. We speed this
4577	 * special case by checking against the last allocdirect in the
4578	 * list before laboriously traversing the list looking for the
4579	 * insertion point.
4580	 */
4581	adphead = &inodedep->id_newextupdt;
4582	oldadp = TAILQ_LAST(adphead, allocdirectlst);
4583	if (oldadp == NULL || oldadp->ad_offset <= off) {
4584		/* insert at end of list */
4585		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
4586		if (oldadp != NULL && oldadp->ad_offset == off)
4587			allocdirect_merge(adphead, adp, oldadp);
4588		FREE_LOCK(&lk);
4589		return;
4590	}
4591	TAILQ_FOREACH(oldadp, adphead, ad_next) {
4592		if (oldadp->ad_offset >= off)
4593			break;
4594	}
4595	if (oldadp == NULL)
4596		panic("softdep_setup_allocext: lost entry");
4597	/* insert in middle of list */
4598	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
4599	if (oldadp->ad_offset == off)
4600		allocdirect_merge(adphead, adp, oldadp);
4601	FREE_LOCK(&lk);
4602}
4603
4604/*
4605 * Indirect block allocation dependencies.
4606 *
4607 * The same dependencies that exist for a direct block also exist when
4608 * a new block is allocated and pointed to by an entry in a block of
4609 * indirect pointers. The undo/redo states described above are also
4610 * used here. Because an indirect block contains many pointers that
4611 * may have dependencies, a second copy of the entire in-memory indirect
4612 * block is kept. The buffer cache copy is always completely up-to-date.
4613 * The second copy, which is used only as a source for disk writes,
4614 * contains only the safe pointers (i.e., those that have no remaining
4615 * update dependencies). The second copy is freed when all pointers
4616 * are safe. The cache is not allowed to replace indirect blocks with
4617 * pending update dependencies. If a buffer containing an indirect
4618 * block with dependencies is written, these routines will mark it
4619 * dirty again. It can only be successfully written once all the
4620 * dependencies are removed. The ffs_fsync routine in conjunction with
4621 * softdep_sync_metadata work together to get all the dependencies
4622 * removed so that a file can be successfully written to disk. Three
4623 * procedures are used when setting up indirect block pointer
4624 * dependencies. The division is necessary because of the organization
4625 * of the "balloc" routine and because of the distinction between file
4626 * pages and file metadata blocks.
4627 */
4628
4629/*
4630 * Allocate a new allocindir structure.
4631 */
4632static struct allocindir *
4633newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
4634	struct inode *ip;	/* inode for file being extended */
4635	int ptrno;		/* offset of pointer in indirect block */
4636	ufs2_daddr_t newblkno;	/* disk block number being added */
4637	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
4638	ufs_lbn_t lbn;
4639{
4640	struct newblk *newblk;
4641	struct allocindir *aip;
4642	struct freefrag *freefrag;
4643	struct jnewblk *jnewblk;
4644
4645	if (oldblkno)
4646		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
4647	else
4648		freefrag = NULL;
4649	ACQUIRE_LOCK(&lk);
4650	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
4651		panic("new_allocindir: lost block");
4652	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
4653	    ("newallocindir: newblk already initialized"));
4654	newblk->nb_list.wk_type = D_ALLOCINDIR;
4655	newblk->nb_freefrag = freefrag;
4656	aip = (struct allocindir *)newblk;
4657	aip->ai_offset = ptrno;
4658	aip->ai_oldblkno = oldblkno;
4659	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
4660		jnewblk->jn_ino = ip->i_number;
4661		jnewblk->jn_lbn = lbn;
4662		add_to_journal(&jnewblk->jn_list);
4663	}
4664	if (freefrag && freefrag->ff_jfreefrag != NULL)
4665		add_to_journal(&freefrag->ff_jfreefrag->fr_list);
4666	return (aip);
4667}
4668
4669/*
4670 * Called just before setting an indirect block pointer
4671 * to a newly allocated file page.
4672 */
4673void
4674softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
4675	struct inode *ip;	/* inode for file being extended */
4676	ufs_lbn_t lbn;		/* allocated block number within file */
4677	struct buf *bp;		/* buffer with indirect blk referencing page */
4678	int ptrno;		/* offset of pointer in indirect block */
4679	ufs2_daddr_t newblkno;	/* disk block number being added */
4680	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
4681	struct buf *nbp;	/* buffer holding allocated page */
4682{
4683	struct inodedep *inodedep;
4684	struct allocindir *aip;
4685	struct pagedep *pagedep;
4686	struct mount *mp;
4687
4688	if (lbn != nbp->b_lblkno)
4689		panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
4690		    lbn, bp->b_lblkno);
4691	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
4692	mp = UFSTOVFS(ip->i_ump);
4693	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
4694	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
4695	/*
4696	 * If we are allocating a directory page, then we must
4697	 * allocate an associated pagedep to track additions and
4698	 * deletions.
4699	 */
4700	if ((ip->i_mode & IFMT) == IFDIR &&
4701	    pagedep_lookup(mp, ip->i_number, lbn, DEPALLOC, &pagedep) == 0)
4702		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
4703	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
4704	setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
4705	FREE_LOCK(&lk);
4706}
4707
4708/*
4709 * Called just before setting an indirect block pointer to a
4710 * newly allocated indirect block.
4711 */
4712void
4713softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
4714	struct buf *nbp;	/* newly allocated indirect block */
4715	struct inode *ip;	/* inode for file being extended */
4716	struct buf *bp;		/* indirect block referencing allocated block */
4717	int ptrno;		/* offset of pointer in indirect block */
4718	ufs2_daddr_t newblkno;	/* disk block number being added */
4719{
4720	struct inodedep *inodedep;
4721	struct allocindir *aip;
4722	ufs_lbn_t lbn;
4723
4724	lbn = nbp->b_lblkno;
4725	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
4726	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
4727	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
4728	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
4729	setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
4730	FREE_LOCK(&lk);
4731}
4732
4733static void
4734indirdep_complete(indirdep)
4735	struct indirdep *indirdep;
4736{
4737	struct allocindir *aip;
4738
4739	LIST_REMOVE(indirdep, ir_next);
4740	indirdep->ir_state &= ~ONDEPLIST;
4741
4742	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
4743		LIST_REMOVE(aip, ai_next);
4744		free_newblk(&aip->ai_block);
4745	}
4746	/*
4747	 * If this indirdep is not attached to a buf it was simply waiting
4748	 * on completion to clear completehd.  free_indirdep() asserts
4749	 * that nothing is dangling.
4750	 */
4751	if ((indirdep->ir_state & ONWORKLIST) == 0)
4752		free_indirdep(indirdep);
4753}
4754
4755/*
4756 * Called to finish the allocation of the "aip" allocated
4757 * by one of the two routines above.
4758 */
4759static void
4760setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
4761	struct buf *bp;		/* in-memory copy of the indirect block */
4762	struct inode *ip;	/* inode for file being extended */
4763	struct inodedep *inodedep; /* Inodedep for ip */
4764	struct allocindir *aip;	/* allocindir allocated by the above routines */
4765	ufs_lbn_t lbn;		/* Logical block number for this block. */
4766{
4767	struct worklist *wk;
4768	struct fs *fs;
4769	struct newblk *newblk;
4770	struct indirdep *indirdep, *newindirdep;
4771	struct allocindir *oldaip;
4772	struct freefrag *freefrag;
4773	struct mount *mp;
4774	ufs2_daddr_t blkno;
4775
4776	mp = UFSTOVFS(ip->i_ump);
4777	fs = ip->i_fs;
4778	mtx_assert(&lk, MA_OWNED);
4779	if (bp->b_lblkno >= 0)
4780		panic("setup_allocindir_phase2: not indir blk");
4781	for (freefrag = NULL, indirdep = NULL, newindirdep = NULL; ; ) {
4782		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4783			if (wk->wk_type != D_INDIRDEP)
4784				continue;
4785			indirdep = WK_INDIRDEP(wk);
4786			break;
4787		}
4788		if (indirdep == NULL && newindirdep) {
4789			indirdep = newindirdep;
4790			newindirdep = NULL;
4791			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
4792			if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0,
4793			    &newblk)) {
4794				indirdep->ir_state |= ONDEPLIST;
4795				LIST_INSERT_HEAD(&newblk->nb_indirdeps,
4796				    indirdep, ir_next);
4797			} else
4798				indirdep->ir_state |= DEPCOMPLETE;
4799		}
4800		if (indirdep) {
4801			aip->ai_indirdep = indirdep;
4802			/*
4803			 * Check to see if there is an existing dependency
4804			 * for this block. If there is, merge the old
4805			 * dependency into the new one.  This happens
4806			 * as a result of reallocblk only.
4807			 */
4808			if (aip->ai_oldblkno == 0)
4809				oldaip = NULL;
4810			else
4811
4812				LIST_FOREACH(oldaip, &indirdep->ir_deplisthd,
4813				    ai_next)
4814					if (oldaip->ai_offset == aip->ai_offset)
4815						break;
4816			if (oldaip != NULL)
4817				freefrag = allocindir_merge(aip, oldaip);
4818			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
4819			KASSERT(aip->ai_offset >= 0 &&
4820			    aip->ai_offset < NINDIR(ip->i_ump->um_fs),
4821			    ("setup_allocindir_phase2: Bad offset %d",
4822			    aip->ai_offset));
4823			KASSERT(indirdep->ir_savebp != NULL,
4824			    ("setup_allocindir_phase2 NULL ir_savebp"));
4825			if (ip->i_ump->um_fstype == UFS1)
4826				((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
4827				    [aip->ai_offset] = aip->ai_oldblkno;
4828			else
4829				((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
4830				    [aip->ai_offset] = aip->ai_oldblkno;
4831			FREE_LOCK(&lk);
4832			if (freefrag != NULL)
4833				handle_workitem_freefrag(freefrag);
4834		} else
4835			FREE_LOCK(&lk);
4836		if (newindirdep) {
4837			newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
4838			brelse(newindirdep->ir_savebp);
4839			ACQUIRE_LOCK(&lk);
4840			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
4841			if (indirdep)
4842				break;
4843			FREE_LOCK(&lk);
4844		}
4845		if (indirdep) {
4846			ACQUIRE_LOCK(&lk);
4847			break;
4848		}
4849		newindirdep = malloc(sizeof(struct indirdep),
4850			M_INDIRDEP, M_SOFTDEP_FLAGS);
4851		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
4852		newindirdep->ir_state = ATTACHED;
4853		if (ip->i_ump->um_fstype == UFS1)
4854			newindirdep->ir_state |= UFS1FMT;
4855		newindirdep->ir_saveddata = NULL;
4856		LIST_INIT(&newindirdep->ir_deplisthd);
4857		LIST_INIT(&newindirdep->ir_donehd);
4858		LIST_INIT(&newindirdep->ir_writehd);
4859		LIST_INIT(&newindirdep->ir_completehd);
4860		LIST_INIT(&newindirdep->ir_jwork);
4861		if (bp->b_blkno == bp->b_lblkno) {
4862			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
4863			    NULL, NULL);
4864			bp->b_blkno = blkno;
4865		}
4866		newindirdep->ir_savebp =
4867		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
4868		BUF_KERNPROC(newindirdep->ir_savebp);
4869		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
4870		ACQUIRE_LOCK(&lk);
4871	}
4872}
4873
4874/*
4875 * Merge two allocindirs which refer to the same block.  Move newblock
4876 * dependencies and setup the freefrags appropriately.
4877 */
4878static struct freefrag *
4879allocindir_merge(aip, oldaip)
4880	struct allocindir *aip;
4881	struct allocindir *oldaip;
4882{
4883	struct newdirblk *newdirblk;
4884	struct freefrag *freefrag;
4885	struct worklist *wk;
4886
4887	if (oldaip->ai_newblkno != aip->ai_oldblkno)
4888		panic("allocindir_merge: blkno");
4889	aip->ai_oldblkno = oldaip->ai_oldblkno;
4890	freefrag = aip->ai_freefrag;
4891	aip->ai_freefrag = oldaip->ai_freefrag;
4892	oldaip->ai_freefrag = NULL;
4893	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
4894	/*
4895	 * If we are tracking a new directory-block allocation,
4896	 * move it from the old allocindir to the new allocindir.
4897	 */
4898	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
4899		newdirblk = WK_NEWDIRBLK(wk);
4900		WORKLIST_REMOVE(&newdirblk->db_list);
4901		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
4902			panic("allocindir_merge: extra newdirblk");
4903		WORKLIST_INSERT(&aip->ai_newdirblk, &newdirblk->db_list);
4904	}
4905	/*
4906	 * We can skip journaling for this freefrag and just complete
4907	 * any pending journal work for the allocindir that is being
4908	 * removed after the freefrag completes.
4909	 */
4910	if (freefrag->ff_jfreefrag)
4911		cancel_jfreefrag(freefrag->ff_jfreefrag);
4912	LIST_REMOVE(oldaip, ai_next);
4913	cancel_newblk(&oldaip->ai_block, &freefrag->ff_jwork);
4914	free_newblk(&oldaip->ai_block);
4915
4916	return (freefrag);
4917}
4918
4919/*
4920 * Block de-allocation dependencies.
4921 *
4922 * When blocks are de-allocated, the on-disk pointers must be nullified before
4923 * the blocks are made available for use by other files.  (The true
4924 * requirement is that old pointers must be nullified before new on-disk
4925 * pointers are set.  We chose this slightly more stringent requirement to
4926 * reduce complexity.) Our implementation handles this dependency by updating
4927 * the inode (or indirect block) appropriately but delaying the actual block
4928 * de-allocation (i.e., freemap and free space count manipulation) until
4929 * after the updated versions reach stable storage.  After the disk is
4930 * updated, the blocks can be safely de-allocated whenever it is convenient.
4931 * This implementation handles only the common case of reducing a file's
4932 * length to zero. Other cases are handled by the conventional synchronous
4933 * write approach.
4934 *
4935 * The ffs implementation with which we worked double-checks
4936 * the state of the block pointers and file size as it reduces
4937 * a file's length.  Some of this code is replicated here in our
4938 * soft updates implementation.  The freeblks->fb_chkcnt field is
4939 * used to transfer a part of this information to the procedure
4940 * that eventually de-allocates the blocks.
4941 *
4942 * This routine should be called from the routine that shortens
4943 * a file's length, before the inode's size or block pointers
4944 * are modified. It will save the block pointer information for
4945 * later release and zero the inode so that the calling routine
4946 * can release it.
4947 */
4948void
4949softdep_setup_freeblocks(ip, length, flags)
4950	struct inode *ip;	/* The inode whose length is to be reduced */
4951	off_t length;		/* The new length for the file */
4952	int flags;		/* IO_EXT and/or IO_NORMAL */
4953{
4954	struct ufs1_dinode *dp1;
4955	struct ufs2_dinode *dp2;
4956	struct freeblks *freeblks;
4957	struct inodedep *inodedep;
4958	struct allocdirect *adp;
4959	struct jfreeblk *jfreeblk;
4960	struct bufobj *bo;
4961	struct vnode *vp;
4962	struct buf *bp;
4963	struct fs *fs;
4964	ufs2_daddr_t extblocks, datablocks;
4965	struct mount *mp;
4966	int i, delay, error;
4967	ufs2_daddr_t blkno;
4968	ufs_lbn_t tmpval;
4969	ufs_lbn_t lbn;
4970	long oldextsize;
4971	long oldsize;
4972	int frags;
4973	int needj;
4974
4975	fs = ip->i_fs;
4976	mp = UFSTOVFS(ip->i_ump);
4977	if (length != 0)
4978		panic("softdep_setup_freeblocks: non-zero length");
4979	freeblks = malloc(sizeof(struct freeblks),
4980		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
4981	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
4982	LIST_INIT(&freeblks->fb_jfreeblkhd);
4983	LIST_INIT(&freeblks->fb_jwork);
4984	freeblks->fb_state = ATTACHED;
4985	freeblks->fb_uid = ip->i_uid;
4986	freeblks->fb_previousinum = ip->i_number;
4987	freeblks->fb_devvp = ip->i_devvp;
4988	freeblks->fb_chkcnt = 0;
4989	ACQUIRE_LOCK(&lk);
4990	/*
4991	 * If we're truncating a removed file that will never be written
4992	 * we don't need to journal the block frees.  The canceled journals
4993	 * for the allocations will suffice.
4994	 */
4995	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
4996	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED ||
4997	    (fs->fs_flags & FS_SUJ) == 0)
4998		needj = 0;
4999	else
5000		needj = 1;
5001	num_freeblkdep++;
5002	FREE_LOCK(&lk);
5003	extblocks = 0;
5004	if (fs->fs_magic == FS_UFS2_MAGIC)
5005		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
5006	datablocks = DIP(ip, i_blocks) - extblocks;
5007	if ((flags & IO_NORMAL) != 0) {
5008		oldsize = ip->i_size;
5009		ip->i_size = 0;
5010		DIP_SET(ip, i_size, 0);
5011		freeblks->fb_chkcnt = datablocks;
5012		for (i = 0; i < NDADDR; i++) {
5013			blkno = DIP(ip, i_db[i]);
5014			DIP_SET(ip, i_db[i], 0);
5015			if (blkno == 0)
5016				continue;
5017			frags = sblksize(fs, oldsize, i);
5018			frags = numfrags(fs, frags);
5019			newfreework(freeblks, NULL, i, blkno, frags, needj);
5020		}
5021		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
5022		    i++, tmpval *= NINDIR(fs)) {
5023			blkno = DIP(ip, i_ib[i]);
5024			DIP_SET(ip, i_ib[i], 0);
5025			if (blkno)
5026				newfreework(freeblks, NULL, -lbn - i, blkno,
5027				    fs->fs_frag, needj);
5028			lbn += tmpval;
5029		}
5030		/*
5031		 * If the file was removed, then the space being freed was
5032		 * accounted for then (see softdep_releasefile()). If the
5033		 * file is merely being truncated, then we account for it now.
5034		 */
5035		if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
5036			UFS_LOCK(ip->i_ump);
5037			fs->fs_pendingblocks += datablocks;
5038			UFS_UNLOCK(ip->i_ump);
5039		}
5040	}
5041	if ((flags & IO_EXT) != 0) {
5042		oldextsize = ip->i_din2->di_extsize;
5043		ip->i_din2->di_extsize = 0;
5044		freeblks->fb_chkcnt += extblocks;
5045		for (i = 0; i < NXADDR; i++) {
5046			blkno = ip->i_din2->di_extb[i];
5047			ip->i_din2->di_extb[i] = 0;
5048			if (blkno == 0)
5049				continue;
5050			frags = sblksize(fs, oldextsize, i);
5051			frags = numfrags(fs, frags);
5052			newfreework(freeblks, NULL, -1 - i, blkno, frags,
5053			    needj);
5054		}
5055	}
5056	if (LIST_EMPTY(&freeblks->fb_jfreeblkhd))
5057		needj = 0;
5058	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
5059	/*
5060	 * Push the zero'ed inode to to its disk buffer so that we are free
5061	 * to delete its dependencies below. Once the dependencies are gone
5062	 * the buffer can be safely released.
5063	 */
5064	if ((error = bread(ip->i_devvp,
5065	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
5066	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
5067		brelse(bp);
5068		softdep_error("softdep_setup_freeblocks", error);
5069	}
5070	if (ip->i_ump->um_fstype == UFS1) {
5071		dp1 = ((struct ufs1_dinode *)bp->b_data +
5072		    ino_to_fsbo(fs, ip->i_number));
5073		ip->i_din1->di_freelink = dp1->di_freelink;
5074		*dp1 = *ip->i_din1;
5075	} else {
5076		dp2 = ((struct ufs2_dinode *)bp->b_data +
5077		    ino_to_fsbo(fs, ip->i_number));
5078		ip->i_din2->di_freelink = dp2->di_freelink;
5079		*dp2 = *ip->i_din2;
5080	}
5081	/*
5082	 * Find and eliminate any inode dependencies.
5083	 */
5084	ACQUIRE_LOCK(&lk);
5085	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5086	if ((inodedep->id_state & IOSTARTED) != 0)
5087		panic("softdep_setup_freeblocks: inode busy");
5088	/*
5089	 * Add the freeblks structure to the list of operations that
5090	 * must await the zero'ed inode being written to disk. If we
5091	 * still have a bitmap dependency (delay == 0), then the inode
5092	 * has never been written to disk, so we can process the
5093	 * freeblks below once we have deleted the dependencies.
5094	 */
5095	delay = (inodedep->id_state & DEPCOMPLETE);
5096	if (delay)
5097		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
5098	else if (needj)
5099		freeblks->fb_state |= DEPCOMPLETE | COMPLETE;
5100	/*
5101	 * Because the file length has been truncated to zero, any
5102	 * pending block allocation dependency structures associated
5103	 * with this inode are obsolete and can simply be de-allocated.
5104	 * We must first merge the two dependency lists to get rid of
5105	 * any duplicate freefrag structures, then purge the merged list.
5106	 * If we still have a bitmap dependency, then the inode has never
5107	 * been written to disk, so we can free any fragments without delay.
5108	 */
5109	if (flags & IO_NORMAL) {
5110		merge_inode_lists(&inodedep->id_newinoupdt,
5111		    &inodedep->id_inoupdt);
5112		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
5113			cancel_allocdirect(&inodedep->id_inoupdt, adp,
5114			    freeblks, delay);
5115	}
5116	if (flags & IO_EXT) {
5117		merge_inode_lists(&inodedep->id_newextupdt,
5118		    &inodedep->id_extupdt);
5119		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
5120			cancel_allocdirect(&inodedep->id_extupdt, adp,
5121			    freeblks, delay);
5122	}
5123	LIST_FOREACH(jfreeblk, &freeblks->fb_jfreeblkhd, jf_deps)
5124		add_to_journal(&jfreeblk->jf_list);
5125
5126	FREE_LOCK(&lk);
5127	bdwrite(bp);
5128	/*
5129	 * We must wait for any I/O in progress to finish so that
5130	 * all potential buffers on the dirty list will be visible.
5131	 * Once they are all there, walk the list and get rid of
5132	 * any dependencies.
5133	 */
5134	vp = ITOV(ip);
5135	bo = &vp->v_bufobj;
5136	BO_LOCK(bo);
5137	drain_output(vp);
5138restart:
5139	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
5140		if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
5141		    ((flags & IO_NORMAL) == 0 &&
5142		      (bp->b_xflags & BX_ALTDATA) == 0))
5143			continue;
5144		if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL)
5145			goto restart;
5146		BO_UNLOCK(bo);
5147		ACQUIRE_LOCK(&lk);
5148		(void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
5149		if (deallocate_dependencies(bp, inodedep, freeblks))
5150			bp->b_flags |= B_INVAL | B_NOCACHE;
5151		FREE_LOCK(&lk);
5152		brelse(bp);
5153		BO_LOCK(bo);
5154		goto restart;
5155	}
5156	BO_UNLOCK(bo);
5157	ACQUIRE_LOCK(&lk);
5158	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
5159		(void) free_inodedep(inodedep);
5160
5161	if (delay) {
5162		freeblks->fb_state |= DEPCOMPLETE;
5163		/*
5164		 * If the inode with zeroed block pointers is now on disk
5165		 * we can start freeing blocks. Add freeblks to the worklist
5166		 * instead of calling  handle_workitem_freeblocks directly as
5167		 * it is more likely that additional IO is needed to complete
5168		 * the request here than in the !delay case.
5169		 */
5170		if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
5171			add_to_worklist(&freeblks->fb_list, 1);
5172	}
5173
5174	FREE_LOCK(&lk);
5175	/*
5176	 * If the inode has never been written to disk (delay == 0) and
5177	 * we're not waiting on any journal writes, then we can process the
5178	 * freeblks now that we have deleted the dependencies.
5179	 */
5180	if (!delay && !needj)
5181		handle_workitem_freeblocks(freeblks, 0);
5182}
5183
5184/*
5185 * Reclaim any dependency structures from a buffer that is about to
5186 * be reallocated to a new vnode. The buffer must be locked, thus,
5187 * no I/O completion operations can occur while we are manipulating
5188 * its associated dependencies. The mutex is held so that other I/O's
5189 * associated with related dependencies do not occur.  Returns 1 if
5190 * all dependencies were cleared, 0 otherwise.
5191 */
5192static int
5193deallocate_dependencies(bp, inodedep, freeblks)
5194	struct buf *bp;
5195	struct inodedep *inodedep;
5196	struct freeblks *freeblks;
5197{
5198	struct worklist *wk;
5199	struct indirdep *indirdep;
5200	struct newdirblk *newdirblk;
5201	struct allocindir *aip;
5202	struct pagedep *pagedep;
5203	struct jremref *jremref;
5204	struct jmvref *jmvref;
5205	struct dirrem *dirrem;
5206	int i;
5207
5208	mtx_assert(&lk, MA_OWNED);
5209	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
5210		switch (wk->wk_type) {
5211
5212		case D_INDIRDEP:
5213			indirdep = WK_INDIRDEP(wk);
5214			if (bp->b_lblkno >= 0 ||
5215			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
5216				panic("deallocate_dependencies: not indir");
5217			cancel_indirdep(indirdep, bp, inodedep, freeblks);
5218			continue;
5219
5220		case D_PAGEDEP:
5221			pagedep = WK_PAGEDEP(wk);
5222			/*
5223			 * There should be no directory add dependencies present
5224			 * as the directory could not be truncated until all
5225			 * children were removed.
5226			 */
5227			KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
5228			    ("deallocate_dependencies: pendinghd != NULL"));
5229			for (i = 0; i < DAHASHSZ; i++)
5230				KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
5231				    ("deallocate_dependencies: diraddhd != NULL"));
5232			/*
5233			 * Copy any directory remove dependencies to the list
5234			 * to be processed after the zero'ed inode is written.
5235			 * If the inode has already been written, then they
5236			 * can be dumped directly onto the work list.
5237			 */
5238			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
5239				/*
5240				 * If there are any dirrems we wait for
5241				 * the journal write to complete and
5242				 * then restart the buf scan as the lock
5243				 * has been dropped.
5244				 */
5245				while ((jremref =
5246				    LIST_FIRST(&dirrem->dm_jremrefhd))
5247				    != NULL) {
5248					stat_jwait_filepage++;
5249					jwait(&jremref->jr_list);
5250					return (0);
5251				}
5252				LIST_REMOVE(dirrem, dm_next);
5253				dirrem->dm_dirinum = pagedep->pd_ino;
5254				if (inodedep == NULL ||
5255				    (inodedep->id_state & ALLCOMPLETE) ==
5256				     ALLCOMPLETE) {
5257					dirrem->dm_state |= COMPLETE;
5258					add_to_worklist(&dirrem->dm_list, 0);
5259				} else
5260					WORKLIST_INSERT(&inodedep->id_bufwait,
5261					    &dirrem->dm_list);
5262			}
5263			if ((pagedep->pd_state & NEWBLOCK) != 0) {
5264				newdirblk = pagedep->pd_newdirblk;
5265				WORKLIST_REMOVE(&newdirblk->db_list);
5266				free_newdirblk(newdirblk);
5267			}
5268			while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd))
5269			    != NULL) {
5270				stat_jwait_filepage++;
5271				jwait(&jmvref->jm_list);
5272				return (0);
5273			}
5274			WORKLIST_REMOVE(&pagedep->pd_list);
5275			LIST_REMOVE(pagedep, pd_hash);
5276			WORKITEM_FREE(pagedep, D_PAGEDEP);
5277			continue;
5278
5279		case D_ALLOCINDIR:
5280			aip = WK_ALLOCINDIR(wk);
5281			cancel_allocindir(aip, inodedep, freeblks);
5282			continue;
5283
5284		case D_ALLOCDIRECT:
5285		case D_INODEDEP:
5286			panic("deallocate_dependencies: Unexpected type %s",
5287			    TYPENAME(wk->wk_type));
5288			/* NOTREACHED */
5289
5290		default:
5291			panic("deallocate_dependencies: Unknown type %s",
5292			    TYPENAME(wk->wk_type));
5293			/* NOTREACHED */
5294		}
5295	}
5296
5297	return (1);
5298}
5299
5300/*
5301 * An allocdirect is being canceled due to a truncate.  We must make sure
5302 * the journal entry is released in concert with the blkfree that releases
5303 * the storage.  Completed journal entries must not be released until the
5304 * space is no longer pointed to by the inode or in the bitmap.
5305 */
5306static void
5307cancel_allocdirect(adphead, adp, freeblks, delay)
5308	struct allocdirectlst *adphead;
5309	struct allocdirect *adp;
5310	struct freeblks *freeblks;
5311	int delay;
5312{
5313	struct freework *freework;
5314	struct newblk *newblk;
5315	struct worklist *wk;
5316	ufs_lbn_t lbn;
5317
5318	TAILQ_REMOVE(adphead, adp, ad_next);
5319	newblk = (struct newblk *)adp;
5320	/*
5321	 * If the journal hasn't been written the jnewblk must be passed
5322	 * to the call to ffs_freeblk that reclaims the space.  We accomplish
5323	 * this by linking the journal dependency into the freework to be
5324	 * freed when freework_freeblock() is called.  If the journal has
5325	 * been written we can simply reclaim the journal space when the
5326	 * freeblks work is complete.
5327	 */
5328	if (newblk->nb_jnewblk == NULL) {
5329		cancel_newblk(newblk, &freeblks->fb_jwork);
5330		goto found;
5331	}
5332	lbn = newblk->nb_jnewblk->jn_lbn;
5333	/*
5334	 * Find the correct freework structure so it releases the canceled
5335	 * journal when the bitmap is cleared.  This preserves rollback
5336	 * until the allocation is reverted.
5337	 */
5338	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
5339		freework = WK_FREEWORK(wk);
5340		if (freework->fw_lbn != lbn)
5341			continue;
5342		cancel_newblk(newblk, &freework->fw_jwork);
5343		goto found;
5344	}
5345	panic("cancel_allocdirect: Freework not found for lbn %jd\n", lbn);
5346found:
5347	if (delay)
5348		WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
5349		    &newblk->nb_list);
5350	else
5351		free_newblk(newblk);
5352	return;
5353}
5354
5355
5356static void
5357cancel_newblk(newblk, wkhd)
5358	struct newblk *newblk;
5359	struct workhead *wkhd;
5360{
5361	struct indirdep *indirdep;
5362	struct allocindir *aip;
5363
5364	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
5365		indirdep->ir_state &= ~ONDEPLIST;
5366		LIST_REMOVE(indirdep, ir_next);
5367		/*
5368		 * If an indirdep is not on the buf worklist we need to
5369		 * free it here as deallocate_dependencies() will never
5370		 * find it.  These pointers were never visible on disk and
5371		 * can be discarded immediately.
5372		 */
5373		while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5374			LIST_REMOVE(aip, ai_next);
5375			cancel_newblk(&aip->ai_block, wkhd);
5376			free_newblk(&aip->ai_block);
5377		}
5378		/*
5379		 * If this indirdep is not attached to a buf it was simply
5380		 * waiting on completion to clear completehd.  free_indirdep()
5381		 * asserts that nothing is dangling.
5382		 */
5383		if ((indirdep->ir_state & ONWORKLIST) == 0)
5384			free_indirdep(indirdep);
5385	}
5386	if (newblk->nb_state & ONDEPLIST) {
5387		newblk->nb_state &= ~ONDEPLIST;
5388		LIST_REMOVE(newblk, nb_deps);
5389	}
5390	if (newblk->nb_state & ONWORKLIST)
5391		WORKLIST_REMOVE(&newblk->nb_list);
5392	/*
5393	 * If the journal entry hasn't been written we hold onto the dep
5394	 * until it is safe to free along with the other journal work.
5395	 */
5396	if (newblk->nb_jnewblk != NULL) {
5397		cancel_jnewblk(newblk->nb_jnewblk, wkhd);
5398		newblk->nb_jnewblk = NULL;
5399	}
5400	if (!LIST_EMPTY(&newblk->nb_jwork))
5401		jwork_move(wkhd, &newblk->nb_jwork);
5402}
5403
5404/*
5405 * Free a newblk. Generate a new freefrag work request if appropriate.
5406 * This must be called after the inode pointer and any direct block pointers
5407 * are valid or fully removed via truncate or frag extension.
5408 */
5409static void
5410free_newblk(newblk)
5411	struct newblk *newblk;
5412{
5413	struct indirdep *indirdep;
5414	struct newdirblk *newdirblk;
5415	struct freefrag *freefrag;
5416	struct worklist *wk;
5417
5418	mtx_assert(&lk, MA_OWNED);
5419	if (newblk->nb_state & ONDEPLIST)
5420		LIST_REMOVE(newblk, nb_deps);
5421	if (newblk->nb_state & ONWORKLIST)
5422		WORKLIST_REMOVE(&newblk->nb_list);
5423	LIST_REMOVE(newblk, nb_hash);
5424	if ((freefrag = newblk->nb_freefrag) != NULL) {
5425		freefrag->ff_state |= COMPLETE;
5426		if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
5427			add_to_worklist(&freefrag->ff_list, 0);
5428	}
5429	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) {
5430		newdirblk = WK_NEWDIRBLK(wk);
5431		WORKLIST_REMOVE(&newdirblk->db_list);
5432		if (!LIST_EMPTY(&newblk->nb_newdirblk))
5433			panic("free_newblk: extra newdirblk");
5434		free_newdirblk(newdirblk);
5435	}
5436	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) {
5437		indirdep->ir_state |= DEPCOMPLETE;
5438		indirdep_complete(indirdep);
5439	}
5440	KASSERT(newblk->nb_jnewblk == NULL,
5441	    ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk));
5442	handle_jwork(&newblk->nb_jwork);
5443	newblk->nb_list.wk_type = D_NEWBLK;
5444	WORKITEM_FREE(newblk, D_NEWBLK);
5445}
5446
5447/*
5448 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
5449 * This routine must be called with splbio interrupts blocked.
5450 */
5451static void
5452free_newdirblk(newdirblk)
5453	struct newdirblk *newdirblk;
5454{
5455	struct pagedep *pagedep;
5456	struct diradd *dap;
5457	struct worklist *wk;
5458	int i;
5459
5460	mtx_assert(&lk, MA_OWNED);
5461	/*
5462	 * If the pagedep is still linked onto the directory buffer
5463	 * dependency chain, then some of the entries on the
5464	 * pd_pendinghd list may not be committed to disk yet. In
5465	 * this case, we will simply clear the NEWBLOCK flag and
5466	 * let the pd_pendinghd list be processed when the pagedep
5467	 * is next written. If the pagedep is no longer on the buffer
5468	 * dependency chain, then all the entries on the pd_pending
5469	 * list are committed to disk and we can free them here.
5470	 */
5471	pagedep = newdirblk->db_pagedep;
5472	pagedep->pd_state &= ~NEWBLOCK;
5473	if ((pagedep->pd_state & ONWORKLIST) == 0)
5474		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
5475			free_diradd(dap, NULL);
5476	/*
5477	 * If no dependencies remain, the pagedep will be freed.
5478	 */
5479	for (i = 0; i < DAHASHSZ; i++)
5480		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
5481			break;
5482	if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0 &&
5483	    LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
5484		KASSERT(LIST_FIRST(&pagedep->pd_dirremhd) == NULL,
5485		    ("free_newdirblk: Freeing non-free pagedep %p", pagedep));
5486		LIST_REMOVE(pagedep, pd_hash);
5487		WORKITEM_FREE(pagedep, D_PAGEDEP);
5488	}
5489	/* Should only ever be one item in the list. */
5490	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
5491		WORKLIST_REMOVE(wk);
5492		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
5493	}
5494	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
5495}
5496
5497/*
5498 * Prepare an inode to be freed. The actual free operation is not
5499 * done until the zero'ed inode has been written to disk.
5500 */
5501void
5502softdep_freefile(pvp, ino, mode)
5503	struct vnode *pvp;
5504	ino_t ino;
5505	int mode;
5506{
5507	struct inode *ip = VTOI(pvp);
5508	struct inodedep *inodedep;
5509	struct freefile *freefile;
5510
5511	/*
5512	 * This sets up the inode de-allocation dependency.
5513	 */
5514	freefile = malloc(sizeof(struct freefile),
5515		M_FREEFILE, M_SOFTDEP_FLAGS);
5516	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
5517	freefile->fx_mode = mode;
5518	freefile->fx_oldinum = ino;
5519	freefile->fx_devvp = ip->i_devvp;
5520	LIST_INIT(&freefile->fx_jwork);
5521	if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
5522		UFS_LOCK(ip->i_ump);
5523		ip->i_fs->fs_pendinginodes += 1;
5524		UFS_UNLOCK(ip->i_ump);
5525	}
5526
5527	/*
5528	 * If the inodedep does not exist, then the zero'ed inode has
5529	 * been written to disk. If the allocated inode has never been
5530	 * written to disk, then the on-disk inode is zero'ed. In either
5531	 * case we can free the file immediately.  If the journal was
5532	 * canceled before being written the inode will never make it to
5533	 * disk and we must send the canceled journal entrys to
5534	 * ffs_freefile() to be cleared in conjunction with the bitmap.
5535	 * Any blocks waiting on the inode to write can be safely freed
5536	 * here as it will never been written.
5537	 */
5538	ACQUIRE_LOCK(&lk);
5539	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
5540	/*
5541	 * Remove this inode from the unlinked list and set
5542	 * GOINGAWAY as appropriate to indicate that this inode
5543	 * will never be written.
5544	 */
5545	if (inodedep && inodedep->id_state & UNLINKED) {
5546		/*
5547		 * Save the journal work to be freed with the bitmap
5548		 * before we clear UNLINKED.  Otherwise it can be lost
5549		 * if the inode block is written.
5550		 */
5551		handle_bufwait(inodedep, &freefile->fx_jwork);
5552		clear_unlinked_inodedep(inodedep);
5553		/* Re-acquire inodedep as we've dropped lk. */
5554		inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
5555		if (inodedep && (inodedep->id_state & DEPCOMPLETE) == 0)
5556			inodedep->id_state |= GOINGAWAY;
5557	}
5558	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
5559		FREE_LOCK(&lk);
5560		handle_workitem_freefile(freefile);
5561		return;
5562	}
5563	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
5564	FREE_LOCK(&lk);
5565	if (ip->i_number == ino)
5566		ip->i_flag |= IN_MODIFIED;
5567}
5568
5569/*
5570 * Check to see if an inode has never been written to disk. If
5571 * so free the inodedep and return success, otherwise return failure.
5572 * This routine must be called with splbio interrupts blocked.
5573 *
5574 * If we still have a bitmap dependency, then the inode has never
5575 * been written to disk. Drop the dependency as it is no longer
5576 * necessary since the inode is being deallocated. We set the
5577 * ALLCOMPLETE flags since the bitmap now properly shows that the
5578 * inode is not allocated. Even if the inode is actively being
5579 * written, it has been rolled back to its zero'ed state, so we
5580 * are ensured that a zero inode is what is on the disk. For short
5581 * lived files, this change will usually result in removing all the
5582 * dependencies from the inode so that it can be freed immediately.
5583 */
5584static int
5585check_inode_unwritten(inodedep)
5586	struct inodedep *inodedep;
5587{
5588
5589	mtx_assert(&lk, MA_OWNED);
5590
5591	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
5592	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
5593	    !LIST_EMPTY(&inodedep->id_bufwait) ||
5594	    !LIST_EMPTY(&inodedep->id_inowait) ||
5595	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
5596	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
5597	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
5598	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
5599	    inodedep->id_mkdiradd != NULL ||
5600	    inodedep->id_nlinkdelta != 0)
5601		return (0);
5602	/*
5603	 * Another process might be in initiate_write_inodeblock_ufs[12]
5604	 * trying to allocate memory without holding "Softdep Lock".
5605	 */
5606	if ((inodedep->id_state & IOSTARTED) != 0 &&
5607	    inodedep->id_savedino1 == NULL)
5608		return (0);
5609
5610	if (inodedep->id_state & ONDEPLIST)
5611		LIST_REMOVE(inodedep, id_deps);
5612	inodedep->id_state &= ~ONDEPLIST;
5613	inodedep->id_state |= ALLCOMPLETE;
5614	inodedep->id_bmsafemap = NULL;
5615	if (inodedep->id_state & ONWORKLIST)
5616		WORKLIST_REMOVE(&inodedep->id_list);
5617	if (inodedep->id_savedino1 != NULL) {
5618		free(inodedep->id_savedino1, M_SAVEDINO);
5619		inodedep->id_savedino1 = NULL;
5620	}
5621	if (free_inodedep(inodedep) == 0)
5622		panic("check_inode_unwritten: busy inode");
5623	return (1);
5624}
5625
5626/*
5627 * Try to free an inodedep structure. Return 1 if it could be freed.
5628 */
5629static int
5630free_inodedep(inodedep)
5631	struct inodedep *inodedep;
5632{
5633
5634	mtx_assert(&lk, MA_OWNED);
5635	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
5636	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
5637	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
5638	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
5639	    !LIST_EMPTY(&inodedep->id_bufwait) ||
5640	    !LIST_EMPTY(&inodedep->id_inowait) ||
5641	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
5642	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
5643	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
5644	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
5645	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
5646	    inodedep->id_mkdiradd != NULL ||
5647	    inodedep->id_nlinkdelta != 0 ||
5648	    inodedep->id_savedino1 != NULL)
5649		return (0);
5650	if (inodedep->id_state & ONDEPLIST)
5651		LIST_REMOVE(inodedep, id_deps);
5652	LIST_REMOVE(inodedep, id_hash);
5653	WORKITEM_FREE(inodedep, D_INODEDEP);
5654	num_inodedep -= 1;
5655	return (1);
5656}
5657
5658/*
5659 * Free the block referenced by a freework structure.  The parent freeblks
5660 * structure is released and completed when the final cg bitmap reaches
5661 * the disk.  This routine may be freeing a jnewblk which never made it to
5662 * disk in which case we do not have to wait as the operation is undone
5663 * in memory immediately.
5664 */
5665static void
5666freework_freeblock(freework)
5667	struct freework *freework;
5668{
5669	struct freeblks *freeblks;
5670	struct ufsmount *ump;
5671	struct workhead wkhd;
5672	struct fs *fs;
5673	int complete;
5674	int pending;
5675	int bsize;
5676	int needj;
5677
5678	freeblks = freework->fw_freeblks;
5679	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
5680	fs = ump->um_fs;
5681	needj = freeblks->fb_list.wk_mp->mnt_kern_flag & MNTK_SUJ;
5682	complete = 0;
5683	LIST_INIT(&wkhd);
5684	/*
5685	 * If we are canceling an existing jnewblk pass it to the free
5686	 * routine, otherwise pass the freeblk which will ultimately
5687	 * release the freeblks.  If we're not journaling, we can just
5688	 * free the freeblks immediately.
5689	 */
5690	if (!LIST_EMPTY(&freework->fw_jwork)) {
5691		LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list);
5692		complete = 1;
5693	} else if (needj)
5694		WORKLIST_INSERT_UNLOCKED(&wkhd, &freework->fw_list);
5695	bsize = lfragtosize(fs, freework->fw_frags);
5696	pending = btodb(bsize);
5697	ACQUIRE_LOCK(&lk);
5698	freeblks->fb_chkcnt -= pending;
5699	FREE_LOCK(&lk);
5700	/*
5701	 * extattr blocks don't show up in pending blocks.  XXX why?
5702	 */
5703	if (freework->fw_lbn >= 0 || freework->fw_lbn <= -NDADDR) {
5704		UFS_LOCK(ump);
5705		fs->fs_pendingblocks -= pending;
5706		UFS_UNLOCK(ump);
5707	}
5708	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno,
5709	    bsize, freeblks->fb_previousinum, &wkhd);
5710	if (complete == 0 && needj)
5711		return;
5712	/*
5713	 * The jnewblk will be discarded and the bits in the map never
5714	 * made it to disk.  We can immediately free the freeblk.
5715	 */
5716	ACQUIRE_LOCK(&lk);
5717	handle_written_freework(freework);
5718	FREE_LOCK(&lk);
5719}
5720
5721/*
5722 * Start, continue, or finish the process of freeing an indirect block tree.
5723 * The free operation may be paused at any point with fw_off containing the
5724 * offset to restart from.  This enables us to implement some flow control
5725 * for large truncates which may fan out and generate a huge number of
5726 * dependencies.
5727 */
5728static void
5729handle_workitem_indirblk(freework)
5730	struct freework *freework;
5731{
5732	struct freeblks *freeblks;
5733	struct ufsmount *ump;
5734	struct fs *fs;
5735
5736
5737	freeblks = freework->fw_freeblks;
5738	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
5739	fs = ump->um_fs;
5740	if (freework->fw_off == NINDIR(fs))
5741		freework_freeblock(freework);
5742	else
5743		indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
5744		    freework->fw_lbn);
5745}
5746
5747/*
5748 * Called when a freework structure attached to a cg buf is written.  The
5749 * ref on either the parent or the freeblks structure is released and
5750 * either may be added to the worklist if it is the final ref.
5751 */
5752static void
5753handle_written_freework(freework)
5754	struct freework *freework;
5755{
5756	struct freeblks *freeblks;
5757	struct freework *parent;
5758
5759	freeblks = freework->fw_freeblks;
5760	parent = freework->fw_parent;
5761	if (parent) {
5762		if (--parent->fw_ref != 0)
5763			parent = NULL;
5764		freeblks = NULL;
5765	} else if (--freeblks->fb_ref != 0)
5766		freeblks = NULL;
5767	WORKITEM_FREE(freework, D_FREEWORK);
5768	/*
5769	 * Don't delay these block frees or it takes an intolerable amount
5770	 * of time to process truncates and free their journal entries.
5771	 */
5772	if (freeblks)
5773		add_to_worklist(&freeblks->fb_list, 1);
5774	if (parent)
5775		add_to_worklist(&parent->fw_list, 1);
5776}
5777
5778/*
5779 * This workitem routine performs the block de-allocation.
5780 * The workitem is added to the pending list after the updated
5781 * inode block has been written to disk.  As mentioned above,
5782 * checks regarding the number of blocks de-allocated (compared
5783 * to the number of blocks allocated for the file) are also
5784 * performed in this function.
5785 */
5786static void
5787handle_workitem_freeblocks(freeblks, flags)
5788	struct freeblks *freeblks;
5789	int flags;
5790{
5791	struct freework *freework;
5792	struct worklist *wk;
5793
5794	KASSERT(LIST_EMPTY(&freeblks->fb_jfreeblkhd),
5795	    ("handle_workitem_freeblocks: Journal entries not written."));
5796	if (LIST_EMPTY(&freeblks->fb_freeworkhd)) {
5797		handle_complete_freeblocks(freeblks);
5798		return;
5799	}
5800	freeblks->fb_ref++;
5801	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
5802		KASSERT(wk->wk_type == D_FREEWORK,
5803		    ("handle_workitem_freeblocks: Unknown type %s",
5804		    TYPENAME(wk->wk_type)));
5805		WORKLIST_REMOVE_UNLOCKED(wk);
5806		freework = WK_FREEWORK(wk);
5807		if (freework->fw_lbn <= -NDADDR)
5808			handle_workitem_indirblk(freework);
5809		else
5810			freework_freeblock(freework);
5811	}
5812	ACQUIRE_LOCK(&lk);
5813	if (--freeblks->fb_ref != 0)
5814		freeblks = NULL;
5815	FREE_LOCK(&lk);
5816	if (freeblks)
5817		handle_complete_freeblocks(freeblks);
5818}
5819
5820/*
5821 * Once all of the freework workitems are complete we can retire the
5822 * freeblocks dependency and any journal work awaiting completion.  This
5823 * can not be called until all other dependencies are stable on disk.
5824 */
5825static void
5826handle_complete_freeblocks(freeblks)
5827	struct freeblks *freeblks;
5828{
5829	struct inode *ip;
5830	struct vnode *vp;
5831	struct fs *fs;
5832	struct ufsmount *ump;
5833	int flags;
5834
5835	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
5836	fs = ump->um_fs;
5837	flags = LK_NOWAIT;
5838
5839	/*
5840	 * If we still have not finished background cleanup, then check
5841	 * to see if the block count needs to be adjusted.
5842	 */
5843	if (freeblks->fb_chkcnt != 0 && (fs->fs_flags & FS_UNCLEAN) != 0 &&
5844	    ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
5845	    (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ) == 0) {
5846		ip = VTOI(vp);
5847		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + freeblks->fb_chkcnt);
5848		ip->i_flag |= IN_CHANGE;
5849		vput(vp);
5850	}
5851
5852#ifdef INVARIANTS
5853	if (freeblks->fb_chkcnt != 0 &&
5854	    ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
5855		printf("handle_workitem_freeblocks: block count\n");
5856#endif /* INVARIANTS */
5857
5858	ACQUIRE_LOCK(&lk);
5859	/*
5860	 * All of the freeblock deps must be complete prior to this call
5861	 * so it's now safe to complete earlier outstanding journal entries.
5862	 */
5863	handle_jwork(&freeblks->fb_jwork);
5864	WORKITEM_FREE(freeblks, D_FREEBLKS);
5865	num_freeblkdep--;
5866	FREE_LOCK(&lk);
5867}
5868
5869/*
5870 * Release blocks associated with the inode ip and stored in the indirect
5871 * block dbn. If level is greater than SINGLE, the block is an indirect block
5872 * and recursive calls to indirtrunc must be used to cleanse other indirect
5873 * blocks.
5874 */
5875static void
5876indir_trunc(freework, dbn, lbn)
5877	struct freework *freework;
5878	ufs2_daddr_t dbn;
5879	ufs_lbn_t lbn;
5880{
5881	struct freework *nfreework;
5882	struct workhead wkhd;
5883	struct jnewblk *jnewblk;
5884	struct freeblks *freeblks;
5885	struct buf *bp;
5886	struct fs *fs;
5887	struct worklist *wkn;
5888	struct worklist *wk;
5889	struct indirdep *indirdep;
5890	struct ufsmount *ump;
5891	ufs1_daddr_t *bap1 = 0;
5892	ufs2_daddr_t nb, nnb, *bap2 = 0;
5893	ufs_lbn_t lbnadd;
5894	int i, nblocks, ufs1fmt;
5895	int fs_pendingblocks;
5896	int freedeps;
5897	int needj;
5898	int level;
5899	int cnt;
5900
5901	LIST_INIT(&wkhd);
5902	level = lbn_level(lbn);
5903	if (level == -1)
5904		panic("indir_trunc: Invalid lbn %jd\n", lbn);
5905	freeblks = freework->fw_freeblks;
5906	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
5907	fs = ump->um_fs;
5908	fs_pendingblocks = 0;
5909	freedeps = 0;
5910	needj = UFSTOVFS(ump)->mnt_kern_flag & MNTK_SUJ;
5911	lbnadd = 1;
5912	for (i = level; i > 0; i--)
5913		lbnadd *= NINDIR(fs);
5914	/*
5915	 * Get buffer of block pointers to be freed. This routine is not
5916	 * called until the zero'ed inode has been written, so it is safe
5917	 * to free blocks as they are encountered. Because the inode has
5918	 * been zero'ed, calls to bmap on these blocks will fail. So, we
5919	 * have to use the on-disk address and the block device for the
5920	 * filesystem to look them up. If the file was deleted before its
5921	 * indirect blocks were all written to disk, the routine that set
5922	 * us up (deallocate_dependencies) will have arranged to leave
5923	 * a complete copy of the indirect block in memory for our use.
5924	 * Otherwise we have to read the blocks in from the disk.
5925	 */
5926#ifdef notyet
5927	bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
5928	    GB_NOCREAT);
5929#else
5930	bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
5931#endif
5932	ACQUIRE_LOCK(&lk);
5933	if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
5934		if (wk->wk_type != D_INDIRDEP ||
5935		    (wk->wk_state & GOINGAWAY) == 0)
5936			panic("indir_trunc: lost indirdep %p", wk);
5937		indirdep = WK_INDIRDEP(wk);
5938		LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list);
5939		free_indirdep(indirdep);
5940		if (!LIST_EMPTY(&bp->b_dep))
5941			panic("indir_trunc: dangling dep %p",
5942			    LIST_FIRST(&bp->b_dep));
5943		ump->um_numindirdeps -= 1;
5944		FREE_LOCK(&lk);
5945	} else {
5946#ifdef notyet
5947		if (bp)
5948			brelse(bp);
5949#endif
5950		FREE_LOCK(&lk);
5951		if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
5952		    NOCRED, &bp) != 0) {
5953			brelse(bp);
5954			return;
5955		}
5956	}
5957	/*
5958	 * Recursively free indirect blocks.
5959	 */
5960	if (ump->um_fstype == UFS1) {
5961		ufs1fmt = 1;
5962		bap1 = (ufs1_daddr_t *)bp->b_data;
5963	} else {
5964		ufs1fmt = 0;
5965		bap2 = (ufs2_daddr_t *)bp->b_data;
5966	}
5967	/*
5968	 * Reclaim indirect blocks which never made it to disk.
5969	 */
5970	cnt = 0;
5971	LIST_FOREACH_SAFE(wk, &wkhd, wk_list, wkn) {
5972		struct workhead freewk;
5973		if (wk->wk_type != D_JNEWBLK)
5974			continue;
5975		WORKLIST_REMOVE_UNLOCKED(wk);
5976		LIST_INIT(&freewk);
5977		WORKLIST_INSERT_UNLOCKED(&freewk, wk);
5978		jnewblk = WK_JNEWBLK(wk);
5979		if (jnewblk->jn_lbn > 0)
5980			i = (jnewblk->jn_lbn - -lbn) / lbnadd;
5981		else
5982			i = (jnewblk->jn_lbn - (lbn + 1)) / lbnadd;
5983		KASSERT(i >= 0 && i < NINDIR(fs),
5984		    ("indir_trunc: Index out of range %d parent %jd lbn %jd",
5985		    i, lbn, jnewblk->jn_lbn));
5986		/* Clear the pointer so it isn't found below. */
5987		if (ufs1fmt) {
5988			nb = bap1[i];
5989			bap1[i] = 0;
5990		} else {
5991			nb = bap2[i];
5992			bap2[i] = 0;
5993		}
5994		KASSERT(nb == jnewblk->jn_blkno,
5995		    ("indir_trunc: Block mismatch %jd != %jd",
5996		    nb, jnewblk->jn_blkno));
5997		ffs_blkfree(ump, fs, freeblks->fb_devvp, jnewblk->jn_blkno,
5998		    fs->fs_bsize, freeblks->fb_previousinum, &freewk);
5999		cnt++;
6000	}
6001	ACQUIRE_LOCK(&lk);
6002	if (needj)
6003		freework->fw_ref += NINDIR(fs) + 1;
6004	/* Any remaining journal work can be completed with freeblks. */
6005	jwork_move(&freeblks->fb_jwork, &wkhd);
6006	FREE_LOCK(&lk);
6007	nblocks = btodb(fs->fs_bsize);
6008	if (ufs1fmt)
6009		nb = bap1[0];
6010	else
6011		nb = bap2[0];
6012	nfreework = freework;
6013	/*
6014	 * Reclaim on disk blocks.
6015	 */
6016	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
6017		if (i != NINDIR(fs) - 1) {
6018			if (ufs1fmt)
6019				nnb = bap1[i+1];
6020			else
6021				nnb = bap2[i+1];
6022		} else
6023			nnb = 0;
6024		if (nb == 0)
6025			continue;
6026		cnt++;
6027		if (level != 0) {
6028			ufs_lbn_t nlbn;
6029
6030			nlbn = (lbn + 1) - (i * lbnadd);
6031			if (needj != 0) {
6032				nfreework = newfreework(freeblks, freework,
6033				    nlbn, nb, fs->fs_frag, 0);
6034				freedeps++;
6035			}
6036			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
6037		} else {
6038			struct freedep *freedep;
6039
6040			/*
6041			 * Attempt to aggregate freedep dependencies for
6042			 * all blocks being released to the same CG.
6043			 */
6044			LIST_INIT(&wkhd);
6045			if (needj != 0 &&
6046			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
6047				freedep = newfreedep(freework);
6048				WORKLIST_INSERT_UNLOCKED(&wkhd,
6049				    &freedep->fd_list);
6050				freedeps++;
6051			}
6052			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
6053			    fs->fs_bsize, freeblks->fb_previousinum, &wkhd);
6054		}
6055	}
6056	if (level == 0)
6057		fs_pendingblocks = (nblocks * cnt);
6058	/*
6059	 * If we're not journaling we can free the indirect now.  Otherwise
6060	 * setup the ref counts and offset so this indirect can be completed
6061	 * when its children are free.
6062	 */
6063	if (needj == 0) {
6064		fs_pendingblocks += nblocks;
6065		dbn = dbtofsb(fs, dbn);
6066		ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
6067		    freeblks->fb_previousinum, NULL);
6068		ACQUIRE_LOCK(&lk);
6069		freeblks->fb_chkcnt -= fs_pendingblocks;
6070		if (freework->fw_blkno == dbn)
6071			handle_written_freework(freework);
6072		FREE_LOCK(&lk);
6073		freework = NULL;
6074	} else {
6075		ACQUIRE_LOCK(&lk);
6076		freework->fw_off = i;
6077		freework->fw_ref += freedeps;
6078		freework->fw_ref -= NINDIR(fs) + 1;
6079		if (freework->fw_ref != 0)
6080			freework = NULL;
6081		freeblks->fb_chkcnt -= fs_pendingblocks;
6082		FREE_LOCK(&lk);
6083	}
6084	if (fs_pendingblocks) {
6085		UFS_LOCK(ump);
6086		fs->fs_pendingblocks -= fs_pendingblocks;
6087		UFS_UNLOCK(ump);
6088	}
6089	bp->b_flags |= B_INVAL | B_NOCACHE;
6090	brelse(bp);
6091	if (freework)
6092		handle_workitem_indirblk(freework);
6093	return;
6094}
6095
6096/*
6097 * Cancel an allocindir when it is removed via truncation.
6098 */
6099static void
6100cancel_allocindir(aip, inodedep, freeblks)
6101	struct allocindir *aip;
6102	struct inodedep *inodedep;
6103	struct freeblks *freeblks;
6104{
6105	struct newblk *newblk;
6106
6107	/*
6108	 * If the journal hasn't been written the jnewblk must be passed
6109	 * to the call to ffs_freeblk that reclaims the space.  We accomplish
6110	 * this by linking the journal dependency into the indirdep to be
6111	 * freed when indir_trunc() is called.  If the journal has already
6112	 * been written we can simply reclaim the journal space when the
6113	 * freeblks work is complete.
6114	 */
6115	LIST_REMOVE(aip, ai_next);
6116	newblk = (struct newblk *)aip;
6117	if (newblk->nb_jnewblk == NULL)
6118		cancel_newblk(newblk, &freeblks->fb_jwork);
6119	else
6120		cancel_newblk(newblk, &aip->ai_indirdep->ir_jwork);
6121	if (inodedep && inodedep->id_state & DEPCOMPLETE)
6122		WORKLIST_INSERT(&inodedep->id_bufwait, &newblk->nb_list);
6123	else
6124		free_newblk(newblk);
6125}
6126
6127/*
6128 * Create the mkdir dependencies for . and .. in a new directory.  Link them
6129 * in to a newdirblk so any subsequent additions are tracked properly.  The
6130 * caller is responsible for adding the mkdir1 dependency to the journal
6131 * and updating id_mkdiradd.  This function returns with lk held.
6132 */
6133static struct mkdir *
6134setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
6135	struct diradd *dap;
6136	ino_t newinum;
6137	ino_t dinum;
6138	struct buf *newdirbp;
6139	struct mkdir **mkdirp;
6140{
6141	struct newblk *newblk;
6142	struct pagedep *pagedep;
6143	struct inodedep *inodedep;
6144	struct newdirblk *newdirblk = 0;
6145	struct mkdir *mkdir1, *mkdir2;
6146	struct worklist *wk;
6147	struct jaddref *jaddref;
6148	struct mount *mp;
6149
6150	mp = dap->da_list.wk_mp;
6151	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
6152	    M_SOFTDEP_FLAGS);
6153	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
6154	LIST_INIT(&newdirblk->db_mkdir);
6155	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
6156	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
6157	mkdir1->md_state = ATTACHED | MKDIR_BODY;
6158	mkdir1->md_diradd = dap;
6159	mkdir1->md_jaddref = NULL;
6160	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
6161	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
6162	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
6163	mkdir2->md_diradd = dap;
6164	mkdir2->md_jaddref = NULL;
6165	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) {
6166		mkdir1->md_state |= DEPCOMPLETE;
6167		mkdir2->md_state |= DEPCOMPLETE;
6168	}
6169	/*
6170	 * Dependency on "." and ".." being written to disk.
6171	 */
6172	mkdir1->md_buf = newdirbp;
6173	ACQUIRE_LOCK(&lk);
6174	LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
6175	/*
6176	 * We must link the pagedep, allocdirect, and newdirblk for
6177	 * the initial file page so the pointer to the new directory
6178	 * is not written until the directory contents are live and
6179	 * any subsequent additions are not marked live until the
6180	 * block is reachable via the inode.
6181	 */
6182	if (pagedep_lookup(mp, newinum, 0, 0, &pagedep) == 0)
6183		panic("setup_newdir: lost pagedep");
6184	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
6185		if (wk->wk_type == D_ALLOCDIRECT)
6186			break;
6187	if (wk == NULL)
6188		panic("setup_newdir: lost allocdirect");
6189	newblk = WK_NEWBLK(wk);
6190	pagedep->pd_state |= NEWBLOCK;
6191	pagedep->pd_newdirblk = newdirblk;
6192	newdirblk->db_pagedep = pagedep;
6193	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
6194	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
6195	/*
6196	 * Look up the inodedep for the parent directory so that we
6197	 * can link mkdir2 into the pending dotdot jaddref or
6198	 * the inode write if there is none.  If the inode is
6199	 * ALLCOMPLETE and no jaddref is present all dependencies have
6200	 * been satisfied and mkdir2 can be freed.
6201	 */
6202	inodedep_lookup(mp, dinum, 0, &inodedep);
6203	if (mp->mnt_kern_flag & MNTK_SUJ) {
6204		if (inodedep == NULL)
6205			panic("setup_newdir: Lost parent.");
6206		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
6207		    inoreflst);
6208		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
6209		    (jaddref->ja_state & MKDIR_PARENT),
6210		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
6211		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
6212		mkdir2->md_jaddref = jaddref;
6213		jaddref->ja_mkdir = mkdir2;
6214	} else if (inodedep == NULL ||
6215	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
6216		dap->da_state &= ~MKDIR_PARENT;
6217		WORKITEM_FREE(mkdir2, D_MKDIR);
6218	} else {
6219		LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
6220		WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
6221	}
6222	*mkdirp = mkdir2;
6223
6224	return (mkdir1);
6225}
6226
6227/*
6228 * Directory entry addition dependencies.
6229 *
6230 * When adding a new directory entry, the inode (with its incremented link
6231 * count) must be written to disk before the directory entry's pointer to it.
6232 * Also, if the inode is newly allocated, the corresponding freemap must be
6233 * updated (on disk) before the directory entry's pointer. These requirements
6234 * are met via undo/redo on the directory entry's pointer, which consists
6235 * simply of the inode number.
6236 *
6237 * As directory entries are added and deleted, the free space within a
6238 * directory block can become fragmented.  The ufs filesystem will compact
6239 * a fragmented directory block to make space for a new entry. When this
6240 * occurs, the offsets of previously added entries change. Any "diradd"
6241 * dependency structures corresponding to these entries must be updated with
6242 * the new offsets.
6243 */
6244
6245/*
6246 * This routine is called after the in-memory inode's link
6247 * count has been incremented, but before the directory entry's
6248 * pointer to the inode has been set.
6249 */
6250int
6251softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
6252	struct buf *bp;		/* buffer containing directory block */
6253	struct inode *dp;	/* inode for directory */
6254	off_t diroffset;	/* offset of new entry in directory */
6255	ino_t newinum;		/* inode referenced by new directory entry */
6256	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
6257	int isnewblk;		/* entry is in a newly allocated block */
6258{
6259	int offset;		/* offset of new entry within directory block */
6260	ufs_lbn_t lbn;		/* block in directory containing new entry */
6261	struct fs *fs;
6262	struct diradd *dap;
6263	struct newblk *newblk;
6264	struct pagedep *pagedep;
6265	struct inodedep *inodedep;
6266	struct newdirblk *newdirblk = 0;
6267	struct mkdir *mkdir1, *mkdir2;
6268	struct jaddref *jaddref;
6269	struct mount *mp;
6270	int isindir;
6271
6272	/*
6273	 * Whiteouts have no dependencies.
6274	 */
6275	if (newinum == WINO) {
6276		if (newdirbp != NULL)
6277			bdwrite(newdirbp);
6278		return (0);
6279	}
6280	jaddref = NULL;
6281	mkdir1 = mkdir2 = NULL;
6282	mp = UFSTOVFS(dp->i_ump);
6283	fs = dp->i_fs;
6284	lbn = lblkno(fs, diroffset);
6285	offset = blkoff(fs, diroffset);
6286	dap = malloc(sizeof(struct diradd), M_DIRADD,
6287		M_SOFTDEP_FLAGS|M_ZERO);
6288	workitem_alloc(&dap->da_list, D_DIRADD, mp);
6289	dap->da_offset = offset;
6290	dap->da_newinum = newinum;
6291	dap->da_state = ATTACHED;
6292	LIST_INIT(&dap->da_jwork);
6293	isindir = bp->b_lblkno >= NDADDR;
6294	if (isnewblk &&
6295	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
6296		newdirblk = malloc(sizeof(struct newdirblk),
6297		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
6298		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
6299		LIST_INIT(&newdirblk->db_mkdir);
6300	}
6301	/*
6302	 * If we're creating a new directory setup the dependencies and set
6303	 * the dap state to wait for them.  Otherwise it's COMPLETE and
6304	 * we can move on.
6305	 */
6306	if (newdirbp == NULL) {
6307		dap->da_state |= DEPCOMPLETE;
6308		ACQUIRE_LOCK(&lk);
6309	} else {
6310		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
6311		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
6312		    &mkdir2);
6313	}
6314	/*
6315	 * Link into parent directory pagedep to await its being written.
6316	 */
6317	if (pagedep_lookup(mp, dp->i_number, lbn, DEPALLOC, &pagedep) == 0)
6318		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
6319#ifdef DEBUG
6320	if (diradd_lookup(pagedep, offset) != NULL)
6321		panic("softdep_setup_directory_add: %p already at off %d\n",
6322		    diradd_lookup(pagedep, offset), offset);
6323#endif
6324	dap->da_pagedep = pagedep;
6325	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
6326	    da_pdlist);
6327	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
6328	/*
6329	 * If we're journaling, link the diradd into the jaddref so it
6330	 * may be completed after the journal entry is written.  Otherwise,
6331	 * link the diradd into its inodedep.  If the inode is not yet
6332	 * written place it on the bufwait list, otherwise do the post-inode
6333	 * write processing to put it on the id_pendinghd list.
6334	 */
6335	if (mp->mnt_kern_flag & MNTK_SUJ) {
6336		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
6337		    inoreflst);
6338		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
6339		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
6340		jaddref->ja_diroff = diroffset;
6341		jaddref->ja_diradd = dap;
6342		add_to_journal(&jaddref->ja_list);
6343	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
6344		diradd_inode_written(dap, inodedep);
6345	else
6346		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
6347	/*
6348	 * Add the journal entries for . and .. links now that the primary
6349	 * link is written.
6350	 */
6351	if (mkdir1 != NULL && mp->mnt_kern_flag & MNTK_SUJ) {
6352		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
6353		    inoreflst, if_deps);
6354		KASSERT(jaddref != NULL &&
6355		    jaddref->ja_ino == jaddref->ja_parent &&
6356		    (jaddref->ja_state & MKDIR_BODY),
6357		    ("softdep_setup_directory_add: bad dot jaddref %p",
6358		    jaddref));
6359		mkdir1->md_jaddref = jaddref;
6360		jaddref->ja_mkdir = mkdir1;
6361		/*
6362		 * It is important that the dotdot journal entry
6363		 * is added prior to the dot entry since dot writes
6364		 * both the dot and dotdot links.  These both must
6365		 * be added after the primary link for the journal
6366		 * to remain consistent.
6367		 */
6368		add_to_journal(&mkdir2->md_jaddref->ja_list);
6369		add_to_journal(&jaddref->ja_list);
6370	}
6371	/*
6372	 * If we are adding a new directory remember this diradd so that if
6373	 * we rename it we can keep the dot and dotdot dependencies.  If
6374	 * we are adding a new name for an inode that has a mkdiradd we
6375	 * must be in rename and we have to move the dot and dotdot
6376	 * dependencies to this new name.  The old name is being orphaned
6377	 * soon.
6378	 */
6379	if (mkdir1 != NULL) {
6380		if (inodedep->id_mkdiradd != NULL)
6381			panic("softdep_setup_directory_add: Existing mkdir");
6382		inodedep->id_mkdiradd = dap;
6383	} else if (inodedep->id_mkdiradd)
6384		merge_diradd(inodedep, dap);
6385	if (newdirblk) {
6386		/*
6387		 * There is nothing to do if we are already tracking
6388		 * this block.
6389		 */
6390		if ((pagedep->pd_state & NEWBLOCK) != 0) {
6391			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
6392			FREE_LOCK(&lk);
6393			return (0);
6394		}
6395		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
6396		    == 0)
6397			panic("softdep_setup_directory_add: lost entry");
6398		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
6399		pagedep->pd_state |= NEWBLOCK;
6400		pagedep->pd_newdirblk = newdirblk;
6401		newdirblk->db_pagedep = pagedep;
6402		FREE_LOCK(&lk);
6403		/*
6404		 * If we extended into an indirect signal direnter to sync.
6405		 */
6406		if (isindir)
6407			return (1);
6408		return (0);
6409	}
6410	FREE_LOCK(&lk);
6411	return (0);
6412}
6413
6414/*
6415 * This procedure is called to change the offset of a directory
6416 * entry when compacting a directory block which must be owned
6417 * exclusively by the caller. Note that the actual entry movement
6418 * must be done in this procedure to ensure that no I/O completions
6419 * occur while the move is in progress.
6420 */
6421void
6422softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
6423	struct buf *bp;		/* Buffer holding directory block. */
6424	struct inode *dp;	/* inode for directory */
6425	caddr_t base;		/* address of dp->i_offset */
6426	caddr_t oldloc;		/* address of old directory location */
6427	caddr_t newloc;		/* address of new directory location */
6428	int entrysize;		/* size of directory entry */
6429{
6430	int offset, oldoffset, newoffset;
6431	struct pagedep *pagedep;
6432	struct jmvref *jmvref;
6433	struct diradd *dap;
6434	struct direct *de;
6435	struct mount *mp;
6436	ufs_lbn_t lbn;
6437	int flags;
6438
6439	mp = UFSTOVFS(dp->i_ump);
6440	de = (struct direct *)oldloc;
6441	jmvref = NULL;
6442	flags = 0;
6443	/*
6444	 * Moves are always journaled as it would be too complex to
6445	 * determine if any affected adds or removes are present in the
6446	 * journal.
6447	 */
6448	if (mp->mnt_kern_flag & MNTK_SUJ)  {
6449		flags = DEPALLOC;
6450		jmvref = newjmvref(dp, de->d_ino,
6451		    dp->i_offset + (oldloc - base),
6452		    dp->i_offset + (newloc - base));
6453	}
6454	lbn = lblkno(dp->i_fs, dp->i_offset);
6455	offset = blkoff(dp->i_fs, dp->i_offset);
6456	oldoffset = offset + (oldloc - base);
6457	newoffset = offset + (newloc - base);
6458	ACQUIRE_LOCK(&lk);
6459	if (pagedep_lookup(mp, dp->i_number, lbn, flags, &pagedep) == 0) {
6460		if (pagedep)
6461			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
6462		goto done;
6463	}
6464	dap = diradd_lookup(pagedep, oldoffset);
6465	if (dap) {
6466		dap->da_offset = newoffset;
6467		newoffset = DIRADDHASH(newoffset);
6468		oldoffset = DIRADDHASH(oldoffset);
6469		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
6470		    newoffset != oldoffset) {
6471			LIST_REMOVE(dap, da_pdlist);
6472			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
6473			    dap, da_pdlist);
6474		}
6475	}
6476done:
6477	if (jmvref) {
6478		jmvref->jm_pagedep = pagedep;
6479		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
6480		add_to_journal(&jmvref->jm_list);
6481	}
6482	bcopy(oldloc, newloc, entrysize);
6483	FREE_LOCK(&lk);
6484}
6485
6486/*
6487 * Move the mkdir dependencies and journal work from one diradd to another
6488 * when renaming a directory.  The new name must depend on the mkdir deps
6489 * completing as the old name did.  Directories can only have one valid link
6490 * at a time so one must be canonical.
6491 */
6492static void
6493merge_diradd(inodedep, newdap)
6494	struct inodedep *inodedep;
6495	struct diradd *newdap;
6496{
6497	struct diradd *olddap;
6498	struct mkdir *mkdir, *nextmd;
6499	short state;
6500
6501	olddap = inodedep->id_mkdiradd;
6502	inodedep->id_mkdiradd = newdap;
6503	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
6504		newdap->da_state &= ~DEPCOMPLETE;
6505		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
6506			nextmd = LIST_NEXT(mkdir, md_mkdirs);
6507			if (mkdir->md_diradd != olddap)
6508				continue;
6509			mkdir->md_diradd = newdap;
6510			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
6511			newdap->da_state |= state;
6512			olddap->da_state &= ~state;
6513			if ((olddap->da_state &
6514			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
6515				break;
6516		}
6517		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
6518			panic("merge_diradd: unfound ref");
6519	}
6520	/*
6521	 * Any mkdir related journal items are not safe to be freed until
6522	 * the new name is stable.
6523	 */
6524	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
6525	olddap->da_state |= DEPCOMPLETE;
6526	complete_diradd(olddap);
6527}
6528
6529/*
6530 * Move the diradd to the pending list when all diradd dependencies are
6531 * complete.
6532 */
6533static void
6534complete_diradd(dap)
6535	struct diradd *dap;
6536{
6537	struct pagedep *pagedep;
6538
6539	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
6540		if (dap->da_state & DIRCHG)
6541			pagedep = dap->da_previous->dm_pagedep;
6542		else
6543			pagedep = dap->da_pagedep;
6544		LIST_REMOVE(dap, da_pdlist);
6545		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
6546	}
6547}
6548
6549/*
6550 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
6551 * add entries and conditonally journal the remove.
6552 */
6553static void
6554cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
6555	struct diradd *dap;
6556	struct dirrem *dirrem;
6557	struct jremref *jremref;
6558	struct jremref *dotremref;
6559	struct jremref *dotdotremref;
6560{
6561	struct inodedep *inodedep;
6562	struct jaddref *jaddref;
6563	struct inoref *inoref;
6564	struct mkdir *mkdir;
6565
6566	/*
6567	 * If no remove references were allocated we're on a non-journaled
6568	 * filesystem and can skip the cancel step.
6569	 */
6570	if (jremref == NULL) {
6571		free_diradd(dap, NULL);
6572		return;
6573	}
6574	/*
6575	 * Cancel the primary name an free it if it does not require
6576	 * journaling.
6577	 */
6578	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
6579	    0, &inodedep) != 0) {
6580		/* Abort the addref that reference this diradd.  */
6581		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
6582			if (inoref->if_list.wk_type != D_JADDREF)
6583				continue;
6584			jaddref = (struct jaddref *)inoref;
6585			if (jaddref->ja_diradd != dap)
6586				continue;
6587			if (cancel_jaddref(jaddref, inodedep,
6588			    &dirrem->dm_jwork) == 0) {
6589				free_jremref(jremref);
6590				jremref = NULL;
6591			}
6592			break;
6593		}
6594	}
6595	/*
6596	 * Cancel subordinate names and free them if they do not require
6597	 * journaling.
6598	 */
6599	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
6600		LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
6601			if (mkdir->md_diradd != dap)
6602				continue;
6603			if ((jaddref = mkdir->md_jaddref) == NULL)
6604				continue;
6605			mkdir->md_jaddref = NULL;
6606			if (mkdir->md_state & MKDIR_PARENT) {
6607				if (cancel_jaddref(jaddref, NULL,
6608				    &dirrem->dm_jwork) == 0) {
6609					free_jremref(dotdotremref);
6610					dotdotremref = NULL;
6611				}
6612			} else {
6613				if (cancel_jaddref(jaddref, inodedep,
6614				    &dirrem->dm_jwork) == 0) {
6615					free_jremref(dotremref);
6616					dotremref = NULL;
6617				}
6618			}
6619		}
6620	}
6621
6622	if (jremref)
6623		journal_jremref(dirrem, jremref, inodedep);
6624	if (dotremref)
6625		journal_jremref(dirrem, dotremref, inodedep);
6626	if (dotdotremref)
6627		journal_jremref(dirrem, dotdotremref, NULL);
6628	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
6629	free_diradd(dap, &dirrem->dm_jwork);
6630}
6631
6632/*
6633 * Free a diradd dependency structure. This routine must be called
6634 * with splbio interrupts blocked.
6635 */
6636static void
6637free_diradd(dap, wkhd)
6638	struct diradd *dap;
6639	struct workhead *wkhd;
6640{
6641	struct dirrem *dirrem;
6642	struct pagedep *pagedep;
6643	struct inodedep *inodedep;
6644	struct mkdir *mkdir, *nextmd;
6645
6646	mtx_assert(&lk, MA_OWNED);
6647	LIST_REMOVE(dap, da_pdlist);
6648	if (dap->da_state & ONWORKLIST)
6649		WORKLIST_REMOVE(&dap->da_list);
6650	if ((dap->da_state & DIRCHG) == 0) {
6651		pagedep = dap->da_pagedep;
6652	} else {
6653		dirrem = dap->da_previous;
6654		pagedep = dirrem->dm_pagedep;
6655		dirrem->dm_dirinum = pagedep->pd_ino;
6656		dirrem->dm_state |= COMPLETE;
6657		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
6658			add_to_worklist(&dirrem->dm_list, 0);
6659	}
6660	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
6661	    0, &inodedep) != 0)
6662		if (inodedep->id_mkdiradd == dap)
6663			inodedep->id_mkdiradd = NULL;
6664	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
6665		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
6666			nextmd = LIST_NEXT(mkdir, md_mkdirs);
6667			if (mkdir->md_diradd != dap)
6668				continue;
6669			dap->da_state &=
6670			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
6671			LIST_REMOVE(mkdir, md_mkdirs);
6672			if (mkdir->md_state & ONWORKLIST)
6673				WORKLIST_REMOVE(&mkdir->md_list);
6674			if (mkdir->md_jaddref != NULL)
6675				panic("free_diradd: Unexpected jaddref");
6676			WORKITEM_FREE(mkdir, D_MKDIR);
6677			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
6678				break;
6679		}
6680		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
6681			panic("free_diradd: unfound ref");
6682	}
6683	if (inodedep)
6684		free_inodedep(inodedep);
6685	/*
6686	 * Free any journal segments waiting for the directory write.
6687	 */
6688	handle_jwork(&dap->da_jwork);
6689	WORKITEM_FREE(dap, D_DIRADD);
6690}
6691
6692/*
6693 * Directory entry removal dependencies.
6694 *
6695 * When removing a directory entry, the entry's inode pointer must be
6696 * zero'ed on disk before the corresponding inode's link count is decremented
6697 * (possibly freeing the inode for re-use). This dependency is handled by
6698 * updating the directory entry but delaying the inode count reduction until
6699 * after the directory block has been written to disk. After this point, the
6700 * inode count can be decremented whenever it is convenient.
6701 */
6702
6703/*
6704 * This routine should be called immediately after removing
6705 * a directory entry.  The inode's link count should not be
6706 * decremented by the calling procedure -- the soft updates
6707 * code will do this task when it is safe.
6708 */
6709void
6710softdep_setup_remove(bp, dp, ip, isrmdir)
6711	struct buf *bp;		/* buffer containing directory block */
6712	struct inode *dp;	/* inode for the directory being modified */
6713	struct inode *ip;	/* inode for directory entry being removed */
6714	int isrmdir;		/* indicates if doing RMDIR */
6715{
6716	struct dirrem *dirrem, *prevdirrem;
6717	struct inodedep *inodedep;
6718	int direct;
6719
6720	/*
6721	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
6722	 * newdirrem() to setup the full directory remove which requires
6723	 * isrmdir > 1.
6724	 */
6725	dirrem = newdirrem(bp, dp, ip, isrmdir?2:0, &prevdirrem);
6726	/*
6727	 * Add the dirrem to the inodedep's pending remove list for quick
6728	 * discovery later.
6729	 */
6730	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
6731	    &inodedep) == 0)
6732		panic("softdep_setup_remove: Lost inodedep.");
6733	dirrem->dm_state |= ONDEPLIST;
6734	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
6735
6736	/*
6737	 * If the COMPLETE flag is clear, then there were no active
6738	 * entries and we want to roll back to a zeroed entry until
6739	 * the new inode is committed to disk. If the COMPLETE flag is
6740	 * set then we have deleted an entry that never made it to
6741	 * disk. If the entry we deleted resulted from a name change,
6742	 * then the old name still resides on disk. We cannot delete
6743	 * its inode (returned to us in prevdirrem) until the zeroed
6744	 * directory entry gets to disk. The new inode has never been
6745	 * referenced on the disk, so can be deleted immediately.
6746	 */
6747	if ((dirrem->dm_state & COMPLETE) == 0) {
6748		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
6749		    dm_next);
6750		FREE_LOCK(&lk);
6751	} else {
6752		if (prevdirrem != NULL)
6753			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
6754			    prevdirrem, dm_next);
6755		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
6756		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
6757		FREE_LOCK(&lk);
6758		if (direct)
6759			handle_workitem_remove(dirrem, NULL);
6760	}
6761}
6762
6763/*
6764 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
6765 * pd_pendinghd list of a pagedep.
6766 */
6767static struct diradd *
6768diradd_lookup(pagedep, offset)
6769	struct pagedep *pagedep;
6770	int offset;
6771{
6772	struct diradd *dap;
6773
6774	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
6775		if (dap->da_offset == offset)
6776			return (dap);
6777	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
6778		if (dap->da_offset == offset)
6779			return (dap);
6780	return (NULL);
6781}
6782
6783/*
6784 * Search for a .. diradd dependency in a directory that is being removed.
6785 * If the directory was renamed to a new parent we have a diradd rather
6786 * than a mkdir for the .. entry.  We need to cancel it now before
6787 * it is found in truncate().
6788 */
6789static struct jremref *
6790cancel_diradd_dotdot(ip, dirrem, jremref)
6791	struct inode *ip;
6792	struct dirrem *dirrem;
6793	struct jremref *jremref;
6794{
6795	struct pagedep *pagedep;
6796	struct diradd *dap;
6797	struct worklist *wk;
6798
6799	if (pagedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 0,
6800	    &pagedep) == 0)
6801		return (jremref);
6802	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
6803	if (dap == NULL)
6804		return (jremref);
6805	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
6806	/*
6807	 * Mark any journal work as belonging to the parent so it is freed
6808	 * with the .. reference.
6809	 */
6810	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
6811		wk->wk_state |= MKDIR_PARENT;
6812	return (NULL);
6813}
6814
6815/*
6816 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
6817 * replace it with a dirrem/diradd pair as a result of re-parenting a
6818 * directory.  This ensures that we don't simultaneously have a mkdir and
6819 * a diradd for the same .. entry.
6820 */
6821static struct jremref *
6822cancel_mkdir_dotdot(ip, dirrem, jremref)
6823	struct inode *ip;
6824	struct dirrem *dirrem;
6825	struct jremref *jremref;
6826{
6827	struct inodedep *inodedep;
6828	struct jaddref *jaddref;
6829	struct mkdir *mkdir;
6830	struct diradd *dap;
6831
6832	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
6833	    &inodedep) == 0)
6834		panic("cancel_mkdir_dotdot: Lost inodedep");
6835	dap = inodedep->id_mkdiradd;
6836	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
6837		return (jremref);
6838	for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir;
6839	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
6840		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
6841			break;
6842	if (mkdir == NULL)
6843		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
6844	if ((jaddref = mkdir->md_jaddref) != NULL) {
6845		mkdir->md_jaddref = NULL;
6846		jaddref->ja_state &= ~MKDIR_PARENT;
6847		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
6848		    &inodedep) == 0)
6849			panic("cancel_mkdir_dotdot: Lost parent inodedep");
6850		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
6851			journal_jremref(dirrem, jremref, inodedep);
6852			jremref = NULL;
6853		}
6854	}
6855	if (mkdir->md_state & ONWORKLIST)
6856		WORKLIST_REMOVE(&mkdir->md_list);
6857	mkdir->md_state |= ALLCOMPLETE;
6858	complete_mkdir(mkdir);
6859	return (jremref);
6860}
6861
6862static void
6863journal_jremref(dirrem, jremref, inodedep)
6864	struct dirrem *dirrem;
6865	struct jremref *jremref;
6866	struct inodedep *inodedep;
6867{
6868
6869	if (inodedep == NULL)
6870		if (inodedep_lookup(jremref->jr_list.wk_mp,
6871		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
6872			panic("journal_jremref: Lost inodedep");
6873	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
6874	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
6875	add_to_journal(&jremref->jr_list);
6876}
6877
6878static void
6879dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
6880	struct dirrem *dirrem;
6881	struct jremref *jremref;
6882	struct jremref *dotremref;
6883	struct jremref *dotdotremref;
6884{
6885	struct inodedep *inodedep;
6886
6887
6888	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
6889	    &inodedep) == 0)
6890		panic("dirrem_journal: Lost inodedep");
6891	journal_jremref(dirrem, jremref, inodedep);
6892	if (dotremref)
6893		journal_jremref(dirrem, dotremref, inodedep);
6894	if (dotdotremref)
6895		journal_jremref(dirrem, dotdotremref, NULL);
6896}
6897
6898/*
6899 * Allocate a new dirrem if appropriate and return it along with
6900 * its associated pagedep. Called without a lock, returns with lock.
6901 */
6902static long num_dirrem;		/* number of dirrem allocated */
6903static struct dirrem *
6904newdirrem(bp, dp, ip, isrmdir, prevdirremp)
6905	struct buf *bp;		/* buffer containing directory block */
6906	struct inode *dp;	/* inode for the directory being modified */
6907	struct inode *ip;	/* inode for directory entry being removed */
6908	int isrmdir;		/* indicates if doing RMDIR */
6909	struct dirrem **prevdirremp; /* previously referenced inode, if any */
6910{
6911	int offset;
6912	ufs_lbn_t lbn;
6913	struct diradd *dap;
6914	struct dirrem *dirrem;
6915	struct pagedep *pagedep;
6916	struct jremref *jremref;
6917	struct jremref *dotremref;
6918	struct jremref *dotdotremref;
6919	struct vnode *dvp;
6920
6921	/*
6922	 * Whiteouts have no deletion dependencies.
6923	 */
6924	if (ip == NULL)
6925		panic("newdirrem: whiteout");
6926	dvp = ITOV(dp);
6927	/*
6928	 * If we are over our limit, try to improve the situation.
6929	 * Limiting the number of dirrem structures will also limit
6930	 * the number of freefile and freeblks structures.
6931	 */
6932	ACQUIRE_LOCK(&lk);
6933	if (!(ip->i_flags & SF_SNAPSHOT) && num_dirrem > max_softdeps / 2)
6934		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE);
6935	num_dirrem += 1;
6936	FREE_LOCK(&lk);
6937	dirrem = malloc(sizeof(struct dirrem),
6938		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
6939	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
6940	LIST_INIT(&dirrem->dm_jremrefhd);
6941	LIST_INIT(&dirrem->dm_jwork);
6942	dirrem->dm_state = isrmdir ? RMDIR : 0;
6943	dirrem->dm_oldinum = ip->i_number;
6944	*prevdirremp = NULL;
6945	/*
6946	 * Allocate remove reference structures to track journal write
6947	 * dependencies.  We will always have one for the link and
6948	 * when doing directories we will always have one more for dot.
6949	 * When renaming a directory we skip the dotdot link change so
6950	 * this is not needed.
6951	 */
6952	jremref = dotremref = dotdotremref = NULL;
6953	if (DOINGSUJ(dvp)) {
6954		if (isrmdir) {
6955			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
6956			    ip->i_effnlink + 2);
6957			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
6958			    ip->i_effnlink + 1);
6959		} else
6960			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
6961			    ip->i_effnlink + 1);
6962		if (isrmdir > 1) {
6963			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
6964			    dp->i_effnlink + 1);
6965			dotdotremref->jr_state |= MKDIR_PARENT;
6966		}
6967	}
6968	ACQUIRE_LOCK(&lk);
6969	lbn = lblkno(dp->i_fs, dp->i_offset);
6970	offset = blkoff(dp->i_fs, dp->i_offset);
6971	if (pagedep_lookup(UFSTOVFS(dp->i_ump), dp->i_number, lbn, DEPALLOC,
6972	    &pagedep) == 0)
6973		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
6974	dirrem->dm_pagedep = pagedep;
6975	/*
6976	 * If we're renaming a .. link to a new directory, cancel any
6977	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
6978	 * the jremref is preserved for any potential diradd in this
6979	 * location.  This can not coincide with a rmdir.
6980	 */
6981	if (dp->i_offset == DOTDOT_OFFSET) {
6982		if (isrmdir)
6983			panic("newdirrem: .. directory change during remove?");
6984		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
6985	}
6986	/*
6987	 * If we're removing a directory search for the .. dependency now and
6988	 * cancel it.  Any pending journal work will be added to the dirrem
6989	 * to be completed when the workitem remove completes.
6990	 */
6991	if (isrmdir > 1)
6992		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
6993	/*
6994	 * Check for a diradd dependency for the same directory entry.
6995	 * If present, then both dependencies become obsolete and can
6996	 * be de-allocated.
6997	 */
6998	dap = diradd_lookup(pagedep, offset);
6999	if (dap == NULL) {
7000		/*
7001		 * Link the jremref structures into the dirrem so they are
7002		 * written prior to the pagedep.
7003		 */
7004		if (jremref)
7005			dirrem_journal(dirrem, jremref, dotremref,
7006			    dotdotremref);
7007		return (dirrem);
7008	}
7009	/*
7010	 * Must be ATTACHED at this point.
7011	 */
7012	if ((dap->da_state & ATTACHED) == 0)
7013		panic("newdirrem: not ATTACHED");
7014	if (dap->da_newinum != ip->i_number)
7015		panic("newdirrem: inum %d should be %d",
7016		    ip->i_number, dap->da_newinum);
7017	/*
7018	 * If we are deleting a changed name that never made it to disk,
7019	 * then return the dirrem describing the previous inode (which
7020	 * represents the inode currently referenced from this entry on disk).
7021	 */
7022	if ((dap->da_state & DIRCHG) != 0) {
7023		*prevdirremp = dap->da_previous;
7024		dap->da_state &= ~DIRCHG;
7025		dap->da_pagedep = pagedep;
7026	}
7027	/*
7028	 * We are deleting an entry that never made it to disk.
7029	 * Mark it COMPLETE so we can delete its inode immediately.
7030	 */
7031	dirrem->dm_state |= COMPLETE;
7032	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
7033#ifdef SUJ_DEBUG
7034	if (isrmdir == 0) {
7035		struct worklist *wk;
7036
7037		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
7038			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
7039				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
7040	}
7041#endif
7042
7043	return (dirrem);
7044}
7045
7046/*
7047 * Directory entry change dependencies.
7048 *
7049 * Changing an existing directory entry requires that an add operation
7050 * be completed first followed by a deletion. The semantics for the addition
7051 * are identical to the description of adding a new entry above except
7052 * that the rollback is to the old inode number rather than zero. Once
7053 * the addition dependency is completed, the removal is done as described
7054 * in the removal routine above.
7055 */
7056
7057/*
7058 * This routine should be called immediately after changing
7059 * a directory entry.  The inode's link count should not be
7060 * decremented by the calling procedure -- the soft updates
7061 * code will perform this task when it is safe.
7062 */
7063void
7064softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
7065	struct buf *bp;		/* buffer containing directory block */
7066	struct inode *dp;	/* inode for the directory being modified */
7067	struct inode *ip;	/* inode for directory entry being removed */
7068	ino_t newinum;		/* new inode number for changed entry */
7069	int isrmdir;		/* indicates if doing RMDIR */
7070{
7071	int offset;
7072	struct diradd *dap = NULL;
7073	struct dirrem *dirrem, *prevdirrem;
7074	struct pagedep *pagedep;
7075	struct inodedep *inodedep;
7076	struct jaddref *jaddref;
7077	struct mount *mp;
7078
7079	offset = blkoff(dp->i_fs, dp->i_offset);
7080	mp = UFSTOVFS(dp->i_ump);
7081
7082	/*
7083	 * Whiteouts do not need diradd dependencies.
7084	 */
7085	if (newinum != WINO) {
7086		dap = malloc(sizeof(struct diradd),
7087		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
7088		workitem_alloc(&dap->da_list, D_DIRADD, mp);
7089		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
7090		dap->da_offset = offset;
7091		dap->da_newinum = newinum;
7092		LIST_INIT(&dap->da_jwork);
7093	}
7094
7095	/*
7096	 * Allocate a new dirrem and ACQUIRE_LOCK.
7097	 */
7098	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
7099	pagedep = dirrem->dm_pagedep;
7100	/*
7101	 * The possible values for isrmdir:
7102	 *	0 - non-directory file rename
7103	 *	1 - directory rename within same directory
7104	 *   inum - directory rename to new directory of given inode number
7105	 * When renaming to a new directory, we are both deleting and
7106	 * creating a new directory entry, so the link count on the new
7107	 * directory should not change. Thus we do not need the followup
7108	 * dirrem which is usually done in handle_workitem_remove. We set
7109	 * the DIRCHG flag to tell handle_workitem_remove to skip the
7110	 * followup dirrem.
7111	 */
7112	if (isrmdir > 1)
7113		dirrem->dm_state |= DIRCHG;
7114
7115	/*
7116	 * Whiteouts have no additional dependencies,
7117	 * so just put the dirrem on the correct list.
7118	 */
7119	if (newinum == WINO) {
7120		if ((dirrem->dm_state & COMPLETE) == 0) {
7121			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
7122			    dm_next);
7123		} else {
7124			dirrem->dm_dirinum = pagedep->pd_ino;
7125			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
7126				add_to_worklist(&dirrem->dm_list, 0);
7127		}
7128		FREE_LOCK(&lk);
7129		return;
7130	}
7131	/*
7132	 * Add the dirrem to the inodedep's pending remove list for quick
7133	 * discovery later.  A valid nlinkdelta ensures that this lookup
7134	 * will not fail.
7135	 */
7136	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
7137		panic("softdep_setup_directory_change: Lost inodedep.");
7138	dirrem->dm_state |= ONDEPLIST;
7139	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
7140
7141	/*
7142	 * If the COMPLETE flag is clear, then there were no active
7143	 * entries and we want to roll back to the previous inode until
7144	 * the new inode is committed to disk. If the COMPLETE flag is
7145	 * set, then we have deleted an entry that never made it to disk.
7146	 * If the entry we deleted resulted from a name change, then the old
7147	 * inode reference still resides on disk. Any rollback that we do
7148	 * needs to be to that old inode (returned to us in prevdirrem). If
7149	 * the entry we deleted resulted from a create, then there is
7150	 * no entry on the disk, so we want to roll back to zero rather
7151	 * than the uncommitted inode. In either of the COMPLETE cases we
7152	 * want to immediately free the unwritten and unreferenced inode.
7153	 */
7154	if ((dirrem->dm_state & COMPLETE) == 0) {
7155		dap->da_previous = dirrem;
7156	} else {
7157		if (prevdirrem != NULL) {
7158			dap->da_previous = prevdirrem;
7159		} else {
7160			dap->da_state &= ~DIRCHG;
7161			dap->da_pagedep = pagedep;
7162		}
7163		dirrem->dm_dirinum = pagedep->pd_ino;
7164		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
7165			add_to_worklist(&dirrem->dm_list, 0);
7166	}
7167	/*
7168	 * Lookup the jaddref for this journal entry.  We must finish
7169	 * initializing it and make the diradd write dependent on it.
7170	 * If we're not journaling Put it on the id_bufwait list if the inode
7171	 * is not yet written. If it is written, do the post-inode write
7172	 * processing to put it on the id_pendinghd list.
7173	 */
7174	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
7175	if (mp->mnt_kern_flag & MNTK_SUJ) {
7176		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
7177		    inoreflst);
7178		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
7179		    ("softdep_setup_directory_change: bad jaddref %p",
7180		    jaddref));
7181		jaddref->ja_diroff = dp->i_offset;
7182		jaddref->ja_diradd = dap;
7183		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
7184		    dap, da_pdlist);
7185		add_to_journal(&jaddref->ja_list);
7186	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
7187		dap->da_state |= COMPLETE;
7188		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
7189		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
7190	} else {
7191		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
7192		    dap, da_pdlist);
7193		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
7194	}
7195	/*
7196	 * If we're making a new name for a directory that has not been
7197	 * committed when need to move the dot and dotdot references to
7198	 * this new name.
7199	 */
7200	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
7201		merge_diradd(inodedep, dap);
7202	FREE_LOCK(&lk);
7203}
7204
7205/*
7206 * Called whenever the link count on an inode is changed.
7207 * It creates an inode dependency so that the new reference(s)
7208 * to the inode cannot be committed to disk until the updated
7209 * inode has been written.
7210 */
7211void
7212softdep_change_linkcnt(ip)
7213	struct inode *ip;	/* the inode with the increased link count */
7214{
7215	struct inodedep *inodedep;
7216
7217	ACQUIRE_LOCK(&lk);
7218	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
7219	if (ip->i_nlink < ip->i_effnlink)
7220		panic("softdep_change_linkcnt: bad delta");
7221	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
7222	FREE_LOCK(&lk);
7223}
7224
7225/*
7226 * Called when the effective link count and the reference count
7227 * on an inode drops to zero. At this point there are no names
7228 * referencing the file in the filesystem and no active file
7229 * references. The space associated with the file will be freed
7230 * as soon as the necessary soft dependencies are cleared.
7231 */
7232void
7233softdep_releasefile(ip)
7234	struct inode *ip;	/* inode with the zero effective link count */
7235{
7236	struct inodedep *inodedep;
7237	struct fs *fs;
7238	int extblocks;
7239
7240	if (ip->i_effnlink > 0)
7241		panic("softdep_releasefile: file still referenced");
7242	/*
7243	 * We may be called several times as the on-disk link count
7244	 * drops to zero. We only want to account for the space once.
7245	 */
7246	if (ip->i_flag & IN_SPACECOUNTED)
7247		return;
7248	/*
7249	 * We have to deactivate a snapshot otherwise copyonwrites may
7250	 * add blocks and the cleanup may remove blocks after we have
7251	 * tried to account for them.
7252	 */
7253	if ((ip->i_flags & SF_SNAPSHOT) != 0)
7254		ffs_snapremove(ITOV(ip));
7255	/*
7256	 * If we are tracking an nlinkdelta, we have to also remember
7257	 * whether we accounted for the freed space yet.
7258	 */
7259	ACQUIRE_LOCK(&lk);
7260	if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep)))
7261		inodedep->id_state |= SPACECOUNTED;
7262	FREE_LOCK(&lk);
7263	fs = ip->i_fs;
7264	extblocks = 0;
7265	if (fs->fs_magic == FS_UFS2_MAGIC)
7266		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
7267	UFS_LOCK(ip->i_ump);
7268	ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks;
7269	ip->i_fs->fs_pendinginodes += 1;
7270	UFS_UNLOCK(ip->i_ump);
7271	ip->i_flag |= IN_SPACECOUNTED;
7272}
7273
7274/*
7275 * Attach a sbdep dependency to the superblock buf so that we can keep
7276 * track of the head of the linked list of referenced but unlinked inodes.
7277 */
7278void
7279softdep_setup_sbupdate(ump, fs, bp)
7280	struct ufsmount *ump;
7281	struct fs *fs;
7282	struct buf *bp;
7283{
7284	struct sbdep *sbdep;
7285	struct worklist *wk;
7286
7287	if ((fs->fs_flags & FS_SUJ) == 0)
7288		return;
7289	LIST_FOREACH(wk, &bp->b_dep, wk_list)
7290		if (wk->wk_type == D_SBDEP)
7291			break;
7292	if (wk != NULL)
7293		return;
7294	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
7295	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
7296	sbdep->sb_fs = fs;
7297	sbdep->sb_ump = ump;
7298	ACQUIRE_LOCK(&lk);
7299	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
7300	FREE_LOCK(&lk);
7301}
7302
7303/*
7304 * Return the first unlinked inodedep which is ready to be the head of the
7305 * list.  The inodedep and all those after it must have valid next pointers.
7306 */
7307static struct inodedep *
7308first_unlinked_inodedep(ump)
7309	struct ufsmount *ump;
7310{
7311	struct inodedep *inodedep;
7312	struct inodedep *idp;
7313
7314	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
7315	    inodedep; inodedep = idp) {
7316		if ((inodedep->id_state & UNLINKNEXT) == 0)
7317			return (NULL);
7318		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
7319		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
7320			break;
7321		if ((inodedep->id_state & UNLINKPREV) == 0)
7322			panic("first_unlinked_inodedep: prev != next");
7323	}
7324	if (inodedep == NULL)
7325		return (NULL);
7326
7327	return (inodedep);
7328}
7329
7330/*
7331 * Set the sujfree unlinked head pointer prior to writing a superblock.
7332 */
7333static void
7334initiate_write_sbdep(sbdep)
7335	struct sbdep *sbdep;
7336{
7337	struct inodedep *inodedep;
7338	struct fs *bpfs;
7339	struct fs *fs;
7340
7341	bpfs = sbdep->sb_fs;
7342	fs = sbdep->sb_ump->um_fs;
7343	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
7344	if (inodedep) {
7345		fs->fs_sujfree = inodedep->id_ino;
7346		inodedep->id_state |= UNLINKPREV;
7347	} else
7348		fs->fs_sujfree = 0;
7349	bpfs->fs_sujfree = fs->fs_sujfree;
7350}
7351
7352/*
7353 * After a superblock is written determine whether it must be written again
7354 * due to a changing unlinked list head.
7355 */
7356static int
7357handle_written_sbdep(sbdep, bp)
7358	struct sbdep *sbdep;
7359	struct buf *bp;
7360{
7361	struct inodedep *inodedep;
7362	struct mount *mp;
7363	struct fs *fs;
7364
7365	fs = sbdep->sb_fs;
7366	mp = UFSTOVFS(sbdep->sb_ump);
7367	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
7368	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
7369	    (inodedep == NULL && fs->fs_sujfree != 0)) {
7370		bdirty(bp);
7371		return (1);
7372	}
7373	WORKITEM_FREE(sbdep, D_SBDEP);
7374	if (fs->fs_sujfree == 0)
7375		return (0);
7376	if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0)
7377		panic("handle_written_sbdep: lost inodedep");
7378	/*
7379	 * Now that we have a record of this indode in stable store allow it
7380	 * to be written to free up pending work.  Inodes may see a lot of
7381	 * write activity after they are unlinked which we must not hold up.
7382	 */
7383	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
7384		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
7385			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
7386			    inodedep, inodedep->id_state);
7387		if (inodedep->id_state & UNLINKONLIST)
7388			break;
7389		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
7390	}
7391
7392	return (0);
7393}
7394
7395/*
7396 * Mark an inodedep has unlinked and insert it into the in-memory unlinked
7397 * list.
7398 */
7399static void
7400unlinked_inodedep(mp, inodedep)
7401	struct mount *mp;
7402	struct inodedep *inodedep;
7403{
7404	struct ufsmount *ump;
7405
7406	if ((mp->mnt_kern_flag & MNTK_SUJ) == 0)
7407		return;
7408	ump = VFSTOUFS(mp);
7409	ump->um_fs->fs_fmod = 1;
7410	inodedep->id_state |= UNLINKED;
7411	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
7412}
7413
7414/*
7415 * Remove an inodedep from the unlinked inodedep list.  This may require
7416 * disk writes if the inode has made it that far.
7417 */
7418static void
7419clear_unlinked_inodedep(inodedep)
7420	struct inodedep *inodedep;
7421{
7422	struct ufsmount *ump;
7423	struct inodedep *idp;
7424	struct inodedep *idn;
7425	struct fs *fs;
7426	struct buf *bp;
7427	ino_t ino;
7428	ino_t nino;
7429	ino_t pino;
7430	int error;
7431
7432	ump = VFSTOUFS(inodedep->id_list.wk_mp);
7433	fs = ump->um_fs;
7434	ino = inodedep->id_ino;
7435	error = 0;
7436	for (;;) {
7437		/*
7438		 * If nothing has yet been written simply remove us from
7439		 * the in memory list and return.  This is the most common
7440		 * case where handle_workitem_remove() loses the final
7441		 * reference.
7442		 */
7443		if ((inodedep->id_state & UNLINKLINKS) == 0)
7444			break;
7445		/*
7446		 * If we have a NEXT pointer and no PREV pointer we can simply
7447		 * clear NEXT's PREV and remove ourselves from the list.  Be
7448		 * careful not to clear PREV if the superblock points at
7449		 * next as well.
7450		 */
7451		idn = TAILQ_NEXT(inodedep, id_unlinked);
7452		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
7453			if (idn && fs->fs_sujfree != idn->id_ino)
7454				idn->id_state &= ~UNLINKPREV;
7455			break;
7456		}
7457		/*
7458		 * Here we have an inodedep which is actually linked into
7459		 * the list.  We must remove it by forcing a write to the
7460		 * link before us, whether it be the superblock or an inode.
7461		 * Unfortunately the list may change while we're waiting
7462		 * on the buf lock for either resource so we must loop until
7463		 * we lock. the right one.  If both the superblock and an
7464		 * inode point to this inode we must clear the inode first
7465		 * followed by the superblock.
7466		 */
7467		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
7468		pino = 0;
7469		if (idp && (idp->id_state & UNLINKNEXT))
7470			pino = idp->id_ino;
7471		FREE_LOCK(&lk);
7472		if (pino == 0)
7473			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
7474			    (int)fs->fs_sbsize, 0, 0, 0);
7475		else
7476			error = bread(ump->um_devvp,
7477			    fsbtodb(fs, ino_to_fsba(fs, pino)),
7478			    (int)fs->fs_bsize, NOCRED, &bp);
7479		ACQUIRE_LOCK(&lk);
7480		if (error)
7481			break;
7482		/* If the list has changed restart the loop. */
7483		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
7484		nino = 0;
7485		if (idp && (idp->id_state & UNLINKNEXT))
7486			nino = idp->id_ino;
7487		if (nino != pino ||
7488		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
7489			FREE_LOCK(&lk);
7490			brelse(bp);
7491			ACQUIRE_LOCK(&lk);
7492			continue;
7493		}
7494		/*
7495		 * Remove us from the in memory list.  After this we cannot
7496		 * access the inodedep.
7497		 */
7498		idn = TAILQ_NEXT(inodedep, id_unlinked);
7499		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
7500		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
7501		/*
7502		 * Determine the next inode number.
7503		 */
7504		nino = 0;
7505		if (idn) {
7506			/*
7507			 * If next isn't on the list we can just clear prev's
7508			 * state and schedule it to be fixed later.  No need
7509			 * to synchronously write if we're not in the real
7510			 * list.
7511			 */
7512			if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) {
7513				idp->id_state &= ~UNLINKNEXT;
7514				if ((idp->id_state & ONWORKLIST) == 0)
7515					WORKLIST_INSERT(&bp->b_dep,
7516					    &idp->id_list);
7517				FREE_LOCK(&lk);
7518				bawrite(bp);
7519				ACQUIRE_LOCK(&lk);
7520				return;
7521			}
7522			nino = idn->id_ino;
7523		}
7524		FREE_LOCK(&lk);
7525		/*
7526		 * The predecessor's next pointer is manually updated here
7527		 * so that the NEXT flag is never cleared for an element
7528		 * that is in the list.
7529		 */
7530		if (pino == 0) {
7531			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
7532			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
7533			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
7534			    bp);
7535		} else if (fs->fs_magic == FS_UFS1_MAGIC)
7536			((struct ufs1_dinode *)bp->b_data +
7537			    ino_to_fsbo(fs, pino))->di_freelink = nino;
7538		else
7539			((struct ufs2_dinode *)bp->b_data +
7540			    ino_to_fsbo(fs, pino))->di_freelink = nino;
7541		/*
7542		 * If the bwrite fails we have no recourse to recover.  The
7543		 * filesystem is corrupted already.
7544		 */
7545		bwrite(bp);
7546		ACQUIRE_LOCK(&lk);
7547		/*
7548		 * If the superblock pointer still needs to be cleared force
7549		 * a write here.
7550		 */
7551		if (fs->fs_sujfree == ino) {
7552			FREE_LOCK(&lk);
7553			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
7554			    (int)fs->fs_sbsize, 0, 0, 0);
7555			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
7556			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
7557			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
7558			    bp);
7559			bwrite(bp);
7560			ACQUIRE_LOCK(&lk);
7561		}
7562		if (fs->fs_sujfree != ino)
7563			return;
7564		panic("clear_unlinked_inodedep: Failed to clear free head");
7565	}
7566	if (inodedep->id_ino == fs->fs_sujfree)
7567		panic("clear_unlinked_inodedep: Freeing head of free list");
7568	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
7569	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
7570	return;
7571}
7572
7573/*
7574 * This workitem decrements the inode's link count.
7575 * If the link count reaches zero, the file is removed.
7576 */
7577static void
7578handle_workitem_remove(dirrem, xp)
7579	struct dirrem *dirrem;
7580	struct vnode *xp;
7581{
7582	struct thread *td = curthread;
7583	struct inodedep *inodedep;
7584	struct workhead dotdotwk;
7585	struct worklist *wk;
7586	struct ufsmount *ump;
7587	struct mount *mp;
7588	struct vnode *vp;
7589	struct inode *ip;
7590	ino_t oldinum;
7591	int error;
7592
7593	if (dirrem->dm_state & ONWORKLIST)
7594		panic("handle_workitem_remove: dirrem %p still on worklist",
7595		    dirrem);
7596	oldinum = dirrem->dm_oldinum;
7597	mp = dirrem->dm_list.wk_mp;
7598	ump = VFSTOUFS(mp);
7599	if ((vp = xp) == NULL &&
7600	    (error = ffs_vgetf(mp, oldinum, LK_EXCLUSIVE, &vp,
7601	    FFSV_FORCEINSMQ)) != 0) {
7602		softdep_error("handle_workitem_remove: vget", error);
7603		return;
7604	}
7605	ip = VTOI(vp);
7606	ACQUIRE_LOCK(&lk);
7607	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
7608		panic("handle_workitem_remove: lost inodedep");
7609	if (dirrem->dm_state & ONDEPLIST)
7610		LIST_REMOVE(dirrem, dm_inonext);
7611	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
7612	    ("handle_workitem_remove:  Journal entries not written."));
7613
7614	/*
7615	 * Move all dependencies waiting on the remove to complete
7616	 * from the dirrem to the inode inowait list to be completed
7617	 * after the inode has been updated and written to disk.  Any
7618	 * marked MKDIR_PARENT are saved to be completed when the .. ref
7619	 * is removed.
7620	 */
7621	LIST_INIT(&dotdotwk);
7622	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
7623		WORKLIST_REMOVE(wk);
7624		if (wk->wk_state & MKDIR_PARENT) {
7625			wk->wk_state &= ~MKDIR_PARENT;
7626			WORKLIST_INSERT(&dotdotwk, wk);
7627			continue;
7628		}
7629		WORKLIST_INSERT(&inodedep->id_inowait, wk);
7630	}
7631	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
7632	/*
7633	 * Normal file deletion.
7634	 */
7635	if ((dirrem->dm_state & RMDIR) == 0) {
7636		ip->i_nlink--;
7637		DIP_SET(ip, i_nlink, ip->i_nlink);
7638		ip->i_flag |= IN_CHANGE;
7639		if (ip->i_nlink < ip->i_effnlink)
7640			panic("handle_workitem_remove: bad file delta");
7641		if (ip->i_nlink == 0)
7642			unlinked_inodedep(mp, inodedep);
7643		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
7644		num_dirrem -= 1;
7645		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
7646		    ("handle_workitem_remove: worklist not empty. %s",
7647		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
7648		WORKITEM_FREE(dirrem, D_DIRREM);
7649		FREE_LOCK(&lk);
7650		goto out;
7651	}
7652	/*
7653	 * Directory deletion. Decrement reference count for both the
7654	 * just deleted parent directory entry and the reference for ".".
7655	 * Next truncate the directory to length zero. When the
7656	 * truncation completes, arrange to have the reference count on
7657	 * the parent decremented to account for the loss of "..".
7658	 */
7659	ip->i_nlink -= 2;
7660	DIP_SET(ip, i_nlink, ip->i_nlink);
7661	ip->i_flag |= IN_CHANGE;
7662	if (ip->i_nlink < ip->i_effnlink)
7663		panic("handle_workitem_remove: bad dir delta");
7664	if (ip->i_nlink == 0)
7665		unlinked_inodedep(mp, inodedep);
7666	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
7667	FREE_LOCK(&lk);
7668	if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
7669		softdep_error("handle_workitem_remove: truncate", error);
7670	ACQUIRE_LOCK(&lk);
7671	/*
7672	 * Rename a directory to a new parent. Since, we are both deleting
7673	 * and creating a new directory entry, the link count on the new
7674	 * directory should not change. Thus we skip the followup dirrem.
7675	 */
7676	if (dirrem->dm_state & DIRCHG) {
7677		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
7678		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
7679		num_dirrem -= 1;
7680		WORKITEM_FREE(dirrem, D_DIRREM);
7681		FREE_LOCK(&lk);
7682		goto out;
7683	}
7684	dirrem->dm_state = ONDEPLIST;
7685	dirrem->dm_oldinum = dirrem->dm_dirinum;
7686	/*
7687	 * Place the dirrem on the parent's diremhd list.
7688	 */
7689	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
7690		panic("handle_workitem_remove: lost dir inodedep");
7691	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
7692	/*
7693	 * If the allocated inode has never been written to disk, then
7694	 * the on-disk inode is zero'ed and we can remove the file
7695	 * immediately.  When journaling if the inode has been marked
7696	 * unlinked and not DEPCOMPLETE we know it can never be written.
7697	 */
7698	inodedep_lookup(mp, oldinum, 0, &inodedep);
7699	if (inodedep == NULL ||
7700	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
7701	    check_inode_unwritten(inodedep)) {
7702		if (xp != NULL)
7703			add_to_worklist(&dirrem->dm_list, 0);
7704		FREE_LOCK(&lk);
7705		if (xp == NULL) {
7706			vput(vp);
7707			handle_workitem_remove(dirrem, NULL);
7708		}
7709		return;
7710	}
7711	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
7712	FREE_LOCK(&lk);
7713	ip->i_flag |= IN_CHANGE;
7714out:
7715	ffs_update(vp, 0);
7716	if (xp == NULL)
7717		vput(vp);
7718}
7719
7720/*
7721 * Inode de-allocation dependencies.
7722 *
7723 * When an inode's link count is reduced to zero, it can be de-allocated. We
7724 * found it convenient to postpone de-allocation until after the inode is
7725 * written to disk with its new link count (zero).  At this point, all of the
7726 * on-disk inode's block pointers are nullified and, with careful dependency
7727 * list ordering, all dependencies related to the inode will be satisfied and
7728 * the corresponding dependency structures de-allocated.  So, if/when the
7729 * inode is reused, there will be no mixing of old dependencies with new
7730 * ones.  This artificial dependency is set up by the block de-allocation
7731 * procedure above (softdep_setup_freeblocks) and completed by the
7732 * following procedure.
7733 */
7734static void
7735handle_workitem_freefile(freefile)
7736	struct freefile *freefile;
7737{
7738	struct workhead wkhd;
7739	struct fs *fs;
7740	struct inodedep *idp;
7741	struct ufsmount *ump;
7742	int error;
7743
7744	ump = VFSTOUFS(freefile->fx_list.wk_mp);
7745	fs = ump->um_fs;
7746#ifdef DEBUG
7747	ACQUIRE_LOCK(&lk);
7748	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
7749	FREE_LOCK(&lk);
7750	if (error)
7751		panic("handle_workitem_freefile: inodedep %p survived", idp);
7752#endif
7753	UFS_LOCK(ump);
7754	fs->fs_pendinginodes -= 1;
7755	UFS_UNLOCK(ump);
7756	LIST_INIT(&wkhd);
7757	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
7758	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
7759	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
7760		softdep_error("handle_workitem_freefile", error);
7761	ACQUIRE_LOCK(&lk);
7762	WORKITEM_FREE(freefile, D_FREEFILE);
7763	FREE_LOCK(&lk);
7764}
7765
7766
7767/*
7768 * Helper function which unlinks marker element from work list and returns
7769 * the next element on the list.
7770 */
7771static __inline struct worklist *
7772markernext(struct worklist *marker)
7773{
7774	struct worklist *next;
7775
7776	next = LIST_NEXT(marker, wk_list);
7777	LIST_REMOVE(marker, wk_list);
7778	return next;
7779}
7780
7781/*
7782 * Disk writes.
7783 *
7784 * The dependency structures constructed above are most actively used when file
7785 * system blocks are written to disk.  No constraints are placed on when a
7786 * block can be written, but unsatisfied update dependencies are made safe by
7787 * modifying (or replacing) the source memory for the duration of the disk
7788 * write.  When the disk write completes, the memory block is again brought
7789 * up-to-date.
7790 *
7791 * In-core inode structure reclamation.
7792 *
7793 * Because there are a finite number of "in-core" inode structures, they are
7794 * reused regularly.  By transferring all inode-related dependencies to the
7795 * in-memory inode block and indexing them separately (via "inodedep"s), we
7796 * can allow "in-core" inode structures to be reused at any time and avoid
7797 * any increase in contention.
7798 *
7799 * Called just before entering the device driver to initiate a new disk I/O.
7800 * The buffer must be locked, thus, no I/O completion operations can occur
7801 * while we are manipulating its associated dependencies.
7802 */
7803static void
7804softdep_disk_io_initiation(bp)
7805	struct buf *bp;		/* structure describing disk write to occur */
7806{
7807	struct worklist *wk;
7808	struct worklist marker;
7809	struct inodedep *inodedep;
7810	struct freeblks *freeblks;
7811	struct jfreeblk *jfreeblk;
7812	struct newblk *newblk;
7813
7814	/*
7815	 * We only care about write operations. There should never
7816	 * be dependencies for reads.
7817	 */
7818	if (bp->b_iocmd != BIO_WRITE)
7819		panic("softdep_disk_io_initiation: not write");
7820
7821	if (bp->b_vflags & BV_BKGRDINPROG)
7822		panic("softdep_disk_io_initiation: Writing buffer with "
7823		    "background write in progress: %p", bp);
7824
7825	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
7826	PHOLD(curproc);			/* Don't swap out kernel stack */
7827
7828	ACQUIRE_LOCK(&lk);
7829	/*
7830	 * Do any necessary pre-I/O processing.
7831	 */
7832	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
7833	     wk = markernext(&marker)) {
7834		LIST_INSERT_AFTER(wk, &marker, wk_list);
7835		switch (wk->wk_type) {
7836
7837		case D_PAGEDEP:
7838			initiate_write_filepage(WK_PAGEDEP(wk), bp);
7839			continue;
7840
7841		case D_INODEDEP:
7842			inodedep = WK_INODEDEP(wk);
7843			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
7844				initiate_write_inodeblock_ufs1(inodedep, bp);
7845			else
7846				initiate_write_inodeblock_ufs2(inodedep, bp);
7847			continue;
7848
7849		case D_INDIRDEP:
7850			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
7851			continue;
7852
7853		case D_BMSAFEMAP:
7854			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
7855			continue;
7856
7857		case D_JSEG:
7858			WK_JSEG(wk)->js_buf = NULL;
7859			continue;
7860
7861		case D_FREEBLKS:
7862			freeblks = WK_FREEBLKS(wk);
7863			jfreeblk = LIST_FIRST(&freeblks->fb_jfreeblkhd);
7864			/*
7865			 * We have to wait for the jfreeblks to be journaled
7866			 * before we can write an inodeblock with updated
7867			 * pointers.  Be careful to arrange the marker so
7868			 * we revisit the jfreeblk if it's not removed by
7869			 * the first jwait().
7870			 */
7871			if (jfreeblk != NULL) {
7872				LIST_REMOVE(&marker, wk_list);
7873				LIST_INSERT_BEFORE(wk, &marker, wk_list);
7874				jwait(&jfreeblk->jf_list);
7875			}
7876			continue;
7877		case D_ALLOCDIRECT:
7878		case D_ALLOCINDIR:
7879			/*
7880			 * We have to wait for the jnewblk to be journaled
7881			 * before we can write to a block otherwise the
7882			 * contents may be confused with an earlier file
7883			 * at recovery time.  Handle the marker as described
7884			 * above.
7885			 */
7886			newblk = WK_NEWBLK(wk);
7887			if (newblk->nb_jnewblk != NULL) {
7888				LIST_REMOVE(&marker, wk_list);
7889				LIST_INSERT_BEFORE(wk, &marker, wk_list);
7890				jwait(&newblk->nb_jnewblk->jn_list);
7891			}
7892			continue;
7893
7894		case D_SBDEP:
7895			initiate_write_sbdep(WK_SBDEP(wk));
7896			continue;
7897
7898		case D_MKDIR:
7899		case D_FREEWORK:
7900		case D_FREEDEP:
7901		case D_JSEGDEP:
7902			continue;
7903
7904		default:
7905			panic("handle_disk_io_initiation: Unexpected type %s",
7906			    TYPENAME(wk->wk_type));
7907			/* NOTREACHED */
7908		}
7909	}
7910	FREE_LOCK(&lk);
7911	PRELE(curproc);			/* Allow swapout of kernel stack */
7912}
7913
7914/*
7915 * Called from within the procedure above to deal with unsatisfied
7916 * allocation dependencies in a directory. The buffer must be locked,
7917 * thus, no I/O completion operations can occur while we are
7918 * manipulating its associated dependencies.
7919 */
7920static void
7921initiate_write_filepage(pagedep, bp)
7922	struct pagedep *pagedep;
7923	struct buf *bp;
7924{
7925	struct jremref *jremref;
7926	struct jmvref *jmvref;
7927	struct dirrem *dirrem;
7928	struct diradd *dap;
7929	struct direct *ep;
7930	int i;
7931
7932	if (pagedep->pd_state & IOSTARTED) {
7933		/*
7934		 * This can only happen if there is a driver that does not
7935		 * understand chaining. Here biodone will reissue the call
7936		 * to strategy for the incomplete buffers.
7937		 */
7938		printf("initiate_write_filepage: already started\n");
7939		return;
7940	}
7941	pagedep->pd_state |= IOSTARTED;
7942	/*
7943	 * Wait for all journal remove dependencies to hit the disk.
7944	 * We can not allow any potentially conflicting directory adds
7945	 * to be visible before removes and rollback is too difficult.
7946	 * lk may be dropped and re-acquired, however we hold the buf
7947	 * locked so the dependency can not go away.
7948	 */
7949	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
7950		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
7951			stat_jwait_filepage++;
7952			jwait(&jremref->jr_list);
7953		}
7954	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
7955		stat_jwait_filepage++;
7956		jwait(&jmvref->jm_list);
7957	}
7958	for (i = 0; i < DAHASHSZ; i++) {
7959		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
7960			ep = (struct direct *)
7961			    ((char *)bp->b_data + dap->da_offset);
7962			if (ep->d_ino != dap->da_newinum)
7963				panic("%s: dir inum %d != new %d",
7964				    "initiate_write_filepage",
7965				    ep->d_ino, dap->da_newinum);
7966			if (dap->da_state & DIRCHG)
7967				ep->d_ino = dap->da_previous->dm_oldinum;
7968			else
7969				ep->d_ino = 0;
7970			dap->da_state &= ~ATTACHED;
7971			dap->da_state |= UNDONE;
7972		}
7973	}
7974}
7975
7976/*
7977 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
7978 * Note that any bug fixes made to this routine must be done in the
7979 * version found below.
7980 *
7981 * Called from within the procedure above to deal with unsatisfied
7982 * allocation dependencies in an inodeblock. The buffer must be
7983 * locked, thus, no I/O completion operations can occur while we
7984 * are manipulating its associated dependencies.
7985 */
7986static void
7987initiate_write_inodeblock_ufs1(inodedep, bp)
7988	struct inodedep *inodedep;
7989	struct buf *bp;			/* The inode block */
7990{
7991	struct allocdirect *adp, *lastadp;
7992	struct ufs1_dinode *dp;
7993	struct ufs1_dinode *sip;
7994	struct inoref *inoref;
7995	struct fs *fs;
7996	ufs_lbn_t i;
7997#ifdef INVARIANTS
7998	ufs_lbn_t prevlbn = 0;
7999#endif
8000	int deplist;
8001
8002	if (inodedep->id_state & IOSTARTED)
8003		panic("initiate_write_inodeblock_ufs1: already started");
8004	inodedep->id_state |= IOSTARTED;
8005	fs = inodedep->id_fs;
8006	dp = (struct ufs1_dinode *)bp->b_data +
8007	    ino_to_fsbo(fs, inodedep->id_ino);
8008
8009	/*
8010	 * If we're on the unlinked list but have not yet written our
8011	 * next pointer initialize it here.
8012	 */
8013	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
8014		struct inodedep *inon;
8015
8016		inon = TAILQ_NEXT(inodedep, id_unlinked);
8017		dp->di_freelink = inon ? inon->id_ino : 0;
8018	}
8019	/*
8020	 * If the bitmap is not yet written, then the allocated
8021	 * inode cannot be written to disk.
8022	 */
8023	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
8024		if (inodedep->id_savedino1 != NULL)
8025			panic("initiate_write_inodeblock_ufs1: I/O underway");
8026		FREE_LOCK(&lk);
8027		sip = malloc(sizeof(struct ufs1_dinode),
8028		    M_SAVEDINO, M_SOFTDEP_FLAGS);
8029		ACQUIRE_LOCK(&lk);
8030		inodedep->id_savedino1 = sip;
8031		*inodedep->id_savedino1 = *dp;
8032		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
8033		dp->di_gen = inodedep->id_savedino1->di_gen;
8034		dp->di_freelink = inodedep->id_savedino1->di_freelink;
8035		return;
8036	}
8037	/*
8038	 * If no dependencies, then there is nothing to roll back.
8039	 */
8040	inodedep->id_savedsize = dp->di_size;
8041	inodedep->id_savedextsize = 0;
8042	inodedep->id_savednlink = dp->di_nlink;
8043	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
8044	    TAILQ_EMPTY(&inodedep->id_inoreflst))
8045		return;
8046	/*
8047	 * Revert the link count to that of the first unwritten journal entry.
8048	 */
8049	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
8050	if (inoref)
8051		dp->di_nlink = inoref->if_nlink;
8052	/*
8053	 * Set the dependencies to busy.
8054	 */
8055	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8056	     adp = TAILQ_NEXT(adp, ad_next)) {
8057#ifdef INVARIANTS
8058		if (deplist != 0 && prevlbn >= adp->ad_offset)
8059			panic("softdep_write_inodeblock: lbn order");
8060		prevlbn = adp->ad_offset;
8061		if (adp->ad_offset < NDADDR &&
8062		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
8063			panic("%s: direct pointer #%jd mismatch %d != %jd",
8064			    "softdep_write_inodeblock",
8065			    (intmax_t)adp->ad_offset,
8066			    dp->di_db[adp->ad_offset],
8067			    (intmax_t)adp->ad_newblkno);
8068		if (adp->ad_offset >= NDADDR &&
8069		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
8070			panic("%s: indirect pointer #%jd mismatch %d != %jd",
8071			    "softdep_write_inodeblock",
8072			    (intmax_t)adp->ad_offset - NDADDR,
8073			    dp->di_ib[adp->ad_offset - NDADDR],
8074			    (intmax_t)adp->ad_newblkno);
8075		deplist |= 1 << adp->ad_offset;
8076		if ((adp->ad_state & ATTACHED) == 0)
8077			panic("softdep_write_inodeblock: Unknown state 0x%x",
8078			    adp->ad_state);
8079#endif /* INVARIANTS */
8080		adp->ad_state &= ~ATTACHED;
8081		adp->ad_state |= UNDONE;
8082	}
8083	/*
8084	 * The on-disk inode cannot claim to be any larger than the last
8085	 * fragment that has been written. Otherwise, the on-disk inode
8086	 * might have fragments that were not the last block in the file
8087	 * which would corrupt the filesystem.
8088	 */
8089	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8090	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8091		if (adp->ad_offset >= NDADDR)
8092			break;
8093		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
8094		/* keep going until hitting a rollback to a frag */
8095		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8096			continue;
8097		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8098		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
8099#ifdef INVARIANTS
8100			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
8101				panic("softdep_write_inodeblock: lost dep1");
8102#endif /* INVARIANTS */
8103			dp->di_db[i] = 0;
8104		}
8105		for (i = 0; i < NIADDR; i++) {
8106#ifdef INVARIANTS
8107			if (dp->di_ib[i] != 0 &&
8108			    (deplist & ((1 << NDADDR) << i)) == 0)
8109				panic("softdep_write_inodeblock: lost dep2");
8110#endif /* INVARIANTS */
8111			dp->di_ib[i] = 0;
8112		}
8113		return;
8114	}
8115	/*
8116	 * If we have zero'ed out the last allocated block of the file,
8117	 * roll back the size to the last currently allocated block.
8118	 * We know that this last allocated block is a full-sized as
8119	 * we already checked for fragments in the loop above.
8120	 */
8121	if (lastadp != NULL &&
8122	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8123		for (i = lastadp->ad_offset; i >= 0; i--)
8124			if (dp->di_db[i] != 0)
8125				break;
8126		dp->di_size = (i + 1) * fs->fs_bsize;
8127	}
8128	/*
8129	 * The only dependencies are for indirect blocks.
8130	 *
8131	 * The file size for indirect block additions is not guaranteed.
8132	 * Such a guarantee would be non-trivial to achieve. The conventional
8133	 * synchronous write implementation also does not make this guarantee.
8134	 * Fsck should catch and fix discrepancies. Arguably, the file size
8135	 * can be over-estimated without destroying integrity when the file
8136	 * moves into the indirect blocks (i.e., is large). If we want to
8137	 * postpone fsck, we are stuck with this argument.
8138	 */
8139	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
8140		dp->di_ib[adp->ad_offset - NDADDR] = 0;
8141}
8142
8143/*
8144 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
8145 * Note that any bug fixes made to this routine must be done in the
8146 * version found above.
8147 *
8148 * Called from within the procedure above to deal with unsatisfied
8149 * allocation dependencies in an inodeblock. The buffer must be
8150 * locked, thus, no I/O completion operations can occur while we
8151 * are manipulating its associated dependencies.
8152 */
8153static void
8154initiate_write_inodeblock_ufs2(inodedep, bp)
8155	struct inodedep *inodedep;
8156	struct buf *bp;			/* The inode block */
8157{
8158	struct allocdirect *adp, *lastadp;
8159	struct ufs2_dinode *dp;
8160	struct ufs2_dinode *sip;
8161	struct inoref *inoref;
8162	struct fs *fs;
8163	ufs_lbn_t i;
8164#ifdef INVARIANTS
8165	ufs_lbn_t prevlbn = 0;
8166#endif
8167	int deplist;
8168
8169	if (inodedep->id_state & IOSTARTED)
8170		panic("initiate_write_inodeblock_ufs2: already started");
8171	inodedep->id_state |= IOSTARTED;
8172	fs = inodedep->id_fs;
8173	dp = (struct ufs2_dinode *)bp->b_data +
8174	    ino_to_fsbo(fs, inodedep->id_ino);
8175
8176	/*
8177	 * If we're on the unlinked list but have not yet written our
8178	 * next pointer initialize it here.
8179	 */
8180	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
8181		struct inodedep *inon;
8182
8183		inon = TAILQ_NEXT(inodedep, id_unlinked);
8184		dp->di_freelink = inon ? inon->id_ino : 0;
8185	}
8186	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) ==
8187	    (UNLINKED | UNLINKNEXT)) {
8188		struct inodedep *inon;
8189		ino_t freelink;
8190
8191		inon = TAILQ_NEXT(inodedep, id_unlinked);
8192		freelink = inon ? inon->id_ino : 0;
8193		if (freelink != dp->di_freelink)
8194			panic("ino %p(0x%X) %d, %d != %d",
8195			    inodedep, inodedep->id_state, inodedep->id_ino,
8196			    freelink, dp->di_freelink);
8197	}
8198	/*
8199	 * If the bitmap is not yet written, then the allocated
8200	 * inode cannot be written to disk.
8201	 */
8202	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
8203		if (inodedep->id_savedino2 != NULL)
8204			panic("initiate_write_inodeblock_ufs2: I/O underway");
8205		FREE_LOCK(&lk);
8206		sip = malloc(sizeof(struct ufs2_dinode),
8207		    M_SAVEDINO, M_SOFTDEP_FLAGS);
8208		ACQUIRE_LOCK(&lk);
8209		inodedep->id_savedino2 = sip;
8210		*inodedep->id_savedino2 = *dp;
8211		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
8212		dp->di_gen = inodedep->id_savedino2->di_gen;
8213		dp->di_freelink = inodedep->id_savedino2->di_freelink;
8214		return;
8215	}
8216	/*
8217	 * If no dependencies, then there is nothing to roll back.
8218	 */
8219	inodedep->id_savedsize = dp->di_size;
8220	inodedep->id_savedextsize = dp->di_extsize;
8221	inodedep->id_savednlink = dp->di_nlink;
8222	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
8223	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
8224	    TAILQ_EMPTY(&inodedep->id_inoreflst))
8225		return;
8226	/*
8227	 * Revert the link count to that of the first unwritten journal entry.
8228	 */
8229	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
8230	if (inoref)
8231		dp->di_nlink = inoref->if_nlink;
8232
8233	/*
8234	 * Set the ext data dependencies to busy.
8235	 */
8236	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
8237	     adp = TAILQ_NEXT(adp, ad_next)) {
8238#ifdef INVARIANTS
8239		if (deplist != 0 && prevlbn >= adp->ad_offset)
8240			panic("softdep_write_inodeblock: lbn order");
8241		prevlbn = adp->ad_offset;
8242		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
8243			panic("%s: direct pointer #%jd mismatch %jd != %jd",
8244			    "softdep_write_inodeblock",
8245			    (intmax_t)adp->ad_offset,
8246			    (intmax_t)dp->di_extb[adp->ad_offset],
8247			    (intmax_t)adp->ad_newblkno);
8248		deplist |= 1 << adp->ad_offset;
8249		if ((adp->ad_state & ATTACHED) == 0)
8250			panic("softdep_write_inodeblock: Unknown state 0x%x",
8251			    adp->ad_state);
8252#endif /* INVARIANTS */
8253		adp->ad_state &= ~ATTACHED;
8254		adp->ad_state |= UNDONE;
8255	}
8256	/*
8257	 * The on-disk inode cannot claim to be any larger than the last
8258	 * fragment that has been written. Otherwise, the on-disk inode
8259	 * might have fragments that were not the last block in the ext
8260	 * data which would corrupt the filesystem.
8261	 */
8262	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
8263	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8264		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
8265		/* keep going until hitting a rollback to a frag */
8266		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8267			continue;
8268		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8269		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
8270#ifdef INVARIANTS
8271			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
8272				panic("softdep_write_inodeblock: lost dep1");
8273#endif /* INVARIANTS */
8274			dp->di_extb[i] = 0;
8275		}
8276		lastadp = NULL;
8277		break;
8278	}
8279	/*
8280	 * If we have zero'ed out the last allocated block of the ext
8281	 * data, roll back the size to the last currently allocated block.
8282	 * We know that this last allocated block is a full-sized as
8283	 * we already checked for fragments in the loop above.
8284	 */
8285	if (lastadp != NULL &&
8286	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8287		for (i = lastadp->ad_offset; i >= 0; i--)
8288			if (dp->di_extb[i] != 0)
8289				break;
8290		dp->di_extsize = (i + 1) * fs->fs_bsize;
8291	}
8292	/*
8293	 * Set the file data dependencies to busy.
8294	 */
8295	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8296	     adp = TAILQ_NEXT(adp, ad_next)) {
8297#ifdef INVARIANTS
8298		if (deplist != 0 && prevlbn >= adp->ad_offset)
8299			panic("softdep_write_inodeblock: lbn order");
8300		prevlbn = adp->ad_offset;
8301		if (adp->ad_offset < NDADDR &&
8302		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
8303			panic("%s: direct pointer #%jd mismatch %jd != %jd",
8304			    "softdep_write_inodeblock",
8305			    (intmax_t)adp->ad_offset,
8306			    (intmax_t)dp->di_db[adp->ad_offset],
8307			    (intmax_t)adp->ad_newblkno);
8308		if (adp->ad_offset >= NDADDR &&
8309		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
8310			panic("%s indirect pointer #%jd mismatch %jd != %jd",
8311			    "softdep_write_inodeblock:",
8312			    (intmax_t)adp->ad_offset - NDADDR,
8313			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
8314			    (intmax_t)adp->ad_newblkno);
8315		deplist |= 1 << adp->ad_offset;
8316		if ((adp->ad_state & ATTACHED) == 0)
8317			panic("softdep_write_inodeblock: Unknown state 0x%x",
8318			    adp->ad_state);
8319#endif /* INVARIANTS */
8320		adp->ad_state &= ~ATTACHED;
8321		adp->ad_state |= UNDONE;
8322	}
8323	/*
8324	 * The on-disk inode cannot claim to be any larger than the last
8325	 * fragment that has been written. Otherwise, the on-disk inode
8326	 * might have fragments that were not the last block in the file
8327	 * which would corrupt the filesystem.
8328	 */
8329	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
8330	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
8331		if (adp->ad_offset >= NDADDR)
8332			break;
8333		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
8334		/* keep going until hitting a rollback to a frag */
8335		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
8336			continue;
8337		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
8338		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
8339#ifdef INVARIANTS
8340			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
8341				panic("softdep_write_inodeblock: lost dep2");
8342#endif /* INVARIANTS */
8343			dp->di_db[i] = 0;
8344		}
8345		for (i = 0; i < NIADDR; i++) {
8346#ifdef INVARIANTS
8347			if (dp->di_ib[i] != 0 &&
8348			    (deplist & ((1 << NDADDR) << i)) == 0)
8349				panic("softdep_write_inodeblock: lost dep3");
8350#endif /* INVARIANTS */
8351			dp->di_ib[i] = 0;
8352		}
8353		return;
8354	}
8355	/*
8356	 * If we have zero'ed out the last allocated block of the file,
8357	 * roll back the size to the last currently allocated block.
8358	 * We know that this last allocated block is a full-sized as
8359	 * we already checked for fragments in the loop above.
8360	 */
8361	if (lastadp != NULL &&
8362	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
8363		for (i = lastadp->ad_offset; i >= 0; i--)
8364			if (dp->di_db[i] != 0)
8365				break;
8366		dp->di_size = (i + 1) * fs->fs_bsize;
8367	}
8368	/*
8369	 * The only dependencies are for indirect blocks.
8370	 *
8371	 * The file size for indirect block additions is not guaranteed.
8372	 * Such a guarantee would be non-trivial to achieve. The conventional
8373	 * synchronous write implementation also does not make this guarantee.
8374	 * Fsck should catch and fix discrepancies. Arguably, the file size
8375	 * can be over-estimated without destroying integrity when the file
8376	 * moves into the indirect blocks (i.e., is large). If we want to
8377	 * postpone fsck, we are stuck with this argument.
8378	 */
8379	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
8380		dp->di_ib[adp->ad_offset - NDADDR] = 0;
8381}
8382
8383/*
8384 * Cancel an indirdep as a result of truncation.  Release all of the
8385 * children allocindirs and place their journal work on the appropriate
8386 * list.
8387 */
8388static void
8389cancel_indirdep(indirdep, bp, inodedep, freeblks)
8390	struct indirdep *indirdep;
8391	struct buf *bp;
8392	struct inodedep *inodedep;
8393	struct freeblks *freeblks;
8394{
8395	struct allocindir *aip;
8396
8397	/*
8398	 * None of the indirect pointers will ever be visible,
8399	 * so they can simply be tossed. GOINGAWAY ensures
8400	 * that allocated pointers will be saved in the buffer
8401	 * cache until they are freed. Note that they will
8402	 * only be able to be found by their physical address
8403	 * since the inode mapping the logical address will
8404	 * be gone. The save buffer used for the safe copy
8405	 * was allocated in setup_allocindir_phase2 using
8406	 * the physical address so it could be used for this
8407	 * purpose. Hence we swap the safe copy with the real
8408	 * copy, allowing the safe copy to be freed and holding
8409	 * on to the real copy for later use in indir_trunc.
8410	 */
8411	if (indirdep->ir_state & GOINGAWAY)
8412		panic("cancel_indirdep: already gone");
8413	if (indirdep->ir_state & ONDEPLIST) {
8414		indirdep->ir_state &= ~ONDEPLIST;
8415		LIST_REMOVE(indirdep, ir_next);
8416	}
8417	indirdep->ir_state |= GOINGAWAY;
8418	VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1;
8419	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
8420		cancel_allocindir(aip, inodedep, freeblks);
8421	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
8422		cancel_allocindir(aip, inodedep, freeblks);
8423	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
8424		cancel_allocindir(aip, inodedep, freeblks);
8425	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
8426		cancel_allocindir(aip, inodedep, freeblks);
8427	bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
8428	WORKLIST_REMOVE(&indirdep->ir_list);
8429	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
8430	indirdep->ir_savebp = NULL;
8431}
8432
8433/*
8434 * Free an indirdep once it no longer has new pointers to track.
8435 */
8436static void
8437free_indirdep(indirdep)
8438	struct indirdep *indirdep;
8439{
8440
8441	KASSERT(LIST_EMPTY(&indirdep->ir_jwork),
8442	    ("free_indirdep: Journal work not empty."));
8443	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
8444	    ("free_indirdep: Complete head not empty."));
8445	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
8446	    ("free_indirdep: write head not empty."));
8447	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
8448	    ("free_indirdep: done head not empty."));
8449	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
8450	    ("free_indirdep: deplist head not empty."));
8451	KASSERT(indirdep->ir_savebp == NULL,
8452	    ("free_indirdep: %p ir_savebp != NULL", indirdep));
8453	KASSERT((indirdep->ir_state & ONDEPLIST) == 0,
8454	    ("free_indirdep: %p still on deplist.", indirdep));
8455	if (indirdep->ir_state & ONWORKLIST)
8456		WORKLIST_REMOVE(&indirdep->ir_list);
8457	WORKITEM_FREE(indirdep, D_INDIRDEP);
8458}
8459
8460/*
8461 * Called before a write to an indirdep.  This routine is responsible for
8462 * rolling back pointers to a safe state which includes only those
8463 * allocindirs which have been completed.
8464 */
8465static void
8466initiate_write_indirdep(indirdep, bp)
8467	struct indirdep *indirdep;
8468	struct buf *bp;
8469{
8470
8471	if (indirdep->ir_state & GOINGAWAY)
8472		panic("disk_io_initiation: indirdep gone");
8473
8474	/*
8475	 * If there are no remaining dependencies, this will be writing
8476	 * the real pointers.
8477	 */
8478	if (LIST_EMPTY(&indirdep->ir_deplisthd))
8479		return;
8480	/*
8481	 * Replace up-to-date version with safe version.
8482	 */
8483	FREE_LOCK(&lk);
8484	indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
8485	    M_SOFTDEP_FLAGS);
8486	ACQUIRE_LOCK(&lk);
8487	indirdep->ir_state &= ~ATTACHED;
8488	indirdep->ir_state |= UNDONE;
8489	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
8490	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
8491	    bp->b_bcount);
8492}
8493
8494/*
8495 * Called when an inode has been cleared in a cg bitmap.  This finally
8496 * eliminates any canceled jaddrefs
8497 */
8498void
8499softdep_setup_inofree(mp, bp, ino, wkhd)
8500	struct mount *mp;
8501	struct buf *bp;
8502	ino_t ino;
8503	struct workhead *wkhd;
8504{
8505	struct worklist *wk, *wkn;
8506	struct inodedep *inodedep;
8507	uint8_t *inosused;
8508	struct cg *cgp;
8509	struct fs *fs;
8510
8511	ACQUIRE_LOCK(&lk);
8512	fs = VFSTOUFS(mp)->um_fs;
8513	cgp = (struct cg *)bp->b_data;
8514	inosused = cg_inosused(cgp);
8515	if (isset(inosused, ino % fs->fs_ipg))
8516		panic("softdep_setup_inofree: inode %d not freed.", ino);
8517	if (inodedep_lookup(mp, ino, 0, &inodedep))
8518		panic("softdep_setup_inofree: ino %d has existing inodedep %p",
8519		    ino, inodedep);
8520	if (wkhd) {
8521		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
8522			if (wk->wk_type != D_JADDREF)
8523				continue;
8524			WORKLIST_REMOVE(wk);
8525			/*
8526			 * We can free immediately even if the jaddref
8527			 * isn't attached in a background write as now
8528			 * the bitmaps are reconciled.
8529		 	 */
8530			wk->wk_state |= COMPLETE | ATTACHED;
8531			free_jaddref(WK_JADDREF(wk));
8532		}
8533		jwork_move(&bp->b_dep, wkhd);
8534	}
8535	FREE_LOCK(&lk);
8536}
8537
8538
8539/*
8540 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
8541 * map.  Any dependencies waiting for the write to clear are added to the
8542 * buf's list and any jnewblks that are being canceled are discarded
8543 * immediately.
8544 */
8545void
8546softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
8547	struct mount *mp;
8548	struct buf *bp;
8549	ufs2_daddr_t blkno;
8550	int frags;
8551	struct workhead *wkhd;
8552{
8553	struct jnewblk *jnewblk;
8554	struct worklist *wk, *wkn;
8555#ifdef SUJ_DEBUG
8556	struct bmsafemap *bmsafemap;
8557	struct fs *fs;
8558	uint8_t *blksfree;
8559	struct cg *cgp;
8560	ufs2_daddr_t jstart;
8561	ufs2_daddr_t jend;
8562	ufs2_daddr_t end;
8563	long bno;
8564	int i;
8565#endif
8566
8567	ACQUIRE_LOCK(&lk);
8568	/*
8569	 * Detach any jnewblks which have been canceled.  They must linger
8570	 * until the bitmap is cleared again by ffs_blkfree() to prevent
8571	 * an unjournaled allocation from hitting the disk.
8572	 */
8573	if (wkhd) {
8574		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
8575			if (wk->wk_type != D_JNEWBLK)
8576				continue;
8577			jnewblk = WK_JNEWBLK(wk);
8578			KASSERT(jnewblk->jn_state & GOINGAWAY,
8579			    ("softdep_setup_blkfree: jnewblk not canceled."));
8580			WORKLIST_REMOVE(wk);
8581#ifdef SUJ_DEBUG
8582			/*
8583			 * Assert that this block is free in the bitmap
8584			 * before we discard the jnewblk.
8585			 */
8586			fs = VFSTOUFS(mp)->um_fs;
8587			cgp = (struct cg *)bp->b_data;
8588			blksfree = cg_blksfree(cgp);
8589			bno = dtogd(fs, jnewblk->jn_blkno);
8590			for (i = jnewblk->jn_oldfrags;
8591			    i < jnewblk->jn_frags; i++) {
8592				if (isset(blksfree, bno + i))
8593					continue;
8594				panic("softdep_setup_blkfree: not free");
8595			}
8596#endif
8597			/*
8598			 * Even if it's not attached we can free immediately
8599			 * as the new bitmap is correct.
8600			 */
8601			wk->wk_state |= COMPLETE | ATTACHED;
8602			free_jnewblk(jnewblk);
8603		}
8604		/*
8605		 * The buf must be locked by the caller otherwise these could
8606		 * be added while it's being written and the write would
8607		 * complete them before they made it to disk.
8608		 */
8609		jwork_move(&bp->b_dep, wkhd);
8610	}
8611
8612#ifdef SUJ_DEBUG
8613	/*
8614	 * Assert that we are not freeing a block which has an outstanding
8615	 * allocation dependency.
8616	 */
8617	fs = VFSTOUFS(mp)->um_fs;
8618	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno));
8619	end = blkno + frags;
8620	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
8621		/*
8622		 * Don't match against blocks that will be freed when the
8623		 * background write is done.
8624		 */
8625		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
8626		    (COMPLETE | DEPCOMPLETE))
8627			continue;
8628		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
8629		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
8630		if ((blkno >= jstart && blkno < jend) ||
8631		    (end > jstart && end <= jend)) {
8632			printf("state 0x%X %jd - %d %d dep %p\n",
8633			    jnewblk->jn_state, jnewblk->jn_blkno,
8634			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
8635			    jnewblk->jn_newblk);
8636			panic("softdep_setup_blkfree: "
8637			    "%jd-%jd(%d) overlaps with %jd-%jd",
8638			    blkno, end, frags, jstart, jend);
8639		}
8640	}
8641#endif
8642	FREE_LOCK(&lk);
8643}
8644
8645static void
8646initiate_write_bmsafemap(bmsafemap, bp)
8647	struct bmsafemap *bmsafemap;
8648	struct buf *bp;			/* The cg block. */
8649{
8650	struct jaddref *jaddref;
8651	struct jnewblk *jnewblk;
8652	uint8_t *inosused;
8653	uint8_t *blksfree;
8654	struct cg *cgp;
8655	struct fs *fs;
8656	int cleared;
8657	ino_t ino;
8658	long bno;
8659	int i;
8660
8661	if (bmsafemap->sm_state & IOSTARTED)
8662		panic("initiate_write_bmsafemap: Already started\n");
8663	bmsafemap->sm_state |= IOSTARTED;
8664	/*
8665	 * Clear any inode allocations which are pending journal writes.
8666	 */
8667	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
8668		cgp = (struct cg *)bp->b_data;
8669		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
8670		inosused = cg_inosused(cgp);
8671		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
8672			ino = jaddref->ja_ino % fs->fs_ipg;
8673			/*
8674			 * If this is a background copy the inode may not
8675			 * be marked used yet.
8676			 */
8677			if (isset(inosused, ino)) {
8678				if ((jaddref->ja_mode & IFMT) == IFDIR)
8679					cgp->cg_cs.cs_ndir--;
8680				cgp->cg_cs.cs_nifree++;
8681				clrbit(inosused, ino);
8682				jaddref->ja_state &= ~ATTACHED;
8683				jaddref->ja_state |= UNDONE;
8684				stat_jaddref++;
8685			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
8686				panic("initiate_write_bmsafemap: inode %d "
8687				    "marked free", jaddref->ja_ino);
8688		}
8689	}
8690	/*
8691	 * Clear any block allocations which are pending journal writes.
8692	 */
8693	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
8694		cgp = (struct cg *)bp->b_data;
8695		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
8696		blksfree = cg_blksfree(cgp);
8697		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
8698			bno = dtogd(fs, jnewblk->jn_blkno);
8699			cleared = 0;
8700			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
8701			    i++) {
8702				if (isclr(blksfree, bno + i)) {
8703					cleared = 1;
8704					setbit(blksfree, bno + i);
8705				}
8706			}
8707			/*
8708			 * We may not clear the block if it's a background
8709			 * copy.  In that case there is no reason to detach
8710			 * it.
8711			 */
8712			if (cleared) {
8713				stat_jnewblk++;
8714				jnewblk->jn_state &= ~ATTACHED;
8715				jnewblk->jn_state |= UNDONE;
8716			} else if ((bp->b_xflags & BX_BKGRDMARKER) == 0)
8717				panic("initiate_write_bmsafemap: block %jd "
8718				    "marked free", jnewblk->jn_blkno);
8719		}
8720	}
8721	/*
8722	 * Move allocation lists to the written lists so they can be
8723	 * cleared once the block write is complete.
8724	 */
8725	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
8726	    inodedep, id_deps);
8727	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
8728	    newblk, nb_deps);
8729}
8730
8731/*
8732 * This routine is called during the completion interrupt
8733 * service routine for a disk write (from the procedure called
8734 * by the device driver to inform the filesystem caches of
8735 * a request completion).  It should be called early in this
8736 * procedure, before the block is made available to other
8737 * processes or other routines are called.
8738 *
8739 */
8740static void
8741softdep_disk_write_complete(bp)
8742	struct buf *bp;		/* describes the completed disk write */
8743{
8744	struct worklist *wk;
8745	struct worklist *owk;
8746	struct workhead reattach;
8747	struct buf *sbp;
8748
8749	/*
8750	 * If an error occurred while doing the write, then the data
8751	 * has not hit the disk and the dependencies cannot be unrolled.
8752	 */
8753	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
8754		return;
8755	LIST_INIT(&reattach);
8756	/*
8757	 * This lock must not be released anywhere in this code segment.
8758	 */
8759	sbp = NULL;
8760	owk = NULL;
8761	ACQUIRE_LOCK(&lk);
8762	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
8763		WORKLIST_REMOVE(wk);
8764		if (wk == owk)
8765			panic("duplicate worklist: %p\n", wk);
8766		owk = wk;
8767		switch (wk->wk_type) {
8768
8769		case D_PAGEDEP:
8770			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
8771				WORKLIST_INSERT(&reattach, wk);
8772			continue;
8773
8774		case D_INODEDEP:
8775			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
8776				WORKLIST_INSERT(&reattach, wk);
8777			continue;
8778
8779		case D_BMSAFEMAP:
8780			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
8781				WORKLIST_INSERT(&reattach, wk);
8782			continue;
8783
8784		case D_MKDIR:
8785			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
8786			continue;
8787
8788		case D_ALLOCDIRECT:
8789			wk->wk_state |= COMPLETE;
8790			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
8791			continue;
8792
8793		case D_ALLOCINDIR:
8794			wk->wk_state |= COMPLETE;
8795			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
8796			continue;
8797
8798		case D_INDIRDEP:
8799			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
8800				WORKLIST_INSERT(&reattach, wk);
8801			continue;
8802
8803		case D_FREEBLKS:
8804			wk->wk_state |= COMPLETE;
8805			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
8806				add_to_worklist(wk, 1);
8807			continue;
8808
8809		case D_FREEWORK:
8810			handle_written_freework(WK_FREEWORK(wk));
8811			break;
8812
8813		case D_FREEDEP:
8814			free_freedep(WK_FREEDEP(wk));
8815			continue;
8816
8817		case D_JSEGDEP:
8818			free_jsegdep(WK_JSEGDEP(wk));
8819			continue;
8820
8821		case D_JSEG:
8822			handle_written_jseg(WK_JSEG(wk), bp);
8823			continue;
8824
8825		case D_SBDEP:
8826			if (handle_written_sbdep(WK_SBDEP(wk), bp))
8827				WORKLIST_INSERT(&reattach, wk);
8828			continue;
8829
8830		default:
8831			panic("handle_disk_write_complete: Unknown type %s",
8832			    TYPENAME(wk->wk_type));
8833			/* NOTREACHED */
8834		}
8835	}
8836	/*
8837	 * Reattach any requests that must be redone.
8838	 */
8839	while ((wk = LIST_FIRST(&reattach)) != NULL) {
8840		WORKLIST_REMOVE(wk);
8841		WORKLIST_INSERT(&bp->b_dep, wk);
8842	}
8843	FREE_LOCK(&lk);
8844	if (sbp)
8845		brelse(sbp);
8846}
8847
8848/*
8849 * Called from within softdep_disk_write_complete above. Note that
8850 * this routine is always called from interrupt level with further
8851 * splbio interrupts blocked.
8852 */
8853static void
8854handle_allocdirect_partdone(adp, wkhd)
8855	struct allocdirect *adp;	/* the completed allocdirect */
8856	struct workhead *wkhd;		/* Work to do when inode is writtne. */
8857{
8858	struct allocdirectlst *listhead;
8859	struct allocdirect *listadp;
8860	struct inodedep *inodedep;
8861	long bsize;
8862
8863	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
8864		return;
8865	/*
8866	 * The on-disk inode cannot claim to be any larger than the last
8867	 * fragment that has been written. Otherwise, the on-disk inode
8868	 * might have fragments that were not the last block in the file
8869	 * which would corrupt the filesystem. Thus, we cannot free any
8870	 * allocdirects after one whose ad_oldblkno claims a fragment as
8871	 * these blocks must be rolled back to zero before writing the inode.
8872	 * We check the currently active set of allocdirects in id_inoupdt
8873	 * or id_extupdt as appropriate.
8874	 */
8875	inodedep = adp->ad_inodedep;
8876	bsize = inodedep->id_fs->fs_bsize;
8877	if (adp->ad_state & EXTDATA)
8878		listhead = &inodedep->id_extupdt;
8879	else
8880		listhead = &inodedep->id_inoupdt;
8881	TAILQ_FOREACH(listadp, listhead, ad_next) {
8882		/* found our block */
8883		if (listadp == adp)
8884			break;
8885		/* continue if ad_oldlbn is not a fragment */
8886		if (listadp->ad_oldsize == 0 ||
8887		    listadp->ad_oldsize == bsize)
8888			continue;
8889		/* hit a fragment */
8890		return;
8891	}
8892	/*
8893	 * If we have reached the end of the current list without
8894	 * finding the just finished dependency, then it must be
8895	 * on the future dependency list. Future dependencies cannot
8896	 * be freed until they are moved to the current list.
8897	 */
8898	if (listadp == NULL) {
8899#ifdef DEBUG
8900		if (adp->ad_state & EXTDATA)
8901			listhead = &inodedep->id_newextupdt;
8902		else
8903			listhead = &inodedep->id_newinoupdt;
8904		TAILQ_FOREACH(listadp, listhead, ad_next)
8905			/* found our block */
8906			if (listadp == adp)
8907				break;
8908		if (listadp == NULL)
8909			panic("handle_allocdirect_partdone: lost dep");
8910#endif /* DEBUG */
8911		return;
8912	}
8913	/*
8914	 * If we have found the just finished dependency, then queue
8915	 * it along with anything that follows it that is complete.
8916	 * Since the pointer has not yet been written in the inode
8917	 * as the dependency prevents it, place the allocdirect on the
8918	 * bufwait list where it will be freed once the pointer is
8919	 * valid.
8920	 */
8921	if (wkhd == NULL)
8922		wkhd = &inodedep->id_bufwait;
8923	for (; adp; adp = listadp) {
8924		listadp = TAILQ_NEXT(adp, ad_next);
8925		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
8926			return;
8927		TAILQ_REMOVE(listhead, adp, ad_next);
8928		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
8929	}
8930}
8931
8932/*
8933 * Called from within softdep_disk_write_complete above.  This routine
8934 * completes successfully written allocindirs.
8935 */
8936static void
8937handle_allocindir_partdone(aip)
8938	struct allocindir *aip;		/* the completed allocindir */
8939{
8940	struct indirdep *indirdep;
8941
8942	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
8943		return;
8944	indirdep = aip->ai_indirdep;
8945	LIST_REMOVE(aip, ai_next);
8946	if (indirdep->ir_state & UNDONE) {
8947		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
8948		return;
8949	}
8950	if (indirdep->ir_state & UFS1FMT)
8951		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
8952		    aip->ai_newblkno;
8953	else
8954		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
8955		    aip->ai_newblkno;
8956	/*
8957	 * Await the pointer write before freeing the allocindir.
8958	 */
8959	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
8960}
8961
8962/*
8963 * Release segments held on a jwork list.
8964 */
8965static void
8966handle_jwork(wkhd)
8967	struct workhead *wkhd;
8968{
8969	struct worklist *wk;
8970
8971	while ((wk = LIST_FIRST(wkhd)) != NULL) {
8972		WORKLIST_REMOVE(wk);
8973		switch (wk->wk_type) {
8974		case D_JSEGDEP:
8975			free_jsegdep(WK_JSEGDEP(wk));
8976			continue;
8977		default:
8978			panic("handle_jwork: Unknown type %s\n",
8979			    TYPENAME(wk->wk_type));
8980		}
8981	}
8982}
8983
8984/*
8985 * Handle the bufwait list on an inode when it is safe to release items
8986 * held there.  This normally happens after an inode block is written but
8987 * may be delayed and handle later if there are pending journal items that
8988 * are not yet safe to be released.
8989 */
8990static struct freefile *
8991handle_bufwait(inodedep, refhd)
8992	struct inodedep *inodedep;
8993	struct workhead *refhd;
8994{
8995	struct jaddref *jaddref;
8996	struct freefile *freefile;
8997	struct worklist *wk;
8998
8999	freefile = NULL;
9000	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
9001		WORKLIST_REMOVE(wk);
9002		switch (wk->wk_type) {
9003		case D_FREEFILE:
9004			/*
9005			 * We defer adding freefile to the worklist
9006			 * until all other additions have been made to
9007			 * ensure that it will be done after all the
9008			 * old blocks have been freed.
9009			 */
9010			if (freefile != NULL)
9011				panic("handle_bufwait: freefile");
9012			freefile = WK_FREEFILE(wk);
9013			continue;
9014
9015		case D_MKDIR:
9016			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
9017			continue;
9018
9019		case D_DIRADD:
9020			diradd_inode_written(WK_DIRADD(wk), inodedep);
9021			continue;
9022
9023		case D_FREEFRAG:
9024			wk->wk_state |= COMPLETE;
9025			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
9026				add_to_worklist(wk, 0);
9027			continue;
9028
9029		case D_DIRREM:
9030			wk->wk_state |= COMPLETE;
9031			add_to_worklist(wk, 0);
9032			continue;
9033
9034		case D_ALLOCDIRECT:
9035		case D_ALLOCINDIR:
9036			free_newblk(WK_NEWBLK(wk));
9037			continue;
9038
9039		case D_JNEWBLK:
9040			wk->wk_state |= COMPLETE;
9041			free_jnewblk(WK_JNEWBLK(wk));
9042			continue;
9043
9044		/*
9045		 * Save freed journal segments and add references on
9046		 * the supplied list which will delay their release
9047		 * until the cg bitmap is cleared on disk.
9048		 */
9049		case D_JSEGDEP:
9050			if (refhd == NULL)
9051				free_jsegdep(WK_JSEGDEP(wk));
9052			else
9053				WORKLIST_INSERT(refhd, wk);
9054			continue;
9055
9056		case D_JADDREF:
9057			jaddref = WK_JADDREF(wk);
9058			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
9059			    if_deps);
9060			/*
9061			 * Transfer any jaddrefs to the list to be freed with
9062			 * the bitmap if we're handling a removed file.
9063			 */
9064			if (refhd == NULL) {
9065				wk->wk_state |= COMPLETE;
9066				free_jaddref(jaddref);
9067			} else
9068				WORKLIST_INSERT(refhd, wk);
9069			continue;
9070
9071		default:
9072			panic("handle_bufwait: Unknown type %p(%s)",
9073			    wk, TYPENAME(wk->wk_type));
9074			/* NOTREACHED */
9075		}
9076	}
9077	return (freefile);
9078}
9079/*
9080 * Called from within softdep_disk_write_complete above to restore
9081 * in-memory inode block contents to their most up-to-date state. Note
9082 * that this routine is always called from interrupt level with further
9083 * splbio interrupts blocked.
9084 */
9085static int
9086handle_written_inodeblock(inodedep, bp)
9087	struct inodedep *inodedep;
9088	struct buf *bp;		/* buffer containing the inode block */
9089{
9090	struct freefile *freefile;
9091	struct allocdirect *adp, *nextadp;
9092	struct ufs1_dinode *dp1 = NULL;
9093	struct ufs2_dinode *dp2 = NULL;
9094	struct workhead wkhd;
9095	int hadchanges, fstype;
9096	ino_t freelink;
9097
9098	LIST_INIT(&wkhd);
9099	hadchanges = 0;
9100	freefile = NULL;
9101	if ((inodedep->id_state & IOSTARTED) == 0)
9102		panic("handle_written_inodeblock: not started");
9103	inodedep->id_state &= ~IOSTARTED;
9104	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
9105		fstype = UFS1;
9106		dp1 = (struct ufs1_dinode *)bp->b_data +
9107		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
9108		freelink = dp1->di_freelink;
9109	} else {
9110		fstype = UFS2;
9111		dp2 = (struct ufs2_dinode *)bp->b_data +
9112		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
9113		freelink = dp2->di_freelink;
9114	}
9115	/*
9116	 * If we wrote a valid freelink pointer during the last write
9117	 * record it here.
9118	 */
9119	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
9120		struct inodedep *inon;
9121
9122		inon = TAILQ_NEXT(inodedep, id_unlinked);
9123		if ((inon == NULL && freelink == 0) ||
9124		    (inon && inon->id_ino == freelink)) {
9125			if (inon)
9126				inon->id_state |= UNLINKPREV;
9127			inodedep->id_state |= UNLINKNEXT;
9128		} else
9129			hadchanges = 1;
9130	}
9131	/* Leave this inodeblock dirty until it's in the list. */
9132	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED)
9133		hadchanges = 1;
9134	/*
9135	 * If we had to rollback the inode allocation because of
9136	 * bitmaps being incomplete, then simply restore it.
9137	 * Keep the block dirty so that it will not be reclaimed until
9138	 * all associated dependencies have been cleared and the
9139	 * corresponding updates written to disk.
9140	 */
9141	if (inodedep->id_savedino1 != NULL) {
9142		hadchanges = 1;
9143		if (fstype == UFS1)
9144			*dp1 = *inodedep->id_savedino1;
9145		else
9146			*dp2 = *inodedep->id_savedino2;
9147		free(inodedep->id_savedino1, M_SAVEDINO);
9148		inodedep->id_savedino1 = NULL;
9149		if ((bp->b_flags & B_DELWRI) == 0)
9150			stat_inode_bitmap++;
9151		bdirty(bp);
9152		/*
9153		 * If the inode is clear here and GOINGAWAY it will never
9154		 * be written.  Process the bufwait and clear any pending
9155		 * work which may include the freefile.
9156		 */
9157		if (inodedep->id_state & GOINGAWAY)
9158			goto bufwait;
9159		return (1);
9160	}
9161	inodedep->id_state |= COMPLETE;
9162	/*
9163	 * Roll forward anything that had to be rolled back before
9164	 * the inode could be updated.
9165	 */
9166	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
9167		nextadp = TAILQ_NEXT(adp, ad_next);
9168		if (adp->ad_state & ATTACHED)
9169			panic("handle_written_inodeblock: new entry");
9170		if (fstype == UFS1) {
9171			if (adp->ad_offset < NDADDR) {
9172				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
9173					panic("%s %s #%jd mismatch %d != %jd",
9174					    "handle_written_inodeblock:",
9175					    "direct pointer",
9176					    (intmax_t)adp->ad_offset,
9177					    dp1->di_db[adp->ad_offset],
9178					    (intmax_t)adp->ad_oldblkno);
9179				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
9180			} else {
9181				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
9182					panic("%s: %s #%jd allocated as %d",
9183					    "handle_written_inodeblock",
9184					    "indirect pointer",
9185					    (intmax_t)adp->ad_offset - NDADDR,
9186					    dp1->di_ib[adp->ad_offset - NDADDR]);
9187				dp1->di_ib[adp->ad_offset - NDADDR] =
9188				    adp->ad_newblkno;
9189			}
9190		} else {
9191			if (adp->ad_offset < NDADDR) {
9192				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
9193					panic("%s: %s #%jd %s %jd != %jd",
9194					    "handle_written_inodeblock",
9195					    "direct pointer",
9196					    (intmax_t)adp->ad_offset, "mismatch",
9197					    (intmax_t)dp2->di_db[adp->ad_offset],
9198					    (intmax_t)adp->ad_oldblkno);
9199				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
9200			} else {
9201				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
9202					panic("%s: %s #%jd allocated as %jd",
9203					    "handle_written_inodeblock",
9204					    "indirect pointer",
9205					    (intmax_t)adp->ad_offset - NDADDR,
9206					    (intmax_t)
9207					    dp2->di_ib[adp->ad_offset - NDADDR]);
9208				dp2->di_ib[adp->ad_offset - NDADDR] =
9209				    adp->ad_newblkno;
9210			}
9211		}
9212		adp->ad_state &= ~UNDONE;
9213		adp->ad_state |= ATTACHED;
9214		hadchanges = 1;
9215	}
9216	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
9217		nextadp = TAILQ_NEXT(adp, ad_next);
9218		if (adp->ad_state & ATTACHED)
9219			panic("handle_written_inodeblock: new entry");
9220		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
9221			panic("%s: direct pointers #%jd %s %jd != %jd",
9222			    "handle_written_inodeblock",
9223			    (intmax_t)adp->ad_offset, "mismatch",
9224			    (intmax_t)dp2->di_extb[adp->ad_offset],
9225			    (intmax_t)adp->ad_oldblkno);
9226		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
9227		adp->ad_state &= ~UNDONE;
9228		adp->ad_state |= ATTACHED;
9229		hadchanges = 1;
9230	}
9231	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
9232		stat_direct_blk_ptrs++;
9233	/*
9234	 * Reset the file size to its most up-to-date value.
9235	 */
9236	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
9237		panic("handle_written_inodeblock: bad size");
9238	if (inodedep->id_savednlink > LINK_MAX)
9239		panic("handle_written_inodeblock: Invalid link count "
9240		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
9241	if (fstype == UFS1) {
9242		if (dp1->di_nlink != inodedep->id_savednlink) {
9243			dp1->di_nlink = inodedep->id_savednlink;
9244			hadchanges = 1;
9245		}
9246		if (dp1->di_size != inodedep->id_savedsize) {
9247			dp1->di_size = inodedep->id_savedsize;
9248			hadchanges = 1;
9249		}
9250	} else {
9251		if (dp2->di_nlink != inodedep->id_savednlink) {
9252			dp2->di_nlink = inodedep->id_savednlink;
9253			hadchanges = 1;
9254		}
9255		if (dp2->di_size != inodedep->id_savedsize) {
9256			dp2->di_size = inodedep->id_savedsize;
9257			hadchanges = 1;
9258		}
9259		if (dp2->di_extsize != inodedep->id_savedextsize) {
9260			dp2->di_extsize = inodedep->id_savedextsize;
9261			hadchanges = 1;
9262		}
9263	}
9264	inodedep->id_savedsize = -1;
9265	inodedep->id_savedextsize = -1;
9266	inodedep->id_savednlink = -1;
9267	/*
9268	 * If there were any rollbacks in the inode block, then it must be
9269	 * marked dirty so that its will eventually get written back in
9270	 * its correct form.
9271	 */
9272	if (hadchanges)
9273		bdirty(bp);
9274bufwait:
9275	/*
9276	 * Process any allocdirects that completed during the update.
9277	 */
9278	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
9279		handle_allocdirect_partdone(adp, &wkhd);
9280	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
9281		handle_allocdirect_partdone(adp, &wkhd);
9282	/*
9283	 * Process deallocations that were held pending until the
9284	 * inode had been written to disk. Freeing of the inode
9285	 * is delayed until after all blocks have been freed to
9286	 * avoid creation of new <vfsid, inum, lbn> triples
9287	 * before the old ones have been deleted.  Completely
9288	 * unlinked inodes are not processed until the unlinked
9289	 * inode list is written or the last reference is removed.
9290	 */
9291	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
9292		freefile = handle_bufwait(inodedep, NULL);
9293		if (freefile && !LIST_EMPTY(&wkhd)) {
9294			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
9295			freefile = NULL;
9296		}
9297	}
9298	/*
9299	 * Move rolled forward dependency completions to the bufwait list
9300	 * now that those that were already written have been processed.
9301	 */
9302	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
9303		panic("handle_written_inodeblock: bufwait but no changes");
9304	jwork_move(&inodedep->id_bufwait, &wkhd);
9305
9306	if (freefile != NULL) {
9307		/*
9308		 * If the inode is goingaway it was never written.  Fake up
9309		 * the state here so free_inodedep() can succeed.
9310		 */
9311		if (inodedep->id_state & GOINGAWAY)
9312			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
9313		if (free_inodedep(inodedep) == 0)
9314			panic("handle_written_inodeblock: live inodedep %p",
9315			    inodedep);
9316		add_to_worklist(&freefile->fx_list, 0);
9317		return (0);
9318	}
9319
9320	/*
9321	 * If no outstanding dependencies, free it.
9322	 */
9323	if (free_inodedep(inodedep) ||
9324	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
9325	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
9326	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
9327	     LIST_FIRST(&inodedep->id_bufwait) == 0))
9328		return (0);
9329	return (hadchanges);
9330}
9331
9332static int
9333handle_written_indirdep(indirdep, bp, bpp)
9334	struct indirdep *indirdep;
9335	struct buf *bp;
9336	struct buf **bpp;
9337{
9338	struct allocindir *aip;
9339	int chgs;
9340
9341	if (indirdep->ir_state & GOINGAWAY)
9342		panic("disk_write_complete: indirdep gone");
9343	chgs = 0;
9344	/*
9345	 * If there were rollbacks revert them here.
9346	 */
9347	if (indirdep->ir_saveddata) {
9348		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
9349		free(indirdep->ir_saveddata, M_INDIRDEP);
9350		indirdep->ir_saveddata = 0;
9351		chgs = 1;
9352	}
9353	indirdep->ir_state &= ~UNDONE;
9354	indirdep->ir_state |= ATTACHED;
9355	/*
9356	 * Move allocindirs with written pointers to the completehd if
9357	 * the the indirdep's pointer is not yet written.  Otherwise
9358	 * free them here.
9359	 */
9360	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
9361		LIST_REMOVE(aip, ai_next);
9362		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
9363			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
9364			    ai_next);
9365			continue;
9366		}
9367		free_newblk(&aip->ai_block);
9368	}
9369	/*
9370	 * Move allocindirs that have finished dependency processing from
9371	 * the done list to the write list after updating the pointers.
9372	 */
9373	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
9374		handle_allocindir_partdone(aip);
9375		if (aip == LIST_FIRST(&indirdep->ir_donehd))
9376			panic("disk_write_complete: not gone");
9377		chgs = 1;
9378	}
9379	/*
9380	 * If this indirdep has been detached from its newblk during
9381	 * I/O we need to keep this dep attached to the buffer so
9382	 * deallocate_dependencies can find it and properly resolve
9383	 * any outstanding dependencies.
9384	 */
9385	if ((indirdep->ir_state & (ONDEPLIST | DEPCOMPLETE)) == 0)
9386		chgs = 1;
9387	if ((bp->b_flags & B_DELWRI) == 0)
9388		stat_indir_blk_ptrs++;
9389	/*
9390	 * If there were no changes we can discard the savedbp and detach
9391	 * ourselves from the buf.  We are only carrying completed pointers
9392	 * in this case.
9393	 */
9394	if (chgs == 0) {
9395		struct buf *sbp;
9396
9397		sbp = indirdep->ir_savebp;
9398		sbp->b_flags |= B_INVAL | B_NOCACHE;
9399		indirdep->ir_savebp = NULL;
9400		if (*bpp != NULL)
9401			panic("handle_written_indirdep: bp already exists.");
9402		*bpp = sbp;
9403	} else
9404		bdirty(bp);
9405	/*
9406	 * If there are no fresh dependencies and none waiting on writes
9407	 * we can free the indirdep.
9408	 */
9409	if ((indirdep->ir_state & DEPCOMPLETE) && chgs == 0) {
9410		if (indirdep->ir_state & ONDEPLIST)
9411			LIST_REMOVE(indirdep, ir_next);
9412		free_indirdep(indirdep);
9413		return (0);
9414	}
9415
9416	return (chgs);
9417}
9418
9419/*
9420 * Process a diradd entry after its dependent inode has been written.
9421 * This routine must be called with splbio interrupts blocked.
9422 */
9423static void
9424diradd_inode_written(dap, inodedep)
9425	struct diradd *dap;
9426	struct inodedep *inodedep;
9427{
9428
9429	dap->da_state |= COMPLETE;
9430	complete_diradd(dap);
9431	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9432}
9433
9434/*
9435 * Returns true if the bmsafemap will have rollbacks when written.  Must
9436 * only be called with lk and the buf lock on the cg held.
9437 */
9438static int
9439bmsafemap_rollbacks(bmsafemap)
9440	struct bmsafemap *bmsafemap;
9441{
9442
9443	return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
9444	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd));
9445}
9446
9447/*
9448 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
9449 * changes if it's not a background write.  Set all written dependencies
9450 * to DEPCOMPLETE and free the structure if possible.
9451 */
9452static int
9453handle_written_bmsafemap(bmsafemap, bp)
9454	struct bmsafemap *bmsafemap;
9455	struct buf *bp;
9456{
9457	struct newblk *newblk;
9458	struct inodedep *inodedep;
9459	struct jaddref *jaddref, *jatmp;
9460	struct jnewblk *jnewblk, *jntmp;
9461	uint8_t *inosused;
9462	uint8_t *blksfree;
9463	struct cg *cgp;
9464	struct fs *fs;
9465	ino_t ino;
9466	long bno;
9467	int chgs;
9468	int i;
9469
9470	if ((bmsafemap->sm_state & IOSTARTED) == 0)
9471		panic("initiate_write_bmsafemap: Not started\n");
9472	chgs = 0;
9473	bmsafemap->sm_state &= ~IOSTARTED;
9474	/*
9475	 * Restore unwritten inode allocation pending jaddref writes.
9476	 */
9477	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
9478		cgp = (struct cg *)bp->b_data;
9479		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
9480		inosused = cg_inosused(cgp);
9481		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
9482		    ja_bmdeps, jatmp) {
9483			if ((jaddref->ja_state & UNDONE) == 0)
9484				continue;
9485			ino = jaddref->ja_ino % fs->fs_ipg;
9486			if (isset(inosused, ino))
9487				panic("handle_written_bmsafemap: "
9488				    "re-allocated inode");
9489			if ((bp->b_xflags & BX_BKGRDMARKER) == 0) {
9490				if ((jaddref->ja_mode & IFMT) == IFDIR)
9491					cgp->cg_cs.cs_ndir++;
9492				cgp->cg_cs.cs_nifree--;
9493				setbit(inosused, ino);
9494				chgs = 1;
9495			}
9496			jaddref->ja_state &= ~UNDONE;
9497			jaddref->ja_state |= ATTACHED;
9498			free_jaddref(jaddref);
9499		}
9500	}
9501	/*
9502	 * Restore any block allocations which are pending journal writes.
9503	 */
9504	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
9505		cgp = (struct cg *)bp->b_data;
9506		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
9507		blksfree = cg_blksfree(cgp);
9508		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
9509		    jntmp) {
9510			if ((jnewblk->jn_state & UNDONE) == 0)
9511				continue;
9512			bno = dtogd(fs, jnewblk->jn_blkno);
9513			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
9514			    i++) {
9515				if (bp->b_xflags & BX_BKGRDMARKER)
9516					break;
9517				if ((jnewblk->jn_state & NEWBLOCK) == 0 &&
9518				    isclr(blksfree, bno + i))
9519					panic("handle_written_bmsafemap: "
9520					    "re-allocated fragment");
9521				clrbit(blksfree, bno + i);
9522				chgs = 1;
9523			}
9524			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
9525			jnewblk->jn_state |= ATTACHED;
9526			free_jnewblk(jnewblk);
9527		}
9528	}
9529	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
9530		newblk->nb_state |= DEPCOMPLETE;
9531		newblk->nb_state &= ~ONDEPLIST;
9532		newblk->nb_bmsafemap = NULL;
9533		LIST_REMOVE(newblk, nb_deps);
9534		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
9535			handle_allocdirect_partdone(
9536			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
9537		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
9538			handle_allocindir_partdone(
9539			    WK_ALLOCINDIR(&newblk->nb_list));
9540		else if (newblk->nb_list.wk_type != D_NEWBLK)
9541			panic("handle_written_bmsafemap: Unexpected type: %s",
9542			    TYPENAME(newblk->nb_list.wk_type));
9543	}
9544	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
9545		inodedep->id_state |= DEPCOMPLETE;
9546		inodedep->id_state &= ~ONDEPLIST;
9547		LIST_REMOVE(inodedep, id_deps);
9548		inodedep->id_bmsafemap = NULL;
9549	}
9550	if (LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
9551	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
9552	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
9553	    LIST_EMPTY(&bmsafemap->sm_inodedephd)) {
9554		if (chgs)
9555			bdirty(bp);
9556		LIST_REMOVE(bmsafemap, sm_hash);
9557		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
9558		return (0);
9559	}
9560	bdirty(bp);
9561	return (1);
9562}
9563
9564/*
9565 * Try to free a mkdir dependency.
9566 */
9567static void
9568complete_mkdir(mkdir)
9569	struct mkdir *mkdir;
9570{
9571	struct diradd *dap;
9572
9573	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
9574		return;
9575	LIST_REMOVE(mkdir, md_mkdirs);
9576	dap = mkdir->md_diradd;
9577	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
9578	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
9579		dap->da_state |= DEPCOMPLETE;
9580		complete_diradd(dap);
9581	}
9582	WORKITEM_FREE(mkdir, D_MKDIR);
9583}
9584
9585/*
9586 * Handle the completion of a mkdir dependency.
9587 */
9588static void
9589handle_written_mkdir(mkdir, type)
9590	struct mkdir *mkdir;
9591	int type;
9592{
9593
9594	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
9595		panic("handle_written_mkdir: bad type");
9596	mkdir->md_state |= COMPLETE;
9597	complete_mkdir(mkdir);
9598}
9599
9600static void
9601free_pagedep(pagedep)
9602	struct pagedep *pagedep;
9603{
9604	int i;
9605
9606	if (pagedep->pd_state & (NEWBLOCK | ONWORKLIST))
9607		return;
9608	for (i = 0; i < DAHASHSZ; i++)
9609		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
9610			return;
9611	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
9612		return;
9613	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
9614		return;
9615	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
9616		return;
9617	LIST_REMOVE(pagedep, pd_hash);
9618	WORKITEM_FREE(pagedep, D_PAGEDEP);
9619}
9620
9621/*
9622 * Called from within softdep_disk_write_complete above.
9623 * A write operation was just completed. Removed inodes can
9624 * now be freed and associated block pointers may be committed.
9625 * Note that this routine is always called from interrupt level
9626 * with further splbio interrupts blocked.
9627 */
9628static int
9629handle_written_filepage(pagedep, bp)
9630	struct pagedep *pagedep;
9631	struct buf *bp;		/* buffer containing the written page */
9632{
9633	struct dirrem *dirrem;
9634	struct diradd *dap, *nextdap;
9635	struct direct *ep;
9636	int i, chgs;
9637
9638	if ((pagedep->pd_state & IOSTARTED) == 0)
9639		panic("handle_written_filepage: not started");
9640	pagedep->pd_state &= ~IOSTARTED;
9641	/*
9642	 * Process any directory removals that have been committed.
9643	 */
9644	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
9645		LIST_REMOVE(dirrem, dm_next);
9646		dirrem->dm_state |= COMPLETE;
9647		dirrem->dm_dirinum = pagedep->pd_ino;
9648		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9649		    ("handle_written_filepage: Journal entries not written."));
9650		add_to_worklist(&dirrem->dm_list, 0);
9651	}
9652	/*
9653	 * Free any directory additions that have been committed.
9654	 * If it is a newly allocated block, we have to wait until
9655	 * the on-disk directory inode claims the new block.
9656	 */
9657	if ((pagedep->pd_state & NEWBLOCK) == 0)
9658		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
9659			free_diradd(dap, NULL);
9660	/*
9661	 * Uncommitted directory entries must be restored.
9662	 */
9663	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
9664		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
9665		     dap = nextdap) {
9666			nextdap = LIST_NEXT(dap, da_pdlist);
9667			if (dap->da_state & ATTACHED)
9668				panic("handle_written_filepage: attached");
9669			ep = (struct direct *)
9670			    ((char *)bp->b_data + dap->da_offset);
9671			ep->d_ino = dap->da_newinum;
9672			dap->da_state &= ~UNDONE;
9673			dap->da_state |= ATTACHED;
9674			chgs = 1;
9675			/*
9676			 * If the inode referenced by the directory has
9677			 * been written out, then the dependency can be
9678			 * moved to the pending list.
9679			 */
9680			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
9681				LIST_REMOVE(dap, da_pdlist);
9682				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
9683				    da_pdlist);
9684			}
9685		}
9686	}
9687	/*
9688	 * If there were any rollbacks in the directory, then it must be
9689	 * marked dirty so that its will eventually get written back in
9690	 * its correct form.
9691	 */
9692	if (chgs) {
9693		if ((bp->b_flags & B_DELWRI) == 0)
9694			stat_dir_entry++;
9695		bdirty(bp);
9696		return (1);
9697	}
9698	/*
9699	 * If we are not waiting for a new directory block to be
9700	 * claimed by its inode, then the pagedep will be freed.
9701	 * Otherwise it will remain to track any new entries on
9702	 * the page in case they are fsync'ed.
9703	 */
9704	if ((pagedep->pd_state & NEWBLOCK) == 0 &&
9705	    LIST_EMPTY(&pagedep->pd_jmvrefhd)) {
9706		LIST_REMOVE(pagedep, pd_hash);
9707		WORKITEM_FREE(pagedep, D_PAGEDEP);
9708	}
9709	return (0);
9710}
9711
9712/*
9713 * Writing back in-core inode structures.
9714 *
9715 * The filesystem only accesses an inode's contents when it occupies an
9716 * "in-core" inode structure.  These "in-core" structures are separate from
9717 * the page frames used to cache inode blocks.  Only the latter are
9718 * transferred to/from the disk.  So, when the updated contents of the
9719 * "in-core" inode structure are copied to the corresponding in-memory inode
9720 * block, the dependencies are also transferred.  The following procedure is
9721 * called when copying a dirty "in-core" inode to a cached inode block.
9722 */
9723
9724/*
9725 * Called when an inode is loaded from disk. If the effective link count
9726 * differed from the actual link count when it was last flushed, then we
9727 * need to ensure that the correct effective link count is put back.
9728 */
9729void
9730softdep_load_inodeblock(ip)
9731	struct inode *ip;	/* the "in_core" copy of the inode */
9732{
9733	struct inodedep *inodedep;
9734
9735	/*
9736	 * Check for alternate nlink count.
9737	 */
9738	ip->i_effnlink = ip->i_nlink;
9739	ACQUIRE_LOCK(&lk);
9740	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
9741	    &inodedep) == 0) {
9742		FREE_LOCK(&lk);
9743		return;
9744	}
9745	ip->i_effnlink -= inodedep->id_nlinkdelta;
9746	if (inodedep->id_state & SPACECOUNTED)
9747		ip->i_flag |= IN_SPACECOUNTED;
9748	FREE_LOCK(&lk);
9749}
9750
9751/*
9752 * This routine is called just before the "in-core" inode
9753 * information is to be copied to the in-memory inode block.
9754 * Recall that an inode block contains several inodes. If
9755 * the force flag is set, then the dependencies will be
9756 * cleared so that the update can always be made. Note that
9757 * the buffer is locked when this routine is called, so we
9758 * will never be in the middle of writing the inode block
9759 * to disk.
9760 */
9761void
9762softdep_update_inodeblock(ip, bp, waitfor)
9763	struct inode *ip;	/* the "in_core" copy of the inode */
9764	struct buf *bp;		/* the buffer containing the inode block */
9765	int waitfor;		/* nonzero => update must be allowed */
9766{
9767	struct inodedep *inodedep;
9768	struct inoref *inoref;
9769	struct worklist *wk;
9770	struct mount *mp;
9771	struct buf *ibp;
9772	struct fs *fs;
9773	int error;
9774
9775	mp = UFSTOVFS(ip->i_ump);
9776	fs = ip->i_fs;
9777	/*
9778	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
9779	 * does not have access to the in-core ip so must write directly into
9780	 * the inode block buffer when setting freelink.
9781	 */
9782	if (fs->fs_magic == FS_UFS1_MAGIC)
9783		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
9784		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
9785	else
9786		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
9787		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
9788	/*
9789	 * If the effective link count is not equal to the actual link
9790	 * count, then we must track the difference in an inodedep while
9791	 * the inode is (potentially) tossed out of the cache. Otherwise,
9792	 * if there is no existing inodedep, then there are no dependencies
9793	 * to track.
9794	 */
9795	ACQUIRE_LOCK(&lk);
9796again:
9797	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
9798		FREE_LOCK(&lk);
9799		if (ip->i_effnlink != ip->i_nlink)
9800			panic("softdep_update_inodeblock: bad link count");
9801		return;
9802	}
9803	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
9804		panic("softdep_update_inodeblock: bad delta");
9805	/*
9806	 * If we're flushing all dependencies we must also move any waiting
9807	 * for journal writes onto the bufwait list prior to I/O.
9808	 */
9809	if (waitfor) {
9810		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
9811			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
9812			    == DEPCOMPLETE) {
9813				stat_jwait_inode++;
9814				jwait(&inoref->if_list);
9815				goto again;
9816			}
9817		}
9818	}
9819	/*
9820	 * Changes have been initiated. Anything depending on these
9821	 * changes cannot occur until this inode has been written.
9822	 */
9823	inodedep->id_state &= ~COMPLETE;
9824	if ((inodedep->id_state & ONWORKLIST) == 0)
9825		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
9826	/*
9827	 * Any new dependencies associated with the incore inode must
9828	 * now be moved to the list associated with the buffer holding
9829	 * the in-memory copy of the inode. Once merged process any
9830	 * allocdirects that are completed by the merger.
9831	 */
9832	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
9833	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
9834		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
9835		    NULL);
9836	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
9837	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
9838		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
9839		    NULL);
9840	/*
9841	 * Now that the inode has been pushed into the buffer, the
9842	 * operations dependent on the inode being written to disk
9843	 * can be moved to the id_bufwait so that they will be
9844	 * processed when the buffer I/O completes.
9845	 */
9846	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
9847		WORKLIST_REMOVE(wk);
9848		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
9849	}
9850	/*
9851	 * Newly allocated inodes cannot be written until the bitmap
9852	 * that allocates them have been written (indicated by
9853	 * DEPCOMPLETE being set in id_state). If we are doing a
9854	 * forced sync (e.g., an fsync on a file), we force the bitmap
9855	 * to be written so that the update can be done.
9856	 */
9857	if (waitfor == 0) {
9858		FREE_LOCK(&lk);
9859		return;
9860	}
9861retry:
9862	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
9863		FREE_LOCK(&lk);
9864		return;
9865	}
9866	ibp = inodedep->id_bmsafemap->sm_buf;
9867	ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
9868	if (ibp == NULL) {
9869		/*
9870		 * If ibp came back as NULL, the dependency could have been
9871		 * freed while we slept.  Look it up again, and check to see
9872		 * that it has completed.
9873		 */
9874		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
9875			goto retry;
9876		FREE_LOCK(&lk);
9877		return;
9878	}
9879	FREE_LOCK(&lk);
9880	if ((error = bwrite(ibp)) != 0)
9881		softdep_error("softdep_update_inodeblock: bwrite", error);
9882}
9883
9884/*
9885 * Merge the a new inode dependency list (such as id_newinoupdt) into an
9886 * old inode dependency list (such as id_inoupdt). This routine must be
9887 * called with splbio interrupts blocked.
9888 */
9889static void
9890merge_inode_lists(newlisthead, oldlisthead)
9891	struct allocdirectlst *newlisthead;
9892	struct allocdirectlst *oldlisthead;
9893{
9894	struct allocdirect *listadp, *newadp;
9895
9896	newadp = TAILQ_FIRST(newlisthead);
9897	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
9898		if (listadp->ad_offset < newadp->ad_offset) {
9899			listadp = TAILQ_NEXT(listadp, ad_next);
9900			continue;
9901		}
9902		TAILQ_REMOVE(newlisthead, newadp, ad_next);
9903		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
9904		if (listadp->ad_offset == newadp->ad_offset) {
9905			allocdirect_merge(oldlisthead, newadp,
9906			    listadp);
9907			listadp = newadp;
9908		}
9909		newadp = TAILQ_FIRST(newlisthead);
9910	}
9911	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
9912		TAILQ_REMOVE(newlisthead, newadp, ad_next);
9913		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
9914	}
9915}
9916
9917/*
9918 * If we are doing an fsync, then we must ensure that any directory
9919 * entries for the inode have been written after the inode gets to disk.
9920 */
9921int
9922softdep_fsync(vp)
9923	struct vnode *vp;	/* the "in_core" copy of the inode */
9924{
9925	struct inodedep *inodedep;
9926	struct pagedep *pagedep;
9927	struct inoref *inoref;
9928	struct worklist *wk;
9929	struct diradd *dap;
9930	struct mount *mp;
9931	struct vnode *pvp;
9932	struct inode *ip;
9933	struct buf *bp;
9934	struct fs *fs;
9935	struct thread *td = curthread;
9936	int error, flushparent, pagedep_new_block;
9937	ino_t parentino;
9938	ufs_lbn_t lbn;
9939
9940	ip = VTOI(vp);
9941	fs = ip->i_fs;
9942	mp = vp->v_mount;
9943	ACQUIRE_LOCK(&lk);
9944restart:
9945	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
9946		FREE_LOCK(&lk);
9947		return (0);
9948	}
9949	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
9950		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
9951		    == DEPCOMPLETE) {
9952			stat_jwait_inode++;
9953			jwait(&inoref->if_list);
9954			goto restart;
9955		}
9956	}
9957	if (!LIST_EMPTY(&inodedep->id_inowait) ||
9958	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
9959	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
9960	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
9961	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
9962		panic("softdep_fsync: pending ops %p", inodedep);
9963	for (error = 0, flushparent = 0; ; ) {
9964		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
9965			break;
9966		if (wk->wk_type != D_DIRADD)
9967			panic("softdep_fsync: Unexpected type %s",
9968			    TYPENAME(wk->wk_type));
9969		dap = WK_DIRADD(wk);
9970		/*
9971		 * Flush our parent if this directory entry has a MKDIR_PARENT
9972		 * dependency or is contained in a newly allocated block.
9973		 */
9974		if (dap->da_state & DIRCHG)
9975			pagedep = dap->da_previous->dm_pagedep;
9976		else
9977			pagedep = dap->da_pagedep;
9978		parentino = pagedep->pd_ino;
9979		lbn = pagedep->pd_lbn;
9980		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
9981			panic("softdep_fsync: dirty");
9982		if ((dap->da_state & MKDIR_PARENT) ||
9983		    (pagedep->pd_state & NEWBLOCK))
9984			flushparent = 1;
9985		else
9986			flushparent = 0;
9987		/*
9988		 * If we are being fsync'ed as part of vgone'ing this vnode,
9989		 * then we will not be able to release and recover the
9990		 * vnode below, so we just have to give up on writing its
9991		 * directory entry out. It will eventually be written, just
9992		 * not now, but then the user was not asking to have it
9993		 * written, so we are not breaking any promises.
9994		 */
9995		if (vp->v_iflag & VI_DOOMED)
9996			break;
9997		/*
9998		 * We prevent deadlock by always fetching inodes from the
9999		 * root, moving down the directory tree. Thus, when fetching
10000		 * our parent directory, we first try to get the lock. If
10001		 * that fails, we must unlock ourselves before requesting
10002		 * the lock on our parent. See the comment in ufs_lookup
10003		 * for details on possible races.
10004		 */
10005		FREE_LOCK(&lk);
10006		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
10007		    FFSV_FORCEINSMQ)) {
10008			error = vfs_busy(mp, MBF_NOWAIT);
10009			if (error != 0) {
10010				vfs_ref(mp);
10011				VOP_UNLOCK(vp, 0);
10012				error = vfs_busy(mp, 0);
10013				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
10014				vfs_rel(mp);
10015				if (error != 0)
10016					return (ENOENT);
10017				if (vp->v_iflag & VI_DOOMED) {
10018					vfs_unbusy(mp);
10019					return (ENOENT);
10020				}
10021			}
10022			VOP_UNLOCK(vp, 0);
10023			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
10024			    &pvp, FFSV_FORCEINSMQ);
10025			vfs_unbusy(mp);
10026			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
10027			if (vp->v_iflag & VI_DOOMED) {
10028				if (error == 0)
10029					vput(pvp);
10030				error = ENOENT;
10031			}
10032			if (error != 0)
10033				return (error);
10034		}
10035		/*
10036		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
10037		 * that are contained in direct blocks will be resolved by
10038		 * doing a ffs_update. Pagedeps contained in indirect blocks
10039		 * may require a complete sync'ing of the directory. So, we
10040		 * try the cheap and fast ffs_update first, and if that fails,
10041		 * then we do the slower ffs_syncvnode of the directory.
10042		 */
10043		if (flushparent) {
10044			int locked;
10045
10046			if ((error = ffs_update(pvp, 1)) != 0) {
10047				vput(pvp);
10048				return (error);
10049			}
10050			ACQUIRE_LOCK(&lk);
10051			locked = 1;
10052			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
10053				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
10054					if (wk->wk_type != D_DIRADD)
10055						panic("softdep_fsync: Unexpected type %s",
10056						      TYPENAME(wk->wk_type));
10057					dap = WK_DIRADD(wk);
10058					if (dap->da_state & DIRCHG)
10059						pagedep = dap->da_previous->dm_pagedep;
10060					else
10061						pagedep = dap->da_pagedep;
10062					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
10063					FREE_LOCK(&lk);
10064					locked = 0;
10065					if (pagedep_new_block &&
10066					    (error = ffs_syncvnode(pvp, MNT_WAIT))) {
10067						vput(pvp);
10068						return (error);
10069					}
10070				}
10071			}
10072			if (locked)
10073				FREE_LOCK(&lk);
10074		}
10075		/*
10076		 * Flush directory page containing the inode's name.
10077		 */
10078		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
10079		    &bp);
10080		if (error == 0)
10081			error = bwrite(bp);
10082		else
10083			brelse(bp);
10084		vput(pvp);
10085		if (error != 0)
10086			return (error);
10087		ACQUIRE_LOCK(&lk);
10088		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
10089			break;
10090	}
10091	FREE_LOCK(&lk);
10092	return (0);
10093}
10094
10095/*
10096 * Flush all the dirty bitmaps associated with the block device
10097 * before flushing the rest of the dirty blocks so as to reduce
10098 * the number of dependencies that will have to be rolled back.
10099 */
10100void
10101softdep_fsync_mountdev(vp)
10102	struct vnode *vp;
10103{
10104	struct buf *bp, *nbp;
10105	struct worklist *wk;
10106	struct bufobj *bo;
10107
10108	if (!vn_isdisk(vp, NULL))
10109		panic("softdep_fsync_mountdev: vnode not a disk");
10110	bo = &vp->v_bufobj;
10111restart:
10112	BO_LOCK(bo);
10113	ACQUIRE_LOCK(&lk);
10114	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
10115		/*
10116		 * If it is already scheduled, skip to the next buffer.
10117		 */
10118		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
10119			continue;
10120
10121		if ((bp->b_flags & B_DELWRI) == 0)
10122			panic("softdep_fsync_mountdev: not dirty");
10123		/*
10124		 * We are only interested in bitmaps with outstanding
10125		 * dependencies.
10126		 */
10127		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
10128		    wk->wk_type != D_BMSAFEMAP ||
10129		    (bp->b_vflags & BV_BKGRDINPROG)) {
10130			BUF_UNLOCK(bp);
10131			continue;
10132		}
10133		FREE_LOCK(&lk);
10134		BO_UNLOCK(bo);
10135		bremfree(bp);
10136		(void) bawrite(bp);
10137		goto restart;
10138	}
10139	FREE_LOCK(&lk);
10140	drain_output(vp);
10141	BO_UNLOCK(bo);
10142}
10143
10144/*
10145 * This routine is called when we are trying to synchronously flush a
10146 * file. This routine must eliminate any filesystem metadata dependencies
10147 * so that the syncing routine can succeed by pushing the dirty blocks
10148 * associated with the file. If any I/O errors occur, they are returned.
10149 */
10150int
10151softdep_sync_metadata(struct vnode *vp)
10152{
10153	struct pagedep *pagedep;
10154	struct allocindir *aip;
10155	struct newblk *newblk;
10156	struct buf *bp, *nbp;
10157	struct worklist *wk;
10158	struct bufobj *bo;
10159	int i, error, waitfor;
10160
10161	if (!DOINGSOFTDEP(vp))
10162		return (0);
10163	/*
10164	 * Ensure that any direct block dependencies have been cleared.
10165	 */
10166	ACQUIRE_LOCK(&lk);
10167	if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) {
10168		FREE_LOCK(&lk);
10169		return (error);
10170	}
10171	FREE_LOCK(&lk);
10172	/*
10173	 * For most files, the only metadata dependencies are the
10174	 * cylinder group maps that allocate their inode or blocks.
10175	 * The block allocation dependencies can be found by traversing
10176	 * the dependency lists for any buffers that remain on their
10177	 * dirty buffer list. The inode allocation dependency will
10178	 * be resolved when the inode is updated with MNT_WAIT.
10179	 * This work is done in two passes. The first pass grabs most
10180	 * of the buffers and begins asynchronously writing them. The
10181	 * only way to wait for these asynchronous writes is to sleep
10182	 * on the filesystem vnode which may stay busy for a long time
10183	 * if the filesystem is active. So, instead, we make a second
10184	 * pass over the dependencies blocking on each write. In the
10185	 * usual case we will be blocking against a write that we
10186	 * initiated, so when it is done the dependency will have been
10187	 * resolved. Thus the second pass is expected to end quickly.
10188	 */
10189	waitfor = MNT_NOWAIT;
10190	bo = &vp->v_bufobj;
10191
10192top:
10193	/*
10194	 * We must wait for any I/O in progress to finish so that
10195	 * all potential buffers on the dirty list will be visible.
10196	 */
10197	BO_LOCK(bo);
10198	drain_output(vp);
10199	while ((bp = TAILQ_FIRST(&bo->bo_dirty.bv_hd)) != NULL) {
10200		bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT);
10201		if (bp)
10202			break;
10203	}
10204	BO_UNLOCK(bo);
10205	if (bp == NULL)
10206		return (0);
10207loop:
10208	/* While syncing snapshots, we must allow recursive lookups */
10209	BUF_AREC(bp);
10210	ACQUIRE_LOCK(&lk);
10211	/*
10212	 * As we hold the buffer locked, none of its dependencies
10213	 * will disappear.
10214	 */
10215	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
10216		switch (wk->wk_type) {
10217
10218		case D_ALLOCDIRECT:
10219		case D_ALLOCINDIR:
10220			newblk = WK_NEWBLK(wk);
10221			if (newblk->nb_jnewblk != NULL) {
10222				stat_jwait_newblk++;
10223				jwait(&newblk->nb_jnewblk->jn_list);
10224				goto restart;
10225			}
10226			if (newblk->nb_state & DEPCOMPLETE)
10227				continue;
10228			nbp = newblk->nb_bmsafemap->sm_buf;
10229			nbp = getdirtybuf(nbp, &lk, waitfor);
10230			if (nbp == NULL)
10231				continue;
10232			FREE_LOCK(&lk);
10233			if (waitfor == MNT_NOWAIT) {
10234				bawrite(nbp);
10235			} else if ((error = bwrite(nbp)) != 0) {
10236				break;
10237			}
10238			ACQUIRE_LOCK(&lk);
10239			continue;
10240
10241		case D_INDIRDEP:
10242		restart:
10243
10244			LIST_FOREACH(aip,
10245			    &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
10246				newblk = (struct newblk *)aip;
10247				if (newblk->nb_jnewblk != NULL) {
10248					stat_jwait_newblk++;
10249					jwait(&newblk->nb_jnewblk->jn_list);
10250					goto restart;
10251				}
10252				if (newblk->nb_state & DEPCOMPLETE)
10253					continue;
10254				nbp = newblk->nb_bmsafemap->sm_buf;
10255				nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
10256				if (nbp == NULL)
10257					goto restart;
10258				FREE_LOCK(&lk);
10259				if ((error = bwrite(nbp)) != 0) {
10260					goto loop_end;
10261				}
10262				ACQUIRE_LOCK(&lk);
10263				goto restart;
10264			}
10265			continue;
10266
10267		case D_PAGEDEP:
10268			/*
10269			 * We are trying to sync a directory that may
10270			 * have dependencies on both its own metadata
10271			 * and/or dependencies on the inodes of any
10272			 * recently allocated files. We walk its diradd
10273			 * lists pushing out the associated inode.
10274			 */
10275			pagedep = WK_PAGEDEP(wk);
10276			for (i = 0; i < DAHASHSZ; i++) {
10277				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
10278					continue;
10279				if ((error =
10280				    flush_pagedep_deps(vp, wk->wk_mp,
10281						&pagedep->pd_diraddhd[i]))) {
10282					FREE_LOCK(&lk);
10283					goto loop_end;
10284				}
10285			}
10286			continue;
10287
10288		default:
10289			panic("softdep_sync_metadata: Unknown type %s",
10290			    TYPENAME(wk->wk_type));
10291			/* NOTREACHED */
10292		}
10293	loop_end:
10294		/* We reach here only in error and unlocked */
10295		if (error == 0)
10296			panic("softdep_sync_metadata: zero error");
10297		BUF_NOREC(bp);
10298		bawrite(bp);
10299		return (error);
10300	}
10301	FREE_LOCK(&lk);
10302	BO_LOCK(bo);
10303	while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) {
10304		nbp = getdirtybuf(nbp, BO_MTX(bo), MNT_WAIT);
10305		if (nbp)
10306			break;
10307	}
10308	BO_UNLOCK(bo);
10309	BUF_NOREC(bp);
10310	bawrite(bp);
10311	if (nbp != NULL) {
10312		bp = nbp;
10313		goto loop;
10314	}
10315	/*
10316	 * The brief unlock is to allow any pent up dependency
10317	 * processing to be done. Then proceed with the second pass.
10318	 */
10319	if (waitfor == MNT_NOWAIT) {
10320		waitfor = MNT_WAIT;
10321		goto top;
10322	}
10323
10324	/*
10325	 * If we have managed to get rid of all the dirty buffers,
10326	 * then we are done. For certain directories and block
10327	 * devices, we may need to do further work.
10328	 *
10329	 * We must wait for any I/O in progress to finish so that
10330	 * all potential buffers on the dirty list will be visible.
10331	 */
10332	BO_LOCK(bo);
10333	drain_output(vp);
10334	BO_UNLOCK(bo);
10335	return ffs_update(vp, 1);
10336	/* return (0); */
10337}
10338
10339/*
10340 * Flush the dependencies associated with an inodedep.
10341 * Called with splbio blocked.
10342 */
10343static int
10344flush_inodedep_deps(mp, ino)
10345	struct mount *mp;
10346	ino_t ino;
10347{
10348	struct inodedep *inodedep;
10349	struct inoref *inoref;
10350	int error, waitfor;
10351
10352	/*
10353	 * This work is done in two passes. The first pass grabs most
10354	 * of the buffers and begins asynchronously writing them. The
10355	 * only way to wait for these asynchronous writes is to sleep
10356	 * on the filesystem vnode which may stay busy for a long time
10357	 * if the filesystem is active. So, instead, we make a second
10358	 * pass over the dependencies blocking on each write. In the
10359	 * usual case we will be blocking against a write that we
10360	 * initiated, so when it is done the dependency will have been
10361	 * resolved. Thus the second pass is expected to end quickly.
10362	 * We give a brief window at the top of the loop to allow
10363	 * any pending I/O to complete.
10364	 */
10365	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
10366		if (error)
10367			return (error);
10368		FREE_LOCK(&lk);
10369		ACQUIRE_LOCK(&lk);
10370restart:
10371		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
10372			return (0);
10373		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
10374			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
10375			    == DEPCOMPLETE) {
10376				stat_jwait_inode++;
10377				jwait(&inoref->if_list);
10378				goto restart;
10379			}
10380		}
10381		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
10382		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
10383		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
10384		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
10385			continue;
10386		/*
10387		 * If pass2, we are done, otherwise do pass 2.
10388		 */
10389		if (waitfor == MNT_WAIT)
10390			break;
10391		waitfor = MNT_WAIT;
10392	}
10393	/*
10394	 * Try freeing inodedep in case all dependencies have been removed.
10395	 */
10396	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
10397		(void) free_inodedep(inodedep);
10398	return (0);
10399}
10400
10401/*
10402 * Flush an inode dependency list.
10403 * Called with splbio blocked.
10404 */
10405static int
10406flush_deplist(listhead, waitfor, errorp)
10407	struct allocdirectlst *listhead;
10408	int waitfor;
10409	int *errorp;
10410{
10411	struct allocdirect *adp;
10412	struct newblk *newblk;
10413	struct buf *bp;
10414
10415	mtx_assert(&lk, MA_OWNED);
10416	TAILQ_FOREACH(adp, listhead, ad_next) {
10417		newblk = (struct newblk *)adp;
10418		if (newblk->nb_jnewblk != NULL) {
10419			stat_jwait_newblk++;
10420			jwait(&newblk->nb_jnewblk->jn_list);
10421			return (1);
10422		}
10423		if (newblk->nb_state & DEPCOMPLETE)
10424			continue;
10425		bp = newblk->nb_bmsafemap->sm_buf;
10426		bp = getdirtybuf(bp, &lk, waitfor);
10427		if (bp == NULL) {
10428			if (waitfor == MNT_NOWAIT)
10429				continue;
10430			return (1);
10431		}
10432		FREE_LOCK(&lk);
10433		if (waitfor == MNT_NOWAIT) {
10434			bawrite(bp);
10435		} else if ((*errorp = bwrite(bp)) != 0) {
10436			ACQUIRE_LOCK(&lk);
10437			return (1);
10438		}
10439		ACQUIRE_LOCK(&lk);
10440		return (1);
10441	}
10442	return (0);
10443}
10444
10445/*
10446 * Flush dependencies associated with an allocdirect block.
10447 */
10448static int
10449flush_newblk_dep(vp, mp, lbn)
10450	struct vnode *vp;
10451	struct mount *mp;
10452	ufs_lbn_t lbn;
10453{
10454	struct newblk *newblk;
10455	struct bufobj *bo;
10456	struct inode *ip;
10457	struct buf *bp;
10458	ufs2_daddr_t blkno;
10459	int error;
10460
10461	error = 0;
10462	bo = &vp->v_bufobj;
10463	ip = VTOI(vp);
10464	blkno = DIP(ip, i_db[lbn]);
10465	if (blkno == 0)
10466		panic("flush_newblk_dep: Missing block");
10467	ACQUIRE_LOCK(&lk);
10468	/*
10469	 * Loop until all dependencies related to this block are satisfied.
10470	 * We must be careful to restart after each sleep in case a write
10471	 * completes some part of this process for us.
10472	 */
10473	for (;;) {
10474		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
10475			FREE_LOCK(&lk);
10476			break;
10477		}
10478		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
10479			panic("flush_newblk_deps: Bad newblk %p", newblk);
10480		/*
10481		 * Flush the journal.
10482		 */
10483		if (newblk->nb_jnewblk != NULL) {
10484			stat_jwait_newblk++;
10485			jwait(&newblk->nb_jnewblk->jn_list);
10486			continue;
10487		}
10488		/*
10489		 * Write the bitmap dependency.
10490		 */
10491		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
10492			bp = newblk->nb_bmsafemap->sm_buf;
10493			bp = getdirtybuf(bp, &lk, MNT_WAIT);
10494			if (bp == NULL)
10495				continue;
10496			FREE_LOCK(&lk);
10497			error = bwrite(bp);
10498			if (error)
10499				break;
10500			ACQUIRE_LOCK(&lk);
10501			continue;
10502		}
10503		/*
10504		 * Write the buffer.
10505		 */
10506		FREE_LOCK(&lk);
10507		BO_LOCK(bo);
10508		bp = gbincore(bo, lbn);
10509		if (bp != NULL) {
10510			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
10511			    LK_INTERLOCK, BO_MTX(bo));
10512			if (error == ENOLCK) {
10513				ACQUIRE_LOCK(&lk);
10514				continue; /* Slept, retry */
10515			}
10516			if (error != 0)
10517				break;	/* Failed */
10518			if (bp->b_flags & B_DELWRI) {
10519				bremfree(bp);
10520				error = bwrite(bp);
10521				if (error)
10522					break;
10523			} else
10524				BUF_UNLOCK(bp);
10525		} else
10526			BO_UNLOCK(bo);
10527		/*
10528		 * We have to wait for the direct pointers to
10529		 * point at the newdirblk before the dependency
10530		 * will go away.
10531		 */
10532		error = ffs_update(vp, MNT_WAIT);
10533		if (error)
10534			break;
10535		ACQUIRE_LOCK(&lk);
10536	}
10537	return (error);
10538}
10539
10540/*
10541 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
10542 * Called with splbio blocked.
10543 */
10544static int
10545flush_pagedep_deps(pvp, mp, diraddhdp)
10546	struct vnode *pvp;
10547	struct mount *mp;
10548	struct diraddhd *diraddhdp;
10549{
10550	struct inodedep *inodedep;
10551	struct inoref *inoref;
10552	struct ufsmount *ump;
10553	struct diradd *dap;
10554	struct vnode *vp;
10555	int error = 0;
10556	struct buf *bp;
10557	ino_t inum;
10558
10559	ump = VFSTOUFS(mp);
10560restart:
10561	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
10562		/*
10563		 * Flush ourselves if this directory entry
10564		 * has a MKDIR_PARENT dependency.
10565		 */
10566		if (dap->da_state & MKDIR_PARENT) {
10567			FREE_LOCK(&lk);
10568			if ((error = ffs_update(pvp, MNT_WAIT)) != 0)
10569				break;
10570			ACQUIRE_LOCK(&lk);
10571			/*
10572			 * If that cleared dependencies, go on to next.
10573			 */
10574			if (dap != LIST_FIRST(diraddhdp))
10575				continue;
10576			if (dap->da_state & MKDIR_PARENT)
10577				panic("flush_pagedep_deps: MKDIR_PARENT");
10578		}
10579		/*
10580		 * A newly allocated directory must have its "." and
10581		 * ".." entries written out before its name can be
10582		 * committed in its parent.
10583		 */
10584		inum = dap->da_newinum;
10585		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
10586			panic("flush_pagedep_deps: lost inode1");
10587		/*
10588		 * Wait for any pending journal adds to complete so we don't
10589		 * cause rollbacks while syncing.
10590		 */
10591		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
10592			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
10593			    == DEPCOMPLETE) {
10594				stat_jwait_inode++;
10595				jwait(&inoref->if_list);
10596				goto restart;
10597			}
10598		}
10599		if (dap->da_state & MKDIR_BODY) {
10600			FREE_LOCK(&lk);
10601			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
10602			    FFSV_FORCEINSMQ)))
10603				break;
10604			error = flush_newblk_dep(vp, mp, 0);
10605			/*
10606			 * If we still have the dependency we might need to
10607			 * update the vnode to sync the new link count to
10608			 * disk.
10609			 */
10610			if (error == 0 && dap == LIST_FIRST(diraddhdp))
10611				error = ffs_update(vp, MNT_WAIT);
10612			vput(vp);
10613			if (error != 0)
10614				break;
10615			ACQUIRE_LOCK(&lk);
10616			/*
10617			 * If that cleared dependencies, go on to next.
10618			 */
10619			if (dap != LIST_FIRST(diraddhdp))
10620				continue;
10621			if (dap->da_state & MKDIR_BODY) {
10622				inodedep_lookup(UFSTOVFS(ump), inum, 0,
10623				    &inodedep);
10624				panic("flush_pagedep_deps: MKDIR_BODY "
10625				    "inodedep %p dap %p vp %p",
10626				    inodedep, dap, vp);
10627			}
10628		}
10629		/*
10630		 * Flush the inode on which the directory entry depends.
10631		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
10632		 * the only remaining dependency is that the updated inode
10633		 * count must get pushed to disk. The inode has already
10634		 * been pushed into its inode buffer (via VOP_UPDATE) at
10635		 * the time of the reference count change. So we need only
10636		 * locate that buffer, ensure that there will be no rollback
10637		 * caused by a bitmap dependency, then write the inode buffer.
10638		 */
10639retry:
10640		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
10641			panic("flush_pagedep_deps: lost inode");
10642		/*
10643		 * If the inode still has bitmap dependencies,
10644		 * push them to disk.
10645		 */
10646		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
10647			bp = inodedep->id_bmsafemap->sm_buf;
10648			bp = getdirtybuf(bp, &lk, MNT_WAIT);
10649			if (bp == NULL)
10650				goto retry;
10651			FREE_LOCK(&lk);
10652			if ((error = bwrite(bp)) != 0)
10653				break;
10654			ACQUIRE_LOCK(&lk);
10655			if (dap != LIST_FIRST(diraddhdp))
10656				continue;
10657		}
10658		/*
10659		 * If the inode is still sitting in a buffer waiting
10660		 * to be written or waiting for the link count to be
10661		 * adjusted update it here to flush it to disk.
10662		 */
10663		if (dap == LIST_FIRST(diraddhdp)) {
10664			FREE_LOCK(&lk);
10665			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
10666			    FFSV_FORCEINSMQ)))
10667				break;
10668			error = ffs_update(vp, MNT_WAIT);
10669			vput(vp);
10670			if (error)
10671				break;
10672			ACQUIRE_LOCK(&lk);
10673		}
10674		/*
10675		 * If we have failed to get rid of all the dependencies
10676		 * then something is seriously wrong.
10677		 */
10678		if (dap == LIST_FIRST(diraddhdp)) {
10679			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
10680			panic("flush_pagedep_deps: failed to flush "
10681			    "inodedep %p ino %d dap %p", inodedep, inum, dap);
10682		}
10683	}
10684	if (error)
10685		ACQUIRE_LOCK(&lk);
10686	return (error);
10687}
10688
10689/*
10690 * A large burst of file addition or deletion activity can drive the
10691 * memory load excessively high. First attempt to slow things down
10692 * using the techniques below. If that fails, this routine requests
10693 * the offending operations to fall back to running synchronously
10694 * until the memory load returns to a reasonable level.
10695 */
10696int
10697softdep_slowdown(vp)
10698	struct vnode *vp;
10699{
10700	int max_softdeps_hard;
10701
10702	ACQUIRE_LOCK(&lk);
10703	max_softdeps_hard = max_softdeps * 11 / 10;
10704	if (num_dirrem < max_softdeps_hard / 2 &&
10705	    num_inodedep < max_softdeps_hard &&
10706	    VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps &&
10707	    num_freeblkdep < max_softdeps_hard) {
10708		FREE_LOCK(&lk);
10709  		return (0);
10710	}
10711	if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps)
10712		softdep_speedup();
10713	stat_sync_limit_hit += 1;
10714	FREE_LOCK(&lk);
10715	return (1);
10716}
10717
10718/*
10719 * Called by the allocation routines when they are about to fail
10720 * in the hope that we can free up some disk space.
10721 *
10722 * First check to see if the work list has anything on it. If it has,
10723 * clean up entries until we successfully free some space. Because this
10724 * process holds inodes locked, we cannot handle any remove requests
10725 * that might block on a locked inode as that could lead to deadlock.
10726 * If the worklist yields no free space, encourage the syncer daemon
10727 * to help us. In no event will we try for longer than tickdelay seconds.
10728 */
10729int
10730softdep_request_cleanup(fs, vp)
10731	struct fs *fs;
10732	struct vnode *vp;
10733{
10734	struct ufsmount *ump;
10735	long starttime;
10736	ufs2_daddr_t needed;
10737	int error;
10738
10739	ump = VTOI(vp)->i_ump;
10740	mtx_assert(UFS_MTX(ump), MA_OWNED);
10741	needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
10742	starttime = time_second + tickdelay;
10743	/*
10744	 * If we are being called because of a process doing a
10745	 * copy-on-write, then it is not safe to update the vnode
10746	 * as we may recurse into the copy-on-write routine.
10747	 */
10748	if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
10749		UFS_UNLOCK(ump);
10750		error = ffs_update(vp, 1);
10751		UFS_LOCK(ump);
10752		if (error != 0)
10753			return (0);
10754	}
10755	while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
10756		if (time_second > starttime)
10757			return (0);
10758		UFS_UNLOCK(ump);
10759		ACQUIRE_LOCK(&lk);
10760		process_removes(vp);
10761		if (ump->softdep_on_worklist > 0 &&
10762		    process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
10763			stat_worklist_push += 1;
10764			FREE_LOCK(&lk);
10765			UFS_LOCK(ump);
10766			continue;
10767		}
10768		request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT);
10769		FREE_LOCK(&lk);
10770		UFS_LOCK(ump);
10771	}
10772	return (1);
10773}
10774
10775/*
10776 * If memory utilization has gotten too high, deliberately slow things
10777 * down and speed up the I/O processing.
10778 */
10779extern struct thread *syncertd;
10780static int
10781request_cleanup(mp, resource)
10782	struct mount *mp;
10783	int resource;
10784{
10785	struct thread *td = curthread;
10786	struct ufsmount *ump;
10787
10788	mtx_assert(&lk, MA_OWNED);
10789	/*
10790	 * We never hold up the filesystem syncer or buf daemon.
10791	 */
10792	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
10793		return (0);
10794	ump = VFSTOUFS(mp);
10795	/*
10796	 * First check to see if the work list has gotten backlogged.
10797	 * If it has, co-opt this process to help clean up two entries.
10798	 * Because this process may hold inodes locked, we cannot
10799	 * handle any remove requests that might block on a locked
10800	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
10801	 * to avoid recursively processing the worklist.
10802	 */
10803	if (ump->softdep_on_worklist > max_softdeps / 10) {
10804		td->td_pflags |= TDP_SOFTDEP;
10805		process_worklist_item(mp, LK_NOWAIT);
10806		process_worklist_item(mp, LK_NOWAIT);
10807		td->td_pflags &= ~TDP_SOFTDEP;
10808		stat_worklist_push += 2;
10809		return(1);
10810	}
10811	/*
10812	 * Next, we attempt to speed up the syncer process. If that
10813	 * is successful, then we allow the process to continue.
10814	 */
10815	if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT)
10816		return(0);
10817	/*
10818	 * If we are resource constrained on inode dependencies, try
10819	 * flushing some dirty inodes. Otherwise, we are constrained
10820	 * by file deletions, so try accelerating flushes of directories
10821	 * with removal dependencies. We would like to do the cleanup
10822	 * here, but we probably hold an inode locked at this point and
10823	 * that might deadlock against one that we try to clean. So,
10824	 * the best that we can do is request the syncer daemon to do
10825	 * the cleanup for us.
10826	 */
10827	switch (resource) {
10828
10829	case FLUSH_INODES:
10830		stat_ino_limit_push += 1;
10831		req_clear_inodedeps += 1;
10832		stat_countp = &stat_ino_limit_hit;
10833		break;
10834
10835	case FLUSH_REMOVE:
10836	case FLUSH_REMOVE_WAIT:
10837		stat_blk_limit_push += 1;
10838		req_clear_remove += 1;
10839		stat_countp = &stat_blk_limit_hit;
10840		break;
10841
10842	default:
10843		panic("request_cleanup: unknown type");
10844	}
10845	/*
10846	 * Hopefully the syncer daemon will catch up and awaken us.
10847	 * We wait at most tickdelay before proceeding in any case.
10848	 */
10849	proc_waiting += 1;
10850	if (callout_pending(&softdep_callout) == FALSE)
10851		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
10852		    pause_timer, 0);
10853
10854	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
10855	proc_waiting -= 1;
10856	return (1);
10857}
10858
10859/*
10860 * Awaken processes pausing in request_cleanup and clear proc_waiting
10861 * to indicate that there is no longer a timer running.
10862 */
10863static void
10864pause_timer(arg)
10865	void *arg;
10866{
10867
10868	/*
10869	 * The callout_ API has acquired mtx and will hold it around this
10870	 * function call.
10871	 */
10872	*stat_countp += 1;
10873	wakeup_one(&proc_waiting);
10874	if (proc_waiting > 0)
10875		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
10876		    pause_timer, 0);
10877}
10878
10879/*
10880 * Flush out a directory with at least one removal dependency in an effort to
10881 * reduce the number of dirrem, freefile, and freeblks dependency structures.
10882 */
10883static void
10884clear_remove(td)
10885	struct thread *td;
10886{
10887	struct pagedep_hashhead *pagedephd;
10888	struct pagedep *pagedep;
10889	static int next = 0;
10890	struct mount *mp;
10891	struct vnode *vp;
10892	struct bufobj *bo;
10893	int error, cnt;
10894	ino_t ino;
10895
10896	mtx_assert(&lk, MA_OWNED);
10897
10898	for (cnt = 0; cnt < pagedep_hash; cnt++) {
10899		pagedephd = &pagedep_hashtbl[next++];
10900		if (next >= pagedep_hash)
10901			next = 0;
10902		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
10903			if (LIST_EMPTY(&pagedep->pd_dirremhd))
10904				continue;
10905			mp = pagedep->pd_list.wk_mp;
10906			ino = pagedep->pd_ino;
10907			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
10908				continue;
10909			FREE_LOCK(&lk);
10910
10911			/*
10912			 * Let unmount clear deps
10913			 */
10914			error = vfs_busy(mp, MBF_NOWAIT);
10915			if (error != 0)
10916				goto finish_write;
10917			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
10918			     FFSV_FORCEINSMQ);
10919			vfs_unbusy(mp);
10920			if (error != 0) {
10921				softdep_error("clear_remove: vget", error);
10922				goto finish_write;
10923			}
10924			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
10925				softdep_error("clear_remove: fsync", error);
10926			bo = &vp->v_bufobj;
10927			BO_LOCK(bo);
10928			drain_output(vp);
10929			BO_UNLOCK(bo);
10930			vput(vp);
10931		finish_write:
10932			vn_finished_write(mp);
10933			ACQUIRE_LOCK(&lk);
10934			return;
10935		}
10936	}
10937}
10938
10939/*
10940 * Clear out a block of dirty inodes in an effort to reduce
10941 * the number of inodedep dependency structures.
10942 */
10943static void
10944clear_inodedeps(td)
10945	struct thread *td;
10946{
10947	struct inodedep_hashhead *inodedephd;
10948	struct inodedep *inodedep;
10949	static int next = 0;
10950	struct mount *mp;
10951	struct vnode *vp;
10952	struct fs *fs;
10953	int error, cnt;
10954	ino_t firstino, lastino, ino;
10955
10956	mtx_assert(&lk, MA_OWNED);
10957	/*
10958	 * Pick a random inode dependency to be cleared.
10959	 * We will then gather up all the inodes in its block
10960	 * that have dependencies and flush them out.
10961	 */
10962	for (cnt = 0; cnt < inodedep_hash; cnt++) {
10963		inodedephd = &inodedep_hashtbl[next++];
10964		if (next >= inodedep_hash)
10965			next = 0;
10966		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
10967			break;
10968	}
10969	if (inodedep == NULL)
10970		return;
10971	fs = inodedep->id_fs;
10972	mp = inodedep->id_list.wk_mp;
10973	/*
10974	 * Find the last inode in the block with dependencies.
10975	 */
10976	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
10977	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
10978		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
10979			break;
10980	/*
10981	 * Asynchronously push all but the last inode with dependencies.
10982	 * Synchronously push the last inode with dependencies to ensure
10983	 * that the inode block gets written to free up the inodedeps.
10984	 */
10985	for (ino = firstino; ino <= lastino; ino++) {
10986		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
10987			continue;
10988		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
10989			continue;
10990		FREE_LOCK(&lk);
10991		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
10992		if (error != 0) {
10993			vn_finished_write(mp);
10994			ACQUIRE_LOCK(&lk);
10995			return;
10996		}
10997		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
10998		    FFSV_FORCEINSMQ)) != 0) {
10999			softdep_error("clear_inodedeps: vget", error);
11000			vfs_unbusy(mp);
11001			vn_finished_write(mp);
11002			ACQUIRE_LOCK(&lk);
11003			return;
11004		}
11005		vfs_unbusy(mp);
11006		if (ino == lastino) {
11007			if ((error = ffs_syncvnode(vp, MNT_WAIT)))
11008				softdep_error("clear_inodedeps: fsync1", error);
11009		} else {
11010			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
11011				softdep_error("clear_inodedeps: fsync2", error);
11012			BO_LOCK(&vp->v_bufobj);
11013			drain_output(vp);
11014			BO_UNLOCK(&vp->v_bufobj);
11015		}
11016		vput(vp);
11017		vn_finished_write(mp);
11018		ACQUIRE_LOCK(&lk);
11019	}
11020}
11021
11022/*
11023 * Function to determine if the buffer has outstanding dependencies
11024 * that will cause a roll-back if the buffer is written. If wantcount
11025 * is set, return number of dependencies, otherwise just yes or no.
11026 */
11027static int
11028softdep_count_dependencies(bp, wantcount)
11029	struct buf *bp;
11030	int wantcount;
11031{
11032	struct worklist *wk;
11033	struct bmsafemap *bmsafemap;
11034	struct inodedep *inodedep;
11035	struct indirdep *indirdep;
11036	struct freeblks *freeblks;
11037	struct allocindir *aip;
11038	struct pagedep *pagedep;
11039	struct dirrem *dirrem;
11040	struct newblk *newblk;
11041	struct mkdir *mkdir;
11042	struct diradd *dap;
11043	int i, retval;
11044
11045	retval = 0;
11046	ACQUIRE_LOCK(&lk);
11047	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
11048		switch (wk->wk_type) {
11049
11050		case D_INODEDEP:
11051			inodedep = WK_INODEDEP(wk);
11052			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
11053				/* bitmap allocation dependency */
11054				retval += 1;
11055				if (!wantcount)
11056					goto out;
11057			}
11058			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
11059				/* direct block pointer dependency */
11060				retval += 1;
11061				if (!wantcount)
11062					goto out;
11063			}
11064			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
11065				/* direct block pointer dependency */
11066				retval += 1;
11067				if (!wantcount)
11068					goto out;
11069			}
11070			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
11071				/* Add reference dependency. */
11072				retval += 1;
11073				if (!wantcount)
11074					goto out;
11075			}
11076			continue;
11077
11078		case D_INDIRDEP:
11079			indirdep = WK_INDIRDEP(wk);
11080
11081			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
11082				/* indirect block pointer dependency */
11083				retval += 1;
11084				if (!wantcount)
11085					goto out;
11086			}
11087			continue;
11088
11089		case D_PAGEDEP:
11090			pagedep = WK_PAGEDEP(wk);
11091			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
11092				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
11093					/* Journal remove ref dependency. */
11094					retval += 1;
11095					if (!wantcount)
11096						goto out;
11097				}
11098			}
11099			for (i = 0; i < DAHASHSZ; i++) {
11100
11101				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
11102					/* directory entry dependency */
11103					retval += 1;
11104					if (!wantcount)
11105						goto out;
11106				}
11107			}
11108			continue;
11109
11110		case D_BMSAFEMAP:
11111			bmsafemap = WK_BMSAFEMAP(wk);
11112			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
11113				/* Add reference dependency. */
11114				retval += 1;
11115				if (!wantcount)
11116					goto out;
11117			}
11118			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
11119				/* Allocate block dependency. */
11120				retval += 1;
11121				if (!wantcount)
11122					goto out;
11123			}
11124			continue;
11125
11126		case D_FREEBLKS:
11127			freeblks = WK_FREEBLKS(wk);
11128			if (LIST_FIRST(&freeblks->fb_jfreeblkhd)) {
11129				/* Freeblk journal dependency. */
11130				retval += 1;
11131				if (!wantcount)
11132					goto out;
11133			}
11134			continue;
11135
11136		case D_ALLOCDIRECT:
11137		case D_ALLOCINDIR:
11138			newblk = WK_NEWBLK(wk);
11139			if (newblk->nb_jnewblk) {
11140				/* Journal allocate dependency. */
11141				retval += 1;
11142				if (!wantcount)
11143					goto out;
11144			}
11145			continue;
11146
11147		case D_MKDIR:
11148			mkdir = WK_MKDIR(wk);
11149			if (mkdir->md_jaddref) {
11150				/* Journal reference dependency. */
11151				retval += 1;
11152				if (!wantcount)
11153					goto out;
11154			}
11155			continue;
11156
11157		case D_FREEWORK:
11158		case D_FREEDEP:
11159		case D_JSEGDEP:
11160		case D_JSEG:
11161		case D_SBDEP:
11162			/* never a dependency on these blocks */
11163			continue;
11164
11165		default:
11166			panic("softdep_count_dependencies: Unexpected type %s",
11167			    TYPENAME(wk->wk_type));
11168			/* NOTREACHED */
11169		}
11170	}
11171out:
11172	FREE_LOCK(&lk);
11173	return retval;
11174}
11175
11176/*
11177 * Acquire exclusive access to a buffer.
11178 * Must be called with a locked mtx parameter.
11179 * Return acquired buffer or NULL on failure.
11180 */
11181static struct buf *
11182getdirtybuf(bp, mtx, waitfor)
11183	struct buf *bp;
11184	struct mtx *mtx;
11185	int waitfor;
11186{
11187	int error;
11188
11189	mtx_assert(mtx, MA_OWNED);
11190	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
11191		if (waitfor != MNT_WAIT)
11192			return (NULL);
11193		error = BUF_LOCK(bp,
11194		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
11195		/*
11196		 * Even if we sucessfully acquire bp here, we have dropped
11197		 * mtx, which may violates our guarantee.
11198		 */
11199		if (error == 0)
11200			BUF_UNLOCK(bp);
11201		else if (error != ENOLCK)
11202			panic("getdirtybuf: inconsistent lock: %d", error);
11203		mtx_lock(mtx);
11204		return (NULL);
11205	}
11206	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
11207		if (mtx == &lk && waitfor == MNT_WAIT) {
11208			mtx_unlock(mtx);
11209			BO_LOCK(bp->b_bufobj);
11210			BUF_UNLOCK(bp);
11211			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
11212				bp->b_vflags |= BV_BKGRDWAIT;
11213				msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
11214				       PRIBIO | PDROP, "getbuf", 0);
11215			} else
11216				BO_UNLOCK(bp->b_bufobj);
11217			mtx_lock(mtx);
11218			return (NULL);
11219		}
11220		BUF_UNLOCK(bp);
11221		if (waitfor != MNT_WAIT)
11222			return (NULL);
11223		/*
11224		 * The mtx argument must be bp->b_vp's mutex in
11225		 * this case.
11226		 */
11227#ifdef	DEBUG_VFS_LOCKS
11228		if (bp->b_vp->v_type != VCHR)
11229			ASSERT_BO_LOCKED(bp->b_bufobj);
11230#endif
11231		bp->b_vflags |= BV_BKGRDWAIT;
11232		msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
11233		return (NULL);
11234	}
11235	if ((bp->b_flags & B_DELWRI) == 0) {
11236		BUF_UNLOCK(bp);
11237		return (NULL);
11238	}
11239	bremfree(bp);
11240	return (bp);
11241}
11242
11243
11244/*
11245 * Check if it is safe to suspend the file system now.  On entry,
11246 * the vnode interlock for devvp should be held.  Return 0 with
11247 * the mount interlock held if the file system can be suspended now,
11248 * otherwise return EAGAIN with the mount interlock held.
11249 */
11250int
11251softdep_check_suspend(struct mount *mp,
11252		      struct vnode *devvp,
11253		      int softdep_deps,
11254		      int softdep_accdeps,
11255		      int secondary_writes,
11256		      int secondary_accwrites)
11257{
11258	struct bufobj *bo;
11259	struct ufsmount *ump;
11260	int error;
11261
11262	ump = VFSTOUFS(mp);
11263	bo = &devvp->v_bufobj;
11264	ASSERT_BO_LOCKED(bo);
11265
11266	for (;;) {
11267		if (!TRY_ACQUIRE_LOCK(&lk)) {
11268			BO_UNLOCK(bo);
11269			ACQUIRE_LOCK(&lk);
11270			FREE_LOCK(&lk);
11271			BO_LOCK(bo);
11272			continue;
11273		}
11274		MNT_ILOCK(mp);
11275		if (mp->mnt_secondary_writes != 0) {
11276			FREE_LOCK(&lk);
11277			BO_UNLOCK(bo);
11278			msleep(&mp->mnt_secondary_writes,
11279			       MNT_MTX(mp),
11280			       (PUSER - 1) | PDROP, "secwr", 0);
11281			BO_LOCK(bo);
11282			continue;
11283		}
11284		break;
11285	}
11286
11287	/*
11288	 * Reasons for needing more work before suspend:
11289	 * - Dirty buffers on devvp.
11290	 * - Softdep activity occurred after start of vnode sync loop
11291	 * - Secondary writes occurred after start of vnode sync loop
11292	 */
11293	error = 0;
11294	if (bo->bo_numoutput > 0 ||
11295	    bo->bo_dirty.bv_cnt > 0 ||
11296	    softdep_deps != 0 ||
11297	    ump->softdep_deps != 0 ||
11298	    softdep_accdeps != ump->softdep_accdeps ||
11299	    secondary_writes != 0 ||
11300	    mp->mnt_secondary_writes != 0 ||
11301	    secondary_accwrites != mp->mnt_secondary_accwrites)
11302		error = EAGAIN;
11303	FREE_LOCK(&lk);
11304	BO_UNLOCK(bo);
11305	return (error);
11306}
11307
11308
11309/*
11310 * Get the number of dependency structures for the file system, both
11311 * the current number and the total number allocated.  These will
11312 * later be used to detect that softdep processing has occurred.
11313 */
11314void
11315softdep_get_depcounts(struct mount *mp,
11316		      int *softdep_depsp,
11317		      int *softdep_accdepsp)
11318{
11319	struct ufsmount *ump;
11320
11321	ump = VFSTOUFS(mp);
11322	ACQUIRE_LOCK(&lk);
11323	*softdep_depsp = ump->softdep_deps;
11324	*softdep_accdepsp = ump->softdep_accdeps;
11325	FREE_LOCK(&lk);
11326}
11327
11328/*
11329 * Wait for pending output on a vnode to complete.
11330 * Must be called with vnode lock and interlock locked.
11331 *
11332 * XXX: Should just be a call to bufobj_wwait().
11333 */
11334static void
11335drain_output(vp)
11336	struct vnode *vp;
11337{
11338	struct bufobj *bo;
11339
11340	bo = &vp->v_bufobj;
11341	ASSERT_VOP_LOCKED(vp, "drain_output");
11342	ASSERT_BO_LOCKED(bo);
11343
11344	while (bo->bo_numoutput) {
11345		bo->bo_flag |= BO_WWAIT;
11346		msleep((caddr_t)&bo->bo_numoutput,
11347		    BO_MTX(bo), PRIBIO + 1, "drainvp", 0);
11348	}
11349}
11350
11351/*
11352 * Called whenever a buffer that is being invalidated or reallocated
11353 * contains dependencies. This should only happen if an I/O error has
11354 * occurred. The routine is called with the buffer locked.
11355 */
11356static void
11357softdep_deallocate_dependencies(bp)
11358	struct buf *bp;
11359{
11360
11361	if ((bp->b_ioflags & BIO_ERROR) == 0)
11362		panic("softdep_deallocate_dependencies: dangling deps");
11363	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
11364	panic("softdep_deallocate_dependencies: unrecovered I/O error");
11365}
11366
11367/*
11368 * Function to handle asynchronous write errors in the filesystem.
11369 */
11370static void
11371softdep_error(func, error)
11372	char *func;
11373	int error;
11374{
11375
11376	/* XXX should do something better! */
11377	printf("%s: got error %d while accessing filesystem\n", func, error);
11378}
11379
11380#ifdef DDB
11381
11382static void
11383inodedep_print(struct inodedep *inodedep, int verbose)
11384{
11385	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
11386	    " saveino %p\n",
11387	    inodedep, inodedep->id_fs, inodedep->id_state,
11388	    (intmax_t)inodedep->id_ino,
11389	    (intmax_t)fsbtodb(inodedep->id_fs,
11390	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
11391	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
11392	    inodedep->id_savedino1);
11393
11394	if (verbose == 0)
11395		return;
11396
11397	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
11398	    "mkdiradd %p\n",
11399	    LIST_FIRST(&inodedep->id_pendinghd),
11400	    LIST_FIRST(&inodedep->id_bufwait),
11401	    LIST_FIRST(&inodedep->id_inowait),
11402	    TAILQ_FIRST(&inodedep->id_inoreflst),
11403	    inodedep->id_mkdiradd);
11404	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
11405	    TAILQ_FIRST(&inodedep->id_inoupdt),
11406	    TAILQ_FIRST(&inodedep->id_newinoupdt),
11407	    TAILQ_FIRST(&inodedep->id_extupdt),
11408	    TAILQ_FIRST(&inodedep->id_newextupdt));
11409}
11410
11411DB_SHOW_COMMAND(inodedep, db_show_inodedep)
11412{
11413
11414	if (have_addr == 0) {
11415		db_printf("Address required\n");
11416		return;
11417	}
11418	inodedep_print((struct inodedep*)addr, 1);
11419}
11420
11421DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
11422{
11423	struct inodedep_hashhead *inodedephd;
11424	struct inodedep *inodedep;
11425	struct fs *fs;
11426	int cnt;
11427
11428	fs = have_addr ? (struct fs *)addr : NULL;
11429	for (cnt = 0; cnt < inodedep_hash; cnt++) {
11430		inodedephd = &inodedep_hashtbl[cnt];
11431		LIST_FOREACH(inodedep, inodedephd, id_hash) {
11432			if (fs != NULL && fs != inodedep->id_fs)
11433				continue;
11434			inodedep_print(inodedep, 0);
11435		}
11436	}
11437}
11438
11439DB_SHOW_COMMAND(worklist, db_show_worklist)
11440{
11441	struct worklist *wk;
11442
11443	if (have_addr == 0) {
11444		db_printf("Address required\n");
11445		return;
11446	}
11447	wk = (struct worklist *)addr;
11448	printf("worklist: %p type %s state 0x%X\n",
11449	    wk, TYPENAME(wk->wk_type), wk->wk_state);
11450}
11451
11452DB_SHOW_COMMAND(workhead, db_show_workhead)
11453{
11454	struct workhead *wkhd;
11455	struct worklist *wk;
11456	int i;
11457
11458	if (have_addr == 0) {
11459		db_printf("Address required\n");
11460		return;
11461	}
11462	wkhd = (struct workhead *)addr;
11463	wk = LIST_FIRST(wkhd);
11464	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
11465		db_printf("worklist: %p type %s state 0x%X",
11466		    wk, TYPENAME(wk->wk_type), wk->wk_state);
11467	if (i == 100)
11468		db_printf("workhead overflow");
11469	printf("\n");
11470}
11471
11472
11473DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
11474{
11475	struct jaddref *jaddref;
11476	struct diradd *diradd;
11477	struct mkdir *mkdir;
11478
11479	LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) {
11480		diradd = mkdir->md_diradd;
11481		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
11482		    mkdir, mkdir->md_state, diradd, diradd->da_state);
11483		if ((jaddref = mkdir->md_jaddref) != NULL)
11484			db_printf(" jaddref %p jaddref state 0x%X",
11485			    jaddref, jaddref->ja_state);
11486		db_printf("\n");
11487	}
11488}
11489
11490#endif /* DDB */
11491
11492#endif /* SOFTUPDATES */
11493