1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright 1998, 2000 Marshall Kirk McKusick.
5 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
6 * All rights reserved.
7 *
8 * The soft updates code is derived from the appendix of a University
9 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
10 * "Soft Updates: A Solution to the Metadata Update Problem in File
11 * Systems", CSE-TR-254-95, August 1995).
12 *
13 * Further information about soft updates can be obtained from:
14 *
15 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
16 *	1614 Oxford Street		mckusick@mckusick.com
17 *	Berkeley, CA 94709-1608		+1-510-843-9542
18 *	USA
19 *
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
22 * are met:
23 *
24 * 1. Redistributions of source code must retain the above copyright
25 *    notice, this list of conditions and the following disclaimer.
26 * 2. Redistributions in binary form must reproduce the above copyright
27 *    notice, this list of conditions and the following disclaimer in the
28 *    documentation and/or other materials provided with the distribution.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
31 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
33 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
34 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
35 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
36 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
37 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
38 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
39 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 *
41 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
42 */
43
44#include <sys/cdefs.h>
45__FBSDID("$FreeBSD$");
46
47#include "opt_ffs.h"
48#include "opt_quota.h"
49#include "opt_ddb.h"
50
51/*
52 * For now we want the safety net that the DEBUG flag provides.
53 */
54#ifndef DEBUG
55#define DEBUG
56#endif
57
58#include <sys/param.h>
59#include <sys/kernel.h>
60#include <sys/systm.h>
61#include <sys/bio.h>
62#include <sys/buf.h>
63#include <sys/kdb.h>
64#include <sys/kthread.h>
65#include <sys/ktr.h>
66#include <sys/limits.h>
67#include <sys/lock.h>
68#include <sys/malloc.h>
69#include <sys/mount.h>
70#include <sys/mutex.h>
71#include <sys/namei.h>
72#include <sys/priv.h>
73#include <sys/proc.h>
74#include <sys/racct.h>
75#include <sys/rwlock.h>
76#include <sys/stat.h>
77#include <sys/sysctl.h>
78#include <sys/syslog.h>
79#include <sys/vnode.h>
80#include <sys/conf.h>
81
82#include <ufs/ufs/dir.h>
83#include <ufs/ufs/extattr.h>
84#include <ufs/ufs/quota.h>
85#include <ufs/ufs/inode.h>
86#include <ufs/ufs/ufsmount.h>
87#include <ufs/ffs/fs.h>
88#include <ufs/ffs/softdep.h>
89#include <ufs/ffs/ffs_extern.h>
90#include <ufs/ufs/ufs_extern.h>
91
92#include <vm/vm.h>
93#include <vm/vm_extern.h>
94#include <vm/vm_object.h>
95
96#include <geom/geom.h>
97
98#include <ddb/ddb.h>
99
100#define	KTR_SUJ	0	/* Define to KTR_SPARE. */
101
102#ifndef SOFTUPDATES
103
104int
105softdep_flushfiles(oldmnt, flags, td)
106	struct mount *oldmnt;
107	int flags;
108	struct thread *td;
109{
110
111	panic("softdep_flushfiles called");
112}
113
114int
115softdep_mount(devvp, mp, fs, cred)
116	struct vnode *devvp;
117	struct mount *mp;
118	struct fs *fs;
119	struct ucred *cred;
120{
121
122	return (0);
123}
124
125void
126softdep_initialize()
127{
128
129	return;
130}
131
132void
133softdep_uninitialize()
134{
135
136	return;
137}
138
139void
140softdep_unmount(mp)
141	struct mount *mp;
142{
143
144	panic("softdep_unmount called");
145}
146
147void
148softdep_setup_sbupdate(ump, fs, bp)
149	struct ufsmount *ump;
150	struct fs *fs;
151	struct buf *bp;
152{
153
154	panic("softdep_setup_sbupdate called");
155}
156
157void
158softdep_setup_inomapdep(bp, ip, newinum, mode)
159	struct buf *bp;
160	struct inode *ip;
161	ino_t newinum;
162	int mode;
163{
164
165	panic("softdep_setup_inomapdep called");
166}
167
168void
169softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
170	struct buf *bp;
171	struct mount *mp;
172	ufs2_daddr_t newblkno;
173	int frags;
174	int oldfrags;
175{
176
177	panic("softdep_setup_blkmapdep called");
178}
179
180void
181softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
182	struct inode *ip;
183	ufs_lbn_t lbn;
184	ufs2_daddr_t newblkno;
185	ufs2_daddr_t oldblkno;
186	long newsize;
187	long oldsize;
188	struct buf *bp;
189{
190
191	panic("softdep_setup_allocdirect called");
192}
193
194void
195softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
196	struct inode *ip;
197	ufs_lbn_t lbn;
198	ufs2_daddr_t newblkno;
199	ufs2_daddr_t oldblkno;
200	long newsize;
201	long oldsize;
202	struct buf *bp;
203{
204
205	panic("softdep_setup_allocext called");
206}
207
208void
209softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
210	struct inode *ip;
211	ufs_lbn_t lbn;
212	struct buf *bp;
213	int ptrno;
214	ufs2_daddr_t newblkno;
215	ufs2_daddr_t oldblkno;
216	struct buf *nbp;
217{
218
219	panic("softdep_setup_allocindir_page called");
220}
221
222void
223softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
224	struct buf *nbp;
225	struct inode *ip;
226	struct buf *bp;
227	int ptrno;
228	ufs2_daddr_t newblkno;
229{
230
231	panic("softdep_setup_allocindir_meta called");
232}
233
234void
235softdep_journal_freeblocks(ip, cred, length, flags)
236	struct inode *ip;
237	struct ucred *cred;
238	off_t length;
239	int flags;
240{
241
242	panic("softdep_journal_freeblocks called");
243}
244
245void
246softdep_journal_fsync(ip)
247	struct inode *ip;
248{
249
250	panic("softdep_journal_fsync called");
251}
252
253void
254softdep_setup_freeblocks(ip, length, flags)
255	struct inode *ip;
256	off_t length;
257	int flags;
258{
259
260	panic("softdep_setup_freeblocks called");
261}
262
263void
264softdep_freefile(pvp, ino, mode)
265		struct vnode *pvp;
266		ino_t ino;
267		int mode;
268{
269
270	panic("softdep_freefile called");
271}
272
273int
274softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
275	struct buf *bp;
276	struct inode *dp;
277	off_t diroffset;
278	ino_t newinum;
279	struct buf *newdirbp;
280	int isnewblk;
281{
282
283	panic("softdep_setup_directory_add called");
284}
285
286void
287softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
288	struct buf *bp;
289	struct inode *dp;
290	caddr_t base;
291	caddr_t oldloc;
292	caddr_t newloc;
293	int entrysize;
294{
295
296	panic("softdep_change_directoryentry_offset called");
297}
298
299void
300softdep_setup_remove(bp, dp, ip, isrmdir)
301	struct buf *bp;
302	struct inode *dp;
303	struct inode *ip;
304	int isrmdir;
305{
306
307	panic("softdep_setup_remove called");
308}
309
310void
311softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
312	struct buf *bp;
313	struct inode *dp;
314	struct inode *ip;
315	ino_t newinum;
316	int isrmdir;
317{
318
319	panic("softdep_setup_directory_change called");
320}
321
322void
323softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
324	struct mount *mp;
325	struct buf *bp;
326	ufs2_daddr_t blkno;
327	int frags;
328	struct workhead *wkhd;
329{
330
331	panic("%s called", __FUNCTION__);
332}
333
334void
335softdep_setup_inofree(mp, bp, ino, wkhd)
336	struct mount *mp;
337	struct buf *bp;
338	ino_t ino;
339	struct workhead *wkhd;
340{
341
342	panic("%s called", __FUNCTION__);
343}
344
345void
346softdep_setup_unlink(dp, ip)
347	struct inode *dp;
348	struct inode *ip;
349{
350
351	panic("%s called", __FUNCTION__);
352}
353
354void
355softdep_setup_link(dp, ip)
356	struct inode *dp;
357	struct inode *ip;
358{
359
360	panic("%s called", __FUNCTION__);
361}
362
363void
364softdep_revert_link(dp, ip)
365	struct inode *dp;
366	struct inode *ip;
367{
368
369	panic("%s called", __FUNCTION__);
370}
371
372void
373softdep_setup_rmdir(dp, ip)
374	struct inode *dp;
375	struct inode *ip;
376{
377
378	panic("%s called", __FUNCTION__);
379}
380
381void
382softdep_revert_rmdir(dp, ip)
383	struct inode *dp;
384	struct inode *ip;
385{
386
387	panic("%s called", __FUNCTION__);
388}
389
390void
391softdep_setup_create(dp, ip)
392	struct inode *dp;
393	struct inode *ip;
394{
395
396	panic("%s called", __FUNCTION__);
397}
398
399void
400softdep_revert_create(dp, ip)
401	struct inode *dp;
402	struct inode *ip;
403{
404
405	panic("%s called", __FUNCTION__);
406}
407
408void
409softdep_setup_mkdir(dp, ip)
410	struct inode *dp;
411	struct inode *ip;
412{
413
414	panic("%s called", __FUNCTION__);
415}
416
417void
418softdep_revert_mkdir(dp, ip)
419	struct inode *dp;
420	struct inode *ip;
421{
422
423	panic("%s called", __FUNCTION__);
424}
425
426void
427softdep_setup_dotdot_link(dp, ip)
428	struct inode *dp;
429	struct inode *ip;
430{
431
432	panic("%s called", __FUNCTION__);
433}
434
435int
436softdep_prealloc(vp, waitok)
437	struct vnode *vp;
438	int waitok;
439{
440
441	panic("%s called", __FUNCTION__);
442}
443
444int
445softdep_journal_lookup(mp, vpp)
446	struct mount *mp;
447	struct vnode **vpp;
448{
449
450	return (ENOENT);
451}
452
453void
454softdep_change_linkcnt(ip)
455	struct inode *ip;
456{
457
458	panic("softdep_change_linkcnt called");
459}
460
461void
462softdep_load_inodeblock(ip)
463	struct inode *ip;
464{
465
466	panic("softdep_load_inodeblock called");
467}
468
469void
470softdep_update_inodeblock(ip, bp, waitfor)
471	struct inode *ip;
472	struct buf *bp;
473	int waitfor;
474{
475
476	panic("softdep_update_inodeblock called");
477}
478
479int
480softdep_fsync(vp)
481	struct vnode *vp;	/* the "in_core" copy of the inode */
482{
483
484	return (0);
485}
486
487void
488softdep_fsync_mountdev(vp)
489	struct vnode *vp;
490{
491
492	return;
493}
494
495int
496softdep_flushworklist(oldmnt, countp, td)
497	struct mount *oldmnt;
498	int *countp;
499	struct thread *td;
500{
501
502	*countp = 0;
503	return (0);
504}
505
506int
507softdep_sync_metadata(struct vnode *vp)
508{
509
510	panic("softdep_sync_metadata called");
511}
512
513int
514softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
515{
516
517	panic("softdep_sync_buf called");
518}
519
520int
521softdep_slowdown(vp)
522	struct vnode *vp;
523{
524
525	panic("softdep_slowdown called");
526}
527
528int
529softdep_request_cleanup(fs, vp, cred, resource)
530	struct fs *fs;
531	struct vnode *vp;
532	struct ucred *cred;
533	int resource;
534{
535
536	return (0);
537}
538
539int
540softdep_check_suspend(struct mount *mp,
541		      struct vnode *devvp,
542		      int softdep_depcnt,
543		      int softdep_accdepcnt,
544		      int secondary_writes,
545		      int secondary_accwrites)
546{
547	struct bufobj *bo;
548	int error;
549
550	(void) softdep_depcnt,
551	(void) softdep_accdepcnt;
552
553	bo = &devvp->v_bufobj;
554	ASSERT_BO_WLOCKED(bo);
555
556	MNT_ILOCK(mp);
557	while (mp->mnt_secondary_writes != 0) {
558		BO_UNLOCK(bo);
559		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
560		    (PUSER - 1) | PDROP, "secwr", 0);
561		BO_LOCK(bo);
562		MNT_ILOCK(mp);
563	}
564
565	/*
566	 * Reasons for needing more work before suspend:
567	 * - Dirty buffers on devvp.
568	 * - Secondary writes occurred after start of vnode sync loop
569	 */
570	error = 0;
571	if (bo->bo_numoutput > 0 ||
572	    bo->bo_dirty.bv_cnt > 0 ||
573	    secondary_writes != 0 ||
574	    mp->mnt_secondary_writes != 0 ||
575	    secondary_accwrites != mp->mnt_secondary_accwrites)
576		error = EAGAIN;
577	BO_UNLOCK(bo);
578	return (error);
579}
580
581void
582softdep_get_depcounts(struct mount *mp,
583		      int *softdepactivep,
584		      int *softdepactiveaccp)
585{
586	(void) mp;
587	*softdepactivep = 0;
588	*softdepactiveaccp = 0;
589}
590
591void
592softdep_buf_append(bp, wkhd)
593	struct buf *bp;
594	struct workhead *wkhd;
595{
596
597	panic("softdep_buf_appendwork called");
598}
599
600void
601softdep_inode_append(ip, cred, wkhd)
602	struct inode *ip;
603	struct ucred *cred;
604	struct workhead *wkhd;
605{
606
607	panic("softdep_inode_appendwork called");
608}
609
610void
611softdep_freework(wkhd)
612	struct workhead *wkhd;
613{
614
615	panic("softdep_freework called");
616}
617
618#else
619
620FEATURE(softupdates, "FFS soft-updates support");
621
622static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
623    "soft updates stats");
624static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
625    "total dependencies allocated");
626static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
627    "high use dependencies allocated");
628static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
629    "current dependencies allocated");
630static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
631    "current dependencies written");
632
633unsigned long dep_current[D_LAST + 1];
634unsigned long dep_highuse[D_LAST + 1];
635unsigned long dep_total[D_LAST + 1];
636unsigned long dep_write[D_LAST + 1];
637
638#define	SOFTDEP_TYPE(type, str, long)					\
639    static MALLOC_DEFINE(M_ ## type, #str, long);			\
640    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
641	&dep_total[D_ ## type], 0, "");					\
642    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
643	&dep_current[D_ ## type], 0, "");				\
644    SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, 	\
645	&dep_highuse[D_ ## type], 0, "");				\
646    SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, 	\
647	&dep_write[D_ ## type], 0, "");
648
649SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
650SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
651SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
652    "Block or frag allocated from cyl group map");
653SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
654SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
655SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
656SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
657SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
658SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
659SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
660SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
661SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
662SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
663SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
664SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
665SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
666SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
667SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
668SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
669SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
670SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
671SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
672SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
673SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
674SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
675SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
676SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
677
678static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
679
680static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
681static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
682static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
683
684#define M_SOFTDEP_FLAGS	(M_WAITOK)
685
686/*
687 * translate from workitem type to memory type
688 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
689 */
690static struct malloc_type *memtype[] = {
691	NULL,
692	M_PAGEDEP,
693	M_INODEDEP,
694	M_BMSAFEMAP,
695	M_NEWBLK,
696	M_ALLOCDIRECT,
697	M_INDIRDEP,
698	M_ALLOCINDIR,
699	M_FREEFRAG,
700	M_FREEBLKS,
701	M_FREEFILE,
702	M_DIRADD,
703	M_MKDIR,
704	M_DIRREM,
705	M_NEWDIRBLK,
706	M_FREEWORK,
707	M_FREEDEP,
708	M_JADDREF,
709	M_JREMREF,
710	M_JMVREF,
711	M_JNEWBLK,
712	M_JFREEBLK,
713	M_JFREEFRAG,
714	M_JSEG,
715	M_JSEGDEP,
716	M_SBDEP,
717	M_JTRUNC,
718	M_JFSYNC,
719	M_SENTINEL
720};
721
722#define DtoM(type) (memtype[type])
723
724/*
725 * Names of malloc types.
726 */
727#define TYPENAME(type)  \
728	((unsigned)(type) <= D_LAST && (unsigned)(type) >= D_FIRST ? \
729	memtype[type]->ks_shortdesc : "???")
730/*
731 * End system adaptation definitions.
732 */
733
734#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
735#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
736
737/*
738 * Internal function prototypes.
739 */
740static	void check_clear_deps(struct mount *);
741static	void softdep_error(char *, int);
742static	int softdep_process_worklist(struct mount *, int);
743static	int softdep_waitidle(struct mount *, int);
744static	void drain_output(struct vnode *);
745static	struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
746static	int check_inodedep_free(struct inodedep *);
747static	void clear_remove(struct mount *);
748static	void clear_inodedeps(struct mount *);
749static	void unlinked_inodedep(struct mount *, struct inodedep *);
750static	void clear_unlinked_inodedep(struct inodedep *);
751static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
752static	int flush_pagedep_deps(struct vnode *, struct mount *,
753	    struct diraddhd *);
754static	int free_pagedep(struct pagedep *);
755static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
756static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
757static	int flush_deplist(struct allocdirectlst *, int, int *);
758static	int sync_cgs(struct mount *, int);
759static	int handle_written_filepage(struct pagedep *, struct buf *, int);
760static	int handle_written_sbdep(struct sbdep *, struct buf *);
761static	void initiate_write_sbdep(struct sbdep *);
762static	void diradd_inode_written(struct diradd *, struct inodedep *);
763static	int handle_written_indirdep(struct indirdep *, struct buf *,
764	    struct buf**, int);
765static	int handle_written_inodeblock(struct inodedep *, struct buf *, int);
766static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
767	    uint8_t *);
768static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
769static	void handle_written_jaddref(struct jaddref *);
770static	void handle_written_jremref(struct jremref *);
771static	void handle_written_jseg(struct jseg *, struct buf *);
772static	void handle_written_jnewblk(struct jnewblk *);
773static	void handle_written_jblkdep(struct jblkdep *);
774static	void handle_written_jfreefrag(struct jfreefrag *);
775static	void complete_jseg(struct jseg *);
776static	void complete_jsegs(struct jseg *);
777static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
778static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
779static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
780static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
781static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
782static	void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
783static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
784static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
785static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
786static	inline void inoref_write(struct inoref *, struct jseg *,
787	    struct jrefrec *);
788static	void handle_allocdirect_partdone(struct allocdirect *,
789	    struct workhead *);
790static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
791	    struct workhead *);
792static	void indirdep_complete(struct indirdep *);
793static	int indirblk_lookup(struct mount *, ufs2_daddr_t);
794static	void indirblk_insert(struct freework *);
795static	void indirblk_remove(struct freework *);
796static	void handle_allocindir_partdone(struct allocindir *);
797static	void initiate_write_filepage(struct pagedep *, struct buf *);
798static	void initiate_write_indirdep(struct indirdep*, struct buf *);
799static	void handle_written_mkdir(struct mkdir *, int);
800static	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
801	    uint8_t *);
802static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
803static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
804static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
805static	void handle_workitem_freefile(struct freefile *);
806static	int handle_workitem_remove(struct dirrem *, int);
807static	struct dirrem *newdirrem(struct buf *, struct inode *,
808	    struct inode *, int, struct dirrem **);
809static	struct indirdep *indirdep_lookup(struct mount *, struct inode *,
810	    struct buf *);
811static	void cancel_indirdep(struct indirdep *, struct buf *,
812	    struct freeblks *);
813static	void free_indirdep(struct indirdep *);
814static	void free_diradd(struct diradd *, struct workhead *);
815static	void merge_diradd(struct inodedep *, struct diradd *);
816static	void complete_diradd(struct diradd *);
817static	struct diradd *diradd_lookup(struct pagedep *, int);
818static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
819	    struct jremref *);
820static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
821	    struct jremref *);
822static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
823	    struct jremref *, struct jremref *);
824static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
825	    struct jremref *);
826static	void cancel_allocindir(struct allocindir *, struct buf *bp,
827	    struct freeblks *, int);
828static	int setup_trunc_indir(struct freeblks *, struct inode *,
829	    ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
830static	void complete_trunc_indir(struct freework *);
831static	void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
832	    int);
833static	void complete_mkdir(struct mkdir *);
834static	void free_newdirblk(struct newdirblk *);
835static	void free_jremref(struct jremref *);
836static	void free_jaddref(struct jaddref *);
837static	void free_jsegdep(struct jsegdep *);
838static	void free_jsegs(struct jblocks *);
839static	void rele_jseg(struct jseg *);
840static	void free_jseg(struct jseg *, struct jblocks *);
841static	void free_jnewblk(struct jnewblk *);
842static	void free_jblkdep(struct jblkdep *);
843static	void free_jfreefrag(struct jfreefrag *);
844static	void free_freedep(struct freedep *);
845static	void journal_jremref(struct dirrem *, struct jremref *,
846	    struct inodedep *);
847static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
848static	int cancel_jaddref(struct jaddref *, struct inodedep *,
849	    struct workhead *);
850static	void cancel_jfreefrag(struct jfreefrag *);
851static	inline void setup_freedirect(struct freeblks *, struct inode *,
852	    int, int);
853static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
854static	inline void setup_freeindir(struct freeblks *, struct inode *, int,
855	    ufs_lbn_t, int);
856static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
857static	void freeblks_free(struct ufsmount *, struct freeblks *, int);
858static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
859static	ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
860static	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
861static	void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
862	    int, int);
863static	void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
864static 	int cancel_pagedep(struct pagedep *, struct freeblks *, int);
865static	int deallocate_dependencies(struct buf *, struct freeblks *, int);
866static	void newblk_freefrag(struct newblk*);
867static	void free_newblk(struct newblk *);
868static	void cancel_allocdirect(struct allocdirectlst *,
869	    struct allocdirect *, struct freeblks *);
870static	int check_inode_unwritten(struct inodedep *);
871static	int free_inodedep(struct inodedep *);
872static	void freework_freeblock(struct freework *, u_long);
873static	void freework_enqueue(struct freework *);
874static	int handle_workitem_freeblocks(struct freeblks *, int);
875static	int handle_complete_freeblocks(struct freeblks *, int);
876static	void handle_workitem_indirblk(struct freework *);
877static	void handle_written_freework(struct freework *);
878static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
879static	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
880	    struct workhead *);
881static	struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
882	    struct inodedep *, struct allocindir *, ufs_lbn_t);
883static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
884	    ufs2_daddr_t, ufs_lbn_t);
885static	void handle_workitem_freefrag(struct freefrag *);
886static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
887	    ufs_lbn_t, u_long);
888static	void allocdirect_merge(struct allocdirectlst *,
889	    struct allocdirect *, struct allocdirect *);
890static	struct freefrag *allocindir_merge(struct allocindir *,
891	    struct allocindir *);
892static	int bmsafemap_find(struct bmsafemap_hashhead *, int,
893	    struct bmsafemap **);
894static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
895	    int cg, struct bmsafemap *);
896static	int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
897	    struct newblk **);
898static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
899static	int inodedep_find(struct inodedep_hashhead *, ino_t,
900	    struct inodedep **);
901static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
902static	int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
903	    int, struct pagedep **);
904static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
905	    struct pagedep **);
906static	void pause_timer(void *);
907static	int request_cleanup(struct mount *, int);
908static	int softdep_request_cleanup_flush(struct mount *, struct ufsmount *);
909static	void schedule_cleanup(struct mount *);
910static void softdep_ast_cleanup_proc(struct thread *);
911static struct ufsmount *softdep_bp_to_mp(struct buf *bp);
912static	int process_worklist_item(struct mount *, int, int);
913static	void process_removes(struct vnode *);
914static	void process_truncates(struct vnode *);
915static	void jwork_move(struct workhead *, struct workhead *);
916static	void jwork_insert(struct workhead *, struct jsegdep *);
917static	void add_to_worklist(struct worklist *, int);
918static	void wake_worklist(struct worklist *);
919static	void wait_worklist(struct worklist *, char *);
920static	void remove_from_worklist(struct worklist *);
921static	void softdep_flush(void *);
922static	void softdep_flushjournal(struct mount *);
923static	int softdep_speedup(struct ufsmount *);
924static	void worklist_speedup(struct mount *);
925static	int journal_mount(struct mount *, struct fs *, struct ucred *);
926static	void journal_unmount(struct ufsmount *);
927static	int journal_space(struct ufsmount *, int);
928static	void journal_suspend(struct ufsmount *);
929static	int journal_unsuspend(struct ufsmount *ump);
930static	void softdep_prelink(struct vnode *, struct vnode *);
931static	void add_to_journal(struct worklist *);
932static	void remove_from_journal(struct worklist *);
933static	bool softdep_excess_items(struct ufsmount *, int);
934static	void softdep_process_journal(struct mount *, struct worklist *, int);
935static	struct jremref *newjremref(struct dirrem *, struct inode *,
936	    struct inode *ip, off_t, nlink_t);
937static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
938	    uint16_t);
939static	inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
940	    uint16_t);
941static	inline struct jsegdep *inoref_jseg(struct inoref *);
942static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
943static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
944	    ufs2_daddr_t, int);
945static	void adjust_newfreework(struct freeblks *, int);
946static	struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
947static	void move_newblock_dep(struct jaddref *, struct inodedep *);
948static	void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
949static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
950	    ufs2_daddr_t, long, ufs_lbn_t);
951static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
952	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
953static	int jwait(struct worklist *, int);
954static	struct inodedep *inodedep_lookup_ip(struct inode *);
955static	int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
956static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
957static	void handle_jwork(struct workhead *);
958static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
959	    struct mkdir **);
960static	struct jblocks *jblocks_create(void);
961static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
962static	void jblocks_free(struct jblocks *, struct mount *, int);
963static	void jblocks_destroy(struct jblocks *);
964static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
965
966/*
967 * Exported softdep operations.
968 */
969static	void softdep_disk_io_initiation(struct buf *);
970static	void softdep_disk_write_complete(struct buf *);
971static	void softdep_deallocate_dependencies(struct buf *);
972static	int softdep_count_dependencies(struct buf *bp, int);
973
974/*
975 * Global lock over all of soft updates.
976 */
977static struct mtx lk;
978MTX_SYSINIT(softdep_lock, &lk, "global softdep", MTX_DEF);
979
980#define ACQUIRE_GBLLOCK(lk)	mtx_lock(lk)
981#define FREE_GBLLOCK(lk)	mtx_unlock(lk)
982#define GBLLOCK_OWNED(lk)	mtx_assert((lk), MA_OWNED)
983
984/*
985 * Per-filesystem soft-updates locking.
986 */
987#define LOCK_PTR(ump)		(&(ump)->um_softdep->sd_fslock)
988#define TRY_ACQUIRE_LOCK(ump)	rw_try_wlock(&(ump)->um_softdep->sd_fslock)
989#define ACQUIRE_LOCK(ump)	rw_wlock(&(ump)->um_softdep->sd_fslock)
990#define FREE_LOCK(ump)		rw_wunlock(&(ump)->um_softdep->sd_fslock)
991#define LOCK_OWNED(ump)		rw_assert(&(ump)->um_softdep->sd_fslock, \
992				    RA_WLOCKED)
993
994#define	BUF_AREC(bp)		lockallowrecurse(&(bp)->b_lock)
995#define	BUF_NOREC(bp)		lockdisablerecurse(&(bp)->b_lock)
996
997/*
998 * Worklist queue management.
999 * These routines require that the lock be held.
1000 */
1001#ifndef /* NOT */ DEBUG
1002#define WORKLIST_INSERT(head, item) do {	\
1003	(item)->wk_state |= ONWORKLIST;		\
1004	LIST_INSERT_HEAD(head, item, wk_list);	\
1005} while (0)
1006#define WORKLIST_REMOVE(item) do {		\
1007	(item)->wk_state &= ~ONWORKLIST;	\
1008	LIST_REMOVE(item, wk_list);		\
1009} while (0)
1010#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
1011#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
1012
1013#else /* DEBUG */
1014static	void worklist_insert(struct workhead *, struct worklist *, int);
1015static	void worklist_remove(struct worklist *, int);
1016
1017#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
1018#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
1019#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
1020#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
1021
1022static void
1023worklist_insert(head, item, locked)
1024	struct workhead *head;
1025	struct worklist *item;
1026	int locked;
1027{
1028
1029	if (locked)
1030		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1031	if (item->wk_state & ONWORKLIST)
1032		panic("worklist_insert: %p %s(0x%X) already on list",
1033		    item, TYPENAME(item->wk_type), item->wk_state);
1034	item->wk_state |= ONWORKLIST;
1035	LIST_INSERT_HEAD(head, item, wk_list);
1036}
1037
1038static void
1039worklist_remove(item, locked)
1040	struct worklist *item;
1041	int locked;
1042{
1043
1044	if (locked)
1045		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1046	if ((item->wk_state & ONWORKLIST) == 0)
1047		panic("worklist_remove: %p %s(0x%X) not on list",
1048		    item, TYPENAME(item->wk_type), item->wk_state);
1049	item->wk_state &= ~ONWORKLIST;
1050	LIST_REMOVE(item, wk_list);
1051}
1052#endif /* DEBUG */
1053
1054/*
1055 * Merge two jsegdeps keeping only the oldest one as newer references
1056 * can't be discarded until after older references.
1057 */
1058static inline struct jsegdep *
1059jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1060{
1061	struct jsegdep *swp;
1062
1063	if (two == NULL)
1064		return (one);
1065
1066	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1067		swp = one;
1068		one = two;
1069		two = swp;
1070	}
1071	WORKLIST_REMOVE(&two->jd_list);
1072	free_jsegdep(two);
1073
1074	return (one);
1075}
1076
1077/*
1078 * If two freedeps are compatible free one to reduce list size.
1079 */
1080static inline struct freedep *
1081freedep_merge(struct freedep *one, struct freedep *two)
1082{
1083	if (two == NULL)
1084		return (one);
1085
1086	if (one->fd_freework == two->fd_freework) {
1087		WORKLIST_REMOVE(&two->fd_list);
1088		free_freedep(two);
1089	}
1090	return (one);
1091}
1092
1093/*
1094 * Move journal work from one list to another.  Duplicate freedeps and
1095 * jsegdeps are coalesced to keep the lists as small as possible.
1096 */
1097static void
1098jwork_move(dst, src)
1099	struct workhead *dst;
1100	struct workhead *src;
1101{
1102	struct freedep *freedep;
1103	struct jsegdep *jsegdep;
1104	struct worklist *wkn;
1105	struct worklist *wk;
1106
1107	KASSERT(dst != src,
1108	    ("jwork_move: dst == src"));
1109	freedep = NULL;
1110	jsegdep = NULL;
1111	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1112		if (wk->wk_type == D_JSEGDEP)
1113			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1114		else if (wk->wk_type == D_FREEDEP)
1115			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1116	}
1117
1118	while ((wk = LIST_FIRST(src)) != NULL) {
1119		WORKLIST_REMOVE(wk);
1120		WORKLIST_INSERT(dst, wk);
1121		if (wk->wk_type == D_JSEGDEP) {
1122			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1123			continue;
1124		}
1125		if (wk->wk_type == D_FREEDEP)
1126			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1127	}
1128}
1129
1130static void
1131jwork_insert(dst, jsegdep)
1132	struct workhead *dst;
1133	struct jsegdep *jsegdep;
1134{
1135	struct jsegdep *jsegdepn;
1136	struct worklist *wk;
1137
1138	LIST_FOREACH(wk, dst, wk_list)
1139		if (wk->wk_type == D_JSEGDEP)
1140			break;
1141	if (wk == NULL) {
1142		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1143		return;
1144	}
1145	jsegdepn = WK_JSEGDEP(wk);
1146	if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1147		WORKLIST_REMOVE(wk);
1148		free_jsegdep(jsegdepn);
1149		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1150	} else
1151		free_jsegdep(jsegdep);
1152}
1153
1154/*
1155 * Routines for tracking and managing workitems.
1156 */
1157static	void workitem_free(struct worklist *, int);
1158static	void workitem_alloc(struct worklist *, int, struct mount *);
1159static	void workitem_reassign(struct worklist *, int);
1160
1161#define	WORKITEM_FREE(item, type) \
1162	workitem_free((struct worklist *)(item), (type))
1163#define	WORKITEM_REASSIGN(item, type) \
1164	workitem_reassign((struct worklist *)(item), (type))
1165
1166static void
1167workitem_free(item, type)
1168	struct worklist *item;
1169	int type;
1170{
1171	struct ufsmount *ump;
1172
1173#ifdef DEBUG
1174	if (item->wk_state & ONWORKLIST)
1175		panic("workitem_free: %s(0x%X) still on list",
1176		    TYPENAME(item->wk_type), item->wk_state);
1177	if (item->wk_type != type && type != D_NEWBLK)
1178		panic("workitem_free: type mismatch %s != %s",
1179		    TYPENAME(item->wk_type), TYPENAME(type));
1180#endif
1181	if (item->wk_state & IOWAITING)
1182		wakeup(item);
1183	ump = VFSTOUFS(item->wk_mp);
1184	LOCK_OWNED(ump);
1185	KASSERT(ump->softdep_deps > 0,
1186	    ("workitem_free: %s: softdep_deps going negative",
1187	    ump->um_fs->fs_fsmnt));
1188	if (--ump->softdep_deps == 0 && ump->softdep_req)
1189		wakeup(&ump->softdep_deps);
1190	KASSERT(dep_current[item->wk_type] > 0,
1191	    ("workitem_free: %s: dep_current[%s] going negative",
1192	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1193	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1194	    ("workitem_free: %s: softdep_curdeps[%s] going negative",
1195	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1196	atomic_subtract_long(&dep_current[item->wk_type], 1);
1197	ump->softdep_curdeps[item->wk_type] -= 1;
1198	free(item, DtoM(type));
1199}
1200
1201static void
1202workitem_alloc(item, type, mp)
1203	struct worklist *item;
1204	int type;
1205	struct mount *mp;
1206{
1207	struct ufsmount *ump;
1208
1209	item->wk_type = type;
1210	item->wk_mp = mp;
1211	item->wk_state = 0;
1212
1213	ump = VFSTOUFS(mp);
1214	ACQUIRE_GBLLOCK(&lk);
1215	dep_current[type]++;
1216	if (dep_current[type] > dep_highuse[type])
1217		dep_highuse[type] = dep_current[type];
1218	dep_total[type]++;
1219	FREE_GBLLOCK(&lk);
1220	ACQUIRE_LOCK(ump);
1221	ump->softdep_curdeps[type] += 1;
1222	ump->softdep_deps++;
1223	ump->softdep_accdeps++;
1224	FREE_LOCK(ump);
1225}
1226
1227static void
1228workitem_reassign(item, newtype)
1229	struct worklist *item;
1230	int newtype;
1231{
1232	struct ufsmount *ump;
1233
1234	ump = VFSTOUFS(item->wk_mp);
1235	LOCK_OWNED(ump);
1236	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1237	    ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
1238	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1239	ump->softdep_curdeps[item->wk_type] -= 1;
1240	ump->softdep_curdeps[newtype] += 1;
1241	KASSERT(dep_current[item->wk_type] > 0,
1242	    ("workitem_reassign: %s: dep_current[%s] going negative",
1243	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1244	ACQUIRE_GBLLOCK(&lk);
1245	dep_current[newtype]++;
1246	dep_current[item->wk_type]--;
1247	if (dep_current[newtype] > dep_highuse[newtype])
1248		dep_highuse[newtype] = dep_current[newtype];
1249	dep_total[newtype]++;
1250	FREE_GBLLOCK(&lk);
1251	item->wk_type = newtype;
1252}
1253
1254/*
1255 * Workitem queue management
1256 */
1257static int max_softdeps;	/* maximum number of structs before slowdown */
1258static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1259static int proc_waiting;	/* tracks whether we have a timeout posted */
1260static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1261static struct callout softdep_callout;
1262static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1263static int req_clear_remove;	/* syncer process flush some freeblks */
1264static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1265
1266/*
1267 * runtime statistics
1268 */
1269static int stat_flush_threads;	/* number of softdep flushing threads */
1270static int stat_worklist_push;	/* number of worklist cleanups */
1271static int stat_blk_limit_push;	/* number of times block limit neared */
1272static int stat_ino_limit_push;	/* number of times inode limit neared */
1273static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1274static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1275static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1276static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1277static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1278static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1279static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1280static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1281static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1282static int stat_journal_min;	/* Times hit journal min threshold */
1283static int stat_journal_low;	/* Times hit journal low threshold */
1284static int stat_journal_wait;	/* Times blocked in jwait(). */
1285static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1286static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1287static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1288static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1289static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1290static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1291static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1292static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1293static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1294static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
1295
1296SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1297    &max_softdeps, 0, "");
1298SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1299    &tickdelay, 0, "");
1300SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
1301    &stat_flush_threads, 0, "");
1302SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1303    &stat_worklist_push, 0,"");
1304SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1305    &stat_blk_limit_push, 0,"");
1306SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1307    &stat_ino_limit_push, 0,"");
1308SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1309    &stat_blk_limit_hit, 0, "");
1310SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1311    &stat_ino_limit_hit, 0, "");
1312SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1313    &stat_sync_limit_hit, 0, "");
1314SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1315    &stat_indir_blk_ptrs, 0, "");
1316SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1317    &stat_inode_bitmap, 0, "");
1318SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1319    &stat_direct_blk_ptrs, 0, "");
1320SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1321    &stat_dir_entry, 0, "");
1322SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1323    &stat_jaddref, 0, "");
1324SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1325    &stat_jnewblk, 0, "");
1326SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1327    &stat_journal_low, 0, "");
1328SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1329    &stat_journal_min, 0, "");
1330SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1331    &stat_journal_wait, 0, "");
1332SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1333    &stat_jwait_filepage, 0, "");
1334SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1335    &stat_jwait_freeblks, 0, "");
1336SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1337    &stat_jwait_inode, 0, "");
1338SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1339    &stat_jwait_newblk, 0, "");
1340SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1341    &stat_cleanup_blkrequests, 0, "");
1342SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1343    &stat_cleanup_inorequests, 0, "");
1344SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1345    &stat_cleanup_high_delay, 0, "");
1346SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1347    &stat_cleanup_retries, 0, "");
1348SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1349    &stat_cleanup_failures, 0, "");
1350SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1351    &softdep_flushcache, 0, "");
1352SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
1353    &stat_emptyjblocks, 0, "");
1354
1355SYSCTL_DECL(_vfs_ffs);
1356
1357/* Whether to recompute the summary at mount time */
1358static int compute_summary_at_mount = 0;
1359SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1360	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1361static int print_threads = 0;
1362SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
1363    &print_threads, 0, "Notify flusher thread start/stop");
1364
1365/* List of all filesystems mounted with soft updates */
1366static TAILQ_HEAD(, mount_softdeps) softdepmounts;
1367
1368/*
1369 * This function cleans the worklist for a filesystem.
1370 * Each filesystem running with soft dependencies gets its own
1371 * thread to run in this function. The thread is started up in
1372 * softdep_mount and shutdown in softdep_unmount. They show up
1373 * as part of the kernel "bufdaemon" process whose process
1374 * entry is available in bufdaemonproc.
1375 */
1376static int searchfailed;
1377extern struct proc *bufdaemonproc;
1378static void
1379softdep_flush(addr)
1380	void *addr;
1381{
1382	struct mount *mp;
1383	struct thread *td;
1384	struct ufsmount *ump;
1385
1386	td = curthread;
1387	td->td_pflags |= TDP_NORUNNINGBUF;
1388	mp = (struct mount *)addr;
1389	ump = VFSTOUFS(mp);
1390	atomic_add_int(&stat_flush_threads, 1);
1391	ACQUIRE_LOCK(ump);
1392	ump->softdep_flags &= ~FLUSH_STARTING;
1393	wakeup(&ump->softdep_flushtd);
1394	FREE_LOCK(ump);
1395	if (print_threads) {
1396		if (stat_flush_threads == 1)
1397			printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
1398			    bufdaemonproc->p_pid);
1399		printf("Start thread %s\n", td->td_name);
1400	}
1401	for (;;) {
1402		while (softdep_process_worklist(mp, 0) > 0 ||
1403		    (MOUNTEDSUJ(mp) &&
1404		    VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
1405			kthread_suspend_check();
1406		ACQUIRE_LOCK(ump);
1407		if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1408			msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
1409			    "sdflush", hz / 2);
1410		ump->softdep_flags &= ~FLUSH_CLEANUP;
1411		/*
1412		 * Check to see if we are done and need to exit.
1413		 */
1414		if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
1415			FREE_LOCK(ump);
1416			continue;
1417		}
1418		ump->softdep_flags &= ~FLUSH_EXIT;
1419		FREE_LOCK(ump);
1420		wakeup(&ump->softdep_flags);
1421		if (print_threads)
1422			printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
1423		atomic_subtract_int(&stat_flush_threads, 1);
1424		kthread_exit();
1425		panic("kthread_exit failed\n");
1426	}
1427}
1428
1429static void
1430worklist_speedup(mp)
1431	struct mount *mp;
1432{
1433	struct ufsmount *ump;
1434
1435	ump = VFSTOUFS(mp);
1436	LOCK_OWNED(ump);
1437	if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1438		ump->softdep_flags |= FLUSH_CLEANUP;
1439	wakeup(&ump->softdep_flushtd);
1440}
1441
1442static int
1443softdep_speedup(ump)
1444	struct ufsmount *ump;
1445{
1446	struct ufsmount *altump;
1447	struct mount_softdeps *sdp;
1448
1449	LOCK_OWNED(ump);
1450	worklist_speedup(ump->um_mountp);
1451	bd_speedup();
1452	/*
1453	 * If we have global shortages, then we need other
1454	 * filesystems to help with the cleanup. Here we wakeup a
1455	 * flusher thread for a filesystem that is over its fair
1456	 * share of resources.
1457	 */
1458	if (req_clear_inodedeps || req_clear_remove) {
1459		ACQUIRE_GBLLOCK(&lk);
1460		TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
1461			if ((altump = sdp->sd_ump) == ump)
1462				continue;
1463			if (((req_clear_inodedeps &&
1464			    altump->softdep_curdeps[D_INODEDEP] >
1465			    max_softdeps / stat_flush_threads) ||
1466			    (req_clear_remove &&
1467			    altump->softdep_curdeps[D_DIRREM] >
1468			    (max_softdeps / 2) / stat_flush_threads)) &&
1469			    TRY_ACQUIRE_LOCK(altump))
1470				break;
1471		}
1472		if (sdp == NULL) {
1473			searchfailed++;
1474			FREE_GBLLOCK(&lk);
1475		} else {
1476			/*
1477			 * Move to the end of the list so we pick a
1478			 * different one on out next try.
1479			 */
1480			TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
1481			TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
1482			FREE_GBLLOCK(&lk);
1483			if ((altump->softdep_flags &
1484			    (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1485				altump->softdep_flags |= FLUSH_CLEANUP;
1486			altump->um_softdep->sd_cleanups++;
1487			wakeup(&altump->softdep_flushtd);
1488			FREE_LOCK(altump);
1489		}
1490	}
1491	return (speedup_syncer());
1492}
1493
1494/*
1495 * Add an item to the end of the work queue.
1496 * This routine requires that the lock be held.
1497 * This is the only routine that adds items to the list.
1498 * The following routine is the only one that removes items
1499 * and does so in order from first to last.
1500 */
1501
1502#define	WK_HEAD		0x0001	/* Add to HEAD. */
1503#define	WK_NODELAY	0x0002	/* Process immediately. */
1504
1505static void
1506add_to_worklist(wk, flags)
1507	struct worklist *wk;
1508	int flags;
1509{
1510	struct ufsmount *ump;
1511
1512	ump = VFSTOUFS(wk->wk_mp);
1513	LOCK_OWNED(ump);
1514	if (wk->wk_state & ONWORKLIST)
1515		panic("add_to_worklist: %s(0x%X) already on list",
1516		    TYPENAME(wk->wk_type), wk->wk_state);
1517	wk->wk_state |= ONWORKLIST;
1518	if (ump->softdep_on_worklist == 0) {
1519		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1520		ump->softdep_worklist_tail = wk;
1521	} else if (flags & WK_HEAD) {
1522		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1523	} else {
1524		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1525		ump->softdep_worklist_tail = wk;
1526	}
1527	ump->softdep_on_worklist += 1;
1528	if (flags & WK_NODELAY)
1529		worklist_speedup(wk->wk_mp);
1530}
1531
1532/*
1533 * Remove the item to be processed. If we are removing the last
1534 * item on the list, we need to recalculate the tail pointer.
1535 */
1536static void
1537remove_from_worklist(wk)
1538	struct worklist *wk;
1539{
1540	struct ufsmount *ump;
1541
1542	ump = VFSTOUFS(wk->wk_mp);
1543	if (ump->softdep_worklist_tail == wk)
1544		ump->softdep_worklist_tail =
1545		    (struct worklist *)wk->wk_list.le_prev;
1546	WORKLIST_REMOVE(wk);
1547	ump->softdep_on_worklist -= 1;
1548}
1549
1550static void
1551wake_worklist(wk)
1552	struct worklist *wk;
1553{
1554	if (wk->wk_state & IOWAITING) {
1555		wk->wk_state &= ~IOWAITING;
1556		wakeup(wk);
1557	}
1558}
1559
1560static void
1561wait_worklist(wk, wmesg)
1562	struct worklist *wk;
1563	char *wmesg;
1564{
1565	struct ufsmount *ump;
1566
1567	ump = VFSTOUFS(wk->wk_mp);
1568	wk->wk_state |= IOWAITING;
1569	msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
1570}
1571
1572/*
1573 * Process that runs once per second to handle items in the background queue.
1574 *
1575 * Note that we ensure that everything is done in the order in which they
1576 * appear in the queue. The code below depends on this property to ensure
1577 * that blocks of a file are freed before the inode itself is freed. This
1578 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1579 * until all the old ones have been purged from the dependency lists.
1580 */
1581static int
1582softdep_process_worklist(mp, full)
1583	struct mount *mp;
1584	int full;
1585{
1586	int cnt, matchcnt;
1587	struct ufsmount *ump;
1588	long starttime;
1589
1590	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1591	if (MOUNTEDSOFTDEP(mp) == 0)
1592		return (0);
1593	matchcnt = 0;
1594	ump = VFSTOUFS(mp);
1595	ACQUIRE_LOCK(ump);
1596	starttime = time_second;
1597	softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
1598	check_clear_deps(mp);
1599	while (ump->softdep_on_worklist > 0) {
1600		if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1601			break;
1602		else
1603			matchcnt += cnt;
1604		check_clear_deps(mp);
1605		/*
1606		 * We do not generally want to stop for buffer space, but if
1607		 * we are really being a buffer hog, we will stop and wait.
1608		 */
1609		if (should_yield()) {
1610			FREE_LOCK(ump);
1611			kern_yield(PRI_USER);
1612			bwillwrite();
1613			ACQUIRE_LOCK(ump);
1614		}
1615		/*
1616		 * Never allow processing to run for more than one
1617		 * second. This gives the syncer thread the opportunity
1618		 * to pause if appropriate.
1619		 */
1620		if (!full && starttime != time_second)
1621			break;
1622	}
1623	if (full == 0)
1624		journal_unsuspend(ump);
1625	FREE_LOCK(ump);
1626	return (matchcnt);
1627}
1628
1629/*
1630 * Process all removes associated with a vnode if we are running out of
1631 * journal space.  Any other process which attempts to flush these will
1632 * be unable as we have the vnodes locked.
1633 */
1634static void
1635process_removes(vp)
1636	struct vnode *vp;
1637{
1638	struct inodedep *inodedep;
1639	struct dirrem *dirrem;
1640	struct ufsmount *ump;
1641	struct mount *mp;
1642	ino_t inum;
1643
1644	mp = vp->v_mount;
1645	ump = VFSTOUFS(mp);
1646	LOCK_OWNED(ump);
1647	inum = VTOI(vp)->i_number;
1648	for (;;) {
1649top:
1650		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1651			return;
1652		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1653			/*
1654			 * If another thread is trying to lock this vnode
1655			 * it will fail but we must wait for it to do so
1656			 * before we can proceed.
1657			 */
1658			if (dirrem->dm_state & INPROGRESS) {
1659				wait_worklist(&dirrem->dm_list, "pwrwait");
1660				goto top;
1661			}
1662			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1663			    (COMPLETE | ONWORKLIST))
1664				break;
1665		}
1666		if (dirrem == NULL)
1667			return;
1668		remove_from_worklist(&dirrem->dm_list);
1669		FREE_LOCK(ump);
1670		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1671			panic("process_removes: suspended filesystem");
1672		handle_workitem_remove(dirrem, 0);
1673		vn_finished_secondary_write(mp);
1674		ACQUIRE_LOCK(ump);
1675	}
1676}
1677
1678/*
1679 * Process all truncations associated with a vnode if we are running out
1680 * of journal space.  This is called when the vnode lock is already held
1681 * and no other process can clear the truncation.  This function returns
1682 * a value greater than zero if it did any work.
1683 */
1684static void
1685process_truncates(vp)
1686	struct vnode *vp;
1687{
1688	struct inodedep *inodedep;
1689	struct freeblks *freeblks;
1690	struct ufsmount *ump;
1691	struct mount *mp;
1692	ino_t inum;
1693	int cgwait;
1694
1695	mp = vp->v_mount;
1696	ump = VFSTOUFS(mp);
1697	LOCK_OWNED(ump);
1698	inum = VTOI(vp)->i_number;
1699	for (;;) {
1700		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1701			return;
1702		cgwait = 0;
1703		TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1704			/* Journal entries not yet written.  */
1705			if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1706				jwait(&LIST_FIRST(
1707				    &freeblks->fb_jblkdephd)->jb_list,
1708				    MNT_WAIT);
1709				break;
1710			}
1711			/* Another thread is executing this item. */
1712			if (freeblks->fb_state & INPROGRESS) {
1713				wait_worklist(&freeblks->fb_list, "ptrwait");
1714				break;
1715			}
1716			/* Freeblks is waiting on a inode write. */
1717			if ((freeblks->fb_state & COMPLETE) == 0) {
1718				FREE_LOCK(ump);
1719				ffs_update(vp, 1);
1720				ACQUIRE_LOCK(ump);
1721				break;
1722			}
1723			if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1724			    (ALLCOMPLETE | ONWORKLIST)) {
1725				remove_from_worklist(&freeblks->fb_list);
1726				freeblks->fb_state |= INPROGRESS;
1727				FREE_LOCK(ump);
1728				if (vn_start_secondary_write(NULL, &mp,
1729				    V_NOWAIT))
1730					panic("process_truncates: "
1731					    "suspended filesystem");
1732				handle_workitem_freeblocks(freeblks, 0);
1733				vn_finished_secondary_write(mp);
1734				ACQUIRE_LOCK(ump);
1735				break;
1736			}
1737			if (freeblks->fb_cgwait)
1738				cgwait++;
1739		}
1740		if (cgwait) {
1741			FREE_LOCK(ump);
1742			sync_cgs(mp, MNT_WAIT);
1743			ffs_sync_snap(mp, MNT_WAIT);
1744			ACQUIRE_LOCK(ump);
1745			continue;
1746		}
1747		if (freeblks == NULL)
1748			break;
1749	}
1750	return;
1751}
1752
1753/*
1754 * Process one item on the worklist.
1755 */
1756static int
1757process_worklist_item(mp, target, flags)
1758	struct mount *mp;
1759	int target;
1760	int flags;
1761{
1762	struct worklist sentinel;
1763	struct worklist *wk;
1764	struct ufsmount *ump;
1765	int matchcnt;
1766	int error;
1767
1768	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1769	/*
1770	 * If we are being called because of a process doing a
1771	 * copy-on-write, then it is not safe to write as we may
1772	 * recurse into the copy-on-write routine.
1773	 */
1774	if (curthread->td_pflags & TDP_COWINPROGRESS)
1775		return (-1);
1776	PHOLD(curproc);	/* Don't let the stack go away. */
1777	ump = VFSTOUFS(mp);
1778	LOCK_OWNED(ump);
1779	matchcnt = 0;
1780	sentinel.wk_mp = NULL;
1781	sentinel.wk_type = D_SENTINEL;
1782	LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1783	for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1784	    wk = LIST_NEXT(&sentinel, wk_list)) {
1785		if (wk->wk_type == D_SENTINEL) {
1786			LIST_REMOVE(&sentinel, wk_list);
1787			LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1788			continue;
1789		}
1790		if (wk->wk_state & INPROGRESS)
1791			panic("process_worklist_item: %p already in progress.",
1792			    wk);
1793		wk->wk_state |= INPROGRESS;
1794		remove_from_worklist(wk);
1795		FREE_LOCK(ump);
1796		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1797			panic("process_worklist_item: suspended filesystem");
1798		switch (wk->wk_type) {
1799		case D_DIRREM:
1800			/* removal of a directory entry */
1801			error = handle_workitem_remove(WK_DIRREM(wk), flags);
1802			break;
1803
1804		case D_FREEBLKS:
1805			/* releasing blocks and/or fragments from a file */
1806			error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1807			    flags);
1808			break;
1809
1810		case D_FREEFRAG:
1811			/* releasing a fragment when replaced as a file grows */
1812			handle_workitem_freefrag(WK_FREEFRAG(wk));
1813			error = 0;
1814			break;
1815
1816		case D_FREEFILE:
1817			/* releasing an inode when its link count drops to 0 */
1818			handle_workitem_freefile(WK_FREEFILE(wk));
1819			error = 0;
1820			break;
1821
1822		default:
1823			panic("%s_process_worklist: Unknown type %s",
1824			    "softdep", TYPENAME(wk->wk_type));
1825			/* NOTREACHED */
1826		}
1827		vn_finished_secondary_write(mp);
1828		ACQUIRE_LOCK(ump);
1829		if (error == 0) {
1830			if (++matchcnt == target)
1831				break;
1832			continue;
1833		}
1834		/*
1835		 * We have to retry the worklist item later.  Wake up any
1836		 * waiters who may be able to complete it immediately and
1837		 * add the item back to the head so we don't try to execute
1838		 * it again.
1839		 */
1840		wk->wk_state &= ~INPROGRESS;
1841		wake_worklist(wk);
1842		add_to_worklist(wk, WK_HEAD);
1843	}
1844	/* Sentinal could've become the tail from remove_from_worklist. */
1845	if (ump->softdep_worklist_tail == &sentinel)
1846		ump->softdep_worklist_tail =
1847		    (struct worklist *)sentinel.wk_list.le_prev;
1848	LIST_REMOVE(&sentinel, wk_list);
1849	PRELE(curproc);
1850	return (matchcnt);
1851}
1852
1853/*
1854 * Move dependencies from one buffer to another.
1855 */
1856int
1857softdep_move_dependencies(oldbp, newbp)
1858	struct buf *oldbp;
1859	struct buf *newbp;
1860{
1861	struct worklist *wk, *wktail;
1862	struct ufsmount *ump;
1863	int dirty;
1864
1865	if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
1866		return (0);
1867	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
1868	    ("softdep_move_dependencies called on non-softdep filesystem"));
1869	dirty = 0;
1870	wktail = NULL;
1871	ump = VFSTOUFS(wk->wk_mp);
1872	ACQUIRE_LOCK(ump);
1873	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1874		LIST_REMOVE(wk, wk_list);
1875		if (wk->wk_type == D_BMSAFEMAP &&
1876		    bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
1877			dirty = 1;
1878		if (wktail == NULL)
1879			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1880		else
1881			LIST_INSERT_AFTER(wktail, wk, wk_list);
1882		wktail = wk;
1883	}
1884	FREE_LOCK(ump);
1885
1886	return (dirty);
1887}
1888
1889/*
1890 * Purge the work list of all items associated with a particular mount point.
1891 */
1892int
1893softdep_flushworklist(oldmnt, countp, td)
1894	struct mount *oldmnt;
1895	int *countp;
1896	struct thread *td;
1897{
1898	struct vnode *devvp;
1899	struct ufsmount *ump;
1900	int count, error;
1901
1902	/*
1903	 * Alternately flush the block device associated with the mount
1904	 * point and process any dependencies that the flushing
1905	 * creates. We continue until no more worklist dependencies
1906	 * are found.
1907	 */
1908	*countp = 0;
1909	error = 0;
1910	ump = VFSTOUFS(oldmnt);
1911	devvp = ump->um_devvp;
1912	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1913		*countp += count;
1914		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1915		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1916		VOP_UNLOCK(devvp, 0);
1917		if (error != 0)
1918			break;
1919	}
1920	return (error);
1921}
1922
1923#define	SU_WAITIDLE_RETRIES	20
1924static int
1925softdep_waitidle(struct mount *mp, int flags __unused)
1926{
1927	struct ufsmount *ump;
1928	struct vnode *devvp;
1929	struct thread *td;
1930	int error, i;
1931
1932	ump = VFSTOUFS(mp);
1933	devvp = ump->um_devvp;
1934	td = curthread;
1935	error = 0;
1936	ACQUIRE_LOCK(ump);
1937	for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
1938		ump->softdep_req = 1;
1939		KASSERT((flags & FORCECLOSE) == 0 ||
1940		    ump->softdep_on_worklist == 0,
1941		    ("softdep_waitidle: work added after flush"));
1942		msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
1943		    "softdeps", 10 * hz);
1944		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1945		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1946		VOP_UNLOCK(devvp, 0);
1947		ACQUIRE_LOCK(ump);
1948		if (error != 0)
1949			break;
1950	}
1951	ump->softdep_req = 0;
1952	if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
1953		error = EBUSY;
1954		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1955		    mp);
1956	}
1957	FREE_LOCK(ump);
1958	return (error);
1959}
1960
1961/*
1962 * Flush all vnodes and worklist items associated with a specified mount point.
1963 */
1964int
1965softdep_flushfiles(oldmnt, flags, td)
1966	struct mount *oldmnt;
1967	int flags;
1968	struct thread *td;
1969{
1970#ifdef QUOTA
1971	struct ufsmount *ump;
1972	int i;
1973#endif
1974	int error, early, depcount, loopcnt, retry_flush_count, retry;
1975	int morework;
1976
1977	KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
1978	    ("softdep_flushfiles called on non-softdep filesystem"));
1979	loopcnt = 10;
1980	retry_flush_count = 3;
1981retry_flush:
1982	error = 0;
1983
1984	/*
1985	 * Alternately flush the vnodes associated with the mount
1986	 * point and process any dependencies that the flushing
1987	 * creates. In theory, this loop can happen at most twice,
1988	 * but we give it a few extra just to be sure.
1989	 */
1990	for (; loopcnt > 0; loopcnt--) {
1991		/*
1992		 * Do another flush in case any vnodes were brought in
1993		 * as part of the cleanup operations.
1994		 */
1995		early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
1996		    MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
1997		if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
1998			break;
1999		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
2000		    depcount == 0)
2001			break;
2002	}
2003	/*
2004	 * If we are unmounting then it is an error to fail. If we
2005	 * are simply trying to downgrade to read-only, then filesystem
2006	 * activity can keep us busy forever, so we just fail with EBUSY.
2007	 */
2008	if (loopcnt == 0) {
2009		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
2010			panic("softdep_flushfiles: looping");
2011		error = EBUSY;
2012	}
2013	if (!error)
2014		error = softdep_waitidle(oldmnt, flags);
2015	if (!error) {
2016		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
2017			retry = 0;
2018			MNT_ILOCK(oldmnt);
2019			morework = oldmnt->mnt_nvnodelistsize > 0;
2020#ifdef QUOTA
2021			ump = VFSTOUFS(oldmnt);
2022			UFS_LOCK(ump);
2023			for (i = 0; i < MAXQUOTAS; i++) {
2024				if (ump->um_quotas[i] != NULLVP)
2025					morework = 1;
2026			}
2027			UFS_UNLOCK(ump);
2028#endif
2029			if (morework) {
2030				if (--retry_flush_count > 0) {
2031					retry = 1;
2032					loopcnt = 3;
2033				} else
2034					error = EBUSY;
2035			}
2036			MNT_IUNLOCK(oldmnt);
2037			if (retry)
2038				goto retry_flush;
2039		}
2040	}
2041	return (error);
2042}
2043
2044/*
2045 * Structure hashing.
2046 *
2047 * There are four types of structures that can be looked up:
2048 *	1) pagedep structures identified by mount point, inode number,
2049 *	   and logical block.
2050 *	2) inodedep structures identified by mount point and inode number.
2051 *	3) newblk structures identified by mount point and
2052 *	   physical block number.
2053 *	4) bmsafemap structures identified by mount point and
2054 *	   cylinder group number.
2055 *
2056 * The "pagedep" and "inodedep" dependency structures are hashed
2057 * separately from the file blocks and inodes to which they correspond.
2058 * This separation helps when the in-memory copy of an inode or
2059 * file block must be replaced. It also obviates the need to access
2060 * an inode or file page when simply updating (or de-allocating)
2061 * dependency structures. Lookup of newblk structures is needed to
2062 * find newly allocated blocks when trying to associate them with
2063 * their allocdirect or allocindir structure.
2064 *
2065 * The lookup routines optionally create and hash a new instance when
2066 * an existing entry is not found. The bmsafemap lookup routine always
2067 * allocates a new structure if an existing one is not found.
2068 */
2069#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
2070
2071/*
2072 * Structures and routines associated with pagedep caching.
2073 */
2074#define	PAGEDEP_HASH(ump, inum, lbn) \
2075	(&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
2076
2077static int
2078pagedep_find(pagedephd, ino, lbn, pagedeppp)
2079	struct pagedep_hashhead *pagedephd;
2080	ino_t ino;
2081	ufs_lbn_t lbn;
2082	struct pagedep **pagedeppp;
2083{
2084	struct pagedep *pagedep;
2085
2086	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2087		if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
2088			*pagedeppp = pagedep;
2089			return (1);
2090		}
2091	}
2092	*pagedeppp = NULL;
2093	return (0);
2094}
2095/*
2096 * Look up a pagedep. Return 1 if found, 0 otherwise.
2097 * If not found, allocate if DEPALLOC flag is passed.
2098 * Found or allocated entry is returned in pagedeppp.
2099 */
2100static int
2101pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
2102	struct mount *mp;
2103	struct buf *bp;
2104	ino_t ino;
2105	ufs_lbn_t lbn;
2106	int flags;
2107	struct pagedep **pagedeppp;
2108{
2109	struct pagedep *pagedep;
2110	struct pagedep_hashhead *pagedephd;
2111	struct worklist *wk;
2112	struct ufsmount *ump;
2113	int ret;
2114	int i;
2115
2116	ump = VFSTOUFS(mp);
2117	LOCK_OWNED(ump);
2118	if (bp) {
2119		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2120			if (wk->wk_type == D_PAGEDEP) {
2121				*pagedeppp = WK_PAGEDEP(wk);
2122				return (1);
2123			}
2124		}
2125	}
2126	pagedephd = PAGEDEP_HASH(ump, ino, lbn);
2127	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2128	if (ret) {
2129		if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2130			WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2131		return (1);
2132	}
2133	if ((flags & DEPALLOC) == 0)
2134		return (0);
2135	FREE_LOCK(ump);
2136	pagedep = malloc(sizeof(struct pagedep),
2137	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2138	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2139	ACQUIRE_LOCK(ump);
2140	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2141	if (*pagedeppp) {
2142		/*
2143		 * This should never happen since we only create pagedeps
2144		 * with the vnode lock held.  Could be an assert.
2145		 */
2146		WORKITEM_FREE(pagedep, D_PAGEDEP);
2147		return (ret);
2148	}
2149	pagedep->pd_ino = ino;
2150	pagedep->pd_lbn = lbn;
2151	LIST_INIT(&pagedep->pd_dirremhd);
2152	LIST_INIT(&pagedep->pd_pendinghd);
2153	for (i = 0; i < DAHASHSZ; i++)
2154		LIST_INIT(&pagedep->pd_diraddhd[i]);
2155	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2156	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2157	*pagedeppp = pagedep;
2158	return (0);
2159}
2160
2161/*
2162 * Structures and routines associated with inodedep caching.
2163 */
2164#define	INODEDEP_HASH(ump, inum) \
2165      (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
2166
2167static int
2168inodedep_find(inodedephd, inum, inodedeppp)
2169	struct inodedep_hashhead *inodedephd;
2170	ino_t inum;
2171	struct inodedep **inodedeppp;
2172{
2173	struct inodedep *inodedep;
2174
2175	LIST_FOREACH(inodedep, inodedephd, id_hash)
2176		if (inum == inodedep->id_ino)
2177			break;
2178	if (inodedep) {
2179		*inodedeppp = inodedep;
2180		return (1);
2181	}
2182	*inodedeppp = NULL;
2183
2184	return (0);
2185}
2186/*
2187 * Look up an inodedep. Return 1 if found, 0 if not found.
2188 * If not found, allocate if DEPALLOC flag is passed.
2189 * Found or allocated entry is returned in inodedeppp.
2190 */
2191static int
2192inodedep_lookup(mp, inum, flags, inodedeppp)
2193	struct mount *mp;
2194	ino_t inum;
2195	int flags;
2196	struct inodedep **inodedeppp;
2197{
2198	struct inodedep *inodedep;
2199	struct inodedep_hashhead *inodedephd;
2200	struct ufsmount *ump;
2201	struct fs *fs;
2202
2203	ump = VFSTOUFS(mp);
2204	LOCK_OWNED(ump);
2205	fs = ump->um_fs;
2206	inodedephd = INODEDEP_HASH(ump, inum);
2207
2208	if (inodedep_find(inodedephd, inum, inodedeppp))
2209		return (1);
2210	if ((flags & DEPALLOC) == 0)
2211		return (0);
2212	/*
2213	 * If the system is over its limit and our filesystem is
2214	 * responsible for more than our share of that usage and
2215	 * we are not in a rush, request some inodedep cleanup.
2216	 */
2217	if (softdep_excess_items(ump, D_INODEDEP))
2218		schedule_cleanup(mp);
2219	else
2220		FREE_LOCK(ump);
2221	inodedep = malloc(sizeof(struct inodedep),
2222		M_INODEDEP, M_SOFTDEP_FLAGS);
2223	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2224	ACQUIRE_LOCK(ump);
2225	if (inodedep_find(inodedephd, inum, inodedeppp)) {
2226		WORKITEM_FREE(inodedep, D_INODEDEP);
2227		return (1);
2228	}
2229	inodedep->id_fs = fs;
2230	inodedep->id_ino = inum;
2231	inodedep->id_state = ALLCOMPLETE;
2232	inodedep->id_nlinkdelta = 0;
2233	inodedep->id_savedino1 = NULL;
2234	inodedep->id_savedsize = -1;
2235	inodedep->id_savedextsize = -1;
2236	inodedep->id_savednlink = -1;
2237	inodedep->id_bmsafemap = NULL;
2238	inodedep->id_mkdiradd = NULL;
2239	LIST_INIT(&inodedep->id_dirremhd);
2240	LIST_INIT(&inodedep->id_pendinghd);
2241	LIST_INIT(&inodedep->id_inowait);
2242	LIST_INIT(&inodedep->id_bufwait);
2243	TAILQ_INIT(&inodedep->id_inoreflst);
2244	TAILQ_INIT(&inodedep->id_inoupdt);
2245	TAILQ_INIT(&inodedep->id_newinoupdt);
2246	TAILQ_INIT(&inodedep->id_extupdt);
2247	TAILQ_INIT(&inodedep->id_newextupdt);
2248	TAILQ_INIT(&inodedep->id_freeblklst);
2249	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2250	*inodedeppp = inodedep;
2251	return (0);
2252}
2253
2254/*
2255 * Structures and routines associated with newblk caching.
2256 */
2257#define	NEWBLK_HASH(ump, inum) \
2258	(&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
2259
2260static int
2261newblk_find(newblkhd, newblkno, flags, newblkpp)
2262	struct newblk_hashhead *newblkhd;
2263	ufs2_daddr_t newblkno;
2264	int flags;
2265	struct newblk **newblkpp;
2266{
2267	struct newblk *newblk;
2268
2269	LIST_FOREACH(newblk, newblkhd, nb_hash) {
2270		if (newblkno != newblk->nb_newblkno)
2271			continue;
2272		/*
2273		 * If we're creating a new dependency don't match those that
2274		 * have already been converted to allocdirects.  This is for
2275		 * a frag extend.
2276		 */
2277		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2278			continue;
2279		break;
2280	}
2281	if (newblk) {
2282		*newblkpp = newblk;
2283		return (1);
2284	}
2285	*newblkpp = NULL;
2286	return (0);
2287}
2288
2289/*
2290 * Look up a newblk. Return 1 if found, 0 if not found.
2291 * If not found, allocate if DEPALLOC flag is passed.
2292 * Found or allocated entry is returned in newblkpp.
2293 */
2294static int
2295newblk_lookup(mp, newblkno, flags, newblkpp)
2296	struct mount *mp;
2297	ufs2_daddr_t newblkno;
2298	int flags;
2299	struct newblk **newblkpp;
2300{
2301	struct newblk *newblk;
2302	struct newblk_hashhead *newblkhd;
2303	struct ufsmount *ump;
2304
2305	ump = VFSTOUFS(mp);
2306	LOCK_OWNED(ump);
2307	newblkhd = NEWBLK_HASH(ump, newblkno);
2308	if (newblk_find(newblkhd, newblkno, flags, newblkpp))
2309		return (1);
2310	if ((flags & DEPALLOC) == 0)
2311		return (0);
2312	if (softdep_excess_items(ump, D_NEWBLK) ||
2313	    softdep_excess_items(ump, D_ALLOCDIRECT) ||
2314	    softdep_excess_items(ump, D_ALLOCINDIR))
2315		schedule_cleanup(mp);
2316	else
2317		FREE_LOCK(ump);
2318	newblk = malloc(sizeof(union allblk), M_NEWBLK,
2319	    M_SOFTDEP_FLAGS | M_ZERO);
2320	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2321	ACQUIRE_LOCK(ump);
2322	if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
2323		WORKITEM_FREE(newblk, D_NEWBLK);
2324		return (1);
2325	}
2326	newblk->nb_freefrag = NULL;
2327	LIST_INIT(&newblk->nb_indirdeps);
2328	LIST_INIT(&newblk->nb_newdirblk);
2329	LIST_INIT(&newblk->nb_jwork);
2330	newblk->nb_state = ATTACHED;
2331	newblk->nb_newblkno = newblkno;
2332	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2333	*newblkpp = newblk;
2334	return (0);
2335}
2336
2337/*
2338 * Structures and routines associated with freed indirect block caching.
2339 */
2340#define	INDIR_HASH(ump, blkno) \
2341	(&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
2342
2343/*
2344 * Lookup an indirect block in the indir hash table.  The freework is
2345 * removed and potentially freed.  The caller must do a blocking journal
2346 * write before writing to the blkno.
2347 */
2348static int
2349indirblk_lookup(mp, blkno)
2350	struct mount *mp;
2351	ufs2_daddr_t blkno;
2352{
2353	struct freework *freework;
2354	struct indir_hashhead *wkhd;
2355	struct ufsmount *ump;
2356
2357	ump = VFSTOUFS(mp);
2358	wkhd = INDIR_HASH(ump, blkno);
2359	TAILQ_FOREACH(freework, wkhd, fw_next) {
2360		if (freework->fw_blkno != blkno)
2361			continue;
2362		indirblk_remove(freework);
2363		return (1);
2364	}
2365	return (0);
2366}
2367
2368/*
2369 * Insert an indirect block represented by freework into the indirblk
2370 * hash table so that it may prevent the block from being re-used prior
2371 * to the journal being written.
2372 */
2373static void
2374indirblk_insert(freework)
2375	struct freework *freework;
2376{
2377	struct jblocks *jblocks;
2378	struct jseg *jseg;
2379	struct ufsmount *ump;
2380
2381	ump = VFSTOUFS(freework->fw_list.wk_mp);
2382	jblocks = ump->softdep_jblocks;
2383	jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2384	if (jseg == NULL)
2385		return;
2386
2387	LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2388	TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
2389	    fw_next);
2390	freework->fw_state &= ~DEPCOMPLETE;
2391}
2392
2393static void
2394indirblk_remove(freework)
2395	struct freework *freework;
2396{
2397	struct ufsmount *ump;
2398
2399	ump = VFSTOUFS(freework->fw_list.wk_mp);
2400	LIST_REMOVE(freework, fw_segs);
2401	TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
2402	freework->fw_state |= DEPCOMPLETE;
2403	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2404		WORKITEM_FREE(freework, D_FREEWORK);
2405}
2406
2407/*
2408 * Executed during filesystem system initialization before
2409 * mounting any filesystems.
2410 */
2411void
2412softdep_initialize()
2413{
2414
2415	TAILQ_INIT(&softdepmounts);
2416#ifdef __LP64__
2417	max_softdeps = desiredvnodes * 4;
2418#else
2419	max_softdeps = desiredvnodes * 2;
2420#endif
2421
2422	/* initialise bioops hack */
2423	bioops.io_start = softdep_disk_io_initiation;
2424	bioops.io_complete = softdep_disk_write_complete;
2425	bioops.io_deallocate = softdep_deallocate_dependencies;
2426	bioops.io_countdeps = softdep_count_dependencies;
2427	softdep_ast_cleanup = softdep_ast_cleanup_proc;
2428
2429	/* Initialize the callout with an mtx. */
2430	callout_init_mtx(&softdep_callout, &lk, 0);
2431}
2432
2433/*
2434 * Executed after all filesystems have been unmounted during
2435 * filesystem module unload.
2436 */
2437void
2438softdep_uninitialize()
2439{
2440
2441	/* clear bioops hack */
2442	bioops.io_start = NULL;
2443	bioops.io_complete = NULL;
2444	bioops.io_deallocate = NULL;
2445	bioops.io_countdeps = NULL;
2446	softdep_ast_cleanup = NULL;
2447
2448	callout_drain(&softdep_callout);
2449}
2450
2451/*
2452 * Called at mount time to notify the dependency code that a
2453 * filesystem wishes to use it.
2454 */
2455int
2456softdep_mount(devvp, mp, fs, cred)
2457	struct vnode *devvp;
2458	struct mount *mp;
2459	struct fs *fs;
2460	struct ucred *cred;
2461{
2462	struct csum_total cstotal;
2463	struct mount_softdeps *sdp;
2464	struct ufsmount *ump;
2465	struct cg *cgp;
2466	struct buf *bp;
2467	u_int cyl, i;
2468	int error;
2469
2470	sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
2471	    M_WAITOK | M_ZERO);
2472	MNT_ILOCK(mp);
2473	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2474	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2475		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2476			MNTK_SOFTDEP | MNTK_NOASYNC;
2477	}
2478	ump = VFSTOUFS(mp);
2479	ump->um_softdep = sdp;
2480	MNT_IUNLOCK(mp);
2481	rw_init(LOCK_PTR(ump), "per-fs softdep");
2482	sdp->sd_ump = ump;
2483	LIST_INIT(&ump->softdep_workitem_pending);
2484	LIST_INIT(&ump->softdep_journal_pending);
2485	TAILQ_INIT(&ump->softdep_unlinked);
2486	LIST_INIT(&ump->softdep_dirtycg);
2487	ump->softdep_worklist_tail = NULL;
2488	ump->softdep_on_worklist = 0;
2489	ump->softdep_deps = 0;
2490	LIST_INIT(&ump->softdep_mkdirlisthd);
2491	ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
2492	    &ump->pagedep_hash_size);
2493	ump->pagedep_nextclean = 0;
2494	ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
2495	    &ump->inodedep_hash_size);
2496	ump->inodedep_nextclean = 0;
2497	ump->newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK,
2498	    &ump->newblk_hash_size);
2499	ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
2500	    &ump->bmsafemap_hash_size);
2501	i = 1 << (ffs(desiredvnodes / 10) - 1);
2502	ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
2503	    M_FREEWORK, M_WAITOK);
2504	ump->indir_hash_size = i - 1;
2505	for (i = 0; i <= ump->indir_hash_size; i++)
2506		TAILQ_INIT(&ump->indir_hashtbl[i]);
2507	ACQUIRE_GBLLOCK(&lk);
2508	TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
2509	FREE_GBLLOCK(&lk);
2510	if ((fs->fs_flags & FS_SUJ) &&
2511	    (error = journal_mount(mp, fs, cred)) != 0) {
2512		printf("Failed to start journal: %d\n", error);
2513		softdep_unmount(mp);
2514		return (error);
2515	}
2516	/*
2517	 * Start our flushing thread in the bufdaemon process.
2518	 */
2519	ACQUIRE_LOCK(ump);
2520	ump->softdep_flags |= FLUSH_STARTING;
2521	FREE_LOCK(ump);
2522	kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
2523	    &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
2524	    mp->mnt_stat.f_mntonname);
2525	ACQUIRE_LOCK(ump);
2526	while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
2527		msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
2528		    hz / 2);
2529	}
2530	FREE_LOCK(ump);
2531	/*
2532	 * When doing soft updates, the counters in the
2533	 * superblock may have gotten out of sync. Recomputation
2534	 * can take a long time and can be deferred for background
2535	 * fsck.  However, the old behavior of scanning the cylinder
2536	 * groups and recalculating them at mount time is available
2537	 * by setting vfs.ffs.compute_summary_at_mount to one.
2538	 */
2539	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2540		return (0);
2541	bzero(&cstotal, sizeof cstotal);
2542	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2543		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2544		    fs->fs_cgsize, cred, &bp)) != 0) {
2545			brelse(bp);
2546			softdep_unmount(mp);
2547			return (error);
2548		}
2549		cgp = (struct cg *)bp->b_data;
2550		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2551		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2552		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2553		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2554		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2555		brelse(bp);
2556	}
2557#ifdef DEBUG
2558	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2559		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2560#endif
2561	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2562	return (0);
2563}
2564
2565void
2566softdep_unmount(mp)
2567	struct mount *mp;
2568{
2569	struct ufsmount *ump;
2570#ifdef INVARIANTS
2571	int i;
2572#endif
2573
2574	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
2575	    ("softdep_unmount called on non-softdep filesystem"));
2576	ump = VFSTOUFS(mp);
2577	MNT_ILOCK(mp);
2578	mp->mnt_flag &= ~MNT_SOFTDEP;
2579	if (MOUNTEDSUJ(mp) == 0) {
2580		MNT_IUNLOCK(mp);
2581	} else {
2582		mp->mnt_flag &= ~MNT_SUJ;
2583		MNT_IUNLOCK(mp);
2584		journal_unmount(ump);
2585	}
2586	/*
2587	 * Shut down our flushing thread. Check for NULL is if
2588	 * softdep_mount errors out before the thread has been created.
2589	 */
2590	if (ump->softdep_flushtd != NULL) {
2591		ACQUIRE_LOCK(ump);
2592		ump->softdep_flags |= FLUSH_EXIT;
2593		wakeup(&ump->softdep_flushtd);
2594		msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
2595		    "sdwait", 0);
2596		KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
2597		    ("Thread shutdown failed"));
2598	}
2599	/*
2600	 * Free up our resources.
2601	 */
2602	ACQUIRE_GBLLOCK(&lk);
2603	TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
2604	FREE_GBLLOCK(&lk);
2605	rw_destroy(LOCK_PTR(ump));
2606	hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
2607	hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
2608	hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
2609	hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
2610	    ump->bmsafemap_hash_size);
2611	free(ump->indir_hashtbl, M_FREEWORK);
2612#ifdef INVARIANTS
2613	for (i = 0; i <= D_LAST; i++)
2614		KASSERT(ump->softdep_curdeps[i] == 0,
2615		    ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
2616		    TYPENAME(i), ump->softdep_curdeps[i]));
2617#endif
2618	free(ump->um_softdep, M_MOUNTDATA);
2619}
2620
2621static struct jblocks *
2622jblocks_create(void)
2623{
2624	struct jblocks *jblocks;
2625
2626	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2627	TAILQ_INIT(&jblocks->jb_segs);
2628	jblocks->jb_avail = 10;
2629	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2630	    M_JBLOCKS, M_WAITOK | M_ZERO);
2631
2632	return (jblocks);
2633}
2634
2635static ufs2_daddr_t
2636jblocks_alloc(jblocks, bytes, actual)
2637	struct jblocks *jblocks;
2638	int bytes;
2639	int *actual;
2640{
2641	ufs2_daddr_t daddr;
2642	struct jextent *jext;
2643	int freecnt;
2644	int blocks;
2645
2646	blocks = bytes / DEV_BSIZE;
2647	jext = &jblocks->jb_extent[jblocks->jb_head];
2648	freecnt = jext->je_blocks - jblocks->jb_off;
2649	if (freecnt == 0) {
2650		jblocks->jb_off = 0;
2651		if (++jblocks->jb_head > jblocks->jb_used)
2652			jblocks->jb_head = 0;
2653		jext = &jblocks->jb_extent[jblocks->jb_head];
2654		freecnt = jext->je_blocks;
2655	}
2656	if (freecnt > blocks)
2657		freecnt = blocks;
2658	*actual = freecnt * DEV_BSIZE;
2659	daddr = jext->je_daddr + jblocks->jb_off;
2660	jblocks->jb_off += freecnt;
2661	jblocks->jb_free -= freecnt;
2662
2663	return (daddr);
2664}
2665
2666static void
2667jblocks_free(jblocks, mp, bytes)
2668	struct jblocks *jblocks;
2669	struct mount *mp;
2670	int bytes;
2671{
2672
2673	LOCK_OWNED(VFSTOUFS(mp));
2674	jblocks->jb_free += bytes / DEV_BSIZE;
2675	if (jblocks->jb_suspended)
2676		worklist_speedup(mp);
2677	wakeup(jblocks);
2678}
2679
2680static void
2681jblocks_destroy(jblocks)
2682	struct jblocks *jblocks;
2683{
2684
2685	if (jblocks->jb_extent)
2686		free(jblocks->jb_extent, M_JBLOCKS);
2687	free(jblocks, M_JBLOCKS);
2688}
2689
2690static void
2691jblocks_add(jblocks, daddr, blocks)
2692	struct jblocks *jblocks;
2693	ufs2_daddr_t daddr;
2694	int blocks;
2695{
2696	struct jextent *jext;
2697
2698	jblocks->jb_blocks += blocks;
2699	jblocks->jb_free += blocks;
2700	jext = &jblocks->jb_extent[jblocks->jb_used];
2701	/* Adding the first block. */
2702	if (jext->je_daddr == 0) {
2703		jext->je_daddr = daddr;
2704		jext->je_blocks = blocks;
2705		return;
2706	}
2707	/* Extending the last extent. */
2708	if (jext->je_daddr + jext->je_blocks == daddr) {
2709		jext->je_blocks += blocks;
2710		return;
2711	}
2712	/* Adding a new extent. */
2713	if (++jblocks->jb_used == jblocks->jb_avail) {
2714		jblocks->jb_avail *= 2;
2715		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2716		    M_JBLOCKS, M_WAITOK | M_ZERO);
2717		memcpy(jext, jblocks->jb_extent,
2718		    sizeof(struct jextent) * jblocks->jb_used);
2719		free(jblocks->jb_extent, M_JBLOCKS);
2720		jblocks->jb_extent = jext;
2721	}
2722	jext = &jblocks->jb_extent[jblocks->jb_used];
2723	jext->je_daddr = daddr;
2724	jext->je_blocks = blocks;
2725	return;
2726}
2727
2728int
2729softdep_journal_lookup(mp, vpp)
2730	struct mount *mp;
2731	struct vnode **vpp;
2732{
2733	struct componentname cnp;
2734	struct vnode *dvp;
2735	ino_t sujournal;
2736	int error;
2737
2738	error = VFS_VGET(mp, UFS_ROOTINO, LK_EXCLUSIVE, &dvp);
2739	if (error)
2740		return (error);
2741	bzero(&cnp, sizeof(cnp));
2742	cnp.cn_nameiop = LOOKUP;
2743	cnp.cn_flags = ISLASTCN;
2744	cnp.cn_thread = curthread;
2745	cnp.cn_cred = curthread->td_ucred;
2746	cnp.cn_pnbuf = SUJ_FILE;
2747	cnp.cn_nameptr = SUJ_FILE;
2748	cnp.cn_namelen = strlen(SUJ_FILE);
2749	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2750	vput(dvp);
2751	if (error != 0)
2752		return (error);
2753	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2754	return (error);
2755}
2756
2757/*
2758 * Open and verify the journal file.
2759 */
2760static int
2761journal_mount(mp, fs, cred)
2762	struct mount *mp;
2763	struct fs *fs;
2764	struct ucred *cred;
2765{
2766	struct jblocks *jblocks;
2767	struct ufsmount *ump;
2768	struct vnode *vp;
2769	struct inode *ip;
2770	ufs2_daddr_t blkno;
2771	int bcount;
2772	int error;
2773	int i;
2774
2775	ump = VFSTOUFS(mp);
2776	ump->softdep_journal_tail = NULL;
2777	ump->softdep_on_journal = 0;
2778	ump->softdep_accdeps = 0;
2779	ump->softdep_req = 0;
2780	ump->softdep_jblocks = NULL;
2781	error = softdep_journal_lookup(mp, &vp);
2782	if (error != 0) {
2783		printf("Failed to find journal.  Use tunefs to create one\n");
2784		return (error);
2785	}
2786	ip = VTOI(vp);
2787	if (ip->i_size < SUJ_MIN) {
2788		error = ENOSPC;
2789		goto out;
2790	}
2791	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2792	jblocks = jblocks_create();
2793	for (i = 0; i < bcount; i++) {
2794		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2795		if (error)
2796			break;
2797		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2798	}
2799	if (error) {
2800		jblocks_destroy(jblocks);
2801		goto out;
2802	}
2803	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2804	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2805	ump->softdep_jblocks = jblocks;
2806out:
2807	if (error == 0) {
2808		MNT_ILOCK(mp);
2809		mp->mnt_flag |= MNT_SUJ;
2810		mp->mnt_flag &= ~MNT_SOFTDEP;
2811		MNT_IUNLOCK(mp);
2812		/*
2813		 * Only validate the journal contents if the
2814		 * filesystem is clean, otherwise we write the logs
2815		 * but they'll never be used.  If the filesystem was
2816		 * still dirty when we mounted it the journal is
2817		 * invalid and a new journal can only be valid if it
2818		 * starts from a clean mount.
2819		 */
2820		if (fs->fs_clean) {
2821			DIP_SET(ip, i_modrev, fs->fs_mtime);
2822			ip->i_flags |= IN_MODIFIED;
2823			ffs_update(vp, 1);
2824		}
2825	}
2826	vput(vp);
2827	return (error);
2828}
2829
2830static void
2831journal_unmount(ump)
2832	struct ufsmount *ump;
2833{
2834
2835	if (ump->softdep_jblocks)
2836		jblocks_destroy(ump->softdep_jblocks);
2837	ump->softdep_jblocks = NULL;
2838}
2839
2840/*
2841 * Called when a journal record is ready to be written.  Space is allocated
2842 * and the journal entry is created when the journal is flushed to stable
2843 * store.
2844 */
2845static void
2846add_to_journal(wk)
2847	struct worklist *wk;
2848{
2849	struct ufsmount *ump;
2850
2851	ump = VFSTOUFS(wk->wk_mp);
2852	LOCK_OWNED(ump);
2853	if (wk->wk_state & ONWORKLIST)
2854		panic("add_to_journal: %s(0x%X) already on list",
2855		    TYPENAME(wk->wk_type), wk->wk_state);
2856	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2857	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2858		ump->softdep_jblocks->jb_age = ticks;
2859		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2860	} else
2861		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2862	ump->softdep_journal_tail = wk;
2863	ump->softdep_on_journal += 1;
2864}
2865
2866/*
2867 * Remove an arbitrary item for the journal worklist maintain the tail
2868 * pointer.  This happens when a new operation obviates the need to
2869 * journal an old operation.
2870 */
2871static void
2872remove_from_journal(wk)
2873	struct worklist *wk;
2874{
2875	struct ufsmount *ump;
2876
2877	ump = VFSTOUFS(wk->wk_mp);
2878	LOCK_OWNED(ump);
2879#ifdef SUJ_DEBUG
2880	{
2881		struct worklist *wkn;
2882
2883		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2884			if (wkn == wk)
2885				break;
2886		if (wkn == NULL)
2887			panic("remove_from_journal: %p is not in journal", wk);
2888	}
2889#endif
2890	/*
2891	 * We emulate a TAILQ to save space in most structures which do not
2892	 * require TAILQ semantics.  Here we must update the tail position
2893	 * when removing the tail which is not the final entry. This works
2894	 * only if the worklist linkage are at the beginning of the structure.
2895	 */
2896	if (ump->softdep_journal_tail == wk)
2897		ump->softdep_journal_tail =
2898		    (struct worklist *)wk->wk_list.le_prev;
2899	WORKLIST_REMOVE(wk);
2900	ump->softdep_on_journal -= 1;
2901}
2902
2903/*
2904 * Check for journal space as well as dependency limits so the prelink
2905 * code can throttle both journaled and non-journaled filesystems.
2906 * Threshold is 0 for low and 1 for min.
2907 */
2908static int
2909journal_space(ump, thresh)
2910	struct ufsmount *ump;
2911	int thresh;
2912{
2913	struct jblocks *jblocks;
2914	int limit, avail;
2915
2916	jblocks = ump->softdep_jblocks;
2917	if (jblocks == NULL)
2918		return (1);
2919	/*
2920	 * We use a tighter restriction here to prevent request_cleanup()
2921	 * running in threads from running into locks we currently hold.
2922	 * We have to be over the limit and our filesystem has to be
2923	 * responsible for more than our share of that usage.
2924	 */
2925	limit = (max_softdeps / 10) * 9;
2926	if (dep_current[D_INODEDEP] > limit &&
2927	    ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
2928		return (0);
2929	if (thresh)
2930		thresh = jblocks->jb_min;
2931	else
2932		thresh = jblocks->jb_low;
2933	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2934	avail = jblocks->jb_free - avail;
2935
2936	return (avail > thresh);
2937}
2938
2939static void
2940journal_suspend(ump)
2941	struct ufsmount *ump;
2942{
2943	struct jblocks *jblocks;
2944	struct mount *mp;
2945
2946	mp = UFSTOVFS(ump);
2947	jblocks = ump->softdep_jblocks;
2948	MNT_ILOCK(mp);
2949	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2950		stat_journal_min++;
2951		mp->mnt_kern_flag |= MNTK_SUSPEND;
2952		mp->mnt_susp_owner = ump->softdep_flushtd;
2953	}
2954	jblocks->jb_suspended = 1;
2955	MNT_IUNLOCK(mp);
2956}
2957
2958static int
2959journal_unsuspend(struct ufsmount *ump)
2960{
2961	struct jblocks *jblocks;
2962	struct mount *mp;
2963
2964	mp = UFSTOVFS(ump);
2965	jblocks = ump->softdep_jblocks;
2966
2967	if (jblocks != NULL && jblocks->jb_suspended &&
2968	    journal_space(ump, jblocks->jb_min)) {
2969		jblocks->jb_suspended = 0;
2970		FREE_LOCK(ump);
2971		mp->mnt_susp_owner = curthread;
2972		vfs_write_resume(mp, 0);
2973		ACQUIRE_LOCK(ump);
2974		return (1);
2975	}
2976	return (0);
2977}
2978
2979/*
2980 * Called before any allocation function to be certain that there is
2981 * sufficient space in the journal prior to creating any new records.
2982 * Since in the case of block allocation we may have multiple locked
2983 * buffers at the time of the actual allocation we can not block
2984 * when the journal records are created.  Doing so would create a deadlock
2985 * if any of these buffers needed to be flushed to reclaim space.  Instead
2986 * we require a sufficiently large amount of available space such that
2987 * each thread in the system could have passed this allocation check and
2988 * still have sufficient free space.  With 20% of a minimum journal size
2989 * of 1MB we have 6553 records available.
2990 */
2991int
2992softdep_prealloc(vp, waitok)
2993	struct vnode *vp;
2994	int waitok;
2995{
2996	struct ufsmount *ump;
2997
2998	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
2999	    ("softdep_prealloc called on non-softdep filesystem"));
3000	/*
3001	 * Nothing to do if we are not running journaled soft updates.
3002	 * If we currently hold the snapshot lock, we must avoid
3003	 * handling other resources that could cause deadlock.  Do not
3004	 * touch quotas vnode since it is typically recursed with
3005	 * other vnode locks held.
3006	 */
3007	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) ||
3008	    (vp->v_vflag & VV_SYSTEM) != 0)
3009		return (0);
3010	ump = VFSTOUFS(vp->v_mount);
3011	ACQUIRE_LOCK(ump);
3012	if (journal_space(ump, 0)) {
3013		FREE_LOCK(ump);
3014		return (0);
3015	}
3016	stat_journal_low++;
3017	FREE_LOCK(ump);
3018	if (waitok == MNT_NOWAIT)
3019		return (ENOSPC);
3020	/*
3021	 * Attempt to sync this vnode once to flush any journal
3022	 * work attached to it.
3023	 */
3024	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
3025		ffs_syncvnode(vp, waitok, 0);
3026	ACQUIRE_LOCK(ump);
3027	process_removes(vp);
3028	process_truncates(vp);
3029	if (journal_space(ump, 0) == 0) {
3030		softdep_speedup(ump);
3031		if (journal_space(ump, 1) == 0)
3032			journal_suspend(ump);
3033	}
3034	FREE_LOCK(ump);
3035
3036	return (0);
3037}
3038
3039/*
3040 * Before adjusting a link count on a vnode verify that we have sufficient
3041 * journal space.  If not, process operations that depend on the currently
3042 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
3043 * and softdep flush threads can not acquire these locks to reclaim space.
3044 */
3045static void
3046softdep_prelink(dvp, vp)
3047	struct vnode *dvp;
3048	struct vnode *vp;
3049{
3050	struct ufsmount *ump;
3051
3052	ump = VFSTOUFS(dvp->v_mount);
3053	LOCK_OWNED(ump);
3054	/*
3055	 * Nothing to do if we have sufficient journal space.
3056	 * If we currently hold the snapshot lock, we must avoid
3057	 * handling other resources that could cause deadlock.
3058	 */
3059	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
3060		return;
3061	stat_journal_low++;
3062	FREE_LOCK(ump);
3063	if (vp)
3064		ffs_syncvnode(vp, MNT_NOWAIT, 0);
3065	ffs_syncvnode(dvp, MNT_WAIT, 0);
3066	ACQUIRE_LOCK(ump);
3067	/* Process vp before dvp as it may create .. removes. */
3068	if (vp) {
3069		process_removes(vp);
3070		process_truncates(vp);
3071	}
3072	process_removes(dvp);
3073	process_truncates(dvp);
3074	softdep_speedup(ump);
3075	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3076	if (journal_space(ump, 0) == 0) {
3077		softdep_speedup(ump);
3078		if (journal_space(ump, 1) == 0)
3079			journal_suspend(ump);
3080	}
3081}
3082
3083static void
3084jseg_write(ump, jseg, data)
3085	struct ufsmount *ump;
3086	struct jseg *jseg;
3087	uint8_t *data;
3088{
3089	struct jsegrec *rec;
3090
3091	rec = (struct jsegrec *)data;
3092	rec->jsr_seq = jseg->js_seq;
3093	rec->jsr_oldest = jseg->js_oldseq;
3094	rec->jsr_cnt = jseg->js_cnt;
3095	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
3096	rec->jsr_crc = 0;
3097	rec->jsr_time = ump->um_fs->fs_mtime;
3098}
3099
3100static inline void
3101inoref_write(inoref, jseg, rec)
3102	struct inoref *inoref;
3103	struct jseg *jseg;
3104	struct jrefrec *rec;
3105{
3106
3107	inoref->if_jsegdep->jd_seg = jseg;
3108	rec->jr_ino = inoref->if_ino;
3109	rec->jr_parent = inoref->if_parent;
3110	rec->jr_nlink = inoref->if_nlink;
3111	rec->jr_mode = inoref->if_mode;
3112	rec->jr_diroff = inoref->if_diroff;
3113}
3114
3115static void
3116jaddref_write(jaddref, jseg, data)
3117	struct jaddref *jaddref;
3118	struct jseg *jseg;
3119	uint8_t *data;
3120{
3121	struct jrefrec *rec;
3122
3123	rec = (struct jrefrec *)data;
3124	rec->jr_op = JOP_ADDREF;
3125	inoref_write(&jaddref->ja_ref, jseg, rec);
3126}
3127
3128static void
3129jremref_write(jremref, jseg, data)
3130	struct jremref *jremref;
3131	struct jseg *jseg;
3132	uint8_t *data;
3133{
3134	struct jrefrec *rec;
3135
3136	rec = (struct jrefrec *)data;
3137	rec->jr_op = JOP_REMREF;
3138	inoref_write(&jremref->jr_ref, jseg, rec);
3139}
3140
3141static void
3142jmvref_write(jmvref, jseg, data)
3143	struct jmvref *jmvref;
3144	struct jseg *jseg;
3145	uint8_t *data;
3146{
3147	struct jmvrec *rec;
3148
3149	rec = (struct jmvrec *)data;
3150	rec->jm_op = JOP_MVREF;
3151	rec->jm_ino = jmvref->jm_ino;
3152	rec->jm_parent = jmvref->jm_parent;
3153	rec->jm_oldoff = jmvref->jm_oldoff;
3154	rec->jm_newoff = jmvref->jm_newoff;
3155}
3156
3157static void
3158jnewblk_write(jnewblk, jseg, data)
3159	struct jnewblk *jnewblk;
3160	struct jseg *jseg;
3161	uint8_t *data;
3162{
3163	struct jblkrec *rec;
3164
3165	jnewblk->jn_jsegdep->jd_seg = jseg;
3166	rec = (struct jblkrec *)data;
3167	rec->jb_op = JOP_NEWBLK;
3168	rec->jb_ino = jnewblk->jn_ino;
3169	rec->jb_blkno = jnewblk->jn_blkno;
3170	rec->jb_lbn = jnewblk->jn_lbn;
3171	rec->jb_frags = jnewblk->jn_frags;
3172	rec->jb_oldfrags = jnewblk->jn_oldfrags;
3173}
3174
3175static void
3176jfreeblk_write(jfreeblk, jseg, data)
3177	struct jfreeblk *jfreeblk;
3178	struct jseg *jseg;
3179	uint8_t *data;
3180{
3181	struct jblkrec *rec;
3182
3183	jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3184	rec = (struct jblkrec *)data;
3185	rec->jb_op = JOP_FREEBLK;
3186	rec->jb_ino = jfreeblk->jf_ino;
3187	rec->jb_blkno = jfreeblk->jf_blkno;
3188	rec->jb_lbn = jfreeblk->jf_lbn;
3189	rec->jb_frags = jfreeblk->jf_frags;
3190	rec->jb_oldfrags = 0;
3191}
3192
3193static void
3194jfreefrag_write(jfreefrag, jseg, data)
3195	struct jfreefrag *jfreefrag;
3196	struct jseg *jseg;
3197	uint8_t *data;
3198{
3199	struct jblkrec *rec;
3200
3201	jfreefrag->fr_jsegdep->jd_seg = jseg;
3202	rec = (struct jblkrec *)data;
3203	rec->jb_op = JOP_FREEBLK;
3204	rec->jb_ino = jfreefrag->fr_ino;
3205	rec->jb_blkno = jfreefrag->fr_blkno;
3206	rec->jb_lbn = jfreefrag->fr_lbn;
3207	rec->jb_frags = jfreefrag->fr_frags;
3208	rec->jb_oldfrags = 0;
3209}
3210
3211static void
3212jtrunc_write(jtrunc, jseg, data)
3213	struct jtrunc *jtrunc;
3214	struct jseg *jseg;
3215	uint8_t *data;
3216{
3217	struct jtrncrec *rec;
3218
3219	jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3220	rec = (struct jtrncrec *)data;
3221	rec->jt_op = JOP_TRUNC;
3222	rec->jt_ino = jtrunc->jt_ino;
3223	rec->jt_size = jtrunc->jt_size;
3224	rec->jt_extsize = jtrunc->jt_extsize;
3225}
3226
3227static void
3228jfsync_write(jfsync, jseg, data)
3229	struct jfsync *jfsync;
3230	struct jseg *jseg;
3231	uint8_t *data;
3232{
3233	struct jtrncrec *rec;
3234
3235	rec = (struct jtrncrec *)data;
3236	rec->jt_op = JOP_SYNC;
3237	rec->jt_ino = jfsync->jfs_ino;
3238	rec->jt_size = jfsync->jfs_size;
3239	rec->jt_extsize = jfsync->jfs_extsize;
3240}
3241
3242static void
3243softdep_flushjournal(mp)
3244	struct mount *mp;
3245{
3246	struct jblocks *jblocks;
3247	struct ufsmount *ump;
3248
3249	if (MOUNTEDSUJ(mp) == 0)
3250		return;
3251	ump = VFSTOUFS(mp);
3252	jblocks = ump->softdep_jblocks;
3253	ACQUIRE_LOCK(ump);
3254	while (ump->softdep_on_journal) {
3255		jblocks->jb_needseg = 1;
3256		softdep_process_journal(mp, NULL, MNT_WAIT);
3257	}
3258	FREE_LOCK(ump);
3259}
3260
3261static void softdep_synchronize_completed(struct bio *);
3262static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3263
3264static void
3265softdep_synchronize_completed(bp)
3266        struct bio *bp;
3267{
3268	struct jseg *oldest;
3269	struct jseg *jseg;
3270	struct ufsmount *ump;
3271
3272	/*
3273	 * caller1 marks the last segment written before we issued the
3274	 * synchronize cache.
3275	 */
3276	jseg = bp->bio_caller1;
3277	if (jseg == NULL) {
3278		g_destroy_bio(bp);
3279		return;
3280	}
3281	ump = VFSTOUFS(jseg->js_list.wk_mp);
3282	ACQUIRE_LOCK(ump);
3283	oldest = NULL;
3284	/*
3285	 * Mark all the journal entries waiting on the synchronize cache
3286	 * as completed so they may continue on.
3287	 */
3288	while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3289		jseg->js_state |= COMPLETE;
3290		oldest = jseg;
3291		jseg = TAILQ_PREV(jseg, jseglst, js_next);
3292	}
3293	/*
3294	 * Restart deferred journal entry processing from the oldest
3295	 * completed jseg.
3296	 */
3297	if (oldest)
3298		complete_jsegs(oldest);
3299
3300	FREE_LOCK(ump);
3301	g_destroy_bio(bp);
3302}
3303
3304/*
3305 * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3306 * barriers.  The journal must be written prior to any blocks that depend
3307 * on it and the journal can not be released until the blocks have be
3308 * written.  This code handles both barriers simultaneously.
3309 */
3310static void
3311softdep_synchronize(bp, ump, caller1)
3312	struct bio *bp;
3313	struct ufsmount *ump;
3314	void *caller1;
3315{
3316
3317	bp->bio_cmd = BIO_FLUSH;
3318	bp->bio_flags |= BIO_ORDERED;
3319	bp->bio_data = NULL;
3320	bp->bio_offset = ump->um_cp->provider->mediasize;
3321	bp->bio_length = 0;
3322	bp->bio_done = softdep_synchronize_completed;
3323	bp->bio_caller1 = caller1;
3324	g_io_request(bp,
3325	    (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
3326}
3327
3328/*
3329 * Flush some journal records to disk.
3330 */
3331static void
3332softdep_process_journal(mp, needwk, flags)
3333	struct mount *mp;
3334	struct worklist *needwk;
3335	int flags;
3336{
3337	struct jblocks *jblocks;
3338	struct ufsmount *ump;
3339	struct worklist *wk;
3340	struct jseg *jseg;
3341	struct buf *bp;
3342	struct bio *bio;
3343	uint8_t *data;
3344	struct fs *fs;
3345	int shouldflush;
3346	int segwritten;
3347	int jrecmin;	/* Minimum records per block. */
3348	int jrecmax;	/* Maximum records per block. */
3349	int size;
3350	int cnt;
3351	int off;
3352	int devbsize;
3353
3354	if (MOUNTEDSUJ(mp) == 0)
3355		return;
3356	shouldflush = softdep_flushcache;
3357	bio = NULL;
3358	jseg = NULL;
3359	ump = VFSTOUFS(mp);
3360	LOCK_OWNED(ump);
3361	fs = ump->um_fs;
3362	jblocks = ump->softdep_jblocks;
3363	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3364	/*
3365	 * We write anywhere between a disk block and fs block.  The upper
3366	 * bound is picked to prevent buffer cache fragmentation and limit
3367	 * processing time per I/O.
3368	 */
3369	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3370	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3371	segwritten = 0;
3372	for (;;) {
3373		cnt = ump->softdep_on_journal;
3374		/*
3375		 * Criteria for writing a segment:
3376		 * 1) We have a full block.
3377		 * 2) We're called from jwait() and haven't found the
3378		 *    journal item yet.
3379		 * 3) Always write if needseg is set.
3380		 * 4) If we are called from process_worklist and have
3381		 *    not yet written anything we write a partial block
3382		 *    to enforce a 1 second maximum latency on journal
3383		 *    entries.
3384		 */
3385		if (cnt < (jrecmax - 1) && needwk == NULL &&
3386		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3387			break;
3388		cnt++;
3389		/*
3390		 * Verify some free journal space.  softdep_prealloc() should
3391		 * guarantee that we don't run out so this is indicative of
3392		 * a problem with the flow control.  Try to recover
3393		 * gracefully in any event.
3394		 */
3395		while (jblocks->jb_free == 0) {
3396			if (flags != MNT_WAIT)
3397				break;
3398			printf("softdep: Out of journal space!\n");
3399			softdep_speedup(ump);
3400			msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
3401		}
3402		FREE_LOCK(ump);
3403		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3404		workitem_alloc(&jseg->js_list, D_JSEG, mp);
3405		LIST_INIT(&jseg->js_entries);
3406		LIST_INIT(&jseg->js_indirs);
3407		jseg->js_state = ATTACHED;
3408		if (shouldflush == 0)
3409			jseg->js_state |= COMPLETE;
3410		else if (bio == NULL)
3411			bio = g_alloc_bio();
3412		jseg->js_jblocks = jblocks;
3413		bp = geteblk(fs->fs_bsize, 0);
3414		ACQUIRE_LOCK(ump);
3415		/*
3416		 * If there was a race while we were allocating the block
3417		 * and jseg the entry we care about was likely written.
3418		 * We bail out in both the WAIT and NOWAIT case and assume
3419		 * the caller will loop if the entry it cares about is
3420		 * not written.
3421		 */
3422		cnt = ump->softdep_on_journal;
3423		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3424			bp->b_flags |= B_INVAL | B_NOCACHE;
3425			WORKITEM_FREE(jseg, D_JSEG);
3426			FREE_LOCK(ump);
3427			brelse(bp);
3428			ACQUIRE_LOCK(ump);
3429			break;
3430		}
3431		/*
3432		 * Calculate the disk block size required for the available
3433		 * records rounded to the min size.
3434		 */
3435		if (cnt == 0)
3436			size = devbsize;
3437		else if (cnt < jrecmax)
3438			size = howmany(cnt, jrecmin) * devbsize;
3439		else
3440			size = fs->fs_bsize;
3441		/*
3442		 * Allocate a disk block for this journal data and account
3443		 * for truncation of the requested size if enough contiguous
3444		 * space was not available.
3445		 */
3446		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3447		bp->b_lblkno = bp->b_blkno;
3448		bp->b_offset = bp->b_blkno * DEV_BSIZE;
3449		bp->b_bcount = size;
3450		bp->b_flags &= ~B_INVAL;
3451		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3452		/*
3453		 * Initialize our jseg with cnt records.  Assign the next
3454		 * sequence number to it and link it in-order.
3455		 */
3456		cnt = MIN(cnt, (size / devbsize) * jrecmin);
3457		jseg->js_buf = bp;
3458		jseg->js_cnt = cnt;
3459		jseg->js_refs = cnt + 1;	/* Self ref. */
3460		jseg->js_size = size;
3461		jseg->js_seq = jblocks->jb_nextseq++;
3462		if (jblocks->jb_oldestseg == NULL)
3463			jblocks->jb_oldestseg = jseg;
3464		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3465		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3466		if (jblocks->jb_writeseg == NULL)
3467			jblocks->jb_writeseg = jseg;
3468		/*
3469		 * Start filling in records from the pending list.
3470		 */
3471		data = bp->b_data;
3472		off = 0;
3473
3474		/*
3475		 * Always put a header on the first block.
3476		 * XXX As with below, there might not be a chance to get
3477		 * into the loop.  Ensure that something valid is written.
3478		 */
3479		jseg_write(ump, jseg, data);
3480		off += JREC_SIZE;
3481		data = bp->b_data + off;
3482
3483		/*
3484		 * XXX Something is wrong here.  There's no work to do,
3485		 * but we need to perform and I/O and allow it to complete
3486		 * anyways.
3487		 */
3488		if (LIST_EMPTY(&ump->softdep_journal_pending))
3489			stat_emptyjblocks++;
3490
3491		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3492		    != NULL) {
3493			if (cnt == 0)
3494				break;
3495			/* Place a segment header on every device block. */
3496			if ((off % devbsize) == 0) {
3497				jseg_write(ump, jseg, data);
3498				off += JREC_SIZE;
3499				data = bp->b_data + off;
3500			}
3501			if (wk == needwk)
3502				needwk = NULL;
3503			remove_from_journal(wk);
3504			wk->wk_state |= INPROGRESS;
3505			WORKLIST_INSERT(&jseg->js_entries, wk);
3506			switch (wk->wk_type) {
3507			case D_JADDREF:
3508				jaddref_write(WK_JADDREF(wk), jseg, data);
3509				break;
3510			case D_JREMREF:
3511				jremref_write(WK_JREMREF(wk), jseg, data);
3512				break;
3513			case D_JMVREF:
3514				jmvref_write(WK_JMVREF(wk), jseg, data);
3515				break;
3516			case D_JNEWBLK:
3517				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3518				break;
3519			case D_JFREEBLK:
3520				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3521				break;
3522			case D_JFREEFRAG:
3523				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3524				break;
3525			case D_JTRUNC:
3526				jtrunc_write(WK_JTRUNC(wk), jseg, data);
3527				break;
3528			case D_JFSYNC:
3529				jfsync_write(WK_JFSYNC(wk), jseg, data);
3530				break;
3531			default:
3532				panic("process_journal: Unknown type %s",
3533				    TYPENAME(wk->wk_type));
3534				/* NOTREACHED */
3535			}
3536			off += JREC_SIZE;
3537			data = bp->b_data + off;
3538			cnt--;
3539		}
3540
3541		/* Clear any remaining space so we don't leak kernel data */
3542		if (size > off)
3543			bzero(data, size - off);
3544
3545		/*
3546		 * Write this one buffer and continue.
3547		 */
3548		segwritten = 1;
3549		jblocks->jb_needseg = 0;
3550		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3551		FREE_LOCK(ump);
3552		pbgetvp(ump->um_devvp, bp);
3553		/*
3554		 * We only do the blocking wait once we find the journal
3555		 * entry we're looking for.
3556		 */
3557		if (needwk == NULL && flags == MNT_WAIT)
3558			bwrite(bp);
3559		else
3560			bawrite(bp);
3561		ACQUIRE_LOCK(ump);
3562	}
3563	/*
3564	 * If we wrote a segment issue a synchronize cache so the journal
3565	 * is reflected on disk before the data is written.  Since reclaiming
3566	 * journal space also requires writing a journal record this
3567	 * process also enforces a barrier before reclamation.
3568	 */
3569	if (segwritten && shouldflush) {
3570		softdep_synchronize(bio, ump,
3571		    TAILQ_LAST(&jblocks->jb_segs, jseglst));
3572	} else if (bio)
3573		g_destroy_bio(bio);
3574	/*
3575	 * If we've suspended the filesystem because we ran out of journal
3576	 * space either try to sync it here to make some progress or
3577	 * unsuspend it if we already have.
3578	 */
3579	if (flags == 0 && jblocks->jb_suspended) {
3580		if (journal_unsuspend(ump))
3581			return;
3582		FREE_LOCK(ump);
3583		VFS_SYNC(mp, MNT_NOWAIT);
3584		ffs_sbupdate(ump, MNT_WAIT, 0);
3585		ACQUIRE_LOCK(ump);
3586	}
3587}
3588
3589/*
3590 * Complete a jseg, allowing all dependencies awaiting journal writes
3591 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3592 * structures so that the journal segment can be freed to reclaim space.
3593 */
3594static void
3595complete_jseg(jseg)
3596	struct jseg *jseg;
3597{
3598	struct worklist *wk;
3599	struct jmvref *jmvref;
3600#ifdef INVARIANTS
3601	int i = 0;
3602#endif
3603
3604	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3605		WORKLIST_REMOVE(wk);
3606		wk->wk_state &= ~INPROGRESS;
3607		wk->wk_state |= COMPLETE;
3608		KASSERT(i++ < jseg->js_cnt,
3609		    ("handle_written_jseg: overflow %d >= %d",
3610		    i - 1, jseg->js_cnt));
3611		switch (wk->wk_type) {
3612		case D_JADDREF:
3613			handle_written_jaddref(WK_JADDREF(wk));
3614			break;
3615		case D_JREMREF:
3616			handle_written_jremref(WK_JREMREF(wk));
3617			break;
3618		case D_JMVREF:
3619			rele_jseg(jseg);	/* No jsegdep. */
3620			jmvref = WK_JMVREF(wk);
3621			LIST_REMOVE(jmvref, jm_deps);
3622			if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3623				free_pagedep(jmvref->jm_pagedep);
3624			WORKITEM_FREE(jmvref, D_JMVREF);
3625			break;
3626		case D_JNEWBLK:
3627			handle_written_jnewblk(WK_JNEWBLK(wk));
3628			break;
3629		case D_JFREEBLK:
3630			handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3631			break;
3632		case D_JTRUNC:
3633			handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3634			break;
3635		case D_JFSYNC:
3636			rele_jseg(jseg);	/* No jsegdep. */
3637			WORKITEM_FREE(wk, D_JFSYNC);
3638			break;
3639		case D_JFREEFRAG:
3640			handle_written_jfreefrag(WK_JFREEFRAG(wk));
3641			break;
3642		default:
3643			panic("handle_written_jseg: Unknown type %s",
3644			    TYPENAME(wk->wk_type));
3645			/* NOTREACHED */
3646		}
3647	}
3648	/* Release the self reference so the structure may be freed. */
3649	rele_jseg(jseg);
3650}
3651
3652/*
3653 * Determine which jsegs are ready for completion processing.  Waits for
3654 * synchronize cache to complete as well as forcing in-order completion
3655 * of journal entries.
3656 */
3657static void
3658complete_jsegs(jseg)
3659	struct jseg *jseg;
3660{
3661	struct jblocks *jblocks;
3662	struct jseg *jsegn;
3663
3664	jblocks = jseg->js_jblocks;
3665	/*
3666	 * Don't allow out of order completions.  If this isn't the first
3667	 * block wait for it to write before we're done.
3668	 */
3669	if (jseg != jblocks->jb_writeseg)
3670		return;
3671	/* Iterate through available jsegs processing their entries. */
3672	while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3673		jblocks->jb_oldestwrseq = jseg->js_oldseq;
3674		jsegn = TAILQ_NEXT(jseg, js_next);
3675		complete_jseg(jseg);
3676		jseg = jsegn;
3677	}
3678	jblocks->jb_writeseg = jseg;
3679	/*
3680	 * Attempt to free jsegs now that oldestwrseq may have advanced.
3681	 */
3682	free_jsegs(jblocks);
3683}
3684
3685/*
3686 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
3687 * the final completions.
3688 */
3689static void
3690handle_written_jseg(jseg, bp)
3691	struct jseg *jseg;
3692	struct buf *bp;
3693{
3694
3695	if (jseg->js_refs == 0)
3696		panic("handle_written_jseg: No self-reference on %p", jseg);
3697	jseg->js_state |= DEPCOMPLETE;
3698	/*
3699	 * We'll never need this buffer again, set flags so it will be
3700	 * discarded.
3701	 */
3702	bp->b_flags |= B_INVAL | B_NOCACHE;
3703	pbrelvp(bp);
3704	complete_jsegs(jseg);
3705}
3706
3707static inline struct jsegdep *
3708inoref_jseg(inoref)
3709	struct inoref *inoref;
3710{
3711	struct jsegdep *jsegdep;
3712
3713	jsegdep = inoref->if_jsegdep;
3714	inoref->if_jsegdep = NULL;
3715
3716	return (jsegdep);
3717}
3718
3719/*
3720 * Called once a jremref has made it to stable store.  The jremref is marked
3721 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3722 * for the jremref to complete will be awoken by free_jremref.
3723 */
3724static void
3725handle_written_jremref(jremref)
3726	struct jremref *jremref;
3727{
3728	struct inodedep *inodedep;
3729	struct jsegdep *jsegdep;
3730	struct dirrem *dirrem;
3731
3732	/* Grab the jsegdep. */
3733	jsegdep = inoref_jseg(&jremref->jr_ref);
3734	/*
3735	 * Remove us from the inoref list.
3736	 */
3737	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3738	    0, &inodedep) == 0)
3739		panic("handle_written_jremref: Lost inodedep");
3740	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3741	/*
3742	 * Complete the dirrem.
3743	 */
3744	dirrem = jremref->jr_dirrem;
3745	jremref->jr_dirrem = NULL;
3746	LIST_REMOVE(jremref, jr_deps);
3747	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3748	jwork_insert(&dirrem->dm_jwork, jsegdep);
3749	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3750	    (dirrem->dm_state & COMPLETE) != 0)
3751		add_to_worklist(&dirrem->dm_list, 0);
3752	free_jremref(jremref);
3753}
3754
3755/*
3756 * Called once a jaddref has made it to stable store.  The dependency is
3757 * marked complete and any dependent structures are added to the inode
3758 * bufwait list to be completed as soon as it is written.  If a bitmap write
3759 * depends on this entry we move the inode into the inodedephd of the
3760 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3761 */
3762static void
3763handle_written_jaddref(jaddref)
3764	struct jaddref *jaddref;
3765{
3766	struct jsegdep *jsegdep;
3767	struct inodedep *inodedep;
3768	struct diradd *diradd;
3769	struct mkdir *mkdir;
3770
3771	/* Grab the jsegdep. */
3772	jsegdep = inoref_jseg(&jaddref->ja_ref);
3773	mkdir = NULL;
3774	diradd = NULL;
3775	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3776	    0, &inodedep) == 0)
3777		panic("handle_written_jaddref: Lost inodedep.");
3778	if (jaddref->ja_diradd == NULL)
3779		panic("handle_written_jaddref: No dependency");
3780	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3781		diradd = jaddref->ja_diradd;
3782		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3783	} else if (jaddref->ja_state & MKDIR_PARENT) {
3784		mkdir = jaddref->ja_mkdir;
3785		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3786	} else if (jaddref->ja_state & MKDIR_BODY)
3787		mkdir = jaddref->ja_mkdir;
3788	else
3789		panic("handle_written_jaddref: Unknown dependency %p",
3790		    jaddref->ja_diradd);
3791	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3792	/*
3793	 * Remove us from the inode list.
3794	 */
3795	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3796	/*
3797	 * The mkdir may be waiting on the jaddref to clear before freeing.
3798	 */
3799	if (mkdir) {
3800		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3801		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3802		    TYPENAME(mkdir->md_list.wk_type)));
3803		mkdir->md_jaddref = NULL;
3804		diradd = mkdir->md_diradd;
3805		mkdir->md_state |= DEPCOMPLETE;
3806		complete_mkdir(mkdir);
3807	}
3808	jwork_insert(&diradd->da_jwork, jsegdep);
3809	if (jaddref->ja_state & NEWBLOCK) {
3810		inodedep->id_state |= ONDEPLIST;
3811		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3812		    inodedep, id_deps);
3813	}
3814	free_jaddref(jaddref);
3815}
3816
3817/*
3818 * Called once a jnewblk journal is written.  The allocdirect or allocindir
3819 * is placed in the bmsafemap to await notification of a written bitmap.  If
3820 * the operation was canceled we add the segdep to the appropriate
3821 * dependency to free the journal space once the canceling operation
3822 * completes.
3823 */
3824static void
3825handle_written_jnewblk(jnewblk)
3826	struct jnewblk *jnewblk;
3827{
3828	struct bmsafemap *bmsafemap;
3829	struct freefrag *freefrag;
3830	struct freework *freework;
3831	struct jsegdep *jsegdep;
3832	struct newblk *newblk;
3833
3834	/* Grab the jsegdep. */
3835	jsegdep = jnewblk->jn_jsegdep;
3836	jnewblk->jn_jsegdep = NULL;
3837	if (jnewblk->jn_dep == NULL)
3838		panic("handle_written_jnewblk: No dependency for the segdep.");
3839	switch (jnewblk->jn_dep->wk_type) {
3840	case D_NEWBLK:
3841	case D_ALLOCDIRECT:
3842	case D_ALLOCINDIR:
3843		/*
3844		 * Add the written block to the bmsafemap so it can
3845		 * be notified when the bitmap is on disk.
3846		 */
3847		newblk = WK_NEWBLK(jnewblk->jn_dep);
3848		newblk->nb_jnewblk = NULL;
3849		if ((newblk->nb_state & GOINGAWAY) == 0) {
3850			bmsafemap = newblk->nb_bmsafemap;
3851			newblk->nb_state |= ONDEPLIST;
3852			LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3853			    nb_deps);
3854		}
3855		jwork_insert(&newblk->nb_jwork, jsegdep);
3856		break;
3857	case D_FREEFRAG:
3858		/*
3859		 * A newblock being removed by a freefrag when replaced by
3860		 * frag extension.
3861		 */
3862		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3863		freefrag->ff_jdep = NULL;
3864		jwork_insert(&freefrag->ff_jwork, jsegdep);
3865		break;
3866	case D_FREEWORK:
3867		/*
3868		 * A direct block was removed by truncate.
3869		 */
3870		freework = WK_FREEWORK(jnewblk->jn_dep);
3871		freework->fw_jnewblk = NULL;
3872		jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
3873		break;
3874	default:
3875		panic("handle_written_jnewblk: Unknown type %d.",
3876		    jnewblk->jn_dep->wk_type);
3877	}
3878	jnewblk->jn_dep = NULL;
3879	free_jnewblk(jnewblk);
3880}
3881
3882/*
3883 * Cancel a jfreefrag that won't be needed, probably due to colliding with
3884 * an in-flight allocation that has not yet been committed.  Divorce us
3885 * from the freefrag and mark it DEPCOMPLETE so that it may be added
3886 * to the worklist.
3887 */
3888static void
3889cancel_jfreefrag(jfreefrag)
3890	struct jfreefrag *jfreefrag;
3891{
3892	struct freefrag *freefrag;
3893
3894	if (jfreefrag->fr_jsegdep) {
3895		free_jsegdep(jfreefrag->fr_jsegdep);
3896		jfreefrag->fr_jsegdep = NULL;
3897	}
3898	freefrag = jfreefrag->fr_freefrag;
3899	jfreefrag->fr_freefrag = NULL;
3900	free_jfreefrag(jfreefrag);
3901	freefrag->ff_state |= DEPCOMPLETE;
3902	CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
3903}
3904
3905/*
3906 * Free a jfreefrag when the parent freefrag is rendered obsolete.
3907 */
3908static void
3909free_jfreefrag(jfreefrag)
3910	struct jfreefrag *jfreefrag;
3911{
3912
3913	if (jfreefrag->fr_state & INPROGRESS)
3914		WORKLIST_REMOVE(&jfreefrag->fr_list);
3915	else if (jfreefrag->fr_state & ONWORKLIST)
3916		remove_from_journal(&jfreefrag->fr_list);
3917	if (jfreefrag->fr_freefrag != NULL)
3918		panic("free_jfreefrag:  Still attached to a freefrag.");
3919	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3920}
3921
3922/*
3923 * Called when the journal write for a jfreefrag completes.  The parent
3924 * freefrag is added to the worklist if this completes its dependencies.
3925 */
3926static void
3927handle_written_jfreefrag(jfreefrag)
3928	struct jfreefrag *jfreefrag;
3929{
3930	struct jsegdep *jsegdep;
3931	struct freefrag *freefrag;
3932
3933	/* Grab the jsegdep. */
3934	jsegdep = jfreefrag->fr_jsegdep;
3935	jfreefrag->fr_jsegdep = NULL;
3936	freefrag = jfreefrag->fr_freefrag;
3937	if (freefrag == NULL)
3938		panic("handle_written_jfreefrag: No freefrag.");
3939	freefrag->ff_state |= DEPCOMPLETE;
3940	freefrag->ff_jdep = NULL;
3941	jwork_insert(&freefrag->ff_jwork, jsegdep);
3942	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3943		add_to_worklist(&freefrag->ff_list, 0);
3944	jfreefrag->fr_freefrag = NULL;
3945	free_jfreefrag(jfreefrag);
3946}
3947
3948/*
3949 * Called when the journal write for a jfreeblk completes.  The jfreeblk
3950 * is removed from the freeblks list of pending journal writes and the
3951 * jsegdep is moved to the freeblks jwork to be completed when all blocks
3952 * have been reclaimed.
3953 */
3954static void
3955handle_written_jblkdep(jblkdep)
3956	struct jblkdep *jblkdep;
3957{
3958	struct freeblks *freeblks;
3959	struct jsegdep *jsegdep;
3960
3961	/* Grab the jsegdep. */
3962	jsegdep = jblkdep->jb_jsegdep;
3963	jblkdep->jb_jsegdep = NULL;
3964	freeblks = jblkdep->jb_freeblks;
3965	LIST_REMOVE(jblkdep, jb_deps);
3966	jwork_insert(&freeblks->fb_jwork, jsegdep);
3967	/*
3968	 * If the freeblks is all journaled, we can add it to the worklist.
3969	 */
3970	if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
3971	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
3972		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
3973
3974	free_jblkdep(jblkdep);
3975}
3976
3977static struct jsegdep *
3978newjsegdep(struct worklist *wk)
3979{
3980	struct jsegdep *jsegdep;
3981
3982	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3983	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3984	jsegdep->jd_seg = NULL;
3985
3986	return (jsegdep);
3987}
3988
3989static struct jmvref *
3990newjmvref(dp, ino, oldoff, newoff)
3991	struct inode *dp;
3992	ino_t ino;
3993	off_t oldoff;
3994	off_t newoff;
3995{
3996	struct jmvref *jmvref;
3997
3998	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3999	workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp));
4000	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
4001	jmvref->jm_parent = dp->i_number;
4002	jmvref->jm_ino = ino;
4003	jmvref->jm_oldoff = oldoff;
4004	jmvref->jm_newoff = newoff;
4005
4006	return (jmvref);
4007}
4008
4009/*
4010 * Allocate a new jremref that tracks the removal of ip from dp with the
4011 * directory entry offset of diroff.  Mark the entry as ATTACHED and
4012 * DEPCOMPLETE as we have all the information required for the journal write
4013 * and the directory has already been removed from the buffer.  The caller
4014 * is responsible for linking the jremref into the pagedep and adding it
4015 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
4016 * a DOTDOT addition so handle_workitem_remove() can properly assign
4017 * the jsegdep when we're done.
4018 */
4019static struct jremref *
4020newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
4021    off_t diroff, nlink_t nlink)
4022{
4023	struct jremref *jremref;
4024
4025	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
4026	workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp));
4027	jremref->jr_state = ATTACHED;
4028	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
4029	   nlink, ip->i_mode);
4030	jremref->jr_dirrem = dirrem;
4031
4032	return (jremref);
4033}
4034
4035static inline void
4036newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
4037    nlink_t nlink, uint16_t mode)
4038{
4039
4040	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
4041	inoref->if_diroff = diroff;
4042	inoref->if_ino = ino;
4043	inoref->if_parent = parent;
4044	inoref->if_nlink = nlink;
4045	inoref->if_mode = mode;
4046}
4047
4048/*
4049 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
4050 * directory offset may not be known until later.  The caller is responsible
4051 * adding the entry to the journal when this information is available.  nlink
4052 * should be the link count prior to the addition and mode is only required
4053 * to have the correct FMT.
4054 */
4055static struct jaddref *
4056newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
4057    uint16_t mode)
4058{
4059	struct jaddref *jaddref;
4060
4061	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
4062	workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp));
4063	jaddref->ja_state = ATTACHED;
4064	jaddref->ja_mkdir = NULL;
4065	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
4066
4067	return (jaddref);
4068}
4069
4070/*
4071 * Create a new free dependency for a freework.  The caller is responsible
4072 * for adjusting the reference count when it has the lock held.  The freedep
4073 * will track an outstanding bitmap write that will ultimately clear the
4074 * freework to continue.
4075 */
4076static struct freedep *
4077newfreedep(struct freework *freework)
4078{
4079	struct freedep *freedep;
4080
4081	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
4082	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
4083	freedep->fd_freework = freework;
4084
4085	return (freedep);
4086}
4087
4088/*
4089 * Free a freedep structure once the buffer it is linked to is written.  If
4090 * this is the last reference to the freework schedule it for completion.
4091 */
4092static void
4093free_freedep(freedep)
4094	struct freedep *freedep;
4095{
4096	struct freework *freework;
4097
4098	freework = freedep->fd_freework;
4099	freework->fw_freeblks->fb_cgwait--;
4100	if (--freework->fw_ref == 0)
4101		freework_enqueue(freework);
4102	WORKITEM_FREE(freedep, D_FREEDEP);
4103}
4104
4105/*
4106 * Allocate a new freework structure that may be a level in an indirect
4107 * when parent is not NULL or a top level block when it is.  The top level
4108 * freework structures are allocated without the per-filesystem lock held
4109 * and before the freeblks is visible outside of softdep_setup_freeblocks().
4110 */
4111static struct freework *
4112newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
4113	struct ufsmount *ump;
4114	struct freeblks *freeblks;
4115	struct freework *parent;
4116	ufs_lbn_t lbn;
4117	ufs2_daddr_t nb;
4118	int frags;
4119	int off;
4120	int journal;
4121{
4122	struct freework *freework;
4123
4124	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
4125	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
4126	freework->fw_state = ATTACHED;
4127	freework->fw_jnewblk = NULL;
4128	freework->fw_freeblks = freeblks;
4129	freework->fw_parent = parent;
4130	freework->fw_lbn = lbn;
4131	freework->fw_blkno = nb;
4132	freework->fw_frags = frags;
4133	freework->fw_indir = NULL;
4134	freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 ||
4135	    lbn >= -UFS_NXADDR) ? 0 : NINDIR(ump->um_fs) + 1;
4136	freework->fw_start = freework->fw_off = off;
4137	if (journal)
4138		newjfreeblk(freeblks, lbn, nb, frags);
4139	if (parent == NULL) {
4140		ACQUIRE_LOCK(ump);
4141		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
4142		freeblks->fb_ref++;
4143		FREE_LOCK(ump);
4144	}
4145
4146	return (freework);
4147}
4148
4149/*
4150 * Eliminate a jfreeblk for a block that does not need journaling.
4151 */
4152static void
4153cancel_jfreeblk(freeblks, blkno)
4154	struct freeblks *freeblks;
4155	ufs2_daddr_t blkno;
4156{
4157	struct jfreeblk *jfreeblk;
4158	struct jblkdep *jblkdep;
4159
4160	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4161		if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4162			continue;
4163		jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4164		if (jfreeblk->jf_blkno == blkno)
4165			break;
4166	}
4167	if (jblkdep == NULL)
4168		return;
4169	CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4170	free_jsegdep(jblkdep->jb_jsegdep);
4171	LIST_REMOVE(jblkdep, jb_deps);
4172	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4173}
4174
4175/*
4176 * Allocate a new jfreeblk to journal top level block pointer when truncating
4177 * a file.  The caller must add this to the worklist when the per-filesystem
4178 * lock is held.
4179 */
4180static struct jfreeblk *
4181newjfreeblk(freeblks, lbn, blkno, frags)
4182	struct freeblks *freeblks;
4183	ufs_lbn_t lbn;
4184	ufs2_daddr_t blkno;
4185	int frags;
4186{
4187	struct jfreeblk *jfreeblk;
4188
4189	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4190	workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4191	    freeblks->fb_list.wk_mp);
4192	jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4193	jfreeblk->jf_dep.jb_freeblks = freeblks;
4194	jfreeblk->jf_ino = freeblks->fb_inum;
4195	jfreeblk->jf_lbn = lbn;
4196	jfreeblk->jf_blkno = blkno;
4197	jfreeblk->jf_frags = frags;
4198	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4199
4200	return (jfreeblk);
4201}
4202
4203/*
4204 * The journal is only prepared to handle full-size block numbers, so we
4205 * have to adjust the record to reflect the change to a full-size block.
4206 * For example, suppose we have a block made up of fragments 8-15 and
4207 * want to free its last two fragments. We are given a request that says:
4208 *     FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
4209 * where frags are the number of fragments to free and oldfrags are the
4210 * number of fragments to keep. To block align it, we have to change it to
4211 * have a valid full-size blkno, so it becomes:
4212 *     FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
4213 */
4214static void
4215adjust_newfreework(freeblks, frag_offset)
4216	struct freeblks *freeblks;
4217	int frag_offset;
4218{
4219	struct jfreeblk *jfreeblk;
4220
4221	KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
4222	    LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
4223	    ("adjust_newfreework: Missing freeblks dependency"));
4224
4225	jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
4226	jfreeblk->jf_blkno -= frag_offset;
4227	jfreeblk->jf_frags += frag_offset;
4228}
4229
4230/*
4231 * Allocate a new jtrunc to track a partial truncation.
4232 */
4233static struct jtrunc *
4234newjtrunc(freeblks, size, extsize)
4235	struct freeblks *freeblks;
4236	off_t size;
4237	int extsize;
4238{
4239	struct jtrunc *jtrunc;
4240
4241	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4242	workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4243	    freeblks->fb_list.wk_mp);
4244	jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4245	jtrunc->jt_dep.jb_freeblks = freeblks;
4246	jtrunc->jt_ino = freeblks->fb_inum;
4247	jtrunc->jt_size = size;
4248	jtrunc->jt_extsize = extsize;
4249	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4250
4251	return (jtrunc);
4252}
4253
4254/*
4255 * If we're canceling a new bitmap we have to search for another ref
4256 * to move into the bmsafemap dep.  This might be better expressed
4257 * with another structure.
4258 */
4259static void
4260move_newblock_dep(jaddref, inodedep)
4261	struct jaddref *jaddref;
4262	struct inodedep *inodedep;
4263{
4264	struct inoref *inoref;
4265	struct jaddref *jaddrefn;
4266
4267	jaddrefn = NULL;
4268	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4269	    inoref = TAILQ_NEXT(inoref, if_deps)) {
4270		if ((jaddref->ja_state & NEWBLOCK) &&
4271		    inoref->if_list.wk_type == D_JADDREF) {
4272			jaddrefn = (struct jaddref *)inoref;
4273			break;
4274		}
4275	}
4276	if (jaddrefn == NULL)
4277		return;
4278	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4279	jaddrefn->ja_state |= jaddref->ja_state &
4280	    (ATTACHED | UNDONE | NEWBLOCK);
4281	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4282	jaddref->ja_state |= ATTACHED;
4283	LIST_REMOVE(jaddref, ja_bmdeps);
4284	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4285	    ja_bmdeps);
4286}
4287
4288/*
4289 * Cancel a jaddref either before it has been written or while it is being
4290 * written.  This happens when a link is removed before the add reaches
4291 * the disk.  The jaddref dependency is kept linked into the bmsafemap
4292 * and inode to prevent the link count or bitmap from reaching the disk
4293 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4294 * required.
4295 *
4296 * Returns 1 if the canceled addref requires journaling of the remove and
4297 * 0 otherwise.
4298 */
4299static int
4300cancel_jaddref(jaddref, inodedep, wkhd)
4301	struct jaddref *jaddref;
4302	struct inodedep *inodedep;
4303	struct workhead *wkhd;
4304{
4305	struct inoref *inoref;
4306	struct jsegdep *jsegdep;
4307	int needsj;
4308
4309	KASSERT((jaddref->ja_state & COMPLETE) == 0,
4310	    ("cancel_jaddref: Canceling complete jaddref"));
4311	if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4312		needsj = 1;
4313	else
4314		needsj = 0;
4315	if (inodedep == NULL)
4316		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4317		    0, &inodedep) == 0)
4318			panic("cancel_jaddref: Lost inodedep");
4319	/*
4320	 * We must adjust the nlink of any reference operation that follows
4321	 * us so that it is consistent with the in-memory reference.  This
4322	 * ensures that inode nlink rollbacks always have the correct link.
4323	 */
4324	if (needsj == 0) {
4325		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4326		    inoref = TAILQ_NEXT(inoref, if_deps)) {
4327			if (inoref->if_state & GOINGAWAY)
4328				break;
4329			inoref->if_nlink--;
4330		}
4331	}
4332	jsegdep = inoref_jseg(&jaddref->ja_ref);
4333	if (jaddref->ja_state & NEWBLOCK)
4334		move_newblock_dep(jaddref, inodedep);
4335	wake_worklist(&jaddref->ja_list);
4336	jaddref->ja_mkdir = NULL;
4337	if (jaddref->ja_state & INPROGRESS) {
4338		jaddref->ja_state &= ~INPROGRESS;
4339		WORKLIST_REMOVE(&jaddref->ja_list);
4340		jwork_insert(wkhd, jsegdep);
4341	} else {
4342		free_jsegdep(jsegdep);
4343		if (jaddref->ja_state & DEPCOMPLETE)
4344			remove_from_journal(&jaddref->ja_list);
4345	}
4346	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4347	/*
4348	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4349	 * can arrange for them to be freed with the bitmap.  Otherwise we
4350	 * no longer need this addref attached to the inoreflst and it
4351	 * will incorrectly adjust nlink if we leave it.
4352	 */
4353	if ((jaddref->ja_state & NEWBLOCK) == 0) {
4354		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4355		    if_deps);
4356		jaddref->ja_state |= COMPLETE;
4357		free_jaddref(jaddref);
4358		return (needsj);
4359	}
4360	/*
4361	 * Leave the head of the list for jsegdeps for fast merging.
4362	 */
4363	if (LIST_FIRST(wkhd) != NULL) {
4364		jaddref->ja_state |= ONWORKLIST;
4365		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4366	} else
4367		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4368
4369	return (needsj);
4370}
4371
4372/*
4373 * Attempt to free a jaddref structure when some work completes.  This
4374 * should only succeed once the entry is written and all dependencies have
4375 * been notified.
4376 */
4377static void
4378free_jaddref(jaddref)
4379	struct jaddref *jaddref;
4380{
4381
4382	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4383		return;
4384	if (jaddref->ja_ref.if_jsegdep)
4385		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4386		    jaddref, jaddref->ja_state);
4387	if (jaddref->ja_state & NEWBLOCK)
4388		LIST_REMOVE(jaddref, ja_bmdeps);
4389	if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4390		panic("free_jaddref: Bad state %p(0x%X)",
4391		    jaddref, jaddref->ja_state);
4392	if (jaddref->ja_mkdir != NULL)
4393		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4394	WORKITEM_FREE(jaddref, D_JADDREF);
4395}
4396
4397/*
4398 * Free a jremref structure once it has been written or discarded.
4399 */
4400static void
4401free_jremref(jremref)
4402	struct jremref *jremref;
4403{
4404
4405	if (jremref->jr_ref.if_jsegdep)
4406		free_jsegdep(jremref->jr_ref.if_jsegdep);
4407	if (jremref->jr_state & INPROGRESS)
4408		panic("free_jremref: IO still pending");
4409	WORKITEM_FREE(jremref, D_JREMREF);
4410}
4411
4412/*
4413 * Free a jnewblk structure.
4414 */
4415static void
4416free_jnewblk(jnewblk)
4417	struct jnewblk *jnewblk;
4418{
4419
4420	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4421		return;
4422	LIST_REMOVE(jnewblk, jn_deps);
4423	if (jnewblk->jn_dep != NULL)
4424		panic("free_jnewblk: Dependency still attached.");
4425	WORKITEM_FREE(jnewblk, D_JNEWBLK);
4426}
4427
4428/*
4429 * Cancel a jnewblk which has been been made redundant by frag extension.
4430 */
4431static void
4432cancel_jnewblk(jnewblk, wkhd)
4433	struct jnewblk *jnewblk;
4434	struct workhead *wkhd;
4435{
4436	struct jsegdep *jsegdep;
4437
4438	CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4439	jsegdep = jnewblk->jn_jsegdep;
4440	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4441		panic("cancel_jnewblk: Invalid state");
4442	jnewblk->jn_jsegdep  = NULL;
4443	jnewblk->jn_dep = NULL;
4444	jnewblk->jn_state |= GOINGAWAY;
4445	if (jnewblk->jn_state & INPROGRESS) {
4446		jnewblk->jn_state &= ~INPROGRESS;
4447		WORKLIST_REMOVE(&jnewblk->jn_list);
4448		jwork_insert(wkhd, jsegdep);
4449	} else {
4450		free_jsegdep(jsegdep);
4451		remove_from_journal(&jnewblk->jn_list);
4452	}
4453	wake_worklist(&jnewblk->jn_list);
4454	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4455}
4456
4457static void
4458free_jblkdep(jblkdep)
4459	struct jblkdep *jblkdep;
4460{
4461
4462	if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4463		WORKITEM_FREE(jblkdep, D_JFREEBLK);
4464	else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4465		WORKITEM_FREE(jblkdep, D_JTRUNC);
4466	else
4467		panic("free_jblkdep: Unexpected type %s",
4468		    TYPENAME(jblkdep->jb_list.wk_type));
4469}
4470
4471/*
4472 * Free a single jseg once it is no longer referenced in memory or on
4473 * disk.  Reclaim journal blocks and dependencies waiting for the segment
4474 * to disappear.
4475 */
4476static void
4477free_jseg(jseg, jblocks)
4478	struct jseg *jseg;
4479	struct jblocks *jblocks;
4480{
4481	struct freework *freework;
4482
4483	/*
4484	 * Free freework structures that were lingering to indicate freed
4485	 * indirect blocks that forced journal write ordering on reallocate.
4486	 */
4487	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4488		indirblk_remove(freework);
4489	if (jblocks->jb_oldestseg == jseg)
4490		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4491	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4492	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4493	KASSERT(LIST_EMPTY(&jseg->js_entries),
4494	    ("free_jseg: Freed jseg has valid entries."));
4495	WORKITEM_FREE(jseg, D_JSEG);
4496}
4497
4498/*
4499 * Free all jsegs that meet the criteria for being reclaimed and update
4500 * oldestseg.
4501 */
4502static void
4503free_jsegs(jblocks)
4504	struct jblocks *jblocks;
4505{
4506	struct jseg *jseg;
4507
4508	/*
4509	 * Free only those jsegs which have none allocated before them to
4510	 * preserve the journal space ordering.
4511	 */
4512	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4513		/*
4514		 * Only reclaim space when nothing depends on this journal
4515		 * set and another set has written that it is no longer
4516		 * valid.
4517		 */
4518		if (jseg->js_refs != 0) {
4519			jblocks->jb_oldestseg = jseg;
4520			return;
4521		}
4522		if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4523			break;
4524		if (jseg->js_seq > jblocks->jb_oldestwrseq)
4525			break;
4526		/*
4527		 * We can free jsegs that didn't write entries when
4528		 * oldestwrseq == js_seq.
4529		 */
4530		if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4531		    jseg->js_cnt != 0)
4532			break;
4533		free_jseg(jseg, jblocks);
4534	}
4535	/*
4536	 * If we exited the loop above we still must discover the
4537	 * oldest valid segment.
4538	 */
4539	if (jseg)
4540		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4541		     jseg = TAILQ_NEXT(jseg, js_next))
4542			if (jseg->js_refs != 0)
4543				break;
4544	jblocks->jb_oldestseg = jseg;
4545	/*
4546	 * The journal has no valid records but some jsegs may still be
4547	 * waiting on oldestwrseq to advance.  We force a small record
4548	 * out to permit these lingering records to be reclaimed.
4549	 */
4550	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4551		jblocks->jb_needseg = 1;
4552}
4553
4554/*
4555 * Release one reference to a jseg and free it if the count reaches 0.  This
4556 * should eventually reclaim journal space as well.
4557 */
4558static void
4559rele_jseg(jseg)
4560	struct jseg *jseg;
4561{
4562
4563	KASSERT(jseg->js_refs > 0,
4564	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4565	if (--jseg->js_refs != 0)
4566		return;
4567	free_jsegs(jseg->js_jblocks);
4568}
4569
4570/*
4571 * Release a jsegdep and decrement the jseg count.
4572 */
4573static void
4574free_jsegdep(jsegdep)
4575	struct jsegdep *jsegdep;
4576{
4577
4578	if (jsegdep->jd_seg)
4579		rele_jseg(jsegdep->jd_seg);
4580	WORKITEM_FREE(jsegdep, D_JSEGDEP);
4581}
4582
4583/*
4584 * Wait for a journal item to make it to disk.  Initiate journal processing
4585 * if required.
4586 */
4587static int
4588jwait(wk, waitfor)
4589	struct worklist *wk;
4590	int waitfor;
4591{
4592
4593	LOCK_OWNED(VFSTOUFS(wk->wk_mp));
4594	/*
4595	 * Blocking journal waits cause slow synchronous behavior.  Record
4596	 * stats on the frequency of these blocking operations.
4597	 */
4598	if (waitfor == MNT_WAIT) {
4599		stat_journal_wait++;
4600		switch (wk->wk_type) {
4601		case D_JREMREF:
4602		case D_JMVREF:
4603			stat_jwait_filepage++;
4604			break;
4605		case D_JTRUNC:
4606		case D_JFREEBLK:
4607			stat_jwait_freeblks++;
4608			break;
4609		case D_JNEWBLK:
4610			stat_jwait_newblk++;
4611			break;
4612		case D_JADDREF:
4613			stat_jwait_inode++;
4614			break;
4615		default:
4616			break;
4617		}
4618	}
4619	/*
4620	 * If IO has not started we process the journal.  We can't mark the
4621	 * worklist item as IOWAITING because we drop the lock while
4622	 * processing the journal and the worklist entry may be freed after
4623	 * this point.  The caller may call back in and re-issue the request.
4624	 */
4625	if ((wk->wk_state & INPROGRESS) == 0) {
4626		softdep_process_journal(wk->wk_mp, wk, waitfor);
4627		if (waitfor != MNT_WAIT)
4628			return (EBUSY);
4629		return (0);
4630	}
4631	if (waitfor != MNT_WAIT)
4632		return (EBUSY);
4633	wait_worklist(wk, "jwait");
4634	return (0);
4635}
4636
4637/*
4638 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4639 * appropriate.  This is a convenience function to reduce duplicate code
4640 * for the setup and revert functions below.
4641 */
4642static struct inodedep *
4643inodedep_lookup_ip(ip)
4644	struct inode *ip;
4645{
4646	struct inodedep *inodedep;
4647
4648	KASSERT(ip->i_nlink >= ip->i_effnlink,
4649	    ("inodedep_lookup_ip: bad delta"));
4650	(void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC,
4651	    &inodedep);
4652	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4653	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4654
4655	return (inodedep);
4656}
4657
4658/*
4659 * Called prior to creating a new inode and linking it to a directory.  The
4660 * jaddref structure must already be allocated by softdep_setup_inomapdep
4661 * and it is discovered here so we can initialize the mode and update
4662 * nlinkdelta.
4663 */
4664void
4665softdep_setup_create(dp, ip)
4666	struct inode *dp;
4667	struct inode *ip;
4668{
4669	struct inodedep *inodedep;
4670	struct jaddref *jaddref;
4671	struct vnode *dvp;
4672
4673	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4674	    ("softdep_setup_create called on non-softdep filesystem"));
4675	KASSERT(ip->i_nlink == 1,
4676	    ("softdep_setup_create: Invalid link count."));
4677	dvp = ITOV(dp);
4678	ACQUIRE_LOCK(ITOUMP(dp));
4679	inodedep = inodedep_lookup_ip(ip);
4680	if (DOINGSUJ(dvp)) {
4681		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4682		    inoreflst);
4683		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4684		    ("softdep_setup_create: No addref structure present."));
4685	}
4686	softdep_prelink(dvp, NULL);
4687	FREE_LOCK(ITOUMP(dp));
4688}
4689
4690/*
4691 * Create a jaddref structure to track the addition of a DOTDOT link when
4692 * we are reparenting an inode as part of a rename.  This jaddref will be
4693 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4694 * non-journaling softdep.
4695 */
4696void
4697softdep_setup_dotdot_link(dp, ip)
4698	struct inode *dp;
4699	struct inode *ip;
4700{
4701	struct inodedep *inodedep;
4702	struct jaddref *jaddref;
4703	struct vnode *dvp;
4704
4705	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4706	    ("softdep_setup_dotdot_link called on non-softdep filesystem"));
4707	dvp = ITOV(dp);
4708	jaddref = NULL;
4709	/*
4710	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4711	 * is used as a normal link would be.
4712	 */
4713	if (DOINGSUJ(dvp))
4714		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4715		    dp->i_effnlink - 1, dp->i_mode);
4716	ACQUIRE_LOCK(ITOUMP(dp));
4717	inodedep = inodedep_lookup_ip(dp);
4718	if (jaddref)
4719		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4720		    if_deps);
4721	softdep_prelink(dvp, ITOV(ip));
4722	FREE_LOCK(ITOUMP(dp));
4723}
4724
4725/*
4726 * Create a jaddref structure to track a new link to an inode.  The directory
4727 * offset is not known until softdep_setup_directory_add or
4728 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4729 * softdep.
4730 */
4731void
4732softdep_setup_link(dp, ip)
4733	struct inode *dp;
4734	struct inode *ip;
4735{
4736	struct inodedep *inodedep;
4737	struct jaddref *jaddref;
4738	struct vnode *dvp;
4739
4740	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4741	    ("softdep_setup_link called on non-softdep filesystem"));
4742	dvp = ITOV(dp);
4743	jaddref = NULL;
4744	if (DOINGSUJ(dvp))
4745		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4746		    ip->i_mode);
4747	ACQUIRE_LOCK(ITOUMP(dp));
4748	inodedep = inodedep_lookup_ip(ip);
4749	if (jaddref)
4750		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4751		    if_deps);
4752	softdep_prelink(dvp, ITOV(ip));
4753	FREE_LOCK(ITOUMP(dp));
4754}
4755
4756/*
4757 * Called to create the jaddref structures to track . and .. references as
4758 * well as lookup and further initialize the incomplete jaddref created
4759 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4760 * nlinkdelta for non-journaling softdep.
4761 */
4762void
4763softdep_setup_mkdir(dp, ip)
4764	struct inode *dp;
4765	struct inode *ip;
4766{
4767	struct inodedep *inodedep;
4768	struct jaddref *dotdotaddref;
4769	struct jaddref *dotaddref;
4770	struct jaddref *jaddref;
4771	struct vnode *dvp;
4772
4773	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4774	    ("softdep_setup_mkdir called on non-softdep filesystem"));
4775	dvp = ITOV(dp);
4776	dotaddref = dotdotaddref = NULL;
4777	if (DOINGSUJ(dvp)) {
4778		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4779		    ip->i_mode);
4780		dotaddref->ja_state |= MKDIR_BODY;
4781		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4782		    dp->i_effnlink - 1, dp->i_mode);
4783		dotdotaddref->ja_state |= MKDIR_PARENT;
4784	}
4785	ACQUIRE_LOCK(ITOUMP(dp));
4786	inodedep = inodedep_lookup_ip(ip);
4787	if (DOINGSUJ(dvp)) {
4788		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4789		    inoreflst);
4790		KASSERT(jaddref != NULL,
4791		    ("softdep_setup_mkdir: No addref structure present."));
4792		KASSERT(jaddref->ja_parent == dp->i_number,
4793		    ("softdep_setup_mkdir: bad parent %ju",
4794		    (uintmax_t)jaddref->ja_parent));
4795		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4796		    if_deps);
4797	}
4798	inodedep = inodedep_lookup_ip(dp);
4799	if (DOINGSUJ(dvp))
4800		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4801		    &dotdotaddref->ja_ref, if_deps);
4802	softdep_prelink(ITOV(dp), NULL);
4803	FREE_LOCK(ITOUMP(dp));
4804}
4805
4806/*
4807 * Called to track nlinkdelta of the inode and parent directories prior to
4808 * unlinking a directory.
4809 */
4810void
4811softdep_setup_rmdir(dp, ip)
4812	struct inode *dp;
4813	struct inode *ip;
4814{
4815	struct vnode *dvp;
4816
4817	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4818	    ("softdep_setup_rmdir called on non-softdep filesystem"));
4819	dvp = ITOV(dp);
4820	ACQUIRE_LOCK(ITOUMP(dp));
4821	(void) inodedep_lookup_ip(ip);
4822	(void) inodedep_lookup_ip(dp);
4823	softdep_prelink(dvp, ITOV(ip));
4824	FREE_LOCK(ITOUMP(dp));
4825}
4826
4827/*
4828 * Called to track nlinkdelta of the inode and parent directories prior to
4829 * unlink.
4830 */
4831void
4832softdep_setup_unlink(dp, ip)
4833	struct inode *dp;
4834	struct inode *ip;
4835{
4836	struct vnode *dvp;
4837
4838	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4839	    ("softdep_setup_unlink called on non-softdep filesystem"));
4840	dvp = ITOV(dp);
4841	ACQUIRE_LOCK(ITOUMP(dp));
4842	(void) inodedep_lookup_ip(ip);
4843	(void) inodedep_lookup_ip(dp);
4844	softdep_prelink(dvp, ITOV(ip));
4845	FREE_LOCK(ITOUMP(dp));
4846}
4847
4848/*
4849 * Called to release the journal structures created by a failed non-directory
4850 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4851 */
4852void
4853softdep_revert_create(dp, ip)
4854	struct inode *dp;
4855	struct inode *ip;
4856{
4857	struct inodedep *inodedep;
4858	struct jaddref *jaddref;
4859	struct vnode *dvp;
4860
4861	KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0,
4862	    ("softdep_revert_create called on non-softdep filesystem"));
4863	dvp = ITOV(dp);
4864	ACQUIRE_LOCK(ITOUMP(dp));
4865	inodedep = inodedep_lookup_ip(ip);
4866	if (DOINGSUJ(dvp)) {
4867		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4868		    inoreflst);
4869		KASSERT(jaddref->ja_parent == dp->i_number,
4870		    ("softdep_revert_create: addref parent mismatch"));
4871		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4872	}
4873	FREE_LOCK(ITOUMP(dp));
4874}
4875
4876/*
4877 * Called to release the journal structures created by a failed link
4878 * addition.  Adjusts nlinkdelta for non-journaling softdep.
4879 */
4880void
4881softdep_revert_link(dp, ip)
4882	struct inode *dp;
4883	struct inode *ip;
4884{
4885	struct inodedep *inodedep;
4886	struct jaddref *jaddref;
4887	struct vnode *dvp;
4888
4889	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4890	    ("softdep_revert_link called on non-softdep filesystem"));
4891	dvp = ITOV(dp);
4892	ACQUIRE_LOCK(ITOUMP(dp));
4893	inodedep = inodedep_lookup_ip(ip);
4894	if (DOINGSUJ(dvp)) {
4895		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4896		    inoreflst);
4897		KASSERT(jaddref->ja_parent == dp->i_number,
4898		    ("softdep_revert_link: addref parent mismatch"));
4899		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4900	}
4901	FREE_LOCK(ITOUMP(dp));
4902}
4903
4904/*
4905 * Called to release the journal structures created by a failed mkdir
4906 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4907 */
4908void
4909softdep_revert_mkdir(dp, ip)
4910	struct inode *dp;
4911	struct inode *ip;
4912{
4913	struct inodedep *inodedep;
4914	struct jaddref *jaddref;
4915	struct jaddref *dotaddref;
4916	struct vnode *dvp;
4917
4918	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4919	    ("softdep_revert_mkdir called on non-softdep filesystem"));
4920	dvp = ITOV(dp);
4921
4922	ACQUIRE_LOCK(ITOUMP(dp));
4923	inodedep = inodedep_lookup_ip(dp);
4924	if (DOINGSUJ(dvp)) {
4925		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4926		    inoreflst);
4927		KASSERT(jaddref->ja_parent == ip->i_number,
4928		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4929		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4930	}
4931	inodedep = inodedep_lookup_ip(ip);
4932	if (DOINGSUJ(dvp)) {
4933		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4934		    inoreflst);
4935		KASSERT(jaddref->ja_parent == dp->i_number,
4936		    ("softdep_revert_mkdir: addref parent mismatch"));
4937		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4938		    inoreflst, if_deps);
4939		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4940		KASSERT(dotaddref->ja_parent == ip->i_number,
4941		    ("softdep_revert_mkdir: dot addref parent mismatch"));
4942		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4943	}
4944	FREE_LOCK(ITOUMP(dp));
4945}
4946
4947/*
4948 * Called to correct nlinkdelta after a failed rmdir.
4949 */
4950void
4951softdep_revert_rmdir(dp, ip)
4952	struct inode *dp;
4953	struct inode *ip;
4954{
4955
4956	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4957	    ("softdep_revert_rmdir called on non-softdep filesystem"));
4958	ACQUIRE_LOCK(ITOUMP(dp));
4959	(void) inodedep_lookup_ip(ip);
4960	(void) inodedep_lookup_ip(dp);
4961	FREE_LOCK(ITOUMP(dp));
4962}
4963
4964/*
4965 * Protecting the freemaps (or bitmaps).
4966 *
4967 * To eliminate the need to execute fsck before mounting a filesystem
4968 * after a power failure, one must (conservatively) guarantee that the
4969 * on-disk copy of the bitmaps never indicate that a live inode or block is
4970 * free.  So, when a block or inode is allocated, the bitmap should be
4971 * updated (on disk) before any new pointers.  When a block or inode is
4972 * freed, the bitmap should not be updated until all pointers have been
4973 * reset.  The latter dependency is handled by the delayed de-allocation
4974 * approach described below for block and inode de-allocation.  The former
4975 * dependency is handled by calling the following procedure when a block or
4976 * inode is allocated. When an inode is allocated an "inodedep" is created
4977 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4978 * Each "inodedep" is also inserted into the hash indexing structure so
4979 * that any additional link additions can be made dependent on the inode
4980 * allocation.
4981 *
4982 * The ufs filesystem maintains a number of free block counts (e.g., per
4983 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4984 * in addition to the bitmaps.  These counts are used to improve efficiency
4985 * during allocation and therefore must be consistent with the bitmaps.
4986 * There is no convenient way to guarantee post-crash consistency of these
4987 * counts with simple update ordering, for two main reasons: (1) The counts
4988 * and bitmaps for a single cylinder group block are not in the same disk
4989 * sector.  If a disk write is interrupted (e.g., by power failure), one may
4990 * be written and the other not.  (2) Some of the counts are located in the
4991 * superblock rather than the cylinder group block. So, we focus our soft
4992 * updates implementation on protecting the bitmaps. When mounting a
4993 * filesystem, we recompute the auxiliary counts from the bitmaps.
4994 */
4995
4996/*
4997 * Called just after updating the cylinder group block to allocate an inode.
4998 */
4999void
5000softdep_setup_inomapdep(bp, ip, newinum, mode)
5001	struct buf *bp;		/* buffer for cylgroup block with inode map */
5002	struct inode *ip;	/* inode related to allocation */
5003	ino_t newinum;		/* new inode number being allocated */
5004	int mode;
5005{
5006	struct inodedep *inodedep;
5007	struct bmsafemap *bmsafemap;
5008	struct jaddref *jaddref;
5009	struct mount *mp;
5010	struct fs *fs;
5011
5012	mp = ITOVFS(ip);
5013	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5014	    ("softdep_setup_inomapdep called on non-softdep filesystem"));
5015	fs = VFSTOUFS(mp)->um_fs;
5016	jaddref = NULL;
5017
5018	/*
5019	 * Allocate the journal reference add structure so that the bitmap
5020	 * can be dependent on it.
5021	 */
5022	if (MOUNTEDSUJ(mp)) {
5023		jaddref = newjaddref(ip, newinum, 0, 0, mode);
5024		jaddref->ja_state |= NEWBLOCK;
5025	}
5026
5027	/*
5028	 * Create a dependency for the newly allocated inode.
5029	 * Panic if it already exists as something is seriously wrong.
5030	 * Otherwise add it to the dependency list for the buffer holding
5031	 * the cylinder group map from which it was allocated.
5032	 *
5033	 * We have to preallocate a bmsafemap entry in case it is needed
5034	 * in bmsafemap_lookup since once we allocate the inodedep, we
5035	 * have to finish initializing it before we can FREE_LOCK().
5036	 * By preallocating, we avoid FREE_LOCK() while doing a malloc
5037	 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
5038	 * creating the inodedep as it can be freed during the time
5039	 * that we FREE_LOCK() while allocating the inodedep. We must
5040	 * call workitem_alloc() before entering the locked section as
5041	 * it also acquires the lock and we must avoid trying doing so
5042	 * recursively.
5043	 */
5044	bmsafemap = malloc(sizeof(struct bmsafemap),
5045	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5046	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5047	ACQUIRE_LOCK(ITOUMP(ip));
5048	if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep)))
5049		panic("softdep_setup_inomapdep: dependency %p for new"
5050		    "inode already exists", inodedep);
5051	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
5052	if (jaddref) {
5053		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
5054		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5055		    if_deps);
5056	} else {
5057		inodedep->id_state |= ONDEPLIST;
5058		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
5059	}
5060	inodedep->id_bmsafemap = bmsafemap;
5061	inodedep->id_state &= ~DEPCOMPLETE;
5062	FREE_LOCK(ITOUMP(ip));
5063}
5064
5065/*
5066 * Called just after updating the cylinder group block to
5067 * allocate block or fragment.
5068 */
5069void
5070softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
5071	struct buf *bp;		/* buffer for cylgroup block with block map */
5072	struct mount *mp;	/* filesystem doing allocation */
5073	ufs2_daddr_t newblkno;	/* number of newly allocated block */
5074	int frags;		/* Number of fragments. */
5075	int oldfrags;		/* Previous number of fragments for extend. */
5076{
5077	struct newblk *newblk;
5078	struct bmsafemap *bmsafemap;
5079	struct jnewblk *jnewblk;
5080	struct ufsmount *ump;
5081	struct fs *fs;
5082
5083	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5084	    ("softdep_setup_blkmapdep called on non-softdep filesystem"));
5085	ump = VFSTOUFS(mp);
5086	fs = ump->um_fs;
5087	jnewblk = NULL;
5088	/*
5089	 * Create a dependency for the newly allocated block.
5090	 * Add it to the dependency list for the buffer holding
5091	 * the cylinder group map from which it was allocated.
5092	 */
5093	if (MOUNTEDSUJ(mp)) {
5094		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
5095		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
5096		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
5097		jnewblk->jn_state = ATTACHED;
5098		jnewblk->jn_blkno = newblkno;
5099		jnewblk->jn_frags = frags;
5100		jnewblk->jn_oldfrags = oldfrags;
5101#ifdef SUJ_DEBUG
5102		{
5103			struct cg *cgp;
5104			uint8_t *blksfree;
5105			long bno;
5106			int i;
5107
5108			cgp = (struct cg *)bp->b_data;
5109			blksfree = cg_blksfree(cgp);
5110			bno = dtogd(fs, jnewblk->jn_blkno);
5111			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
5112			    i++) {
5113				if (isset(blksfree, bno + i))
5114					panic("softdep_setup_blkmapdep: "
5115					    "free fragment %d from %d-%d "
5116					    "state 0x%X dep %p", i,
5117					    jnewblk->jn_oldfrags,
5118					    jnewblk->jn_frags,
5119					    jnewblk->jn_state,
5120					    jnewblk->jn_dep);
5121			}
5122		}
5123#endif
5124	}
5125
5126	CTR3(KTR_SUJ,
5127	    "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
5128	    newblkno, frags, oldfrags);
5129	ACQUIRE_LOCK(ump);
5130	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
5131		panic("softdep_setup_blkmapdep: found block");
5132	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
5133	    dtog(fs, newblkno), NULL);
5134	if (jnewblk) {
5135		jnewblk->jn_dep = (struct worklist *)newblk;
5136		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
5137	} else {
5138		newblk->nb_state |= ONDEPLIST;
5139		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
5140	}
5141	newblk->nb_bmsafemap = bmsafemap;
5142	newblk->nb_jnewblk = jnewblk;
5143	FREE_LOCK(ump);
5144}
5145
5146#define	BMSAFEMAP_HASH(ump, cg) \
5147      (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
5148
5149static int
5150bmsafemap_find(bmsafemaphd, cg, bmsafemapp)
5151	struct bmsafemap_hashhead *bmsafemaphd;
5152	int cg;
5153	struct bmsafemap **bmsafemapp;
5154{
5155	struct bmsafemap *bmsafemap;
5156
5157	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
5158		if (bmsafemap->sm_cg == cg)
5159			break;
5160	if (bmsafemap) {
5161		*bmsafemapp = bmsafemap;
5162		return (1);
5163	}
5164	*bmsafemapp = NULL;
5165
5166	return (0);
5167}
5168
5169/*
5170 * Find the bmsafemap associated with a cylinder group buffer.
5171 * If none exists, create one. The buffer must be locked when
5172 * this routine is called and this routine must be called with
5173 * the softdep lock held. To avoid giving up the lock while
5174 * allocating a new bmsafemap, a preallocated bmsafemap may be
5175 * provided. If it is provided but not needed, it is freed.
5176 */
5177static struct bmsafemap *
5178bmsafemap_lookup(mp, bp, cg, newbmsafemap)
5179	struct mount *mp;
5180	struct buf *bp;
5181	int cg;
5182	struct bmsafemap *newbmsafemap;
5183{
5184	struct bmsafemap_hashhead *bmsafemaphd;
5185	struct bmsafemap *bmsafemap, *collision;
5186	struct worklist *wk;
5187	struct ufsmount *ump;
5188
5189	ump = VFSTOUFS(mp);
5190	LOCK_OWNED(ump);
5191	KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
5192	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5193		if (wk->wk_type == D_BMSAFEMAP) {
5194			if (newbmsafemap)
5195				WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5196			return (WK_BMSAFEMAP(wk));
5197		}
5198	}
5199	bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
5200	if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
5201		if (newbmsafemap)
5202			WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5203		return (bmsafemap);
5204	}
5205	if (newbmsafemap) {
5206		bmsafemap = newbmsafemap;
5207	} else {
5208		FREE_LOCK(ump);
5209		bmsafemap = malloc(sizeof(struct bmsafemap),
5210			M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5211		workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5212		ACQUIRE_LOCK(ump);
5213	}
5214	bmsafemap->sm_buf = bp;
5215	LIST_INIT(&bmsafemap->sm_inodedephd);
5216	LIST_INIT(&bmsafemap->sm_inodedepwr);
5217	LIST_INIT(&bmsafemap->sm_newblkhd);
5218	LIST_INIT(&bmsafemap->sm_newblkwr);
5219	LIST_INIT(&bmsafemap->sm_jaddrefhd);
5220	LIST_INIT(&bmsafemap->sm_jnewblkhd);
5221	LIST_INIT(&bmsafemap->sm_freehd);
5222	LIST_INIT(&bmsafemap->sm_freewr);
5223	if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
5224		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5225		return (collision);
5226	}
5227	bmsafemap->sm_cg = cg;
5228	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5229	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
5230	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5231	return (bmsafemap);
5232}
5233
5234/*
5235 * Direct block allocation dependencies.
5236 *
5237 * When a new block is allocated, the corresponding disk locations must be
5238 * initialized (with zeros or new data) before the on-disk inode points to
5239 * them.  Also, the freemap from which the block was allocated must be
5240 * updated (on disk) before the inode's pointer. These two dependencies are
5241 * independent of each other and are needed for all file blocks and indirect
5242 * blocks that are pointed to directly by the inode.  Just before the
5243 * "in-core" version of the inode is updated with a newly allocated block
5244 * number, a procedure (below) is called to setup allocation dependency
5245 * structures.  These structures are removed when the corresponding
5246 * dependencies are satisfied or when the block allocation becomes obsolete
5247 * (i.e., the file is deleted, the block is de-allocated, or the block is a
5248 * fragment that gets upgraded).  All of these cases are handled in
5249 * procedures described later.
5250 *
5251 * When a file extension causes a fragment to be upgraded, either to a larger
5252 * fragment or to a full block, the on-disk location may change (if the
5253 * previous fragment could not simply be extended). In this case, the old
5254 * fragment must be de-allocated, but not until after the inode's pointer has
5255 * been updated. In most cases, this is handled by later procedures, which
5256 * will construct a "freefrag" structure to be added to the workitem queue
5257 * when the inode update is complete (or obsolete).  The main exception to
5258 * this is when an allocation occurs while a pending allocation dependency
5259 * (for the same block pointer) remains.  This case is handled in the main
5260 * allocation dependency setup procedure by immediately freeing the
5261 * unreferenced fragments.
5262 */
5263void
5264softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5265	struct inode *ip;	/* inode to which block is being added */
5266	ufs_lbn_t off;		/* block pointer within inode */
5267	ufs2_daddr_t newblkno;	/* disk block number being added */
5268	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
5269	long newsize;		/* size of new block */
5270	long oldsize;		/* size of new block */
5271	struct buf *bp;		/* bp for allocated block */
5272{
5273	struct allocdirect *adp, *oldadp;
5274	struct allocdirectlst *adphead;
5275	struct freefrag *freefrag;
5276	struct inodedep *inodedep;
5277	struct pagedep *pagedep;
5278	struct jnewblk *jnewblk;
5279	struct newblk *newblk;
5280	struct mount *mp;
5281	ufs_lbn_t lbn;
5282
5283	lbn = bp->b_lblkno;
5284	mp = ITOVFS(ip);
5285	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5286	    ("softdep_setup_allocdirect called on non-softdep filesystem"));
5287	if (oldblkno && oldblkno != newblkno)
5288		/*
5289		 * The usual case is that a smaller fragment that
5290		 * was just allocated has been replaced with a bigger
5291		 * fragment or a full-size block. If it is marked as
5292		 * B_DELWRI, the current contents have not been written
5293		 * to disk. It is possible that the block was written
5294		 * earlier, but very uncommon. If the block has never
5295		 * been written, there is no need to send a BIO_DELETE
5296		 * for it when it is freed. The gain from avoiding the
5297		 * TRIMs for the common case of unwritten blocks far
5298		 * exceeds the cost of the write amplification for the
5299		 * uncommon case of failing to send a TRIM for a block
5300		 * that had been written.
5301		 */
5302		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
5303		    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
5304	else
5305		freefrag = NULL;
5306
5307	CTR6(KTR_SUJ,
5308	    "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5309	    "off %jd newsize %ld oldsize %d",
5310	    ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5311	ACQUIRE_LOCK(ITOUMP(ip));
5312	if (off >= UFS_NDADDR) {
5313		if (lbn > 0)
5314			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5315			    lbn, off);
5316		/* allocating an indirect block */
5317		if (oldblkno != 0)
5318			panic("softdep_setup_allocdirect: non-zero indir");
5319	} else {
5320		if (off != lbn)
5321			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5322			    lbn, off);
5323		/*
5324		 * Allocating a direct block.
5325		 *
5326		 * If we are allocating a directory block, then we must
5327		 * allocate an associated pagedep to track additions and
5328		 * deletions.
5329		 */
5330		if ((ip->i_mode & IFMT) == IFDIR)
5331			pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5332			    &pagedep);
5333	}
5334	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5335		panic("softdep_setup_allocdirect: lost block");
5336	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5337	    ("softdep_setup_allocdirect: newblk already initialized"));
5338	/*
5339	 * Convert the newblk to an allocdirect.
5340	 */
5341	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5342	adp = (struct allocdirect *)newblk;
5343	newblk->nb_freefrag = freefrag;
5344	adp->ad_offset = off;
5345	adp->ad_oldblkno = oldblkno;
5346	adp->ad_newsize = newsize;
5347	adp->ad_oldsize = oldsize;
5348
5349	/*
5350	 * Finish initializing the journal.
5351	 */
5352	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5353		jnewblk->jn_ino = ip->i_number;
5354		jnewblk->jn_lbn = lbn;
5355		add_to_journal(&jnewblk->jn_list);
5356	}
5357	if (freefrag && freefrag->ff_jdep != NULL &&
5358	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5359		add_to_journal(freefrag->ff_jdep);
5360	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5361	adp->ad_inodedep = inodedep;
5362
5363	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5364	/*
5365	 * The list of allocdirects must be kept in sorted and ascending
5366	 * order so that the rollback routines can quickly determine the
5367	 * first uncommitted block (the size of the file stored on disk
5368	 * ends at the end of the lowest committed fragment, or if there
5369	 * are no fragments, at the end of the highest committed block).
5370	 * Since files generally grow, the typical case is that the new
5371	 * block is to be added at the end of the list. We speed this
5372	 * special case by checking against the last allocdirect in the
5373	 * list before laboriously traversing the list looking for the
5374	 * insertion point.
5375	 */
5376	adphead = &inodedep->id_newinoupdt;
5377	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5378	if (oldadp == NULL || oldadp->ad_offset <= off) {
5379		/* insert at end of list */
5380		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5381		if (oldadp != NULL && oldadp->ad_offset == off)
5382			allocdirect_merge(adphead, adp, oldadp);
5383		FREE_LOCK(ITOUMP(ip));
5384		return;
5385	}
5386	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5387		if (oldadp->ad_offset >= off)
5388			break;
5389	}
5390	if (oldadp == NULL)
5391		panic("softdep_setup_allocdirect: lost entry");
5392	/* insert in middle of list */
5393	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5394	if (oldadp->ad_offset == off)
5395		allocdirect_merge(adphead, adp, oldadp);
5396
5397	FREE_LOCK(ITOUMP(ip));
5398}
5399
5400/*
5401 * Merge a newer and older journal record to be stored either in a
5402 * newblock or freefrag.  This handles aggregating journal records for
5403 * fragment allocation into a second record as well as replacing a
5404 * journal free with an aborted journal allocation.  A segment for the
5405 * oldest record will be placed on wkhd if it has been written.  If not
5406 * the segment for the newer record will suffice.
5407 */
5408static struct worklist *
5409jnewblk_merge(new, old, wkhd)
5410	struct worklist *new;
5411	struct worklist *old;
5412	struct workhead *wkhd;
5413{
5414	struct jnewblk *njnewblk;
5415	struct jnewblk *jnewblk;
5416
5417	/* Handle NULLs to simplify callers. */
5418	if (new == NULL)
5419		return (old);
5420	if (old == NULL)
5421		return (new);
5422	/* Replace a jfreefrag with a jnewblk. */
5423	if (new->wk_type == D_JFREEFRAG) {
5424		if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5425			panic("jnewblk_merge: blkno mismatch: %p, %p",
5426			    old, new);
5427		cancel_jfreefrag(WK_JFREEFRAG(new));
5428		return (old);
5429	}
5430	if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5431		panic("jnewblk_merge: Bad type: old %d new %d\n",
5432		    old->wk_type, new->wk_type);
5433	/*
5434	 * Handle merging of two jnewblk records that describe
5435	 * different sets of fragments in the same block.
5436	 */
5437	jnewblk = WK_JNEWBLK(old);
5438	njnewblk = WK_JNEWBLK(new);
5439	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5440		panic("jnewblk_merge: Merging disparate blocks.");
5441	/*
5442	 * The record may be rolled back in the cg.
5443	 */
5444	if (jnewblk->jn_state & UNDONE) {
5445		jnewblk->jn_state &= ~UNDONE;
5446		njnewblk->jn_state |= UNDONE;
5447		njnewblk->jn_state &= ~ATTACHED;
5448	}
5449	/*
5450	 * We modify the newer addref and free the older so that if neither
5451	 * has been written the most up-to-date copy will be on disk.  If
5452	 * both have been written but rolled back we only temporarily need
5453	 * one of them to fix the bits when the cg write completes.
5454	 */
5455	jnewblk->jn_state |= ATTACHED | COMPLETE;
5456	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5457	cancel_jnewblk(jnewblk, wkhd);
5458	WORKLIST_REMOVE(&jnewblk->jn_list);
5459	free_jnewblk(jnewblk);
5460	return (new);
5461}
5462
5463/*
5464 * Replace an old allocdirect dependency with a newer one.
5465 */
5466static void
5467allocdirect_merge(adphead, newadp, oldadp)
5468	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
5469	struct allocdirect *newadp;	/* allocdirect being added */
5470	struct allocdirect *oldadp;	/* existing allocdirect being checked */
5471{
5472	struct worklist *wk;
5473	struct freefrag *freefrag;
5474
5475	freefrag = NULL;
5476	LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
5477	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5478	    newadp->ad_oldsize != oldadp->ad_newsize ||
5479	    newadp->ad_offset >= UFS_NDADDR)
5480		panic("%s %jd != new %jd || old size %ld != new %ld",
5481		    "allocdirect_merge: old blkno",
5482		    (intmax_t)newadp->ad_oldblkno,
5483		    (intmax_t)oldadp->ad_newblkno,
5484		    newadp->ad_oldsize, oldadp->ad_newsize);
5485	newadp->ad_oldblkno = oldadp->ad_oldblkno;
5486	newadp->ad_oldsize = oldadp->ad_oldsize;
5487	/*
5488	 * If the old dependency had a fragment to free or had never
5489	 * previously had a block allocated, then the new dependency
5490	 * can immediately post its freefrag and adopt the old freefrag.
5491	 * This action is done by swapping the freefrag dependencies.
5492	 * The new dependency gains the old one's freefrag, and the
5493	 * old one gets the new one and then immediately puts it on
5494	 * the worklist when it is freed by free_newblk. It is
5495	 * not possible to do this swap when the old dependency had a
5496	 * non-zero size but no previous fragment to free. This condition
5497	 * arises when the new block is an extension of the old block.
5498	 * Here, the first part of the fragment allocated to the new
5499	 * dependency is part of the block currently claimed on disk by
5500	 * the old dependency, so cannot legitimately be freed until the
5501	 * conditions for the new dependency are fulfilled.
5502	 */
5503	freefrag = newadp->ad_freefrag;
5504	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5505		newadp->ad_freefrag = oldadp->ad_freefrag;
5506		oldadp->ad_freefrag = freefrag;
5507	}
5508	/*
5509	 * If we are tracking a new directory-block allocation,
5510	 * move it from the old allocdirect to the new allocdirect.
5511	 */
5512	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5513		WORKLIST_REMOVE(wk);
5514		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5515			panic("allocdirect_merge: extra newdirblk");
5516		WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5517	}
5518	TAILQ_REMOVE(adphead, oldadp, ad_next);
5519	/*
5520	 * We need to move any journal dependencies over to the freefrag
5521	 * that releases this block if it exists.  Otherwise we are
5522	 * extending an existing block and we'll wait until that is
5523	 * complete to release the journal space and extend the
5524	 * new journal to cover this old space as well.
5525	 */
5526	if (freefrag == NULL) {
5527		if (oldadp->ad_newblkno != newadp->ad_newblkno)
5528			panic("allocdirect_merge: %jd != %jd",
5529			    oldadp->ad_newblkno, newadp->ad_newblkno);
5530		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5531		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5532		    &oldadp->ad_block.nb_jnewblk->jn_list,
5533		    &newadp->ad_block.nb_jwork);
5534		oldadp->ad_block.nb_jnewblk = NULL;
5535		cancel_newblk(&oldadp->ad_block, NULL,
5536		    &newadp->ad_block.nb_jwork);
5537	} else {
5538		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5539		    &freefrag->ff_list, &freefrag->ff_jwork);
5540		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5541		    &freefrag->ff_jwork);
5542	}
5543	free_newblk(&oldadp->ad_block);
5544}
5545
5546/*
5547 * Allocate a jfreefrag structure to journal a single block free.
5548 */
5549static struct jfreefrag *
5550newjfreefrag(freefrag, ip, blkno, size, lbn)
5551	struct freefrag *freefrag;
5552	struct inode *ip;
5553	ufs2_daddr_t blkno;
5554	long size;
5555	ufs_lbn_t lbn;
5556{
5557	struct jfreefrag *jfreefrag;
5558	struct fs *fs;
5559
5560	fs = ITOFS(ip);
5561	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5562	    M_SOFTDEP_FLAGS);
5563	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip));
5564	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5565	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5566	jfreefrag->fr_ino = ip->i_number;
5567	jfreefrag->fr_lbn = lbn;
5568	jfreefrag->fr_blkno = blkno;
5569	jfreefrag->fr_frags = numfrags(fs, size);
5570	jfreefrag->fr_freefrag = freefrag;
5571
5572	return (jfreefrag);
5573}
5574
5575/*
5576 * Allocate a new freefrag structure.
5577 */
5578static struct freefrag *
5579newfreefrag(ip, blkno, size, lbn, key)
5580	struct inode *ip;
5581	ufs2_daddr_t blkno;
5582	long size;
5583	ufs_lbn_t lbn;
5584	u_long key;
5585{
5586	struct freefrag *freefrag;
5587	struct ufsmount *ump;
5588	struct fs *fs;
5589
5590	CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5591	    ip->i_number, blkno, size, lbn);
5592	ump = ITOUMP(ip);
5593	fs = ump->um_fs;
5594	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5595		panic("newfreefrag: frag size");
5596	freefrag = malloc(sizeof(struct freefrag),
5597	    M_FREEFRAG, M_SOFTDEP_FLAGS);
5598	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump));
5599	freefrag->ff_state = ATTACHED;
5600	LIST_INIT(&freefrag->ff_jwork);
5601	freefrag->ff_inum = ip->i_number;
5602	freefrag->ff_vtype = ITOV(ip)->v_type;
5603	freefrag->ff_blkno = blkno;
5604	freefrag->ff_fragsize = size;
5605	freefrag->ff_key = key;
5606
5607	if (MOUNTEDSUJ(UFSTOVFS(ump))) {
5608		freefrag->ff_jdep = (struct worklist *)
5609		    newjfreefrag(freefrag, ip, blkno, size, lbn);
5610	} else {
5611		freefrag->ff_state |= DEPCOMPLETE;
5612		freefrag->ff_jdep = NULL;
5613	}
5614
5615	return (freefrag);
5616}
5617
5618/*
5619 * This workitem de-allocates fragments that were replaced during
5620 * file block allocation.
5621 */
5622static void
5623handle_workitem_freefrag(freefrag)
5624	struct freefrag *freefrag;
5625{
5626	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5627	struct workhead wkhd;
5628
5629	CTR3(KTR_SUJ,
5630	    "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5631	    freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5632	/*
5633	 * It would be illegal to add new completion items to the
5634	 * freefrag after it was schedule to be done so it must be
5635	 * safe to modify the list head here.
5636	 */
5637	LIST_INIT(&wkhd);
5638	ACQUIRE_LOCK(ump);
5639	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5640	/*
5641	 * If the journal has not been written we must cancel it here.
5642	 */
5643	if (freefrag->ff_jdep) {
5644		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5645			panic("handle_workitem_freefrag: Unexpected type %d\n",
5646			    freefrag->ff_jdep->wk_type);
5647		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5648	}
5649	FREE_LOCK(ump);
5650	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5651	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype,
5652	   &wkhd, freefrag->ff_key);
5653	ACQUIRE_LOCK(ump);
5654	WORKITEM_FREE(freefrag, D_FREEFRAG);
5655	FREE_LOCK(ump);
5656}
5657
5658/*
5659 * Set up a dependency structure for an external attributes data block.
5660 * This routine follows much of the structure of softdep_setup_allocdirect.
5661 * See the description of softdep_setup_allocdirect above for details.
5662 */
5663void
5664softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5665	struct inode *ip;
5666	ufs_lbn_t off;
5667	ufs2_daddr_t newblkno;
5668	ufs2_daddr_t oldblkno;
5669	long newsize;
5670	long oldsize;
5671	struct buf *bp;
5672{
5673	struct allocdirect *adp, *oldadp;
5674	struct allocdirectlst *adphead;
5675	struct freefrag *freefrag;
5676	struct inodedep *inodedep;
5677	struct jnewblk *jnewblk;
5678	struct newblk *newblk;
5679	struct mount *mp;
5680	struct ufsmount *ump;
5681	ufs_lbn_t lbn;
5682
5683	mp = ITOVFS(ip);
5684	ump = VFSTOUFS(mp);
5685	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5686	    ("softdep_setup_allocext called on non-softdep filesystem"));
5687	KASSERT(off < UFS_NXADDR,
5688	    ("softdep_setup_allocext: lbn %lld > UFS_NXADDR", (long long)off));
5689
5690	lbn = bp->b_lblkno;
5691	if (oldblkno && oldblkno != newblkno)
5692		/*
5693		 * The usual case is that a smaller fragment that
5694		 * was just allocated has been replaced with a bigger
5695		 * fragment or a full-size block. If it is marked as
5696		 * B_DELWRI, the current contents have not been written
5697		 * to disk. It is possible that the block was written
5698		 * earlier, but very uncommon. If the block has never
5699		 * been written, there is no need to send a BIO_DELETE
5700		 * for it when it is freed. The gain from avoiding the
5701		 * TRIMs for the common case of unwritten blocks far
5702		 * exceeds the cost of the write amplification for the
5703		 * uncommon case of failing to send a TRIM for a block
5704		 * that had been written.
5705		 */
5706		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
5707		    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
5708	else
5709		freefrag = NULL;
5710
5711	ACQUIRE_LOCK(ump);
5712	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5713		panic("softdep_setup_allocext: lost block");
5714	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5715	    ("softdep_setup_allocext: newblk already initialized"));
5716	/*
5717	 * Convert the newblk to an allocdirect.
5718	 */
5719	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5720	adp = (struct allocdirect *)newblk;
5721	newblk->nb_freefrag = freefrag;
5722	adp->ad_offset = off;
5723	adp->ad_oldblkno = oldblkno;
5724	adp->ad_newsize = newsize;
5725	adp->ad_oldsize = oldsize;
5726	adp->ad_state |=  EXTDATA;
5727
5728	/*
5729	 * Finish initializing the journal.
5730	 */
5731	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5732		jnewblk->jn_ino = ip->i_number;
5733		jnewblk->jn_lbn = lbn;
5734		add_to_journal(&jnewblk->jn_list);
5735	}
5736	if (freefrag && freefrag->ff_jdep != NULL &&
5737	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5738		add_to_journal(freefrag->ff_jdep);
5739	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5740	adp->ad_inodedep = inodedep;
5741
5742	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5743	/*
5744	 * The list of allocdirects must be kept in sorted and ascending
5745	 * order so that the rollback routines can quickly determine the
5746	 * first uncommitted block (the size of the file stored on disk
5747	 * ends at the end of the lowest committed fragment, or if there
5748	 * are no fragments, at the end of the highest committed block).
5749	 * Since files generally grow, the typical case is that the new
5750	 * block is to be added at the end of the list. We speed this
5751	 * special case by checking against the last allocdirect in the
5752	 * list before laboriously traversing the list looking for the
5753	 * insertion point.
5754	 */
5755	adphead = &inodedep->id_newextupdt;
5756	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5757	if (oldadp == NULL || oldadp->ad_offset <= off) {
5758		/* insert at end of list */
5759		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5760		if (oldadp != NULL && oldadp->ad_offset == off)
5761			allocdirect_merge(adphead, adp, oldadp);
5762		FREE_LOCK(ump);
5763		return;
5764	}
5765	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5766		if (oldadp->ad_offset >= off)
5767			break;
5768	}
5769	if (oldadp == NULL)
5770		panic("softdep_setup_allocext: lost entry");
5771	/* insert in middle of list */
5772	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5773	if (oldadp->ad_offset == off)
5774		allocdirect_merge(adphead, adp, oldadp);
5775	FREE_LOCK(ump);
5776}
5777
5778/*
5779 * Indirect block allocation dependencies.
5780 *
5781 * The same dependencies that exist for a direct block also exist when
5782 * a new block is allocated and pointed to by an entry in a block of
5783 * indirect pointers. The undo/redo states described above are also
5784 * used here. Because an indirect block contains many pointers that
5785 * may have dependencies, a second copy of the entire in-memory indirect
5786 * block is kept. The buffer cache copy is always completely up-to-date.
5787 * The second copy, which is used only as a source for disk writes,
5788 * contains only the safe pointers (i.e., those that have no remaining
5789 * update dependencies). The second copy is freed when all pointers
5790 * are safe. The cache is not allowed to replace indirect blocks with
5791 * pending update dependencies. If a buffer containing an indirect
5792 * block with dependencies is written, these routines will mark it
5793 * dirty again. It can only be successfully written once all the
5794 * dependencies are removed. The ffs_fsync routine in conjunction with
5795 * softdep_sync_metadata work together to get all the dependencies
5796 * removed so that a file can be successfully written to disk. Three
5797 * procedures are used when setting up indirect block pointer
5798 * dependencies. The division is necessary because of the organization
5799 * of the "balloc" routine and because of the distinction between file
5800 * pages and file metadata blocks.
5801 */
5802
5803/*
5804 * Allocate a new allocindir structure.
5805 */
5806static struct allocindir *
5807newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
5808	struct inode *ip;	/* inode for file being extended */
5809	int ptrno;		/* offset of pointer in indirect block */
5810	ufs2_daddr_t newblkno;	/* disk block number being added */
5811	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5812	ufs_lbn_t lbn;
5813{
5814	struct newblk *newblk;
5815	struct allocindir *aip;
5816	struct freefrag *freefrag;
5817	struct jnewblk *jnewblk;
5818
5819	if (oldblkno)
5820		freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn,
5821		    SINGLETON_KEY);
5822	else
5823		freefrag = NULL;
5824	ACQUIRE_LOCK(ITOUMP(ip));
5825	if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0)
5826		panic("new_allocindir: lost block");
5827	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5828	    ("newallocindir: newblk already initialized"));
5829	WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
5830	newblk->nb_freefrag = freefrag;
5831	aip = (struct allocindir *)newblk;
5832	aip->ai_offset = ptrno;
5833	aip->ai_oldblkno = oldblkno;
5834	aip->ai_lbn = lbn;
5835	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5836		jnewblk->jn_ino = ip->i_number;
5837		jnewblk->jn_lbn = lbn;
5838		add_to_journal(&jnewblk->jn_list);
5839	}
5840	if (freefrag && freefrag->ff_jdep != NULL &&
5841	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5842		add_to_journal(freefrag->ff_jdep);
5843	return (aip);
5844}
5845
5846/*
5847 * Called just before setting an indirect block pointer
5848 * to a newly allocated file page.
5849 */
5850void
5851softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
5852	struct inode *ip;	/* inode for file being extended */
5853	ufs_lbn_t lbn;		/* allocated block number within file */
5854	struct buf *bp;		/* buffer with indirect blk referencing page */
5855	int ptrno;		/* offset of pointer in indirect block */
5856	ufs2_daddr_t newblkno;	/* disk block number being added */
5857	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5858	struct buf *nbp;	/* buffer holding allocated page */
5859{
5860	struct inodedep *inodedep;
5861	struct freefrag *freefrag;
5862	struct allocindir *aip;
5863	struct pagedep *pagedep;
5864	struct mount *mp;
5865	struct ufsmount *ump;
5866
5867	mp = ITOVFS(ip);
5868	ump = VFSTOUFS(mp);
5869	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5870	    ("softdep_setup_allocindir_page called on non-softdep filesystem"));
5871	KASSERT(lbn == nbp->b_lblkno,
5872	    ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
5873	    lbn, bp->b_lblkno));
5874	CTR4(KTR_SUJ,
5875	    "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
5876	    "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
5877	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
5878	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
5879	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5880	/*
5881	 * If we are allocating a directory page, then we must
5882	 * allocate an associated pagedep to track additions and
5883	 * deletions.
5884	 */
5885	if ((ip->i_mode & IFMT) == IFDIR)
5886		pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
5887	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5888	freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5889	FREE_LOCK(ump);
5890	if (freefrag)
5891		handle_workitem_freefrag(freefrag);
5892}
5893
5894/*
5895 * Called just before setting an indirect block pointer to a
5896 * newly allocated indirect block.
5897 */
5898void
5899softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
5900	struct buf *nbp;	/* newly allocated indirect block */
5901	struct inode *ip;	/* inode for file being extended */
5902	struct buf *bp;		/* indirect block referencing allocated block */
5903	int ptrno;		/* offset of pointer in indirect block */
5904	ufs2_daddr_t newblkno;	/* disk block number being added */
5905{
5906	struct inodedep *inodedep;
5907	struct allocindir *aip;
5908	struct ufsmount *ump;
5909	ufs_lbn_t lbn;
5910
5911	ump = ITOUMP(ip);
5912	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
5913	    ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
5914	CTR3(KTR_SUJ,
5915	    "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
5916	    ip->i_number, newblkno, ptrno);
5917	lbn = nbp->b_lblkno;
5918	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
5919	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
5920	inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
5921	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5922	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
5923		panic("softdep_setup_allocindir_meta: Block already existed");
5924	FREE_LOCK(ump);
5925}
5926
5927static void
5928indirdep_complete(indirdep)
5929	struct indirdep *indirdep;
5930{
5931	struct allocindir *aip;
5932
5933	LIST_REMOVE(indirdep, ir_next);
5934	indirdep->ir_state |= DEPCOMPLETE;
5935
5936	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5937		LIST_REMOVE(aip, ai_next);
5938		free_newblk(&aip->ai_block);
5939	}
5940	/*
5941	 * If this indirdep is not attached to a buf it was simply waiting
5942	 * on completion to clear completehd.  free_indirdep() asserts
5943	 * that nothing is dangling.
5944	 */
5945	if ((indirdep->ir_state & ONWORKLIST) == 0)
5946		free_indirdep(indirdep);
5947}
5948
5949static struct indirdep *
5950indirdep_lookup(mp, ip, bp)
5951	struct mount *mp;
5952	struct inode *ip;
5953	struct buf *bp;
5954{
5955	struct indirdep *indirdep, *newindirdep;
5956	struct newblk *newblk;
5957	struct ufsmount *ump;
5958	struct worklist *wk;
5959	struct fs *fs;
5960	ufs2_daddr_t blkno;
5961
5962	ump = VFSTOUFS(mp);
5963	LOCK_OWNED(ump);
5964	indirdep = NULL;
5965	newindirdep = NULL;
5966	fs = ump->um_fs;
5967	for (;;) {
5968		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5969			if (wk->wk_type != D_INDIRDEP)
5970				continue;
5971			indirdep = WK_INDIRDEP(wk);
5972			break;
5973		}
5974		/* Found on the buffer worklist, no new structure to free. */
5975		if (indirdep != NULL && newindirdep == NULL)
5976			return (indirdep);
5977		if (indirdep != NULL && newindirdep != NULL)
5978			panic("indirdep_lookup: simultaneous create");
5979		/* None found on the buffer and a new structure is ready. */
5980		if (indirdep == NULL && newindirdep != NULL)
5981			break;
5982		/* None found and no new structure available. */
5983		FREE_LOCK(ump);
5984		newindirdep = malloc(sizeof(struct indirdep),
5985		    M_INDIRDEP, M_SOFTDEP_FLAGS);
5986		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5987		newindirdep->ir_state = ATTACHED;
5988		if (I_IS_UFS1(ip))
5989			newindirdep->ir_state |= UFS1FMT;
5990		TAILQ_INIT(&newindirdep->ir_trunc);
5991		newindirdep->ir_saveddata = NULL;
5992		LIST_INIT(&newindirdep->ir_deplisthd);
5993		LIST_INIT(&newindirdep->ir_donehd);
5994		LIST_INIT(&newindirdep->ir_writehd);
5995		LIST_INIT(&newindirdep->ir_completehd);
5996		if (bp->b_blkno == bp->b_lblkno) {
5997			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5998			    NULL, NULL);
5999			bp->b_blkno = blkno;
6000		}
6001		newindirdep->ir_freeblks = NULL;
6002		newindirdep->ir_savebp =
6003		    getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
6004		newindirdep->ir_bp = bp;
6005		BUF_KERNPROC(newindirdep->ir_savebp);
6006		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
6007		ACQUIRE_LOCK(ump);
6008	}
6009	indirdep = newindirdep;
6010	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
6011	/*
6012	 * If the block is not yet allocated we don't set DEPCOMPLETE so
6013	 * that we don't free dependencies until the pointers are valid.
6014	 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
6015	 * than using the hash.
6016	 */
6017	if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
6018		LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
6019	else
6020		indirdep->ir_state |= DEPCOMPLETE;
6021	return (indirdep);
6022}
6023
6024/*
6025 * Called to finish the allocation of the "aip" allocated
6026 * by one of the two routines above.
6027 */
6028static struct freefrag *
6029setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
6030	struct buf *bp;		/* in-memory copy of the indirect block */
6031	struct inode *ip;	/* inode for file being extended */
6032	struct inodedep *inodedep; /* Inodedep for ip */
6033	struct allocindir *aip;	/* allocindir allocated by the above routines */
6034	ufs_lbn_t lbn;		/* Logical block number for this block. */
6035{
6036	struct fs *fs;
6037	struct indirdep *indirdep;
6038	struct allocindir *oldaip;
6039	struct freefrag *freefrag;
6040	struct mount *mp;
6041	struct ufsmount *ump;
6042
6043	mp = ITOVFS(ip);
6044	ump = VFSTOUFS(mp);
6045	LOCK_OWNED(ump);
6046	fs = ump->um_fs;
6047	if (bp->b_lblkno >= 0)
6048		panic("setup_allocindir_phase2: not indir blk");
6049	KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
6050	    ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
6051	indirdep = indirdep_lookup(mp, ip, bp);
6052	KASSERT(indirdep->ir_savebp != NULL,
6053	    ("setup_allocindir_phase2 NULL ir_savebp"));
6054	aip->ai_indirdep = indirdep;
6055	/*
6056	 * Check for an unwritten dependency for this indirect offset.  If
6057	 * there is, merge the old dependency into the new one.  This happens
6058	 * as a result of reallocblk only.
6059	 */
6060	freefrag = NULL;
6061	if (aip->ai_oldblkno != 0) {
6062		LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
6063			if (oldaip->ai_offset == aip->ai_offset) {
6064				freefrag = allocindir_merge(aip, oldaip);
6065				goto done;
6066			}
6067		}
6068		LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
6069			if (oldaip->ai_offset == aip->ai_offset) {
6070				freefrag = allocindir_merge(aip, oldaip);
6071				goto done;
6072			}
6073		}
6074	}
6075done:
6076	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
6077	return (freefrag);
6078}
6079
6080/*
6081 * Merge two allocindirs which refer to the same block.  Move newblock
6082 * dependencies and setup the freefrags appropriately.
6083 */
6084static struct freefrag *
6085allocindir_merge(aip, oldaip)
6086	struct allocindir *aip;
6087	struct allocindir *oldaip;
6088{
6089	struct freefrag *freefrag;
6090	struct worklist *wk;
6091
6092	if (oldaip->ai_newblkno != aip->ai_oldblkno)
6093		panic("allocindir_merge: blkno");
6094	aip->ai_oldblkno = oldaip->ai_oldblkno;
6095	freefrag = aip->ai_freefrag;
6096	aip->ai_freefrag = oldaip->ai_freefrag;
6097	oldaip->ai_freefrag = NULL;
6098	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
6099	/*
6100	 * If we are tracking a new directory-block allocation,
6101	 * move it from the old allocindir to the new allocindir.
6102	 */
6103	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
6104		WORKLIST_REMOVE(wk);
6105		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
6106			panic("allocindir_merge: extra newdirblk");
6107		WORKLIST_INSERT(&aip->ai_newdirblk, wk);
6108	}
6109	/*
6110	 * We can skip journaling for this freefrag and just complete
6111	 * any pending journal work for the allocindir that is being
6112	 * removed after the freefrag completes.
6113	 */
6114	if (freefrag->ff_jdep)
6115		cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
6116	LIST_REMOVE(oldaip, ai_next);
6117	freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
6118	    &freefrag->ff_list, &freefrag->ff_jwork);
6119	free_newblk(&oldaip->ai_block);
6120
6121	return (freefrag);
6122}
6123
6124static inline void
6125setup_freedirect(freeblks, ip, i, needj)
6126	struct freeblks *freeblks;
6127	struct inode *ip;
6128	int i;
6129	int needj;
6130{
6131	struct ufsmount *ump;
6132	ufs2_daddr_t blkno;
6133	int frags;
6134
6135	blkno = DIP(ip, i_db[i]);
6136	if (blkno == 0)
6137		return;
6138	DIP_SET(ip, i_db[i], 0);
6139	ump = ITOUMP(ip);
6140	frags = sblksize(ump->um_fs, ip->i_size, i);
6141	frags = numfrags(ump->um_fs, frags);
6142	newfreework(ump, freeblks, NULL, i, blkno, frags, 0, needj);
6143}
6144
6145static inline void
6146setup_freeext(freeblks, ip, i, needj)
6147	struct freeblks *freeblks;
6148	struct inode *ip;
6149	int i;
6150	int needj;
6151{
6152	struct ufsmount *ump;
6153	ufs2_daddr_t blkno;
6154	int frags;
6155
6156	blkno = ip->i_din2->di_extb[i];
6157	if (blkno == 0)
6158		return;
6159	ip->i_din2->di_extb[i] = 0;
6160	ump = ITOUMP(ip);
6161	frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i);
6162	frags = numfrags(ump->um_fs, frags);
6163	newfreework(ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
6164}
6165
6166static inline void
6167setup_freeindir(freeblks, ip, i, lbn, needj)
6168	struct freeblks *freeblks;
6169	struct inode *ip;
6170	int i;
6171	ufs_lbn_t lbn;
6172	int needj;
6173{
6174	struct ufsmount *ump;
6175	ufs2_daddr_t blkno;
6176
6177	blkno = DIP(ip, i_ib[i]);
6178	if (blkno == 0)
6179		return;
6180	DIP_SET(ip, i_ib[i], 0);
6181	ump = ITOUMP(ip);
6182	newfreework(ump, freeblks, NULL, lbn, blkno, ump->um_fs->fs_frag,
6183	    0, needj);
6184}
6185
6186static inline struct freeblks *
6187newfreeblks(mp, ip)
6188	struct mount *mp;
6189	struct inode *ip;
6190{
6191	struct freeblks *freeblks;
6192
6193	freeblks = malloc(sizeof(struct freeblks),
6194		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
6195	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
6196	LIST_INIT(&freeblks->fb_jblkdephd);
6197	LIST_INIT(&freeblks->fb_jwork);
6198	freeblks->fb_ref = 0;
6199	freeblks->fb_cgwait = 0;
6200	freeblks->fb_state = ATTACHED;
6201	freeblks->fb_uid = ip->i_uid;
6202	freeblks->fb_inum = ip->i_number;
6203	freeblks->fb_vtype = ITOV(ip)->v_type;
6204	freeblks->fb_modrev = DIP(ip, i_modrev);
6205	freeblks->fb_devvp = ITODEVVP(ip);
6206	freeblks->fb_chkcnt = 0;
6207	freeblks->fb_len = 0;
6208
6209	return (freeblks);
6210}
6211
6212static void
6213trunc_indirdep(indirdep, freeblks, bp, off)
6214	struct indirdep *indirdep;
6215	struct freeblks *freeblks;
6216	struct buf *bp;
6217	int off;
6218{
6219	struct allocindir *aip, *aipn;
6220
6221	/*
6222	 * The first set of allocindirs won't be in savedbp.
6223	 */
6224	LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
6225		if (aip->ai_offset > off)
6226			cancel_allocindir(aip, bp, freeblks, 1);
6227	LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
6228		if (aip->ai_offset > off)
6229			cancel_allocindir(aip, bp, freeblks, 1);
6230	/*
6231	 * These will exist in savedbp.
6232	 */
6233	LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
6234		if (aip->ai_offset > off)
6235			cancel_allocindir(aip, NULL, freeblks, 0);
6236	LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
6237		if (aip->ai_offset > off)
6238			cancel_allocindir(aip, NULL, freeblks, 0);
6239}
6240
6241/*
6242 * Follow the chain of indirects down to lastlbn creating a freework
6243 * structure for each.  This will be used to start indir_trunc() at
6244 * the right offset and create the journal records for the parrtial
6245 * truncation.  A second step will handle the truncated dependencies.
6246 */
6247static int
6248setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
6249	struct freeblks *freeblks;
6250	struct inode *ip;
6251	ufs_lbn_t lbn;
6252	ufs_lbn_t lastlbn;
6253	ufs2_daddr_t blkno;
6254{
6255	struct indirdep *indirdep;
6256	struct indirdep *indirn;
6257	struct freework *freework;
6258	struct newblk *newblk;
6259	struct mount *mp;
6260	struct ufsmount *ump;
6261	struct buf *bp;
6262	uint8_t *start;
6263	uint8_t *end;
6264	ufs_lbn_t lbnadd;
6265	int level;
6266	int error;
6267	int off;
6268
6269
6270	freework = NULL;
6271	if (blkno == 0)
6272		return (0);
6273	mp = freeblks->fb_list.wk_mp;
6274	ump = VFSTOUFS(mp);
6275	bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
6276	if ((bp->b_flags & B_CACHE) == 0) {
6277		bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
6278		bp->b_iocmd = BIO_READ;
6279		bp->b_flags &= ~B_INVAL;
6280		bp->b_ioflags &= ~BIO_ERROR;
6281		vfs_busy_pages(bp, 0);
6282		bp->b_iooffset = dbtob(bp->b_blkno);
6283		bstrategy(bp);
6284#ifdef RACCT
6285		if (racct_enable) {
6286			PROC_LOCK(curproc);
6287			racct_add_buf(curproc, bp, 0);
6288			PROC_UNLOCK(curproc);
6289		}
6290#endif /* RACCT */
6291		curthread->td_ru.ru_inblock++;
6292		error = bufwait(bp);
6293		if (error) {
6294			brelse(bp);
6295			return (error);
6296		}
6297	}
6298	level = lbn_level(lbn);
6299	lbnadd = lbn_offset(ump->um_fs, level);
6300	/*
6301	 * Compute the offset of the last block we want to keep.  Store
6302	 * in the freework the first block we want to completely free.
6303	 */
6304	off = (lastlbn - -(lbn + level)) / lbnadd;
6305	if (off + 1 == NINDIR(ump->um_fs))
6306		goto nowork;
6307	freework = newfreework(ump, freeblks, NULL, lbn, blkno, 0, off + 1, 0);
6308	/*
6309	 * Link the freework into the indirdep.  This will prevent any new
6310	 * allocations from proceeding until we are finished with the
6311	 * truncate and the block is written.
6312	 */
6313	ACQUIRE_LOCK(ump);
6314	indirdep = indirdep_lookup(mp, ip, bp);
6315	if (indirdep->ir_freeblks)
6316		panic("setup_trunc_indir: indirdep already truncated.");
6317	TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
6318	freework->fw_indir = indirdep;
6319	/*
6320	 * Cancel any allocindirs that will not make it to disk.
6321	 * We have to do this for all copies of the indirdep that
6322	 * live on this newblk.
6323	 */
6324	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
6325		if (newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0,
6326		    &newblk) == 0)
6327			panic("setup_trunc_indir: lost block");
6328		LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
6329			trunc_indirdep(indirn, freeblks, bp, off);
6330	} else
6331		trunc_indirdep(indirdep, freeblks, bp, off);
6332	FREE_LOCK(ump);
6333	/*
6334	 * Creation is protected by the buf lock. The saveddata is only
6335	 * needed if a full truncation follows a partial truncation but it
6336	 * is difficult to allocate in that case so we fetch it anyway.
6337	 */
6338	if (indirdep->ir_saveddata == NULL)
6339		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
6340		    M_SOFTDEP_FLAGS);
6341nowork:
6342	/* Fetch the blkno of the child and the zero start offset. */
6343	if (I_IS_UFS1(ip)) {
6344		blkno = ((ufs1_daddr_t *)bp->b_data)[off];
6345		start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
6346	} else {
6347		blkno = ((ufs2_daddr_t *)bp->b_data)[off];
6348		start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
6349	}
6350	if (freework) {
6351		/* Zero the truncated pointers. */
6352		end = bp->b_data + bp->b_bcount;
6353		bzero(start, end - start);
6354		bdwrite(bp);
6355	} else
6356		bqrelse(bp);
6357	if (level == 0)
6358		return (0);
6359	lbn++; /* adjust level */
6360	lbn -= (off * lbnadd);
6361	return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
6362}
6363
6364/*
6365 * Complete the partial truncation of an indirect block setup by
6366 * setup_trunc_indir().  This zeros the truncated pointers in the saved
6367 * copy and writes them to disk before the freeblks is allowed to complete.
6368 */
6369static void
6370complete_trunc_indir(freework)
6371	struct freework *freework;
6372{
6373	struct freework *fwn;
6374	struct indirdep *indirdep;
6375	struct ufsmount *ump;
6376	struct buf *bp;
6377	uintptr_t start;
6378	int count;
6379
6380	ump = VFSTOUFS(freework->fw_list.wk_mp);
6381	LOCK_OWNED(ump);
6382	indirdep = freework->fw_indir;
6383	for (;;) {
6384		bp = indirdep->ir_bp;
6385		/* See if the block was discarded. */
6386		if (bp == NULL)
6387			break;
6388		/* Inline part of getdirtybuf().  We dont want bremfree. */
6389		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
6390			break;
6391		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6392		    LOCK_PTR(ump)) == 0)
6393			BUF_UNLOCK(bp);
6394		ACQUIRE_LOCK(ump);
6395	}
6396	freework->fw_state |= DEPCOMPLETE;
6397	TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
6398	/*
6399	 * Zero the pointers in the saved copy.
6400	 */
6401	if (indirdep->ir_state & UFS1FMT)
6402		start = sizeof(ufs1_daddr_t);
6403	else
6404		start = sizeof(ufs2_daddr_t);
6405	start *= freework->fw_start;
6406	count = indirdep->ir_savebp->b_bcount - start;
6407	start += (uintptr_t)indirdep->ir_savebp->b_data;
6408	bzero((char *)start, count);
6409	/*
6410	 * We need to start the next truncation in the list if it has not
6411	 * been started yet.
6412	 */
6413	fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6414	if (fwn != NULL) {
6415		if (fwn->fw_freeblks == indirdep->ir_freeblks)
6416			TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6417		if ((fwn->fw_state & ONWORKLIST) == 0)
6418			freework_enqueue(fwn);
6419	}
6420	/*
6421	 * If bp is NULL the block was fully truncated, restore
6422	 * the saved block list otherwise free it if it is no
6423	 * longer needed.
6424	 */
6425	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6426		if (bp == NULL)
6427			bcopy(indirdep->ir_saveddata,
6428			    indirdep->ir_savebp->b_data,
6429			    indirdep->ir_savebp->b_bcount);
6430		free(indirdep->ir_saveddata, M_INDIRDEP);
6431		indirdep->ir_saveddata = NULL;
6432	}
6433	/*
6434	 * When bp is NULL there is a full truncation pending.  We
6435	 * must wait for this full truncation to be journaled before
6436	 * we can release this freework because the disk pointers will
6437	 * never be written as zero.
6438	 */
6439	if (bp == NULL)  {
6440		if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6441			handle_written_freework(freework);
6442		else
6443			WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6444			   &freework->fw_list);
6445	} else {
6446		/* Complete when the real copy is written. */
6447		WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6448		BUF_UNLOCK(bp);
6449	}
6450}
6451
6452/*
6453 * Calculate the number of blocks we are going to release where datablocks
6454 * is the current total and length is the new file size.
6455 */
6456static ufs2_daddr_t
6457blkcount(fs, datablocks, length)
6458	struct fs *fs;
6459	ufs2_daddr_t datablocks;
6460	off_t length;
6461{
6462	off_t totblks, numblks;
6463
6464	totblks = 0;
6465	numblks = howmany(length, fs->fs_bsize);
6466	if (numblks <= UFS_NDADDR) {
6467		totblks = howmany(length, fs->fs_fsize);
6468		goto out;
6469	}
6470        totblks = blkstofrags(fs, numblks);
6471	numblks -= UFS_NDADDR;
6472	/*
6473	 * Count all single, then double, then triple indirects required.
6474	 * Subtracting one indirects worth of blocks for each pass
6475	 * acknowledges one of each pointed to by the inode.
6476	 */
6477	for (;;) {
6478		totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6479		numblks -= NINDIR(fs);
6480		if (numblks <= 0)
6481			break;
6482		numblks = howmany(numblks, NINDIR(fs));
6483	}
6484out:
6485	totblks = fsbtodb(fs, totblks);
6486	/*
6487	 * Handle sparse files.  We can't reclaim more blocks than the inode
6488	 * references.  We will correct it later in handle_complete_freeblks()
6489	 * when we know the real count.
6490	 */
6491	if (totblks > datablocks)
6492		return (0);
6493	return (datablocks - totblks);
6494}
6495
6496/*
6497 * Handle freeblocks for journaled softupdate filesystems.
6498 *
6499 * Contrary to normal softupdates, we must preserve the block pointers in
6500 * indirects until their subordinates are free.  This is to avoid journaling
6501 * every block that is freed which may consume more space than the journal
6502 * itself.  The recovery program will see the free block journals at the
6503 * base of the truncated area and traverse them to reclaim space.  The
6504 * pointers in the inode may be cleared immediately after the journal
6505 * records are written because each direct and indirect pointer in the
6506 * inode is recorded in a journal.  This permits full truncation to proceed
6507 * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6508 *
6509 * The algorithm is as follows:
6510 * 1) Traverse the in-memory state and create journal entries to release
6511 *    the relevant blocks and full indirect trees.
6512 * 2) Traverse the indirect block chain adding partial truncation freework
6513 *    records to indirects in the path to lastlbn.  The freework will
6514 *    prevent new allocation dependencies from being satisfied in this
6515 *    indirect until the truncation completes.
6516 * 3) Read and lock the inode block, performing an update with the new size
6517 *    and pointers.  This prevents truncated data from becoming valid on
6518 *    disk through step 4.
6519 * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6520 *    eliminate journal work for those records that do not require it.
6521 * 5) Schedule the journal records to be written followed by the inode block.
6522 * 6) Allocate any necessary frags for the end of file.
6523 * 7) Zero any partially truncated blocks.
6524 *
6525 * From this truncation proceeds asynchronously using the freework and
6526 * indir_trunc machinery.  The file will not be extended again into a
6527 * partially truncated indirect block until all work is completed but
6528 * the normal dependency mechanism ensures that it is rolled back/forward
6529 * as appropriate.  Further truncation may occur without delay and is
6530 * serialized in indir_trunc().
6531 */
6532void
6533softdep_journal_freeblocks(ip, cred, length, flags)
6534	struct inode *ip;	/* The inode whose length is to be reduced */
6535	struct ucred *cred;
6536	off_t length;		/* The new length for the file */
6537	int flags;		/* IO_EXT and/or IO_NORMAL */
6538{
6539	struct freeblks *freeblks, *fbn;
6540	struct worklist *wk, *wkn;
6541	struct inodedep *inodedep;
6542	struct jblkdep *jblkdep;
6543	struct allocdirect *adp, *adpn;
6544	struct ufsmount *ump;
6545	struct fs *fs;
6546	struct buf *bp;
6547	struct vnode *vp;
6548	struct mount *mp;
6549	ufs2_daddr_t extblocks, datablocks;
6550	ufs_lbn_t tmpval, lbn, lastlbn;
6551	int frags, lastoff, iboff, allocblock, needj, error, i;
6552
6553	ump = ITOUMP(ip);
6554	mp = UFSTOVFS(ump);
6555	fs = ump->um_fs;
6556	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6557	    ("softdep_journal_freeblocks called on non-softdep filesystem"));
6558	vp = ITOV(ip);
6559	needj = 1;
6560	iboff = -1;
6561	allocblock = 0;
6562	extblocks = 0;
6563	datablocks = 0;
6564	frags = 0;
6565	freeblks = newfreeblks(mp, ip);
6566	ACQUIRE_LOCK(ump);
6567	/*
6568	 * If we're truncating a removed file that will never be written
6569	 * we don't need to journal the block frees.  The canceled journals
6570	 * for the allocations will suffice.
6571	 */
6572	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6573	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6574	    length == 0)
6575		needj = 0;
6576	CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
6577	    ip->i_number, length, needj);
6578	FREE_LOCK(ump);
6579	/*
6580	 * Calculate the lbn that we are truncating to.  This results in -1
6581	 * if we're truncating the 0 bytes.  So it is the last lbn we want
6582	 * to keep, not the first lbn we want to truncate.
6583	 */
6584	lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6585	lastoff = blkoff(fs, length);
6586	/*
6587	 * Compute frags we are keeping in lastlbn.  0 means all.
6588	 */
6589	if (lastlbn >= 0 && lastlbn < UFS_NDADDR) {
6590		frags = fragroundup(fs, lastoff);
6591		/* adp offset of last valid allocdirect. */
6592		iboff = lastlbn;
6593	} else if (lastlbn > 0)
6594		iboff = UFS_NDADDR;
6595	if (fs->fs_magic == FS_UFS2_MAGIC)
6596		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6597	/*
6598	 * Handle normal data blocks and indirects.  This section saves
6599	 * values used after the inode update to complete frag and indirect
6600	 * truncation.
6601	 */
6602	if ((flags & IO_NORMAL) != 0) {
6603		/*
6604		 * Handle truncation of whole direct and indirect blocks.
6605		 */
6606		for (i = iboff + 1; i < UFS_NDADDR; i++)
6607			setup_freedirect(freeblks, ip, i, needj);
6608		for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR;
6609		    i < UFS_NIADDR;
6610		    i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6611			/* Release a whole indirect tree. */
6612			if (lbn > lastlbn) {
6613				setup_freeindir(freeblks, ip, i, -lbn -i,
6614				    needj);
6615				continue;
6616			}
6617			iboff = i + UFS_NDADDR;
6618			/*
6619			 * Traverse partially truncated indirect tree.
6620			 */
6621			if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6622				setup_trunc_indir(freeblks, ip, -lbn - i,
6623				    lastlbn, DIP(ip, i_ib[i]));
6624		}
6625		/*
6626		 * Handle partial truncation to a frag boundary.
6627		 */
6628		if (frags) {
6629			ufs2_daddr_t blkno;
6630			long oldfrags;
6631
6632			oldfrags = blksize(fs, ip, lastlbn);
6633			blkno = DIP(ip, i_db[lastlbn]);
6634			if (blkno && oldfrags != frags) {
6635				oldfrags -= frags;
6636				oldfrags = numfrags(fs, oldfrags);
6637				blkno += numfrags(fs, frags);
6638				newfreework(ump, freeblks, NULL, lastlbn,
6639				    blkno, oldfrags, 0, needj);
6640				if (needj)
6641					adjust_newfreework(freeblks,
6642					    numfrags(fs, frags));
6643			} else if (blkno == 0)
6644				allocblock = 1;
6645		}
6646		/*
6647		 * Add a journal record for partial truncate if we are
6648		 * handling indirect blocks.  Non-indirects need no extra
6649		 * journaling.
6650		 */
6651		if (length != 0 && lastlbn >= UFS_NDADDR) {
6652			ip->i_flag |= IN_TRUNCATED;
6653			newjtrunc(freeblks, length, 0);
6654		}
6655		ip->i_size = length;
6656		DIP_SET(ip, i_size, ip->i_size);
6657		ip->i_flag |= IN_SIZEMOD | IN_CHANGE;
6658		datablocks = DIP(ip, i_blocks) - extblocks;
6659		if (length != 0)
6660			datablocks = blkcount(fs, datablocks, length);
6661		freeblks->fb_len = length;
6662	}
6663	if ((flags & IO_EXT) != 0) {
6664		for (i = 0; i < UFS_NXADDR; i++)
6665			setup_freeext(freeblks, ip, i, needj);
6666		ip->i_din2->di_extsize = 0;
6667		datablocks += extblocks;
6668		ip->i_flag |= IN_SIZEMOD | IN_CHANGE;
6669	}
6670#ifdef QUOTA
6671	/* Reference the quotas in case the block count is wrong in the end. */
6672	quotaref(vp, freeblks->fb_quota);
6673	(void) chkdq(ip, -datablocks, NOCRED, FORCE);
6674#endif
6675	freeblks->fb_chkcnt = -datablocks;
6676	UFS_LOCK(ump);
6677	fs->fs_pendingblocks += datablocks;
6678	UFS_UNLOCK(ump);
6679	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6680	/*
6681	 * Handle truncation of incomplete alloc direct dependencies.  We
6682	 * hold the inode block locked to prevent incomplete dependencies
6683	 * from reaching the disk while we are eliminating those that
6684	 * have been truncated.  This is a partially inlined ffs_update().
6685	 */
6686	ufs_itimes(vp);
6687	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6688	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6689	    (int)fs->fs_bsize, cred, &bp);
6690	if (error) {
6691		brelse(bp);
6692		softdep_error("softdep_journal_freeblocks", error);
6693		return;
6694	}
6695	if (bp->b_bufsize == fs->fs_bsize)
6696		bp->b_flags |= B_CLUSTEROK;
6697	softdep_update_inodeblock(ip, bp, 0);
6698	if (ump->um_fstype == UFS1)
6699		*((struct ufs1_dinode *)bp->b_data +
6700		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6701	else
6702		*((struct ufs2_dinode *)bp->b_data +
6703		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6704	ACQUIRE_LOCK(ump);
6705	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6706	if ((inodedep->id_state & IOSTARTED) != 0)
6707		panic("softdep_setup_freeblocks: inode busy");
6708	/*
6709	 * Add the freeblks structure to the list of operations that
6710	 * must await the zero'ed inode being written to disk. If we
6711	 * still have a bitmap dependency (needj), then the inode
6712	 * has never been written to disk, so we can process the
6713	 * freeblks below once we have deleted the dependencies.
6714	 */
6715	if (needj)
6716		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6717	else
6718		freeblks->fb_state |= COMPLETE;
6719	if ((flags & IO_NORMAL) != 0) {
6720		TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6721			if (adp->ad_offset > iboff)
6722				cancel_allocdirect(&inodedep->id_inoupdt, adp,
6723				    freeblks);
6724			/*
6725			 * Truncate the allocdirect.  We could eliminate
6726			 * or modify journal records as well.
6727			 */
6728			else if (adp->ad_offset == iboff && frags)
6729				adp->ad_newsize = frags;
6730		}
6731	}
6732	if ((flags & IO_EXT) != 0)
6733		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
6734			cancel_allocdirect(&inodedep->id_extupdt, adp,
6735			    freeblks);
6736	/*
6737	 * Scan the bufwait list for newblock dependencies that will never
6738	 * make it to disk.
6739	 */
6740	LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
6741		if (wk->wk_type != D_ALLOCDIRECT)
6742			continue;
6743		adp = WK_ALLOCDIRECT(wk);
6744		if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
6745		    ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
6746			cancel_jfreeblk(freeblks, adp->ad_newblkno);
6747			cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
6748			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6749		}
6750	}
6751	/*
6752	 * Add journal work.
6753	 */
6754	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6755		add_to_journal(&jblkdep->jb_list);
6756	FREE_LOCK(ump);
6757	bdwrite(bp);
6758	/*
6759	 * Truncate dependency structures beyond length.
6760	 */
6761	trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6762	/*
6763	 * This is only set when we need to allocate a fragment because
6764	 * none existed at the end of a frag-sized file.  It handles only
6765	 * allocating a new, zero filled block.
6766	 */
6767	if (allocblock) {
6768		ip->i_size = length - lastoff;
6769		DIP_SET(ip, i_size, ip->i_size);
6770		error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6771		if (error != 0) {
6772			softdep_error("softdep_journal_freeblks", error);
6773			return;
6774		}
6775		ip->i_size = length;
6776		DIP_SET(ip, i_size, length);
6777		ip->i_flag |= IN_SIZEMOD | IN_CHANGE | IN_UPDATE;
6778		allocbuf(bp, frags);
6779		ffs_update(vp, 0);
6780		bawrite(bp);
6781	} else if (lastoff != 0 && vp->v_type != VDIR) {
6782		int size;
6783
6784		/*
6785		 * Zero the end of a truncated frag or block.
6786		 */
6787		size = sblksize(fs, length, lastlbn);
6788		error = bread(vp, lastlbn, size, cred, &bp);
6789		if (error) {
6790			softdep_error("softdep_journal_freeblks", error);
6791			return;
6792		}
6793		bzero((char *)bp->b_data + lastoff, size - lastoff);
6794		bawrite(bp);
6795
6796	}
6797	ACQUIRE_LOCK(ump);
6798	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6799	TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
6800	freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
6801	/*
6802	 * We zero earlier truncations so they don't erroneously
6803	 * update i_blocks.
6804	 */
6805	if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
6806		TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
6807			fbn->fb_len = 0;
6808	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
6809	    LIST_EMPTY(&freeblks->fb_jblkdephd))
6810		freeblks->fb_state |= INPROGRESS;
6811	else
6812		freeblks = NULL;
6813	FREE_LOCK(ump);
6814	if (freeblks)
6815		handle_workitem_freeblocks(freeblks, 0);
6816	trunc_pages(ip, length, extblocks, flags);
6817
6818}
6819
6820/*
6821 * Flush a JOP_SYNC to the journal.
6822 */
6823void
6824softdep_journal_fsync(ip)
6825	struct inode *ip;
6826{
6827	struct jfsync *jfsync;
6828	struct ufsmount *ump;
6829
6830	ump = ITOUMP(ip);
6831	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
6832	    ("softdep_journal_fsync called on non-softdep filesystem"));
6833	if ((ip->i_flag & IN_TRUNCATED) == 0)
6834		return;
6835	ip->i_flag &= ~IN_TRUNCATED;
6836	jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
6837	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump));
6838	jfsync->jfs_size = ip->i_size;
6839	jfsync->jfs_ino = ip->i_number;
6840	ACQUIRE_LOCK(ump);
6841	add_to_journal(&jfsync->jfs_list);
6842	jwait(&jfsync->jfs_list, MNT_WAIT);
6843	FREE_LOCK(ump);
6844}
6845
6846/*
6847 * Block de-allocation dependencies.
6848 *
6849 * When blocks are de-allocated, the on-disk pointers must be nullified before
6850 * the blocks are made available for use by other files.  (The true
6851 * requirement is that old pointers must be nullified before new on-disk
6852 * pointers are set.  We chose this slightly more stringent requirement to
6853 * reduce complexity.) Our implementation handles this dependency by updating
6854 * the inode (or indirect block) appropriately but delaying the actual block
6855 * de-allocation (i.e., freemap and free space count manipulation) until
6856 * after the updated versions reach stable storage.  After the disk is
6857 * updated, the blocks can be safely de-allocated whenever it is convenient.
6858 * This implementation handles only the common case of reducing a file's
6859 * length to zero. Other cases are handled by the conventional synchronous
6860 * write approach.
6861 *
6862 * The ffs implementation with which we worked double-checks
6863 * the state of the block pointers and file size as it reduces
6864 * a file's length.  Some of this code is replicated here in our
6865 * soft updates implementation.  The freeblks->fb_chkcnt field is
6866 * used to transfer a part of this information to the procedure
6867 * that eventually de-allocates the blocks.
6868 *
6869 * This routine should be called from the routine that shortens
6870 * a file's length, before the inode's size or block pointers
6871 * are modified. It will save the block pointer information for
6872 * later release and zero the inode so that the calling routine
6873 * can release it.
6874 */
6875void
6876softdep_setup_freeblocks(ip, length, flags)
6877	struct inode *ip;	/* The inode whose length is to be reduced */
6878	off_t length;		/* The new length for the file */
6879	int flags;		/* IO_EXT and/or IO_NORMAL */
6880{
6881	struct ufs1_dinode *dp1;
6882	struct ufs2_dinode *dp2;
6883	struct freeblks *freeblks;
6884	struct inodedep *inodedep;
6885	struct allocdirect *adp;
6886	struct ufsmount *ump;
6887	struct buf *bp;
6888	struct fs *fs;
6889	ufs2_daddr_t extblocks, datablocks;
6890	struct mount *mp;
6891	int i, delay, error;
6892	ufs_lbn_t tmpval;
6893	ufs_lbn_t lbn;
6894
6895	ump = ITOUMP(ip);
6896	mp = UFSTOVFS(ump);
6897	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6898	    ("softdep_setup_freeblocks called on non-softdep filesystem"));
6899	CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
6900	    ip->i_number, length);
6901	KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
6902	fs = ump->um_fs;
6903	if ((error = bread(ump->um_devvp,
6904	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6905	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
6906		brelse(bp);
6907		softdep_error("softdep_setup_freeblocks", error);
6908		return;
6909	}
6910	freeblks = newfreeblks(mp, ip);
6911	extblocks = 0;
6912	datablocks = 0;
6913	if (fs->fs_magic == FS_UFS2_MAGIC)
6914		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6915	if ((flags & IO_NORMAL) != 0) {
6916		for (i = 0; i < UFS_NDADDR; i++)
6917			setup_freedirect(freeblks, ip, i, 0);
6918		for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR;
6919		    i < UFS_NIADDR;
6920		    i++, lbn += tmpval, tmpval *= NINDIR(fs))
6921			setup_freeindir(freeblks, ip, i, -lbn -i, 0);
6922		ip->i_size = 0;
6923		DIP_SET(ip, i_size, 0);
6924		ip->i_flag |= IN_SIZEMOD | IN_CHANGE;
6925		datablocks = DIP(ip, i_blocks) - extblocks;
6926	}
6927	if ((flags & IO_EXT) != 0) {
6928		for (i = 0; i < UFS_NXADDR; i++)
6929			setup_freeext(freeblks, ip, i, 0);
6930		ip->i_din2->di_extsize = 0;
6931		datablocks += extblocks;
6932		ip->i_flag |= IN_SIZEMOD | IN_CHANGE;
6933	}
6934#ifdef QUOTA
6935	/* Reference the quotas in case the block count is wrong in the end. */
6936	quotaref(ITOV(ip), freeblks->fb_quota);
6937	(void) chkdq(ip, -datablocks, NOCRED, FORCE);
6938#endif
6939	freeblks->fb_chkcnt = -datablocks;
6940	UFS_LOCK(ump);
6941	fs->fs_pendingblocks += datablocks;
6942	UFS_UNLOCK(ump);
6943	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6944	/*
6945	 * Push the zero'ed inode to its disk buffer so that we are free
6946	 * to delete its dependencies below. Once the dependencies are gone
6947	 * the buffer can be safely released.
6948	 */
6949	if (ump->um_fstype == UFS1) {
6950		dp1 = ((struct ufs1_dinode *)bp->b_data +
6951		    ino_to_fsbo(fs, ip->i_number));
6952		ip->i_din1->di_freelink = dp1->di_freelink;
6953		*dp1 = *ip->i_din1;
6954	} else {
6955		dp2 = ((struct ufs2_dinode *)bp->b_data +
6956		    ino_to_fsbo(fs, ip->i_number));
6957		ip->i_din2->di_freelink = dp2->di_freelink;
6958		*dp2 = *ip->i_din2;
6959	}
6960	/*
6961	 * Find and eliminate any inode dependencies.
6962	 */
6963	ACQUIRE_LOCK(ump);
6964	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6965	if ((inodedep->id_state & IOSTARTED) != 0)
6966		panic("softdep_setup_freeblocks: inode busy");
6967	/*
6968	 * Add the freeblks structure to the list of operations that
6969	 * must await the zero'ed inode being written to disk. If we
6970	 * still have a bitmap dependency (delay == 0), then the inode
6971	 * has never been written to disk, so we can process the
6972	 * freeblks below once we have deleted the dependencies.
6973	 */
6974	delay = (inodedep->id_state & DEPCOMPLETE);
6975	if (delay)
6976		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6977	else
6978		freeblks->fb_state |= COMPLETE;
6979	/*
6980	 * Because the file length has been truncated to zero, any
6981	 * pending block allocation dependency structures associated
6982	 * with this inode are obsolete and can simply be de-allocated.
6983	 * We must first merge the two dependency lists to get rid of
6984	 * any duplicate freefrag structures, then purge the merged list.
6985	 * If we still have a bitmap dependency, then the inode has never
6986	 * been written to disk, so we can free any fragments without delay.
6987	 */
6988	if (flags & IO_NORMAL) {
6989		merge_inode_lists(&inodedep->id_newinoupdt,
6990		    &inodedep->id_inoupdt);
6991		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
6992			cancel_allocdirect(&inodedep->id_inoupdt, adp,
6993			    freeblks);
6994	}
6995	if (flags & IO_EXT) {
6996		merge_inode_lists(&inodedep->id_newextupdt,
6997		    &inodedep->id_extupdt);
6998		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
6999			cancel_allocdirect(&inodedep->id_extupdt, adp,
7000			    freeblks);
7001	}
7002	FREE_LOCK(ump);
7003	bdwrite(bp);
7004	trunc_dependencies(ip, freeblks, -1, 0, flags);
7005	ACQUIRE_LOCK(ump);
7006	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
7007		(void) free_inodedep(inodedep);
7008	freeblks->fb_state |= DEPCOMPLETE;
7009	/*
7010	 * If the inode with zeroed block pointers is now on disk
7011	 * we can start freeing blocks.
7012	 */
7013	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
7014		freeblks->fb_state |= INPROGRESS;
7015	else
7016		freeblks = NULL;
7017	FREE_LOCK(ump);
7018	if (freeblks)
7019		handle_workitem_freeblocks(freeblks, 0);
7020	trunc_pages(ip, length, extblocks, flags);
7021}
7022
7023/*
7024 * Eliminate pages from the page cache that back parts of this inode and
7025 * adjust the vnode pager's idea of our size.  This prevents stale data
7026 * from hanging around in the page cache.
7027 */
7028static void
7029trunc_pages(ip, length, extblocks, flags)
7030	struct inode *ip;
7031	off_t length;
7032	ufs2_daddr_t extblocks;
7033	int flags;
7034{
7035	struct vnode *vp;
7036	struct fs *fs;
7037	ufs_lbn_t lbn;
7038	off_t end, extend;
7039
7040	vp = ITOV(ip);
7041	fs = ITOFS(ip);
7042	extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
7043	if ((flags & IO_EXT) != 0)
7044		vn_pages_remove(vp, extend, 0);
7045	if ((flags & IO_NORMAL) == 0)
7046		return;
7047	BO_LOCK(&vp->v_bufobj);
7048	drain_output(vp);
7049	BO_UNLOCK(&vp->v_bufobj);
7050	/*
7051	 * The vnode pager eliminates file pages we eliminate indirects
7052	 * below.
7053	 */
7054	vnode_pager_setsize(vp, length);
7055	/*
7056	 * Calculate the end based on the last indirect we want to keep.  If
7057	 * the block extends into indirects we can just use the negative of
7058	 * its lbn.  Doubles and triples exist at lower numbers so we must
7059	 * be careful not to remove those, if they exist.  double and triple
7060	 * indirect lbns do not overlap with others so it is not important
7061	 * to verify how many levels are required.
7062	 */
7063	lbn = lblkno(fs, length);
7064	if (lbn >= UFS_NDADDR) {
7065		/* Calculate the virtual lbn of the triple indirect. */
7066		lbn = -lbn - (UFS_NIADDR - 1);
7067		end = OFF_TO_IDX(lblktosize(fs, lbn));
7068	} else
7069		end = extend;
7070	vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
7071}
7072
7073/*
7074 * See if the buf bp is in the range eliminated by truncation.
7075 */
7076static int
7077trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
7078	struct buf *bp;
7079	int *blkoffp;
7080	ufs_lbn_t lastlbn;
7081	int lastoff;
7082	int flags;
7083{
7084	ufs_lbn_t lbn;
7085
7086	*blkoffp = 0;
7087	/* Only match ext/normal blocks as appropriate. */
7088	if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
7089	    ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
7090		return (0);
7091	/* ALTDATA is always a full truncation. */
7092	if ((bp->b_xflags & BX_ALTDATA) != 0)
7093		return (1);
7094	/* -1 is full truncation. */
7095	if (lastlbn == -1)
7096		return (1);
7097	/*
7098	 * If this is a partial truncate we only want those
7099	 * blocks and indirect blocks that cover the range
7100	 * we're after.
7101	 */
7102	lbn = bp->b_lblkno;
7103	if (lbn < 0)
7104		lbn = -(lbn + lbn_level(lbn));
7105	if (lbn < lastlbn)
7106		return (0);
7107	/* Here we only truncate lblkno if it's partial. */
7108	if (lbn == lastlbn) {
7109		if (lastoff == 0)
7110			return (0);
7111		*blkoffp = lastoff;
7112	}
7113	return (1);
7114}
7115
7116/*
7117 * Eliminate any dependencies that exist in memory beyond lblkno:off
7118 */
7119static void
7120trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
7121	struct inode *ip;
7122	struct freeblks *freeblks;
7123	ufs_lbn_t lastlbn;
7124	int lastoff;
7125	int flags;
7126{
7127	struct bufobj *bo;
7128	struct vnode *vp;
7129	struct buf *bp;
7130	int blkoff;
7131
7132	/*
7133	 * We must wait for any I/O in progress to finish so that
7134	 * all potential buffers on the dirty list will be visible.
7135	 * Once they are all there, walk the list and get rid of
7136	 * any dependencies.
7137	 */
7138	vp = ITOV(ip);
7139	bo = &vp->v_bufobj;
7140	BO_LOCK(bo);
7141	drain_output(vp);
7142	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
7143		bp->b_vflags &= ~BV_SCANNED;
7144restart:
7145	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
7146		if (bp->b_vflags & BV_SCANNED)
7147			continue;
7148		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7149			bp->b_vflags |= BV_SCANNED;
7150			continue;
7151		}
7152		KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer"));
7153		if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL)
7154			goto restart;
7155		BO_UNLOCK(bo);
7156		if (deallocate_dependencies(bp, freeblks, blkoff))
7157			bqrelse(bp);
7158		else
7159			brelse(bp);
7160		BO_LOCK(bo);
7161		goto restart;
7162	}
7163	/*
7164	 * Now do the work of vtruncbuf while also matching indirect blocks.
7165	 */
7166	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
7167		bp->b_vflags &= ~BV_SCANNED;
7168cleanrestart:
7169	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
7170		if (bp->b_vflags & BV_SCANNED)
7171			continue;
7172		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7173			bp->b_vflags |= BV_SCANNED;
7174			continue;
7175		}
7176		if (BUF_LOCK(bp,
7177		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
7178		    BO_LOCKPTR(bo)) == ENOLCK) {
7179			BO_LOCK(bo);
7180			goto cleanrestart;
7181		}
7182		BO_LOCK(bo);
7183		bp->b_vflags |= BV_SCANNED;
7184		BO_UNLOCK(bo);
7185		bremfree(bp);
7186		if (blkoff != 0) {
7187			allocbuf(bp, blkoff);
7188			bqrelse(bp);
7189		} else {
7190			bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
7191			brelse(bp);
7192		}
7193		BO_LOCK(bo);
7194		goto cleanrestart;
7195	}
7196	drain_output(vp);
7197	BO_UNLOCK(bo);
7198}
7199
7200static int
7201cancel_pagedep(pagedep, freeblks, blkoff)
7202	struct pagedep *pagedep;
7203	struct freeblks *freeblks;
7204	int blkoff;
7205{
7206	struct jremref *jremref;
7207	struct jmvref *jmvref;
7208	struct dirrem *dirrem, *tmp;
7209	int i;
7210
7211	/*
7212	 * Copy any directory remove dependencies to the list
7213	 * to be processed after the freeblks proceeds.  If
7214	 * directory entry never made it to disk they
7215	 * can be dumped directly onto the work list.
7216	 */
7217	LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
7218		/* Skip this directory removal if it is intended to remain. */
7219		if (dirrem->dm_offset < blkoff)
7220			continue;
7221		/*
7222		 * If there are any dirrems we wait for the journal write
7223		 * to complete and then restart the buf scan as the lock
7224		 * has been dropped.
7225		 */
7226		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
7227			jwait(&jremref->jr_list, MNT_WAIT);
7228			return (ERESTART);
7229		}
7230		LIST_REMOVE(dirrem, dm_next);
7231		dirrem->dm_dirinum = pagedep->pd_ino;
7232		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
7233	}
7234	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
7235		jwait(&jmvref->jm_list, MNT_WAIT);
7236		return (ERESTART);
7237	}
7238	/*
7239	 * When we're partially truncating a pagedep we just want to flush
7240	 * journal entries and return.  There can not be any adds in the
7241	 * truncated portion of the directory and newblk must remain if
7242	 * part of the block remains.
7243	 */
7244	if (blkoff != 0) {
7245		struct diradd *dap;
7246
7247		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
7248			if (dap->da_offset > blkoff)
7249				panic("cancel_pagedep: diradd %p off %d > %d",
7250				    dap, dap->da_offset, blkoff);
7251		for (i = 0; i < DAHASHSZ; i++)
7252			LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
7253				if (dap->da_offset > blkoff)
7254					panic("cancel_pagedep: diradd %p off %d > %d",
7255					    dap, dap->da_offset, blkoff);
7256		return (0);
7257	}
7258	/*
7259	 * There should be no directory add dependencies present
7260	 * as the directory could not be truncated until all
7261	 * children were removed.
7262	 */
7263	KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
7264	    ("deallocate_dependencies: pendinghd != NULL"));
7265	for (i = 0; i < DAHASHSZ; i++)
7266		KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
7267		    ("deallocate_dependencies: diraddhd != NULL"));
7268	if ((pagedep->pd_state & NEWBLOCK) != 0)
7269		free_newdirblk(pagedep->pd_newdirblk);
7270	if (free_pagedep(pagedep) == 0)
7271		panic("Failed to free pagedep %p", pagedep);
7272	return (0);
7273}
7274
7275/*
7276 * Reclaim any dependency structures from a buffer that is about to
7277 * be reallocated to a new vnode. The buffer must be locked, thus,
7278 * no I/O completion operations can occur while we are manipulating
7279 * its associated dependencies. The mutex is held so that other I/O's
7280 * associated with related dependencies do not occur.
7281 */
7282static int
7283deallocate_dependencies(bp, freeblks, off)
7284	struct buf *bp;
7285	struct freeblks *freeblks;
7286	int off;
7287{
7288	struct indirdep *indirdep;
7289	struct pagedep *pagedep;
7290	struct worklist *wk, *wkn;
7291	struct ufsmount *ump;
7292
7293	ump = softdep_bp_to_mp(bp);
7294	if (ump == NULL)
7295		goto done;
7296	ACQUIRE_LOCK(ump);
7297	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
7298		switch (wk->wk_type) {
7299		case D_INDIRDEP:
7300			indirdep = WK_INDIRDEP(wk);
7301			if (bp->b_lblkno >= 0 ||
7302			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
7303				panic("deallocate_dependencies: not indir");
7304			cancel_indirdep(indirdep, bp, freeblks);
7305			continue;
7306
7307		case D_PAGEDEP:
7308			pagedep = WK_PAGEDEP(wk);
7309			if (cancel_pagedep(pagedep, freeblks, off)) {
7310				FREE_LOCK(ump);
7311				return (ERESTART);
7312			}
7313			continue;
7314
7315		case D_ALLOCINDIR:
7316			/*
7317			 * Simply remove the allocindir, we'll find it via
7318			 * the indirdep where we can clear pointers if
7319			 * needed.
7320			 */
7321			WORKLIST_REMOVE(wk);
7322			continue;
7323
7324		case D_FREEWORK:
7325			/*
7326			 * A truncation is waiting for the zero'd pointers
7327			 * to be written.  It can be freed when the freeblks
7328			 * is journaled.
7329			 */
7330			WORKLIST_REMOVE(wk);
7331			wk->wk_state |= ONDEPLIST;
7332			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7333			break;
7334
7335		case D_ALLOCDIRECT:
7336			if (off != 0)
7337				continue;
7338			/* FALLTHROUGH */
7339		default:
7340			panic("deallocate_dependencies: Unexpected type %s",
7341			    TYPENAME(wk->wk_type));
7342			/* NOTREACHED */
7343		}
7344	}
7345	FREE_LOCK(ump);
7346done:
7347	/*
7348	 * Don't throw away this buf, we were partially truncating and
7349	 * some deps may always remain.
7350	 */
7351	if (off) {
7352		allocbuf(bp, off);
7353		bp->b_vflags |= BV_SCANNED;
7354		return (EBUSY);
7355	}
7356	bp->b_flags |= B_INVAL | B_NOCACHE;
7357
7358	return (0);
7359}
7360
7361/*
7362 * An allocdirect is being canceled due to a truncate.  We must make sure
7363 * the journal entry is released in concert with the blkfree that releases
7364 * the storage.  Completed journal entries must not be released until the
7365 * space is no longer pointed to by the inode or in the bitmap.
7366 */
7367static void
7368cancel_allocdirect(adphead, adp, freeblks)
7369	struct allocdirectlst *adphead;
7370	struct allocdirect *adp;
7371	struct freeblks *freeblks;
7372{
7373	struct freework *freework;
7374	struct newblk *newblk;
7375	struct worklist *wk;
7376
7377	TAILQ_REMOVE(adphead, adp, ad_next);
7378	newblk = (struct newblk *)adp;
7379	freework = NULL;
7380	/*
7381	 * Find the correct freework structure.
7382	 */
7383	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
7384		if (wk->wk_type != D_FREEWORK)
7385			continue;
7386		freework = WK_FREEWORK(wk);
7387		if (freework->fw_blkno == newblk->nb_newblkno)
7388			break;
7389	}
7390	if (freework == NULL)
7391		panic("cancel_allocdirect: Freework not found");
7392	/*
7393	 * If a newblk exists at all we still have the journal entry that
7394	 * initiated the allocation so we do not need to journal the free.
7395	 */
7396	cancel_jfreeblk(freeblks, freework->fw_blkno);
7397	/*
7398	 * If the journal hasn't been written the jnewblk must be passed
7399	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
7400	 * this by linking the journal dependency into the freework to be
7401	 * freed when freework_freeblock() is called.  If the journal has
7402	 * been written we can simply reclaim the journal space when the
7403	 * freeblks work is complete.
7404	 */
7405	freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
7406	    &freeblks->fb_jwork);
7407	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7408}
7409
7410
7411/*
7412 * Cancel a new block allocation.  May be an indirect or direct block.  We
7413 * remove it from various lists and return any journal record that needs to
7414 * be resolved by the caller.
7415 *
7416 * A special consideration is made for indirects which were never pointed
7417 * at on disk and will never be found once this block is released.
7418 */
7419static struct jnewblk *
7420cancel_newblk(newblk, wk, wkhd)
7421	struct newblk *newblk;
7422	struct worklist *wk;
7423	struct workhead *wkhd;
7424{
7425	struct jnewblk *jnewblk;
7426
7427	CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
7428
7429	newblk->nb_state |= GOINGAWAY;
7430	/*
7431	 * Previously we traversed the completedhd on each indirdep
7432	 * attached to this newblk to cancel them and gather journal
7433	 * work.  Since we need only the oldest journal segment and
7434	 * the lowest point on the tree will always have the oldest
7435	 * journal segment we are free to release the segments
7436	 * of any subordinates and may leave the indirdep list to
7437	 * indirdep_complete() when this newblk is freed.
7438	 */
7439	if (newblk->nb_state & ONDEPLIST) {
7440		newblk->nb_state &= ~ONDEPLIST;
7441		LIST_REMOVE(newblk, nb_deps);
7442	}
7443	if (newblk->nb_state & ONWORKLIST)
7444		WORKLIST_REMOVE(&newblk->nb_list);
7445	/*
7446	 * If the journal entry hasn't been written we save a pointer to
7447	 * the dependency that frees it until it is written or the
7448	 * superseding operation completes.
7449	 */
7450	jnewblk = newblk->nb_jnewblk;
7451	if (jnewblk != NULL && wk != NULL) {
7452		newblk->nb_jnewblk = NULL;
7453		jnewblk->jn_dep = wk;
7454	}
7455	if (!LIST_EMPTY(&newblk->nb_jwork))
7456		jwork_move(wkhd, &newblk->nb_jwork);
7457	/*
7458	 * When truncating we must free the newdirblk early to remove
7459	 * the pagedep from the hash before returning.
7460	 */
7461	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7462		free_newdirblk(WK_NEWDIRBLK(wk));
7463	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7464		panic("cancel_newblk: extra newdirblk");
7465
7466	return (jnewblk);
7467}
7468
7469/*
7470 * Schedule the freefrag associated with a newblk to be released once
7471 * the pointers are written and the previous block is no longer needed.
7472 */
7473static void
7474newblk_freefrag(newblk)
7475	struct newblk *newblk;
7476{
7477	struct freefrag *freefrag;
7478
7479	if (newblk->nb_freefrag == NULL)
7480		return;
7481	freefrag = newblk->nb_freefrag;
7482	newblk->nb_freefrag = NULL;
7483	freefrag->ff_state |= COMPLETE;
7484	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7485		add_to_worklist(&freefrag->ff_list, 0);
7486}
7487
7488/*
7489 * Free a newblk. Generate a new freefrag work request if appropriate.
7490 * This must be called after the inode pointer and any direct block pointers
7491 * are valid or fully removed via truncate or frag extension.
7492 */
7493static void
7494free_newblk(newblk)
7495	struct newblk *newblk;
7496{
7497	struct indirdep *indirdep;
7498	struct worklist *wk;
7499
7500	KASSERT(newblk->nb_jnewblk == NULL,
7501	    ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
7502	KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
7503	    ("free_newblk: unclaimed newblk"));
7504	LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp));
7505	newblk_freefrag(newblk);
7506	if (newblk->nb_state & ONDEPLIST)
7507		LIST_REMOVE(newblk, nb_deps);
7508	if (newblk->nb_state & ONWORKLIST)
7509		WORKLIST_REMOVE(&newblk->nb_list);
7510	LIST_REMOVE(newblk, nb_hash);
7511	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7512		free_newdirblk(WK_NEWDIRBLK(wk));
7513	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7514		panic("free_newblk: extra newdirblk");
7515	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7516		indirdep_complete(indirdep);
7517	handle_jwork(&newblk->nb_jwork);
7518	WORKITEM_FREE(newblk, D_NEWBLK);
7519}
7520
7521/*
7522 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7523 */
7524static void
7525free_newdirblk(newdirblk)
7526	struct newdirblk *newdirblk;
7527{
7528	struct pagedep *pagedep;
7529	struct diradd *dap;
7530	struct worklist *wk;
7531
7532	LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp));
7533	WORKLIST_REMOVE(&newdirblk->db_list);
7534	/*
7535	 * If the pagedep is still linked onto the directory buffer
7536	 * dependency chain, then some of the entries on the
7537	 * pd_pendinghd list may not be committed to disk yet. In
7538	 * this case, we will simply clear the NEWBLOCK flag and
7539	 * let the pd_pendinghd list be processed when the pagedep
7540	 * is next written. If the pagedep is no longer on the buffer
7541	 * dependency chain, then all the entries on the pd_pending
7542	 * list are committed to disk and we can free them here.
7543	 */
7544	pagedep = newdirblk->db_pagedep;
7545	pagedep->pd_state &= ~NEWBLOCK;
7546	if ((pagedep->pd_state & ONWORKLIST) == 0) {
7547		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7548			free_diradd(dap, NULL);
7549		/*
7550		 * If no dependencies remain, the pagedep will be freed.
7551		 */
7552		free_pagedep(pagedep);
7553	}
7554	/* Should only ever be one item in the list. */
7555	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7556		WORKLIST_REMOVE(wk);
7557		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7558	}
7559	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7560}
7561
7562/*
7563 * Prepare an inode to be freed. The actual free operation is not
7564 * done until the zero'ed inode has been written to disk.
7565 */
7566void
7567softdep_freefile(pvp, ino, mode)
7568	struct vnode *pvp;
7569	ino_t ino;
7570	int mode;
7571{
7572	struct inode *ip = VTOI(pvp);
7573	struct inodedep *inodedep;
7574	struct freefile *freefile;
7575	struct freeblks *freeblks;
7576	struct ufsmount *ump;
7577
7578	ump = ITOUMP(ip);
7579	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
7580	    ("softdep_freefile called on non-softdep filesystem"));
7581	/*
7582	 * This sets up the inode de-allocation dependency.
7583	 */
7584	freefile = malloc(sizeof(struct freefile),
7585		M_FREEFILE, M_SOFTDEP_FLAGS);
7586	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7587	freefile->fx_mode = mode;
7588	freefile->fx_oldinum = ino;
7589	freefile->fx_devvp = ump->um_devvp;
7590	LIST_INIT(&freefile->fx_jwork);
7591	UFS_LOCK(ump);
7592	ump->um_fs->fs_pendinginodes += 1;
7593	UFS_UNLOCK(ump);
7594
7595	/*
7596	 * If the inodedep does not exist, then the zero'ed inode has
7597	 * been written to disk. If the allocated inode has never been
7598	 * written to disk, then the on-disk inode is zero'ed. In either
7599	 * case we can free the file immediately.  If the journal was
7600	 * canceled before being written the inode will never make it to
7601	 * disk and we must send the canceled journal entrys to
7602	 * ffs_freefile() to be cleared in conjunction with the bitmap.
7603	 * Any blocks waiting on the inode to write can be safely freed
7604	 * here as it will never been written.
7605	 */
7606	ACQUIRE_LOCK(ump);
7607	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7608	if (inodedep) {
7609		/*
7610		 * Clear out freeblks that no longer need to reference
7611		 * this inode.
7612		 */
7613		while ((freeblks =
7614		    TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7615			TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7616			    fb_next);
7617			freeblks->fb_state &= ~ONDEPLIST;
7618		}
7619		/*
7620		 * Remove this inode from the unlinked list.
7621		 */
7622		if (inodedep->id_state & UNLINKED) {
7623			/*
7624			 * Save the journal work to be freed with the bitmap
7625			 * before we clear UNLINKED.  Otherwise it can be lost
7626			 * if the inode block is written.
7627			 */
7628			handle_bufwait(inodedep, &freefile->fx_jwork);
7629			clear_unlinked_inodedep(inodedep);
7630			/*
7631			 * Re-acquire inodedep as we've dropped the
7632			 * per-filesystem lock in clear_unlinked_inodedep().
7633			 */
7634			inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7635		}
7636	}
7637	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7638		FREE_LOCK(ump);
7639		handle_workitem_freefile(freefile);
7640		return;
7641	}
7642	if ((inodedep->id_state & DEPCOMPLETE) == 0)
7643		inodedep->id_state |= GOINGAWAY;
7644	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7645	FREE_LOCK(ump);
7646	if (ip->i_number == ino)
7647		ip->i_flag |= IN_MODIFIED;
7648}
7649
7650/*
7651 * Check to see if an inode has never been written to disk. If
7652 * so free the inodedep and return success, otherwise return failure.
7653 *
7654 * If we still have a bitmap dependency, then the inode has never
7655 * been written to disk. Drop the dependency as it is no longer
7656 * necessary since the inode is being deallocated. We set the
7657 * ALLCOMPLETE flags since the bitmap now properly shows that the
7658 * inode is not allocated. Even if the inode is actively being
7659 * written, it has been rolled back to its zero'ed state, so we
7660 * are ensured that a zero inode is what is on the disk. For short
7661 * lived files, this change will usually result in removing all the
7662 * dependencies from the inode so that it can be freed immediately.
7663 */
7664static int
7665check_inode_unwritten(inodedep)
7666	struct inodedep *inodedep;
7667{
7668
7669	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7670
7671	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7672	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7673	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7674	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7675	    !LIST_EMPTY(&inodedep->id_inowait) ||
7676	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7677	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7678	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7679	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7680	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7681	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7682	    inodedep->id_mkdiradd != NULL ||
7683	    inodedep->id_nlinkdelta != 0)
7684		return (0);
7685	/*
7686	 * Another process might be in initiate_write_inodeblock_ufs[12]
7687	 * trying to allocate memory without holding "Softdep Lock".
7688	 */
7689	if ((inodedep->id_state & IOSTARTED) != 0 &&
7690	    inodedep->id_savedino1 == NULL)
7691		return (0);
7692
7693	if (inodedep->id_state & ONDEPLIST)
7694		LIST_REMOVE(inodedep, id_deps);
7695	inodedep->id_state &= ~ONDEPLIST;
7696	inodedep->id_state |= ALLCOMPLETE;
7697	inodedep->id_bmsafemap = NULL;
7698	if (inodedep->id_state & ONWORKLIST)
7699		WORKLIST_REMOVE(&inodedep->id_list);
7700	if (inodedep->id_savedino1 != NULL) {
7701		free(inodedep->id_savedino1, M_SAVEDINO);
7702		inodedep->id_savedino1 = NULL;
7703	}
7704	if (free_inodedep(inodedep) == 0)
7705		panic("check_inode_unwritten: busy inode");
7706	return (1);
7707}
7708
7709static int
7710check_inodedep_free(inodedep)
7711	struct inodedep *inodedep;
7712{
7713
7714	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7715	if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7716	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7717	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7718	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7719	    !LIST_EMPTY(&inodedep->id_inowait) ||
7720	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7721	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7722	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7723	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7724	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7725	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7726	    inodedep->id_mkdiradd != NULL ||
7727	    inodedep->id_nlinkdelta != 0 ||
7728	    inodedep->id_savedino1 != NULL)
7729		return (0);
7730	return (1);
7731}
7732
7733/*
7734 * Try to free an inodedep structure. Return 1 if it could be freed.
7735 */
7736static int
7737free_inodedep(inodedep)
7738	struct inodedep *inodedep;
7739{
7740
7741	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7742	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7743	    !check_inodedep_free(inodedep))
7744		return (0);
7745	if (inodedep->id_state & ONDEPLIST)
7746		LIST_REMOVE(inodedep, id_deps);
7747	LIST_REMOVE(inodedep, id_hash);
7748	WORKITEM_FREE(inodedep, D_INODEDEP);
7749	return (1);
7750}
7751
7752/*
7753 * Free the block referenced by a freework structure.  The parent freeblks
7754 * structure is released and completed when the final cg bitmap reaches
7755 * the disk.  This routine may be freeing a jnewblk which never made it to
7756 * disk in which case we do not have to wait as the operation is undone
7757 * in memory immediately.
7758 */
7759static void
7760freework_freeblock(freework, key)
7761	struct freework *freework;
7762	u_long key;
7763{
7764	struct freeblks *freeblks;
7765	struct jnewblk *jnewblk;
7766	struct ufsmount *ump;
7767	struct workhead wkhd;
7768	struct fs *fs;
7769	int bsize;
7770	int needj;
7771
7772	ump = VFSTOUFS(freework->fw_list.wk_mp);
7773	LOCK_OWNED(ump);
7774	/*
7775	 * Handle partial truncate separately.
7776	 */
7777	if (freework->fw_indir) {
7778		complete_trunc_indir(freework);
7779		return;
7780	}
7781	freeblks = freework->fw_freeblks;
7782	fs = ump->um_fs;
7783	needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7784	bsize = lfragtosize(fs, freework->fw_frags);
7785	LIST_INIT(&wkhd);
7786	/*
7787	 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7788	 * on the indirblk hashtable and prevents premature freeing.
7789	 */
7790	freework->fw_state |= DEPCOMPLETE;
7791	/*
7792	 * SUJ needs to wait for the segment referencing freed indirect
7793	 * blocks to expire so that we know the checker will not confuse
7794	 * a re-allocated indirect block with its old contents.
7795	 */
7796	if (needj && freework->fw_lbn <= -UFS_NDADDR)
7797		indirblk_insert(freework);
7798	/*
7799	 * If we are canceling an existing jnewblk pass it to the free
7800	 * routine, otherwise pass the freeblk which will ultimately
7801	 * release the freeblks.  If we're not journaling, we can just
7802	 * free the freeblks immediately.
7803	 */
7804	jnewblk = freework->fw_jnewblk;
7805	if (jnewblk != NULL) {
7806		cancel_jnewblk(jnewblk, &wkhd);
7807		needj = 0;
7808	} else if (needj) {
7809		freework->fw_state |= DELAYEDFREE;
7810		freeblks->fb_cgwait++;
7811		WORKLIST_INSERT(&wkhd, &freework->fw_list);
7812	}
7813	FREE_LOCK(ump);
7814	freeblks_free(ump, freeblks, btodb(bsize));
7815	CTR4(KTR_SUJ,
7816	    "freework_freeblock: ino %jd blkno %jd lbn %jd size %d",
7817	    freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
7818	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
7819	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key);
7820	ACQUIRE_LOCK(ump);
7821	/*
7822	 * The jnewblk will be discarded and the bits in the map never
7823	 * made it to disk.  We can immediately free the freeblk.
7824	 */
7825	if (needj == 0)
7826		handle_written_freework(freework);
7827}
7828
7829/*
7830 * We enqueue freework items that need processing back on the freeblks and
7831 * add the freeblks to the worklist.  This makes it easier to find all work
7832 * required to flush a truncation in process_truncates().
7833 */
7834static void
7835freework_enqueue(freework)
7836	struct freework *freework;
7837{
7838	struct freeblks *freeblks;
7839
7840	freeblks = freework->fw_freeblks;
7841	if ((freework->fw_state & INPROGRESS) == 0)
7842		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
7843	if ((freeblks->fb_state &
7844	    (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
7845	    LIST_EMPTY(&freeblks->fb_jblkdephd))
7846		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7847}
7848
7849/*
7850 * Start, continue, or finish the process of freeing an indirect block tree.
7851 * The free operation may be paused at any point with fw_off containing the
7852 * offset to restart from.  This enables us to implement some flow control
7853 * for large truncates which may fan out and generate a huge number of
7854 * dependencies.
7855 */
7856static void
7857handle_workitem_indirblk(freework)
7858	struct freework *freework;
7859{
7860	struct freeblks *freeblks;
7861	struct ufsmount *ump;
7862	struct fs *fs;
7863
7864	freeblks = freework->fw_freeblks;
7865	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7866	fs = ump->um_fs;
7867	if (freework->fw_state & DEPCOMPLETE) {
7868		handle_written_freework(freework);
7869		return;
7870	}
7871	if (freework->fw_off == NINDIR(fs)) {
7872		freework_freeblock(freework, SINGLETON_KEY);
7873		return;
7874	}
7875	freework->fw_state |= INPROGRESS;
7876	FREE_LOCK(ump);
7877	indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
7878	    freework->fw_lbn);
7879	ACQUIRE_LOCK(ump);
7880}
7881
7882/*
7883 * Called when a freework structure attached to a cg buf is written.  The
7884 * ref on either the parent or the freeblks structure is released and
7885 * the freeblks is added back to the worklist if there is more work to do.
7886 */
7887static void
7888handle_written_freework(freework)
7889	struct freework *freework;
7890{
7891	struct freeblks *freeblks;
7892	struct freework *parent;
7893
7894	freeblks = freework->fw_freeblks;
7895	parent = freework->fw_parent;
7896	if (freework->fw_state & DELAYEDFREE)
7897		freeblks->fb_cgwait--;
7898	freework->fw_state |= COMPLETE;
7899	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
7900		WORKITEM_FREE(freework, D_FREEWORK);
7901	if (parent) {
7902		if (--parent->fw_ref == 0)
7903			freework_enqueue(parent);
7904		return;
7905	}
7906	if (--freeblks->fb_ref != 0)
7907		return;
7908	if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
7909	    ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
7910		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7911}
7912
7913/*
7914 * This workitem routine performs the block de-allocation.
7915 * The workitem is added to the pending list after the updated
7916 * inode block has been written to disk.  As mentioned above,
7917 * checks regarding the number of blocks de-allocated (compared
7918 * to the number of blocks allocated for the file) are also
7919 * performed in this function.
7920 */
7921static int
7922handle_workitem_freeblocks(freeblks, flags)
7923	struct freeblks *freeblks;
7924	int flags;
7925{
7926	struct freework *freework;
7927	struct newblk *newblk;
7928	struct allocindir *aip;
7929	struct ufsmount *ump;
7930	struct worklist *wk;
7931	u_long key;
7932
7933	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
7934	    ("handle_workitem_freeblocks: Journal entries not written."));
7935	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7936	key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
7937	ACQUIRE_LOCK(ump);
7938	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
7939		WORKLIST_REMOVE(wk);
7940		switch (wk->wk_type) {
7941		case D_DIRREM:
7942			wk->wk_state |= COMPLETE;
7943			add_to_worklist(wk, 0);
7944			continue;
7945
7946		case D_ALLOCDIRECT:
7947			free_newblk(WK_NEWBLK(wk));
7948			continue;
7949
7950		case D_ALLOCINDIR:
7951			aip = WK_ALLOCINDIR(wk);
7952			freework = NULL;
7953			if (aip->ai_state & DELAYEDFREE) {
7954				FREE_LOCK(ump);
7955				freework = newfreework(ump, freeblks, NULL,
7956				    aip->ai_lbn, aip->ai_newblkno,
7957				    ump->um_fs->fs_frag, 0, 0);
7958				ACQUIRE_LOCK(ump);
7959			}
7960			newblk = WK_NEWBLK(wk);
7961			if (newblk->nb_jnewblk) {
7962				freework->fw_jnewblk = newblk->nb_jnewblk;
7963				newblk->nb_jnewblk->jn_dep = &freework->fw_list;
7964				newblk->nb_jnewblk = NULL;
7965			}
7966			free_newblk(newblk);
7967			continue;
7968
7969		case D_FREEWORK:
7970			freework = WK_FREEWORK(wk);
7971			if (freework->fw_lbn <= -UFS_NDADDR)
7972				handle_workitem_indirblk(freework);
7973			else
7974				freework_freeblock(freework, key);
7975			continue;
7976		default:
7977			panic("handle_workitem_freeblocks: Unknown type %s",
7978			    TYPENAME(wk->wk_type));
7979		}
7980	}
7981	if (freeblks->fb_ref != 0) {
7982		freeblks->fb_state &= ~INPROGRESS;
7983		wake_worklist(&freeblks->fb_list);
7984		freeblks = NULL;
7985	}
7986	FREE_LOCK(ump);
7987	ffs_blkrelease_finish(ump, key);
7988	if (freeblks)
7989		return handle_complete_freeblocks(freeblks, flags);
7990	return (0);
7991}
7992
7993/*
7994 * Handle completion of block free via truncate.  This allows fs_pending
7995 * to track the actual free block count more closely than if we only updated
7996 * it at the end.  We must be careful to handle cases where the block count
7997 * on free was incorrect.
7998 */
7999static void
8000freeblks_free(ump, freeblks, blocks)
8001	struct ufsmount *ump;
8002	struct freeblks *freeblks;
8003	int blocks;
8004{
8005	struct fs *fs;
8006	ufs2_daddr_t remain;
8007
8008	UFS_LOCK(ump);
8009	remain = -freeblks->fb_chkcnt;
8010	freeblks->fb_chkcnt += blocks;
8011	if (remain > 0) {
8012		if (remain < blocks)
8013			blocks = remain;
8014		fs = ump->um_fs;
8015		fs->fs_pendingblocks -= blocks;
8016	}
8017	UFS_UNLOCK(ump);
8018}
8019
8020/*
8021 * Once all of the freework workitems are complete we can retire the
8022 * freeblocks dependency and any journal work awaiting completion.  This
8023 * can not be called until all other dependencies are stable on disk.
8024 */
8025static int
8026handle_complete_freeblocks(freeblks, flags)
8027	struct freeblks *freeblks;
8028	int flags;
8029{
8030	struct inodedep *inodedep;
8031	struct inode *ip;
8032	struct vnode *vp;
8033	struct fs *fs;
8034	struct ufsmount *ump;
8035	ufs2_daddr_t spare;
8036
8037	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8038	fs = ump->um_fs;
8039	flags = LK_EXCLUSIVE | flags;
8040	spare = freeblks->fb_chkcnt;
8041
8042	/*
8043	 * If we did not release the expected number of blocks we may have
8044	 * to adjust the inode block count here.  Only do so if it wasn't
8045	 * a truncation to zero and the modrev still matches.
8046	 */
8047	if (spare && freeblks->fb_len != 0) {
8048		if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
8049		    flags, &vp, FFSV_FORCEINSMQ) != 0)
8050			return (EBUSY);
8051		ip = VTOI(vp);
8052		if (ip->i_mode == 0) {
8053			vgone(vp);
8054		} else if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
8055			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
8056			ip->i_flag |= IN_CHANGE;
8057			/*
8058			 * We must wait so this happens before the
8059			 * journal is reclaimed.
8060			 */
8061			ffs_update(vp, 1);
8062		}
8063		vput(vp);
8064	}
8065	if (spare < 0) {
8066		UFS_LOCK(ump);
8067		fs->fs_pendingblocks += spare;
8068		UFS_UNLOCK(ump);
8069	}
8070#ifdef QUOTA
8071	/* Handle spare. */
8072	if (spare)
8073		quotaadj(freeblks->fb_quota, ump, -spare);
8074	quotarele(freeblks->fb_quota);
8075#endif
8076	ACQUIRE_LOCK(ump);
8077	if (freeblks->fb_state & ONDEPLIST) {
8078		inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
8079		    0, &inodedep);
8080		TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
8081		freeblks->fb_state &= ~ONDEPLIST;
8082		if (TAILQ_EMPTY(&inodedep->id_freeblklst))
8083			free_inodedep(inodedep);
8084	}
8085	/*
8086	 * All of the freeblock deps must be complete prior to this call
8087	 * so it's now safe to complete earlier outstanding journal entries.
8088	 */
8089	handle_jwork(&freeblks->fb_jwork);
8090	WORKITEM_FREE(freeblks, D_FREEBLKS);
8091	FREE_LOCK(ump);
8092	return (0);
8093}
8094
8095/*
8096 * Release blocks associated with the freeblks and stored in the indirect
8097 * block dbn. If level is greater than SINGLE, the block is an indirect block
8098 * and recursive calls to indirtrunc must be used to cleanse other indirect
8099 * blocks.
8100 *
8101 * This handles partial and complete truncation of blocks.  Partial is noted
8102 * with goingaway == 0.  In this case the freework is completed after the
8103 * zero'd indirects are written to disk.  For full truncation the freework
8104 * is completed after the block is freed.
8105 */
8106static void
8107indir_trunc(freework, dbn, lbn)
8108	struct freework *freework;
8109	ufs2_daddr_t dbn;
8110	ufs_lbn_t lbn;
8111{
8112	struct freework *nfreework;
8113	struct workhead wkhd;
8114	struct freeblks *freeblks;
8115	struct buf *bp;
8116	struct fs *fs;
8117	struct indirdep *indirdep;
8118	struct mount *mp;
8119	struct ufsmount *ump;
8120	ufs1_daddr_t *bap1;
8121	ufs2_daddr_t nb, nnb, *bap2;
8122	ufs_lbn_t lbnadd, nlbn;
8123	u_long key;
8124	int nblocks, ufs1fmt, freedblocks;
8125	int goingaway, freedeps, needj, level, cnt, i;
8126
8127	freeblks = freework->fw_freeblks;
8128	mp = freeblks->fb_list.wk_mp;
8129	ump = VFSTOUFS(mp);
8130	fs = ump->um_fs;
8131	/*
8132	 * Get buffer of block pointers to be freed.  There are three cases:
8133	 *
8134	 * 1) Partial truncate caches the indirdep pointer in the freework
8135	 *    which provides us a back copy to the save bp which holds the
8136	 *    pointers we want to clear.  When this completes the zero
8137	 *    pointers are written to the real copy.
8138	 * 2) The indirect is being completely truncated, cancel_indirdep()
8139	 *    eliminated the real copy and placed the indirdep on the saved
8140	 *    copy.  The indirdep and buf are discarded when this completes.
8141	 * 3) The indirect was not in memory, we read a copy off of the disk
8142	 *    using the devvp and drop and invalidate the buffer when we're
8143	 *    done.
8144	 */
8145	goingaway = 1;
8146	indirdep = NULL;
8147	if (freework->fw_indir != NULL) {
8148		goingaway = 0;
8149		indirdep = freework->fw_indir;
8150		bp = indirdep->ir_savebp;
8151		if (bp == NULL || bp->b_blkno != dbn)
8152			panic("indir_trunc: Bad saved buf %p blkno %jd",
8153			    bp, (intmax_t)dbn);
8154	} else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
8155		/*
8156		 * The lock prevents the buf dep list from changing and
8157	 	 * indirects on devvp should only ever have one dependency.
8158		 */
8159		indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
8160		if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
8161			panic("indir_trunc: Bad indirdep %p from buf %p",
8162			    indirdep, bp);
8163	} else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
8164	    NOCRED, &bp) != 0) {
8165		brelse(bp);
8166		return;
8167	}
8168	ACQUIRE_LOCK(ump);
8169	/* Protects against a race with complete_trunc_indir(). */
8170	freework->fw_state &= ~INPROGRESS;
8171	/*
8172	 * If we have an indirdep we need to enforce the truncation order
8173	 * and discard it when it is complete.
8174	 */
8175	if (indirdep) {
8176		if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
8177		    !TAILQ_EMPTY(&indirdep->ir_trunc)) {
8178			/*
8179			 * Add the complete truncate to the list on the
8180			 * indirdep to enforce in-order processing.
8181			 */
8182			if (freework->fw_indir == NULL)
8183				TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
8184				    freework, fw_next);
8185			FREE_LOCK(ump);
8186			return;
8187		}
8188		/*
8189		 * If we're goingaway, free the indirdep.  Otherwise it will
8190		 * linger until the write completes.
8191		 */
8192		if (goingaway)
8193			free_indirdep(indirdep);
8194	}
8195	FREE_LOCK(ump);
8196	/* Initialize pointers depending on block size. */
8197	if (ump->um_fstype == UFS1) {
8198		bap1 = (ufs1_daddr_t *)bp->b_data;
8199		nb = bap1[freework->fw_off];
8200		ufs1fmt = 1;
8201		bap2 = NULL;
8202	} else {
8203		bap2 = (ufs2_daddr_t *)bp->b_data;
8204		nb = bap2[freework->fw_off];
8205		ufs1fmt = 0;
8206		bap1 = NULL;
8207	}
8208	level = lbn_level(lbn);
8209	needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
8210	lbnadd = lbn_offset(fs, level);
8211	nblocks = btodb(fs->fs_bsize);
8212	nfreework = freework;
8213	freedeps = 0;
8214	cnt = 0;
8215	/*
8216	 * Reclaim blocks.  Traverses into nested indirect levels and
8217	 * arranges for the current level to be freed when subordinates
8218	 * are free when journaling.
8219	 */
8220	key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
8221	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
8222		if (UFS_CHECK_BLKNO(mp, freeblks->fb_inum, nb,
8223		    fs->fs_bsize) != 0)
8224			nb = 0;
8225		if (i != NINDIR(fs) - 1) {
8226			if (ufs1fmt)
8227				nnb = bap1[i+1];
8228			else
8229				nnb = bap2[i+1];
8230		} else
8231			nnb = 0;
8232		if (nb == 0)
8233			continue;
8234		cnt++;
8235		if (level != 0) {
8236			nlbn = (lbn + 1) - (i * lbnadd);
8237			if (needj != 0) {
8238				nfreework = newfreework(ump, freeblks, freework,
8239				    nlbn, nb, fs->fs_frag, 0, 0);
8240				freedeps++;
8241			}
8242			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
8243		} else {
8244			struct freedep *freedep;
8245
8246			/*
8247			 * Attempt to aggregate freedep dependencies for
8248			 * all blocks being released to the same CG.
8249			 */
8250			LIST_INIT(&wkhd);
8251			if (needj != 0 &&
8252			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
8253				freedep = newfreedep(freework);
8254				WORKLIST_INSERT_UNLOCKED(&wkhd,
8255				    &freedep->fd_list);
8256				freedeps++;
8257			}
8258			CTR3(KTR_SUJ,
8259			    "indir_trunc: ino %jd blkno %jd size %d",
8260			    freeblks->fb_inum, nb, fs->fs_bsize);
8261			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
8262			    fs->fs_bsize, freeblks->fb_inum,
8263			    freeblks->fb_vtype, &wkhd, key);
8264		}
8265	}
8266	ffs_blkrelease_finish(ump, key);
8267	if (goingaway) {
8268		bp->b_flags |= B_INVAL | B_NOCACHE;
8269		brelse(bp);
8270	}
8271	freedblocks = 0;
8272	if (level == 0)
8273		freedblocks = (nblocks * cnt);
8274	if (needj == 0)
8275		freedblocks += nblocks;
8276	freeblks_free(ump, freeblks, freedblocks);
8277	/*
8278	 * If we are journaling set up the ref counts and offset so this
8279	 * indirect can be completed when its children are free.
8280	 */
8281	if (needj) {
8282		ACQUIRE_LOCK(ump);
8283		freework->fw_off = i;
8284		freework->fw_ref += freedeps;
8285		freework->fw_ref -= NINDIR(fs) + 1;
8286		if (level == 0)
8287			freeblks->fb_cgwait += freedeps;
8288		if (freework->fw_ref == 0)
8289			freework_freeblock(freework, SINGLETON_KEY);
8290		FREE_LOCK(ump);
8291		return;
8292	}
8293	/*
8294	 * If we're not journaling we can free the indirect now.
8295	 */
8296	dbn = dbtofsb(fs, dbn);
8297	CTR3(KTR_SUJ,
8298	    "indir_trunc 2: ino %jd blkno %jd size %d",
8299	    freeblks->fb_inum, dbn, fs->fs_bsize);
8300	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
8301	    freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON_KEY);
8302	/* Non SUJ softdep does single-threaded truncations. */
8303	if (freework->fw_blkno == dbn) {
8304		freework->fw_state |= ALLCOMPLETE;
8305		ACQUIRE_LOCK(ump);
8306		handle_written_freework(freework);
8307		FREE_LOCK(ump);
8308	}
8309	return;
8310}
8311
8312/*
8313 * Cancel an allocindir when it is removed via truncation.  When bp is not
8314 * NULL the indirect never appeared on disk and is scheduled to be freed
8315 * independently of the indir so we can more easily track journal work.
8316 */
8317static void
8318cancel_allocindir(aip, bp, freeblks, trunc)
8319	struct allocindir *aip;
8320	struct buf *bp;
8321	struct freeblks *freeblks;
8322	int trunc;
8323{
8324	struct indirdep *indirdep;
8325	struct freefrag *freefrag;
8326	struct newblk *newblk;
8327
8328	newblk = (struct newblk *)aip;
8329	LIST_REMOVE(aip, ai_next);
8330	/*
8331	 * We must eliminate the pointer in bp if it must be freed on its
8332	 * own due to partial truncate or pending journal work.
8333	 */
8334	if (bp && (trunc || newblk->nb_jnewblk)) {
8335		/*
8336		 * Clear the pointer and mark the aip to be freed
8337		 * directly if it never existed on disk.
8338		 */
8339		aip->ai_state |= DELAYEDFREE;
8340		indirdep = aip->ai_indirdep;
8341		if (indirdep->ir_state & UFS1FMT)
8342			((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8343		else
8344			((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8345	}
8346	/*
8347	 * When truncating the previous pointer will be freed via
8348	 * savedbp.  Eliminate the freefrag which would dup free.
8349	 */
8350	if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
8351		newblk->nb_freefrag = NULL;
8352		if (freefrag->ff_jdep)
8353			cancel_jfreefrag(
8354			    WK_JFREEFRAG(freefrag->ff_jdep));
8355		jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
8356		WORKITEM_FREE(freefrag, D_FREEFRAG);
8357	}
8358	/*
8359	 * If the journal hasn't been written the jnewblk must be passed
8360	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
8361	 * this by leaving the journal dependency on the newblk to be freed
8362	 * when a freework is created in handle_workitem_freeblocks().
8363	 */
8364	cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
8365	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
8366}
8367
8368/*
8369 * Create the mkdir dependencies for . and .. in a new directory.  Link them
8370 * in to a newdirblk so any subsequent additions are tracked properly.  The
8371 * caller is responsible for adding the mkdir1 dependency to the journal
8372 * and updating id_mkdiradd.  This function returns with the per-filesystem
8373 * lock held.
8374 */
8375static struct mkdir *
8376setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
8377	struct diradd *dap;
8378	ino_t newinum;
8379	ino_t dinum;
8380	struct buf *newdirbp;
8381	struct mkdir **mkdirp;
8382{
8383	struct newblk *newblk;
8384	struct pagedep *pagedep;
8385	struct inodedep *inodedep;
8386	struct newdirblk *newdirblk;
8387	struct mkdir *mkdir1, *mkdir2;
8388	struct worklist *wk;
8389	struct jaddref *jaddref;
8390	struct ufsmount *ump;
8391	struct mount *mp;
8392
8393	mp = dap->da_list.wk_mp;
8394	ump = VFSTOUFS(mp);
8395	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
8396	    M_SOFTDEP_FLAGS);
8397	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8398	LIST_INIT(&newdirblk->db_mkdir);
8399	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8400	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
8401	mkdir1->md_state = ATTACHED | MKDIR_BODY;
8402	mkdir1->md_diradd = dap;
8403	mkdir1->md_jaddref = NULL;
8404	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8405	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
8406	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
8407	mkdir2->md_diradd = dap;
8408	mkdir2->md_jaddref = NULL;
8409	if (MOUNTEDSUJ(mp) == 0) {
8410		mkdir1->md_state |= DEPCOMPLETE;
8411		mkdir2->md_state |= DEPCOMPLETE;
8412	}
8413	/*
8414	 * Dependency on "." and ".." being written to disk.
8415	 */
8416	mkdir1->md_buf = newdirbp;
8417	ACQUIRE_LOCK(VFSTOUFS(mp));
8418	LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs);
8419	/*
8420	 * We must link the pagedep, allocdirect, and newdirblk for
8421	 * the initial file page so the pointer to the new directory
8422	 * is not written until the directory contents are live and
8423	 * any subsequent additions are not marked live until the
8424	 * block is reachable via the inode.
8425	 */
8426	if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
8427		panic("setup_newdir: lost pagedep");
8428	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
8429		if (wk->wk_type == D_ALLOCDIRECT)
8430			break;
8431	if (wk == NULL)
8432		panic("setup_newdir: lost allocdirect");
8433	if (pagedep->pd_state & NEWBLOCK)
8434		panic("setup_newdir: NEWBLOCK already set");
8435	newblk = WK_NEWBLK(wk);
8436	pagedep->pd_state |= NEWBLOCK;
8437	pagedep->pd_newdirblk = newdirblk;
8438	newdirblk->db_pagedep = pagedep;
8439	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8440	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
8441	/*
8442	 * Look up the inodedep for the parent directory so that we
8443	 * can link mkdir2 into the pending dotdot jaddref or
8444	 * the inode write if there is none.  If the inode is
8445	 * ALLCOMPLETE and no jaddref is present all dependencies have
8446	 * been satisfied and mkdir2 can be freed.
8447	 */
8448	inodedep_lookup(mp, dinum, 0, &inodedep);
8449	if (MOUNTEDSUJ(mp)) {
8450		if (inodedep == NULL)
8451			panic("setup_newdir: Lost parent.");
8452		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8453		    inoreflst);
8454		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
8455		    (jaddref->ja_state & MKDIR_PARENT),
8456		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
8457		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8458		mkdir2->md_jaddref = jaddref;
8459		jaddref->ja_mkdir = mkdir2;
8460	} else if (inodedep == NULL ||
8461	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8462		dap->da_state &= ~MKDIR_PARENT;
8463		WORKITEM_FREE(mkdir2, D_MKDIR);
8464		mkdir2 = NULL;
8465	} else {
8466		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8467		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
8468	}
8469	*mkdirp = mkdir2;
8470
8471	return (mkdir1);
8472}
8473
8474/*
8475 * Directory entry addition dependencies.
8476 *
8477 * When adding a new directory entry, the inode (with its incremented link
8478 * count) must be written to disk before the directory entry's pointer to it.
8479 * Also, if the inode is newly allocated, the corresponding freemap must be
8480 * updated (on disk) before the directory entry's pointer. These requirements
8481 * are met via undo/redo on the directory entry's pointer, which consists
8482 * simply of the inode number.
8483 *
8484 * As directory entries are added and deleted, the free space within a
8485 * directory block can become fragmented.  The ufs filesystem will compact
8486 * a fragmented directory block to make space for a new entry. When this
8487 * occurs, the offsets of previously added entries change. Any "diradd"
8488 * dependency structures corresponding to these entries must be updated with
8489 * the new offsets.
8490 */
8491
8492/*
8493 * This routine is called after the in-memory inode's link
8494 * count has been incremented, but before the directory entry's
8495 * pointer to the inode has been set.
8496 */
8497int
8498softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
8499	struct buf *bp;		/* buffer containing directory block */
8500	struct inode *dp;	/* inode for directory */
8501	off_t diroffset;	/* offset of new entry in directory */
8502	ino_t newinum;		/* inode referenced by new directory entry */
8503	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
8504	int isnewblk;		/* entry is in a newly allocated block */
8505{
8506	int offset;		/* offset of new entry within directory block */
8507	ufs_lbn_t lbn;		/* block in directory containing new entry */
8508	struct fs *fs;
8509	struct diradd *dap;
8510	struct newblk *newblk;
8511	struct pagedep *pagedep;
8512	struct inodedep *inodedep;
8513	struct newdirblk *newdirblk;
8514	struct mkdir *mkdir1, *mkdir2;
8515	struct jaddref *jaddref;
8516	struct ufsmount *ump;
8517	struct mount *mp;
8518	int isindir;
8519
8520	mp = ITOVFS(dp);
8521	ump = VFSTOUFS(mp);
8522	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8523	    ("softdep_setup_directory_add called on non-softdep filesystem"));
8524	/*
8525	 * Whiteouts have no dependencies.
8526	 */
8527	if (newinum == UFS_WINO) {
8528		if (newdirbp != NULL)
8529			bdwrite(newdirbp);
8530		return (0);
8531	}
8532	jaddref = NULL;
8533	mkdir1 = mkdir2 = NULL;
8534	fs = ump->um_fs;
8535	lbn = lblkno(fs, diroffset);
8536	offset = blkoff(fs, diroffset);
8537	dap = malloc(sizeof(struct diradd), M_DIRADD,
8538		M_SOFTDEP_FLAGS|M_ZERO);
8539	workitem_alloc(&dap->da_list, D_DIRADD, mp);
8540	dap->da_offset = offset;
8541	dap->da_newinum = newinum;
8542	dap->da_state = ATTACHED;
8543	LIST_INIT(&dap->da_jwork);
8544	isindir = bp->b_lblkno >= UFS_NDADDR;
8545	newdirblk = NULL;
8546	if (isnewblk &&
8547	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8548		newdirblk = malloc(sizeof(struct newdirblk),
8549		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8550		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8551		LIST_INIT(&newdirblk->db_mkdir);
8552	}
8553	/*
8554	 * If we're creating a new directory setup the dependencies and set
8555	 * the dap state to wait for them.  Otherwise it's COMPLETE and
8556	 * we can move on.
8557	 */
8558	if (newdirbp == NULL) {
8559		dap->da_state |= DEPCOMPLETE;
8560		ACQUIRE_LOCK(ump);
8561	} else {
8562		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8563		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8564		    &mkdir2);
8565	}
8566	/*
8567	 * Link into parent directory pagedep to await its being written.
8568	 */
8569	pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8570#ifdef DEBUG
8571	if (diradd_lookup(pagedep, offset) != NULL)
8572		panic("softdep_setup_directory_add: %p already at off %d\n",
8573		    diradd_lookup(pagedep, offset), offset);
8574#endif
8575	dap->da_pagedep = pagedep;
8576	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8577	    da_pdlist);
8578	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
8579	/*
8580	 * If we're journaling, link the diradd into the jaddref so it
8581	 * may be completed after the journal entry is written.  Otherwise,
8582	 * link the diradd into its inodedep.  If the inode is not yet
8583	 * written place it on the bufwait list, otherwise do the post-inode
8584	 * write processing to put it on the id_pendinghd list.
8585	 */
8586	if (MOUNTEDSUJ(mp)) {
8587		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8588		    inoreflst);
8589		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8590		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8591		jaddref->ja_diroff = diroffset;
8592		jaddref->ja_diradd = dap;
8593		add_to_journal(&jaddref->ja_list);
8594	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8595		diradd_inode_written(dap, inodedep);
8596	else
8597		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8598	/*
8599	 * Add the journal entries for . and .. links now that the primary
8600	 * link is written.
8601	 */
8602	if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8603		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8604		    inoreflst, if_deps);
8605		KASSERT(jaddref != NULL &&
8606		    jaddref->ja_ino == jaddref->ja_parent &&
8607		    (jaddref->ja_state & MKDIR_BODY),
8608		    ("softdep_setup_directory_add: bad dot jaddref %p",
8609		    jaddref));
8610		mkdir1->md_jaddref = jaddref;
8611		jaddref->ja_mkdir = mkdir1;
8612		/*
8613		 * It is important that the dotdot journal entry
8614		 * is added prior to the dot entry since dot writes
8615		 * both the dot and dotdot links.  These both must
8616		 * be added after the primary link for the journal
8617		 * to remain consistent.
8618		 */
8619		add_to_journal(&mkdir2->md_jaddref->ja_list);
8620		add_to_journal(&jaddref->ja_list);
8621	}
8622	/*
8623	 * If we are adding a new directory remember this diradd so that if
8624	 * we rename it we can keep the dot and dotdot dependencies.  If
8625	 * we are adding a new name for an inode that has a mkdiradd we
8626	 * must be in rename and we have to move the dot and dotdot
8627	 * dependencies to this new name.  The old name is being orphaned
8628	 * soon.
8629	 */
8630	if (mkdir1 != NULL) {
8631		if (inodedep->id_mkdiradd != NULL)
8632			panic("softdep_setup_directory_add: Existing mkdir");
8633		inodedep->id_mkdiradd = dap;
8634	} else if (inodedep->id_mkdiradd)
8635		merge_diradd(inodedep, dap);
8636	if (newdirblk != NULL) {
8637		/*
8638		 * There is nothing to do if we are already tracking
8639		 * this block.
8640		 */
8641		if ((pagedep->pd_state & NEWBLOCK) != 0) {
8642			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8643			FREE_LOCK(ump);
8644			return (0);
8645		}
8646		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8647		    == 0)
8648			panic("softdep_setup_directory_add: lost entry");
8649		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8650		pagedep->pd_state |= NEWBLOCK;
8651		pagedep->pd_newdirblk = newdirblk;
8652		newdirblk->db_pagedep = pagedep;
8653		FREE_LOCK(ump);
8654		/*
8655		 * If we extended into an indirect signal direnter to sync.
8656		 */
8657		if (isindir)
8658			return (1);
8659		return (0);
8660	}
8661	FREE_LOCK(ump);
8662	return (0);
8663}
8664
8665/*
8666 * This procedure is called to change the offset of a directory
8667 * entry when compacting a directory block which must be owned
8668 * exclusively by the caller. Note that the actual entry movement
8669 * must be done in this procedure to ensure that no I/O completions
8670 * occur while the move is in progress.
8671 */
8672void
8673softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
8674	struct buf *bp;		/* Buffer holding directory block. */
8675	struct inode *dp;	/* inode for directory */
8676	caddr_t base;		/* address of dp->i_offset */
8677	caddr_t oldloc;		/* address of old directory location */
8678	caddr_t newloc;		/* address of new directory location */
8679	int entrysize;		/* size of directory entry */
8680{
8681	int offset, oldoffset, newoffset;
8682	struct pagedep *pagedep;
8683	struct jmvref *jmvref;
8684	struct diradd *dap;
8685	struct direct *de;
8686	struct mount *mp;
8687	struct ufsmount *ump;
8688	ufs_lbn_t lbn;
8689	int flags;
8690
8691	mp = ITOVFS(dp);
8692	ump = VFSTOUFS(mp);
8693	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8694	    ("softdep_change_directoryentry_offset called on "
8695	     "non-softdep filesystem"));
8696	de = (struct direct *)oldloc;
8697	jmvref = NULL;
8698	flags = 0;
8699	/*
8700	 * Moves are always journaled as it would be too complex to
8701	 * determine if any affected adds or removes are present in the
8702	 * journal.
8703	 */
8704	if (MOUNTEDSUJ(mp)) {
8705		flags = DEPALLOC;
8706		jmvref = newjmvref(dp, de->d_ino,
8707		    dp->i_offset + (oldloc - base),
8708		    dp->i_offset + (newloc - base));
8709	}
8710	lbn = lblkno(ump->um_fs, dp->i_offset);
8711	offset = blkoff(ump->um_fs, dp->i_offset);
8712	oldoffset = offset + (oldloc - base);
8713	newoffset = offset + (newloc - base);
8714	ACQUIRE_LOCK(ump);
8715	if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8716		goto done;
8717	dap = diradd_lookup(pagedep, oldoffset);
8718	if (dap) {
8719		dap->da_offset = newoffset;
8720		newoffset = DIRADDHASH(newoffset);
8721		oldoffset = DIRADDHASH(oldoffset);
8722		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8723		    newoffset != oldoffset) {
8724			LIST_REMOVE(dap, da_pdlist);
8725			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8726			    dap, da_pdlist);
8727		}
8728	}
8729done:
8730	if (jmvref) {
8731		jmvref->jm_pagedep = pagedep;
8732		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8733		add_to_journal(&jmvref->jm_list);
8734	}
8735	bcopy(oldloc, newloc, entrysize);
8736	FREE_LOCK(ump);
8737}
8738
8739/*
8740 * Move the mkdir dependencies and journal work from one diradd to another
8741 * when renaming a directory.  The new name must depend on the mkdir deps
8742 * completing as the old name did.  Directories can only have one valid link
8743 * at a time so one must be canonical.
8744 */
8745static void
8746merge_diradd(inodedep, newdap)
8747	struct inodedep *inodedep;
8748	struct diradd *newdap;
8749{
8750	struct diradd *olddap;
8751	struct mkdir *mkdir, *nextmd;
8752	struct ufsmount *ump;
8753	short state;
8754
8755	olddap = inodedep->id_mkdiradd;
8756	inodedep->id_mkdiradd = newdap;
8757	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8758		newdap->da_state &= ~DEPCOMPLETE;
8759		ump = VFSTOUFS(inodedep->id_list.wk_mp);
8760		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8761		     mkdir = nextmd) {
8762			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8763			if (mkdir->md_diradd != olddap)
8764				continue;
8765			mkdir->md_diradd = newdap;
8766			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8767			newdap->da_state |= state;
8768			olddap->da_state &= ~state;
8769			if ((olddap->da_state &
8770			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
8771				break;
8772		}
8773		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8774			panic("merge_diradd: unfound ref");
8775	}
8776	/*
8777	 * Any mkdir related journal items are not safe to be freed until
8778	 * the new name is stable.
8779	 */
8780	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8781	olddap->da_state |= DEPCOMPLETE;
8782	complete_diradd(olddap);
8783}
8784
8785/*
8786 * Move the diradd to the pending list when all diradd dependencies are
8787 * complete.
8788 */
8789static void
8790complete_diradd(dap)
8791	struct diradd *dap;
8792{
8793	struct pagedep *pagedep;
8794
8795	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8796		if (dap->da_state & DIRCHG)
8797			pagedep = dap->da_previous->dm_pagedep;
8798		else
8799			pagedep = dap->da_pagedep;
8800		LIST_REMOVE(dap, da_pdlist);
8801		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8802	}
8803}
8804
8805/*
8806 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
8807 * add entries and conditonally journal the remove.
8808 */
8809static void
8810cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
8811	struct diradd *dap;
8812	struct dirrem *dirrem;
8813	struct jremref *jremref;
8814	struct jremref *dotremref;
8815	struct jremref *dotdotremref;
8816{
8817	struct inodedep *inodedep;
8818	struct jaddref *jaddref;
8819	struct inoref *inoref;
8820	struct ufsmount *ump;
8821	struct mkdir *mkdir;
8822
8823	/*
8824	 * If no remove references were allocated we're on a non-journaled
8825	 * filesystem and can skip the cancel step.
8826	 */
8827	if (jremref == NULL) {
8828		free_diradd(dap, NULL);
8829		return;
8830	}
8831	/*
8832	 * Cancel the primary name an free it if it does not require
8833	 * journaling.
8834	 */
8835	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
8836	    0, &inodedep) != 0) {
8837		/* Abort the addref that reference this diradd.  */
8838		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
8839			if (inoref->if_list.wk_type != D_JADDREF)
8840				continue;
8841			jaddref = (struct jaddref *)inoref;
8842			if (jaddref->ja_diradd != dap)
8843				continue;
8844			if (cancel_jaddref(jaddref, inodedep,
8845			    &dirrem->dm_jwork) == 0) {
8846				free_jremref(jremref);
8847				jremref = NULL;
8848			}
8849			break;
8850		}
8851	}
8852	/*
8853	 * Cancel subordinate names and free them if they do not require
8854	 * journaling.
8855	 */
8856	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8857		ump = VFSTOUFS(dap->da_list.wk_mp);
8858		LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) {
8859			if (mkdir->md_diradd != dap)
8860				continue;
8861			if ((jaddref = mkdir->md_jaddref) == NULL)
8862				continue;
8863			mkdir->md_jaddref = NULL;
8864			if (mkdir->md_state & MKDIR_PARENT) {
8865				if (cancel_jaddref(jaddref, NULL,
8866				    &dirrem->dm_jwork) == 0) {
8867					free_jremref(dotdotremref);
8868					dotdotremref = NULL;
8869				}
8870			} else {
8871				if (cancel_jaddref(jaddref, inodedep,
8872				    &dirrem->dm_jwork) == 0) {
8873					free_jremref(dotremref);
8874					dotremref = NULL;
8875				}
8876			}
8877		}
8878	}
8879
8880	if (jremref)
8881		journal_jremref(dirrem, jremref, inodedep);
8882	if (dotremref)
8883		journal_jremref(dirrem, dotremref, inodedep);
8884	if (dotdotremref)
8885		journal_jremref(dirrem, dotdotremref, NULL);
8886	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
8887	free_diradd(dap, &dirrem->dm_jwork);
8888}
8889
8890/*
8891 * Free a diradd dependency structure.
8892 */
8893static void
8894free_diradd(dap, wkhd)
8895	struct diradd *dap;
8896	struct workhead *wkhd;
8897{
8898	struct dirrem *dirrem;
8899	struct pagedep *pagedep;
8900	struct inodedep *inodedep;
8901	struct mkdir *mkdir, *nextmd;
8902	struct ufsmount *ump;
8903
8904	ump = VFSTOUFS(dap->da_list.wk_mp);
8905	LOCK_OWNED(ump);
8906	LIST_REMOVE(dap, da_pdlist);
8907	if (dap->da_state & ONWORKLIST)
8908		WORKLIST_REMOVE(&dap->da_list);
8909	if ((dap->da_state & DIRCHG) == 0) {
8910		pagedep = dap->da_pagedep;
8911	} else {
8912		dirrem = dap->da_previous;
8913		pagedep = dirrem->dm_pagedep;
8914		dirrem->dm_dirinum = pagedep->pd_ino;
8915		dirrem->dm_state |= COMPLETE;
8916		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8917			add_to_worklist(&dirrem->dm_list, 0);
8918	}
8919	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
8920	    0, &inodedep) != 0)
8921		if (inodedep->id_mkdiradd == dap)
8922			inodedep->id_mkdiradd = NULL;
8923	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8924		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8925		     mkdir = nextmd) {
8926			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8927			if (mkdir->md_diradd != dap)
8928				continue;
8929			dap->da_state &=
8930			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
8931			LIST_REMOVE(mkdir, md_mkdirs);
8932			if (mkdir->md_state & ONWORKLIST)
8933				WORKLIST_REMOVE(&mkdir->md_list);
8934			if (mkdir->md_jaddref != NULL)
8935				panic("free_diradd: Unexpected jaddref");
8936			WORKITEM_FREE(mkdir, D_MKDIR);
8937			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
8938				break;
8939		}
8940		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8941			panic("free_diradd: unfound ref");
8942	}
8943	if (inodedep)
8944		free_inodedep(inodedep);
8945	/*
8946	 * Free any journal segments waiting for the directory write.
8947	 */
8948	handle_jwork(&dap->da_jwork);
8949	WORKITEM_FREE(dap, D_DIRADD);
8950}
8951
8952/*
8953 * Directory entry removal dependencies.
8954 *
8955 * When removing a directory entry, the entry's inode pointer must be
8956 * zero'ed on disk before the corresponding inode's link count is decremented
8957 * (possibly freeing the inode for re-use). This dependency is handled by
8958 * updating the directory entry but delaying the inode count reduction until
8959 * after the directory block has been written to disk. After this point, the
8960 * inode count can be decremented whenever it is convenient.
8961 */
8962
8963/*
8964 * This routine should be called immediately after removing
8965 * a directory entry.  The inode's link count should not be
8966 * decremented by the calling procedure -- the soft updates
8967 * code will do this task when it is safe.
8968 */
8969void
8970softdep_setup_remove(bp, dp, ip, isrmdir)
8971	struct buf *bp;		/* buffer containing directory block */
8972	struct inode *dp;	/* inode for the directory being modified */
8973	struct inode *ip;	/* inode for directory entry being removed */
8974	int isrmdir;		/* indicates if doing RMDIR */
8975{
8976	struct dirrem *dirrem, *prevdirrem;
8977	struct inodedep *inodedep;
8978	struct ufsmount *ump;
8979	int direct;
8980
8981	ump = ITOUMP(ip);
8982	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
8983	    ("softdep_setup_remove called on non-softdep filesystem"));
8984	/*
8985	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
8986	 * newdirrem() to setup the full directory remove which requires
8987	 * isrmdir > 1.
8988	 */
8989	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8990	/*
8991	 * Add the dirrem to the inodedep's pending remove list for quick
8992	 * discovery later.
8993	 */
8994	if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0)
8995		panic("softdep_setup_remove: Lost inodedep.");
8996	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
8997	dirrem->dm_state |= ONDEPLIST;
8998	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8999
9000	/*
9001	 * If the COMPLETE flag is clear, then there were no active
9002	 * entries and we want to roll back to a zeroed entry until
9003	 * the new inode is committed to disk. If the COMPLETE flag is
9004	 * set then we have deleted an entry that never made it to
9005	 * disk. If the entry we deleted resulted from a name change,
9006	 * then the old name still resides on disk. We cannot delete
9007	 * its inode (returned to us in prevdirrem) until the zeroed
9008	 * directory entry gets to disk. The new inode has never been
9009	 * referenced on the disk, so can be deleted immediately.
9010	 */
9011	if ((dirrem->dm_state & COMPLETE) == 0) {
9012		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
9013		    dm_next);
9014		FREE_LOCK(ump);
9015	} else {
9016		if (prevdirrem != NULL)
9017			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
9018			    prevdirrem, dm_next);
9019		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
9020		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
9021		FREE_LOCK(ump);
9022		if (direct)
9023			handle_workitem_remove(dirrem, 0);
9024	}
9025}
9026
9027/*
9028 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
9029 * pd_pendinghd list of a pagedep.
9030 */
9031static struct diradd *
9032diradd_lookup(pagedep, offset)
9033	struct pagedep *pagedep;
9034	int offset;
9035{
9036	struct diradd *dap;
9037
9038	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
9039		if (dap->da_offset == offset)
9040			return (dap);
9041	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
9042		if (dap->da_offset == offset)
9043			return (dap);
9044	return (NULL);
9045}
9046
9047/*
9048 * Search for a .. diradd dependency in a directory that is being removed.
9049 * If the directory was renamed to a new parent we have a diradd rather
9050 * than a mkdir for the .. entry.  We need to cancel it now before
9051 * it is found in truncate().
9052 */
9053static struct jremref *
9054cancel_diradd_dotdot(ip, dirrem, jremref)
9055	struct inode *ip;
9056	struct dirrem *dirrem;
9057	struct jremref *jremref;
9058{
9059	struct pagedep *pagedep;
9060	struct diradd *dap;
9061	struct worklist *wk;
9062
9063	if (pagedep_lookup(ITOVFS(ip), NULL, ip->i_number, 0, 0, &pagedep) == 0)
9064		return (jremref);
9065	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
9066	if (dap == NULL)
9067		return (jremref);
9068	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
9069	/*
9070	 * Mark any journal work as belonging to the parent so it is freed
9071	 * with the .. reference.
9072	 */
9073	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9074		wk->wk_state |= MKDIR_PARENT;
9075	return (NULL);
9076}
9077
9078/*
9079 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
9080 * replace it with a dirrem/diradd pair as a result of re-parenting a
9081 * directory.  This ensures that we don't simultaneously have a mkdir and
9082 * a diradd for the same .. entry.
9083 */
9084static struct jremref *
9085cancel_mkdir_dotdot(ip, dirrem, jremref)
9086	struct inode *ip;
9087	struct dirrem *dirrem;
9088	struct jremref *jremref;
9089{
9090	struct inodedep *inodedep;
9091	struct jaddref *jaddref;
9092	struct ufsmount *ump;
9093	struct mkdir *mkdir;
9094	struct diradd *dap;
9095	struct mount *mp;
9096
9097	mp = ITOVFS(ip);
9098	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9099		return (jremref);
9100	dap = inodedep->id_mkdiradd;
9101	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
9102		return (jremref);
9103	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9104	for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
9105	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
9106		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
9107			break;
9108	if (mkdir == NULL)
9109		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
9110	if ((jaddref = mkdir->md_jaddref) != NULL) {
9111		mkdir->md_jaddref = NULL;
9112		jaddref->ja_state &= ~MKDIR_PARENT;
9113		if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0)
9114			panic("cancel_mkdir_dotdot: Lost parent inodedep");
9115		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
9116			journal_jremref(dirrem, jremref, inodedep);
9117			jremref = NULL;
9118		}
9119	}
9120	if (mkdir->md_state & ONWORKLIST)
9121		WORKLIST_REMOVE(&mkdir->md_list);
9122	mkdir->md_state |= ALLCOMPLETE;
9123	complete_mkdir(mkdir);
9124	return (jremref);
9125}
9126
9127static void
9128journal_jremref(dirrem, jremref, inodedep)
9129	struct dirrem *dirrem;
9130	struct jremref *jremref;
9131	struct inodedep *inodedep;
9132{
9133
9134	if (inodedep == NULL)
9135		if (inodedep_lookup(jremref->jr_list.wk_mp,
9136		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
9137			panic("journal_jremref: Lost inodedep");
9138	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
9139	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
9140	add_to_journal(&jremref->jr_list);
9141}
9142
9143static void
9144dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
9145	struct dirrem *dirrem;
9146	struct jremref *jremref;
9147	struct jremref *dotremref;
9148	struct jremref *dotdotremref;
9149{
9150	struct inodedep *inodedep;
9151
9152
9153	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
9154	    &inodedep) == 0)
9155		panic("dirrem_journal: Lost inodedep");
9156	journal_jremref(dirrem, jremref, inodedep);
9157	if (dotremref)
9158		journal_jremref(dirrem, dotremref, inodedep);
9159	if (dotdotremref)
9160		journal_jremref(dirrem, dotdotremref, NULL);
9161}
9162
9163/*
9164 * Allocate a new dirrem if appropriate and return it along with
9165 * its associated pagedep. Called without a lock, returns with lock.
9166 */
9167static struct dirrem *
9168newdirrem(bp, dp, ip, isrmdir, prevdirremp)
9169	struct buf *bp;		/* buffer containing directory block */
9170	struct inode *dp;	/* inode for the directory being modified */
9171	struct inode *ip;	/* inode for directory entry being removed */
9172	int isrmdir;		/* indicates if doing RMDIR */
9173	struct dirrem **prevdirremp; /* previously referenced inode, if any */
9174{
9175	int offset;
9176	ufs_lbn_t lbn;
9177	struct diradd *dap;
9178	struct dirrem *dirrem;
9179	struct pagedep *pagedep;
9180	struct jremref *jremref;
9181	struct jremref *dotremref;
9182	struct jremref *dotdotremref;
9183	struct vnode *dvp;
9184	struct ufsmount *ump;
9185
9186	/*
9187	 * Whiteouts have no deletion dependencies.
9188	 */
9189	if (ip == NULL)
9190		panic("newdirrem: whiteout");
9191	dvp = ITOV(dp);
9192	ump = ITOUMP(dp);
9193
9194	/*
9195	 * If the system is over its limit and our filesystem is
9196	 * responsible for more than our share of that usage and
9197	 * we are not a snapshot, request some inodedep cleanup.
9198	 * Limiting the number of dirrem structures will also limit
9199	 * the number of freefile and freeblks structures.
9200	 */
9201	ACQUIRE_LOCK(ump);
9202	if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM))
9203		schedule_cleanup(UFSTOVFS(ump));
9204	else
9205		FREE_LOCK(ump);
9206	dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS |
9207	    M_ZERO);
9208	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
9209	LIST_INIT(&dirrem->dm_jremrefhd);
9210	LIST_INIT(&dirrem->dm_jwork);
9211	dirrem->dm_state = isrmdir ? RMDIR : 0;
9212	dirrem->dm_oldinum = ip->i_number;
9213	*prevdirremp = NULL;
9214	/*
9215	 * Allocate remove reference structures to track journal write
9216	 * dependencies.  We will always have one for the link and
9217	 * when doing directories we will always have one more for dot.
9218	 * When renaming a directory we skip the dotdot link change so
9219	 * this is not needed.
9220	 */
9221	jremref = dotremref = dotdotremref = NULL;
9222	if (DOINGSUJ(dvp)) {
9223		if (isrmdir) {
9224			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
9225			    ip->i_effnlink + 2);
9226			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
9227			    ip->i_effnlink + 1);
9228			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
9229			    dp->i_effnlink + 1);
9230			dotdotremref->jr_state |= MKDIR_PARENT;
9231		} else
9232			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
9233			    ip->i_effnlink + 1);
9234	}
9235	ACQUIRE_LOCK(ump);
9236	lbn = lblkno(ump->um_fs, dp->i_offset);
9237	offset = blkoff(ump->um_fs, dp->i_offset);
9238	pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC,
9239	    &pagedep);
9240	dirrem->dm_pagedep = pagedep;
9241	dirrem->dm_offset = offset;
9242	/*
9243	 * If we're renaming a .. link to a new directory, cancel any
9244	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
9245	 * the jremref is preserved for any potential diradd in this
9246	 * location.  This can not coincide with a rmdir.
9247	 */
9248	if (dp->i_offset == DOTDOT_OFFSET) {
9249		if (isrmdir)
9250			panic("newdirrem: .. directory change during remove?");
9251		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
9252	}
9253	/*
9254	 * If we're removing a directory search for the .. dependency now and
9255	 * cancel it.  Any pending journal work will be added to the dirrem
9256	 * to be completed when the workitem remove completes.
9257	 */
9258	if (isrmdir)
9259		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
9260	/*
9261	 * Check for a diradd dependency for the same directory entry.
9262	 * If present, then both dependencies become obsolete and can
9263	 * be de-allocated.
9264	 */
9265	dap = diradd_lookup(pagedep, offset);
9266	if (dap == NULL) {
9267		/*
9268		 * Link the jremref structures into the dirrem so they are
9269		 * written prior to the pagedep.
9270		 */
9271		if (jremref)
9272			dirrem_journal(dirrem, jremref, dotremref,
9273			    dotdotremref);
9274		return (dirrem);
9275	}
9276	/*
9277	 * Must be ATTACHED at this point.
9278	 */
9279	if ((dap->da_state & ATTACHED) == 0)
9280		panic("newdirrem: not ATTACHED");
9281	if (dap->da_newinum != ip->i_number)
9282		panic("newdirrem: inum %ju should be %ju",
9283		    (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
9284	/*
9285	 * If we are deleting a changed name that never made it to disk,
9286	 * then return the dirrem describing the previous inode (which
9287	 * represents the inode currently referenced from this entry on disk).
9288	 */
9289	if ((dap->da_state & DIRCHG) != 0) {
9290		*prevdirremp = dap->da_previous;
9291		dap->da_state &= ~DIRCHG;
9292		dap->da_pagedep = pagedep;
9293	}
9294	/*
9295	 * We are deleting an entry that never made it to disk.
9296	 * Mark it COMPLETE so we can delete its inode immediately.
9297	 */
9298	dirrem->dm_state |= COMPLETE;
9299	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
9300#ifdef SUJ_DEBUG
9301	if (isrmdir == 0) {
9302		struct worklist *wk;
9303
9304		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9305			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
9306				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
9307	}
9308#endif
9309
9310	return (dirrem);
9311}
9312
9313/*
9314 * Directory entry change dependencies.
9315 *
9316 * Changing an existing directory entry requires that an add operation
9317 * be completed first followed by a deletion. The semantics for the addition
9318 * are identical to the description of adding a new entry above except
9319 * that the rollback is to the old inode number rather than zero. Once
9320 * the addition dependency is completed, the removal is done as described
9321 * in the removal routine above.
9322 */
9323
9324/*
9325 * This routine should be called immediately after changing
9326 * a directory entry.  The inode's link count should not be
9327 * decremented by the calling procedure -- the soft updates
9328 * code will perform this task when it is safe.
9329 */
9330void
9331softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
9332	struct buf *bp;		/* buffer containing directory block */
9333	struct inode *dp;	/* inode for the directory being modified */
9334	struct inode *ip;	/* inode for directory entry being removed */
9335	ino_t newinum;		/* new inode number for changed entry */
9336	int isrmdir;		/* indicates if doing RMDIR */
9337{
9338	int offset;
9339	struct diradd *dap = NULL;
9340	struct dirrem *dirrem, *prevdirrem;
9341	struct pagedep *pagedep;
9342	struct inodedep *inodedep;
9343	struct jaddref *jaddref;
9344	struct mount *mp;
9345	struct ufsmount *ump;
9346
9347	mp = ITOVFS(dp);
9348	ump = VFSTOUFS(mp);
9349	offset = blkoff(ump->um_fs, dp->i_offset);
9350	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
9351	   ("softdep_setup_directory_change called on non-softdep filesystem"));
9352
9353	/*
9354	 * Whiteouts do not need diradd dependencies.
9355	 */
9356	if (newinum != UFS_WINO) {
9357		dap = malloc(sizeof(struct diradd),
9358		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
9359		workitem_alloc(&dap->da_list, D_DIRADD, mp);
9360		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
9361		dap->da_offset = offset;
9362		dap->da_newinum = newinum;
9363		LIST_INIT(&dap->da_jwork);
9364	}
9365
9366	/*
9367	 * Allocate a new dirrem and ACQUIRE_LOCK.
9368	 */
9369	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9370	pagedep = dirrem->dm_pagedep;
9371	/*
9372	 * The possible values for isrmdir:
9373	 *	0 - non-directory file rename
9374	 *	1 - directory rename within same directory
9375	 *   inum - directory rename to new directory of given inode number
9376	 * When renaming to a new directory, we are both deleting and
9377	 * creating a new directory entry, so the link count on the new
9378	 * directory should not change. Thus we do not need the followup
9379	 * dirrem which is usually done in handle_workitem_remove. We set
9380	 * the DIRCHG flag to tell handle_workitem_remove to skip the
9381	 * followup dirrem.
9382	 */
9383	if (isrmdir > 1)
9384		dirrem->dm_state |= DIRCHG;
9385
9386	/*
9387	 * Whiteouts have no additional dependencies,
9388	 * so just put the dirrem on the correct list.
9389	 */
9390	if (newinum == UFS_WINO) {
9391		if ((dirrem->dm_state & COMPLETE) == 0) {
9392			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
9393			    dm_next);
9394		} else {
9395			dirrem->dm_dirinum = pagedep->pd_ino;
9396			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9397				add_to_worklist(&dirrem->dm_list, 0);
9398		}
9399		FREE_LOCK(ump);
9400		return;
9401	}
9402	/*
9403	 * Add the dirrem to the inodedep's pending remove list for quick
9404	 * discovery later.  A valid nlinkdelta ensures that this lookup
9405	 * will not fail.
9406	 */
9407	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9408		panic("softdep_setup_directory_change: Lost inodedep.");
9409	dirrem->dm_state |= ONDEPLIST;
9410	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9411
9412	/*
9413	 * If the COMPLETE flag is clear, then there were no active
9414	 * entries and we want to roll back to the previous inode until
9415	 * the new inode is committed to disk. If the COMPLETE flag is
9416	 * set, then we have deleted an entry that never made it to disk.
9417	 * If the entry we deleted resulted from a name change, then the old
9418	 * inode reference still resides on disk. Any rollback that we do
9419	 * needs to be to that old inode (returned to us in prevdirrem). If
9420	 * the entry we deleted resulted from a create, then there is
9421	 * no entry on the disk, so we want to roll back to zero rather
9422	 * than the uncommitted inode. In either of the COMPLETE cases we
9423	 * want to immediately free the unwritten and unreferenced inode.
9424	 */
9425	if ((dirrem->dm_state & COMPLETE) == 0) {
9426		dap->da_previous = dirrem;
9427	} else {
9428		if (prevdirrem != NULL) {
9429			dap->da_previous = prevdirrem;
9430		} else {
9431			dap->da_state &= ~DIRCHG;
9432			dap->da_pagedep = pagedep;
9433		}
9434		dirrem->dm_dirinum = pagedep->pd_ino;
9435		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9436			add_to_worklist(&dirrem->dm_list, 0);
9437	}
9438	/*
9439	 * Lookup the jaddref for this journal entry.  We must finish
9440	 * initializing it and make the diradd write dependent on it.
9441	 * If we're not journaling, put it on the id_bufwait list if the
9442	 * inode is not yet written. If it is written, do the post-inode
9443	 * write processing to put it on the id_pendinghd list.
9444	 */
9445	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
9446	if (MOUNTEDSUJ(mp)) {
9447		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
9448		    inoreflst);
9449		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
9450		    ("softdep_setup_directory_change: bad jaddref %p",
9451		    jaddref));
9452		jaddref->ja_diroff = dp->i_offset;
9453		jaddref->ja_diradd = dap;
9454		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9455		    dap, da_pdlist);
9456		add_to_journal(&jaddref->ja_list);
9457	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
9458		dap->da_state |= COMPLETE;
9459		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9460		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9461	} else {
9462		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9463		    dap, da_pdlist);
9464		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
9465	}
9466	/*
9467	 * If we're making a new name for a directory that has not been
9468	 * committed when need to move the dot and dotdot references to
9469	 * this new name.
9470	 */
9471	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
9472		merge_diradd(inodedep, dap);
9473	FREE_LOCK(ump);
9474}
9475
9476/*
9477 * Called whenever the link count on an inode is changed.
9478 * It creates an inode dependency so that the new reference(s)
9479 * to the inode cannot be committed to disk until the updated
9480 * inode has been written.
9481 */
9482void
9483softdep_change_linkcnt(ip)
9484	struct inode *ip;	/* the inode with the increased link count */
9485{
9486	struct inodedep *inodedep;
9487	struct ufsmount *ump;
9488
9489	ump = ITOUMP(ip);
9490	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9491	    ("softdep_change_linkcnt called on non-softdep filesystem"));
9492	ACQUIRE_LOCK(ump);
9493	inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
9494	if (ip->i_nlink < ip->i_effnlink)
9495		panic("softdep_change_linkcnt: bad delta");
9496	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9497	FREE_LOCK(ump);
9498}
9499
9500/*
9501 * Attach a sbdep dependency to the superblock buf so that we can keep
9502 * track of the head of the linked list of referenced but unlinked inodes.
9503 */
9504void
9505softdep_setup_sbupdate(ump, fs, bp)
9506	struct ufsmount *ump;
9507	struct fs *fs;
9508	struct buf *bp;
9509{
9510	struct sbdep *sbdep;
9511	struct worklist *wk;
9512
9513	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9514	    ("softdep_setup_sbupdate called on non-softdep filesystem"));
9515	LIST_FOREACH(wk, &bp->b_dep, wk_list)
9516		if (wk->wk_type == D_SBDEP)
9517			break;
9518	if (wk != NULL)
9519		return;
9520	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
9521	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9522	sbdep->sb_fs = fs;
9523	sbdep->sb_ump = ump;
9524	ACQUIRE_LOCK(ump);
9525	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9526	FREE_LOCK(ump);
9527}
9528
9529/*
9530 * Return the first unlinked inodedep which is ready to be the head of the
9531 * list.  The inodedep and all those after it must have valid next pointers.
9532 */
9533static struct inodedep *
9534first_unlinked_inodedep(ump)
9535	struct ufsmount *ump;
9536{
9537	struct inodedep *inodedep;
9538	struct inodedep *idp;
9539
9540	LOCK_OWNED(ump);
9541	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9542	    inodedep; inodedep = idp) {
9543		if ((inodedep->id_state & UNLINKNEXT) == 0)
9544			return (NULL);
9545		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9546		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9547			break;
9548		if ((inodedep->id_state & UNLINKPREV) == 0)
9549			break;
9550	}
9551	return (inodedep);
9552}
9553
9554/*
9555 * Set the sujfree unlinked head pointer prior to writing a superblock.
9556 */
9557static void
9558initiate_write_sbdep(sbdep)
9559	struct sbdep *sbdep;
9560{
9561	struct inodedep *inodedep;
9562	struct fs *bpfs;
9563	struct fs *fs;
9564
9565	bpfs = sbdep->sb_fs;
9566	fs = sbdep->sb_ump->um_fs;
9567	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9568	if (inodedep) {
9569		fs->fs_sujfree = inodedep->id_ino;
9570		inodedep->id_state |= UNLINKPREV;
9571	} else
9572		fs->fs_sujfree = 0;
9573	bpfs->fs_sujfree = fs->fs_sujfree;
9574}
9575
9576/*
9577 * After a superblock is written determine whether it must be written again
9578 * due to a changing unlinked list head.
9579 */
9580static int
9581handle_written_sbdep(sbdep, bp)
9582	struct sbdep *sbdep;
9583	struct buf *bp;
9584{
9585	struct inodedep *inodedep;
9586	struct fs *fs;
9587
9588	LOCK_OWNED(sbdep->sb_ump);
9589	fs = sbdep->sb_fs;
9590	/*
9591	 * If the superblock doesn't match the in-memory list start over.
9592	 */
9593	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9594	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9595	    (inodedep == NULL && fs->fs_sujfree != 0)) {
9596		bdirty(bp);
9597		return (1);
9598	}
9599	WORKITEM_FREE(sbdep, D_SBDEP);
9600	if (fs->fs_sujfree == 0)
9601		return (0);
9602	/*
9603	 * Now that we have a record of this inode in stable store allow it
9604	 * to be written to free up pending work.  Inodes may see a lot of
9605	 * write activity after they are unlinked which we must not hold up.
9606	 */
9607	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9608		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9609			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9610			    inodedep, inodedep->id_state);
9611		if (inodedep->id_state & UNLINKONLIST)
9612			break;
9613		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9614	}
9615
9616	return (0);
9617}
9618
9619/*
9620 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9621 */
9622static void
9623unlinked_inodedep(mp, inodedep)
9624	struct mount *mp;
9625	struct inodedep *inodedep;
9626{
9627	struct ufsmount *ump;
9628
9629	ump = VFSTOUFS(mp);
9630	LOCK_OWNED(ump);
9631	if (MOUNTEDSUJ(mp) == 0)
9632		return;
9633	ump->um_fs->fs_fmod = 1;
9634	if (inodedep->id_state & UNLINKED)
9635		panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9636	inodedep->id_state |= UNLINKED;
9637	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9638}
9639
9640/*
9641 * Remove an inodedep from the unlinked inodedep list.  This may require
9642 * disk writes if the inode has made it that far.
9643 */
9644static void
9645clear_unlinked_inodedep(inodedep)
9646	struct inodedep *inodedep;
9647{
9648	struct ufsmount *ump;
9649	struct inodedep *idp;
9650	struct inodedep *idn;
9651	struct fs *fs;
9652	struct buf *bp;
9653	ino_t ino;
9654	ino_t nino;
9655	ino_t pino;
9656	int error;
9657
9658	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9659	fs = ump->um_fs;
9660	ino = inodedep->id_ino;
9661	error = 0;
9662	for (;;) {
9663		LOCK_OWNED(ump);
9664		KASSERT((inodedep->id_state & UNLINKED) != 0,
9665		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9666		    inodedep));
9667		/*
9668		 * If nothing has yet been written simply remove us from
9669		 * the in memory list and return.  This is the most common
9670		 * case where handle_workitem_remove() loses the final
9671		 * reference.
9672		 */
9673		if ((inodedep->id_state & UNLINKLINKS) == 0)
9674			break;
9675		/*
9676		 * If we have a NEXT pointer and no PREV pointer we can simply
9677		 * clear NEXT's PREV and remove ourselves from the list.  Be
9678		 * careful not to clear PREV if the superblock points at
9679		 * next as well.
9680		 */
9681		idn = TAILQ_NEXT(inodedep, id_unlinked);
9682		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9683			if (idn && fs->fs_sujfree != idn->id_ino)
9684				idn->id_state &= ~UNLINKPREV;
9685			break;
9686		}
9687		/*
9688		 * Here we have an inodedep which is actually linked into
9689		 * the list.  We must remove it by forcing a write to the
9690		 * link before us, whether it be the superblock or an inode.
9691		 * Unfortunately the list may change while we're waiting
9692		 * on the buf lock for either resource so we must loop until
9693		 * we lock the right one.  If both the superblock and an
9694		 * inode point to this inode we must clear the inode first
9695		 * followed by the superblock.
9696		 */
9697		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9698		pino = 0;
9699		if (idp && (idp->id_state & UNLINKNEXT))
9700			pino = idp->id_ino;
9701		FREE_LOCK(ump);
9702		if (pino == 0) {
9703			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9704			    (int)fs->fs_sbsize, 0, 0, 0);
9705		} else {
9706			error = bread(ump->um_devvp,
9707			    fsbtodb(fs, ino_to_fsba(fs, pino)),
9708			    (int)fs->fs_bsize, NOCRED, &bp);
9709			if (error)
9710				brelse(bp);
9711		}
9712		ACQUIRE_LOCK(ump);
9713		if (error)
9714			break;
9715		/* If the list has changed restart the loop. */
9716		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9717		nino = 0;
9718		if (idp && (idp->id_state & UNLINKNEXT))
9719			nino = idp->id_ino;
9720		if (nino != pino ||
9721		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9722			FREE_LOCK(ump);
9723			brelse(bp);
9724			ACQUIRE_LOCK(ump);
9725			continue;
9726		}
9727		nino = 0;
9728		idn = TAILQ_NEXT(inodedep, id_unlinked);
9729		if (idn)
9730			nino = idn->id_ino;
9731		/*
9732		 * Remove us from the in memory list.  After this we cannot
9733		 * access the inodedep.
9734		 */
9735		KASSERT((inodedep->id_state & UNLINKED) != 0,
9736		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9737		    inodedep));
9738		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9739		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9740		FREE_LOCK(ump);
9741		/*
9742		 * The predecessor's next pointer is manually updated here
9743		 * so that the NEXT flag is never cleared for an element
9744		 * that is in the list.
9745		 */
9746		if (pino == 0) {
9747			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9748			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9749			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9750			    bp);
9751		} else if (fs->fs_magic == FS_UFS1_MAGIC)
9752			((struct ufs1_dinode *)bp->b_data +
9753			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9754		else
9755			((struct ufs2_dinode *)bp->b_data +
9756			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9757		/*
9758		 * If the bwrite fails we have no recourse to recover.  The
9759		 * filesystem is corrupted already.
9760		 */
9761		bwrite(bp);
9762		ACQUIRE_LOCK(ump);
9763		/*
9764		 * If the superblock pointer still needs to be cleared force
9765		 * a write here.
9766		 */
9767		if (fs->fs_sujfree == ino) {
9768			FREE_LOCK(ump);
9769			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9770			    (int)fs->fs_sbsize, 0, 0, 0);
9771			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9772			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9773			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9774			    bp);
9775			bwrite(bp);
9776			ACQUIRE_LOCK(ump);
9777		}
9778
9779		if (fs->fs_sujfree != ino)
9780			return;
9781		panic("clear_unlinked_inodedep: Failed to clear free head");
9782	}
9783	if (inodedep->id_ino == fs->fs_sujfree)
9784		panic("clear_unlinked_inodedep: Freeing head of free list");
9785	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9786	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9787	return;
9788}
9789
9790/*
9791 * This workitem decrements the inode's link count.
9792 * If the link count reaches zero, the file is removed.
9793 */
9794static int
9795handle_workitem_remove(dirrem, flags)
9796	struct dirrem *dirrem;
9797	int flags;
9798{
9799	struct inodedep *inodedep;
9800	struct workhead dotdotwk;
9801	struct worklist *wk;
9802	struct ufsmount *ump;
9803	struct mount *mp;
9804	struct vnode *vp;
9805	struct inode *ip;
9806	ino_t oldinum;
9807
9808	if (dirrem->dm_state & ONWORKLIST)
9809		panic("handle_workitem_remove: dirrem %p still on worklist",
9810		    dirrem);
9811	oldinum = dirrem->dm_oldinum;
9812	mp = dirrem->dm_list.wk_mp;
9813	ump = VFSTOUFS(mp);
9814	flags |= LK_EXCLUSIVE;
9815	if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
9816		return (EBUSY);
9817	ip = VTOI(vp);
9818	MPASS(ip->i_mode != 0);
9819	ACQUIRE_LOCK(ump);
9820	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
9821		panic("handle_workitem_remove: lost inodedep");
9822	if (dirrem->dm_state & ONDEPLIST)
9823		LIST_REMOVE(dirrem, dm_inonext);
9824	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9825	    ("handle_workitem_remove:  Journal entries not written."));
9826
9827	/*
9828	 * Move all dependencies waiting on the remove to complete
9829	 * from the dirrem to the inode inowait list to be completed
9830	 * after the inode has been updated and written to disk.
9831	 *
9832	 * Any marked MKDIR_PARENT are saved to be completed when the
9833	 * dotdot ref is removed unless DIRCHG is specified.  For
9834	 * directory change operations there will be no further
9835	 * directory writes and the jsegdeps need to be moved along
9836	 * with the rest to be completed when the inode is free or
9837	 * stable in the inode free list.
9838	 */
9839	LIST_INIT(&dotdotwk);
9840	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
9841		WORKLIST_REMOVE(wk);
9842		if ((dirrem->dm_state & DIRCHG) == 0 &&
9843		    wk->wk_state & MKDIR_PARENT) {
9844			wk->wk_state &= ~MKDIR_PARENT;
9845			WORKLIST_INSERT(&dotdotwk, wk);
9846			continue;
9847		}
9848		WORKLIST_INSERT(&inodedep->id_inowait, wk);
9849	}
9850	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
9851	/*
9852	 * Normal file deletion.
9853	 */
9854	if ((dirrem->dm_state & RMDIR) == 0) {
9855		ip->i_nlink--;
9856		DIP_SET(ip, i_nlink, ip->i_nlink);
9857		ip->i_flag |= IN_CHANGE;
9858		if (ip->i_nlink < ip->i_effnlink)
9859			panic("handle_workitem_remove: bad file delta");
9860		if (ip->i_nlink == 0)
9861			unlinked_inodedep(mp, inodedep);
9862		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9863		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9864		    ("handle_workitem_remove: worklist not empty. %s",
9865		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
9866		WORKITEM_FREE(dirrem, D_DIRREM);
9867		FREE_LOCK(ump);
9868		goto out;
9869	}
9870	/*
9871	 * Directory deletion. Decrement reference count for both the
9872	 * just deleted parent directory entry and the reference for ".".
9873	 * Arrange to have the reference count on the parent decremented
9874	 * to account for the loss of "..".
9875	 */
9876	ip->i_nlink -= 2;
9877	DIP_SET(ip, i_nlink, ip->i_nlink);
9878	ip->i_flag |= IN_CHANGE;
9879	if (ip->i_nlink < ip->i_effnlink)
9880		panic("handle_workitem_remove: bad dir delta");
9881	if (ip->i_nlink == 0)
9882		unlinked_inodedep(mp, inodedep);
9883	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9884	/*
9885	 * Rename a directory to a new parent. Since, we are both deleting
9886	 * and creating a new directory entry, the link count on the new
9887	 * directory should not change. Thus we skip the followup dirrem.
9888	 */
9889	if (dirrem->dm_state & DIRCHG) {
9890		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9891		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
9892		WORKITEM_FREE(dirrem, D_DIRREM);
9893		FREE_LOCK(ump);
9894		goto out;
9895	}
9896	dirrem->dm_state = ONDEPLIST;
9897	dirrem->dm_oldinum = dirrem->dm_dirinum;
9898	/*
9899	 * Place the dirrem on the parent's diremhd list.
9900	 */
9901	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
9902		panic("handle_workitem_remove: lost dir inodedep");
9903	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9904	/*
9905	 * If the allocated inode has never been written to disk, then
9906	 * the on-disk inode is zero'ed and we can remove the file
9907	 * immediately.  When journaling if the inode has been marked
9908	 * unlinked and not DEPCOMPLETE we know it can never be written.
9909	 */
9910	inodedep_lookup(mp, oldinum, 0, &inodedep);
9911	if (inodedep == NULL ||
9912	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
9913	    check_inode_unwritten(inodedep)) {
9914		FREE_LOCK(ump);
9915		vput(vp);
9916		return handle_workitem_remove(dirrem, flags);
9917	}
9918	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
9919	FREE_LOCK(ump);
9920	ip->i_flag |= IN_CHANGE;
9921out:
9922	ffs_update(vp, 0);
9923	vput(vp);
9924	return (0);
9925}
9926
9927/*
9928 * Inode de-allocation dependencies.
9929 *
9930 * When an inode's link count is reduced to zero, it can be de-allocated. We
9931 * found it convenient to postpone de-allocation until after the inode is
9932 * written to disk with its new link count (zero).  At this point, all of the
9933 * on-disk inode's block pointers are nullified and, with careful dependency
9934 * list ordering, all dependencies related to the inode will be satisfied and
9935 * the corresponding dependency structures de-allocated.  So, if/when the
9936 * inode is reused, there will be no mixing of old dependencies with new
9937 * ones.  This artificial dependency is set up by the block de-allocation
9938 * procedure above (softdep_setup_freeblocks) and completed by the
9939 * following procedure.
9940 */
9941static void
9942handle_workitem_freefile(freefile)
9943	struct freefile *freefile;
9944{
9945	struct workhead wkhd;
9946	struct fs *fs;
9947	struct inodedep *idp;
9948	struct ufsmount *ump;
9949	int error;
9950
9951	ump = VFSTOUFS(freefile->fx_list.wk_mp);
9952	fs = ump->um_fs;
9953#ifdef DEBUG
9954	ACQUIRE_LOCK(ump);
9955	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
9956	FREE_LOCK(ump);
9957	if (error)
9958		panic("handle_workitem_freefile: inodedep %p survived", idp);
9959#endif
9960	UFS_LOCK(ump);
9961	fs->fs_pendinginodes -= 1;
9962	UFS_UNLOCK(ump);
9963	LIST_INIT(&wkhd);
9964	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
9965	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
9966	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
9967		softdep_error("handle_workitem_freefile", error);
9968	ACQUIRE_LOCK(ump);
9969	WORKITEM_FREE(freefile, D_FREEFILE);
9970	FREE_LOCK(ump);
9971}
9972
9973
9974/*
9975 * Helper function which unlinks marker element from work list and returns
9976 * the next element on the list.
9977 */
9978static __inline struct worklist *
9979markernext(struct worklist *marker)
9980{
9981	struct worklist *next;
9982
9983	next = LIST_NEXT(marker, wk_list);
9984	LIST_REMOVE(marker, wk_list);
9985	return next;
9986}
9987
9988/*
9989 * Disk writes.
9990 *
9991 * The dependency structures constructed above are most actively used when file
9992 * system blocks are written to disk.  No constraints are placed on when a
9993 * block can be written, but unsatisfied update dependencies are made safe by
9994 * modifying (or replacing) the source memory for the duration of the disk
9995 * write.  When the disk write completes, the memory block is again brought
9996 * up-to-date.
9997 *
9998 * In-core inode structure reclamation.
9999 *
10000 * Because there are a finite number of "in-core" inode structures, they are
10001 * reused regularly.  By transferring all inode-related dependencies to the
10002 * in-memory inode block and indexing them separately (via "inodedep"s), we
10003 * can allow "in-core" inode structures to be reused at any time and avoid
10004 * any increase in contention.
10005 *
10006 * Called just before entering the device driver to initiate a new disk I/O.
10007 * The buffer must be locked, thus, no I/O completion operations can occur
10008 * while we are manipulating its associated dependencies.
10009 */
10010static void
10011softdep_disk_io_initiation(bp)
10012	struct buf *bp;		/* structure describing disk write to occur */
10013{
10014	struct worklist *wk;
10015	struct worklist marker;
10016	struct inodedep *inodedep;
10017	struct freeblks *freeblks;
10018	struct jblkdep *jblkdep;
10019	struct newblk *newblk;
10020	struct ufsmount *ump;
10021
10022	/*
10023	 * We only care about write operations. There should never
10024	 * be dependencies for reads.
10025	 */
10026	if (bp->b_iocmd != BIO_WRITE)
10027		panic("softdep_disk_io_initiation: not write");
10028
10029	if (bp->b_vflags & BV_BKGRDINPROG)
10030		panic("softdep_disk_io_initiation: Writing buffer with "
10031		    "background write in progress: %p", bp);
10032
10033	ump = softdep_bp_to_mp(bp);
10034	if (ump == NULL)
10035		return;
10036
10037	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
10038	PHOLD(curproc);			/* Don't swap out kernel stack */
10039	ACQUIRE_LOCK(ump);
10040	/*
10041	 * Do any necessary pre-I/O processing.
10042	 */
10043	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
10044	     wk = markernext(&marker)) {
10045		LIST_INSERT_AFTER(wk, &marker, wk_list);
10046		switch (wk->wk_type) {
10047
10048		case D_PAGEDEP:
10049			initiate_write_filepage(WK_PAGEDEP(wk), bp);
10050			continue;
10051
10052		case D_INODEDEP:
10053			inodedep = WK_INODEDEP(wk);
10054			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
10055				initiate_write_inodeblock_ufs1(inodedep, bp);
10056			else
10057				initiate_write_inodeblock_ufs2(inodedep, bp);
10058			continue;
10059
10060		case D_INDIRDEP:
10061			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
10062			continue;
10063
10064		case D_BMSAFEMAP:
10065			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
10066			continue;
10067
10068		case D_JSEG:
10069			WK_JSEG(wk)->js_buf = NULL;
10070			continue;
10071
10072		case D_FREEBLKS:
10073			freeblks = WK_FREEBLKS(wk);
10074			jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
10075			/*
10076			 * We have to wait for the freeblks to be journaled
10077			 * before we can write an inodeblock with updated
10078			 * pointers.  Be careful to arrange the marker so
10079			 * we revisit the freeblks if it's not removed by
10080			 * the first jwait().
10081			 */
10082			if (jblkdep != NULL) {
10083				LIST_REMOVE(&marker, wk_list);
10084				LIST_INSERT_BEFORE(wk, &marker, wk_list);
10085				jwait(&jblkdep->jb_list, MNT_WAIT);
10086			}
10087			continue;
10088		case D_ALLOCDIRECT:
10089		case D_ALLOCINDIR:
10090			/*
10091			 * We have to wait for the jnewblk to be journaled
10092			 * before we can write to a block if the contents
10093			 * may be confused with an earlier file's indirect
10094			 * at recovery time.  Handle the marker as described
10095			 * above.
10096			 */
10097			newblk = WK_NEWBLK(wk);
10098			if (newblk->nb_jnewblk != NULL &&
10099			    indirblk_lookup(newblk->nb_list.wk_mp,
10100			    newblk->nb_newblkno)) {
10101				LIST_REMOVE(&marker, wk_list);
10102				LIST_INSERT_BEFORE(wk, &marker, wk_list);
10103				jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
10104			}
10105			continue;
10106
10107		case D_SBDEP:
10108			initiate_write_sbdep(WK_SBDEP(wk));
10109			continue;
10110
10111		case D_MKDIR:
10112		case D_FREEWORK:
10113		case D_FREEDEP:
10114		case D_JSEGDEP:
10115			continue;
10116
10117		default:
10118			panic("handle_disk_io_initiation: Unexpected type %s",
10119			    TYPENAME(wk->wk_type));
10120			/* NOTREACHED */
10121		}
10122	}
10123	FREE_LOCK(ump);
10124	PRELE(curproc);			/* Allow swapout of kernel stack */
10125}
10126
10127/*
10128 * Called from within the procedure above to deal with unsatisfied
10129 * allocation dependencies in a directory. The buffer must be locked,
10130 * thus, no I/O completion operations can occur while we are
10131 * manipulating its associated dependencies.
10132 */
10133static void
10134initiate_write_filepage(pagedep, bp)
10135	struct pagedep *pagedep;
10136	struct buf *bp;
10137{
10138	struct jremref *jremref;
10139	struct jmvref *jmvref;
10140	struct dirrem *dirrem;
10141	struct diradd *dap;
10142	struct direct *ep;
10143	int i;
10144
10145	if (pagedep->pd_state & IOSTARTED) {
10146		/*
10147		 * This can only happen if there is a driver that does not
10148		 * understand chaining. Here biodone will reissue the call
10149		 * to strategy for the incomplete buffers.
10150		 */
10151		printf("initiate_write_filepage: already started\n");
10152		return;
10153	}
10154	pagedep->pd_state |= IOSTARTED;
10155	/*
10156	 * Wait for all journal remove dependencies to hit the disk.
10157	 * We can not allow any potentially conflicting directory adds
10158	 * to be visible before removes and rollback is too difficult.
10159	 * The per-filesystem lock may be dropped and re-acquired, however
10160	 * we hold the buf locked so the dependency can not go away.
10161	 */
10162	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
10163		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
10164			jwait(&jremref->jr_list, MNT_WAIT);
10165	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
10166		jwait(&jmvref->jm_list, MNT_WAIT);
10167	for (i = 0; i < DAHASHSZ; i++) {
10168		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
10169			ep = (struct direct *)
10170			    ((char *)bp->b_data + dap->da_offset);
10171			if (ep->d_ino != dap->da_newinum)
10172				panic("%s: dir inum %ju != new %ju",
10173				    "initiate_write_filepage",
10174				    (uintmax_t)ep->d_ino,
10175				    (uintmax_t)dap->da_newinum);
10176			if (dap->da_state & DIRCHG)
10177				ep->d_ino = dap->da_previous->dm_oldinum;
10178			else
10179				ep->d_ino = 0;
10180			dap->da_state &= ~ATTACHED;
10181			dap->da_state |= UNDONE;
10182		}
10183	}
10184}
10185
10186/*
10187 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
10188 * Note that any bug fixes made to this routine must be done in the
10189 * version found below.
10190 *
10191 * Called from within the procedure above to deal with unsatisfied
10192 * allocation dependencies in an inodeblock. The buffer must be
10193 * locked, thus, no I/O completion operations can occur while we
10194 * are manipulating its associated dependencies.
10195 */
10196static void
10197initiate_write_inodeblock_ufs1(inodedep, bp)
10198	struct inodedep *inodedep;
10199	struct buf *bp;			/* The inode block */
10200{
10201	struct allocdirect *adp, *lastadp;
10202	struct ufs1_dinode *dp;
10203	struct ufs1_dinode *sip;
10204	struct inoref *inoref;
10205	struct ufsmount *ump;
10206	struct fs *fs;
10207	ufs_lbn_t i;
10208#ifdef INVARIANTS
10209	ufs_lbn_t prevlbn = 0;
10210#endif
10211	int deplist;
10212
10213	if (inodedep->id_state & IOSTARTED)
10214		panic("initiate_write_inodeblock_ufs1: already started");
10215	inodedep->id_state |= IOSTARTED;
10216	fs = inodedep->id_fs;
10217	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10218	LOCK_OWNED(ump);
10219	dp = (struct ufs1_dinode *)bp->b_data +
10220	    ino_to_fsbo(fs, inodedep->id_ino);
10221
10222	/*
10223	 * If we're on the unlinked list but have not yet written our
10224	 * next pointer initialize it here.
10225	 */
10226	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10227		struct inodedep *inon;
10228
10229		inon = TAILQ_NEXT(inodedep, id_unlinked);
10230		dp->di_freelink = inon ? inon->id_ino : 0;
10231	}
10232	/*
10233	 * If the bitmap is not yet written, then the allocated
10234	 * inode cannot be written to disk.
10235	 */
10236	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10237		if (inodedep->id_savedino1 != NULL)
10238			panic("initiate_write_inodeblock_ufs1: I/O underway");
10239		FREE_LOCK(ump);
10240		sip = malloc(sizeof(struct ufs1_dinode),
10241		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10242		ACQUIRE_LOCK(ump);
10243		inodedep->id_savedino1 = sip;
10244		*inodedep->id_savedino1 = *dp;
10245		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
10246		dp->di_gen = inodedep->id_savedino1->di_gen;
10247		dp->di_freelink = inodedep->id_savedino1->di_freelink;
10248		return;
10249	}
10250	/*
10251	 * If no dependencies, then there is nothing to roll back.
10252	 */
10253	inodedep->id_savedsize = dp->di_size;
10254	inodedep->id_savedextsize = 0;
10255	inodedep->id_savednlink = dp->di_nlink;
10256	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10257	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10258		return;
10259	/*
10260	 * Revert the link count to that of the first unwritten journal entry.
10261	 */
10262	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10263	if (inoref)
10264		dp->di_nlink = inoref->if_nlink;
10265	/*
10266	 * Set the dependencies to busy.
10267	 */
10268	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10269	     adp = TAILQ_NEXT(adp, ad_next)) {
10270#ifdef INVARIANTS
10271		if (deplist != 0 && prevlbn >= adp->ad_offset)
10272			panic("softdep_write_inodeblock: lbn order");
10273		prevlbn = adp->ad_offset;
10274		if (adp->ad_offset < UFS_NDADDR &&
10275		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10276			panic("initiate_write_inodeblock_ufs1: "
10277			    "direct pointer #%jd mismatch %d != %jd",
10278			    (intmax_t)adp->ad_offset,
10279			    dp->di_db[adp->ad_offset],
10280			    (intmax_t)adp->ad_newblkno);
10281		if (adp->ad_offset >= UFS_NDADDR &&
10282		    dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno)
10283			panic("initiate_write_inodeblock_ufs1: "
10284			    "indirect pointer #%jd mismatch %d != %jd",
10285			    (intmax_t)adp->ad_offset - UFS_NDADDR,
10286			    dp->di_ib[adp->ad_offset - UFS_NDADDR],
10287			    (intmax_t)adp->ad_newblkno);
10288		deplist |= 1 << adp->ad_offset;
10289		if ((adp->ad_state & ATTACHED) == 0)
10290			panic("initiate_write_inodeblock_ufs1: "
10291			    "Unknown state 0x%x", adp->ad_state);
10292#endif /* INVARIANTS */
10293		adp->ad_state &= ~ATTACHED;
10294		adp->ad_state |= UNDONE;
10295	}
10296	/*
10297	 * The on-disk inode cannot claim to be any larger than the last
10298	 * fragment that has been written. Otherwise, the on-disk inode
10299	 * might have fragments that were not the last block in the file
10300	 * which would corrupt the filesystem.
10301	 */
10302	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10303	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10304		if (adp->ad_offset >= UFS_NDADDR)
10305			break;
10306		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10307		/* keep going until hitting a rollback to a frag */
10308		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10309			continue;
10310		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10311		for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) {
10312#ifdef INVARIANTS
10313			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10314				panic("initiate_write_inodeblock_ufs1: "
10315				    "lost dep1");
10316#endif /* INVARIANTS */
10317			dp->di_db[i] = 0;
10318		}
10319		for (i = 0; i < UFS_NIADDR; i++) {
10320#ifdef INVARIANTS
10321			if (dp->di_ib[i] != 0 &&
10322			    (deplist & ((1 << UFS_NDADDR) << i)) == 0)
10323				panic("initiate_write_inodeblock_ufs1: "
10324				    "lost dep2");
10325#endif /* INVARIANTS */
10326			dp->di_ib[i] = 0;
10327		}
10328		return;
10329	}
10330	/*
10331	 * If we have zero'ed out the last allocated block of the file,
10332	 * roll back the size to the last currently allocated block.
10333	 * We know that this last allocated block is a full-sized as
10334	 * we already checked for fragments in the loop above.
10335	 */
10336	if (lastadp != NULL &&
10337	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10338		for (i = lastadp->ad_offset; i >= 0; i--)
10339			if (dp->di_db[i] != 0)
10340				break;
10341		dp->di_size = (i + 1) * fs->fs_bsize;
10342	}
10343	/*
10344	 * The only dependencies are for indirect blocks.
10345	 *
10346	 * The file size for indirect block additions is not guaranteed.
10347	 * Such a guarantee would be non-trivial to achieve. The conventional
10348	 * synchronous write implementation also does not make this guarantee.
10349	 * Fsck should catch and fix discrepancies. Arguably, the file size
10350	 * can be over-estimated without destroying integrity when the file
10351	 * moves into the indirect blocks (i.e., is large). If we want to
10352	 * postpone fsck, we are stuck with this argument.
10353	 */
10354	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10355		dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0;
10356}
10357
10358/*
10359 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
10360 * Note that any bug fixes made to this routine must be done in the
10361 * version found above.
10362 *
10363 * Called from within the procedure above to deal with unsatisfied
10364 * allocation dependencies in an inodeblock. The buffer must be
10365 * locked, thus, no I/O completion operations can occur while we
10366 * are manipulating its associated dependencies.
10367 */
10368static void
10369initiate_write_inodeblock_ufs2(inodedep, bp)
10370	struct inodedep *inodedep;
10371	struct buf *bp;			/* The inode block */
10372{
10373	struct allocdirect *adp, *lastadp;
10374	struct ufs2_dinode *dp;
10375	struct ufs2_dinode *sip;
10376	struct inoref *inoref;
10377	struct ufsmount *ump;
10378	struct fs *fs;
10379	ufs_lbn_t i;
10380#ifdef INVARIANTS
10381	ufs_lbn_t prevlbn = 0;
10382#endif
10383	int deplist;
10384
10385	if (inodedep->id_state & IOSTARTED)
10386		panic("initiate_write_inodeblock_ufs2: already started");
10387	inodedep->id_state |= IOSTARTED;
10388	fs = inodedep->id_fs;
10389	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10390	LOCK_OWNED(ump);
10391	dp = (struct ufs2_dinode *)bp->b_data +
10392	    ino_to_fsbo(fs, inodedep->id_ino);
10393
10394	/*
10395	 * If we're on the unlinked list but have not yet written our
10396	 * next pointer initialize it here.
10397	 */
10398	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10399		struct inodedep *inon;
10400
10401		inon = TAILQ_NEXT(inodedep, id_unlinked);
10402		dp->di_freelink = inon ? inon->id_ino : 0;
10403	}
10404	/*
10405	 * If the bitmap is not yet written, then the allocated
10406	 * inode cannot be written to disk.
10407	 */
10408	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10409		if (inodedep->id_savedino2 != NULL)
10410			panic("initiate_write_inodeblock_ufs2: I/O underway");
10411		FREE_LOCK(ump);
10412		sip = malloc(sizeof(struct ufs2_dinode),
10413		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10414		ACQUIRE_LOCK(ump);
10415		inodedep->id_savedino2 = sip;
10416		*inodedep->id_savedino2 = *dp;
10417		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10418		dp->di_gen = inodedep->id_savedino2->di_gen;
10419		dp->di_freelink = inodedep->id_savedino2->di_freelink;
10420		return;
10421	}
10422	/*
10423	 * If no dependencies, then there is nothing to roll back.
10424	 */
10425	inodedep->id_savedsize = dp->di_size;
10426	inodedep->id_savedextsize = dp->di_extsize;
10427	inodedep->id_savednlink = dp->di_nlink;
10428	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10429	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
10430	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10431		return;
10432	/*
10433	 * Revert the link count to that of the first unwritten journal entry.
10434	 */
10435	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10436	if (inoref)
10437		dp->di_nlink = inoref->if_nlink;
10438
10439	/*
10440	 * Set the ext data dependencies to busy.
10441	 */
10442	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10443	     adp = TAILQ_NEXT(adp, ad_next)) {
10444#ifdef INVARIANTS
10445		if (deplist != 0 && prevlbn >= adp->ad_offset)
10446			panic("initiate_write_inodeblock_ufs2: lbn order");
10447		prevlbn = adp->ad_offset;
10448		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10449			panic("initiate_write_inodeblock_ufs2: "
10450			    "ext pointer #%jd mismatch %jd != %jd",
10451			    (intmax_t)adp->ad_offset,
10452			    (intmax_t)dp->di_extb[adp->ad_offset],
10453			    (intmax_t)adp->ad_newblkno);
10454		deplist |= 1 << adp->ad_offset;
10455		if ((adp->ad_state & ATTACHED) == 0)
10456			panic("initiate_write_inodeblock_ufs2: Unknown "
10457			    "state 0x%x", adp->ad_state);
10458#endif /* INVARIANTS */
10459		adp->ad_state &= ~ATTACHED;
10460		adp->ad_state |= UNDONE;
10461	}
10462	/*
10463	 * The on-disk inode cannot claim to be any larger than the last
10464	 * fragment that has been written. Otherwise, the on-disk inode
10465	 * might have fragments that were not the last block in the ext
10466	 * data which would corrupt the filesystem.
10467	 */
10468	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10469	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10470		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10471		/* keep going until hitting a rollback to a frag */
10472		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10473			continue;
10474		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10475		for (i = adp->ad_offset + 1; i < UFS_NXADDR; i++) {
10476#ifdef INVARIANTS
10477			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10478				panic("initiate_write_inodeblock_ufs2: "
10479				    "lost dep1");
10480#endif /* INVARIANTS */
10481			dp->di_extb[i] = 0;
10482		}
10483		lastadp = NULL;
10484		break;
10485	}
10486	/*
10487	 * If we have zero'ed out the last allocated block of the ext
10488	 * data, roll back the size to the last currently allocated block.
10489	 * We know that this last allocated block is a full-sized as
10490	 * we already checked for fragments in the loop above.
10491	 */
10492	if (lastadp != NULL &&
10493	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10494		for (i = lastadp->ad_offset; i >= 0; i--)
10495			if (dp->di_extb[i] != 0)
10496				break;
10497		dp->di_extsize = (i + 1) * fs->fs_bsize;
10498	}
10499	/*
10500	 * Set the file data dependencies to busy.
10501	 */
10502	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10503	     adp = TAILQ_NEXT(adp, ad_next)) {
10504#ifdef INVARIANTS
10505		if (deplist != 0 && prevlbn >= adp->ad_offset)
10506			panic("softdep_write_inodeblock: lbn order");
10507		if ((adp->ad_state & ATTACHED) == 0)
10508			panic("inodedep %p and adp %p not attached", inodedep, adp);
10509		prevlbn = adp->ad_offset;
10510		if (adp->ad_offset < UFS_NDADDR &&
10511		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10512			panic("initiate_write_inodeblock_ufs2: "
10513			    "direct pointer #%jd mismatch %jd != %jd",
10514			    (intmax_t)adp->ad_offset,
10515			    (intmax_t)dp->di_db[adp->ad_offset],
10516			    (intmax_t)adp->ad_newblkno);
10517		if (adp->ad_offset >= UFS_NDADDR &&
10518		    dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno)
10519			panic("initiate_write_inodeblock_ufs2: "
10520			    "indirect pointer #%jd mismatch %jd != %jd",
10521			    (intmax_t)adp->ad_offset - UFS_NDADDR,
10522			    (intmax_t)dp->di_ib[adp->ad_offset - UFS_NDADDR],
10523			    (intmax_t)adp->ad_newblkno);
10524		deplist |= 1 << adp->ad_offset;
10525		if ((adp->ad_state & ATTACHED) == 0)
10526			panic("initiate_write_inodeblock_ufs2: Unknown "
10527			     "state 0x%x", adp->ad_state);
10528#endif /* INVARIANTS */
10529		adp->ad_state &= ~ATTACHED;
10530		adp->ad_state |= UNDONE;
10531	}
10532	/*
10533	 * The on-disk inode cannot claim to be any larger than the last
10534	 * fragment that has been written. Otherwise, the on-disk inode
10535	 * might have fragments that were not the last block in the file
10536	 * which would corrupt the filesystem.
10537	 */
10538	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10539	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10540		if (adp->ad_offset >= UFS_NDADDR)
10541			break;
10542		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10543		/* keep going until hitting a rollback to a frag */
10544		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10545			continue;
10546		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10547		for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) {
10548#ifdef INVARIANTS
10549			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10550				panic("initiate_write_inodeblock_ufs2: "
10551				    "lost dep2");
10552#endif /* INVARIANTS */
10553			dp->di_db[i] = 0;
10554		}
10555		for (i = 0; i < UFS_NIADDR; i++) {
10556#ifdef INVARIANTS
10557			if (dp->di_ib[i] != 0 &&
10558			    (deplist & ((1 << UFS_NDADDR) << i)) == 0)
10559				panic("initiate_write_inodeblock_ufs2: "
10560				    "lost dep3");
10561#endif /* INVARIANTS */
10562			dp->di_ib[i] = 0;
10563		}
10564		return;
10565	}
10566	/*
10567	 * If we have zero'ed out the last allocated block of the file,
10568	 * roll back the size to the last currently allocated block.
10569	 * We know that this last allocated block is a full-sized as
10570	 * we already checked for fragments in the loop above.
10571	 */
10572	if (lastadp != NULL &&
10573	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10574		for (i = lastadp->ad_offset; i >= 0; i--)
10575			if (dp->di_db[i] != 0)
10576				break;
10577		dp->di_size = (i + 1) * fs->fs_bsize;
10578	}
10579	/*
10580	 * The only dependencies are for indirect blocks.
10581	 *
10582	 * The file size for indirect block additions is not guaranteed.
10583	 * Such a guarantee would be non-trivial to achieve. The conventional
10584	 * synchronous write implementation also does not make this guarantee.
10585	 * Fsck should catch and fix discrepancies. Arguably, the file size
10586	 * can be over-estimated without destroying integrity when the file
10587	 * moves into the indirect blocks (i.e., is large). If we want to
10588	 * postpone fsck, we are stuck with this argument.
10589	 */
10590	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10591		dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0;
10592}
10593
10594/*
10595 * Cancel an indirdep as a result of truncation.  Release all of the
10596 * children allocindirs and place their journal work on the appropriate
10597 * list.
10598 */
10599static void
10600cancel_indirdep(indirdep, bp, freeblks)
10601	struct indirdep *indirdep;
10602	struct buf *bp;
10603	struct freeblks *freeblks;
10604{
10605	struct allocindir *aip;
10606
10607	/*
10608	 * None of the indirect pointers will ever be visible,
10609	 * so they can simply be tossed. GOINGAWAY ensures
10610	 * that allocated pointers will be saved in the buffer
10611	 * cache until they are freed. Note that they will
10612	 * only be able to be found by their physical address
10613	 * since the inode mapping the logical address will
10614	 * be gone. The save buffer used for the safe copy
10615	 * was allocated in setup_allocindir_phase2 using
10616	 * the physical address so it could be used for this
10617	 * purpose. Hence we swap the safe copy with the real
10618	 * copy, allowing the safe copy to be freed and holding
10619	 * on to the real copy for later use in indir_trunc.
10620	 */
10621	if (indirdep->ir_state & GOINGAWAY)
10622		panic("cancel_indirdep: already gone");
10623	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10624		indirdep->ir_state |= DEPCOMPLETE;
10625		LIST_REMOVE(indirdep, ir_next);
10626	}
10627	indirdep->ir_state |= GOINGAWAY;
10628	/*
10629	 * Pass in bp for blocks still have journal writes
10630	 * pending so we can cancel them on their own.
10631	 */
10632	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL)
10633		cancel_allocindir(aip, bp, freeblks, 0);
10634	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL)
10635		cancel_allocindir(aip, NULL, freeblks, 0);
10636	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL)
10637		cancel_allocindir(aip, NULL, freeblks, 0);
10638	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL)
10639		cancel_allocindir(aip, NULL, freeblks, 0);
10640	/*
10641	 * If there are pending partial truncations we need to keep the
10642	 * old block copy around until they complete.  This is because
10643	 * the current b_data is not a perfect superset of the available
10644	 * blocks.
10645	 */
10646	if (TAILQ_EMPTY(&indirdep->ir_trunc))
10647		bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10648	else
10649		bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10650	WORKLIST_REMOVE(&indirdep->ir_list);
10651	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10652	indirdep->ir_bp = NULL;
10653	indirdep->ir_freeblks = freeblks;
10654}
10655
10656/*
10657 * Free an indirdep once it no longer has new pointers to track.
10658 */
10659static void
10660free_indirdep(indirdep)
10661	struct indirdep *indirdep;
10662{
10663
10664	KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10665	    ("free_indirdep: Indir trunc list not empty."));
10666	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10667	    ("free_indirdep: Complete head not empty."));
10668	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10669	    ("free_indirdep: write head not empty."));
10670	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10671	    ("free_indirdep: done head not empty."));
10672	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10673	    ("free_indirdep: deplist head not empty."));
10674	KASSERT((indirdep->ir_state & DEPCOMPLETE),
10675	    ("free_indirdep: %p still on newblk list.", indirdep));
10676	KASSERT(indirdep->ir_saveddata == NULL,
10677	    ("free_indirdep: %p still has saved data.", indirdep));
10678	if (indirdep->ir_state & ONWORKLIST)
10679		WORKLIST_REMOVE(&indirdep->ir_list);
10680	WORKITEM_FREE(indirdep, D_INDIRDEP);
10681}
10682
10683/*
10684 * Called before a write to an indirdep.  This routine is responsible for
10685 * rolling back pointers to a safe state which includes only those
10686 * allocindirs which have been completed.
10687 */
10688static void
10689initiate_write_indirdep(indirdep, bp)
10690	struct indirdep *indirdep;
10691	struct buf *bp;
10692{
10693	struct ufsmount *ump;
10694
10695	indirdep->ir_state |= IOSTARTED;
10696	if (indirdep->ir_state & GOINGAWAY)
10697		panic("disk_io_initiation: indirdep gone");
10698	/*
10699	 * If there are no remaining dependencies, this will be writing
10700	 * the real pointers.
10701	 */
10702	if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10703	    TAILQ_EMPTY(&indirdep->ir_trunc))
10704		return;
10705	/*
10706	 * Replace up-to-date version with safe version.
10707	 */
10708	if (indirdep->ir_saveddata == NULL) {
10709		ump = VFSTOUFS(indirdep->ir_list.wk_mp);
10710		LOCK_OWNED(ump);
10711		FREE_LOCK(ump);
10712		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10713		    M_SOFTDEP_FLAGS);
10714		ACQUIRE_LOCK(ump);
10715	}
10716	indirdep->ir_state &= ~ATTACHED;
10717	indirdep->ir_state |= UNDONE;
10718	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10719	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10720	    bp->b_bcount);
10721}
10722
10723/*
10724 * Called when an inode has been cleared in a cg bitmap.  This finally
10725 * eliminates any canceled jaddrefs
10726 */
10727void
10728softdep_setup_inofree(mp, bp, ino, wkhd)
10729	struct mount *mp;
10730	struct buf *bp;
10731	ino_t ino;
10732	struct workhead *wkhd;
10733{
10734	struct worklist *wk, *wkn;
10735	struct inodedep *inodedep;
10736	struct ufsmount *ump;
10737	uint8_t *inosused;
10738	struct cg *cgp;
10739	struct fs *fs;
10740
10741	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
10742	    ("softdep_setup_inofree called on non-softdep filesystem"));
10743	ump = VFSTOUFS(mp);
10744	ACQUIRE_LOCK(ump);
10745	fs = ump->um_fs;
10746	cgp = (struct cg *)bp->b_data;
10747	inosused = cg_inosused(cgp);
10748	if (isset(inosused, ino % fs->fs_ipg))
10749		panic("softdep_setup_inofree: inode %ju not freed.",
10750		    (uintmax_t)ino);
10751	if (inodedep_lookup(mp, ino, 0, &inodedep))
10752		panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
10753		    (uintmax_t)ino, inodedep);
10754	if (wkhd) {
10755		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10756			if (wk->wk_type != D_JADDREF)
10757				continue;
10758			WORKLIST_REMOVE(wk);
10759			/*
10760			 * We can free immediately even if the jaddref
10761			 * isn't attached in a background write as now
10762			 * the bitmaps are reconciled.
10763			 */
10764			wk->wk_state |= COMPLETE | ATTACHED;
10765			free_jaddref(WK_JADDREF(wk));
10766		}
10767		jwork_move(&bp->b_dep, wkhd);
10768	}
10769	FREE_LOCK(ump);
10770}
10771
10772
10773/*
10774 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10775 * map.  Any dependencies waiting for the write to clear are added to the
10776 * buf's list and any jnewblks that are being canceled are discarded
10777 * immediately.
10778 */
10779void
10780softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
10781	struct mount *mp;
10782	struct buf *bp;
10783	ufs2_daddr_t blkno;
10784	int frags;
10785	struct workhead *wkhd;
10786{
10787	struct bmsafemap *bmsafemap;
10788	struct jnewblk *jnewblk;
10789	struct ufsmount *ump;
10790	struct worklist *wk;
10791	struct fs *fs;
10792#ifdef SUJ_DEBUG
10793	uint8_t *blksfree;
10794	struct cg *cgp;
10795	ufs2_daddr_t jstart;
10796	ufs2_daddr_t jend;
10797	ufs2_daddr_t end;
10798	long bno;
10799	int i;
10800#endif
10801
10802	CTR3(KTR_SUJ,
10803	    "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
10804	    blkno, frags, wkhd);
10805
10806	ump = VFSTOUFS(mp);
10807	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
10808	    ("softdep_setup_blkfree called on non-softdep filesystem"));
10809	ACQUIRE_LOCK(ump);
10810	/* Lookup the bmsafemap so we track when it is dirty. */
10811	fs = ump->um_fs;
10812	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10813	/*
10814	 * Detach any jnewblks which have been canceled.  They must linger
10815	 * until the bitmap is cleared again by ffs_blkfree() to prevent
10816	 * an unjournaled allocation from hitting the disk.
10817	 */
10818	if (wkhd) {
10819		while ((wk = LIST_FIRST(wkhd)) != NULL) {
10820			CTR2(KTR_SUJ,
10821			    "softdep_setup_blkfree: blkno %jd wk type %d",
10822			    blkno, wk->wk_type);
10823			WORKLIST_REMOVE(wk);
10824			if (wk->wk_type != D_JNEWBLK) {
10825				WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
10826				continue;
10827			}
10828			jnewblk = WK_JNEWBLK(wk);
10829			KASSERT(jnewblk->jn_state & GOINGAWAY,
10830			    ("softdep_setup_blkfree: jnewblk not canceled."));
10831#ifdef SUJ_DEBUG
10832			/*
10833			 * Assert that this block is free in the bitmap
10834			 * before we discard the jnewblk.
10835			 */
10836			cgp = (struct cg *)bp->b_data;
10837			blksfree = cg_blksfree(cgp);
10838			bno = dtogd(fs, jnewblk->jn_blkno);
10839			for (i = jnewblk->jn_oldfrags;
10840			    i < jnewblk->jn_frags; i++) {
10841				if (isset(blksfree, bno + i))
10842					continue;
10843				panic("softdep_setup_blkfree: not free");
10844			}
10845#endif
10846			/*
10847			 * Even if it's not attached we can free immediately
10848			 * as the new bitmap is correct.
10849			 */
10850			wk->wk_state |= COMPLETE | ATTACHED;
10851			free_jnewblk(jnewblk);
10852		}
10853	}
10854
10855#ifdef SUJ_DEBUG
10856	/*
10857	 * Assert that we are not freeing a block which has an outstanding
10858	 * allocation dependency.
10859	 */
10860	fs = VFSTOUFS(mp)->um_fs;
10861	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10862	end = blkno + frags;
10863	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10864		/*
10865		 * Don't match against blocks that will be freed when the
10866		 * background write is done.
10867		 */
10868		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
10869		    (COMPLETE | DEPCOMPLETE))
10870			continue;
10871		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
10872		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
10873		if ((blkno >= jstart && blkno < jend) ||
10874		    (end > jstart && end <= jend)) {
10875			printf("state 0x%X %jd - %d %d dep %p\n",
10876			    jnewblk->jn_state, jnewblk->jn_blkno,
10877			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
10878			    jnewblk->jn_dep);
10879			panic("softdep_setup_blkfree: "
10880			    "%jd-%jd(%d) overlaps with %jd-%jd",
10881			    blkno, end, frags, jstart, jend);
10882		}
10883	}
10884#endif
10885	FREE_LOCK(ump);
10886}
10887
10888/*
10889 * Revert a block allocation when the journal record that describes it
10890 * is not yet written.
10891 */
10892static int
10893jnewblk_rollback(jnewblk, fs, cgp, blksfree)
10894	struct jnewblk *jnewblk;
10895	struct fs *fs;
10896	struct cg *cgp;
10897	uint8_t *blksfree;
10898{
10899	ufs1_daddr_t fragno;
10900	long cgbno, bbase;
10901	int frags, blk;
10902	int i;
10903
10904	frags = 0;
10905	cgbno = dtogd(fs, jnewblk->jn_blkno);
10906	/*
10907	 * We have to test which frags need to be rolled back.  We may
10908	 * be operating on a stale copy when doing background writes.
10909	 */
10910	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
10911		if (isclr(blksfree, cgbno + i))
10912			frags++;
10913	if (frags == 0)
10914		return (0);
10915	/*
10916	 * This is mostly ffs_blkfree() sans some validation and
10917	 * superblock updates.
10918	 */
10919	if (frags == fs->fs_frag) {
10920		fragno = fragstoblks(fs, cgbno);
10921		ffs_setblock(fs, blksfree, fragno);
10922		ffs_clusteracct(fs, cgp, fragno, 1);
10923		cgp->cg_cs.cs_nbfree++;
10924	} else {
10925		cgbno += jnewblk->jn_oldfrags;
10926		bbase = cgbno - fragnum(fs, cgbno);
10927		/* Decrement the old frags.  */
10928		blk = blkmap(fs, blksfree, bbase);
10929		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
10930		/* Deallocate the fragment */
10931		for (i = 0; i < frags; i++)
10932			setbit(blksfree, cgbno + i);
10933		cgp->cg_cs.cs_nffree += frags;
10934		/* Add back in counts associated with the new frags */
10935		blk = blkmap(fs, blksfree, bbase);
10936		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
10937		/* If a complete block has been reassembled, account for it. */
10938		fragno = fragstoblks(fs, bbase);
10939		if (ffs_isblock(fs, blksfree, fragno)) {
10940			cgp->cg_cs.cs_nffree -= fs->fs_frag;
10941			ffs_clusteracct(fs, cgp, fragno, 1);
10942			cgp->cg_cs.cs_nbfree++;
10943		}
10944	}
10945	stat_jnewblk++;
10946	jnewblk->jn_state &= ~ATTACHED;
10947	jnewblk->jn_state |= UNDONE;
10948
10949	return (frags);
10950}
10951
10952static void
10953initiate_write_bmsafemap(bmsafemap, bp)
10954	struct bmsafemap *bmsafemap;
10955	struct buf *bp;			/* The cg block. */
10956{
10957	struct jaddref *jaddref;
10958	struct jnewblk *jnewblk;
10959	uint8_t *inosused;
10960	uint8_t *blksfree;
10961	struct cg *cgp;
10962	struct fs *fs;
10963	ino_t ino;
10964
10965	/*
10966	 * If this is a background write, we did this at the time that
10967	 * the copy was made, so do not need to do it again.
10968	 */
10969	if (bmsafemap->sm_state & IOSTARTED)
10970		return;
10971	bmsafemap->sm_state |= IOSTARTED;
10972	/*
10973	 * Clear any inode allocations which are pending journal writes.
10974	 */
10975	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
10976		cgp = (struct cg *)bp->b_data;
10977		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10978		inosused = cg_inosused(cgp);
10979		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
10980			ino = jaddref->ja_ino % fs->fs_ipg;
10981			if (isset(inosused, ino)) {
10982				if ((jaddref->ja_mode & IFMT) == IFDIR)
10983					cgp->cg_cs.cs_ndir--;
10984				cgp->cg_cs.cs_nifree++;
10985				clrbit(inosused, ino);
10986				jaddref->ja_state &= ~ATTACHED;
10987				jaddref->ja_state |= UNDONE;
10988				stat_jaddref++;
10989			} else
10990				panic("initiate_write_bmsafemap: inode %ju "
10991				    "marked free", (uintmax_t)jaddref->ja_ino);
10992		}
10993	}
10994	/*
10995	 * Clear any block allocations which are pending journal writes.
10996	 */
10997	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
10998		cgp = (struct cg *)bp->b_data;
10999		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11000		blksfree = cg_blksfree(cgp);
11001		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
11002			if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
11003				continue;
11004			panic("initiate_write_bmsafemap: block %jd "
11005			    "marked free", jnewblk->jn_blkno);
11006		}
11007	}
11008	/*
11009	 * Move allocation lists to the written lists so they can be
11010	 * cleared once the block write is complete.
11011	 */
11012	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
11013	    inodedep, id_deps);
11014	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
11015	    newblk, nb_deps);
11016	LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
11017	    wk_list);
11018}
11019
11020/*
11021 * This routine is called during the completion interrupt
11022 * service routine for a disk write (from the procedure called
11023 * by the device driver to inform the filesystem caches of
11024 * a request completion).  It should be called early in this
11025 * procedure, before the block is made available to other
11026 * processes or other routines are called.
11027 *
11028 */
11029static void
11030softdep_disk_write_complete(bp)
11031	struct buf *bp;		/* describes the completed disk write */
11032{
11033	struct worklist *wk;
11034	struct worklist *owk;
11035	struct ufsmount *ump;
11036	struct workhead reattach;
11037	struct freeblks *freeblks;
11038	struct buf *sbp;
11039
11040	ump = softdep_bp_to_mp(bp);
11041	if (ump == NULL)
11042		return;
11043
11044	sbp = NULL;
11045
11046	/*
11047	 * If an error occurred while doing the write, then the data
11048	 * has not hit the disk and the dependencies cannot be processed.
11049	 * But we do have to go through and roll forward any dependencies
11050	 * that were rolled back before the disk write.
11051	 */
11052	ACQUIRE_LOCK(ump);
11053	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) {
11054		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
11055			switch (wk->wk_type) {
11056
11057			case D_PAGEDEP:
11058				handle_written_filepage(WK_PAGEDEP(wk), bp, 0);
11059				continue;
11060
11061			case D_INODEDEP:
11062				handle_written_inodeblock(WK_INODEDEP(wk),
11063				    bp, 0);
11064				continue;
11065
11066			case D_BMSAFEMAP:
11067				handle_written_bmsafemap(WK_BMSAFEMAP(wk),
11068				    bp, 0);
11069				continue;
11070
11071			case D_INDIRDEP:
11072				handle_written_indirdep(WK_INDIRDEP(wk),
11073				    bp, &sbp, 0);
11074				continue;
11075			default:
11076				/* nothing to roll forward */
11077				continue;
11078			}
11079		}
11080		FREE_LOCK(ump);
11081		return;
11082	}
11083	LIST_INIT(&reattach);
11084
11085	/*
11086	 * Ump SU lock must not be released anywhere in this code segment.
11087	 */
11088	owk = NULL;
11089	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
11090		WORKLIST_REMOVE(wk);
11091		atomic_add_long(&dep_write[wk->wk_type], 1);
11092		if (wk == owk)
11093			panic("duplicate worklist: %p\n", wk);
11094		owk = wk;
11095		switch (wk->wk_type) {
11096
11097		case D_PAGEDEP:
11098			if (handle_written_filepage(WK_PAGEDEP(wk), bp,
11099			    WRITESUCCEEDED))
11100				WORKLIST_INSERT(&reattach, wk);
11101			continue;
11102
11103		case D_INODEDEP:
11104			if (handle_written_inodeblock(WK_INODEDEP(wk), bp,
11105			    WRITESUCCEEDED))
11106				WORKLIST_INSERT(&reattach, wk);
11107			continue;
11108
11109		case D_BMSAFEMAP:
11110			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp,
11111			    WRITESUCCEEDED))
11112				WORKLIST_INSERT(&reattach, wk);
11113			continue;
11114
11115		case D_MKDIR:
11116			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
11117			continue;
11118
11119		case D_ALLOCDIRECT:
11120			wk->wk_state |= COMPLETE;
11121			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
11122			continue;
11123
11124		case D_ALLOCINDIR:
11125			wk->wk_state |= COMPLETE;
11126			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
11127			continue;
11128
11129		case D_INDIRDEP:
11130			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp,
11131			    WRITESUCCEEDED))
11132				WORKLIST_INSERT(&reattach, wk);
11133			continue;
11134
11135		case D_FREEBLKS:
11136			wk->wk_state |= COMPLETE;
11137			freeblks = WK_FREEBLKS(wk);
11138			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
11139			    LIST_EMPTY(&freeblks->fb_jblkdephd))
11140				add_to_worklist(wk, WK_NODELAY);
11141			continue;
11142
11143		case D_FREEWORK:
11144			handle_written_freework(WK_FREEWORK(wk));
11145			break;
11146
11147		case D_JSEGDEP:
11148			free_jsegdep(WK_JSEGDEP(wk));
11149			continue;
11150
11151		case D_JSEG:
11152			handle_written_jseg(WK_JSEG(wk), bp);
11153			continue;
11154
11155		case D_SBDEP:
11156			if (handle_written_sbdep(WK_SBDEP(wk), bp))
11157				WORKLIST_INSERT(&reattach, wk);
11158			continue;
11159
11160		case D_FREEDEP:
11161			free_freedep(WK_FREEDEP(wk));
11162			continue;
11163
11164		default:
11165			panic("handle_disk_write_complete: Unknown type %s",
11166			    TYPENAME(wk->wk_type));
11167			/* NOTREACHED */
11168		}
11169	}
11170	/*
11171	 * Reattach any requests that must be redone.
11172	 */
11173	while ((wk = LIST_FIRST(&reattach)) != NULL) {
11174		WORKLIST_REMOVE(wk);
11175		WORKLIST_INSERT(&bp->b_dep, wk);
11176	}
11177	FREE_LOCK(ump);
11178	if (sbp)
11179		brelse(sbp);
11180}
11181
11182/*
11183 * Called from within softdep_disk_write_complete above.
11184 */
11185static void
11186handle_allocdirect_partdone(adp, wkhd)
11187	struct allocdirect *adp;	/* the completed allocdirect */
11188	struct workhead *wkhd;		/* Work to do when inode is writtne. */
11189{
11190	struct allocdirectlst *listhead;
11191	struct allocdirect *listadp;
11192	struct inodedep *inodedep;
11193	long bsize;
11194
11195	LOCK_OWNED(VFSTOUFS(adp->ad_block.nb_list.wk_mp));
11196	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11197		return;
11198	/*
11199	 * The on-disk inode cannot claim to be any larger than the last
11200	 * fragment that has been written. Otherwise, the on-disk inode
11201	 * might have fragments that were not the last block in the file
11202	 * which would corrupt the filesystem. Thus, we cannot free any
11203	 * allocdirects after one whose ad_oldblkno claims a fragment as
11204	 * these blocks must be rolled back to zero before writing the inode.
11205	 * We check the currently active set of allocdirects in id_inoupdt
11206	 * or id_extupdt as appropriate.
11207	 */
11208	inodedep = adp->ad_inodedep;
11209	bsize = inodedep->id_fs->fs_bsize;
11210	if (adp->ad_state & EXTDATA)
11211		listhead = &inodedep->id_extupdt;
11212	else
11213		listhead = &inodedep->id_inoupdt;
11214	TAILQ_FOREACH(listadp, listhead, ad_next) {
11215		/* found our block */
11216		if (listadp == adp)
11217			break;
11218		/* continue if ad_oldlbn is not a fragment */
11219		if (listadp->ad_oldsize == 0 ||
11220		    listadp->ad_oldsize == bsize)
11221			continue;
11222		/* hit a fragment */
11223		return;
11224	}
11225	/*
11226	 * If we have reached the end of the current list without
11227	 * finding the just finished dependency, then it must be
11228	 * on the future dependency list. Future dependencies cannot
11229	 * be freed until they are moved to the current list.
11230	 */
11231	if (listadp == NULL) {
11232#ifdef DEBUG
11233		if (adp->ad_state & EXTDATA)
11234			listhead = &inodedep->id_newextupdt;
11235		else
11236			listhead = &inodedep->id_newinoupdt;
11237		TAILQ_FOREACH(listadp, listhead, ad_next)
11238			/* found our block */
11239			if (listadp == adp)
11240				break;
11241		if (listadp == NULL)
11242			panic("handle_allocdirect_partdone: lost dep");
11243#endif /* DEBUG */
11244		return;
11245	}
11246	/*
11247	 * If we have found the just finished dependency, then queue
11248	 * it along with anything that follows it that is complete.
11249	 * Since the pointer has not yet been written in the inode
11250	 * as the dependency prevents it, place the allocdirect on the
11251	 * bufwait list where it will be freed once the pointer is
11252	 * valid.
11253	 */
11254	if (wkhd == NULL)
11255		wkhd = &inodedep->id_bufwait;
11256	for (; adp; adp = listadp) {
11257		listadp = TAILQ_NEXT(adp, ad_next);
11258		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11259			return;
11260		TAILQ_REMOVE(listhead, adp, ad_next);
11261		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
11262	}
11263}
11264
11265/*
11266 * Called from within softdep_disk_write_complete above.  This routine
11267 * completes successfully written allocindirs.
11268 */
11269static void
11270handle_allocindir_partdone(aip)
11271	struct allocindir *aip;		/* the completed allocindir */
11272{
11273	struct indirdep *indirdep;
11274
11275	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
11276		return;
11277	indirdep = aip->ai_indirdep;
11278	LIST_REMOVE(aip, ai_next);
11279	/*
11280	 * Don't set a pointer while the buffer is undergoing IO or while
11281	 * we have active truncations.
11282	 */
11283	if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
11284		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
11285		return;
11286	}
11287	if (indirdep->ir_state & UFS1FMT)
11288		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11289		    aip->ai_newblkno;
11290	else
11291		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11292		    aip->ai_newblkno;
11293	/*
11294	 * Await the pointer write before freeing the allocindir.
11295	 */
11296	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
11297}
11298
11299/*
11300 * Release segments held on a jwork list.
11301 */
11302static void
11303handle_jwork(wkhd)
11304	struct workhead *wkhd;
11305{
11306	struct worklist *wk;
11307
11308	while ((wk = LIST_FIRST(wkhd)) != NULL) {
11309		WORKLIST_REMOVE(wk);
11310		switch (wk->wk_type) {
11311		case D_JSEGDEP:
11312			free_jsegdep(WK_JSEGDEP(wk));
11313			continue;
11314		case D_FREEDEP:
11315			free_freedep(WK_FREEDEP(wk));
11316			continue;
11317		case D_FREEFRAG:
11318			rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
11319			WORKITEM_FREE(wk, D_FREEFRAG);
11320			continue;
11321		case D_FREEWORK:
11322			handle_written_freework(WK_FREEWORK(wk));
11323			continue;
11324		default:
11325			panic("handle_jwork: Unknown type %s\n",
11326			    TYPENAME(wk->wk_type));
11327		}
11328	}
11329}
11330
11331/*
11332 * Handle the bufwait list on an inode when it is safe to release items
11333 * held there.  This normally happens after an inode block is written but
11334 * may be delayed and handled later if there are pending journal items that
11335 * are not yet safe to be released.
11336 */
11337static struct freefile *
11338handle_bufwait(inodedep, refhd)
11339	struct inodedep *inodedep;
11340	struct workhead *refhd;
11341{
11342	struct jaddref *jaddref;
11343	struct freefile *freefile;
11344	struct worklist *wk;
11345
11346	freefile = NULL;
11347	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
11348		WORKLIST_REMOVE(wk);
11349		switch (wk->wk_type) {
11350		case D_FREEFILE:
11351			/*
11352			 * We defer adding freefile to the worklist
11353			 * until all other additions have been made to
11354			 * ensure that it will be done after all the
11355			 * old blocks have been freed.
11356			 */
11357			if (freefile != NULL)
11358				panic("handle_bufwait: freefile");
11359			freefile = WK_FREEFILE(wk);
11360			continue;
11361
11362		case D_MKDIR:
11363			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
11364			continue;
11365
11366		case D_DIRADD:
11367			diradd_inode_written(WK_DIRADD(wk), inodedep);
11368			continue;
11369
11370		case D_FREEFRAG:
11371			wk->wk_state |= COMPLETE;
11372			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
11373				add_to_worklist(wk, 0);
11374			continue;
11375
11376		case D_DIRREM:
11377			wk->wk_state |= COMPLETE;
11378			add_to_worklist(wk, 0);
11379			continue;
11380
11381		case D_ALLOCDIRECT:
11382		case D_ALLOCINDIR:
11383			free_newblk(WK_NEWBLK(wk));
11384			continue;
11385
11386		case D_JNEWBLK:
11387			wk->wk_state |= COMPLETE;
11388			free_jnewblk(WK_JNEWBLK(wk));
11389			continue;
11390
11391		/*
11392		 * Save freed journal segments and add references on
11393		 * the supplied list which will delay their release
11394		 * until the cg bitmap is cleared on disk.
11395		 */
11396		case D_JSEGDEP:
11397			if (refhd == NULL)
11398				free_jsegdep(WK_JSEGDEP(wk));
11399			else
11400				WORKLIST_INSERT(refhd, wk);
11401			continue;
11402
11403		case D_JADDREF:
11404			jaddref = WK_JADDREF(wk);
11405			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
11406			    if_deps);
11407			/*
11408			 * Transfer any jaddrefs to the list to be freed with
11409			 * the bitmap if we're handling a removed file.
11410			 */
11411			if (refhd == NULL) {
11412				wk->wk_state |= COMPLETE;
11413				free_jaddref(jaddref);
11414			} else
11415				WORKLIST_INSERT(refhd, wk);
11416			continue;
11417
11418		default:
11419			panic("handle_bufwait: Unknown type %p(%s)",
11420			    wk, TYPENAME(wk->wk_type));
11421			/* NOTREACHED */
11422		}
11423	}
11424	return (freefile);
11425}
11426/*
11427 * Called from within softdep_disk_write_complete above to restore
11428 * in-memory inode block contents to their most up-to-date state. Note
11429 * that this routine is always called from interrupt level with further
11430 * interrupts from this device blocked.
11431 *
11432 * If the write did not succeed, we will do all the roll-forward
11433 * operations, but we will not take the actions that will allow its
11434 * dependencies to be processed.
11435 */
11436static int
11437handle_written_inodeblock(inodedep, bp, flags)
11438	struct inodedep *inodedep;
11439	struct buf *bp;		/* buffer containing the inode block */
11440	int flags;
11441{
11442	struct freefile *freefile;
11443	struct allocdirect *adp, *nextadp;
11444	struct ufs1_dinode *dp1 = NULL;
11445	struct ufs2_dinode *dp2 = NULL;
11446	struct workhead wkhd;
11447	int hadchanges, fstype;
11448	ino_t freelink;
11449
11450	LIST_INIT(&wkhd);
11451	hadchanges = 0;
11452	freefile = NULL;
11453	if ((inodedep->id_state & IOSTARTED) == 0)
11454		panic("handle_written_inodeblock: not started");
11455	inodedep->id_state &= ~IOSTARTED;
11456	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11457		fstype = UFS1;
11458		dp1 = (struct ufs1_dinode *)bp->b_data +
11459		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11460		freelink = dp1->di_freelink;
11461	} else {
11462		fstype = UFS2;
11463		dp2 = (struct ufs2_dinode *)bp->b_data +
11464		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11465		freelink = dp2->di_freelink;
11466	}
11467	/*
11468	 * Leave this inodeblock dirty until it's in the list.
11469	 */
11470	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED &&
11471	    (flags & WRITESUCCEEDED)) {
11472		struct inodedep *inon;
11473
11474		inon = TAILQ_NEXT(inodedep, id_unlinked);
11475		if ((inon == NULL && freelink == 0) ||
11476		    (inon && inon->id_ino == freelink)) {
11477			if (inon)
11478				inon->id_state |= UNLINKPREV;
11479			inodedep->id_state |= UNLINKNEXT;
11480		}
11481		hadchanges = 1;
11482	}
11483	/*
11484	 * If we had to rollback the inode allocation because of
11485	 * bitmaps being incomplete, then simply restore it.
11486	 * Keep the block dirty so that it will not be reclaimed until
11487	 * all associated dependencies have been cleared and the
11488	 * corresponding updates written to disk.
11489	 */
11490	if (inodedep->id_savedino1 != NULL) {
11491		hadchanges = 1;
11492		if (fstype == UFS1)
11493			*dp1 = *inodedep->id_savedino1;
11494		else
11495			*dp2 = *inodedep->id_savedino2;
11496		free(inodedep->id_savedino1, M_SAVEDINO);
11497		inodedep->id_savedino1 = NULL;
11498		if ((bp->b_flags & B_DELWRI) == 0)
11499			stat_inode_bitmap++;
11500		bdirty(bp);
11501		/*
11502		 * If the inode is clear here and GOINGAWAY it will never
11503		 * be written.  Process the bufwait and clear any pending
11504		 * work which may include the freefile.
11505		 */
11506		if (inodedep->id_state & GOINGAWAY)
11507			goto bufwait;
11508		return (1);
11509	}
11510	if (flags & WRITESUCCEEDED)
11511		inodedep->id_state |= COMPLETE;
11512	/*
11513	 * Roll forward anything that had to be rolled back before
11514	 * the inode could be updated.
11515	 */
11516	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11517		nextadp = TAILQ_NEXT(adp, ad_next);
11518		if (adp->ad_state & ATTACHED)
11519			panic("handle_written_inodeblock: new entry");
11520		if (fstype == UFS1) {
11521			if (adp->ad_offset < UFS_NDADDR) {
11522				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11523					panic("%s %s #%jd mismatch %d != %jd",
11524					    "handle_written_inodeblock:",
11525					    "direct pointer",
11526					    (intmax_t)adp->ad_offset,
11527					    dp1->di_db[adp->ad_offset],
11528					    (intmax_t)adp->ad_oldblkno);
11529				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11530			} else {
11531				if (dp1->di_ib[adp->ad_offset - UFS_NDADDR] !=
11532				    0)
11533					panic("%s: %s #%jd allocated as %d",
11534					    "handle_written_inodeblock",
11535					    "indirect pointer",
11536					    (intmax_t)adp->ad_offset -
11537					    UFS_NDADDR,
11538					    dp1->di_ib[adp->ad_offset -
11539					    UFS_NDADDR]);
11540				dp1->di_ib[adp->ad_offset - UFS_NDADDR] =
11541				    adp->ad_newblkno;
11542			}
11543		} else {
11544			if (adp->ad_offset < UFS_NDADDR) {
11545				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11546					panic("%s: %s #%jd %s %jd != %jd",
11547					    "handle_written_inodeblock",
11548					    "direct pointer",
11549					    (intmax_t)adp->ad_offset, "mismatch",
11550					    (intmax_t)dp2->di_db[adp->ad_offset],
11551					    (intmax_t)adp->ad_oldblkno);
11552				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11553			} else {
11554				if (dp2->di_ib[adp->ad_offset - UFS_NDADDR] !=
11555				    0)
11556					panic("%s: %s #%jd allocated as %jd",
11557					    "handle_written_inodeblock",
11558					    "indirect pointer",
11559					    (intmax_t)adp->ad_offset -
11560					    UFS_NDADDR,
11561					    (intmax_t)
11562					    dp2->di_ib[adp->ad_offset -
11563					    UFS_NDADDR]);
11564				dp2->di_ib[adp->ad_offset - UFS_NDADDR] =
11565				    adp->ad_newblkno;
11566			}
11567		}
11568		adp->ad_state &= ~UNDONE;
11569		adp->ad_state |= ATTACHED;
11570		hadchanges = 1;
11571	}
11572	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11573		nextadp = TAILQ_NEXT(adp, ad_next);
11574		if (adp->ad_state & ATTACHED)
11575			panic("handle_written_inodeblock: new entry");
11576		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11577			panic("%s: direct pointers #%jd %s %jd != %jd",
11578			    "handle_written_inodeblock",
11579			    (intmax_t)adp->ad_offset, "mismatch",
11580			    (intmax_t)dp2->di_extb[adp->ad_offset],
11581			    (intmax_t)adp->ad_oldblkno);
11582		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11583		adp->ad_state &= ~UNDONE;
11584		adp->ad_state |= ATTACHED;
11585		hadchanges = 1;
11586	}
11587	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11588		stat_direct_blk_ptrs++;
11589	/*
11590	 * Reset the file size to its most up-to-date value.
11591	 */
11592	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11593		panic("handle_written_inodeblock: bad size");
11594	if (inodedep->id_savednlink > UFS_LINK_MAX)
11595		panic("handle_written_inodeblock: Invalid link count "
11596		    "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink,
11597		    inodedep);
11598	if (fstype == UFS1) {
11599		if (dp1->di_nlink != inodedep->id_savednlink) {
11600			dp1->di_nlink = inodedep->id_savednlink;
11601			hadchanges = 1;
11602		}
11603		if (dp1->di_size != inodedep->id_savedsize) {
11604			dp1->di_size = inodedep->id_savedsize;
11605			hadchanges = 1;
11606		}
11607	} else {
11608		if (dp2->di_nlink != inodedep->id_savednlink) {
11609			dp2->di_nlink = inodedep->id_savednlink;
11610			hadchanges = 1;
11611		}
11612		if (dp2->di_size != inodedep->id_savedsize) {
11613			dp2->di_size = inodedep->id_savedsize;
11614			hadchanges = 1;
11615		}
11616		if (dp2->di_extsize != inodedep->id_savedextsize) {
11617			dp2->di_extsize = inodedep->id_savedextsize;
11618			hadchanges = 1;
11619		}
11620	}
11621	inodedep->id_savedsize = -1;
11622	inodedep->id_savedextsize = -1;
11623	inodedep->id_savednlink = -1;
11624	/*
11625	 * If there were any rollbacks in the inode block, then it must be
11626	 * marked dirty so that its will eventually get written back in
11627	 * its correct form.
11628	 */
11629	if (hadchanges)
11630		bdirty(bp);
11631bufwait:
11632	/*
11633	 * If the write did not succeed, we have done all the roll-forward
11634	 * operations, but we cannot take the actions that will allow its
11635	 * dependencies to be processed.
11636	 */
11637	if ((flags & WRITESUCCEEDED) == 0)
11638		return (hadchanges);
11639	/*
11640	 * Process any allocdirects that completed during the update.
11641	 */
11642	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11643		handle_allocdirect_partdone(adp, &wkhd);
11644	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11645		handle_allocdirect_partdone(adp, &wkhd);
11646	/*
11647	 * Process deallocations that were held pending until the
11648	 * inode had been written to disk. Freeing of the inode
11649	 * is delayed until after all blocks have been freed to
11650	 * avoid creation of new <vfsid, inum, lbn> triples
11651	 * before the old ones have been deleted.  Completely
11652	 * unlinked inodes are not processed until the unlinked
11653	 * inode list is written or the last reference is removed.
11654	 */
11655	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11656		freefile = handle_bufwait(inodedep, NULL);
11657		if (freefile && !LIST_EMPTY(&wkhd)) {
11658			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11659			freefile = NULL;
11660		}
11661	}
11662	/*
11663	 * Move rolled forward dependency completions to the bufwait list
11664	 * now that those that were already written have been processed.
11665	 */
11666	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11667		panic("handle_written_inodeblock: bufwait but no changes");
11668	jwork_move(&inodedep->id_bufwait, &wkhd);
11669
11670	if (freefile != NULL) {
11671		/*
11672		 * If the inode is goingaway it was never written.  Fake up
11673		 * the state here so free_inodedep() can succeed.
11674		 */
11675		if (inodedep->id_state & GOINGAWAY)
11676			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11677		if (free_inodedep(inodedep) == 0)
11678			panic("handle_written_inodeblock: live inodedep %p",
11679			    inodedep);
11680		add_to_worklist(&freefile->fx_list, 0);
11681		return (0);
11682	}
11683
11684	/*
11685	 * If no outstanding dependencies, free it.
11686	 */
11687	if (free_inodedep(inodedep) ||
11688	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11689	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11690	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11691	     LIST_FIRST(&inodedep->id_bufwait) == 0))
11692		return (0);
11693	return (hadchanges);
11694}
11695
11696/*
11697 * Perform needed roll-forwards and kick off any dependencies that
11698 * can now be processed.
11699 *
11700 * If the write did not succeed, we will do all the roll-forward
11701 * operations, but we will not take the actions that will allow its
11702 * dependencies to be processed.
11703 */
11704static int
11705handle_written_indirdep(indirdep, bp, bpp, flags)
11706	struct indirdep *indirdep;
11707	struct buf *bp;
11708	struct buf **bpp;
11709	int flags;
11710{
11711	struct allocindir *aip;
11712	struct buf *sbp;
11713	int chgs;
11714
11715	if (indirdep->ir_state & GOINGAWAY)
11716		panic("handle_written_indirdep: indirdep gone");
11717	if ((indirdep->ir_state & IOSTARTED) == 0)
11718		panic("handle_written_indirdep: IO not started");
11719	chgs = 0;
11720	/*
11721	 * If there were rollbacks revert them here.
11722	 */
11723	if (indirdep->ir_saveddata) {
11724		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11725		if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11726			free(indirdep->ir_saveddata, M_INDIRDEP);
11727			indirdep->ir_saveddata = NULL;
11728		}
11729		chgs = 1;
11730	}
11731	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11732	indirdep->ir_state |= ATTACHED;
11733	/*
11734	 * If the write did not succeed, we have done all the roll-forward
11735	 * operations, but we cannot take the actions that will allow its
11736	 * dependencies to be processed.
11737	 */
11738	if ((flags & WRITESUCCEEDED) == 0) {
11739		stat_indir_blk_ptrs++;
11740		bdirty(bp);
11741		return (1);
11742	}
11743	/*
11744	 * Move allocindirs with written pointers to the completehd if
11745	 * the indirdep's pointer is not yet written.  Otherwise
11746	 * free them here.
11747	 */
11748	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) {
11749		LIST_REMOVE(aip, ai_next);
11750		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11751			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11752			    ai_next);
11753			newblk_freefrag(&aip->ai_block);
11754			continue;
11755		}
11756		free_newblk(&aip->ai_block);
11757	}
11758	/*
11759	 * Move allocindirs that have finished dependency processing from
11760	 * the done list to the write list after updating the pointers.
11761	 */
11762	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11763		while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) {
11764			handle_allocindir_partdone(aip);
11765			if (aip == LIST_FIRST(&indirdep->ir_donehd))
11766				panic("disk_write_complete: not gone");
11767			chgs = 1;
11768		}
11769	}
11770	/*
11771	 * Preserve the indirdep if there were any changes or if it is not
11772	 * yet valid on disk.
11773	 */
11774	if (chgs) {
11775		stat_indir_blk_ptrs++;
11776		bdirty(bp);
11777		return (1);
11778	}
11779	/*
11780	 * If there were no changes we can discard the savedbp and detach
11781	 * ourselves from the buf.  We are only carrying completed pointers
11782	 * in this case.
11783	 */
11784	sbp = indirdep->ir_savebp;
11785	sbp->b_flags |= B_INVAL | B_NOCACHE;
11786	indirdep->ir_savebp = NULL;
11787	indirdep->ir_bp = NULL;
11788	if (*bpp != NULL)
11789		panic("handle_written_indirdep: bp already exists.");
11790	*bpp = sbp;
11791	/*
11792	 * The indirdep may not be freed until its parent points at it.
11793	 */
11794	if (indirdep->ir_state & DEPCOMPLETE)
11795		free_indirdep(indirdep);
11796
11797	return (0);
11798}
11799
11800/*
11801 * Process a diradd entry after its dependent inode has been written.
11802 */
11803static void
11804diradd_inode_written(dap, inodedep)
11805	struct diradd *dap;
11806	struct inodedep *inodedep;
11807{
11808
11809	LOCK_OWNED(VFSTOUFS(dap->da_list.wk_mp));
11810	dap->da_state |= COMPLETE;
11811	complete_diradd(dap);
11812	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
11813}
11814
11815/*
11816 * Returns true if the bmsafemap will have rollbacks when written.  Must only
11817 * be called with the per-filesystem lock and the buf lock on the cg held.
11818 */
11819static int
11820bmsafemap_backgroundwrite(bmsafemap, bp)
11821	struct bmsafemap *bmsafemap;
11822	struct buf *bp;
11823{
11824	int dirty;
11825
11826	LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp));
11827	dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
11828	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
11829	/*
11830	 * If we're initiating a background write we need to process the
11831	 * rollbacks as they exist now, not as they exist when IO starts.
11832	 * No other consumers will look at the contents of the shadowed
11833	 * buf so this is safe to do here.
11834	 */
11835	if (bp->b_xflags & BX_BKGRDMARKER)
11836		initiate_write_bmsafemap(bmsafemap, bp);
11837
11838	return (dirty);
11839}
11840
11841/*
11842 * Re-apply an allocation when a cg write is complete.
11843 */
11844static int
11845jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
11846	struct jnewblk *jnewblk;
11847	struct fs *fs;
11848	struct cg *cgp;
11849	uint8_t *blksfree;
11850{
11851	ufs1_daddr_t fragno;
11852	ufs2_daddr_t blkno;
11853	long cgbno, bbase;
11854	int frags, blk;
11855	int i;
11856
11857	frags = 0;
11858	cgbno = dtogd(fs, jnewblk->jn_blkno);
11859	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
11860		if (isclr(blksfree, cgbno + i))
11861			panic("jnewblk_rollforward: re-allocated fragment");
11862		frags++;
11863	}
11864	if (frags == fs->fs_frag) {
11865		blkno = fragstoblks(fs, cgbno);
11866		ffs_clrblock(fs, blksfree, (long)blkno);
11867		ffs_clusteracct(fs, cgp, blkno, -1);
11868		cgp->cg_cs.cs_nbfree--;
11869	} else {
11870		bbase = cgbno - fragnum(fs, cgbno);
11871		cgbno += jnewblk->jn_oldfrags;
11872                /* If a complete block had been reassembled, account for it. */
11873		fragno = fragstoblks(fs, bbase);
11874		if (ffs_isblock(fs, blksfree, fragno)) {
11875			cgp->cg_cs.cs_nffree += fs->fs_frag;
11876			ffs_clusteracct(fs, cgp, fragno, -1);
11877			cgp->cg_cs.cs_nbfree--;
11878		}
11879		/* Decrement the old frags.  */
11880		blk = blkmap(fs, blksfree, bbase);
11881		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11882		/* Allocate the fragment */
11883		for (i = 0; i < frags; i++)
11884			clrbit(blksfree, cgbno + i);
11885		cgp->cg_cs.cs_nffree -= frags;
11886		/* Add back in counts associated with the new frags */
11887		blk = blkmap(fs, blksfree, bbase);
11888		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11889	}
11890	return (frags);
11891}
11892
11893/*
11894 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
11895 * changes if it's not a background write.  Set all written dependencies
11896 * to DEPCOMPLETE and free the structure if possible.
11897 *
11898 * If the write did not succeed, we will do all the roll-forward
11899 * operations, but we will not take the actions that will allow its
11900 * dependencies to be processed.
11901 */
11902static int
11903handle_written_bmsafemap(bmsafemap, bp, flags)
11904	struct bmsafemap *bmsafemap;
11905	struct buf *bp;
11906	int flags;
11907{
11908	struct newblk *newblk;
11909	struct inodedep *inodedep;
11910	struct jaddref *jaddref, *jatmp;
11911	struct jnewblk *jnewblk, *jntmp;
11912	struct ufsmount *ump;
11913	uint8_t *inosused;
11914	uint8_t *blksfree;
11915	struct cg *cgp;
11916	struct fs *fs;
11917	ino_t ino;
11918	int foreground;
11919	int chgs;
11920
11921	if ((bmsafemap->sm_state & IOSTARTED) == 0)
11922		panic("handle_written_bmsafemap: Not started\n");
11923	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
11924	chgs = 0;
11925	bmsafemap->sm_state &= ~IOSTARTED;
11926	foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
11927	/*
11928	 * If write was successful, release journal work that was waiting
11929	 * on the write. Otherwise move the work back.
11930	 */
11931	if (flags & WRITESUCCEEDED)
11932		handle_jwork(&bmsafemap->sm_freewr);
11933	else
11934		LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
11935		    worklist, wk_list);
11936
11937	/*
11938	 * Restore unwritten inode allocation pending jaddref writes.
11939	 */
11940	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
11941		cgp = (struct cg *)bp->b_data;
11942		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11943		inosused = cg_inosused(cgp);
11944		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
11945		    ja_bmdeps, jatmp) {
11946			if ((jaddref->ja_state & UNDONE) == 0)
11947				continue;
11948			ino = jaddref->ja_ino % fs->fs_ipg;
11949			if (isset(inosused, ino))
11950				panic("handle_written_bmsafemap: "
11951				    "re-allocated inode");
11952			/* Do the roll-forward only if it's a real copy. */
11953			if (foreground) {
11954				if ((jaddref->ja_mode & IFMT) == IFDIR)
11955					cgp->cg_cs.cs_ndir++;
11956				cgp->cg_cs.cs_nifree--;
11957				setbit(inosused, ino);
11958				chgs = 1;
11959			}
11960			jaddref->ja_state &= ~UNDONE;
11961			jaddref->ja_state |= ATTACHED;
11962			free_jaddref(jaddref);
11963		}
11964	}
11965	/*
11966	 * Restore any block allocations which are pending journal writes.
11967	 */
11968	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11969		cgp = (struct cg *)bp->b_data;
11970		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11971		blksfree = cg_blksfree(cgp);
11972		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
11973		    jntmp) {
11974			if ((jnewblk->jn_state & UNDONE) == 0)
11975				continue;
11976			/* Do the roll-forward only if it's a real copy. */
11977			if (foreground &&
11978			    jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
11979				chgs = 1;
11980			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
11981			jnewblk->jn_state |= ATTACHED;
11982			free_jnewblk(jnewblk);
11983		}
11984	}
11985	/*
11986	 * If the write did not succeed, we have done all the roll-forward
11987	 * operations, but we cannot take the actions that will allow its
11988	 * dependencies to be processed.
11989	 */
11990	if ((flags & WRITESUCCEEDED) == 0) {
11991		LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
11992		    newblk, nb_deps);
11993		LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
11994		    worklist, wk_list);
11995		if (foreground)
11996			bdirty(bp);
11997		return (1);
11998	}
11999	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
12000		newblk->nb_state |= DEPCOMPLETE;
12001		newblk->nb_state &= ~ONDEPLIST;
12002		newblk->nb_bmsafemap = NULL;
12003		LIST_REMOVE(newblk, nb_deps);
12004		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
12005			handle_allocdirect_partdone(
12006			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
12007		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
12008			handle_allocindir_partdone(
12009			    WK_ALLOCINDIR(&newblk->nb_list));
12010		else if (newblk->nb_list.wk_type != D_NEWBLK)
12011			panic("handle_written_bmsafemap: Unexpected type: %s",
12012			    TYPENAME(newblk->nb_list.wk_type));
12013	}
12014	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
12015		inodedep->id_state |= DEPCOMPLETE;
12016		inodedep->id_state &= ~ONDEPLIST;
12017		LIST_REMOVE(inodedep, id_deps);
12018		inodedep->id_bmsafemap = NULL;
12019	}
12020	LIST_REMOVE(bmsafemap, sm_next);
12021	if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
12022	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
12023	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
12024	    LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
12025	    LIST_EMPTY(&bmsafemap->sm_freehd)) {
12026		LIST_REMOVE(bmsafemap, sm_hash);
12027		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
12028		return (0);
12029	}
12030	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
12031	if (foreground)
12032		bdirty(bp);
12033	return (1);
12034}
12035
12036/*
12037 * Try to free a mkdir dependency.
12038 */
12039static void
12040complete_mkdir(mkdir)
12041	struct mkdir *mkdir;
12042{
12043	struct diradd *dap;
12044
12045	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
12046		return;
12047	LIST_REMOVE(mkdir, md_mkdirs);
12048	dap = mkdir->md_diradd;
12049	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
12050	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
12051		dap->da_state |= DEPCOMPLETE;
12052		complete_diradd(dap);
12053	}
12054	WORKITEM_FREE(mkdir, D_MKDIR);
12055}
12056
12057/*
12058 * Handle the completion of a mkdir dependency.
12059 */
12060static void
12061handle_written_mkdir(mkdir, type)
12062	struct mkdir *mkdir;
12063	int type;
12064{
12065
12066	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
12067		panic("handle_written_mkdir: bad type");
12068	mkdir->md_state |= COMPLETE;
12069	complete_mkdir(mkdir);
12070}
12071
12072static int
12073free_pagedep(pagedep)
12074	struct pagedep *pagedep;
12075{
12076	int i;
12077
12078	if (pagedep->pd_state & NEWBLOCK)
12079		return (0);
12080	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
12081		return (0);
12082	for (i = 0; i < DAHASHSZ; i++)
12083		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
12084			return (0);
12085	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
12086		return (0);
12087	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
12088		return (0);
12089	if (pagedep->pd_state & ONWORKLIST)
12090		WORKLIST_REMOVE(&pagedep->pd_list);
12091	LIST_REMOVE(pagedep, pd_hash);
12092	WORKITEM_FREE(pagedep, D_PAGEDEP);
12093
12094	return (1);
12095}
12096
12097/*
12098 * Called from within softdep_disk_write_complete above.
12099 * A write operation was just completed. Removed inodes can
12100 * now be freed and associated block pointers may be committed.
12101 * Note that this routine is always called from interrupt level
12102 * with further interrupts from this device blocked.
12103 *
12104 * If the write did not succeed, we will do all the roll-forward
12105 * operations, but we will not take the actions that will allow its
12106 * dependencies to be processed.
12107 */
12108static int
12109handle_written_filepage(pagedep, bp, flags)
12110	struct pagedep *pagedep;
12111	struct buf *bp;		/* buffer containing the written page */
12112	int flags;
12113{
12114	struct dirrem *dirrem;
12115	struct diradd *dap, *nextdap;
12116	struct direct *ep;
12117	int i, chgs;
12118
12119	if ((pagedep->pd_state & IOSTARTED) == 0)
12120		panic("handle_written_filepage: not started");
12121	pagedep->pd_state &= ~IOSTARTED;
12122	if ((flags & WRITESUCCEEDED) == 0)
12123		goto rollforward;
12124	/*
12125	 * Process any directory removals that have been committed.
12126	 */
12127	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
12128		LIST_REMOVE(dirrem, dm_next);
12129		dirrem->dm_state |= COMPLETE;
12130		dirrem->dm_dirinum = pagedep->pd_ino;
12131		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
12132		    ("handle_written_filepage: Journal entries not written."));
12133		add_to_worklist(&dirrem->dm_list, 0);
12134	}
12135	/*
12136	 * Free any directory additions that have been committed.
12137	 * If it is a newly allocated block, we have to wait until
12138	 * the on-disk directory inode claims the new block.
12139	 */
12140	if ((pagedep->pd_state & NEWBLOCK) == 0)
12141		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
12142			free_diradd(dap, NULL);
12143rollforward:
12144	/*
12145	 * Uncommitted directory entries must be restored.
12146	 */
12147	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
12148		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
12149		     dap = nextdap) {
12150			nextdap = LIST_NEXT(dap, da_pdlist);
12151			if (dap->da_state & ATTACHED)
12152				panic("handle_written_filepage: attached");
12153			ep = (struct direct *)
12154			    ((char *)bp->b_data + dap->da_offset);
12155			ep->d_ino = dap->da_newinum;
12156			dap->da_state &= ~UNDONE;
12157			dap->da_state |= ATTACHED;
12158			chgs = 1;
12159			/*
12160			 * If the inode referenced by the directory has
12161			 * been written out, then the dependency can be
12162			 * moved to the pending list.
12163			 */
12164			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
12165				LIST_REMOVE(dap, da_pdlist);
12166				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
12167				    da_pdlist);
12168			}
12169		}
12170	}
12171	/*
12172	 * If there were any rollbacks in the directory, then it must be
12173	 * marked dirty so that its will eventually get written back in
12174	 * its correct form.
12175	 */
12176	if (chgs || (flags & WRITESUCCEEDED) == 0) {
12177		if ((bp->b_flags & B_DELWRI) == 0)
12178			stat_dir_entry++;
12179		bdirty(bp);
12180		return (1);
12181	}
12182	/*
12183	 * If we are not waiting for a new directory block to be
12184	 * claimed by its inode, then the pagedep will be freed.
12185	 * Otherwise it will remain to track any new entries on
12186	 * the page in case they are fsync'ed.
12187	 */
12188	free_pagedep(pagedep);
12189	return (0);
12190}
12191
12192/*
12193 * Writing back in-core inode structures.
12194 *
12195 * The filesystem only accesses an inode's contents when it occupies an
12196 * "in-core" inode structure.  These "in-core" structures are separate from
12197 * the page frames used to cache inode blocks.  Only the latter are
12198 * transferred to/from the disk.  So, when the updated contents of the
12199 * "in-core" inode structure are copied to the corresponding in-memory inode
12200 * block, the dependencies are also transferred.  The following procedure is
12201 * called when copying a dirty "in-core" inode to a cached inode block.
12202 */
12203
12204/*
12205 * Called when an inode is loaded from disk. If the effective link count
12206 * differed from the actual link count when it was last flushed, then we
12207 * need to ensure that the correct effective link count is put back.
12208 */
12209void
12210softdep_load_inodeblock(ip)
12211	struct inode *ip;	/* the "in_core" copy of the inode */
12212{
12213	struct inodedep *inodedep;
12214	struct ufsmount *ump;
12215
12216	ump = ITOUMP(ip);
12217	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
12218	    ("softdep_load_inodeblock called on non-softdep filesystem"));
12219	/*
12220	 * Check for alternate nlink count.
12221	 */
12222	ip->i_effnlink = ip->i_nlink;
12223	ACQUIRE_LOCK(ump);
12224	if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) {
12225		FREE_LOCK(ump);
12226		return;
12227	}
12228	ip->i_effnlink -= inodedep->id_nlinkdelta;
12229	FREE_LOCK(ump);
12230}
12231
12232/*
12233 * This routine is called just before the "in-core" inode
12234 * information is to be copied to the in-memory inode block.
12235 * Recall that an inode block contains several inodes. If
12236 * the force flag is set, then the dependencies will be
12237 * cleared so that the update can always be made. Note that
12238 * the buffer is locked when this routine is called, so we
12239 * will never be in the middle of writing the inode block
12240 * to disk.
12241 */
12242void
12243softdep_update_inodeblock(ip, bp, waitfor)
12244	struct inode *ip;	/* the "in_core" copy of the inode */
12245	struct buf *bp;		/* the buffer containing the inode block */
12246	int waitfor;		/* nonzero => update must be allowed */
12247{
12248	struct inodedep *inodedep;
12249	struct inoref *inoref;
12250	struct ufsmount *ump;
12251	struct worklist *wk;
12252	struct mount *mp;
12253	struct buf *ibp;
12254	struct fs *fs;
12255	int error;
12256
12257	ump = ITOUMP(ip);
12258	mp = UFSTOVFS(ump);
12259	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
12260	    ("softdep_update_inodeblock called on non-softdep filesystem"));
12261	fs = ump->um_fs;
12262	/*
12263	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
12264	 * does not have access to the in-core ip so must write directly into
12265	 * the inode block buffer when setting freelink.
12266	 */
12267	if (fs->fs_magic == FS_UFS1_MAGIC)
12268		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
12269		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
12270	else
12271		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
12272		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
12273	/*
12274	 * If the effective link count is not equal to the actual link
12275	 * count, then we must track the difference in an inodedep while
12276	 * the inode is (potentially) tossed out of the cache. Otherwise,
12277	 * if there is no existing inodedep, then there are no dependencies
12278	 * to track.
12279	 */
12280	ACQUIRE_LOCK(ump);
12281again:
12282	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12283		FREE_LOCK(ump);
12284		if (ip->i_effnlink != ip->i_nlink)
12285			panic("softdep_update_inodeblock: bad link count");
12286		return;
12287	}
12288	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
12289		panic("softdep_update_inodeblock: bad delta");
12290	/*
12291	 * If we're flushing all dependencies we must also move any waiting
12292	 * for journal writes onto the bufwait list prior to I/O.
12293	 */
12294	if (waitfor) {
12295		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12296			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12297			    == DEPCOMPLETE) {
12298				jwait(&inoref->if_list, MNT_WAIT);
12299				goto again;
12300			}
12301		}
12302	}
12303	/*
12304	 * Changes have been initiated. Anything depending on these
12305	 * changes cannot occur until this inode has been written.
12306	 */
12307	inodedep->id_state &= ~COMPLETE;
12308	if ((inodedep->id_state & ONWORKLIST) == 0)
12309		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
12310	/*
12311	 * Any new dependencies associated with the incore inode must
12312	 * now be moved to the list associated with the buffer holding
12313	 * the in-memory copy of the inode. Once merged process any
12314	 * allocdirects that are completed by the merger.
12315	 */
12316	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
12317	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
12318		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
12319		    NULL);
12320	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
12321	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
12322		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
12323		    NULL);
12324	/*
12325	 * Now that the inode has been pushed into the buffer, the
12326	 * operations dependent on the inode being written to disk
12327	 * can be moved to the id_bufwait so that they will be
12328	 * processed when the buffer I/O completes.
12329	 */
12330	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
12331		WORKLIST_REMOVE(wk);
12332		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
12333	}
12334	/*
12335	 * Newly allocated inodes cannot be written until the bitmap
12336	 * that allocates them have been written (indicated by
12337	 * DEPCOMPLETE being set in id_state). If we are doing a
12338	 * forced sync (e.g., an fsync on a file), we force the bitmap
12339	 * to be written so that the update can be done.
12340	 */
12341	if (waitfor == 0) {
12342		FREE_LOCK(ump);
12343		return;
12344	}
12345retry:
12346	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
12347		FREE_LOCK(ump);
12348		return;
12349	}
12350	ibp = inodedep->id_bmsafemap->sm_buf;
12351	ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT);
12352	if (ibp == NULL) {
12353		/*
12354		 * If ibp came back as NULL, the dependency could have been
12355		 * freed while we slept.  Look it up again, and check to see
12356		 * that it has completed.
12357		 */
12358		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
12359			goto retry;
12360		FREE_LOCK(ump);
12361		return;
12362	}
12363	FREE_LOCK(ump);
12364	if ((error = bwrite(ibp)) != 0)
12365		softdep_error("softdep_update_inodeblock: bwrite", error);
12366}
12367
12368/*
12369 * Merge the a new inode dependency list (such as id_newinoupdt) into an
12370 * old inode dependency list (such as id_inoupdt).
12371 */
12372static void
12373merge_inode_lists(newlisthead, oldlisthead)
12374	struct allocdirectlst *newlisthead;
12375	struct allocdirectlst *oldlisthead;
12376{
12377	struct allocdirect *listadp, *newadp;
12378
12379	newadp = TAILQ_FIRST(newlisthead);
12380	if (newadp != NULL)
12381		LOCK_OWNED(VFSTOUFS(newadp->ad_block.nb_list.wk_mp));
12382	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
12383		if (listadp->ad_offset < newadp->ad_offset) {
12384			listadp = TAILQ_NEXT(listadp, ad_next);
12385			continue;
12386		}
12387		TAILQ_REMOVE(newlisthead, newadp, ad_next);
12388		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
12389		if (listadp->ad_offset == newadp->ad_offset) {
12390			allocdirect_merge(oldlisthead, newadp,
12391			    listadp);
12392			listadp = newadp;
12393		}
12394		newadp = TAILQ_FIRST(newlisthead);
12395	}
12396	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
12397		TAILQ_REMOVE(newlisthead, newadp, ad_next);
12398		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
12399	}
12400}
12401
12402/*
12403 * If we are doing an fsync, then we must ensure that any directory
12404 * entries for the inode have been written after the inode gets to disk.
12405 */
12406int
12407softdep_fsync(vp)
12408	struct vnode *vp;	/* the "in_core" copy of the inode */
12409{
12410	struct inodedep *inodedep;
12411	struct pagedep *pagedep;
12412	struct inoref *inoref;
12413	struct ufsmount *ump;
12414	struct worklist *wk;
12415	struct diradd *dap;
12416	struct mount *mp;
12417	struct vnode *pvp;
12418	struct inode *ip;
12419	struct buf *bp;
12420	struct fs *fs;
12421	struct thread *td = curthread;
12422	int error, flushparent, pagedep_new_block;
12423	ino_t parentino;
12424	ufs_lbn_t lbn;
12425
12426	ip = VTOI(vp);
12427	mp = vp->v_mount;
12428	ump = VFSTOUFS(mp);
12429	fs = ump->um_fs;
12430	if (MOUNTEDSOFTDEP(mp) == 0)
12431		return (0);
12432	ACQUIRE_LOCK(ump);
12433restart:
12434	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12435		FREE_LOCK(ump);
12436		return (0);
12437	}
12438	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12439		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12440		    == DEPCOMPLETE) {
12441			jwait(&inoref->if_list, MNT_WAIT);
12442			goto restart;
12443		}
12444	}
12445	if (!LIST_EMPTY(&inodedep->id_inowait) ||
12446	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
12447	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
12448	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
12449	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
12450		panic("softdep_fsync: pending ops %p", inodedep);
12451	for (error = 0, flushparent = 0; ; ) {
12452		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
12453			break;
12454		if (wk->wk_type != D_DIRADD)
12455			panic("softdep_fsync: Unexpected type %s",
12456			    TYPENAME(wk->wk_type));
12457		dap = WK_DIRADD(wk);
12458		/*
12459		 * Flush our parent if this directory entry has a MKDIR_PARENT
12460		 * dependency or is contained in a newly allocated block.
12461		 */
12462		if (dap->da_state & DIRCHG)
12463			pagedep = dap->da_previous->dm_pagedep;
12464		else
12465			pagedep = dap->da_pagedep;
12466		parentino = pagedep->pd_ino;
12467		lbn = pagedep->pd_lbn;
12468		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
12469			panic("softdep_fsync: dirty");
12470		if ((dap->da_state & MKDIR_PARENT) ||
12471		    (pagedep->pd_state & NEWBLOCK))
12472			flushparent = 1;
12473		else
12474			flushparent = 0;
12475		/*
12476		 * If we are being fsync'ed as part of vgone'ing this vnode,
12477		 * then we will not be able to release and recover the
12478		 * vnode below, so we just have to give up on writing its
12479		 * directory entry out. It will eventually be written, just
12480		 * not now, but then the user was not asking to have it
12481		 * written, so we are not breaking any promises.
12482		 */
12483		if (vp->v_iflag & VI_DOOMED)
12484			break;
12485		/*
12486		 * We prevent deadlock by always fetching inodes from the
12487		 * root, moving down the directory tree. Thus, when fetching
12488		 * our parent directory, we first try to get the lock. If
12489		 * that fails, we must unlock ourselves before requesting
12490		 * the lock on our parent. See the comment in ufs_lookup
12491		 * for details on possible races.
12492		 */
12493		FREE_LOCK(ump);
12494		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
12495		    FFSV_FORCEINSMQ)) {
12496			/*
12497			 * Unmount cannot proceed after unlock because
12498			 * caller must have called vn_start_write().
12499			 */
12500			VOP_UNLOCK(vp, 0);
12501			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
12502			    &pvp, FFSV_FORCEINSMQ);
12503			MPASS(VTOI(pvp)->i_mode != 0);
12504			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12505			if (vp->v_iflag & VI_DOOMED) {
12506				if (error == 0)
12507					vput(pvp);
12508				error = ENOENT;
12509			}
12510			if (error != 0)
12511				return (error);
12512		}
12513		/*
12514		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12515		 * that are contained in direct blocks will be resolved by
12516		 * doing a ffs_update. Pagedeps contained in indirect blocks
12517		 * may require a complete sync'ing of the directory. So, we
12518		 * try the cheap and fast ffs_update first, and if that fails,
12519		 * then we do the slower ffs_syncvnode of the directory.
12520		 */
12521		if (flushparent) {
12522			int locked;
12523
12524			if ((error = ffs_update(pvp, 1)) != 0) {
12525				vput(pvp);
12526				return (error);
12527			}
12528			ACQUIRE_LOCK(ump);
12529			locked = 1;
12530			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12531				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12532					if (wk->wk_type != D_DIRADD)
12533						panic("softdep_fsync: Unexpected type %s",
12534						      TYPENAME(wk->wk_type));
12535					dap = WK_DIRADD(wk);
12536					if (dap->da_state & DIRCHG)
12537						pagedep = dap->da_previous->dm_pagedep;
12538					else
12539						pagedep = dap->da_pagedep;
12540					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12541					FREE_LOCK(ump);
12542					locked = 0;
12543					if (pagedep_new_block && (error =
12544					    ffs_syncvnode(pvp, MNT_WAIT, 0))) {
12545						vput(pvp);
12546						return (error);
12547					}
12548				}
12549			}
12550			if (locked)
12551				FREE_LOCK(ump);
12552		}
12553		/*
12554		 * Flush directory page containing the inode's name.
12555		 */
12556		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12557		    &bp);
12558		if (error == 0)
12559			error = bwrite(bp);
12560		else
12561			brelse(bp);
12562		vput(pvp);
12563		if (error != 0)
12564			return (error);
12565		ACQUIRE_LOCK(ump);
12566		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12567			break;
12568	}
12569	FREE_LOCK(ump);
12570	return (0);
12571}
12572
12573/*
12574 * Flush all the dirty bitmaps associated with the block device
12575 * before flushing the rest of the dirty blocks so as to reduce
12576 * the number of dependencies that will have to be rolled back.
12577 *
12578 * XXX Unused?
12579 */
12580void
12581softdep_fsync_mountdev(vp)
12582	struct vnode *vp;
12583{
12584	struct buf *bp, *nbp;
12585	struct worklist *wk;
12586	struct bufobj *bo;
12587
12588	if (!vn_isdisk(vp, NULL))
12589		panic("softdep_fsync_mountdev: vnode not a disk");
12590	bo = &vp->v_bufobj;
12591restart:
12592	BO_LOCK(bo);
12593	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
12594		/*
12595		 * If it is already scheduled, skip to the next buffer.
12596		 */
12597		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
12598			continue;
12599
12600		if ((bp->b_flags & B_DELWRI) == 0)
12601			panic("softdep_fsync_mountdev: not dirty");
12602		/*
12603		 * We are only interested in bitmaps with outstanding
12604		 * dependencies.
12605		 */
12606		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
12607		    wk->wk_type != D_BMSAFEMAP ||
12608		    (bp->b_vflags & BV_BKGRDINPROG)) {
12609			BUF_UNLOCK(bp);
12610			continue;
12611		}
12612		BO_UNLOCK(bo);
12613		bremfree(bp);
12614		(void) bawrite(bp);
12615		goto restart;
12616	}
12617	drain_output(vp);
12618	BO_UNLOCK(bo);
12619}
12620
12621/*
12622 * Sync all cylinder groups that were dirty at the time this function is
12623 * called.  Newly dirtied cgs will be inserted before the sentinel.  This
12624 * is used to flush freedep activity that may be holding up writes to a
12625 * indirect block.
12626 */
12627static int
12628sync_cgs(mp, waitfor)
12629	struct mount *mp;
12630	int waitfor;
12631{
12632	struct bmsafemap *bmsafemap;
12633	struct bmsafemap *sentinel;
12634	struct ufsmount *ump;
12635	struct buf *bp;
12636	int error;
12637
12638	sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
12639	sentinel->sm_cg = -1;
12640	ump = VFSTOUFS(mp);
12641	error = 0;
12642	ACQUIRE_LOCK(ump);
12643	LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
12644	for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
12645	    bmsafemap = LIST_NEXT(sentinel, sm_next)) {
12646		/* Skip sentinels and cgs with no work to release. */
12647		if (bmsafemap->sm_cg == -1 ||
12648		    (LIST_EMPTY(&bmsafemap->sm_freehd) &&
12649		    LIST_EMPTY(&bmsafemap->sm_freewr))) {
12650			LIST_REMOVE(sentinel, sm_next);
12651			LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12652			continue;
12653		}
12654		/*
12655		 * If we don't get the lock and we're waiting try again, if
12656		 * not move on to the next buf and try to sync it.
12657		 */
12658		bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor);
12659		if (bp == NULL && waitfor == MNT_WAIT)
12660			continue;
12661		LIST_REMOVE(sentinel, sm_next);
12662		LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12663		if (bp == NULL)
12664			continue;
12665		FREE_LOCK(ump);
12666		if (waitfor == MNT_NOWAIT)
12667			bawrite(bp);
12668		else
12669			error = bwrite(bp);
12670		ACQUIRE_LOCK(ump);
12671		if (error)
12672			break;
12673	}
12674	LIST_REMOVE(sentinel, sm_next);
12675	FREE_LOCK(ump);
12676	free(sentinel, M_BMSAFEMAP);
12677	return (error);
12678}
12679
12680/*
12681 * This routine is called when we are trying to synchronously flush a
12682 * file. This routine must eliminate any filesystem metadata dependencies
12683 * so that the syncing routine can succeed.
12684 */
12685int
12686softdep_sync_metadata(struct vnode *vp)
12687{
12688	struct inode *ip;
12689	int error;
12690
12691	ip = VTOI(vp);
12692	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12693	    ("softdep_sync_metadata called on non-softdep filesystem"));
12694	/*
12695	 * Ensure that any direct block dependencies have been cleared,
12696	 * truncations are started, and inode references are journaled.
12697	 */
12698	ACQUIRE_LOCK(VFSTOUFS(vp->v_mount));
12699	/*
12700	 * Write all journal records to prevent rollbacks on devvp.
12701	 */
12702	if (vp->v_type == VCHR)
12703		softdep_flushjournal(vp->v_mount);
12704	error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number);
12705	/*
12706	 * Ensure that all truncates are written so we won't find deps on
12707	 * indirect blocks.
12708	 */
12709	process_truncates(vp);
12710	FREE_LOCK(VFSTOUFS(vp->v_mount));
12711
12712	return (error);
12713}
12714
12715/*
12716 * This routine is called when we are attempting to sync a buf with
12717 * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12718 * other IO it can but returns EBUSY if the buffer is not yet able to
12719 * be written.  Dependencies which will not cause rollbacks will always
12720 * return 0.
12721 */
12722int
12723softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
12724{
12725	struct indirdep *indirdep;
12726	struct pagedep *pagedep;
12727	struct allocindir *aip;
12728	struct newblk *newblk;
12729	struct ufsmount *ump;
12730	struct buf *nbp;
12731	struct worklist *wk;
12732	int i, error;
12733
12734	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12735	    ("softdep_sync_buf called on non-softdep filesystem"));
12736	/*
12737	 * For VCHR we just don't want to force flush any dependencies that
12738	 * will cause rollbacks.
12739	 */
12740	if (vp->v_type == VCHR) {
12741		if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12742			return (EBUSY);
12743		return (0);
12744	}
12745	ump = VFSTOUFS(vp->v_mount);
12746	ACQUIRE_LOCK(ump);
12747	/*
12748	 * As we hold the buffer locked, none of its dependencies
12749	 * will disappear.
12750	 */
12751	error = 0;
12752top:
12753	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12754		switch (wk->wk_type) {
12755
12756		case D_ALLOCDIRECT:
12757		case D_ALLOCINDIR:
12758			newblk = WK_NEWBLK(wk);
12759			if (newblk->nb_jnewblk != NULL) {
12760				if (waitfor == MNT_NOWAIT) {
12761					error = EBUSY;
12762					goto out_unlock;
12763				}
12764				jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12765				goto top;
12766			}
12767			if (newblk->nb_state & DEPCOMPLETE ||
12768			    waitfor == MNT_NOWAIT)
12769				continue;
12770			nbp = newblk->nb_bmsafemap->sm_buf;
12771			nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12772			if (nbp == NULL)
12773				goto top;
12774			FREE_LOCK(ump);
12775			if ((error = bwrite(nbp)) != 0)
12776				goto out;
12777			ACQUIRE_LOCK(ump);
12778			continue;
12779
12780		case D_INDIRDEP:
12781			indirdep = WK_INDIRDEP(wk);
12782			if (waitfor == MNT_NOWAIT) {
12783				if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
12784				    !LIST_EMPTY(&indirdep->ir_deplisthd)) {
12785					error = EBUSY;
12786					goto out_unlock;
12787				}
12788			}
12789			if (!TAILQ_EMPTY(&indirdep->ir_trunc))
12790				panic("softdep_sync_buf: truncation pending.");
12791		restart:
12792			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
12793				newblk = (struct newblk *)aip;
12794				if (newblk->nb_jnewblk != NULL) {
12795					jwait(&newblk->nb_jnewblk->jn_list,
12796					    waitfor);
12797					goto restart;
12798				}
12799				if (newblk->nb_state & DEPCOMPLETE)
12800					continue;
12801				nbp = newblk->nb_bmsafemap->sm_buf;
12802				nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12803				if (nbp == NULL)
12804					goto restart;
12805				FREE_LOCK(ump);
12806				if ((error = bwrite(nbp)) != 0)
12807					goto out;
12808				ACQUIRE_LOCK(ump);
12809				goto restart;
12810			}
12811			continue;
12812
12813		case D_PAGEDEP:
12814			/*
12815			 * Only flush directory entries in synchronous passes.
12816			 */
12817			if (waitfor != MNT_WAIT) {
12818				error = EBUSY;
12819				goto out_unlock;
12820			}
12821			/*
12822			 * While syncing snapshots, we must allow recursive
12823			 * lookups.
12824			 */
12825			BUF_AREC(bp);
12826			/*
12827			 * We are trying to sync a directory that may
12828			 * have dependencies on both its own metadata
12829			 * and/or dependencies on the inodes of any
12830			 * recently allocated files. We walk its diradd
12831			 * lists pushing out the associated inode.
12832			 */
12833			pagedep = WK_PAGEDEP(wk);
12834			for (i = 0; i < DAHASHSZ; i++) {
12835				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
12836					continue;
12837				if ((error = flush_pagedep_deps(vp, wk->wk_mp,
12838				    &pagedep->pd_diraddhd[i]))) {
12839					BUF_NOREC(bp);
12840					goto out_unlock;
12841				}
12842			}
12843			BUF_NOREC(bp);
12844			continue;
12845
12846		case D_FREEWORK:
12847		case D_FREEDEP:
12848		case D_JSEGDEP:
12849		case D_JNEWBLK:
12850			continue;
12851
12852		default:
12853			panic("softdep_sync_buf: Unknown type %s",
12854			    TYPENAME(wk->wk_type));
12855			/* NOTREACHED */
12856		}
12857	}
12858out_unlock:
12859	FREE_LOCK(ump);
12860out:
12861	return (error);
12862}
12863
12864/*
12865 * Flush the dependencies associated with an inodedep.
12866 */
12867static int
12868flush_inodedep_deps(vp, mp, ino)
12869	struct vnode *vp;
12870	struct mount *mp;
12871	ino_t ino;
12872{
12873	struct inodedep *inodedep;
12874	struct inoref *inoref;
12875	struct ufsmount *ump;
12876	int error, waitfor;
12877
12878	/*
12879	 * This work is done in two passes. The first pass grabs most
12880	 * of the buffers and begins asynchronously writing them. The
12881	 * only way to wait for these asynchronous writes is to sleep
12882	 * on the filesystem vnode which may stay busy for a long time
12883	 * if the filesystem is active. So, instead, we make a second
12884	 * pass over the dependencies blocking on each write. In the
12885	 * usual case we will be blocking against a write that we
12886	 * initiated, so when it is done the dependency will have been
12887	 * resolved. Thus the second pass is expected to end quickly.
12888	 * We give a brief window at the top of the loop to allow
12889	 * any pending I/O to complete.
12890	 */
12891	ump = VFSTOUFS(mp);
12892	LOCK_OWNED(ump);
12893	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
12894		if (error)
12895			return (error);
12896		FREE_LOCK(ump);
12897		ACQUIRE_LOCK(ump);
12898restart:
12899		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12900			return (0);
12901		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12902			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12903			    == DEPCOMPLETE) {
12904				jwait(&inoref->if_list, MNT_WAIT);
12905				goto restart;
12906			}
12907		}
12908		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
12909		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
12910		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
12911		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
12912			continue;
12913		/*
12914		 * If pass2, we are done, otherwise do pass 2.
12915		 */
12916		if (waitfor == MNT_WAIT)
12917			break;
12918		waitfor = MNT_WAIT;
12919	}
12920	/*
12921	 * Try freeing inodedep in case all dependencies have been removed.
12922	 */
12923	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
12924		(void) free_inodedep(inodedep);
12925	return (0);
12926}
12927
12928/*
12929 * Flush an inode dependency list.
12930 */
12931static int
12932flush_deplist(listhead, waitfor, errorp)
12933	struct allocdirectlst *listhead;
12934	int waitfor;
12935	int *errorp;
12936{
12937	struct allocdirect *adp;
12938	struct newblk *newblk;
12939	struct ufsmount *ump;
12940	struct buf *bp;
12941
12942	if ((adp = TAILQ_FIRST(listhead)) == NULL)
12943		return (0);
12944	ump = VFSTOUFS(adp->ad_list.wk_mp);
12945	LOCK_OWNED(ump);
12946	TAILQ_FOREACH(adp, listhead, ad_next) {
12947		newblk = (struct newblk *)adp;
12948		if (newblk->nb_jnewblk != NULL) {
12949			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12950			return (1);
12951		}
12952		if (newblk->nb_state & DEPCOMPLETE)
12953			continue;
12954		bp = newblk->nb_bmsafemap->sm_buf;
12955		bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor);
12956		if (bp == NULL) {
12957			if (waitfor == MNT_NOWAIT)
12958				continue;
12959			return (1);
12960		}
12961		FREE_LOCK(ump);
12962		if (waitfor == MNT_NOWAIT)
12963			bawrite(bp);
12964		else
12965			*errorp = bwrite(bp);
12966		ACQUIRE_LOCK(ump);
12967		return (1);
12968	}
12969	return (0);
12970}
12971
12972/*
12973 * Flush dependencies associated with an allocdirect block.
12974 */
12975static int
12976flush_newblk_dep(vp, mp, lbn)
12977	struct vnode *vp;
12978	struct mount *mp;
12979	ufs_lbn_t lbn;
12980{
12981	struct newblk *newblk;
12982	struct ufsmount *ump;
12983	struct bufobj *bo;
12984	struct inode *ip;
12985	struct buf *bp;
12986	ufs2_daddr_t blkno;
12987	int error;
12988
12989	error = 0;
12990	bo = &vp->v_bufobj;
12991	ip = VTOI(vp);
12992	blkno = DIP(ip, i_db[lbn]);
12993	if (blkno == 0)
12994		panic("flush_newblk_dep: Missing block");
12995	ump = VFSTOUFS(mp);
12996	ACQUIRE_LOCK(ump);
12997	/*
12998	 * Loop until all dependencies related to this block are satisfied.
12999	 * We must be careful to restart after each sleep in case a write
13000	 * completes some part of this process for us.
13001	 */
13002	for (;;) {
13003		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
13004			FREE_LOCK(ump);
13005			break;
13006		}
13007		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
13008			panic("flush_newblk_dep: Bad newblk %p", newblk);
13009		/*
13010		 * Flush the journal.
13011		 */
13012		if (newblk->nb_jnewblk != NULL) {
13013			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
13014			continue;
13015		}
13016		/*
13017		 * Write the bitmap dependency.
13018		 */
13019		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
13020			bp = newblk->nb_bmsafemap->sm_buf;
13021			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
13022			if (bp == NULL)
13023				continue;
13024			FREE_LOCK(ump);
13025			error = bwrite(bp);
13026			if (error)
13027				break;
13028			ACQUIRE_LOCK(ump);
13029			continue;
13030		}
13031		/*
13032		 * Write the buffer.
13033		 */
13034		FREE_LOCK(ump);
13035		BO_LOCK(bo);
13036		bp = gbincore(bo, lbn);
13037		if (bp != NULL) {
13038			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
13039			    LK_INTERLOCK, BO_LOCKPTR(bo));
13040			if (error == ENOLCK) {
13041				ACQUIRE_LOCK(ump);
13042				error = 0;
13043				continue; /* Slept, retry */
13044			}
13045			if (error != 0)
13046				break;	/* Failed */
13047			if (bp->b_flags & B_DELWRI) {
13048				bremfree(bp);
13049				error = bwrite(bp);
13050				if (error)
13051					break;
13052			} else
13053				BUF_UNLOCK(bp);
13054		} else
13055			BO_UNLOCK(bo);
13056		/*
13057		 * We have to wait for the direct pointers to
13058		 * point at the newdirblk before the dependency
13059		 * will go away.
13060		 */
13061		error = ffs_update(vp, 1);
13062		if (error)
13063			break;
13064		ACQUIRE_LOCK(ump);
13065	}
13066	return (error);
13067}
13068
13069/*
13070 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
13071 */
13072static int
13073flush_pagedep_deps(pvp, mp, diraddhdp)
13074	struct vnode *pvp;
13075	struct mount *mp;
13076	struct diraddhd *diraddhdp;
13077{
13078	struct inodedep *inodedep;
13079	struct inoref *inoref;
13080	struct ufsmount *ump;
13081	struct diradd *dap;
13082	struct vnode *vp;
13083	int error = 0;
13084	struct buf *bp;
13085	ino_t inum;
13086	struct diraddhd unfinished;
13087
13088	LIST_INIT(&unfinished);
13089	ump = VFSTOUFS(mp);
13090	LOCK_OWNED(ump);
13091restart:
13092	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
13093		/*
13094		 * Flush ourselves if this directory entry
13095		 * has a MKDIR_PARENT dependency.
13096		 */
13097		if (dap->da_state & MKDIR_PARENT) {
13098			FREE_LOCK(ump);
13099			if ((error = ffs_update(pvp, 1)) != 0)
13100				break;
13101			ACQUIRE_LOCK(ump);
13102			/*
13103			 * If that cleared dependencies, go on to next.
13104			 */
13105			if (dap != LIST_FIRST(diraddhdp))
13106				continue;
13107			/*
13108			 * All MKDIR_PARENT dependencies and all the
13109			 * NEWBLOCK pagedeps that are contained in direct
13110			 * blocks were resolved by doing above ffs_update.
13111			 * Pagedeps contained in indirect blocks may
13112			 * require a complete sync'ing of the directory.
13113			 * We are in the midst of doing a complete sync,
13114			 * so if they are not resolved in this pass we
13115			 * defer them for now as they will be sync'ed by
13116			 * our caller shortly.
13117			 */
13118			LIST_REMOVE(dap, da_pdlist);
13119			LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
13120			continue;
13121		}
13122		/*
13123		 * A newly allocated directory must have its "." and
13124		 * ".." entries written out before its name can be
13125		 * committed in its parent.
13126		 */
13127		inum = dap->da_newinum;
13128		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13129			panic("flush_pagedep_deps: lost inode1");
13130		/*
13131		 * Wait for any pending journal adds to complete so we don't
13132		 * cause rollbacks while syncing.
13133		 */
13134		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
13135			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
13136			    == DEPCOMPLETE) {
13137				jwait(&inoref->if_list, MNT_WAIT);
13138				goto restart;
13139			}
13140		}
13141		if (dap->da_state & MKDIR_BODY) {
13142			FREE_LOCK(ump);
13143			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
13144			    FFSV_FORCEINSMQ)))
13145				break;
13146			MPASS(VTOI(vp)->i_mode != 0);
13147			error = flush_newblk_dep(vp, mp, 0);
13148			/*
13149			 * If we still have the dependency we might need to
13150			 * update the vnode to sync the new link count to
13151			 * disk.
13152			 */
13153			if (error == 0 && dap == LIST_FIRST(diraddhdp))
13154				error = ffs_update(vp, 1);
13155			vput(vp);
13156			if (error != 0)
13157				break;
13158			ACQUIRE_LOCK(ump);
13159			/*
13160			 * If that cleared dependencies, go on to next.
13161			 */
13162			if (dap != LIST_FIRST(diraddhdp))
13163				continue;
13164			if (dap->da_state & MKDIR_BODY) {
13165				inodedep_lookup(UFSTOVFS(ump), inum, 0,
13166				    &inodedep);
13167				panic("flush_pagedep_deps: MKDIR_BODY "
13168				    "inodedep %p dap %p vp %p",
13169				    inodedep, dap, vp);
13170			}
13171		}
13172		/*
13173		 * Flush the inode on which the directory entry depends.
13174		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
13175		 * the only remaining dependency is that the updated inode
13176		 * count must get pushed to disk. The inode has already
13177		 * been pushed into its inode buffer (via VOP_UPDATE) at
13178		 * the time of the reference count change. So we need only
13179		 * locate that buffer, ensure that there will be no rollback
13180		 * caused by a bitmap dependency, then write the inode buffer.
13181		 */
13182retry:
13183		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13184			panic("flush_pagedep_deps: lost inode");
13185		/*
13186		 * If the inode still has bitmap dependencies,
13187		 * push them to disk.
13188		 */
13189		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
13190			bp = inodedep->id_bmsafemap->sm_buf;
13191			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
13192			if (bp == NULL)
13193				goto retry;
13194			FREE_LOCK(ump);
13195			if ((error = bwrite(bp)) != 0)
13196				break;
13197			ACQUIRE_LOCK(ump);
13198			if (dap != LIST_FIRST(diraddhdp))
13199				continue;
13200		}
13201		/*
13202		 * If the inode is still sitting in a buffer waiting
13203		 * to be written or waiting for the link count to be
13204		 * adjusted update it here to flush it to disk.
13205		 */
13206		if (dap == LIST_FIRST(diraddhdp)) {
13207			FREE_LOCK(ump);
13208			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
13209			    FFSV_FORCEINSMQ)))
13210				break;
13211			MPASS(VTOI(vp)->i_mode != 0);
13212			error = ffs_update(vp, 1);
13213			vput(vp);
13214			if (error)
13215				break;
13216			ACQUIRE_LOCK(ump);
13217		}
13218		/*
13219		 * If we have failed to get rid of all the dependencies
13220		 * then something is seriously wrong.
13221		 */
13222		if (dap == LIST_FIRST(diraddhdp)) {
13223			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
13224			panic("flush_pagedep_deps: failed to flush "
13225			    "inodedep %p ino %ju dap %p",
13226			    inodedep, (uintmax_t)inum, dap);
13227		}
13228	}
13229	if (error)
13230		ACQUIRE_LOCK(ump);
13231	while ((dap = LIST_FIRST(&unfinished)) != NULL) {
13232		LIST_REMOVE(dap, da_pdlist);
13233		LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
13234	}
13235	return (error);
13236}
13237
13238/*
13239 * A large burst of file addition or deletion activity can drive the
13240 * memory load excessively high. First attempt to slow things down
13241 * using the techniques below. If that fails, this routine requests
13242 * the offending operations to fall back to running synchronously
13243 * until the memory load returns to a reasonable level.
13244 */
13245int
13246softdep_slowdown(vp)
13247	struct vnode *vp;
13248{
13249	struct ufsmount *ump;
13250	int jlow;
13251	int max_softdeps_hard;
13252
13253	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
13254	    ("softdep_slowdown called on non-softdep filesystem"));
13255	ump = VFSTOUFS(vp->v_mount);
13256	ACQUIRE_LOCK(ump);
13257	jlow = 0;
13258	/*
13259	 * Check for journal space if needed.
13260	 */
13261	if (DOINGSUJ(vp)) {
13262		if (journal_space(ump, 0) == 0)
13263			jlow = 1;
13264	}
13265	/*
13266	 * If the system is under its limits and our filesystem is
13267	 * not responsible for more than our share of the usage and
13268	 * we are not low on journal space, then no need to slow down.
13269	 */
13270	max_softdeps_hard = max_softdeps * 11 / 10;
13271	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
13272	    dep_current[D_INODEDEP] < max_softdeps_hard &&
13273	    dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 &&
13274	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 &&
13275	    ump->softdep_curdeps[D_DIRREM] <
13276	    (max_softdeps_hard / 2) / stat_flush_threads &&
13277	    ump->softdep_curdeps[D_INODEDEP] <
13278	    max_softdeps_hard / stat_flush_threads &&
13279	    ump->softdep_curdeps[D_INDIRDEP] <
13280	    (max_softdeps_hard / 1000) / stat_flush_threads &&
13281	    ump->softdep_curdeps[D_FREEBLKS] <
13282	    max_softdeps_hard / stat_flush_threads) {
13283		FREE_LOCK(ump);
13284  		return (0);
13285	}
13286	/*
13287	 * If the journal is low or our filesystem is over its limit
13288	 * then speedup the cleanup.
13289	 */
13290	if (ump->softdep_curdeps[D_INDIRDEP] <
13291	    (max_softdeps_hard / 1000) / stat_flush_threads || jlow)
13292		softdep_speedup(ump);
13293	stat_sync_limit_hit += 1;
13294	FREE_LOCK(ump);
13295	/*
13296	 * We only slow down the rate at which new dependencies are
13297	 * generated if we are not using journaling. With journaling,
13298	 * the cleanup should always be sufficient to keep things
13299	 * under control.
13300	 */
13301	if (DOINGSUJ(vp))
13302		return (0);
13303	return (1);
13304}
13305
13306/*
13307 * Called by the allocation routines when they are about to fail
13308 * in the hope that we can free up the requested resource (inodes
13309 * or disk space).
13310 *
13311 * First check to see if the work list has anything on it. If it has,
13312 * clean up entries until we successfully free the requested resource.
13313 * Because this process holds inodes locked, we cannot handle any remove
13314 * requests that might block on a locked inode as that could lead to
13315 * deadlock. If the worklist yields none of the requested resource,
13316 * start syncing out vnodes to free up the needed space.
13317 */
13318int
13319softdep_request_cleanup(fs, vp, cred, resource)
13320	struct fs *fs;
13321	struct vnode *vp;
13322	struct ucred *cred;
13323	int resource;
13324{
13325	struct ufsmount *ump;
13326	struct mount *mp;
13327	long starttime;
13328	ufs2_daddr_t needed;
13329	int error, failed_vnode;
13330
13331	/*
13332	 * If we are being called because of a process doing a
13333	 * copy-on-write, then it is not safe to process any
13334	 * worklist items as we will recurse into the copyonwrite
13335	 * routine.  This will result in an incoherent snapshot.
13336	 * If the vnode that we hold is a snapshot, we must avoid
13337	 * handling other resources that could cause deadlock.
13338	 */
13339	if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
13340		return (0);
13341
13342	if (resource == FLUSH_BLOCKS_WAIT)
13343		stat_cleanup_blkrequests += 1;
13344	else
13345		stat_cleanup_inorequests += 1;
13346
13347	mp = vp->v_mount;
13348	ump = VFSTOUFS(mp);
13349	mtx_assert(UFS_MTX(ump), MA_OWNED);
13350	UFS_UNLOCK(ump);
13351	error = ffs_update(vp, 1);
13352	if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) {
13353		UFS_LOCK(ump);
13354		return (0);
13355	}
13356	/*
13357	 * If we are in need of resources, start by cleaning up
13358	 * any block removals associated with our inode.
13359	 */
13360	ACQUIRE_LOCK(ump);
13361	process_removes(vp);
13362	process_truncates(vp);
13363	FREE_LOCK(ump);
13364	/*
13365	 * Now clean up at least as many resources as we will need.
13366	 *
13367	 * When requested to clean up inodes, the number that are needed
13368	 * is set by the number of simultaneous writers (mnt_writeopcount)
13369	 * plus a bit of slop (2) in case some more writers show up while
13370	 * we are cleaning.
13371	 *
13372	 * When requested to free up space, the amount of space that
13373	 * we need is enough blocks to allocate a full-sized segment
13374	 * (fs_contigsumsize). The number of such segments that will
13375	 * be needed is set by the number of simultaneous writers
13376	 * (mnt_writeopcount) plus a bit of slop (2) in case some more
13377	 * writers show up while we are cleaning.
13378	 *
13379	 * Additionally, if we are unpriviledged and allocating space,
13380	 * we need to ensure that we clean up enough blocks to get the
13381	 * needed number of blocks over the threshold of the minimum
13382	 * number of blocks required to be kept free by the filesystem
13383	 * (fs_minfree).
13384	 */
13385	if (resource == FLUSH_INODES_WAIT) {
13386		needed = vp->v_mount->mnt_writeopcount + 2;
13387	} else if (resource == FLUSH_BLOCKS_WAIT) {
13388		needed = (vp->v_mount->mnt_writeopcount + 2) *
13389		    fs->fs_contigsumsize;
13390		if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
13391			needed += fragstoblks(fs,
13392			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
13393			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
13394	} else {
13395		UFS_LOCK(ump);
13396		printf("softdep_request_cleanup: Unknown resource type %d\n",
13397		    resource);
13398		return (0);
13399	}
13400	starttime = time_second;
13401retry:
13402	if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
13403	    fs->fs_cstotal.cs_nbfree <= needed) ||
13404	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13405	    fs->fs_cstotal.cs_nifree <= needed)) {
13406		ACQUIRE_LOCK(ump);
13407		if (ump->softdep_on_worklist > 0 &&
13408		    process_worklist_item(UFSTOVFS(ump),
13409		    ump->softdep_on_worklist, LK_NOWAIT) != 0)
13410			stat_worklist_push += 1;
13411		FREE_LOCK(ump);
13412	}
13413	/*
13414	 * If we still need resources and there are no more worklist
13415	 * entries to process to obtain them, we have to start flushing
13416	 * the dirty vnodes to force the release of additional requests
13417	 * to the worklist that we can then process to reap addition
13418	 * resources. We walk the vnodes associated with the mount point
13419	 * until we get the needed worklist requests that we can reap.
13420	 *
13421	 * If there are several threads all needing to clean the same
13422	 * mount point, only one is allowed to walk the mount list.
13423	 * When several threads all try to walk the same mount list,
13424	 * they end up competing with each other and often end up in
13425	 * livelock. This approach ensures that forward progress is
13426	 * made at the cost of occational ENOSPC errors being returned
13427	 * that might otherwise have been avoided.
13428	 */
13429	error = 1;
13430	if ((resource == FLUSH_BLOCKS_WAIT &&
13431	     fs->fs_cstotal.cs_nbfree <= needed) ||
13432	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13433	     fs->fs_cstotal.cs_nifree <= needed)) {
13434		ACQUIRE_LOCK(ump);
13435		if ((ump->um_softdep->sd_flags & FLUSH_RC_ACTIVE) == 0) {
13436			ump->um_softdep->sd_flags |= FLUSH_RC_ACTIVE;
13437			FREE_LOCK(ump);
13438			failed_vnode = softdep_request_cleanup_flush(mp, ump);
13439			ACQUIRE_LOCK(ump);
13440			ump->um_softdep->sd_flags &= ~FLUSH_RC_ACTIVE;
13441			FREE_LOCK(ump);
13442			if (ump->softdep_on_worklist > 0) {
13443				stat_cleanup_retries += 1;
13444				if (!failed_vnode)
13445					goto retry;
13446			}
13447		} else {
13448			FREE_LOCK(ump);
13449			error = 0;
13450		}
13451		stat_cleanup_failures += 1;
13452	}
13453	if (time_second - starttime > stat_cleanup_high_delay)
13454		stat_cleanup_high_delay = time_second - starttime;
13455	UFS_LOCK(ump);
13456	return (error);
13457}
13458
13459/*
13460 * Scan the vnodes for the specified mount point flushing out any
13461 * vnodes that can be locked without waiting. Finally, try to flush
13462 * the device associated with the mount point if it can be locked
13463 * without waiting.
13464 *
13465 * We return 0 if we were able to lock every vnode in our scan.
13466 * If we had to skip one or more vnodes, we return 1.
13467 */
13468static int
13469softdep_request_cleanup_flush(mp, ump)
13470	struct mount *mp;
13471	struct ufsmount *ump;
13472{
13473	struct thread *td;
13474	struct vnode *lvp, *mvp;
13475	int failed_vnode;
13476
13477	failed_vnode = 0;
13478	td = curthread;
13479	MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
13480		if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
13481			VI_UNLOCK(lvp);
13482			continue;
13483		}
13484		if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
13485		    td) != 0) {
13486			failed_vnode = 1;
13487			continue;
13488		}
13489		if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
13490			vput(lvp);
13491			continue;
13492		}
13493		(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
13494		vput(lvp);
13495	}
13496	lvp = ump->um_devvp;
13497	if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
13498		VOP_FSYNC(lvp, MNT_NOWAIT, td);
13499		VOP_UNLOCK(lvp, 0);
13500	}
13501	return (failed_vnode);
13502}
13503
13504static bool
13505softdep_excess_items(struct ufsmount *ump, int item)
13506{
13507
13508	KASSERT(item >= 0 && item < D_LAST, ("item %d", item));
13509	return (dep_current[item] > max_softdeps &&
13510	    ump->softdep_curdeps[item] > max_softdeps /
13511	    stat_flush_threads);
13512}
13513
13514static void
13515schedule_cleanup(struct mount *mp)
13516{
13517	struct ufsmount *ump;
13518	struct thread *td;
13519
13520	ump = VFSTOUFS(mp);
13521	LOCK_OWNED(ump);
13522	FREE_LOCK(ump);
13523	td = curthread;
13524	if ((td->td_pflags & TDP_KTHREAD) != 0 &&
13525	    (td->td_proc->p_flag2 & P2_AST_SU) == 0) {
13526		/*
13527		 * No ast is delivered to kernel threads, so nobody
13528		 * would deref the mp.  Some kernel threads
13529		 * explicitely check for AST, e.g. NFS daemon does
13530		 * this in the serving loop.
13531		 */
13532		return;
13533	}
13534	if (td->td_su != NULL)
13535		vfs_rel(td->td_su);
13536	vfs_ref(mp);
13537	td->td_su = mp;
13538	thread_lock(td);
13539	td->td_flags |= TDF_ASTPENDING;
13540	thread_unlock(td);
13541}
13542
13543static void
13544softdep_ast_cleanup_proc(struct thread *td)
13545{
13546	struct mount *mp;
13547	struct ufsmount *ump;
13548	int error;
13549	bool req;
13550
13551	while ((mp = td->td_su) != NULL) {
13552		td->td_su = NULL;
13553		error = vfs_busy(mp, MBF_NOWAIT);
13554		vfs_rel(mp);
13555		if (error != 0)
13556			return;
13557		if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) {
13558			ump = VFSTOUFS(mp);
13559			for (;;) {
13560				req = false;
13561				ACQUIRE_LOCK(ump);
13562				if (softdep_excess_items(ump, D_INODEDEP)) {
13563					req = true;
13564					request_cleanup(mp, FLUSH_INODES);
13565				}
13566				if (softdep_excess_items(ump, D_DIRREM)) {
13567					req = true;
13568					request_cleanup(mp, FLUSH_BLOCKS);
13569				}
13570				FREE_LOCK(ump);
13571				if (softdep_excess_items(ump, D_NEWBLK) ||
13572				    softdep_excess_items(ump, D_ALLOCDIRECT) ||
13573				    softdep_excess_items(ump, D_ALLOCINDIR)) {
13574					error = vn_start_write(NULL, &mp,
13575					    V_WAIT);
13576					if (error == 0) {
13577						req = true;
13578						VFS_SYNC(mp, MNT_WAIT);
13579						vn_finished_write(mp);
13580					}
13581				}
13582				if ((td->td_pflags & TDP_KTHREAD) != 0 || !req)
13583					break;
13584			}
13585		}
13586		vfs_unbusy(mp);
13587	}
13588	if ((mp = td->td_su) != NULL) {
13589		td->td_su = NULL;
13590		vfs_rel(mp);
13591	}
13592}
13593
13594/*
13595 * If memory utilization has gotten too high, deliberately slow things
13596 * down and speed up the I/O processing.
13597 */
13598static int
13599request_cleanup(mp, resource)
13600	struct mount *mp;
13601	int resource;
13602{
13603	struct thread *td = curthread;
13604	struct ufsmount *ump;
13605
13606	ump = VFSTOUFS(mp);
13607	LOCK_OWNED(ump);
13608	/*
13609	 * We never hold up the filesystem syncer or buf daemon.
13610	 */
13611	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
13612		return (0);
13613	/*
13614	 * First check to see if the work list has gotten backlogged.
13615	 * If it has, co-opt this process to help clean up two entries.
13616	 * Because this process may hold inodes locked, we cannot
13617	 * handle any remove requests that might block on a locked
13618	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
13619	 * to avoid recursively processing the worklist.
13620	 */
13621	if (ump->softdep_on_worklist > max_softdeps / 10) {
13622		td->td_pflags |= TDP_SOFTDEP;
13623		process_worklist_item(mp, 2, LK_NOWAIT);
13624		td->td_pflags &= ~TDP_SOFTDEP;
13625		stat_worklist_push += 2;
13626		return(1);
13627	}
13628	/*
13629	 * Next, we attempt to speed up the syncer process. If that
13630	 * is successful, then we allow the process to continue.
13631	 */
13632	if (softdep_speedup(ump) &&
13633	    resource != FLUSH_BLOCKS_WAIT &&
13634	    resource != FLUSH_INODES_WAIT)
13635		return(0);
13636	/*
13637	 * If we are resource constrained on inode dependencies, try
13638	 * flushing some dirty inodes. Otherwise, we are constrained
13639	 * by file deletions, so try accelerating flushes of directories
13640	 * with removal dependencies. We would like to do the cleanup
13641	 * here, but we probably hold an inode locked at this point and
13642	 * that might deadlock against one that we try to clean. So,
13643	 * the best that we can do is request the syncer daemon to do
13644	 * the cleanup for us.
13645	 */
13646	switch (resource) {
13647
13648	case FLUSH_INODES:
13649	case FLUSH_INODES_WAIT:
13650		ACQUIRE_GBLLOCK(&lk);
13651		stat_ino_limit_push += 1;
13652		req_clear_inodedeps += 1;
13653		FREE_GBLLOCK(&lk);
13654		stat_countp = &stat_ino_limit_hit;
13655		break;
13656
13657	case FLUSH_BLOCKS:
13658	case FLUSH_BLOCKS_WAIT:
13659		ACQUIRE_GBLLOCK(&lk);
13660		stat_blk_limit_push += 1;
13661		req_clear_remove += 1;
13662		FREE_GBLLOCK(&lk);
13663		stat_countp = &stat_blk_limit_hit;
13664		break;
13665
13666	default:
13667		panic("request_cleanup: unknown type");
13668	}
13669	/*
13670	 * Hopefully the syncer daemon will catch up and awaken us.
13671	 * We wait at most tickdelay before proceeding in any case.
13672	 */
13673	ACQUIRE_GBLLOCK(&lk);
13674	FREE_LOCK(ump);
13675	proc_waiting += 1;
13676	if (callout_pending(&softdep_callout) == FALSE)
13677		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13678		    pause_timer, 0);
13679
13680	if ((td->td_pflags & TDP_KTHREAD) == 0)
13681		msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
13682	proc_waiting -= 1;
13683	FREE_GBLLOCK(&lk);
13684	ACQUIRE_LOCK(ump);
13685	return (1);
13686}
13687
13688/*
13689 * Awaken processes pausing in request_cleanup and clear proc_waiting
13690 * to indicate that there is no longer a timer running. Pause_timer
13691 * will be called with the global softdep mutex (&lk) locked.
13692 */
13693static void
13694pause_timer(arg)
13695	void *arg;
13696{
13697
13698	GBLLOCK_OWNED(&lk);
13699	/*
13700	 * The callout_ API has acquired mtx and will hold it around this
13701	 * function call.
13702	 */
13703	*stat_countp += proc_waiting;
13704	wakeup(&proc_waiting);
13705}
13706
13707/*
13708 * If requested, try removing inode or removal dependencies.
13709 */
13710static void
13711check_clear_deps(mp)
13712	struct mount *mp;
13713{
13714
13715	/*
13716	 * If we are suspended, it may be because of our using
13717	 * too many inodedeps, so help clear them out.
13718	 */
13719	if (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended)
13720		clear_inodedeps(mp);
13721	/*
13722	 * General requests for cleanup of backed up dependencies
13723	 */
13724	ACQUIRE_GBLLOCK(&lk);
13725	if (req_clear_inodedeps) {
13726		req_clear_inodedeps -= 1;
13727		FREE_GBLLOCK(&lk);
13728		clear_inodedeps(mp);
13729		ACQUIRE_GBLLOCK(&lk);
13730		wakeup(&proc_waiting);
13731	}
13732	if (req_clear_remove) {
13733		req_clear_remove -= 1;
13734		FREE_GBLLOCK(&lk);
13735		clear_remove(mp);
13736		ACQUIRE_GBLLOCK(&lk);
13737		wakeup(&proc_waiting);
13738	}
13739	FREE_GBLLOCK(&lk);
13740}
13741
13742/*
13743 * Flush out a directory with at least one removal dependency in an effort to
13744 * reduce the number of dirrem, freefile, and freeblks dependency structures.
13745 */
13746static void
13747clear_remove(mp)
13748	struct mount *mp;
13749{
13750	struct pagedep_hashhead *pagedephd;
13751	struct pagedep *pagedep;
13752	struct ufsmount *ump;
13753	struct vnode *vp;
13754	struct bufobj *bo;
13755	int error, cnt;
13756	ino_t ino;
13757
13758	ump = VFSTOUFS(mp);
13759	LOCK_OWNED(ump);
13760
13761	for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) {
13762		pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++];
13763		if (ump->pagedep_nextclean > ump->pagedep_hash_size)
13764			ump->pagedep_nextclean = 0;
13765		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
13766			if (LIST_EMPTY(&pagedep->pd_dirremhd))
13767				continue;
13768			ino = pagedep->pd_ino;
13769			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13770				continue;
13771			FREE_LOCK(ump);
13772
13773			/*
13774			 * Let unmount clear deps
13775			 */
13776			error = vfs_busy(mp, MBF_NOWAIT);
13777			if (error != 0)
13778				goto finish_write;
13779			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13780			     FFSV_FORCEINSMQ);
13781			vfs_unbusy(mp);
13782			if (error != 0) {
13783				softdep_error("clear_remove: vget", error);
13784				goto finish_write;
13785			}
13786			MPASS(VTOI(vp)->i_mode != 0);
13787			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13788				softdep_error("clear_remove: fsync", error);
13789			bo = &vp->v_bufobj;
13790			BO_LOCK(bo);
13791			drain_output(vp);
13792			BO_UNLOCK(bo);
13793			vput(vp);
13794		finish_write:
13795			vn_finished_write(mp);
13796			ACQUIRE_LOCK(ump);
13797			return;
13798		}
13799	}
13800}
13801
13802/*
13803 * Clear out a block of dirty inodes in an effort to reduce
13804 * the number of inodedep dependency structures.
13805 */
13806static void
13807clear_inodedeps(mp)
13808	struct mount *mp;
13809{
13810	struct inodedep_hashhead *inodedephd;
13811	struct inodedep *inodedep;
13812	struct ufsmount *ump;
13813	struct vnode *vp;
13814	struct fs *fs;
13815	int error, cnt;
13816	ino_t firstino, lastino, ino;
13817
13818	ump = VFSTOUFS(mp);
13819	fs = ump->um_fs;
13820	LOCK_OWNED(ump);
13821	/*
13822	 * Pick a random inode dependency to be cleared.
13823	 * We will then gather up all the inodes in its block
13824	 * that have dependencies and flush them out.
13825	 */
13826	for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) {
13827		inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++];
13828		if (ump->inodedep_nextclean > ump->inodedep_hash_size)
13829			ump->inodedep_nextclean = 0;
13830		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
13831			break;
13832	}
13833	if (inodedep == NULL)
13834		return;
13835	/*
13836	 * Find the last inode in the block with dependencies.
13837	 */
13838	firstino = rounddown2(inodedep->id_ino, INOPB(fs));
13839	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
13840		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
13841			break;
13842	/*
13843	 * Asynchronously push all but the last inode with dependencies.
13844	 * Synchronously push the last inode with dependencies to ensure
13845	 * that the inode block gets written to free up the inodedeps.
13846	 */
13847	for (ino = firstino; ino <= lastino; ino++) {
13848		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13849			continue;
13850		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13851			continue;
13852		FREE_LOCK(ump);
13853		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
13854		if (error != 0) {
13855			vn_finished_write(mp);
13856			ACQUIRE_LOCK(ump);
13857			return;
13858		}
13859		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13860		    FFSV_FORCEINSMQ)) != 0) {
13861			softdep_error("clear_inodedeps: vget", error);
13862			vfs_unbusy(mp);
13863			vn_finished_write(mp);
13864			ACQUIRE_LOCK(ump);
13865			return;
13866		}
13867		vfs_unbusy(mp);
13868		if (VTOI(vp)->i_mode == 0) {
13869			vgone(vp);
13870		} else if (ino == lastino) {
13871			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
13872				softdep_error("clear_inodedeps: fsync1", error);
13873		} else {
13874			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13875				softdep_error("clear_inodedeps: fsync2", error);
13876			BO_LOCK(&vp->v_bufobj);
13877			drain_output(vp);
13878			BO_UNLOCK(&vp->v_bufobj);
13879		}
13880		vput(vp);
13881		vn_finished_write(mp);
13882		ACQUIRE_LOCK(ump);
13883	}
13884}
13885
13886void
13887softdep_buf_append(bp, wkhd)
13888	struct buf *bp;
13889	struct workhead *wkhd;
13890{
13891	struct worklist *wk;
13892	struct ufsmount *ump;
13893
13894	if ((wk = LIST_FIRST(wkhd)) == NULL)
13895		return;
13896	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13897	    ("softdep_buf_append called on non-softdep filesystem"));
13898	ump = VFSTOUFS(wk->wk_mp);
13899	ACQUIRE_LOCK(ump);
13900	while ((wk = LIST_FIRST(wkhd)) != NULL) {
13901		WORKLIST_REMOVE(wk);
13902		WORKLIST_INSERT(&bp->b_dep, wk);
13903	}
13904	FREE_LOCK(ump);
13905
13906}
13907
13908void
13909softdep_inode_append(ip, cred, wkhd)
13910	struct inode *ip;
13911	struct ucred *cred;
13912	struct workhead *wkhd;
13913{
13914	struct buf *bp;
13915	struct fs *fs;
13916	struct ufsmount *ump;
13917	int error;
13918
13919	ump = ITOUMP(ip);
13920	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
13921	    ("softdep_inode_append called on non-softdep filesystem"));
13922	fs = ump->um_fs;
13923	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
13924	    (int)fs->fs_bsize, cred, &bp);
13925	if (error) {
13926		bqrelse(bp);
13927		softdep_freework(wkhd);
13928		return;
13929	}
13930	softdep_buf_append(bp, wkhd);
13931	bqrelse(bp);
13932}
13933
13934void
13935softdep_freework(wkhd)
13936	struct workhead *wkhd;
13937{
13938	struct worklist *wk;
13939	struct ufsmount *ump;
13940
13941	if ((wk = LIST_FIRST(wkhd)) == NULL)
13942		return;
13943	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13944	    ("softdep_freework called on non-softdep filesystem"));
13945	ump = VFSTOUFS(wk->wk_mp);
13946	ACQUIRE_LOCK(ump);
13947	handle_jwork(wkhd);
13948	FREE_LOCK(ump);
13949}
13950
13951static struct ufsmount *
13952softdep_bp_to_mp(bp)
13953	struct buf *bp;
13954{
13955	struct mount *mp;
13956	struct vnode *vp;
13957
13958	if (LIST_EMPTY(&bp->b_dep))
13959		return (NULL);
13960	vp = bp->b_vp;
13961	KASSERT(vp != NULL,
13962	    ("%s, buffer with dependencies lacks vnode", __func__));
13963
13964	/*
13965	 * The ump mount point is stable after we get a correct
13966	 * pointer, since bp is locked and this prevents unmount from
13967	 * proceeding.  But to get to it, we cannot dereference bp->b_dep
13968	 * head wk_mp, because we do not yet own SU ump lock and
13969	 * workitem might be freed while dereferenced.
13970	 */
13971retry:
13972	switch (vp->v_type) {
13973	case VCHR:
13974		VI_LOCK(vp);
13975		mp = vp->v_type == VCHR ? vp->v_rdev->si_mountpt : NULL;
13976		VI_UNLOCK(vp);
13977		if (mp == NULL)
13978			goto retry;
13979		break;
13980	case VREG:
13981	case VDIR:
13982	case VLNK:
13983	case VFIFO:
13984	case VSOCK:
13985		mp = vp->v_mount;
13986		break;
13987	case VBLK:
13988		vn_printf(vp, "softdep_bp_to_mp: unexpected block device\n");
13989		/* FALLTHROUGH */
13990	case VNON:
13991	case VBAD:
13992	case VMARKER:
13993		mp = NULL;
13994		break;
13995	default:
13996		vn_printf(vp, "unknown vnode type");
13997		mp = NULL;
13998		break;
13999	}
14000	return (VFSTOUFS(mp));
14001}
14002
14003/*
14004 * Function to determine if the buffer has outstanding dependencies
14005 * that will cause a roll-back if the buffer is written. If wantcount
14006 * is set, return number of dependencies, otherwise just yes or no.
14007 */
14008static int
14009softdep_count_dependencies(bp, wantcount)
14010	struct buf *bp;
14011	int wantcount;
14012{
14013	struct worklist *wk;
14014	struct ufsmount *ump;
14015	struct bmsafemap *bmsafemap;
14016	struct freework *freework;
14017	struct inodedep *inodedep;
14018	struct indirdep *indirdep;
14019	struct freeblks *freeblks;
14020	struct allocindir *aip;
14021	struct pagedep *pagedep;
14022	struct dirrem *dirrem;
14023	struct newblk *newblk;
14024	struct mkdir *mkdir;
14025	struct diradd *dap;
14026	int i, retval;
14027
14028	ump = softdep_bp_to_mp(bp);
14029	if (ump == NULL)
14030		return (0);
14031	retval = 0;
14032	ACQUIRE_LOCK(ump);
14033	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
14034		switch (wk->wk_type) {
14035
14036		case D_INODEDEP:
14037			inodedep = WK_INODEDEP(wk);
14038			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
14039				/* bitmap allocation dependency */
14040				retval += 1;
14041				if (!wantcount)
14042					goto out;
14043			}
14044			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
14045				/* direct block pointer dependency */
14046				retval += 1;
14047				if (!wantcount)
14048					goto out;
14049			}
14050			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
14051				/* direct block pointer dependency */
14052				retval += 1;
14053				if (!wantcount)
14054					goto out;
14055			}
14056			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
14057				/* Add reference dependency. */
14058				retval += 1;
14059				if (!wantcount)
14060					goto out;
14061			}
14062			continue;
14063
14064		case D_INDIRDEP:
14065			indirdep = WK_INDIRDEP(wk);
14066
14067			TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
14068				/* indirect truncation dependency */
14069				retval += 1;
14070				if (!wantcount)
14071					goto out;
14072			}
14073
14074			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
14075				/* indirect block pointer dependency */
14076				retval += 1;
14077				if (!wantcount)
14078					goto out;
14079			}
14080			continue;
14081
14082		case D_PAGEDEP:
14083			pagedep = WK_PAGEDEP(wk);
14084			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
14085				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
14086					/* Journal remove ref dependency. */
14087					retval += 1;
14088					if (!wantcount)
14089						goto out;
14090				}
14091			}
14092			for (i = 0; i < DAHASHSZ; i++) {
14093
14094				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
14095					/* directory entry dependency */
14096					retval += 1;
14097					if (!wantcount)
14098						goto out;
14099				}
14100			}
14101			continue;
14102
14103		case D_BMSAFEMAP:
14104			bmsafemap = WK_BMSAFEMAP(wk);
14105			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
14106				/* Add reference dependency. */
14107				retval += 1;
14108				if (!wantcount)
14109					goto out;
14110			}
14111			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
14112				/* Allocate block dependency. */
14113				retval += 1;
14114				if (!wantcount)
14115					goto out;
14116			}
14117			continue;
14118
14119		case D_FREEBLKS:
14120			freeblks = WK_FREEBLKS(wk);
14121			if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
14122				/* Freeblk journal dependency. */
14123				retval += 1;
14124				if (!wantcount)
14125					goto out;
14126			}
14127			continue;
14128
14129		case D_ALLOCDIRECT:
14130		case D_ALLOCINDIR:
14131			newblk = WK_NEWBLK(wk);
14132			if (newblk->nb_jnewblk) {
14133				/* Journal allocate dependency. */
14134				retval += 1;
14135				if (!wantcount)
14136					goto out;
14137			}
14138			continue;
14139
14140		case D_MKDIR:
14141			mkdir = WK_MKDIR(wk);
14142			if (mkdir->md_jaddref) {
14143				/* Journal reference dependency. */
14144				retval += 1;
14145				if (!wantcount)
14146					goto out;
14147			}
14148			continue;
14149
14150		case D_FREEWORK:
14151		case D_FREEDEP:
14152		case D_JSEGDEP:
14153		case D_JSEG:
14154		case D_SBDEP:
14155			/* never a dependency on these blocks */
14156			continue;
14157
14158		default:
14159			panic("softdep_count_dependencies: Unexpected type %s",
14160			    TYPENAME(wk->wk_type));
14161			/* NOTREACHED */
14162		}
14163	}
14164out:
14165	FREE_LOCK(ump);
14166	return (retval);
14167}
14168
14169/*
14170 * Acquire exclusive access to a buffer.
14171 * Must be called with a locked mtx parameter.
14172 * Return acquired buffer or NULL on failure.
14173 */
14174static struct buf *
14175getdirtybuf(bp, lock, waitfor)
14176	struct buf *bp;
14177	struct rwlock *lock;
14178	int waitfor;
14179{
14180	int error;
14181
14182	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
14183		if (waitfor != MNT_WAIT)
14184			return (NULL);
14185		error = BUF_LOCK(bp,
14186		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
14187		/*
14188		 * Even if we successfully acquire bp here, we have dropped
14189		 * lock, which may violates our guarantee.
14190		 */
14191		if (error == 0)
14192			BUF_UNLOCK(bp);
14193		else if (error != ENOLCK)
14194			panic("getdirtybuf: inconsistent lock: %d", error);
14195		rw_wlock(lock);
14196		return (NULL);
14197	}
14198	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
14199		if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) {
14200			rw_wunlock(lock);
14201			BO_LOCK(bp->b_bufobj);
14202			BUF_UNLOCK(bp);
14203			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
14204				bp->b_vflags |= BV_BKGRDWAIT;
14205				msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
14206				       PRIBIO | PDROP, "getbuf", 0);
14207			} else
14208				BO_UNLOCK(bp->b_bufobj);
14209			rw_wlock(lock);
14210			return (NULL);
14211		}
14212		BUF_UNLOCK(bp);
14213		if (waitfor != MNT_WAIT)
14214			return (NULL);
14215#ifdef DEBUG_VFS_LOCKS
14216		if (bp->b_vp->v_type != VCHR)
14217			ASSERT_BO_WLOCKED(bp->b_bufobj);
14218#endif
14219		bp->b_vflags |= BV_BKGRDWAIT;
14220		rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
14221		return (NULL);
14222	}
14223	if ((bp->b_flags & B_DELWRI) == 0) {
14224		BUF_UNLOCK(bp);
14225		return (NULL);
14226	}
14227	bremfree(bp);
14228	return (bp);
14229}
14230
14231
14232/*
14233 * Check if it is safe to suspend the file system now.  On entry,
14234 * the vnode interlock for devvp should be held.  Return 0 with
14235 * the mount interlock held if the file system can be suspended now,
14236 * otherwise return EAGAIN with the mount interlock held.
14237 */
14238int
14239softdep_check_suspend(struct mount *mp,
14240		      struct vnode *devvp,
14241		      int softdep_depcnt,
14242		      int softdep_accdepcnt,
14243		      int secondary_writes,
14244		      int secondary_accwrites)
14245{
14246	struct bufobj *bo;
14247	struct ufsmount *ump;
14248	struct inodedep *inodedep;
14249	int error, unlinked;
14250
14251	bo = &devvp->v_bufobj;
14252	ASSERT_BO_WLOCKED(bo);
14253
14254	/*
14255	 * If we are not running with soft updates, then we need only
14256	 * deal with secondary writes as we try to suspend.
14257	 */
14258	if (MOUNTEDSOFTDEP(mp) == 0) {
14259		MNT_ILOCK(mp);
14260		while (mp->mnt_secondary_writes != 0) {
14261			BO_UNLOCK(bo);
14262			msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
14263			    (PUSER - 1) | PDROP, "secwr", 0);
14264			BO_LOCK(bo);
14265			MNT_ILOCK(mp);
14266		}
14267
14268		/*
14269		 * Reasons for needing more work before suspend:
14270		 * - Dirty buffers on devvp.
14271		 * - Secondary writes occurred after start of vnode sync loop
14272		 */
14273		error = 0;
14274		if (bo->bo_numoutput > 0 ||
14275		    bo->bo_dirty.bv_cnt > 0 ||
14276		    secondary_writes != 0 ||
14277		    mp->mnt_secondary_writes != 0 ||
14278		    secondary_accwrites != mp->mnt_secondary_accwrites)
14279			error = EAGAIN;
14280		BO_UNLOCK(bo);
14281		return (error);
14282	}
14283
14284	/*
14285	 * If we are running with soft updates, then we need to coordinate
14286	 * with them as we try to suspend.
14287	 */
14288	ump = VFSTOUFS(mp);
14289	for (;;) {
14290		if (!TRY_ACQUIRE_LOCK(ump)) {
14291			BO_UNLOCK(bo);
14292			ACQUIRE_LOCK(ump);
14293			FREE_LOCK(ump);
14294			BO_LOCK(bo);
14295			continue;
14296		}
14297		MNT_ILOCK(mp);
14298		if (mp->mnt_secondary_writes != 0) {
14299			FREE_LOCK(ump);
14300			BO_UNLOCK(bo);
14301			msleep(&mp->mnt_secondary_writes,
14302			       MNT_MTX(mp),
14303			       (PUSER - 1) | PDROP, "secwr", 0);
14304			BO_LOCK(bo);
14305			continue;
14306		}
14307		break;
14308	}
14309
14310	unlinked = 0;
14311	if (MOUNTEDSUJ(mp)) {
14312		for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked);
14313		    inodedep != NULL;
14314		    inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
14315			if ((inodedep->id_state & (UNLINKED | UNLINKLINKS |
14316			    UNLINKONLIST)) != (UNLINKED | UNLINKLINKS |
14317			    UNLINKONLIST) ||
14318			    !check_inodedep_free(inodedep))
14319				continue;
14320			unlinked++;
14321		}
14322	}
14323
14324	/*
14325	 * Reasons for needing more work before suspend:
14326	 * - Dirty buffers on devvp.
14327	 * - Softdep activity occurred after start of vnode sync loop
14328	 * - Secondary writes occurred after start of vnode sync loop
14329	 */
14330	error = 0;
14331	if (bo->bo_numoutput > 0 ||
14332	    bo->bo_dirty.bv_cnt > 0 ||
14333	    softdep_depcnt != unlinked ||
14334	    ump->softdep_deps != unlinked ||
14335	    softdep_accdepcnt != ump->softdep_accdeps ||
14336	    secondary_writes != 0 ||
14337	    mp->mnt_secondary_writes != 0 ||
14338	    secondary_accwrites != mp->mnt_secondary_accwrites)
14339		error = EAGAIN;
14340	FREE_LOCK(ump);
14341	BO_UNLOCK(bo);
14342	return (error);
14343}
14344
14345
14346/*
14347 * Get the number of dependency structures for the file system, both
14348 * the current number and the total number allocated.  These will
14349 * later be used to detect that softdep processing has occurred.
14350 */
14351void
14352softdep_get_depcounts(struct mount *mp,
14353		      int *softdep_depsp,
14354		      int *softdep_accdepsp)
14355{
14356	struct ufsmount *ump;
14357
14358	if (MOUNTEDSOFTDEP(mp) == 0) {
14359		*softdep_depsp = 0;
14360		*softdep_accdepsp = 0;
14361		return;
14362	}
14363	ump = VFSTOUFS(mp);
14364	ACQUIRE_LOCK(ump);
14365	*softdep_depsp = ump->softdep_deps;
14366	*softdep_accdepsp = ump->softdep_accdeps;
14367	FREE_LOCK(ump);
14368}
14369
14370/*
14371 * Wait for pending output on a vnode to complete.
14372 */
14373static void
14374drain_output(vp)
14375	struct vnode *vp;
14376{
14377
14378	ASSERT_VOP_LOCKED(vp, "drain_output");
14379	(void)bufobj_wwait(&vp->v_bufobj, 0, 0);
14380}
14381
14382/*
14383 * Called whenever a buffer that is being invalidated or reallocated
14384 * contains dependencies. This should only happen if an I/O error has
14385 * occurred. The routine is called with the buffer locked.
14386 */
14387static void
14388softdep_deallocate_dependencies(bp)
14389	struct buf *bp;
14390{
14391
14392	if ((bp->b_ioflags & BIO_ERROR) == 0)
14393		panic("softdep_deallocate_dependencies: dangling deps");
14394	if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
14395		softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
14396	else
14397		printf("softdep_deallocate_dependencies: "
14398		    "got error %d while accessing filesystem\n", bp->b_error);
14399	if (bp->b_error != ENXIO)
14400		panic("softdep_deallocate_dependencies: unrecovered I/O error");
14401}
14402
14403/*
14404 * Function to handle asynchronous write errors in the filesystem.
14405 */
14406static void
14407softdep_error(func, error)
14408	char *func;
14409	int error;
14410{
14411
14412	/* XXX should do something better! */
14413	printf("%s: got error %d while accessing filesystem\n", func, error);
14414}
14415
14416#ifdef DDB
14417
14418static void
14419inodedep_print(struct inodedep *inodedep, int verbose)
14420{
14421	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %jd nlink %jd"
14422	    " saveino %p\n",
14423	    inodedep, inodedep->id_fs, inodedep->id_state,
14424	    (intmax_t)inodedep->id_ino,
14425	    (intmax_t)fsbtodb(inodedep->id_fs,
14426	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
14427	    (intmax_t)inodedep->id_nlinkdelta,
14428	    (intmax_t)inodedep->id_savednlink,
14429	    inodedep->id_savedino1);
14430
14431	if (verbose == 0)
14432		return;
14433
14434	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
14435	    "mkdiradd %p\n",
14436	    LIST_FIRST(&inodedep->id_pendinghd),
14437	    LIST_FIRST(&inodedep->id_bufwait),
14438	    LIST_FIRST(&inodedep->id_inowait),
14439	    TAILQ_FIRST(&inodedep->id_inoreflst),
14440	    inodedep->id_mkdiradd);
14441	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
14442	    TAILQ_FIRST(&inodedep->id_inoupdt),
14443	    TAILQ_FIRST(&inodedep->id_newinoupdt),
14444	    TAILQ_FIRST(&inodedep->id_extupdt),
14445	    TAILQ_FIRST(&inodedep->id_newextupdt));
14446}
14447
14448DB_SHOW_COMMAND(inodedep, db_show_inodedep)
14449{
14450
14451	if (have_addr == 0) {
14452		db_printf("Address required\n");
14453		return;
14454	}
14455	inodedep_print((struct inodedep*)addr, 1);
14456}
14457
14458DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
14459{
14460	struct inodedep_hashhead *inodedephd;
14461	struct inodedep *inodedep;
14462	struct ufsmount *ump;
14463	int cnt;
14464
14465	if (have_addr == 0) {
14466		db_printf("Address required\n");
14467		return;
14468	}
14469	ump = (struct ufsmount *)addr;
14470	for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) {
14471		inodedephd = &ump->inodedep_hashtbl[cnt];
14472		LIST_FOREACH(inodedep, inodedephd, id_hash) {
14473			inodedep_print(inodedep, 0);
14474		}
14475	}
14476}
14477
14478DB_SHOW_COMMAND(worklist, db_show_worklist)
14479{
14480	struct worklist *wk;
14481
14482	if (have_addr == 0) {
14483		db_printf("Address required\n");
14484		return;
14485	}
14486	wk = (struct worklist *)addr;
14487	printf("worklist: %p type %s state 0x%X\n",
14488	    wk, TYPENAME(wk->wk_type), wk->wk_state);
14489}
14490
14491DB_SHOW_COMMAND(workhead, db_show_workhead)
14492{
14493	struct workhead *wkhd;
14494	struct worklist *wk;
14495	int i;
14496
14497	if (have_addr == 0) {
14498		db_printf("Address required\n");
14499		return;
14500	}
14501	wkhd = (struct workhead *)addr;
14502	wk = LIST_FIRST(wkhd);
14503	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
14504		db_printf("worklist: %p type %s state 0x%X",
14505		    wk, TYPENAME(wk->wk_type), wk->wk_state);
14506	if (i == 100)
14507		db_printf("workhead overflow");
14508	printf("\n");
14509}
14510
14511
14512DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
14513{
14514	struct mkdirlist *mkdirlisthd;
14515	struct jaddref *jaddref;
14516	struct diradd *diradd;
14517	struct mkdir *mkdir;
14518
14519	if (have_addr == 0) {
14520		db_printf("Address required\n");
14521		return;
14522	}
14523	mkdirlisthd = (struct mkdirlist *)addr;
14524	LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) {
14525		diradd = mkdir->md_diradd;
14526		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
14527		    mkdir, mkdir->md_state, diradd, diradd->da_state);
14528		if ((jaddref = mkdir->md_jaddref) != NULL)
14529			db_printf(" jaddref %p jaddref state 0x%X",
14530			    jaddref, jaddref->ja_state);
14531		db_printf("\n");
14532	}
14533}
14534
14535/* exported to ffs_vfsops.c */
14536extern void db_print_ffs(struct ufsmount *ump);
14537void
14538db_print_ffs(struct ufsmount *ump)
14539{
14540	db_printf("mp %p %s devvp %p fs %p su_wl %d su_deps %d su_req %d\n",
14541	    ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname,
14542	    ump->um_devvp, ump->um_fs, ump->softdep_on_worklist,
14543	    ump->softdep_deps, ump->softdep_req);
14544}
14545
14546#endif /* DDB */
14547
14548#endif /* SOFTUPDATES */
14549