ffs_softdep.c revision 144585
1219820Sjeff/*-
2219820Sjeff * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
3219820Sjeff *
4219820Sjeff * The soft updates code is derived from the appendix of a University
5219820Sjeff * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
6219820Sjeff * "Soft Updates: A Solution to the Metadata Update Problem in File
7219820Sjeff * Systems", CSE-TR-254-95, August 1995).
8 *
9 * Further information about soft updates can be obtained from:
10 *
11 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
12 *	1614 Oxford Street		mckusick@mckusick.com
13 *	Berkeley, CA 94709-1608		+1-510-843-9542
14 *	USA
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 *
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 *
26 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
27 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
28 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
29 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
30 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 144585 2005-04-03 10:29:55Z jeff $");
43
44/*
45 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
46 */
47#ifndef DIAGNOSTIC
48#define DIAGNOSTIC
49#endif
50#ifndef DEBUG
51#define DEBUG
52#endif
53
54#include <sys/param.h>
55#include <sys/kernel.h>
56#include <sys/systm.h>
57#include <sys/bio.h>
58#include <sys/buf.h>
59#include <sys/kdb.h>
60#include <sys/lock.h>
61#include <sys/malloc.h>
62#include <sys/mount.h>
63#include <sys/mutex.h>
64#include <sys/proc.h>
65#include <sys/stat.h>
66#include <sys/syslog.h>
67#include <sys/vnode.h>
68#include <sys/conf.h>
69#include <ufs/ufs/dir.h>
70#include <ufs/ufs/extattr.h>
71#include <ufs/ufs/quota.h>
72#include <ufs/ufs/inode.h>
73#include <ufs/ufs/ufsmount.h>
74#include <ufs/ffs/fs.h>
75#include <ufs/ffs/softdep.h>
76#include <ufs/ffs/ffs_extern.h>
77#include <ufs/ufs/ufs_extern.h>
78
79#include "opt_ffs.h"
80
81#ifndef SOFTUPDATES
82
83int
84softdep_flushfiles(oldmnt, flags, td)
85	struct mount *oldmnt;
86	int flags;
87	struct thread *td;
88{
89
90	panic("softdep_flushfiles called");
91}
92
93int
94softdep_mount(devvp, mp, fs, cred)
95	struct vnode *devvp;
96	struct mount *mp;
97	struct fs *fs;
98	struct ucred *cred;
99{
100
101	return (0);
102}
103
104void
105softdep_initialize()
106{
107
108	return;
109}
110
111void
112softdep_uninitialize()
113{
114
115	return;
116}
117
118void
119softdep_setup_inomapdep(bp, ip, newinum)
120	struct buf *bp;
121	struct inode *ip;
122	ino_t newinum;
123{
124
125	panic("softdep_setup_inomapdep called");
126}
127
128void
129softdep_setup_blkmapdep(bp, fs, newblkno)
130	struct buf *bp;
131	struct fs *fs;
132	ufs2_daddr_t newblkno;
133{
134
135	panic("softdep_setup_blkmapdep called");
136}
137
138void
139softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
140	struct inode *ip;
141	ufs_lbn_t lbn;
142	ufs2_daddr_t newblkno;
143	ufs2_daddr_t oldblkno;
144	long newsize;
145	long oldsize;
146	struct buf *bp;
147{
148
149	panic("softdep_setup_allocdirect called");
150}
151
152void
153softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
154	struct inode *ip;
155	ufs_lbn_t lbn;
156	ufs2_daddr_t newblkno;
157	ufs2_daddr_t oldblkno;
158	long newsize;
159	long oldsize;
160	struct buf *bp;
161{
162
163	panic("softdep_setup_allocdirect called");
164}
165
166void
167softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
168	struct inode *ip;
169	ufs_lbn_t lbn;
170	struct buf *bp;
171	int ptrno;
172	ufs2_daddr_t newblkno;
173	ufs2_daddr_t oldblkno;
174	struct buf *nbp;
175{
176
177	panic("softdep_setup_allocindir_page called");
178}
179
180void
181softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
182	struct buf *nbp;
183	struct inode *ip;
184	struct buf *bp;
185	int ptrno;
186	ufs2_daddr_t newblkno;
187{
188
189	panic("softdep_setup_allocindir_meta called");
190}
191
192void
193softdep_setup_freeblocks(ip, length, flags)
194	struct inode *ip;
195	off_t length;
196	int flags;
197{
198
199	panic("softdep_setup_freeblocks called");
200}
201
202void
203softdep_freefile(pvp, ino, mode)
204		struct vnode *pvp;
205		ino_t ino;
206		int mode;
207{
208
209	panic("softdep_freefile called");
210}
211
212int
213softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
214	struct buf *bp;
215	struct inode *dp;
216	off_t diroffset;
217	ino_t newinum;
218	struct buf *newdirbp;
219	int isnewblk;
220{
221
222	panic("softdep_setup_directory_add called");
223}
224
225void
226softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
227	struct inode *dp;
228	caddr_t base;
229	caddr_t oldloc;
230	caddr_t newloc;
231	int entrysize;
232{
233
234	panic("softdep_change_directoryentry_offset called");
235}
236
237void
238softdep_setup_remove(bp, dp, ip, isrmdir)
239	struct buf *bp;
240	struct inode *dp;
241	struct inode *ip;
242	int isrmdir;
243{
244
245	panic("softdep_setup_remove called");
246}
247
248void
249softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
250	struct buf *bp;
251	struct inode *dp;
252	struct inode *ip;
253	ino_t newinum;
254	int isrmdir;
255{
256
257	panic("softdep_setup_directory_change called");
258}
259
260void
261softdep_change_linkcnt(ip)
262	struct inode *ip;
263{
264
265	panic("softdep_change_linkcnt called");
266}
267
268void
269softdep_load_inodeblock(ip)
270	struct inode *ip;
271{
272
273	panic("softdep_load_inodeblock called");
274}
275
276void
277softdep_update_inodeblock(ip, bp, waitfor)
278	struct inode *ip;
279	struct buf *bp;
280	int waitfor;
281{
282
283	panic("softdep_update_inodeblock called");
284}
285
286int
287softdep_fsync(vp)
288	struct vnode *vp;	/* the "in_core" copy of the inode */
289{
290
291	return (0);
292}
293
294void
295softdep_fsync_mountdev(vp)
296	struct vnode *vp;
297{
298
299	return;
300}
301
302int
303softdep_flushworklist(oldmnt, countp, td)
304	struct mount *oldmnt;
305	int *countp;
306	struct thread *td;
307{
308
309	*countp = 0;
310	return (0);
311}
312
313int
314softdep_sync_metadata(struct vnode *vp)
315{
316
317	return (0);
318}
319
320int
321softdep_slowdown(vp)
322	struct vnode *vp;
323{
324
325	panic("softdep_slowdown called");
326}
327
328void
329softdep_releasefile(ip)
330	struct inode *ip;	/* inode with the zero effective link count */
331{
332
333	panic("softdep_releasefile called");
334}
335
336int
337softdep_request_cleanup(fs, vp)
338	struct fs *fs;
339	struct vnode *vp;
340{
341
342	return (0);
343}
344
345#else
346/*
347 * These definitions need to be adapted to the system to which
348 * this file is being ported.
349 */
350/*
351 * malloc types defined for the softdep system.
352 */
353static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
354static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
355static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
356static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
357static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
358static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
359static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
360static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
361static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
362static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
363static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
364static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
365static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
366static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
367static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
368
369#define M_SOFTDEP_FLAGS	(M_WAITOK | M_USE_RESERVE)
370
371#define	D_PAGEDEP	0
372#define	D_INODEDEP	1
373#define	D_NEWBLK	2
374#define	D_BMSAFEMAP	3
375#define	D_ALLOCDIRECT	4
376#define	D_INDIRDEP	5
377#define	D_ALLOCINDIR	6
378#define	D_FREEFRAG	7
379#define	D_FREEBLKS	8
380#define	D_FREEFILE	9
381#define	D_DIRADD	10
382#define	D_MKDIR		11
383#define	D_DIRREM	12
384#define	D_NEWDIRBLK	13
385#define	D_LAST		D_NEWDIRBLK
386
387/*
388 * translate from workitem type to memory type
389 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
390 */
391static struct malloc_type *memtype[] = {
392	M_PAGEDEP,
393	M_INODEDEP,
394	M_NEWBLK,
395	M_BMSAFEMAP,
396	M_ALLOCDIRECT,
397	M_INDIRDEP,
398	M_ALLOCINDIR,
399	M_FREEFRAG,
400	M_FREEBLKS,
401	M_FREEFILE,
402	M_DIRADD,
403	M_MKDIR,
404	M_DIRREM,
405	M_NEWDIRBLK
406};
407
408#define DtoM(type) (memtype[type])
409
410/*
411 * Names of malloc types.
412 */
413#define TYPENAME(type)  \
414	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
415/*
416 * End system adaptaion definitions.
417 */
418
419/*
420 * Forward declarations.
421 */
422struct inodedep_hashhead;
423struct newblk_hashhead;
424struct pagedep_hashhead;
425
426/*
427 * Internal function prototypes.
428 */
429static	void softdep_error(char *, int);
430static	void drain_output(struct vnode *);
431static	struct buf *getdirtybuf(struct buf *, struct mtx *, int);
432static	void clear_remove(struct thread *);
433static	void clear_inodedeps(struct thread *);
434static	int flush_pagedep_deps(struct vnode *, struct mount *,
435	    struct diraddhd *);
436static	int flush_inodedep_deps(struct fs *, ino_t);
437static	int flush_deplist(struct allocdirectlst *, int, int *);
438static	int handle_written_filepage(struct pagedep *, struct buf *);
439static  void diradd_inode_written(struct diradd *, struct inodedep *);
440static	int handle_written_inodeblock(struct inodedep *, struct buf *);
441static	void handle_allocdirect_partdone(struct allocdirect *);
442static	void handle_allocindir_partdone(struct allocindir *);
443static	void initiate_write_filepage(struct pagedep *, struct buf *);
444static	void handle_written_mkdir(struct mkdir *, int);
445static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
446static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
447static	void handle_workitem_freefile(struct freefile *);
448static	void handle_workitem_remove(struct dirrem *, struct vnode *);
449static	struct dirrem *newdirrem(struct buf *, struct inode *,
450	    struct inode *, int, struct dirrem **);
451static	void free_diradd(struct diradd *);
452static	void free_allocindir(struct allocindir *, struct inodedep *);
453static	void free_newdirblk(struct newdirblk *);
454static	int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
455	    ufs2_daddr_t *);
456static	void deallocate_dependencies(struct buf *, struct inodedep *);
457static	void free_allocdirect(struct allocdirectlst *,
458	    struct allocdirect *, int);
459static	int check_inode_unwritten(struct inodedep *);
460static	int free_inodedep(struct inodedep *);
461static	void handle_workitem_freeblocks(struct freeblks *, int);
462static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
463static	void setup_allocindir_phase2(struct buf *, struct inode *,
464	    struct allocindir *);
465static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
466	    ufs2_daddr_t);
467static	void handle_workitem_freefrag(struct freefrag *);
468static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
469static	void allocdirect_merge(struct allocdirectlst *,
470	    struct allocdirect *, struct allocdirect *);
471static	struct bmsafemap *bmsafemap_lookup(struct buf *);
472static	int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t,
473	    struct newblk **);
474static	int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
475static	int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
476	    struct inodedep **);
477static	int inodedep_lookup(struct fs *, ino_t, int, struct inodedep **);
478static	int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
479static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
480	    struct mount *mp, int, struct pagedep **);
481static	void pause_timer(void *);
482static	int request_cleanup(int);
483static	int process_worklist_item(struct mount *, int);
484static	void add_to_worklist(struct worklist *);
485
486/*
487 * Exported softdep operations.
488 */
489static	void softdep_disk_io_initiation(struct buf *);
490static	void softdep_disk_write_complete(struct buf *);
491static	void softdep_deallocate_dependencies(struct buf *);
492static	int softdep_count_dependencies(struct buf *bp, int);
493
494static struct mtx lk;
495MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
496
497#define ACQUIRE_LOCK(lk)		mtx_lock(lk)
498#define FREE_LOCK(lk)			mtx_unlock(lk)
499
500/*
501 * Worklist queue management.
502 * These routines require that the lock be held.
503 */
504#ifndef /* NOT */ DEBUG
505#define WORKLIST_INSERT(head, item) do {	\
506	(item)->wk_state |= ONWORKLIST;		\
507	LIST_INSERT_HEAD(head, item, wk_list);	\
508} while (0)
509#define WORKLIST_REMOVE(item) do {		\
510	(item)->wk_state &= ~ONWORKLIST;	\
511	LIST_REMOVE(item, wk_list);		\
512} while (0)
513#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
514
515#else /* DEBUG */
516static	void worklist_insert(struct workhead *, struct worklist *);
517static	void worklist_remove(struct worklist *);
518static	void workitem_free(struct worklist *, int);
519
520#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
521#define WORKLIST_REMOVE(item) worklist_remove(item)
522#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
523
524static void
525worklist_insert(head, item)
526	struct workhead *head;
527	struct worklist *item;
528{
529
530	mtx_assert(&lk, MA_OWNED);
531	if (item->wk_state & ONWORKLIST)
532		panic("worklist_insert: already on list");
533	item->wk_state |= ONWORKLIST;
534	LIST_INSERT_HEAD(head, item, wk_list);
535}
536
537static void
538worklist_remove(item)
539	struct worklist *item;
540{
541
542	mtx_assert(&lk, MA_OWNED);
543	if ((item->wk_state & ONWORKLIST) == 0)
544		panic("worklist_remove: not on list");
545	item->wk_state &= ~ONWORKLIST;
546	LIST_REMOVE(item, wk_list);
547}
548
549static void
550workitem_free(item, type)
551	struct worklist *item;
552	int type;
553{
554
555	if (item->wk_state & ONWORKLIST)
556		panic("workitem_free: still on list");
557	if (item->wk_type != type)
558		panic("workitem_free: type mismatch");
559	FREE(item, DtoM(type));
560}
561#endif /* DEBUG */
562
563/*
564 * Workitem queue management
565 */
566static struct workhead softdep_workitem_pending;
567static struct worklist *worklist_tail;
568static int num_on_worklist;	/* number of worklist items to be processed */
569static int softdep_worklist_busy; /* 1 => trying to do unmount */
570static int softdep_worklist_req; /* serialized waiters */
571static int max_softdeps;	/* maximum number of structs before slowdown */
572static int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
573static int tickdelay = 2;	/* number of ticks to pause during slowdown */
574static int proc_waiting;	/* tracks whether we have a timeout posted */
575static int *stat_countp;	/* statistic to count in proc_waiting timeout */
576static struct callout_handle handle; /* handle on posted proc_waiting timeout */
577static struct thread *filesys_syncer; /* proc of filesystem syncer process */
578static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
579#define FLUSH_INODES		1
580static int req_clear_remove;	/* syncer process flush some freeblks */
581#define FLUSH_REMOVE		2
582#define FLUSH_REMOVE_WAIT	3
583/*
584 * runtime statistics
585 */
586static int stat_worklist_push;	/* number of worklist cleanups */
587static int stat_blk_limit_push;	/* number of times block limit neared */
588static int stat_ino_limit_push;	/* number of times inode limit neared */
589static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
590static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
591static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
592static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
593static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
594static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
595static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
596#ifdef DEBUG
597#include <vm/vm.h>
598#include <sys/sysctl.h>
599SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
600SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
601SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
602SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
603SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
604SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
605SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
606SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
607SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
608SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
609SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
610SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
611SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
612#endif /* DEBUG */
613
614SYSCTL_DECL(_vfs_ffs);
615
616static int compute_summary_at_mount = 0;	/* Whether to recompute the summary at mount time */
617SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
618	   &compute_summary_at_mount, 0, "Recompute summary at mount");
619
620/*
621 * Add an item to the end of the work queue.
622 * This routine requires that the lock be held.
623 * This is the only routine that adds items to the list.
624 * The following routine is the only one that removes items
625 * and does so in order from first to last.
626 */
627static void
628add_to_worklist(wk)
629	struct worklist *wk;
630{
631
632	mtx_assert(&lk, MA_OWNED);
633	if (wk->wk_state & ONWORKLIST)
634		panic("add_to_worklist: already on list");
635	wk->wk_state |= ONWORKLIST;
636	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
637		LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
638	else
639		LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
640	worklist_tail = wk;
641	num_on_worklist += 1;
642}
643
644/*
645 * Process that runs once per second to handle items in the background queue.
646 *
647 * Note that we ensure that everything is done in the order in which they
648 * appear in the queue. The code below depends on this property to ensure
649 * that blocks of a file are freed before the inode itself is freed. This
650 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
651 * until all the old ones have been purged from the dependency lists.
652 */
653int
654softdep_process_worklist(matchmnt)
655	struct mount *matchmnt;
656{
657	struct thread *td = curthread;
658	int cnt, matchcnt, loopcount;
659	long starttime;
660
661	/*
662	 * Record the process identifier of our caller so that we can give
663	 * this process preferential treatment in request_cleanup below.
664	 */
665	filesys_syncer = td;
666	matchcnt = 0;
667
668	/*
669	 * There is no danger of having multiple processes run this
670	 * code, but we have to single-thread it when softdep_flushfiles()
671	 * is in operation to get an accurate count of the number of items
672	 * related to its mount point that are in the list.
673	 */
674	ACQUIRE_LOCK(&lk);
675	if (matchmnt == NULL) {
676		if (softdep_worklist_busy < 0) {
677			FREE_LOCK(&lk);
678			return(-1);
679		}
680		softdep_worklist_busy += 1;
681	}
682
683	/*
684	 * If requested, try removing inode or removal dependencies.
685	 */
686	if (req_clear_inodedeps) {
687		clear_inodedeps(td);
688		req_clear_inodedeps -= 1;
689		wakeup_one(&proc_waiting);
690	}
691	if (req_clear_remove) {
692		clear_remove(td);
693		req_clear_remove -= 1;
694		wakeup_one(&proc_waiting);
695	}
696	loopcount = 1;
697	starttime = time_second;
698	while (num_on_worklist > 0) {
699		if ((cnt = process_worklist_item(matchmnt, 0)) == -1)
700			break;
701		else
702			matchcnt += cnt;
703
704		/*
705		 * If a umount operation wants to run the worklist
706		 * accurately, abort.
707		 */
708		if (softdep_worklist_req && matchmnt == NULL) {
709			matchcnt = -1;
710			break;
711		}
712
713		/*
714		 * If requested, try removing inode or removal dependencies.
715		 */
716		if (req_clear_inodedeps) {
717			clear_inodedeps(td);
718			req_clear_inodedeps -= 1;
719			wakeup_one(&proc_waiting);
720		}
721		if (req_clear_remove) {
722			clear_remove(td);
723			req_clear_remove -= 1;
724			wakeup_one(&proc_waiting);
725		}
726		/*
727		 * We do not generally want to stop for buffer space, but if
728		 * we are really being a buffer hog, we will stop and wait.
729		 */
730		if (loopcount++ % 128 == 0) {
731			FREE_LOCK(&lk);
732			bwillwrite();
733			ACQUIRE_LOCK(&lk);
734		}
735		/*
736		 * Never allow processing to run for more than one
737		 * second. Otherwise the other syncer tasks may get
738		 * excessively backlogged.
739		 */
740		if (starttime != time_second && matchmnt == NULL) {
741			matchcnt = -1;
742			break;
743		}
744	}
745	if (matchmnt == NULL) {
746		softdep_worklist_busy -= 1;
747		if (softdep_worklist_req && softdep_worklist_busy == 0)
748			wakeup(&softdep_worklist_req);
749	}
750	FREE_LOCK(&lk);
751	return (matchcnt);
752}
753
754/*
755 * Process one item on the worklist.
756 */
757static int
758process_worklist_item(matchmnt, flags)
759	struct mount *matchmnt;
760	int flags;
761{
762	struct worklist *wk, *wkend;
763	struct mount *mp;
764	struct vnode *vp;
765	int matchcnt = 0;
766
767	mtx_assert(&lk, MA_OWNED);
768	/*
769	 * If we are being called because of a process doing a
770	 * copy-on-write, then it is not safe to write as we may
771	 * recurse into the copy-on-write routine.
772	 */
773	if (curthread->td_pflags & TDP_COWINPROGRESS)
774		return (-1);
775	/*
776	 * Normally we just process each item on the worklist in order.
777	 * However, if we are in a situation where we cannot lock any
778	 * inodes, we have to skip over any dirrem requests whose
779	 * vnodes are resident and locked.
780	 */
781	vp = NULL;
782	LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
783		if (wk->wk_state & INPROGRESS)
784			continue;
785		if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
786			break;
787		wk->wk_state |= INPROGRESS;
788		FREE_LOCK(&lk);
789		ffs_vget(WK_DIRREM(wk)->dm_mnt, WK_DIRREM(wk)->dm_oldinum,
790		    LK_NOWAIT | LK_EXCLUSIVE, &vp);
791		ACQUIRE_LOCK(&lk);
792		wk->wk_state &= ~INPROGRESS;
793		if (vp != NULL)
794			break;
795	}
796	if (wk == 0)
797		return (-1);
798	/*
799	 * Remove the item to be processed. If we are removing the last
800	 * item on the list, we need to recalculate the tail pointer.
801	 * As this happens rarely and usually when the list is short,
802	 * we just run down the list to find it rather than tracking it
803	 * in the above loop.
804	 */
805	WORKLIST_REMOVE(wk);
806	if (wk == worklist_tail) {
807		LIST_FOREACH(wkend, &softdep_workitem_pending, wk_list)
808			if (LIST_NEXT(wkend, wk_list) == NULL)
809				break;
810		worklist_tail = wkend;
811	}
812	num_on_worklist -= 1;
813	FREE_LOCK(&lk);
814	switch (wk->wk_type) {
815
816	case D_DIRREM:
817		/* removal of a directory entry */
818		mp = WK_DIRREM(wk)->dm_mnt;
819		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
820			panic("%s: dirrem on suspended filesystem",
821				"process_worklist_item");
822		if (mp == matchmnt)
823			matchcnt += 1;
824		handle_workitem_remove(WK_DIRREM(wk), vp);
825		break;
826
827	case D_FREEBLKS:
828		/* releasing blocks and/or fragments from a file */
829		mp = WK_FREEBLKS(wk)->fb_mnt;
830		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
831			panic("%s: freeblks on suspended filesystem",
832				"process_worklist_item");
833		if (mp == matchmnt)
834			matchcnt += 1;
835		handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
836		break;
837
838	case D_FREEFRAG:
839		/* releasing a fragment when replaced as a file grows */
840		mp = WK_FREEFRAG(wk)->ff_mnt;
841		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
842			panic("%s: freefrag on suspended filesystem",
843				"process_worklist_item");
844		if (mp == matchmnt)
845			matchcnt += 1;
846		handle_workitem_freefrag(WK_FREEFRAG(wk));
847		break;
848
849	case D_FREEFILE:
850		/* releasing an inode when its link count drops to 0 */
851		mp = WK_FREEFILE(wk)->fx_mnt;
852		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
853			panic("%s: freefile on suspended filesystem",
854				"process_worklist_item");
855		if (mp == matchmnt)
856			matchcnt += 1;
857		handle_workitem_freefile(WK_FREEFILE(wk));
858		break;
859
860	default:
861		panic("%s_process_worklist: Unknown type %s",
862		    "softdep", TYPENAME(wk->wk_type));
863		/* NOTREACHED */
864	}
865	ACQUIRE_LOCK(&lk);
866	return (matchcnt);
867}
868
869/*
870 * Move dependencies from one buffer to another.
871 */
872void
873softdep_move_dependencies(oldbp, newbp)
874	struct buf *oldbp;
875	struct buf *newbp;
876{
877	struct worklist *wk, *wktail;
878
879	if (LIST_FIRST(&newbp->b_dep) != NULL)
880		panic("softdep_move_dependencies: need merge code");
881	wktail = 0;
882	ACQUIRE_LOCK(&lk);
883	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
884		LIST_REMOVE(wk, wk_list);
885		if (wktail == 0)
886			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
887		else
888			LIST_INSERT_AFTER(wktail, wk, wk_list);
889		wktail = wk;
890	}
891	FREE_LOCK(&lk);
892}
893
894/*
895 * Purge the work list of all items associated with a particular mount point.
896 */
897int
898softdep_flushworklist(oldmnt, countp, td)
899	struct mount *oldmnt;
900	int *countp;
901	struct thread *td;
902{
903	struct vnode *devvp;
904	int count, error = 0;
905
906	/*
907	 * Await our turn to clear out the queue, then serialize access.
908	 */
909	ACQUIRE_LOCK(&lk);
910	while (softdep_worklist_busy) {
911		softdep_worklist_req += 1;
912		msleep(&softdep_worklist_req, &lk, PRIBIO, "softflush", 0);
913		softdep_worklist_req -= 1;
914	}
915	softdep_worklist_busy = -1;
916	FREE_LOCK(&lk);
917	/*
918	 * Alternately flush the block device associated with the mount
919	 * point and process any dependencies that the flushing
920	 * creates. We continue until no more worklist dependencies
921	 * are found.
922	 */
923	*countp = 0;
924	devvp = VFSTOUFS(oldmnt)->um_devvp;
925	while ((count = softdep_process_worklist(oldmnt)) > 0) {
926		*countp += count;
927		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
928		error = VOP_FSYNC(devvp, MNT_WAIT, td);
929		VOP_UNLOCK(devvp, 0, td);
930		if (error)
931			break;
932	}
933	ACQUIRE_LOCK(&lk);
934	softdep_worklist_busy = 0;
935	if (softdep_worklist_req)
936		wakeup(&softdep_worklist_req);
937	FREE_LOCK(&lk);
938	return (error);
939}
940
941/*
942 * Flush all vnodes and worklist items associated with a specified mount point.
943 */
944int
945softdep_flushfiles(oldmnt, flags, td)
946	struct mount *oldmnt;
947	int flags;
948	struct thread *td;
949{
950	int error, count, loopcnt;
951
952	error = 0;
953
954	/*
955	 * Alternately flush the vnodes associated with the mount
956	 * point and process any dependencies that the flushing
957	 * creates. In theory, this loop can happen at most twice,
958	 * but we give it a few extra just to be sure.
959	 */
960	for (loopcnt = 10; loopcnt > 0; loopcnt--) {
961		/*
962		 * Do another flush in case any vnodes were brought in
963		 * as part of the cleanup operations.
964		 */
965		if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
966			break;
967		if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 ||
968		    count == 0)
969			break;
970	}
971	/*
972	 * If we are unmounting then it is an error to fail. If we
973	 * are simply trying to downgrade to read-only, then filesystem
974	 * activity can keep us busy forever, so we just fail with EBUSY.
975	 */
976	if (loopcnt == 0) {
977		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
978			panic("softdep_flushfiles: looping");
979		error = EBUSY;
980	}
981	return (error);
982}
983
984/*
985 * Structure hashing.
986 *
987 * There are three types of structures that can be looked up:
988 *	1) pagedep structures identified by mount point, inode number,
989 *	   and logical block.
990 *	2) inodedep structures identified by mount point and inode number.
991 *	3) newblk structures identified by mount point and
992 *	   physical block number.
993 *
994 * The "pagedep" and "inodedep" dependency structures are hashed
995 * separately from the file blocks and inodes to which they correspond.
996 * This separation helps when the in-memory copy of an inode or
997 * file block must be replaced. It also obviates the need to access
998 * an inode or file page when simply updating (or de-allocating)
999 * dependency structures. Lookup of newblk structures is needed to
1000 * find newly allocated blocks when trying to associate them with
1001 * their allocdirect or allocindir structure.
1002 *
1003 * The lookup routines optionally create and hash a new instance when
1004 * an existing entry is not found.
1005 */
1006#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
1007#define NODELAY		0x0002	/* cannot do background work */
1008
1009/*
1010 * Structures and routines associated with pagedep caching.
1011 */
1012LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
1013u_long	pagedep_hash;		/* size of hash table - 1 */
1014#define	PAGEDEP_HASH(mp, inum, lbn) \
1015	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
1016	    pagedep_hash])
1017
1018static int
1019pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
1020	struct pagedep_hashhead *pagedephd;
1021	ino_t ino;
1022	ufs_lbn_t lbn;
1023	struct mount *mp;
1024	int flags;
1025	struct pagedep **pagedeppp;
1026{
1027	struct pagedep *pagedep;
1028
1029	LIST_FOREACH(pagedep, pagedephd, pd_hash)
1030		if (ino == pagedep->pd_ino &&
1031		    lbn == pagedep->pd_lbn &&
1032		    mp == pagedep->pd_mnt)
1033			break;
1034	if (pagedep) {
1035		*pagedeppp = pagedep;
1036		if ((flags & DEPALLOC) != 0 &&
1037		    (pagedep->pd_state & ONWORKLIST) == 0)
1038			return (0);
1039		return (1);
1040	}
1041	*pagedeppp = NULL;
1042	return (0);
1043}
1044/*
1045 * Look up a pagedep. Return 1 if found, 0 if not found or found
1046 * when asked to allocate but not associated with any buffer.
1047 * If not found, allocate if DEPALLOC flag is passed.
1048 * Found or allocated entry is returned in pagedeppp.
1049 * This routine must be called with splbio interrupts blocked.
1050 */
1051static int
1052pagedep_lookup(ip, lbn, flags, pagedeppp)
1053	struct inode *ip;
1054	ufs_lbn_t lbn;
1055	int flags;
1056	struct pagedep **pagedeppp;
1057{
1058	struct pagedep *pagedep;
1059	struct pagedep_hashhead *pagedephd;
1060	struct mount *mp;
1061	int ret;
1062	int i;
1063
1064	mtx_assert(&lk, MA_OWNED);
1065	mp = ITOV(ip)->v_mount;
1066	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
1067
1068	ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
1069	if (*pagedeppp || (flags & DEPALLOC) == 0)
1070		return (ret);
1071	FREE_LOCK(&lk);
1072	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep),
1073	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
1074	ACQUIRE_LOCK(&lk);
1075	ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
1076	if (*pagedeppp) {
1077		FREE(pagedep, M_PAGEDEP);
1078		return (ret);
1079	}
1080	pagedep->pd_list.wk_type = D_PAGEDEP;
1081	pagedep->pd_mnt = mp;
1082	pagedep->pd_ino = ip->i_number;
1083	pagedep->pd_lbn = lbn;
1084	LIST_INIT(&pagedep->pd_dirremhd);
1085	LIST_INIT(&pagedep->pd_pendinghd);
1086	for (i = 0; i < DAHASHSZ; i++)
1087		LIST_INIT(&pagedep->pd_diraddhd[i]);
1088	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
1089	*pagedeppp = pagedep;
1090	return (0);
1091}
1092
1093/*
1094 * Structures and routines associated with inodedep caching.
1095 */
1096LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
1097static u_long	inodedep_hash;	/* size of hash table - 1 */
1098static long	num_inodedep;	/* number of inodedep allocated */
1099#define	INODEDEP_HASH(fs, inum) \
1100      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
1101
1102static int
1103inodedep_find(inodedephd, fs, inum, inodedeppp)
1104	struct inodedep_hashhead *inodedephd;
1105	struct fs *fs;
1106	ino_t inum;
1107	struct inodedep **inodedeppp;
1108{
1109	struct inodedep *inodedep;
1110
1111	LIST_FOREACH(inodedep, inodedephd, id_hash)
1112		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1113			break;
1114	if (inodedep) {
1115		*inodedeppp = inodedep;
1116		return (1);
1117	}
1118	*inodedeppp = NULL;
1119
1120	return (0);
1121}
1122/*
1123 * Look up an inodedep. Return 1 if found, 0 if not found.
1124 * If not found, allocate if DEPALLOC flag is passed.
1125 * Found or allocated entry is returned in inodedeppp.
1126 * This routine must be called with splbio interrupts blocked.
1127 */
1128static int
1129inodedep_lookup(fs, inum, flags, inodedeppp)
1130	struct fs *fs;
1131	ino_t inum;
1132	int flags;
1133	struct inodedep **inodedeppp;
1134{
1135	struct inodedep *inodedep;
1136	struct inodedep_hashhead *inodedephd;
1137
1138	mtx_assert(&lk, MA_OWNED);
1139	inodedephd = INODEDEP_HASH(fs, inum);
1140
1141	if (inodedep_find(inodedephd, fs, inum, inodedeppp))
1142		return (1);
1143	if ((flags & DEPALLOC) == 0)
1144		return (0);
1145	/*
1146	 * If we are over our limit, try to improve the situation.
1147	 */
1148	if (num_inodedep > max_softdeps  && (flags & NODELAY) == 0)
1149		request_cleanup(FLUSH_INODES);
1150	FREE_LOCK(&lk);
1151	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
1152		M_INODEDEP, M_SOFTDEP_FLAGS);
1153	ACQUIRE_LOCK(&lk);
1154	if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
1155		FREE(inodedep, M_INODEDEP);
1156		return (1);
1157	}
1158	num_inodedep += 1;
1159	inodedep->id_list.wk_type = D_INODEDEP;
1160	inodedep->id_fs = fs;
1161	inodedep->id_ino = inum;
1162	inodedep->id_state = ALLCOMPLETE;
1163	inodedep->id_nlinkdelta = 0;
1164	inodedep->id_savedino1 = NULL;
1165	inodedep->id_savedsize = -1;
1166	inodedep->id_savedextsize = -1;
1167	inodedep->id_buf = NULL;
1168	LIST_INIT(&inodedep->id_pendinghd);
1169	LIST_INIT(&inodedep->id_inowait);
1170	LIST_INIT(&inodedep->id_bufwait);
1171	TAILQ_INIT(&inodedep->id_inoupdt);
1172	TAILQ_INIT(&inodedep->id_newinoupdt);
1173	TAILQ_INIT(&inodedep->id_extupdt);
1174	TAILQ_INIT(&inodedep->id_newextupdt);
1175	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1176	*inodedeppp = inodedep;
1177	return (0);
1178}
1179
1180/*
1181 * Structures and routines associated with newblk caching.
1182 */
1183LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1184u_long	newblk_hash;		/* size of hash table - 1 */
1185#define	NEWBLK_HASH(fs, inum) \
1186	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1187
1188static int
1189newblk_find(newblkhd, fs, newblkno, newblkpp)
1190	struct newblk_hashhead *newblkhd;
1191	struct fs *fs;
1192	ufs2_daddr_t newblkno;
1193	struct newblk **newblkpp;
1194{
1195	struct newblk *newblk;
1196
1197	LIST_FOREACH(newblk, newblkhd, nb_hash)
1198		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
1199			break;
1200	if (newblk) {
1201		*newblkpp = newblk;
1202		return (1);
1203	}
1204	*newblkpp = NULL;
1205	return (0);
1206}
1207
1208/*
1209 * Look up a newblk. Return 1 if found, 0 if not found.
1210 * If not found, allocate if DEPALLOC flag is passed.
1211 * Found or allocated entry is returned in newblkpp.
1212 */
1213static int
1214newblk_lookup(fs, newblkno, flags, newblkpp)
1215	struct fs *fs;
1216	ufs2_daddr_t newblkno;
1217	int flags;
1218	struct newblk **newblkpp;
1219{
1220	struct newblk *newblk;
1221	struct newblk_hashhead *newblkhd;
1222
1223	newblkhd = NEWBLK_HASH(fs, newblkno);
1224	if (newblk_find(newblkhd, fs, newblkno, newblkpp))
1225		return (1);
1226	if ((flags & DEPALLOC) == 0)
1227		return (0);
1228	FREE_LOCK(&lk);
1229	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
1230		M_NEWBLK, M_SOFTDEP_FLAGS);
1231	ACQUIRE_LOCK(&lk);
1232	if (newblk_find(newblkhd, fs, newblkno, newblkpp)) {
1233		FREE(newblk, M_NEWBLK);
1234		return (1);
1235	}
1236	newblk->nb_state = 0;
1237	newblk->nb_fs = fs;
1238	newblk->nb_newblkno = newblkno;
1239	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1240	*newblkpp = newblk;
1241	return (0);
1242}
1243
1244/*
1245 * Executed during filesystem system initialization before
1246 * mounting any filesystems.
1247 */
1248void
1249softdep_initialize()
1250{
1251
1252	LIST_INIT(&mkdirlisthd);
1253	LIST_INIT(&softdep_workitem_pending);
1254	max_softdeps = desiredvnodes * 4;
1255	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
1256	    &pagedep_hash);
1257	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
1258	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
1259
1260	/* hooks through which the main kernel code calls us */
1261	softdep_process_worklist_hook = softdep_process_worklist;
1262
1263	/* initialise bioops hack */
1264	bioops.io_start = softdep_disk_io_initiation;
1265	bioops.io_complete = softdep_disk_write_complete;
1266	bioops.io_deallocate = softdep_deallocate_dependencies;
1267	bioops.io_countdeps = softdep_count_dependencies;
1268}
1269
1270/*
1271 * Executed after all filesystems have been unmounted during
1272 * filesystem module unload.
1273 */
1274void
1275softdep_uninitialize()
1276{
1277
1278	softdep_process_worklist_hook = NULL;
1279	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
1280	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
1281	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
1282}
1283
1284/*
1285 * Called at mount time to notify the dependency code that a
1286 * filesystem wishes to use it.
1287 */
1288int
1289softdep_mount(devvp, mp, fs, cred)
1290	struct vnode *devvp;
1291	struct mount *mp;
1292	struct fs *fs;
1293	struct ucred *cred;
1294{
1295	struct csum_total cstotal;
1296	struct cg *cgp;
1297	struct buf *bp;
1298	int error, cyl;
1299
1300	mp->mnt_flag &= ~MNT_ASYNC;
1301	mp->mnt_flag |= MNT_SOFTDEP;
1302	/*
1303	 * When doing soft updates, the counters in the
1304	 * superblock may have gotten out of sync. Recomputation
1305	 * can take a long time and can be deferred for background
1306	 * fsck.  However, the old behavior of scanning the cylinder
1307	 * groups and recalculating them at mount time is available
1308	 * by setting vfs.ffs.compute_summary_at_mount to one.
1309	 */
1310	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
1311		return (0);
1312	bzero(&cstotal, sizeof cstotal);
1313	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1314		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
1315		    fs->fs_cgsize, cred, &bp)) != 0) {
1316			brelse(bp);
1317			return (error);
1318		}
1319		cgp = (struct cg *)bp->b_data;
1320		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1321		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1322		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1323		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1324		fs->fs_cs(fs, cyl) = cgp->cg_cs;
1325		brelse(bp);
1326	}
1327#ifdef DEBUG
1328	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1329		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
1330#endif
1331	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1332	return (0);
1333}
1334
1335/*
1336 * Protecting the freemaps (or bitmaps).
1337 *
1338 * To eliminate the need to execute fsck before mounting a filesystem
1339 * after a power failure, one must (conservatively) guarantee that the
1340 * on-disk copy of the bitmaps never indicate that a live inode or block is
1341 * free.  So, when a block or inode is allocated, the bitmap should be
1342 * updated (on disk) before any new pointers.  When a block or inode is
1343 * freed, the bitmap should not be updated until all pointers have been
1344 * reset.  The latter dependency is handled by the delayed de-allocation
1345 * approach described below for block and inode de-allocation.  The former
1346 * dependency is handled by calling the following procedure when a block or
1347 * inode is allocated. When an inode is allocated an "inodedep" is created
1348 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1349 * Each "inodedep" is also inserted into the hash indexing structure so
1350 * that any additional link additions can be made dependent on the inode
1351 * allocation.
1352 *
1353 * The ufs filesystem maintains a number of free block counts (e.g., per
1354 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1355 * in addition to the bitmaps.  These counts are used to improve efficiency
1356 * during allocation and therefore must be consistent with the bitmaps.
1357 * There is no convenient way to guarantee post-crash consistency of these
1358 * counts with simple update ordering, for two main reasons: (1) The counts
1359 * and bitmaps for a single cylinder group block are not in the same disk
1360 * sector.  If a disk write is interrupted (e.g., by power failure), one may
1361 * be written and the other not.  (2) Some of the counts are located in the
1362 * superblock rather than the cylinder group block. So, we focus our soft
1363 * updates implementation on protecting the bitmaps. When mounting a
1364 * filesystem, we recompute the auxiliary counts from the bitmaps.
1365 */
1366
1367/*
1368 * Called just after updating the cylinder group block to allocate an inode.
1369 */
1370void
1371softdep_setup_inomapdep(bp, ip, newinum)
1372	struct buf *bp;		/* buffer for cylgroup block with inode map */
1373	struct inode *ip;	/* inode related to allocation */
1374	ino_t newinum;		/* new inode number being allocated */
1375{
1376	struct inodedep *inodedep;
1377	struct bmsafemap *bmsafemap;
1378
1379	/*
1380	 * Create a dependency for the newly allocated inode.
1381	 * Panic if it already exists as something is seriously wrong.
1382	 * Otherwise add it to the dependency list for the buffer holding
1383	 * the cylinder group map from which it was allocated.
1384	 */
1385	ACQUIRE_LOCK(&lk);
1386	if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC|NODELAY, &inodedep)))
1387		panic("softdep_setup_inomapdep: found inode");
1388	inodedep->id_buf = bp;
1389	inodedep->id_state &= ~DEPCOMPLETE;
1390	bmsafemap = bmsafemap_lookup(bp);
1391	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1392	FREE_LOCK(&lk);
1393}
1394
1395/*
1396 * Called just after updating the cylinder group block to
1397 * allocate block or fragment.
1398 */
1399void
1400softdep_setup_blkmapdep(bp, fs, newblkno)
1401	struct buf *bp;		/* buffer for cylgroup block with block map */
1402	struct fs *fs;		/* filesystem doing allocation */
1403	ufs2_daddr_t newblkno;	/* number of newly allocated block */
1404{
1405	struct newblk *newblk;
1406	struct bmsafemap *bmsafemap;
1407
1408	/*
1409	 * Create a dependency for the newly allocated block.
1410	 * Add it to the dependency list for the buffer holding
1411	 * the cylinder group map from which it was allocated.
1412	 */
1413	ACQUIRE_LOCK(&lk);
1414	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1415		panic("softdep_setup_blkmapdep: found block");
1416	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1417	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1418	FREE_LOCK(&lk);
1419}
1420
1421/*
1422 * Find the bmsafemap associated with a cylinder group buffer.
1423 * If none exists, create one. The buffer must be locked when
1424 * this routine is called and this routine must be called with
1425 * splbio interrupts blocked.
1426 */
1427static struct bmsafemap *
1428bmsafemap_lookup(bp)
1429	struct buf *bp;
1430{
1431	struct bmsafemap *bmsafemap;
1432	struct worklist *wk;
1433
1434	mtx_assert(&lk, MA_OWNED);
1435	LIST_FOREACH(wk, &bp->b_dep, wk_list)
1436		if (wk->wk_type == D_BMSAFEMAP)
1437			return (WK_BMSAFEMAP(wk));
1438	FREE_LOCK(&lk);
1439	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1440		M_BMSAFEMAP, M_SOFTDEP_FLAGS);
1441	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1442	bmsafemap->sm_list.wk_state = 0;
1443	bmsafemap->sm_buf = bp;
1444	LIST_INIT(&bmsafemap->sm_allocdirecthd);
1445	LIST_INIT(&bmsafemap->sm_allocindirhd);
1446	LIST_INIT(&bmsafemap->sm_inodedephd);
1447	LIST_INIT(&bmsafemap->sm_newblkhd);
1448	ACQUIRE_LOCK(&lk);
1449	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1450	return (bmsafemap);
1451}
1452
1453/*
1454 * Direct block allocation dependencies.
1455 *
1456 * When a new block is allocated, the corresponding disk locations must be
1457 * initialized (with zeros or new data) before the on-disk inode points to
1458 * them.  Also, the freemap from which the block was allocated must be
1459 * updated (on disk) before the inode's pointer. These two dependencies are
1460 * independent of each other and are needed for all file blocks and indirect
1461 * blocks that are pointed to directly by the inode.  Just before the
1462 * "in-core" version of the inode is updated with a newly allocated block
1463 * number, a procedure (below) is called to setup allocation dependency
1464 * structures.  These structures are removed when the corresponding
1465 * dependencies are satisfied or when the block allocation becomes obsolete
1466 * (i.e., the file is deleted, the block is de-allocated, or the block is a
1467 * fragment that gets upgraded).  All of these cases are handled in
1468 * procedures described later.
1469 *
1470 * When a file extension causes a fragment to be upgraded, either to a larger
1471 * fragment or to a full block, the on-disk location may change (if the
1472 * previous fragment could not simply be extended). In this case, the old
1473 * fragment must be de-allocated, but not until after the inode's pointer has
1474 * been updated. In most cases, this is handled by later procedures, which
1475 * will construct a "freefrag" structure to be added to the workitem queue
1476 * when the inode update is complete (or obsolete).  The main exception to
1477 * this is when an allocation occurs while a pending allocation dependency
1478 * (for the same block pointer) remains.  This case is handled in the main
1479 * allocation dependency setup procedure by immediately freeing the
1480 * unreferenced fragments.
1481 */
1482void
1483softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1484	struct inode *ip;	/* inode to which block is being added */
1485	ufs_lbn_t lbn;		/* block pointer within inode */
1486	ufs2_daddr_t newblkno;	/* disk block number being added */
1487	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
1488	long newsize;		/* size of new block */
1489	long oldsize;		/* size of new block */
1490	struct buf *bp;		/* bp for allocated block */
1491{
1492	struct allocdirect *adp, *oldadp;
1493	struct allocdirectlst *adphead;
1494	struct bmsafemap *bmsafemap;
1495	struct inodedep *inodedep;
1496	struct pagedep *pagedep;
1497	struct newblk *newblk;
1498
1499	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1500		M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1501	adp->ad_list.wk_type = D_ALLOCDIRECT;
1502	adp->ad_lbn = lbn;
1503	adp->ad_newblkno = newblkno;
1504	adp->ad_oldblkno = oldblkno;
1505	adp->ad_newsize = newsize;
1506	adp->ad_oldsize = oldsize;
1507	adp->ad_state = ATTACHED;
1508	LIST_INIT(&adp->ad_newdirblk);
1509	if (newblkno == oldblkno)
1510		adp->ad_freefrag = NULL;
1511	else
1512		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1513
1514	ACQUIRE_LOCK(&lk);
1515	if (lbn >= NDADDR) {
1516		/* allocating an indirect block */
1517		if (oldblkno != 0)
1518			panic("softdep_setup_allocdirect: non-zero indir");
1519	} else {
1520		/*
1521		 * Allocating a direct block.
1522		 *
1523		 * If we are allocating a directory block, then we must
1524		 * allocate an associated pagedep to track additions and
1525		 * deletions.
1526		 */
1527		if ((ip->i_mode & IFMT) == IFDIR &&
1528		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1529			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1530	}
1531	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1532		panic("softdep_setup_allocdirect: lost block");
1533	if (newblk->nb_state == DEPCOMPLETE) {
1534		adp->ad_state |= DEPCOMPLETE;
1535		adp->ad_buf = NULL;
1536	} else {
1537		bmsafemap = newblk->nb_bmsafemap;
1538		adp->ad_buf = bmsafemap->sm_buf;
1539		LIST_REMOVE(newblk, nb_deps);
1540		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1541	}
1542	LIST_REMOVE(newblk, nb_hash);
1543	FREE(newblk, M_NEWBLK);
1544
1545	inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1546	adp->ad_inodedep = inodedep;
1547	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1548	/*
1549	 * The list of allocdirects must be kept in sorted and ascending
1550	 * order so that the rollback routines can quickly determine the
1551	 * first uncommitted block (the size of the file stored on disk
1552	 * ends at the end of the lowest committed fragment, or if there
1553	 * are no fragments, at the end of the highest committed block).
1554	 * Since files generally grow, the typical case is that the new
1555	 * block is to be added at the end of the list. We speed this
1556	 * special case by checking against the last allocdirect in the
1557	 * list before laboriously traversing the list looking for the
1558	 * insertion point.
1559	 */
1560	adphead = &inodedep->id_newinoupdt;
1561	oldadp = TAILQ_LAST(adphead, allocdirectlst);
1562	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1563		/* insert at end of list */
1564		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1565		if (oldadp != NULL && oldadp->ad_lbn == lbn)
1566			allocdirect_merge(adphead, adp, oldadp);
1567		FREE_LOCK(&lk);
1568		return;
1569	}
1570	TAILQ_FOREACH(oldadp, adphead, ad_next) {
1571		if (oldadp->ad_lbn >= lbn)
1572			break;
1573	}
1574	if (oldadp == NULL)
1575		panic("softdep_setup_allocdirect: lost entry");
1576	/* insert in middle of list */
1577	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1578	if (oldadp->ad_lbn == lbn)
1579		allocdirect_merge(adphead, adp, oldadp);
1580	FREE_LOCK(&lk);
1581}
1582
1583/*
1584 * Replace an old allocdirect dependency with a newer one.
1585 * This routine must be called with splbio interrupts blocked.
1586 */
1587static void
1588allocdirect_merge(adphead, newadp, oldadp)
1589	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
1590	struct allocdirect *newadp;	/* allocdirect being added */
1591	struct allocdirect *oldadp;	/* existing allocdirect being checked */
1592{
1593	struct worklist *wk;
1594	struct freefrag *freefrag;
1595	struct newdirblk *newdirblk;
1596
1597	mtx_assert(&lk, MA_OWNED);
1598	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1599	    newadp->ad_oldsize != oldadp->ad_newsize ||
1600	    newadp->ad_lbn >= NDADDR)
1601		panic("%s %jd != new %jd || old size %ld != new %ld",
1602		    "allocdirect_merge: old blkno",
1603		    (intmax_t)newadp->ad_oldblkno,
1604		    (intmax_t)oldadp->ad_newblkno,
1605		    newadp->ad_oldsize, oldadp->ad_newsize);
1606	newadp->ad_oldblkno = oldadp->ad_oldblkno;
1607	newadp->ad_oldsize = oldadp->ad_oldsize;
1608	/*
1609	 * If the old dependency had a fragment to free or had never
1610	 * previously had a block allocated, then the new dependency
1611	 * can immediately post its freefrag and adopt the old freefrag.
1612	 * This action is done by swapping the freefrag dependencies.
1613	 * The new dependency gains the old one's freefrag, and the
1614	 * old one gets the new one and then immediately puts it on
1615	 * the worklist when it is freed by free_allocdirect. It is
1616	 * not possible to do this swap when the old dependency had a
1617	 * non-zero size but no previous fragment to free. This condition
1618	 * arises when the new block is an extension of the old block.
1619	 * Here, the first part of the fragment allocated to the new
1620	 * dependency is part of the block currently claimed on disk by
1621	 * the old dependency, so cannot legitimately be freed until the
1622	 * conditions for the new dependency are fulfilled.
1623	 */
1624	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1625		freefrag = newadp->ad_freefrag;
1626		newadp->ad_freefrag = oldadp->ad_freefrag;
1627		oldadp->ad_freefrag = freefrag;
1628	}
1629	/*
1630	 * If we are tracking a new directory-block allocation,
1631	 * move it from the old allocdirect to the new allocdirect.
1632	 */
1633	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
1634		newdirblk = WK_NEWDIRBLK(wk);
1635		WORKLIST_REMOVE(&newdirblk->db_list);
1636		if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL)
1637			panic("allocdirect_merge: extra newdirblk");
1638		WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
1639	}
1640	free_allocdirect(adphead, oldadp, 0);
1641}
1642
1643/*
1644 * Allocate a new freefrag structure if needed.
1645 */
1646static struct freefrag *
1647newfreefrag(ip, blkno, size)
1648	struct inode *ip;
1649	ufs2_daddr_t blkno;
1650	long size;
1651{
1652	struct freefrag *freefrag;
1653	struct fs *fs;
1654
1655	if (blkno == 0)
1656		return (NULL);
1657	fs = ip->i_fs;
1658	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1659		panic("newfreefrag: frag size");
1660	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1661		M_FREEFRAG, M_SOFTDEP_FLAGS);
1662	freefrag->ff_list.wk_type = D_FREEFRAG;
1663	freefrag->ff_state = 0;
1664	freefrag->ff_inum = ip->i_number;
1665	freefrag->ff_mnt = ITOV(ip)->v_mount;
1666	freefrag->ff_blkno = blkno;
1667	freefrag->ff_fragsize = size;
1668	return (freefrag);
1669}
1670
1671/*
1672 * This workitem de-allocates fragments that were replaced during
1673 * file block allocation.
1674 */
1675static void
1676handle_workitem_freefrag(freefrag)
1677	struct freefrag *freefrag;
1678{
1679	struct ufsmount *ump = VFSTOUFS(freefrag->ff_mnt);
1680
1681	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
1682	    freefrag->ff_fragsize, freefrag->ff_inum);
1683	FREE(freefrag, M_FREEFRAG);
1684}
1685
1686/*
1687 * Set up a dependency structure for an external attributes data block.
1688 * This routine follows much of the structure of softdep_setup_allocdirect.
1689 * See the description of softdep_setup_allocdirect above for details.
1690 */
1691void
1692softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1693	struct inode *ip;
1694	ufs_lbn_t lbn;
1695	ufs2_daddr_t newblkno;
1696	ufs2_daddr_t oldblkno;
1697	long newsize;
1698	long oldsize;
1699	struct buf *bp;
1700{
1701	struct allocdirect *adp, *oldadp;
1702	struct allocdirectlst *adphead;
1703	struct bmsafemap *bmsafemap;
1704	struct inodedep *inodedep;
1705	struct newblk *newblk;
1706
1707	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1708		M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1709	adp->ad_list.wk_type = D_ALLOCDIRECT;
1710	adp->ad_lbn = lbn;
1711	adp->ad_newblkno = newblkno;
1712	adp->ad_oldblkno = oldblkno;
1713	adp->ad_newsize = newsize;
1714	adp->ad_oldsize = oldsize;
1715	adp->ad_state = ATTACHED | EXTDATA;
1716	LIST_INIT(&adp->ad_newdirblk);
1717	if (newblkno == oldblkno)
1718		adp->ad_freefrag = NULL;
1719	else
1720		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1721
1722	ACQUIRE_LOCK(&lk);
1723	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1724		panic("softdep_setup_allocext: lost block");
1725
1726	inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1727	adp->ad_inodedep = inodedep;
1728
1729	if (newblk->nb_state == DEPCOMPLETE) {
1730		adp->ad_state |= DEPCOMPLETE;
1731		adp->ad_buf = NULL;
1732	} else {
1733		bmsafemap = newblk->nb_bmsafemap;
1734		adp->ad_buf = bmsafemap->sm_buf;
1735		LIST_REMOVE(newblk, nb_deps);
1736		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1737	}
1738	LIST_REMOVE(newblk, nb_hash);
1739	FREE(newblk, M_NEWBLK);
1740
1741	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1742	if (lbn >= NXADDR)
1743		panic("softdep_setup_allocext: lbn %lld > NXADDR",
1744		    (long long)lbn);
1745	/*
1746	 * The list of allocdirects must be kept in sorted and ascending
1747	 * order so that the rollback routines can quickly determine the
1748	 * first uncommitted block (the size of the file stored on disk
1749	 * ends at the end of the lowest committed fragment, or if there
1750	 * are no fragments, at the end of the highest committed block).
1751	 * Since files generally grow, the typical case is that the new
1752	 * block is to be added at the end of the list. We speed this
1753	 * special case by checking against the last allocdirect in the
1754	 * list before laboriously traversing the list looking for the
1755	 * insertion point.
1756	 */
1757	adphead = &inodedep->id_newextupdt;
1758	oldadp = TAILQ_LAST(adphead, allocdirectlst);
1759	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1760		/* insert at end of list */
1761		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1762		if (oldadp != NULL && oldadp->ad_lbn == lbn)
1763			allocdirect_merge(adphead, adp, oldadp);
1764		FREE_LOCK(&lk);
1765		return;
1766	}
1767	TAILQ_FOREACH(oldadp, adphead, ad_next) {
1768		if (oldadp->ad_lbn >= lbn)
1769			break;
1770	}
1771	if (oldadp == NULL)
1772		panic("softdep_setup_allocext: lost entry");
1773	/* insert in middle of list */
1774	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1775	if (oldadp->ad_lbn == lbn)
1776		allocdirect_merge(adphead, adp, oldadp);
1777	FREE_LOCK(&lk);
1778}
1779
1780/*
1781 * Indirect block allocation dependencies.
1782 *
1783 * The same dependencies that exist for a direct block also exist when
1784 * a new block is allocated and pointed to by an entry in a block of
1785 * indirect pointers. The undo/redo states described above are also
1786 * used here. Because an indirect block contains many pointers that
1787 * may have dependencies, a second copy of the entire in-memory indirect
1788 * block is kept. The buffer cache copy is always completely up-to-date.
1789 * The second copy, which is used only as a source for disk writes,
1790 * contains only the safe pointers (i.e., those that have no remaining
1791 * update dependencies). The second copy is freed when all pointers
1792 * are safe. The cache is not allowed to replace indirect blocks with
1793 * pending update dependencies. If a buffer containing an indirect
1794 * block with dependencies is written, these routines will mark it
1795 * dirty again. It can only be successfully written once all the
1796 * dependencies are removed. The ffs_fsync routine in conjunction with
1797 * softdep_sync_metadata work together to get all the dependencies
1798 * removed so that a file can be successfully written to disk. Three
1799 * procedures are used when setting up indirect block pointer
1800 * dependencies. The division is necessary because of the organization
1801 * of the "balloc" routine and because of the distinction between file
1802 * pages and file metadata blocks.
1803 */
1804
1805/*
1806 * Allocate a new allocindir structure.
1807 */
1808static struct allocindir *
1809newallocindir(ip, ptrno, newblkno, oldblkno)
1810	struct inode *ip;	/* inode for file being extended */
1811	int ptrno;		/* offset of pointer in indirect block */
1812	ufs2_daddr_t newblkno;	/* disk block number being added */
1813	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
1814{
1815	struct allocindir *aip;
1816
1817	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1818		M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
1819	aip->ai_list.wk_type = D_ALLOCINDIR;
1820	aip->ai_state = ATTACHED;
1821	aip->ai_offset = ptrno;
1822	aip->ai_newblkno = newblkno;
1823	aip->ai_oldblkno = oldblkno;
1824	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1825	return (aip);
1826}
1827
1828/*
1829 * Called just before setting an indirect block pointer
1830 * to a newly allocated file page.
1831 */
1832void
1833softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1834	struct inode *ip;	/* inode for file being extended */
1835	ufs_lbn_t lbn;		/* allocated block number within file */
1836	struct buf *bp;		/* buffer with indirect blk referencing page */
1837	int ptrno;		/* offset of pointer in indirect block */
1838	ufs2_daddr_t newblkno;	/* disk block number being added */
1839	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
1840	struct buf *nbp;	/* buffer holding allocated page */
1841{
1842	struct allocindir *aip;
1843	struct pagedep *pagedep;
1844
1845	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
1846	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1847	ACQUIRE_LOCK(&lk);
1848	/*
1849	 * If we are allocating a directory page, then we must
1850	 * allocate an associated pagedep to track additions and
1851	 * deletions.
1852	 */
1853	if ((ip->i_mode & IFMT) == IFDIR &&
1854	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1855		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1856	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1857	setup_allocindir_phase2(bp, ip, aip);
1858	FREE_LOCK(&lk);
1859}
1860
1861/*
1862 * Called just before setting an indirect block pointer to a
1863 * newly allocated indirect block.
1864 */
1865void
1866softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1867	struct buf *nbp;	/* newly allocated indirect block */
1868	struct inode *ip;	/* inode for file being extended */
1869	struct buf *bp;		/* indirect block referencing allocated block */
1870	int ptrno;		/* offset of pointer in indirect block */
1871	ufs2_daddr_t newblkno;	/* disk block number being added */
1872{
1873	struct allocindir *aip;
1874
1875	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
1876	aip = newallocindir(ip, ptrno, newblkno, 0);
1877	ACQUIRE_LOCK(&lk);
1878	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1879	setup_allocindir_phase2(bp, ip, aip);
1880	FREE_LOCK(&lk);
1881}
1882
1883/*
1884 * Called to finish the allocation of the "aip" allocated
1885 * by one of the two routines above.
1886 */
1887static void
1888setup_allocindir_phase2(bp, ip, aip)
1889	struct buf *bp;		/* in-memory copy of the indirect block */
1890	struct inode *ip;	/* inode for file being extended */
1891	struct allocindir *aip;	/* allocindir allocated by the above routines */
1892{
1893	struct worklist *wk;
1894	struct indirdep *indirdep, *newindirdep;
1895	struct bmsafemap *bmsafemap;
1896	struct allocindir *oldaip;
1897	struct freefrag *freefrag;
1898	struct newblk *newblk;
1899	ufs2_daddr_t blkno;
1900
1901	mtx_assert(&lk, MA_OWNED);
1902	if (bp->b_lblkno >= 0)
1903		panic("setup_allocindir_phase2: not indir blk");
1904	for (indirdep = NULL, newindirdep = NULL; ; ) {
1905		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
1906			if (wk->wk_type != D_INDIRDEP)
1907				continue;
1908			indirdep = WK_INDIRDEP(wk);
1909			break;
1910		}
1911		if (indirdep == NULL && newindirdep) {
1912			indirdep = newindirdep;
1913			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1914			newindirdep = NULL;
1915		}
1916		if (indirdep) {
1917			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1918			    &newblk) == 0)
1919				panic("setup_allocindir: lost block");
1920			if (newblk->nb_state == DEPCOMPLETE) {
1921				aip->ai_state |= DEPCOMPLETE;
1922				aip->ai_buf = NULL;
1923			} else {
1924				bmsafemap = newblk->nb_bmsafemap;
1925				aip->ai_buf = bmsafemap->sm_buf;
1926				LIST_REMOVE(newblk, nb_deps);
1927				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1928				    aip, ai_deps);
1929			}
1930			LIST_REMOVE(newblk, nb_hash);
1931			FREE(newblk, M_NEWBLK);
1932			aip->ai_indirdep = indirdep;
1933			/*
1934			 * Check to see if there is an existing dependency
1935			 * for this block. If there is, merge the old
1936			 * dependency into the new one.
1937			 */
1938			if (aip->ai_oldblkno == 0)
1939				oldaip = NULL;
1940			else
1941
1942				LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
1943					if (oldaip->ai_offset == aip->ai_offset)
1944						break;
1945			freefrag = NULL;
1946			if (oldaip != NULL) {
1947				if (oldaip->ai_newblkno != aip->ai_oldblkno)
1948					panic("setup_allocindir_phase2: blkno");
1949				aip->ai_oldblkno = oldaip->ai_oldblkno;
1950				freefrag = aip->ai_freefrag;
1951				aip->ai_freefrag = oldaip->ai_freefrag;
1952				oldaip->ai_freefrag = NULL;
1953				free_allocindir(oldaip, NULL);
1954			}
1955			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1956			if (ip->i_ump->um_fstype == UFS1)
1957				((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
1958				    [aip->ai_offset] = aip->ai_oldblkno;
1959			else
1960				((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
1961				    [aip->ai_offset] = aip->ai_oldblkno;
1962			FREE_LOCK(&lk);
1963			if (freefrag != NULL)
1964				handle_workitem_freefrag(freefrag);
1965		} else
1966			FREE_LOCK(&lk);
1967		if (newindirdep) {
1968			newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
1969			brelse(newindirdep->ir_savebp);
1970			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
1971		}
1972		if (indirdep) {
1973			ACQUIRE_LOCK(&lk);
1974			break;
1975		}
1976		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
1977			M_INDIRDEP, M_SOFTDEP_FLAGS);
1978		newindirdep->ir_list.wk_type = D_INDIRDEP;
1979		newindirdep->ir_state = ATTACHED;
1980		if (ip->i_ump->um_fstype == UFS1)
1981			newindirdep->ir_state |= UFS1FMT;
1982		LIST_INIT(&newindirdep->ir_deplisthd);
1983		LIST_INIT(&newindirdep->ir_donehd);
1984		if (bp->b_blkno == bp->b_lblkno) {
1985			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
1986			    NULL, NULL);
1987			bp->b_blkno = blkno;
1988		}
1989		newindirdep->ir_savebp =
1990		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
1991		BUF_KERNPROC(newindirdep->ir_savebp);
1992		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1993		ACQUIRE_LOCK(&lk);
1994	}
1995}
1996
1997/*
1998 * Block de-allocation dependencies.
1999 *
2000 * When blocks are de-allocated, the on-disk pointers must be nullified before
2001 * the blocks are made available for use by other files.  (The true
2002 * requirement is that old pointers must be nullified before new on-disk
2003 * pointers are set.  We chose this slightly more stringent requirement to
2004 * reduce complexity.) Our implementation handles this dependency by updating
2005 * the inode (or indirect block) appropriately but delaying the actual block
2006 * de-allocation (i.e., freemap and free space count manipulation) until
2007 * after the updated versions reach stable storage.  After the disk is
2008 * updated, the blocks can be safely de-allocated whenever it is convenient.
2009 * This implementation handles only the common case of reducing a file's
2010 * length to zero. Other cases are handled by the conventional synchronous
2011 * write approach.
2012 *
2013 * The ffs implementation with which we worked double-checks
2014 * the state of the block pointers and file size as it reduces
2015 * a file's length.  Some of this code is replicated here in our
2016 * soft updates implementation.  The freeblks->fb_chkcnt field is
2017 * used to transfer a part of this information to the procedure
2018 * that eventually de-allocates the blocks.
2019 *
2020 * This routine should be called from the routine that shortens
2021 * a file's length, before the inode's size or block pointers
2022 * are modified. It will save the block pointer information for
2023 * later release and zero the inode so that the calling routine
2024 * can release it.
2025 */
2026void
2027softdep_setup_freeblocks(ip, length, flags)
2028	struct inode *ip;	/* The inode whose length is to be reduced */
2029	off_t length;		/* The new length for the file */
2030	int flags;		/* IO_EXT and/or IO_NORMAL */
2031{
2032	struct freeblks *freeblks;
2033	struct inodedep *inodedep;
2034	struct allocdirect *adp;
2035	struct vnode *vp;
2036	struct buf *bp;
2037	struct fs *fs;
2038	ufs2_daddr_t extblocks, datablocks;
2039	int i, delay, error;
2040
2041	fs = ip->i_fs;
2042	if (length != 0)
2043		panic("softdep_setup_freeblocks: non-zero length");
2044	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
2045		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
2046	freeblks->fb_list.wk_type = D_FREEBLKS;
2047	freeblks->fb_uid = ip->i_uid;
2048	freeblks->fb_previousinum = ip->i_number;
2049	freeblks->fb_devvp = ip->i_devvp;
2050	freeblks->fb_mnt = ITOV(ip)->v_mount;
2051	extblocks = 0;
2052	if (fs->fs_magic == FS_UFS2_MAGIC)
2053		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
2054	datablocks = DIP(ip, i_blocks) - extblocks;
2055	if ((flags & IO_NORMAL) == 0) {
2056		freeblks->fb_oldsize = 0;
2057		freeblks->fb_chkcnt = 0;
2058	} else {
2059		freeblks->fb_oldsize = ip->i_size;
2060		ip->i_size = 0;
2061		DIP_SET(ip, i_size, 0);
2062		freeblks->fb_chkcnt = datablocks;
2063		for (i = 0; i < NDADDR; i++) {
2064			freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
2065			DIP_SET(ip, i_db[i], 0);
2066		}
2067		for (i = 0; i < NIADDR; i++) {
2068			freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
2069			DIP_SET(ip, i_ib[i], 0);
2070		}
2071		/*
2072		 * If the file was removed, then the space being freed was
2073		 * accounted for then (see softdep_filereleased()). If the
2074		 * file is merely being truncated, then we account for it now.
2075		 */
2076		if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
2077			UFS_LOCK(ip->i_ump);
2078			fs->fs_pendingblocks += datablocks;
2079			UFS_UNLOCK(ip->i_ump);
2080		}
2081	}
2082	if ((flags & IO_EXT) == 0) {
2083		freeblks->fb_oldextsize = 0;
2084	} else {
2085		freeblks->fb_oldextsize = ip->i_din2->di_extsize;
2086		ip->i_din2->di_extsize = 0;
2087		freeblks->fb_chkcnt += extblocks;
2088		for (i = 0; i < NXADDR; i++) {
2089			freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
2090			ip->i_din2->di_extb[i] = 0;
2091		}
2092	}
2093	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
2094	/*
2095	 * Push the zero'ed inode to to its disk buffer so that we are free
2096	 * to delete its dependencies below. Once the dependencies are gone
2097	 * the buffer can be safely released.
2098	 */
2099	if ((error = bread(ip->i_devvp,
2100	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
2101	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
2102		brelse(bp);
2103		softdep_error("softdep_setup_freeblocks", error);
2104	}
2105	if (ip->i_ump->um_fstype == UFS1)
2106		*((struct ufs1_dinode *)bp->b_data +
2107		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
2108	else
2109		*((struct ufs2_dinode *)bp->b_data +
2110		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
2111	/*
2112	 * Find and eliminate any inode dependencies.
2113	 */
2114	ACQUIRE_LOCK(&lk);
2115	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
2116	if ((inodedep->id_state & IOSTARTED) != 0)
2117		panic("softdep_setup_freeblocks: inode busy");
2118	/*
2119	 * Add the freeblks structure to the list of operations that
2120	 * must await the zero'ed inode being written to disk. If we
2121	 * still have a bitmap dependency (delay == 0), then the inode
2122	 * has never been written to disk, so we can process the
2123	 * freeblks below once we have deleted the dependencies.
2124	 */
2125	delay = (inodedep->id_state & DEPCOMPLETE);
2126	if (delay)
2127		WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
2128	/*
2129	 * Because the file length has been truncated to zero, any
2130	 * pending block allocation dependency structures associated
2131	 * with this inode are obsolete and can simply be de-allocated.
2132	 * We must first merge the two dependency lists to get rid of
2133	 * any duplicate freefrag structures, then purge the merged list.
2134	 * If we still have a bitmap dependency, then the inode has never
2135	 * been written to disk, so we can free any fragments without delay.
2136	 */
2137	if (flags & IO_NORMAL) {
2138		merge_inode_lists(&inodedep->id_newinoupdt,
2139		    &inodedep->id_inoupdt);
2140		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
2141			free_allocdirect(&inodedep->id_inoupdt, adp, delay);
2142	}
2143	if (flags & IO_EXT) {
2144		merge_inode_lists(&inodedep->id_newextupdt,
2145		    &inodedep->id_extupdt);
2146		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
2147			free_allocdirect(&inodedep->id_extupdt, adp, delay);
2148	}
2149	FREE_LOCK(&lk);
2150	bdwrite(bp);
2151	/*
2152	 * We must wait for any I/O in progress to finish so that
2153	 * all potential buffers on the dirty list will be visible.
2154	 * Once they are all there, walk the list and get rid of
2155	 * any dependencies.
2156	 */
2157	vp = ITOV(ip);
2158	VI_LOCK(vp);
2159	drain_output(vp);
2160restart:
2161	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
2162		if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
2163		    ((flags & IO_NORMAL) == 0 &&
2164		      (bp->b_xflags & BX_ALTDATA) == 0))
2165			continue;
2166		if ((bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT)) == NULL)
2167			goto restart;
2168		VI_UNLOCK(vp);
2169		ACQUIRE_LOCK(&lk);
2170		(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
2171		deallocate_dependencies(bp, inodedep);
2172		FREE_LOCK(&lk);
2173		bp->b_flags |= B_INVAL | B_NOCACHE;
2174		brelse(bp);
2175		VI_LOCK(vp);
2176		goto restart;
2177	}
2178	VI_UNLOCK(vp);
2179	ACQUIRE_LOCK(&lk);
2180	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
2181		(void) free_inodedep(inodedep);
2182	FREE_LOCK(&lk);
2183	/*
2184	 * If the inode has never been written to disk (delay == 0),
2185	 * then we can process the freeblks now that we have deleted
2186	 * the dependencies.
2187	 */
2188	if (!delay)
2189		handle_workitem_freeblocks(freeblks, 0);
2190}
2191
2192/*
2193 * Reclaim any dependency structures from a buffer that is about to
2194 * be reallocated to a new vnode. The buffer must be locked, thus,
2195 * no I/O completion operations can occur while we are manipulating
2196 * its associated dependencies. The mutex is held so that other I/O's
2197 * associated with related dependencies do not occur.
2198 */
2199static void
2200deallocate_dependencies(bp, inodedep)
2201	struct buf *bp;
2202	struct inodedep *inodedep;
2203{
2204	struct worklist *wk;
2205	struct indirdep *indirdep;
2206	struct allocindir *aip;
2207	struct pagedep *pagedep;
2208	struct dirrem *dirrem;
2209	struct diradd *dap;
2210	int i;
2211
2212	mtx_assert(&lk, MA_OWNED);
2213	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2214		switch (wk->wk_type) {
2215
2216		case D_INDIRDEP:
2217			indirdep = WK_INDIRDEP(wk);
2218			/*
2219			 * None of the indirect pointers will ever be visible,
2220			 * so they can simply be tossed. GOINGAWAY ensures
2221			 * that allocated pointers will be saved in the buffer
2222			 * cache until they are freed. Note that they will
2223			 * only be able to be found by their physical address
2224			 * since the inode mapping the logical address will
2225			 * be gone. The save buffer used for the safe copy
2226			 * was allocated in setup_allocindir_phase2 using
2227			 * the physical address so it could be used for this
2228			 * purpose. Hence we swap the safe copy with the real
2229			 * copy, allowing the safe copy to be freed and holding
2230			 * on to the real copy for later use in indir_trunc.
2231			 */
2232			if (indirdep->ir_state & GOINGAWAY)
2233				panic("deallocate_dependencies: already gone");
2234			indirdep->ir_state |= GOINGAWAY;
2235			VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
2236			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
2237				free_allocindir(aip, inodedep);
2238			if (bp->b_lblkno >= 0 ||
2239			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
2240				panic("deallocate_dependencies: not indir");
2241			bcopy(bp->b_data, indirdep->ir_savebp->b_data,
2242			    bp->b_bcount);
2243			WORKLIST_REMOVE(wk);
2244			WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
2245			continue;
2246
2247		case D_PAGEDEP:
2248			pagedep = WK_PAGEDEP(wk);
2249			/*
2250			 * None of the directory additions will ever be
2251			 * visible, so they can simply be tossed.
2252			 */
2253			for (i = 0; i < DAHASHSZ; i++)
2254				while ((dap =
2255				    LIST_FIRST(&pagedep->pd_diraddhd[i])))
2256					free_diradd(dap);
2257			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
2258				free_diradd(dap);
2259			/*
2260			 * Copy any directory remove dependencies to the list
2261			 * to be processed after the zero'ed inode is written.
2262			 * If the inode has already been written, then they
2263			 * can be dumped directly onto the work list.
2264			 */
2265			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
2266				LIST_REMOVE(dirrem, dm_next);
2267				dirrem->dm_dirinum = pagedep->pd_ino;
2268				if (inodedep == NULL ||
2269				    (inodedep->id_state & ALLCOMPLETE) ==
2270				     ALLCOMPLETE)
2271					add_to_worklist(&dirrem->dm_list);
2272				else
2273					WORKLIST_INSERT(&inodedep->id_bufwait,
2274					    &dirrem->dm_list);
2275			}
2276			if ((pagedep->pd_state & NEWBLOCK) != 0) {
2277				LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
2278					if (wk->wk_type == D_NEWDIRBLK &&
2279					    WK_NEWDIRBLK(wk)->db_pagedep ==
2280					      pagedep)
2281						break;
2282				if (wk != NULL) {
2283					WORKLIST_REMOVE(wk);
2284					free_newdirblk(WK_NEWDIRBLK(wk));
2285				} else
2286					panic("deallocate_dependencies: "
2287					      "lost pagedep");
2288			}
2289			WORKLIST_REMOVE(&pagedep->pd_list);
2290			LIST_REMOVE(pagedep, pd_hash);
2291			WORKITEM_FREE(pagedep, D_PAGEDEP);
2292			continue;
2293
2294		case D_ALLOCINDIR:
2295			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
2296			continue;
2297
2298		case D_ALLOCDIRECT:
2299		case D_INODEDEP:
2300			panic("deallocate_dependencies: Unexpected type %s",
2301			    TYPENAME(wk->wk_type));
2302			/* NOTREACHED */
2303
2304		default:
2305			panic("deallocate_dependencies: Unknown type %s",
2306			    TYPENAME(wk->wk_type));
2307			/* NOTREACHED */
2308		}
2309	}
2310}
2311
2312/*
2313 * Free an allocdirect. Generate a new freefrag work request if appropriate.
2314 * This routine must be called with splbio interrupts blocked.
2315 */
2316static void
2317free_allocdirect(adphead, adp, delay)
2318	struct allocdirectlst *adphead;
2319	struct allocdirect *adp;
2320	int delay;
2321{
2322	struct newdirblk *newdirblk;
2323	struct worklist *wk;
2324
2325	mtx_assert(&lk, MA_OWNED);
2326	if ((adp->ad_state & DEPCOMPLETE) == 0)
2327		LIST_REMOVE(adp, ad_deps);
2328	TAILQ_REMOVE(adphead, adp, ad_next);
2329	if ((adp->ad_state & COMPLETE) == 0)
2330		WORKLIST_REMOVE(&adp->ad_list);
2331	if (adp->ad_freefrag != NULL) {
2332		if (delay)
2333			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2334			    &adp->ad_freefrag->ff_list);
2335		else
2336			add_to_worklist(&adp->ad_freefrag->ff_list);
2337	}
2338	if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
2339		newdirblk = WK_NEWDIRBLK(wk);
2340		WORKLIST_REMOVE(&newdirblk->db_list);
2341		if (LIST_FIRST(&adp->ad_newdirblk) != NULL)
2342			panic("free_allocdirect: extra newdirblk");
2343		if (delay)
2344			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2345			    &newdirblk->db_list);
2346		else
2347			free_newdirblk(newdirblk);
2348	}
2349	WORKITEM_FREE(adp, D_ALLOCDIRECT);
2350}
2351
2352/*
2353 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
2354 * This routine must be called with splbio interrupts blocked.
2355 */
2356static void
2357free_newdirblk(newdirblk)
2358	struct newdirblk *newdirblk;
2359{
2360	struct pagedep *pagedep;
2361	struct diradd *dap;
2362	int i;
2363
2364	mtx_assert(&lk, MA_OWNED);
2365	/*
2366	 * If the pagedep is still linked onto the directory buffer
2367	 * dependency chain, then some of the entries on the
2368	 * pd_pendinghd list may not be committed to disk yet. In
2369	 * this case, we will simply clear the NEWBLOCK flag and
2370	 * let the pd_pendinghd list be processed when the pagedep
2371	 * is next written. If the pagedep is no longer on the buffer
2372	 * dependency chain, then all the entries on the pd_pending
2373	 * list are committed to disk and we can free them here.
2374	 */
2375	pagedep = newdirblk->db_pagedep;
2376	pagedep->pd_state &= ~NEWBLOCK;
2377	if ((pagedep->pd_state & ONWORKLIST) == 0)
2378		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
2379			free_diradd(dap);
2380	/*
2381	 * If no dependencies remain, the pagedep will be freed.
2382	 */
2383	for (i = 0; i < DAHASHSZ; i++)
2384		if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
2385			break;
2386	if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
2387		LIST_REMOVE(pagedep, pd_hash);
2388		WORKITEM_FREE(pagedep, D_PAGEDEP);
2389	}
2390	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2391}
2392
2393/*
2394 * Prepare an inode to be freed. The actual free operation is not
2395 * done until the zero'ed inode has been written to disk.
2396 */
2397void
2398softdep_freefile(pvp, ino, mode)
2399	struct vnode *pvp;
2400	ino_t ino;
2401	int mode;
2402{
2403	struct inode *ip = VTOI(pvp);
2404	struct inodedep *inodedep;
2405	struct freefile *freefile;
2406
2407	/*
2408	 * This sets up the inode de-allocation dependency.
2409	 */
2410	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
2411		M_FREEFILE, M_SOFTDEP_FLAGS);
2412	freefile->fx_list.wk_type = D_FREEFILE;
2413	freefile->fx_list.wk_state = 0;
2414	freefile->fx_mode = mode;
2415	freefile->fx_oldinum = ino;
2416	freefile->fx_devvp = ip->i_devvp;
2417	freefile->fx_mnt = ITOV(ip)->v_mount;
2418	if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
2419		UFS_LOCK(ip->i_ump);
2420		ip->i_fs->fs_pendinginodes += 1;
2421		UFS_UNLOCK(ip->i_ump);
2422	}
2423
2424	/*
2425	 * If the inodedep does not exist, then the zero'ed inode has
2426	 * been written to disk. If the allocated inode has never been
2427	 * written to disk, then the on-disk inode is zero'ed. In either
2428	 * case we can free the file immediately.
2429	 */
2430	ACQUIRE_LOCK(&lk);
2431	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
2432	    check_inode_unwritten(inodedep)) {
2433		FREE_LOCK(&lk);
2434		handle_workitem_freefile(freefile);
2435		return;
2436	}
2437	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
2438	FREE_LOCK(&lk);
2439}
2440
2441/*
2442 * Check to see if an inode has never been written to disk. If
2443 * so free the inodedep and return success, otherwise return failure.
2444 * This routine must be called with splbio interrupts blocked.
2445 *
2446 * If we still have a bitmap dependency, then the inode has never
2447 * been written to disk. Drop the dependency as it is no longer
2448 * necessary since the inode is being deallocated. We set the
2449 * ALLCOMPLETE flags since the bitmap now properly shows that the
2450 * inode is not allocated. Even if the inode is actively being
2451 * written, it has been rolled back to its zero'ed state, so we
2452 * are ensured that a zero inode is what is on the disk. For short
2453 * lived files, this change will usually result in removing all the
2454 * dependencies from the inode so that it can be freed immediately.
2455 */
2456static int
2457check_inode_unwritten(inodedep)
2458	struct inodedep *inodedep;
2459{
2460
2461	mtx_assert(&lk, MA_OWNED);
2462	if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
2463	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2464	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2465	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
2466	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2467	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2468	    TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
2469	    TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
2470	    inodedep->id_nlinkdelta != 0)
2471		return (0);
2472	inodedep->id_state |= ALLCOMPLETE;
2473	LIST_REMOVE(inodedep, id_deps);
2474	inodedep->id_buf = NULL;
2475	if (inodedep->id_state & ONWORKLIST)
2476		WORKLIST_REMOVE(&inodedep->id_list);
2477	if (inodedep->id_savedino1 != NULL) {
2478		FREE(inodedep->id_savedino1, M_SAVEDINO);
2479		inodedep->id_savedino1 = NULL;
2480	}
2481	if (free_inodedep(inodedep) == 0)
2482		panic("check_inode_unwritten: busy inode");
2483	return (1);
2484}
2485
2486/*
2487 * Try to free an inodedep structure. Return 1 if it could be freed.
2488 */
2489static int
2490free_inodedep(inodedep)
2491	struct inodedep *inodedep;
2492{
2493
2494	mtx_assert(&lk, MA_OWNED);
2495	if ((inodedep->id_state & ONWORKLIST) != 0 ||
2496	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
2497	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2498	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2499	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
2500	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2501	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2502	    TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
2503	    TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
2504	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
2505		return (0);
2506	LIST_REMOVE(inodedep, id_hash);
2507	WORKITEM_FREE(inodedep, D_INODEDEP);
2508	num_inodedep -= 1;
2509	return (1);
2510}
2511
2512/*
2513 * This workitem routine performs the block de-allocation.
2514 * The workitem is added to the pending list after the updated
2515 * inode block has been written to disk.  As mentioned above,
2516 * checks regarding the number of blocks de-allocated (compared
2517 * to the number of blocks allocated for the file) are also
2518 * performed in this function.
2519 */
2520static void
2521handle_workitem_freeblocks(freeblks, flags)
2522	struct freeblks *freeblks;
2523	int flags;
2524{
2525	struct inode *ip;
2526	struct vnode *vp;
2527	struct fs *fs;
2528	struct ufsmount *ump;
2529	int i, nblocks, level, bsize;
2530	ufs2_daddr_t bn, blocksreleased = 0;
2531	int error, allerror = 0;
2532	ufs_lbn_t baselbns[NIADDR], tmpval;
2533	int fs_pendingblocks;
2534
2535	ump = VFSTOUFS(freeblks->fb_mnt);
2536	fs = ump->um_fs;
2537	fs_pendingblocks = 0;
2538	tmpval = 1;
2539	baselbns[0] = NDADDR;
2540	for (i = 1; i < NIADDR; i++) {
2541		tmpval *= NINDIR(fs);
2542		baselbns[i] = baselbns[i - 1] + tmpval;
2543	}
2544	nblocks = btodb(fs->fs_bsize);
2545	blocksreleased = 0;
2546	/*
2547	 * Release all extended attribute blocks or frags.
2548	 */
2549	if (freeblks->fb_oldextsize > 0) {
2550		for (i = (NXADDR - 1); i >= 0; i--) {
2551			if ((bn = freeblks->fb_eblks[i]) == 0)
2552				continue;
2553			bsize = sblksize(fs, freeblks->fb_oldextsize, i);
2554			ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
2555			    freeblks->fb_previousinum);
2556			blocksreleased += btodb(bsize);
2557		}
2558	}
2559	/*
2560	 * Release all data blocks or frags.
2561	 */
2562	if (freeblks->fb_oldsize > 0) {
2563		/*
2564		 * Indirect blocks first.
2565		 */
2566		for (level = (NIADDR - 1); level >= 0; level--) {
2567			if ((bn = freeblks->fb_iblks[level]) == 0)
2568				continue;
2569			if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
2570			    level, baselbns[level], &blocksreleased)) == 0)
2571				allerror = error;
2572			ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,
2573			    fs->fs_bsize, freeblks->fb_previousinum);
2574			fs_pendingblocks += nblocks;
2575			blocksreleased += nblocks;
2576		}
2577		/*
2578		 * All direct blocks or frags.
2579		 */
2580		for (i = (NDADDR - 1); i >= 0; i--) {
2581			if ((bn = freeblks->fb_dblks[i]) == 0)
2582				continue;
2583			bsize = sblksize(fs, freeblks->fb_oldsize, i);
2584			ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
2585			    freeblks->fb_previousinum);
2586			fs_pendingblocks += btodb(bsize);
2587			blocksreleased += btodb(bsize);
2588		}
2589	}
2590	UFS_LOCK(ump);
2591	fs->fs_pendingblocks -= fs_pendingblocks;
2592	UFS_UNLOCK(ump);
2593	/*
2594	 * If we still have not finished background cleanup, then check
2595	 * to see if the block count needs to be adjusted.
2596	 */
2597	if (freeblks->fb_chkcnt != blocksreleased &&
2598	    (fs->fs_flags & FS_UNCLEAN) != 0 &&
2599	    ffs_vget(freeblks->fb_mnt, freeblks->fb_previousinum,
2600	    (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) {
2601		ip = VTOI(vp);
2602		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \
2603		    freeblks->fb_chkcnt - blocksreleased);
2604		ip->i_flag |= IN_CHANGE;
2605		vput(vp);
2606	}
2607
2608#ifdef DIAGNOSTIC
2609	if (freeblks->fb_chkcnt != blocksreleased &&
2610	    ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
2611		printf("handle_workitem_freeblocks: block count\n");
2612	if (allerror)
2613		softdep_error("handle_workitem_freeblks", allerror);
2614#endif /* DIAGNOSTIC */
2615
2616	WORKITEM_FREE(freeblks, D_FREEBLKS);
2617}
2618
2619/*
2620 * Release blocks associated with the inode ip and stored in the indirect
2621 * block dbn. If level is greater than SINGLE, the block is an indirect block
2622 * and recursive calls to indirtrunc must be used to cleanse other indirect
2623 * blocks.
2624 */
2625static int
2626indir_trunc(freeblks, dbn, level, lbn, countp)
2627	struct freeblks *freeblks;
2628	ufs2_daddr_t dbn;
2629	int level;
2630	ufs_lbn_t lbn;
2631	ufs2_daddr_t *countp;
2632{
2633	struct buf *bp;
2634	struct fs *fs;
2635	struct worklist *wk;
2636	struct indirdep *indirdep;
2637	struct ufsmount *ump;
2638	ufs1_daddr_t *bap1 = 0;
2639	ufs2_daddr_t nb, *bap2 = 0;
2640	ufs_lbn_t lbnadd;
2641	int i, nblocks, ufs1fmt;
2642	int error, allerror = 0;
2643	int fs_pendingblocks;
2644
2645	ump = VFSTOUFS(freeblks->fb_mnt);
2646	fs = ump->um_fs;
2647	fs_pendingblocks = 0;
2648	lbnadd = 1;
2649	for (i = level; i > 0; i--)
2650		lbnadd *= NINDIR(fs);
2651	/*
2652	 * Get buffer of block pointers to be freed. This routine is not
2653	 * called until the zero'ed inode has been written, so it is safe
2654	 * to free blocks as they are encountered. Because the inode has
2655	 * been zero'ed, calls to bmap on these blocks will fail. So, we
2656	 * have to use the on-disk address and the block device for the
2657	 * filesystem to look them up. If the file was deleted before its
2658	 * indirect blocks were all written to disk, the routine that set
2659	 * us up (deallocate_dependencies) will have arranged to leave
2660	 * a complete copy of the indirect block in memory for our use.
2661	 * Otherwise we have to read the blocks in from the disk.
2662	 */
2663#ifdef notyet
2664	bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
2665	    GB_NOCREAT);
2666#else
2667	bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
2668#endif
2669	ACQUIRE_LOCK(&lk);
2670	if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2671		if (wk->wk_type != D_INDIRDEP ||
2672		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2673		    (indirdep->ir_state & GOINGAWAY) == 0)
2674			panic("indir_trunc: lost indirdep");
2675		WORKLIST_REMOVE(wk);
2676		WORKITEM_FREE(indirdep, D_INDIRDEP);
2677		if (LIST_FIRST(&bp->b_dep) != NULL)
2678			panic("indir_trunc: dangling dep");
2679		VFSTOUFS(freeblks->fb_mnt)->um_numindirdeps -= 1;
2680		FREE_LOCK(&lk);
2681	} else {
2682#ifdef notyet
2683		if (bp)
2684			brelse(bp);
2685#endif
2686		FREE_LOCK(&lk);
2687		error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
2688		    NOCRED, &bp);
2689		if (error) {
2690			brelse(bp);
2691			return (error);
2692		}
2693	}
2694	/*
2695	 * Recursively free indirect blocks.
2696	 */
2697	if (VFSTOUFS(freeblks->fb_mnt)->um_fstype == UFS1) {
2698		ufs1fmt = 1;
2699		bap1 = (ufs1_daddr_t *)bp->b_data;
2700	} else {
2701		ufs1fmt = 0;
2702		bap2 = (ufs2_daddr_t *)bp->b_data;
2703	}
2704	nblocks = btodb(fs->fs_bsize);
2705	for (i = NINDIR(fs) - 1; i >= 0; i--) {
2706		if (ufs1fmt)
2707			nb = bap1[i];
2708		else
2709			nb = bap2[i];
2710		if (nb == 0)
2711			continue;
2712		if (level != 0) {
2713			if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
2714			     level - 1, lbn + (i * lbnadd), countp)) != 0)
2715				allerror = error;
2716		}
2717		ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize,
2718		    freeblks->fb_previousinum);
2719		fs_pendingblocks += nblocks;
2720		*countp += nblocks;
2721	}
2722	UFS_LOCK(ump);
2723	fs->fs_pendingblocks -= fs_pendingblocks;
2724	UFS_UNLOCK(ump);
2725	bp->b_flags |= B_INVAL | B_NOCACHE;
2726	brelse(bp);
2727	return (allerror);
2728}
2729
2730/*
2731 * Free an allocindir.
2732 * This routine must be called with splbio interrupts blocked.
2733 */
2734static void
2735free_allocindir(aip, inodedep)
2736	struct allocindir *aip;
2737	struct inodedep *inodedep;
2738{
2739	struct freefrag *freefrag;
2740
2741	mtx_assert(&lk, MA_OWNED);
2742	if ((aip->ai_state & DEPCOMPLETE) == 0)
2743		LIST_REMOVE(aip, ai_deps);
2744	if (aip->ai_state & ONWORKLIST)
2745		WORKLIST_REMOVE(&aip->ai_list);
2746	LIST_REMOVE(aip, ai_next);
2747	if ((freefrag = aip->ai_freefrag) != NULL) {
2748		if (inodedep == NULL)
2749			add_to_worklist(&freefrag->ff_list);
2750		else
2751			WORKLIST_INSERT(&inodedep->id_bufwait,
2752			    &freefrag->ff_list);
2753	}
2754	WORKITEM_FREE(aip, D_ALLOCINDIR);
2755}
2756
2757/*
2758 * Directory entry addition dependencies.
2759 *
2760 * When adding a new directory entry, the inode (with its incremented link
2761 * count) must be written to disk before the directory entry's pointer to it.
2762 * Also, if the inode is newly allocated, the corresponding freemap must be
2763 * updated (on disk) before the directory entry's pointer. These requirements
2764 * are met via undo/redo on the directory entry's pointer, which consists
2765 * simply of the inode number.
2766 *
2767 * As directory entries are added and deleted, the free space within a
2768 * directory block can become fragmented.  The ufs filesystem will compact
2769 * a fragmented directory block to make space for a new entry. When this
2770 * occurs, the offsets of previously added entries change. Any "diradd"
2771 * dependency structures corresponding to these entries must be updated with
2772 * the new offsets.
2773 */
2774
2775/*
2776 * This routine is called after the in-memory inode's link
2777 * count has been incremented, but before the directory entry's
2778 * pointer to the inode has been set.
2779 */
2780int
2781softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
2782	struct buf *bp;		/* buffer containing directory block */
2783	struct inode *dp;	/* inode for directory */
2784	off_t diroffset;	/* offset of new entry in directory */
2785	ino_t newinum;		/* inode referenced by new directory entry */
2786	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
2787	int isnewblk;		/* entry is in a newly allocated block */
2788{
2789	int offset;		/* offset of new entry within directory block */
2790	ufs_lbn_t lbn;		/* block in directory containing new entry */
2791	struct fs *fs;
2792	struct diradd *dap;
2793	struct allocdirect *adp;
2794	struct pagedep *pagedep;
2795	struct inodedep *inodedep;
2796	struct newdirblk *newdirblk = 0;
2797	struct mkdir *mkdir1, *mkdir2;
2798
2799	/*
2800	 * Whiteouts have no dependencies.
2801	 */
2802	if (newinum == WINO) {
2803		if (newdirbp != NULL)
2804			bdwrite(newdirbp);
2805		return (0);
2806	}
2807
2808	fs = dp->i_fs;
2809	lbn = lblkno(fs, diroffset);
2810	offset = blkoff(fs, diroffset);
2811	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
2812		M_SOFTDEP_FLAGS|M_ZERO);
2813	dap->da_list.wk_type = D_DIRADD;
2814	dap->da_offset = offset;
2815	dap->da_newinum = newinum;
2816	dap->da_state = ATTACHED;
2817	if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
2818		MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk),
2819		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
2820		newdirblk->db_list.wk_type = D_NEWDIRBLK;
2821		newdirblk->db_state = 0;
2822	}
2823	if (newdirbp == NULL) {
2824		dap->da_state |= DEPCOMPLETE;
2825		ACQUIRE_LOCK(&lk);
2826	} else {
2827		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2828		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2829		    M_SOFTDEP_FLAGS);
2830		mkdir1->md_list.wk_type = D_MKDIR;
2831		mkdir1->md_state = MKDIR_BODY;
2832		mkdir1->md_diradd = dap;
2833		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2834		    M_SOFTDEP_FLAGS);
2835		mkdir2->md_list.wk_type = D_MKDIR;
2836		mkdir2->md_state = MKDIR_PARENT;
2837		mkdir2->md_diradd = dap;
2838		/*
2839		 * Dependency on "." and ".." being written to disk.
2840		 */
2841		mkdir1->md_buf = newdirbp;
2842		ACQUIRE_LOCK(&lk);
2843		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2844		WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2845		FREE_LOCK(&lk);
2846		bdwrite(newdirbp);
2847		/*
2848		 * Dependency on link count increase for parent directory
2849		 */
2850		ACQUIRE_LOCK(&lk);
2851		if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0
2852		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2853			dap->da_state &= ~MKDIR_PARENT;
2854			WORKITEM_FREE(mkdir2, D_MKDIR);
2855		} else {
2856			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2857			WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2858		}
2859	}
2860	/*
2861	 * Link into parent directory pagedep to await its being written.
2862	 */
2863	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2864		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2865	dap->da_pagedep = pagedep;
2866	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2867	    da_pdlist);
2868	/*
2869	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2870	 * is not yet written. If it is written, do the post-inode write
2871	 * processing to put it on the id_pendinghd list.
2872	 */
2873	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2874	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2875		diradd_inode_written(dap, inodedep);
2876	else
2877		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2878	if (isnewblk) {
2879		/*
2880		 * Directories growing into indirect blocks are rare
2881		 * enough and the frequency of new block allocation
2882		 * in those cases even more rare, that we choose not
2883		 * to bother tracking them. Rather we simply force the
2884		 * new directory entry to disk.
2885		 */
2886		if (lbn >= NDADDR) {
2887			FREE_LOCK(&lk);
2888			/*
2889			 * We only have a new allocation when at the
2890			 * beginning of a new block, not when we are
2891			 * expanding into an existing block.
2892			 */
2893			if (blkoff(fs, diroffset) == 0)
2894				return (1);
2895			return (0);
2896		}
2897		/*
2898		 * We only have a new allocation when at the beginning
2899		 * of a new fragment, not when we are expanding into an
2900		 * existing fragment. Also, there is nothing to do if we
2901		 * are already tracking this block.
2902		 */
2903		if (fragoff(fs, diroffset) != 0) {
2904			FREE_LOCK(&lk);
2905			return (0);
2906		}
2907		if ((pagedep->pd_state & NEWBLOCK) != 0) {
2908			FREE_LOCK(&lk);
2909			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2910			return (0);
2911		}
2912		/*
2913		 * Find our associated allocdirect and have it track us.
2914		 */
2915		if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0)
2916			panic("softdep_setup_directory_add: lost inodedep");
2917		adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
2918		if (adp == NULL || adp->ad_lbn != lbn)
2919			panic("softdep_setup_directory_add: lost entry");
2920		pagedep->pd_state |= NEWBLOCK;
2921		newdirblk->db_pagedep = pagedep;
2922		WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
2923	}
2924	FREE_LOCK(&lk);
2925	return (0);
2926}
2927
2928/*
2929 * This procedure is called to change the offset of a directory
2930 * entry when compacting a directory block which must be owned
2931 * exclusively by the caller. Note that the actual entry movement
2932 * must be done in this procedure to ensure that no I/O completions
2933 * occur while the move is in progress.
2934 */
2935void
2936softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2937	struct inode *dp;	/* inode for directory */
2938	caddr_t base;		/* address of dp->i_offset */
2939	caddr_t oldloc;		/* address of old directory location */
2940	caddr_t newloc;		/* address of new directory location */
2941	int entrysize;		/* size of directory entry */
2942{
2943	int offset, oldoffset, newoffset;
2944	struct pagedep *pagedep;
2945	struct diradd *dap;
2946	ufs_lbn_t lbn;
2947
2948	ACQUIRE_LOCK(&lk);
2949	lbn = lblkno(dp->i_fs, dp->i_offset);
2950	offset = blkoff(dp->i_fs, dp->i_offset);
2951	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2952		goto done;
2953	oldoffset = offset + (oldloc - base);
2954	newoffset = offset + (newloc - base);
2955
2956	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
2957		if (dap->da_offset != oldoffset)
2958			continue;
2959		dap->da_offset = newoffset;
2960		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2961			break;
2962		LIST_REMOVE(dap, da_pdlist);
2963		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2964		    dap, da_pdlist);
2965		break;
2966	}
2967	if (dap == NULL) {
2968
2969		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
2970			if (dap->da_offset == oldoffset) {
2971				dap->da_offset = newoffset;
2972				break;
2973			}
2974		}
2975	}
2976done:
2977	bcopy(oldloc, newloc, entrysize);
2978	FREE_LOCK(&lk);
2979}
2980
2981/*
2982 * Free a diradd dependency structure. This routine must be called
2983 * with splbio interrupts blocked.
2984 */
2985static void
2986free_diradd(dap)
2987	struct diradd *dap;
2988{
2989	struct dirrem *dirrem;
2990	struct pagedep *pagedep;
2991	struct inodedep *inodedep;
2992	struct mkdir *mkdir, *nextmd;
2993
2994	mtx_assert(&lk, MA_OWNED);
2995	WORKLIST_REMOVE(&dap->da_list);
2996	LIST_REMOVE(dap, da_pdlist);
2997	if ((dap->da_state & DIRCHG) == 0) {
2998		pagedep = dap->da_pagedep;
2999	} else {
3000		dirrem = dap->da_previous;
3001		pagedep = dirrem->dm_pagedep;
3002		dirrem->dm_dirinum = pagedep->pd_ino;
3003		add_to_worklist(&dirrem->dm_list);
3004	}
3005	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
3006	    0, &inodedep) != 0)
3007		(void) free_inodedep(inodedep);
3008	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
3009		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
3010			nextmd = LIST_NEXT(mkdir, md_mkdirs);
3011			if (mkdir->md_diradd != dap)
3012				continue;
3013			dap->da_state &= ~mkdir->md_state;
3014			WORKLIST_REMOVE(&mkdir->md_list);
3015			LIST_REMOVE(mkdir, md_mkdirs);
3016			WORKITEM_FREE(mkdir, D_MKDIR);
3017		}
3018		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
3019			panic("free_diradd: unfound ref");
3020	}
3021	WORKITEM_FREE(dap, D_DIRADD);
3022}
3023
3024/*
3025 * Directory entry removal dependencies.
3026 *
3027 * When removing a directory entry, the entry's inode pointer must be
3028 * zero'ed on disk before the corresponding inode's link count is decremented
3029 * (possibly freeing the inode for re-use). This dependency is handled by
3030 * updating the directory entry but delaying the inode count reduction until
3031 * after the directory block has been written to disk. After this point, the
3032 * inode count can be decremented whenever it is convenient.
3033 */
3034
3035/*
3036 * This routine should be called immediately after removing
3037 * a directory entry.  The inode's link count should not be
3038 * decremented by the calling procedure -- the soft updates
3039 * code will do this task when it is safe.
3040 */
3041void
3042softdep_setup_remove(bp, dp, ip, isrmdir)
3043	struct buf *bp;		/* buffer containing directory block */
3044	struct inode *dp;	/* inode for the directory being modified */
3045	struct inode *ip;	/* inode for directory entry being removed */
3046	int isrmdir;		/* indicates if doing RMDIR */
3047{
3048	struct dirrem *dirrem, *prevdirrem;
3049
3050	/*
3051	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
3052	 */
3053	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
3054
3055	/*
3056	 * If the COMPLETE flag is clear, then there were no active
3057	 * entries and we want to roll back to a zeroed entry until
3058	 * the new inode is committed to disk. If the COMPLETE flag is
3059	 * set then we have deleted an entry that never made it to
3060	 * disk. If the entry we deleted resulted from a name change,
3061	 * then the old name still resides on disk. We cannot delete
3062	 * its inode (returned to us in prevdirrem) until the zeroed
3063	 * directory entry gets to disk. The new inode has never been
3064	 * referenced on the disk, so can be deleted immediately.
3065	 */
3066	if ((dirrem->dm_state & COMPLETE) == 0) {
3067		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
3068		    dm_next);
3069		FREE_LOCK(&lk);
3070	} else {
3071		if (prevdirrem != NULL)
3072			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
3073			    prevdirrem, dm_next);
3074		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
3075		FREE_LOCK(&lk);
3076		handle_workitem_remove(dirrem, NULL);
3077	}
3078}
3079
3080/*
3081 * Allocate a new dirrem if appropriate and return it along with
3082 * its associated pagedep. Called without a lock, returns with lock.
3083 */
3084static long num_dirrem;		/* number of dirrem allocated */
3085static struct dirrem *
3086newdirrem(bp, dp, ip, isrmdir, prevdirremp)
3087	struct buf *bp;		/* buffer containing directory block */
3088	struct inode *dp;	/* inode for the directory being modified */
3089	struct inode *ip;	/* inode for directory entry being removed */
3090	int isrmdir;		/* indicates if doing RMDIR */
3091	struct dirrem **prevdirremp; /* previously referenced inode, if any */
3092{
3093	int offset;
3094	ufs_lbn_t lbn;
3095	struct diradd *dap;
3096	struct dirrem *dirrem;
3097	struct pagedep *pagedep;
3098
3099	/*
3100	 * Whiteouts have no deletion dependencies.
3101	 */
3102	if (ip == NULL)
3103		panic("newdirrem: whiteout");
3104	/*
3105	 * If we are over our limit, try to improve the situation.
3106	 * Limiting the number of dirrem structures will also limit
3107	 * the number of freefile and freeblks structures.
3108	 */
3109	ACQUIRE_LOCK(&lk);
3110	if (num_dirrem > max_softdeps / 2)
3111		(void) request_cleanup(FLUSH_REMOVE);
3112	num_dirrem += 1;
3113	FREE_LOCK(&lk);
3114	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
3115		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
3116	dirrem->dm_list.wk_type = D_DIRREM;
3117	dirrem->dm_state = isrmdir ? RMDIR : 0;
3118	dirrem->dm_mnt = ITOV(ip)->v_mount;
3119	dirrem->dm_oldinum = ip->i_number;
3120	*prevdirremp = NULL;
3121
3122	ACQUIRE_LOCK(&lk);
3123	lbn = lblkno(dp->i_fs, dp->i_offset);
3124	offset = blkoff(dp->i_fs, dp->i_offset);
3125	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
3126		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
3127	dirrem->dm_pagedep = pagedep;
3128	/*
3129	 * Check for a diradd dependency for the same directory entry.
3130	 * If present, then both dependencies become obsolete and can
3131	 * be de-allocated. Check for an entry on both the pd_dirraddhd
3132	 * list and the pd_pendinghd list.
3133	 */
3134
3135	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
3136		if (dap->da_offset == offset)
3137			break;
3138	if (dap == NULL) {
3139
3140		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
3141			if (dap->da_offset == offset)
3142				break;
3143		if (dap == NULL)
3144			return (dirrem);
3145	}
3146	/*
3147	 * Must be ATTACHED at this point.
3148	 */
3149	if ((dap->da_state & ATTACHED) == 0)
3150		panic("newdirrem: not ATTACHED");
3151	if (dap->da_newinum != ip->i_number)
3152		panic("newdirrem: inum %d should be %d",
3153		    ip->i_number, dap->da_newinum);
3154	/*
3155	 * If we are deleting a changed name that never made it to disk,
3156	 * then return the dirrem describing the previous inode (which
3157	 * represents the inode currently referenced from this entry on disk).
3158	 */
3159	if ((dap->da_state & DIRCHG) != 0) {
3160		*prevdirremp = dap->da_previous;
3161		dap->da_state &= ~DIRCHG;
3162		dap->da_pagedep = pagedep;
3163	}
3164	/*
3165	 * We are deleting an entry that never made it to disk.
3166	 * Mark it COMPLETE so we can delete its inode immediately.
3167	 */
3168	dirrem->dm_state |= COMPLETE;
3169	free_diradd(dap);
3170	return (dirrem);
3171}
3172
3173/*
3174 * Directory entry change dependencies.
3175 *
3176 * Changing an existing directory entry requires that an add operation
3177 * be completed first followed by a deletion. The semantics for the addition
3178 * are identical to the description of adding a new entry above except
3179 * that the rollback is to the old inode number rather than zero. Once
3180 * the addition dependency is completed, the removal is done as described
3181 * in the removal routine above.
3182 */
3183
3184/*
3185 * This routine should be called immediately after changing
3186 * a directory entry.  The inode's link count should not be
3187 * decremented by the calling procedure -- the soft updates
3188 * code will perform this task when it is safe.
3189 */
3190void
3191softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
3192	struct buf *bp;		/* buffer containing directory block */
3193	struct inode *dp;	/* inode for the directory being modified */
3194	struct inode *ip;	/* inode for directory entry being removed */
3195	ino_t newinum;		/* new inode number for changed entry */
3196	int isrmdir;		/* indicates if doing RMDIR */
3197{
3198	int offset;
3199	struct diradd *dap = NULL;
3200	struct dirrem *dirrem, *prevdirrem;
3201	struct pagedep *pagedep;
3202	struct inodedep *inodedep;
3203
3204	offset = blkoff(dp->i_fs, dp->i_offset);
3205
3206	/*
3207	 * Whiteouts do not need diradd dependencies.
3208	 */
3209	if (newinum != WINO) {
3210		MALLOC(dap, struct diradd *, sizeof(struct diradd),
3211		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
3212		dap->da_list.wk_type = D_DIRADD;
3213		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
3214		dap->da_offset = offset;
3215		dap->da_newinum = newinum;
3216	}
3217
3218	/*
3219	 * Allocate a new dirrem and ACQUIRE_LOCK.
3220	 */
3221	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
3222	pagedep = dirrem->dm_pagedep;
3223	/*
3224	 * The possible values for isrmdir:
3225	 *	0 - non-directory file rename
3226	 *	1 - directory rename within same directory
3227	 *   inum - directory rename to new directory of given inode number
3228	 * When renaming to a new directory, we are both deleting and
3229	 * creating a new directory entry, so the link count on the new
3230	 * directory should not change. Thus we do not need the followup
3231	 * dirrem which is usually done in handle_workitem_remove. We set
3232	 * the DIRCHG flag to tell handle_workitem_remove to skip the
3233	 * followup dirrem.
3234	 */
3235	if (isrmdir > 1)
3236		dirrem->dm_state |= DIRCHG;
3237
3238	/*
3239	 * Whiteouts have no additional dependencies,
3240	 * so just put the dirrem on the correct list.
3241	 */
3242	if (newinum == WINO) {
3243		if ((dirrem->dm_state & COMPLETE) == 0) {
3244			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
3245			    dm_next);
3246		} else {
3247			dirrem->dm_dirinum = pagedep->pd_ino;
3248			add_to_worklist(&dirrem->dm_list);
3249		}
3250		FREE_LOCK(&lk);
3251		return;
3252	}
3253
3254	/*
3255	 * If the COMPLETE flag is clear, then there were no active
3256	 * entries and we want to roll back to the previous inode until
3257	 * the new inode is committed to disk. If the COMPLETE flag is
3258	 * set, then we have deleted an entry that never made it to disk.
3259	 * If the entry we deleted resulted from a name change, then the old
3260	 * inode reference still resides on disk. Any rollback that we do
3261	 * needs to be to that old inode (returned to us in prevdirrem). If
3262	 * the entry we deleted resulted from a create, then there is
3263	 * no entry on the disk, so we want to roll back to zero rather
3264	 * than the uncommitted inode. In either of the COMPLETE cases we
3265	 * want to immediately free the unwritten and unreferenced inode.
3266	 */
3267	if ((dirrem->dm_state & COMPLETE) == 0) {
3268		dap->da_previous = dirrem;
3269	} else {
3270		if (prevdirrem != NULL) {
3271			dap->da_previous = prevdirrem;
3272		} else {
3273			dap->da_state &= ~DIRCHG;
3274			dap->da_pagedep = pagedep;
3275		}
3276		dirrem->dm_dirinum = pagedep->pd_ino;
3277		add_to_worklist(&dirrem->dm_list);
3278	}
3279	/*
3280	 * Link into its inodedep. Put it on the id_bufwait list if the inode
3281	 * is not yet written. If it is written, do the post-inode write
3282	 * processing to put it on the id_pendinghd list.
3283	 */
3284	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
3285	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
3286		dap->da_state |= COMPLETE;
3287		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3288		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3289	} else {
3290		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
3291		    dap, da_pdlist);
3292		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
3293	}
3294	FREE_LOCK(&lk);
3295}
3296
3297/*
3298 * Called whenever the link count on an inode is changed.
3299 * It creates an inode dependency so that the new reference(s)
3300 * to the inode cannot be committed to disk until the updated
3301 * inode has been written.
3302 */
3303void
3304softdep_change_linkcnt(ip)
3305	struct inode *ip;	/* the inode with the increased link count */
3306{
3307	struct inodedep *inodedep;
3308
3309	ACQUIRE_LOCK(&lk);
3310	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
3311	if (ip->i_nlink < ip->i_effnlink)
3312		panic("softdep_change_linkcnt: bad delta");
3313	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3314	FREE_LOCK(&lk);
3315}
3316
3317/*
3318 * Called when the effective link count and the reference count
3319 * on an inode drops to zero. At this point there are no names
3320 * referencing the file in the filesystem and no active file
3321 * references. The space associated with the file will be freed
3322 * as soon as the necessary soft dependencies are cleared.
3323 */
3324void
3325softdep_releasefile(ip)
3326	struct inode *ip;	/* inode with the zero effective link count */
3327{
3328	struct inodedep *inodedep;
3329	struct fs *fs;
3330	int extblocks;
3331
3332	if (ip->i_effnlink > 0)
3333		panic("softdep_filerelease: file still referenced");
3334	/*
3335	 * We may be called several times as the real reference count
3336	 * drops to zero. We only want to account for the space once.
3337	 */
3338	if (ip->i_flag & IN_SPACECOUNTED)
3339		return;
3340	/*
3341	 * We have to deactivate a snapshot otherwise copyonwrites may
3342	 * add blocks and the cleanup may remove blocks after we have
3343	 * tried to account for them.
3344	 */
3345	if ((ip->i_flags & SF_SNAPSHOT) != 0)
3346		ffs_snapremove(ITOV(ip));
3347	/*
3348	 * If we are tracking an nlinkdelta, we have to also remember
3349	 * whether we accounted for the freed space yet.
3350	 */
3351	ACQUIRE_LOCK(&lk);
3352	if ((inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep)))
3353		inodedep->id_state |= SPACECOUNTED;
3354	FREE_LOCK(&lk);
3355	fs = ip->i_fs;
3356	extblocks = 0;
3357	if (fs->fs_magic == FS_UFS2_MAGIC)
3358		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
3359	UFS_LOCK(ip->i_ump);
3360	ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks;
3361	ip->i_fs->fs_pendinginodes += 1;
3362	UFS_UNLOCK(ip->i_ump);
3363	ip->i_flag |= IN_SPACECOUNTED;
3364}
3365
3366/*
3367 * This workitem decrements the inode's link count.
3368 * If the link count reaches zero, the file is removed.
3369 */
3370static void
3371handle_workitem_remove(dirrem, xp)
3372	struct dirrem *dirrem;
3373	struct vnode *xp;
3374{
3375	struct thread *td = curthread;
3376	struct inodedep *inodedep;
3377	struct vnode *vp;
3378	struct inode *ip;
3379	ino_t oldinum;
3380	int error;
3381
3382	if ((vp = xp) == NULL &&
3383	    (error = ffs_vget(dirrem->dm_mnt, dirrem->dm_oldinum, LK_EXCLUSIVE,
3384	     &vp)) != 0) {
3385		softdep_error("handle_workitem_remove: vget", error);
3386		return;
3387	}
3388	ip = VTOI(vp);
3389	ACQUIRE_LOCK(&lk);
3390	if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0)
3391		panic("handle_workitem_remove: lost inodedep");
3392	/*
3393	 * Normal file deletion.
3394	 */
3395	if ((dirrem->dm_state & RMDIR) == 0) {
3396		ip->i_nlink--;
3397		DIP_SET(ip, i_nlink, ip->i_nlink);
3398		ip->i_flag |= IN_CHANGE;
3399		if (ip->i_nlink < ip->i_effnlink)
3400			panic("handle_workitem_remove: bad file delta");
3401		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3402		num_dirrem -= 1;
3403		FREE_LOCK(&lk);
3404		vput(vp);
3405		WORKITEM_FREE(dirrem, D_DIRREM);
3406		return;
3407	}
3408	/*
3409	 * Directory deletion. Decrement reference count for both the
3410	 * just deleted parent directory entry and the reference for ".".
3411	 * Next truncate the directory to length zero. When the
3412	 * truncation completes, arrange to have the reference count on
3413	 * the parent decremented to account for the loss of "..".
3414	 */
3415	ip->i_nlink -= 2;
3416	DIP_SET(ip, i_nlink, ip->i_nlink);
3417	ip->i_flag |= IN_CHANGE;
3418	if (ip->i_nlink < ip->i_effnlink)
3419		panic("handle_workitem_remove: bad dir delta");
3420	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3421	FREE_LOCK(&lk);
3422	if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
3423		softdep_error("handle_workitem_remove: truncate", error);
3424	ACQUIRE_LOCK(&lk);
3425	/*
3426	 * Rename a directory to a new parent. Since, we are both deleting
3427	 * and creating a new directory entry, the link count on the new
3428	 * directory should not change. Thus we skip the followup dirrem.
3429	 */
3430	if (dirrem->dm_state & DIRCHG) {
3431		num_dirrem -= 1;
3432		FREE_LOCK(&lk);
3433		vput(vp);
3434		WORKITEM_FREE(dirrem, D_DIRREM);
3435		return;
3436	}
3437	/*
3438	 * If the inodedep does not exist, then the zero'ed inode has
3439	 * been written to disk. If the allocated inode has never been
3440	 * written to disk, then the on-disk inode is zero'ed. In either
3441	 * case we can remove the file immediately.
3442	 */
3443	dirrem->dm_state = 0;
3444	oldinum = dirrem->dm_oldinum;
3445	dirrem->dm_oldinum = dirrem->dm_dirinum;
3446	if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
3447	    check_inode_unwritten(inodedep)) {
3448		FREE_LOCK(&lk);
3449		vput(vp);
3450		handle_workitem_remove(dirrem, NULL);
3451		return;
3452	}
3453	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
3454	FREE_LOCK(&lk);
3455	vput(vp);
3456}
3457
3458/*
3459 * Inode de-allocation dependencies.
3460 *
3461 * When an inode's link count is reduced to zero, it can be de-allocated. We
3462 * found it convenient to postpone de-allocation until after the inode is
3463 * written to disk with its new link count (zero).  At this point, all of the
3464 * on-disk inode's block pointers are nullified and, with careful dependency
3465 * list ordering, all dependencies related to the inode will be satisfied and
3466 * the corresponding dependency structures de-allocated.  So, if/when the
3467 * inode is reused, there will be no mixing of old dependencies with new
3468 * ones.  This artificial dependency is set up by the block de-allocation
3469 * procedure above (softdep_setup_freeblocks) and completed by the
3470 * following procedure.
3471 */
3472static void
3473handle_workitem_freefile(freefile)
3474	struct freefile *freefile;
3475{
3476	struct fs *fs;
3477	struct inodedep *idp;
3478	struct ufsmount *ump;
3479	int error;
3480
3481	ump = VFSTOUFS(freefile->fx_mnt);
3482	fs = ump->um_fs;
3483#ifdef DEBUG
3484	ACQUIRE_LOCK(&lk);
3485	error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp);
3486	FREE_LOCK(&lk);
3487	if (error)
3488		panic("handle_workitem_freefile: inodedep survived");
3489#endif
3490	UFS_LOCK(ump);
3491	fs->fs_pendinginodes -= 1;
3492	UFS_UNLOCK(ump);
3493	if ((error = ffs_freefile(VFSTOUFS(freefile->fx_mnt), fs,
3494	    freefile->fx_devvp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
3495		softdep_error("handle_workitem_freefile", error);
3496	WORKITEM_FREE(freefile, D_FREEFILE);
3497}
3498
3499/*
3500 * Disk writes.
3501 *
3502 * The dependency structures constructed above are most actively used when file
3503 * system blocks are written to disk.  No constraints are placed on when a
3504 * block can be written, but unsatisfied update dependencies are made safe by
3505 * modifying (or replacing) the source memory for the duration of the disk
3506 * write.  When the disk write completes, the memory block is again brought
3507 * up-to-date.
3508 *
3509 * In-core inode structure reclamation.
3510 *
3511 * Because there are a finite number of "in-core" inode structures, they are
3512 * reused regularly.  By transferring all inode-related dependencies to the
3513 * in-memory inode block and indexing them separately (via "inodedep"s), we
3514 * can allow "in-core" inode structures to be reused at any time and avoid
3515 * any increase in contention.
3516 *
3517 * Called just before entering the device driver to initiate a new disk I/O.
3518 * The buffer must be locked, thus, no I/O completion operations can occur
3519 * while we are manipulating its associated dependencies.
3520 */
3521static void
3522softdep_disk_io_initiation(bp)
3523	struct buf *bp;		/* structure describing disk write to occur */
3524{
3525	struct worklist *wk, *nextwk;
3526	struct indirdep *indirdep;
3527	struct inodedep *inodedep;
3528
3529	/*
3530	 * We only care about write operations. There should never
3531	 * be dependencies for reads.
3532	 */
3533	if (bp->b_iocmd != BIO_WRITE)
3534		panic("softdep_disk_io_initiation: not write");
3535	ACQUIRE_LOCK(&lk);
3536	/*
3537	 * Do any necessary pre-I/O processing.
3538	 */
3539	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, nextwk) {
3540		switch (wk->wk_type) {
3541
3542		case D_PAGEDEP:
3543			initiate_write_filepage(WK_PAGEDEP(wk), bp);
3544			continue;
3545
3546		case D_INODEDEP:
3547			inodedep = WK_INODEDEP(wk);
3548			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
3549				initiate_write_inodeblock_ufs1(inodedep, bp);
3550			else
3551				initiate_write_inodeblock_ufs2(inodedep, bp);
3552			continue;
3553
3554		case D_INDIRDEP:
3555			indirdep = WK_INDIRDEP(wk);
3556			if (indirdep->ir_state & GOINGAWAY)
3557				panic("disk_io_initiation: indirdep gone");
3558			/*
3559			 * If there are no remaining dependencies, this
3560			 * will be writing the real pointers, so the
3561			 * dependency can be freed.
3562			 */
3563			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
3564				struct buf *bp;
3565
3566				bp = indirdep->ir_savebp;
3567				bp->b_flags |= B_INVAL | B_NOCACHE;
3568				/* inline expand WORKLIST_REMOVE(wk); */
3569				wk->wk_state &= ~ONWORKLIST;
3570				LIST_REMOVE(wk, wk_list);
3571				WORKITEM_FREE(indirdep, D_INDIRDEP);
3572				FREE_LOCK(&lk);
3573				brelse(bp);
3574				ACQUIRE_LOCK(&lk);
3575				continue;
3576			}
3577			/*
3578			 * Replace up-to-date version with safe version.
3579			 */
3580			FREE_LOCK(&lk);
3581			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
3582			    M_INDIRDEP, M_SOFTDEP_FLAGS);
3583			ACQUIRE_LOCK(&lk);
3584			indirdep->ir_state &= ~ATTACHED;
3585			indirdep->ir_state |= UNDONE;
3586			bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
3587			bcopy(indirdep->ir_savebp->b_data, bp->b_data,
3588			    bp->b_bcount);
3589			continue;
3590
3591		case D_MKDIR:
3592		case D_BMSAFEMAP:
3593		case D_ALLOCDIRECT:
3594		case D_ALLOCINDIR:
3595			continue;
3596
3597		default:
3598			panic("handle_disk_io_initiation: Unexpected type %s",
3599			    TYPENAME(wk->wk_type));
3600			/* NOTREACHED */
3601		}
3602	}
3603	FREE_LOCK(&lk);
3604}
3605
3606/*
3607 * Called from within the procedure above to deal with unsatisfied
3608 * allocation dependencies in a directory. The buffer must be locked,
3609 * thus, no I/O completion operations can occur while we are
3610 * manipulating its associated dependencies.
3611 */
3612static void
3613initiate_write_filepage(pagedep, bp)
3614	struct pagedep *pagedep;
3615	struct buf *bp;
3616{
3617	struct diradd *dap;
3618	struct direct *ep;
3619	int i;
3620
3621	if (pagedep->pd_state & IOSTARTED) {
3622		/*
3623		 * This can only happen if there is a driver that does not
3624		 * understand chaining. Here biodone will reissue the call
3625		 * to strategy for the incomplete buffers.
3626		 */
3627		printf("initiate_write_filepage: already started\n");
3628		return;
3629	}
3630	pagedep->pd_state |= IOSTARTED;
3631	for (i = 0; i < DAHASHSZ; i++) {
3632		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
3633			ep = (struct direct *)
3634			    ((char *)bp->b_data + dap->da_offset);
3635			if (ep->d_ino != dap->da_newinum)
3636				panic("%s: dir inum %d != new %d",
3637				    "initiate_write_filepage",
3638				    ep->d_ino, dap->da_newinum);
3639			if (dap->da_state & DIRCHG)
3640				ep->d_ino = dap->da_previous->dm_oldinum;
3641			else
3642				ep->d_ino = 0;
3643			dap->da_state &= ~ATTACHED;
3644			dap->da_state |= UNDONE;
3645		}
3646	}
3647}
3648
3649/*
3650 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
3651 * Note that any bug fixes made to this routine must be done in the
3652 * version found below.
3653 *
3654 * Called from within the procedure above to deal with unsatisfied
3655 * allocation dependencies in an inodeblock. The buffer must be
3656 * locked, thus, no I/O completion operations can occur while we
3657 * are manipulating its associated dependencies.
3658 */
3659static void
3660initiate_write_inodeblock_ufs1(inodedep, bp)
3661	struct inodedep *inodedep;
3662	struct buf *bp;			/* The inode block */
3663{
3664	struct allocdirect *adp, *lastadp;
3665	struct ufs1_dinode *dp;
3666	struct fs *fs;
3667	ufs_lbn_t i, prevlbn = 0;
3668	int deplist;
3669
3670	if (inodedep->id_state & IOSTARTED)
3671		panic("initiate_write_inodeblock_ufs1: already started");
3672	inodedep->id_state |= IOSTARTED;
3673	fs = inodedep->id_fs;
3674	dp = (struct ufs1_dinode *)bp->b_data +
3675	    ino_to_fsbo(fs, inodedep->id_ino);
3676	/*
3677	 * If the bitmap is not yet written, then the allocated
3678	 * inode cannot be written to disk.
3679	 */
3680	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3681		if (inodedep->id_savedino1 != NULL)
3682			panic("initiate_write_inodeblock_ufs1: I/O underway");
3683		FREE_LOCK(&lk);
3684		MALLOC(inodedep->id_savedino1, struct ufs1_dinode *,
3685		    sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
3686		ACQUIRE_LOCK(&lk);
3687		*inodedep->id_savedino1 = *dp;
3688		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
3689		return;
3690	}
3691	/*
3692	 * If no dependencies, then there is nothing to roll back.
3693	 */
3694	inodedep->id_savedsize = dp->di_size;
3695	inodedep->id_savedextsize = 0;
3696	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3697		return;
3698	/*
3699	 * Set the dependencies to busy.
3700	 */
3701	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3702	     adp = TAILQ_NEXT(adp, ad_next)) {
3703#ifdef DIAGNOSTIC
3704		if (deplist != 0 && prevlbn >= adp->ad_lbn)
3705			panic("softdep_write_inodeblock: lbn order");
3706		prevlbn = adp->ad_lbn;
3707		if (adp->ad_lbn < NDADDR &&
3708		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
3709			panic("%s: direct pointer #%jd mismatch %d != %jd",
3710			    "softdep_write_inodeblock",
3711			    (intmax_t)adp->ad_lbn,
3712			    dp->di_db[adp->ad_lbn],
3713			    (intmax_t)adp->ad_newblkno);
3714		if (adp->ad_lbn >= NDADDR &&
3715		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
3716			panic("%s: indirect pointer #%jd mismatch %d != %jd",
3717			    "softdep_write_inodeblock",
3718			    (intmax_t)adp->ad_lbn - NDADDR,
3719			    dp->di_ib[adp->ad_lbn - NDADDR],
3720			    (intmax_t)adp->ad_newblkno);
3721		deplist |= 1 << adp->ad_lbn;
3722		if ((adp->ad_state & ATTACHED) == 0)
3723			panic("softdep_write_inodeblock: Unknown state 0x%x",
3724			    adp->ad_state);
3725#endif /* DIAGNOSTIC */
3726		adp->ad_state &= ~ATTACHED;
3727		adp->ad_state |= UNDONE;
3728	}
3729	/*
3730	 * The on-disk inode cannot claim to be any larger than the last
3731	 * fragment that has been written. Otherwise, the on-disk inode
3732	 * might have fragments that were not the last block in the file
3733	 * which would corrupt the filesystem.
3734	 */
3735	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3736	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3737		if (adp->ad_lbn >= NDADDR)
3738			break;
3739		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3740		/* keep going until hitting a rollback to a frag */
3741		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3742			continue;
3743		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3744		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3745#ifdef DIAGNOSTIC
3746			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
3747				panic("softdep_write_inodeblock: lost dep1");
3748#endif /* DIAGNOSTIC */
3749			dp->di_db[i] = 0;
3750		}
3751		for (i = 0; i < NIADDR; i++) {
3752#ifdef DIAGNOSTIC
3753			if (dp->di_ib[i] != 0 &&
3754			    (deplist & ((1 << NDADDR) << i)) == 0)
3755				panic("softdep_write_inodeblock: lost dep2");
3756#endif /* DIAGNOSTIC */
3757			dp->di_ib[i] = 0;
3758		}
3759		return;
3760	}
3761	/*
3762	 * If we have zero'ed out the last allocated block of the file,
3763	 * roll back the size to the last currently allocated block.
3764	 * We know that this last allocated block is a full-sized as
3765	 * we already checked for fragments in the loop above.
3766	 */
3767	if (lastadp != NULL &&
3768	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3769		for (i = lastadp->ad_lbn; i >= 0; i--)
3770			if (dp->di_db[i] != 0)
3771				break;
3772		dp->di_size = (i + 1) * fs->fs_bsize;
3773	}
3774	/*
3775	 * The only dependencies are for indirect blocks.
3776	 *
3777	 * The file size for indirect block additions is not guaranteed.
3778	 * Such a guarantee would be non-trivial to achieve. The conventional
3779	 * synchronous write implementation also does not make this guarantee.
3780	 * Fsck should catch and fix discrepancies. Arguably, the file size
3781	 * can be over-estimated without destroying integrity when the file
3782	 * moves into the indirect blocks (i.e., is large). If we want to
3783	 * postpone fsck, we are stuck with this argument.
3784	 */
3785	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3786		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3787}
3788
3789/*
3790 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
3791 * Note that any bug fixes made to this routine must be done in the
3792 * version found above.
3793 *
3794 * Called from within the procedure above to deal with unsatisfied
3795 * allocation dependencies in an inodeblock. The buffer must be
3796 * locked, thus, no I/O completion operations can occur while we
3797 * are manipulating its associated dependencies.
3798 */
3799static void
3800initiate_write_inodeblock_ufs2(inodedep, bp)
3801	struct inodedep *inodedep;
3802	struct buf *bp;			/* The inode block */
3803{
3804	struct allocdirect *adp, *lastadp;
3805	struct ufs2_dinode *dp;
3806	struct fs *fs;
3807	ufs_lbn_t i, prevlbn = 0;
3808	int deplist;
3809
3810	if (inodedep->id_state & IOSTARTED)
3811		panic("initiate_write_inodeblock_ufs2: already started");
3812	inodedep->id_state |= IOSTARTED;
3813	fs = inodedep->id_fs;
3814	dp = (struct ufs2_dinode *)bp->b_data +
3815	    ino_to_fsbo(fs, inodedep->id_ino);
3816	/*
3817	 * If the bitmap is not yet written, then the allocated
3818	 * inode cannot be written to disk.
3819	 */
3820	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3821		if (inodedep->id_savedino2 != NULL)
3822			panic("initiate_write_inodeblock_ufs2: I/O underway");
3823		FREE_LOCK(&lk);
3824		MALLOC(inodedep->id_savedino2, struct ufs2_dinode *,
3825		    sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
3826		ACQUIRE_LOCK(&lk);
3827		*inodedep->id_savedino2 = *dp;
3828		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
3829		return;
3830	}
3831	/*
3832	 * If no dependencies, then there is nothing to roll back.
3833	 */
3834	inodedep->id_savedsize = dp->di_size;
3835	inodedep->id_savedextsize = dp->di_extsize;
3836	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL &&
3837	    TAILQ_FIRST(&inodedep->id_extupdt) == NULL)
3838		return;
3839	/*
3840	 * Set the ext data dependencies to busy.
3841	 */
3842	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
3843	     adp = TAILQ_NEXT(adp, ad_next)) {
3844#ifdef DIAGNOSTIC
3845		if (deplist != 0 && prevlbn >= adp->ad_lbn)
3846			panic("softdep_write_inodeblock: lbn order");
3847		prevlbn = adp->ad_lbn;
3848		if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno)
3849			panic("%s: direct pointer #%jd mismatch %jd != %jd",
3850			    "softdep_write_inodeblock",
3851			    (intmax_t)adp->ad_lbn,
3852			    (intmax_t)dp->di_extb[adp->ad_lbn],
3853			    (intmax_t)adp->ad_newblkno);
3854		deplist |= 1 << adp->ad_lbn;
3855		if ((adp->ad_state & ATTACHED) == 0)
3856			panic("softdep_write_inodeblock: Unknown state 0x%x",
3857			    adp->ad_state);
3858#endif /* DIAGNOSTIC */
3859		adp->ad_state &= ~ATTACHED;
3860		adp->ad_state |= UNDONE;
3861	}
3862	/*
3863	 * The on-disk inode cannot claim to be any larger than the last
3864	 * fragment that has been written. Otherwise, the on-disk inode
3865	 * might have fragments that were not the last block in the ext
3866	 * data which would corrupt the filesystem.
3867	 */
3868	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
3869	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3870		dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
3871		/* keep going until hitting a rollback to a frag */
3872		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3873			continue;
3874		dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3875		for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
3876#ifdef DIAGNOSTIC
3877			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
3878				panic("softdep_write_inodeblock: lost dep1");
3879#endif /* DIAGNOSTIC */
3880			dp->di_extb[i] = 0;
3881		}
3882		lastadp = NULL;
3883		break;
3884	}
3885	/*
3886	 * If we have zero'ed out the last allocated block of the ext
3887	 * data, roll back the size to the last currently allocated block.
3888	 * We know that this last allocated block is a full-sized as
3889	 * we already checked for fragments in the loop above.
3890	 */
3891	if (lastadp != NULL &&
3892	    dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3893		for (i = lastadp->ad_lbn; i >= 0; i--)
3894			if (dp->di_extb[i] != 0)
3895				break;
3896		dp->di_extsize = (i + 1) * fs->fs_bsize;
3897	}
3898	/*
3899	 * Set the file data dependencies to busy.
3900	 */
3901	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3902	     adp = TAILQ_NEXT(adp, ad_next)) {
3903#ifdef DIAGNOSTIC
3904		if (deplist != 0 && prevlbn >= adp->ad_lbn)
3905			panic("softdep_write_inodeblock: lbn order");
3906		prevlbn = adp->ad_lbn;
3907		if (adp->ad_lbn < NDADDR &&
3908		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
3909			panic("%s: direct pointer #%jd mismatch %jd != %jd",
3910			    "softdep_write_inodeblock",
3911			    (intmax_t)adp->ad_lbn,
3912			    (intmax_t)dp->di_db[adp->ad_lbn],
3913			    (intmax_t)adp->ad_newblkno);
3914		if (adp->ad_lbn >= NDADDR &&
3915		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
3916			panic("%s indirect pointer #%jd mismatch %jd != %jd",
3917			    "softdep_write_inodeblock:",
3918			    (intmax_t)adp->ad_lbn - NDADDR,
3919			    (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
3920			    (intmax_t)adp->ad_newblkno);
3921		deplist |= 1 << adp->ad_lbn;
3922		if ((adp->ad_state & ATTACHED) == 0)
3923			panic("softdep_write_inodeblock: Unknown state 0x%x",
3924			    adp->ad_state);
3925#endif /* DIAGNOSTIC */
3926		adp->ad_state &= ~ATTACHED;
3927		adp->ad_state |= UNDONE;
3928	}
3929	/*
3930	 * The on-disk inode cannot claim to be any larger than the last
3931	 * fragment that has been written. Otherwise, the on-disk inode
3932	 * might have fragments that were not the last block in the file
3933	 * which would corrupt the filesystem.
3934	 */
3935	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3936	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3937		if (adp->ad_lbn >= NDADDR)
3938			break;
3939		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3940		/* keep going until hitting a rollback to a frag */
3941		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3942			continue;
3943		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3944		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3945#ifdef DIAGNOSTIC
3946			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
3947				panic("softdep_write_inodeblock: lost dep2");
3948#endif /* DIAGNOSTIC */
3949			dp->di_db[i] = 0;
3950		}
3951		for (i = 0; i < NIADDR; i++) {
3952#ifdef DIAGNOSTIC
3953			if (dp->di_ib[i] != 0 &&
3954			    (deplist & ((1 << NDADDR) << i)) == 0)
3955				panic("softdep_write_inodeblock: lost dep3");
3956#endif /* DIAGNOSTIC */
3957			dp->di_ib[i] = 0;
3958		}
3959		return;
3960	}
3961	/*
3962	 * If we have zero'ed out the last allocated block of the file,
3963	 * roll back the size to the last currently allocated block.
3964	 * We know that this last allocated block is a full-sized as
3965	 * we already checked for fragments in the loop above.
3966	 */
3967	if (lastadp != NULL &&
3968	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3969		for (i = lastadp->ad_lbn; i >= 0; i--)
3970			if (dp->di_db[i] != 0)
3971				break;
3972		dp->di_size = (i + 1) * fs->fs_bsize;
3973	}
3974	/*
3975	 * The only dependencies are for indirect blocks.
3976	 *
3977	 * The file size for indirect block additions is not guaranteed.
3978	 * Such a guarantee would be non-trivial to achieve. The conventional
3979	 * synchronous write implementation also does not make this guarantee.
3980	 * Fsck should catch and fix discrepancies. Arguably, the file size
3981	 * can be over-estimated without destroying integrity when the file
3982	 * moves into the indirect blocks (i.e., is large). If we want to
3983	 * postpone fsck, we are stuck with this argument.
3984	 */
3985	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3986		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3987}
3988
3989/*
3990 * This routine is called during the completion interrupt
3991 * service routine for a disk write (from the procedure called
3992 * by the device driver to inform the filesystem caches of
3993 * a request completion).  It should be called early in this
3994 * procedure, before the block is made available to other
3995 * processes or other routines are called.
3996 */
3997static void
3998softdep_disk_write_complete(bp)
3999	struct buf *bp;		/* describes the completed disk write */
4000{
4001	struct worklist *wk;
4002	struct worklist *owk;
4003	struct workhead reattach;
4004	struct newblk *newblk;
4005	struct allocindir *aip;
4006	struct allocdirect *adp;
4007	struct indirdep *indirdep;
4008	struct inodedep *inodedep;
4009	struct bmsafemap *bmsafemap;
4010
4011	/*
4012	 * If an error occurred while doing the write, then the data
4013	 * has not hit the disk and the dependencies cannot be unrolled.
4014	 */
4015	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
4016		return;
4017	LIST_INIT(&reattach);
4018	/*
4019	 * This lock must not be released anywhere in this code segment.
4020	 */
4021	ACQUIRE_LOCK(&lk);
4022	owk = NULL;
4023	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
4024		WORKLIST_REMOVE(wk);
4025		if (wk == owk)
4026			panic("duplicate worklist: %p\n", wk);
4027		owk = wk;
4028		switch (wk->wk_type) {
4029
4030		case D_PAGEDEP:
4031			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
4032				WORKLIST_INSERT(&reattach, wk);
4033			continue;
4034
4035		case D_INODEDEP:
4036			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
4037				WORKLIST_INSERT(&reattach, wk);
4038			continue;
4039
4040		case D_BMSAFEMAP:
4041			bmsafemap = WK_BMSAFEMAP(wk);
4042			while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
4043				newblk->nb_state |= DEPCOMPLETE;
4044				newblk->nb_bmsafemap = NULL;
4045				LIST_REMOVE(newblk, nb_deps);
4046			}
4047			while ((adp =
4048			   LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
4049				adp->ad_state |= DEPCOMPLETE;
4050				adp->ad_buf = NULL;
4051				LIST_REMOVE(adp, ad_deps);
4052				handle_allocdirect_partdone(adp);
4053			}
4054			while ((aip =
4055			    LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
4056				aip->ai_state |= DEPCOMPLETE;
4057				aip->ai_buf = NULL;
4058				LIST_REMOVE(aip, ai_deps);
4059				handle_allocindir_partdone(aip);
4060			}
4061			while ((inodedep =
4062			     LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
4063				inodedep->id_state |= DEPCOMPLETE;
4064				LIST_REMOVE(inodedep, id_deps);
4065				inodedep->id_buf = NULL;
4066			}
4067			WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
4068			continue;
4069
4070		case D_MKDIR:
4071			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
4072			continue;
4073
4074		case D_ALLOCDIRECT:
4075			adp = WK_ALLOCDIRECT(wk);
4076			adp->ad_state |= COMPLETE;
4077			handle_allocdirect_partdone(adp);
4078			continue;
4079
4080		case D_ALLOCINDIR:
4081			aip = WK_ALLOCINDIR(wk);
4082			aip->ai_state |= COMPLETE;
4083			handle_allocindir_partdone(aip);
4084			continue;
4085
4086		case D_INDIRDEP:
4087			indirdep = WK_INDIRDEP(wk);
4088			if (indirdep->ir_state & GOINGAWAY)
4089				panic("disk_write_complete: indirdep gone");
4090			bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
4091			FREE(indirdep->ir_saveddata, M_INDIRDEP);
4092			indirdep->ir_saveddata = 0;
4093			indirdep->ir_state &= ~UNDONE;
4094			indirdep->ir_state |= ATTACHED;
4095			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
4096				handle_allocindir_partdone(aip);
4097				if (aip == LIST_FIRST(&indirdep->ir_donehd))
4098					panic("disk_write_complete: not gone");
4099			}
4100			WORKLIST_INSERT(&reattach, wk);
4101			if ((bp->b_flags & B_DELWRI) == 0)
4102				stat_indir_blk_ptrs++;
4103			bdirty(bp);
4104			continue;
4105
4106		default:
4107			panic("handle_disk_write_complete: Unknown type %s",
4108			    TYPENAME(wk->wk_type));
4109			/* NOTREACHED */
4110		}
4111	}
4112	/*
4113	 * Reattach any requests that must be redone.
4114	 */
4115	while ((wk = LIST_FIRST(&reattach)) != NULL) {
4116		WORKLIST_REMOVE(wk);
4117		WORKLIST_INSERT(&bp->b_dep, wk);
4118	}
4119	FREE_LOCK(&lk);
4120}
4121
4122/*
4123 * Called from within softdep_disk_write_complete above. Note that
4124 * this routine is always called from interrupt level with further
4125 * splbio interrupts blocked.
4126 */
4127static void
4128handle_allocdirect_partdone(adp)
4129	struct allocdirect *adp;	/* the completed allocdirect */
4130{
4131	struct allocdirectlst *listhead;
4132	struct allocdirect *listadp;
4133	struct inodedep *inodedep;
4134	long bsize, delay;
4135
4136	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
4137		return;
4138	if (adp->ad_buf != NULL)
4139		panic("handle_allocdirect_partdone: dangling dep");
4140	/*
4141	 * The on-disk inode cannot claim to be any larger than the last
4142	 * fragment that has been written. Otherwise, the on-disk inode
4143	 * might have fragments that were not the last block in the file
4144	 * which would corrupt the filesystem. Thus, we cannot free any
4145	 * allocdirects after one whose ad_oldblkno claims a fragment as
4146	 * these blocks must be rolled back to zero before writing the inode.
4147	 * We check the currently active set of allocdirects in id_inoupdt
4148	 * or id_extupdt as appropriate.
4149	 */
4150	inodedep = adp->ad_inodedep;
4151	bsize = inodedep->id_fs->fs_bsize;
4152	if (adp->ad_state & EXTDATA)
4153		listhead = &inodedep->id_extupdt;
4154	else
4155		listhead = &inodedep->id_inoupdt;
4156	TAILQ_FOREACH(listadp, listhead, ad_next) {
4157		/* found our block */
4158		if (listadp == adp)
4159			break;
4160		/* continue if ad_oldlbn is not a fragment */
4161		if (listadp->ad_oldsize == 0 ||
4162		    listadp->ad_oldsize == bsize)
4163			continue;
4164		/* hit a fragment */
4165		return;
4166	}
4167	/*
4168	 * If we have reached the end of the current list without
4169	 * finding the just finished dependency, then it must be
4170	 * on the future dependency list. Future dependencies cannot
4171	 * be freed until they are moved to the current list.
4172	 */
4173	if (listadp == NULL) {
4174#ifdef DEBUG
4175		if (adp->ad_state & EXTDATA)
4176			listhead = &inodedep->id_newextupdt;
4177		else
4178			listhead = &inodedep->id_newinoupdt;
4179		TAILQ_FOREACH(listadp, listhead, ad_next)
4180			/* found our block */
4181			if (listadp == adp)
4182				break;
4183		if (listadp == NULL)
4184			panic("handle_allocdirect_partdone: lost dep");
4185#endif /* DEBUG */
4186		return;
4187	}
4188	/*
4189	 * If we have found the just finished dependency, then free
4190	 * it along with anything that follows it that is complete.
4191	 * If the inode still has a bitmap dependency, then it has
4192	 * never been written to disk, hence the on-disk inode cannot
4193	 * reference the old fragment so we can free it without delay.
4194	 */
4195	delay = (inodedep->id_state & DEPCOMPLETE);
4196	for (; adp; adp = listadp) {
4197		listadp = TAILQ_NEXT(adp, ad_next);
4198		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
4199			return;
4200		free_allocdirect(listhead, adp, delay);
4201	}
4202}
4203
4204/*
4205 * Called from within softdep_disk_write_complete above. Note that
4206 * this routine is always called from interrupt level with further
4207 * splbio interrupts blocked.
4208 */
4209static void
4210handle_allocindir_partdone(aip)
4211	struct allocindir *aip;		/* the completed allocindir */
4212{
4213	struct indirdep *indirdep;
4214
4215	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
4216		return;
4217	if (aip->ai_buf != NULL)
4218		panic("handle_allocindir_partdone: dangling dependency");
4219	indirdep = aip->ai_indirdep;
4220	if (indirdep->ir_state & UNDONE) {
4221		LIST_REMOVE(aip, ai_next);
4222		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
4223		return;
4224	}
4225	if (indirdep->ir_state & UFS1FMT)
4226		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4227		    aip->ai_newblkno;
4228	else
4229		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4230		    aip->ai_newblkno;
4231	LIST_REMOVE(aip, ai_next);
4232	if (aip->ai_freefrag != NULL)
4233		add_to_worklist(&aip->ai_freefrag->ff_list);
4234	WORKITEM_FREE(aip, D_ALLOCINDIR);
4235}
4236
4237/*
4238 * Called from within softdep_disk_write_complete above to restore
4239 * in-memory inode block contents to their most up-to-date state. Note
4240 * that this routine is always called from interrupt level with further
4241 * splbio interrupts blocked.
4242 */
4243static int
4244handle_written_inodeblock(inodedep, bp)
4245	struct inodedep *inodedep;
4246	struct buf *bp;		/* buffer containing the inode block */
4247{
4248	struct worklist *wk, *filefree;
4249	struct allocdirect *adp, *nextadp;
4250	struct ufs1_dinode *dp1 = NULL;
4251	struct ufs2_dinode *dp2 = NULL;
4252	int hadchanges, fstype;
4253
4254	if ((inodedep->id_state & IOSTARTED) == 0)
4255		panic("handle_written_inodeblock: not started");
4256	inodedep->id_state &= ~IOSTARTED;
4257	inodedep->id_state |= COMPLETE;
4258	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
4259		fstype = UFS1;
4260		dp1 = (struct ufs1_dinode *)bp->b_data +
4261		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4262	} else {
4263		fstype = UFS2;
4264		dp2 = (struct ufs2_dinode *)bp->b_data +
4265		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4266	}
4267	/*
4268	 * If we had to rollback the inode allocation because of
4269	 * bitmaps being incomplete, then simply restore it.
4270	 * Keep the block dirty so that it will not be reclaimed until
4271	 * all associated dependencies have been cleared and the
4272	 * corresponding updates written to disk.
4273	 */
4274	if (inodedep->id_savedino1 != NULL) {
4275		if (fstype == UFS1)
4276			*dp1 = *inodedep->id_savedino1;
4277		else
4278			*dp2 = *inodedep->id_savedino2;
4279		FREE(inodedep->id_savedino1, M_SAVEDINO);
4280		inodedep->id_savedino1 = NULL;
4281		if ((bp->b_flags & B_DELWRI) == 0)
4282			stat_inode_bitmap++;
4283		bdirty(bp);
4284		return (1);
4285	}
4286	/*
4287	 * Roll forward anything that had to be rolled back before
4288	 * the inode could be updated.
4289	 */
4290	hadchanges = 0;
4291	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
4292		nextadp = TAILQ_NEXT(adp, ad_next);
4293		if (adp->ad_state & ATTACHED)
4294			panic("handle_written_inodeblock: new entry");
4295		if (fstype == UFS1) {
4296			if (adp->ad_lbn < NDADDR) {
4297				if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
4298					panic("%s %s #%jd mismatch %d != %jd",
4299					    "handle_written_inodeblock:",
4300					    "direct pointer",
4301					    (intmax_t)adp->ad_lbn,
4302					    dp1->di_db[adp->ad_lbn],
4303					    (intmax_t)adp->ad_oldblkno);
4304				dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
4305			} else {
4306				if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
4307					panic("%s: %s #%jd allocated as %d",
4308					    "handle_written_inodeblock",
4309					    "indirect pointer",
4310					    (intmax_t)adp->ad_lbn - NDADDR,
4311					    dp1->di_ib[adp->ad_lbn - NDADDR]);
4312				dp1->di_ib[adp->ad_lbn - NDADDR] =
4313				    adp->ad_newblkno;
4314			}
4315		} else {
4316			if (adp->ad_lbn < NDADDR) {
4317				if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
4318					panic("%s: %s #%jd %s %jd != %jd",
4319					    "handle_written_inodeblock",
4320					    "direct pointer",
4321					    (intmax_t)adp->ad_lbn, "mismatch",
4322					    (intmax_t)dp2->di_db[adp->ad_lbn],
4323					    (intmax_t)adp->ad_oldblkno);
4324				dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
4325			} else {
4326				if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
4327					panic("%s: %s #%jd allocated as %jd",
4328					    "handle_written_inodeblock",
4329					    "indirect pointer",
4330					    (intmax_t)adp->ad_lbn - NDADDR,
4331					    (intmax_t)
4332					    dp2->di_ib[adp->ad_lbn - NDADDR]);
4333				dp2->di_ib[adp->ad_lbn - NDADDR] =
4334				    adp->ad_newblkno;
4335			}
4336		}
4337		adp->ad_state &= ~UNDONE;
4338		adp->ad_state |= ATTACHED;
4339		hadchanges = 1;
4340	}
4341	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
4342		nextadp = TAILQ_NEXT(adp, ad_next);
4343		if (adp->ad_state & ATTACHED)
4344			panic("handle_written_inodeblock: new entry");
4345		if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno)
4346			panic("%s: direct pointers #%jd %s %jd != %jd",
4347			    "handle_written_inodeblock",
4348			    (intmax_t)adp->ad_lbn, "mismatch",
4349			    (intmax_t)dp2->di_extb[adp->ad_lbn],
4350			    (intmax_t)adp->ad_oldblkno);
4351		dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
4352		adp->ad_state &= ~UNDONE;
4353		adp->ad_state |= ATTACHED;
4354		hadchanges = 1;
4355	}
4356	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
4357		stat_direct_blk_ptrs++;
4358	/*
4359	 * Reset the file size to its most up-to-date value.
4360	 */
4361	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
4362		panic("handle_written_inodeblock: bad size");
4363	if (fstype == UFS1) {
4364		if (dp1->di_size != inodedep->id_savedsize) {
4365			dp1->di_size = inodedep->id_savedsize;
4366			hadchanges = 1;
4367		}
4368	} else {
4369		if (dp2->di_size != inodedep->id_savedsize) {
4370			dp2->di_size = inodedep->id_savedsize;
4371			hadchanges = 1;
4372		}
4373		if (dp2->di_extsize != inodedep->id_savedextsize) {
4374			dp2->di_extsize = inodedep->id_savedextsize;
4375			hadchanges = 1;
4376		}
4377	}
4378	inodedep->id_savedsize = -1;
4379	inodedep->id_savedextsize = -1;
4380	/*
4381	 * If there were any rollbacks in the inode block, then it must be
4382	 * marked dirty so that its will eventually get written back in
4383	 * its correct form.
4384	 */
4385	if (hadchanges)
4386		bdirty(bp);
4387	/*
4388	 * Process any allocdirects that completed during the update.
4389	 */
4390	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
4391		handle_allocdirect_partdone(adp);
4392	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
4393		handle_allocdirect_partdone(adp);
4394	/*
4395	 * Process deallocations that were held pending until the
4396	 * inode had been written to disk. Freeing of the inode
4397	 * is delayed until after all blocks have been freed to
4398	 * avoid creation of new <vfsid, inum, lbn> triples
4399	 * before the old ones have been deleted.
4400	 */
4401	filefree = NULL;
4402	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
4403		WORKLIST_REMOVE(wk);
4404		switch (wk->wk_type) {
4405
4406		case D_FREEFILE:
4407			/*
4408			 * We defer adding filefree to the worklist until
4409			 * all other additions have been made to ensure
4410			 * that it will be done after all the old blocks
4411			 * have been freed.
4412			 */
4413			if (filefree != NULL)
4414				panic("handle_written_inodeblock: filefree");
4415			filefree = wk;
4416			continue;
4417
4418		case D_MKDIR:
4419			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
4420			continue;
4421
4422		case D_DIRADD:
4423			diradd_inode_written(WK_DIRADD(wk), inodedep);
4424			continue;
4425
4426		case D_FREEBLKS:
4427		case D_FREEFRAG:
4428		case D_DIRREM:
4429			add_to_worklist(wk);
4430			continue;
4431
4432		case D_NEWDIRBLK:
4433			free_newdirblk(WK_NEWDIRBLK(wk));
4434			continue;
4435
4436		default:
4437			panic("handle_written_inodeblock: Unknown type %s",
4438			    TYPENAME(wk->wk_type));
4439			/* NOTREACHED */
4440		}
4441	}
4442	if (filefree != NULL) {
4443		if (free_inodedep(inodedep) == 0)
4444			panic("handle_written_inodeblock: live inodedep");
4445		add_to_worklist(filefree);
4446		return (0);
4447	}
4448
4449	/*
4450	 * If no outstanding dependencies, free it.
4451	 */
4452	if (free_inodedep(inodedep) ||
4453	    (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
4454	     TAILQ_FIRST(&inodedep->id_extupdt) == 0))
4455		return (0);
4456	return (hadchanges);
4457}
4458
4459/*
4460 * Process a diradd entry after its dependent inode has been written.
4461 * This routine must be called with splbio interrupts blocked.
4462 */
4463static void
4464diradd_inode_written(dap, inodedep)
4465	struct diradd *dap;
4466	struct inodedep *inodedep;
4467{
4468	struct pagedep *pagedep;
4469
4470	dap->da_state |= COMPLETE;
4471	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4472		if (dap->da_state & DIRCHG)
4473			pagedep = dap->da_previous->dm_pagedep;
4474		else
4475			pagedep = dap->da_pagedep;
4476		LIST_REMOVE(dap, da_pdlist);
4477		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4478	}
4479	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
4480}
4481
4482/*
4483 * Handle the completion of a mkdir dependency.
4484 */
4485static void
4486handle_written_mkdir(mkdir, type)
4487	struct mkdir *mkdir;
4488	int type;
4489{
4490	struct diradd *dap;
4491	struct pagedep *pagedep;
4492
4493	if (mkdir->md_state != type)
4494		panic("handle_written_mkdir: bad type");
4495	dap = mkdir->md_diradd;
4496	dap->da_state &= ~type;
4497	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
4498		dap->da_state |= DEPCOMPLETE;
4499	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4500		if (dap->da_state & DIRCHG)
4501			pagedep = dap->da_previous->dm_pagedep;
4502		else
4503			pagedep = dap->da_pagedep;
4504		LIST_REMOVE(dap, da_pdlist);
4505		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4506	}
4507	LIST_REMOVE(mkdir, md_mkdirs);
4508	WORKITEM_FREE(mkdir, D_MKDIR);
4509}
4510
4511/*
4512 * Called from within softdep_disk_write_complete above.
4513 * A write operation was just completed. Removed inodes can
4514 * now be freed and associated block pointers may be committed.
4515 * Note that this routine is always called from interrupt level
4516 * with further splbio interrupts blocked.
4517 */
4518static int
4519handle_written_filepage(pagedep, bp)
4520	struct pagedep *pagedep;
4521	struct buf *bp;		/* buffer containing the written page */
4522{
4523	struct dirrem *dirrem;
4524	struct diradd *dap, *nextdap;
4525	struct direct *ep;
4526	int i, chgs;
4527
4528	if ((pagedep->pd_state & IOSTARTED) == 0)
4529		panic("handle_written_filepage: not started");
4530	pagedep->pd_state &= ~IOSTARTED;
4531	/*
4532	 * Process any directory removals that have been committed.
4533	 */
4534	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
4535		LIST_REMOVE(dirrem, dm_next);
4536		dirrem->dm_dirinum = pagedep->pd_ino;
4537		add_to_worklist(&dirrem->dm_list);
4538	}
4539	/*
4540	 * Free any directory additions that have been committed.
4541	 * If it is a newly allocated block, we have to wait until
4542	 * the on-disk directory inode claims the new block.
4543	 */
4544	if ((pagedep->pd_state & NEWBLOCK) == 0)
4545		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
4546			free_diradd(dap);
4547	/*
4548	 * Uncommitted directory entries must be restored.
4549	 */
4550	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
4551		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
4552		     dap = nextdap) {
4553			nextdap = LIST_NEXT(dap, da_pdlist);
4554			if (dap->da_state & ATTACHED)
4555				panic("handle_written_filepage: attached");
4556			ep = (struct direct *)
4557			    ((char *)bp->b_data + dap->da_offset);
4558			ep->d_ino = dap->da_newinum;
4559			dap->da_state &= ~UNDONE;
4560			dap->da_state |= ATTACHED;
4561			chgs = 1;
4562			/*
4563			 * If the inode referenced by the directory has
4564			 * been written out, then the dependency can be
4565			 * moved to the pending list.
4566			 */
4567			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4568				LIST_REMOVE(dap, da_pdlist);
4569				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
4570				    da_pdlist);
4571			}
4572		}
4573	}
4574	/*
4575	 * If there were any rollbacks in the directory, then it must be
4576	 * marked dirty so that its will eventually get written back in
4577	 * its correct form.
4578	 */
4579	if (chgs) {
4580		if ((bp->b_flags & B_DELWRI) == 0)
4581			stat_dir_entry++;
4582		bdirty(bp);
4583		return (1);
4584	}
4585	/*
4586	 * If we are not waiting for a new directory block to be
4587	 * claimed by its inode, then the pagedep will be freed.
4588	 * Otherwise it will remain to track any new entries on
4589	 * the page in case they are fsync'ed.
4590	 */
4591	if ((pagedep->pd_state & NEWBLOCK) == 0) {
4592		LIST_REMOVE(pagedep, pd_hash);
4593		WORKITEM_FREE(pagedep, D_PAGEDEP);
4594	}
4595	return (0);
4596}
4597
4598/*
4599 * Writing back in-core inode structures.
4600 *
4601 * The filesystem only accesses an inode's contents when it occupies an
4602 * "in-core" inode structure.  These "in-core" structures are separate from
4603 * the page frames used to cache inode blocks.  Only the latter are
4604 * transferred to/from the disk.  So, when the updated contents of the
4605 * "in-core" inode structure are copied to the corresponding in-memory inode
4606 * block, the dependencies are also transferred.  The following procedure is
4607 * called when copying a dirty "in-core" inode to a cached inode block.
4608 */
4609
4610/*
4611 * Called when an inode is loaded from disk. If the effective link count
4612 * differed from the actual link count when it was last flushed, then we
4613 * need to ensure that the correct effective link count is put back.
4614 */
4615void
4616softdep_load_inodeblock(ip)
4617	struct inode *ip;	/* the "in_core" copy of the inode */
4618{
4619	struct inodedep *inodedep;
4620
4621	/*
4622	 * Check for alternate nlink count.
4623	 */
4624	ip->i_effnlink = ip->i_nlink;
4625	ACQUIRE_LOCK(&lk);
4626	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
4627		FREE_LOCK(&lk);
4628		return;
4629	}
4630	ip->i_effnlink -= inodedep->id_nlinkdelta;
4631	if (inodedep->id_state & SPACECOUNTED)
4632		ip->i_flag |= IN_SPACECOUNTED;
4633	FREE_LOCK(&lk);
4634}
4635
4636/*
4637 * This routine is called just before the "in-core" inode
4638 * information is to be copied to the in-memory inode block.
4639 * Recall that an inode block contains several inodes. If
4640 * the force flag is set, then the dependencies will be
4641 * cleared so that the update can always be made. Note that
4642 * the buffer is locked when this routine is called, so we
4643 * will never be in the middle of writing the inode block
4644 * to disk.
4645 */
4646void
4647softdep_update_inodeblock(ip, bp, waitfor)
4648	struct inode *ip;	/* the "in_core" copy of the inode */
4649	struct buf *bp;		/* the buffer containing the inode block */
4650	int waitfor;		/* nonzero => update must be allowed */
4651{
4652	struct inodedep *inodedep;
4653	struct worklist *wk;
4654	struct buf *ibp;
4655	int error;
4656
4657	/*
4658	 * If the effective link count is not equal to the actual link
4659	 * count, then we must track the difference in an inodedep while
4660	 * the inode is (potentially) tossed out of the cache. Otherwise,
4661	 * if there is no existing inodedep, then there are no dependencies
4662	 * to track.
4663	 */
4664	ACQUIRE_LOCK(&lk);
4665	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
4666		FREE_LOCK(&lk);
4667		if (ip->i_effnlink != ip->i_nlink)
4668			panic("softdep_update_inodeblock: bad link count");
4669		return;
4670	}
4671	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
4672		panic("softdep_update_inodeblock: bad delta");
4673	/*
4674	 * Changes have been initiated. Anything depending on these
4675	 * changes cannot occur until this inode has been written.
4676	 */
4677	inodedep->id_state &= ~COMPLETE;
4678	if ((inodedep->id_state & ONWORKLIST) == 0)
4679		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
4680	/*
4681	 * Any new dependencies associated with the incore inode must
4682	 * now be moved to the list associated with the buffer holding
4683	 * the in-memory copy of the inode. Once merged process any
4684	 * allocdirects that are completed by the merger.
4685	 */
4686	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
4687	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
4688		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
4689	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
4690	if (TAILQ_FIRST(&inodedep->id_extupdt) != NULL)
4691		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
4692	/*
4693	 * Now that the inode has been pushed into the buffer, the
4694	 * operations dependent on the inode being written to disk
4695	 * can be moved to the id_bufwait so that they will be
4696	 * processed when the buffer I/O completes.
4697	 */
4698	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
4699		WORKLIST_REMOVE(wk);
4700		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
4701	}
4702	/*
4703	 * Newly allocated inodes cannot be written until the bitmap
4704	 * that allocates them have been written (indicated by
4705	 * DEPCOMPLETE being set in id_state). If we are doing a
4706	 * forced sync (e.g., an fsync on a file), we force the bitmap
4707	 * to be written so that the update can be done.
4708	 */
4709	if (waitfor == 0) {
4710		FREE_LOCK(&lk);
4711		return;
4712	}
4713retry:
4714	if ((inodedep->id_state & DEPCOMPLETE) != 0) {
4715		FREE_LOCK(&lk);
4716		return;
4717	}
4718	ibp = inodedep->id_buf;
4719	ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
4720	if (ibp == NULL) {
4721		/*
4722		 * If ibp came back as NULL, the dependency could have been
4723		 * freed while we slept.  Look it up again, and check to see
4724		 * that it has completed.
4725		 */
4726		if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) != 0)
4727			goto retry;
4728		FREE_LOCK(&lk);
4729		return;
4730	}
4731	FREE_LOCK(&lk);
4732	if ((error = bwrite(ibp)) != 0)
4733		softdep_error("softdep_update_inodeblock: bwrite", error);
4734}
4735
4736/*
4737 * Merge the a new inode dependency list (such as id_newinoupdt) into an
4738 * old inode dependency list (such as id_inoupdt). This routine must be
4739 * called with splbio interrupts blocked.
4740 */
4741static void
4742merge_inode_lists(newlisthead, oldlisthead)
4743	struct allocdirectlst *newlisthead;
4744	struct allocdirectlst *oldlisthead;
4745{
4746	struct allocdirect *listadp, *newadp;
4747
4748	newadp = TAILQ_FIRST(newlisthead);
4749	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
4750		if (listadp->ad_lbn < newadp->ad_lbn) {
4751			listadp = TAILQ_NEXT(listadp, ad_next);
4752			continue;
4753		}
4754		TAILQ_REMOVE(newlisthead, newadp, ad_next);
4755		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
4756		if (listadp->ad_lbn == newadp->ad_lbn) {
4757			allocdirect_merge(oldlisthead, newadp,
4758			    listadp);
4759			listadp = newadp;
4760		}
4761		newadp = TAILQ_FIRST(newlisthead);
4762	}
4763	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
4764		TAILQ_REMOVE(newlisthead, newadp, ad_next);
4765		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
4766	}
4767}
4768
4769/*
4770 * If we are doing an fsync, then we must ensure that any directory
4771 * entries for the inode have been written after the inode gets to disk.
4772 */
4773int
4774softdep_fsync(vp)
4775	struct vnode *vp;	/* the "in_core" copy of the inode */
4776{
4777	struct inodedep *inodedep;
4778	struct pagedep *pagedep;
4779	struct worklist *wk;
4780	struct diradd *dap;
4781	struct mount *mnt;
4782	struct vnode *pvp;
4783	struct inode *ip;
4784	struct buf *bp;
4785	struct fs *fs;
4786	struct thread *td = curthread;
4787	int error, flushparent;
4788	ino_t parentino;
4789	ufs_lbn_t lbn;
4790
4791	ip = VTOI(vp);
4792	fs = ip->i_fs;
4793	ACQUIRE_LOCK(&lk);
4794	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
4795		FREE_LOCK(&lk);
4796		return (0);
4797	}
4798	if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
4799	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
4800	    TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
4801	    TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
4802	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
4803	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
4804		panic("softdep_fsync: pending ops");
4805	for (error = 0, flushparent = 0; ; ) {
4806		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
4807			break;
4808		if (wk->wk_type != D_DIRADD)
4809			panic("softdep_fsync: Unexpected type %s",
4810			    TYPENAME(wk->wk_type));
4811		dap = WK_DIRADD(wk);
4812		/*
4813		 * Flush our parent if this directory entry has a MKDIR_PARENT
4814		 * dependency or is contained in a newly allocated block.
4815		 */
4816		if (dap->da_state & DIRCHG)
4817			pagedep = dap->da_previous->dm_pagedep;
4818		else
4819			pagedep = dap->da_pagedep;
4820		mnt = pagedep->pd_mnt;
4821		parentino = pagedep->pd_ino;
4822		lbn = pagedep->pd_lbn;
4823		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
4824			panic("softdep_fsync: dirty");
4825		if ((dap->da_state & MKDIR_PARENT) ||
4826		    (pagedep->pd_state & NEWBLOCK))
4827			flushparent = 1;
4828		else
4829			flushparent = 0;
4830		/*
4831		 * If we are being fsync'ed as part of vgone'ing this vnode,
4832		 * then we will not be able to release and recover the
4833		 * vnode below, so we just have to give up on writing its
4834		 * directory entry out. It will eventually be written, just
4835		 * not now, but then the user was not asking to have it
4836		 * written, so we are not breaking any promises.
4837		 */
4838		if (vp->v_iflag & VI_DOOMED)
4839			break;
4840		/*
4841		 * We prevent deadlock by always fetching inodes from the
4842		 * root, moving down the directory tree. Thus, when fetching
4843		 * our parent directory, we first try to get the lock. If
4844		 * that fails, we must unlock ourselves before requesting
4845		 * the lock on our parent. See the comment in ufs_lookup
4846		 * for details on possible races.
4847		 */
4848		FREE_LOCK(&lk);
4849		if (ffs_vget(mnt, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) {
4850			VOP_UNLOCK(vp, 0, td);
4851			error = ffs_vget(mnt, parentino, LK_EXCLUSIVE, &pvp);
4852			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
4853			if (error != 0)
4854				return (error);
4855		}
4856		/*
4857		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
4858		 * that are contained in direct blocks will be resolved by
4859		 * doing a ffs_update. Pagedeps contained in indirect blocks
4860		 * may require a complete sync'ing of the directory. So, we
4861		 * try the cheap and fast ffs_update first, and if that fails,
4862		 * then we do the slower ffs_syncvnode of the directory.
4863		 */
4864		if (flushparent) {
4865			if ((error = ffs_update(pvp, 1)) != 0) {
4866				vput(pvp);
4867				return (error);
4868			}
4869			if ((pagedep->pd_state & NEWBLOCK) &&
4870			    (error = ffs_syncvnode(pvp, MNT_WAIT))) {
4871				vput(pvp);
4872				return (error);
4873			}
4874		}
4875		/*
4876		 * Flush directory page containing the inode's name.
4877		 */
4878		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
4879		    &bp);
4880		if (error == 0)
4881			error = bwrite(bp);
4882		else
4883			brelse(bp);
4884		vput(pvp);
4885		if (error != 0)
4886			return (error);
4887		ACQUIRE_LOCK(&lk);
4888		if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
4889			break;
4890	}
4891	FREE_LOCK(&lk);
4892	return (0);
4893}
4894
4895/*
4896 * Flush all the dirty bitmaps associated with the block device
4897 * before flushing the rest of the dirty blocks so as to reduce
4898 * the number of dependencies that will have to be rolled back.
4899 */
4900void
4901softdep_fsync_mountdev(vp)
4902	struct vnode *vp;
4903{
4904	struct buf *bp, *nbp;
4905	struct worklist *wk;
4906
4907	if (!vn_isdisk(vp, NULL))
4908		panic("softdep_fsync_mountdev: vnode not a disk");
4909	ACQUIRE_LOCK(&lk);
4910	VI_LOCK(vp);
4911	TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
4912		/*
4913		 * If it is already scheduled, skip to the next buffer.
4914		 */
4915		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
4916			continue;
4917
4918		if ((bp->b_flags & B_DELWRI) == 0)
4919			panic("softdep_fsync_mountdev: not dirty");
4920		/*
4921		 * We are only interested in bitmaps with outstanding
4922		 * dependencies.
4923		 */
4924		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
4925		    wk->wk_type != D_BMSAFEMAP ||
4926		    (bp->b_vflags & BV_BKGRDINPROG)) {
4927			BUF_UNLOCK(bp);
4928			continue;
4929		}
4930		VI_UNLOCK(vp);
4931		FREE_LOCK(&lk);
4932		bremfree(bp);
4933		(void) bawrite(bp);
4934		ACQUIRE_LOCK(&lk);
4935		/*
4936		 * Since we may have slept during the I/O, we need
4937		 * to start from a known point.
4938		 */
4939		VI_LOCK(vp);
4940		nbp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd);
4941	}
4942	FREE_LOCK(&lk);
4943	drain_output(vp);
4944	VI_UNLOCK(vp);
4945}
4946
4947/*
4948 * This routine is called when we are trying to synchronously flush a
4949 * file. This routine must eliminate any filesystem metadata dependencies
4950 * so that the syncing routine can succeed by pushing the dirty blocks
4951 * associated with the file. If any I/O errors occur, they are returned.
4952 */
4953int
4954softdep_sync_metadata(struct vnode *vp)
4955{
4956	struct pagedep *pagedep;
4957	struct allocdirect *adp;
4958	struct allocindir *aip;
4959	struct buf *bp, *nbp;
4960	struct worklist *wk;
4961	int i, error, waitfor;
4962
4963	if (!DOINGSOFTDEP(vp))
4964		return (0);
4965	/*
4966	 * Ensure that any direct block dependencies have been cleared.
4967	 */
4968	ACQUIRE_LOCK(&lk);
4969	if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
4970		FREE_LOCK(&lk);
4971		return (error);
4972	}
4973	FREE_LOCK(&lk);
4974	/*
4975	 * For most files, the only metadata dependencies are the
4976	 * cylinder group maps that allocate their inode or blocks.
4977	 * The block allocation dependencies can be found by traversing
4978	 * the dependency lists for any buffers that remain on their
4979	 * dirty buffer list. The inode allocation dependency will
4980	 * be resolved when the inode is updated with MNT_WAIT.
4981	 * This work is done in two passes. The first pass grabs most
4982	 * of the buffers and begins asynchronously writing them. The
4983	 * only way to wait for these asynchronous writes is to sleep
4984	 * on the filesystem vnode which may stay busy for a long time
4985	 * if the filesystem is active. So, instead, we make a second
4986	 * pass over the dependencies blocking on each write. In the
4987	 * usual case we will be blocking against a write that we
4988	 * initiated, so when it is done the dependency will have been
4989	 * resolved. Thus the second pass is expected to end quickly.
4990	 */
4991	waitfor = MNT_NOWAIT;
4992
4993top:
4994	/*
4995	 * We must wait for any I/O in progress to finish so that
4996	 * all potential buffers on the dirty list will be visible.
4997	 */
4998	VI_LOCK(vp);
4999	drain_output(vp);
5000	while ((bp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd)) != NULL) {
5001		bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT);
5002		if (bp)
5003			break;
5004	}
5005	VI_UNLOCK(vp);
5006	if (bp == NULL)
5007		return (0);
5008loop:
5009	/* While syncing snapshots, we must allow recursive lookups */
5010	bp->b_lock.lk_flags |= LK_CANRECURSE;
5011	ACQUIRE_LOCK(&lk);
5012	/*
5013	 * As we hold the buffer locked, none of its dependencies
5014	 * will disappear.
5015	 */
5016	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5017		switch (wk->wk_type) {
5018
5019		case D_ALLOCDIRECT:
5020			adp = WK_ALLOCDIRECT(wk);
5021			if (adp->ad_state & DEPCOMPLETE)
5022				continue;
5023			nbp = adp->ad_buf;
5024			nbp = getdirtybuf(nbp, &lk, waitfor);
5025			if (nbp == NULL)
5026				continue;
5027			FREE_LOCK(&lk);
5028			if (waitfor == MNT_NOWAIT) {
5029				bawrite(nbp);
5030			} else if ((error = bwrite(nbp)) != 0) {
5031				break;
5032			}
5033			ACQUIRE_LOCK(&lk);
5034			continue;
5035
5036		case D_ALLOCINDIR:
5037			aip = WK_ALLOCINDIR(wk);
5038			if (aip->ai_state & DEPCOMPLETE)
5039				continue;
5040			nbp = aip->ai_buf;
5041			nbp = getdirtybuf(nbp, &lk, waitfor);
5042			if (nbp == NULL)
5043				continue;
5044			FREE_LOCK(&lk);
5045			if (waitfor == MNT_NOWAIT) {
5046				bawrite(nbp);
5047			} else if ((error = bwrite(nbp)) != 0) {
5048				break;
5049			}
5050			ACQUIRE_LOCK(&lk);
5051			continue;
5052
5053		case D_INDIRDEP:
5054		restart:
5055
5056			LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
5057				if (aip->ai_state & DEPCOMPLETE)
5058					continue;
5059				nbp = aip->ai_buf;
5060				nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
5061				if (nbp == NULL)
5062					goto restart;
5063				FREE_LOCK(&lk);
5064				if ((error = bwrite(nbp)) != 0) {
5065					break;
5066				}
5067				ACQUIRE_LOCK(&lk);
5068				goto restart;
5069			}
5070			continue;
5071
5072		case D_INODEDEP:
5073			if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
5074			    WK_INODEDEP(wk)->id_ino)) != 0) {
5075				FREE_LOCK(&lk);
5076				break;
5077			}
5078			continue;
5079
5080		case D_PAGEDEP:
5081			/*
5082			 * We are trying to sync a directory that may
5083			 * have dependencies on both its own metadata
5084			 * and/or dependencies on the inodes of any
5085			 * recently allocated files. We walk its diradd
5086			 * lists pushing out the associated inode.
5087			 */
5088			pagedep = WK_PAGEDEP(wk);
5089			for (i = 0; i < DAHASHSZ; i++) {
5090				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
5091					continue;
5092				if ((error =
5093				    flush_pagedep_deps(vp, pagedep->pd_mnt,
5094						&pagedep->pd_diraddhd[i]))) {
5095					FREE_LOCK(&lk);
5096					break;
5097				}
5098			}
5099			continue;
5100
5101		case D_MKDIR:
5102			/*
5103			 * This case should never happen if the vnode has
5104			 * been properly sync'ed. However, if this function
5105			 * is used at a place where the vnode has not yet
5106			 * been sync'ed, this dependency can show up. So,
5107			 * rather than panic, just flush it.
5108			 */
5109			nbp = WK_MKDIR(wk)->md_buf;
5110			nbp = getdirtybuf(nbp, &lk, waitfor);
5111			if (nbp == NULL)
5112				continue;
5113			FREE_LOCK(&lk);
5114			if (waitfor == MNT_NOWAIT) {
5115				bawrite(nbp);
5116			} else if ((error = bwrite(nbp)) != 0) {
5117				break;
5118			}
5119			ACQUIRE_LOCK(&lk);
5120			continue;
5121
5122		case D_BMSAFEMAP:
5123			/*
5124			 * This case should never happen if the vnode has
5125			 * been properly sync'ed. However, if this function
5126			 * is used at a place where the vnode has not yet
5127			 * been sync'ed, this dependency can show up. So,
5128			 * rather than panic, just flush it.
5129			 */
5130			nbp = WK_BMSAFEMAP(wk)->sm_buf;
5131			nbp = getdirtybuf(nbp, &lk, waitfor);
5132			if (nbp == NULL)
5133				continue;
5134			FREE_LOCK(&lk);
5135			if (waitfor == MNT_NOWAIT) {
5136				bawrite(nbp);
5137			} else if ((error = bwrite(nbp)) != 0) {
5138				break;
5139			}
5140			ACQUIRE_LOCK(&lk);
5141			continue;
5142
5143		default:
5144			panic("softdep_sync_metadata: Unknown type %s",
5145			    TYPENAME(wk->wk_type));
5146			/* NOTREACHED */
5147		}
5148		/* We reach here only in error and unlocked */
5149		if (error == 0)
5150			panic("softdep_sync_metadata: zero error");
5151		bp->b_lock.lk_flags &= ~LK_CANRECURSE;
5152		bawrite(bp);
5153		return (error);
5154	}
5155	FREE_LOCK(&lk);
5156	VI_LOCK(vp);
5157	while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) {
5158		nbp = getdirtybuf(nbp, VI_MTX(vp), MNT_WAIT);
5159		if (nbp)
5160			break;
5161	}
5162	VI_UNLOCK(vp);
5163	bp->b_lock.lk_flags &= ~LK_CANRECURSE;
5164	bawrite(bp);
5165	if (nbp != NULL) {
5166		bp = nbp;
5167		goto loop;
5168	}
5169	/*
5170	 * The brief unlock is to allow any pent up dependency
5171	 * processing to be done. Then proceed with the second pass.
5172	 */
5173	if (waitfor == MNT_NOWAIT) {
5174		waitfor = MNT_WAIT;
5175		goto top;
5176	}
5177
5178	/*
5179	 * If we have managed to get rid of all the dirty buffers,
5180	 * then we are done. For certain directories and block
5181	 * devices, we may need to do further work.
5182	 *
5183	 * We must wait for any I/O in progress to finish so that
5184	 * all potential buffers on the dirty list will be visible.
5185	 */
5186	VI_LOCK(vp);
5187	drain_output(vp);
5188	VI_UNLOCK(vp);
5189	return (0);
5190}
5191
5192/*
5193 * Flush the dependencies associated with an inodedep.
5194 * Called with splbio blocked.
5195 */
5196static int
5197flush_inodedep_deps(fs, ino)
5198	struct fs *fs;
5199	ino_t ino;
5200{
5201	struct inodedep *inodedep;
5202	int error, waitfor;
5203
5204	/*
5205	 * This work is done in two passes. The first pass grabs most
5206	 * of the buffers and begins asynchronously writing them. The
5207	 * only way to wait for these asynchronous writes is to sleep
5208	 * on the filesystem vnode which may stay busy for a long time
5209	 * if the filesystem is active. So, instead, we make a second
5210	 * pass over the dependencies blocking on each write. In the
5211	 * usual case we will be blocking against a write that we
5212	 * initiated, so when it is done the dependency will have been
5213	 * resolved. Thus the second pass is expected to end quickly.
5214	 * We give a brief window at the top of the loop to allow
5215	 * any pending I/O to complete.
5216	 */
5217	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
5218		if (error)
5219			return (error);
5220		FREE_LOCK(&lk);
5221		ACQUIRE_LOCK(&lk);
5222		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
5223			return (0);
5224		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
5225		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
5226		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
5227		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
5228			continue;
5229		/*
5230		 * If pass2, we are done, otherwise do pass 2.
5231		 */
5232		if (waitfor == MNT_WAIT)
5233			break;
5234		waitfor = MNT_WAIT;
5235	}
5236	/*
5237	 * Try freeing inodedep in case all dependencies have been removed.
5238	 */
5239	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
5240		(void) free_inodedep(inodedep);
5241	return (0);
5242}
5243
5244/*
5245 * Flush an inode dependency list.
5246 * Called with splbio blocked.
5247 */
5248static int
5249flush_deplist(listhead, waitfor, errorp)
5250	struct allocdirectlst *listhead;
5251	int waitfor;
5252	int *errorp;
5253{
5254	struct allocdirect *adp;
5255	struct buf *bp;
5256
5257	mtx_assert(&lk, MA_OWNED);
5258	TAILQ_FOREACH(adp, listhead, ad_next) {
5259		if (adp->ad_state & DEPCOMPLETE)
5260			continue;
5261		bp = adp->ad_buf;
5262		bp = getdirtybuf(bp, &lk, waitfor);
5263		if (bp == NULL) {
5264			if (waitfor == MNT_NOWAIT)
5265				continue;
5266			return (1);
5267		}
5268		FREE_LOCK(&lk);
5269		if (waitfor == MNT_NOWAIT) {
5270			bawrite(bp);
5271		} else if ((*errorp = bwrite(bp)) != 0) {
5272			ACQUIRE_LOCK(&lk);
5273			return (1);
5274		}
5275		ACQUIRE_LOCK(&lk);
5276		return (1);
5277	}
5278	return (0);
5279}
5280
5281/*
5282 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
5283 * Called with splbio blocked.
5284 */
5285static int
5286flush_pagedep_deps(pvp, mp, diraddhdp)
5287	struct vnode *pvp;
5288	struct mount *mp;
5289	struct diraddhd *diraddhdp;
5290{
5291	struct inodedep *inodedep;
5292	struct ufsmount *ump;
5293	struct diradd *dap;
5294	struct vnode *vp;
5295	int error = 0;
5296	struct buf *bp;
5297	ino_t inum;
5298
5299	ump = VFSTOUFS(mp);
5300	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
5301		/*
5302		 * Flush ourselves if this directory entry
5303		 * has a MKDIR_PARENT dependency.
5304		 */
5305		if (dap->da_state & MKDIR_PARENT) {
5306			FREE_LOCK(&lk);
5307			if ((error = ffs_update(pvp, 1)) != 0)
5308				break;
5309			ACQUIRE_LOCK(&lk);
5310			/*
5311			 * If that cleared dependencies, go on to next.
5312			 */
5313			if (dap != LIST_FIRST(diraddhdp))
5314				continue;
5315			if (dap->da_state & MKDIR_PARENT)
5316				panic("flush_pagedep_deps: MKDIR_PARENT");
5317		}
5318		/*
5319		 * A newly allocated directory must have its "." and
5320		 * ".." entries written out before its name can be
5321		 * committed in its parent. We do not want or need
5322		 * the full semantics of a synchronous ffs_syncvnode as
5323		 * that may end up here again, once for each directory
5324		 * level in the filesystem. Instead, we push the blocks
5325		 * and wait for them to clear. We have to fsync twice
5326		 * because the first call may choose to defer blocks
5327		 * that still have dependencies, but deferral will
5328		 * happen at most once.
5329		 */
5330		inum = dap->da_newinum;
5331		if (dap->da_state & MKDIR_BODY) {
5332			FREE_LOCK(&lk);
5333			if ((error = ffs_vget(mp, inum, LK_EXCLUSIVE, &vp)))
5334				break;
5335			if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) ||
5336			    (error=ffs_syncvnode(vp, MNT_NOWAIT))) {
5337				vput(vp);
5338				break;
5339			}
5340			VI_LOCK(vp);
5341			drain_output(vp);
5342			VI_UNLOCK(vp);
5343			vput(vp);
5344			ACQUIRE_LOCK(&lk);
5345			/*
5346			 * If that cleared dependencies, go on to next.
5347			 */
5348			if (dap != LIST_FIRST(diraddhdp))
5349				continue;
5350			if (dap->da_state & MKDIR_BODY)
5351				panic("flush_pagedep_deps: MKDIR_BODY");
5352		}
5353		/*
5354		 * Flush the inode on which the directory entry depends.
5355		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
5356		 * the only remaining dependency is that the updated inode
5357		 * count must get pushed to disk. The inode has already
5358		 * been pushed into its inode buffer (via VOP_UPDATE) at
5359		 * the time of the reference count change. So we need only
5360		 * locate that buffer, ensure that there will be no rollback
5361		 * caused by a bitmap dependency, then write the inode buffer.
5362		 */
5363retry:
5364		if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0)
5365			panic("flush_pagedep_deps: lost inode");
5366		/*
5367		 * If the inode still has bitmap dependencies,
5368		 * push them to disk.
5369		 */
5370		if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5371			bp = inodedep->id_buf;
5372			bp = getdirtybuf(bp, &lk, MNT_WAIT);
5373			if (bp == NULL)
5374				goto retry;
5375			FREE_LOCK(&lk);
5376			if ((error = bwrite(bp)) != 0)
5377				break;
5378			ACQUIRE_LOCK(&lk);
5379			if (dap != LIST_FIRST(diraddhdp))
5380				continue;
5381		}
5382		/*
5383		 * If the inode is still sitting in a buffer waiting
5384		 * to be written, push it to disk.
5385		 */
5386		FREE_LOCK(&lk);
5387		if ((error = bread(ump->um_devvp,
5388		    fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
5389		    (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
5390			brelse(bp);
5391			break;
5392		}
5393		if ((error = bwrite(bp)) != 0)
5394			break;
5395		ACQUIRE_LOCK(&lk);
5396		/*
5397		 * If we have failed to get rid of all the dependencies
5398		 * then something is seriously wrong.
5399		 */
5400		if (dap == LIST_FIRST(diraddhdp))
5401			panic("flush_pagedep_deps: flush failed");
5402	}
5403	if (error)
5404		ACQUIRE_LOCK(&lk);
5405	return (error);
5406}
5407
5408/*
5409 * A large burst of file addition or deletion activity can drive the
5410 * memory load excessively high. First attempt to slow things down
5411 * using the techniques below. If that fails, this routine requests
5412 * the offending operations to fall back to running synchronously
5413 * until the memory load returns to a reasonable level.
5414 */
5415int
5416softdep_slowdown(vp)
5417	struct vnode *vp;
5418{
5419	int max_softdeps_hard;
5420
5421	max_softdeps_hard = max_softdeps * 11 / 10;
5422	if (num_dirrem < max_softdeps_hard / 2 &&
5423	    num_inodedep < max_softdeps_hard &&
5424	    VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps)
5425  		return (0);
5426	if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps)
5427		speedup_syncer();
5428	stat_sync_limit_hit += 1;
5429	return (1);
5430}
5431
5432/*
5433 * Called by the allocation routines when they are about to fail
5434 * in the hope that we can free up some disk space.
5435 *
5436 * First check to see if the work list has anything on it. If it has,
5437 * clean up entries until we successfully free some space. Because this
5438 * process holds inodes locked, we cannot handle any remove requests
5439 * that might block on a locked inode as that could lead to deadlock.
5440 * If the worklist yields no free space, encourage the syncer daemon
5441 * to help us. In no event will we try for longer than tickdelay seconds.
5442 */
5443int
5444softdep_request_cleanup(fs, vp)
5445	struct fs *fs;
5446	struct vnode *vp;
5447{
5448	struct ufsmount *ump;
5449	long starttime;
5450	ufs2_daddr_t needed;
5451	int error;
5452
5453	ump = VTOI(vp)->i_ump;
5454	mtx_assert(UFS_MTX(ump), MA_OWNED);
5455	needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
5456	starttime = time_second + tickdelay;
5457	/*
5458	 * If we are being called because of a process doing a
5459	 * copy-on-write, then it is not safe to update the vnode
5460	 * as we may recurse into the copy-on-write routine.
5461	 */
5462	if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
5463		UFS_UNLOCK(ump);
5464		error = ffs_update(vp, 1);
5465		UFS_LOCK(ump);
5466		if (error != 0)
5467			return (0);
5468	}
5469	while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
5470		if (time_second > starttime)
5471			return (0);
5472		UFS_UNLOCK(ump);
5473		ACQUIRE_LOCK(&lk);
5474		if (num_on_worklist > 0 &&
5475		    process_worklist_item(NULL, LK_NOWAIT) != -1) {
5476			stat_worklist_push += 1;
5477			FREE_LOCK(&lk);
5478			UFS_LOCK(ump);
5479			continue;
5480		}
5481		request_cleanup(FLUSH_REMOVE_WAIT);
5482		FREE_LOCK(&lk);
5483		UFS_LOCK(ump);
5484	}
5485	return (1);
5486}
5487
5488/*
5489 * If memory utilization has gotten too high, deliberately slow things
5490 * down and speed up the I/O processing.
5491 */
5492static int
5493request_cleanup(resource)
5494	int resource;
5495{
5496	struct thread *td = curthread;
5497
5498	mtx_assert(&lk, MA_OWNED);
5499	/*
5500	 * We never hold up the filesystem syncer process.
5501	 */
5502	if (td == filesys_syncer || (td->td_pflags & TDP_SOFTDEP))
5503		return (0);
5504	/*
5505	 * First check to see if the work list has gotten backlogged.
5506	 * If it has, co-opt this process to help clean up two entries.
5507	 * Because this process may hold inodes locked, we cannot
5508	 * handle any remove requests that might block on a locked
5509	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
5510	 * to avoid recursively processing the worklist.
5511	 */
5512	if (num_on_worklist > max_softdeps / 10) {
5513		td->td_pflags |= TDP_SOFTDEP;
5514		process_worklist_item(NULL, LK_NOWAIT);
5515		process_worklist_item(NULL, LK_NOWAIT);
5516		td->td_pflags &= ~TDP_SOFTDEP;
5517		stat_worklist_push += 2;
5518		return(1);
5519	}
5520	/*
5521	 * Next, we attempt to speed up the syncer process. If that
5522	 * is successful, then we allow the process to continue.
5523	 */
5524	if (speedup_syncer() && resource != FLUSH_REMOVE_WAIT)
5525		return(0);
5526	/*
5527	 * If we are resource constrained on inode dependencies, try
5528	 * flushing some dirty inodes. Otherwise, we are constrained
5529	 * by file deletions, so try accelerating flushes of directories
5530	 * with removal dependencies. We would like to do the cleanup
5531	 * here, but we probably hold an inode locked at this point and
5532	 * that might deadlock against one that we try to clean. So,
5533	 * the best that we can do is request the syncer daemon to do
5534	 * the cleanup for us.
5535	 */
5536	switch (resource) {
5537
5538	case FLUSH_INODES:
5539		stat_ino_limit_push += 1;
5540		req_clear_inodedeps += 1;
5541		stat_countp = &stat_ino_limit_hit;
5542		break;
5543
5544	case FLUSH_REMOVE:
5545	case FLUSH_REMOVE_WAIT:
5546		stat_blk_limit_push += 1;
5547		req_clear_remove += 1;
5548		stat_countp = &stat_blk_limit_hit;
5549		break;
5550
5551	default:
5552		panic("request_cleanup: unknown type");
5553	}
5554	/*
5555	 * Hopefully the syncer daemon will catch up and awaken us.
5556	 * We wait at most tickdelay before proceeding in any case.
5557	 */
5558	proc_waiting += 1;
5559	if (handle.callout == NULL)
5560		handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
5561	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
5562	proc_waiting -= 1;
5563	return (1);
5564}
5565
5566/*
5567 * Awaken processes pausing in request_cleanup and clear proc_waiting
5568 * to indicate that there is no longer a timer running.
5569 */
5570static void
5571pause_timer(arg)
5572	void *arg;
5573{
5574
5575	ACQUIRE_LOCK(&lk);
5576	*stat_countp += 1;
5577	wakeup_one(&proc_waiting);
5578	if (proc_waiting > 0)
5579		handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
5580	else
5581		handle.callout = NULL;
5582	FREE_LOCK(&lk);
5583}
5584
5585/*
5586 * Flush out a directory with at least one removal dependency in an effort to
5587 * reduce the number of dirrem, freefile, and freeblks dependency structures.
5588 */
5589static void
5590clear_remove(td)
5591	struct thread *td;
5592{
5593	struct pagedep_hashhead *pagedephd;
5594	struct pagedep *pagedep;
5595	static int next = 0;
5596	struct mount *mp;
5597	struct vnode *vp;
5598	int error, cnt;
5599	ino_t ino;
5600
5601	mtx_assert(&lk, MA_OWNED);
5602
5603	for (cnt = 0; cnt < pagedep_hash; cnt++) {
5604		pagedephd = &pagedep_hashtbl[next++];
5605		if (next >= pagedep_hash)
5606			next = 0;
5607		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
5608			if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
5609				continue;
5610			mp = pagedep->pd_mnt;
5611			ino = pagedep->pd_ino;
5612			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5613				continue;
5614			FREE_LOCK(&lk);
5615			if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp))) {
5616				softdep_error("clear_remove: vget", error);
5617				vn_finished_write(mp);
5618				ACQUIRE_LOCK(&lk);
5619				return;
5620			}
5621			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
5622				softdep_error("clear_remove: fsync", error);
5623			VI_LOCK(vp);
5624			drain_output(vp);
5625			VI_UNLOCK(vp);
5626			vput(vp);
5627			vn_finished_write(mp);
5628			ACQUIRE_LOCK(&lk);
5629			return;
5630		}
5631	}
5632}
5633
5634/*
5635 * Clear out a block of dirty inodes in an effort to reduce
5636 * the number of inodedep dependency structures.
5637 */
5638static void
5639clear_inodedeps(td)
5640	struct thread *td;
5641{
5642	struct inodedep_hashhead *inodedephd;
5643	struct inodedep *inodedep;
5644	static int next = 0;
5645	struct mount *mp;
5646	struct vnode *vp;
5647	struct fs *fs;
5648	int error, cnt;
5649	ino_t firstino, lastino, ino;
5650
5651	mtx_assert(&lk, MA_OWNED);
5652	/*
5653	 * Pick a random inode dependency to be cleared.
5654	 * We will then gather up all the inodes in its block
5655	 * that have dependencies and flush them out.
5656	 */
5657	for (cnt = 0; cnt < inodedep_hash; cnt++) {
5658		inodedephd = &inodedep_hashtbl[next++];
5659		if (next >= inodedep_hash)
5660			next = 0;
5661		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
5662			break;
5663	}
5664	if (inodedep == NULL)
5665		return;
5666	/*
5667	 * Ugly code to find mount point given pointer to superblock.
5668	 */
5669	fs = inodedep->id_fs;
5670	TAILQ_FOREACH(mp, &mountlist, mnt_list)
5671		if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
5672			break;
5673	/*
5674	 * Find the last inode in the block with dependencies.
5675	 */
5676	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
5677	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
5678		if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
5679			break;
5680	/*
5681	 * Asynchronously push all but the last inode with dependencies.
5682	 * Synchronously push the last inode with dependencies to ensure
5683	 * that the inode block gets written to free up the inodedeps.
5684	 */
5685	for (ino = firstino; ino <= lastino; ino++) {
5686		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
5687			continue;
5688		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5689			continue;
5690		FREE_LOCK(&lk);
5691		if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp)) != 0) {
5692			softdep_error("clear_inodedeps: vget", error);
5693			vn_finished_write(mp);
5694			ACQUIRE_LOCK(&lk);
5695			return;
5696		}
5697		if (ino == lastino) {
5698			if ((error = ffs_syncvnode(vp, MNT_WAIT)))
5699				softdep_error("clear_inodedeps: fsync1", error);
5700		} else {
5701			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
5702				softdep_error("clear_inodedeps: fsync2", error);
5703			VI_LOCK(vp);
5704			drain_output(vp);
5705			VI_UNLOCK(vp);
5706		}
5707		vput(vp);
5708		vn_finished_write(mp);
5709		ACQUIRE_LOCK(&lk);
5710	}
5711}
5712
5713/*
5714 * Function to determine if the buffer has outstanding dependencies
5715 * that will cause a roll-back if the buffer is written. If wantcount
5716 * is set, return number of dependencies, otherwise just yes or no.
5717 */
5718static int
5719softdep_count_dependencies(bp, wantcount)
5720	struct buf *bp;
5721	int wantcount;
5722{
5723	struct worklist *wk;
5724	struct inodedep *inodedep;
5725	struct indirdep *indirdep;
5726	struct allocindir *aip;
5727	struct pagedep *pagedep;
5728	struct diradd *dap;
5729	int i, retval;
5730
5731	retval = 0;
5732	ACQUIRE_LOCK(&lk);
5733	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5734		switch (wk->wk_type) {
5735
5736		case D_INODEDEP:
5737			inodedep = WK_INODEDEP(wk);
5738			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5739				/* bitmap allocation dependency */
5740				retval += 1;
5741				if (!wantcount)
5742					goto out;
5743			}
5744			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
5745				/* direct block pointer dependency */
5746				retval += 1;
5747				if (!wantcount)
5748					goto out;
5749			}
5750			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
5751				/* direct block pointer dependency */
5752				retval += 1;
5753				if (!wantcount)
5754					goto out;
5755			}
5756			continue;
5757
5758		case D_INDIRDEP:
5759			indirdep = WK_INDIRDEP(wk);
5760
5761			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
5762				/* indirect block pointer dependency */
5763				retval += 1;
5764				if (!wantcount)
5765					goto out;
5766			}
5767			continue;
5768
5769		case D_PAGEDEP:
5770			pagedep = WK_PAGEDEP(wk);
5771			for (i = 0; i < DAHASHSZ; i++) {
5772
5773				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
5774					/* directory entry dependency */
5775					retval += 1;
5776					if (!wantcount)
5777						goto out;
5778				}
5779			}
5780			continue;
5781
5782		case D_BMSAFEMAP:
5783		case D_ALLOCDIRECT:
5784		case D_ALLOCINDIR:
5785		case D_MKDIR:
5786			/* never a dependency on these blocks */
5787			continue;
5788
5789		default:
5790			panic("softdep_check_for_rollback: Unexpected type %s",
5791			    TYPENAME(wk->wk_type));
5792			/* NOTREACHED */
5793		}
5794	}
5795out:
5796	FREE_LOCK(&lk);
5797	return retval;
5798}
5799
5800/*
5801 * Acquire exclusive access to a buffer.
5802 * Must be called with a locked mtx parameter.
5803 * Return acquired buffer or NULL on failure.
5804 */
5805static struct buf *
5806getdirtybuf(bp, mtx, waitfor)
5807	struct buf *bp;
5808	struct mtx *mtx;
5809	int waitfor;
5810{
5811	int error;
5812
5813	mtx_assert(mtx, MA_OWNED);
5814	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
5815		if (waitfor != MNT_WAIT)
5816			return (NULL);
5817		error = BUF_LOCK(bp,
5818		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
5819		/*
5820		 * Even if we sucessfully acquire bp here, we have dropped
5821		 * mtx, which may violates our guarantee.
5822		 */
5823		if (error == 0)
5824			BUF_UNLOCK(bp);
5825		else if (error != ENOLCK)
5826			panic("getdirtybuf: inconsistent lock: %d", error);
5827		mtx_lock(mtx);
5828		return (NULL);
5829	}
5830	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
5831		BUF_UNLOCK(bp);
5832		if (waitfor != MNT_WAIT)
5833			return (NULL);
5834		/*
5835		 * The mtx argument must be bp->b_vp's mutex in
5836		 * this case.
5837		 */
5838#ifdef	DEBUG_VFS_LOCKS
5839		if (bp->b_vp->v_type != VCHR)
5840			ASSERT_VI_LOCKED(bp->b_vp, "getdirtybuf");
5841#endif
5842		bp->b_vflags |= BV_BKGRDWAIT;
5843		msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
5844		return (NULL);
5845	}
5846	if ((bp->b_flags & B_DELWRI) == 0) {
5847		BUF_UNLOCK(bp);
5848		return (NULL);
5849	}
5850	bremfree(bp);
5851	return (bp);
5852}
5853
5854/*
5855 * Wait for pending output on a vnode to complete.
5856 * Must be called with vnode lock and interlock locked.
5857 *
5858 * XXX: Should just be a call to bufobj_wwait().
5859 */
5860static void
5861drain_output(vp)
5862	struct vnode *vp;
5863{
5864	ASSERT_VOP_LOCKED(vp, "drain_output");
5865	ASSERT_VI_LOCKED(vp, "drain_output");
5866
5867	while (vp->v_bufobj.bo_numoutput) {
5868		vp->v_bufobj.bo_flag |= BO_WWAIT;
5869		msleep((caddr_t)&vp->v_bufobj.bo_numoutput,
5870		    VI_MTX(vp), PRIBIO + 1, "drainvp", 0);
5871	}
5872}
5873
5874/*
5875 * Called whenever a buffer that is being invalidated or reallocated
5876 * contains dependencies. This should only happen if an I/O error has
5877 * occurred. The routine is called with the buffer locked.
5878 */
5879static void
5880softdep_deallocate_dependencies(bp)
5881	struct buf *bp;
5882{
5883
5884	if ((bp->b_ioflags & BIO_ERROR) == 0)
5885		panic("softdep_deallocate_dependencies: dangling deps");
5886	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
5887	panic("softdep_deallocate_dependencies: unrecovered I/O error");
5888}
5889
5890/*
5891 * Function to handle asynchronous write errors in the filesystem.
5892 */
5893static void
5894softdep_error(func, error)
5895	char *func;
5896	int error;
5897{
5898
5899	/* XXX should do something better! */
5900	printf("%s: got error %d while accessing filesystem\n", func, error);
5901}
5902
5903#endif /* SOFTUPDATES */
5904