ffs_softdep.c revision 36900
1
2/*
3 * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved.
4 *
5 * The soft updates code is derived from the appendix of a University
6 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
7 * "Soft Updates: A Solution to the Metadata Update Problem in File
8 * Systems", CSE-TR-254-95, August 1995).
9 *
10 * The following are the copyrights and redistribution conditions that
11 * apply to this copy of the soft update software. For a license
12 * to use, redistribute or sell the soft update software under
13 * conditions other than those described here, please contact the
14 * author at one of the following addresses:
15 *
16 *	Marshall Kirk McKusick		mckusick@mckusick.com
17 *	1614 Oxford Street		+1-510-843-9542
18 *	Berkeley, CA 94709-1608
19 *	USA
20 *
21 * Redistribution and use in source and binary forms, with or without
22 * modification, are permitted provided that the following conditions
23 * are met:
24 *
25 * 1. Redistributions of source code must retain the above copyright
26 *    notice, this list of conditions and the following disclaimer.
27 * 2. Redistributions in binary form must reproduce the above copyright
28 *    notice, this list of conditions and the following disclaimer in the
29 *    documentation and/or other materials provided with the distribution.
30 * 3. None of the names of McKusick, Ganger, Patt, or the University of
31 *    Michigan may be used to endorse or promote products derived from
32 *    this software without specific prior written permission.
33 * 4. Redistributions in any form must be accompanied by information on
34 *    how to obtain complete source code for any accompanying software
35 *    that uses this software. This source code must either be included
36 *    in the distribution or be available for no more than the cost of
37 *    distribution plus a nominal fee, and must be freely redistributable
38 *    under reasonable conditions. For an executable file, complete
39 *    source code means the source code for all modules it contains.
40 *    It does not mean source code for modules or files that typically
41 *    accompany the operating system on which the executable file runs,
42 *    e.g., standard library modules or system header files.
43 *
44 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
45 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
46 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
47 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
48 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
49 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
50 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
51 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
52 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
53 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
54 * SUCH DAMAGE.
55 *
56 *	@(#)ffs_softdep.c	9.23 (McKusick) 2/20/98
57 *	$Id: ffs_softdep.c,v 1.9 1998/06/10 20:45:46 julian Exp $
58 */
59
60/*
61 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
62 */
63#ifndef DIAGNOSTIC
64#define DIAGNOSTIC
65#endif
66#ifndef DEBUG
67#define DEBUG
68#endif
69
70#include <sys/param.h>
71#include <sys/buf.h>
72#include <sys/kernel.h>
73#include <sys/malloc.h>
74#include <sys/mount.h>
75#include <sys/proc.h>
76#include <sys/syslog.h>
77#include <sys/systm.h>
78#include <sys/vnode.h>
79#include <miscfs/specfs/specdev.h>
80#include <ufs/ufs/dir.h>
81#include <ufs/ufs/quota.h>
82#include <ufs/ufs/inode.h>
83#include <ufs/ufs/ufsmount.h>
84#include <ufs/ffs/fs.h>
85#include <ufs/ffs/softdep.h>
86#include <ufs/ffs/ffs_extern.h>
87#include <ufs/ufs/ufs_extern.h>
88
89/*
90 * These definitions need to be adapted to the system to which
91 * this file is being ported.
92 */
93/*
94 * malloc types defined for the softdep system.
95 */
96MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
97MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
98MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
99MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
100MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
101MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
102MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
103MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
104MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
105MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
106MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
107MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
108MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
109
110#define	D_PAGEDEP	0
111#define	D_INODEDEP	1
112#define	D_NEWBLK	2
113#define	D_BMSAFEMAP	3
114#define	D_ALLOCDIRECT	4
115#define	D_INDIRDEP	5
116#define	D_ALLOCINDIR	6
117#define	D_FREEFRAG	7
118#define	D_FREEBLKS	8
119#define	D_FREEFILE	9
120#define	D_DIRADD	10
121#define	D_MKDIR		11
122#define	D_DIRREM	12
123#define D_LAST		D_DIRREM
124
125/*
126 * translate from workitem type to memory type
127 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
128 */
129static struct malloc_type *memtype[] = {
130	M_PAGEDEP,
131	M_INODEDEP,
132	M_NEWBLK,
133	M_BMSAFEMAP,
134	M_ALLOCDIRECT,
135	M_INDIRDEP,
136	M_ALLOCINDIR,
137	M_FREEFRAG,
138	M_FREEBLKS,
139	M_FREEFILE,
140	M_DIRADD,
141	M_MKDIR,
142	M_DIRREM
143};
144
145#define DtoM(type) (memtype[type])
146
147/*
148 * Names of malloc types.
149 */
150#define TYPENAME(type)  \
151	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
152#define CURPROC curproc
153/*
154 * End system adaptaion definitions.
155 */
156
157/*
158 * Internal function prototypes.
159 */
160static	void softdep_error __P((char *, int));
161static	int getdirtybuf __P((struct buf **, int));
162static	int flush_pagedep_deps __P((struct vnode *, struct mount *,
163	    struct diraddhd *));
164static	int flush_inodedep_deps __P((struct fs *, ino_t));
165static	int handle_written_filepage __P((struct pagedep *, struct buf *));
166static  void diradd_inode_written __P((struct diradd *, struct inodedep *));
167static	int handle_written_inodeblock __P((struct inodedep *, struct buf *));
168static	void handle_allocdirect_partdone __P((struct allocdirect *));
169static	void handle_allocindir_partdone __P((struct allocindir *));
170static	void initiate_write_filepage __P((struct pagedep *, struct buf *));
171static	void handle_written_mkdir __P((struct mkdir *, int));
172static	void initiate_write_inodeblock __P((struct inodedep *, struct buf *));
173static	void handle_workitem_freefile __P((struct freefile *));
174static	void handle_workitem_remove __P((struct dirrem *));
175static	struct dirrem *newdirrem __P((struct buf *, struct inode *,
176	    struct inode *, int));
177static	void free_diradd __P((struct diradd *));
178static	void free_allocindir __P((struct allocindir *, struct inodedep *));
179static	int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t,
180	    long *));
181static	void deallocate_dependencies __P((struct buf *, struct inodedep *));
182static	void free_allocdirect __P((struct allocdirectlst *,
183	    struct allocdirect *, int));
184static	int free_inodedep __P((struct inodedep *));
185static	void handle_workitem_freeblocks __P((struct freeblks *));
186static	void merge_inode_lists __P((struct inodedep *));
187static	void setup_allocindir_phase2 __P((struct buf *, struct inode *,
188	    struct allocindir *));
189static	struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t,
190	    ufs_daddr_t));
191static	void handle_workitem_freefrag __P((struct freefrag *));
192static	struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long));
193static	void allocdirect_merge __P((struct allocdirectlst *,
194	    struct allocdirect *, struct allocdirect *));
195static	struct bmsafemap *bmsafemap_lookup __P((struct buf *));
196static	int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
197	    struct newblk **));
198static	int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
199static	int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
200	    struct pagedep **));
201static	void pause_timer __P((void *));
202static	int checklimit __P((long *, int));
203static	void add_to_worklist __P((struct worklist *));
204
205/*
206 * Exported softdep operations.
207 */
208struct bio_ops bioops = {
209	softdep_disk_io_initiation,		/* io_start */
210	softdep_disk_write_complete,		/* io_complete */
211	softdep_deallocate_dependencies,	/* io_deallocate */
212	softdep_process_worklist,		/* io_sync */
213};
214
215/*
216 * Locking primitives.
217 *
218 * For a uniprocessor, all we need to do is protect against disk
219 * interrupts. For a multiprocessor, this lock would have to be
220 * a mutex. A single mutex is used throughout this file, though
221 * finer grain locking could be used if contention warranted it.
222 *
223 * For a multiprocessor, the sleep call would accept a lock and
224 * release it after the sleep processing was complete. In a uniprocessor
225 * implementation there is no such interlock, so we simple mark
226 * the places where it needs to be done with the `interlocked' form
227 * of the lock calls. Since the uniprocessor sleep already interlocks
228 * the spl, there is nothing that really needs to be done.
229 */
230#ifndef /* NOT */ DEBUG
231static struct lockit {
232	int	lkt_spl;
233} lk = { 0 };
234#define ACQUIRE_LOCK(lk)		(lk)->lkt_spl = splbio()
235#define FREE_LOCK(lk)			splx((lk)->lkt_spl)
236#define ACQUIRE_LOCK_INTERLOCKED(lk)
237#define FREE_LOCK_INTERLOCKED(lk)
238
239#else /* DEBUG */
240static struct lockit {
241	int	lkt_spl;
242	pid_t	lkt_held;
243} lk = { 0, -1 };
244static int lockcnt;
245
246static	void acquire_lock __P((struct lockit *));
247static	void free_lock __P((struct lockit *));
248static	void acquire_lock_interlocked __P((struct lockit *));
249static	void free_lock_interlocked __P((struct lockit *));
250
251#define ACQUIRE_LOCK(lk)		acquire_lock(lk)
252#define FREE_LOCK(lk)			free_lock(lk)
253#define ACQUIRE_LOCK_INTERLOCKED(lk)	acquire_lock_interlocked(lk)
254#define FREE_LOCK_INTERLOCKED(lk)	free_lock_interlocked(lk)
255
256static void
257acquire_lock(lk)
258	struct lockit *lk;
259{
260
261	if (lk->lkt_held != -1)
262		if (lk->lkt_held == CURPROC->p_pid)
263			panic("softdep_lock: locking against myself");
264		else
265			panic("softdep_lock: lock held by %d", lk->lkt_held);
266	lk->lkt_spl = splbio();
267	lk->lkt_held = CURPROC->p_pid;
268	lockcnt++;
269}
270
271static void
272free_lock(lk)
273	struct lockit *lk;
274{
275
276	if (lk->lkt_held == -1)
277		panic("softdep_unlock: lock not held");
278	lk->lkt_held = -1;
279	splx(lk->lkt_spl);
280}
281
282static void
283acquire_lock_interlocked(lk)
284	struct lockit *lk;
285{
286
287	if (lk->lkt_held != -1)
288		if (lk->lkt_held == CURPROC->p_pid)
289			panic("softdep_lock_interlocked: locking against self");
290		else
291			panic("softdep_lock_interlocked: lock held by %d",
292			    lk->lkt_held);
293	lk->lkt_held = CURPROC->p_pid;
294	lockcnt++;
295}
296
297static void
298free_lock_interlocked(lk)
299	struct lockit *lk;
300{
301
302	if (lk->lkt_held == -1)
303		panic("softdep_unlock_interlocked: lock not held");
304	lk->lkt_held = -1;
305}
306#endif /* DEBUG */
307
308/*
309 * Place holder for real semaphores.
310 */
311struct sema {
312	int	value;
313	pid_t	holder;
314	char	*name;
315	int	prio;
316	int	timo;
317};
318static	void sema_init __P((struct sema *, char *, int, int));
319static	int sema_get __P((struct sema *, struct lockit *));
320static	void sema_release __P((struct sema *));
321
322static void
323sema_init(semap, name, prio, timo)
324	struct sema *semap;
325	char *name;
326	int prio, timo;
327{
328
329	semap->holder = -1;
330	semap->value = 0;
331	semap->name = name;
332	semap->prio = prio;
333	semap->timo = timo;
334}
335
336static int
337sema_get(semap, interlock)
338	struct sema *semap;
339	struct lockit *interlock;
340{
341
342	if (semap->value++ > 0) {
343		if (interlock != NULL)
344			FREE_LOCK_INTERLOCKED(interlock);
345		tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo);
346		if (interlock != NULL) {
347			ACQUIRE_LOCK_INTERLOCKED(interlock);
348			FREE_LOCK(interlock);
349		}
350		return (0);
351	}
352	semap->holder = CURPROC->p_pid;
353	if (interlock != NULL)
354		FREE_LOCK(interlock);
355	return (1);
356}
357
358static void
359sema_release(semap)
360	struct sema *semap;
361{
362
363	if (semap->value <= 0 || semap->holder != CURPROC->p_pid)
364		panic("sema_release: not held");
365	if (--semap->value > 0) {
366		semap->value = 0;
367		wakeup(semap);
368	}
369	semap->holder = -1;
370}
371
372/*
373 * Worklist queue management.
374 * These routines require that the lock be held.
375 */
376#ifndef /* NOT */ DEBUG
377#define WORKLIST_INSERT(head, item) do {	\
378	(item)->wk_state |= ONWORKLIST;		\
379	LIST_INSERT_HEAD(head, item, wk_list);	\
380} while (0)
381#define WORKLIST_REMOVE(item) do {		\
382	(item)->wk_state &= ~ONWORKLIST;	\
383	LIST_REMOVE(item, wk_list);		\
384} while (0)
385#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
386
387#else /* DEBUG */
388static	void worklist_insert __P((struct workhead *, struct worklist *));
389static	void worklist_remove __P((struct worklist *));
390static	void workitem_free __P((struct worklist *, int));
391
392#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
393#define WORKLIST_REMOVE(item) worklist_remove(item)
394#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
395
396static void
397worklist_insert(head, item)
398	struct workhead *head;
399	struct worklist *item;
400{
401
402	if (lk.lkt_held == -1)
403		panic("worklist_insert: lock not held");
404	if (item->wk_state & ONWORKLIST)
405		panic("worklist_insert: already on list");
406	item->wk_state |= ONWORKLIST;
407	LIST_INSERT_HEAD(head, item, wk_list);
408}
409
410static void
411worklist_remove(item)
412	struct worklist *item;
413{
414
415	if (lk.lkt_held == -1)
416		panic("worklist_remove: lock not held");
417	if ((item->wk_state & ONWORKLIST) == 0)
418		panic("worklist_remove: not on list");
419	item->wk_state &= ~ONWORKLIST;
420	LIST_REMOVE(item, wk_list);
421}
422
423static void
424workitem_free(item, type)
425	struct worklist *item;
426	int type;
427{
428
429	if (item->wk_state & ONWORKLIST)
430		panic("workitem_free: still on list");
431	if (item->wk_type != type)
432		panic("workitem_free: type mismatch");
433	FREE(item, DtoM(type));
434}
435#endif /* DEBUG */
436
437/*
438 * Workitem queue management
439 */
440static struct workhead softdep_workitem_pending;
441static int softdep_worklist_busy;
442static int max_softdeps;	/* maximum number of structs before slowdown */
443static int tickdelay = 2;	/* number of ticks to pause during slowdown */
444static int max_limit_hit;	/* number of times slowdown imposed */
445static int rush_requests;	/* number of times I/O speeded up */
446static int proc_waiting;	/* tracks whether we have a timeout posted */
447static pid_t filesys_syncer_pid;/* records pid of filesystem syncer process */
448#ifdef DEBUG
449#include <vm/vm.h>
450#include <sys/sysctl.h>
451#if defined(__FreeBSD__)
452SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
453SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
454SYSCTL_INT(_debug, OID_AUTO, max_limit_hit, CTLFLAG_RW, &max_limit_hit, 0, "");
455SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &rush_requests, 0, "");
456#else /* !__FreeBSD__ */
457struct ctldebug debug8 = { "max_softdeps", &max_softdeps };
458struct ctldebug debug9 = { "tickdelay", &tickdelay };
459struct ctldebug debug10 = { "max_limit_hit", &max_limit_hit };
460struct ctldebug debug11 = { "rush_requests", &rush_requests };
461#endif	/* !__FreeBSD__ */
462
463#endif /* DEBUG */
464
465/*
466 * Add an item to the end of the work queue.
467 * This routine requires that the lock be held.
468 * This is the only routine that adds items to the list.
469 * The following routine is the only one that removes items
470 * and does so in order from first to last.
471 */
472static void
473add_to_worklist(wk)
474	struct worklist *wk;
475{
476	static struct worklist *worklist_tail;
477
478	if (wk->wk_state & ONWORKLIST)
479		panic("add_to_worklist: already on list");
480	wk->wk_state |= ONWORKLIST;
481	if (LIST_FIRST(&softdep_workitem_pending) == NULL) {
482		LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
483	} else {
484		LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
485	}
486	worklist_tail = wk;
487}
488
489/*
490 * Process that runs once per second to handle items in the background queue.
491 *
492 * Note that we ensure that everything is done in the order in which they
493 * appear in the queue. The code below depends on this property to ensure
494 * that blocks of a file are freed before the inode itself is freed. This
495 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
496 * until all the old ones have been purged from the dependency lists.
497 */
498int
499softdep_process_worklist(matchmnt)
500	struct mount *matchmnt;
501{
502	struct proc *p = CURPROC;
503	struct worklist *wk;
504	struct fs *matchfs;
505	int matchcnt;
506
507	/*
508	 * Record the process identifier of our caller so that we can
509	 * give this process preferential treatment in checklimit below.
510	 */
511	filesys_syncer_pid = p->p_pid;
512	matchcnt = 0;
513	matchfs = NULL;
514	if (matchmnt != NULL)
515		matchfs = VFSTOUFS(matchmnt)->um_fs;
516	/*
517	 * There is no danger of having multiple processes run this
518	 * code. It is single threaded solely so that softdep_flushfiles
519	 * (below) can get an accurate count of the number of items
520	 * related to its mount point that are in the list.
521	 */
522	if (softdep_worklist_busy && matchmnt == NULL)
523		return (-1);
524	ACQUIRE_LOCK(&lk);
525	while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) {
526		WORKLIST_REMOVE(wk);
527		FREE_LOCK(&lk);
528		switch (wk->wk_type) {
529
530		case D_DIRREM:
531			/* removal of a directory entry */
532			if (WK_DIRREM(wk)->dm_mnt == matchmnt)
533				matchcnt += 1;
534			handle_workitem_remove(WK_DIRREM(wk));
535			break;
536
537		case D_FREEBLKS:
538			/* releasing blocks and/or fragments from a file */
539			if (WK_FREEBLKS(wk)->fb_fs == matchfs)
540				matchcnt += 1;
541			handle_workitem_freeblocks(WK_FREEBLKS(wk));
542			break;
543
544		case D_FREEFRAG:
545			/* releasing a fragment when replaced as a file grows */
546			if (WK_FREEFRAG(wk)->ff_fs == matchfs)
547				matchcnt += 1;
548			handle_workitem_freefrag(WK_FREEFRAG(wk));
549			break;
550
551		case D_FREEFILE:
552			/* releasing an inode when its link count drops to 0 */
553			if (WK_FREEFILE(wk)->fx_fs == matchfs)
554				matchcnt += 1;
555			handle_workitem_freefile(WK_FREEFILE(wk));
556			break;
557
558		default:
559			panic("%s_process_worklist: Unknown type %s",
560			    "softdep", TYPENAME(wk->wk_type));
561			/* NOTREACHED */
562		}
563		if (softdep_worklist_busy && matchmnt == NULL)
564			return (-1);
565		ACQUIRE_LOCK(&lk);
566	}
567	FREE_LOCK(&lk);
568	return (matchcnt);
569}
570
571/*
572 * Purge the work list of all items associated with a particular mount point.
573 */
574int
575softdep_flushfiles(oldmnt, flags, p)
576	struct mount *oldmnt;
577	int flags;
578	struct proc *p;
579{
580	struct vnode *devvp;
581	int error, loopcnt;
582
583	/*
584	 * Await our turn to clear out the queue.
585	 */
586	while (softdep_worklist_busy)
587		tsleep(&lbolt, PRIBIO, "softflush", 0);
588	softdep_worklist_busy = 1;
589	if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) {
590		softdep_worklist_busy = 0;
591		return (error);
592	}
593	/*
594	 * Alternately flush the block device associated with the mount
595	 * point and process any dependencies that the flushing
596	 * creates. In theory, this loop can happen at most twice,
597	 * but we give it a few extra just to be sure.
598	 */
599	devvp = VFSTOUFS(oldmnt)->um_devvp;
600	for (loopcnt = 10; loopcnt > 0; loopcnt--) {
601		if (softdep_process_worklist(oldmnt) == 0) {
602			/*
603			 * Do another flush in case any vnodes were brought in
604			 * as part of the cleanup operations.
605			 */
606			if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
607				break;
608			/*
609			 * If we still found nothing to do, we are really done.
610			 */
611			if (softdep_process_worklist(oldmnt) == 0)
612				break;
613		}
614		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
615		error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
616		VOP_UNLOCK(devvp, 0, p);
617		if (error)
618			break;
619	}
620	softdep_worklist_busy = 0;
621	/*
622	 * If we are unmounting then it is an error to fail. If we
623	 * are simply trying to downgrade to read-only, then filesystem
624	 * activity can keep us busy forever, so we just fail with EBUSY.
625	 */
626	if (loopcnt == 0) {
627		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
628			panic("softdep_flushfiles: looping");
629		error = EBUSY;
630	}
631	return (error);
632}
633
634/*
635 * A large burst of file addition or deletion activity can drive the
636 * memory load excessively high. Therefore we deliberately slow things
637 * down and speed up the I/O processing if we find ourselves with too
638 * many dependencies in progress.
639 */
640static int
641checklimit(resource, islocked)
642	long *resource;
643	int islocked;
644{
645	struct proc *p = CURPROC;
646
647	/*
648	 * If we are under our limit, just proceed.
649	 */
650	if (*resource < max_softdeps)
651		return (0);
652	/*
653	 * We never hold up the filesystem syncer process.
654	 */
655	if (p->p_pid == filesys_syncer_pid)
656		return (0);
657	/*
658	 * Our first approach is to speed up the syncer process.
659	 * We never push it to speed up more than half of its
660	 * normal turn time, otherwise it could take over the cpu.
661	 */
662	if (rushjob < syncdelay / 2) {
663		rushjob += 1;
664		rush_requests += 1;
665		return (0);
666	}
667	/*
668	 * Every trick has failed, so we pause momentarily to let
669	 * the filesystem syncer process catch up.
670	 */
671	if (islocked == 0)
672		ACQUIRE_LOCK(&lk);
673	if (proc_waiting == 0) {
674		proc_waiting = 1;
675		timeout(pause_timer, NULL, tickdelay > 2 ? tickdelay : 2);
676	}
677	FREE_LOCK_INTERLOCKED(&lk);
678	(void) tsleep((caddr_t)&proc_waiting, PPAUSE | PCATCH, "softupdate", 0);
679	ACQUIRE_LOCK_INTERLOCKED(&lk);
680	if (islocked == 0)
681		FREE_LOCK(&lk);
682	max_limit_hit += 1;
683	return (1);
684}
685
686/*
687 * Awaken processes pausing in checklimit and clear proc_waiting
688 * to indicate that there is no longer a timer running.
689 */
690void
691pause_timer(arg)
692	void *arg;
693{
694
695	proc_waiting = 0;
696	wakeup(&proc_waiting);
697}
698
699/*
700 * Structure hashing.
701 *
702 * There are three types of structures that can be looked up:
703 *	1) pagedep structures identified by mount point, inode number,
704 *	   and logical block.
705 *	2) inodedep structures identified by mount point and inode number.
706 *	3) newblk structures identified by mount point and
707 *	   physical block number.
708 *
709 * The "pagedep" and "inodedep" dependency structures are hashed
710 * separately from the file blocks and inodes to which they correspond.
711 * This separation helps when the in-memory copy of an inode or
712 * file block must be replaced. It also obviates the need to access
713 * an inode or file page when simply updating (or de-allocating)
714 * dependency structures. Lookup of newblk structures is needed to
715 * find newly allocated blocks when trying to associate them with
716 * their allocdirect or allocindir structure.
717 *
718 * The lookup routines optionally create and hash a new instance when
719 * an existing entry is not found.
720 */
721#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
722
723/*
724 * Structures and routines associated with pagedep caching.
725 */
726LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
727u_long	pagedep_hash;		/* size of hash table - 1 */
728#define	PAGEDEP_HASH(mp, inum, lbn) \
729	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
730	    pagedep_hash])
731static struct sema pagedep_in_progress;
732
733/*
734 * Look up a pagedep. Return 1 if found, 0 if not found.
735 * If not found, allocate if DEPALLOC flag is passed.
736 * Found or allocated entry is returned in pagedeppp.
737 * This routine must be called with splbio interrupts blocked.
738 */
739static int
740pagedep_lookup(ip, lbn, flags, pagedeppp)
741	struct inode *ip;
742	ufs_lbn_t lbn;
743	int flags;
744	struct pagedep **pagedeppp;
745{
746	struct pagedep *pagedep;
747	struct pagedep_hashhead *pagedephd;
748	struct mount *mp;
749	int i;
750
751#ifdef DEBUG
752	if (lk.lkt_held == -1)
753		panic("pagedep_lookup: lock not held");
754#endif
755	mp = ITOV(ip)->v_mount;
756	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
757top:
758	for (pagedep = LIST_FIRST(pagedephd); pagedep;
759	     pagedep = LIST_NEXT(pagedep, pd_hash))
760		if (ip->i_number == pagedep->pd_ino &&
761		    lbn == pagedep->pd_lbn &&
762		    mp == pagedep->pd_mnt)
763			break;
764	if (pagedep) {
765		*pagedeppp = pagedep;
766		return (1);
767	}
768	if ((flags & DEPALLOC) == 0) {
769		*pagedeppp = NULL;
770		return (0);
771	}
772	if (sema_get(&pagedep_in_progress, &lk) == 0) {
773		ACQUIRE_LOCK(&lk);
774		goto top;
775	}
776	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
777		M_WAITOK);
778	bzero(pagedep, sizeof(struct pagedep));
779	pagedep->pd_list.wk_type = D_PAGEDEP;
780	pagedep->pd_mnt = mp;
781	pagedep->pd_ino = ip->i_number;
782	pagedep->pd_lbn = lbn;
783	LIST_INIT(&pagedep->pd_dirremhd);
784	LIST_INIT(&pagedep->pd_pendinghd);
785	for (i = 0; i < DAHASHSZ; i++)
786		LIST_INIT(&pagedep->pd_diraddhd[i]);
787	ACQUIRE_LOCK(&lk);
788	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
789	sema_release(&pagedep_in_progress);
790	*pagedeppp = pagedep;
791	return (0);
792}
793
794/*
795 * Structures and routines associated with inodedep caching.
796 */
797LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
798static u_long	inodedep_hash;	/* size of hash table - 1 */
799static long	num_inodedep;	/* number of inodedep allocated */
800#define	INODEDEP_HASH(fs, inum) \
801      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
802static struct sema inodedep_in_progress;
803
804/*
805 * Look up a inodedep. Return 1 if found, 0 if not found.
806 * If not found, allocate if DEPALLOC flag is passed.
807 * Found or allocated entry is returned in inodedeppp.
808 * This routine must be called with splbio interrupts blocked.
809 */
810static int
811inodedep_lookup(fs, inum, flags, inodedeppp)
812	struct fs *fs;
813	ino_t inum;
814	int flags;
815	struct inodedep **inodedeppp;
816{
817	struct inodedep *inodedep;
818	struct inodedep_hashhead *inodedephd;
819	int firsttry;
820
821#ifdef DEBUG
822	if (lk.lkt_held == -1)
823		panic("inodedep_lookup: lock not held");
824#endif
825	firsttry = 1;
826	inodedephd = INODEDEP_HASH(fs, inum);
827top:
828	for (inodedep = LIST_FIRST(inodedephd); inodedep;
829	     inodedep = LIST_NEXT(inodedep, id_hash))
830		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
831			break;
832	if (inodedep) {
833		*inodedeppp = inodedep;
834		return (1);
835	}
836	if ((flags & DEPALLOC) == 0) {
837		*inodedeppp = NULL;
838		return (0);
839	}
840	if (firsttry && checklimit(&num_inodedep, 1) == 1) {
841		firsttry = 0;
842		goto top;
843	}
844	if (sema_get(&inodedep_in_progress, &lk) == 0) {
845		ACQUIRE_LOCK(&lk);
846		goto top;
847	}
848	num_inodedep += 1;
849	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
850		M_INODEDEP, M_WAITOK);
851	inodedep->id_list.wk_type = D_INODEDEP;
852	inodedep->id_fs = fs;
853	inodedep->id_ino = inum;
854	inodedep->id_state = ALLCOMPLETE;
855	inodedep->id_nlinkdelta = 0;
856	inodedep->id_savedino = NULL;
857	inodedep->id_savedsize = -1;
858	inodedep->id_buf = NULL;
859	LIST_INIT(&inodedep->id_pendinghd);
860	LIST_INIT(&inodedep->id_inowait);
861	LIST_INIT(&inodedep->id_bufwait);
862	TAILQ_INIT(&inodedep->id_inoupdt);
863	TAILQ_INIT(&inodedep->id_newinoupdt);
864	ACQUIRE_LOCK(&lk);
865	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
866	sema_release(&inodedep_in_progress);
867	*inodedeppp = inodedep;
868	return (0);
869}
870
871/*
872 * Structures and routines associated with newblk caching.
873 */
874LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
875u_long	newblk_hash;		/* size of hash table - 1 */
876#define	NEWBLK_HASH(fs, inum) \
877	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
878static struct sema newblk_in_progress;
879
880/*
881 * Look up a newblk. Return 1 if found, 0 if not found.
882 * If not found, allocate if DEPALLOC flag is passed.
883 * Found or allocated entry is returned in newblkpp.
884 */
885static int
886newblk_lookup(fs, newblkno, flags, newblkpp)
887	struct fs *fs;
888	ufs_daddr_t newblkno;
889	int flags;
890	struct newblk **newblkpp;
891{
892	struct newblk *newblk;
893	struct newblk_hashhead *newblkhd;
894
895	newblkhd = NEWBLK_HASH(fs, newblkno);
896top:
897	for (newblk = LIST_FIRST(newblkhd); newblk;
898	     newblk = LIST_NEXT(newblk, nb_hash))
899		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
900			break;
901	if (newblk) {
902		*newblkpp = newblk;
903		return (1);
904	}
905	if ((flags & DEPALLOC) == 0) {
906		*newblkpp = NULL;
907		return (0);
908	}
909	if (sema_get(&newblk_in_progress, 0) == 0)
910		goto top;
911	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
912		M_NEWBLK, M_WAITOK);
913	newblk->nb_state = 0;
914	newblk->nb_fs = fs;
915	newblk->nb_newblkno = newblkno;
916	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
917	sema_release(&newblk_in_progress);
918	*newblkpp = newblk;
919	return (0);
920}
921
922/*
923 * Executed during filesystem system initialization before
924 * mounting any file systems.
925 */
926void
927softdep_initialize()
928{
929
930	LIST_INIT(&mkdirlisthd);
931	LIST_INIT(&softdep_workitem_pending);
932	max_softdeps = desiredvnodes * 8;
933	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
934	    &pagedep_hash);
935	sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
936	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
937	sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
938	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
939	sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
940}
941
942/*
943 * Called at mount time to notify the dependency code that a
944 * filesystem wishes to use it.
945 */
946int
947softdep_mount(devvp, mp, fs, cred)
948	struct vnode *devvp;
949	struct mount *mp;
950	struct fs *fs;
951	struct ucred *cred;
952{
953	struct csum cstotal;
954	struct cg *cgp;
955	struct buf *bp;
956	int error, cyl;
957
958	mp->mnt_flag &= ~MNT_ASYNC;
959	mp->mnt_flag |= MNT_SOFTDEP;
960	/*
961	 * When doing soft updates, the counters in the
962	 * superblock may have gotten out of sync, so we have
963	 * to scan the cylinder groups and recalculate them.
964	 */
965	if (fs->fs_clean != 0)
966		return (0);
967	bzero(&cstotal, sizeof cstotal);
968	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
969		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
970		    fs->fs_cgsize, cred, &bp)) != 0) {
971			brelse(bp);
972			return (error);
973		}
974		cgp = (struct cg *)bp->b_data;
975		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
976		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
977		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
978		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
979		fs->fs_cs(fs, cyl) = cgp->cg_cs;
980		brelse(bp);
981	}
982#ifdef DEBUG
983	if (!bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
984		printf("ffs_mountfs: superblock updated\n");
985#endif
986	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
987	return (0);
988}
989
990/*
991 * Protecting the freemaps (or bitmaps).
992 *
993 * To eliminate the need to execute fsck before mounting a file system
994 * after a power failure, one must (conservatively) guarantee that the
995 * on-disk copy of the bitmaps never indicate that a live inode or block is
996 * free.  So, when a block or inode is allocated, the bitmap should be
997 * updated (on disk) before any new pointers.  When a block or inode is
998 * freed, the bitmap should not be updated until all pointers have been
999 * reset.  The latter dependency is handled by the delayed de-allocation
1000 * approach described below for block and inode de-allocation.  The former
1001 * dependency is handled by calling the following procedure when a block or
1002 * inode is allocated. When an inode is allocated an "inodedep" is created
1003 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1004 * Each "inodedep" is also inserted into the hash indexing structure so
1005 * that any additional link additions can be made dependent on the inode
1006 * allocation.
1007 *
1008 * The ufs file system maintains a number of free block counts (e.g., per
1009 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1010 * in addition to the bitmaps.  These counts are used to improve efficiency
1011 * during allocation and therefore must be consistent with the bitmaps.
1012 * There is no convenient way to guarantee post-crash consistency of these
1013 * counts with simple update ordering, for two main reasons: (1) The counts
1014 * and bitmaps for a single cylinder group block are not in the same disk
1015 * sector.  If a disk write is interrupted (e.g., by power failure), one may
1016 * be written and the other not.  (2) Some of the counts are located in the
1017 * superblock rather than the cylinder group block. So, we focus our soft
1018 * updates implementation on protecting the bitmaps. When mounting a
1019 * filesystem, we recompute the auxiliary counts from the bitmaps.
1020 */
1021
1022/*
1023 * Called just after updating the cylinder group block to allocate an inode.
1024 */
1025void
1026softdep_setup_inomapdep(bp, ip, newinum)
1027	struct buf *bp;		/* buffer for cylgroup block with inode map */
1028	struct inode *ip;	/* inode related to allocation */
1029	ino_t newinum;		/* new inode number being allocated */
1030{
1031	struct inodedep *inodedep;
1032	struct bmsafemap *bmsafemap;
1033
1034	/*
1035	 * Create a dependency for the newly allocated inode.
1036	 * Panic if it already exists as something is seriously wrong.
1037	 * Otherwise add it to the dependency list for the buffer holding
1038	 * the cylinder group map from which it was allocated.
1039	 */
1040	ACQUIRE_LOCK(&lk);
1041	if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0)
1042		panic("softdep_setup_inomapdep: found inode");
1043	inodedep->id_buf = bp;
1044	inodedep->id_state &= ~DEPCOMPLETE;
1045	bmsafemap = bmsafemap_lookup(bp);
1046	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1047	FREE_LOCK(&lk);
1048}
1049
1050/*
1051 * Called just after updating the cylinder group block to
1052 * allocate block or fragment.
1053 */
1054void
1055softdep_setup_blkmapdep(bp, fs, newblkno)
1056	struct buf *bp;		/* buffer for cylgroup block with block map */
1057	struct fs *fs;		/* filesystem doing allocation */
1058	ufs_daddr_t newblkno;	/* number of newly allocated block */
1059{
1060	struct newblk *newblk;
1061	struct bmsafemap *bmsafemap;
1062
1063	/*
1064	 * Create a dependency for the newly allocated block.
1065	 * Add it to the dependency list for the buffer holding
1066	 * the cylinder group map from which it was allocated.
1067	 */
1068	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1069		panic("softdep_setup_blkmapdep: found block");
1070	ACQUIRE_LOCK(&lk);
1071	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1072	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1073	FREE_LOCK(&lk);
1074}
1075
1076/*
1077 * Find the bmsafemap associated with a cylinder group buffer.
1078 * If none exists, create one. The buffer must be locked when
1079 * this routine is called and this routine must be called with
1080 * splbio interrupts blocked.
1081 */
1082static struct bmsafemap *
1083bmsafemap_lookup(bp)
1084	struct buf *bp;
1085{
1086	struct bmsafemap *bmsafemap;
1087	struct worklist *wk;
1088
1089#ifdef DEBUG
1090	if (lk.lkt_held == -1)
1091		panic("bmsafemap_lookup: lock not held");
1092#endif
1093	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list))
1094		if (wk->wk_type == D_BMSAFEMAP)
1095			return (WK_BMSAFEMAP(wk));
1096	FREE_LOCK(&lk);
1097	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1098		M_BMSAFEMAP, M_WAITOK);
1099	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1100	bmsafemap->sm_list.wk_state = 0;
1101	bmsafemap->sm_buf = bp;
1102	LIST_INIT(&bmsafemap->sm_allocdirecthd);
1103	LIST_INIT(&bmsafemap->sm_allocindirhd);
1104	LIST_INIT(&bmsafemap->sm_inodedephd);
1105	LIST_INIT(&bmsafemap->sm_newblkhd);
1106	ACQUIRE_LOCK(&lk);
1107	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1108	return (bmsafemap);
1109}
1110
1111/*
1112 * Direct block allocation dependencies.
1113 *
1114 * When a new block is allocated, the corresponding disk locations must be
1115 * initialized (with zeros or new data) before the on-disk inode points to
1116 * them.  Also, the freemap from which the block was allocated must be
1117 * updated (on disk) before the inode's pointer. These two dependencies are
1118 * independent of each other and are needed for all file blocks and indirect
1119 * blocks that are pointed to directly by the inode.  Just before the
1120 * "in-core" version of the inode is updated with a newly allocated block
1121 * number, a procedure (below) is called to setup allocation dependency
1122 * structures.  These structures are removed when the corresponding
1123 * dependencies are satisfied or when the block allocation becomes obsolete
1124 * (i.e., the file is deleted, the block is de-allocated, or the block is a
1125 * fragment that gets upgraded).  All of these cases are handled in
1126 * procedures described later.
1127 *
1128 * When a file extension causes a fragment to be upgraded, either to a larger
1129 * fragment or to a full block, the on-disk location may change (if the
1130 * previous fragment could not simply be extended). In this case, the old
1131 * fragment must be de-allocated, but not until after the inode's pointer has
1132 * been updated. In most cases, this is handled by later procedures, which
1133 * will construct a "freefrag" structure to be added to the workitem queue
1134 * when the inode update is complete (or obsolete).  The main exception to
1135 * this is when an allocation occurs while a pending allocation dependency
1136 * (for the same block pointer) remains.  This case is handled in the main
1137 * allocation dependency setup procedure by immediately freeing the
1138 * unreferenced fragments.
1139 */
1140void
1141softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1142	struct inode *ip;	/* inode to which block is being added */
1143	ufs_lbn_t lbn;		/* block pointer within inode */
1144	ufs_daddr_t newblkno;	/* disk block number being added */
1145	ufs_daddr_t oldblkno;	/* previous block number, 0 unless frag */
1146	long newsize;		/* size of new block */
1147	long oldsize;		/* size of new block */
1148	struct buf *bp;		/* bp for allocated block */
1149{
1150	struct allocdirect *adp, *oldadp;
1151	struct allocdirectlst *adphead;
1152	struct bmsafemap *bmsafemap;
1153	struct inodedep *inodedep;
1154	struct pagedep *pagedep;
1155	struct newblk *newblk;
1156
1157	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1158		M_ALLOCDIRECT, M_WAITOK);
1159	bzero(adp, sizeof(struct allocdirect));
1160	adp->ad_list.wk_type = D_ALLOCDIRECT;
1161	adp->ad_lbn = lbn;
1162	adp->ad_newblkno = newblkno;
1163	adp->ad_oldblkno = oldblkno;
1164	adp->ad_newsize = newsize;
1165	adp->ad_oldsize = oldsize;
1166	adp->ad_state = ATTACHED;
1167	if (newblkno == oldblkno)
1168		adp->ad_freefrag = NULL;
1169	else
1170		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1171
1172	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1173		panic("softdep_setup_allocdirect: lost block");
1174
1175	ACQUIRE_LOCK(&lk);
1176	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
1177	adp->ad_inodedep = inodedep;
1178
1179	if (newblk->nb_state == DEPCOMPLETE) {
1180		adp->ad_state |= DEPCOMPLETE;
1181		adp->ad_buf = NULL;
1182	} else {
1183		bmsafemap = newblk->nb_bmsafemap;
1184		adp->ad_buf = bmsafemap->sm_buf;
1185		LIST_REMOVE(newblk, nb_deps);
1186		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1187	}
1188	LIST_REMOVE(newblk, nb_hash);
1189	FREE(newblk, M_NEWBLK);
1190
1191	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1192	if (lbn >= NDADDR) {
1193		/* allocating an indirect block */
1194		if (oldblkno != 0)
1195			panic("softdep_setup_allocdirect: non-zero indir");
1196	} else {
1197		/*
1198		 * Allocating a direct block.
1199		 *
1200		 * If we are allocating a directory block, then we must
1201		 * allocate an associated pagedep to track additions and
1202		 * deletions.
1203		 */
1204		if ((ip->i_mode & IFMT) == IFDIR &&
1205		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1206			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1207	}
1208	/*
1209	 * The list of allocdirects must be kept in sorted and ascending
1210	 * order so that the rollback routines can quickly determine the
1211	 * first uncommitted block (the size of the file stored on disk
1212	 * ends at the end of the lowest committed fragment, or if there
1213	 * are no fragments, at the end of the highest committed block).
1214	 * Since files generally grow, the typical case is that the new
1215	 * block is to be added at the end of the list. We speed this
1216	 * special case by checking against the last allocdirect in the
1217	 * list before laboriously traversing the list looking for the
1218	 * insertion point.
1219	 */
1220	adphead = &inodedep->id_newinoupdt;
1221	oldadp = TAILQ_LAST(adphead, allocdirectlst);
1222	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1223		/* insert at end of list */
1224		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1225		if (oldadp != NULL && oldadp->ad_lbn == lbn)
1226			allocdirect_merge(adphead, adp, oldadp);
1227		FREE_LOCK(&lk);
1228		return;
1229	}
1230	for (oldadp = TAILQ_FIRST(adphead); oldadp;
1231	     oldadp = TAILQ_NEXT(oldadp, ad_next)) {
1232		if (oldadp->ad_lbn >= lbn)
1233			break;
1234	}
1235	if (oldadp == NULL)
1236		panic("softdep_setup_allocdirect: lost entry");
1237	/* insert in middle of list */
1238	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1239	if (oldadp->ad_lbn == lbn)
1240		allocdirect_merge(adphead, adp, oldadp);
1241	FREE_LOCK(&lk);
1242}
1243
1244/*
1245 * Replace an old allocdirect dependency with a newer one.
1246 * This routine must be called with splbio interrupts blocked.
1247 */
1248static void
1249allocdirect_merge(adphead, newadp, oldadp)
1250	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
1251	struct allocdirect *newadp;	/* allocdirect being added */
1252	struct allocdirect *oldadp;	/* existing allocdirect being checked */
1253{
1254	struct freefrag *freefrag;
1255
1256#ifdef DEBUG
1257	if (lk.lkt_held == -1)
1258		panic("allocdirect_merge: lock not held");
1259#endif
1260	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1261	    newadp->ad_oldsize != oldadp->ad_newsize ||
1262	    newadp->ad_lbn >= NDADDR)
1263		panic("allocdirect_check: old %d != new %d || lbn %d >= %d",
1264		    newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
1265		    NDADDR);
1266	newadp->ad_oldblkno = oldadp->ad_oldblkno;
1267	newadp->ad_oldsize = oldadp->ad_oldsize;
1268	/*
1269	 * If the old dependency had a fragment to free or had never
1270	 * previously had a block allocated, then the new dependency
1271	 * can immediately post its freefrag and adopt the old freefrag.
1272	 * This action is done by swapping the freefrag dependencies.
1273	 * The new dependency gains the old one's freefrag, and the
1274	 * old one gets the new one and then immediately puts it on
1275	 * the worklist when it is freed by free_allocdirect. It is
1276	 * not possible to do this swap when the old dependency had a
1277	 * non-zero size but no previous fragment to free. This condition
1278	 * arises when the new block is an extension of the old block.
1279	 * Here, the first part of the fragment allocated to the new
1280	 * dependency is part of the block currently claimed on disk by
1281	 * the old dependency, so cannot legitimately be freed until the
1282	 * conditions for the new dependency are fulfilled.
1283	 */
1284	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1285		freefrag = newadp->ad_freefrag;
1286		newadp->ad_freefrag = oldadp->ad_freefrag;
1287		oldadp->ad_freefrag = freefrag;
1288	}
1289	free_allocdirect(adphead, oldadp, 0);
1290}
1291
1292/*
1293 * Allocate a new freefrag structure if needed.
1294 */
1295static struct freefrag *
1296newfreefrag(ip, blkno, size)
1297	struct inode *ip;
1298	ufs_daddr_t blkno;
1299	long size;
1300{
1301	struct freefrag *freefrag;
1302	struct fs *fs;
1303
1304	if (blkno == 0)
1305		return (NULL);
1306	fs = ip->i_fs;
1307	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1308		panic("newfreefrag: frag size");
1309	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1310		M_FREEFRAG, M_WAITOK);
1311	freefrag->ff_list.wk_type = D_FREEFRAG;
1312	freefrag->ff_state = ip->i_uid & ~ONWORKLIST;	/* XXX - used below */
1313	freefrag->ff_inum = ip->i_number;
1314	freefrag->ff_fs = fs;
1315	freefrag->ff_devvp = ip->i_devvp;
1316	freefrag->ff_blkno = blkno;
1317	freefrag->ff_fragsize = size;
1318	return (freefrag);
1319}
1320
1321/*
1322 * This workitem de-allocates fragments that were replaced during
1323 * file block allocation.
1324 */
1325static void
1326handle_workitem_freefrag(freefrag)
1327	struct freefrag *freefrag;
1328{
1329	struct inode tip;
1330
1331	tip.i_fs = freefrag->ff_fs;
1332	tip.i_devvp = freefrag->ff_devvp;
1333	tip.i_dev = freefrag->ff_devvp->v_rdev;
1334	tip.i_number = freefrag->ff_inum;
1335	tip.i_uid = freefrag->ff_state & ~ONWORKLIST;	/* XXX - set above */
1336	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1337	FREE(freefrag, M_FREEFRAG);
1338}
1339
1340/*
1341 * Indirect block allocation dependencies.
1342 *
1343 * The same dependencies that exist for a direct block also exist when
1344 * a new block is allocated and pointed to by an entry in a block of
1345 * indirect pointers. The undo/redo states described above are also
1346 * used here. Because an indirect block contains many pointers that
1347 * may have dependencies, a second copy of the entire in-memory indirect
1348 * block is kept. The buffer cache copy is always completely up-to-date.
1349 * The second copy, which is used only as a source for disk writes,
1350 * contains only the safe pointers (i.e., those that have no remaining
1351 * update dependencies). The second copy is freed when all pointers
1352 * are safe. The cache is not allowed to replace indirect blocks with
1353 * pending update dependencies. If a buffer containing an indirect
1354 * block with dependencies is written, these routines will mark it
1355 * dirty again. It can only be successfully written once all the
1356 * dependencies are removed. The ffs_fsync routine in conjunction with
1357 * softdep_sync_metadata work together to get all the dependencies
1358 * removed so that a file can be successfully written to disk. Three
1359 * procedures are used when setting up indirect block pointer
1360 * dependencies. The division is necessary because of the organization
1361 * of the "balloc" routine and because of the distinction between file
1362 * pages and file metadata blocks.
1363 */
1364
1365/*
1366 * Allocate a new allocindir structure.
1367 */
1368static struct allocindir *
1369newallocindir(ip, ptrno, newblkno, oldblkno)
1370	struct inode *ip;	/* inode for file being extended */
1371	int ptrno;		/* offset of pointer in indirect block */
1372	ufs_daddr_t newblkno;	/* disk block number being added */
1373	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1374{
1375	struct allocindir *aip;
1376
1377	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1378		M_ALLOCINDIR, M_WAITOK);
1379	bzero(aip, sizeof(struct allocindir));
1380	aip->ai_list.wk_type = D_ALLOCINDIR;
1381	aip->ai_state = ATTACHED;
1382	aip->ai_offset = ptrno;
1383	aip->ai_newblkno = newblkno;
1384	aip->ai_oldblkno = oldblkno;
1385	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1386	return (aip);
1387}
1388
1389/*
1390 * Called just before setting an indirect block pointer
1391 * to a newly allocated file page.
1392 */
1393void
1394softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1395	struct inode *ip;	/* inode for file being extended */
1396	ufs_lbn_t lbn;		/* allocated block number within file */
1397	struct buf *bp;		/* buffer with indirect blk referencing page */
1398	int ptrno;		/* offset of pointer in indirect block */
1399	ufs_daddr_t newblkno;	/* disk block number being added */
1400	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1401	struct buf *nbp;	/* buffer holding allocated page */
1402{
1403	struct allocindir *aip;
1404	struct pagedep *pagedep;
1405
1406	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1407	ACQUIRE_LOCK(&lk);
1408	/*
1409	 * If we are allocating a directory page, then we must
1410	 * allocate an associated pagedep to track additions and
1411	 * deletions.
1412	 */
1413	if ((ip->i_mode & IFMT) == IFDIR &&
1414	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1415		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1416	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1417	FREE_LOCK(&lk);
1418	setup_allocindir_phase2(bp, ip, aip);
1419}
1420
1421/*
1422 * Called just before setting an indirect block pointer to a
1423 * newly allocated indirect block.
1424 */
1425void
1426softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1427	struct buf *nbp;	/* newly allocated indirect block */
1428	struct inode *ip;	/* inode for file being extended */
1429	struct buf *bp;		/* indirect block referencing allocated block */
1430	int ptrno;		/* offset of pointer in indirect block */
1431	ufs_daddr_t newblkno;	/* disk block number being added */
1432{
1433	struct allocindir *aip;
1434
1435	aip = newallocindir(ip, ptrno, newblkno, 0);
1436	ACQUIRE_LOCK(&lk);
1437	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1438	FREE_LOCK(&lk);
1439	setup_allocindir_phase2(bp, ip, aip);
1440}
1441
1442/*
1443 * Called to finish the allocation of the "aip" allocated
1444 * by one of the two routines above.
1445 */
1446static void
1447setup_allocindir_phase2(bp, ip, aip)
1448	struct buf *bp;		/* in-memory copy of the indirect block */
1449	struct inode *ip;	/* inode for file being extended */
1450	struct allocindir *aip;	/* allocindir allocated by the above routines */
1451{
1452	struct worklist *wk;
1453	struct indirdep *indirdep, *newindirdep;
1454	struct bmsafemap *bmsafemap;
1455	struct allocindir *oldaip;
1456	struct freefrag *freefrag;
1457	struct newblk *newblk;
1458
1459	if (bp->b_lblkno >= 0)
1460		panic("setup_allocindir_phase2: not indir blk");
1461	for (indirdep = NULL, newindirdep = NULL; ; ) {
1462		ACQUIRE_LOCK(&lk);
1463		for (wk = LIST_FIRST(&bp->b_dep); wk;
1464		     wk = LIST_NEXT(wk, wk_list)) {
1465			if (wk->wk_type != D_INDIRDEP)
1466				continue;
1467			indirdep = WK_INDIRDEP(wk);
1468			break;
1469		}
1470		if (indirdep == NULL && newindirdep) {
1471			indirdep = newindirdep;
1472			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1473			newindirdep = NULL;
1474		}
1475		FREE_LOCK(&lk);
1476		if (indirdep) {
1477			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1478			    &newblk) == 0)
1479				panic("setup_allocindir: lost block");
1480			ACQUIRE_LOCK(&lk);
1481			if (newblk->nb_state == DEPCOMPLETE) {
1482				aip->ai_state |= DEPCOMPLETE;
1483				aip->ai_buf = NULL;
1484			} else {
1485				bmsafemap = newblk->nb_bmsafemap;
1486				aip->ai_buf = bmsafemap->sm_buf;
1487				LIST_REMOVE(newblk, nb_deps);
1488				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1489				    aip, ai_deps);
1490			}
1491			LIST_REMOVE(newblk, nb_hash);
1492			FREE(newblk, M_NEWBLK);
1493			aip->ai_indirdep = indirdep;
1494			/*
1495			 * Check to see if there is an existing dependency
1496			 * for this block. If there is, merge the old
1497			 * dependency into the new one.
1498			 */
1499			if (aip->ai_oldblkno == 0)
1500				oldaip = NULL;
1501			else
1502				for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd);
1503				    oldaip; oldaip = LIST_NEXT(oldaip, ai_next))
1504					if (oldaip->ai_offset == aip->ai_offset)
1505						break;
1506			if (oldaip != NULL) {
1507				if (oldaip->ai_newblkno != aip->ai_oldblkno)
1508					panic("setup_allocindir_phase2: blkno");
1509				aip->ai_oldblkno = oldaip->ai_oldblkno;
1510				freefrag = oldaip->ai_freefrag;
1511				oldaip->ai_freefrag = aip->ai_freefrag;
1512				aip->ai_freefrag = freefrag;
1513				free_allocindir(oldaip, NULL);
1514			}
1515			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1516			((ufs_daddr_t *)indirdep->ir_savebp->b_data)
1517			    [aip->ai_offset] = aip->ai_oldblkno;
1518			FREE_LOCK(&lk);
1519		}
1520		if (newindirdep) {
1521			if (indirdep->ir_savebp != NULL)
1522				brelse(newindirdep->ir_savebp);
1523			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
1524		}
1525		if (indirdep)
1526			break;
1527		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
1528			M_INDIRDEP, M_WAITOK);
1529		newindirdep->ir_list.wk_type = D_INDIRDEP;
1530		newindirdep->ir_state = ATTACHED;
1531		LIST_INIT(&newindirdep->ir_deplisthd);
1532		LIST_INIT(&newindirdep->ir_donehd);
1533#ifdef __FreeBSD__
1534		if (bp->b_blkno == bp->b_lblkno) {
1535#if 0 /* we know this happens.. research suggested.. */
1536			printf("setup_allocindir_phase2: need bmap, blk %d\n",
1537				bp->b_lblkno);
1538#endif
1539			VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
1540				NULL, NULL);
1541		}
1542#endif /* __FreeBSD__ */
1543		newindirdep->ir_savebp =
1544		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1545		bp->b_flags |= B_XXX;
1546		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1547	}
1548}
1549
1550/*
1551 * Block de-allocation dependencies.
1552 *
1553 * When blocks are de-allocated, the on-disk pointers must be nullified before
1554 * the blocks are made available for use by other files.  (The true
1555 * requirement is that old pointers must be nullified before new on-disk
1556 * pointers are set.  We chose this slightly more stringent requirement to
1557 * reduce complexity.) Our implementation handles this dependency by updating
1558 * the inode (or indirect block) appropriately but delaying the actual block
1559 * de-allocation (i.e., freemap and free space count manipulation) until
1560 * after the updated versions reach stable storage.  After the disk is
1561 * updated, the blocks can be safely de-allocated whenever it is convenient.
1562 * This implementation handles only the common case of reducing a file's
1563 * length to zero. Other cases are handled by the conventional synchronous
1564 * write approach.
1565 *
1566 * The ffs implementation with which we worked double-checks
1567 * the state of the block pointers and file size as it reduces
1568 * a file's length.  Some of this code is replicated here in our
1569 * soft updates implementation.  The freeblks->fb_chkcnt field is
1570 * used to transfer a part of this information to the procedure
1571 * that eventually de-allocates the blocks.
1572 *
1573 * This routine should be called from the routine that shortens
1574 * a file's length, before the inode's size or block pointers
1575 * are modified. It will save the block pointer information for
1576 * later release and zero the inode so that the calling routine
1577 * can release it.
1578 */
1579static long num_freeblks;	/* number of freeblks allocated */
1580void
1581softdep_setup_freeblocks(ip, length)
1582	struct inode *ip;	/* The inode whose length is to be reduced */
1583	off_t length;		/* The new length for the file */
1584{
1585	struct freeblks *freeblks;
1586	struct inodedep *inodedep;
1587	struct allocdirect *adp;
1588	struct vnode *vp;
1589	struct buf *bp;
1590	struct fs *fs;
1591	int i, error;
1592
1593	fs = ip->i_fs;
1594	if (length != 0)
1595		panic("softde_setup_freeblocks: non-zero length");
1596	(void) checklimit(&num_freeblks, 0);
1597	num_freeblks += 1;
1598	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
1599		M_FREEBLKS, M_WAITOK);
1600	bzero(freeblks, sizeof(struct freeblks));
1601	freeblks->fb_list.wk_type = D_FREEBLKS;
1602	freeblks->fb_uid = ip->i_uid;
1603	freeblks->fb_previousinum = ip->i_number;
1604	freeblks->fb_devvp = ip->i_devvp;
1605	freeblks->fb_fs = fs;
1606	freeblks->fb_oldsize = ip->i_size;
1607	freeblks->fb_newsize = length;
1608	freeblks->fb_chkcnt = ip->i_blocks;
1609	for (i = 0; i < NDADDR; i++) {
1610		freeblks->fb_dblks[i] = ip->i_db[i];
1611		ip->i_db[i] = 0;
1612	}
1613	for (i = 0; i < NIADDR; i++) {
1614		freeblks->fb_iblks[i] = ip->i_ib[i];
1615		ip->i_ib[i] = 0;
1616	}
1617	ip->i_blocks = 0;
1618	ip->i_size = 0;
1619	/*
1620	 * Push the zero'ed inode to to its disk buffer so that we are free
1621	 * to delete its dependencies below. Once the dependencies are gone
1622	 * the buffer can be safely released.
1623	 */
1624	if ((error = bread(ip->i_devvp,
1625	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1626	    (int)fs->fs_bsize, NOCRED, &bp)) != 0)
1627		softdep_error("softdep_setup_freeblocks", error);
1628	*((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
1629	    ip->i_din;
1630	/*
1631	 * Find and eliminate any inode dependencies.
1632	 */
1633	ACQUIRE_LOCK(&lk);
1634	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1635	if ((inodedep->id_state & IOSTARTED) != 0)
1636		panic("softdep_setup_freeblocks: inode busy");
1637	/*
1638	 * Add the freeblks structure to the list of operations that
1639	 * must await the zero'ed inode being written to disk.
1640	 */
1641	WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1642	/*
1643	 * Because the file length has been truncated to zero, any
1644	 * pending block allocation dependency structures associated
1645	 * with this inode are obsolete and can simply be de-allocated.
1646	 * We must first merge the two dependency lists to get rid of
1647	 * any duplicate freefrag structures, then purge the merged list.
1648	 */
1649	merge_inode_lists(inodedep);
1650	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
1651		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
1652	bdwrite(bp);
1653	/*
1654	 * We must wait for any I/O in progress to finish so that
1655	 * all potential buffers on the dirty list will be visible.
1656	 * Once they are all there, walk the list and get rid of
1657	 * any dependencies.
1658	 */
1659	vp = ITOV(ip);
1660	while (vp->v_numoutput) {
1661		vp->v_flag |= VBWAIT;
1662		FREE_LOCK_INTERLOCKED(&lk);
1663		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "sdsetf", 0);
1664		ACQUIRE_LOCK_INTERLOCKED(&lk);
1665	}
1666	while (getdirtybuf(&LIST_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
1667		bp = LIST_FIRST(&vp->v_dirtyblkhd);
1668		(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
1669		deallocate_dependencies(bp, inodedep);
1670		bp->b_flags |= B_INVAL | B_NOCACHE;
1671		brelse(bp);
1672	}
1673	/*
1674	 * Try freeing the inodedep in case that was the last dependency.
1675	 */
1676	if ((inodedep_lookup(fs, ip->i_number, 0, &inodedep)) != 0)
1677		(void) free_inodedep(inodedep);
1678	FREE_LOCK(&lk);
1679}
1680
1681/*
1682 * Reclaim any dependency structures from a buffer that is about to
1683 * be reallocated to a new vnode. The buffer must be locked, thus,
1684 * no I/O completion operations can occur while we are manipulating
1685 * its associated dependencies. The mutex is held so that other I/O's
1686 * associated with related dependencies do not occur.
1687 */
1688static void
1689deallocate_dependencies(bp, inodedep)
1690	struct buf *bp;
1691	struct inodedep *inodedep;
1692{
1693	struct worklist *wk;
1694	struct indirdep *indirdep;
1695	struct allocindir *aip;
1696	struct pagedep *pagedep;
1697	struct dirrem *dirrem;
1698	struct diradd *dap;
1699	int i;
1700
1701	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
1702		switch (wk->wk_type) {
1703
1704		case D_INDIRDEP:
1705			indirdep = WK_INDIRDEP(wk);
1706			/*
1707			 * None of the indirect pointers will ever be visible,
1708			 * so they can simply be tossed. GOINGAWAY ensures
1709			 * that allocated pointers will be saved in the buffer
1710			 * cache until they are freed. Note that they will
1711			 * only be able to be found by their physical address
1712			 * since the inode mapping the logical address will
1713			 * be gone. The save buffer used for the safe copy
1714			 * was allocated in setup_allocindir_phase2 using
1715			 * the physical address so it could be used for this
1716			 * purpose. Hence we swap the safe copy with the real
1717			 * copy, allowing the safe copy to be freed and holding
1718			 * on to the real copy for later use in indir_trunc.
1719			 */
1720			if (indirdep->ir_state & GOINGAWAY)
1721				panic("deallocate_dependencies: already gone");
1722			indirdep->ir_state |= GOINGAWAY;
1723			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
1724				free_allocindir(aip, inodedep);
1725			if (bp->b_lblkno >= 0 ||
1726			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
1727				panic("deallocate_dependencies: not indir");
1728			bcopy(bp->b_data, indirdep->ir_savebp->b_data,
1729			    bp->b_bcount);
1730			WORKLIST_REMOVE(wk);
1731			WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
1732			continue;
1733
1734		case D_PAGEDEP:
1735			pagedep = WK_PAGEDEP(wk);
1736			/*
1737			 * None of the directory additions will ever be
1738			 * visible, so they can simply be tossed.
1739			 */
1740			for (i = 0; i < DAHASHSZ; i++)
1741				while (dap=LIST_FIRST(&pagedep->pd_diraddhd[i]))
1742					free_diradd(dap);
1743			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
1744				free_diradd(dap);
1745			/*
1746			 * Copy any directory remove dependencies to the list
1747			 * to be processed after the zero'ed inode is written.
1748			 * If the inode has already been written, then they
1749			 * can be dumped directly onto the work list.
1750			 */
1751			for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem;
1752			     dirrem = LIST_NEXT(dirrem, dm_next)) {
1753				LIST_REMOVE(dirrem, dm_next);
1754				dirrem->dm_dirinum = pagedep->pd_ino;
1755				if (inodedep == NULL)
1756					add_to_worklist(&dirrem->dm_list);
1757				else
1758					WORKLIST_INSERT(&inodedep->id_bufwait,
1759					    &dirrem->dm_list);
1760			}
1761			WORKLIST_REMOVE(&pagedep->pd_list);
1762			LIST_REMOVE(pagedep, pd_hash);
1763			WORKITEM_FREE(pagedep, D_PAGEDEP);
1764			continue;
1765
1766		case D_ALLOCINDIR:
1767			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
1768			continue;
1769
1770		case D_ALLOCDIRECT:
1771		case D_INODEDEP:
1772			panic("deallocate_dependencies: Unexpected type %s",
1773			    TYPENAME(wk->wk_type));
1774			/* NOTREACHED */
1775
1776		default:
1777			panic("deallocate_dependencies: Unknown type %s",
1778			    TYPENAME(wk->wk_type));
1779			/* NOTREACHED */
1780		}
1781	}
1782}
1783
1784/*
1785 * Free an allocdirect. Generate a new freefrag work request if appropriate.
1786 * This routine must be called with splbio interrupts blocked.
1787 */
1788static void
1789free_allocdirect(adphead, adp, delay)
1790	struct allocdirectlst *adphead;
1791	struct allocdirect *adp;
1792	int delay;
1793{
1794
1795#ifdef DEBUG
1796	if (lk.lkt_held == -1)
1797		panic("free_allocdirect: lock not held");
1798#endif
1799	if ((adp->ad_state & DEPCOMPLETE) == 0)
1800		LIST_REMOVE(adp, ad_deps);
1801	TAILQ_REMOVE(adphead, adp, ad_next);
1802	if ((adp->ad_state & COMPLETE) == 0)
1803		WORKLIST_REMOVE(&adp->ad_list);
1804	if (adp->ad_freefrag != NULL) {
1805		if (delay)
1806			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
1807			    &adp->ad_freefrag->ff_list);
1808		else
1809			add_to_worklist(&adp->ad_freefrag->ff_list);
1810	}
1811	WORKITEM_FREE(adp, D_ALLOCDIRECT);
1812}
1813
1814/*
1815 * Prepare an inode to be freed. The actual free operation is not
1816 * done until the zero'ed inode has been written to disk.
1817 */
1818static long num_freefile;	/* number of freefile allocated */
1819void
1820softdep_freefile(pvp, ino, mode)
1821		struct vnode *pvp;
1822		ino_t ino;
1823		int mode;
1824{
1825	struct inode *ip = VTOI(pvp);
1826	struct inodedep *inodedep;
1827	struct freefile *freefile;
1828
1829	/*
1830	 * This sets up the inode de-allocation dependency.
1831	 */
1832	(void) checklimit(&num_freefile, 0);
1833	num_freefile += 1;
1834	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
1835		M_FREEFILE, M_WAITOK);
1836	freefile->fx_list.wk_type = D_FREEFILE;
1837	freefile->fx_list.wk_state = 0;
1838	freefile->fx_mode = mode;
1839	freefile->fx_oldinum = ino;
1840	freefile->fx_devvp = ip->i_devvp;
1841	freefile->fx_fs = ip->i_fs;
1842
1843	/*
1844	 * If the inodedep does not exist, then the zero'ed inode has
1845	 * been written to disk and we can free the file immediately.
1846	 */
1847	ACQUIRE_LOCK(&lk);
1848	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0) {
1849		add_to_worklist(&freefile->fx_list);
1850		FREE_LOCK(&lk);
1851		return;
1852	}
1853
1854	/*
1855	 * If we still have a bitmap dependency, then the inode has never
1856	 * been written to disk. Drop the dependency as it is no longer
1857	 * necessary since the inode is being deallocated. We could process
1858	 * the freefile immediately, but then we would have to clear the
1859	 * id_inowait dependencies here and it is easier just to let the
1860	 * zero'ed inode be written and let them be cleaned up in the
1861	 * normal followup actions that follow the inode write.
1862	 */
1863	 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
1864		inodedep->id_state |= DEPCOMPLETE;
1865		LIST_REMOVE(inodedep, id_deps);
1866		inodedep->id_buf = NULL;
1867	}
1868	/*
1869	 * If the inodedep has no dependencies associated with it,
1870	 * then we must free it here and free the file immediately.
1871	 * This case arises when an early allocation fails (for
1872	 * example, the user is over their file quota).
1873	 */
1874	if (free_inodedep(inodedep) == 0)
1875		WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
1876	else
1877		add_to_worklist(&freefile->fx_list);
1878	FREE_LOCK(&lk);
1879}
1880
1881/*
1882 * Try to free an inodedep structure. Return 1 if it could be freed.
1883 */
1884static int
1885free_inodedep(inodedep)
1886	struct inodedep *inodedep;
1887{
1888
1889	if ((inodedep->id_state & ONWORKLIST) != 0 ||
1890	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
1891	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
1892	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
1893	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
1894	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
1895	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
1896	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL)
1897		return (0);
1898	LIST_REMOVE(inodedep, id_hash);
1899	WORKITEM_FREE(inodedep, D_INODEDEP);
1900	num_inodedep -= 1;
1901	return (1);
1902}
1903
1904/*
1905 * This workitem routine performs the block de-allocation.
1906 * The workitem is added to the pending list after the updated
1907 * inode block has been written to disk.  As mentioned above,
1908 * checks regarding the number of blocks de-allocated (compared
1909 * to the number of blocks allocated for the file) are also
1910 * performed in this function.
1911 */
1912static void
1913handle_workitem_freeblocks(freeblks)
1914	struct freeblks *freeblks;
1915{
1916	struct inode tip;
1917	ufs_daddr_t bn;
1918	struct fs *fs;
1919	int i, level, bsize;
1920	long nblocks, blocksreleased = 0;
1921	int error, allerror = 0;
1922	ufs_lbn_t baselbns[NIADDR], tmpval;
1923
1924	tip.i_number = freeblks->fb_previousinum;
1925	tip.i_devvp = freeblks->fb_devvp;
1926	tip.i_dev = freeblks->fb_devvp->v_rdev;
1927	tip.i_fs = freeblks->fb_fs;
1928	tip.i_size = freeblks->fb_oldsize;
1929	tip.i_uid = freeblks->fb_uid;
1930	fs = freeblks->fb_fs;
1931	tmpval = 1;
1932	baselbns[0] = NDADDR;
1933	for (i = 1; i < NIADDR; i++) {
1934		tmpval *= NINDIR(fs);
1935		baselbns[i] = baselbns[i - 1] + tmpval;
1936	}
1937	nblocks = btodb(fs->fs_bsize);
1938	blocksreleased = 0;
1939	/*
1940	 * Indirect blocks first.
1941	 */
1942	for (level = (NIADDR - 1); level >= 0; level--) {
1943		if ((bn = freeblks->fb_iblks[level]) == 0)
1944			continue;
1945		if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
1946		    baselbns[level], &blocksreleased)) == 0)
1947			allerror = error;
1948		ffs_blkfree(&tip, bn, fs->fs_bsize);
1949		blocksreleased += nblocks;
1950	}
1951	/*
1952	 * All direct blocks or frags.
1953	 */
1954	for (i = (NDADDR - 1); i >= 0; i--) {
1955		if ((bn = freeblks->fb_dblks[i]) == 0)
1956			continue;
1957		bsize = blksize(fs, &tip, i);
1958		ffs_blkfree(&tip, bn, bsize);
1959		blocksreleased += btodb(bsize);
1960	}
1961
1962#ifdef DIAGNOSTIC
1963	if (freeblks->fb_chkcnt != blocksreleased)
1964		panic("handle_workitem_freeblocks: block count");
1965	if (allerror)
1966		softdep_error("handle_workitem_freeblks", allerror);
1967#endif /* DIAGNOSTIC */
1968	WORKITEM_FREE(freeblks, D_FREEBLKS);
1969	num_freeblks -= 1;
1970}
1971
1972/*
1973 * Release blocks associated with the inode ip and stored in the indirect
1974 * block dbn. If level is greater than SINGLE, the block is an indirect block
1975 * and recursive calls to indirtrunc must be used to cleanse other indirect
1976 * blocks.
1977 */
1978static int
1979indir_trunc(ip, dbn, level, lbn, countp)
1980	struct inode *ip;
1981	ufs_daddr_t dbn;
1982	int level;
1983	ufs_lbn_t lbn;
1984	long *countp;
1985{
1986	struct buf *bp;
1987	ufs_daddr_t *bap;
1988	ufs_daddr_t nb;
1989	struct fs *fs;
1990	struct worklist *wk;
1991	struct indirdep *indirdep;
1992	int i, lbnadd, nblocks;
1993	int error, allerror = 0;
1994
1995	fs = ip->i_fs;
1996	lbnadd = 1;
1997	for (i = level; i > 0; i--)
1998		lbnadd *= NINDIR(fs);
1999	/*
2000	 * Get buffer of block pointers to be freed. This routine is not
2001	 * called until the zero'ed inode has been written, so it is safe
2002	 * to free blocks as they are encountered. Because the inode has
2003	 * been zero'ed, calls to bmap on these blocks will fail. So, we
2004	 * have to use the on-disk address and the block device for the
2005	 * filesystem to look them up. If the file was deleted before its
2006	 * indirect blocks were all written to disk, the routine that set
2007	 * us up (deallocate_dependencies) will have arranged to leave
2008	 * a complete copy of the indirect block in memory for our use.
2009	 * Otherwise we have to read the blocks in from the disk.
2010	 */
2011	ACQUIRE_LOCK(&lk);
2012	if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
2013	    (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2014		if (wk->wk_type != D_INDIRDEP ||
2015		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2016		    (indirdep->ir_state & GOINGAWAY) == 0)
2017			panic("indir_trunc: lost indirdep");
2018		WORKLIST_REMOVE(wk);
2019		WORKITEM_FREE(indirdep, D_INDIRDEP);
2020		if (LIST_FIRST(&bp->b_dep) != NULL)
2021			panic("indir_trunc: dangling dep");
2022		FREE_LOCK(&lk);
2023	} else {
2024		FREE_LOCK(&lk);
2025		error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
2026		if (error)
2027			return (error);
2028	}
2029	/*
2030	 * Recursively free indirect blocks.
2031	 */
2032	bap = (ufs_daddr_t *)bp->b_data;
2033	nblocks = btodb(fs->fs_bsize);
2034	for (i = NINDIR(fs) - 1; i >= 0; i--) {
2035		if ((nb = bap[i]) == 0)
2036			continue;
2037		if (level != 0) {
2038			if ((error = indir_trunc(ip, fsbtodb(fs, nb),
2039			     level - 1, lbn + (i * lbnadd), countp)) != 0)
2040				allerror = error;
2041		}
2042		ffs_blkfree(ip, nb, fs->fs_bsize);
2043		*countp += nblocks;
2044	}
2045	bp->b_flags |= B_INVAL | B_NOCACHE;
2046	bp->b_flags &= ~B_XXX;
2047	brelse(bp);
2048	return (allerror);
2049}
2050
2051/*
2052 * Free an allocindir.
2053 * This routine must be called with splbio interrupts blocked.
2054 */
2055static void
2056free_allocindir(aip, inodedep)
2057	struct allocindir *aip;
2058	struct inodedep *inodedep;
2059{
2060	struct freefrag *freefrag;
2061
2062#ifdef DEBUG
2063	if (lk.lkt_held == -1)
2064		panic("free_allocindir: lock not held");
2065#endif
2066	if ((aip->ai_state & DEPCOMPLETE) == 0)
2067		LIST_REMOVE(aip, ai_deps);
2068	if (aip->ai_state & ONWORKLIST)
2069		WORKLIST_REMOVE(&aip->ai_list);
2070	LIST_REMOVE(aip, ai_next);
2071	if ((freefrag = aip->ai_freefrag) != NULL) {
2072		if (inodedep == NULL)
2073			add_to_worklist(&freefrag->ff_list);
2074		else
2075			WORKLIST_INSERT(&inodedep->id_bufwait,
2076			    &freefrag->ff_list);
2077	}
2078	WORKITEM_FREE(aip, D_ALLOCINDIR);
2079}
2080
2081/*
2082 * Directory entry addition dependencies.
2083 *
2084 * When adding a new directory entry, the inode (with its incremented link
2085 * count) must be written to disk before the directory entry's pointer to it.
2086 * Also, if the inode is newly allocated, the corresponding freemap must be
2087 * updated (on disk) before the directory entry's pointer. These requirements
2088 * are met via undo/redo on the directory entry's pointer, which consists
2089 * simply of the inode number.
2090 *
2091 * As directory entries are added and deleted, the free space within a
2092 * directory block can become fragmented.  The ufs file system will compact
2093 * a fragmented directory block to make space for a new entry. When this
2094 * occurs, the offsets of previously added entries change. Any "diradd"
2095 * dependency structures corresponding to these entries must be updated with
2096 * the new offsets.
2097 */
2098
2099/*
2100 * This routine is called after the in-memory inode's link
2101 * count has been incremented, but before the directory entry's
2102 * pointer to the inode has been set.
2103 */
2104void
2105softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
2106	struct buf *bp;		/* buffer containing directory block */
2107	struct inode *dp;	/* inode for directory */
2108	off_t diroffset;	/* offset of new entry in directory */
2109	long newinum;		/* inode referenced by new directory entry */
2110	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
2111{
2112	int offset;		/* offset of new entry within directory block */
2113	ufs_lbn_t lbn;		/* block in directory containing new entry */
2114	struct fs *fs;
2115	struct diradd *dap;
2116	struct pagedep *pagedep;
2117	struct inodedep *inodedep;
2118	struct mkdir *mkdir1, *mkdir2;
2119
2120	/*
2121	 * Whiteouts have no dependencies.
2122	 */
2123	if (newinum == WINO) {
2124		if (newdirbp != NULL)
2125			bdwrite(newdirbp);
2126		return;
2127	}
2128
2129	fs = dp->i_fs;
2130	lbn = lblkno(fs, diroffset);
2131	offset = blkoff(fs, diroffset);
2132	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK);
2133	bzero(dap, sizeof(struct diradd));
2134	dap->da_list.wk_type = D_DIRADD;
2135	dap->da_offset = offset;
2136	dap->da_newinum = newinum;
2137	dap->da_state = ATTACHED;
2138	if (newdirbp == NULL) {
2139		dap->da_state |= DEPCOMPLETE;
2140		ACQUIRE_LOCK(&lk);
2141	} else {
2142		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2143		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2144		    M_WAITOK);
2145		mkdir1->md_list.wk_type = D_MKDIR;
2146		mkdir1->md_state = MKDIR_BODY;
2147		mkdir1->md_diradd = dap;
2148		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2149		    M_WAITOK);
2150		mkdir2->md_list.wk_type = D_MKDIR;
2151		mkdir2->md_state = MKDIR_PARENT;
2152		mkdir2->md_diradd = dap;
2153		ACQUIRE_LOCK(&lk);
2154		/*
2155		 * Dependency on "." and ".." being written to disk.
2156		 */
2157		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2158		WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2159		bdwrite(newdirbp);
2160		/*
2161		 * Dependency on link count increase for parent directory
2162		 */
2163		if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
2164		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2165			dap->da_state &= ~MKDIR_PARENT;
2166			WORKITEM_FREE(mkdir2, D_MKDIR);
2167		} else {
2168			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2169			WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2170		}
2171	}
2172	/*
2173	 * Link into parent directory pagedep to await its being written.
2174	 */
2175	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2176		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2177	dap->da_pagedep = pagedep;
2178	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2179	    da_pdlist);
2180	/*
2181	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2182	 * is not yet written. If it is written, do the post-inode write
2183	 * processing to put it on the id_pendinghd list.
2184	 */
2185	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2186	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2187		diradd_inode_written(dap, inodedep);
2188	else
2189		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2190	FREE_LOCK(&lk);
2191}
2192
2193/*
2194 * This procedure is called to change the offset of a directory
2195 * entry when compacting a directory block which must be owned
2196 * exclusively by the caller. Note that the actual entry movement
2197 * must be done in this procedure to ensure that no I/O completions
2198 * occur while the move is in progress.
2199 */
2200void
2201softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2202	struct inode *dp;	/* inode for directory */
2203	caddr_t base;		/* address of dp->i_offset */
2204	caddr_t oldloc;		/* address of old directory location */
2205	caddr_t newloc;		/* address of new directory location */
2206	int entrysize;		/* size of directory entry */
2207{
2208	int offset, oldoffset, newoffset;
2209	struct pagedep *pagedep;
2210	struct diradd *dap;
2211	ufs_lbn_t lbn;
2212
2213	ACQUIRE_LOCK(&lk);
2214	lbn = lblkno(dp->i_fs, dp->i_offset);
2215	offset = blkoff(dp->i_fs, dp->i_offset);
2216	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2217		goto done;
2218	oldoffset = offset + (oldloc - base);
2219	newoffset = offset + (newloc - base);
2220	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]);
2221	     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2222		if (dap->da_offset != oldoffset)
2223			continue;
2224		dap->da_offset = newoffset;
2225		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2226			break;
2227		LIST_REMOVE(dap, da_pdlist);
2228		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2229		    dap, da_pdlist);
2230		break;
2231	}
2232done:
2233	bcopy(oldloc, newloc, entrysize);
2234	FREE_LOCK(&lk);
2235}
2236
2237/*
2238 * Free a diradd dependency structure. This routine must be called
2239 * with splbio interrupts blocked.
2240 */
2241static void
2242free_diradd(dap)
2243	struct diradd *dap;
2244{
2245	struct dirrem *dirrem;
2246	struct pagedep *pagedep;
2247	struct inodedep *inodedep;
2248	struct mkdir *mkdir, *nextmd;
2249
2250#ifdef DEBUG
2251	if (lk.lkt_held == -1)
2252		panic("free_diradd: lock not held");
2253#endif
2254	WORKLIST_REMOVE(&dap->da_list);
2255	LIST_REMOVE(dap, da_pdlist);
2256	if ((dap->da_state & DIRCHG) == 0) {
2257		pagedep = dap->da_pagedep;
2258	} else {
2259		dirrem = dap->da_previous;
2260		pagedep = dirrem->dm_pagedep;
2261		add_to_worklist(&dirrem->dm_list);
2262	}
2263	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2264	    0, &inodedep) != 0)
2265		(void) free_inodedep(inodedep);
2266	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2267		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2268			nextmd = LIST_NEXT(mkdir, md_mkdirs);
2269			if (mkdir->md_diradd != dap)
2270				continue;
2271			dap->da_state &= ~mkdir->md_state;
2272			WORKLIST_REMOVE(&mkdir->md_list);
2273			LIST_REMOVE(mkdir, md_mkdirs);
2274			WORKITEM_FREE(mkdir, D_MKDIR);
2275		}
2276		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
2277			panic("free_diradd: unfound ref");
2278	}
2279	WORKITEM_FREE(dap, D_DIRADD);
2280}
2281
2282/*
2283 * Directory entry removal dependencies.
2284 *
2285 * When removing a directory entry, the entry's inode pointer must be
2286 * zero'ed on disk before the corresponding inode's link count is decremented
2287 * (possibly freeing the inode for re-use). This dependency is handled by
2288 * updating the directory entry but delaying the inode count reduction until
2289 * after the directory block has been written to disk. After this point, the
2290 * inode count can be decremented whenever it is convenient.
2291 */
2292
2293/*
2294 * This routine should be called immediately after removing
2295 * a directory entry.  The inode's link count should not be
2296 * decremented by the calling procedure -- the soft updates
2297 * code will do this task when it is safe.
2298 */
2299void
2300softdep_setup_remove(bp, dp, ip, isrmdir)
2301	struct buf *bp;		/* buffer containing directory block */
2302	struct inode *dp;	/* inode for the directory being modified */
2303	struct inode *ip;	/* inode for directory entry being removed */
2304	int isrmdir;		/* indicates if doing RMDIR */
2305{
2306	struct dirrem *dirrem;
2307
2308	/*
2309	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2310	 */
2311	dirrem = newdirrem(bp, dp, ip, isrmdir);
2312	if ((dirrem->dm_state & COMPLETE) == 0) {
2313		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2314		    dm_next);
2315	} else {
2316		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2317		add_to_worklist(&dirrem->dm_list);
2318	}
2319	FREE_LOCK(&lk);
2320}
2321
2322/*
2323 * Allocate a new dirrem if appropriate and return it along with
2324 * its associated pagedep. Called without a lock, returns with lock.
2325 */
2326static struct dirrem *
2327newdirrem(bp, dp, ip, isrmdir)
2328	struct buf *bp;		/* buffer containing directory block */
2329	struct inode *dp;	/* inode for the directory being modified */
2330	struct inode *ip;	/* inode for directory entry being removed */
2331	int isrmdir;		/* indicates if doing RMDIR */
2332{
2333	int offset;
2334	ufs_lbn_t lbn;
2335	struct diradd *dap;
2336	struct dirrem *dirrem;
2337	struct pagedep *pagedep;
2338
2339	/*
2340	 * Whiteouts have no deletion dependencies.
2341	 */
2342	if (ip == NULL)
2343		panic("newdirrem: whiteout");
2344	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
2345		M_DIRREM, M_WAITOK);
2346	bzero(dirrem, sizeof(struct dirrem));
2347	dirrem->dm_list.wk_type = D_DIRREM;
2348	dirrem->dm_state = isrmdir ? RMDIR : 0;
2349	dirrem->dm_mnt = ITOV(ip)->v_mount;
2350	dirrem->dm_oldinum = ip->i_number;
2351
2352	ACQUIRE_LOCK(&lk);
2353	lbn = lblkno(dp->i_fs, dp->i_offset);
2354	offset = blkoff(dp->i_fs, dp->i_offset);
2355	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2356		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2357	dirrem->dm_pagedep = pagedep;
2358	/*
2359	 * Check for a diradd dependency for the same directory entry.
2360	 * If present, then both dependencies become obsolete and can
2361	 * be de-allocated. Check for an entry on both the pd_dirraddhd
2362	 * list and the pd_pendinghd list.
2363	 */
2364	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(offset)]);
2365	     dap; dap = LIST_NEXT(dap, da_pdlist))
2366		if (dap->da_offset == offset)
2367			break;
2368	if (dap == NULL) {
2369		for (dap = LIST_FIRST(&pagedep->pd_pendinghd);
2370		     dap; dap = LIST_NEXT(dap, da_pdlist))
2371			if (dap->da_offset == offset)
2372				break;
2373		if (dap == NULL)
2374			return (dirrem);
2375	}
2376	/*
2377	 * Must be ATTACHED at this point, so just delete it.
2378	 */
2379	if ((dap->da_state & ATTACHED) == 0)
2380		panic("newdirrem: not ATTACHED");
2381	if (dap->da_newinum != ip->i_number)
2382		panic("newdirrem: inum %d should be %d",
2383		    ip->i_number, dap->da_newinum);
2384	free_diradd(dap);
2385	dirrem->dm_state |= COMPLETE;
2386	return (dirrem);
2387}
2388
2389/*
2390 * Directory entry change dependencies.
2391 *
2392 * Changing an existing directory entry requires that an add operation
2393 * be completed first followed by a deletion. The semantics for the addition
2394 * are identical to the description of adding a new entry above except
2395 * that the rollback is to the old inode number rather than zero. Once
2396 * the addition dependency is completed, the removal is done as described
2397 * in the removal routine above.
2398 */
2399
2400/*
2401 * This routine should be called immediately after changing
2402 * a directory entry.  The inode's link count should not be
2403 * decremented by the calling procedure -- the soft updates
2404 * code will perform this task when it is safe.
2405 */
2406void
2407softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
2408	struct buf *bp;		/* buffer containing directory block */
2409	struct inode *dp;	/* inode for the directory being modified */
2410	struct inode *ip;	/* inode for directory entry being removed */
2411	long newinum;		/* new inode number for changed entry */
2412	int isrmdir;		/* indicates if doing RMDIR */
2413{
2414	int offset;
2415	struct diradd *dap;
2416	struct dirrem *dirrem;
2417	struct inodedep *inodedep;
2418
2419	offset = blkoff(dp->i_fs, dp->i_offset);
2420
2421	/*
2422	 * Whiteouts have no addition dependencies.
2423	 */
2424	if (newinum == WINO) {
2425		dap = NULL;
2426	} else {
2427		MALLOC(dap, struct diradd *, sizeof(struct diradd),
2428		    M_DIRADD, M_WAITOK);
2429		bzero(dap, sizeof(struct diradd));
2430		dap->da_list.wk_type = D_DIRADD;
2431		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
2432		dap->da_offset = offset;
2433		dap->da_newinum = newinum;
2434	}
2435
2436	/*
2437	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2438	 */
2439	dirrem = newdirrem(bp, dp, ip, isrmdir);
2440
2441	/*
2442	 * If the inode has already been written, then no addition
2443	 * dependency needs to be created.
2444	 */
2445	if (inodedep_lookup(dp->i_fs, newinum, 0, &inodedep) == 0 ||
2446	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2447		WORKITEM_FREE(dap, D_DIRADD);
2448		dap = NULL;
2449	}
2450
2451	if (dap) {
2452		dap->da_previous = dirrem;
2453		LIST_INSERT_HEAD(
2454		    &dirrem->dm_pagedep->pd_diraddhd[DIRADDHASH(offset)],
2455		    dap, da_pdlist);
2456		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2457	} else if ((dirrem->dm_state & COMPLETE) == 0) {
2458		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2459		    dm_next);
2460	} else {
2461		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2462		add_to_worklist(&dirrem->dm_list);
2463	}
2464	FREE_LOCK(&lk);
2465}
2466
2467/*
2468 * Called whenever the link count on an inode is increased.
2469 * It creates an inode dependency so that the new reference(s)
2470 * to the inode cannot be committed to disk until the updated
2471 * inode has been written.
2472 */
2473void
2474softdep_increase_linkcnt(ip)
2475	struct inode *ip;	/* the inode with the increased link count */
2476{
2477	struct inodedep *inodedep;
2478
2479	ACQUIRE_LOCK(&lk);
2480	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
2481	FREE_LOCK(&lk);
2482}
2483
2484/*
2485 * This workitem decrements the inode's link count.
2486 * If the link count reaches zero, the file is removed.
2487 */
2488static void
2489handle_workitem_remove(dirrem)
2490	struct dirrem *dirrem;
2491{
2492	struct proc *p = CURPROC;	/* XXX */
2493	struct inodedep *inodedep;
2494	struct vnode *vp;
2495	struct inode *ip;
2496	int error;
2497
2498	if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
2499		softdep_error("handle_workitem_remove: vget", error);
2500		return;
2501	}
2502	ip = VTOI(vp);
2503	/*
2504	 * Normal file deletion.
2505	 */
2506	if ((dirrem->dm_state & RMDIR) == 0) {
2507		ip->i_nlink--;
2508		if (ip->i_nlink < ip->i_effnlink)
2509			panic("handle_workitem_remove: bad file delta");
2510		ip->i_flag |= IN_CHANGE;
2511		vput(vp);
2512		WORKITEM_FREE(dirrem, D_DIRREM);
2513		return;
2514	}
2515	/*
2516	 * Directory deletion. Decrement reference count for both the
2517	 * just deleted parent directory entry and the reference for ".".
2518	 * Next truncate the directory to length zero. When the
2519	 * truncation completes, arrange to have the reference count on
2520	 * the parent decremented to account for the loss of "..".
2521	 */
2522	ip->i_nlink -= 2;
2523	if (ip->i_nlink < ip->i_effnlink)
2524		panic("handle_workitem_remove: bad dir delta");
2525	ip->i_flag |= IN_CHANGE;
2526	if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0)
2527		softdep_error("handle_workitem_remove: truncate", error);
2528	ACQUIRE_LOCK(&lk);
2529	(void) inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, DEPALLOC,
2530	    &inodedep);
2531	dirrem->dm_state = 0;
2532	dirrem->dm_oldinum = dirrem->dm_dirinum;
2533	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
2534	FREE_LOCK(&lk);
2535	vput(vp);
2536}
2537
2538/*
2539 * Inode de-allocation dependencies.
2540 *
2541 * When an inode's link count is reduced to zero, it can be de-allocated. We
2542 * found it convenient to postpone de-allocation until after the inode is
2543 * written to disk with its new link count (zero).  At this point, all of the
2544 * on-disk inode's block pointers are nullified and, with careful dependency
2545 * list ordering, all dependencies related to the inode will be satisfied and
2546 * the corresponding dependency structures de-allocated.  So, if/when the
2547 * inode is reused, there will be no mixing of old dependencies with new
2548 * ones.  This artificial dependency is set up by the block de-allocation
2549 * procedure above (softdep_setup_freeblocks) and completed by the
2550 * following procedure.
2551 */
2552static void
2553handle_workitem_freefile(freefile)
2554	struct freefile *freefile;
2555{
2556	struct vnode vp;
2557	struct inode tip;
2558	struct inodedep *idp;
2559	int error;
2560
2561#ifdef DEBUG
2562	ACQUIRE_LOCK(&lk);
2563	if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp))
2564		panic("handle_workitem_freefile: inodedep survived");
2565	FREE_LOCK(&lk);
2566#endif
2567	tip.i_devvp = freefile->fx_devvp;
2568	tip.i_dev = freefile->fx_devvp->v_rdev;
2569	tip.i_fs = freefile->fx_fs;
2570	vp.v_data = &tip;
2571	if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
2572		softdep_error("handle_workitem_freefile", error);
2573	WORKITEM_FREE(freefile, D_FREEFILE);
2574	num_freefile -= 1;
2575}
2576
2577/*
2578 * Disk writes.
2579 *
2580 * The dependency structures constructed above are most actively used when file
2581 * system blocks are written to disk.  No constraints are placed on when a
2582 * block can be written, but unsatisfied update dependencies are made safe by
2583 * modifying (or replacing) the source memory for the duration of the disk
2584 * write.  When the disk write completes, the memory block is again brought
2585 * up-to-date.
2586 *
2587 * In-core inode structure reclamation.
2588 *
2589 * Because there are a finite number of "in-core" inode structures, they are
2590 * reused regularly.  By transferring all inode-related dependencies to the
2591 * in-memory inode block and indexing them separately (via "inodedep"s), we
2592 * can allow "in-core" inode structures to be reused at any time and avoid
2593 * any increase in contention.
2594 *
2595 * Called just before entering the device driver to initiate a new disk I/O.
2596 * The buffer must be locked, thus, no I/O completion operations can occur
2597 * while we are manipulating its associated dependencies.
2598 */
2599void
2600softdep_disk_io_initiation(bp)
2601	struct buf *bp;		/* structure describing disk write to occur */
2602{
2603	struct worklist *wk, *nextwk;
2604	struct indirdep *indirdep;
2605
2606	/*
2607	 * We only care about write operations. There should never
2608	 * be dependencies for reads.
2609	 */
2610	if (bp->b_flags & B_READ)
2611		panic("softdep_disk_io_initiation: read");
2612	/*
2613	 * Do any necessary pre-I/O processing.
2614	 */
2615	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
2616		nextwk = LIST_NEXT(wk, wk_list);
2617		switch (wk->wk_type) {
2618
2619		case D_PAGEDEP:
2620			initiate_write_filepage(WK_PAGEDEP(wk), bp);
2621			continue;
2622
2623		case D_INODEDEP:
2624			initiate_write_inodeblock(WK_INODEDEP(wk), bp);
2625			continue;
2626
2627		case D_INDIRDEP:
2628			indirdep = WK_INDIRDEP(wk);
2629			if (indirdep->ir_state & GOINGAWAY)
2630				panic("disk_io_initiation: indirdep gone");
2631			/*
2632			 * If there are no remaining dependencies, this
2633			 * will be writing the real pointers, so the
2634			 * dependency can be freed.
2635			 */
2636			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
2637				indirdep->ir_savebp->b_flags &= ~B_XXX;
2638				indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
2639				brelse(indirdep->ir_savebp);
2640				/* inline expand WORKLIST_REMOVE(wk); */
2641				wk->wk_state &= ~ONWORKLIST;
2642				LIST_REMOVE(wk, wk_list);
2643				WORKITEM_FREE(indirdep, D_INDIRDEP);
2644				continue;
2645			}
2646			/*
2647			 * Replace up-to-date version with safe version.
2648			 */
2649			ACQUIRE_LOCK(&lk);
2650			indirdep->ir_state &= ~ATTACHED;
2651			indirdep->ir_state |= UNDONE;
2652			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
2653			    M_INDIRDEP, M_WAITOK);
2654			bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
2655			bcopy(indirdep->ir_savebp->b_data, bp->b_data,
2656			    bp->b_bcount);
2657			FREE_LOCK(&lk);
2658			continue;
2659
2660		case D_MKDIR:
2661		case D_BMSAFEMAP:
2662		case D_ALLOCDIRECT:
2663		case D_ALLOCINDIR:
2664			continue;
2665
2666		default:
2667			panic("handle_disk_io_initiation: Unexpected type %s",
2668			    TYPENAME(wk->wk_type));
2669			/* NOTREACHED */
2670		}
2671	}
2672}
2673
2674/*
2675 * Called from within the procedure above to deal with unsatisfied
2676 * allocation dependencies in a directory. The buffer must be locked,
2677 * thus, no I/O completion operations can occur while we are
2678 * manipulating its associated dependencies.
2679 */
2680static void
2681initiate_write_filepage(pagedep, bp)
2682	struct pagedep *pagedep;
2683	struct buf *bp;
2684{
2685	struct diradd *dap;
2686	struct direct *ep;
2687	int i;
2688
2689	if (pagedep->pd_state & IOSTARTED) {
2690		/*
2691		 * This can only happen if there is a driver that does not
2692		 * understand chaining. Here biodone will reissue the call
2693		 * to strategy for the incomplete buffers.
2694		 */
2695		printf("initiate_write_filepage: already started\n");
2696		return;
2697	}
2698	pagedep->pd_state |= IOSTARTED;
2699	ACQUIRE_LOCK(&lk);
2700	for (i = 0; i < DAHASHSZ; i++) {
2701		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
2702		     dap = LIST_NEXT(dap, da_pdlist)) {
2703			ep = (struct direct *)
2704			    ((char *)bp->b_data + dap->da_offset);
2705			if (ep->d_ino != dap->da_newinum)
2706				panic("%s: dir inum %d != new %d",
2707				    "initiate_write_filepage",
2708				    ep->d_ino, dap->da_newinum);
2709			if (dap->da_state & DIRCHG)
2710				ep->d_ino = dap->da_previous->dm_oldinum;
2711			else
2712				ep->d_ino = 0;
2713			dap->da_state &= ~ATTACHED;
2714			dap->da_state |= UNDONE;
2715		}
2716	}
2717	FREE_LOCK(&lk);
2718}
2719
2720/*
2721 * Called from within the procedure above to deal with unsatisfied
2722 * allocation dependencies in an inodeblock. The buffer must be
2723 * locked, thus, no I/O completion operations can occur while we
2724 * are manipulating its associated dependencies.
2725 */
2726static void
2727initiate_write_inodeblock(inodedep, bp)
2728	struct inodedep *inodedep;
2729	struct buf *bp;			/* The inode block */
2730{
2731	struct allocdirect *adp, *lastadp;
2732	struct dinode *dp;
2733	struct fs *fs;
2734	ufs_lbn_t prevlbn = 0;
2735	int i, deplist;
2736
2737	if (inodedep->id_state & IOSTARTED)
2738		panic("initiate_write_inodeblock: already started");
2739	inodedep->id_state |= IOSTARTED;
2740	fs = inodedep->id_fs;
2741	dp = (struct dinode *)bp->b_data +
2742	    ino_to_fsbo(fs, inodedep->id_ino);
2743	/*
2744	 * If the bitmap is not yet written, then the allocated
2745	 * inode cannot be written to disk.
2746	 */
2747	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
2748		if (inodedep->id_savedino != NULL)
2749			panic("initiate_write_inodeblock: already doing I/O");
2750		MALLOC(inodedep->id_savedino, struct dinode *,
2751		    sizeof(struct dinode), M_INODEDEP, M_WAITOK);
2752		*inodedep->id_savedino = *dp;
2753		bzero((caddr_t)dp, sizeof(struct dinode));
2754		return;
2755	}
2756	/*
2757	 * If no dependencies, then there is nothing to roll back.
2758	 */
2759	inodedep->id_savedsize = dp->di_size;
2760	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
2761		return;
2762	/*
2763	 * Set the dependencies to busy.
2764	 */
2765	ACQUIRE_LOCK(&lk);
2766	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2767	     adp = TAILQ_NEXT(adp, ad_next)) {
2768#ifdef DIAGNOSTIC
2769		if (deplist != 0 && prevlbn >= adp->ad_lbn)
2770			panic("softdep_write_inodeblock: lbn order");
2771		prevlbn = adp->ad_lbn;
2772		if (adp->ad_lbn < NDADDR &&
2773		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
2774			panic("%s: direct pointer #%d mismatch %d != %d",
2775			    "softdep_write_inodeblock", adp->ad_lbn,
2776			    dp->di_db[adp->ad_lbn], adp->ad_newblkno);
2777		if (adp->ad_lbn >= NDADDR &&
2778		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
2779			panic("%s: indirect pointer #%d mismatch %d != %d",
2780			    "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
2781			    dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
2782		deplist |= 1 << adp->ad_lbn;
2783		if ((adp->ad_state & ATTACHED) == 0)
2784			panic("softdep_write_inodeblock: Unknown state 0x%x",
2785			    adp->ad_state);
2786#endif /* DIAGNOSTIC */
2787		adp->ad_state &= ~ATTACHED;
2788		adp->ad_state |= UNDONE;
2789	}
2790	/*
2791	 * The on-disk inode cannot claim to be any larger than the last
2792	 * fragment that has been written. Otherwise, the on-disk inode
2793	 * might have fragments that were not the last block in the file
2794	 * which would corrupt the filesystem.
2795	 */
2796	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2797	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
2798		if (adp->ad_lbn >= NDADDR)
2799			break;
2800		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
2801		/* keep going until hitting a rollback to a frag */
2802		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
2803			continue;
2804		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
2805		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
2806#ifdef DIAGNOSTIC
2807			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
2808				panic("softdep_write_inodeblock: lost dep1");
2809#endif /* DIAGNOSTIC */
2810			dp->di_db[i] = 0;
2811		}
2812		for (i = 0; i < NIADDR; i++) {
2813#ifdef DIAGNOSTIC
2814			if (dp->di_ib[i] != 0 &&
2815			    (deplist & ((1 << NDADDR) << i)) == 0)
2816				panic("softdep_write_inodeblock: lost dep2");
2817#endif /* DIAGNOSTIC */
2818			dp->di_ib[i] = 0;
2819		}
2820		FREE_LOCK(&lk);
2821		return;
2822	}
2823	/*
2824	 * If we have zero'ed out the last allocated block of the file,
2825	 * roll back the size to the last currently allocated block.
2826	 * We know that this last allocated block is a full-sized as
2827	 * we already checked for fragments in the loop above.
2828	 */
2829	if (lastadp != NULL &&
2830	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
2831		for (i = lastadp->ad_lbn; i >= 0; i--)
2832			if (dp->di_db[i] != 0)
2833				break;
2834		dp->di_size = (i + 1) * fs->fs_bsize;
2835	}
2836	/*
2837	 * The only dependencies are for indirect blocks.
2838	 *
2839	 * The file size for indirect block additions is not guaranteed.
2840	 * Such a guarantee would be non-trivial to achieve. The conventional
2841	 * synchronous write implementation also does not make this guarantee.
2842	 * Fsck should catch and fix discrepancies. Arguably, the file size
2843	 * can be over-estimated without destroying integrity when the file
2844	 * moves into the indirect blocks (i.e., is large). If we want to
2845	 * postpone fsck, we are stuck with this argument.
2846	 */
2847	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
2848		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
2849	FREE_LOCK(&lk);
2850}
2851
2852/*
2853 * This routine is called during the completion interrupt
2854 * service routine for a disk write (from the procedure called
2855 * by the device driver to inform the file system caches of
2856 * a request completion).  It should be called early in this
2857 * procedure, before the block is made available to other
2858 * processes or other routines are called.
2859 */
2860void
2861softdep_disk_write_complete(bp)
2862	struct buf *bp;		/* describes the completed disk write */
2863{
2864	struct worklist *wk;
2865	struct workhead reattach;
2866	struct newblk *newblk;
2867	struct allocindir *aip;
2868	struct allocdirect *adp;
2869	struct indirdep *indirdep;
2870	struct inodedep *inodedep;
2871	struct bmsafemap *bmsafemap;
2872
2873#ifdef DEBUG
2874	if (lk.lkt_held != -1)
2875		panic("softdep_disk_write_complete: lock is held");
2876	lk.lkt_held = -2;
2877#endif
2878	LIST_INIT(&reattach);
2879	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2880		WORKLIST_REMOVE(wk);
2881		switch (wk->wk_type) {
2882
2883		case D_PAGEDEP:
2884			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
2885				WORKLIST_INSERT(&reattach, wk);
2886			continue;
2887
2888		case D_INODEDEP:
2889			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
2890				WORKLIST_INSERT(&reattach, wk);
2891			continue;
2892
2893		case D_BMSAFEMAP:
2894			bmsafemap = WK_BMSAFEMAP(wk);
2895			while (newblk = LIST_FIRST(&bmsafemap->sm_newblkhd)) {
2896				newblk->nb_state |= DEPCOMPLETE;
2897				newblk->nb_bmsafemap = NULL;
2898				LIST_REMOVE(newblk, nb_deps);
2899			}
2900			while (adp = LIST_FIRST(&bmsafemap->sm_allocdirecthd)) {
2901				adp->ad_state |= DEPCOMPLETE;
2902				adp->ad_buf = NULL;
2903				LIST_REMOVE(adp, ad_deps);
2904				handle_allocdirect_partdone(adp);
2905			}
2906			while (aip = LIST_FIRST(&bmsafemap->sm_allocindirhd)) {
2907				aip->ai_state |= DEPCOMPLETE;
2908				aip->ai_buf = NULL;
2909				LIST_REMOVE(aip, ai_deps);
2910				handle_allocindir_partdone(aip);
2911			}
2912			while ((inodedep =
2913			       LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
2914				inodedep->id_state |= DEPCOMPLETE;
2915				LIST_REMOVE(inodedep, id_deps);
2916				inodedep->id_buf = NULL;
2917			}
2918			WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
2919			continue;
2920
2921		case D_MKDIR:
2922			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
2923			continue;
2924
2925		case D_ALLOCDIRECT:
2926			adp = WK_ALLOCDIRECT(wk);
2927			adp->ad_state |= COMPLETE;
2928			handle_allocdirect_partdone(adp);
2929			continue;
2930
2931		case D_ALLOCINDIR:
2932			aip = WK_ALLOCINDIR(wk);
2933			aip->ai_state |= COMPLETE;
2934			handle_allocindir_partdone(aip);
2935			continue;
2936
2937		case D_INDIRDEP:
2938			indirdep = WK_INDIRDEP(wk);
2939			if (indirdep->ir_state & GOINGAWAY)
2940				panic("disk_write_complete: indirdep gone");
2941			bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
2942			FREE(indirdep->ir_saveddata, M_INDIRDEP);
2943			indirdep->ir_saveddata = 0;
2944			indirdep->ir_state &= ~UNDONE;
2945			indirdep->ir_state |= ATTACHED;
2946			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
2947				LIST_REMOVE(aip, ai_next);
2948				handle_allocindir_partdone(aip);
2949			}
2950			WORKLIST_INSERT(&reattach, wk);
2951			bdirty(bp);
2952			continue;
2953
2954		default:
2955			panic("handle_disk_write_complete: Unknown type %s",
2956			    TYPENAME(wk->wk_type));
2957			/* NOTREACHED */
2958		}
2959	}
2960	/*
2961	 * Reattach any requests that must be redone.
2962	 */
2963	while ((wk = LIST_FIRST(&reattach)) != NULL) {
2964		WORKLIST_REMOVE(wk);
2965		WORKLIST_INSERT(&bp->b_dep, wk);
2966	}
2967#ifdef DEBUG
2968	if (lk.lkt_held != -2)
2969		panic("softdep_disk_write_complete: lock lost");
2970	lk.lkt_held = -1;
2971#endif
2972}
2973
2974/*
2975 * Called from within softdep_disk_write_complete above. Note that
2976 * this routine is always called from interrupt level with further
2977 * splbio interrupts blocked.
2978 */
2979static void
2980handle_allocdirect_partdone(adp)
2981	struct allocdirect *adp;	/* the completed allocdirect */
2982{
2983	struct allocdirect *listadp;
2984	struct inodedep *inodedep;
2985	long bsize;
2986
2987	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
2988		return;
2989	if (adp->ad_buf != NULL)
2990		panic("handle_allocdirect_partdone: dangling dep");
2991	/*
2992	 * The on-disk inode cannot claim to be any larger than the last
2993	 * fragment that has been written. Otherwise, the on-disk inode
2994	 * might have fragments that were not the last block in the file
2995	 * which would corrupt the filesystem. Thus, we cannot free any
2996	 * allocdirects after one whose ad_oldblkno claims a fragment as
2997	 * these blocks must be rolled back to zero before writing the inode.
2998	 * We check the currently active set of allocdirects in id_inoupdt.
2999	 */
3000	inodedep = adp->ad_inodedep;
3001	bsize = inodedep->id_fs->fs_bsize;
3002	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp;
3003	     listadp = TAILQ_NEXT(listadp, ad_next)) {
3004		/* found our block */
3005		if (listadp == adp)
3006			break;
3007		/* continue if ad_oldlbn is not a fragment */
3008		if (listadp->ad_oldsize == 0 ||
3009		    listadp->ad_oldsize == bsize)
3010			continue;
3011		/* hit a fragment */
3012		return;
3013	}
3014	/*
3015	 * If we have reached the end of the current list without
3016	 * finding the just finished dependency, then it must be
3017	 * on the future dependency list. Future dependencies cannot
3018	 * be freed until they are moved to the current list.
3019	 */
3020	if (listadp == NULL) {
3021#ifdef DEBUG
3022		for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp;
3023		     listadp = TAILQ_NEXT(listadp, ad_next))
3024			/* found our block */
3025			if (listadp == adp)
3026				break;
3027		if (listadp == NULL)
3028			panic("handle_allocdirect_partdone: lost dep");
3029#endif /* DEBUG */
3030		return;
3031	}
3032	/*
3033	 * If we have found the just finished dependency, then free
3034	 * it along with anything that follows it that is complete.
3035	 */
3036	for (; adp; adp = listadp) {
3037		listadp = TAILQ_NEXT(adp, ad_next);
3038		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3039			return;
3040		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
3041	}
3042}
3043
3044/*
3045 * Called from within softdep_disk_write_complete above. Note that
3046 * this routine is always called from interrupt level with further
3047 * splbio interrupts blocked.
3048 */
3049static void
3050handle_allocindir_partdone(aip)
3051	struct allocindir *aip;		/* the completed allocindir */
3052{
3053	struct indirdep *indirdep;
3054
3055	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
3056		return;
3057	if (aip->ai_buf != NULL)
3058		panic("handle_allocindir_partdone: dangling dependency");
3059	indirdep = aip->ai_indirdep;
3060	if (indirdep->ir_state & UNDONE) {
3061		LIST_REMOVE(aip, ai_next);
3062		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
3063		return;
3064	}
3065	((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3066	    aip->ai_newblkno;
3067	LIST_REMOVE(aip, ai_next);
3068	if (aip->ai_freefrag != NULL)
3069		add_to_worklist(&aip->ai_freefrag->ff_list);
3070	WORKITEM_FREE(aip, D_ALLOCINDIR);
3071}
3072
3073/*
3074 * Called from within softdep_disk_write_complete above to restore
3075 * in-memory inode block contents to their most up-to-date state. Note
3076 * that this routine is always called from interrupt level with further
3077 * splbio interrupts blocked.
3078 */
3079static int
3080handle_written_inodeblock(inodedep, bp)
3081	struct inodedep *inodedep;
3082	struct buf *bp;		/* buffer containing the inode block */
3083{
3084	struct pagedep *pagedep;
3085	struct worklist *wk, *filefree;
3086	struct allocdirect *adp, *nextadp;
3087	struct dinode *dp;
3088	struct diradd *dap;
3089	int hadchanges;
3090
3091	if ((inodedep->id_state & IOSTARTED) == 0)
3092		panic("handle_written_inodeblock: not started");
3093	inodedep->id_state &= ~IOSTARTED;
3094	inodedep->id_state |= COMPLETE;
3095	dp = (struct dinode *)bp->b_data +
3096	    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
3097	/*
3098	 * If we had to rollback the inode allocation because of
3099	 * bitmaps being incomplete, then simply restore it.
3100	 * Keep the block dirty so that it will not be reclaimed until
3101	 * all associated dependencies have been cleared and the
3102	 * corresponding updates written to disk.
3103	 */
3104	if (inodedep->id_savedino != NULL) {
3105		*dp = *inodedep->id_savedino;
3106		FREE(inodedep->id_savedino, M_INODEDEP);
3107		inodedep->id_savedino = NULL;
3108		bdirty(bp);
3109		return (1);
3110	}
3111	/*
3112	 * Roll forward anything that had to be rolled back before
3113	 * the inode could be updated.
3114	 */
3115	hadchanges = 0;
3116	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
3117		nextadp = TAILQ_NEXT(adp, ad_next);
3118		if (adp->ad_state & ATTACHED)
3119			panic("handle_written_inodeblock: new entry");
3120		if (adp->ad_lbn < NDADDR) {
3121			if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno)
3122				panic("%s: %s #%d mismatch %d != %d",
3123				    "handle_written_inodeblock",
3124				    "direct pointer", adp->ad_lbn,
3125				    dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
3126			dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
3127		} else {
3128			if (dp->di_ib[adp->ad_lbn - NDADDR] != 0)
3129				panic("%s: %s #%d allocated as %d",
3130				    "handle_written_inodeblock",
3131				    "indirect pointer", adp->ad_lbn - NDADDR,
3132				    dp->di_ib[adp->ad_lbn - NDADDR]);
3133			dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
3134		}
3135		adp->ad_state &= ~UNDONE;
3136		adp->ad_state |= ATTACHED;
3137		hadchanges = 1;
3138	}
3139	/*
3140	 * Reset the file size to its most up-to-date value.
3141	 */
3142	if (inodedep->id_savedsize == -1)
3143		panic("handle_written_inodeblock: bad size");
3144	if (dp->di_size != inodedep->id_savedsize) {
3145		dp->di_size = inodedep->id_savedsize;
3146		hadchanges = 1;
3147	}
3148	inodedep->id_savedsize = -1;
3149	/*
3150	 * If there were any rollbacks in the inode block, then it must be
3151	 * marked dirty so that its will eventually get written back in
3152	 * its correct form.
3153	 */
3154	if (hadchanges)
3155		bdirty(bp);
3156	/*
3157	 * Process any allocdirects that completed during the update.
3158	 */
3159	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
3160		handle_allocdirect_partdone(adp);
3161	/*
3162	 * Process deallocations that were held pending until the
3163	 * inode had been written to disk. Freeing of the inode
3164	 * is delayed until after all blocks have been freed to
3165	 * avoid creation of new <vfsid, inum, lbn> triples
3166	 * before the old ones have been deleted.
3167	 */
3168	filefree = NULL;
3169	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
3170		WORKLIST_REMOVE(wk);
3171		switch (wk->wk_type) {
3172
3173		case D_FREEFILE:
3174			/*
3175			 * We defer adding filefree to the worklist until
3176			 * all other additions have been made to ensure
3177			 * that it will be done after all the old blocks
3178			 * have been freed.
3179			 */
3180			if (filefree != NULL)
3181				panic("handle_written_inodeblock: filefree");
3182			filefree = wk;
3183			continue;
3184
3185		case D_MKDIR:
3186			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
3187			continue;
3188
3189		case D_DIRADD:
3190			diradd_inode_written(WK_DIRADD(wk), inodedep);
3191			continue;
3192
3193		case D_FREEBLKS:
3194		case D_FREEFRAG:
3195		case D_DIRREM:
3196			add_to_worklist(wk);
3197			continue;
3198
3199		default:
3200			panic("handle_written_inodeblock: Unknown type %s",
3201			    TYPENAME(wk->wk_type));
3202			/* NOTREACHED */
3203		}
3204	}
3205	if (filefree != NULL) {
3206		if (free_inodedep(inodedep) == 0)
3207			panic("handle_written_inodeblock: live inodedep");
3208		add_to_worklist(filefree);
3209		return (0);
3210	}
3211
3212	/*
3213	 * If no outstanding dependencies, free it.
3214	 */
3215	if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
3216		return (0);
3217	return (hadchanges);
3218}
3219
3220/*
3221 * Process a diradd entry after its dependent inode has been written.
3222 * This routine must be called with splbio interrupts blocked.
3223 */
3224static void
3225diradd_inode_written(dap, inodedep)
3226	struct diradd *dap;
3227	struct inodedep *inodedep;
3228{
3229	struct pagedep *pagedep;
3230
3231	dap->da_state |= COMPLETE;
3232	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3233		if (dap->da_state & DIRCHG)
3234			pagedep = dap->da_previous->dm_pagedep;
3235		else
3236			pagedep = dap->da_pagedep;
3237		LIST_REMOVE(dap, da_pdlist);
3238		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3239	}
3240	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3241}
3242
3243/*
3244 * Handle the completion of a mkdir dependency.
3245 */
3246static void
3247handle_written_mkdir(mkdir, type)
3248	struct mkdir *mkdir;
3249	int type;
3250{
3251	struct diradd *dap;
3252	struct pagedep *pagedep;
3253
3254	if (mkdir->md_state != type)
3255		panic("handle_written_mkdir: bad type");
3256	dap = mkdir->md_diradd;
3257	dap->da_state &= ~type;
3258	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
3259		dap->da_state |= DEPCOMPLETE;
3260	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3261		if (dap->da_state & DIRCHG)
3262			pagedep = dap->da_previous->dm_pagedep;
3263		else
3264			pagedep = dap->da_pagedep;
3265		LIST_REMOVE(dap, da_pdlist);
3266		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3267	}
3268	LIST_REMOVE(mkdir, md_mkdirs);
3269	WORKITEM_FREE(mkdir, D_MKDIR);
3270}
3271
3272/*
3273 * Called from within softdep_disk_write_complete above.
3274 * A write operation was just completed. Removed inodes can
3275 * now be freed and associated block pointers may be committed.
3276 * Note that this routine is always called from interrupt level
3277 * with further splbio interrupts blocked.
3278 */
3279static int
3280handle_written_filepage(pagedep, bp)
3281	struct pagedep *pagedep;
3282	struct buf *bp;		/* buffer containing the written page */
3283{
3284	struct dirrem *dirrem;
3285	struct diradd *dap, *nextdap;
3286	struct direct *ep;
3287	int i, chgs;
3288
3289	if ((pagedep->pd_state & IOSTARTED) == 0)
3290		panic("handle_written_filepage: not started");
3291	pagedep->pd_state &= ~IOSTARTED;
3292	/*
3293	 * Process any directory removals that have been committed.
3294	 */
3295	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
3296		LIST_REMOVE(dirrem, dm_next);
3297		dirrem->dm_dirinum = pagedep->pd_ino;
3298		add_to_worklist(&dirrem->dm_list);
3299	}
3300	/*
3301	 * Free any directory additions that have been committed.
3302	 */
3303	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
3304		free_diradd(dap);
3305	/*
3306	 * Uncommitted directory entries must be restored.
3307	 */
3308	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
3309		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
3310		     dap = nextdap) {
3311			nextdap = LIST_NEXT(dap, da_pdlist);
3312			if (dap->da_state & ATTACHED)
3313				panic("handle_written_filepage: attached");
3314			ep = (struct direct *)
3315			    ((char *)bp->b_data + dap->da_offset);
3316			ep->d_ino = dap->da_newinum;
3317			dap->da_state &= ~UNDONE;
3318			dap->da_state |= ATTACHED;
3319			chgs = 1;
3320			/*
3321			 * If the inode referenced by the directory has
3322			 * been written out, then the dependency can be
3323			 * moved to the pending list.
3324			 */
3325			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3326				LIST_REMOVE(dap, da_pdlist);
3327				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
3328				    da_pdlist);
3329			}
3330		}
3331	}
3332	/*
3333	 * If there were any rollbacks in the directory, then it must be
3334	 * marked dirty so that its will eventually get written back in
3335	 * its correct form.
3336	 */
3337	if (chgs)
3338		bdirty(bp);
3339	/*
3340	 * If no dependencies remain, the pagedep will be freed.
3341	 * Otherwise it will remain to update the page before it
3342	 * is written back to disk.
3343	 */
3344	if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
3345		for (i = 0; i < DAHASHSZ; i++)
3346			if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
3347				break;
3348		if (i == DAHASHSZ) {
3349			LIST_REMOVE(pagedep, pd_hash);
3350			WORKITEM_FREE(pagedep, D_PAGEDEP);
3351			return (0);
3352		}
3353	}
3354	return (1);
3355}
3356
3357/*
3358 * Writing back in-core inode structures.
3359 *
3360 * The file system only accesses an inode's contents when it occupies an
3361 * "in-core" inode structure.  These "in-core" structures are separate from
3362 * the page frames used to cache inode blocks.  Only the latter are
3363 * transferred to/from the disk.  So, when the updated contents of the
3364 * "in-core" inode structure are copied to the corresponding in-memory inode
3365 * block, the dependencies are also transferred.  The following procedure is
3366 * called when copying a dirty "in-core" inode to a cached inode block.
3367 */
3368
3369/*
3370 * Called when an inode is loaded from disk. If the effective link count
3371 * differed from the actual link count when it was last flushed, then we
3372 * need to ensure that the correct effective link count is put back.
3373 */
3374void
3375softdep_load_inodeblock(ip)
3376	struct inode *ip;	/* the "in_core" copy of the inode */
3377{
3378	struct inodedep *inodedep;
3379	int error, gotit;
3380
3381	/*
3382	 * Check for alternate nlink count.
3383	 */
3384	ip->i_effnlink = ip->i_nlink;
3385	ACQUIRE_LOCK(&lk);
3386	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3387		FREE_LOCK(&lk);
3388		return;
3389	}
3390	if (inodedep->id_nlinkdelta != 0) {
3391		ip->i_effnlink -= inodedep->id_nlinkdelta;
3392		ip->i_flag |= IN_MODIFIED;
3393		inodedep->id_nlinkdelta = 0;
3394		(void) free_inodedep(inodedep);
3395	}
3396	FREE_LOCK(&lk);
3397}
3398
3399/*
3400 * This routine is called just before the "in-core" inode
3401 * information is to be copied to the in-memory inode block.
3402 * Recall that an inode block contains several inodes. If
3403 * the force flag is set, then the dependencies will be
3404 * cleared so that the update can always be made. Note that
3405 * the buffer is locked when this routine is called, so we
3406 * will never be in the middle of writing the inode block
3407 * to disk.
3408 */
3409void
3410softdep_update_inodeblock(ip, bp, waitfor)
3411	struct inode *ip;	/* the "in_core" copy of the inode */
3412	struct buf *bp;		/* the buffer containing the inode block */
3413	int waitfor;		/* 1 => update must be allowed */
3414{
3415	struct inodedep *inodedep;
3416	struct worklist *wk;
3417	int error, gotit;
3418
3419	/*
3420	 * If the effective link count is not equal to the actual link
3421	 * count, then we must track the difference in an inodedep while
3422	 * the inode is (potentially) tossed out of the cache. Otherwise,
3423	 * if there is no existing inodedep, then there are no dependencies
3424	 * to track.
3425	 */
3426	ACQUIRE_LOCK(&lk);
3427	if (ip->i_effnlink != ip->i_nlink) {
3428		(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC,
3429		    &inodedep);
3430	} else if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3431		FREE_LOCK(&lk);
3432		return;
3433	}
3434	if (ip->i_nlink < ip->i_effnlink)
3435		panic("softdep_update_inodeblock: bad delta");
3436	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3437	/*
3438	 * Changes have been initiated. Anything depending on these
3439	 * changes cannot occur until this inode has been written.
3440	 */
3441	inodedep->id_state &= ~COMPLETE;
3442	if ((inodedep->id_state & ONWORKLIST) == 0)
3443		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
3444	/*
3445	 * Any new dependencies associated with the incore inode must
3446	 * now be moved to the list associated with the buffer holding
3447	 * the in-memory copy of the inode. Once merged process any
3448	 * allocdirects that are completed by the merger.
3449	 */
3450	merge_inode_lists(inodedep);
3451	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
3452		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
3453	/*
3454	 * Now that the inode has been pushed into the buffer, the
3455	 * operations dependent on the inode being written to disk
3456	 * can be moved to the id_bufwait so that they will be
3457	 * processed when the buffer I/O completes.
3458	 */
3459	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
3460		WORKLIST_REMOVE(wk);
3461		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
3462	}
3463	/*
3464	 * Newly allocated inodes cannot be written until the bitmap
3465	 * that allocates them have been written (indicated by
3466	 * DEPCOMPLETE being set in id_state). If we are doing a
3467	 * forced sync (e.g., an fsync on a file), we force the bitmap
3468	 * to be written so that the update can be done.
3469	 */
3470	if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
3471		FREE_LOCK(&lk);
3472		return;
3473	}
3474	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
3475	FREE_LOCK(&lk);
3476	if (gotit && (error = VOP_BWRITE(inodedep->id_buf)) != 0)
3477		softdep_error("softdep_update_inodeblock: bwrite", error);
3478	if ((inodedep->id_state & DEPCOMPLETE) == 0)
3479		panic("softdep_update_inodeblock: update failed");
3480}
3481
3482/*
3483 * Merge the new inode dependency list (id_newinoupdt) into the old
3484 * inode dependency list (id_inoupdt). This routine must be called
3485 * with splbio interrupts blocked.
3486 */
3487static void
3488merge_inode_lists(inodedep)
3489	struct inodedep *inodedep;
3490{
3491	struct allocdirect *listadp, *newadp;
3492
3493	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3494	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
3495		if (listadp->ad_lbn < newadp->ad_lbn) {
3496			listadp = TAILQ_NEXT(listadp, ad_next);
3497			continue;
3498		}
3499		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3500		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
3501		if (listadp->ad_lbn == newadp->ad_lbn) {
3502			allocdirect_merge(&inodedep->id_inoupdt, newadp,
3503			    listadp);
3504			listadp = newadp;
3505		}
3506		newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3507	}
3508	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
3509		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3510		TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
3511	}
3512}
3513
3514/*
3515 * If we are doing an fsync, then we must ensure that any directory
3516 * entries for the inode have been written after the inode gets to disk.
3517 */
3518int
3519softdep_fsync(vp)
3520	struct vnode *vp;	/* the "in_core" copy of the inode */
3521{
3522	struct diradd *dap, *olddap;
3523	struct inodedep *inodedep;
3524	struct pagedep *pagedep;
3525	struct worklist *wk;
3526	struct mount *mnt;
3527	struct vnode *pvp;
3528	struct inode *ip;
3529	struct buf *bp;
3530	struct fs *fs;
3531	struct proc *p = CURPROC;		/* XXX */
3532	int error, ret, flushparent;
3533	struct timeval tv;
3534	ino_t parentino;
3535	ufs_lbn_t lbn;
3536
3537	ip = VTOI(vp);
3538	fs = ip->i_fs;
3539	for (error = 0, flushparent = 0, olddap = NULL; ; ) {
3540		ACQUIRE_LOCK(&lk);
3541		if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
3542			break;
3543		if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
3544		    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
3545		    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
3546		    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
3547			panic("softdep_fsync: pending ops");
3548		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
3549			break;
3550		if (wk->wk_type != D_DIRADD)
3551			panic("softdep_fsync: Unexpected type %s",
3552			    TYPENAME(wk->wk_type));
3553		dap = WK_DIRADD(wk);
3554		/*
3555		 * If we have failed to get rid of all the dependencies
3556		 * then something is seriously wrong.
3557		 */
3558		if (dap == olddap)
3559			panic("softdep_fsync: flush failed");
3560		olddap = dap;
3561		/*
3562		 * Flush our parent if this directory entry
3563		 * has a MKDIR_PARENT dependency.
3564		 */
3565		if (dap->da_state & DIRCHG)
3566			pagedep = dap->da_previous->dm_pagedep;
3567		else
3568			pagedep = dap->da_pagedep;
3569		mnt = pagedep->pd_mnt;
3570		parentino = pagedep->pd_ino;
3571		lbn = pagedep->pd_lbn;
3572		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
3573			panic("softdep_fsync: dirty");
3574		flushparent = dap->da_state & MKDIR_PARENT;
3575		/*
3576		 * If we are being fsync'ed as part of vgone'ing this vnode,
3577		 * then we will not be able to release and recover the
3578		 * vnode below, so we just have to give up on writing its
3579		 * directory entry out. It will eventually be written, just
3580		 * not now, but then the user was not asking to have it
3581		 * written, so we are not breaking any promises.
3582		 */
3583		if (vp->v_flag & VXLOCK)
3584			break;
3585		/*
3586		 * We prevent deadlock by always fetching inodes from the
3587		 * root, moving down the directory tree. Thus, when fetching
3588		 * our parent directory, we must unlock ourselves before
3589		 * requesting the lock on our parent. See the comment in
3590		 * ufs_lookup for details on possible races.
3591		 */
3592		FREE_LOCK(&lk);
3593		VOP_UNLOCK(vp, 0, p);
3594		if ((error = VFS_VGET(mnt, parentino, &pvp)) != 0) {
3595			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
3596			return (error);
3597		}
3598		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
3599		if (flushparent) {
3600#ifndef __FreeBSD__
3601			tv = time;
3602#else
3603			getmicrotime(&tv);
3604#endif /* __FreeBSD__ */
3605			if (error = UFS_UPDATE(pvp, &tv, &tv, MNT_WAIT)) {
3606				vput(pvp);
3607				return (error);
3608			}
3609		}
3610		/*
3611		 * Flush directory page containing the inode's name.
3612		 */
3613		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
3614		    &bp);
3615		ret = VOP_BWRITE(bp);
3616		vput(pvp);
3617		if (error != 0)
3618			return (error);
3619		if (ret != 0)
3620			return (ret);
3621	}
3622	FREE_LOCK(&lk);
3623	return (0);
3624}
3625
3626/*
3627 * This routine is called when we are trying to synchronously flush a
3628 * file. This routine must eliminate any filesystem metadata dependencies
3629 * so that the syncing routine can succeed by pushing the dirty blocks
3630 * associated with the file. If any I/O errors occur, they are returned.
3631 */
3632int
3633softdep_sync_metadata(ap)
3634	struct vop_fsync_args /* {
3635		struct vnode *a_vp;
3636		struct ucred *a_cred;
3637		int a_waitfor;
3638		struct proc *a_p;
3639	} */ *ap;
3640{
3641	struct vnode *vp = ap->a_vp;
3642	struct pagedep *pagedep;
3643	struct allocdirect *adp;
3644	struct allocindir *aip;
3645	struct buf *bp, *nbp;
3646	struct worklist *wk;
3647	int i, error, waitfor;
3648
3649	/*
3650	 * Check whether this vnode is involved in a filesystem
3651	 * that is doing soft dependency processing.
3652	 */
3653	if (vp->v_type != VBLK) {
3654		if (!DOINGSOFTDEP(vp))
3655			return (0);
3656	} else
3657		if (vp->v_specmountpoint == NULL ||
3658		    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
3659			return (0);
3660	/*
3661	 * Ensure that any direct block dependencies have been cleared.
3662	 */
3663	ACQUIRE_LOCK(&lk);
3664	if (error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number)) {
3665		FREE_LOCK(&lk);
3666		return (error);
3667	}
3668	/*
3669	 * For most files, the only metadata dependencies are the
3670	 * cylinder group maps that allocate their inode or blocks.
3671	 * The block allocation dependencies can be found by traversing
3672	 * the dependency lists for any buffers that remain on their
3673	 * dirty buffer list. The inode allocation dependency will
3674	 * be resolved when the inode is updated with MNT_WAIT.
3675	 * This work is done in two passes. The first pass grabs most
3676	 * of the buffers and begins asynchronously writing them. The
3677	 * only way to wait for these asynchronous writes is to sleep
3678	 * on the filesystem vnode which may stay busy for a long time
3679	 * if the filesystem is active. So, instead, we make a second
3680	 * pass over the dependencies blocking on each write. In the
3681	 * usual case we will be blocking against a write that we
3682	 * initiated, so when it is done the dependency will have been
3683	 * resolved. Thus the second pass is expected to end quickly.
3684	 */
3685	waitfor = MNT_NOWAIT;
3686top:
3687	if (getdirtybuf(&LIST_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
3688		while (vp->v_numoutput) {
3689			vp->v_flag |= VBWAIT;
3690			FREE_LOCK_INTERLOCKED(&lk);
3691			tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1,
3692				"sdsynm", 0);
3693			ACQUIRE_LOCK_INTERLOCKED(&lk);
3694		}
3695		FREE_LOCK(&lk);
3696		return (0);
3697	}
3698	bp = LIST_FIRST(&vp->v_dirtyblkhd);
3699loop:
3700	/*
3701	 * As we hold the buffer locked, none of its dependencies
3702	 * will disappear.
3703	 */
3704	for (wk = LIST_FIRST(&bp->b_dep); wk;
3705	     wk = LIST_NEXT(wk, wk_list)) {
3706		switch (wk->wk_type) {
3707
3708		case D_ALLOCDIRECT:
3709			adp = WK_ALLOCDIRECT(wk);
3710			if (adp->ad_state & DEPCOMPLETE)
3711				break;
3712			nbp = adp->ad_buf;
3713			if (getdirtybuf(&nbp, waitfor) == 0)
3714				break;
3715			FREE_LOCK(&lk);
3716			if (waitfor == MNT_NOWAIT) {
3717				bawrite(nbp);
3718			} else if ((error = VOP_BWRITE(nbp)) != 0) {
3719				bawrite(bp);
3720				return (error);
3721			}
3722			ACQUIRE_LOCK(&lk);
3723			break;
3724
3725		case D_ALLOCINDIR:
3726			aip = WK_ALLOCINDIR(wk);
3727			if (aip->ai_state & DEPCOMPLETE)
3728				break;
3729			nbp = aip->ai_buf;
3730			if (getdirtybuf(&nbp, waitfor) == 0)
3731				break;
3732			FREE_LOCK(&lk);
3733			if (waitfor == MNT_NOWAIT) {
3734				bawrite(nbp);
3735			} else if ((error = VOP_BWRITE(nbp)) != 0) {
3736				bawrite(bp);
3737				return (error);
3738			}
3739			ACQUIRE_LOCK(&lk);
3740			break;
3741
3742		case D_INDIRDEP:
3743		restart:
3744			for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd);
3745			     aip; aip = LIST_NEXT(aip, ai_next)) {
3746				if (aip->ai_state & DEPCOMPLETE)
3747					continue;
3748				nbp = aip->ai_buf;
3749				if (getdirtybuf(&nbp, MNT_WAIT) == 0)
3750					goto restart;
3751				FREE_LOCK(&lk);
3752				if ((error = VOP_BWRITE(nbp)) != 0) {
3753					bawrite(bp);
3754					return (error);
3755				}
3756				ACQUIRE_LOCK(&lk);
3757				goto restart;
3758			}
3759			break;
3760
3761		case D_INODEDEP:
3762			if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
3763			    WK_INODEDEP(wk)->id_ino)) != 0) {
3764				FREE_LOCK(&lk);
3765				bawrite(bp);
3766				return (error);
3767			}
3768			break;
3769
3770		case D_PAGEDEP:
3771			/*
3772			 * We are trying to sync a directory that may
3773			 * have dependencies on both its own metadata
3774			 * and/or dependencies on the inodes of any
3775			 * recently allocated files. We walk its diradd
3776			 * lists pushing out the associated inode.
3777			 */
3778			pagedep = WK_PAGEDEP(wk);
3779			for (i = 0; i < DAHASHSZ; i++) {
3780				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
3781					continue;
3782				if (error = flush_pagedep_deps(vp,
3783				   pagedep->pd_mnt, &pagedep->pd_diraddhd[i])) {
3784					FREE_LOCK(&lk);
3785					bawrite(bp);
3786					return (error);
3787				}
3788			}
3789			break;
3790
3791		default:
3792			panic("softdep_sync_metadata: Unknown type %s",
3793			    TYPENAME(wk->wk_type));
3794			/* NOTREACHED */
3795		}
3796	}
3797	(void) getdirtybuf(&LIST_NEXT(bp, b_vnbufs), MNT_WAIT);
3798	nbp = LIST_NEXT(bp, b_vnbufs);
3799	FREE_LOCK(&lk);
3800	bawrite(bp);
3801	ACQUIRE_LOCK(&lk);
3802	if (nbp != NULL) {
3803		bp = nbp;
3804		goto loop;
3805	}
3806	/*
3807	 * We must wait for any I/O in progress to finish so that
3808	 * all potential buffers on the dirty list will be visible.
3809	 * Once they are all there, proceed with the second pass
3810	 * which will wait for the I/O as per above.
3811	 */
3812	while (vp->v_numoutput) {
3813		vp->v_flag |= VBWAIT;
3814		FREE_LOCK_INTERLOCKED(&lk);
3815		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "sdsynm", 0);
3816		ACQUIRE_LOCK_INTERLOCKED(&lk);
3817	}
3818	/*
3819	 * The brief unlock is to allow any pent up dependency
3820	 * processing to be done.
3821	 */
3822	if (waitfor == MNT_NOWAIT) {
3823		waitfor = MNT_WAIT;
3824		FREE_LOCK(&lk);
3825		ACQUIRE_LOCK(&lk);
3826		goto top;
3827	}
3828
3829	/*
3830	 * If we have managed to get rid of all the dirty buffers,
3831	 * then we are done. For certain directories and block
3832	 * devices, we may need to do further work.
3833	 */
3834	if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
3835		FREE_LOCK(&lk);
3836		return (0);
3837	}
3838
3839	FREE_LOCK(&lk);
3840	/*
3841	 * If we are trying to sync a block device, some of its buffers may
3842	 * contain metadata that cannot be written until the contents of some
3843	 * partially written files have been written to disk. The only easy
3844	 * way to accomplish this is to sync the entire filesystem (luckily
3845	 * this happens rarely).
3846	 */
3847	if (vp->v_type == VBLK && vp->v_specmountpoint && !VOP_ISLOCKED(vp) &&
3848	    (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred,
3849	     ap->a_p)) != 0)
3850		return (error);
3851	return (0);
3852}
3853
3854/*
3855 * Flush the dependencies associated with an inodedep.
3856 * Called with splbio blocked.
3857 */
3858static int
3859flush_inodedep_deps(fs, ino)
3860	struct fs *fs;
3861	ino_t ino;
3862{
3863	struct inodedep *inodedep;
3864	struct allocdirect *adp;
3865	int error, waitfor;
3866	struct buf *bp;
3867
3868	/*
3869	 * This work is done in two passes. The first pass grabs most
3870	 * of the buffers and begins asynchronously writing them. The
3871	 * only way to wait for these asynchronous writes is to sleep
3872	 * on the filesystem vnode which may stay busy for a long time
3873	 * if the filesystem is active. So, instead, we make a second
3874	 * pass over the dependencies blocking on each write. In the
3875	 * usual case we will be blocking against a write that we
3876	 * initiated, so when it is done the dependency will have been
3877	 * resolved. Thus the second pass is expected to end quickly.
3878	 * We give a brief window at the top of the loop to allow
3879	 * any pending I/O to complete.
3880	 */
3881	for (waitfor = MNT_NOWAIT; ; ) {
3882		FREE_LOCK(&lk);
3883		ACQUIRE_LOCK(&lk);
3884		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
3885			return (0);
3886		for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3887		     adp = TAILQ_NEXT(adp, ad_next)) {
3888			if (adp->ad_state & DEPCOMPLETE)
3889				continue;
3890			bp = adp->ad_buf;
3891			if (getdirtybuf(&bp, waitfor) == 0) {
3892				if (waitfor == MNT_NOWAIT)
3893					continue;
3894				break;
3895			}
3896			FREE_LOCK(&lk);
3897			if (waitfor == MNT_NOWAIT) {
3898				bawrite(bp);
3899			} else if ((error = VOP_BWRITE(bp)) != 0) {
3900				ACQUIRE_LOCK(&lk);
3901				return (error);
3902			}
3903			ACQUIRE_LOCK(&lk);
3904			break;
3905		}
3906		if (adp != NULL)
3907			continue;
3908		for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp;
3909		     adp = TAILQ_NEXT(adp, ad_next)) {
3910			if (adp->ad_state & DEPCOMPLETE)
3911				continue;
3912			bp = adp->ad_buf;
3913			if (getdirtybuf(&bp, waitfor) == 0) {
3914				if (waitfor == MNT_NOWAIT)
3915					continue;
3916				break;
3917			}
3918			FREE_LOCK(&lk);
3919			if (waitfor == MNT_NOWAIT) {
3920				bawrite(bp);
3921			} else if ((error = VOP_BWRITE(bp)) != 0) {
3922				ACQUIRE_LOCK(&lk);
3923				return (error);
3924			}
3925			ACQUIRE_LOCK(&lk);
3926			break;
3927		}
3928		if (adp != NULL)
3929			continue;
3930		/*
3931		 * If pass2, we are done, otherwise do pass 2.
3932		 */
3933		if (waitfor == MNT_WAIT)
3934			break;
3935		waitfor = MNT_WAIT;
3936	}
3937	/*
3938	 * Try freeing inodedep in case all dependencies have been removed.
3939	 */
3940	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
3941		(void) free_inodedep(inodedep);
3942	return (0);
3943}
3944
3945/*
3946 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
3947 * Called with splbio blocked.
3948 */
3949static int
3950flush_pagedep_deps(pvp, mp, diraddhdp)
3951	struct vnode *pvp;
3952	struct mount *mp;
3953	struct diraddhd *diraddhdp;
3954{
3955	struct proc *p = CURPROC;	/* XXX */
3956	struct inodedep *inodedep;
3957	struct ufsmount *ump;
3958	struct diradd *dap;
3959	struct timeval tv;
3960	struct vnode *vp;
3961	int gotit, error = 0;
3962	struct buf *bp;
3963	ino_t inum;
3964
3965	ump = VFSTOUFS(mp);
3966	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
3967		/*
3968		 * Flush ourselves if this directory entry
3969		 * has a MKDIR_PARENT dependency.
3970		 */
3971		if (dap->da_state & MKDIR_PARENT) {
3972#ifndef __FreeBSD__
3973			tv = time;
3974#else
3975			getmicrotime(&tv);
3976#endif /* __FreeBSD__ */
3977			FREE_LOCK(&lk);
3978			if (error = UFS_UPDATE(pvp, &tv, &tv, MNT_WAIT))
3979				break;
3980			ACQUIRE_LOCK(&lk);
3981			/*
3982			 * If that cleared dependencies, go on to next.
3983			 */
3984			if (dap != LIST_FIRST(diraddhdp))
3985				continue;
3986			if (dap->da_state & MKDIR_PARENT)
3987				panic("flush_pagedep_deps: MKDIR");
3988		}
3989		/*
3990		 * Flush the file on which the directory entry depends.
3991		 * If the inode has already been pushed out of the cache,
3992		 * then all the block dependencies will have been flushed
3993		 * leaving only inode dependencies (e.g., bitmaps). Thus,
3994		 * we do a ufs_ihashget to check for the vnode in the cache.
3995		 * If it is there, we do a full flush. If it is no longer
3996		 * there we need only dispose of any remaining bitmap
3997		 * dependencies and write the inode to disk.
3998		 */
3999		inum = dap->da_newinum;
4000		FREE_LOCK(&lk);
4001		if ((vp = ufs_ihashget(ump->um_dev, inum)) == NULL) {
4002			ACQUIRE_LOCK(&lk);
4003			if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0
4004			    && dap == LIST_FIRST(diraddhdp))
4005				panic("flush_pagedep_deps: flush 1 failed");
4006			/*
4007			 * If the inode still has bitmap dependencies,
4008			 * push them to disk.
4009			 */
4010			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4011				gotit = getdirtybuf(&inodedep->id_buf,MNT_WAIT);
4012				FREE_LOCK(&lk);
4013				if (gotit &&
4014				    (error = VOP_BWRITE(inodedep->id_buf)) != 0)
4015					break;
4016				ACQUIRE_LOCK(&lk);
4017			}
4018			if (dap != LIST_FIRST(diraddhdp))
4019				continue;
4020			/*
4021			 * If the inode is still sitting in a buffer waiting
4022			 * to be written, push it to disk.
4023			 */
4024			FREE_LOCK(&lk);
4025			if ((error = bread(ump->um_devvp,
4026			    fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
4027			    (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0)
4028				break;
4029			if ((error = VOP_BWRITE(bp)) != 0)
4030				break;
4031			ACQUIRE_LOCK(&lk);
4032			if (dap == LIST_FIRST(diraddhdp))
4033				panic("flush_pagedep_deps: flush 2 failed");
4034			continue;
4035		}
4036		if (vp->v_type == VDIR) {
4037			/*
4038			 * A newly allocated directory must have its "." and
4039			 * ".." entries written out before its name can be
4040			 * committed in its parent. We do not want or need
4041			 * the full semantics of a synchronous VOP_FSYNC as
4042			 * that may end up here again, once for each directory
4043			 * level in the filesystem. Instead, we push the blocks
4044			 * and wait for them to clear.
4045			 */
4046			if (error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) {
4047				vput(vp);
4048				break;
4049			}
4050			ACQUIRE_LOCK(&lk);
4051			while (vp->v_numoutput) {
4052				vp->v_flag |= VBWAIT;
4053				FREE_LOCK_INTERLOCKED(&lk);
4054				tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1,
4055					"sdflpd", 0);
4056				ACQUIRE_LOCK_INTERLOCKED(&lk);
4057			}
4058			FREE_LOCK(&lk);
4059		}
4060#ifndef __FreeBSD__
4061		tv = time;
4062#else
4063		getmicrotime(&tv);
4064#endif /* __FreeBSD__ */
4065		error = UFS_UPDATE(vp, &tv, &tv, MNT_WAIT);
4066		vput(vp);
4067		if (error)
4068			break;
4069		/*
4070		 * If we have failed to get rid of all the dependencies
4071		 * then something is seriously wrong.
4072		 */
4073		if (dap == LIST_FIRST(diraddhdp))
4074			panic("flush_pagedep_deps: flush 3 failed");
4075		ACQUIRE_LOCK(&lk);
4076	}
4077	if (error)
4078		ACQUIRE_LOCK(&lk);
4079	return (error);
4080}
4081
4082/*
4083 * Acquire exclusive access to a buffer.
4084 * Must be called with splbio blocked.
4085 * Return 1 if buffer was acquired.
4086 */
4087static int
4088getdirtybuf(bpp, waitfor)
4089	struct buf **bpp;
4090	int waitfor;
4091{
4092	struct buf *bp;
4093
4094	for (;;) {
4095		if ((bp = *bpp) == NULL)
4096			return (0);
4097		if ((bp->b_flags & B_BUSY) == 0)
4098			break;
4099		if (waitfor != MNT_WAIT)
4100			return (0);
4101		bp->b_flags |= B_WANTED;
4102		FREE_LOCK_INTERLOCKED(&lk);
4103		tsleep((caddr_t)bp, PRIBIO + 1, "sdsdty", 0);
4104		ACQUIRE_LOCK_INTERLOCKED(&lk);
4105	}
4106	if ((bp->b_flags & B_DELWRI) == 0)
4107		return (0);
4108	bremfree(bp);
4109	bp->b_flags |= B_BUSY;
4110	return (1);
4111}
4112
4113/*
4114 * Called whenever a buffer that is being invalidated or reallocated
4115 * contains dependencies. This should only happen if an I/O error has
4116 * occurred. The routine is called with the buffer locked.
4117 */
4118void
4119softdep_deallocate_dependencies(bp)
4120	struct buf *bp;
4121{
4122	struct worklist *wk;
4123
4124	if ((bp->b_flags & B_ERROR) == 0)
4125		panic("softdep_deallocate_dependencies: dangling deps");
4126	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
4127	ACQUIRE_LOCK(&lk);
4128	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
4129		WORKLIST_REMOVE(wk);
4130		FREE_LOCK(&lk);
4131		switch (wk->wk_type) {
4132		/*
4133		 * XXX - should really clean up, but for now we will
4134		 * just leak memory and not worry about it. Also should
4135		 * mark the filesystem permanently dirty so that it will
4136		 * force fsck to be run (though this would best be done
4137		 * in the mainline code).
4138		 */
4139		case D_PAGEDEP:
4140		case D_INODEDEP:
4141		case D_BMSAFEMAP:
4142		case D_ALLOCDIRECT:
4143		case D_INDIRDEP:
4144		case D_ALLOCINDIR:
4145		case D_MKDIR:
4146#ifdef DEBUG
4147			printf("Lost type %s\n", TYPENAME(wk->wk_type));
4148#endif
4149			break;
4150		default:
4151			panic("%s: Unexpected type %s",
4152			    "softdep_deallocate_dependencies",
4153			    TYPENAME(wk->wk_type));
4154			/* NOTREACHED */
4155		}
4156		ACQUIRE_LOCK(&lk);
4157	}
4158	FREE_LOCK(&lk);
4159}
4160
4161/*
4162 * Function to handle asynchronous write errors in the filesystem.
4163 */
4164void
4165softdep_error(func, error)
4166	char *func;
4167	int error;
4168{
4169
4170	/* XXX should do something better! */
4171	log(LOG_ERR, "%s: got error %d while accessing filesystem\n",
4172	    func, error);
4173}
4174
4175