ffs_softdep.c revision 44383
1/*
2 * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved.
3 *
4 * The soft updates code is derived from the appendix of a University
5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
6 * "Soft Updates: A Solution to the Metadata Update Problem in File
7 * Systems", CSE-TR-254-95, August 1995).
8 *
9 * The following are the copyrights and redistribution conditions that
10 * apply to this copy of the soft update software. For a license
11 * to use, redistribute or sell the soft update software under
12 * conditions other than those described here, please contact the
13 * author at one of the following addresses:
14 *
15 *	Marshall Kirk McKusick		mckusick@mckusick.com
16 *	1614 Oxford Street		+1-510-843-9542
17 *	Berkeley, CA 94709-1608
18 *	USA
19 *
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
22 * are met:
23 *
24 * 1. Redistributions of source code must retain the above copyright
25 *    notice, this list of conditions and the following disclaimer.
26 * 2. Redistributions in binary form must reproduce the above copyright
27 *    notice, this list of conditions and the following disclaimer in the
28 *    documentation and/or other materials provided with the distribution.
29 * 3. None of the names of McKusick, Ganger, Patt, or the University of
30 *    Michigan may be used to endorse or promote products derived from
31 *    this software without specific prior written permission.
32 * 4. Redistributions in any form must be accompanied by information on
33 *    how to obtain complete source code for any accompanying software
34 *    that uses this software. This source code must either be included
35 *    in the distribution or be available for no more than the cost of
36 *    distribution plus a nominal fee, and must be freely redistributable
37 *    under reasonable conditions. For an executable file, complete
38 *    source code means the source code for all modules it contains.
39 *    It does not mean source code for modules or files that typically
40 *    accompany the operating system on which the executable file runs,
41 *    e.g., standard library modules or system header files.
42 *
43 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
44 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
45 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
46 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
47 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 *	from: @(#)ffs_softdep.c	9.33 (McKusick) 2/25/99
56 *	$Id: ffs_softdep.c,v 1.22 1999/02/17 20:01:20 mckusick Exp $
57 */
58
59/*
60 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
61 */
62#ifndef DIAGNOSTIC
63#define DIAGNOSTIC
64#endif
65#ifndef DEBUG
66#define DEBUG
67#endif
68
69#include <sys/param.h>
70#include <sys/buf.h>
71#include <sys/kernel.h>
72#include <sys/malloc.h>
73#include <sys/mount.h>
74#include <sys/proc.h>
75#include <sys/syslog.h>
76#include <sys/systm.h>
77#include <sys/vnode.h>
78#include <miscfs/specfs/specdev.h>
79#include <ufs/ufs/dir.h>
80#include <ufs/ufs/quota.h>
81#include <ufs/ufs/inode.h>
82#include <ufs/ufs/ufsmount.h>
83#include <ufs/ffs/fs.h>
84#include <ufs/ffs/softdep.h>
85#include <ufs/ffs/ffs_extern.h>
86#include <ufs/ufs/ufs_extern.h>
87
88/*
89 * These definitions need to be adapted to the system to which
90 * this file is being ported.
91 */
92/*
93 * malloc types defined for the softdep system.
94 */
95MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
96MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
97MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
98MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
99MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
100MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
101MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
102MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
103MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
104MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
105MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
106MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
107MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
108
109#define	D_PAGEDEP	0
110#define	D_INODEDEP	1
111#define	D_NEWBLK	2
112#define	D_BMSAFEMAP	3
113#define	D_ALLOCDIRECT	4
114#define	D_INDIRDEP	5
115#define	D_ALLOCINDIR	6
116#define	D_FREEFRAG	7
117#define	D_FREEBLKS	8
118#define	D_FREEFILE	9
119#define	D_DIRADD	10
120#define	D_MKDIR		11
121#define	D_DIRREM	12
122#define D_LAST		D_DIRREM
123
124/*
125 * translate from workitem type to memory type
126 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
127 */
128static struct malloc_type *memtype[] = {
129	M_PAGEDEP,
130	M_INODEDEP,
131	M_NEWBLK,
132	M_BMSAFEMAP,
133	M_ALLOCDIRECT,
134	M_INDIRDEP,
135	M_ALLOCINDIR,
136	M_FREEFRAG,
137	M_FREEBLKS,
138	M_FREEFILE,
139	M_DIRADD,
140	M_MKDIR,
141	M_DIRREM
142};
143
144#define DtoM(type) (memtype[type])
145
146/*
147 * Names of malloc types.
148 */
149#define TYPENAME(type)  \
150	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
151#define CURPROC curproc
152/*
153 * End system adaptaion definitions.
154 */
155
156/*
157 * Internal function prototypes.
158 */
159static	void softdep_error __P((char *, int));
160static	int getdirtybuf __P((struct buf **, int));
161static	int flush_pagedep_deps __P((struct vnode *, struct mount *,
162	    struct diraddhd *));
163static	int flush_inodedep_deps __P((struct fs *, ino_t));
164static	int handle_written_filepage __P((struct pagedep *, struct buf *));
165static  void diradd_inode_written __P((struct diradd *, struct inodedep *));
166static	int handle_written_inodeblock __P((struct inodedep *, struct buf *));
167static	void handle_allocdirect_partdone __P((struct allocdirect *));
168static	void handle_allocindir_partdone __P((struct allocindir *));
169static	void initiate_write_filepage __P((struct pagedep *, struct buf *));
170static	void handle_written_mkdir __P((struct mkdir *, int));
171static	void initiate_write_inodeblock __P((struct inodedep *, struct buf *));
172static	void handle_workitem_freefile __P((struct freefile *));
173static	void handle_workitem_remove __P((struct dirrem *));
174static	struct dirrem *newdirrem __P((struct buf *, struct inode *,
175	    struct inode *, int));
176static	void free_diradd __P((struct diradd *));
177static	void free_allocindir __P((struct allocindir *, struct inodedep *));
178static	int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t,
179	    long *));
180static	void deallocate_dependencies __P((struct buf *, struct inodedep *));
181static	void free_allocdirect __P((struct allocdirectlst *,
182	    struct allocdirect *, int));
183static	int free_inodedep __P((struct inodedep *));
184static	void handle_workitem_freeblocks __P((struct freeblks *));
185static	void merge_inode_lists __P((struct inodedep *));
186static	void setup_allocindir_phase2 __P((struct buf *, struct inode *,
187	    struct allocindir *));
188static	struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t,
189	    ufs_daddr_t));
190static	void handle_workitem_freefrag __P((struct freefrag *));
191static	struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long));
192static	void allocdirect_merge __P((struct allocdirectlst *,
193	    struct allocdirect *, struct allocdirect *));
194static	struct bmsafemap *bmsafemap_lookup __P((struct buf *));
195static	int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
196	    struct newblk **));
197static	int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
198static	int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
199	    struct pagedep **));
200static	void pause_timer __P((void *));
201static	int checklimit __P((long *, int));
202static	void add_to_worklist __P((struct worklist *));
203
204/*
205 * Exported softdep operations.
206 */
207struct bio_ops bioops = {
208	softdep_disk_io_initiation,		/* io_start */
209	softdep_disk_write_complete,		/* io_complete */
210	softdep_deallocate_dependencies,	/* io_deallocate */
211	softdep_fsync,				/* io_fsync */
212	softdep_process_worklist,		/* io_sync */
213};
214
215/*
216 * Locking primitives.
217 *
218 * For a uniprocessor, all we need to do is protect against disk
219 * interrupts. For a multiprocessor, this lock would have to be
220 * a mutex. A single mutex is used throughout this file, though
221 * finer grain locking could be used if contention warranted it.
222 *
223 * For a multiprocessor, the sleep call would accept a lock and
224 * release it after the sleep processing was complete. In a uniprocessor
225 * implementation there is no such interlock, so we simple mark
226 * the places where it needs to be done with the `interlocked' form
227 * of the lock calls. Since the uniprocessor sleep already interlocks
228 * the spl, there is nothing that really needs to be done.
229 */
230#ifndef /* NOT */ DEBUG
231static struct lockit {
232	int	lkt_spl;
233} lk = { 0 };
234#define ACQUIRE_LOCK(lk)		(lk)->lkt_spl = splbio()
235#define FREE_LOCK(lk)			splx((lk)->lkt_spl)
236#define ACQUIRE_LOCK_INTERLOCKED(lk)
237#define FREE_LOCK_INTERLOCKED(lk)
238
239#else /* DEBUG */
240static struct lockit {
241	int	lkt_spl;
242	pid_t	lkt_held;
243} lk = { 0, -1 };
244static int lockcnt;
245
246static	void acquire_lock __P((struct lockit *));
247static	void free_lock __P((struct lockit *));
248static	void acquire_lock_interlocked __P((struct lockit *));
249static	void free_lock_interlocked __P((struct lockit *));
250
251#define ACQUIRE_LOCK(lk)		acquire_lock(lk)
252#define FREE_LOCK(lk)			free_lock(lk)
253#define ACQUIRE_LOCK_INTERLOCKED(lk)	acquire_lock_interlocked(lk)
254#define FREE_LOCK_INTERLOCKED(lk)	free_lock_interlocked(lk)
255
256static void
257acquire_lock(lk)
258	struct lockit *lk;
259{
260
261	if (lk->lkt_held != -1)
262		if (lk->lkt_held == CURPROC->p_pid)
263			panic("softdep_lock: locking against myself");
264		else
265			panic("softdep_lock: lock held by %d", lk->lkt_held);
266	lk->lkt_spl = splbio();
267	lk->lkt_held = CURPROC->p_pid;
268	lockcnt++;
269}
270
271static void
272free_lock(lk)
273	struct lockit *lk;
274{
275
276	if (lk->lkt_held == -1)
277		panic("softdep_unlock: lock not held");
278	lk->lkt_held = -1;
279	splx(lk->lkt_spl);
280}
281
282static void
283acquire_lock_interlocked(lk)
284	struct lockit *lk;
285{
286
287	if (lk->lkt_held != -1)
288		if (lk->lkt_held == CURPROC->p_pid)
289			panic("softdep_lock_interlocked: locking against self");
290		else
291			panic("softdep_lock_interlocked: lock held by %d",
292			    lk->lkt_held);
293	lk->lkt_held = CURPROC->p_pid;
294	lockcnt++;
295}
296
297static void
298free_lock_interlocked(lk)
299	struct lockit *lk;
300{
301
302	if (lk->lkt_held == -1)
303		panic("softdep_unlock_interlocked: lock not held");
304	lk->lkt_held = -1;
305}
306#endif /* DEBUG */
307
308/*
309 * Place holder for real semaphores.
310 */
311struct sema {
312	int	value;
313	pid_t	holder;
314	char	*name;
315	int	prio;
316	int	timo;
317};
318static	void sema_init __P((struct sema *, char *, int, int));
319static	int sema_get __P((struct sema *, struct lockit *));
320static	void sema_release __P((struct sema *));
321
322static void
323sema_init(semap, name, prio, timo)
324	struct sema *semap;
325	char *name;
326	int prio, timo;
327{
328
329	semap->holder = -1;
330	semap->value = 0;
331	semap->name = name;
332	semap->prio = prio;
333	semap->timo = timo;
334}
335
336static int
337sema_get(semap, interlock)
338	struct sema *semap;
339	struct lockit *interlock;
340{
341
342	if (semap->value++ > 0) {
343		if (interlock != NULL)
344			FREE_LOCK_INTERLOCKED(interlock);
345		tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo);
346		if (interlock != NULL) {
347			ACQUIRE_LOCK_INTERLOCKED(interlock);
348			FREE_LOCK(interlock);
349		}
350		return (0);
351	}
352	semap->holder = CURPROC->p_pid;
353	if (interlock != NULL)
354		FREE_LOCK(interlock);
355	return (1);
356}
357
358static void
359sema_release(semap)
360	struct sema *semap;
361{
362
363	if (semap->value <= 0 || semap->holder != CURPROC->p_pid)
364		panic("sema_release: not held");
365	if (--semap->value > 0) {
366		semap->value = 0;
367		wakeup(semap);
368	}
369	semap->holder = -1;
370}
371
372/*
373 * Worklist queue management.
374 * These routines require that the lock be held.
375 */
376#ifndef /* NOT */ DEBUG
377#define WORKLIST_INSERT(head, item) do {	\
378	(item)->wk_state |= ONWORKLIST;		\
379	LIST_INSERT_HEAD(head, item, wk_list);	\
380} while (0)
381#define WORKLIST_REMOVE(item) do {		\
382	(item)->wk_state &= ~ONWORKLIST;	\
383	LIST_REMOVE(item, wk_list);		\
384} while (0)
385#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
386
387#else /* DEBUG */
388static	void worklist_insert __P((struct workhead *, struct worklist *));
389static	void worklist_remove __P((struct worklist *));
390static	void workitem_free __P((struct worklist *, int));
391
392#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
393#define WORKLIST_REMOVE(item) worklist_remove(item)
394#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
395
396static void
397worklist_insert(head, item)
398	struct workhead *head;
399	struct worklist *item;
400{
401
402	if (lk.lkt_held == -1)
403		panic("worklist_insert: lock not held");
404	if (item->wk_state & ONWORKLIST)
405		panic("worklist_insert: already on list");
406	item->wk_state |= ONWORKLIST;
407	LIST_INSERT_HEAD(head, item, wk_list);
408}
409
410static void
411worklist_remove(item)
412	struct worklist *item;
413{
414
415	if (lk.lkt_held == -1)
416		panic("worklist_remove: lock not held");
417	if ((item->wk_state & ONWORKLIST) == 0)
418		panic("worklist_remove: not on list");
419	item->wk_state &= ~ONWORKLIST;
420	LIST_REMOVE(item, wk_list);
421}
422
423static void
424workitem_free(item, type)
425	struct worklist *item;
426	int type;
427{
428
429	if (item->wk_state & ONWORKLIST)
430		panic("workitem_free: still on list");
431	if (item->wk_type != type)
432		panic("workitem_free: type mismatch");
433	FREE(item, DtoM(type));
434}
435#endif /* DEBUG */
436
437/*
438 * Workitem queue management
439 */
440static struct workhead softdep_workitem_pending;
441static int softdep_worklist_busy;
442static int max_softdeps;	/* maximum number of structs before slowdown */
443static int tickdelay = 2;	/* number of ticks to pause during slowdown */
444static int max_limit_hit;	/* number of times slowdown imposed */
445static int rush_requests;	/* number of times I/O speeded up */
446static int proc_waiting;	/* tracks whether we have a timeout posted */
447static pid_t filesys_syncer_pid;/* records pid of filesystem syncer process */
448#ifdef DEBUG
449#include <vm/vm.h>
450#include <sys/sysctl.h>
451#if defined(__FreeBSD__)
452SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
453SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
454SYSCTL_INT(_debug, OID_AUTO, max_limit_hit, CTLFLAG_RW, &max_limit_hit, 0, "");
455SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &rush_requests, 0, "");
456#else /* !__FreeBSD__ */
457struct ctldebug debug8 = { "max_softdeps", &max_softdeps };
458struct ctldebug debug9 = { "tickdelay", &tickdelay };
459struct ctldebug debug10 = { "max_limit_hit", &max_limit_hit };
460struct ctldebug debug11 = { "rush_requests", &rush_requests };
461#endif	/* !__FreeBSD__ */
462
463#endif /* DEBUG */
464
465/*
466 * Add an item to the end of the work queue.
467 * This routine requires that the lock be held.
468 * This is the only routine that adds items to the list.
469 * The following routine is the only one that removes items
470 * and does so in order from first to last.
471 */
472static void
473add_to_worklist(wk)
474	struct worklist *wk;
475{
476	static struct worklist *worklist_tail;
477
478	if (wk->wk_state & ONWORKLIST)
479		panic("add_to_worklist: already on list");
480	wk->wk_state |= ONWORKLIST;
481	if (LIST_FIRST(&softdep_workitem_pending) == NULL) {
482		LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
483	} else {
484		LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
485	}
486	worklist_tail = wk;
487}
488
489/*
490 * Process that runs once per second to handle items in the background queue.
491 *
492 * Note that we ensure that everything is done in the order in which they
493 * appear in the queue. The code below depends on this property to ensure
494 * that blocks of a file are freed before the inode itself is freed. This
495 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
496 * until all the old ones have been purged from the dependency lists.
497 */
498int
499softdep_process_worklist(matchmnt)
500	struct mount *matchmnt;
501{
502	struct proc *p = CURPROC;
503	struct worklist *wk;
504	struct fs *matchfs;
505	int matchcnt;
506
507	/*
508	 * Record the process identifier of our caller so that we can
509	 * give this process preferential treatment in checklimit below.
510	 */
511	filesys_syncer_pid = p->p_pid;
512	matchcnt = 0;
513	matchfs = NULL;
514	if (matchmnt != NULL)
515		matchfs = VFSTOUFS(matchmnt)->um_fs;
516	/*
517	 * There is no danger of having multiple processes run this
518	 * code. It is single threaded solely so that softdep_flushfiles
519	 * (below) can get an accurate count of the number of items
520	 * related to its mount point that are in the list.
521	 */
522	if (softdep_worklist_busy && matchmnt == NULL)
523		return (-1);
524	ACQUIRE_LOCK(&lk);
525	while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) {
526		WORKLIST_REMOVE(wk);
527		FREE_LOCK(&lk);
528		switch (wk->wk_type) {
529
530		case D_DIRREM:
531			/* removal of a directory entry */
532			if (WK_DIRREM(wk)->dm_mnt == matchmnt)
533				matchcnt += 1;
534			handle_workitem_remove(WK_DIRREM(wk));
535			break;
536
537		case D_FREEBLKS:
538			/* releasing blocks and/or fragments from a file */
539			if (WK_FREEBLKS(wk)->fb_fs == matchfs)
540				matchcnt += 1;
541			handle_workitem_freeblocks(WK_FREEBLKS(wk));
542			break;
543
544		case D_FREEFRAG:
545			/* releasing a fragment when replaced as a file grows */
546			if (WK_FREEFRAG(wk)->ff_fs == matchfs)
547				matchcnt += 1;
548			handle_workitem_freefrag(WK_FREEFRAG(wk));
549			break;
550
551		case D_FREEFILE:
552			/* releasing an inode when its link count drops to 0 */
553			if (WK_FREEFILE(wk)->fx_fs == matchfs)
554				matchcnt += 1;
555			handle_workitem_freefile(WK_FREEFILE(wk));
556			break;
557
558		default:
559			panic("%s_process_worklist: Unknown type %s",
560			    "softdep", TYPENAME(wk->wk_type));
561			/* NOTREACHED */
562		}
563		if (softdep_worklist_busy && matchmnt == NULL)
564			return (-1);
565		ACQUIRE_LOCK(&lk);
566	}
567	FREE_LOCK(&lk);
568	return (matchcnt);
569}
570
571/*
572 * Purge the work list of all items associated with a particular mount point.
573 */
574int
575softdep_flushfiles(oldmnt, flags, p)
576	struct mount *oldmnt;
577	int flags;
578	struct proc *p;
579{
580	struct vnode *devvp;
581	int error, loopcnt;
582
583	/*
584	 * Await our turn to clear out the queue.
585	 */
586	while (softdep_worklist_busy)
587		tsleep(&lbolt, PRIBIO, "softflush", 0);
588	softdep_worklist_busy = 1;
589	if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) {
590		softdep_worklist_busy = 0;
591		return (error);
592	}
593	/*
594	 * Alternately flush the block device associated with the mount
595	 * point and process any dependencies that the flushing
596	 * creates. In theory, this loop can happen at most twice,
597	 * but we give it a few extra just to be sure.
598	 */
599	devvp = VFSTOUFS(oldmnt)->um_devvp;
600	for (loopcnt = 10; loopcnt > 0; loopcnt--) {
601		if (softdep_process_worklist(oldmnt) == 0) {
602			/*
603			 * Do another flush in case any vnodes were brought in
604			 * as part of the cleanup operations.
605			 */
606			if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
607				break;
608			/*
609			 * If we still found nothing to do, we are really done.
610			 */
611			if (softdep_process_worklist(oldmnt) == 0)
612				break;
613		}
614		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
615		error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
616		VOP_UNLOCK(devvp, 0, p);
617		if (error)
618			break;
619	}
620	softdep_worklist_busy = 0;
621	/*
622	 * If we are unmounting then it is an error to fail. If we
623	 * are simply trying to downgrade to read-only, then filesystem
624	 * activity can keep us busy forever, so we just fail with EBUSY.
625	 */
626	if (loopcnt == 0) {
627		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
628			panic("softdep_flushfiles: looping");
629		error = EBUSY;
630	}
631	return (error);
632}
633
634/*
635 * A large burst of file addition or deletion activity can drive the
636 * memory load excessively high. Therefore we deliberately slow things
637 * down and speed up the I/O processing if we find ourselves with too
638 * many dependencies in progress.
639 */
640static int
641checklimit(resource, islocked)
642	long *resource;
643	int islocked;
644{
645	struct proc *p = CURPROC;
646
647	/*
648	 * If we are under our limit, just proceed.
649	 */
650	if (*resource < max_softdeps)
651		return (0);
652	/*
653	 * We never hold up the filesystem syncer process.
654	 */
655	if (p->p_pid == filesys_syncer_pid)
656		return (0);
657	/*
658	 * Our first approach is to speed up the syncer process.
659	 * We never push it to speed up more than half of its
660	 * normal turn time, otherwise it could take over the cpu.
661	 */
662	if (rushjob < syncdelay / 2) {
663		rushjob += 1;
664		rush_requests += 1;
665		return (0);
666	}
667	/*
668	 * Every trick has failed, so we pause momentarily to let
669	 * the filesystem syncer process catch up.
670	 */
671	if (islocked == 0)
672		ACQUIRE_LOCK(&lk);
673	if (proc_waiting == 0) {
674		proc_waiting = 1;
675		timeout(pause_timer, NULL, tickdelay > 2 ? tickdelay : 2);
676	}
677	FREE_LOCK_INTERLOCKED(&lk);
678	(void) tsleep((caddr_t)&proc_waiting, PPAUSE | PCATCH, "softupdate", 0);
679	ACQUIRE_LOCK_INTERLOCKED(&lk);
680	if (islocked == 0)
681		FREE_LOCK(&lk);
682	max_limit_hit += 1;
683	return (1);
684}
685
686/*
687 * Awaken processes pausing in checklimit and clear proc_waiting
688 * to indicate that there is no longer a timer running.
689 */
690void
691pause_timer(arg)
692	void *arg;
693{
694
695	proc_waiting = 0;
696	wakeup(&proc_waiting);
697}
698
699/*
700 * Structure hashing.
701 *
702 * There are three types of structures that can be looked up:
703 *	1) pagedep structures identified by mount point, inode number,
704 *	   and logical block.
705 *	2) inodedep structures identified by mount point and inode number.
706 *	3) newblk structures identified by mount point and
707 *	   physical block number.
708 *
709 * The "pagedep" and "inodedep" dependency structures are hashed
710 * separately from the file blocks and inodes to which they correspond.
711 * This separation helps when the in-memory copy of an inode or
712 * file block must be replaced. It also obviates the need to access
713 * an inode or file page when simply updating (or de-allocating)
714 * dependency structures. Lookup of newblk structures is needed to
715 * find newly allocated blocks when trying to associate them with
716 * their allocdirect or allocindir structure.
717 *
718 * The lookup routines optionally create and hash a new instance when
719 * an existing entry is not found.
720 */
721#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
722
723/*
724 * Structures and routines associated with pagedep caching.
725 */
726LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
727u_long	pagedep_hash;		/* size of hash table - 1 */
728#define	PAGEDEP_HASH(mp, inum, lbn) \
729	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
730	    pagedep_hash])
731static struct sema pagedep_in_progress;
732
733/*
734 * Look up a pagedep. Return 1 if found, 0 if not found.
735 * If not found, allocate if DEPALLOC flag is passed.
736 * Found or allocated entry is returned in pagedeppp.
737 * This routine must be called with splbio interrupts blocked.
738 */
739static int
740pagedep_lookup(ip, lbn, flags, pagedeppp)
741	struct inode *ip;
742	ufs_lbn_t lbn;
743	int flags;
744	struct pagedep **pagedeppp;
745{
746	struct pagedep *pagedep;
747	struct pagedep_hashhead *pagedephd;
748	struct mount *mp;
749	int i;
750
751#ifdef DEBUG
752	if (lk.lkt_held == -1)
753		panic("pagedep_lookup: lock not held");
754#endif
755	mp = ITOV(ip)->v_mount;
756	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
757top:
758	for (pagedep = LIST_FIRST(pagedephd); pagedep;
759	     pagedep = LIST_NEXT(pagedep, pd_hash))
760		if (ip->i_number == pagedep->pd_ino &&
761		    lbn == pagedep->pd_lbn &&
762		    mp == pagedep->pd_mnt)
763			break;
764	if (pagedep) {
765		*pagedeppp = pagedep;
766		return (1);
767	}
768	if ((flags & DEPALLOC) == 0) {
769		*pagedeppp = NULL;
770		return (0);
771	}
772	if (sema_get(&pagedep_in_progress, &lk) == 0) {
773		ACQUIRE_LOCK(&lk);
774		goto top;
775	}
776	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
777		M_WAITOK);
778	bzero(pagedep, sizeof(struct pagedep));
779	pagedep->pd_list.wk_type = D_PAGEDEP;
780	pagedep->pd_mnt = mp;
781	pagedep->pd_ino = ip->i_number;
782	pagedep->pd_lbn = lbn;
783	LIST_INIT(&pagedep->pd_dirremhd);
784	LIST_INIT(&pagedep->pd_pendinghd);
785	for (i = 0; i < DAHASHSZ; i++)
786		LIST_INIT(&pagedep->pd_diraddhd[i]);
787	ACQUIRE_LOCK(&lk);
788	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
789	sema_release(&pagedep_in_progress);
790	*pagedeppp = pagedep;
791	return (0);
792}
793
794/*
795 * Structures and routines associated with inodedep caching.
796 */
797LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
798static u_long	inodedep_hash;	/* size of hash table - 1 */
799static long	num_inodedep;	/* number of inodedep allocated */
800#define	INODEDEP_HASH(fs, inum) \
801      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
802static struct sema inodedep_in_progress;
803
804/*
805 * Look up a inodedep. Return 1 if found, 0 if not found.
806 * If not found, allocate if DEPALLOC flag is passed.
807 * Found or allocated entry is returned in inodedeppp.
808 * This routine must be called with splbio interrupts blocked.
809 */
810static int
811inodedep_lookup(fs, inum, flags, inodedeppp)
812	struct fs *fs;
813	ino_t inum;
814	int flags;
815	struct inodedep **inodedeppp;
816{
817	struct inodedep *inodedep;
818	struct inodedep_hashhead *inodedephd;
819	int firsttry;
820
821#ifdef DEBUG
822	if (lk.lkt_held == -1)
823		panic("inodedep_lookup: lock not held");
824#endif
825	firsttry = 1;
826	inodedephd = INODEDEP_HASH(fs, inum);
827top:
828	for (inodedep = LIST_FIRST(inodedephd); inodedep;
829	     inodedep = LIST_NEXT(inodedep, id_hash))
830		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
831			break;
832	if (inodedep) {
833		*inodedeppp = inodedep;
834		return (1);
835	}
836	if ((flags & DEPALLOC) == 0) {
837		*inodedeppp = NULL;
838		return (0);
839	}
840	if (firsttry && checklimit(&num_inodedep, 1) == 1) {
841		firsttry = 0;
842		goto top;
843	}
844	if (sema_get(&inodedep_in_progress, &lk) == 0) {
845		ACQUIRE_LOCK(&lk);
846		goto top;
847	}
848	num_inodedep += 1;
849	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
850		M_INODEDEP, M_WAITOK);
851	inodedep->id_list.wk_type = D_INODEDEP;
852	inodedep->id_fs = fs;
853	inodedep->id_ino = inum;
854	inodedep->id_state = ALLCOMPLETE;
855	inodedep->id_nlinkdelta = 0;
856	inodedep->id_savedino = NULL;
857	inodedep->id_savedsize = -1;
858	inodedep->id_buf = NULL;
859	LIST_INIT(&inodedep->id_pendinghd);
860	LIST_INIT(&inodedep->id_inowait);
861	LIST_INIT(&inodedep->id_bufwait);
862	TAILQ_INIT(&inodedep->id_inoupdt);
863	TAILQ_INIT(&inodedep->id_newinoupdt);
864	ACQUIRE_LOCK(&lk);
865	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
866	sema_release(&inodedep_in_progress);
867	*inodedeppp = inodedep;
868	return (0);
869}
870
871/*
872 * Structures and routines associated with newblk caching.
873 */
874LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
875u_long	newblk_hash;		/* size of hash table - 1 */
876#define	NEWBLK_HASH(fs, inum) \
877	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
878static struct sema newblk_in_progress;
879
880/*
881 * Look up a newblk. Return 1 if found, 0 if not found.
882 * If not found, allocate if DEPALLOC flag is passed.
883 * Found or allocated entry is returned in newblkpp.
884 */
885static int
886newblk_lookup(fs, newblkno, flags, newblkpp)
887	struct fs *fs;
888	ufs_daddr_t newblkno;
889	int flags;
890	struct newblk **newblkpp;
891{
892	struct newblk *newblk;
893	struct newblk_hashhead *newblkhd;
894
895	newblkhd = NEWBLK_HASH(fs, newblkno);
896top:
897	for (newblk = LIST_FIRST(newblkhd); newblk;
898	     newblk = LIST_NEXT(newblk, nb_hash))
899		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
900			break;
901	if (newblk) {
902		*newblkpp = newblk;
903		return (1);
904	}
905	if ((flags & DEPALLOC) == 0) {
906		*newblkpp = NULL;
907		return (0);
908	}
909	if (sema_get(&newblk_in_progress, 0) == 0)
910		goto top;
911	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
912		M_NEWBLK, M_WAITOK);
913	newblk->nb_state = 0;
914	newblk->nb_fs = fs;
915	newblk->nb_newblkno = newblkno;
916	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
917	sema_release(&newblk_in_progress);
918	*newblkpp = newblk;
919	return (0);
920}
921
922/*
923 * Executed during filesystem system initialization before
924 * mounting any file systems.
925 */
926void
927softdep_initialize()
928{
929
930	LIST_INIT(&mkdirlisthd);
931	LIST_INIT(&softdep_workitem_pending);
932	max_softdeps = desiredvnodes * 8;
933	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
934	    &pagedep_hash);
935	sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
936	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
937	sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
938	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
939	sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
940}
941
942/*
943 * Called at mount time to notify the dependency code that a
944 * filesystem wishes to use it.
945 */
946int
947softdep_mount(devvp, mp, fs, cred)
948	struct vnode *devvp;
949	struct mount *mp;
950	struct fs *fs;
951	struct ucred *cred;
952{
953	struct csum cstotal;
954	struct cg *cgp;
955	struct buf *bp;
956	int error, cyl;
957
958	mp->mnt_flag &= ~MNT_ASYNC;
959	mp->mnt_flag |= MNT_SOFTDEP;
960	/*
961	 * When doing soft updates, the counters in the
962	 * superblock may have gotten out of sync, so we have
963	 * to scan the cylinder groups and recalculate them.
964	 */
965	if (fs->fs_clean != 0)
966		return (0);
967	bzero(&cstotal, sizeof cstotal);
968	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
969		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
970		    fs->fs_cgsize, cred, &bp)) != 0) {
971			brelse(bp);
972			return (error);
973		}
974		cgp = (struct cg *)bp->b_data;
975		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
976		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
977		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
978		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
979		fs->fs_cs(fs, cyl) = cgp->cg_cs;
980		brelse(bp);
981	}
982#ifdef DEBUG
983	if (!bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
984		printf("ffs_mountfs: superblock updated for soft updates\n");
985#endif
986	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
987	return (0);
988}
989
990/*
991 * Protecting the freemaps (or bitmaps).
992 *
993 * To eliminate the need to execute fsck before mounting a file system
994 * after a power failure, one must (conservatively) guarantee that the
995 * on-disk copy of the bitmaps never indicate that a live inode or block is
996 * free.  So, when a block or inode is allocated, the bitmap should be
997 * updated (on disk) before any new pointers.  When a block or inode is
998 * freed, the bitmap should not be updated until all pointers have been
999 * reset.  The latter dependency is handled by the delayed de-allocation
1000 * approach described below for block and inode de-allocation.  The former
1001 * dependency is handled by calling the following procedure when a block or
1002 * inode is allocated. When an inode is allocated an "inodedep" is created
1003 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1004 * Each "inodedep" is also inserted into the hash indexing structure so
1005 * that any additional link additions can be made dependent on the inode
1006 * allocation.
1007 *
1008 * The ufs file system maintains a number of free block counts (e.g., per
1009 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1010 * in addition to the bitmaps.  These counts are used to improve efficiency
1011 * during allocation and therefore must be consistent with the bitmaps.
1012 * There is no convenient way to guarantee post-crash consistency of these
1013 * counts with simple update ordering, for two main reasons: (1) The counts
1014 * and bitmaps for a single cylinder group block are not in the same disk
1015 * sector.  If a disk write is interrupted (e.g., by power failure), one may
1016 * be written and the other not.  (2) Some of the counts are located in the
1017 * superblock rather than the cylinder group block. So, we focus our soft
1018 * updates implementation on protecting the bitmaps. When mounting a
1019 * filesystem, we recompute the auxiliary counts from the bitmaps.
1020 */
1021
1022/*
1023 * Called just after updating the cylinder group block to allocate an inode.
1024 */
1025void
1026softdep_setup_inomapdep(bp, ip, newinum)
1027	struct buf *bp;		/* buffer for cylgroup block with inode map */
1028	struct inode *ip;	/* inode related to allocation */
1029	ino_t newinum;		/* new inode number being allocated */
1030{
1031	struct inodedep *inodedep;
1032	struct bmsafemap *bmsafemap;
1033
1034	/*
1035	 * Create a dependency for the newly allocated inode.
1036	 * Panic if it already exists as something is seriously wrong.
1037	 * Otherwise add it to the dependency list for the buffer holding
1038	 * the cylinder group map from which it was allocated.
1039	 */
1040	ACQUIRE_LOCK(&lk);
1041	if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0)
1042		panic("softdep_setup_inomapdep: found inode");
1043	inodedep->id_buf = bp;
1044	inodedep->id_state &= ~DEPCOMPLETE;
1045	bmsafemap = bmsafemap_lookup(bp);
1046	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1047	FREE_LOCK(&lk);
1048}
1049
1050/*
1051 * Called just after updating the cylinder group block to
1052 * allocate block or fragment.
1053 */
1054void
1055softdep_setup_blkmapdep(bp, fs, newblkno)
1056	struct buf *bp;		/* buffer for cylgroup block with block map */
1057	struct fs *fs;		/* filesystem doing allocation */
1058	ufs_daddr_t newblkno;	/* number of newly allocated block */
1059{
1060	struct newblk *newblk;
1061	struct bmsafemap *bmsafemap;
1062
1063	/*
1064	 * Create a dependency for the newly allocated block.
1065	 * Add it to the dependency list for the buffer holding
1066	 * the cylinder group map from which it was allocated.
1067	 */
1068	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1069		panic("softdep_setup_blkmapdep: found block");
1070	ACQUIRE_LOCK(&lk);
1071	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1072	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1073	FREE_LOCK(&lk);
1074}
1075
1076/*
1077 * Find the bmsafemap associated with a cylinder group buffer.
1078 * If none exists, create one. The buffer must be locked when
1079 * this routine is called and this routine must be called with
1080 * splbio interrupts blocked.
1081 */
1082static struct bmsafemap *
1083bmsafemap_lookup(bp)
1084	struct buf *bp;
1085{
1086	struct bmsafemap *bmsafemap;
1087	struct worklist *wk;
1088
1089#ifdef DEBUG
1090	if (lk.lkt_held == -1)
1091		panic("bmsafemap_lookup: lock not held");
1092#endif
1093	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list))
1094		if (wk->wk_type == D_BMSAFEMAP)
1095			return (WK_BMSAFEMAP(wk));
1096	FREE_LOCK(&lk);
1097	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1098		M_BMSAFEMAP, M_WAITOK);
1099	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1100	bmsafemap->sm_list.wk_state = 0;
1101	bmsafemap->sm_buf = bp;
1102	LIST_INIT(&bmsafemap->sm_allocdirecthd);
1103	LIST_INIT(&bmsafemap->sm_allocindirhd);
1104	LIST_INIT(&bmsafemap->sm_inodedephd);
1105	LIST_INIT(&bmsafemap->sm_newblkhd);
1106	ACQUIRE_LOCK(&lk);
1107	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1108	return (bmsafemap);
1109}
1110
1111/*
1112 * Direct block allocation dependencies.
1113 *
1114 * When a new block is allocated, the corresponding disk locations must be
1115 * initialized (with zeros or new data) before the on-disk inode points to
1116 * them.  Also, the freemap from which the block was allocated must be
1117 * updated (on disk) before the inode's pointer. These two dependencies are
1118 * independent of each other and are needed for all file blocks and indirect
1119 * blocks that are pointed to directly by the inode.  Just before the
1120 * "in-core" version of the inode is updated with a newly allocated block
1121 * number, a procedure (below) is called to setup allocation dependency
1122 * structures.  These structures are removed when the corresponding
1123 * dependencies are satisfied or when the block allocation becomes obsolete
1124 * (i.e., the file is deleted, the block is de-allocated, or the block is a
1125 * fragment that gets upgraded).  All of these cases are handled in
1126 * procedures described later.
1127 *
1128 * When a file extension causes a fragment to be upgraded, either to a larger
1129 * fragment or to a full block, the on-disk location may change (if the
1130 * previous fragment could not simply be extended). In this case, the old
1131 * fragment must be de-allocated, but not until after the inode's pointer has
1132 * been updated. In most cases, this is handled by later procedures, which
1133 * will construct a "freefrag" structure to be added to the workitem queue
1134 * when the inode update is complete (or obsolete).  The main exception to
1135 * this is when an allocation occurs while a pending allocation dependency
1136 * (for the same block pointer) remains.  This case is handled in the main
1137 * allocation dependency setup procedure by immediately freeing the
1138 * unreferenced fragments.
1139 */
1140void
1141softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1142	struct inode *ip;	/* inode to which block is being added */
1143	ufs_lbn_t lbn;		/* block pointer within inode */
1144	ufs_daddr_t newblkno;	/* disk block number being added */
1145	ufs_daddr_t oldblkno;	/* previous block number, 0 unless frag */
1146	long newsize;		/* size of new block */
1147	long oldsize;		/* size of new block */
1148	struct buf *bp;		/* bp for allocated block */
1149{
1150	struct allocdirect *adp, *oldadp;
1151	struct allocdirectlst *adphead;
1152	struct bmsafemap *bmsafemap;
1153	struct inodedep *inodedep;
1154	struct pagedep *pagedep;
1155	struct newblk *newblk;
1156
1157	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1158		M_ALLOCDIRECT, M_WAITOK);
1159	bzero(adp, sizeof(struct allocdirect));
1160	adp->ad_list.wk_type = D_ALLOCDIRECT;
1161	adp->ad_lbn = lbn;
1162	adp->ad_newblkno = newblkno;
1163	adp->ad_oldblkno = oldblkno;
1164	adp->ad_newsize = newsize;
1165	adp->ad_oldsize = oldsize;
1166	adp->ad_state = ATTACHED;
1167	if (newblkno == oldblkno)
1168		adp->ad_freefrag = NULL;
1169	else
1170		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1171
1172	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1173		panic("softdep_setup_allocdirect: lost block");
1174
1175	ACQUIRE_LOCK(&lk);
1176	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
1177	adp->ad_inodedep = inodedep;
1178
1179	if (newblk->nb_state == DEPCOMPLETE) {
1180		adp->ad_state |= DEPCOMPLETE;
1181		adp->ad_buf = NULL;
1182	} else {
1183		bmsafemap = newblk->nb_bmsafemap;
1184		adp->ad_buf = bmsafemap->sm_buf;
1185		LIST_REMOVE(newblk, nb_deps);
1186		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1187	}
1188	LIST_REMOVE(newblk, nb_hash);
1189	FREE(newblk, M_NEWBLK);
1190
1191	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1192	if (lbn >= NDADDR) {
1193		/* allocating an indirect block */
1194		if (oldblkno != 0)
1195			panic("softdep_setup_allocdirect: non-zero indir");
1196	} else {
1197		/*
1198		 * Allocating a direct block.
1199		 *
1200		 * If we are allocating a directory block, then we must
1201		 * allocate an associated pagedep to track additions and
1202		 * deletions.
1203		 */
1204		if ((ip->i_mode & IFMT) == IFDIR &&
1205		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1206			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1207	}
1208	/*
1209	 * The list of allocdirects must be kept in sorted and ascending
1210	 * order so that the rollback routines can quickly determine the
1211	 * first uncommitted block (the size of the file stored on disk
1212	 * ends at the end of the lowest committed fragment, or if there
1213	 * are no fragments, at the end of the highest committed block).
1214	 * Since files generally grow, the typical case is that the new
1215	 * block is to be added at the end of the list. We speed this
1216	 * special case by checking against the last allocdirect in the
1217	 * list before laboriously traversing the list looking for the
1218	 * insertion point.
1219	 */
1220	adphead = &inodedep->id_newinoupdt;
1221	oldadp = TAILQ_LAST(adphead, allocdirectlst);
1222	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1223		/* insert at end of list */
1224		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1225		if (oldadp != NULL && oldadp->ad_lbn == lbn)
1226			allocdirect_merge(adphead, adp, oldadp);
1227		FREE_LOCK(&lk);
1228		return;
1229	}
1230	for (oldadp = TAILQ_FIRST(adphead); oldadp;
1231	     oldadp = TAILQ_NEXT(oldadp, ad_next)) {
1232		if (oldadp->ad_lbn >= lbn)
1233			break;
1234	}
1235	if (oldadp == NULL)
1236		panic("softdep_setup_allocdirect: lost entry");
1237	/* insert in middle of list */
1238	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1239	if (oldadp->ad_lbn == lbn)
1240		allocdirect_merge(adphead, adp, oldadp);
1241	FREE_LOCK(&lk);
1242}
1243
1244/*
1245 * Replace an old allocdirect dependency with a newer one.
1246 * This routine must be called with splbio interrupts blocked.
1247 */
1248static void
1249allocdirect_merge(adphead, newadp, oldadp)
1250	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
1251	struct allocdirect *newadp;	/* allocdirect being added */
1252	struct allocdirect *oldadp;	/* existing allocdirect being checked */
1253{
1254	struct freefrag *freefrag;
1255
1256#ifdef DEBUG
1257	if (lk.lkt_held == -1)
1258		panic("allocdirect_merge: lock not held");
1259#endif
1260	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1261	    newadp->ad_oldsize != oldadp->ad_newsize ||
1262	    newadp->ad_lbn >= NDADDR)
1263		panic("allocdirect_check: old %d != new %d || lbn %ld >= %d",
1264		    newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
1265		    NDADDR);
1266	newadp->ad_oldblkno = oldadp->ad_oldblkno;
1267	newadp->ad_oldsize = oldadp->ad_oldsize;
1268	/*
1269	 * If the old dependency had a fragment to free or had never
1270	 * previously had a block allocated, then the new dependency
1271	 * can immediately post its freefrag and adopt the old freefrag.
1272	 * This action is done by swapping the freefrag dependencies.
1273	 * The new dependency gains the old one's freefrag, and the
1274	 * old one gets the new one and then immediately puts it on
1275	 * the worklist when it is freed by free_allocdirect. It is
1276	 * not possible to do this swap when the old dependency had a
1277	 * non-zero size but no previous fragment to free. This condition
1278	 * arises when the new block is an extension of the old block.
1279	 * Here, the first part of the fragment allocated to the new
1280	 * dependency is part of the block currently claimed on disk by
1281	 * the old dependency, so cannot legitimately be freed until the
1282	 * conditions for the new dependency are fulfilled.
1283	 */
1284	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1285		freefrag = newadp->ad_freefrag;
1286		newadp->ad_freefrag = oldadp->ad_freefrag;
1287		oldadp->ad_freefrag = freefrag;
1288	}
1289	free_allocdirect(adphead, oldadp, 0);
1290}
1291
1292/*
1293 * Allocate a new freefrag structure if needed.
1294 */
1295static struct freefrag *
1296newfreefrag(ip, blkno, size)
1297	struct inode *ip;
1298	ufs_daddr_t blkno;
1299	long size;
1300{
1301	struct freefrag *freefrag;
1302	struct fs *fs;
1303
1304	if (blkno == 0)
1305		return (NULL);
1306	fs = ip->i_fs;
1307	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1308		panic("newfreefrag: frag size");
1309	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1310		M_FREEFRAG, M_WAITOK);
1311	freefrag->ff_list.wk_type = D_FREEFRAG;
1312	freefrag->ff_state = ip->i_uid & ~ONWORKLIST;	/* XXX - used below */
1313	freefrag->ff_inum = ip->i_number;
1314	freefrag->ff_fs = fs;
1315	freefrag->ff_devvp = ip->i_devvp;
1316	freefrag->ff_blkno = blkno;
1317	freefrag->ff_fragsize = size;
1318	return (freefrag);
1319}
1320
1321/*
1322 * This workitem de-allocates fragments that were replaced during
1323 * file block allocation.
1324 */
1325static void
1326handle_workitem_freefrag(freefrag)
1327	struct freefrag *freefrag;
1328{
1329	struct inode tip;
1330
1331	tip.i_fs = freefrag->ff_fs;
1332	tip.i_devvp = freefrag->ff_devvp;
1333	tip.i_dev = freefrag->ff_devvp->v_rdev;
1334	tip.i_number = freefrag->ff_inum;
1335	tip.i_uid = freefrag->ff_state & ~ONWORKLIST;	/* XXX - set above */
1336	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1337	FREE(freefrag, M_FREEFRAG);
1338}
1339
1340/*
1341 * Indirect block allocation dependencies.
1342 *
1343 * The same dependencies that exist for a direct block also exist when
1344 * a new block is allocated and pointed to by an entry in a block of
1345 * indirect pointers. The undo/redo states described above are also
1346 * used here. Because an indirect block contains many pointers that
1347 * may have dependencies, a second copy of the entire in-memory indirect
1348 * block is kept. The buffer cache copy is always completely up-to-date.
1349 * The second copy, which is used only as a source for disk writes,
1350 * contains only the safe pointers (i.e., those that have no remaining
1351 * update dependencies). The second copy is freed when all pointers
1352 * are safe. The cache is not allowed to replace indirect blocks with
1353 * pending update dependencies. If a buffer containing an indirect
1354 * block with dependencies is written, these routines will mark it
1355 * dirty again. It can only be successfully written once all the
1356 * dependencies are removed. The ffs_fsync routine in conjunction with
1357 * softdep_sync_metadata work together to get all the dependencies
1358 * removed so that a file can be successfully written to disk. Three
1359 * procedures are used when setting up indirect block pointer
1360 * dependencies. The division is necessary because of the organization
1361 * of the "balloc" routine and because of the distinction between file
1362 * pages and file metadata blocks.
1363 */
1364
1365/*
1366 * Allocate a new allocindir structure.
1367 */
1368static struct allocindir *
1369newallocindir(ip, ptrno, newblkno, oldblkno)
1370	struct inode *ip;	/* inode for file being extended */
1371	int ptrno;		/* offset of pointer in indirect block */
1372	ufs_daddr_t newblkno;	/* disk block number being added */
1373	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1374{
1375	struct allocindir *aip;
1376
1377	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1378		M_ALLOCINDIR, M_WAITOK);
1379	bzero(aip, sizeof(struct allocindir));
1380	aip->ai_list.wk_type = D_ALLOCINDIR;
1381	aip->ai_state = ATTACHED;
1382	aip->ai_offset = ptrno;
1383	aip->ai_newblkno = newblkno;
1384	aip->ai_oldblkno = oldblkno;
1385	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1386	return (aip);
1387}
1388
1389/*
1390 * Called just before setting an indirect block pointer
1391 * to a newly allocated file page.
1392 */
1393void
1394softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1395	struct inode *ip;	/* inode for file being extended */
1396	ufs_lbn_t lbn;		/* allocated block number within file */
1397	struct buf *bp;		/* buffer with indirect blk referencing page */
1398	int ptrno;		/* offset of pointer in indirect block */
1399	ufs_daddr_t newblkno;	/* disk block number being added */
1400	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1401	struct buf *nbp;	/* buffer holding allocated page */
1402{
1403	struct allocindir *aip;
1404	struct pagedep *pagedep;
1405
1406	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1407	ACQUIRE_LOCK(&lk);
1408	/*
1409	 * If we are allocating a directory page, then we must
1410	 * allocate an associated pagedep to track additions and
1411	 * deletions.
1412	 */
1413	if ((ip->i_mode & IFMT) == IFDIR &&
1414	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1415		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1416	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1417	FREE_LOCK(&lk);
1418	setup_allocindir_phase2(bp, ip, aip);
1419}
1420
1421/*
1422 * Called just before setting an indirect block pointer to a
1423 * newly allocated indirect block.
1424 */
1425void
1426softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1427	struct buf *nbp;	/* newly allocated indirect block */
1428	struct inode *ip;	/* inode for file being extended */
1429	struct buf *bp;		/* indirect block referencing allocated block */
1430	int ptrno;		/* offset of pointer in indirect block */
1431	ufs_daddr_t newblkno;	/* disk block number being added */
1432{
1433	struct allocindir *aip;
1434
1435	aip = newallocindir(ip, ptrno, newblkno, 0);
1436	ACQUIRE_LOCK(&lk);
1437	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1438	FREE_LOCK(&lk);
1439	setup_allocindir_phase2(bp, ip, aip);
1440}
1441
1442/*
1443 * Called to finish the allocation of the "aip" allocated
1444 * by one of the two routines above.
1445 */
1446static void
1447setup_allocindir_phase2(bp, ip, aip)
1448	struct buf *bp;		/* in-memory copy of the indirect block */
1449	struct inode *ip;	/* inode for file being extended */
1450	struct allocindir *aip;	/* allocindir allocated by the above routines */
1451{
1452	struct worklist *wk;
1453	struct indirdep *indirdep, *newindirdep;
1454	struct bmsafemap *bmsafemap;
1455	struct allocindir *oldaip;
1456	struct freefrag *freefrag;
1457	struct newblk *newblk;
1458
1459	if (bp->b_lblkno >= 0)
1460		panic("setup_allocindir_phase2: not indir blk");
1461	for (indirdep = NULL, newindirdep = NULL; ; ) {
1462		ACQUIRE_LOCK(&lk);
1463		for (wk = LIST_FIRST(&bp->b_dep); wk;
1464		     wk = LIST_NEXT(wk, wk_list)) {
1465			if (wk->wk_type != D_INDIRDEP)
1466				continue;
1467			indirdep = WK_INDIRDEP(wk);
1468			break;
1469		}
1470		if (indirdep == NULL && newindirdep) {
1471			indirdep = newindirdep;
1472			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1473			newindirdep = NULL;
1474		}
1475		FREE_LOCK(&lk);
1476		if (indirdep) {
1477			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1478			    &newblk) == 0)
1479				panic("setup_allocindir: lost block");
1480			ACQUIRE_LOCK(&lk);
1481			if (newblk->nb_state == DEPCOMPLETE) {
1482				aip->ai_state |= DEPCOMPLETE;
1483				aip->ai_buf = NULL;
1484			} else {
1485				bmsafemap = newblk->nb_bmsafemap;
1486				aip->ai_buf = bmsafemap->sm_buf;
1487				LIST_REMOVE(newblk, nb_deps);
1488				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1489				    aip, ai_deps);
1490			}
1491			LIST_REMOVE(newblk, nb_hash);
1492			FREE(newblk, M_NEWBLK);
1493			aip->ai_indirdep = indirdep;
1494			/*
1495			 * Check to see if there is an existing dependency
1496			 * for this block. If there is, merge the old
1497			 * dependency into the new one.
1498			 */
1499			if (aip->ai_oldblkno == 0)
1500				oldaip = NULL;
1501			else
1502				for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd);
1503				    oldaip; oldaip = LIST_NEXT(oldaip, ai_next))
1504					if (oldaip->ai_offset == aip->ai_offset)
1505						break;
1506			if (oldaip != NULL) {
1507				if (oldaip->ai_newblkno != aip->ai_oldblkno)
1508					panic("setup_allocindir_phase2: blkno");
1509				aip->ai_oldblkno = oldaip->ai_oldblkno;
1510				freefrag = oldaip->ai_freefrag;
1511				oldaip->ai_freefrag = aip->ai_freefrag;
1512				aip->ai_freefrag = freefrag;
1513				free_allocindir(oldaip, NULL);
1514			}
1515			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1516			((ufs_daddr_t *)indirdep->ir_savebp->b_data)
1517			    [aip->ai_offset] = aip->ai_oldblkno;
1518			FREE_LOCK(&lk);
1519		}
1520		if (newindirdep) {
1521			if (indirdep->ir_savebp != NULL)
1522				brelse(newindirdep->ir_savebp);
1523			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
1524		}
1525		if (indirdep)
1526			break;
1527		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
1528			M_INDIRDEP, M_WAITOK);
1529		newindirdep->ir_list.wk_type = D_INDIRDEP;
1530		newindirdep->ir_state = ATTACHED;
1531		LIST_INIT(&newindirdep->ir_deplisthd);
1532		LIST_INIT(&newindirdep->ir_donehd);
1533#ifdef __FreeBSD__
1534		if (bp->b_blkno == bp->b_lblkno) {
1535#if 0 /* we know this happens.. research suggested.. */
1536			printf("setup_allocindir_phase2: need bmap, blk %d\n",
1537				bp->b_lblkno);
1538#endif
1539			VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
1540				NULL, NULL);
1541		}
1542#endif /* __FreeBSD__ */
1543		newindirdep->ir_savebp =
1544		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1545		bp->b_flags |= B_XXX;
1546		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1547	}
1548}
1549
1550/*
1551 * Block de-allocation dependencies.
1552 *
1553 * When blocks are de-allocated, the on-disk pointers must be nullified before
1554 * the blocks are made available for use by other files.  (The true
1555 * requirement is that old pointers must be nullified before new on-disk
1556 * pointers are set.  We chose this slightly more stringent requirement to
1557 * reduce complexity.) Our implementation handles this dependency by updating
1558 * the inode (or indirect block) appropriately but delaying the actual block
1559 * de-allocation (i.e., freemap and free space count manipulation) until
1560 * after the updated versions reach stable storage.  After the disk is
1561 * updated, the blocks can be safely de-allocated whenever it is convenient.
1562 * This implementation handles only the common case of reducing a file's
1563 * length to zero. Other cases are handled by the conventional synchronous
1564 * write approach.
1565 *
1566 * The ffs implementation with which we worked double-checks
1567 * the state of the block pointers and file size as it reduces
1568 * a file's length.  Some of this code is replicated here in our
1569 * soft updates implementation.  The freeblks->fb_chkcnt field is
1570 * used to transfer a part of this information to the procedure
1571 * that eventually de-allocates the blocks.
1572 *
1573 * This routine should be called from the routine that shortens
1574 * a file's length, before the inode's size or block pointers
1575 * are modified. It will save the block pointer information for
1576 * later release and zero the inode so that the calling routine
1577 * can release it.
1578 */
1579static long num_freeblks;	/* number of freeblks allocated */
1580void
1581softdep_setup_freeblocks(ip, length)
1582	struct inode *ip;	/* The inode whose length is to be reduced */
1583	off_t length;		/* The new length for the file */
1584{
1585	struct freeblks *freeblks;
1586	struct inodedep *inodedep;
1587	struct allocdirect *adp;
1588	struct vnode *vp;
1589	struct buf *bp;
1590	struct fs *fs;
1591	int i, error;
1592
1593	fs = ip->i_fs;
1594	if (length != 0)
1595		panic("softde_setup_freeblocks: non-zero length");
1596	(void) checklimit(&num_freeblks, 0);
1597	num_freeblks += 1;
1598	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
1599		M_FREEBLKS, M_WAITOK);
1600	bzero(freeblks, sizeof(struct freeblks));
1601	freeblks->fb_list.wk_type = D_FREEBLKS;
1602	freeblks->fb_uid = ip->i_uid;
1603	freeblks->fb_previousinum = ip->i_number;
1604	freeblks->fb_devvp = ip->i_devvp;
1605	freeblks->fb_fs = fs;
1606	freeblks->fb_oldsize = ip->i_size;
1607	freeblks->fb_newsize = length;
1608	freeblks->fb_chkcnt = ip->i_blocks;
1609	for (i = 0; i < NDADDR; i++) {
1610		freeblks->fb_dblks[i] = ip->i_db[i];
1611		ip->i_db[i] = 0;
1612	}
1613	for (i = 0; i < NIADDR; i++) {
1614		freeblks->fb_iblks[i] = ip->i_ib[i];
1615		ip->i_ib[i] = 0;
1616	}
1617	ip->i_blocks = 0;
1618	ip->i_size = 0;
1619	/*
1620	 * Push the zero'ed inode to to its disk buffer so that we are free
1621	 * to delete its dependencies below. Once the dependencies are gone
1622	 * the buffer can be safely released.
1623	 */
1624	if ((error = bread(ip->i_devvp,
1625	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1626	    (int)fs->fs_bsize, NOCRED, &bp)) != 0)
1627		softdep_error("softdep_setup_freeblocks", error);
1628	*((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
1629	    ip->i_din;
1630	/*
1631	 * Find and eliminate any inode dependencies.
1632	 */
1633	ACQUIRE_LOCK(&lk);
1634	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1635	if ((inodedep->id_state & IOSTARTED) != 0)
1636		panic("softdep_setup_freeblocks: inode busy");
1637	/*
1638	 * Add the freeblks structure to the list of operations that
1639	 * must await the zero'ed inode being written to disk.
1640	 */
1641	WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1642	/*
1643	 * Because the file length has been truncated to zero, any
1644	 * pending block allocation dependency structures associated
1645	 * with this inode are obsolete and can simply be de-allocated.
1646	 * We must first merge the two dependency lists to get rid of
1647	 * any duplicate freefrag structures, then purge the merged list.
1648	 */
1649	merge_inode_lists(inodedep);
1650	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
1651		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
1652	bdwrite(bp);
1653	/*
1654	 * We must wait for any I/O in progress to finish so that
1655	 * all potential buffers on the dirty list will be visible.
1656	 * Once they are all there, walk the list and get rid of
1657	 * any dependencies.
1658	 */
1659	vp = ITOV(ip);
1660	while (vp->v_numoutput) {
1661		vp->v_flag |= VBWAIT;
1662		FREE_LOCK_INTERLOCKED(&lk);
1663		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "sdsetf", 0);
1664		ACQUIRE_LOCK_INTERLOCKED(&lk);
1665	}
1666	while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
1667		bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
1668		(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
1669		deallocate_dependencies(bp, inodedep);
1670		bp->b_flags |= B_INVAL | B_NOCACHE;
1671		brelse(bp);
1672	}
1673	/*
1674	 * Try freeing the inodedep in case that was the last dependency.
1675	 */
1676	if ((inodedep_lookup(fs, ip->i_number, 0, &inodedep)) != 0)
1677		(void) free_inodedep(inodedep);
1678	FREE_LOCK(&lk);
1679}
1680
1681/*
1682 * Reclaim any dependency structures from a buffer that is about to
1683 * be reallocated to a new vnode. The buffer must be locked, thus,
1684 * no I/O completion operations can occur while we are manipulating
1685 * its associated dependencies. The mutex is held so that other I/O's
1686 * associated with related dependencies do not occur.
1687 */
1688static void
1689deallocate_dependencies(bp, inodedep)
1690	struct buf *bp;
1691	struct inodedep *inodedep;
1692{
1693	struct worklist *wk;
1694	struct indirdep *indirdep;
1695	struct allocindir *aip;
1696	struct pagedep *pagedep;
1697	struct dirrem *dirrem;
1698	struct diradd *dap;
1699	int i;
1700
1701	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
1702		switch (wk->wk_type) {
1703
1704		case D_INDIRDEP:
1705			indirdep = WK_INDIRDEP(wk);
1706			/*
1707			 * None of the indirect pointers will ever be visible,
1708			 * so they can simply be tossed. GOINGAWAY ensures
1709			 * that allocated pointers will be saved in the buffer
1710			 * cache until they are freed. Note that they will
1711			 * only be able to be found by their physical address
1712			 * since the inode mapping the logical address will
1713			 * be gone. The save buffer used for the safe copy
1714			 * was allocated in setup_allocindir_phase2 using
1715			 * the physical address so it could be used for this
1716			 * purpose. Hence we swap the safe copy with the real
1717			 * copy, allowing the safe copy to be freed and holding
1718			 * on to the real copy for later use in indir_trunc.
1719			 */
1720			if (indirdep->ir_state & GOINGAWAY)
1721				panic("deallocate_dependencies: already gone");
1722			indirdep->ir_state |= GOINGAWAY;
1723			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
1724				free_allocindir(aip, inodedep);
1725			if (bp->b_lblkno >= 0 ||
1726			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
1727				panic("deallocate_dependencies: not indir");
1728			bcopy(bp->b_data, indirdep->ir_savebp->b_data,
1729			    bp->b_bcount);
1730			WORKLIST_REMOVE(wk);
1731			WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
1732			continue;
1733
1734		case D_PAGEDEP:
1735			pagedep = WK_PAGEDEP(wk);
1736			/*
1737			 * None of the directory additions will ever be
1738			 * visible, so they can simply be tossed.
1739			 */
1740			for (i = 0; i < DAHASHSZ; i++)
1741				while (dap=LIST_FIRST(&pagedep->pd_diraddhd[i]))
1742					free_diradd(dap);
1743			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
1744				free_diradd(dap);
1745			/*
1746			 * Copy any directory remove dependencies to the list
1747			 * to be processed after the zero'ed inode is written.
1748			 * If the inode has already been written, then they
1749			 * can be dumped directly onto the work list.
1750			 */
1751			for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem;
1752			     dirrem = LIST_NEXT(dirrem, dm_next)) {
1753				LIST_REMOVE(dirrem, dm_next);
1754				dirrem->dm_dirinum = pagedep->pd_ino;
1755				if (inodedep == NULL)
1756					add_to_worklist(&dirrem->dm_list);
1757				else
1758					WORKLIST_INSERT(&inodedep->id_bufwait,
1759					    &dirrem->dm_list);
1760			}
1761			WORKLIST_REMOVE(&pagedep->pd_list);
1762			LIST_REMOVE(pagedep, pd_hash);
1763			WORKITEM_FREE(pagedep, D_PAGEDEP);
1764			continue;
1765
1766		case D_ALLOCINDIR:
1767			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
1768			continue;
1769
1770		case D_ALLOCDIRECT:
1771		case D_INODEDEP:
1772			panic("deallocate_dependencies: Unexpected type %s",
1773			    TYPENAME(wk->wk_type));
1774			/* NOTREACHED */
1775
1776		default:
1777			panic("deallocate_dependencies: Unknown type %s",
1778			    TYPENAME(wk->wk_type));
1779			/* NOTREACHED */
1780		}
1781	}
1782}
1783
1784/*
1785 * Free an allocdirect. Generate a new freefrag work request if appropriate.
1786 * This routine must be called with splbio interrupts blocked.
1787 */
1788static void
1789free_allocdirect(adphead, adp, delay)
1790	struct allocdirectlst *adphead;
1791	struct allocdirect *adp;
1792	int delay;
1793{
1794
1795#ifdef DEBUG
1796	if (lk.lkt_held == -1)
1797		panic("free_allocdirect: lock not held");
1798#endif
1799	if ((adp->ad_state & DEPCOMPLETE) == 0)
1800		LIST_REMOVE(adp, ad_deps);
1801	TAILQ_REMOVE(adphead, adp, ad_next);
1802	if ((adp->ad_state & COMPLETE) == 0)
1803		WORKLIST_REMOVE(&adp->ad_list);
1804	if (adp->ad_freefrag != NULL) {
1805		if (delay)
1806			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
1807			    &adp->ad_freefrag->ff_list);
1808		else
1809			add_to_worklist(&adp->ad_freefrag->ff_list);
1810	}
1811	WORKITEM_FREE(adp, D_ALLOCDIRECT);
1812}
1813
1814/*
1815 * Prepare an inode to be freed. The actual free operation is not
1816 * done until the zero'ed inode has been written to disk.
1817 */
1818static long num_freefile;	/* number of freefile allocated */
1819void
1820softdep_freefile(pvp, ino, mode)
1821		struct vnode *pvp;
1822		ino_t ino;
1823		int mode;
1824{
1825	struct inode *ip = VTOI(pvp);
1826	struct inodedep *inodedep;
1827	struct freefile *freefile;
1828
1829	/*
1830	 * This sets up the inode de-allocation dependency.
1831	 */
1832	(void) checklimit(&num_freefile, 0);
1833	num_freefile += 1;
1834	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
1835		M_FREEFILE, M_WAITOK);
1836	freefile->fx_list.wk_type = D_FREEFILE;
1837	freefile->fx_list.wk_state = 0;
1838	freefile->fx_mode = mode;
1839	freefile->fx_oldinum = ino;
1840	freefile->fx_devvp = ip->i_devvp;
1841	freefile->fx_fs = ip->i_fs;
1842
1843	/*
1844	 * If the inodedep does not exist, then the zero'ed inode has
1845	 * been written to disk and we can free the file immediately.
1846	 */
1847	ACQUIRE_LOCK(&lk);
1848	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0) {
1849		add_to_worklist(&freefile->fx_list);
1850		FREE_LOCK(&lk);
1851		return;
1852	}
1853
1854	/*
1855	 * If we still have a bitmap dependency, then the inode has never
1856	 * been written to disk. Drop the dependency as it is no longer
1857	 * necessary since the inode is being deallocated. We could process
1858	 * the freefile immediately, but then we would have to clear the
1859	 * id_inowait dependencies here and it is easier just to let the
1860	 * zero'ed inode be written and let them be cleaned up in the
1861	 * normal followup actions that follow the inode write.
1862	 */
1863	 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
1864		inodedep->id_state |= DEPCOMPLETE;
1865		LIST_REMOVE(inodedep, id_deps);
1866		inodedep->id_buf = NULL;
1867	}
1868	/*
1869	 * If the inodedep has no dependencies associated with it,
1870	 * then we must free it here and free the file immediately.
1871	 * This case arises when an early allocation fails (for
1872	 * example, the user is over their file quota).
1873	 */
1874	if (free_inodedep(inodedep) == 0)
1875		WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
1876	else
1877		add_to_worklist(&freefile->fx_list);
1878	FREE_LOCK(&lk);
1879}
1880
1881/*
1882 * Try to free an inodedep structure. Return 1 if it could be freed.
1883 */
1884static int
1885free_inodedep(inodedep)
1886	struct inodedep *inodedep;
1887{
1888
1889	if ((inodedep->id_state & ONWORKLIST) != 0 ||
1890	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
1891	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
1892	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
1893	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
1894	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
1895	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
1896	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL)
1897		return (0);
1898	LIST_REMOVE(inodedep, id_hash);
1899	WORKITEM_FREE(inodedep, D_INODEDEP);
1900	num_inodedep -= 1;
1901	return (1);
1902}
1903
1904/*
1905 * This workitem routine performs the block de-allocation.
1906 * The workitem is added to the pending list after the updated
1907 * inode block has been written to disk.  As mentioned above,
1908 * checks regarding the number of blocks de-allocated (compared
1909 * to the number of blocks allocated for the file) are also
1910 * performed in this function.
1911 */
1912static void
1913handle_workitem_freeblocks(freeblks)
1914	struct freeblks *freeblks;
1915{
1916	struct inode tip;
1917	ufs_daddr_t bn;
1918	struct fs *fs;
1919	int i, level, bsize;
1920	long nblocks, blocksreleased = 0;
1921	int error, allerror = 0;
1922	ufs_lbn_t baselbns[NIADDR], tmpval;
1923
1924	tip.i_number = freeblks->fb_previousinum;
1925	tip.i_devvp = freeblks->fb_devvp;
1926	tip.i_dev = freeblks->fb_devvp->v_rdev;
1927	tip.i_fs = freeblks->fb_fs;
1928	tip.i_size = freeblks->fb_oldsize;
1929	tip.i_uid = freeblks->fb_uid;
1930	fs = freeblks->fb_fs;
1931	tmpval = 1;
1932	baselbns[0] = NDADDR;
1933	for (i = 1; i < NIADDR; i++) {
1934		tmpval *= NINDIR(fs);
1935		baselbns[i] = baselbns[i - 1] + tmpval;
1936	}
1937	nblocks = btodb(fs->fs_bsize);
1938	blocksreleased = 0;
1939	/*
1940	 * Indirect blocks first.
1941	 */
1942	for (level = (NIADDR - 1); level >= 0; level--) {
1943		if ((bn = freeblks->fb_iblks[level]) == 0)
1944			continue;
1945		if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
1946		    baselbns[level], &blocksreleased)) == 0)
1947			allerror = error;
1948		ffs_blkfree(&tip, bn, fs->fs_bsize);
1949		blocksreleased += nblocks;
1950	}
1951	/*
1952	 * All direct blocks or frags.
1953	 */
1954	for (i = (NDADDR - 1); i >= 0; i--) {
1955		if ((bn = freeblks->fb_dblks[i]) == 0)
1956			continue;
1957		bsize = blksize(fs, &tip, i);
1958		ffs_blkfree(&tip, bn, bsize);
1959		blocksreleased += btodb(bsize);
1960	}
1961
1962#ifdef DIAGNOSTIC
1963	if (freeblks->fb_chkcnt != blocksreleased)
1964		panic("handle_workitem_freeblocks: block count");
1965	if (allerror)
1966		softdep_error("handle_workitem_freeblks", allerror);
1967#endif /* DIAGNOSTIC */
1968	WORKITEM_FREE(freeblks, D_FREEBLKS);
1969	num_freeblks -= 1;
1970}
1971
1972/*
1973 * Release blocks associated with the inode ip and stored in the indirect
1974 * block dbn. If level is greater than SINGLE, the block is an indirect block
1975 * and recursive calls to indirtrunc must be used to cleanse other indirect
1976 * blocks.
1977 */
1978static int
1979indir_trunc(ip, dbn, level, lbn, countp)
1980	struct inode *ip;
1981	ufs_daddr_t dbn;
1982	int level;
1983	ufs_lbn_t lbn;
1984	long *countp;
1985{
1986	struct buf *bp;
1987	ufs_daddr_t *bap;
1988	ufs_daddr_t nb;
1989	struct fs *fs;
1990	struct worklist *wk;
1991	struct indirdep *indirdep;
1992	int i, lbnadd, nblocks;
1993	int error, allerror = 0;
1994
1995	fs = ip->i_fs;
1996	lbnadd = 1;
1997	for (i = level; i > 0; i--)
1998		lbnadd *= NINDIR(fs);
1999	/*
2000	 * Get buffer of block pointers to be freed. This routine is not
2001	 * called until the zero'ed inode has been written, so it is safe
2002	 * to free blocks as they are encountered. Because the inode has
2003	 * been zero'ed, calls to bmap on these blocks will fail. So, we
2004	 * have to use the on-disk address and the block device for the
2005	 * filesystem to look them up. If the file was deleted before its
2006	 * indirect blocks were all written to disk, the routine that set
2007	 * us up (deallocate_dependencies) will have arranged to leave
2008	 * a complete copy of the indirect block in memory for our use.
2009	 * Otherwise we have to read the blocks in from the disk.
2010	 */
2011	ACQUIRE_LOCK(&lk);
2012	if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
2013	    (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2014		if (wk->wk_type != D_INDIRDEP ||
2015		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2016		    (indirdep->ir_state & GOINGAWAY) == 0)
2017			panic("indir_trunc: lost indirdep");
2018		WORKLIST_REMOVE(wk);
2019		WORKITEM_FREE(indirdep, D_INDIRDEP);
2020		if (LIST_FIRST(&bp->b_dep) != NULL)
2021			panic("indir_trunc: dangling dep");
2022		FREE_LOCK(&lk);
2023	} else {
2024		FREE_LOCK(&lk);
2025		error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
2026		if (error)
2027			return (error);
2028	}
2029	/*
2030	 * Recursively free indirect blocks.
2031	 */
2032	bap = (ufs_daddr_t *)bp->b_data;
2033	nblocks = btodb(fs->fs_bsize);
2034	for (i = NINDIR(fs) - 1; i >= 0; i--) {
2035		if ((nb = bap[i]) == 0)
2036			continue;
2037		if (level != 0) {
2038			if ((error = indir_trunc(ip, fsbtodb(fs, nb),
2039			     level - 1, lbn + (i * lbnadd), countp)) != 0)
2040				allerror = error;
2041		}
2042		ffs_blkfree(ip, nb, fs->fs_bsize);
2043		*countp += nblocks;
2044	}
2045	bp->b_flags |= B_INVAL | B_NOCACHE;
2046	bp->b_flags &= ~B_XXX;
2047	brelse(bp);
2048	return (allerror);
2049}
2050
2051/*
2052 * Free an allocindir.
2053 * This routine must be called with splbio interrupts blocked.
2054 */
2055static void
2056free_allocindir(aip, inodedep)
2057	struct allocindir *aip;
2058	struct inodedep *inodedep;
2059{
2060	struct freefrag *freefrag;
2061
2062#ifdef DEBUG
2063	if (lk.lkt_held == -1)
2064		panic("free_allocindir: lock not held");
2065#endif
2066	if ((aip->ai_state & DEPCOMPLETE) == 0)
2067		LIST_REMOVE(aip, ai_deps);
2068	if (aip->ai_state & ONWORKLIST)
2069		WORKLIST_REMOVE(&aip->ai_list);
2070	LIST_REMOVE(aip, ai_next);
2071	if ((freefrag = aip->ai_freefrag) != NULL) {
2072		if (inodedep == NULL)
2073			add_to_worklist(&freefrag->ff_list);
2074		else
2075			WORKLIST_INSERT(&inodedep->id_bufwait,
2076			    &freefrag->ff_list);
2077	}
2078	WORKITEM_FREE(aip, D_ALLOCINDIR);
2079}
2080
2081/*
2082 * Directory entry addition dependencies.
2083 *
2084 * When adding a new directory entry, the inode (with its incremented link
2085 * count) must be written to disk before the directory entry's pointer to it.
2086 * Also, if the inode is newly allocated, the corresponding freemap must be
2087 * updated (on disk) before the directory entry's pointer. These requirements
2088 * are met via undo/redo on the directory entry's pointer, which consists
2089 * simply of the inode number.
2090 *
2091 * As directory entries are added and deleted, the free space within a
2092 * directory block can become fragmented.  The ufs file system will compact
2093 * a fragmented directory block to make space for a new entry. When this
2094 * occurs, the offsets of previously added entries change. Any "diradd"
2095 * dependency structures corresponding to these entries must be updated with
2096 * the new offsets.
2097 */
2098
2099/*
2100 * This routine is called after the in-memory inode's link
2101 * count has been incremented, but before the directory entry's
2102 * pointer to the inode has been set.
2103 */
2104void
2105softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
2106	struct buf *bp;		/* buffer containing directory block */
2107	struct inode *dp;	/* inode for directory */
2108	off_t diroffset;	/* offset of new entry in directory */
2109	long newinum;		/* inode referenced by new directory entry */
2110	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
2111{
2112	int offset;		/* offset of new entry within directory block */
2113	ufs_lbn_t lbn;		/* block in directory containing new entry */
2114	struct fs *fs;
2115	struct diradd *dap;
2116	struct pagedep *pagedep;
2117	struct inodedep *inodedep;
2118	struct mkdir *mkdir1, *mkdir2;
2119
2120	/*
2121	 * Whiteouts have no dependencies.
2122	 */
2123	if (newinum == WINO) {
2124		if (newdirbp != NULL)
2125			bdwrite(newdirbp);
2126		return;
2127	}
2128
2129	fs = dp->i_fs;
2130	lbn = lblkno(fs, diroffset);
2131	offset = blkoff(fs, diroffset);
2132	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK);
2133	bzero(dap, sizeof(struct diradd));
2134	dap->da_list.wk_type = D_DIRADD;
2135	dap->da_offset = offset;
2136	dap->da_newinum = newinum;
2137	dap->da_state = ATTACHED;
2138	if (newdirbp == NULL) {
2139		dap->da_state |= DEPCOMPLETE;
2140		ACQUIRE_LOCK(&lk);
2141	} else {
2142		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2143		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2144		    M_WAITOK);
2145		mkdir1->md_list.wk_type = D_MKDIR;
2146		mkdir1->md_state = MKDIR_BODY;
2147		mkdir1->md_diradd = dap;
2148		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2149		    M_WAITOK);
2150		mkdir2->md_list.wk_type = D_MKDIR;
2151		mkdir2->md_state = MKDIR_PARENT;
2152		mkdir2->md_diradd = dap;
2153		ACQUIRE_LOCK(&lk);
2154		/*
2155		 * Dependency on "." and ".." being written to disk.
2156		 */
2157		mkdir1->md_buf = newdirbp;
2158		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2159		WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2160		bdwrite(newdirbp);
2161		/*
2162		 * Dependency on link count increase for parent directory
2163		 */
2164		if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
2165		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2166			dap->da_state &= ~MKDIR_PARENT;
2167			WORKITEM_FREE(mkdir2, D_MKDIR);
2168		} else {
2169			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2170			WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2171		}
2172	}
2173	/*
2174	 * Link into parent directory pagedep to await its being written.
2175	 */
2176	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2177		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2178	dap->da_pagedep = pagedep;
2179	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2180	    da_pdlist);
2181	/*
2182	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2183	 * is not yet written. If it is written, do the post-inode write
2184	 * processing to put it on the id_pendinghd list.
2185	 */
2186	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2187	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2188		diradd_inode_written(dap, inodedep);
2189	else
2190		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2191	FREE_LOCK(&lk);
2192}
2193
2194/*
2195 * This procedure is called to change the offset of a directory
2196 * entry when compacting a directory block which must be owned
2197 * exclusively by the caller. Note that the actual entry movement
2198 * must be done in this procedure to ensure that no I/O completions
2199 * occur while the move is in progress.
2200 */
2201void
2202softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2203	struct inode *dp;	/* inode for directory */
2204	caddr_t base;		/* address of dp->i_offset */
2205	caddr_t oldloc;		/* address of old directory location */
2206	caddr_t newloc;		/* address of new directory location */
2207	int entrysize;		/* size of directory entry */
2208{
2209	int offset, oldoffset, newoffset;
2210	struct pagedep *pagedep;
2211	struct diradd *dap;
2212	ufs_lbn_t lbn;
2213
2214	ACQUIRE_LOCK(&lk);
2215	lbn = lblkno(dp->i_fs, dp->i_offset);
2216	offset = blkoff(dp->i_fs, dp->i_offset);
2217	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2218		goto done;
2219	oldoffset = offset + (oldloc - base);
2220	newoffset = offset + (newloc - base);
2221	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]);
2222	     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2223		if (dap->da_offset != oldoffset)
2224			continue;
2225		dap->da_offset = newoffset;
2226		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2227			break;
2228		LIST_REMOVE(dap, da_pdlist);
2229		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2230		    dap, da_pdlist);
2231		break;
2232	}
2233	if (dap == NULL) {
2234		for (dap = LIST_FIRST(&pagedep->pd_pendinghd);
2235		     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2236			if (dap->da_offset == oldoffset) {
2237				dap->da_offset = newoffset;
2238				break;
2239			}
2240		}
2241	}
2242done:
2243	bcopy(oldloc, newloc, entrysize);
2244	FREE_LOCK(&lk);
2245}
2246
2247/*
2248 * Free a diradd dependency structure. This routine must be called
2249 * with splbio interrupts blocked.
2250 */
2251static void
2252free_diradd(dap)
2253	struct diradd *dap;
2254{
2255	struct dirrem *dirrem;
2256	struct pagedep *pagedep;
2257	struct inodedep *inodedep;
2258	struct mkdir *mkdir, *nextmd;
2259
2260#ifdef DEBUG
2261	if (lk.lkt_held == -1)
2262		panic("free_diradd: lock not held");
2263#endif
2264	WORKLIST_REMOVE(&dap->da_list);
2265	LIST_REMOVE(dap, da_pdlist);
2266	if ((dap->da_state & DIRCHG) == 0) {
2267		pagedep = dap->da_pagedep;
2268	} else {
2269		dirrem = dap->da_previous;
2270		pagedep = dirrem->dm_pagedep;
2271		dirrem->dm_dirinum = pagedep->pd_ino;
2272		add_to_worklist(&dirrem->dm_list);
2273	}
2274	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2275	    0, &inodedep) != 0)
2276		(void) free_inodedep(inodedep);
2277	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2278		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2279			nextmd = LIST_NEXT(mkdir, md_mkdirs);
2280			if (mkdir->md_diradd != dap)
2281				continue;
2282			dap->da_state &= ~mkdir->md_state;
2283			WORKLIST_REMOVE(&mkdir->md_list);
2284			LIST_REMOVE(mkdir, md_mkdirs);
2285			WORKITEM_FREE(mkdir, D_MKDIR);
2286		}
2287		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
2288			panic("free_diradd: unfound ref");
2289	}
2290	WORKITEM_FREE(dap, D_DIRADD);
2291}
2292
2293/*
2294 * Directory entry removal dependencies.
2295 *
2296 * When removing a directory entry, the entry's inode pointer must be
2297 * zero'ed on disk before the corresponding inode's link count is decremented
2298 * (possibly freeing the inode for re-use). This dependency is handled by
2299 * updating the directory entry but delaying the inode count reduction until
2300 * after the directory block has been written to disk. After this point, the
2301 * inode count can be decremented whenever it is convenient.
2302 */
2303
2304/*
2305 * This routine should be called immediately after removing
2306 * a directory entry.  The inode's link count should not be
2307 * decremented by the calling procedure -- the soft updates
2308 * code will do this task when it is safe.
2309 */
2310void
2311softdep_setup_remove(bp, dp, ip, isrmdir)
2312	struct buf *bp;		/* buffer containing directory block */
2313	struct inode *dp;	/* inode for the directory being modified */
2314	struct inode *ip;	/* inode for directory entry being removed */
2315	int isrmdir;		/* indicates if doing RMDIR */
2316{
2317	struct dirrem *dirrem;
2318
2319	/*
2320	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2321	 */
2322	dirrem = newdirrem(bp, dp, ip, isrmdir);
2323	if ((dirrem->dm_state & COMPLETE) == 0) {
2324		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2325		    dm_next);
2326	} else {
2327		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2328		add_to_worklist(&dirrem->dm_list);
2329	}
2330	FREE_LOCK(&lk);
2331}
2332
2333/*
2334 * Allocate a new dirrem if appropriate and return it along with
2335 * its associated pagedep. Called without a lock, returns with lock.
2336 */
2337static struct dirrem *
2338newdirrem(bp, dp, ip, isrmdir)
2339	struct buf *bp;		/* buffer containing directory block */
2340	struct inode *dp;	/* inode for the directory being modified */
2341	struct inode *ip;	/* inode for directory entry being removed */
2342	int isrmdir;		/* indicates if doing RMDIR */
2343{
2344	int offset;
2345	ufs_lbn_t lbn;
2346	struct diradd *dap;
2347	struct dirrem *dirrem;
2348	struct pagedep *pagedep;
2349
2350	/*
2351	 * Whiteouts have no deletion dependencies.
2352	 */
2353	if (ip == NULL)
2354		panic("newdirrem: whiteout");
2355	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
2356		M_DIRREM, M_WAITOK);
2357	bzero(dirrem, sizeof(struct dirrem));
2358	dirrem->dm_list.wk_type = D_DIRREM;
2359	dirrem->dm_state = isrmdir ? RMDIR : 0;
2360	dirrem->dm_mnt = ITOV(ip)->v_mount;
2361	dirrem->dm_oldinum = ip->i_number;
2362
2363	ACQUIRE_LOCK(&lk);
2364	lbn = lblkno(dp->i_fs, dp->i_offset);
2365	offset = blkoff(dp->i_fs, dp->i_offset);
2366	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2367		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2368	dirrem->dm_pagedep = pagedep;
2369	/*
2370	 * Check for a diradd dependency for the same directory entry.
2371	 * If present, then both dependencies become obsolete and can
2372	 * be de-allocated. Check for an entry on both the pd_dirraddhd
2373	 * list and the pd_pendinghd list.
2374	 */
2375	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(offset)]);
2376	     dap; dap = LIST_NEXT(dap, da_pdlist))
2377		if (dap->da_offset == offset)
2378			break;
2379	if (dap == NULL) {
2380		for (dap = LIST_FIRST(&pagedep->pd_pendinghd);
2381		     dap; dap = LIST_NEXT(dap, da_pdlist))
2382			if (dap->da_offset == offset)
2383				break;
2384		if (dap == NULL)
2385			return (dirrem);
2386	}
2387	/*
2388	 * Must be ATTACHED at this point, so just delete it.
2389	 */
2390	if ((dap->da_state & ATTACHED) == 0)
2391		panic("newdirrem: not ATTACHED");
2392	if (dap->da_newinum != ip->i_number)
2393		panic("newdirrem: inum %d should be %d",
2394		    ip->i_number, dap->da_newinum);
2395	free_diradd(dap);
2396	dirrem->dm_state |= COMPLETE;
2397	return (dirrem);
2398}
2399
2400/*
2401 * Directory entry change dependencies.
2402 *
2403 * Changing an existing directory entry requires that an add operation
2404 * be completed first followed by a deletion. The semantics for the addition
2405 * are identical to the description of adding a new entry above except
2406 * that the rollback is to the old inode number rather than zero. Once
2407 * the addition dependency is completed, the removal is done as described
2408 * in the removal routine above.
2409 */
2410
2411/*
2412 * This routine should be called immediately after changing
2413 * a directory entry.  The inode's link count should not be
2414 * decremented by the calling procedure -- the soft updates
2415 * code will perform this task when it is safe.
2416 */
2417void
2418softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
2419	struct buf *bp;		/* buffer containing directory block */
2420	struct inode *dp;	/* inode for the directory being modified */
2421	struct inode *ip;	/* inode for directory entry being removed */
2422	long newinum;		/* new inode number for changed entry */
2423	int isrmdir;		/* indicates if doing RMDIR */
2424{
2425	int offset;
2426	struct diradd *dap = NULL;
2427	struct dirrem *dirrem;
2428	struct pagedep *pagedep;
2429	struct inodedep *inodedep;
2430
2431	offset = blkoff(dp->i_fs, dp->i_offset);
2432
2433	/*
2434	 * Whiteouts do not need diradd dependencies.
2435	 */
2436	if (newinum != WINO) {
2437		MALLOC(dap, struct diradd *, sizeof(struct diradd),
2438		    M_DIRADD, M_WAITOK);
2439		bzero(dap, sizeof(struct diradd));
2440		dap->da_list.wk_type = D_DIRADD;
2441		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
2442		dap->da_offset = offset;
2443		dap->da_newinum = newinum;
2444	}
2445
2446	/*
2447	 * Allocate a new dirrem and ACQUIRE_LOCK.
2448	 */
2449	dirrem = newdirrem(bp, dp, ip, isrmdir);
2450	pagedep = dirrem->dm_pagedep;
2451	/*
2452	 * The possible values for isrmdir:
2453	 *	0 - non-directory file rename
2454	 *	1 - directory rename within same directory
2455	 *   inum - directory rename to new directory of given inode number
2456	 * When renaming to a new directory, we are both deleting and
2457	 * creating a new directory entry, so the link count on the new
2458	 * directory should not change. Thus we do not need the followup
2459	 * dirrem which is usually done in handle_workitem_remove. We set
2460	 * the DIRCHG flag to tell handle_workitem_remove to skip the
2461	 * followup dirrem.
2462	 */
2463	if (isrmdir > 1)
2464		dirrem->dm_state |= DIRCHG;
2465
2466	/*
2467	 * Whiteouts have no additional dependencies,
2468	 * so just put the dirrem on the correct list.
2469	 */
2470	if (newinum == WINO) {
2471		if ((dirrem->dm_state & COMPLETE) == 0) {
2472			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
2473			    dm_next);
2474		} else {
2475			dirrem->dm_dirinum = pagedep->pd_ino;
2476			add_to_worklist(&dirrem->dm_list);
2477		}
2478		FREE_LOCK(&lk);
2479		return;
2480	}
2481
2482	/*
2483	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2484	 * is not yet written. If it is written, do the post-inode write
2485	 * processing to put it on the id_pendinghd list.
2486	 */
2487	dap->da_previous = dirrem;
2488	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
2489	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2490		dap->da_state |= COMPLETE;
2491		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
2492		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
2493	} else {
2494		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
2495		    dap, da_pdlist);
2496		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2497	}
2498	/*
2499	 * If the previous inode was never written or its previous directory
2500	 * entry was never written, then we do not want to roll back to this
2501	 * previous value. Instead we want to roll back to zero and immediately
2502	 * free the unwritten or unreferenced inode.
2503	 */
2504	if (dirrem->dm_state & COMPLETE) {
2505		dap->da_state &= ~DIRCHG;
2506		dap->da_pagedep = pagedep;
2507		dirrem->dm_dirinum = pagedep->pd_ino;
2508		add_to_worklist(&dirrem->dm_list);
2509	}
2510	FREE_LOCK(&lk);
2511}
2512
2513/*
2514 * Called whenever the link count on an inode is increased.
2515 * It creates an inode dependency so that the new reference(s)
2516 * to the inode cannot be committed to disk until the updated
2517 * inode has been written.
2518 */
2519void
2520softdep_increase_linkcnt(ip)
2521	struct inode *ip;	/* the inode with the increased link count */
2522{
2523	struct inodedep *inodedep;
2524
2525	ACQUIRE_LOCK(&lk);
2526	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
2527	FREE_LOCK(&lk);
2528}
2529
2530/*
2531 * This workitem decrements the inode's link count.
2532 * If the link count reaches zero, the file is removed.
2533 */
2534static void
2535handle_workitem_remove(dirrem)
2536	struct dirrem *dirrem;
2537{
2538	struct proc *p = CURPROC;	/* XXX */
2539	struct inodedep *inodedep;
2540	struct vnode *vp;
2541	struct inode *ip;
2542	int error;
2543
2544	if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
2545		softdep_error("handle_workitem_remove: vget", error);
2546		return;
2547	}
2548	ip = VTOI(vp);
2549	/*
2550	 * Normal file deletion.
2551	 */
2552	if ((dirrem->dm_state & RMDIR) == 0) {
2553		ip->i_nlink--;
2554		if (ip->i_nlink < ip->i_effnlink)
2555			panic("handle_workitem_remove: bad file delta");
2556		ip->i_flag |= IN_CHANGE;
2557		vput(vp);
2558		WORKITEM_FREE(dirrem, D_DIRREM);
2559		return;
2560	}
2561	/*
2562	 * Directory deletion. Decrement reference count for both the
2563	 * just deleted parent directory entry and the reference for ".".
2564	 * Next truncate the directory to length zero. When the
2565	 * truncation completes, arrange to have the reference count on
2566	 * the parent decremented to account for the loss of "..".
2567	 */
2568	ip->i_nlink -= 2;
2569	if (ip->i_nlink < ip->i_effnlink)
2570		panic("handle_workitem_remove: bad dir delta");
2571	ip->i_flag |= IN_CHANGE;
2572	if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0)
2573		softdep_error("handle_workitem_remove: truncate", error);
2574	/*
2575	 * Rename a directory to a new parent. Since, we are both deleting
2576	 * and creating a new directory entry, the link count on the new
2577	 * directory should not change. Thus we skip the followup dirrem.
2578	 */
2579	if (dirrem->dm_state & DIRCHG) {
2580		vput(vp);
2581		WORKITEM_FREE(dirrem, D_DIRREM);
2582		return;
2583	}
2584	ACQUIRE_LOCK(&lk);
2585	(void) inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, DEPALLOC,
2586	    &inodedep);
2587	dirrem->dm_state = 0;
2588	dirrem->dm_oldinum = dirrem->dm_dirinum;
2589	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
2590	FREE_LOCK(&lk);
2591	vput(vp);
2592}
2593
2594/*
2595 * Inode de-allocation dependencies.
2596 *
2597 * When an inode's link count is reduced to zero, it can be de-allocated. We
2598 * found it convenient to postpone de-allocation until after the inode is
2599 * written to disk with its new link count (zero).  At this point, all of the
2600 * on-disk inode's block pointers are nullified and, with careful dependency
2601 * list ordering, all dependencies related to the inode will be satisfied and
2602 * the corresponding dependency structures de-allocated.  So, if/when the
2603 * inode is reused, there will be no mixing of old dependencies with new
2604 * ones.  This artificial dependency is set up by the block de-allocation
2605 * procedure above (softdep_setup_freeblocks) and completed by the
2606 * following procedure.
2607 */
2608static void
2609handle_workitem_freefile(freefile)
2610	struct freefile *freefile;
2611{
2612	struct vnode vp;
2613	struct inode tip;
2614	struct inodedep *idp;
2615	int error;
2616
2617#ifdef DEBUG
2618	ACQUIRE_LOCK(&lk);
2619	if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp))
2620		panic("handle_workitem_freefile: inodedep survived");
2621	FREE_LOCK(&lk);
2622#endif
2623	tip.i_devvp = freefile->fx_devvp;
2624	tip.i_dev = freefile->fx_devvp->v_rdev;
2625	tip.i_fs = freefile->fx_fs;
2626	vp.v_data = &tip;
2627	if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
2628		softdep_error("handle_workitem_freefile", error);
2629	WORKITEM_FREE(freefile, D_FREEFILE);
2630	num_freefile -= 1;
2631}
2632
2633/*
2634 * Disk writes.
2635 *
2636 * The dependency structures constructed above are most actively used when file
2637 * system blocks are written to disk.  No constraints are placed on when a
2638 * block can be written, but unsatisfied update dependencies are made safe by
2639 * modifying (or replacing) the source memory for the duration of the disk
2640 * write.  When the disk write completes, the memory block is again brought
2641 * up-to-date.
2642 *
2643 * In-core inode structure reclamation.
2644 *
2645 * Because there are a finite number of "in-core" inode structures, they are
2646 * reused regularly.  By transferring all inode-related dependencies to the
2647 * in-memory inode block and indexing them separately (via "inodedep"s), we
2648 * can allow "in-core" inode structures to be reused at any time and avoid
2649 * any increase in contention.
2650 *
2651 * Called just before entering the device driver to initiate a new disk I/O.
2652 * The buffer must be locked, thus, no I/O completion operations can occur
2653 * while we are manipulating its associated dependencies.
2654 */
2655void
2656softdep_disk_io_initiation(bp)
2657	struct buf *bp;		/* structure describing disk write to occur */
2658{
2659	struct worklist *wk, *nextwk;
2660	struct indirdep *indirdep;
2661
2662	/*
2663	 * We only care about write operations. There should never
2664	 * be dependencies for reads.
2665	 */
2666	if (bp->b_flags & B_READ)
2667		panic("softdep_disk_io_initiation: read");
2668	/*
2669	 * Do any necessary pre-I/O processing.
2670	 */
2671	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
2672		nextwk = LIST_NEXT(wk, wk_list);
2673		switch (wk->wk_type) {
2674
2675		case D_PAGEDEP:
2676			initiate_write_filepage(WK_PAGEDEP(wk), bp);
2677			continue;
2678
2679		case D_INODEDEP:
2680			initiate_write_inodeblock(WK_INODEDEP(wk), bp);
2681			continue;
2682
2683		case D_INDIRDEP:
2684			indirdep = WK_INDIRDEP(wk);
2685			if (indirdep->ir_state & GOINGAWAY)
2686				panic("disk_io_initiation: indirdep gone");
2687			/*
2688			 * If there are no remaining dependencies, this
2689			 * will be writing the real pointers, so the
2690			 * dependency can be freed.
2691			 */
2692			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
2693				indirdep->ir_savebp->b_flags &= ~B_XXX;
2694				indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
2695				brelse(indirdep->ir_savebp);
2696				/* inline expand WORKLIST_REMOVE(wk); */
2697				wk->wk_state &= ~ONWORKLIST;
2698				LIST_REMOVE(wk, wk_list);
2699				WORKITEM_FREE(indirdep, D_INDIRDEP);
2700				continue;
2701			}
2702			/*
2703			 * Replace up-to-date version with safe version.
2704			 */
2705			ACQUIRE_LOCK(&lk);
2706			indirdep->ir_state &= ~ATTACHED;
2707			indirdep->ir_state |= UNDONE;
2708			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
2709			    M_INDIRDEP, M_WAITOK);
2710			bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
2711			bcopy(indirdep->ir_savebp->b_data, bp->b_data,
2712			    bp->b_bcount);
2713			FREE_LOCK(&lk);
2714			continue;
2715
2716		case D_MKDIR:
2717		case D_BMSAFEMAP:
2718		case D_ALLOCDIRECT:
2719		case D_ALLOCINDIR:
2720			continue;
2721
2722		default:
2723			panic("handle_disk_io_initiation: Unexpected type %s",
2724			    TYPENAME(wk->wk_type));
2725			/* NOTREACHED */
2726		}
2727	}
2728}
2729
2730/*
2731 * Called from within the procedure above to deal with unsatisfied
2732 * allocation dependencies in a directory. The buffer must be locked,
2733 * thus, no I/O completion operations can occur while we are
2734 * manipulating its associated dependencies.
2735 */
2736static void
2737initiate_write_filepage(pagedep, bp)
2738	struct pagedep *pagedep;
2739	struct buf *bp;
2740{
2741	struct diradd *dap;
2742	struct direct *ep;
2743	int i;
2744
2745	if (pagedep->pd_state & IOSTARTED) {
2746		/*
2747		 * This can only happen if there is a driver that does not
2748		 * understand chaining. Here biodone will reissue the call
2749		 * to strategy for the incomplete buffers.
2750		 */
2751		printf("initiate_write_filepage: already started\n");
2752		return;
2753	}
2754	pagedep->pd_state |= IOSTARTED;
2755	ACQUIRE_LOCK(&lk);
2756	for (i = 0; i < DAHASHSZ; i++) {
2757		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
2758		     dap = LIST_NEXT(dap, da_pdlist)) {
2759			ep = (struct direct *)
2760			    ((char *)bp->b_data + dap->da_offset);
2761			if (ep->d_ino != dap->da_newinum)
2762				panic("%s: dir inum %d != new %d",
2763				    "initiate_write_filepage",
2764				    ep->d_ino, dap->da_newinum);
2765			if (dap->da_state & DIRCHG)
2766				ep->d_ino = dap->da_previous->dm_oldinum;
2767			else
2768				ep->d_ino = 0;
2769			dap->da_state &= ~ATTACHED;
2770			dap->da_state |= UNDONE;
2771		}
2772	}
2773	FREE_LOCK(&lk);
2774}
2775
2776/*
2777 * Called from within the procedure above to deal with unsatisfied
2778 * allocation dependencies in an inodeblock. The buffer must be
2779 * locked, thus, no I/O completion operations can occur while we
2780 * are manipulating its associated dependencies.
2781 */
2782static void
2783initiate_write_inodeblock(inodedep, bp)
2784	struct inodedep *inodedep;
2785	struct buf *bp;			/* The inode block */
2786{
2787	struct allocdirect *adp, *lastadp;
2788	struct dinode *dp;
2789	struct fs *fs;
2790	ufs_lbn_t prevlbn = 0;
2791	int i, deplist;
2792
2793	if (inodedep->id_state & IOSTARTED)
2794		panic("initiate_write_inodeblock: already started");
2795	inodedep->id_state |= IOSTARTED;
2796	fs = inodedep->id_fs;
2797	dp = (struct dinode *)bp->b_data +
2798	    ino_to_fsbo(fs, inodedep->id_ino);
2799	/*
2800	 * If the bitmap is not yet written, then the allocated
2801	 * inode cannot be written to disk.
2802	 */
2803	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
2804		if (inodedep->id_savedino != NULL)
2805			panic("initiate_write_inodeblock: already doing I/O");
2806		MALLOC(inodedep->id_savedino, struct dinode *,
2807		    sizeof(struct dinode), M_INODEDEP, M_WAITOK);
2808		*inodedep->id_savedino = *dp;
2809		bzero((caddr_t)dp, sizeof(struct dinode));
2810		return;
2811	}
2812	/*
2813	 * If no dependencies, then there is nothing to roll back.
2814	 */
2815	inodedep->id_savedsize = dp->di_size;
2816	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
2817		return;
2818	/*
2819	 * Set the dependencies to busy.
2820	 */
2821	ACQUIRE_LOCK(&lk);
2822	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2823	     adp = TAILQ_NEXT(adp, ad_next)) {
2824#ifdef DIAGNOSTIC
2825		if (deplist != 0 && prevlbn >= adp->ad_lbn)
2826			panic("softdep_write_inodeblock: lbn order");
2827		prevlbn = adp->ad_lbn;
2828		if (adp->ad_lbn < NDADDR &&
2829		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
2830			panic("%s: direct pointer #%ld mismatch %d != %d",
2831			    "softdep_write_inodeblock", adp->ad_lbn,
2832			    dp->di_db[adp->ad_lbn], adp->ad_newblkno);
2833		if (adp->ad_lbn >= NDADDR &&
2834		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
2835			panic("%s: indirect pointer #%ld mismatch %d != %d",
2836			    "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
2837			    dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
2838		deplist |= 1 << adp->ad_lbn;
2839		if ((adp->ad_state & ATTACHED) == 0)
2840			panic("softdep_write_inodeblock: Unknown state 0x%x",
2841			    adp->ad_state);
2842#endif /* DIAGNOSTIC */
2843		adp->ad_state &= ~ATTACHED;
2844		adp->ad_state |= UNDONE;
2845	}
2846	/*
2847	 * The on-disk inode cannot claim to be any larger than the last
2848	 * fragment that has been written. Otherwise, the on-disk inode
2849	 * might have fragments that were not the last block in the file
2850	 * which would corrupt the filesystem.
2851	 */
2852	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2853	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
2854		if (adp->ad_lbn >= NDADDR)
2855			break;
2856		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
2857		/* keep going until hitting a rollback to a frag */
2858		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
2859			continue;
2860		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
2861		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
2862#ifdef DIAGNOSTIC
2863			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
2864				panic("softdep_write_inodeblock: lost dep1");
2865#endif /* DIAGNOSTIC */
2866			dp->di_db[i] = 0;
2867		}
2868		for (i = 0; i < NIADDR; i++) {
2869#ifdef DIAGNOSTIC
2870			if (dp->di_ib[i] != 0 &&
2871			    (deplist & ((1 << NDADDR) << i)) == 0)
2872				panic("softdep_write_inodeblock: lost dep2");
2873#endif /* DIAGNOSTIC */
2874			dp->di_ib[i] = 0;
2875		}
2876		FREE_LOCK(&lk);
2877		return;
2878	}
2879	/*
2880	 * If we have zero'ed out the last allocated block of the file,
2881	 * roll back the size to the last currently allocated block.
2882	 * We know that this last allocated block is a full-sized as
2883	 * we already checked for fragments in the loop above.
2884	 */
2885	if (lastadp != NULL &&
2886	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
2887		for (i = lastadp->ad_lbn; i >= 0; i--)
2888			if (dp->di_db[i] != 0)
2889				break;
2890		dp->di_size = (i + 1) * fs->fs_bsize;
2891	}
2892	/*
2893	 * The only dependencies are for indirect blocks.
2894	 *
2895	 * The file size for indirect block additions is not guaranteed.
2896	 * Such a guarantee would be non-trivial to achieve. The conventional
2897	 * synchronous write implementation also does not make this guarantee.
2898	 * Fsck should catch and fix discrepancies. Arguably, the file size
2899	 * can be over-estimated without destroying integrity when the file
2900	 * moves into the indirect blocks (i.e., is large). If we want to
2901	 * postpone fsck, we are stuck with this argument.
2902	 */
2903	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
2904		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
2905	FREE_LOCK(&lk);
2906}
2907
2908/*
2909 * This routine is called during the completion interrupt
2910 * service routine for a disk write (from the procedure called
2911 * by the device driver to inform the file system caches of
2912 * a request completion).  It should be called early in this
2913 * procedure, before the block is made available to other
2914 * processes or other routines are called.
2915 */
2916void
2917softdep_disk_write_complete(bp)
2918	struct buf *bp;		/* describes the completed disk write */
2919{
2920	struct worklist *wk;
2921	struct workhead reattach;
2922	struct newblk *newblk;
2923	struct allocindir *aip;
2924	struct allocdirect *adp;
2925	struct indirdep *indirdep;
2926	struct inodedep *inodedep;
2927	struct bmsafemap *bmsafemap;
2928
2929#ifdef DEBUG
2930	if (lk.lkt_held != -1)
2931		panic("softdep_disk_write_complete: lock is held");
2932	lk.lkt_held = -2;
2933#endif
2934	LIST_INIT(&reattach);
2935	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2936		WORKLIST_REMOVE(wk);
2937		switch (wk->wk_type) {
2938
2939		case D_PAGEDEP:
2940			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
2941				WORKLIST_INSERT(&reattach, wk);
2942			continue;
2943
2944		case D_INODEDEP:
2945			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
2946				WORKLIST_INSERT(&reattach, wk);
2947			continue;
2948
2949		case D_BMSAFEMAP:
2950			bmsafemap = WK_BMSAFEMAP(wk);
2951			while (newblk = LIST_FIRST(&bmsafemap->sm_newblkhd)) {
2952				newblk->nb_state |= DEPCOMPLETE;
2953				newblk->nb_bmsafemap = NULL;
2954				LIST_REMOVE(newblk, nb_deps);
2955			}
2956			while (adp = LIST_FIRST(&bmsafemap->sm_allocdirecthd)) {
2957				adp->ad_state |= DEPCOMPLETE;
2958				adp->ad_buf = NULL;
2959				LIST_REMOVE(adp, ad_deps);
2960				handle_allocdirect_partdone(adp);
2961			}
2962			while (aip = LIST_FIRST(&bmsafemap->sm_allocindirhd)) {
2963				aip->ai_state |= DEPCOMPLETE;
2964				aip->ai_buf = NULL;
2965				LIST_REMOVE(aip, ai_deps);
2966				handle_allocindir_partdone(aip);
2967			}
2968			while ((inodedep =
2969			       LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
2970				inodedep->id_state |= DEPCOMPLETE;
2971				LIST_REMOVE(inodedep, id_deps);
2972				inodedep->id_buf = NULL;
2973			}
2974			WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
2975			continue;
2976
2977		case D_MKDIR:
2978			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
2979			continue;
2980
2981		case D_ALLOCDIRECT:
2982			adp = WK_ALLOCDIRECT(wk);
2983			adp->ad_state |= COMPLETE;
2984			handle_allocdirect_partdone(adp);
2985			continue;
2986
2987		case D_ALLOCINDIR:
2988			aip = WK_ALLOCINDIR(wk);
2989			aip->ai_state |= COMPLETE;
2990			handle_allocindir_partdone(aip);
2991			continue;
2992
2993		case D_INDIRDEP:
2994			indirdep = WK_INDIRDEP(wk);
2995			if (indirdep->ir_state & GOINGAWAY)
2996				panic("disk_write_complete: indirdep gone");
2997			bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
2998			FREE(indirdep->ir_saveddata, M_INDIRDEP);
2999			indirdep->ir_saveddata = 0;
3000			indirdep->ir_state &= ~UNDONE;
3001			indirdep->ir_state |= ATTACHED;
3002			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
3003				handle_allocindir_partdone(aip);
3004				if (aip == LIST_FIRST(&indirdep->ir_donehd))
3005					panic("disk_write_complete: not gone");
3006			}
3007			WORKLIST_INSERT(&reattach, wk);
3008			bdirty(bp);
3009			continue;
3010
3011		default:
3012			panic("handle_disk_write_complete: Unknown type %s",
3013			    TYPENAME(wk->wk_type));
3014			/* NOTREACHED */
3015		}
3016	}
3017	/*
3018	 * Reattach any requests that must be redone.
3019	 */
3020	while ((wk = LIST_FIRST(&reattach)) != NULL) {
3021		WORKLIST_REMOVE(wk);
3022		WORKLIST_INSERT(&bp->b_dep, wk);
3023	}
3024#ifdef DEBUG
3025	if (lk.lkt_held != -2)
3026		panic("softdep_disk_write_complete: lock lost");
3027	lk.lkt_held = -1;
3028#endif
3029}
3030
3031/*
3032 * Called from within softdep_disk_write_complete above. Note that
3033 * this routine is always called from interrupt level with further
3034 * splbio interrupts blocked.
3035 */
3036static void
3037handle_allocdirect_partdone(adp)
3038	struct allocdirect *adp;	/* the completed allocdirect */
3039{
3040	struct allocdirect *listadp;
3041	struct inodedep *inodedep;
3042	long bsize;
3043
3044	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3045		return;
3046	if (adp->ad_buf != NULL)
3047		panic("handle_allocdirect_partdone: dangling dep");
3048	/*
3049	 * The on-disk inode cannot claim to be any larger than the last
3050	 * fragment that has been written. Otherwise, the on-disk inode
3051	 * might have fragments that were not the last block in the file
3052	 * which would corrupt the filesystem. Thus, we cannot free any
3053	 * allocdirects after one whose ad_oldblkno claims a fragment as
3054	 * these blocks must be rolled back to zero before writing the inode.
3055	 * We check the currently active set of allocdirects in id_inoupdt.
3056	 */
3057	inodedep = adp->ad_inodedep;
3058	bsize = inodedep->id_fs->fs_bsize;
3059	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp;
3060	     listadp = TAILQ_NEXT(listadp, ad_next)) {
3061		/* found our block */
3062		if (listadp == adp)
3063			break;
3064		/* continue if ad_oldlbn is not a fragment */
3065		if (listadp->ad_oldsize == 0 ||
3066		    listadp->ad_oldsize == bsize)
3067			continue;
3068		/* hit a fragment */
3069		return;
3070	}
3071	/*
3072	 * If we have reached the end of the current list without
3073	 * finding the just finished dependency, then it must be
3074	 * on the future dependency list. Future dependencies cannot
3075	 * be freed until they are moved to the current list.
3076	 */
3077	if (listadp == NULL) {
3078#ifdef DEBUG
3079		for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp;
3080		     listadp = TAILQ_NEXT(listadp, ad_next))
3081			/* found our block */
3082			if (listadp == adp)
3083				break;
3084		if (listadp == NULL)
3085			panic("handle_allocdirect_partdone: lost dep");
3086#endif /* DEBUG */
3087		return;
3088	}
3089	/*
3090	 * If we have found the just finished dependency, then free
3091	 * it along with anything that follows it that is complete.
3092	 */
3093	for (; adp; adp = listadp) {
3094		listadp = TAILQ_NEXT(adp, ad_next);
3095		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3096			return;
3097		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
3098	}
3099}
3100
3101/*
3102 * Called from within softdep_disk_write_complete above. Note that
3103 * this routine is always called from interrupt level with further
3104 * splbio interrupts blocked.
3105 */
3106static void
3107handle_allocindir_partdone(aip)
3108	struct allocindir *aip;		/* the completed allocindir */
3109{
3110	struct indirdep *indirdep;
3111
3112	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
3113		return;
3114	if (aip->ai_buf != NULL)
3115		panic("handle_allocindir_partdone: dangling dependency");
3116	indirdep = aip->ai_indirdep;
3117	if (indirdep->ir_state & UNDONE) {
3118		LIST_REMOVE(aip, ai_next);
3119		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
3120		return;
3121	}
3122	((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3123	    aip->ai_newblkno;
3124	LIST_REMOVE(aip, ai_next);
3125	if (aip->ai_freefrag != NULL)
3126		add_to_worklist(&aip->ai_freefrag->ff_list);
3127	WORKITEM_FREE(aip, D_ALLOCINDIR);
3128}
3129
3130/*
3131 * Called from within softdep_disk_write_complete above to restore
3132 * in-memory inode block contents to their most up-to-date state. Note
3133 * that this routine is always called from interrupt level with further
3134 * splbio interrupts blocked.
3135 */
3136static int
3137handle_written_inodeblock(inodedep, bp)
3138	struct inodedep *inodedep;
3139	struct buf *bp;		/* buffer containing the inode block */
3140{
3141	struct worklist *wk, *filefree;
3142	struct allocdirect *adp, *nextadp;
3143	struct dinode *dp;
3144	int hadchanges;
3145
3146	if ((inodedep->id_state & IOSTARTED) == 0)
3147		panic("handle_written_inodeblock: not started");
3148	inodedep->id_state &= ~IOSTARTED;
3149	inodedep->id_state |= COMPLETE;
3150	dp = (struct dinode *)bp->b_data +
3151	    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
3152	/*
3153	 * If we had to rollback the inode allocation because of
3154	 * bitmaps being incomplete, then simply restore it.
3155	 * Keep the block dirty so that it will not be reclaimed until
3156	 * all associated dependencies have been cleared and the
3157	 * corresponding updates written to disk.
3158	 */
3159	if (inodedep->id_savedino != NULL) {
3160		*dp = *inodedep->id_savedino;
3161		FREE(inodedep->id_savedino, M_INODEDEP);
3162		inodedep->id_savedino = NULL;
3163		bdirty(bp);
3164		return (1);
3165	}
3166	/*
3167	 * Roll forward anything that had to be rolled back before
3168	 * the inode could be updated.
3169	 */
3170	hadchanges = 0;
3171	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
3172		nextadp = TAILQ_NEXT(adp, ad_next);
3173		if (adp->ad_state & ATTACHED)
3174			panic("handle_written_inodeblock: new entry");
3175		if (adp->ad_lbn < NDADDR) {
3176			if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno)
3177				panic("%s: %s #%ld mismatch %d != %d",
3178				    "handle_written_inodeblock",
3179				    "direct pointer", adp->ad_lbn,
3180				    dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
3181			dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
3182		} else {
3183			if (dp->di_ib[adp->ad_lbn - NDADDR] != 0)
3184				panic("%s: %s #%ld allocated as %d",
3185				    "handle_written_inodeblock",
3186				    "indirect pointer", adp->ad_lbn - NDADDR,
3187				    dp->di_ib[adp->ad_lbn - NDADDR]);
3188			dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
3189		}
3190		adp->ad_state &= ~UNDONE;
3191		adp->ad_state |= ATTACHED;
3192		hadchanges = 1;
3193	}
3194	/*
3195	 * Reset the file size to its most up-to-date value.
3196	 */
3197	if (inodedep->id_savedsize == -1)
3198		panic("handle_written_inodeblock: bad size");
3199	if (dp->di_size != inodedep->id_savedsize) {
3200		dp->di_size = inodedep->id_savedsize;
3201		hadchanges = 1;
3202	}
3203	inodedep->id_savedsize = -1;
3204	/*
3205	 * If there were any rollbacks in the inode block, then it must be
3206	 * marked dirty so that its will eventually get written back in
3207	 * its correct form.
3208	 */
3209	if (hadchanges)
3210		bdirty(bp);
3211	/*
3212	 * Process any allocdirects that completed during the update.
3213	 */
3214	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
3215		handle_allocdirect_partdone(adp);
3216	/*
3217	 * Process deallocations that were held pending until the
3218	 * inode had been written to disk. Freeing of the inode
3219	 * is delayed until after all blocks have been freed to
3220	 * avoid creation of new <vfsid, inum, lbn> triples
3221	 * before the old ones have been deleted.
3222	 */
3223	filefree = NULL;
3224	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
3225		WORKLIST_REMOVE(wk);
3226		switch (wk->wk_type) {
3227
3228		case D_FREEFILE:
3229			/*
3230			 * We defer adding filefree to the worklist until
3231			 * all other additions have been made to ensure
3232			 * that it will be done after all the old blocks
3233			 * have been freed.
3234			 */
3235			if (filefree != NULL)
3236				panic("handle_written_inodeblock: filefree");
3237			filefree = wk;
3238			continue;
3239
3240		case D_MKDIR:
3241			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
3242			continue;
3243
3244		case D_DIRADD:
3245			diradd_inode_written(WK_DIRADD(wk), inodedep);
3246			continue;
3247
3248		case D_FREEBLKS:
3249		case D_FREEFRAG:
3250		case D_DIRREM:
3251			add_to_worklist(wk);
3252			continue;
3253
3254		default:
3255			panic("handle_written_inodeblock: Unknown type %s",
3256			    TYPENAME(wk->wk_type));
3257			/* NOTREACHED */
3258		}
3259	}
3260	if (filefree != NULL) {
3261		if (free_inodedep(inodedep) == 0)
3262			panic("handle_written_inodeblock: live inodedep");
3263		add_to_worklist(filefree);
3264		return (0);
3265	}
3266
3267	/*
3268	 * If no outstanding dependencies, free it.
3269	 */
3270	if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
3271		return (0);
3272	return (hadchanges);
3273}
3274
3275/*
3276 * Process a diradd entry after its dependent inode has been written.
3277 * This routine must be called with splbio interrupts blocked.
3278 */
3279static void
3280diradd_inode_written(dap, inodedep)
3281	struct diradd *dap;
3282	struct inodedep *inodedep;
3283{
3284	struct pagedep *pagedep;
3285
3286	dap->da_state |= COMPLETE;
3287	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3288		if (dap->da_state & DIRCHG)
3289			pagedep = dap->da_previous->dm_pagedep;
3290		else
3291			pagedep = dap->da_pagedep;
3292		LIST_REMOVE(dap, da_pdlist);
3293		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3294	}
3295	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3296}
3297
3298/*
3299 * Handle the completion of a mkdir dependency.
3300 */
3301static void
3302handle_written_mkdir(mkdir, type)
3303	struct mkdir *mkdir;
3304	int type;
3305{
3306	struct diradd *dap;
3307	struct pagedep *pagedep;
3308
3309	if (mkdir->md_state != type)
3310		panic("handle_written_mkdir: bad type");
3311	dap = mkdir->md_diradd;
3312	dap->da_state &= ~type;
3313	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
3314		dap->da_state |= DEPCOMPLETE;
3315	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3316		if (dap->da_state & DIRCHG)
3317			pagedep = dap->da_previous->dm_pagedep;
3318		else
3319			pagedep = dap->da_pagedep;
3320		LIST_REMOVE(dap, da_pdlist);
3321		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3322	}
3323	LIST_REMOVE(mkdir, md_mkdirs);
3324	WORKITEM_FREE(mkdir, D_MKDIR);
3325}
3326
3327/*
3328 * Called from within softdep_disk_write_complete above.
3329 * A write operation was just completed. Removed inodes can
3330 * now be freed and associated block pointers may be committed.
3331 * Note that this routine is always called from interrupt level
3332 * with further splbio interrupts blocked.
3333 */
3334static int
3335handle_written_filepage(pagedep, bp)
3336	struct pagedep *pagedep;
3337	struct buf *bp;		/* buffer containing the written page */
3338{
3339	struct dirrem *dirrem;
3340	struct diradd *dap, *nextdap;
3341	struct direct *ep;
3342	int i, chgs;
3343
3344	if ((pagedep->pd_state & IOSTARTED) == 0)
3345		panic("handle_written_filepage: not started");
3346	pagedep->pd_state &= ~IOSTARTED;
3347	/*
3348	 * Process any directory removals that have been committed.
3349	 */
3350	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
3351		LIST_REMOVE(dirrem, dm_next);
3352		dirrem->dm_dirinum = pagedep->pd_ino;
3353		add_to_worklist(&dirrem->dm_list);
3354	}
3355	/*
3356	 * Free any directory additions that have been committed.
3357	 */
3358	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
3359		free_diradd(dap);
3360	/*
3361	 * Uncommitted directory entries must be restored.
3362	 */
3363	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
3364		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
3365		     dap = nextdap) {
3366			nextdap = LIST_NEXT(dap, da_pdlist);
3367			if (dap->da_state & ATTACHED)
3368				panic("handle_written_filepage: attached");
3369			ep = (struct direct *)
3370			    ((char *)bp->b_data + dap->da_offset);
3371			ep->d_ino = dap->da_newinum;
3372			dap->da_state &= ~UNDONE;
3373			dap->da_state |= ATTACHED;
3374			chgs = 1;
3375			/*
3376			 * If the inode referenced by the directory has
3377			 * been written out, then the dependency can be
3378			 * moved to the pending list.
3379			 */
3380			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3381				LIST_REMOVE(dap, da_pdlist);
3382				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
3383				    da_pdlist);
3384			}
3385		}
3386	}
3387	/*
3388	 * If there were any rollbacks in the directory, then it must be
3389	 * marked dirty so that its will eventually get written back in
3390	 * its correct form.
3391	 */
3392	if (chgs)
3393		bdirty(bp);
3394	/*
3395	 * If no dependencies remain, the pagedep will be freed.
3396	 * Otherwise it will remain to update the page before it
3397	 * is written back to disk.
3398	 */
3399	if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
3400		for (i = 0; i < DAHASHSZ; i++)
3401			if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
3402				break;
3403		if (i == DAHASHSZ) {
3404			LIST_REMOVE(pagedep, pd_hash);
3405			WORKITEM_FREE(pagedep, D_PAGEDEP);
3406			return (0);
3407		}
3408	}
3409	return (1);
3410}
3411
3412/*
3413 * Writing back in-core inode structures.
3414 *
3415 * The file system only accesses an inode's contents when it occupies an
3416 * "in-core" inode structure.  These "in-core" structures are separate from
3417 * the page frames used to cache inode blocks.  Only the latter are
3418 * transferred to/from the disk.  So, when the updated contents of the
3419 * "in-core" inode structure are copied to the corresponding in-memory inode
3420 * block, the dependencies are also transferred.  The following procedure is
3421 * called when copying a dirty "in-core" inode to a cached inode block.
3422 */
3423
3424/*
3425 * Called when an inode is loaded from disk. If the effective link count
3426 * differed from the actual link count when it was last flushed, then we
3427 * need to ensure that the correct effective link count is put back.
3428 */
3429void
3430softdep_load_inodeblock(ip)
3431	struct inode *ip;	/* the "in_core" copy of the inode */
3432{
3433	struct inodedep *inodedep;
3434
3435	/*
3436	 * Check for alternate nlink count.
3437	 */
3438	ip->i_effnlink = ip->i_nlink;
3439	ACQUIRE_LOCK(&lk);
3440	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3441		FREE_LOCK(&lk);
3442		return;
3443	}
3444	if (inodedep->id_nlinkdelta != 0) {
3445		ip->i_effnlink -= inodedep->id_nlinkdelta;
3446		ip->i_flag |= IN_MODIFIED;
3447		inodedep->id_nlinkdelta = 0;
3448		(void) free_inodedep(inodedep);
3449	}
3450	FREE_LOCK(&lk);
3451}
3452
3453/*
3454 * This routine is called just before the "in-core" inode
3455 * information is to be copied to the in-memory inode block.
3456 * Recall that an inode block contains several inodes. If
3457 * the force flag is set, then the dependencies will be
3458 * cleared so that the update can always be made. Note that
3459 * the buffer is locked when this routine is called, so we
3460 * will never be in the middle of writing the inode block
3461 * to disk.
3462 */
3463void
3464softdep_update_inodeblock(ip, bp, waitfor)
3465	struct inode *ip;	/* the "in_core" copy of the inode */
3466	struct buf *bp;		/* the buffer containing the inode block */
3467	int waitfor;		/* nonzero => update must be allowed */
3468{
3469	struct inodedep *inodedep;
3470	struct worklist *wk;
3471	int error, gotit;
3472
3473	/*
3474	 * If the effective link count is not equal to the actual link
3475	 * count, then we must track the difference in an inodedep while
3476	 * the inode is (potentially) tossed out of the cache. Otherwise,
3477	 * if there is no existing inodedep, then there are no dependencies
3478	 * to track.
3479	 */
3480	ACQUIRE_LOCK(&lk);
3481	if (ip->i_effnlink != ip->i_nlink) {
3482		(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC,
3483		    &inodedep);
3484	} else if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3485		FREE_LOCK(&lk);
3486		return;
3487	}
3488	if (ip->i_nlink < ip->i_effnlink)
3489		panic("softdep_update_inodeblock: bad delta");
3490	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3491	/*
3492	 * Changes have been initiated. Anything depending on these
3493	 * changes cannot occur until this inode has been written.
3494	 */
3495	inodedep->id_state &= ~COMPLETE;
3496	if ((inodedep->id_state & ONWORKLIST) == 0)
3497		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
3498	/*
3499	 * Any new dependencies associated with the incore inode must
3500	 * now be moved to the list associated with the buffer holding
3501	 * the in-memory copy of the inode. Once merged process any
3502	 * allocdirects that are completed by the merger.
3503	 */
3504	merge_inode_lists(inodedep);
3505	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
3506		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
3507	/*
3508	 * Now that the inode has been pushed into the buffer, the
3509	 * operations dependent on the inode being written to disk
3510	 * can be moved to the id_bufwait so that they will be
3511	 * processed when the buffer I/O completes.
3512	 */
3513	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
3514		WORKLIST_REMOVE(wk);
3515		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
3516	}
3517	/*
3518	 * Newly allocated inodes cannot be written until the bitmap
3519	 * that allocates them have been written (indicated by
3520	 * DEPCOMPLETE being set in id_state). If we are doing a
3521	 * forced sync (e.g., an fsync on a file), we force the bitmap
3522	 * to be written so that the update can be done.
3523	 */
3524	if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
3525		FREE_LOCK(&lk);
3526		return;
3527	}
3528	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
3529	FREE_LOCK(&lk);
3530	if (gotit && (error = VOP_BWRITE(inodedep->id_buf)) != 0)
3531		softdep_error("softdep_update_inodeblock: bwrite", error);
3532	if ((inodedep->id_state & DEPCOMPLETE) == 0)
3533		panic("softdep_update_inodeblock: update failed");
3534}
3535
3536/*
3537 * Merge the new inode dependency list (id_newinoupdt) into the old
3538 * inode dependency list (id_inoupdt). This routine must be called
3539 * with splbio interrupts blocked.
3540 */
3541static void
3542merge_inode_lists(inodedep)
3543	struct inodedep *inodedep;
3544{
3545	struct allocdirect *listadp, *newadp;
3546
3547	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3548	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
3549		if (listadp->ad_lbn < newadp->ad_lbn) {
3550			listadp = TAILQ_NEXT(listadp, ad_next);
3551			continue;
3552		}
3553		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3554		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
3555		if (listadp->ad_lbn == newadp->ad_lbn) {
3556			allocdirect_merge(&inodedep->id_inoupdt, newadp,
3557			    listadp);
3558			listadp = newadp;
3559		}
3560		newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3561	}
3562	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
3563		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3564		TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
3565	}
3566}
3567
3568/*
3569 * If we are doing an fsync, then we must ensure that any directory
3570 * entries for the inode have been written after the inode gets to disk.
3571 */
3572int
3573softdep_fsync(vp)
3574	struct vnode *vp;	/* the "in_core" copy of the inode */
3575{
3576	struct diradd *dap, *olddap;
3577	struct inodedep *inodedep;
3578	struct pagedep *pagedep;
3579	struct worklist *wk;
3580	struct mount *mnt;
3581	struct vnode *pvp;
3582	struct inode *ip;
3583	struct buf *bp;
3584	struct fs *fs;
3585	struct proc *p = CURPROC;		/* XXX */
3586	int error, ret, flushparent;
3587#ifndef __FreeBSD__
3588	struct timeval tv;
3589#endif
3590	ino_t parentino;
3591	ufs_lbn_t lbn;
3592
3593	ip = VTOI(vp);
3594	fs = ip->i_fs;
3595	for (error = 0, flushparent = 0, olddap = NULL; ; ) {
3596		ACQUIRE_LOCK(&lk);
3597		if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
3598			break;
3599		if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
3600		    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
3601		    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
3602		    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
3603			panic("softdep_fsync: pending ops");
3604		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
3605			break;
3606		if (wk->wk_type != D_DIRADD)
3607			panic("softdep_fsync: Unexpected type %s",
3608			    TYPENAME(wk->wk_type));
3609		dap = WK_DIRADD(wk);
3610		/*
3611		 * If we have failed to get rid of all the dependencies
3612		 * then something is seriously wrong.
3613		 */
3614		if (dap == olddap)
3615			panic("softdep_fsync: flush failed");
3616		olddap = dap;
3617		/*
3618		 * Flush our parent if this directory entry
3619		 * has a MKDIR_PARENT dependency.
3620		 */
3621		if (dap->da_state & DIRCHG)
3622			pagedep = dap->da_previous->dm_pagedep;
3623		else
3624			pagedep = dap->da_pagedep;
3625		mnt = pagedep->pd_mnt;
3626		parentino = pagedep->pd_ino;
3627		lbn = pagedep->pd_lbn;
3628		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
3629			panic("softdep_fsync: dirty");
3630		flushparent = dap->da_state & MKDIR_PARENT;
3631		/*
3632		 * If we are being fsync'ed as part of vgone'ing this vnode,
3633		 * then we will not be able to release and recover the
3634		 * vnode below, so we just have to give up on writing its
3635		 * directory entry out. It will eventually be written, just
3636		 * not now, but then the user was not asking to have it
3637		 * written, so we are not breaking any promises.
3638		 */
3639		if (vp->v_flag & VXLOCK)
3640			break;
3641		/*
3642		 * We prevent deadlock by always fetching inodes from the
3643		 * root, moving down the directory tree. Thus, when fetching
3644		 * our parent directory, we must unlock ourselves before
3645		 * requesting the lock on our parent. See the comment in
3646		 * ufs_lookup for details on possible races.
3647		 */
3648		FREE_LOCK(&lk);
3649		VOP_UNLOCK(vp, 0, p);
3650		if ((error = VFS_VGET(mnt, parentino, &pvp)) != 0) {
3651			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
3652			return (error);
3653		}
3654		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
3655		if (flushparent) {
3656#ifdef __FreeBSD__
3657			error = UFS_UPDATE(pvp, 1);
3658#else
3659			tv = time;
3660			error = UFS_UPDATE(pvp, &tv, &tv, 1);
3661#endif
3662			if (error) {
3663				vput(pvp);
3664				return (error);
3665			}
3666		}
3667		/*
3668		 * Flush directory page containing the inode's name.
3669		 */
3670		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
3671		    &bp);
3672		ret = VOP_BWRITE(bp);
3673		vput(pvp);
3674		if (error != 0)
3675			return (error);
3676		if (ret != 0)
3677			return (ret);
3678	}
3679	FREE_LOCK(&lk);
3680	return (0);
3681}
3682
3683/*
3684 * This routine is called when we are trying to synchronously flush a
3685 * file. This routine must eliminate any filesystem metadata dependencies
3686 * so that the syncing routine can succeed by pushing the dirty blocks
3687 * associated with the file. If any I/O errors occur, they are returned.
3688 */
3689int
3690softdep_sync_metadata(ap)
3691	struct vop_fsync_args /* {
3692		struct vnode *a_vp;
3693		struct ucred *a_cred;
3694		int a_waitfor;
3695		struct proc *a_p;
3696	} */ *ap;
3697{
3698	struct vnode *vp = ap->a_vp;
3699	struct pagedep *pagedep;
3700	struct allocdirect *adp;
3701	struct allocindir *aip;
3702	struct buf *bp, *nbp;
3703	struct worklist *wk;
3704	int i, error, waitfor;
3705
3706	/*
3707	 * Check whether this vnode is involved in a filesystem
3708	 * that is doing soft dependency processing.
3709	 */
3710	if (vp->v_type != VBLK) {
3711		if (!DOINGSOFTDEP(vp))
3712			return (0);
3713	} else
3714		if (vp->v_specmountpoint == NULL ||
3715		    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
3716			return (0);
3717	/*
3718	 * Ensure that any direct block dependencies have been cleared.
3719	 */
3720	ACQUIRE_LOCK(&lk);
3721	if (error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number)) {
3722		FREE_LOCK(&lk);
3723		return (error);
3724	}
3725	/*
3726	 * For most files, the only metadata dependencies are the
3727	 * cylinder group maps that allocate their inode or blocks.
3728	 * The block allocation dependencies can be found by traversing
3729	 * the dependency lists for any buffers that remain on their
3730	 * dirty buffer list. The inode allocation dependency will
3731	 * be resolved when the inode is updated with MNT_WAIT.
3732	 * This work is done in two passes. The first pass grabs most
3733	 * of the buffers and begins asynchronously writing them. The
3734	 * only way to wait for these asynchronous writes is to sleep
3735	 * on the filesystem vnode which may stay busy for a long time
3736	 * if the filesystem is active. So, instead, we make a second
3737	 * pass over the dependencies blocking on each write. In the
3738	 * usual case we will be blocking against a write that we
3739	 * initiated, so when it is done the dependency will have been
3740	 * resolved. Thus the second pass is expected to end quickly.
3741	 */
3742	waitfor = MNT_NOWAIT;
3743top:
3744	if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
3745		while (vp->v_numoutput) {
3746			vp->v_flag |= VBWAIT;
3747			FREE_LOCK_INTERLOCKED(&lk);
3748			tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1,
3749				"sdsynm", 0);
3750			ACQUIRE_LOCK_INTERLOCKED(&lk);
3751		}
3752		FREE_LOCK(&lk);
3753		return (0);
3754	}
3755	bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
3756loop:
3757	/*
3758	 * As we hold the buffer locked, none of its dependencies
3759	 * will disappear.
3760	 */
3761	for (wk = LIST_FIRST(&bp->b_dep); wk;
3762	     wk = LIST_NEXT(wk, wk_list)) {
3763		switch (wk->wk_type) {
3764
3765		case D_ALLOCDIRECT:
3766			adp = WK_ALLOCDIRECT(wk);
3767			if (adp->ad_state & DEPCOMPLETE)
3768				break;
3769			nbp = adp->ad_buf;
3770			if (getdirtybuf(&nbp, waitfor) == 0)
3771				break;
3772			FREE_LOCK(&lk);
3773			if (waitfor == MNT_NOWAIT) {
3774				bawrite(nbp);
3775			} else if ((error = VOP_BWRITE(nbp)) != 0) {
3776				bawrite(bp);
3777				return (error);
3778			}
3779			ACQUIRE_LOCK(&lk);
3780			break;
3781
3782		case D_ALLOCINDIR:
3783			aip = WK_ALLOCINDIR(wk);
3784			if (aip->ai_state & DEPCOMPLETE)
3785				break;
3786			nbp = aip->ai_buf;
3787			if (getdirtybuf(&nbp, waitfor) == 0)
3788				break;
3789			FREE_LOCK(&lk);
3790			if (waitfor == MNT_NOWAIT) {
3791				bawrite(nbp);
3792			} else if ((error = VOP_BWRITE(nbp)) != 0) {
3793				bawrite(bp);
3794				return (error);
3795			}
3796			ACQUIRE_LOCK(&lk);
3797			break;
3798
3799		case D_INDIRDEP:
3800		restart:
3801			for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd);
3802			     aip; aip = LIST_NEXT(aip, ai_next)) {
3803				if (aip->ai_state & DEPCOMPLETE)
3804					continue;
3805				nbp = aip->ai_buf;
3806				if (getdirtybuf(&nbp, MNT_WAIT) == 0)
3807					goto restart;
3808				FREE_LOCK(&lk);
3809				if ((error = VOP_BWRITE(nbp)) != 0) {
3810					bawrite(bp);
3811					return (error);
3812				}
3813				ACQUIRE_LOCK(&lk);
3814				goto restart;
3815			}
3816			break;
3817
3818		case D_INODEDEP:
3819			if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
3820			    WK_INODEDEP(wk)->id_ino)) != 0) {
3821				FREE_LOCK(&lk);
3822				bawrite(bp);
3823				return (error);
3824			}
3825			break;
3826
3827		case D_PAGEDEP:
3828			/*
3829			 * We are trying to sync a directory that may
3830			 * have dependencies on both its own metadata
3831			 * and/or dependencies on the inodes of any
3832			 * recently allocated files. We walk its diradd
3833			 * lists pushing out the associated inode.
3834			 */
3835			pagedep = WK_PAGEDEP(wk);
3836			for (i = 0; i < DAHASHSZ; i++) {
3837				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
3838					continue;
3839				if (error = flush_pagedep_deps(vp,
3840				   pagedep->pd_mnt, &pagedep->pd_diraddhd[i])) {
3841					FREE_LOCK(&lk);
3842					bawrite(bp);
3843					return (error);
3844				}
3845			}
3846			break;
3847
3848		case D_MKDIR:
3849			/*
3850			 * This case should never happen if the vnode has
3851			 * been properly sync'ed. However, if this function
3852			 * is used at a place where the vnode has not yet
3853			 * been sync'ed, this dependency can show up. So,
3854			 * rather than panic, just flush it.
3855			 */
3856			nbp = WK_MKDIR(wk)->md_buf;
3857			if (getdirtybuf(&nbp, waitfor) == 0)
3858				break;
3859			FREE_LOCK(&lk);
3860			if (waitfor == MNT_NOWAIT) {
3861				bawrite(nbp);
3862			} else if ((error = VOP_BWRITE(nbp)) != 0) {
3863				bawrite(bp);
3864				return (error);
3865			}
3866			ACQUIRE_LOCK(&lk);
3867			break;
3868
3869		case D_BMSAFEMAP:
3870			/*
3871			 * This case should never happen if the vnode has
3872			 * been properly sync'ed. However, if this function
3873			 * is used at a place where the vnode has not yet
3874			 * been sync'ed, this dependency can show up. So,
3875			 * rather than panic, just flush it.
3876			 */
3877			nbp = WK_BMSAFEMAP(wk)->sm_buf;
3878			if (getdirtybuf(&nbp, waitfor) == 0)
3879				break;
3880			FREE_LOCK(&lk);
3881			if (waitfor == MNT_NOWAIT) {
3882				bawrite(nbp);
3883			} else if ((error = VOP_BWRITE(nbp)) != 0) {
3884				bawrite(bp);
3885				return (error);
3886			}
3887			ACQUIRE_LOCK(&lk);
3888			break;
3889
3890		default:
3891			panic("softdep_sync_metadata: Unknown type %s",
3892			    TYPENAME(wk->wk_type));
3893			/* NOTREACHED */
3894		}
3895	}
3896	(void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT);
3897	nbp = TAILQ_NEXT(bp, b_vnbufs);
3898	FREE_LOCK(&lk);
3899	bawrite(bp);
3900	ACQUIRE_LOCK(&lk);
3901	if (nbp != NULL) {
3902		bp = nbp;
3903		goto loop;
3904	}
3905	/*
3906	 * We must wait for any I/O in progress to finish so that
3907	 * all potential buffers on the dirty list will be visible.
3908	 * Once they are all there, proceed with the second pass
3909	 * which will wait for the I/O as per above.
3910	 */
3911	while (vp->v_numoutput) {
3912		vp->v_flag |= VBWAIT;
3913		FREE_LOCK_INTERLOCKED(&lk);
3914		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "sdsynm", 0);
3915		ACQUIRE_LOCK_INTERLOCKED(&lk);
3916	}
3917	/*
3918	 * The brief unlock is to allow any pent up dependency
3919	 * processing to be done.
3920	 */
3921	if (waitfor == MNT_NOWAIT) {
3922		waitfor = MNT_WAIT;
3923		FREE_LOCK(&lk);
3924		ACQUIRE_LOCK(&lk);
3925		goto top;
3926	}
3927
3928	/*
3929	 * If we have managed to get rid of all the dirty buffers,
3930	 * then we are done. For certain directories and block
3931	 * devices, we may need to do further work.
3932	 */
3933	if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
3934		FREE_LOCK(&lk);
3935		return (0);
3936	}
3937
3938	FREE_LOCK(&lk);
3939	/*
3940	 * If we are trying to sync a block device, some of its buffers may
3941	 * contain metadata that cannot be written until the contents of some
3942	 * partially written files have been written to disk. The only easy
3943	 * way to accomplish this is to sync the entire filesystem (luckily
3944	 * this happens rarely).
3945	 */
3946	if (vp->v_type == VBLK && vp->v_specmountpoint && !VOP_ISLOCKED(vp) &&
3947	    (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred,
3948	     ap->a_p)) != 0)
3949		return (error);
3950	return (0);
3951}
3952
3953/*
3954 * Flush the dependencies associated with an inodedep.
3955 * Called with splbio blocked.
3956 */
3957static int
3958flush_inodedep_deps(fs, ino)
3959	struct fs *fs;
3960	ino_t ino;
3961{
3962	struct inodedep *inodedep;
3963	struct allocdirect *adp;
3964	int error, waitfor;
3965	struct buf *bp;
3966
3967	/*
3968	 * This work is done in two passes. The first pass grabs most
3969	 * of the buffers and begins asynchronously writing them. The
3970	 * only way to wait for these asynchronous writes is to sleep
3971	 * on the filesystem vnode which may stay busy for a long time
3972	 * if the filesystem is active. So, instead, we make a second
3973	 * pass over the dependencies blocking on each write. In the
3974	 * usual case we will be blocking against a write that we
3975	 * initiated, so when it is done the dependency will have been
3976	 * resolved. Thus the second pass is expected to end quickly.
3977	 * We give a brief window at the top of the loop to allow
3978	 * any pending I/O to complete.
3979	 */
3980	for (waitfor = MNT_NOWAIT; ; ) {
3981		FREE_LOCK(&lk);
3982		ACQUIRE_LOCK(&lk);
3983		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
3984			return (0);
3985		for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3986		     adp = TAILQ_NEXT(adp, ad_next)) {
3987			if (adp->ad_state & DEPCOMPLETE)
3988				continue;
3989			bp = adp->ad_buf;
3990			if (getdirtybuf(&bp, waitfor) == 0) {
3991				if (waitfor == MNT_NOWAIT)
3992					continue;
3993				break;
3994			}
3995			FREE_LOCK(&lk);
3996			if (waitfor == MNT_NOWAIT) {
3997				bawrite(bp);
3998			} else if ((error = VOP_BWRITE(bp)) != 0) {
3999				ACQUIRE_LOCK(&lk);
4000				return (error);
4001			}
4002			ACQUIRE_LOCK(&lk);
4003			break;
4004		}
4005		if (adp != NULL)
4006			continue;
4007		for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp;
4008		     adp = TAILQ_NEXT(adp, ad_next)) {
4009			if (adp->ad_state & DEPCOMPLETE)
4010				continue;
4011			bp = adp->ad_buf;
4012			if (getdirtybuf(&bp, waitfor) == 0) {
4013				if (waitfor == MNT_NOWAIT)
4014					continue;
4015				break;
4016			}
4017			FREE_LOCK(&lk);
4018			if (waitfor == MNT_NOWAIT) {
4019				bawrite(bp);
4020			} else if ((error = VOP_BWRITE(bp)) != 0) {
4021				ACQUIRE_LOCK(&lk);
4022				return (error);
4023			}
4024			ACQUIRE_LOCK(&lk);
4025			break;
4026		}
4027		if (adp != NULL)
4028			continue;
4029		/*
4030		 * If pass2, we are done, otherwise do pass 2.
4031		 */
4032		if (waitfor == MNT_WAIT)
4033			break;
4034		waitfor = MNT_WAIT;
4035	}
4036	/*
4037	 * Try freeing inodedep in case all dependencies have been removed.
4038	 */
4039	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
4040		(void) free_inodedep(inodedep);
4041	return (0);
4042}
4043
4044/*
4045 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
4046 * Called with splbio blocked.
4047 */
4048static int
4049flush_pagedep_deps(pvp, mp, diraddhdp)
4050	struct vnode *pvp;
4051	struct mount *mp;
4052	struct diraddhd *diraddhdp;
4053{
4054	struct proc *p = CURPROC;	/* XXX */
4055	struct inodedep *inodedep;
4056	struct ufsmount *ump;
4057	struct diradd *dap;
4058#ifndef __FreeBSD__
4059	struct timeval tv;
4060#endif
4061	struct vnode *vp;
4062	int gotit, error = 0;
4063	struct buf *bp;
4064	ino_t inum;
4065
4066	ump = VFSTOUFS(mp);
4067	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
4068		/*
4069		 * Flush ourselves if this directory entry
4070		 * has a MKDIR_PARENT dependency.
4071		 */
4072		if (dap->da_state & MKDIR_PARENT) {
4073			FREE_LOCK(&lk);
4074#ifdef __FreeBSD__
4075			error = UFS_UPDATE(pvp, 1);
4076#else
4077			tv = time;
4078			error = UFS_UPDATE(pvp, &tv, &tv, 1);
4079#endif
4080			if (error)
4081				break;
4082			ACQUIRE_LOCK(&lk);
4083			/*
4084			 * If that cleared dependencies, go on to next.
4085			 */
4086			if (dap != LIST_FIRST(diraddhdp))
4087				continue;
4088			if (dap->da_state & MKDIR_PARENT)
4089				panic("flush_pagedep_deps: MKDIR");
4090		}
4091		/*
4092		 * Flush the file on which the directory entry depends.
4093		 * If the inode has already been pushed out of the cache,
4094		 * then all the block dependencies will have been flushed
4095		 * leaving only inode dependencies (e.g., bitmaps). Thus,
4096		 * we do a ufs_ihashget to check for the vnode in the cache.
4097		 * If it is there, we do a full flush. If it is no longer
4098		 * there we need only dispose of any remaining bitmap
4099		 * dependencies and write the inode to disk.
4100		 */
4101		inum = dap->da_newinum;
4102		FREE_LOCK(&lk);
4103		if ((vp = ufs_ihashget(ump->um_dev, inum)) == NULL) {
4104			ACQUIRE_LOCK(&lk);
4105			if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0
4106			    && dap == LIST_FIRST(diraddhdp))
4107				panic("flush_pagedep_deps: flush 1 failed");
4108			/*
4109			 * If the inode still has bitmap dependencies,
4110			 * push them to disk.
4111			 */
4112			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4113				gotit = getdirtybuf(&inodedep->id_buf,MNT_WAIT);
4114				FREE_LOCK(&lk);
4115				if (gotit &&
4116				    (error = VOP_BWRITE(inodedep->id_buf)) != 0)
4117					break;
4118				ACQUIRE_LOCK(&lk);
4119			}
4120			if (dap != LIST_FIRST(diraddhdp))
4121				continue;
4122			/*
4123			 * If the inode is still sitting in a buffer waiting
4124			 * to be written, push it to disk.
4125			 */
4126			FREE_LOCK(&lk);
4127			if ((error = bread(ump->um_devvp,
4128			    fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
4129			    (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0)
4130				break;
4131			if ((error = VOP_BWRITE(bp)) != 0)
4132				break;
4133			ACQUIRE_LOCK(&lk);
4134			if (dap == LIST_FIRST(diraddhdp))
4135				panic("flush_pagedep_deps: flush 2 failed");
4136			continue;
4137		}
4138		if (vp->v_type == VDIR) {
4139			/*
4140			 * A newly allocated directory must have its "." and
4141			 * ".." entries written out before its name can be
4142			 * committed in its parent. We do not want or need
4143			 * the full semantics of a synchronous VOP_FSYNC as
4144			 * that may end up here again, once for each directory
4145			 * level in the filesystem. Instead, we push the blocks
4146			 * and wait for them to clear.
4147			 */
4148			if (error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) {
4149				vput(vp);
4150				break;
4151			}
4152			ACQUIRE_LOCK(&lk);
4153			while (vp->v_numoutput) {
4154				vp->v_flag |= VBWAIT;
4155				FREE_LOCK_INTERLOCKED(&lk);
4156				tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1,
4157					"sdflpd", 0);
4158				ACQUIRE_LOCK_INTERLOCKED(&lk);
4159			}
4160			FREE_LOCK(&lk);
4161		}
4162#ifdef __FreeBSD__
4163		error = UFS_UPDATE(vp, 1);
4164#else
4165		tv = time;
4166		error = UFS_UPDATE(vp, &tv, &tv, 1);
4167#endif
4168		vput(vp);
4169		if (error)
4170			break;
4171		/*
4172		 * If we have failed to get rid of all the dependencies
4173		 * then something is seriously wrong.
4174		 */
4175		if (dap == LIST_FIRST(diraddhdp))
4176			panic("flush_pagedep_deps: flush 3 failed");
4177		ACQUIRE_LOCK(&lk);
4178	}
4179	if (error)
4180		ACQUIRE_LOCK(&lk);
4181	return (error);
4182}
4183
4184/*
4185 * Acquire exclusive access to a buffer.
4186 * Must be called with splbio blocked.
4187 * Return 1 if buffer was acquired.
4188 */
4189static int
4190getdirtybuf(bpp, waitfor)
4191	struct buf **bpp;
4192	int waitfor;
4193{
4194	struct buf *bp;
4195
4196	for (;;) {
4197		if ((bp = *bpp) == NULL)
4198			return (0);
4199		if ((bp->b_flags & B_BUSY) == 0)
4200			break;
4201		if (waitfor != MNT_WAIT)
4202			return (0);
4203		bp->b_flags |= B_WANTED;
4204		FREE_LOCK_INTERLOCKED(&lk);
4205		tsleep((caddr_t)bp, PRIBIO + 1, "sdsdty", 0);
4206		ACQUIRE_LOCK_INTERLOCKED(&lk);
4207	}
4208	if ((bp->b_flags & B_DELWRI) == 0)
4209		return (0);
4210	bremfree(bp);
4211	bp->b_flags |= B_BUSY;
4212	return (1);
4213}
4214
4215/*
4216 * Called whenever a buffer that is being invalidated or reallocated
4217 * contains dependencies. This should only happen if an I/O error has
4218 * occurred. The routine is called with the buffer locked.
4219 */
4220void
4221softdep_deallocate_dependencies(bp)
4222	struct buf *bp;
4223{
4224	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
4225	panic("softdep_deallocate_dependencies: dangling deps");
4226}
4227
4228/*
4229 * Function to handle asynchronous write errors in the filesystem.
4230 */
4231void
4232softdep_error(func, error)
4233	char *func;
4234	int error;
4235{
4236	/* XXX should do something better! */
4237	printf("%s: got error %d while accessing filesystem\n", func, error);
4238}
4239