ffs_softdep.c revision 42354
1139823Simp
211819Sjulian/*
311819Sjulian * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved.
411819Sjulian *
511819Sjulian * The soft updates code is derived from the appendix of a University
611819Sjulian * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
711819Sjulian * "Soft Updates: A Solution to the Metadata Update Problem in File
811819Sjulian * Systems", CSE-TR-254-95, August 1995).
911819Sjulian *
1011819Sjulian * The following are the copyrights and redistribution conditions that
1111819Sjulian * apply to this copy of the soft update software. For a license
1211819Sjulian * to use, redistribute or sell the soft update software under
13165899Srwatson * conditions other than those described here, please contact the
14165899Srwatson * author at one of the following addresses:
15165899Srwatson *
16165899Srwatson *	Marshall Kirk McKusick		mckusick@mckusick.com
17165899Srwatson *	1614 Oxford Street		+1-510-843-9542
18165899Srwatson *	Berkeley, CA 94709-1608
19165899Srwatson *	USA
20165899Srwatson *
21165899Srwatson * Redistribution and use in source and binary forms, with or without
22165899Srwatson * modification, are permitted provided that the following conditions
23165899Srwatson * are met:
24165899Srwatson *
25165899Srwatson * 1. Redistributions of source code must retain the above copyright
26165899Srwatson *    notice, this list of conditions and the following disclaimer.
27165899Srwatson * 2. Redistributions in binary form must reproduce the above copyright
28165899Srwatson *    notice, this list of conditions and the following disclaimer in the
29165899Srwatson *    documentation and/or other materials provided with the distribution.
30165899Srwatson * 3. None of the names of McKusick, Ganger, Patt, or the University of
31165899Srwatson *    Michigan may be used to endorse or promote products derived from
32165899Srwatson *    this software without specific prior written permission.
33165899Srwatson * 4. Redistributions in any form must be accompanied by information on
34165899Srwatson *    how to obtain complete source code for any accompanying software
35165899Srwatson *    that uses this software. This source code must either be included
36165899Srwatson *    in the distribution or be available for no more than the cost of
37165899Srwatson *    distribution plus a nominal fee, and must be freely redistributable
38165899Srwatson *    under reasonable conditions. For an executable file, complete
3911819Sjulian *    source code means the source code for all modules it contains.
4011819Sjulian *    It does not mean source code for modules or files that typically
4111819Sjulian *    accompany the operating system on which the executable file runs,
4211819Sjulian *    e.g., standard library modules or system header files.
4311819Sjulian *
4411819Sjulian * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
4511819Sjulian * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
4611819Sjulian * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
4711819Sjulian * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
4811819Sjulian * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
4911819Sjulian * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
5011819Sjulian * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
5111819Sjulian * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
5211819Sjulian * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
5311819Sjulian * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
5411819Sjulian * SUCH DAMAGE.
5511819Sjulian *
5611819Sjulian *	from: @(#)ffs_softdep.c	9.28 (McKusick) 8/8/98
5711819Sjulian *	$Id: ffs_softdep.c,v 1.18 1998/12/10 20:11:47 julian Exp $
58139584Srwatson */
5912057Sjulian
6012057Sjulian/*
6150477Speter * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
6211819Sjulian */
6311819Sjulian#ifndef DIAGNOSTIC
6411819Sjulian#define DIAGNOSTIC
6512470Sbde#endif
6611819Sjulian#ifndef DEBUG
6711819Sjulian#define DEBUG
6811819Sjulian#endif
6911819Sjulian
7011819Sjulian#include <sys/param.h>
7111819Sjulian#include <sys/buf.h>
7211819Sjulian#include <sys/kernel.h>
7311819Sjulian#include <sys/malloc.h>
7411819Sjulian#include <sys/mount.h>
7511819Sjulian#include <sys/proc.h>
7611819Sjulian#include <sys/syslog.h>
7711819Sjulian#include <sys/systm.h>
78194905Srwatson#include <sys/vnode.h>
7911819Sjulian#include <miscfs/specfs/specdev.h>
8011819Sjulian#include <ufs/ufs/dir.h>
8111819Sjulian#include <ufs/ufs/quota.h>
8211819Sjulian#include <ufs/ufs/inode.h>
8311819Sjulian#include <ufs/ufs/ufsmount.h>
8411819Sjulian#include <ufs/ffs/fs.h>
8511819Sjulian#include <ufs/ffs/softdep.h>
8611819Sjulian#include <ufs/ffs/ffs_extern.h>
8711819Sjulian#include <ufs/ufs/ufs_extern.h>
8811819Sjulian
8911819Sjulian/*
9011819Sjulian * These definitions need to be adapted to the system to which
91194905Srwatson * this file is being ported.
9211819Sjulian */
93194905Srwatson/*
94194905Srwatson * malloc types defined for the softdep system.
95194905Srwatson */
96194905SrwatsonMALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
97194905SrwatsonMALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
9811819SjulianMALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
9911819SjulianMALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
10011819SjulianMALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
10111819SjulianMALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
10211819SjulianMALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
10311819SjulianMALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
10411819SjulianMALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
10511819SjulianMALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
10611991SjulianMALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
10711991SjulianMALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
10811991SjulianMALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
10911991Sjulian
11011991Sjulian#define	D_PAGEDEP	0
11111991Sjulian#define	D_INODEDEP	1
11211991Sjulian#define	D_NEWBLK	2
11355205Speter#define	D_BMSAFEMAP	3
114194608Srwatson#define	D_ALLOCDIRECT	4
115194905Srwatson#define	D_INDIRDEP	5
11611947Sjulian#define	D_ALLOCINDIR	6
117194608Srwatson#define	D_FREEFRAG	7
118194608Srwatson#define	D_FREEBLKS	8
119194608Srwatson#define	D_FREEFILE	9
120194608Srwatson#define	D_DIRADD	10
121194608Srwatson#define	D_MKDIR		11
122194608Srwatson#define	D_DIRREM	12
123194608Srwatson#define D_LAST		D_DIRREM
124194608Srwatson
125169463Srwatson/*
12655205Speter * translate from workitem type to memory type
12711819Sjulian * MUST match the defines above, such that memtype[D_XXX] == M_XXX
12826965Sjhay */
129static struct malloc_type *memtype[] = {
130	M_PAGEDEP,
131	M_INODEDEP,
132	M_NEWBLK,
133	M_BMSAFEMAP,
134	M_ALLOCDIRECT,
135	M_INDIRDEP,
136	M_ALLOCINDIR,
137	M_FREEFRAG,
138	M_FREEBLKS,
139	M_FREEFILE,
140	M_DIRADD,
141	M_MKDIR,
142	M_DIRREM
143};
144
145#define DtoM(type) (memtype[type])
146
147/*
148 * Names of malloc types.
149 */
150#define TYPENAME(type)  \
151	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
152#define CURPROC curproc
153/*
154 * End system adaptaion definitions.
155 */
156
157/*
158 * Internal function prototypes.
159 */
160static	void softdep_error __P((char *, int));
161static	int getdirtybuf __P((struct buf **, int));
162static	int flush_pagedep_deps __P((struct vnode *, struct mount *,
163	    struct diraddhd *));
164static	int flush_inodedep_deps __P((struct fs *, ino_t));
165static	int handle_written_filepage __P((struct pagedep *, struct buf *));
166static  void diradd_inode_written __P((struct diradd *, struct inodedep *));
167static	int handle_written_inodeblock __P((struct inodedep *, struct buf *));
168static	void handle_allocdirect_partdone __P((struct allocdirect *));
169static	void handle_allocindir_partdone __P((struct allocindir *));
170static	void initiate_write_filepage __P((struct pagedep *, struct buf *));
171static	void handle_written_mkdir __P((struct mkdir *, int));
172static	void initiate_write_inodeblock __P((struct inodedep *, struct buf *));
173static	void handle_workitem_freefile __P((struct freefile *));
174static	void handle_workitem_remove __P((struct dirrem *));
175static	struct dirrem *newdirrem __P((struct buf *, struct inode *,
176	    struct inode *, int));
177static	void free_diradd __P((struct diradd *));
178static	void free_allocindir __P((struct allocindir *, struct inodedep *));
179static	int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t,
180	    long *));
181static	void deallocate_dependencies __P((struct buf *, struct inodedep *));
182static	void free_allocdirect __P((struct allocdirectlst *,
183	    struct allocdirect *, int));
184static	int free_inodedep __P((struct inodedep *));
185static	void handle_workitem_freeblocks __P((struct freeblks *));
186static	void merge_inode_lists __P((struct inodedep *));
187static	void setup_allocindir_phase2 __P((struct buf *, struct inode *,
188	    struct allocindir *));
189static	struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t,
190	    ufs_daddr_t));
191static	void handle_workitem_freefrag __P((struct freefrag *));
192static	struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long));
193static	void allocdirect_merge __P((struct allocdirectlst *,
194	    struct allocdirect *, struct allocdirect *));
195static	struct bmsafemap *bmsafemap_lookup __P((struct buf *));
196static	int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
197	    struct newblk **));
198static	int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
199static	int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
200	    struct pagedep **));
201static	void pause_timer __P((void *));
202static	int checklimit __P((long *, int));
203static	void add_to_worklist __P((struct worklist *));
204
205/*
206 * Exported softdep operations.
207 */
208struct bio_ops bioops = {
209	softdep_disk_io_initiation,		/* io_start */
210	softdep_disk_write_complete,		/* io_complete */
211	softdep_deallocate_dependencies,	/* io_deallocate */
212	softdep_fsync,				/* io_fsync */
213	softdep_process_worklist,		/* io_sync */
214};
215
216/*
217 * Locking primitives.
218 *
219 * For a uniprocessor, all we need to do is protect against disk
220 * interrupts. For a multiprocessor, this lock would have to be
221 * a mutex. A single mutex is used throughout this file, though
222 * finer grain locking could be used if contention warranted it.
223 *
224 * For a multiprocessor, the sleep call would accept a lock and
225 * release it after the sleep processing was complete. In a uniprocessor
226 * implementation there is no such interlock, so we simple mark
227 * the places where it needs to be done with the `interlocked' form
228 * of the lock calls. Since the uniprocessor sleep already interlocks
229 * the spl, there is nothing that really needs to be done.
230 */
231#ifndef /* NOT */ DEBUG
232static struct lockit {
233	int	lkt_spl;
234} lk = { 0 };
235#define ACQUIRE_LOCK(lk)		(lk)->lkt_spl = splbio()
236#define FREE_LOCK(lk)			splx((lk)->lkt_spl)
237#define ACQUIRE_LOCK_INTERLOCKED(lk)
238#define FREE_LOCK_INTERLOCKED(lk)
239
240#else /* DEBUG */
241static struct lockit {
242	int	lkt_spl;
243	pid_t	lkt_held;
244} lk = { 0, -1 };
245static int lockcnt;
246
247static	void acquire_lock __P((struct lockit *));
248static	void free_lock __P((struct lockit *));
249static	void acquire_lock_interlocked __P((struct lockit *));
250static	void free_lock_interlocked __P((struct lockit *));
251
252#define ACQUIRE_LOCK(lk)		acquire_lock(lk)
253#define FREE_LOCK(lk)			free_lock(lk)
254#define ACQUIRE_LOCK_INTERLOCKED(lk)	acquire_lock_interlocked(lk)
255#define FREE_LOCK_INTERLOCKED(lk)	free_lock_interlocked(lk)
256
257static void
258acquire_lock(lk)
259	struct lockit *lk;
260{
261
262	if (lk->lkt_held != -1)
263		if (lk->lkt_held == CURPROC->p_pid)
264			panic("softdep_lock: locking against myself");
265		else
266			panic("softdep_lock: lock held by %d", lk->lkt_held);
267	lk->lkt_spl = splbio();
268	lk->lkt_held = CURPROC->p_pid;
269	lockcnt++;
270}
271
272static void
273free_lock(lk)
274	struct lockit *lk;
275{
276
277	if (lk->lkt_held == -1)
278		panic("softdep_unlock: lock not held");
279	lk->lkt_held = -1;
280	splx(lk->lkt_spl);
281}
282
283static void
284acquire_lock_interlocked(lk)
285	struct lockit *lk;
286{
287
288	if (lk->lkt_held != -1)
289		if (lk->lkt_held == CURPROC->p_pid)
290			panic("softdep_lock_interlocked: locking against self");
291		else
292			panic("softdep_lock_interlocked: lock held by %d",
293			    lk->lkt_held);
294	lk->lkt_held = CURPROC->p_pid;
295	lockcnt++;
296}
297
298static void
299free_lock_interlocked(lk)
300	struct lockit *lk;
301{
302
303	if (lk->lkt_held == -1)
304		panic("softdep_unlock_interlocked: lock not held");
305	lk->lkt_held = -1;
306}
307#endif /* DEBUG */
308
309/*
310 * Place holder for real semaphores.
311 */
312struct sema {
313	int	value;
314	pid_t	holder;
315	char	*name;
316	int	prio;
317	int	timo;
318};
319static	void sema_init __P((struct sema *, char *, int, int));
320static	int sema_get __P((struct sema *, struct lockit *));
321static	void sema_release __P((struct sema *));
322
323static void
324sema_init(semap, name, prio, timo)
325	struct sema *semap;
326	char *name;
327	int prio, timo;
328{
329
330	semap->holder = -1;
331	semap->value = 0;
332	semap->name = name;
333	semap->prio = prio;
334	semap->timo = timo;
335}
336
337static int
338sema_get(semap, interlock)
339	struct sema *semap;
340	struct lockit *interlock;
341{
342
343	if (semap->value++ > 0) {
344		if (interlock != NULL)
345			FREE_LOCK_INTERLOCKED(interlock);
346		tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo);
347		if (interlock != NULL) {
348			ACQUIRE_LOCK_INTERLOCKED(interlock);
349			FREE_LOCK(interlock);
350		}
351		return (0);
352	}
353	semap->holder = CURPROC->p_pid;
354	if (interlock != NULL)
355		FREE_LOCK(interlock);
356	return (1);
357}
358
359static void
360sema_release(semap)
361	struct sema *semap;
362{
363
364	if (semap->value <= 0 || semap->holder != CURPROC->p_pid)
365		panic("sema_release: not held");
366	if (--semap->value > 0) {
367		semap->value = 0;
368		wakeup(semap);
369	}
370	semap->holder = -1;
371}
372
373/*
374 * Worklist queue management.
375 * These routines require that the lock be held.
376 */
377#ifndef /* NOT */ DEBUG
378#define WORKLIST_INSERT(head, item) do {	\
379	(item)->wk_state |= ONWORKLIST;		\
380	LIST_INSERT_HEAD(head, item, wk_list);	\
381} while (0)
382#define WORKLIST_REMOVE(item) do {		\
383	(item)->wk_state &= ~ONWORKLIST;	\
384	LIST_REMOVE(item, wk_list);		\
385} while (0)
386#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
387
388#else /* DEBUG */
389static	void worklist_insert __P((struct workhead *, struct worklist *));
390static	void worklist_remove __P((struct worklist *));
391static	void workitem_free __P((struct worklist *, int));
392
393#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
394#define WORKLIST_REMOVE(item) worklist_remove(item)
395#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
396
397static void
398worklist_insert(head, item)
399	struct workhead *head;
400	struct worklist *item;
401{
402
403	if (lk.lkt_held == -1)
404		panic("worklist_insert: lock not held");
405	if (item->wk_state & ONWORKLIST)
406		panic("worklist_insert: already on list");
407	item->wk_state |= ONWORKLIST;
408	LIST_INSERT_HEAD(head, item, wk_list);
409}
410
411static void
412worklist_remove(item)
413	struct worklist *item;
414{
415
416	if (lk.lkt_held == -1)
417		panic("worklist_remove: lock not held");
418	if ((item->wk_state & ONWORKLIST) == 0)
419		panic("worklist_remove: not on list");
420	item->wk_state &= ~ONWORKLIST;
421	LIST_REMOVE(item, wk_list);
422}
423
424static void
425workitem_free(item, type)
426	struct worklist *item;
427	int type;
428{
429
430	if (item->wk_state & ONWORKLIST)
431		panic("workitem_free: still on list");
432	if (item->wk_type != type)
433		panic("workitem_free: type mismatch");
434	FREE(item, DtoM(type));
435}
436#endif /* DEBUG */
437
438/*
439 * Workitem queue management
440 */
441static struct workhead softdep_workitem_pending;
442static int softdep_worklist_busy;
443static int max_softdeps;	/* maximum number of structs before slowdown */
444static int tickdelay = 2;	/* number of ticks to pause during slowdown */
445static int max_limit_hit;	/* number of times slowdown imposed */
446static int rush_requests;	/* number of times I/O speeded up */
447static int proc_waiting;	/* tracks whether we have a timeout posted */
448static pid_t filesys_syncer_pid;/* records pid of filesystem syncer process */
449#ifdef DEBUG
450#include <vm/vm.h>
451#include <sys/sysctl.h>
452#if defined(__FreeBSD__)
453SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
454SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
455SYSCTL_INT(_debug, OID_AUTO, max_limit_hit, CTLFLAG_RW, &max_limit_hit, 0, "");
456SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &rush_requests, 0, "");
457#else /* !__FreeBSD__ */
458struct ctldebug debug8 = { "max_softdeps", &max_softdeps };
459struct ctldebug debug9 = { "tickdelay", &tickdelay };
460struct ctldebug debug10 = { "max_limit_hit", &max_limit_hit };
461struct ctldebug debug11 = { "rush_requests", &rush_requests };
462#endif	/* !__FreeBSD__ */
463
464#endif /* DEBUG */
465
466/*
467 * Add an item to the end of the work queue.
468 * This routine requires that the lock be held.
469 * This is the only routine that adds items to the list.
470 * The following routine is the only one that removes items
471 * and does so in order from first to last.
472 */
473static void
474add_to_worklist(wk)
475	struct worklist *wk;
476{
477	static struct worklist *worklist_tail;
478
479	if (wk->wk_state & ONWORKLIST)
480		panic("add_to_worklist: already on list");
481	wk->wk_state |= ONWORKLIST;
482	if (LIST_FIRST(&softdep_workitem_pending) == NULL) {
483		LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
484	} else {
485		LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
486	}
487	worklist_tail = wk;
488}
489
490/*
491 * Process that runs once per second to handle items in the background queue.
492 *
493 * Note that we ensure that everything is done in the order in which they
494 * appear in the queue. The code below depends on this property to ensure
495 * that blocks of a file are freed before the inode itself is freed. This
496 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
497 * until all the old ones have been purged from the dependency lists.
498 */
499int
500softdep_process_worklist(matchmnt)
501	struct mount *matchmnt;
502{
503	struct proc *p = CURPROC;
504	struct worklist *wk;
505	struct fs *matchfs;
506	int matchcnt;
507
508	/*
509	 * Record the process identifier of our caller so that we can
510	 * give this process preferential treatment in checklimit below.
511	 */
512	filesys_syncer_pid = p->p_pid;
513	matchcnt = 0;
514	matchfs = NULL;
515	if (matchmnt != NULL)
516		matchfs = VFSTOUFS(matchmnt)->um_fs;
517	/*
518	 * There is no danger of having multiple processes run this
519	 * code. It is single threaded solely so that softdep_flushfiles
520	 * (below) can get an accurate count of the number of items
521	 * related to its mount point that are in the list.
522	 */
523	if (softdep_worklist_busy && matchmnt == NULL)
524		return (-1);
525	ACQUIRE_LOCK(&lk);
526	while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) {
527		WORKLIST_REMOVE(wk);
528		FREE_LOCK(&lk);
529		switch (wk->wk_type) {
530
531		case D_DIRREM:
532			/* removal of a directory entry */
533			if (WK_DIRREM(wk)->dm_mnt == matchmnt)
534				matchcnt += 1;
535			handle_workitem_remove(WK_DIRREM(wk));
536			break;
537
538		case D_FREEBLKS:
539			/* releasing blocks and/or fragments from a file */
540			if (WK_FREEBLKS(wk)->fb_fs == matchfs)
541				matchcnt += 1;
542			handle_workitem_freeblocks(WK_FREEBLKS(wk));
543			break;
544
545		case D_FREEFRAG:
546			/* releasing a fragment when replaced as a file grows */
547			if (WK_FREEFRAG(wk)->ff_fs == matchfs)
548				matchcnt += 1;
549			handle_workitem_freefrag(WK_FREEFRAG(wk));
550			break;
551
552		case D_FREEFILE:
553			/* releasing an inode when its link count drops to 0 */
554			if (WK_FREEFILE(wk)->fx_fs == matchfs)
555				matchcnt += 1;
556			handle_workitem_freefile(WK_FREEFILE(wk));
557			break;
558
559		default:
560			panic("%s_process_worklist: Unknown type %s",
561			    "softdep", TYPENAME(wk->wk_type));
562			/* NOTREACHED */
563		}
564		if (softdep_worklist_busy && matchmnt == NULL)
565			return (-1);
566		ACQUIRE_LOCK(&lk);
567	}
568	FREE_LOCK(&lk);
569	return (matchcnt);
570}
571
572/*
573 * Purge the work list of all items associated with a particular mount point.
574 */
575int
576softdep_flushfiles(oldmnt, flags, p)
577	struct mount *oldmnt;
578	int flags;
579	struct proc *p;
580{
581	struct vnode *devvp;
582	int error, loopcnt;
583
584	/*
585	 * Await our turn to clear out the queue.
586	 */
587	while (softdep_worklist_busy)
588		tsleep(&lbolt, PRIBIO, "softflush", 0);
589	softdep_worklist_busy = 1;
590	if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) {
591		softdep_worklist_busy = 0;
592		return (error);
593	}
594	/*
595	 * Alternately flush the block device associated with the mount
596	 * point and process any dependencies that the flushing
597	 * creates. In theory, this loop can happen at most twice,
598	 * but we give it a few extra just to be sure.
599	 */
600	devvp = VFSTOUFS(oldmnt)->um_devvp;
601	for (loopcnt = 10; loopcnt > 0; loopcnt--) {
602		if (softdep_process_worklist(oldmnt) == 0) {
603			/*
604			 * Do another flush in case any vnodes were brought in
605			 * as part of the cleanup operations.
606			 */
607			if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
608				break;
609			/*
610			 * If we still found nothing to do, we are really done.
611			 */
612			if (softdep_process_worklist(oldmnt) == 0)
613				break;
614		}
615		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
616		error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
617		VOP_UNLOCK(devvp, 0, p);
618		if (error)
619			break;
620	}
621	softdep_worklist_busy = 0;
622	/*
623	 * If we are unmounting then it is an error to fail. If we
624	 * are simply trying to downgrade to read-only, then filesystem
625	 * activity can keep us busy forever, so we just fail with EBUSY.
626	 */
627	if (loopcnt == 0) {
628		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
629			panic("softdep_flushfiles: looping");
630		error = EBUSY;
631	}
632	return (error);
633}
634
635/*
636 * A large burst of file addition or deletion activity can drive the
637 * memory load excessively high. Therefore we deliberately slow things
638 * down and speed up the I/O processing if we find ourselves with too
639 * many dependencies in progress.
640 */
641static int
642checklimit(resource, islocked)
643	long *resource;
644	int islocked;
645{
646	struct proc *p = CURPROC;
647
648	/*
649	 * If we are under our limit, just proceed.
650	 */
651	if (*resource < max_softdeps)
652		return (0);
653	/*
654	 * We never hold up the filesystem syncer process.
655	 */
656	if (p->p_pid == filesys_syncer_pid)
657		return (0);
658	/*
659	 * Our first approach is to speed up the syncer process.
660	 * We never push it to speed up more than half of its
661	 * normal turn time, otherwise it could take over the cpu.
662	 */
663	if (rushjob < syncdelay / 2) {
664		rushjob += 1;
665		rush_requests += 1;
666		return (0);
667	}
668	/*
669	 * Every trick has failed, so we pause momentarily to let
670	 * the filesystem syncer process catch up.
671	 */
672	if (islocked == 0)
673		ACQUIRE_LOCK(&lk);
674	if (proc_waiting == 0) {
675		proc_waiting = 1;
676		timeout(pause_timer, NULL, tickdelay > 2 ? tickdelay : 2);
677	}
678	FREE_LOCK_INTERLOCKED(&lk);
679	(void) tsleep((caddr_t)&proc_waiting, PPAUSE | PCATCH, "softupdate", 0);
680	ACQUIRE_LOCK_INTERLOCKED(&lk);
681	if (islocked == 0)
682		FREE_LOCK(&lk);
683	max_limit_hit += 1;
684	return (1);
685}
686
687/*
688 * Awaken processes pausing in checklimit and clear proc_waiting
689 * to indicate that there is no longer a timer running.
690 */
691void
692pause_timer(arg)
693	void *arg;
694{
695
696	proc_waiting = 0;
697	wakeup(&proc_waiting);
698}
699
700/*
701 * Structure hashing.
702 *
703 * There are three types of structures that can be looked up:
704 *	1) pagedep structures identified by mount point, inode number,
705 *	   and logical block.
706 *	2) inodedep structures identified by mount point and inode number.
707 *	3) newblk structures identified by mount point and
708 *	   physical block number.
709 *
710 * The "pagedep" and "inodedep" dependency structures are hashed
711 * separately from the file blocks and inodes to which they correspond.
712 * This separation helps when the in-memory copy of an inode or
713 * file block must be replaced. It also obviates the need to access
714 * an inode or file page when simply updating (or de-allocating)
715 * dependency structures. Lookup of newblk structures is needed to
716 * find newly allocated blocks when trying to associate them with
717 * their allocdirect or allocindir structure.
718 *
719 * The lookup routines optionally create and hash a new instance when
720 * an existing entry is not found.
721 */
722#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
723
724/*
725 * Structures and routines associated with pagedep caching.
726 */
727LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
728u_long	pagedep_hash;		/* size of hash table - 1 */
729#define	PAGEDEP_HASH(mp, inum, lbn) \
730	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
731	    pagedep_hash])
732static struct sema pagedep_in_progress;
733
734/*
735 * Look up a pagedep. Return 1 if found, 0 if not found.
736 * If not found, allocate if DEPALLOC flag is passed.
737 * Found or allocated entry is returned in pagedeppp.
738 * This routine must be called with splbio interrupts blocked.
739 */
740static int
741pagedep_lookup(ip, lbn, flags, pagedeppp)
742	struct inode *ip;
743	ufs_lbn_t lbn;
744	int flags;
745	struct pagedep **pagedeppp;
746{
747	struct pagedep *pagedep;
748	struct pagedep_hashhead *pagedephd;
749	struct mount *mp;
750	int i;
751
752#ifdef DEBUG
753	if (lk.lkt_held == -1)
754		panic("pagedep_lookup: lock not held");
755#endif
756	mp = ITOV(ip)->v_mount;
757	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
758top:
759	for (pagedep = LIST_FIRST(pagedephd); pagedep;
760	     pagedep = LIST_NEXT(pagedep, pd_hash))
761		if (ip->i_number == pagedep->pd_ino &&
762		    lbn == pagedep->pd_lbn &&
763		    mp == pagedep->pd_mnt)
764			break;
765	if (pagedep) {
766		*pagedeppp = pagedep;
767		return (1);
768	}
769	if ((flags & DEPALLOC) == 0) {
770		*pagedeppp = NULL;
771		return (0);
772	}
773	if (sema_get(&pagedep_in_progress, &lk) == 0) {
774		ACQUIRE_LOCK(&lk);
775		goto top;
776	}
777	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
778		M_WAITOK);
779	bzero(pagedep, sizeof(struct pagedep));
780	pagedep->pd_list.wk_type = D_PAGEDEP;
781	pagedep->pd_mnt = mp;
782	pagedep->pd_ino = ip->i_number;
783	pagedep->pd_lbn = lbn;
784	LIST_INIT(&pagedep->pd_dirremhd);
785	LIST_INIT(&pagedep->pd_pendinghd);
786	for (i = 0; i < DAHASHSZ; i++)
787		LIST_INIT(&pagedep->pd_diraddhd[i]);
788	ACQUIRE_LOCK(&lk);
789	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
790	sema_release(&pagedep_in_progress);
791	*pagedeppp = pagedep;
792	return (0);
793}
794
795/*
796 * Structures and routines associated with inodedep caching.
797 */
798LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
799static u_long	inodedep_hash;	/* size of hash table - 1 */
800static long	num_inodedep;	/* number of inodedep allocated */
801#define	INODEDEP_HASH(fs, inum) \
802      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
803static struct sema inodedep_in_progress;
804
805/*
806 * Look up a inodedep. Return 1 if found, 0 if not found.
807 * If not found, allocate if DEPALLOC flag is passed.
808 * Found or allocated entry is returned in inodedeppp.
809 * This routine must be called with splbio interrupts blocked.
810 */
811static int
812inodedep_lookup(fs, inum, flags, inodedeppp)
813	struct fs *fs;
814	ino_t inum;
815	int flags;
816	struct inodedep **inodedeppp;
817{
818	struct inodedep *inodedep;
819	struct inodedep_hashhead *inodedephd;
820	int firsttry;
821
822#ifdef DEBUG
823	if (lk.lkt_held == -1)
824		panic("inodedep_lookup: lock not held");
825#endif
826	firsttry = 1;
827	inodedephd = INODEDEP_HASH(fs, inum);
828top:
829	for (inodedep = LIST_FIRST(inodedephd); inodedep;
830	     inodedep = LIST_NEXT(inodedep, id_hash))
831		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
832			break;
833	if (inodedep) {
834		*inodedeppp = inodedep;
835		return (1);
836	}
837	if ((flags & DEPALLOC) == 0) {
838		*inodedeppp = NULL;
839		return (0);
840	}
841	if (firsttry && checklimit(&num_inodedep, 1) == 1) {
842		firsttry = 0;
843		goto top;
844	}
845	if (sema_get(&inodedep_in_progress, &lk) == 0) {
846		ACQUIRE_LOCK(&lk);
847		goto top;
848	}
849	num_inodedep += 1;
850	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
851		M_INODEDEP, M_WAITOK);
852	inodedep->id_list.wk_type = D_INODEDEP;
853	inodedep->id_fs = fs;
854	inodedep->id_ino = inum;
855	inodedep->id_state = ALLCOMPLETE;
856	inodedep->id_nlinkdelta = 0;
857	inodedep->id_savedino = NULL;
858	inodedep->id_savedsize = -1;
859	inodedep->id_buf = NULL;
860	LIST_INIT(&inodedep->id_pendinghd);
861	LIST_INIT(&inodedep->id_inowait);
862	LIST_INIT(&inodedep->id_bufwait);
863	TAILQ_INIT(&inodedep->id_inoupdt);
864	TAILQ_INIT(&inodedep->id_newinoupdt);
865	ACQUIRE_LOCK(&lk);
866	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
867	sema_release(&inodedep_in_progress);
868	*inodedeppp = inodedep;
869	return (0);
870}
871
872/*
873 * Structures and routines associated with newblk caching.
874 */
875LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
876u_long	newblk_hash;		/* size of hash table - 1 */
877#define	NEWBLK_HASH(fs, inum) \
878	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
879static struct sema newblk_in_progress;
880
881/*
882 * Look up a newblk. Return 1 if found, 0 if not found.
883 * If not found, allocate if DEPALLOC flag is passed.
884 * Found or allocated entry is returned in newblkpp.
885 */
886static int
887newblk_lookup(fs, newblkno, flags, newblkpp)
888	struct fs *fs;
889	ufs_daddr_t newblkno;
890	int flags;
891	struct newblk **newblkpp;
892{
893	struct newblk *newblk;
894	struct newblk_hashhead *newblkhd;
895
896	newblkhd = NEWBLK_HASH(fs, newblkno);
897top:
898	for (newblk = LIST_FIRST(newblkhd); newblk;
899	     newblk = LIST_NEXT(newblk, nb_hash))
900		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
901			break;
902	if (newblk) {
903		*newblkpp = newblk;
904		return (1);
905	}
906	if ((flags & DEPALLOC) == 0) {
907		*newblkpp = NULL;
908		return (0);
909	}
910	if (sema_get(&newblk_in_progress, 0) == 0)
911		goto top;
912	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
913		M_NEWBLK, M_WAITOK);
914	newblk->nb_state = 0;
915	newblk->nb_fs = fs;
916	newblk->nb_newblkno = newblkno;
917	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
918	sema_release(&newblk_in_progress);
919	*newblkpp = newblk;
920	return (0);
921}
922
923/*
924 * Executed during filesystem system initialization before
925 * mounting any file systems.
926 */
927void
928softdep_initialize()
929{
930
931	LIST_INIT(&mkdirlisthd);
932	LIST_INIT(&softdep_workitem_pending);
933	max_softdeps = desiredvnodes * 8;
934	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
935	    &pagedep_hash);
936	sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
937	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
938	sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
939	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
940	sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
941}
942
943/*
944 * Called at mount time to notify the dependency code that a
945 * filesystem wishes to use it.
946 */
947int
948softdep_mount(devvp, mp, fs, cred)
949	struct vnode *devvp;
950	struct mount *mp;
951	struct fs *fs;
952	struct ucred *cred;
953{
954	struct csum cstotal;
955	struct cg *cgp;
956	struct buf *bp;
957	int error, cyl;
958
959	mp->mnt_flag &= ~MNT_ASYNC;
960	mp->mnt_flag |= MNT_SOFTDEP;
961	/*
962	 * When doing soft updates, the counters in the
963	 * superblock may have gotten out of sync, so we have
964	 * to scan the cylinder groups and recalculate them.
965	 */
966	if (fs->fs_clean != 0)
967		return (0);
968	bzero(&cstotal, sizeof cstotal);
969	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
970		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
971		    fs->fs_cgsize, cred, &bp)) != 0) {
972			brelse(bp);
973			return (error);
974		}
975		cgp = (struct cg *)bp->b_data;
976		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
977		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
978		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
979		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
980		fs->fs_cs(fs, cyl) = cgp->cg_cs;
981		brelse(bp);
982	}
983#ifdef DEBUG
984	if (!bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
985		printf("ffs_mountfs: superblock updated for soft updates\n");
986#endif
987	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
988	return (0);
989}
990
991/*
992 * Protecting the freemaps (or bitmaps).
993 *
994 * To eliminate the need to execute fsck before mounting a file system
995 * after a power failure, one must (conservatively) guarantee that the
996 * on-disk copy of the bitmaps never indicate that a live inode or block is
997 * free.  So, when a block or inode is allocated, the bitmap should be
998 * updated (on disk) before any new pointers.  When a block or inode is
999 * freed, the bitmap should not be updated until all pointers have been
1000 * reset.  The latter dependency is handled by the delayed de-allocation
1001 * approach described below for block and inode de-allocation.  The former
1002 * dependency is handled by calling the following procedure when a block or
1003 * inode is allocated. When an inode is allocated an "inodedep" is created
1004 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1005 * Each "inodedep" is also inserted into the hash indexing structure so
1006 * that any additional link additions can be made dependent on the inode
1007 * allocation.
1008 *
1009 * The ufs file system maintains a number of free block counts (e.g., per
1010 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1011 * in addition to the bitmaps.  These counts are used to improve efficiency
1012 * during allocation and therefore must be consistent with the bitmaps.
1013 * There is no convenient way to guarantee post-crash consistency of these
1014 * counts with simple update ordering, for two main reasons: (1) The counts
1015 * and bitmaps for a single cylinder group block are not in the same disk
1016 * sector.  If a disk write is interrupted (e.g., by power failure), one may
1017 * be written and the other not.  (2) Some of the counts are located in the
1018 * superblock rather than the cylinder group block. So, we focus our soft
1019 * updates implementation on protecting the bitmaps. When mounting a
1020 * filesystem, we recompute the auxiliary counts from the bitmaps.
1021 */
1022
1023/*
1024 * Called just after updating the cylinder group block to allocate an inode.
1025 */
1026void
1027softdep_setup_inomapdep(bp, ip, newinum)
1028	struct buf *bp;		/* buffer for cylgroup block with inode map */
1029	struct inode *ip;	/* inode related to allocation */
1030	ino_t newinum;		/* new inode number being allocated */
1031{
1032	struct inodedep *inodedep;
1033	struct bmsafemap *bmsafemap;
1034
1035	/*
1036	 * Create a dependency for the newly allocated inode.
1037	 * Panic if it already exists as something is seriously wrong.
1038	 * Otherwise add it to the dependency list for the buffer holding
1039	 * the cylinder group map from which it was allocated.
1040	 */
1041	ACQUIRE_LOCK(&lk);
1042	if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0)
1043		panic("softdep_setup_inomapdep: found inode");
1044	inodedep->id_buf = bp;
1045	inodedep->id_state &= ~DEPCOMPLETE;
1046	bmsafemap = bmsafemap_lookup(bp);
1047	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1048	FREE_LOCK(&lk);
1049}
1050
1051/*
1052 * Called just after updating the cylinder group block to
1053 * allocate block or fragment.
1054 */
1055void
1056softdep_setup_blkmapdep(bp, fs, newblkno)
1057	struct buf *bp;		/* buffer for cylgroup block with block map */
1058	struct fs *fs;		/* filesystem doing allocation */
1059	ufs_daddr_t newblkno;	/* number of newly allocated block */
1060{
1061	struct newblk *newblk;
1062	struct bmsafemap *bmsafemap;
1063
1064	/*
1065	 * Create a dependency for the newly allocated block.
1066	 * Add it to the dependency list for the buffer holding
1067	 * the cylinder group map from which it was allocated.
1068	 */
1069	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1070		panic("softdep_setup_blkmapdep: found block");
1071	ACQUIRE_LOCK(&lk);
1072	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1073	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1074	FREE_LOCK(&lk);
1075}
1076
1077/*
1078 * Find the bmsafemap associated with a cylinder group buffer.
1079 * If none exists, create one. The buffer must be locked when
1080 * this routine is called and this routine must be called with
1081 * splbio interrupts blocked.
1082 */
1083static struct bmsafemap *
1084bmsafemap_lookup(bp)
1085	struct buf *bp;
1086{
1087	struct bmsafemap *bmsafemap;
1088	struct worklist *wk;
1089
1090#ifdef DEBUG
1091	if (lk.lkt_held == -1)
1092		panic("bmsafemap_lookup: lock not held");
1093#endif
1094	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list))
1095		if (wk->wk_type == D_BMSAFEMAP)
1096			return (WK_BMSAFEMAP(wk));
1097	FREE_LOCK(&lk);
1098	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1099		M_BMSAFEMAP, M_WAITOK);
1100	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1101	bmsafemap->sm_list.wk_state = 0;
1102	bmsafemap->sm_buf = bp;
1103	LIST_INIT(&bmsafemap->sm_allocdirecthd);
1104	LIST_INIT(&bmsafemap->sm_allocindirhd);
1105	LIST_INIT(&bmsafemap->sm_inodedephd);
1106	LIST_INIT(&bmsafemap->sm_newblkhd);
1107	ACQUIRE_LOCK(&lk);
1108	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1109	return (bmsafemap);
1110}
1111
1112/*
1113 * Direct block allocation dependencies.
1114 *
1115 * When a new block is allocated, the corresponding disk locations must be
1116 * initialized (with zeros or new data) before the on-disk inode points to
1117 * them.  Also, the freemap from which the block was allocated must be
1118 * updated (on disk) before the inode's pointer. These two dependencies are
1119 * independent of each other and are needed for all file blocks and indirect
1120 * blocks that are pointed to directly by the inode.  Just before the
1121 * "in-core" version of the inode is updated with a newly allocated block
1122 * number, a procedure (below) is called to setup allocation dependency
1123 * structures.  These structures are removed when the corresponding
1124 * dependencies are satisfied or when the block allocation becomes obsolete
1125 * (i.e., the file is deleted, the block is de-allocated, or the block is a
1126 * fragment that gets upgraded).  All of these cases are handled in
1127 * procedures described later.
1128 *
1129 * When a file extension causes a fragment to be upgraded, either to a larger
1130 * fragment or to a full block, the on-disk location may change (if the
1131 * previous fragment could not simply be extended). In this case, the old
1132 * fragment must be de-allocated, but not until after the inode's pointer has
1133 * been updated. In most cases, this is handled by later procedures, which
1134 * will construct a "freefrag" structure to be added to the workitem queue
1135 * when the inode update is complete (or obsolete).  The main exception to
1136 * this is when an allocation occurs while a pending allocation dependency
1137 * (for the same block pointer) remains.  This case is handled in the main
1138 * allocation dependency setup procedure by immediately freeing the
1139 * unreferenced fragments.
1140 */
1141void
1142softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1143	struct inode *ip;	/* inode to which block is being added */
1144	ufs_lbn_t lbn;		/* block pointer within inode */
1145	ufs_daddr_t newblkno;	/* disk block number being added */
1146	ufs_daddr_t oldblkno;	/* previous block number, 0 unless frag */
1147	long newsize;		/* size of new block */
1148	long oldsize;		/* size of new block */
1149	struct buf *bp;		/* bp for allocated block */
1150{
1151	struct allocdirect *adp, *oldadp;
1152	struct allocdirectlst *adphead;
1153	struct bmsafemap *bmsafemap;
1154	struct inodedep *inodedep;
1155	struct pagedep *pagedep;
1156	struct newblk *newblk;
1157
1158	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1159		M_ALLOCDIRECT, M_WAITOK);
1160	bzero(adp, sizeof(struct allocdirect));
1161	adp->ad_list.wk_type = D_ALLOCDIRECT;
1162	adp->ad_lbn = lbn;
1163	adp->ad_newblkno = newblkno;
1164	adp->ad_oldblkno = oldblkno;
1165	adp->ad_newsize = newsize;
1166	adp->ad_oldsize = oldsize;
1167	adp->ad_state = ATTACHED;
1168	if (newblkno == oldblkno)
1169		adp->ad_freefrag = NULL;
1170	else
1171		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1172
1173	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1174		panic("softdep_setup_allocdirect: lost block");
1175
1176	ACQUIRE_LOCK(&lk);
1177	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
1178	adp->ad_inodedep = inodedep;
1179
1180	if (newblk->nb_state == DEPCOMPLETE) {
1181		adp->ad_state |= DEPCOMPLETE;
1182		adp->ad_buf = NULL;
1183	} else {
1184		bmsafemap = newblk->nb_bmsafemap;
1185		adp->ad_buf = bmsafemap->sm_buf;
1186		LIST_REMOVE(newblk, nb_deps);
1187		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1188	}
1189	LIST_REMOVE(newblk, nb_hash);
1190	FREE(newblk, M_NEWBLK);
1191
1192	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1193	if (lbn >= NDADDR) {
1194		/* allocating an indirect block */
1195		if (oldblkno != 0)
1196			panic("softdep_setup_allocdirect: non-zero indir");
1197	} else {
1198		/*
1199		 * Allocating a direct block.
1200		 *
1201		 * If we are allocating a directory block, then we must
1202		 * allocate an associated pagedep to track additions and
1203		 * deletions.
1204		 */
1205		if ((ip->i_mode & IFMT) == IFDIR &&
1206		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1207			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1208	}
1209	/*
1210	 * The list of allocdirects must be kept in sorted and ascending
1211	 * order so that the rollback routines can quickly determine the
1212	 * first uncommitted block (the size of the file stored on disk
1213	 * ends at the end of the lowest committed fragment, or if there
1214	 * are no fragments, at the end of the highest committed block).
1215	 * Since files generally grow, the typical case is that the new
1216	 * block is to be added at the end of the list. We speed this
1217	 * special case by checking against the last allocdirect in the
1218	 * list before laboriously traversing the list looking for the
1219	 * insertion point.
1220	 */
1221	adphead = &inodedep->id_newinoupdt;
1222	oldadp = TAILQ_LAST(adphead, allocdirectlst);
1223	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1224		/* insert at end of list */
1225		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1226		if (oldadp != NULL && oldadp->ad_lbn == lbn)
1227			allocdirect_merge(adphead, adp, oldadp);
1228		FREE_LOCK(&lk);
1229		return;
1230	}
1231	for (oldadp = TAILQ_FIRST(adphead); oldadp;
1232	     oldadp = TAILQ_NEXT(oldadp, ad_next)) {
1233		if (oldadp->ad_lbn >= lbn)
1234			break;
1235	}
1236	if (oldadp == NULL)
1237		panic("softdep_setup_allocdirect: lost entry");
1238	/* insert in middle of list */
1239	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1240	if (oldadp->ad_lbn == lbn)
1241		allocdirect_merge(adphead, adp, oldadp);
1242	FREE_LOCK(&lk);
1243}
1244
1245/*
1246 * Replace an old allocdirect dependency with a newer one.
1247 * This routine must be called with splbio interrupts blocked.
1248 */
1249static void
1250allocdirect_merge(adphead, newadp, oldadp)
1251	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
1252	struct allocdirect *newadp;	/* allocdirect being added */
1253	struct allocdirect *oldadp;	/* existing allocdirect being checked */
1254{
1255	struct freefrag *freefrag;
1256
1257#ifdef DEBUG
1258	if (lk.lkt_held == -1)
1259		panic("allocdirect_merge: lock not held");
1260#endif
1261	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1262	    newadp->ad_oldsize != oldadp->ad_newsize ||
1263	    newadp->ad_lbn >= NDADDR)
1264		panic("allocdirect_check: old %d != new %d || lbn %ld >= %d",
1265		    newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
1266		    NDADDR);
1267	newadp->ad_oldblkno = oldadp->ad_oldblkno;
1268	newadp->ad_oldsize = oldadp->ad_oldsize;
1269	/*
1270	 * If the old dependency had a fragment to free or had never
1271	 * previously had a block allocated, then the new dependency
1272	 * can immediately post its freefrag and adopt the old freefrag.
1273	 * This action is done by swapping the freefrag dependencies.
1274	 * The new dependency gains the old one's freefrag, and the
1275	 * old one gets the new one and then immediately puts it on
1276	 * the worklist when it is freed by free_allocdirect. It is
1277	 * not possible to do this swap when the old dependency had a
1278	 * non-zero size but no previous fragment to free. This condition
1279	 * arises when the new block is an extension of the old block.
1280	 * Here, the first part of the fragment allocated to the new
1281	 * dependency is part of the block currently claimed on disk by
1282	 * the old dependency, so cannot legitimately be freed until the
1283	 * conditions for the new dependency are fulfilled.
1284	 */
1285	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1286		freefrag = newadp->ad_freefrag;
1287		newadp->ad_freefrag = oldadp->ad_freefrag;
1288		oldadp->ad_freefrag = freefrag;
1289	}
1290	free_allocdirect(adphead, oldadp, 0);
1291}
1292
1293/*
1294 * Allocate a new freefrag structure if needed.
1295 */
1296static struct freefrag *
1297newfreefrag(ip, blkno, size)
1298	struct inode *ip;
1299	ufs_daddr_t blkno;
1300	long size;
1301{
1302	struct freefrag *freefrag;
1303	struct fs *fs;
1304
1305	if (blkno == 0)
1306		return (NULL);
1307	fs = ip->i_fs;
1308	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1309		panic("newfreefrag: frag size");
1310	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1311		M_FREEFRAG, M_WAITOK);
1312	freefrag->ff_list.wk_type = D_FREEFRAG;
1313	freefrag->ff_state = ip->i_uid & ~ONWORKLIST;	/* XXX - used below */
1314	freefrag->ff_inum = ip->i_number;
1315	freefrag->ff_fs = fs;
1316	freefrag->ff_devvp = ip->i_devvp;
1317	freefrag->ff_blkno = blkno;
1318	freefrag->ff_fragsize = size;
1319	return (freefrag);
1320}
1321
1322/*
1323 * This workitem de-allocates fragments that were replaced during
1324 * file block allocation.
1325 */
1326static void
1327handle_workitem_freefrag(freefrag)
1328	struct freefrag *freefrag;
1329{
1330	struct inode tip;
1331
1332	tip.i_fs = freefrag->ff_fs;
1333	tip.i_devvp = freefrag->ff_devvp;
1334	tip.i_dev = freefrag->ff_devvp->v_rdev;
1335	tip.i_number = freefrag->ff_inum;
1336	tip.i_uid = freefrag->ff_state & ~ONWORKLIST;	/* XXX - set above */
1337	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1338	FREE(freefrag, M_FREEFRAG);
1339}
1340
1341/*
1342 * Indirect block allocation dependencies.
1343 *
1344 * The same dependencies that exist for a direct block also exist when
1345 * a new block is allocated and pointed to by an entry in a block of
1346 * indirect pointers. The undo/redo states described above are also
1347 * used here. Because an indirect block contains many pointers that
1348 * may have dependencies, a second copy of the entire in-memory indirect
1349 * block is kept. The buffer cache copy is always completely up-to-date.
1350 * The second copy, which is used only as a source for disk writes,
1351 * contains only the safe pointers (i.e., those that have no remaining
1352 * update dependencies). The second copy is freed when all pointers
1353 * are safe. The cache is not allowed to replace indirect blocks with
1354 * pending update dependencies. If a buffer containing an indirect
1355 * block with dependencies is written, these routines will mark it
1356 * dirty again. It can only be successfully written once all the
1357 * dependencies are removed. The ffs_fsync routine in conjunction with
1358 * softdep_sync_metadata work together to get all the dependencies
1359 * removed so that a file can be successfully written to disk. Three
1360 * procedures are used when setting up indirect block pointer
1361 * dependencies. The division is necessary because of the organization
1362 * of the "balloc" routine and because of the distinction between file
1363 * pages and file metadata blocks.
1364 */
1365
1366/*
1367 * Allocate a new allocindir structure.
1368 */
1369static struct allocindir *
1370newallocindir(ip, ptrno, newblkno, oldblkno)
1371	struct inode *ip;	/* inode for file being extended */
1372	int ptrno;		/* offset of pointer in indirect block */
1373	ufs_daddr_t newblkno;	/* disk block number being added */
1374	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1375{
1376	struct allocindir *aip;
1377
1378	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1379		M_ALLOCINDIR, M_WAITOK);
1380	bzero(aip, sizeof(struct allocindir));
1381	aip->ai_list.wk_type = D_ALLOCINDIR;
1382	aip->ai_state = ATTACHED;
1383	aip->ai_offset = ptrno;
1384	aip->ai_newblkno = newblkno;
1385	aip->ai_oldblkno = oldblkno;
1386	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1387	return (aip);
1388}
1389
1390/*
1391 * Called just before setting an indirect block pointer
1392 * to a newly allocated file page.
1393 */
1394void
1395softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1396	struct inode *ip;	/* inode for file being extended */
1397	ufs_lbn_t lbn;		/* allocated block number within file */
1398	struct buf *bp;		/* buffer with indirect blk referencing page */
1399	int ptrno;		/* offset of pointer in indirect block */
1400	ufs_daddr_t newblkno;	/* disk block number being added */
1401	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1402	struct buf *nbp;	/* buffer holding allocated page */
1403{
1404	struct allocindir *aip;
1405	struct pagedep *pagedep;
1406
1407	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1408	ACQUIRE_LOCK(&lk);
1409	/*
1410	 * If we are allocating a directory page, then we must
1411	 * allocate an associated pagedep to track additions and
1412	 * deletions.
1413	 */
1414	if ((ip->i_mode & IFMT) == IFDIR &&
1415	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1416		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1417	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1418	FREE_LOCK(&lk);
1419	setup_allocindir_phase2(bp, ip, aip);
1420}
1421
1422/*
1423 * Called just before setting an indirect block pointer to a
1424 * newly allocated indirect block.
1425 */
1426void
1427softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1428	struct buf *nbp;	/* newly allocated indirect block */
1429	struct inode *ip;	/* inode for file being extended */
1430	struct buf *bp;		/* indirect block referencing allocated block */
1431	int ptrno;		/* offset of pointer in indirect block */
1432	ufs_daddr_t newblkno;	/* disk block number being added */
1433{
1434	struct allocindir *aip;
1435
1436	aip = newallocindir(ip, ptrno, newblkno, 0);
1437	ACQUIRE_LOCK(&lk);
1438	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1439	FREE_LOCK(&lk);
1440	setup_allocindir_phase2(bp, ip, aip);
1441}
1442
1443/*
1444 * Called to finish the allocation of the "aip" allocated
1445 * by one of the two routines above.
1446 */
1447static void
1448setup_allocindir_phase2(bp, ip, aip)
1449	struct buf *bp;		/* in-memory copy of the indirect block */
1450	struct inode *ip;	/* inode for file being extended */
1451	struct allocindir *aip;	/* allocindir allocated by the above routines */
1452{
1453	struct worklist *wk;
1454	struct indirdep *indirdep, *newindirdep;
1455	struct bmsafemap *bmsafemap;
1456	struct allocindir *oldaip;
1457	struct freefrag *freefrag;
1458	struct newblk *newblk;
1459
1460	if (bp->b_lblkno >= 0)
1461		panic("setup_allocindir_phase2: not indir blk");
1462	for (indirdep = NULL, newindirdep = NULL; ; ) {
1463		ACQUIRE_LOCK(&lk);
1464		for (wk = LIST_FIRST(&bp->b_dep); wk;
1465		     wk = LIST_NEXT(wk, wk_list)) {
1466			if (wk->wk_type != D_INDIRDEP)
1467				continue;
1468			indirdep = WK_INDIRDEP(wk);
1469			break;
1470		}
1471		if (indirdep == NULL && newindirdep) {
1472			indirdep = newindirdep;
1473			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1474			newindirdep = NULL;
1475		}
1476		FREE_LOCK(&lk);
1477		if (indirdep) {
1478			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1479			    &newblk) == 0)
1480				panic("setup_allocindir: lost block");
1481			ACQUIRE_LOCK(&lk);
1482			if (newblk->nb_state == DEPCOMPLETE) {
1483				aip->ai_state |= DEPCOMPLETE;
1484				aip->ai_buf = NULL;
1485			} else {
1486				bmsafemap = newblk->nb_bmsafemap;
1487				aip->ai_buf = bmsafemap->sm_buf;
1488				LIST_REMOVE(newblk, nb_deps);
1489				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1490				    aip, ai_deps);
1491			}
1492			LIST_REMOVE(newblk, nb_hash);
1493			FREE(newblk, M_NEWBLK);
1494			aip->ai_indirdep = indirdep;
1495			/*
1496			 * Check to see if there is an existing dependency
1497			 * for this block. If there is, merge the old
1498			 * dependency into the new one.
1499			 */
1500			if (aip->ai_oldblkno == 0)
1501				oldaip = NULL;
1502			else
1503				for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd);
1504				    oldaip; oldaip = LIST_NEXT(oldaip, ai_next))
1505					if (oldaip->ai_offset == aip->ai_offset)
1506						break;
1507			if (oldaip != NULL) {
1508				if (oldaip->ai_newblkno != aip->ai_oldblkno)
1509					panic("setup_allocindir_phase2: blkno");
1510				aip->ai_oldblkno = oldaip->ai_oldblkno;
1511				freefrag = oldaip->ai_freefrag;
1512				oldaip->ai_freefrag = aip->ai_freefrag;
1513				aip->ai_freefrag = freefrag;
1514				free_allocindir(oldaip, NULL);
1515			}
1516			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1517			((ufs_daddr_t *)indirdep->ir_savebp->b_data)
1518			    [aip->ai_offset] = aip->ai_oldblkno;
1519			FREE_LOCK(&lk);
1520		}
1521		if (newindirdep) {
1522			if (indirdep->ir_savebp != NULL)
1523				brelse(newindirdep->ir_savebp);
1524			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
1525		}
1526		if (indirdep)
1527			break;
1528		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
1529			M_INDIRDEP, M_WAITOK);
1530		newindirdep->ir_list.wk_type = D_INDIRDEP;
1531		newindirdep->ir_state = ATTACHED;
1532		LIST_INIT(&newindirdep->ir_deplisthd);
1533		LIST_INIT(&newindirdep->ir_donehd);
1534#ifdef __FreeBSD__
1535		if (bp->b_blkno == bp->b_lblkno) {
1536#if 0 /* we know this happens.. research suggested.. */
1537			printf("setup_allocindir_phase2: need bmap, blk %d\n",
1538				bp->b_lblkno);
1539#endif
1540			VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
1541				NULL, NULL);
1542		}
1543#endif /* __FreeBSD__ */
1544		newindirdep->ir_savebp =
1545		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1546		bp->b_flags |= B_XXX;
1547		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1548	}
1549}
1550
1551/*
1552 * Block de-allocation dependencies.
1553 *
1554 * When blocks are de-allocated, the on-disk pointers must be nullified before
1555 * the blocks are made available for use by other files.  (The true
1556 * requirement is that old pointers must be nullified before new on-disk
1557 * pointers are set.  We chose this slightly more stringent requirement to
1558 * reduce complexity.) Our implementation handles this dependency by updating
1559 * the inode (or indirect block) appropriately but delaying the actual block
1560 * de-allocation (i.e., freemap and free space count manipulation) until
1561 * after the updated versions reach stable storage.  After the disk is
1562 * updated, the blocks can be safely de-allocated whenever it is convenient.
1563 * This implementation handles only the common case of reducing a file's
1564 * length to zero. Other cases are handled by the conventional synchronous
1565 * write approach.
1566 *
1567 * The ffs implementation with which we worked double-checks
1568 * the state of the block pointers and file size as it reduces
1569 * a file's length.  Some of this code is replicated here in our
1570 * soft updates implementation.  The freeblks->fb_chkcnt field is
1571 * used to transfer a part of this information to the procedure
1572 * that eventually de-allocates the blocks.
1573 *
1574 * This routine should be called from the routine that shortens
1575 * a file's length, before the inode's size or block pointers
1576 * are modified. It will save the block pointer information for
1577 * later release and zero the inode so that the calling routine
1578 * can release it.
1579 */
1580static long num_freeblks;	/* number of freeblks allocated */
1581void
1582softdep_setup_freeblocks(ip, length)
1583	struct inode *ip;	/* The inode whose length is to be reduced */
1584	off_t length;		/* The new length for the file */
1585{
1586	struct freeblks *freeblks;
1587	struct inodedep *inodedep;
1588	struct allocdirect *adp;
1589	struct vnode *vp;
1590	struct buf *bp;
1591	struct fs *fs;
1592	int i, error;
1593
1594	fs = ip->i_fs;
1595	if (length != 0)
1596		panic("softde_setup_freeblocks: non-zero length");
1597	(void) checklimit(&num_freeblks, 0);
1598	num_freeblks += 1;
1599	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
1600		M_FREEBLKS, M_WAITOK);
1601	bzero(freeblks, sizeof(struct freeblks));
1602	freeblks->fb_list.wk_type = D_FREEBLKS;
1603	freeblks->fb_uid = ip->i_uid;
1604	freeblks->fb_previousinum = ip->i_number;
1605	freeblks->fb_devvp = ip->i_devvp;
1606	freeblks->fb_fs = fs;
1607	freeblks->fb_oldsize = ip->i_size;
1608	freeblks->fb_newsize = length;
1609	freeblks->fb_chkcnt = ip->i_blocks;
1610	for (i = 0; i < NDADDR; i++) {
1611		freeblks->fb_dblks[i] = ip->i_db[i];
1612		ip->i_db[i] = 0;
1613	}
1614	for (i = 0; i < NIADDR; i++) {
1615		freeblks->fb_iblks[i] = ip->i_ib[i];
1616		ip->i_ib[i] = 0;
1617	}
1618	ip->i_blocks = 0;
1619	ip->i_size = 0;
1620	/*
1621	 * Push the zero'ed inode to to its disk buffer so that we are free
1622	 * to delete its dependencies below. Once the dependencies are gone
1623	 * the buffer can be safely released.
1624	 */
1625	if ((error = bread(ip->i_devvp,
1626	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1627	    (int)fs->fs_bsize, NOCRED, &bp)) != 0)
1628		softdep_error("softdep_setup_freeblocks", error);
1629	*((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
1630	    ip->i_din;
1631	/*
1632	 * Find and eliminate any inode dependencies.
1633	 */
1634	ACQUIRE_LOCK(&lk);
1635	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1636	if ((inodedep->id_state & IOSTARTED) != 0)
1637		panic("softdep_setup_freeblocks: inode busy");
1638	/*
1639	 * Add the freeblks structure to the list of operations that
1640	 * must await the zero'ed inode being written to disk.
1641	 */
1642	WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1643	/*
1644	 * Because the file length has been truncated to zero, any
1645	 * pending block allocation dependency structures associated
1646	 * with this inode are obsolete and can simply be de-allocated.
1647	 * We must first merge the two dependency lists to get rid of
1648	 * any duplicate freefrag structures, then purge the merged list.
1649	 */
1650	merge_inode_lists(inodedep);
1651	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
1652		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
1653	bdwrite(bp);
1654	/*
1655	 * We must wait for any I/O in progress to finish so that
1656	 * all potential buffers on the dirty list will be visible.
1657	 * Once they are all there, walk the list and get rid of
1658	 * any dependencies.
1659	 */
1660	vp = ITOV(ip);
1661	while (vp->v_numoutput) {
1662		vp->v_flag |= VBWAIT;
1663		FREE_LOCK_INTERLOCKED(&lk);
1664		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "sdsetf", 0);
1665		ACQUIRE_LOCK_INTERLOCKED(&lk);
1666	}
1667	while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
1668		bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
1669		(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
1670		deallocate_dependencies(bp, inodedep);
1671		bp->b_flags |= B_INVAL | B_NOCACHE;
1672		brelse(bp);
1673	}
1674	/*
1675	 * Try freeing the inodedep in case that was the last dependency.
1676	 */
1677	if ((inodedep_lookup(fs, ip->i_number, 0, &inodedep)) != 0)
1678		(void) free_inodedep(inodedep);
1679	FREE_LOCK(&lk);
1680}
1681
1682/*
1683 * Reclaim any dependency structures from a buffer that is about to
1684 * be reallocated to a new vnode. The buffer must be locked, thus,
1685 * no I/O completion operations can occur while we are manipulating
1686 * its associated dependencies. The mutex is held so that other I/O's
1687 * associated with related dependencies do not occur.
1688 */
1689static void
1690deallocate_dependencies(bp, inodedep)
1691	struct buf *bp;
1692	struct inodedep *inodedep;
1693{
1694	struct worklist *wk;
1695	struct indirdep *indirdep;
1696	struct allocindir *aip;
1697	struct pagedep *pagedep;
1698	struct dirrem *dirrem;
1699	struct diradd *dap;
1700	int i;
1701
1702	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
1703		switch (wk->wk_type) {
1704
1705		case D_INDIRDEP:
1706			indirdep = WK_INDIRDEP(wk);
1707			/*
1708			 * None of the indirect pointers will ever be visible,
1709			 * so they can simply be tossed. GOINGAWAY ensures
1710			 * that allocated pointers will be saved in the buffer
1711			 * cache until they are freed. Note that they will
1712			 * only be able to be found by their physical address
1713			 * since the inode mapping the logical address will
1714			 * be gone. The save buffer used for the safe copy
1715			 * was allocated in setup_allocindir_phase2 using
1716			 * the physical address so it could be used for this
1717			 * purpose. Hence we swap the safe copy with the real
1718			 * copy, allowing the safe copy to be freed and holding
1719			 * on to the real copy for later use in indir_trunc.
1720			 */
1721			if (indirdep->ir_state & GOINGAWAY)
1722				panic("deallocate_dependencies: already gone");
1723			indirdep->ir_state |= GOINGAWAY;
1724			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
1725				free_allocindir(aip, inodedep);
1726			if (bp->b_lblkno >= 0 ||
1727			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
1728				panic("deallocate_dependencies: not indir");
1729			bcopy(bp->b_data, indirdep->ir_savebp->b_data,
1730			    bp->b_bcount);
1731			WORKLIST_REMOVE(wk);
1732			WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
1733			continue;
1734
1735		case D_PAGEDEP:
1736			pagedep = WK_PAGEDEP(wk);
1737			/*
1738			 * None of the directory additions will ever be
1739			 * visible, so they can simply be tossed.
1740			 */
1741			for (i = 0; i < DAHASHSZ; i++)
1742				while (dap=LIST_FIRST(&pagedep->pd_diraddhd[i]))
1743					free_diradd(dap);
1744			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
1745				free_diradd(dap);
1746			/*
1747			 * Copy any directory remove dependencies to the list
1748			 * to be processed after the zero'ed inode is written.
1749			 * If the inode has already been written, then they
1750			 * can be dumped directly onto the work list.
1751			 */
1752			for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem;
1753			     dirrem = LIST_NEXT(dirrem, dm_next)) {
1754				LIST_REMOVE(dirrem, dm_next);
1755				dirrem->dm_dirinum = pagedep->pd_ino;
1756				if (inodedep == NULL)
1757					add_to_worklist(&dirrem->dm_list);
1758				else
1759					WORKLIST_INSERT(&inodedep->id_bufwait,
1760					    &dirrem->dm_list);
1761			}
1762			WORKLIST_REMOVE(&pagedep->pd_list);
1763			LIST_REMOVE(pagedep, pd_hash);
1764			WORKITEM_FREE(pagedep, D_PAGEDEP);
1765			continue;
1766
1767		case D_ALLOCINDIR:
1768			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
1769			continue;
1770
1771		case D_ALLOCDIRECT:
1772		case D_INODEDEP:
1773			panic("deallocate_dependencies: Unexpected type %s",
1774			    TYPENAME(wk->wk_type));
1775			/* NOTREACHED */
1776
1777		default:
1778			panic("deallocate_dependencies: Unknown type %s",
1779			    TYPENAME(wk->wk_type));
1780			/* NOTREACHED */
1781		}
1782	}
1783}
1784
1785/*
1786 * Free an allocdirect. Generate a new freefrag work request if appropriate.
1787 * This routine must be called with splbio interrupts blocked.
1788 */
1789static void
1790free_allocdirect(adphead, adp, delay)
1791	struct allocdirectlst *adphead;
1792	struct allocdirect *adp;
1793	int delay;
1794{
1795
1796#ifdef DEBUG
1797	if (lk.lkt_held == -1)
1798		panic("free_allocdirect: lock not held");
1799#endif
1800	if ((adp->ad_state & DEPCOMPLETE) == 0)
1801		LIST_REMOVE(adp, ad_deps);
1802	TAILQ_REMOVE(adphead, adp, ad_next);
1803	if ((adp->ad_state & COMPLETE) == 0)
1804		WORKLIST_REMOVE(&adp->ad_list);
1805	if (adp->ad_freefrag != NULL) {
1806		if (delay)
1807			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
1808			    &adp->ad_freefrag->ff_list);
1809		else
1810			add_to_worklist(&adp->ad_freefrag->ff_list);
1811	}
1812	WORKITEM_FREE(adp, D_ALLOCDIRECT);
1813}
1814
1815/*
1816 * Prepare an inode to be freed. The actual free operation is not
1817 * done until the zero'ed inode has been written to disk.
1818 */
1819static long num_freefile;	/* number of freefile allocated */
1820void
1821softdep_freefile(pvp, ino, mode)
1822		struct vnode *pvp;
1823		ino_t ino;
1824		int mode;
1825{
1826	struct inode *ip = VTOI(pvp);
1827	struct inodedep *inodedep;
1828	struct freefile *freefile;
1829
1830	/*
1831	 * This sets up the inode de-allocation dependency.
1832	 */
1833	(void) checklimit(&num_freefile, 0);
1834	num_freefile += 1;
1835	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
1836		M_FREEFILE, M_WAITOK);
1837	freefile->fx_list.wk_type = D_FREEFILE;
1838	freefile->fx_list.wk_state = 0;
1839	freefile->fx_mode = mode;
1840	freefile->fx_oldinum = ino;
1841	freefile->fx_devvp = ip->i_devvp;
1842	freefile->fx_fs = ip->i_fs;
1843
1844	/*
1845	 * If the inodedep does not exist, then the zero'ed inode has
1846	 * been written to disk and we can free the file immediately.
1847	 */
1848	ACQUIRE_LOCK(&lk);
1849	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0) {
1850		add_to_worklist(&freefile->fx_list);
1851		FREE_LOCK(&lk);
1852		return;
1853	}
1854
1855	/*
1856	 * If we still have a bitmap dependency, then the inode has never
1857	 * been written to disk. Drop the dependency as it is no longer
1858	 * necessary since the inode is being deallocated. We could process
1859	 * the freefile immediately, but then we would have to clear the
1860	 * id_inowait dependencies here and it is easier just to let the
1861	 * zero'ed inode be written and let them be cleaned up in the
1862	 * normal followup actions that follow the inode write.
1863	 */
1864	 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
1865		inodedep->id_state |= DEPCOMPLETE;
1866		LIST_REMOVE(inodedep, id_deps);
1867		inodedep->id_buf = NULL;
1868	}
1869	/*
1870	 * If the inodedep has no dependencies associated with it,
1871	 * then we must free it here and free the file immediately.
1872	 * This case arises when an early allocation fails (for
1873	 * example, the user is over their file quota).
1874	 */
1875	if (free_inodedep(inodedep) == 0)
1876		WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
1877	else
1878		add_to_worklist(&freefile->fx_list);
1879	FREE_LOCK(&lk);
1880}
1881
1882/*
1883 * Try to free an inodedep structure. Return 1 if it could be freed.
1884 */
1885static int
1886free_inodedep(inodedep)
1887	struct inodedep *inodedep;
1888{
1889
1890	if ((inodedep->id_state & ONWORKLIST) != 0 ||
1891	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
1892	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
1893	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
1894	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
1895	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
1896	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
1897	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL)
1898		return (0);
1899	LIST_REMOVE(inodedep, id_hash);
1900	WORKITEM_FREE(inodedep, D_INODEDEP);
1901	num_inodedep -= 1;
1902	return (1);
1903}
1904
1905/*
1906 * This workitem routine performs the block de-allocation.
1907 * The workitem is added to the pending list after the updated
1908 * inode block has been written to disk.  As mentioned above,
1909 * checks regarding the number of blocks de-allocated (compared
1910 * to the number of blocks allocated for the file) are also
1911 * performed in this function.
1912 */
1913static void
1914handle_workitem_freeblocks(freeblks)
1915	struct freeblks *freeblks;
1916{
1917	struct inode tip;
1918	ufs_daddr_t bn;
1919	struct fs *fs;
1920	int i, level, bsize;
1921	long nblocks, blocksreleased = 0;
1922	int error, allerror = 0;
1923	ufs_lbn_t baselbns[NIADDR], tmpval;
1924
1925	tip.i_number = freeblks->fb_previousinum;
1926	tip.i_devvp = freeblks->fb_devvp;
1927	tip.i_dev = freeblks->fb_devvp->v_rdev;
1928	tip.i_fs = freeblks->fb_fs;
1929	tip.i_size = freeblks->fb_oldsize;
1930	tip.i_uid = freeblks->fb_uid;
1931	fs = freeblks->fb_fs;
1932	tmpval = 1;
1933	baselbns[0] = NDADDR;
1934	for (i = 1; i < NIADDR; i++) {
1935		tmpval *= NINDIR(fs);
1936		baselbns[i] = baselbns[i - 1] + tmpval;
1937	}
1938	nblocks = btodb(fs->fs_bsize);
1939	blocksreleased = 0;
1940	/*
1941	 * Indirect blocks first.
1942	 */
1943	for (level = (NIADDR - 1); level >= 0; level--) {
1944		if ((bn = freeblks->fb_iblks[level]) == 0)
1945			continue;
1946		if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
1947		    baselbns[level], &blocksreleased)) == 0)
1948			allerror = error;
1949		ffs_blkfree(&tip, bn, fs->fs_bsize);
1950		blocksreleased += nblocks;
1951	}
1952	/*
1953	 * All direct blocks or frags.
1954	 */
1955	for (i = (NDADDR - 1); i >= 0; i--) {
1956		if ((bn = freeblks->fb_dblks[i]) == 0)
1957			continue;
1958		bsize = blksize(fs, &tip, i);
1959		ffs_blkfree(&tip, bn, bsize);
1960		blocksreleased += btodb(bsize);
1961	}
1962
1963#ifdef DIAGNOSTIC
1964	if (freeblks->fb_chkcnt != blocksreleased)
1965		panic("handle_workitem_freeblocks: block count");
1966	if (allerror)
1967		softdep_error("handle_workitem_freeblks", allerror);
1968#endif /* DIAGNOSTIC */
1969	WORKITEM_FREE(freeblks, D_FREEBLKS);
1970	num_freeblks -= 1;
1971}
1972
1973/*
1974 * Release blocks associated with the inode ip and stored in the indirect
1975 * block dbn. If level is greater than SINGLE, the block is an indirect block
1976 * and recursive calls to indirtrunc must be used to cleanse other indirect
1977 * blocks.
1978 */
1979static int
1980indir_trunc(ip, dbn, level, lbn, countp)
1981	struct inode *ip;
1982	ufs_daddr_t dbn;
1983	int level;
1984	ufs_lbn_t lbn;
1985	long *countp;
1986{
1987	struct buf *bp;
1988	ufs_daddr_t *bap;
1989	ufs_daddr_t nb;
1990	struct fs *fs;
1991	struct worklist *wk;
1992	struct indirdep *indirdep;
1993	int i, lbnadd, nblocks;
1994	int error, allerror = 0;
1995
1996	fs = ip->i_fs;
1997	lbnadd = 1;
1998	for (i = level; i > 0; i--)
1999		lbnadd *= NINDIR(fs);
2000	/*
2001	 * Get buffer of block pointers to be freed. This routine is not
2002	 * called until the zero'ed inode has been written, so it is safe
2003	 * to free blocks as they are encountered. Because the inode has
2004	 * been zero'ed, calls to bmap on these blocks will fail. So, we
2005	 * have to use the on-disk address and the block device for the
2006	 * filesystem to look them up. If the file was deleted before its
2007	 * indirect blocks were all written to disk, the routine that set
2008	 * us up (deallocate_dependencies) will have arranged to leave
2009	 * a complete copy of the indirect block in memory for our use.
2010	 * Otherwise we have to read the blocks in from the disk.
2011	 */
2012	ACQUIRE_LOCK(&lk);
2013	if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
2014	    (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2015		if (wk->wk_type != D_INDIRDEP ||
2016		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2017		    (indirdep->ir_state & GOINGAWAY) == 0)
2018			panic("indir_trunc: lost indirdep");
2019		WORKLIST_REMOVE(wk);
2020		WORKITEM_FREE(indirdep, D_INDIRDEP);
2021		if (LIST_FIRST(&bp->b_dep) != NULL)
2022			panic("indir_trunc: dangling dep");
2023		FREE_LOCK(&lk);
2024	} else {
2025		FREE_LOCK(&lk);
2026		error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
2027		if (error)
2028			return (error);
2029	}
2030	/*
2031	 * Recursively free indirect blocks.
2032	 */
2033	bap = (ufs_daddr_t *)bp->b_data;
2034	nblocks = btodb(fs->fs_bsize);
2035	for (i = NINDIR(fs) - 1; i >= 0; i--) {
2036		if ((nb = bap[i]) == 0)
2037			continue;
2038		if (level != 0) {
2039			if ((error = indir_trunc(ip, fsbtodb(fs, nb),
2040			     level - 1, lbn + (i * lbnadd), countp)) != 0)
2041				allerror = error;
2042		}
2043		ffs_blkfree(ip, nb, fs->fs_bsize);
2044		*countp += nblocks;
2045	}
2046	bp->b_flags |= B_INVAL | B_NOCACHE;
2047	bp->b_flags &= ~B_XXX;
2048	brelse(bp);
2049	return (allerror);
2050}
2051
2052/*
2053 * Free an allocindir.
2054 * This routine must be called with splbio interrupts blocked.
2055 */
2056static void
2057free_allocindir(aip, inodedep)
2058	struct allocindir *aip;
2059	struct inodedep *inodedep;
2060{
2061	struct freefrag *freefrag;
2062
2063#ifdef DEBUG
2064	if (lk.lkt_held == -1)
2065		panic("free_allocindir: lock not held");
2066#endif
2067	if ((aip->ai_state & DEPCOMPLETE) == 0)
2068		LIST_REMOVE(aip, ai_deps);
2069	if (aip->ai_state & ONWORKLIST)
2070		WORKLIST_REMOVE(&aip->ai_list);
2071	LIST_REMOVE(aip, ai_next);
2072	if ((freefrag = aip->ai_freefrag) != NULL) {
2073		if (inodedep == NULL)
2074			add_to_worklist(&freefrag->ff_list);
2075		else
2076			WORKLIST_INSERT(&inodedep->id_bufwait,
2077			    &freefrag->ff_list);
2078	}
2079	WORKITEM_FREE(aip, D_ALLOCINDIR);
2080}
2081
2082/*
2083 * Directory entry addition dependencies.
2084 *
2085 * When adding a new directory entry, the inode (with its incremented link
2086 * count) must be written to disk before the directory entry's pointer to it.
2087 * Also, if the inode is newly allocated, the corresponding freemap must be
2088 * updated (on disk) before the directory entry's pointer. These requirements
2089 * are met via undo/redo on the directory entry's pointer, which consists
2090 * simply of the inode number.
2091 *
2092 * As directory entries are added and deleted, the free space within a
2093 * directory block can become fragmented.  The ufs file system will compact
2094 * a fragmented directory block to make space for a new entry. When this
2095 * occurs, the offsets of previously added entries change. Any "diradd"
2096 * dependency structures corresponding to these entries must be updated with
2097 * the new offsets.
2098 */
2099
2100/*
2101 * This routine is called after the in-memory inode's link
2102 * count has been incremented, but before the directory entry's
2103 * pointer to the inode has been set.
2104 */
2105void
2106softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
2107	struct buf *bp;		/* buffer containing directory block */
2108	struct inode *dp;	/* inode for directory */
2109	off_t diroffset;	/* offset of new entry in directory */
2110	long newinum;		/* inode referenced by new directory entry */
2111	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
2112{
2113	int offset;		/* offset of new entry within directory block */
2114	ufs_lbn_t lbn;		/* block in directory containing new entry */
2115	struct fs *fs;
2116	struct diradd *dap;
2117	struct pagedep *pagedep;
2118	struct inodedep *inodedep;
2119	struct mkdir *mkdir1, *mkdir2;
2120
2121	/*
2122	 * Whiteouts have no dependencies.
2123	 */
2124	if (newinum == WINO) {
2125		if (newdirbp != NULL)
2126			bdwrite(newdirbp);
2127		return;
2128	}
2129
2130	fs = dp->i_fs;
2131	lbn = lblkno(fs, diroffset);
2132	offset = blkoff(fs, diroffset);
2133	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK);
2134	bzero(dap, sizeof(struct diradd));
2135	dap->da_list.wk_type = D_DIRADD;
2136	dap->da_offset = offset;
2137	dap->da_newinum = newinum;
2138	dap->da_state = ATTACHED;
2139	if (newdirbp == NULL) {
2140		dap->da_state |= DEPCOMPLETE;
2141		ACQUIRE_LOCK(&lk);
2142	} else {
2143		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2144		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2145		    M_WAITOK);
2146		mkdir1->md_list.wk_type = D_MKDIR;
2147		mkdir1->md_state = MKDIR_BODY;
2148		mkdir1->md_diradd = dap;
2149		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2150		    M_WAITOK);
2151		mkdir2->md_list.wk_type = D_MKDIR;
2152		mkdir2->md_state = MKDIR_PARENT;
2153		mkdir2->md_diradd = dap;
2154		ACQUIRE_LOCK(&lk);
2155		/*
2156		 * Dependency on "." and ".." being written to disk.
2157		 */
2158		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2159		WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2160		bdwrite(newdirbp);
2161		/*
2162		 * Dependency on link count increase for parent directory
2163		 */
2164		if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
2165		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2166			dap->da_state &= ~MKDIR_PARENT;
2167			WORKITEM_FREE(mkdir2, D_MKDIR);
2168		} else {
2169			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2170			WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2171		}
2172	}
2173	/*
2174	 * Link into parent directory pagedep to await its being written.
2175	 */
2176	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2177		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2178	dap->da_pagedep = pagedep;
2179	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2180	    da_pdlist);
2181	/*
2182	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2183	 * is not yet written. If it is written, do the post-inode write
2184	 * processing to put it on the id_pendinghd list.
2185	 */
2186	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2187	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2188		diradd_inode_written(dap, inodedep);
2189	else
2190		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2191	FREE_LOCK(&lk);
2192}
2193
2194/*
2195 * This procedure is called to change the offset of a directory
2196 * entry when compacting a directory block which must be owned
2197 * exclusively by the caller. Note that the actual entry movement
2198 * must be done in this procedure to ensure that no I/O completions
2199 * occur while the move is in progress.
2200 */
2201void
2202softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2203	struct inode *dp;	/* inode for directory */
2204	caddr_t base;		/* address of dp->i_offset */
2205	caddr_t oldloc;		/* address of old directory location */
2206	caddr_t newloc;		/* address of new directory location */
2207	int entrysize;		/* size of directory entry */
2208{
2209	int offset, oldoffset, newoffset;
2210	struct pagedep *pagedep;
2211	struct diradd *dap;
2212	ufs_lbn_t lbn;
2213
2214	ACQUIRE_LOCK(&lk);
2215	lbn = lblkno(dp->i_fs, dp->i_offset);
2216	offset = blkoff(dp->i_fs, dp->i_offset);
2217	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2218		goto done;
2219	oldoffset = offset + (oldloc - base);
2220	newoffset = offset + (newloc - base);
2221	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]);
2222	     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2223		if (dap->da_offset != oldoffset)
2224			continue;
2225		dap->da_offset = newoffset;
2226		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2227			break;
2228		LIST_REMOVE(dap, da_pdlist);
2229		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2230		    dap, da_pdlist);
2231		break;
2232	}
2233	if (dap == NULL) {
2234		for (dap = LIST_FIRST(&pagedep->pd_pendinghd);
2235		     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2236			if (dap->da_offset == oldoffset) {
2237				dap->da_offset = newoffset;
2238				break;
2239			}
2240		}
2241	}
2242done:
2243	bcopy(oldloc, newloc, entrysize);
2244	FREE_LOCK(&lk);
2245}
2246
2247/*
2248 * Free a diradd dependency structure. This routine must be called
2249 * with splbio interrupts blocked.
2250 */
2251static void
2252free_diradd(dap)
2253	struct diradd *dap;
2254{
2255	struct dirrem *dirrem;
2256	struct pagedep *pagedep;
2257	struct inodedep *inodedep;
2258	struct mkdir *mkdir, *nextmd;
2259
2260#ifdef DEBUG
2261	if (lk.lkt_held == -1)
2262		panic("free_diradd: lock not held");
2263#endif
2264	WORKLIST_REMOVE(&dap->da_list);
2265	LIST_REMOVE(dap, da_pdlist);
2266	if ((dap->da_state & DIRCHG) == 0) {
2267		pagedep = dap->da_pagedep;
2268	} else {
2269		dirrem = dap->da_previous;
2270		pagedep = dirrem->dm_pagedep;
2271		dirrem->dm_dirinum = pagedep->pd_ino;
2272		add_to_worklist(&dirrem->dm_list);
2273	}
2274	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2275	    0, &inodedep) != 0)
2276		(void) free_inodedep(inodedep);
2277	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2278		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2279			nextmd = LIST_NEXT(mkdir, md_mkdirs);
2280			if (mkdir->md_diradd != dap)
2281				continue;
2282			dap->da_state &= ~mkdir->md_state;
2283			WORKLIST_REMOVE(&mkdir->md_list);
2284			LIST_REMOVE(mkdir, md_mkdirs);
2285			WORKITEM_FREE(mkdir, D_MKDIR);
2286		}
2287		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
2288			panic("free_diradd: unfound ref");
2289	}
2290	WORKITEM_FREE(dap, D_DIRADD);
2291}
2292
2293/*
2294 * Directory entry removal dependencies.
2295 *
2296 * When removing a directory entry, the entry's inode pointer must be
2297 * zero'ed on disk before the corresponding inode's link count is decremented
2298 * (possibly freeing the inode for re-use). This dependency is handled by
2299 * updating the directory entry but delaying the inode count reduction until
2300 * after the directory block has been written to disk. After this point, the
2301 * inode count can be decremented whenever it is convenient.
2302 */
2303
2304/*
2305 * This routine should be called immediately after removing
2306 * a directory entry.  The inode's link count should not be
2307 * decremented by the calling procedure -- the soft updates
2308 * code will do this task when it is safe.
2309 */
2310void
2311softdep_setup_remove(bp, dp, ip, isrmdir)
2312	struct buf *bp;		/* buffer containing directory block */
2313	struct inode *dp;	/* inode for the directory being modified */
2314	struct inode *ip;	/* inode for directory entry being removed */
2315	int isrmdir;		/* indicates if doing RMDIR */
2316{
2317	struct dirrem *dirrem;
2318
2319	/*
2320	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2321	 */
2322	dirrem = newdirrem(bp, dp, ip, isrmdir);
2323	if ((dirrem->dm_state & COMPLETE) == 0) {
2324		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2325		    dm_next);
2326	} else {
2327		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2328		add_to_worklist(&dirrem->dm_list);
2329	}
2330	FREE_LOCK(&lk);
2331}
2332
2333/*
2334 * Allocate a new dirrem if appropriate and return it along with
2335 * its associated pagedep. Called without a lock, returns with lock.
2336 */
2337static struct dirrem *
2338newdirrem(bp, dp, ip, isrmdir)
2339	struct buf *bp;		/* buffer containing directory block */
2340	struct inode *dp;	/* inode for the directory being modified */
2341	struct inode *ip;	/* inode for directory entry being removed */
2342	int isrmdir;		/* indicates if doing RMDIR */
2343{
2344	int offset;
2345	ufs_lbn_t lbn;
2346	struct diradd *dap;
2347	struct dirrem *dirrem;
2348	struct pagedep *pagedep;
2349
2350	/*
2351	 * Whiteouts have no deletion dependencies.
2352	 */
2353	if (ip == NULL)
2354		panic("newdirrem: whiteout");
2355	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
2356		M_DIRREM, M_WAITOK);
2357	bzero(dirrem, sizeof(struct dirrem));
2358	dirrem->dm_list.wk_type = D_DIRREM;
2359	dirrem->dm_state = isrmdir ? RMDIR : 0;
2360	dirrem->dm_mnt = ITOV(ip)->v_mount;
2361	dirrem->dm_oldinum = ip->i_number;
2362
2363	ACQUIRE_LOCK(&lk);
2364	lbn = lblkno(dp->i_fs, dp->i_offset);
2365	offset = blkoff(dp->i_fs, dp->i_offset);
2366	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2367		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2368	dirrem->dm_pagedep = pagedep;
2369	/*
2370	 * Check for a diradd dependency for the same directory entry.
2371	 * If present, then both dependencies become obsolete and can
2372	 * be de-allocated. Check for an entry on both the pd_dirraddhd
2373	 * list and the pd_pendinghd list.
2374	 */
2375	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(offset)]);
2376	     dap; dap = LIST_NEXT(dap, da_pdlist))
2377		if (dap->da_offset == offset)
2378			break;
2379	if (dap == NULL) {
2380		for (dap = LIST_FIRST(&pagedep->pd_pendinghd);
2381		     dap; dap = LIST_NEXT(dap, da_pdlist))
2382			if (dap->da_offset == offset)
2383				break;
2384		if (dap == NULL)
2385			return (dirrem);
2386	}
2387	/*
2388	 * Must be ATTACHED at this point, so just delete it.
2389	 */
2390	if ((dap->da_state & ATTACHED) == 0)
2391		panic("newdirrem: not ATTACHED");
2392	if (dap->da_newinum != ip->i_number)
2393		panic("newdirrem: inum %d should be %d",
2394		    ip->i_number, dap->da_newinum);
2395	free_diradd(dap);
2396	dirrem->dm_state |= COMPLETE;
2397	return (dirrem);
2398}
2399
2400/*
2401 * Directory entry change dependencies.
2402 *
2403 * Changing an existing directory entry requires that an add operation
2404 * be completed first followed by a deletion. The semantics for the addition
2405 * are identical to the description of adding a new entry above except
2406 * that the rollback is to the old inode number rather than zero. Once
2407 * the addition dependency is completed, the removal is done as described
2408 * in the removal routine above.
2409 */
2410
2411/*
2412 * This routine should be called immediately after changing
2413 * a directory entry.  The inode's link count should not be
2414 * decremented by the calling procedure -- the soft updates
2415 * code will perform this task when it is safe.
2416 */
2417void
2418softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
2419	struct buf *bp;		/* buffer containing directory block */
2420	struct inode *dp;	/* inode for the directory being modified */
2421	struct inode *ip;	/* inode for directory entry being removed */
2422	long newinum;		/* new inode number for changed entry */
2423	int isrmdir;		/* indicates if doing RMDIR */
2424{
2425	int offset;
2426	struct diradd *dap = NULL;
2427	struct dirrem *dirrem;
2428	struct pagedep *pagedep;
2429	struct inodedep *inodedep;
2430
2431	offset = blkoff(dp->i_fs, dp->i_offset);
2432
2433	/*
2434	 * Whiteouts do not need diradd dependencies.
2435	 */
2436	if (newinum != WINO) {
2437		MALLOC(dap, struct diradd *, sizeof(struct diradd),
2438		    M_DIRADD, M_WAITOK);
2439		bzero(dap, sizeof(struct diradd));
2440		dap->da_list.wk_type = D_DIRADD;
2441		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
2442		dap->da_offset = offset;
2443		dap->da_newinum = newinum;
2444	}
2445
2446	/*
2447	 * Allocate a new dirrem and ACQUIRE_LOCK.
2448	 */
2449	dirrem = newdirrem(bp, dp, ip, isrmdir);
2450	pagedep = dirrem->dm_pagedep;
2451	/*
2452	 * The possible values for isrmdir:
2453	 *	0 - non-directory file rename
2454	 *	1 - directory rename within same directory
2455	 *   inum - directory rename to new directory of given inode number
2456	 * When renaming to a new directory, we are both deleting and
2457	 * creating a new directory entry, so the link count on the new
2458	 * directory should not change. Thus we do not need the followup
2459	 * dirrem which is usually done in handle_workitem_remove. We set
2460	 * the DIRCHG flag to tell handle_workitem_remove to skip the
2461	 * followup dirrem.
2462	 */
2463	if (isrmdir > 1)
2464		dirrem->dm_state |= DIRCHG;
2465
2466	/*
2467	 * Whiteouts have no additional dependencies,
2468	 * so just put the dirrem on the correct list.
2469	 */
2470	if (newinum == WINO) {
2471		if ((dirrem->dm_state & COMPLETE) == 0) {
2472			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
2473			    dm_next);
2474		} else {
2475			dirrem->dm_dirinum = pagedep->pd_ino;
2476			add_to_worklist(&dirrem->dm_list);
2477		}
2478		FREE_LOCK(&lk);
2479		return;
2480	}
2481
2482	/*
2483	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2484	 * is not yet written. If it is written, do the post-inode write
2485	 * processing to put it on the id_pendinghd list.
2486	 */
2487	dap->da_previous = dirrem;
2488	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
2489	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2490		dap->da_state |= COMPLETE;
2491		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
2492		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
2493	} else {
2494		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
2495		    dap, da_pdlist);
2496		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2497	}
2498	/*
2499	 * If the previous inode was never written or its previous directory
2500	 * entry was never written, then we do not want to roll back to this
2501	 * previous value. Instead we want to roll back to zero and immediately
2502	 * free the unwritten or unreferenced inode.
2503	 */
2504	if (dirrem->dm_state & COMPLETE) {
2505		dap->da_state &= ~DIRCHG;
2506		dap->da_pagedep = pagedep;
2507		dirrem->dm_dirinum = pagedep->pd_ino;
2508		add_to_worklist(&dirrem->dm_list);
2509	}
2510	FREE_LOCK(&lk);
2511}
2512
2513/*
2514 * Called whenever the link count on an inode is increased.
2515 * It creates an inode dependency so that the new reference(s)
2516 * to the inode cannot be committed to disk until the updated
2517 * inode has been written.
2518 */
2519void
2520softdep_increase_linkcnt(ip)
2521	struct inode *ip;	/* the inode with the increased link count */
2522{
2523	struct inodedep *inodedep;
2524
2525	ACQUIRE_LOCK(&lk);
2526	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
2527	FREE_LOCK(&lk);
2528}
2529
2530/*
2531 * This workitem decrements the inode's link count.
2532 * If the link count reaches zero, the file is removed.
2533 */
2534static void
2535handle_workitem_remove(dirrem)
2536	struct dirrem *dirrem;
2537{
2538	struct proc *p = CURPROC;	/* XXX */
2539	struct inodedep *inodedep;
2540	struct vnode *vp;
2541	struct inode *ip;
2542	int error;
2543
2544	if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
2545		softdep_error("handle_workitem_remove: vget", error);
2546		return;
2547	}
2548	ip = VTOI(vp);
2549	/*
2550	 * Normal file deletion.
2551	 */
2552	if ((dirrem->dm_state & RMDIR) == 0) {
2553		ip->i_nlink--;
2554		if (ip->i_nlink < ip->i_effnlink)
2555			panic("handle_workitem_remove: bad file delta");
2556		ip->i_flag |= IN_CHANGE;
2557		vput(vp);
2558		WORKITEM_FREE(dirrem, D_DIRREM);
2559		return;
2560	}
2561	/*
2562	 * Directory deletion. Decrement reference count for both the
2563	 * just deleted parent directory entry and the reference for ".".
2564	 * Next truncate the directory to length zero. When the
2565	 * truncation completes, arrange to have the reference count on
2566	 * the parent decremented to account for the loss of "..".
2567	 */
2568	ip->i_nlink -= 2;
2569	if (ip->i_nlink < ip->i_effnlink)
2570		panic("handle_workitem_remove: bad dir delta");
2571	ip->i_flag |= IN_CHANGE;
2572	if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0)
2573		softdep_error("handle_workitem_remove: truncate", error);
2574	/*
2575	 * Rename a directory to a new parent. Since, we are both deleting
2576	 * and creating a new directory entry, the link count on the new
2577	 * directory should not change. Thus we skip the followup dirrem.
2578	 */
2579	if (dirrem->dm_state & DIRCHG) {
2580		vput(vp);
2581		WORKITEM_FREE(dirrem, D_DIRREM);
2582		return;
2583	}
2584	ACQUIRE_LOCK(&lk);
2585	(void) inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, DEPALLOC,
2586	    &inodedep);
2587	dirrem->dm_state = 0;
2588	dirrem->dm_oldinum = dirrem->dm_dirinum;
2589	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
2590	FREE_LOCK(&lk);
2591	vput(vp);
2592}
2593
2594/*
2595 * Inode de-allocation dependencies.
2596 *
2597 * When an inode's link count is reduced to zero, it can be de-allocated. We
2598 * found it convenient to postpone de-allocation until after the inode is
2599 * written to disk with its new link count (zero).  At this point, all of the
2600 * on-disk inode's block pointers are nullified and, with careful dependency
2601 * list ordering, all dependencies related to the inode will be satisfied and
2602 * the corresponding dependency structures de-allocated.  So, if/when the
2603 * inode is reused, there will be no mixing of old dependencies with new
2604 * ones.  This artificial dependency is set up by the block de-allocation
2605 * procedure above (softdep_setup_freeblocks) and completed by the
2606 * following procedure.
2607 */
2608static void
2609handle_workitem_freefile(freefile)
2610	struct freefile *freefile;
2611{
2612	struct vnode vp;
2613	struct inode tip;
2614	struct inodedep *idp;
2615	int error;
2616
2617#ifdef DEBUG
2618	ACQUIRE_LOCK(&lk);
2619	if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp))
2620		panic("handle_workitem_freefile: inodedep survived");
2621	FREE_LOCK(&lk);
2622#endif
2623	tip.i_devvp = freefile->fx_devvp;
2624	tip.i_dev = freefile->fx_devvp->v_rdev;
2625	tip.i_fs = freefile->fx_fs;
2626	vp.v_data = &tip;
2627	if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
2628		softdep_error("handle_workitem_freefile", error);
2629	WORKITEM_FREE(freefile, D_FREEFILE);
2630	num_freefile -= 1;
2631}
2632
2633/*
2634 * Disk writes.
2635 *
2636 * The dependency structures constructed above are most actively used when file
2637 * system blocks are written to disk.  No constraints are placed on when a
2638 * block can be written, but unsatisfied update dependencies are made safe by
2639 * modifying (or replacing) the source memory for the duration of the disk
2640 * write.  When the disk write completes, the memory block is again brought
2641 * up-to-date.
2642 *
2643 * In-core inode structure reclamation.
2644 *
2645 * Because there are a finite number of "in-core" inode structures, they are
2646 * reused regularly.  By transferring all inode-related dependencies to the
2647 * in-memory inode block and indexing them separately (via "inodedep"s), we
2648 * can allow "in-core" inode structures to be reused at any time and avoid
2649 * any increase in contention.
2650 *
2651 * Called just before entering the device driver to initiate a new disk I/O.
2652 * The buffer must be locked, thus, no I/O completion operations can occur
2653 * while we are manipulating its associated dependencies.
2654 */
2655void
2656softdep_disk_io_initiation(bp)
2657	struct buf *bp;		/* structure describing disk write to occur */
2658{
2659	struct worklist *wk, *nextwk;
2660	struct indirdep *indirdep;
2661
2662	/*
2663	 * We only care about write operations. There should never
2664	 * be dependencies for reads.
2665	 */
2666	if (bp->b_flags & B_READ)
2667		panic("softdep_disk_io_initiation: read");
2668	/*
2669	 * Do any necessary pre-I/O processing.
2670	 */
2671	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
2672		nextwk = LIST_NEXT(wk, wk_list);
2673		switch (wk->wk_type) {
2674
2675		case D_PAGEDEP:
2676			initiate_write_filepage(WK_PAGEDEP(wk), bp);
2677			continue;
2678
2679		case D_INODEDEP:
2680			initiate_write_inodeblock(WK_INODEDEP(wk), bp);
2681			continue;
2682
2683		case D_INDIRDEP:
2684			indirdep = WK_INDIRDEP(wk);
2685			if (indirdep->ir_state & GOINGAWAY)
2686				panic("disk_io_initiation: indirdep gone");
2687			/*
2688			 * If there are no remaining dependencies, this
2689			 * will be writing the real pointers, so the
2690			 * dependency can be freed.
2691			 */
2692			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
2693				indirdep->ir_savebp->b_flags &= ~B_XXX;
2694				indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
2695				brelse(indirdep->ir_savebp);
2696				/* inline expand WORKLIST_REMOVE(wk); */
2697				wk->wk_state &= ~ONWORKLIST;
2698				LIST_REMOVE(wk, wk_list);
2699				WORKITEM_FREE(indirdep, D_INDIRDEP);
2700				continue;
2701			}
2702			/*
2703			 * Replace up-to-date version with safe version.
2704			 */
2705			ACQUIRE_LOCK(&lk);
2706			indirdep->ir_state &= ~ATTACHED;
2707			indirdep->ir_state |= UNDONE;
2708			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
2709			    M_INDIRDEP, M_WAITOK);
2710			bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
2711			bcopy(indirdep->ir_savebp->b_data, bp->b_data,
2712			    bp->b_bcount);
2713			FREE_LOCK(&lk);
2714			continue;
2715
2716		case D_MKDIR:
2717		case D_BMSAFEMAP:
2718		case D_ALLOCDIRECT:
2719		case D_ALLOCINDIR:
2720			continue;
2721
2722		default:
2723			panic("handle_disk_io_initiation: Unexpected type %s",
2724			    TYPENAME(wk->wk_type));
2725			/* NOTREACHED */
2726		}
2727	}
2728}
2729
2730/*
2731 * Called from within the procedure above to deal with unsatisfied
2732 * allocation dependencies in a directory. The buffer must be locked,
2733 * thus, no I/O completion operations can occur while we are
2734 * manipulating its associated dependencies.
2735 */
2736static void
2737initiate_write_filepage(pagedep, bp)
2738	struct pagedep *pagedep;
2739	struct buf *bp;
2740{
2741	struct diradd *dap;
2742	struct direct *ep;
2743	int i;
2744
2745	if (pagedep->pd_state & IOSTARTED) {
2746		/*
2747		 * This can only happen if there is a driver that does not
2748		 * understand chaining. Here biodone will reissue the call
2749		 * to strategy for the incomplete buffers.
2750		 */
2751		printf("initiate_write_filepage: already started\n");
2752		return;
2753	}
2754	pagedep->pd_state |= IOSTARTED;
2755	ACQUIRE_LOCK(&lk);
2756	for (i = 0; i < DAHASHSZ; i++) {
2757		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
2758		     dap = LIST_NEXT(dap, da_pdlist)) {
2759			ep = (struct direct *)
2760			    ((char *)bp->b_data + dap->da_offset);
2761			if (ep->d_ino != dap->da_newinum)
2762				panic("%s: dir inum %d != new %d",
2763				    "initiate_write_filepage",
2764				    ep->d_ino, dap->da_newinum);
2765			if (dap->da_state & DIRCHG)
2766				ep->d_ino = dap->da_previous->dm_oldinum;
2767			else
2768				ep->d_ino = 0;
2769			dap->da_state &= ~ATTACHED;
2770			dap->da_state |= UNDONE;
2771		}
2772	}
2773	FREE_LOCK(&lk);
2774}
2775
2776/*
2777 * Called from within the procedure above to deal with unsatisfied
2778 * allocation dependencies in an inodeblock. The buffer must be
2779 * locked, thus, no I/O completion operations can occur while we
2780 * are manipulating its associated dependencies.
2781 */
2782static void
2783initiate_write_inodeblock(inodedep, bp)
2784	struct inodedep *inodedep;
2785	struct buf *bp;			/* The inode block */
2786{
2787	struct allocdirect *adp, *lastadp;
2788	struct dinode *dp;
2789	struct fs *fs;
2790	ufs_lbn_t prevlbn = 0;
2791	int i, deplist;
2792
2793	if (inodedep->id_state & IOSTARTED)
2794		panic("initiate_write_inodeblock: already started");
2795	inodedep->id_state |= IOSTARTED;
2796	fs = inodedep->id_fs;
2797	dp = (struct dinode *)bp->b_data +
2798	    ino_to_fsbo(fs, inodedep->id_ino);
2799	/*
2800	 * If the bitmap is not yet written, then the allocated
2801	 * inode cannot be written to disk.
2802	 */
2803	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
2804		if (inodedep->id_savedino != NULL)
2805			panic("initiate_write_inodeblock: already doing I/O");
2806		MALLOC(inodedep->id_savedino, struct dinode *,
2807		    sizeof(struct dinode), M_INODEDEP, M_WAITOK);
2808		*inodedep->id_savedino = *dp;
2809		bzero((caddr_t)dp, sizeof(struct dinode));
2810		return;
2811	}
2812	/*
2813	 * If no dependencies, then there is nothing to roll back.
2814	 */
2815	inodedep->id_savedsize = dp->di_size;
2816	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
2817		return;
2818	/*
2819	 * Set the dependencies to busy.
2820	 */
2821	ACQUIRE_LOCK(&lk);
2822	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2823	     adp = TAILQ_NEXT(adp, ad_next)) {
2824#ifdef DIAGNOSTIC
2825		if (deplist != 0 && prevlbn >= adp->ad_lbn)
2826			panic("softdep_write_inodeblock: lbn order");
2827		prevlbn = adp->ad_lbn;
2828		if (adp->ad_lbn < NDADDR &&
2829		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
2830			panic("%s: direct pointer #%ld mismatch %d != %d",
2831			    "softdep_write_inodeblock", adp->ad_lbn,
2832			    dp->di_db[adp->ad_lbn], adp->ad_newblkno);
2833		if (adp->ad_lbn >= NDADDR &&
2834		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
2835			panic("%s: indirect pointer #%ld mismatch %d != %d",
2836			    "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
2837			    dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
2838		deplist |= 1 << adp->ad_lbn;
2839		if ((adp->ad_state & ATTACHED) == 0)
2840			panic("softdep_write_inodeblock: Unknown state 0x%x",
2841			    adp->ad_state);
2842#endif /* DIAGNOSTIC */
2843		adp->ad_state &= ~ATTACHED;
2844		adp->ad_state |= UNDONE;
2845	}
2846	/*
2847	 * The on-disk inode cannot claim to be any larger than the last
2848	 * fragment that has been written. Otherwise, the on-disk inode
2849	 * might have fragments that were not the last block in the file
2850	 * which would corrupt the filesystem.
2851	 */
2852	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2853	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
2854		if (adp->ad_lbn >= NDADDR)
2855			break;
2856		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
2857		/* keep going until hitting a rollback to a frag */
2858		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
2859			continue;
2860		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
2861		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
2862#ifdef DIAGNOSTIC
2863			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
2864				panic("softdep_write_inodeblock: lost dep1");
2865#endif /* DIAGNOSTIC */
2866			dp->di_db[i] = 0;
2867		}
2868		for (i = 0; i < NIADDR; i++) {
2869#ifdef DIAGNOSTIC
2870			if (dp->di_ib[i] != 0 &&
2871			    (deplist & ((1 << NDADDR) << i)) == 0)
2872				panic("softdep_write_inodeblock: lost dep2");
2873#endif /* DIAGNOSTIC */
2874			dp->di_ib[i] = 0;
2875		}
2876		FREE_LOCK(&lk);
2877		return;
2878	}
2879	/*
2880	 * If we have zero'ed out the last allocated block of the file,
2881	 * roll back the size to the last currently allocated block.
2882	 * We know that this last allocated block is a full-sized as
2883	 * we already checked for fragments in the loop above.
2884	 */
2885	if (lastadp != NULL &&
2886	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
2887		for (i = lastadp->ad_lbn; i >= 0; i--)
2888			if (dp->di_db[i] != 0)
2889				break;
2890		dp->di_size = (i + 1) * fs->fs_bsize;
2891	}
2892	/*
2893	 * The only dependencies are for indirect blocks.
2894	 *
2895	 * The file size for indirect block additions is not guaranteed.
2896	 * Such a guarantee would be non-trivial to achieve. The conventional
2897	 * synchronous write implementation also does not make this guarantee.
2898	 * Fsck should catch and fix discrepancies. Arguably, the file size
2899	 * can be over-estimated without destroying integrity when the file
2900	 * moves into the indirect blocks (i.e., is large). If we want to
2901	 * postpone fsck, we are stuck with this argument.
2902	 */
2903	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
2904		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
2905	FREE_LOCK(&lk);
2906}
2907
2908/*
2909 * This routine is called during the completion interrupt
2910 * service routine for a disk write (from the procedure called
2911 * by the device driver to inform the file system caches of
2912 * a request completion).  It should be called early in this
2913 * procedure, before the block is made available to other
2914 * processes or other routines are called.
2915 */
2916void
2917softdep_disk_write_complete(bp)
2918	struct buf *bp;		/* describes the completed disk write */
2919{
2920	struct worklist *wk;
2921	struct workhead reattach;
2922	struct newblk *newblk;
2923	struct allocindir *aip;
2924	struct allocdirect *adp;
2925	struct indirdep *indirdep;
2926	struct inodedep *inodedep;
2927	struct bmsafemap *bmsafemap;
2928
2929#ifdef DEBUG
2930	if (lk.lkt_held != -1)
2931		panic("softdep_disk_write_complete: lock is held");
2932	lk.lkt_held = -2;
2933#endif
2934	LIST_INIT(&reattach);
2935	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2936		WORKLIST_REMOVE(wk);
2937		switch (wk->wk_type) {
2938
2939		case D_PAGEDEP:
2940			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
2941				WORKLIST_INSERT(&reattach, wk);
2942			continue;
2943
2944		case D_INODEDEP:
2945			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
2946				WORKLIST_INSERT(&reattach, wk);
2947			continue;
2948
2949		case D_BMSAFEMAP:
2950			bmsafemap = WK_BMSAFEMAP(wk);
2951			while (newblk = LIST_FIRST(&bmsafemap->sm_newblkhd)) {
2952				newblk->nb_state |= DEPCOMPLETE;
2953				newblk->nb_bmsafemap = NULL;
2954				LIST_REMOVE(newblk, nb_deps);
2955			}
2956			while (adp = LIST_FIRST(&bmsafemap->sm_allocdirecthd)) {
2957				adp->ad_state |= DEPCOMPLETE;
2958				adp->ad_buf = NULL;
2959				LIST_REMOVE(adp, ad_deps);
2960				handle_allocdirect_partdone(adp);
2961			}
2962			while (aip = LIST_FIRST(&bmsafemap->sm_allocindirhd)) {
2963				aip->ai_state |= DEPCOMPLETE;
2964				aip->ai_buf = NULL;
2965				LIST_REMOVE(aip, ai_deps);
2966				handle_allocindir_partdone(aip);
2967			}
2968			while ((inodedep =
2969			       LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
2970				inodedep->id_state |= DEPCOMPLETE;
2971				LIST_REMOVE(inodedep, id_deps);
2972				inodedep->id_buf = NULL;
2973			}
2974			WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
2975			continue;
2976
2977		case D_MKDIR:
2978			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
2979			continue;
2980
2981		case D_ALLOCDIRECT:
2982			adp = WK_ALLOCDIRECT(wk);
2983			adp->ad_state |= COMPLETE;
2984			handle_allocdirect_partdone(adp);
2985			continue;
2986
2987		case D_ALLOCINDIR:
2988			aip = WK_ALLOCINDIR(wk);
2989			aip->ai_state |= COMPLETE;
2990			handle_allocindir_partdone(aip);
2991			continue;
2992
2993		case D_INDIRDEP:
2994			indirdep = WK_INDIRDEP(wk);
2995			if (indirdep->ir_state & GOINGAWAY)
2996				panic("disk_write_complete: indirdep gone");
2997			bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
2998			FREE(indirdep->ir_saveddata, M_INDIRDEP);
2999			indirdep->ir_saveddata = 0;
3000			indirdep->ir_state &= ~UNDONE;
3001			indirdep->ir_state |= ATTACHED;
3002			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
3003				LIST_REMOVE(aip, ai_next);
3004				handle_allocindir_partdone(aip);
3005			}
3006			WORKLIST_INSERT(&reattach, wk);
3007			bdirty(bp);
3008			continue;
3009
3010		default:
3011			panic("handle_disk_write_complete: Unknown type %s",
3012			    TYPENAME(wk->wk_type));
3013			/* NOTREACHED */
3014		}
3015	}
3016	/*
3017	 * Reattach any requests that must be redone.
3018	 */
3019	while ((wk = LIST_FIRST(&reattach)) != NULL) {
3020		WORKLIST_REMOVE(wk);
3021		WORKLIST_INSERT(&bp->b_dep, wk);
3022	}
3023#ifdef DEBUG
3024	if (lk.lkt_held != -2)
3025		panic("softdep_disk_write_complete: lock lost");
3026	lk.lkt_held = -1;
3027#endif
3028}
3029
3030/*
3031 * Called from within softdep_disk_write_complete above. Note that
3032 * this routine is always called from interrupt level with further
3033 * splbio interrupts blocked.
3034 */
3035static void
3036handle_allocdirect_partdone(adp)
3037	struct allocdirect *adp;	/* the completed allocdirect */
3038{
3039	struct allocdirect *listadp;
3040	struct inodedep *inodedep;
3041	long bsize;
3042
3043	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3044		return;
3045	if (adp->ad_buf != NULL)
3046		panic("handle_allocdirect_partdone: dangling dep");
3047	/*
3048	 * The on-disk inode cannot claim to be any larger than the last
3049	 * fragment that has been written. Otherwise, the on-disk inode
3050	 * might have fragments that were not the last block in the file
3051	 * which would corrupt the filesystem. Thus, we cannot free any
3052	 * allocdirects after one whose ad_oldblkno claims a fragment as
3053	 * these blocks must be rolled back to zero before writing the inode.
3054	 * We check the currently active set of allocdirects in id_inoupdt.
3055	 */
3056	inodedep = adp->ad_inodedep;
3057	bsize = inodedep->id_fs->fs_bsize;
3058	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp;
3059	     listadp = TAILQ_NEXT(listadp, ad_next)) {
3060		/* found our block */
3061		if (listadp == adp)
3062			break;
3063		/* continue if ad_oldlbn is not a fragment */
3064		if (listadp->ad_oldsize == 0 ||
3065		    listadp->ad_oldsize == bsize)
3066			continue;
3067		/* hit a fragment */
3068		return;
3069	}
3070	/*
3071	 * If we have reached the end of the current list without
3072	 * finding the just finished dependency, then it must be
3073	 * on the future dependency list. Future dependencies cannot
3074	 * be freed until they are moved to the current list.
3075	 */
3076	if (listadp == NULL) {
3077#ifdef DEBUG
3078		for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp;
3079		     listadp = TAILQ_NEXT(listadp, ad_next))
3080			/* found our block */
3081			if (listadp == adp)
3082				break;
3083		if (listadp == NULL)
3084			panic("handle_allocdirect_partdone: lost dep");
3085#endif /* DEBUG */
3086		return;
3087	}
3088	/*
3089	 * If we have found the just finished dependency, then free
3090	 * it along with anything that follows it that is complete.
3091	 */
3092	for (; adp; adp = listadp) {
3093		listadp = TAILQ_NEXT(adp, ad_next);
3094		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3095			return;
3096		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
3097	}
3098}
3099
3100/*
3101 * Called from within softdep_disk_write_complete above. Note that
3102 * this routine is always called from interrupt level with further
3103 * splbio interrupts blocked.
3104 */
3105static void
3106handle_allocindir_partdone(aip)
3107	struct allocindir *aip;		/* the completed allocindir */
3108{
3109	struct indirdep *indirdep;
3110
3111	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
3112		return;
3113	if (aip->ai_buf != NULL)
3114		panic("handle_allocindir_partdone: dangling dependency");
3115	indirdep = aip->ai_indirdep;
3116	if (indirdep->ir_state & UNDONE) {
3117		LIST_REMOVE(aip, ai_next);
3118		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
3119		return;
3120	}
3121	((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3122	    aip->ai_newblkno;
3123	LIST_REMOVE(aip, ai_next);
3124	if (aip->ai_freefrag != NULL)
3125		add_to_worklist(&aip->ai_freefrag->ff_list);
3126	WORKITEM_FREE(aip, D_ALLOCINDIR);
3127}
3128
3129/*
3130 * Called from within softdep_disk_write_complete above to restore
3131 * in-memory inode block contents to their most up-to-date state. Note
3132 * that this routine is always called from interrupt level with further
3133 * splbio interrupts blocked.
3134 */
3135static int
3136handle_written_inodeblock(inodedep, bp)
3137	struct inodedep *inodedep;
3138	struct buf *bp;		/* buffer containing the inode block */
3139{
3140	struct worklist *wk, *filefree;
3141	struct allocdirect *adp, *nextadp;
3142	struct dinode *dp;
3143	int hadchanges;
3144
3145	if ((inodedep->id_state & IOSTARTED) == 0)
3146		panic("handle_written_inodeblock: not started");
3147	inodedep->id_state &= ~IOSTARTED;
3148	inodedep->id_state |= COMPLETE;
3149	dp = (struct dinode *)bp->b_data +
3150	    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
3151	/*
3152	 * If we had to rollback the inode allocation because of
3153	 * bitmaps being incomplete, then simply restore it.
3154	 * Keep the block dirty so that it will not be reclaimed until
3155	 * all associated dependencies have been cleared and the
3156	 * corresponding updates written to disk.
3157	 */
3158	if (inodedep->id_savedino != NULL) {
3159		*dp = *inodedep->id_savedino;
3160		FREE(inodedep->id_savedino, M_INODEDEP);
3161		inodedep->id_savedino = NULL;
3162		bdirty(bp);
3163		return (1);
3164	}
3165	/*
3166	 * Roll forward anything that had to be rolled back before
3167	 * the inode could be updated.
3168	 */
3169	hadchanges = 0;
3170	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
3171		nextadp = TAILQ_NEXT(adp, ad_next);
3172		if (adp->ad_state & ATTACHED)
3173			panic("handle_written_inodeblock: new entry");
3174		if (adp->ad_lbn < NDADDR) {
3175			if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno)
3176				panic("%s: %s #%ld mismatch %d != %d",
3177				    "handle_written_inodeblock",
3178				    "direct pointer", adp->ad_lbn,
3179				    dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
3180			dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
3181		} else {
3182			if (dp->di_ib[adp->ad_lbn - NDADDR] != 0)
3183				panic("%s: %s #%ld allocated as %d",
3184				    "handle_written_inodeblock",
3185				    "indirect pointer", adp->ad_lbn - NDADDR,
3186				    dp->di_ib[adp->ad_lbn - NDADDR]);
3187			dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
3188		}
3189		adp->ad_state &= ~UNDONE;
3190		adp->ad_state |= ATTACHED;
3191		hadchanges = 1;
3192	}
3193	/*
3194	 * Reset the file size to its most up-to-date value.
3195	 */
3196	if (inodedep->id_savedsize == -1)
3197		panic("handle_written_inodeblock: bad size");
3198	if (dp->di_size != inodedep->id_savedsize) {
3199		dp->di_size = inodedep->id_savedsize;
3200		hadchanges = 1;
3201	}
3202	inodedep->id_savedsize = -1;
3203	/*
3204	 * If there were any rollbacks in the inode block, then it must be
3205	 * marked dirty so that its will eventually get written back in
3206	 * its correct form.
3207	 */
3208	if (hadchanges)
3209		bdirty(bp);
3210	/*
3211	 * Process any allocdirects that completed during the update.
3212	 */
3213	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
3214		handle_allocdirect_partdone(adp);
3215	/*
3216	 * Process deallocations that were held pending until the
3217	 * inode had been written to disk. Freeing of the inode
3218	 * is delayed until after all blocks have been freed to
3219	 * avoid creation of new <vfsid, inum, lbn> triples
3220	 * before the old ones have been deleted.
3221	 */
3222	filefree = NULL;
3223	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
3224		WORKLIST_REMOVE(wk);
3225		switch (wk->wk_type) {
3226
3227		case D_FREEFILE:
3228			/*
3229			 * We defer adding filefree to the worklist until
3230			 * all other additions have been made to ensure
3231			 * that it will be done after all the old blocks
3232			 * have been freed.
3233			 */
3234			if (filefree != NULL)
3235				panic("handle_written_inodeblock: filefree");
3236			filefree = wk;
3237			continue;
3238
3239		case D_MKDIR:
3240			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
3241			continue;
3242
3243		case D_DIRADD:
3244			diradd_inode_written(WK_DIRADD(wk), inodedep);
3245			continue;
3246
3247		case D_FREEBLKS:
3248		case D_FREEFRAG:
3249		case D_DIRREM:
3250			add_to_worklist(wk);
3251			continue;
3252
3253		default:
3254			panic("handle_written_inodeblock: Unknown type %s",
3255			    TYPENAME(wk->wk_type));
3256			/* NOTREACHED */
3257		}
3258	}
3259	if (filefree != NULL) {
3260		if (free_inodedep(inodedep) == 0)
3261			panic("handle_written_inodeblock: live inodedep");
3262		add_to_worklist(filefree);
3263		return (0);
3264	}
3265
3266	/*
3267	 * If no outstanding dependencies, free it.
3268	 */
3269	if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
3270		return (0);
3271	return (hadchanges);
3272}
3273
3274/*
3275 * Process a diradd entry after its dependent inode has been written.
3276 * This routine must be called with splbio interrupts blocked.
3277 */
3278static void
3279diradd_inode_written(dap, inodedep)
3280	struct diradd *dap;
3281	struct inodedep *inodedep;
3282{
3283	struct pagedep *pagedep;
3284
3285	dap->da_state |= COMPLETE;
3286	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3287		if (dap->da_state & DIRCHG)
3288			pagedep = dap->da_previous->dm_pagedep;
3289		else
3290			pagedep = dap->da_pagedep;
3291		LIST_REMOVE(dap, da_pdlist);
3292		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3293	}
3294	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3295}
3296
3297/*
3298 * Handle the completion of a mkdir dependency.
3299 */
3300static void
3301handle_written_mkdir(mkdir, type)
3302	struct mkdir *mkdir;
3303	int type;
3304{
3305	struct diradd *dap;
3306	struct pagedep *pagedep;
3307
3308	if (mkdir->md_state != type)
3309		panic("handle_written_mkdir: bad type");
3310	dap = mkdir->md_diradd;
3311	dap->da_state &= ~type;
3312	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
3313		dap->da_state |= DEPCOMPLETE;
3314	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3315		if (dap->da_state & DIRCHG)
3316			pagedep = dap->da_previous->dm_pagedep;
3317		else
3318			pagedep = dap->da_pagedep;
3319		LIST_REMOVE(dap, da_pdlist);
3320		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3321	}
3322	LIST_REMOVE(mkdir, md_mkdirs);
3323	WORKITEM_FREE(mkdir, D_MKDIR);
3324}
3325
3326/*
3327 * Called from within softdep_disk_write_complete above.
3328 * A write operation was just completed. Removed inodes can
3329 * now be freed and associated block pointers may be committed.
3330 * Note that this routine is always called from interrupt level
3331 * with further splbio interrupts blocked.
3332 */
3333static int
3334handle_written_filepage(pagedep, bp)
3335	struct pagedep *pagedep;
3336	struct buf *bp;		/* buffer containing the written page */
3337{
3338	struct dirrem *dirrem;
3339	struct diradd *dap, *nextdap;
3340	struct direct *ep;
3341	int i, chgs;
3342
3343	if ((pagedep->pd_state & IOSTARTED) == 0)
3344		panic("handle_written_filepage: not started");
3345	pagedep->pd_state &= ~IOSTARTED;
3346	/*
3347	 * Process any directory removals that have been committed.
3348	 */
3349	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
3350		LIST_REMOVE(dirrem, dm_next);
3351		dirrem->dm_dirinum = pagedep->pd_ino;
3352		add_to_worklist(&dirrem->dm_list);
3353	}
3354	/*
3355	 * Free any directory additions that have been committed.
3356	 */
3357	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
3358		free_diradd(dap);
3359	/*
3360	 * Uncommitted directory entries must be restored.
3361	 */
3362	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
3363		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
3364		     dap = nextdap) {
3365			nextdap = LIST_NEXT(dap, da_pdlist);
3366			if (dap->da_state & ATTACHED)
3367				panic("handle_written_filepage: attached");
3368			ep = (struct direct *)
3369			    ((char *)bp->b_data + dap->da_offset);
3370			ep->d_ino = dap->da_newinum;
3371			dap->da_state &= ~UNDONE;
3372			dap->da_state |= ATTACHED;
3373			chgs = 1;
3374			/*
3375			 * If the inode referenced by the directory has
3376			 * been written out, then the dependency can be
3377			 * moved to the pending list.
3378			 */
3379			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3380				LIST_REMOVE(dap, da_pdlist);
3381				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
3382				    da_pdlist);
3383			}
3384		}
3385	}
3386	/*
3387	 * If there were any rollbacks in the directory, then it must be
3388	 * marked dirty so that its will eventually get written back in
3389	 * its correct form.
3390	 */
3391	if (chgs)
3392		bdirty(bp);
3393	/*
3394	 * If no dependencies remain, the pagedep will be freed.
3395	 * Otherwise it will remain to update the page before it
3396	 * is written back to disk.
3397	 */
3398	if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
3399		for (i = 0; i < DAHASHSZ; i++)
3400			if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
3401				break;
3402		if (i == DAHASHSZ) {
3403			LIST_REMOVE(pagedep, pd_hash);
3404			WORKITEM_FREE(pagedep, D_PAGEDEP);
3405			return (0);
3406		}
3407	}
3408	return (1);
3409}
3410
3411/*
3412 * Writing back in-core inode structures.
3413 *
3414 * The file system only accesses an inode's contents when it occupies an
3415 * "in-core" inode structure.  These "in-core" structures are separate from
3416 * the page frames used to cache inode blocks.  Only the latter are
3417 * transferred to/from the disk.  So, when the updated contents of the
3418 * "in-core" inode structure are copied to the corresponding in-memory inode
3419 * block, the dependencies are also transferred.  The following procedure is
3420 * called when copying a dirty "in-core" inode to a cached inode block.
3421 */
3422
3423/*
3424 * Called when an inode is loaded from disk. If the effective link count
3425 * differed from the actual link count when it was last flushed, then we
3426 * need to ensure that the correct effective link count is put back.
3427 */
3428void
3429softdep_load_inodeblock(ip)
3430	struct inode *ip;	/* the "in_core" copy of the inode */
3431{
3432	struct inodedep *inodedep;
3433
3434	/*
3435	 * Check for alternate nlink count.
3436	 */
3437	ip->i_effnlink = ip->i_nlink;
3438	ACQUIRE_LOCK(&lk);
3439	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3440		FREE_LOCK(&lk);
3441		return;
3442	}
3443	if (inodedep->id_nlinkdelta != 0) {
3444		ip->i_effnlink -= inodedep->id_nlinkdelta;
3445		ip->i_flag |= IN_MODIFIED;
3446		inodedep->id_nlinkdelta = 0;
3447		(void) free_inodedep(inodedep);
3448	}
3449	FREE_LOCK(&lk);
3450}
3451
3452/*
3453 * This routine is called just before the "in-core" inode
3454 * information is to be copied to the in-memory inode block.
3455 * Recall that an inode block contains several inodes. If
3456 * the force flag is set, then the dependencies will be
3457 * cleared so that the update can always be made. Note that
3458 * the buffer is locked when this routine is called, so we
3459 * will never be in the middle of writing the inode block
3460 * to disk.
3461 */
3462void
3463softdep_update_inodeblock(ip, bp, waitfor)
3464	struct inode *ip;	/* the "in_core" copy of the inode */
3465	struct buf *bp;		/* the buffer containing the inode block */
3466	int waitfor;		/* nonzero => update must be allowed */
3467{
3468	struct inodedep *inodedep;
3469	struct worklist *wk;
3470	int error, gotit;
3471
3472	/*
3473	 * If the effective link count is not equal to the actual link
3474	 * count, then we must track the difference in an inodedep while
3475	 * the inode is (potentially) tossed out of the cache. Otherwise,
3476	 * if there is no existing inodedep, then there are no dependencies
3477	 * to track.
3478	 */
3479	ACQUIRE_LOCK(&lk);
3480	if (ip->i_effnlink != ip->i_nlink) {
3481		(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC,
3482		    &inodedep);
3483	} else if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3484		FREE_LOCK(&lk);
3485		return;
3486	}
3487	if (ip->i_nlink < ip->i_effnlink)
3488		panic("softdep_update_inodeblock: bad delta");
3489	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3490	/*
3491	 * Changes have been initiated. Anything depending on these
3492	 * changes cannot occur until this inode has been written.
3493	 */
3494	inodedep->id_state &= ~COMPLETE;
3495	if ((inodedep->id_state & ONWORKLIST) == 0)
3496		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
3497	/*
3498	 * Any new dependencies associated with the incore inode must
3499	 * now be moved to the list associated with the buffer holding
3500	 * the in-memory copy of the inode. Once merged process any
3501	 * allocdirects that are completed by the merger.
3502	 */
3503	merge_inode_lists(inodedep);
3504	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
3505		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
3506	/*
3507	 * Now that the inode has been pushed into the buffer, the
3508	 * operations dependent on the inode being written to disk
3509	 * can be moved to the id_bufwait so that they will be
3510	 * processed when the buffer I/O completes.
3511	 */
3512	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
3513		WORKLIST_REMOVE(wk);
3514		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
3515	}
3516	/*
3517	 * Newly allocated inodes cannot be written until the bitmap
3518	 * that allocates them have been written (indicated by
3519	 * DEPCOMPLETE being set in id_state). If we are doing a
3520	 * forced sync (e.g., an fsync on a file), we force the bitmap
3521	 * to be written so that the update can be done.
3522	 */
3523	if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
3524		FREE_LOCK(&lk);
3525		return;
3526	}
3527	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
3528	FREE_LOCK(&lk);
3529	if (gotit && (error = VOP_BWRITE(inodedep->id_buf)) != 0)
3530		softdep_error("softdep_update_inodeblock: bwrite", error);
3531	if ((inodedep->id_state & DEPCOMPLETE) == 0)
3532		panic("softdep_update_inodeblock: update failed");
3533}
3534
3535/*
3536 * Merge the new inode dependency list (id_newinoupdt) into the old
3537 * inode dependency list (id_inoupdt). This routine must be called
3538 * with splbio interrupts blocked.
3539 */
3540static void
3541merge_inode_lists(inodedep)
3542	struct inodedep *inodedep;
3543{
3544	struct allocdirect *listadp, *newadp;
3545
3546	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3547	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
3548		if (listadp->ad_lbn < newadp->ad_lbn) {
3549			listadp = TAILQ_NEXT(listadp, ad_next);
3550			continue;
3551		}
3552		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3553		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
3554		if (listadp->ad_lbn == newadp->ad_lbn) {
3555			allocdirect_merge(&inodedep->id_inoupdt, newadp,
3556			    listadp);
3557			listadp = newadp;
3558		}
3559		newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3560	}
3561	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
3562		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3563		TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
3564	}
3565}
3566
3567/*
3568 * If we are doing an fsync, then we must ensure that any directory
3569 * entries for the inode have been written after the inode gets to disk.
3570 */
3571int
3572softdep_fsync(vp)
3573	struct vnode *vp;	/* the "in_core" copy of the inode */
3574{
3575	struct diradd *dap, *olddap;
3576	struct inodedep *inodedep;
3577	struct pagedep *pagedep;
3578	struct worklist *wk;
3579	struct mount *mnt;
3580	struct vnode *pvp;
3581	struct inode *ip;
3582	struct buf *bp;
3583	struct fs *fs;
3584	struct proc *p = CURPROC;		/* XXX */
3585	int error, ret, flushparent;
3586	struct timeval tv;
3587	ino_t parentino;
3588	ufs_lbn_t lbn;
3589
3590	ip = VTOI(vp);
3591	fs = ip->i_fs;
3592	for (error = 0, flushparent = 0, olddap = NULL; ; ) {
3593		ACQUIRE_LOCK(&lk);
3594		if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
3595			break;
3596		if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
3597		    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
3598		    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
3599		    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
3600			panic("softdep_fsync: pending ops");
3601		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
3602			break;
3603		if (wk->wk_type != D_DIRADD)
3604			panic("softdep_fsync: Unexpected type %s",
3605			    TYPENAME(wk->wk_type));
3606		dap = WK_DIRADD(wk);
3607		/*
3608		 * If we have failed to get rid of all the dependencies
3609		 * then something is seriously wrong.
3610		 */
3611		if (dap == olddap)
3612			panic("softdep_fsync: flush failed");
3613		olddap = dap;
3614		/*
3615		 * Flush our parent if this directory entry
3616		 * has a MKDIR_PARENT dependency.
3617		 */
3618		if (dap->da_state & DIRCHG)
3619			pagedep = dap->da_previous->dm_pagedep;
3620		else
3621			pagedep = dap->da_pagedep;
3622		mnt = pagedep->pd_mnt;
3623		parentino = pagedep->pd_ino;
3624		lbn = pagedep->pd_lbn;
3625		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
3626			panic("softdep_fsync: dirty");
3627		flushparent = dap->da_state & MKDIR_PARENT;
3628		/*
3629		 * If we are being fsync'ed as part of vgone'ing this vnode,
3630		 * then we will not be able to release and recover the
3631		 * vnode below, so we just have to give up on writing its
3632		 * directory entry out. It will eventually be written, just
3633		 * not now, but then the user was not asking to have it
3634		 * written, so we are not breaking any promises.
3635		 */
3636		if (vp->v_flag & VXLOCK)
3637			break;
3638		/*
3639		 * We prevent deadlock by always fetching inodes from the
3640		 * root, moving down the directory tree. Thus, when fetching
3641		 * our parent directory, we must unlock ourselves before
3642		 * requesting the lock on our parent. See the comment in
3643		 * ufs_lookup for details on possible races.
3644		 */
3645		FREE_LOCK(&lk);
3646		VOP_UNLOCK(vp, 0, p);
3647		if ((error = VFS_VGET(mnt, parentino, &pvp)) != 0) {
3648			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
3649			return (error);
3650		}
3651		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
3652		if (flushparent) {
3653#ifndef __FreeBSD__
3654			tv = time;
3655#else
3656			getmicrotime(&tv);
3657#endif /* __FreeBSD__ */
3658			if (error = UFS_UPDATE(pvp, &tv, &tv, 1)) {
3659				vput(pvp);
3660				return (error);
3661			}
3662		}
3663		/*
3664		 * Flush directory page containing the inode's name.
3665		 */
3666		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
3667		    &bp);
3668		ret = VOP_BWRITE(bp);
3669		vput(pvp);
3670		if (error != 0)
3671			return (error);
3672		if (ret != 0)
3673			return (ret);
3674	}
3675	FREE_LOCK(&lk);
3676	return (0);
3677}
3678
3679/*
3680 * This routine is called when we are trying to synchronously flush a
3681 * file. This routine must eliminate any filesystem metadata dependencies
3682 * so that the syncing routine can succeed by pushing the dirty blocks
3683 * associated with the file. If any I/O errors occur, they are returned.
3684 */
3685int
3686softdep_sync_metadata(ap)
3687	struct vop_fsync_args /* {
3688		struct vnode *a_vp;
3689		struct ucred *a_cred;
3690		int a_waitfor;
3691		struct proc *a_p;
3692	} */ *ap;
3693{
3694	struct vnode *vp = ap->a_vp;
3695	struct pagedep *pagedep;
3696	struct allocdirect *adp;
3697	struct allocindir *aip;
3698	struct buf *bp, *nbp;
3699	struct worklist *wk;
3700	int i, error, waitfor;
3701
3702	/*
3703	 * Check whether this vnode is involved in a filesystem
3704	 * that is doing soft dependency processing.
3705	 */
3706	if (vp->v_type != VBLK) {
3707		if (!DOINGSOFTDEP(vp))
3708			return (0);
3709	} else
3710		if (vp->v_specmountpoint == NULL ||
3711		    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
3712			return (0);
3713	/*
3714	 * Ensure that any direct block dependencies have been cleared.
3715	 */
3716	ACQUIRE_LOCK(&lk);
3717	if (error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number)) {
3718		FREE_LOCK(&lk);
3719		return (error);
3720	}
3721	/*
3722	 * For most files, the only metadata dependencies are the
3723	 * cylinder group maps that allocate their inode or blocks.
3724	 * The block allocation dependencies can be found by traversing
3725	 * the dependency lists for any buffers that remain on their
3726	 * dirty buffer list. The inode allocation dependency will
3727	 * be resolved when the inode is updated with MNT_WAIT.
3728	 * This work is done in two passes. The first pass grabs most
3729	 * of the buffers and begins asynchronously writing them. The
3730	 * only way to wait for these asynchronous writes is to sleep
3731	 * on the filesystem vnode which may stay busy for a long time
3732	 * if the filesystem is active. So, instead, we make a second
3733	 * pass over the dependencies blocking on each write. In the
3734	 * usual case we will be blocking against a write that we
3735	 * initiated, so when it is done the dependency will have been
3736	 * resolved. Thus the second pass is expected to end quickly.
3737	 */
3738	waitfor = MNT_NOWAIT;
3739top:
3740	if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
3741		while (vp->v_numoutput) {
3742			vp->v_flag |= VBWAIT;
3743			FREE_LOCK_INTERLOCKED(&lk);
3744			tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1,
3745				"sdsynm", 0);
3746			ACQUIRE_LOCK_INTERLOCKED(&lk);
3747		}
3748		FREE_LOCK(&lk);
3749		return (0);
3750	}
3751	bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
3752loop:
3753	/*
3754	 * As we hold the buffer locked, none of its dependencies
3755	 * will disappear.
3756	 */
3757	for (wk = LIST_FIRST(&bp->b_dep); wk;
3758	     wk = LIST_NEXT(wk, wk_list)) {
3759		switch (wk->wk_type) {
3760
3761		case D_ALLOCDIRECT:
3762			adp = WK_ALLOCDIRECT(wk);
3763			if (adp->ad_state & DEPCOMPLETE)
3764				break;
3765			nbp = adp->ad_buf;
3766			if (getdirtybuf(&nbp, waitfor) == 0)
3767				break;
3768			FREE_LOCK(&lk);
3769			if (waitfor == MNT_NOWAIT) {
3770				bawrite(nbp);
3771			} else if ((error = VOP_BWRITE(nbp)) != 0) {
3772				bawrite(bp);
3773				return (error);
3774			}
3775			ACQUIRE_LOCK(&lk);
3776			break;
3777
3778		case D_ALLOCINDIR:
3779			aip = WK_ALLOCINDIR(wk);
3780			if (aip->ai_state & DEPCOMPLETE)
3781				break;
3782			nbp = aip->ai_buf;
3783			if (getdirtybuf(&nbp, waitfor) == 0)
3784				break;
3785			FREE_LOCK(&lk);
3786			if (waitfor == MNT_NOWAIT) {
3787				bawrite(nbp);
3788			} else if ((error = VOP_BWRITE(nbp)) != 0) {
3789				bawrite(bp);
3790				return (error);
3791			}
3792			ACQUIRE_LOCK(&lk);
3793			break;
3794
3795		case D_INDIRDEP:
3796		restart:
3797			for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd);
3798			     aip; aip = LIST_NEXT(aip, ai_next)) {
3799				if (aip->ai_state & DEPCOMPLETE)
3800					continue;
3801				nbp = aip->ai_buf;
3802				if (getdirtybuf(&nbp, MNT_WAIT) == 0)
3803					goto restart;
3804				FREE_LOCK(&lk);
3805				if ((error = VOP_BWRITE(nbp)) != 0) {
3806					bawrite(bp);
3807					return (error);
3808				}
3809				ACQUIRE_LOCK(&lk);
3810				goto restart;
3811			}
3812			break;
3813
3814		case D_INODEDEP:
3815			if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
3816			    WK_INODEDEP(wk)->id_ino)) != 0) {
3817				FREE_LOCK(&lk);
3818				bawrite(bp);
3819				return (error);
3820			}
3821			break;
3822
3823		case D_PAGEDEP:
3824			/*
3825			 * We are trying to sync a directory that may
3826			 * have dependencies on both its own metadata
3827			 * and/or dependencies on the inodes of any
3828			 * recently allocated files. We walk its diradd
3829			 * lists pushing out the associated inode.
3830			 */
3831			pagedep = WK_PAGEDEP(wk);
3832			for (i = 0; i < DAHASHSZ; i++) {
3833				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
3834					continue;
3835				if (error = flush_pagedep_deps(vp,
3836				   pagedep->pd_mnt, &pagedep->pd_diraddhd[i])) {
3837					FREE_LOCK(&lk);
3838					bawrite(bp);
3839					return (error);
3840				}
3841			}
3842			break;
3843
3844		default:
3845			panic("softdep_sync_metadata: Unknown type %s",
3846			    TYPENAME(wk->wk_type));
3847			/* NOTREACHED */
3848		}
3849	}
3850	(void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT);
3851	nbp = TAILQ_NEXT(bp, b_vnbufs);
3852	FREE_LOCK(&lk);
3853	bawrite(bp);
3854	ACQUIRE_LOCK(&lk);
3855	if (nbp != NULL) {
3856		bp = nbp;
3857		goto loop;
3858	}
3859	/*
3860	 * We must wait for any I/O in progress to finish so that
3861	 * all potential buffers on the dirty list will be visible.
3862	 * Once they are all there, proceed with the second pass
3863	 * which will wait for the I/O as per above.
3864	 */
3865	while (vp->v_numoutput) {
3866		vp->v_flag |= VBWAIT;
3867		FREE_LOCK_INTERLOCKED(&lk);
3868		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "sdsynm", 0);
3869		ACQUIRE_LOCK_INTERLOCKED(&lk);
3870	}
3871	/*
3872	 * The brief unlock is to allow any pent up dependency
3873	 * processing to be done.
3874	 */
3875	if (waitfor == MNT_NOWAIT) {
3876		waitfor = MNT_WAIT;
3877		FREE_LOCK(&lk);
3878		ACQUIRE_LOCK(&lk);
3879		goto top;
3880	}
3881
3882	/*
3883	 * If we have managed to get rid of all the dirty buffers,
3884	 * then we are done. For certain directories and block
3885	 * devices, we may need to do further work.
3886	 */
3887	if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
3888		FREE_LOCK(&lk);
3889		return (0);
3890	}
3891
3892	FREE_LOCK(&lk);
3893	/*
3894	 * If we are trying to sync a block device, some of its buffers may
3895	 * contain metadata that cannot be written until the contents of some
3896	 * partially written files have been written to disk. The only easy
3897	 * way to accomplish this is to sync the entire filesystem (luckily
3898	 * this happens rarely).
3899	 */
3900	if (vp->v_type == VBLK && vp->v_specmountpoint && !VOP_ISLOCKED(vp) &&
3901	    (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred,
3902	     ap->a_p)) != 0)
3903		return (error);
3904	return (0);
3905}
3906
3907/*
3908 * Flush the dependencies associated with an inodedep.
3909 * Called with splbio blocked.
3910 */
3911static int
3912flush_inodedep_deps(fs, ino)
3913	struct fs *fs;
3914	ino_t ino;
3915{
3916	struct inodedep *inodedep;
3917	struct allocdirect *adp;
3918	int error, waitfor;
3919	struct buf *bp;
3920
3921	/*
3922	 * This work is done in two passes. The first pass grabs most
3923	 * of the buffers and begins asynchronously writing them. The
3924	 * only way to wait for these asynchronous writes is to sleep
3925	 * on the filesystem vnode which may stay busy for a long time
3926	 * if the filesystem is active. So, instead, we make a second
3927	 * pass over the dependencies blocking on each write. In the
3928	 * usual case we will be blocking against a write that we
3929	 * initiated, so when it is done the dependency will have been
3930	 * resolved. Thus the second pass is expected to end quickly.
3931	 * We give a brief window at the top of the loop to allow
3932	 * any pending I/O to complete.
3933	 */
3934	for (waitfor = MNT_NOWAIT; ; ) {
3935		FREE_LOCK(&lk);
3936		ACQUIRE_LOCK(&lk);
3937		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
3938			return (0);
3939		for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3940		     adp = TAILQ_NEXT(adp, ad_next)) {
3941			if (adp->ad_state & DEPCOMPLETE)
3942				continue;
3943			bp = adp->ad_buf;
3944			if (getdirtybuf(&bp, waitfor) == 0) {
3945				if (waitfor == MNT_NOWAIT)
3946					continue;
3947				break;
3948			}
3949			FREE_LOCK(&lk);
3950			if (waitfor == MNT_NOWAIT) {
3951				bawrite(bp);
3952			} else if ((error = VOP_BWRITE(bp)) != 0) {
3953				ACQUIRE_LOCK(&lk);
3954				return (error);
3955			}
3956			ACQUIRE_LOCK(&lk);
3957			break;
3958		}
3959		if (adp != NULL)
3960			continue;
3961		for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp;
3962		     adp = TAILQ_NEXT(adp, ad_next)) {
3963			if (adp->ad_state & DEPCOMPLETE)
3964				continue;
3965			bp = adp->ad_buf;
3966			if (getdirtybuf(&bp, waitfor) == 0) {
3967				if (waitfor == MNT_NOWAIT)
3968					continue;
3969				break;
3970			}
3971			FREE_LOCK(&lk);
3972			if (waitfor == MNT_NOWAIT) {
3973				bawrite(bp);
3974			} else if ((error = VOP_BWRITE(bp)) != 0) {
3975				ACQUIRE_LOCK(&lk);
3976				return (error);
3977			}
3978			ACQUIRE_LOCK(&lk);
3979			break;
3980		}
3981		if (adp != NULL)
3982			continue;
3983		/*
3984		 * If pass2, we are done, otherwise do pass 2.
3985		 */
3986		if (waitfor == MNT_WAIT)
3987			break;
3988		waitfor = MNT_WAIT;
3989	}
3990	/*
3991	 * Try freeing inodedep in case all dependencies have been removed.
3992	 */
3993	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
3994		(void) free_inodedep(inodedep);
3995	return (0);
3996}
3997
3998/*
3999 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
4000 * Called with splbio blocked.
4001 */
4002static int
4003flush_pagedep_deps(pvp, mp, diraddhdp)
4004	struct vnode *pvp;
4005	struct mount *mp;
4006	struct diraddhd *diraddhdp;
4007{
4008	struct proc *p = CURPROC;	/* XXX */
4009	struct inodedep *inodedep;
4010	struct ufsmount *ump;
4011	struct diradd *dap;
4012	struct timeval tv;
4013	struct vnode *vp;
4014	int gotit, error = 0;
4015	struct buf *bp;
4016	ino_t inum;
4017
4018	ump = VFSTOUFS(mp);
4019	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
4020		/*
4021		 * Flush ourselves if this directory entry
4022		 * has a MKDIR_PARENT dependency.
4023		 */
4024		if (dap->da_state & MKDIR_PARENT) {
4025#ifndef __FreeBSD__
4026			tv = time;
4027#else
4028			getmicrotime(&tv);
4029#endif /* __FreeBSD__ */
4030			FREE_LOCK(&lk);
4031			if (error = UFS_UPDATE(pvp, &tv, &tv, 1))
4032				break;
4033			ACQUIRE_LOCK(&lk);
4034			/*
4035			 * If that cleared dependencies, go on to next.
4036			 */
4037			if (dap != LIST_FIRST(diraddhdp))
4038				continue;
4039			if (dap->da_state & MKDIR_PARENT)
4040				panic("flush_pagedep_deps: MKDIR");
4041		}
4042		/*
4043		 * Flush the file on which the directory entry depends.
4044		 * If the inode has already been pushed out of the cache,
4045		 * then all the block dependencies will have been flushed
4046		 * leaving only inode dependencies (e.g., bitmaps). Thus,
4047		 * we do a ufs_ihashget to check for the vnode in the cache.
4048		 * If it is there, we do a full flush. If it is no longer
4049		 * there we need only dispose of any remaining bitmap
4050		 * dependencies and write the inode to disk.
4051		 */
4052		inum = dap->da_newinum;
4053		FREE_LOCK(&lk);
4054		if ((vp = ufs_ihashget(ump->um_dev, inum)) == NULL) {
4055			ACQUIRE_LOCK(&lk);
4056			if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0
4057			    && dap == LIST_FIRST(diraddhdp))
4058				panic("flush_pagedep_deps: flush 1 failed");
4059			/*
4060			 * If the inode still has bitmap dependencies,
4061			 * push them to disk.
4062			 */
4063			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4064				gotit = getdirtybuf(&inodedep->id_buf,MNT_WAIT);
4065				FREE_LOCK(&lk);
4066				if (gotit &&
4067				    (error = VOP_BWRITE(inodedep->id_buf)) != 0)
4068					break;
4069				ACQUIRE_LOCK(&lk);
4070			}
4071			if (dap != LIST_FIRST(diraddhdp))
4072				continue;
4073			/*
4074			 * If the inode is still sitting in a buffer waiting
4075			 * to be written, push it to disk.
4076			 */
4077			FREE_LOCK(&lk);
4078			if ((error = bread(ump->um_devvp,
4079			    fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
4080			    (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0)
4081				break;
4082			if ((error = VOP_BWRITE(bp)) != 0)
4083				break;
4084			ACQUIRE_LOCK(&lk);
4085			if (dap == LIST_FIRST(diraddhdp))
4086				panic("flush_pagedep_deps: flush 2 failed");
4087			continue;
4088		}
4089		if (vp->v_type == VDIR) {
4090			/*
4091			 * A newly allocated directory must have its "." and
4092			 * ".." entries written out before its name can be
4093			 * committed in its parent. We do not want or need
4094			 * the full semantics of a synchronous VOP_FSYNC as
4095			 * that may end up here again, once for each directory
4096			 * level in the filesystem. Instead, we push the blocks
4097			 * and wait for them to clear.
4098			 */
4099			if (error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) {
4100				vput(vp);
4101				break;
4102			}
4103			ACQUIRE_LOCK(&lk);
4104			while (vp->v_numoutput) {
4105				vp->v_flag |= VBWAIT;
4106				FREE_LOCK_INTERLOCKED(&lk);
4107				tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1,
4108					"sdflpd", 0);
4109				ACQUIRE_LOCK_INTERLOCKED(&lk);
4110			}
4111			FREE_LOCK(&lk);
4112		}
4113#ifndef __FreeBSD__
4114		tv = time;
4115#else
4116		getmicrotime(&tv);
4117#endif /* __FreeBSD__ */
4118		error = UFS_UPDATE(vp, &tv, &tv, 1);
4119		vput(vp);
4120		if (error)
4121			break;
4122		/*
4123		 * If we have failed to get rid of all the dependencies
4124		 * then something is seriously wrong.
4125		 */
4126		if (dap == LIST_FIRST(diraddhdp))
4127			panic("flush_pagedep_deps: flush 3 failed");
4128		ACQUIRE_LOCK(&lk);
4129	}
4130	if (error)
4131		ACQUIRE_LOCK(&lk);
4132	return (error);
4133}
4134
4135/*
4136 * Acquire exclusive access to a buffer.
4137 * Must be called with splbio blocked.
4138 * Return 1 if buffer was acquired.
4139 */
4140static int
4141getdirtybuf(bpp, waitfor)
4142	struct buf **bpp;
4143	int waitfor;
4144{
4145	struct buf *bp;
4146
4147	for (;;) {
4148		if ((bp = *bpp) == NULL)
4149			return (0);
4150		if ((bp->b_flags & B_BUSY) == 0)
4151			break;
4152		if (waitfor != MNT_WAIT)
4153			return (0);
4154		bp->b_flags |= B_WANTED;
4155		FREE_LOCK_INTERLOCKED(&lk);
4156		tsleep((caddr_t)bp, PRIBIO + 1, "sdsdty", 0);
4157		ACQUIRE_LOCK_INTERLOCKED(&lk);
4158	}
4159	if ((bp->b_flags & B_DELWRI) == 0)
4160		return (0);
4161	bremfree(bp);
4162	bp->b_flags |= B_BUSY;
4163	return (1);
4164}
4165
4166/*
4167 * Called whenever a buffer that is being invalidated or reallocated
4168 * contains dependencies. This should only happen if an I/O error has
4169 * occurred. The routine is called with the buffer locked.
4170 */
4171void
4172softdep_deallocate_dependencies(bp)
4173	struct buf *bp;
4174{
4175	struct worklist *wk;
4176
4177	if ((bp->b_flags & B_ERROR) == 0)
4178		panic("softdep_deallocate_dependencies: dangling deps");
4179	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
4180	ACQUIRE_LOCK(&lk);
4181	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
4182		WORKLIST_REMOVE(wk);
4183		FREE_LOCK(&lk);
4184		switch (wk->wk_type) {
4185		/*
4186		 * XXX - should really clean up, but for now we will
4187		 * just leak memory and not worry about it. Also should
4188		 * mark the filesystem permanently dirty so that it will
4189		 * force fsck to be run (though this would best be done
4190		 * in the mainline code).
4191		 */
4192		case D_PAGEDEP:
4193		case D_INODEDEP:
4194		case D_BMSAFEMAP:
4195		case D_ALLOCDIRECT:
4196		case D_INDIRDEP:
4197		case D_ALLOCINDIR:
4198		case D_MKDIR:
4199#ifdef DEBUG
4200			printf("Lost type %s\n", TYPENAME(wk->wk_type));
4201#endif
4202			break;
4203		default:
4204			panic("%s: Unexpected type %s",
4205			    "softdep_deallocate_dependencies",
4206			    TYPENAME(wk->wk_type));
4207			/* NOTREACHED */
4208		}
4209		ACQUIRE_LOCK(&lk);
4210	}
4211	FREE_LOCK(&lk);
4212}
4213
4214/*
4215 * Function to handle asynchronous write errors in the filesystem.
4216 */
4217void
4218softdep_error(func, error)
4219	char *func;
4220	int error;
4221{
4222
4223	/* XXX should do something better! */
4224	log(LOG_ERR, "%s: got error %d while accessing filesystem\n",
4225	    func, error);
4226}
4227
4228