ffs_softdep.c revision 55928
1/*
2 * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved.
3 *
4 * The soft updates code is derived from the appendix of a University
5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
6 * "Soft Updates: A Solution to the Metadata Update Problem in File
7 * Systems", CSE-TR-254-95, August 1995).
8 *
9 * The following are the copyrights and redistribution conditions that
10 * apply to this copy of the soft update software. For a license
11 * to use, redistribute or sell the soft update software under
12 * conditions other than those described here, please contact the
13 * author at one of the following addresses:
14 *
15 *	Marshall Kirk McKusick		mckusick@mckusick.com
16 *	1614 Oxford Street		+1-510-843-9542
17 *	Berkeley, CA 94709-1608
18 *	USA
19 *
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
22 * are met:
23 *
24 * 1. Redistributions of source code must retain the above copyright
25 *    notice, this list of conditions and the following disclaimer.
26 * 2. Redistributions in binary form must reproduce the above copyright
27 *    notice, this list of conditions and the following disclaimer in the
28 *    documentation and/or other materials provided with the distribution.
29 * 3. None of the names of McKusick, Ganger, Patt, or the University of
30 *    Michigan may be used to endorse or promote products derived from
31 *    this software without specific prior written permission.
32 * 4. Redistributions in any form must be accompanied by information on
33 *    how to obtain complete source code for any accompanying software
34 *    that uses this software. This source code must either be included
35 *    in the distribution or be available for no more than the cost of
36 *    distribution plus a nominal fee, and must be freely redistributable
37 *    under reasonable conditions. For an executable file, complete
38 *    source code means the source code for all modules it contains.
39 *    It does not mean source code for modules or files that typically
40 *    accompany the operating system on which the executable file runs,
41 *    e.g., standard library modules or system header files.
42 *
43 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
44 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
45 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
46 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
47 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 *	from: @(#)ffs_softdep.c	9.50 (McKusick) 1/12/00
56 * $FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 55928 2000-01-13 18:48:12Z peter $
57 */
58
59/*
60 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
61 */
62#ifndef DIAGNOSTIC
63#define DIAGNOSTIC
64#endif
65#ifndef DEBUG
66#define DEBUG
67#endif
68
69#include <sys/param.h>
70#include <sys/kernel.h>
71#include <sys/systm.h>
72#include <sys/buf.h>
73#include <sys/malloc.h>
74#include <sys/mount.h>
75#include <sys/proc.h>
76#include <sys/syslog.h>
77#include <sys/vnode.h>
78#include <sys/conf.h>
79#include <ufs/ufs/dir.h>
80#include <ufs/ufs/quota.h>
81#include <ufs/ufs/inode.h>
82#include <ufs/ufs/ufsmount.h>
83#include <ufs/ffs/fs.h>
84#include <ufs/ffs/softdep.h>
85#include <ufs/ffs/ffs_extern.h>
86#include <ufs/ufs/ufs_extern.h>
87
88/*
89 * These definitions need to be adapted to the system to which
90 * this file is being ported.
91 */
92/*
93 * malloc types defined for the softdep system.
94 */
95MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
96MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
97MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
98MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
99MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
100MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
101MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
102MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
103MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
104MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
105MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
106MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
107MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
108
109#define	D_PAGEDEP	0
110#define	D_INODEDEP	1
111#define	D_NEWBLK	2
112#define	D_BMSAFEMAP	3
113#define	D_ALLOCDIRECT	4
114#define	D_INDIRDEP	5
115#define	D_ALLOCINDIR	6
116#define	D_FREEFRAG	7
117#define	D_FREEBLKS	8
118#define	D_FREEFILE	9
119#define	D_DIRADD	10
120#define	D_MKDIR		11
121#define	D_DIRREM	12
122#define D_LAST		D_DIRREM
123
124/*
125 * translate from workitem type to memory type
126 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
127 */
128static struct malloc_type *memtype[] = {
129	M_PAGEDEP,
130	M_INODEDEP,
131	M_NEWBLK,
132	M_BMSAFEMAP,
133	M_ALLOCDIRECT,
134	M_INDIRDEP,
135	M_ALLOCINDIR,
136	M_FREEFRAG,
137	M_FREEBLKS,
138	M_FREEFILE,
139	M_DIRADD,
140	M_MKDIR,
141	M_DIRREM
142};
143
144#define DtoM(type) (memtype[type])
145
146/*
147 * Names of malloc types.
148 */
149#define TYPENAME(type)  \
150	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
151#define CURPROC curproc
152/*
153 * End system adaptaion definitions.
154 */
155
156/*
157 * Internal function prototypes.
158 */
159static	void softdep_error __P((char *, int));
160static	void drain_output __P((struct vnode *, int));
161static	int getdirtybuf __P((struct buf **, int));
162static	void clear_remove __P((struct proc *));
163static	void clear_inodedeps __P((struct proc *));
164static	int flush_pagedep_deps __P((struct vnode *, struct mount *,
165	    struct diraddhd *));
166static	int flush_inodedep_deps __P((struct fs *, ino_t));
167static	int handle_written_filepage __P((struct pagedep *, struct buf *));
168static  void diradd_inode_written __P((struct diradd *, struct inodedep *));
169static	int handle_written_inodeblock __P((struct inodedep *, struct buf *));
170static	void handle_allocdirect_partdone __P((struct allocdirect *));
171static	void handle_allocindir_partdone __P((struct allocindir *));
172static	void initiate_write_filepage __P((struct pagedep *, struct buf *));
173static	void handle_written_mkdir __P((struct mkdir *, int));
174static	void initiate_write_inodeblock __P((struct inodedep *, struct buf *));
175static	void handle_workitem_freefile __P((struct freefile *));
176static	void handle_workitem_remove __P((struct dirrem *));
177static	struct dirrem *newdirrem __P((struct buf *, struct inode *,
178	    struct inode *, int));
179static	void free_diradd __P((struct diradd *));
180static	void free_allocindir __P((struct allocindir *, struct inodedep *));
181static	int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t,
182	    long *));
183static	void deallocate_dependencies __P((struct buf *, struct inodedep *));
184static	void free_allocdirect __P((struct allocdirectlst *,
185	    struct allocdirect *, int));
186static	int free_inodedep __P((struct inodedep *));
187static	void handle_workitem_freeblocks __P((struct freeblks *));
188static	void merge_inode_lists __P((struct inodedep *));
189static	void setup_allocindir_phase2 __P((struct buf *, struct inode *,
190	    struct allocindir *));
191static	struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t,
192	    ufs_daddr_t));
193static	void handle_workitem_freefrag __P((struct freefrag *));
194static	struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long));
195static	void allocdirect_merge __P((struct allocdirectlst *,
196	    struct allocdirect *, struct allocdirect *));
197static	struct bmsafemap *bmsafemap_lookup __P((struct buf *));
198static	int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
199	    struct newblk **));
200static	int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
201static	int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
202	    struct pagedep **));
203static	void pause_timer __P((void *));
204static	int request_cleanup __P((int, int));
205static	void add_to_worklist __P((struct worklist *));
206
207/*
208 * Exported softdep operations.
209 */
210static	void softdep_disk_io_initiation __P((struct buf *));
211static	void softdep_disk_write_complete __P((struct buf *));
212static	void softdep_deallocate_dependencies __P((struct buf *));
213static	int softdep_fsync __P((struct vnode *));
214static	int softdep_process_worklist __P((struct mount *));
215static	void softdep_move_dependencies __P((struct buf *, struct buf *));
216static	int softdep_count_dependencies __P((struct buf *bp, int));
217
218struct bio_ops bioops = {
219	softdep_disk_io_initiation,		/* io_start */
220	softdep_disk_write_complete,		/* io_complete */
221	softdep_deallocate_dependencies,	/* io_deallocate */
222	softdep_fsync,				/* io_fsync */
223	softdep_process_worklist,		/* io_sync */
224	softdep_move_dependencies,		/* io_movedeps */
225	softdep_count_dependencies,		/* io_countdeps */
226};
227
228/*
229 * Locking primitives.
230 *
231 * For a uniprocessor, all we need to do is protect against disk
232 * interrupts. For a multiprocessor, this lock would have to be
233 * a mutex. A single mutex is used throughout this file, though
234 * finer grain locking could be used if contention warranted it.
235 *
236 * For a multiprocessor, the sleep call would accept a lock and
237 * release it after the sleep processing was complete. In a uniprocessor
238 * implementation there is no such interlock, so we simple mark
239 * the places where it needs to be done with the `interlocked' form
240 * of the lock calls. Since the uniprocessor sleep already interlocks
241 * the spl, there is nothing that really needs to be done.
242 */
243#ifndef /* NOT */ DEBUG
244static struct lockit {
245	int	lkt_spl;
246} lk = { 0 };
247#define ACQUIRE_LOCK(lk)		(lk)->lkt_spl = splbio()
248#define FREE_LOCK(lk)			splx((lk)->lkt_spl)
249#define ACQUIRE_LOCK_INTERLOCKED(lk)
250#define FREE_LOCK_INTERLOCKED(lk)
251
252#else /* DEBUG */
253static struct lockit {
254	int	lkt_spl;
255	pid_t	lkt_held;
256} lk = { 0, -1 };
257static int lockcnt;
258
259static	void acquire_lock __P((struct lockit *));
260static	void free_lock __P((struct lockit *));
261static	void acquire_lock_interlocked __P((struct lockit *));
262static	void free_lock_interlocked __P((struct lockit *));
263
264#define ACQUIRE_LOCK(lk)		acquire_lock(lk)
265#define FREE_LOCK(lk)			free_lock(lk)
266#define ACQUIRE_LOCK_INTERLOCKED(lk)	acquire_lock_interlocked(lk)
267#define FREE_LOCK_INTERLOCKED(lk)	free_lock_interlocked(lk)
268
269static void
270acquire_lock(lk)
271	struct lockit *lk;
272{
273
274	if (lk->lkt_held != -1) {
275		if (lk->lkt_held == CURPROC->p_pid)
276			panic("softdep_lock: locking against myself");
277		else
278			panic("softdep_lock: lock held by %d", lk->lkt_held);
279	}
280	lk->lkt_spl = splbio();
281	lk->lkt_held = CURPROC->p_pid;
282	lockcnt++;
283}
284
285static void
286free_lock(lk)
287	struct lockit *lk;
288{
289
290	if (lk->lkt_held == -1)
291		panic("softdep_unlock: lock not held");
292	lk->lkt_held = -1;
293	splx(lk->lkt_spl);
294}
295
296static void
297acquire_lock_interlocked(lk)
298	struct lockit *lk;
299{
300
301	if (lk->lkt_held != -1) {
302		if (lk->lkt_held == CURPROC->p_pid)
303			panic("softdep_lock_interlocked: locking against self");
304		else
305			panic("softdep_lock_interlocked: lock held by %d",
306			    lk->lkt_held);
307	}
308	lk->lkt_held = CURPROC->p_pid;
309	lockcnt++;
310}
311
312static void
313free_lock_interlocked(lk)
314	struct lockit *lk;
315{
316
317	if (lk->lkt_held == -1)
318		panic("softdep_unlock_interlocked: lock not held");
319	lk->lkt_held = -1;
320}
321#endif /* DEBUG */
322
323/*
324 * Place holder for real semaphores.
325 */
326struct sema {
327	int	value;
328	pid_t	holder;
329	char	*name;
330	int	prio;
331	int	timo;
332};
333static	void sema_init __P((struct sema *, char *, int, int));
334static	int sema_get __P((struct sema *, struct lockit *));
335static	void sema_release __P((struct sema *));
336
337static void
338sema_init(semap, name, prio, timo)
339	struct sema *semap;
340	char *name;
341	int prio, timo;
342{
343
344	semap->holder = -1;
345	semap->value = 0;
346	semap->name = name;
347	semap->prio = prio;
348	semap->timo = timo;
349}
350
351static int
352sema_get(semap, interlock)
353	struct sema *semap;
354	struct lockit *interlock;
355{
356
357	if (semap->value++ > 0) {
358		if (interlock != NULL)
359			FREE_LOCK_INTERLOCKED(interlock);
360		tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo);
361		if (interlock != NULL) {
362			ACQUIRE_LOCK_INTERLOCKED(interlock);
363			FREE_LOCK(interlock);
364		}
365		return (0);
366	}
367	semap->holder = CURPROC->p_pid;
368	if (interlock != NULL)
369		FREE_LOCK(interlock);
370	return (1);
371}
372
373static void
374sema_release(semap)
375	struct sema *semap;
376{
377
378	if (semap->value <= 0 || semap->holder != CURPROC->p_pid)
379		panic("sema_release: not held");
380	if (--semap->value > 0) {
381		semap->value = 0;
382		wakeup(semap);
383	}
384	semap->holder = -1;
385}
386
387/*
388 * Worklist queue management.
389 * These routines require that the lock be held.
390 */
391#ifndef /* NOT */ DEBUG
392#define WORKLIST_INSERT(head, item) do {	\
393	(item)->wk_state |= ONWORKLIST;		\
394	LIST_INSERT_HEAD(head, item, wk_list);	\
395} while (0)
396#define WORKLIST_REMOVE(item) do {		\
397	(item)->wk_state &= ~ONWORKLIST;	\
398	LIST_REMOVE(item, wk_list);		\
399} while (0)
400#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
401
402#else /* DEBUG */
403static	void worklist_insert __P((struct workhead *, struct worklist *));
404static	void worklist_remove __P((struct worklist *));
405static	void workitem_free __P((struct worklist *, int));
406
407#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
408#define WORKLIST_REMOVE(item) worklist_remove(item)
409#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
410
411static void
412worklist_insert(head, item)
413	struct workhead *head;
414	struct worklist *item;
415{
416
417	if (lk.lkt_held == -1)
418		panic("worklist_insert: lock not held");
419	if (item->wk_state & ONWORKLIST)
420		panic("worklist_insert: already on list");
421	item->wk_state |= ONWORKLIST;
422	LIST_INSERT_HEAD(head, item, wk_list);
423}
424
425static void
426worklist_remove(item)
427	struct worklist *item;
428{
429
430	if (lk.lkt_held == -1)
431		panic("worklist_remove: lock not held");
432	if ((item->wk_state & ONWORKLIST) == 0)
433		panic("worklist_remove: not on list");
434	item->wk_state &= ~ONWORKLIST;
435	LIST_REMOVE(item, wk_list);
436}
437
438static void
439workitem_free(item, type)
440	struct worklist *item;
441	int type;
442{
443
444	if (item->wk_state & ONWORKLIST)
445		panic("workitem_free: still on list");
446	if (item->wk_type != type)
447		panic("workitem_free: type mismatch");
448	FREE(item, DtoM(type));
449}
450#endif /* DEBUG */
451
452/*
453 * Workitem queue management
454 */
455static struct workhead softdep_workitem_pending;
456static int softdep_worklist_busy;
457static int max_softdeps;	/* maximum number of structs before slowdown */
458static int tickdelay = 2;	/* number of ticks to pause during slowdown */
459static int proc_waiting;	/* tracks whether we have a timeout posted */
460static struct proc *filesys_syncer; /* proc of filesystem syncer process */
461static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
462#define FLUSH_INODES	1
463static int req_clear_remove;	/* syncer process flush some freeblks */
464#define FLUSH_REMOVE	2
465/*
466 * runtime statistics
467 */
468static int stat_blk_limit_push;	/* number of times block limit neared */
469static int stat_ino_limit_push;	/* number of times inode limit neared */
470static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
471static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
472static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
473static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
474static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
475static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
476#ifdef DEBUG
477#include <vm/vm.h>
478#include <sys/sysctl.h>
479SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
480SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
481SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
482SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
483SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
484SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
485SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
486SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
487SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
488SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
489#endif /* DEBUG */
490
491/*
492 * Add an item to the end of the work queue.
493 * This routine requires that the lock be held.
494 * This is the only routine that adds items to the list.
495 * The following routine is the only one that removes items
496 * and does so in order from first to last.
497 */
498static void
499add_to_worklist(wk)
500	struct worklist *wk;
501{
502	static struct worklist *worklist_tail;
503
504	if (wk->wk_state & ONWORKLIST)
505		panic("add_to_worklist: already on list");
506	wk->wk_state |= ONWORKLIST;
507	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
508		LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
509	else
510		LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
511	worklist_tail = wk;
512}
513
514/*
515 * Process that runs once per second to handle items in the background queue.
516 *
517 * Note that we ensure that everything is done in the order in which they
518 * appear in the queue. The code below depends on this property to ensure
519 * that blocks of a file are freed before the inode itself is freed. This
520 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
521 * until all the old ones have been purged from the dependency lists.
522 */
523static int
524softdep_process_worklist(matchmnt)
525	struct mount *matchmnt;
526{
527	struct proc *p = CURPROC;
528	struct worklist *wk;
529	struct fs *matchfs;
530	int matchcnt, loopcount;
531
532	/*
533	 * Record the process identifier of our caller so that we can give
534	 * this process preferential treatment in request_cleanup below.
535	 */
536	filesys_syncer = p;
537	matchcnt = 0;
538	matchfs = NULL;
539	if (matchmnt != NULL)
540		matchfs = VFSTOUFS(matchmnt)->um_fs;
541	/*
542	 * There is no danger of having multiple processes run this
543	 * code. It is single threaded solely so that softdep_flushfiles
544	 * (below) can get an accurate count of the number of items
545	 * related to its mount point that are in the list.
546	 */
547	if (softdep_worklist_busy && matchmnt == NULL)
548		return (-1);
549	/*
550	 * If requested, try removing inode or removal dependencies.
551	 */
552	if (req_clear_inodedeps) {
553		clear_inodedeps(p);
554		req_clear_inodedeps = 0;
555		wakeup(&proc_waiting);
556	}
557	if (req_clear_remove) {
558		clear_remove(p);
559		req_clear_remove = 0;
560		wakeup(&proc_waiting);
561	}
562	ACQUIRE_LOCK(&lk);
563	loopcount = 1;
564	while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) {
565		WORKLIST_REMOVE(wk);
566		FREE_LOCK(&lk);
567		switch (wk->wk_type) {
568
569		case D_DIRREM:
570			/* removal of a directory entry */
571			if (WK_DIRREM(wk)->dm_mnt == matchmnt)
572				matchcnt += 1;
573			handle_workitem_remove(WK_DIRREM(wk));
574			break;
575
576		case D_FREEBLKS:
577			/* releasing blocks and/or fragments from a file */
578			if (WK_FREEBLKS(wk)->fb_fs == matchfs)
579				matchcnt += 1;
580			handle_workitem_freeblocks(WK_FREEBLKS(wk));
581			break;
582
583		case D_FREEFRAG:
584			/* releasing a fragment when replaced as a file grows */
585			if (WK_FREEFRAG(wk)->ff_fs == matchfs)
586				matchcnt += 1;
587			handle_workitem_freefrag(WK_FREEFRAG(wk));
588			break;
589
590		case D_FREEFILE:
591			/* releasing an inode when its link count drops to 0 */
592			if (WK_FREEFILE(wk)->fx_fs == matchfs)
593				matchcnt += 1;
594			handle_workitem_freefile(WK_FREEFILE(wk));
595			break;
596
597		default:
598			panic("%s_process_worklist: Unknown type %s",
599			    "softdep", TYPENAME(wk->wk_type));
600			/* NOTREACHED */
601		}
602		if (softdep_worklist_busy && matchmnt == NULL)
603			return (-1);
604		/*
605		 * If requested, try removing inode or removal dependencies.
606		 */
607		if (req_clear_inodedeps) {
608			clear_inodedeps(p);
609			req_clear_inodedeps = 0;
610			wakeup(&proc_waiting);
611		}
612		if (req_clear_remove) {
613			clear_remove(p);
614			req_clear_remove = 0;
615			wakeup(&proc_waiting);
616		}
617		/*
618		 * We do not generally want to stop for buffer space, but if
619		 * we are really being a buffer hog, we will stop and wait.
620		 */
621		if (loopcount++ % 128 == 0)
622			bwillwrite();
623		ACQUIRE_LOCK(&lk);
624	}
625	FREE_LOCK(&lk);
626	return (matchcnt);
627}
628
629/*
630 * Move dependencies from one buffer to another.
631 */
632static void
633softdep_move_dependencies(oldbp, newbp)
634	struct buf *oldbp;
635	struct buf *newbp;
636{
637	struct worklist *wk, *wktail;
638
639	if (LIST_FIRST(&newbp->b_dep) != NULL)
640		panic("softdep_move_dependencies: need merge code");
641	wktail = 0;
642	ACQUIRE_LOCK(&lk);
643	while (wk = LIST_FIRST(&oldbp->b_dep)) {
644		LIST_REMOVE(wk, wk_list);
645		if (wktail == 0)
646			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
647		else
648			LIST_INSERT_AFTER(wktail, wk, wk_list);
649		wktail = wk;
650	}
651	FREE_LOCK(&lk);
652}
653
654/*
655 * Purge the work list of all items associated with a particular mount point.
656 */
657int
658softdep_flushfiles(oldmnt, flags, p)
659	struct mount *oldmnt;
660	int flags;
661	struct proc *p;
662{
663	struct vnode *devvp;
664	int error, loopcnt;
665
666	/*
667	 * Await our turn to clear out the queue.
668	 */
669	while (softdep_worklist_busy)
670		tsleep(&lbolt, PRIBIO, "softflush", 0);
671	softdep_worklist_busy = 1;
672	if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) {
673		softdep_worklist_busy = 0;
674		return (error);
675	}
676	/*
677	 * Alternately flush the block device associated with the mount
678	 * point and process any dependencies that the flushing
679	 * creates. In theory, this loop can happen at most twice,
680	 * but we give it a few extra just to be sure.
681	 */
682	devvp = VFSTOUFS(oldmnt)->um_devvp;
683	for (loopcnt = 10; loopcnt > 0; loopcnt--) {
684		if (softdep_process_worklist(oldmnt) == 0) {
685			/*
686			 * Do another flush in case any vnodes were brought in
687			 * as part of the cleanup operations.
688			 */
689			if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
690				break;
691			/*
692			 * If we still found nothing to do, we are really done.
693			 */
694			if (softdep_process_worklist(oldmnt) == 0)
695				break;
696		}
697		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
698		error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
699		VOP_UNLOCK(devvp, 0, p);
700		if (error)
701			break;
702	}
703	softdep_worklist_busy = 0;
704	/*
705	 * If we are unmounting then it is an error to fail. If we
706	 * are simply trying to downgrade to read-only, then filesystem
707	 * activity can keep us busy forever, so we just fail with EBUSY.
708	 */
709	if (loopcnt == 0) {
710		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
711			panic("softdep_flushfiles: looping");
712		error = EBUSY;
713	}
714	return (error);
715}
716
717/*
718 * Structure hashing.
719 *
720 * There are three types of structures that can be looked up:
721 *	1) pagedep structures identified by mount point, inode number,
722 *	   and logical block.
723 *	2) inodedep structures identified by mount point and inode number.
724 *	3) newblk structures identified by mount point and
725 *	   physical block number.
726 *
727 * The "pagedep" and "inodedep" dependency structures are hashed
728 * separately from the file blocks and inodes to which they correspond.
729 * This separation helps when the in-memory copy of an inode or
730 * file block must be replaced. It also obviates the need to access
731 * an inode or file page when simply updating (or de-allocating)
732 * dependency structures. Lookup of newblk structures is needed to
733 * find newly allocated blocks when trying to associate them with
734 * their allocdirect or allocindir structure.
735 *
736 * The lookup routines optionally create and hash a new instance when
737 * an existing entry is not found.
738 */
739#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
740
741/*
742 * Structures and routines associated with pagedep caching.
743 */
744LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
745u_long	pagedep_hash;		/* size of hash table - 1 */
746#define	PAGEDEP_HASH(mp, inum, lbn) \
747	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
748	    pagedep_hash])
749static struct sema pagedep_in_progress;
750
751/*
752 * Look up a pagedep. Return 1 if found, 0 if not found.
753 * If not found, allocate if DEPALLOC flag is passed.
754 * Found or allocated entry is returned in pagedeppp.
755 * This routine must be called with splbio interrupts blocked.
756 */
757static int
758pagedep_lookup(ip, lbn, flags, pagedeppp)
759	struct inode *ip;
760	ufs_lbn_t lbn;
761	int flags;
762	struct pagedep **pagedeppp;
763{
764	struct pagedep *pagedep;
765	struct pagedep_hashhead *pagedephd;
766	struct mount *mp;
767	int i;
768
769#ifdef DEBUG
770	if (lk.lkt_held == -1)
771		panic("pagedep_lookup: lock not held");
772#endif
773	mp = ITOV(ip)->v_mount;
774	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
775top:
776	for (pagedep = LIST_FIRST(pagedephd); pagedep;
777	     pagedep = LIST_NEXT(pagedep, pd_hash))
778		if (ip->i_number == pagedep->pd_ino &&
779		    lbn == pagedep->pd_lbn &&
780		    mp == pagedep->pd_mnt)
781			break;
782	if (pagedep) {
783		*pagedeppp = pagedep;
784		return (1);
785	}
786	if ((flags & DEPALLOC) == 0) {
787		*pagedeppp = NULL;
788		return (0);
789	}
790	if (sema_get(&pagedep_in_progress, &lk) == 0) {
791		ACQUIRE_LOCK(&lk);
792		goto top;
793	}
794	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
795		M_WAITOK);
796	bzero(pagedep, sizeof(struct pagedep));
797	pagedep->pd_list.wk_type = D_PAGEDEP;
798	pagedep->pd_mnt = mp;
799	pagedep->pd_ino = ip->i_number;
800	pagedep->pd_lbn = lbn;
801	LIST_INIT(&pagedep->pd_dirremhd);
802	LIST_INIT(&pagedep->pd_pendinghd);
803	for (i = 0; i < DAHASHSZ; i++)
804		LIST_INIT(&pagedep->pd_diraddhd[i]);
805	ACQUIRE_LOCK(&lk);
806	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
807	sema_release(&pagedep_in_progress);
808	*pagedeppp = pagedep;
809	return (0);
810}
811
812/*
813 * Structures and routines associated with inodedep caching.
814 */
815LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
816static u_long	inodedep_hash;	/* size of hash table - 1 */
817static long	num_inodedep;	/* number of inodedep allocated */
818#define	INODEDEP_HASH(fs, inum) \
819      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
820static struct sema inodedep_in_progress;
821
822/*
823 * Look up a inodedep. Return 1 if found, 0 if not found.
824 * If not found, allocate if DEPALLOC flag is passed.
825 * Found or allocated entry is returned in inodedeppp.
826 * This routine must be called with splbio interrupts blocked.
827 */
828static int
829inodedep_lookup(fs, inum, flags, inodedeppp)
830	struct fs *fs;
831	ino_t inum;
832	int flags;
833	struct inodedep **inodedeppp;
834{
835	struct inodedep *inodedep;
836	struct inodedep_hashhead *inodedephd;
837	int firsttry;
838
839#ifdef DEBUG
840	if (lk.lkt_held == -1)
841		panic("inodedep_lookup: lock not held");
842#endif
843	firsttry = 1;
844	inodedephd = INODEDEP_HASH(fs, inum);
845top:
846	for (inodedep = LIST_FIRST(inodedephd); inodedep;
847	     inodedep = LIST_NEXT(inodedep, id_hash))
848		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
849			break;
850	if (inodedep) {
851		*inodedeppp = inodedep;
852		return (1);
853	}
854	if ((flags & DEPALLOC) == 0) {
855		*inodedeppp = NULL;
856		return (0);
857	}
858	/*
859	 * If we are over our limit, try to improve the situation.
860	 */
861	if (num_inodedep > max_softdeps && firsttry && speedup_syncer() == 0 &&
862	    request_cleanup(FLUSH_INODES, 1)) {
863		firsttry = 0;
864		goto top;
865	}
866	if (sema_get(&inodedep_in_progress, &lk) == 0) {
867		ACQUIRE_LOCK(&lk);
868		goto top;
869	}
870	num_inodedep += 1;
871	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
872		M_INODEDEP, M_WAITOK);
873	inodedep->id_list.wk_type = D_INODEDEP;
874	inodedep->id_fs = fs;
875	inodedep->id_ino = inum;
876	inodedep->id_state = ALLCOMPLETE;
877	inodedep->id_nlinkdelta = 0;
878	inodedep->id_savedino = NULL;
879	inodedep->id_savedsize = -1;
880	inodedep->id_buf = NULL;
881	LIST_INIT(&inodedep->id_pendinghd);
882	LIST_INIT(&inodedep->id_inowait);
883	LIST_INIT(&inodedep->id_bufwait);
884	TAILQ_INIT(&inodedep->id_inoupdt);
885	TAILQ_INIT(&inodedep->id_newinoupdt);
886	ACQUIRE_LOCK(&lk);
887	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
888	sema_release(&inodedep_in_progress);
889	*inodedeppp = inodedep;
890	return (0);
891}
892
893/*
894 * Structures and routines associated with newblk caching.
895 */
896LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
897u_long	newblk_hash;		/* size of hash table - 1 */
898#define	NEWBLK_HASH(fs, inum) \
899	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
900static struct sema newblk_in_progress;
901
902/*
903 * Look up a newblk. Return 1 if found, 0 if not found.
904 * If not found, allocate if DEPALLOC flag is passed.
905 * Found or allocated entry is returned in newblkpp.
906 */
907static int
908newblk_lookup(fs, newblkno, flags, newblkpp)
909	struct fs *fs;
910	ufs_daddr_t newblkno;
911	int flags;
912	struct newblk **newblkpp;
913{
914	struct newblk *newblk;
915	struct newblk_hashhead *newblkhd;
916
917	newblkhd = NEWBLK_HASH(fs, newblkno);
918top:
919	for (newblk = LIST_FIRST(newblkhd); newblk;
920	     newblk = LIST_NEXT(newblk, nb_hash))
921		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
922			break;
923	if (newblk) {
924		*newblkpp = newblk;
925		return (1);
926	}
927	if ((flags & DEPALLOC) == 0) {
928		*newblkpp = NULL;
929		return (0);
930	}
931	if (sema_get(&newblk_in_progress, 0) == 0)
932		goto top;
933	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
934		M_NEWBLK, M_WAITOK);
935	newblk->nb_state = 0;
936	newblk->nb_fs = fs;
937	newblk->nb_newblkno = newblkno;
938	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
939	sema_release(&newblk_in_progress);
940	*newblkpp = newblk;
941	return (0);
942}
943
944/*
945 * Executed during filesystem system initialization before
946 * mounting any file systems.
947 */
948void
949softdep_initialize()
950{
951
952	LIST_INIT(&mkdirlisthd);
953	LIST_INIT(&softdep_workitem_pending);
954	max_softdeps = desiredvnodes * 8;
955	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
956	    &pagedep_hash);
957	sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
958	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
959	sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
960	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
961	sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
962}
963
964/*
965 * Called at mount time to notify the dependency code that a
966 * filesystem wishes to use it.
967 */
968int
969softdep_mount(devvp, mp, fs, cred)
970	struct vnode *devvp;
971	struct mount *mp;
972	struct fs *fs;
973	struct ucred *cred;
974{
975	struct csum cstotal;
976	struct cg *cgp;
977	struct buf *bp;
978	int error, cyl;
979
980	mp->mnt_flag &= ~MNT_ASYNC;
981	mp->mnt_flag |= MNT_SOFTDEP;
982	/*
983	 * When doing soft updates, the counters in the
984	 * superblock may have gotten out of sync, so we have
985	 * to scan the cylinder groups and recalculate them.
986	 */
987	if (fs->fs_clean != 0)
988		return (0);
989	bzero(&cstotal, sizeof cstotal);
990	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
991		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
992		    fs->fs_cgsize, cred, &bp)) != 0) {
993			brelse(bp);
994			return (error);
995		}
996		cgp = (struct cg *)bp->b_data;
997		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
998		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
999		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1000		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1001		fs->fs_cs(fs, cyl) = cgp->cg_cs;
1002		brelse(bp);
1003	}
1004#ifdef DEBUG
1005	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1006		printf("ffs_mountfs: superblock updated for soft updates\n");
1007#endif
1008	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1009	return (0);
1010}
1011
1012/*
1013 * Protecting the freemaps (or bitmaps).
1014 *
1015 * To eliminate the need to execute fsck before mounting a file system
1016 * after a power failure, one must (conservatively) guarantee that the
1017 * on-disk copy of the bitmaps never indicate that a live inode or block is
1018 * free.  So, when a block or inode is allocated, the bitmap should be
1019 * updated (on disk) before any new pointers.  When a block or inode is
1020 * freed, the bitmap should not be updated until all pointers have been
1021 * reset.  The latter dependency is handled by the delayed de-allocation
1022 * approach described below for block and inode de-allocation.  The former
1023 * dependency is handled by calling the following procedure when a block or
1024 * inode is allocated. When an inode is allocated an "inodedep" is created
1025 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1026 * Each "inodedep" is also inserted into the hash indexing structure so
1027 * that any additional link additions can be made dependent on the inode
1028 * allocation.
1029 *
1030 * The ufs file system maintains a number of free block counts (e.g., per
1031 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1032 * in addition to the bitmaps.  These counts are used to improve efficiency
1033 * during allocation and therefore must be consistent with the bitmaps.
1034 * There is no convenient way to guarantee post-crash consistency of these
1035 * counts with simple update ordering, for two main reasons: (1) The counts
1036 * and bitmaps for a single cylinder group block are not in the same disk
1037 * sector.  If a disk write is interrupted (e.g., by power failure), one may
1038 * be written and the other not.  (2) Some of the counts are located in the
1039 * superblock rather than the cylinder group block. So, we focus our soft
1040 * updates implementation on protecting the bitmaps. When mounting a
1041 * filesystem, we recompute the auxiliary counts from the bitmaps.
1042 */
1043
1044/*
1045 * Called just after updating the cylinder group block to allocate an inode.
1046 */
1047void
1048softdep_setup_inomapdep(bp, ip, newinum)
1049	struct buf *bp;		/* buffer for cylgroup block with inode map */
1050	struct inode *ip;	/* inode related to allocation */
1051	ino_t newinum;		/* new inode number being allocated */
1052{
1053	struct inodedep *inodedep;
1054	struct bmsafemap *bmsafemap;
1055
1056	/*
1057	 * Create a dependency for the newly allocated inode.
1058	 * Panic if it already exists as something is seriously wrong.
1059	 * Otherwise add it to the dependency list for the buffer holding
1060	 * the cylinder group map from which it was allocated.
1061	 */
1062	ACQUIRE_LOCK(&lk);
1063	if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0)
1064		panic("softdep_setup_inomapdep: found inode");
1065	inodedep->id_buf = bp;
1066	inodedep->id_state &= ~DEPCOMPLETE;
1067	bmsafemap = bmsafemap_lookup(bp);
1068	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1069	FREE_LOCK(&lk);
1070}
1071
1072/*
1073 * Called just after updating the cylinder group block to
1074 * allocate block or fragment.
1075 */
1076void
1077softdep_setup_blkmapdep(bp, fs, newblkno)
1078	struct buf *bp;		/* buffer for cylgroup block with block map */
1079	struct fs *fs;		/* filesystem doing allocation */
1080	ufs_daddr_t newblkno;	/* number of newly allocated block */
1081{
1082	struct newblk *newblk;
1083	struct bmsafemap *bmsafemap;
1084
1085	/*
1086	 * Create a dependency for the newly allocated block.
1087	 * Add it to the dependency list for the buffer holding
1088	 * the cylinder group map from which it was allocated.
1089	 */
1090	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1091		panic("softdep_setup_blkmapdep: found block");
1092	ACQUIRE_LOCK(&lk);
1093	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1094	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1095	FREE_LOCK(&lk);
1096}
1097
1098/*
1099 * Find the bmsafemap associated with a cylinder group buffer.
1100 * If none exists, create one. The buffer must be locked when
1101 * this routine is called and this routine must be called with
1102 * splbio interrupts blocked.
1103 */
1104static struct bmsafemap *
1105bmsafemap_lookup(bp)
1106	struct buf *bp;
1107{
1108	struct bmsafemap *bmsafemap;
1109	struct worklist *wk;
1110
1111#ifdef DEBUG
1112	if (lk.lkt_held == -1)
1113		panic("bmsafemap_lookup: lock not held");
1114#endif
1115	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list))
1116		if (wk->wk_type == D_BMSAFEMAP)
1117			return (WK_BMSAFEMAP(wk));
1118	FREE_LOCK(&lk);
1119	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1120		M_BMSAFEMAP, M_WAITOK);
1121	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1122	bmsafemap->sm_list.wk_state = 0;
1123	bmsafemap->sm_buf = bp;
1124	LIST_INIT(&bmsafemap->sm_allocdirecthd);
1125	LIST_INIT(&bmsafemap->sm_allocindirhd);
1126	LIST_INIT(&bmsafemap->sm_inodedephd);
1127	LIST_INIT(&bmsafemap->sm_newblkhd);
1128	ACQUIRE_LOCK(&lk);
1129	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1130	return (bmsafemap);
1131}
1132
1133/*
1134 * Direct block allocation dependencies.
1135 *
1136 * When a new block is allocated, the corresponding disk locations must be
1137 * initialized (with zeros or new data) before the on-disk inode points to
1138 * them.  Also, the freemap from which the block was allocated must be
1139 * updated (on disk) before the inode's pointer. These two dependencies are
1140 * independent of each other and are needed for all file blocks and indirect
1141 * blocks that are pointed to directly by the inode.  Just before the
1142 * "in-core" version of the inode is updated with a newly allocated block
1143 * number, a procedure (below) is called to setup allocation dependency
1144 * structures.  These structures are removed when the corresponding
1145 * dependencies are satisfied or when the block allocation becomes obsolete
1146 * (i.e., the file is deleted, the block is de-allocated, or the block is a
1147 * fragment that gets upgraded).  All of these cases are handled in
1148 * procedures described later.
1149 *
1150 * When a file extension causes a fragment to be upgraded, either to a larger
1151 * fragment or to a full block, the on-disk location may change (if the
1152 * previous fragment could not simply be extended). In this case, the old
1153 * fragment must be de-allocated, but not until after the inode's pointer has
1154 * been updated. In most cases, this is handled by later procedures, which
1155 * will construct a "freefrag" structure to be added to the workitem queue
1156 * when the inode update is complete (or obsolete).  The main exception to
1157 * this is when an allocation occurs while a pending allocation dependency
1158 * (for the same block pointer) remains.  This case is handled in the main
1159 * allocation dependency setup procedure by immediately freeing the
1160 * unreferenced fragments.
1161 */
1162void
1163softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1164	struct inode *ip;	/* inode to which block is being added */
1165	ufs_lbn_t lbn;		/* block pointer within inode */
1166	ufs_daddr_t newblkno;	/* disk block number being added */
1167	ufs_daddr_t oldblkno;	/* previous block number, 0 unless frag */
1168	long newsize;		/* size of new block */
1169	long oldsize;		/* size of new block */
1170	struct buf *bp;		/* bp for allocated block */
1171{
1172	struct allocdirect *adp, *oldadp;
1173	struct allocdirectlst *adphead;
1174	struct bmsafemap *bmsafemap;
1175	struct inodedep *inodedep;
1176	struct pagedep *pagedep;
1177	struct newblk *newblk;
1178
1179	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1180		M_ALLOCDIRECT, M_WAITOK);
1181	bzero(adp, sizeof(struct allocdirect));
1182	adp->ad_list.wk_type = D_ALLOCDIRECT;
1183	adp->ad_lbn = lbn;
1184	adp->ad_newblkno = newblkno;
1185	adp->ad_oldblkno = oldblkno;
1186	adp->ad_newsize = newsize;
1187	adp->ad_oldsize = oldsize;
1188	adp->ad_state = ATTACHED;
1189	if (newblkno == oldblkno)
1190		adp->ad_freefrag = NULL;
1191	else
1192		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1193
1194	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1195		panic("softdep_setup_allocdirect: lost block");
1196
1197	ACQUIRE_LOCK(&lk);
1198	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
1199	adp->ad_inodedep = inodedep;
1200
1201	if (newblk->nb_state == DEPCOMPLETE) {
1202		adp->ad_state |= DEPCOMPLETE;
1203		adp->ad_buf = NULL;
1204	} else {
1205		bmsafemap = newblk->nb_bmsafemap;
1206		adp->ad_buf = bmsafemap->sm_buf;
1207		LIST_REMOVE(newblk, nb_deps);
1208		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1209	}
1210	LIST_REMOVE(newblk, nb_hash);
1211	FREE(newblk, M_NEWBLK);
1212
1213	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1214	if (lbn >= NDADDR) {
1215		/* allocating an indirect block */
1216		if (oldblkno != 0)
1217			panic("softdep_setup_allocdirect: non-zero indir");
1218	} else {
1219		/*
1220		 * Allocating a direct block.
1221		 *
1222		 * If we are allocating a directory block, then we must
1223		 * allocate an associated pagedep to track additions and
1224		 * deletions.
1225		 */
1226		if ((ip->i_mode & IFMT) == IFDIR &&
1227		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1228			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1229	}
1230	/*
1231	 * The list of allocdirects must be kept in sorted and ascending
1232	 * order so that the rollback routines can quickly determine the
1233	 * first uncommitted block (the size of the file stored on disk
1234	 * ends at the end of the lowest committed fragment, or if there
1235	 * are no fragments, at the end of the highest committed block).
1236	 * Since files generally grow, the typical case is that the new
1237	 * block is to be added at the end of the list. We speed this
1238	 * special case by checking against the last allocdirect in the
1239	 * list before laboriously traversing the list looking for the
1240	 * insertion point.
1241	 */
1242	adphead = &inodedep->id_newinoupdt;
1243	oldadp = TAILQ_LAST(adphead, allocdirectlst);
1244	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1245		/* insert at end of list */
1246		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1247		if (oldadp != NULL && oldadp->ad_lbn == lbn)
1248			allocdirect_merge(adphead, adp, oldadp);
1249		FREE_LOCK(&lk);
1250		return;
1251	}
1252	for (oldadp = TAILQ_FIRST(adphead); oldadp;
1253	     oldadp = TAILQ_NEXT(oldadp, ad_next)) {
1254		if (oldadp->ad_lbn >= lbn)
1255			break;
1256	}
1257	if (oldadp == NULL)
1258		panic("softdep_setup_allocdirect: lost entry");
1259	/* insert in middle of list */
1260	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1261	if (oldadp->ad_lbn == lbn)
1262		allocdirect_merge(adphead, adp, oldadp);
1263	FREE_LOCK(&lk);
1264}
1265
1266/*
1267 * Replace an old allocdirect dependency with a newer one.
1268 * This routine must be called with splbio interrupts blocked.
1269 */
1270static void
1271allocdirect_merge(adphead, newadp, oldadp)
1272	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
1273	struct allocdirect *newadp;	/* allocdirect being added */
1274	struct allocdirect *oldadp;	/* existing allocdirect being checked */
1275{
1276	struct freefrag *freefrag;
1277
1278#ifdef DEBUG
1279	if (lk.lkt_held == -1)
1280		panic("allocdirect_merge: lock not held");
1281#endif
1282	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1283	    newadp->ad_oldsize != oldadp->ad_newsize ||
1284	    newadp->ad_lbn >= NDADDR)
1285		panic("allocdirect_check: old %d != new %d || lbn %ld >= %d",
1286		    newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
1287		    NDADDR);
1288	newadp->ad_oldblkno = oldadp->ad_oldblkno;
1289	newadp->ad_oldsize = oldadp->ad_oldsize;
1290	/*
1291	 * If the old dependency had a fragment to free or had never
1292	 * previously had a block allocated, then the new dependency
1293	 * can immediately post its freefrag and adopt the old freefrag.
1294	 * This action is done by swapping the freefrag dependencies.
1295	 * The new dependency gains the old one's freefrag, and the
1296	 * old one gets the new one and then immediately puts it on
1297	 * the worklist when it is freed by free_allocdirect. It is
1298	 * not possible to do this swap when the old dependency had a
1299	 * non-zero size but no previous fragment to free. This condition
1300	 * arises when the new block is an extension of the old block.
1301	 * Here, the first part of the fragment allocated to the new
1302	 * dependency is part of the block currently claimed on disk by
1303	 * the old dependency, so cannot legitimately be freed until the
1304	 * conditions for the new dependency are fulfilled.
1305	 */
1306	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1307		freefrag = newadp->ad_freefrag;
1308		newadp->ad_freefrag = oldadp->ad_freefrag;
1309		oldadp->ad_freefrag = freefrag;
1310	}
1311	free_allocdirect(adphead, oldadp, 0);
1312}
1313
1314/*
1315 * Allocate a new freefrag structure if needed.
1316 */
1317static struct freefrag *
1318newfreefrag(ip, blkno, size)
1319	struct inode *ip;
1320	ufs_daddr_t blkno;
1321	long size;
1322{
1323	struct freefrag *freefrag;
1324	struct fs *fs;
1325
1326	if (blkno == 0)
1327		return (NULL);
1328	fs = ip->i_fs;
1329	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1330		panic("newfreefrag: frag size");
1331	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1332		M_FREEFRAG, M_WAITOK);
1333	freefrag->ff_list.wk_type = D_FREEFRAG;
1334	freefrag->ff_state = ip->i_uid & ~ONWORKLIST;	/* XXX - used below */
1335	freefrag->ff_inum = ip->i_number;
1336	freefrag->ff_fs = fs;
1337	freefrag->ff_devvp = ip->i_devvp;
1338	freefrag->ff_blkno = blkno;
1339	freefrag->ff_fragsize = size;
1340	return (freefrag);
1341}
1342
1343/*
1344 * This workitem de-allocates fragments that were replaced during
1345 * file block allocation.
1346 */
1347static void
1348handle_workitem_freefrag(freefrag)
1349	struct freefrag *freefrag;
1350{
1351	struct inode tip;
1352
1353	tip.i_fs = freefrag->ff_fs;
1354	tip.i_devvp = freefrag->ff_devvp;
1355	tip.i_dev = freefrag->ff_devvp->v_rdev;
1356	tip.i_number = freefrag->ff_inum;
1357	tip.i_uid = freefrag->ff_state & ~ONWORKLIST;	/* XXX - set above */
1358	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1359	FREE(freefrag, M_FREEFRAG);
1360}
1361
1362/*
1363 * Indirect block allocation dependencies.
1364 *
1365 * The same dependencies that exist for a direct block also exist when
1366 * a new block is allocated and pointed to by an entry in a block of
1367 * indirect pointers. The undo/redo states described above are also
1368 * used here. Because an indirect block contains many pointers that
1369 * may have dependencies, a second copy of the entire in-memory indirect
1370 * block is kept. The buffer cache copy is always completely up-to-date.
1371 * The second copy, which is used only as a source for disk writes,
1372 * contains only the safe pointers (i.e., those that have no remaining
1373 * update dependencies). The second copy is freed when all pointers
1374 * are safe. The cache is not allowed to replace indirect blocks with
1375 * pending update dependencies. If a buffer containing an indirect
1376 * block with dependencies is written, these routines will mark it
1377 * dirty again. It can only be successfully written once all the
1378 * dependencies are removed. The ffs_fsync routine in conjunction with
1379 * softdep_sync_metadata work together to get all the dependencies
1380 * removed so that a file can be successfully written to disk. Three
1381 * procedures are used when setting up indirect block pointer
1382 * dependencies. The division is necessary because of the organization
1383 * of the "balloc" routine and because of the distinction between file
1384 * pages and file metadata blocks.
1385 */
1386
1387/*
1388 * Allocate a new allocindir structure.
1389 */
1390static struct allocindir *
1391newallocindir(ip, ptrno, newblkno, oldblkno)
1392	struct inode *ip;	/* inode for file being extended */
1393	int ptrno;		/* offset of pointer in indirect block */
1394	ufs_daddr_t newblkno;	/* disk block number being added */
1395	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1396{
1397	struct allocindir *aip;
1398
1399	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1400		M_ALLOCINDIR, M_WAITOK);
1401	bzero(aip, sizeof(struct allocindir));
1402	aip->ai_list.wk_type = D_ALLOCINDIR;
1403	aip->ai_state = ATTACHED;
1404	aip->ai_offset = ptrno;
1405	aip->ai_newblkno = newblkno;
1406	aip->ai_oldblkno = oldblkno;
1407	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1408	return (aip);
1409}
1410
1411/*
1412 * Called just before setting an indirect block pointer
1413 * to a newly allocated file page.
1414 */
1415void
1416softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1417	struct inode *ip;	/* inode for file being extended */
1418	ufs_lbn_t lbn;		/* allocated block number within file */
1419	struct buf *bp;		/* buffer with indirect blk referencing page */
1420	int ptrno;		/* offset of pointer in indirect block */
1421	ufs_daddr_t newblkno;	/* disk block number being added */
1422	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1423	struct buf *nbp;	/* buffer holding allocated page */
1424{
1425	struct allocindir *aip;
1426	struct pagedep *pagedep;
1427
1428	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1429	ACQUIRE_LOCK(&lk);
1430	/*
1431	 * If we are allocating a directory page, then we must
1432	 * allocate an associated pagedep to track additions and
1433	 * deletions.
1434	 */
1435	if ((ip->i_mode & IFMT) == IFDIR &&
1436	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1437		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1438	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1439	FREE_LOCK(&lk);
1440	setup_allocindir_phase2(bp, ip, aip);
1441}
1442
1443/*
1444 * Called just before setting an indirect block pointer to a
1445 * newly allocated indirect block.
1446 */
1447void
1448softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1449	struct buf *nbp;	/* newly allocated indirect block */
1450	struct inode *ip;	/* inode for file being extended */
1451	struct buf *bp;		/* indirect block referencing allocated block */
1452	int ptrno;		/* offset of pointer in indirect block */
1453	ufs_daddr_t newblkno;	/* disk block number being added */
1454{
1455	struct allocindir *aip;
1456
1457	aip = newallocindir(ip, ptrno, newblkno, 0);
1458	ACQUIRE_LOCK(&lk);
1459	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1460	FREE_LOCK(&lk);
1461	setup_allocindir_phase2(bp, ip, aip);
1462}
1463
1464/*
1465 * Called to finish the allocation of the "aip" allocated
1466 * by one of the two routines above.
1467 */
1468static void
1469setup_allocindir_phase2(bp, ip, aip)
1470	struct buf *bp;		/* in-memory copy of the indirect block */
1471	struct inode *ip;	/* inode for file being extended */
1472	struct allocindir *aip;	/* allocindir allocated by the above routines */
1473{
1474	struct worklist *wk;
1475	struct indirdep *indirdep, *newindirdep;
1476	struct bmsafemap *bmsafemap;
1477	struct allocindir *oldaip;
1478	struct freefrag *freefrag;
1479	struct newblk *newblk;
1480
1481	if (bp->b_lblkno >= 0)
1482		panic("setup_allocindir_phase2: not indir blk");
1483	for (indirdep = NULL, newindirdep = NULL; ; ) {
1484		ACQUIRE_LOCK(&lk);
1485		for (wk = LIST_FIRST(&bp->b_dep); wk;
1486		     wk = LIST_NEXT(wk, wk_list)) {
1487			if (wk->wk_type != D_INDIRDEP)
1488				continue;
1489			indirdep = WK_INDIRDEP(wk);
1490			break;
1491		}
1492		if (indirdep == NULL && newindirdep) {
1493			indirdep = newindirdep;
1494			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1495			newindirdep = NULL;
1496		}
1497		FREE_LOCK(&lk);
1498		if (indirdep) {
1499			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1500			    &newblk) == 0)
1501				panic("setup_allocindir: lost block");
1502			ACQUIRE_LOCK(&lk);
1503			if (newblk->nb_state == DEPCOMPLETE) {
1504				aip->ai_state |= DEPCOMPLETE;
1505				aip->ai_buf = NULL;
1506			} else {
1507				bmsafemap = newblk->nb_bmsafemap;
1508				aip->ai_buf = bmsafemap->sm_buf;
1509				LIST_REMOVE(newblk, nb_deps);
1510				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1511				    aip, ai_deps);
1512			}
1513			LIST_REMOVE(newblk, nb_hash);
1514			FREE(newblk, M_NEWBLK);
1515			aip->ai_indirdep = indirdep;
1516			/*
1517			 * Check to see if there is an existing dependency
1518			 * for this block. If there is, merge the old
1519			 * dependency into the new one.
1520			 */
1521			if (aip->ai_oldblkno == 0)
1522				oldaip = NULL;
1523			else
1524				for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd);
1525				    oldaip; oldaip = LIST_NEXT(oldaip, ai_next))
1526					if (oldaip->ai_offset == aip->ai_offset)
1527						break;
1528			if (oldaip != NULL) {
1529				if (oldaip->ai_newblkno != aip->ai_oldblkno)
1530					panic("setup_allocindir_phase2: blkno");
1531				aip->ai_oldblkno = oldaip->ai_oldblkno;
1532				freefrag = oldaip->ai_freefrag;
1533				oldaip->ai_freefrag = aip->ai_freefrag;
1534				aip->ai_freefrag = freefrag;
1535				free_allocindir(oldaip, NULL);
1536			}
1537			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1538			((ufs_daddr_t *)indirdep->ir_savebp->b_data)
1539			    [aip->ai_offset] = aip->ai_oldblkno;
1540			FREE_LOCK(&lk);
1541		}
1542		if (newindirdep) {
1543			if (indirdep->ir_savebp != NULL)
1544				brelse(newindirdep->ir_savebp);
1545			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
1546		}
1547		if (indirdep)
1548			break;
1549		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
1550			M_INDIRDEP, M_WAITOK);
1551		newindirdep->ir_list.wk_type = D_INDIRDEP;
1552		newindirdep->ir_state = ATTACHED;
1553		LIST_INIT(&newindirdep->ir_deplisthd);
1554		LIST_INIT(&newindirdep->ir_donehd);
1555		if (bp->b_blkno == bp->b_lblkno) {
1556			VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
1557				NULL, NULL);
1558		}
1559		newindirdep->ir_savebp =
1560		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1561		BUF_KERNPROC(newindirdep->ir_savebp);
1562		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1563	}
1564}
1565
1566/*
1567 * Block de-allocation dependencies.
1568 *
1569 * When blocks are de-allocated, the on-disk pointers must be nullified before
1570 * the blocks are made available for use by other files.  (The true
1571 * requirement is that old pointers must be nullified before new on-disk
1572 * pointers are set.  We chose this slightly more stringent requirement to
1573 * reduce complexity.) Our implementation handles this dependency by updating
1574 * the inode (or indirect block) appropriately but delaying the actual block
1575 * de-allocation (i.e., freemap and free space count manipulation) until
1576 * after the updated versions reach stable storage.  After the disk is
1577 * updated, the blocks can be safely de-allocated whenever it is convenient.
1578 * This implementation handles only the common case of reducing a file's
1579 * length to zero. Other cases are handled by the conventional synchronous
1580 * write approach.
1581 *
1582 * The ffs implementation with which we worked double-checks
1583 * the state of the block pointers and file size as it reduces
1584 * a file's length.  Some of this code is replicated here in our
1585 * soft updates implementation.  The freeblks->fb_chkcnt field is
1586 * used to transfer a part of this information to the procedure
1587 * that eventually de-allocates the blocks.
1588 *
1589 * This routine should be called from the routine that shortens
1590 * a file's length, before the inode's size or block pointers
1591 * are modified. It will save the block pointer information for
1592 * later release and zero the inode so that the calling routine
1593 * can release it.
1594 */
1595void
1596softdep_setup_freeblocks(ip, length)
1597	struct inode *ip;	/* The inode whose length is to be reduced */
1598	off_t length;		/* The new length for the file */
1599{
1600	struct freeblks *freeblks;
1601	struct inodedep *inodedep;
1602	struct allocdirect *adp;
1603	struct vnode *vp;
1604	struct buf *bp;
1605	struct fs *fs;
1606	int i, error;
1607
1608	fs = ip->i_fs;
1609	if (length != 0)
1610		panic("softde_setup_freeblocks: non-zero length");
1611	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
1612		M_FREEBLKS, M_WAITOK);
1613	bzero(freeblks, sizeof(struct freeblks));
1614	freeblks->fb_list.wk_type = D_FREEBLKS;
1615	freeblks->fb_uid = ip->i_uid;
1616	freeblks->fb_previousinum = ip->i_number;
1617	freeblks->fb_devvp = ip->i_devvp;
1618	freeblks->fb_fs = fs;
1619	freeblks->fb_oldsize = ip->i_size;
1620	freeblks->fb_newsize = length;
1621	freeblks->fb_chkcnt = ip->i_blocks;
1622	for (i = 0; i < NDADDR; i++) {
1623		freeblks->fb_dblks[i] = ip->i_db[i];
1624		ip->i_db[i] = 0;
1625	}
1626	for (i = 0; i < NIADDR; i++) {
1627		freeblks->fb_iblks[i] = ip->i_ib[i];
1628		ip->i_ib[i] = 0;
1629	}
1630	ip->i_blocks = 0;
1631	ip->i_size = 0;
1632	/*
1633	 * Push the zero'ed inode to to its disk buffer so that we are free
1634	 * to delete its dependencies below. Once the dependencies are gone
1635	 * the buffer can be safely released.
1636	 */
1637	if ((error = bread(ip->i_devvp,
1638	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1639	    (int)fs->fs_bsize, NOCRED, &bp)) != 0)
1640		softdep_error("softdep_setup_freeblocks", error);
1641	*((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
1642	    ip->i_din;
1643	/*
1644	 * Find and eliminate any inode dependencies.
1645	 */
1646	ACQUIRE_LOCK(&lk);
1647	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1648	if ((inodedep->id_state & IOSTARTED) != 0)
1649		panic("softdep_setup_freeblocks: inode busy");
1650	/*
1651	 * Because the file length has been truncated to zero, any
1652	 * pending block allocation dependency structures associated
1653	 * with this inode are obsolete and can simply be de-allocated.
1654	 * We must first merge the two dependency lists to get rid of
1655	 * any duplicate freefrag structures, then purge the merged list.
1656	 */
1657	merge_inode_lists(inodedep);
1658	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
1659		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
1660	FREE_LOCK(&lk);
1661	bdwrite(bp);
1662	/*
1663	 * We must wait for any I/O in progress to finish so that
1664	 * all potential buffers on the dirty list will be visible.
1665	 * Once they are all there, walk the list and get rid of
1666	 * any dependencies.
1667	 */
1668	vp = ITOV(ip);
1669	ACQUIRE_LOCK(&lk);
1670	drain_output(vp, 1);
1671	while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
1672		bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
1673		(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
1674		deallocate_dependencies(bp, inodedep);
1675		bp->b_flags |= B_INVAL | B_NOCACHE;
1676		FREE_LOCK(&lk);
1677		brelse(bp);
1678		ACQUIRE_LOCK(&lk);
1679	}
1680	/*
1681	 * Add the freeblks structure to the list of operations that
1682	 * must await the zero'ed inode being written to disk. If we
1683	 * still have a bitmap dependency, then the inode has never been
1684	 * written to disk, so we can process the freeblks immediately.
1685	 * If the inodedep does not exist, then the zero'ed inode has
1686	 * been written and we can also proceed.
1687	 */
1688	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0 ||
1689	    free_inodedep(inodedep) ||
1690	    (inodedep->id_state & DEPCOMPLETE) == 0) {
1691		FREE_LOCK(&lk);
1692		handle_workitem_freeblocks(freeblks);
1693	} else {
1694		WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1695		FREE_LOCK(&lk);
1696	}
1697}
1698
1699/*
1700 * Reclaim any dependency structures from a buffer that is about to
1701 * be reallocated to a new vnode. The buffer must be locked, thus,
1702 * no I/O completion operations can occur while we are manipulating
1703 * its associated dependencies. The mutex is held so that other I/O's
1704 * associated with related dependencies do not occur.
1705 */
1706static void
1707deallocate_dependencies(bp, inodedep)
1708	struct buf *bp;
1709	struct inodedep *inodedep;
1710{
1711	struct worklist *wk;
1712	struct indirdep *indirdep;
1713	struct allocindir *aip;
1714	struct pagedep *pagedep;
1715	struct dirrem *dirrem;
1716	struct diradd *dap;
1717	int i;
1718
1719	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
1720		switch (wk->wk_type) {
1721
1722		case D_INDIRDEP:
1723			indirdep = WK_INDIRDEP(wk);
1724			/*
1725			 * None of the indirect pointers will ever be visible,
1726			 * so they can simply be tossed. GOINGAWAY ensures
1727			 * that allocated pointers will be saved in the buffer
1728			 * cache until they are freed. Note that they will
1729			 * only be able to be found by their physical address
1730			 * since the inode mapping the logical address will
1731			 * be gone. The save buffer used for the safe copy
1732			 * was allocated in setup_allocindir_phase2 using
1733			 * the physical address so it could be used for this
1734			 * purpose. Hence we swap the safe copy with the real
1735			 * copy, allowing the safe copy to be freed and holding
1736			 * on to the real copy for later use in indir_trunc.
1737			 */
1738			if (indirdep->ir_state & GOINGAWAY)
1739				panic("deallocate_dependencies: already gone");
1740			indirdep->ir_state |= GOINGAWAY;
1741			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
1742				free_allocindir(aip, inodedep);
1743			if (bp->b_lblkno >= 0 ||
1744			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
1745				panic("deallocate_dependencies: not indir");
1746			bcopy(bp->b_data, indirdep->ir_savebp->b_data,
1747			    bp->b_bcount);
1748			WORKLIST_REMOVE(wk);
1749			WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
1750			continue;
1751
1752		case D_PAGEDEP:
1753			pagedep = WK_PAGEDEP(wk);
1754			/*
1755			 * None of the directory additions will ever be
1756			 * visible, so they can simply be tossed.
1757			 */
1758			for (i = 0; i < DAHASHSZ; i++)
1759				while ((dap =
1760				    LIST_FIRST(&pagedep->pd_diraddhd[i])))
1761					free_diradd(dap);
1762			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
1763				free_diradd(dap);
1764			/*
1765			 * Copy any directory remove dependencies to the list
1766			 * to be processed after the zero'ed inode is written.
1767			 * If the inode has already been written, then they
1768			 * can be dumped directly onto the work list.
1769			 */
1770			for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem;
1771			     dirrem = LIST_NEXT(dirrem, dm_next)) {
1772				LIST_REMOVE(dirrem, dm_next);
1773				dirrem->dm_dirinum = pagedep->pd_ino;
1774				if (inodedep == NULL)
1775					add_to_worklist(&dirrem->dm_list);
1776				else
1777					WORKLIST_INSERT(&inodedep->id_bufwait,
1778					    &dirrem->dm_list);
1779			}
1780			WORKLIST_REMOVE(&pagedep->pd_list);
1781			LIST_REMOVE(pagedep, pd_hash);
1782			WORKITEM_FREE(pagedep, D_PAGEDEP);
1783			continue;
1784
1785		case D_ALLOCINDIR:
1786			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
1787			continue;
1788
1789		case D_ALLOCDIRECT:
1790		case D_INODEDEP:
1791			panic("deallocate_dependencies: Unexpected type %s",
1792			    TYPENAME(wk->wk_type));
1793			/* NOTREACHED */
1794
1795		default:
1796			panic("deallocate_dependencies: Unknown type %s",
1797			    TYPENAME(wk->wk_type));
1798			/* NOTREACHED */
1799		}
1800	}
1801}
1802
1803/*
1804 * Free an allocdirect. Generate a new freefrag work request if appropriate.
1805 * This routine must be called with splbio interrupts blocked.
1806 */
1807static void
1808free_allocdirect(adphead, adp, delay)
1809	struct allocdirectlst *adphead;
1810	struct allocdirect *adp;
1811	int delay;
1812{
1813
1814#ifdef DEBUG
1815	if (lk.lkt_held == -1)
1816		panic("free_allocdirect: lock not held");
1817#endif
1818	if ((adp->ad_state & DEPCOMPLETE) == 0)
1819		LIST_REMOVE(adp, ad_deps);
1820	TAILQ_REMOVE(adphead, adp, ad_next);
1821	if ((adp->ad_state & COMPLETE) == 0)
1822		WORKLIST_REMOVE(&adp->ad_list);
1823	if (adp->ad_freefrag != NULL) {
1824		if (delay)
1825			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
1826			    &adp->ad_freefrag->ff_list);
1827		else
1828			add_to_worklist(&adp->ad_freefrag->ff_list);
1829	}
1830	WORKITEM_FREE(adp, D_ALLOCDIRECT);
1831}
1832
1833/*
1834 * Prepare an inode to be freed. The actual free operation is not
1835 * done until the zero'ed inode has been written to disk.
1836 */
1837void
1838softdep_freefile(pvp, ino, mode)
1839		struct vnode *pvp;
1840		ino_t ino;
1841		int mode;
1842{
1843	struct inode *ip = VTOI(pvp);
1844	struct inodedep *inodedep;
1845	struct freefile *freefile;
1846
1847	/*
1848	 * This sets up the inode de-allocation dependency.
1849	 */
1850	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
1851		M_FREEFILE, M_WAITOK);
1852	freefile->fx_list.wk_type = D_FREEFILE;
1853	freefile->fx_list.wk_state = 0;
1854	freefile->fx_mode = mode;
1855	freefile->fx_oldinum = ino;
1856	freefile->fx_devvp = ip->i_devvp;
1857	freefile->fx_fs = ip->i_fs;
1858
1859	/*
1860	 * If the inodedep does not exist, then the zero'ed inode has
1861	 * been written to disk and we can free the file immediately.
1862	 */
1863	ACQUIRE_LOCK(&lk);
1864	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0) {
1865		FREE_LOCK(&lk);
1866		handle_workitem_freefile(freefile);
1867		return;
1868	}
1869
1870	/*
1871	 * If we still have a bitmap dependency, then the inode has never
1872	 * been written to disk. Drop the dependency as it is no longer
1873	 * necessary since the inode is being deallocated. We set the
1874	 * ALLCOMPLETE flags since the bitmap now properly shows that the
1875	 * inode is not allocated. Even if the inode is actively being
1876	 * written, it has been rolled back to its zero'ed state, so we
1877	 * are ensured that a zero inode is what is on the disk. For short
1878	 * lived files, this change will usually result in removing all the
1879	 * dependencies from the inode so that it can be freed immediately.
1880	 */
1881	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
1882		inodedep->id_state |= ALLCOMPLETE;
1883		LIST_REMOVE(inodedep, id_deps);
1884		inodedep->id_buf = NULL;
1885		WORKLIST_REMOVE(&inodedep->id_list);
1886	}
1887	if (free_inodedep(inodedep) == 0) {
1888		WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
1889		FREE_LOCK(&lk);
1890	} else {
1891		FREE_LOCK(&lk);
1892		handle_workitem_freefile(freefile);
1893	}
1894}
1895
1896/*
1897 * Try to free an inodedep structure. Return 1 if it could be freed.
1898 */
1899static int
1900free_inodedep(inodedep)
1901	struct inodedep *inodedep;
1902{
1903
1904	if ((inodedep->id_state & ONWORKLIST) != 0 ||
1905	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
1906	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
1907	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
1908	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
1909	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
1910	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
1911	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL)
1912		return (0);
1913	LIST_REMOVE(inodedep, id_hash);
1914	WORKITEM_FREE(inodedep, D_INODEDEP);
1915	num_inodedep -= 1;
1916	return (1);
1917}
1918
1919/*
1920 * This workitem routine performs the block de-allocation.
1921 * The workitem is added to the pending list after the updated
1922 * inode block has been written to disk.  As mentioned above,
1923 * checks regarding the number of blocks de-allocated (compared
1924 * to the number of blocks allocated for the file) are also
1925 * performed in this function.
1926 */
1927static void
1928handle_workitem_freeblocks(freeblks)
1929	struct freeblks *freeblks;
1930{
1931	struct inode tip;
1932	ufs_daddr_t bn;
1933	struct fs *fs;
1934	int i, level, bsize;
1935	long nblocks, blocksreleased = 0;
1936	int error, allerror = 0;
1937	ufs_lbn_t baselbns[NIADDR], tmpval;
1938
1939	tip.i_number = freeblks->fb_previousinum;
1940	tip.i_devvp = freeblks->fb_devvp;
1941	tip.i_dev = freeblks->fb_devvp->v_rdev;
1942	tip.i_fs = freeblks->fb_fs;
1943	tip.i_size = freeblks->fb_oldsize;
1944	tip.i_uid = freeblks->fb_uid;
1945	fs = freeblks->fb_fs;
1946	tmpval = 1;
1947	baselbns[0] = NDADDR;
1948	for (i = 1; i < NIADDR; i++) {
1949		tmpval *= NINDIR(fs);
1950		baselbns[i] = baselbns[i - 1] + tmpval;
1951	}
1952	nblocks = btodb(fs->fs_bsize);
1953	blocksreleased = 0;
1954	/*
1955	 * Indirect blocks first.
1956	 */
1957	for (level = (NIADDR - 1); level >= 0; level--) {
1958		if ((bn = freeblks->fb_iblks[level]) == 0)
1959			continue;
1960		if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
1961		    baselbns[level], &blocksreleased)) == 0)
1962			allerror = error;
1963		ffs_blkfree(&tip, bn, fs->fs_bsize);
1964		blocksreleased += nblocks;
1965	}
1966	/*
1967	 * All direct blocks or frags.
1968	 */
1969	for (i = (NDADDR - 1); i >= 0; i--) {
1970		if ((bn = freeblks->fb_dblks[i]) == 0)
1971			continue;
1972		bsize = blksize(fs, &tip, i);
1973		ffs_blkfree(&tip, bn, bsize);
1974		blocksreleased += btodb(bsize);
1975	}
1976
1977#ifdef DIAGNOSTIC
1978	if (freeblks->fb_chkcnt != blocksreleased)
1979		panic("handle_workitem_freeblocks: block count");
1980	if (allerror)
1981		softdep_error("handle_workitem_freeblks", allerror);
1982#endif /* DIAGNOSTIC */
1983	WORKITEM_FREE(freeblks, D_FREEBLKS);
1984}
1985
1986/*
1987 * Release blocks associated with the inode ip and stored in the indirect
1988 * block dbn. If level is greater than SINGLE, the block is an indirect block
1989 * and recursive calls to indirtrunc must be used to cleanse other indirect
1990 * blocks.
1991 */
1992static int
1993indir_trunc(ip, dbn, level, lbn, countp)
1994	struct inode *ip;
1995	ufs_daddr_t dbn;
1996	int level;
1997	ufs_lbn_t lbn;
1998	long *countp;
1999{
2000	struct buf *bp;
2001	ufs_daddr_t *bap;
2002	ufs_daddr_t nb;
2003	struct fs *fs;
2004	struct worklist *wk;
2005	struct indirdep *indirdep;
2006	int i, lbnadd, nblocks;
2007	int error, allerror = 0;
2008
2009	fs = ip->i_fs;
2010	lbnadd = 1;
2011	for (i = level; i > 0; i--)
2012		lbnadd *= NINDIR(fs);
2013	/*
2014	 * Get buffer of block pointers to be freed. This routine is not
2015	 * called until the zero'ed inode has been written, so it is safe
2016	 * to free blocks as they are encountered. Because the inode has
2017	 * been zero'ed, calls to bmap on these blocks will fail. So, we
2018	 * have to use the on-disk address and the block device for the
2019	 * filesystem to look them up. If the file was deleted before its
2020	 * indirect blocks were all written to disk, the routine that set
2021	 * us up (deallocate_dependencies) will have arranged to leave
2022	 * a complete copy of the indirect block in memory for our use.
2023	 * Otherwise we have to read the blocks in from the disk.
2024	 */
2025	ACQUIRE_LOCK(&lk);
2026	if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
2027	    (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2028		if (wk->wk_type != D_INDIRDEP ||
2029		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2030		    (indirdep->ir_state & GOINGAWAY) == 0)
2031			panic("indir_trunc: lost indirdep");
2032		WORKLIST_REMOVE(wk);
2033		WORKITEM_FREE(indirdep, D_INDIRDEP);
2034		if (LIST_FIRST(&bp->b_dep) != NULL)
2035			panic("indir_trunc: dangling dep");
2036		FREE_LOCK(&lk);
2037	} else {
2038		FREE_LOCK(&lk);
2039		error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
2040		if (error)
2041			return (error);
2042	}
2043	/*
2044	 * Recursively free indirect blocks.
2045	 */
2046	bap = (ufs_daddr_t *)bp->b_data;
2047	nblocks = btodb(fs->fs_bsize);
2048	for (i = NINDIR(fs) - 1; i >= 0; i--) {
2049		if ((nb = bap[i]) == 0)
2050			continue;
2051		if (level != 0) {
2052			if ((error = indir_trunc(ip, fsbtodb(fs, nb),
2053			     level - 1, lbn + (i * lbnadd), countp)) != 0)
2054				allerror = error;
2055		}
2056		ffs_blkfree(ip, nb, fs->fs_bsize);
2057		*countp += nblocks;
2058	}
2059	bp->b_flags |= B_INVAL | B_NOCACHE;
2060	brelse(bp);
2061	return (allerror);
2062}
2063
2064/*
2065 * Free an allocindir.
2066 * This routine must be called with splbio interrupts blocked.
2067 */
2068static void
2069free_allocindir(aip, inodedep)
2070	struct allocindir *aip;
2071	struct inodedep *inodedep;
2072{
2073	struct freefrag *freefrag;
2074
2075#ifdef DEBUG
2076	if (lk.lkt_held == -1)
2077		panic("free_allocindir: lock not held");
2078#endif
2079	if ((aip->ai_state & DEPCOMPLETE) == 0)
2080		LIST_REMOVE(aip, ai_deps);
2081	if (aip->ai_state & ONWORKLIST)
2082		WORKLIST_REMOVE(&aip->ai_list);
2083	LIST_REMOVE(aip, ai_next);
2084	if ((freefrag = aip->ai_freefrag) != NULL) {
2085		if (inodedep == NULL)
2086			add_to_worklist(&freefrag->ff_list);
2087		else
2088			WORKLIST_INSERT(&inodedep->id_bufwait,
2089			    &freefrag->ff_list);
2090	}
2091	WORKITEM_FREE(aip, D_ALLOCINDIR);
2092}
2093
2094/*
2095 * Directory entry addition dependencies.
2096 *
2097 * When adding a new directory entry, the inode (with its incremented link
2098 * count) must be written to disk before the directory entry's pointer to it.
2099 * Also, if the inode is newly allocated, the corresponding freemap must be
2100 * updated (on disk) before the directory entry's pointer. These requirements
2101 * are met via undo/redo on the directory entry's pointer, which consists
2102 * simply of the inode number.
2103 *
2104 * As directory entries are added and deleted, the free space within a
2105 * directory block can become fragmented.  The ufs file system will compact
2106 * a fragmented directory block to make space for a new entry. When this
2107 * occurs, the offsets of previously added entries change. Any "diradd"
2108 * dependency structures corresponding to these entries must be updated with
2109 * the new offsets.
2110 */
2111
2112/*
2113 * This routine is called after the in-memory inode's link
2114 * count has been incremented, but before the directory entry's
2115 * pointer to the inode has been set.
2116 */
2117void
2118softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
2119	struct buf *bp;		/* buffer containing directory block */
2120	struct inode *dp;	/* inode for directory */
2121	off_t diroffset;	/* offset of new entry in directory */
2122	long newinum;		/* inode referenced by new directory entry */
2123	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
2124{
2125	int offset;		/* offset of new entry within directory block */
2126	ufs_lbn_t lbn;		/* block in directory containing new entry */
2127	struct fs *fs;
2128	struct diradd *dap;
2129	struct pagedep *pagedep;
2130	struct inodedep *inodedep;
2131	struct mkdir *mkdir1, *mkdir2;
2132
2133	/*
2134	 * Whiteouts have no dependencies.
2135	 */
2136	if (newinum == WINO) {
2137		if (newdirbp != NULL)
2138			bdwrite(newdirbp);
2139		return;
2140	}
2141
2142	fs = dp->i_fs;
2143	lbn = lblkno(fs, diroffset);
2144	offset = blkoff(fs, diroffset);
2145	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK);
2146	bzero(dap, sizeof(struct diradd));
2147	dap->da_list.wk_type = D_DIRADD;
2148	dap->da_offset = offset;
2149	dap->da_newinum = newinum;
2150	dap->da_state = ATTACHED;
2151	if (newdirbp == NULL) {
2152		dap->da_state |= DEPCOMPLETE;
2153		ACQUIRE_LOCK(&lk);
2154	} else {
2155		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2156		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2157		    M_WAITOK);
2158		mkdir1->md_list.wk_type = D_MKDIR;
2159		mkdir1->md_state = MKDIR_BODY;
2160		mkdir1->md_diradd = dap;
2161		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2162		    M_WAITOK);
2163		mkdir2->md_list.wk_type = D_MKDIR;
2164		mkdir2->md_state = MKDIR_PARENT;
2165		mkdir2->md_diradd = dap;
2166		/*
2167		 * Dependency on "." and ".." being written to disk.
2168		 */
2169		mkdir1->md_buf = newdirbp;
2170		ACQUIRE_LOCK(&lk);
2171		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2172		WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2173		FREE_LOCK(&lk);
2174		bdwrite(newdirbp);
2175		/*
2176		 * Dependency on link count increase for parent directory
2177		 */
2178		ACQUIRE_LOCK(&lk);
2179		if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
2180		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2181			dap->da_state &= ~MKDIR_PARENT;
2182			WORKITEM_FREE(mkdir2, D_MKDIR);
2183		} else {
2184			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2185			WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2186		}
2187	}
2188	/*
2189	 * Link into parent directory pagedep to await its being written.
2190	 */
2191	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2192		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2193	dap->da_pagedep = pagedep;
2194	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2195	    da_pdlist);
2196	/*
2197	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2198	 * is not yet written. If it is written, do the post-inode write
2199	 * processing to put it on the id_pendinghd list.
2200	 */
2201	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2202	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2203		diradd_inode_written(dap, inodedep);
2204	else
2205		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2206	FREE_LOCK(&lk);
2207}
2208
2209/*
2210 * This procedure is called to change the offset of a directory
2211 * entry when compacting a directory block which must be owned
2212 * exclusively by the caller. Note that the actual entry movement
2213 * must be done in this procedure to ensure that no I/O completions
2214 * occur while the move is in progress.
2215 */
2216void
2217softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2218	struct inode *dp;	/* inode for directory */
2219	caddr_t base;		/* address of dp->i_offset */
2220	caddr_t oldloc;		/* address of old directory location */
2221	caddr_t newloc;		/* address of new directory location */
2222	int entrysize;		/* size of directory entry */
2223{
2224	int offset, oldoffset, newoffset;
2225	struct pagedep *pagedep;
2226	struct diradd *dap;
2227	ufs_lbn_t lbn;
2228
2229	ACQUIRE_LOCK(&lk);
2230	lbn = lblkno(dp->i_fs, dp->i_offset);
2231	offset = blkoff(dp->i_fs, dp->i_offset);
2232	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2233		goto done;
2234	oldoffset = offset + (oldloc - base);
2235	newoffset = offset + (newloc - base);
2236	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]);
2237	     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2238		if (dap->da_offset != oldoffset)
2239			continue;
2240		dap->da_offset = newoffset;
2241		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2242			break;
2243		LIST_REMOVE(dap, da_pdlist);
2244		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2245		    dap, da_pdlist);
2246		break;
2247	}
2248	if (dap == NULL) {
2249		for (dap = LIST_FIRST(&pagedep->pd_pendinghd);
2250		     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2251			if (dap->da_offset == oldoffset) {
2252				dap->da_offset = newoffset;
2253				break;
2254			}
2255		}
2256	}
2257done:
2258	bcopy(oldloc, newloc, entrysize);
2259	FREE_LOCK(&lk);
2260}
2261
2262/*
2263 * Free a diradd dependency structure. This routine must be called
2264 * with splbio interrupts blocked.
2265 */
2266static void
2267free_diradd(dap)
2268	struct diradd *dap;
2269{
2270	struct dirrem *dirrem;
2271	struct pagedep *pagedep;
2272	struct inodedep *inodedep;
2273	struct mkdir *mkdir, *nextmd;
2274
2275#ifdef DEBUG
2276	if (lk.lkt_held == -1)
2277		panic("free_diradd: lock not held");
2278#endif
2279	WORKLIST_REMOVE(&dap->da_list);
2280	LIST_REMOVE(dap, da_pdlist);
2281	if ((dap->da_state & DIRCHG) == 0) {
2282		pagedep = dap->da_pagedep;
2283	} else {
2284		dirrem = dap->da_previous;
2285		pagedep = dirrem->dm_pagedep;
2286		dirrem->dm_dirinum = pagedep->pd_ino;
2287		add_to_worklist(&dirrem->dm_list);
2288	}
2289	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2290	    0, &inodedep) != 0)
2291		(void) free_inodedep(inodedep);
2292	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2293		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2294			nextmd = LIST_NEXT(mkdir, md_mkdirs);
2295			if (mkdir->md_diradd != dap)
2296				continue;
2297			dap->da_state &= ~mkdir->md_state;
2298			WORKLIST_REMOVE(&mkdir->md_list);
2299			LIST_REMOVE(mkdir, md_mkdirs);
2300			WORKITEM_FREE(mkdir, D_MKDIR);
2301		}
2302		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
2303			panic("free_diradd: unfound ref");
2304	}
2305	WORKITEM_FREE(dap, D_DIRADD);
2306}
2307
2308/*
2309 * Directory entry removal dependencies.
2310 *
2311 * When removing a directory entry, the entry's inode pointer must be
2312 * zero'ed on disk before the corresponding inode's link count is decremented
2313 * (possibly freeing the inode for re-use). This dependency is handled by
2314 * updating the directory entry but delaying the inode count reduction until
2315 * after the directory block has been written to disk. After this point, the
2316 * inode count can be decremented whenever it is convenient.
2317 */
2318
2319/*
2320 * This routine should be called immediately after removing
2321 * a directory entry.  The inode's link count should not be
2322 * decremented by the calling procedure -- the soft updates
2323 * code will do this task when it is safe.
2324 */
2325void
2326softdep_setup_remove(bp, dp, ip, isrmdir)
2327	struct buf *bp;		/* buffer containing directory block */
2328	struct inode *dp;	/* inode for the directory being modified */
2329	struct inode *ip;	/* inode for directory entry being removed */
2330	int isrmdir;		/* indicates if doing RMDIR */
2331{
2332	struct dirrem *dirrem;
2333
2334	/*
2335	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2336	 */
2337	dirrem = newdirrem(bp, dp, ip, isrmdir);
2338	if ((dirrem->dm_state & COMPLETE) == 0) {
2339		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2340		    dm_next);
2341		FREE_LOCK(&lk);
2342	} else {
2343		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2344		FREE_LOCK(&lk);
2345		handle_workitem_remove(dirrem);
2346	}
2347}
2348
2349/*
2350 * Allocate a new dirrem if appropriate and return it along with
2351 * its associated pagedep. Called without a lock, returns with lock.
2352 */
2353static long num_dirrem;		/* number of dirrem allocated */
2354static struct dirrem *
2355newdirrem(bp, dp, ip, isrmdir)
2356	struct buf *bp;		/* buffer containing directory block */
2357	struct inode *dp;	/* inode for the directory being modified */
2358	struct inode *ip;	/* inode for directory entry being removed */
2359	int isrmdir;		/* indicates if doing RMDIR */
2360{
2361	int offset;
2362	ufs_lbn_t lbn;
2363	struct diradd *dap;
2364	struct dirrem *dirrem;
2365	struct pagedep *pagedep;
2366
2367	/*
2368	 * Whiteouts have no deletion dependencies.
2369	 */
2370	if (ip == NULL)
2371		panic("newdirrem: whiteout");
2372	/*
2373	 * If we are over our limit, try to improve the situation.
2374	 * Limiting the number of dirrem structures will also limit
2375	 * the number of freefile and freeblks structures.
2376	 */
2377	if (num_dirrem > max_softdeps / 2 && speedup_syncer() == 0)
2378		(void) request_cleanup(FLUSH_REMOVE, 0);
2379	num_dirrem += 1;
2380	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
2381		M_DIRREM, M_WAITOK);
2382	bzero(dirrem, sizeof(struct dirrem));
2383	dirrem->dm_list.wk_type = D_DIRREM;
2384	dirrem->dm_state = isrmdir ? RMDIR : 0;
2385	dirrem->dm_mnt = ITOV(ip)->v_mount;
2386	dirrem->dm_oldinum = ip->i_number;
2387
2388	ACQUIRE_LOCK(&lk);
2389	lbn = lblkno(dp->i_fs, dp->i_offset);
2390	offset = blkoff(dp->i_fs, dp->i_offset);
2391	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2392		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2393	dirrem->dm_pagedep = pagedep;
2394	/*
2395	 * Check for a diradd dependency for the same directory entry.
2396	 * If present, then both dependencies become obsolete and can
2397	 * be de-allocated. Check for an entry on both the pd_dirraddhd
2398	 * list and the pd_pendinghd list.
2399	 */
2400	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(offset)]);
2401	     dap; dap = LIST_NEXT(dap, da_pdlist))
2402		if (dap->da_offset == offset)
2403			break;
2404	if (dap == NULL) {
2405		for (dap = LIST_FIRST(&pagedep->pd_pendinghd);
2406		     dap; dap = LIST_NEXT(dap, da_pdlist))
2407			if (dap->da_offset == offset)
2408				break;
2409		if (dap == NULL)
2410			return (dirrem);
2411	}
2412	/*
2413	 * Must be ATTACHED at this point, so just delete it.
2414	 */
2415	if ((dap->da_state & ATTACHED) == 0)
2416		panic("newdirrem: not ATTACHED");
2417	if (dap->da_newinum != ip->i_number)
2418		panic("newdirrem: inum %d should be %d",
2419		    ip->i_number, dap->da_newinum);
2420	free_diradd(dap);
2421	dirrem->dm_state |= COMPLETE;
2422	return (dirrem);
2423}
2424
2425/*
2426 * Directory entry change dependencies.
2427 *
2428 * Changing an existing directory entry requires that an add operation
2429 * be completed first followed by a deletion. The semantics for the addition
2430 * are identical to the description of adding a new entry above except
2431 * that the rollback is to the old inode number rather than zero. Once
2432 * the addition dependency is completed, the removal is done as described
2433 * in the removal routine above.
2434 */
2435
2436/*
2437 * This routine should be called immediately after changing
2438 * a directory entry.  The inode's link count should not be
2439 * decremented by the calling procedure -- the soft updates
2440 * code will perform this task when it is safe.
2441 */
2442void
2443softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
2444	struct buf *bp;		/* buffer containing directory block */
2445	struct inode *dp;	/* inode for the directory being modified */
2446	struct inode *ip;	/* inode for directory entry being removed */
2447	long newinum;		/* new inode number for changed entry */
2448	int isrmdir;		/* indicates if doing RMDIR */
2449{
2450	int offset;
2451	struct diradd *dap = NULL;
2452	struct dirrem *dirrem;
2453	struct pagedep *pagedep;
2454	struct inodedep *inodedep;
2455
2456	offset = blkoff(dp->i_fs, dp->i_offset);
2457
2458	/*
2459	 * Whiteouts do not need diradd dependencies.
2460	 */
2461	if (newinum != WINO) {
2462		MALLOC(dap, struct diradd *, sizeof(struct diradd),
2463		    M_DIRADD, M_WAITOK);
2464		bzero(dap, sizeof(struct diradd));
2465		dap->da_list.wk_type = D_DIRADD;
2466		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
2467		dap->da_offset = offset;
2468		dap->da_newinum = newinum;
2469	}
2470
2471	/*
2472	 * Allocate a new dirrem and ACQUIRE_LOCK.
2473	 */
2474	dirrem = newdirrem(bp, dp, ip, isrmdir);
2475	pagedep = dirrem->dm_pagedep;
2476	/*
2477	 * The possible values for isrmdir:
2478	 *	0 - non-directory file rename
2479	 *	1 - directory rename within same directory
2480	 *   inum - directory rename to new directory of given inode number
2481	 * When renaming to a new directory, we are both deleting and
2482	 * creating a new directory entry, so the link count on the new
2483	 * directory should not change. Thus we do not need the followup
2484	 * dirrem which is usually done in handle_workitem_remove. We set
2485	 * the DIRCHG flag to tell handle_workitem_remove to skip the
2486	 * followup dirrem.
2487	 */
2488	if (isrmdir > 1)
2489		dirrem->dm_state |= DIRCHG;
2490
2491	/*
2492	 * Whiteouts have no additional dependencies,
2493	 * so just put the dirrem on the correct list.
2494	 */
2495	if (newinum == WINO) {
2496		if ((dirrem->dm_state & COMPLETE) == 0) {
2497			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
2498			    dm_next);
2499		} else {
2500			dirrem->dm_dirinum = pagedep->pd_ino;
2501			add_to_worklist(&dirrem->dm_list);
2502		}
2503		FREE_LOCK(&lk);
2504		return;
2505	}
2506
2507	/*
2508	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2509	 * is not yet written. If it is written, do the post-inode write
2510	 * processing to put it on the id_pendinghd list.
2511	 */
2512	dap->da_previous = dirrem;
2513	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
2514	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2515		dap->da_state |= COMPLETE;
2516		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
2517		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
2518	} else {
2519		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
2520		    dap, da_pdlist);
2521		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2522	}
2523	/*
2524	 * If the previous inode was never written or its previous directory
2525	 * entry was never written, then we do not want to roll back to this
2526	 * previous value. Instead we want to roll back to zero and immediately
2527	 * free the unwritten or unreferenced inode.
2528	 */
2529	if (dirrem->dm_state & COMPLETE) {
2530		dap->da_state &= ~DIRCHG;
2531		dap->da_pagedep = pagedep;
2532		dirrem->dm_dirinum = pagedep->pd_ino;
2533		add_to_worklist(&dirrem->dm_list);
2534	}
2535	FREE_LOCK(&lk);
2536}
2537
2538/*
2539 * Called whenever the link count on an inode is changed.
2540 * It creates an inode dependency so that the new reference(s)
2541 * to the inode cannot be committed to disk until the updated
2542 * inode has been written.
2543 */
2544void
2545softdep_change_linkcnt(ip)
2546	struct inode *ip;	/* the inode with the increased link count */
2547{
2548	struct inodedep *inodedep;
2549
2550	ACQUIRE_LOCK(&lk);
2551	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
2552	if (ip->i_nlink < ip->i_effnlink)
2553		panic("softdep_change_linkcnt: bad delta");
2554	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2555	FREE_LOCK(&lk);
2556}
2557
2558/*
2559 * This workitem decrements the inode's link count.
2560 * If the link count reaches zero, the file is removed.
2561 */
2562static void
2563handle_workitem_remove(dirrem)
2564	struct dirrem *dirrem;
2565{
2566	struct proc *p = CURPROC;	/* XXX */
2567	struct inodedep *inodedep;
2568	struct vnode *vp;
2569	struct inode *ip;
2570	ino_t oldinum;
2571	int error;
2572
2573	if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
2574		softdep_error("handle_workitem_remove: vget", error);
2575		return;
2576	}
2577	ip = VTOI(vp);
2578	ACQUIRE_LOCK(&lk);
2579	if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0)
2580		panic("handle_workitem_remove: lost inodedep");
2581	/*
2582	 * Normal file deletion.
2583	 */
2584	if ((dirrem->dm_state & RMDIR) == 0) {
2585		ip->i_nlink--;
2586		ip->i_flag |= IN_CHANGE;
2587		if (ip->i_nlink < ip->i_effnlink)
2588			panic("handle_workitem_remove: bad file delta");
2589		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2590		FREE_LOCK(&lk);
2591		vput(vp);
2592		num_dirrem -= 1;
2593		WORKITEM_FREE(dirrem, D_DIRREM);
2594		return;
2595	}
2596	/*
2597	 * Directory deletion. Decrement reference count for both the
2598	 * just deleted parent directory entry and the reference for ".".
2599	 * Next truncate the directory to length zero. When the
2600	 * truncation completes, arrange to have the reference count on
2601	 * the parent decremented to account for the loss of "..".
2602	 */
2603	ip->i_nlink -= 2;
2604	ip->i_flag |= IN_CHANGE;
2605	if (ip->i_nlink < ip->i_effnlink)
2606		panic("handle_workitem_remove: bad dir delta");
2607	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2608	FREE_LOCK(&lk);
2609	if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0)
2610		softdep_error("handle_workitem_remove: truncate", error);
2611	/*
2612	 * Rename a directory to a new parent. Since, we are both deleting
2613	 * and creating a new directory entry, the link count on the new
2614	 * directory should not change. Thus we skip the followup dirrem.
2615	 */
2616	if (dirrem->dm_state & DIRCHG) {
2617		vput(vp);
2618		num_dirrem -= 1;
2619		WORKITEM_FREE(dirrem, D_DIRREM);
2620		return;
2621	}
2622	/*
2623	 * If there is no inode dependency then we can free immediately.
2624	 * If we still have a bitmap dependency, then the inode has never
2625	 * been written to disk. Drop the dependency as it is no longer
2626	 * necessary since the inode is being deallocated. We set the
2627	 * ALLCOMPLETE flags since the bitmap now properly shows that the
2628	 * inode is not allocated. Even if the inode is actively being
2629	 * written, it has been rolled back to its zero'ed state, so we
2630	 * are ensured that a zero inode is what is on the disk. For short
2631	 * lived files, this change will usually result in removing all the
2632	 * dependencies from the inode so that it can be freed immediately.
2633	 */
2634	ACQUIRE_LOCK(&lk);
2635	dirrem->dm_state = 0;
2636	oldinum = dirrem->dm_oldinum;
2637	dirrem->dm_oldinum = dirrem->dm_dirinum;
2638	if ((inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep)) == 0)
2639		goto out;
2640	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
2641		inodedep->id_state |= ALLCOMPLETE;
2642		LIST_REMOVE(inodedep, id_deps);
2643		inodedep->id_buf = NULL;
2644		WORKLIST_REMOVE(&inodedep->id_list);
2645	}
2646	if (free_inodedep(inodedep) == 0) {
2647		WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
2648		FREE_LOCK(&lk);
2649		vput(vp);
2650		return;
2651	}
2652out:
2653	FREE_LOCK(&lk);
2654	vput(vp);
2655	handle_workitem_remove(dirrem);
2656}
2657
2658/*
2659 * Inode de-allocation dependencies.
2660 *
2661 * When an inode's link count is reduced to zero, it can be de-allocated. We
2662 * found it convenient to postpone de-allocation until after the inode is
2663 * written to disk with its new link count (zero).  At this point, all of the
2664 * on-disk inode's block pointers are nullified and, with careful dependency
2665 * list ordering, all dependencies related to the inode will be satisfied and
2666 * the corresponding dependency structures de-allocated.  So, if/when the
2667 * inode is reused, there will be no mixing of old dependencies with new
2668 * ones.  This artificial dependency is set up by the block de-allocation
2669 * procedure above (softdep_setup_freeblocks) and completed by the
2670 * following procedure.
2671 */
2672static void
2673handle_workitem_freefile(freefile)
2674	struct freefile *freefile;
2675{
2676	struct vnode vp;
2677	struct inode tip;
2678	struct inodedep *idp;
2679	int error;
2680
2681#ifdef DEBUG
2682	ACQUIRE_LOCK(&lk);
2683	if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp))
2684		panic("handle_workitem_freefile: inodedep survived");
2685	FREE_LOCK(&lk);
2686#endif
2687	tip.i_devvp = freefile->fx_devvp;
2688	tip.i_dev = freefile->fx_devvp->v_rdev;
2689	tip.i_fs = freefile->fx_fs;
2690	vp.v_data = &tip;
2691	if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
2692		softdep_error("handle_workitem_freefile", error);
2693	WORKITEM_FREE(freefile, D_FREEFILE);
2694}
2695
2696/*
2697 * Disk writes.
2698 *
2699 * The dependency structures constructed above are most actively used when file
2700 * system blocks are written to disk.  No constraints are placed on when a
2701 * block can be written, but unsatisfied update dependencies are made safe by
2702 * modifying (or replacing) the source memory for the duration of the disk
2703 * write.  When the disk write completes, the memory block is again brought
2704 * up-to-date.
2705 *
2706 * In-core inode structure reclamation.
2707 *
2708 * Because there are a finite number of "in-core" inode structures, they are
2709 * reused regularly.  By transferring all inode-related dependencies to the
2710 * in-memory inode block and indexing them separately (via "inodedep"s), we
2711 * can allow "in-core" inode structures to be reused at any time and avoid
2712 * any increase in contention.
2713 *
2714 * Called just before entering the device driver to initiate a new disk I/O.
2715 * The buffer must be locked, thus, no I/O completion operations can occur
2716 * while we are manipulating its associated dependencies.
2717 */
2718static void
2719softdep_disk_io_initiation(bp)
2720	struct buf *bp;		/* structure describing disk write to occur */
2721{
2722	struct worklist *wk, *nextwk;
2723	struct indirdep *indirdep;
2724
2725	/*
2726	 * We only care about write operations. There should never
2727	 * be dependencies for reads.
2728	 */
2729	if (bp->b_flags & B_READ)
2730		panic("softdep_disk_io_initiation: read");
2731	/*
2732	 * Do any necessary pre-I/O processing.
2733	 */
2734	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
2735		nextwk = LIST_NEXT(wk, wk_list);
2736		switch (wk->wk_type) {
2737
2738		case D_PAGEDEP:
2739			initiate_write_filepage(WK_PAGEDEP(wk), bp);
2740			continue;
2741
2742		case D_INODEDEP:
2743			initiate_write_inodeblock(WK_INODEDEP(wk), bp);
2744			continue;
2745
2746		case D_INDIRDEP:
2747			indirdep = WK_INDIRDEP(wk);
2748			if (indirdep->ir_state & GOINGAWAY)
2749				panic("disk_io_initiation: indirdep gone");
2750			/*
2751			 * If there are no remaining dependencies, this
2752			 * will be writing the real pointers, so the
2753			 * dependency can be freed.
2754			 */
2755			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
2756				indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
2757				brelse(indirdep->ir_savebp);
2758				/* inline expand WORKLIST_REMOVE(wk); */
2759				wk->wk_state &= ~ONWORKLIST;
2760				LIST_REMOVE(wk, wk_list);
2761				WORKITEM_FREE(indirdep, D_INDIRDEP);
2762				continue;
2763			}
2764			/*
2765			 * Replace up-to-date version with safe version.
2766			 */
2767			ACQUIRE_LOCK(&lk);
2768			indirdep->ir_state &= ~ATTACHED;
2769			indirdep->ir_state |= UNDONE;
2770			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
2771			    M_INDIRDEP, M_WAITOK);
2772			bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
2773			bcopy(indirdep->ir_savebp->b_data, bp->b_data,
2774			    bp->b_bcount);
2775			FREE_LOCK(&lk);
2776			continue;
2777
2778		case D_MKDIR:
2779		case D_BMSAFEMAP:
2780		case D_ALLOCDIRECT:
2781		case D_ALLOCINDIR:
2782			continue;
2783
2784		default:
2785			panic("handle_disk_io_initiation: Unexpected type %s",
2786			    TYPENAME(wk->wk_type));
2787			/* NOTREACHED */
2788		}
2789	}
2790}
2791
2792/*
2793 * Called from within the procedure above to deal with unsatisfied
2794 * allocation dependencies in a directory. The buffer must be locked,
2795 * thus, no I/O completion operations can occur while we are
2796 * manipulating its associated dependencies.
2797 */
2798static void
2799initiate_write_filepage(pagedep, bp)
2800	struct pagedep *pagedep;
2801	struct buf *bp;
2802{
2803	struct diradd *dap;
2804	struct direct *ep;
2805	int i;
2806
2807	if (pagedep->pd_state & IOSTARTED) {
2808		/*
2809		 * This can only happen if there is a driver that does not
2810		 * understand chaining. Here biodone will reissue the call
2811		 * to strategy for the incomplete buffers.
2812		 */
2813		printf("initiate_write_filepage: already started\n");
2814		return;
2815	}
2816	pagedep->pd_state |= IOSTARTED;
2817	ACQUIRE_LOCK(&lk);
2818	for (i = 0; i < DAHASHSZ; i++) {
2819		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
2820		     dap = LIST_NEXT(dap, da_pdlist)) {
2821			ep = (struct direct *)
2822			    ((char *)bp->b_data + dap->da_offset);
2823			if (ep->d_ino != dap->da_newinum)
2824				panic("%s: dir inum %d != new %d",
2825				    "initiate_write_filepage",
2826				    ep->d_ino, dap->da_newinum);
2827			if (dap->da_state & DIRCHG)
2828				ep->d_ino = dap->da_previous->dm_oldinum;
2829			else
2830				ep->d_ino = 0;
2831			dap->da_state &= ~ATTACHED;
2832			dap->da_state |= UNDONE;
2833		}
2834	}
2835	FREE_LOCK(&lk);
2836}
2837
2838/*
2839 * Called from within the procedure above to deal with unsatisfied
2840 * allocation dependencies in an inodeblock. The buffer must be
2841 * locked, thus, no I/O completion operations can occur while we
2842 * are manipulating its associated dependencies.
2843 */
2844static void
2845initiate_write_inodeblock(inodedep, bp)
2846	struct inodedep *inodedep;
2847	struct buf *bp;			/* The inode block */
2848{
2849	struct allocdirect *adp, *lastadp;
2850	struct dinode *dp;
2851	struct fs *fs;
2852	ufs_lbn_t prevlbn = 0;
2853	int i, deplist;
2854
2855	if (inodedep->id_state & IOSTARTED)
2856		panic("initiate_write_inodeblock: already started");
2857	inodedep->id_state |= IOSTARTED;
2858	fs = inodedep->id_fs;
2859	dp = (struct dinode *)bp->b_data +
2860	    ino_to_fsbo(fs, inodedep->id_ino);
2861	/*
2862	 * If the bitmap is not yet written, then the allocated
2863	 * inode cannot be written to disk.
2864	 */
2865	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
2866		if (inodedep->id_savedino != NULL)
2867			panic("initiate_write_inodeblock: already doing I/O");
2868		MALLOC(inodedep->id_savedino, struct dinode *,
2869		    sizeof(struct dinode), M_INODEDEP, M_WAITOK);
2870		*inodedep->id_savedino = *dp;
2871		bzero((caddr_t)dp, sizeof(struct dinode));
2872		return;
2873	}
2874	/*
2875	 * If no dependencies, then there is nothing to roll back.
2876	 */
2877	inodedep->id_savedsize = dp->di_size;
2878	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
2879		return;
2880	/*
2881	 * Set the dependencies to busy.
2882	 */
2883	ACQUIRE_LOCK(&lk);
2884	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2885	     adp = TAILQ_NEXT(adp, ad_next)) {
2886#ifdef DIAGNOSTIC
2887		if (deplist != 0 && prevlbn >= adp->ad_lbn)
2888			panic("softdep_write_inodeblock: lbn order");
2889		prevlbn = adp->ad_lbn;
2890		if (adp->ad_lbn < NDADDR &&
2891		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
2892			panic("%s: direct pointer #%ld mismatch %d != %d",
2893			    "softdep_write_inodeblock", adp->ad_lbn,
2894			    dp->di_db[adp->ad_lbn], adp->ad_newblkno);
2895		if (adp->ad_lbn >= NDADDR &&
2896		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
2897			panic("%s: indirect pointer #%ld mismatch %d != %d",
2898			    "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
2899			    dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
2900		deplist |= 1 << adp->ad_lbn;
2901		if ((adp->ad_state & ATTACHED) == 0)
2902			panic("softdep_write_inodeblock: Unknown state 0x%x",
2903			    adp->ad_state);
2904#endif /* DIAGNOSTIC */
2905		adp->ad_state &= ~ATTACHED;
2906		adp->ad_state |= UNDONE;
2907	}
2908	/*
2909	 * The on-disk inode cannot claim to be any larger than the last
2910	 * fragment that has been written. Otherwise, the on-disk inode
2911	 * might have fragments that were not the last block in the file
2912	 * which would corrupt the filesystem.
2913	 */
2914	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2915	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
2916		if (adp->ad_lbn >= NDADDR)
2917			break;
2918		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
2919		/* keep going until hitting a rollback to a frag */
2920		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
2921			continue;
2922		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
2923		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
2924#ifdef DIAGNOSTIC
2925			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
2926				panic("softdep_write_inodeblock: lost dep1");
2927#endif /* DIAGNOSTIC */
2928			dp->di_db[i] = 0;
2929		}
2930		for (i = 0; i < NIADDR; i++) {
2931#ifdef DIAGNOSTIC
2932			if (dp->di_ib[i] != 0 &&
2933			    (deplist & ((1 << NDADDR) << i)) == 0)
2934				panic("softdep_write_inodeblock: lost dep2");
2935#endif /* DIAGNOSTIC */
2936			dp->di_ib[i] = 0;
2937		}
2938		FREE_LOCK(&lk);
2939		return;
2940	}
2941	/*
2942	 * If we have zero'ed out the last allocated block of the file,
2943	 * roll back the size to the last currently allocated block.
2944	 * We know that this last allocated block is a full-sized as
2945	 * we already checked for fragments in the loop above.
2946	 */
2947	if (lastadp != NULL &&
2948	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
2949		for (i = lastadp->ad_lbn; i >= 0; i--)
2950			if (dp->di_db[i] != 0)
2951				break;
2952		dp->di_size = (i + 1) * fs->fs_bsize;
2953	}
2954	/*
2955	 * The only dependencies are for indirect blocks.
2956	 *
2957	 * The file size for indirect block additions is not guaranteed.
2958	 * Such a guarantee would be non-trivial to achieve. The conventional
2959	 * synchronous write implementation also does not make this guarantee.
2960	 * Fsck should catch and fix discrepancies. Arguably, the file size
2961	 * can be over-estimated without destroying integrity when the file
2962	 * moves into the indirect blocks (i.e., is large). If we want to
2963	 * postpone fsck, we are stuck with this argument.
2964	 */
2965	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
2966		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
2967	FREE_LOCK(&lk);
2968}
2969
2970/*
2971 * This routine is called during the completion interrupt
2972 * service routine for a disk write (from the procedure called
2973 * by the device driver to inform the file system caches of
2974 * a request completion).  It should be called early in this
2975 * procedure, before the block is made available to other
2976 * processes or other routines are called.
2977 */
2978static void
2979softdep_disk_write_complete(bp)
2980	struct buf *bp;		/* describes the completed disk write */
2981{
2982	struct worklist *wk;
2983	struct workhead reattach;
2984	struct newblk *newblk;
2985	struct allocindir *aip;
2986	struct allocdirect *adp;
2987	struct indirdep *indirdep;
2988	struct inodedep *inodedep;
2989	struct bmsafemap *bmsafemap;
2990
2991#ifdef DEBUG
2992	if (lk.lkt_held != -1)
2993		panic("softdep_disk_write_complete: lock is held");
2994	lk.lkt_held = -2;
2995#endif
2996	LIST_INIT(&reattach);
2997	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2998		WORKLIST_REMOVE(wk);
2999		switch (wk->wk_type) {
3000
3001		case D_PAGEDEP:
3002			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
3003				WORKLIST_INSERT(&reattach, wk);
3004			continue;
3005
3006		case D_INODEDEP:
3007			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
3008				WORKLIST_INSERT(&reattach, wk);
3009			continue;
3010
3011		case D_BMSAFEMAP:
3012			bmsafemap = WK_BMSAFEMAP(wk);
3013			while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
3014				newblk->nb_state |= DEPCOMPLETE;
3015				newblk->nb_bmsafemap = NULL;
3016				LIST_REMOVE(newblk, nb_deps);
3017			}
3018			while ((adp =
3019			   LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
3020				adp->ad_state |= DEPCOMPLETE;
3021				adp->ad_buf = NULL;
3022				LIST_REMOVE(adp, ad_deps);
3023				handle_allocdirect_partdone(adp);
3024			}
3025			while ((aip =
3026			    LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
3027				aip->ai_state |= DEPCOMPLETE;
3028				aip->ai_buf = NULL;
3029				LIST_REMOVE(aip, ai_deps);
3030				handle_allocindir_partdone(aip);
3031			}
3032			while ((inodedep =
3033			     LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
3034				inodedep->id_state |= DEPCOMPLETE;
3035				LIST_REMOVE(inodedep, id_deps);
3036				inodedep->id_buf = NULL;
3037			}
3038			WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
3039			continue;
3040
3041		case D_MKDIR:
3042			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
3043			continue;
3044
3045		case D_ALLOCDIRECT:
3046			adp = WK_ALLOCDIRECT(wk);
3047			adp->ad_state |= COMPLETE;
3048			handle_allocdirect_partdone(adp);
3049			continue;
3050
3051		case D_ALLOCINDIR:
3052			aip = WK_ALLOCINDIR(wk);
3053			aip->ai_state |= COMPLETE;
3054			handle_allocindir_partdone(aip);
3055			continue;
3056
3057		case D_INDIRDEP:
3058			indirdep = WK_INDIRDEP(wk);
3059			if (indirdep->ir_state & GOINGAWAY)
3060				panic("disk_write_complete: indirdep gone");
3061			bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
3062			FREE(indirdep->ir_saveddata, M_INDIRDEP);
3063			indirdep->ir_saveddata = 0;
3064			indirdep->ir_state &= ~UNDONE;
3065			indirdep->ir_state |= ATTACHED;
3066			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
3067				handle_allocindir_partdone(aip);
3068				if (aip == LIST_FIRST(&indirdep->ir_donehd))
3069					panic("disk_write_complete: not gone");
3070			}
3071			WORKLIST_INSERT(&reattach, wk);
3072			if ((bp->b_flags & B_DELWRI) == 0)
3073				stat_indir_blk_ptrs++;
3074			bdirty(bp);
3075			continue;
3076
3077		default:
3078			panic("handle_disk_write_complete: Unknown type %s",
3079			    TYPENAME(wk->wk_type));
3080			/* NOTREACHED */
3081		}
3082	}
3083	/*
3084	 * Reattach any requests that must be redone.
3085	 */
3086	while ((wk = LIST_FIRST(&reattach)) != NULL) {
3087		WORKLIST_REMOVE(wk);
3088		WORKLIST_INSERT(&bp->b_dep, wk);
3089	}
3090#ifdef DEBUG
3091	if (lk.lkt_held != -2)
3092		panic("softdep_disk_write_complete: lock lost");
3093	lk.lkt_held = -1;
3094#endif
3095}
3096
3097/*
3098 * Called from within softdep_disk_write_complete above. Note that
3099 * this routine is always called from interrupt level with further
3100 * splbio interrupts blocked.
3101 */
3102static void
3103handle_allocdirect_partdone(adp)
3104	struct allocdirect *adp;	/* the completed allocdirect */
3105{
3106	struct allocdirect *listadp;
3107	struct inodedep *inodedep;
3108	long bsize;
3109
3110	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3111		return;
3112	if (adp->ad_buf != NULL)
3113		panic("handle_allocdirect_partdone: dangling dep");
3114	/*
3115	 * The on-disk inode cannot claim to be any larger than the last
3116	 * fragment that has been written. Otherwise, the on-disk inode
3117	 * might have fragments that were not the last block in the file
3118	 * which would corrupt the filesystem. Thus, we cannot free any
3119	 * allocdirects after one whose ad_oldblkno claims a fragment as
3120	 * these blocks must be rolled back to zero before writing the inode.
3121	 * We check the currently active set of allocdirects in id_inoupdt.
3122	 */
3123	inodedep = adp->ad_inodedep;
3124	bsize = inodedep->id_fs->fs_bsize;
3125	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp;
3126	     listadp = TAILQ_NEXT(listadp, ad_next)) {
3127		/* found our block */
3128		if (listadp == adp)
3129			break;
3130		/* continue if ad_oldlbn is not a fragment */
3131		if (listadp->ad_oldsize == 0 ||
3132		    listadp->ad_oldsize == bsize)
3133			continue;
3134		/* hit a fragment */
3135		return;
3136	}
3137	/*
3138	 * If we have reached the end of the current list without
3139	 * finding the just finished dependency, then it must be
3140	 * on the future dependency list. Future dependencies cannot
3141	 * be freed until they are moved to the current list.
3142	 */
3143	if (listadp == NULL) {
3144#ifdef DEBUG
3145		for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp;
3146		     listadp = TAILQ_NEXT(listadp, ad_next))
3147			/* found our block */
3148			if (listadp == adp)
3149				break;
3150		if (listadp == NULL)
3151			panic("handle_allocdirect_partdone: lost dep");
3152#endif /* DEBUG */
3153		return;
3154	}
3155	/*
3156	 * If we have found the just finished dependency, then free
3157	 * it along with anything that follows it that is complete.
3158	 */
3159	for (; adp; adp = listadp) {
3160		listadp = TAILQ_NEXT(adp, ad_next);
3161		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3162			return;
3163		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
3164	}
3165}
3166
3167/*
3168 * Called from within softdep_disk_write_complete above. Note that
3169 * this routine is always called from interrupt level with further
3170 * splbio interrupts blocked.
3171 */
3172static void
3173handle_allocindir_partdone(aip)
3174	struct allocindir *aip;		/* the completed allocindir */
3175{
3176	struct indirdep *indirdep;
3177
3178	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
3179		return;
3180	if (aip->ai_buf != NULL)
3181		panic("handle_allocindir_partdone: dangling dependency");
3182	indirdep = aip->ai_indirdep;
3183	if (indirdep->ir_state & UNDONE) {
3184		LIST_REMOVE(aip, ai_next);
3185		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
3186		return;
3187	}
3188	((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3189	    aip->ai_newblkno;
3190	LIST_REMOVE(aip, ai_next);
3191	if (aip->ai_freefrag != NULL)
3192		add_to_worklist(&aip->ai_freefrag->ff_list);
3193	WORKITEM_FREE(aip, D_ALLOCINDIR);
3194}
3195
3196/*
3197 * Called from within softdep_disk_write_complete above to restore
3198 * in-memory inode block contents to their most up-to-date state. Note
3199 * that this routine is always called from interrupt level with further
3200 * splbio interrupts blocked.
3201 */
3202static int
3203handle_written_inodeblock(inodedep, bp)
3204	struct inodedep *inodedep;
3205	struct buf *bp;		/* buffer containing the inode block */
3206{
3207	struct worklist *wk, *filefree;
3208	struct allocdirect *adp, *nextadp;
3209	struct dinode *dp;
3210	int hadchanges;
3211
3212	if ((inodedep->id_state & IOSTARTED) == 0)
3213		panic("handle_written_inodeblock: not started");
3214	inodedep->id_state &= ~IOSTARTED;
3215	inodedep->id_state |= COMPLETE;
3216	dp = (struct dinode *)bp->b_data +
3217	    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
3218	/*
3219	 * If we had to rollback the inode allocation because of
3220	 * bitmaps being incomplete, then simply restore it.
3221	 * Keep the block dirty so that it will not be reclaimed until
3222	 * all associated dependencies have been cleared and the
3223	 * corresponding updates written to disk.
3224	 */
3225	if (inodedep->id_savedino != NULL) {
3226		*dp = *inodedep->id_savedino;
3227		FREE(inodedep->id_savedino, M_INODEDEP);
3228		inodedep->id_savedino = NULL;
3229		if ((bp->b_flags & B_DELWRI) == 0)
3230			stat_inode_bitmap++;
3231		bdirty(bp);
3232		return (1);
3233	}
3234	/*
3235	 * Roll forward anything that had to be rolled back before
3236	 * the inode could be updated.
3237	 */
3238	hadchanges = 0;
3239	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
3240		nextadp = TAILQ_NEXT(adp, ad_next);
3241		if (adp->ad_state & ATTACHED)
3242			panic("handle_written_inodeblock: new entry");
3243		if (adp->ad_lbn < NDADDR) {
3244			if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno)
3245				panic("%s: %s #%ld mismatch %d != %d",
3246				    "handle_written_inodeblock",
3247				    "direct pointer", adp->ad_lbn,
3248				    dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
3249			dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
3250		} else {
3251			if (dp->di_ib[adp->ad_lbn - NDADDR] != 0)
3252				panic("%s: %s #%ld allocated as %d",
3253				    "handle_written_inodeblock",
3254				    "indirect pointer", adp->ad_lbn - NDADDR,
3255				    dp->di_ib[adp->ad_lbn - NDADDR]);
3256			dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
3257		}
3258		adp->ad_state &= ~UNDONE;
3259		adp->ad_state |= ATTACHED;
3260		hadchanges = 1;
3261	}
3262	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
3263		stat_direct_blk_ptrs++;
3264	/*
3265	 * Reset the file size to its most up-to-date value.
3266	 */
3267	if (inodedep->id_savedsize == -1)
3268		panic("handle_written_inodeblock: bad size");
3269	if (dp->di_size != inodedep->id_savedsize) {
3270		dp->di_size = inodedep->id_savedsize;
3271		hadchanges = 1;
3272	}
3273	inodedep->id_savedsize = -1;
3274	/*
3275	 * If there were any rollbacks in the inode block, then it must be
3276	 * marked dirty so that its will eventually get written back in
3277	 * its correct form.
3278	 */
3279	if (hadchanges)
3280		bdirty(bp);
3281	/*
3282	 * Process any allocdirects that completed during the update.
3283	 */
3284	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
3285		handle_allocdirect_partdone(adp);
3286	/*
3287	 * Process deallocations that were held pending until the
3288	 * inode had been written to disk. Freeing of the inode
3289	 * is delayed until after all blocks have been freed to
3290	 * avoid creation of new <vfsid, inum, lbn> triples
3291	 * before the old ones have been deleted.
3292	 */
3293	filefree = NULL;
3294	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
3295		WORKLIST_REMOVE(wk);
3296		switch (wk->wk_type) {
3297
3298		case D_FREEFILE:
3299			/*
3300			 * We defer adding filefree to the worklist until
3301			 * all other additions have been made to ensure
3302			 * that it will be done after all the old blocks
3303			 * have been freed.
3304			 */
3305			if (filefree != NULL)
3306				panic("handle_written_inodeblock: filefree");
3307			filefree = wk;
3308			continue;
3309
3310		case D_MKDIR:
3311			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
3312			continue;
3313
3314		case D_DIRADD:
3315			diradd_inode_written(WK_DIRADD(wk), inodedep);
3316			continue;
3317
3318		case D_FREEBLKS:
3319		case D_FREEFRAG:
3320		case D_DIRREM:
3321			add_to_worklist(wk);
3322			continue;
3323
3324		default:
3325			panic("handle_written_inodeblock: Unknown type %s",
3326			    TYPENAME(wk->wk_type));
3327			/* NOTREACHED */
3328		}
3329	}
3330	if (filefree != NULL) {
3331		if (free_inodedep(inodedep) == 0)
3332			panic("handle_written_inodeblock: live inodedep");
3333		add_to_worklist(filefree);
3334		return (0);
3335	}
3336
3337	/*
3338	 * If no outstanding dependencies, free it.
3339	 */
3340	if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
3341		return (0);
3342	return (hadchanges);
3343}
3344
3345/*
3346 * Process a diradd entry after its dependent inode has been written.
3347 * This routine must be called with splbio interrupts blocked.
3348 */
3349static void
3350diradd_inode_written(dap, inodedep)
3351	struct diradd *dap;
3352	struct inodedep *inodedep;
3353{
3354	struct pagedep *pagedep;
3355
3356	dap->da_state |= COMPLETE;
3357	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3358		if (dap->da_state & DIRCHG)
3359			pagedep = dap->da_previous->dm_pagedep;
3360		else
3361			pagedep = dap->da_pagedep;
3362		LIST_REMOVE(dap, da_pdlist);
3363		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3364	}
3365	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3366}
3367
3368/*
3369 * Handle the completion of a mkdir dependency.
3370 */
3371static void
3372handle_written_mkdir(mkdir, type)
3373	struct mkdir *mkdir;
3374	int type;
3375{
3376	struct diradd *dap;
3377	struct pagedep *pagedep;
3378
3379	if (mkdir->md_state != type)
3380		panic("handle_written_mkdir: bad type");
3381	dap = mkdir->md_diradd;
3382	dap->da_state &= ~type;
3383	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
3384		dap->da_state |= DEPCOMPLETE;
3385	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3386		if (dap->da_state & DIRCHG)
3387			pagedep = dap->da_previous->dm_pagedep;
3388		else
3389			pagedep = dap->da_pagedep;
3390		LIST_REMOVE(dap, da_pdlist);
3391		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3392	}
3393	LIST_REMOVE(mkdir, md_mkdirs);
3394	WORKITEM_FREE(mkdir, D_MKDIR);
3395}
3396
3397/*
3398 * Called from within softdep_disk_write_complete above.
3399 * A write operation was just completed. Removed inodes can
3400 * now be freed and associated block pointers may be committed.
3401 * Note that this routine is always called from interrupt level
3402 * with further splbio interrupts blocked.
3403 */
3404static int
3405handle_written_filepage(pagedep, bp)
3406	struct pagedep *pagedep;
3407	struct buf *bp;		/* buffer containing the written page */
3408{
3409	struct dirrem *dirrem;
3410	struct diradd *dap, *nextdap;
3411	struct direct *ep;
3412	int i, chgs;
3413
3414	if ((pagedep->pd_state & IOSTARTED) == 0)
3415		panic("handle_written_filepage: not started");
3416	pagedep->pd_state &= ~IOSTARTED;
3417	/*
3418	 * Process any directory removals that have been committed.
3419	 */
3420	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
3421		LIST_REMOVE(dirrem, dm_next);
3422		dirrem->dm_dirinum = pagedep->pd_ino;
3423		add_to_worklist(&dirrem->dm_list);
3424	}
3425	/*
3426	 * Free any directory additions that have been committed.
3427	 */
3428	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
3429		free_diradd(dap);
3430	/*
3431	 * Uncommitted directory entries must be restored.
3432	 */
3433	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
3434		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
3435		     dap = nextdap) {
3436			nextdap = LIST_NEXT(dap, da_pdlist);
3437			if (dap->da_state & ATTACHED)
3438				panic("handle_written_filepage: attached");
3439			ep = (struct direct *)
3440			    ((char *)bp->b_data + dap->da_offset);
3441			ep->d_ino = dap->da_newinum;
3442			dap->da_state &= ~UNDONE;
3443			dap->da_state |= ATTACHED;
3444			chgs = 1;
3445			/*
3446			 * If the inode referenced by the directory has
3447			 * been written out, then the dependency can be
3448			 * moved to the pending list.
3449			 */
3450			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3451				LIST_REMOVE(dap, da_pdlist);
3452				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
3453				    da_pdlist);
3454			}
3455		}
3456	}
3457	/*
3458	 * If there were any rollbacks in the directory, then it must be
3459	 * marked dirty so that its will eventually get written back in
3460	 * its correct form.
3461	 */
3462	if (chgs) {
3463		if ((bp->b_flags & B_DELWRI) == 0)
3464			stat_dir_entry++;
3465		bdirty(bp);
3466	}
3467	/*
3468	 * If no dependencies remain, the pagedep will be freed.
3469	 * Otherwise it will remain to update the page before it
3470	 * is written back to disk.
3471	 */
3472	if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
3473		for (i = 0; i < DAHASHSZ; i++)
3474			if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
3475				break;
3476		if (i == DAHASHSZ) {
3477			LIST_REMOVE(pagedep, pd_hash);
3478			WORKITEM_FREE(pagedep, D_PAGEDEP);
3479			return (0);
3480		}
3481	}
3482	return (1);
3483}
3484
3485/*
3486 * Writing back in-core inode structures.
3487 *
3488 * The file system only accesses an inode's contents when it occupies an
3489 * "in-core" inode structure.  These "in-core" structures are separate from
3490 * the page frames used to cache inode blocks.  Only the latter are
3491 * transferred to/from the disk.  So, when the updated contents of the
3492 * "in-core" inode structure are copied to the corresponding in-memory inode
3493 * block, the dependencies are also transferred.  The following procedure is
3494 * called when copying a dirty "in-core" inode to a cached inode block.
3495 */
3496
3497/*
3498 * Called when an inode is loaded from disk. If the effective link count
3499 * differed from the actual link count when it was last flushed, then we
3500 * need to ensure that the correct effective link count is put back.
3501 */
3502void
3503softdep_load_inodeblock(ip)
3504	struct inode *ip;	/* the "in_core" copy of the inode */
3505{
3506	struct inodedep *inodedep;
3507
3508	/*
3509	 * Check for alternate nlink count.
3510	 */
3511	ip->i_effnlink = ip->i_nlink;
3512	ACQUIRE_LOCK(&lk);
3513	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3514		FREE_LOCK(&lk);
3515		return;
3516	}
3517	ip->i_effnlink -= inodedep->id_nlinkdelta;
3518	FREE_LOCK(&lk);
3519}
3520
3521/*
3522 * This routine is called just before the "in-core" inode
3523 * information is to be copied to the in-memory inode block.
3524 * Recall that an inode block contains several inodes. If
3525 * the force flag is set, then the dependencies will be
3526 * cleared so that the update can always be made. Note that
3527 * the buffer is locked when this routine is called, so we
3528 * will never be in the middle of writing the inode block
3529 * to disk.
3530 */
3531void
3532softdep_update_inodeblock(ip, bp, waitfor)
3533	struct inode *ip;	/* the "in_core" copy of the inode */
3534	struct buf *bp;		/* the buffer containing the inode block */
3535	int waitfor;		/* nonzero => update must be allowed */
3536{
3537	struct inodedep *inodedep;
3538	struct worklist *wk;
3539	int error, gotit;
3540
3541	/*
3542	 * If the effective link count is not equal to the actual link
3543	 * count, then we must track the difference in an inodedep while
3544	 * the inode is (potentially) tossed out of the cache. Otherwise,
3545	 * if there is no existing inodedep, then there are no dependencies
3546	 * to track.
3547	 */
3548	ACQUIRE_LOCK(&lk);
3549	if (ip->i_effnlink != ip->i_nlink) {
3550		(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC,
3551		    &inodedep);
3552	} else if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3553		FREE_LOCK(&lk);
3554		return;
3555	}
3556	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
3557		panic("softdep_update_inodeblock: bad delta");
3558	/*
3559	 * Changes have been initiated. Anything depending on these
3560	 * changes cannot occur until this inode has been written.
3561	 */
3562	inodedep->id_state &= ~COMPLETE;
3563	if ((inodedep->id_state & ONWORKLIST) == 0)
3564		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
3565	/*
3566	 * Any new dependencies associated with the incore inode must
3567	 * now be moved to the list associated with the buffer holding
3568	 * the in-memory copy of the inode. Once merged process any
3569	 * allocdirects that are completed by the merger.
3570	 */
3571	merge_inode_lists(inodedep);
3572	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
3573		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
3574	/*
3575	 * Now that the inode has been pushed into the buffer, the
3576	 * operations dependent on the inode being written to disk
3577	 * can be moved to the id_bufwait so that they will be
3578	 * processed when the buffer I/O completes.
3579	 */
3580	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
3581		WORKLIST_REMOVE(wk);
3582		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
3583	}
3584	/*
3585	 * Newly allocated inodes cannot be written until the bitmap
3586	 * that allocates them have been written (indicated by
3587	 * DEPCOMPLETE being set in id_state). If we are doing a
3588	 * forced sync (e.g., an fsync on a file), we force the bitmap
3589	 * to be written so that the update can be done.
3590	 */
3591	if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
3592		FREE_LOCK(&lk);
3593		return;
3594	}
3595	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
3596	FREE_LOCK(&lk);
3597	if (gotit &&
3598	    (error = VOP_BWRITE(inodedep->id_buf->b_vp, inodedep->id_buf)) != 0)
3599		softdep_error("softdep_update_inodeblock: bwrite", error);
3600	if ((inodedep->id_state & DEPCOMPLETE) == 0)
3601		panic("softdep_update_inodeblock: update failed");
3602}
3603
3604/*
3605 * Merge the new inode dependency list (id_newinoupdt) into the old
3606 * inode dependency list (id_inoupdt). This routine must be called
3607 * with splbio interrupts blocked.
3608 */
3609static void
3610merge_inode_lists(inodedep)
3611	struct inodedep *inodedep;
3612{
3613	struct allocdirect *listadp, *newadp;
3614
3615	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3616	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
3617		if (listadp->ad_lbn < newadp->ad_lbn) {
3618			listadp = TAILQ_NEXT(listadp, ad_next);
3619			continue;
3620		}
3621		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3622		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
3623		if (listadp->ad_lbn == newadp->ad_lbn) {
3624			allocdirect_merge(&inodedep->id_inoupdt, newadp,
3625			    listadp);
3626			listadp = newadp;
3627		}
3628		newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3629	}
3630	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
3631		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3632		TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
3633	}
3634}
3635
3636/*
3637 * If we are doing an fsync, then we must ensure that any directory
3638 * entries for the inode have been written after the inode gets to disk.
3639 */
3640static int
3641softdep_fsync(vp)
3642	struct vnode *vp;	/* the "in_core" copy of the inode */
3643{
3644	struct inodedep *inodedep;
3645	struct pagedep *pagedep;
3646	struct worklist *wk;
3647	struct diradd *dap;
3648	struct mount *mnt;
3649	struct vnode *pvp;
3650	struct inode *ip;
3651	struct buf *bp;
3652	struct fs *fs;
3653	struct proc *p = CURPROC;		/* XXX */
3654	int error, flushparent;
3655	ino_t parentino;
3656	ufs_lbn_t lbn;
3657
3658	ip = VTOI(vp);
3659	fs = ip->i_fs;
3660	ACQUIRE_LOCK(&lk);
3661	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
3662		FREE_LOCK(&lk);
3663		return (0);
3664	}
3665	if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
3666	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
3667	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
3668	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
3669		panic("softdep_fsync: pending ops");
3670	for (error = 0, flushparent = 0; ; ) {
3671		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
3672			break;
3673		if (wk->wk_type != D_DIRADD)
3674			panic("softdep_fsync: Unexpected type %s",
3675			    TYPENAME(wk->wk_type));
3676		dap = WK_DIRADD(wk);
3677		/*
3678		 * Flush our parent if this directory entry
3679		 * has a MKDIR_PARENT dependency.
3680		 */
3681		if (dap->da_state & DIRCHG)
3682			pagedep = dap->da_previous->dm_pagedep;
3683		else
3684			pagedep = dap->da_pagedep;
3685		mnt = pagedep->pd_mnt;
3686		parentino = pagedep->pd_ino;
3687		lbn = pagedep->pd_lbn;
3688		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
3689			panic("softdep_fsync: dirty");
3690		flushparent = dap->da_state & MKDIR_PARENT;
3691		/*
3692		 * If we are being fsync'ed as part of vgone'ing this vnode,
3693		 * then we will not be able to release and recover the
3694		 * vnode below, so we just have to give up on writing its
3695		 * directory entry out. It will eventually be written, just
3696		 * not now, but then the user was not asking to have it
3697		 * written, so we are not breaking any promises.
3698		 */
3699		if (vp->v_flag & VXLOCK)
3700			break;
3701		/*
3702		 * We prevent deadlock by always fetching inodes from the
3703		 * root, moving down the directory tree. Thus, when fetching
3704		 * our parent directory, we must unlock ourselves before
3705		 * requesting the lock on our parent. See the comment in
3706		 * ufs_lookup for details on possible races.
3707		 */
3708		FREE_LOCK(&lk);
3709		VOP_UNLOCK(vp, 0, p);
3710		error = VFS_VGET(mnt, parentino, &pvp);
3711		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
3712		if (error != 0)
3713			return (error);
3714		if (flushparent) {
3715			if ((error = UFS_UPDATE(pvp, 1)) != 0) {
3716				vput(pvp);
3717				return (error);
3718			}
3719		}
3720		/*
3721		 * Flush directory page containing the inode's name.
3722		 */
3723		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
3724		    &bp);
3725		if (error == 0)
3726			error = VOP_BWRITE(bp->b_vp, bp);
3727		vput(pvp);
3728		if (error != 0)
3729			return (error);
3730		ACQUIRE_LOCK(&lk);
3731		if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
3732			break;
3733	}
3734	FREE_LOCK(&lk);
3735	return (0);
3736}
3737
3738/*
3739 * Flush all the dirty bitmaps associated with the block device
3740 * before flushing the rest of the dirty blocks so as to reduce
3741 * the number of dependencies that will have to be rolled back.
3742 */
3743void
3744softdep_fsync_mountdev(vp)
3745	struct vnode *vp;
3746{
3747	struct buf *bp, *nbp;
3748	struct worklist *wk;
3749
3750	if (!vn_isdisk(vp, NULL))
3751		panic("softdep_fsync_mountdev: vnode not a disk");
3752	ACQUIRE_LOCK(&lk);
3753	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
3754		nbp = TAILQ_NEXT(bp, b_vnbufs);
3755		/*
3756		 * If it is already scheduled, skip to the next buffer.
3757		 */
3758		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
3759			continue;
3760		if ((bp->b_flags & B_DELWRI) == 0)
3761			panic("softdep_fsync_mountdev: not dirty");
3762		/*
3763		 * We are only interested in bitmaps with outstanding
3764		 * dependencies.
3765		 */
3766		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
3767		    wk->wk_type != D_BMSAFEMAP) {
3768			BUF_UNLOCK(bp);
3769			continue;
3770		}
3771		bremfree(bp);
3772		FREE_LOCK(&lk);
3773		(void) bawrite(bp);
3774		ACQUIRE_LOCK(&lk);
3775		/*
3776		 * Since we may have slept during the I/O, we need
3777		 * to start from a known point.
3778		 */
3779		nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
3780	}
3781	drain_output(vp, 1);
3782	FREE_LOCK(&lk);
3783}
3784
3785/*
3786 * This routine is called when we are trying to synchronously flush a
3787 * file. This routine must eliminate any filesystem metadata dependencies
3788 * so that the syncing routine can succeed by pushing the dirty blocks
3789 * associated with the file. If any I/O errors occur, they are returned.
3790 */
3791int
3792softdep_sync_metadata(ap)
3793	struct vop_fsync_args /* {
3794		struct vnode *a_vp;
3795		struct ucred *a_cred;
3796		int a_waitfor;
3797		struct proc *a_p;
3798	} */ *ap;
3799{
3800	struct vnode *vp = ap->a_vp;
3801	struct pagedep *pagedep;
3802	struct allocdirect *adp;
3803	struct allocindir *aip;
3804	struct buf *bp, *nbp;
3805	struct worklist *wk;
3806	int i, error, waitfor;
3807
3808	/*
3809	 * Check whether this vnode is involved in a filesystem
3810	 * that is doing soft dependency processing.
3811	 */
3812	if (!vn_isdisk(vp, NULL)) {
3813		if (!DOINGSOFTDEP(vp))
3814			return (0);
3815	} else
3816		if (vp->v_specmountpoint == NULL ||
3817		    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
3818			return (0);
3819	/*
3820	 * Ensure that any direct block dependencies have been cleared.
3821	 */
3822	ACQUIRE_LOCK(&lk);
3823	if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
3824		FREE_LOCK(&lk);
3825		return (error);
3826	}
3827	/*
3828	 * For most files, the only metadata dependencies are the
3829	 * cylinder group maps that allocate their inode or blocks.
3830	 * The block allocation dependencies can be found by traversing
3831	 * the dependency lists for any buffers that remain on their
3832	 * dirty buffer list. The inode allocation dependency will
3833	 * be resolved when the inode is updated with MNT_WAIT.
3834	 * This work is done in two passes. The first pass grabs most
3835	 * of the buffers and begins asynchronously writing them. The
3836	 * only way to wait for these asynchronous writes is to sleep
3837	 * on the filesystem vnode which may stay busy for a long time
3838	 * if the filesystem is active. So, instead, we make a second
3839	 * pass over the dependencies blocking on each write. In the
3840	 * usual case we will be blocking against a write that we
3841	 * initiated, so when it is done the dependency will have been
3842	 * resolved. Thus the second pass is expected to end quickly.
3843	 */
3844	waitfor = MNT_NOWAIT;
3845top:
3846	if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
3847		FREE_LOCK(&lk);
3848		return (0);
3849	}
3850	bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
3851loop:
3852	/*
3853	 * As we hold the buffer locked, none of its dependencies
3854	 * will disappear.
3855	 */
3856	for (wk = LIST_FIRST(&bp->b_dep); wk;
3857	     wk = LIST_NEXT(wk, wk_list)) {
3858		switch (wk->wk_type) {
3859
3860		case D_ALLOCDIRECT:
3861			adp = WK_ALLOCDIRECT(wk);
3862			if (adp->ad_state & DEPCOMPLETE)
3863				break;
3864			nbp = adp->ad_buf;
3865			if (getdirtybuf(&nbp, waitfor) == 0)
3866				break;
3867			FREE_LOCK(&lk);
3868			if (waitfor == MNT_NOWAIT) {
3869				bawrite(nbp);
3870			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
3871				bawrite(bp);
3872				return (error);
3873			}
3874			ACQUIRE_LOCK(&lk);
3875			break;
3876
3877		case D_ALLOCINDIR:
3878			aip = WK_ALLOCINDIR(wk);
3879			if (aip->ai_state & DEPCOMPLETE)
3880				break;
3881			nbp = aip->ai_buf;
3882			if (getdirtybuf(&nbp, waitfor) == 0)
3883				break;
3884			FREE_LOCK(&lk);
3885			if (waitfor == MNT_NOWAIT) {
3886				bawrite(nbp);
3887			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
3888				bawrite(bp);
3889				return (error);
3890			}
3891			ACQUIRE_LOCK(&lk);
3892			break;
3893
3894		case D_INDIRDEP:
3895		restart:
3896			for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd);
3897			     aip; aip = LIST_NEXT(aip, ai_next)) {
3898				if (aip->ai_state & DEPCOMPLETE)
3899					continue;
3900				nbp = aip->ai_buf;
3901				if (getdirtybuf(&nbp, MNT_WAIT) == 0)
3902					goto restart;
3903				FREE_LOCK(&lk);
3904				if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
3905					bawrite(bp);
3906					return (error);
3907				}
3908				ACQUIRE_LOCK(&lk);
3909				goto restart;
3910			}
3911			break;
3912
3913		case D_INODEDEP:
3914			if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
3915			    WK_INODEDEP(wk)->id_ino)) != 0) {
3916				FREE_LOCK(&lk);
3917				bawrite(bp);
3918				return (error);
3919			}
3920			break;
3921
3922		case D_PAGEDEP:
3923			/*
3924			 * We are trying to sync a directory that may
3925			 * have dependencies on both its own metadata
3926			 * and/or dependencies on the inodes of any
3927			 * recently allocated files. We walk its diradd
3928			 * lists pushing out the associated inode.
3929			 */
3930			pagedep = WK_PAGEDEP(wk);
3931			for (i = 0; i < DAHASHSZ; i++) {
3932				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
3933					continue;
3934				if ((error =
3935				    flush_pagedep_deps(vp, pagedep->pd_mnt,
3936						&pagedep->pd_diraddhd[i]))) {
3937					FREE_LOCK(&lk);
3938					bawrite(bp);
3939					return (error);
3940				}
3941			}
3942			break;
3943
3944		case D_MKDIR:
3945			/*
3946			 * This case should never happen if the vnode has
3947			 * been properly sync'ed. However, if this function
3948			 * is used at a place where the vnode has not yet
3949			 * been sync'ed, this dependency can show up. So,
3950			 * rather than panic, just flush it.
3951			 */
3952			nbp = WK_MKDIR(wk)->md_buf;
3953			if (getdirtybuf(&nbp, waitfor) == 0)
3954				break;
3955			FREE_LOCK(&lk);
3956			if (waitfor == MNT_NOWAIT) {
3957				bawrite(nbp);
3958			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
3959				bawrite(bp);
3960				return (error);
3961			}
3962			ACQUIRE_LOCK(&lk);
3963			break;
3964
3965		case D_BMSAFEMAP:
3966			/*
3967			 * This case should never happen if the vnode has
3968			 * been properly sync'ed. However, if this function
3969			 * is used at a place where the vnode has not yet
3970			 * been sync'ed, this dependency can show up. So,
3971			 * rather than panic, just flush it.
3972			 */
3973			nbp = WK_BMSAFEMAP(wk)->sm_buf;
3974			if (getdirtybuf(&nbp, waitfor) == 0)
3975				break;
3976			FREE_LOCK(&lk);
3977			if (waitfor == MNT_NOWAIT) {
3978				bawrite(nbp);
3979			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
3980				bawrite(bp);
3981				return (error);
3982			}
3983			ACQUIRE_LOCK(&lk);
3984			break;
3985
3986		default:
3987			panic("softdep_sync_metadata: Unknown type %s",
3988			    TYPENAME(wk->wk_type));
3989			/* NOTREACHED */
3990		}
3991	}
3992	(void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT);
3993	nbp = TAILQ_NEXT(bp, b_vnbufs);
3994	FREE_LOCK(&lk);
3995	bawrite(bp);
3996	ACQUIRE_LOCK(&lk);
3997	if (nbp != NULL) {
3998		bp = nbp;
3999		goto loop;
4000	}
4001	/*
4002	 * We must wait for any I/O in progress to finish so that
4003	 * all potential buffers on the dirty list will be visible.
4004	 * Once they are all there, proceed with the second pass
4005	 * which will wait for the I/O as per above.
4006	 */
4007	drain_output(vp, 1);
4008	/*
4009	 * The brief unlock is to allow any pent up dependency
4010	 * processing to be done.
4011	 */
4012	if (waitfor == MNT_NOWAIT) {
4013		waitfor = MNT_WAIT;
4014		FREE_LOCK(&lk);
4015		ACQUIRE_LOCK(&lk);
4016		goto top;
4017	}
4018
4019	/*
4020	 * If we have managed to get rid of all the dirty buffers,
4021	 * then we are done. For certain directories and block
4022	 * devices, we may need to do further work.
4023	 */
4024	if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
4025		FREE_LOCK(&lk);
4026		return (0);
4027	}
4028
4029	FREE_LOCK(&lk);
4030	/*
4031	 * If we are trying to sync a block device, some of its buffers may
4032	 * contain metadata that cannot be written until the contents of some
4033	 * partially written files have been written to disk. The only easy
4034	 * way to accomplish this is to sync the entire filesystem (luckily
4035	 * this happens rarely).
4036	 */
4037	if (vn_isdisk(vp, NULL) &&
4038	    vp->v_specmountpoint && !VOP_ISLOCKED(vp, NULL) &&
4039	    (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred,
4040	     ap->a_p)) != 0)
4041		return (error);
4042	return (0);
4043}
4044
4045/*
4046 * Flush the dependencies associated with an inodedep.
4047 * Called with splbio blocked.
4048 */
4049static int
4050flush_inodedep_deps(fs, ino)
4051	struct fs *fs;
4052	ino_t ino;
4053{
4054	struct inodedep *inodedep;
4055	struct allocdirect *adp;
4056	int error, waitfor;
4057	struct buf *bp;
4058
4059	/*
4060	 * This work is done in two passes. The first pass grabs most
4061	 * of the buffers and begins asynchronously writing them. The
4062	 * only way to wait for these asynchronous writes is to sleep
4063	 * on the filesystem vnode which may stay busy for a long time
4064	 * if the filesystem is active. So, instead, we make a second
4065	 * pass over the dependencies blocking on each write. In the
4066	 * usual case we will be blocking against a write that we
4067	 * initiated, so when it is done the dependency will have been
4068	 * resolved. Thus the second pass is expected to end quickly.
4069	 * We give a brief window at the top of the loop to allow
4070	 * any pending I/O to complete.
4071	 */
4072	for (waitfor = MNT_NOWAIT; ; ) {
4073		FREE_LOCK(&lk);
4074		ACQUIRE_LOCK(&lk);
4075		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4076			return (0);
4077		for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
4078		     adp = TAILQ_NEXT(adp, ad_next)) {
4079			if (adp->ad_state & DEPCOMPLETE)
4080				continue;
4081			bp = adp->ad_buf;
4082			if (getdirtybuf(&bp, waitfor) == 0) {
4083				if (waitfor == MNT_NOWAIT)
4084					continue;
4085				break;
4086			}
4087			FREE_LOCK(&lk);
4088			if (waitfor == MNT_NOWAIT) {
4089				bawrite(bp);
4090			} else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) {
4091				ACQUIRE_LOCK(&lk);
4092				return (error);
4093			}
4094			ACQUIRE_LOCK(&lk);
4095			break;
4096		}
4097		if (adp != NULL)
4098			continue;
4099		for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp;
4100		     adp = TAILQ_NEXT(adp, ad_next)) {
4101			if (adp->ad_state & DEPCOMPLETE)
4102				continue;
4103			bp = adp->ad_buf;
4104			if (getdirtybuf(&bp, waitfor) == 0) {
4105				if (waitfor == MNT_NOWAIT)
4106					continue;
4107				break;
4108			}
4109			FREE_LOCK(&lk);
4110			if (waitfor == MNT_NOWAIT) {
4111				bawrite(bp);
4112			} else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) {
4113				ACQUIRE_LOCK(&lk);
4114				return (error);
4115			}
4116			ACQUIRE_LOCK(&lk);
4117			break;
4118		}
4119		if (adp != NULL)
4120			continue;
4121		/*
4122		 * If pass2, we are done, otherwise do pass 2.
4123		 */
4124		if (waitfor == MNT_WAIT)
4125			break;
4126		waitfor = MNT_WAIT;
4127	}
4128	/*
4129	 * Try freeing inodedep in case all dependencies have been removed.
4130	 */
4131	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
4132		(void) free_inodedep(inodedep);
4133	return (0);
4134}
4135
4136/*
4137 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
4138 * Called with splbio blocked.
4139 */
4140static int
4141flush_pagedep_deps(pvp, mp, diraddhdp)
4142	struct vnode *pvp;
4143	struct mount *mp;
4144	struct diraddhd *diraddhdp;
4145{
4146	struct proc *p = CURPROC;	/* XXX */
4147	struct inodedep *inodedep;
4148	struct ufsmount *ump;
4149	struct diradd *dap;
4150	struct vnode *vp;
4151	int gotit, error = 0;
4152	struct buf *bp;
4153	ino_t inum;
4154
4155	ump = VFSTOUFS(mp);
4156	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
4157		/*
4158		 * Flush ourselves if this directory entry
4159		 * has a MKDIR_PARENT dependency.
4160		 */
4161		if (dap->da_state & MKDIR_PARENT) {
4162			FREE_LOCK(&lk);
4163			if ((error = UFS_UPDATE(pvp, 1)) != 0)
4164				break;
4165			ACQUIRE_LOCK(&lk);
4166			/*
4167			 * If that cleared dependencies, go on to next.
4168			 */
4169			if (dap != LIST_FIRST(diraddhdp))
4170				continue;
4171			if (dap->da_state & MKDIR_PARENT)
4172				panic("flush_pagedep_deps: MKDIR");
4173		}
4174		/*
4175		 * Flush the file on which the directory entry depends.
4176		 * If the inode has already been pushed out of the cache,
4177		 * then all the block dependencies will have been flushed
4178		 * leaving only inode dependencies (e.g., bitmaps). Thus,
4179		 * we do a ufs_ihashget to check for the vnode in the cache.
4180		 * If it is there, we do a full flush. If it is no longer
4181		 * there we need only dispose of any remaining bitmap
4182		 * dependencies and write the inode to disk.
4183		 */
4184		inum = dap->da_newinum;
4185		FREE_LOCK(&lk);
4186		if ((vp = ufs_ihashget(ump->um_dev, inum)) == NULL) {
4187			ACQUIRE_LOCK(&lk);
4188			if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0
4189			    && dap == LIST_FIRST(diraddhdp))
4190				panic("flush_pagedep_deps: flush 1 failed");
4191			/*
4192			 * If the inode still has bitmap dependencies,
4193			 * push them to disk.
4194			 */
4195			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4196				gotit = getdirtybuf(&inodedep->id_buf,MNT_WAIT);
4197				FREE_LOCK(&lk);
4198				if (gotit &&
4199				    (error = VOP_BWRITE(inodedep->id_buf->b_vp,
4200				     inodedep->id_buf)) != 0)
4201					break;
4202				ACQUIRE_LOCK(&lk);
4203			}
4204			if (dap != LIST_FIRST(diraddhdp))
4205				continue;
4206			/*
4207			 * If the inode is still sitting in a buffer waiting
4208			 * to be written, push it to disk.
4209			 */
4210			FREE_LOCK(&lk);
4211			if ((error = bread(ump->um_devvp,
4212			    fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
4213			    (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0)
4214				break;
4215			if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0)
4216				break;
4217			ACQUIRE_LOCK(&lk);
4218			if (dap == LIST_FIRST(diraddhdp))
4219				panic("flush_pagedep_deps: flush 2 failed");
4220			continue;
4221		}
4222		if (vp->v_type == VDIR) {
4223			/*
4224			 * A newly allocated directory must have its "." and
4225			 * ".." entries written out before its name can be
4226			 * committed in its parent. We do not want or need
4227			 * the full semantics of a synchronous VOP_FSYNC as
4228			 * that may end up here again, once for each directory
4229			 * level in the filesystem. Instead, we push the blocks
4230			 * and wait for them to clear. We have to fsync twice
4231			 * because the first call may choose to defer blocks
4232			 * that still have dependencies, but deferral will
4233			 * happen at most once.
4234			 */
4235			if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) ||
4236			    (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) {
4237				vput(vp);
4238				break;
4239			}
4240			drain_output(vp, 0);
4241		}
4242		error = UFS_UPDATE(vp, 1);
4243		vput(vp);
4244		if (error)
4245			break;
4246		/*
4247		 * If we have failed to get rid of all the dependencies
4248		 * then something is seriously wrong.
4249		 */
4250		if (dap == LIST_FIRST(diraddhdp))
4251			panic("flush_pagedep_deps: flush 3 failed");
4252		ACQUIRE_LOCK(&lk);
4253	}
4254	if (error)
4255		ACQUIRE_LOCK(&lk);
4256	return (error);
4257}
4258
4259/*
4260 * A large burst of file addition or deletion activity can drive the
4261 * memory load excessively high. Therefore we deliberately slow things
4262 * down and speed up the I/O processing if we find ourselves with too
4263 * many dependencies in progress.
4264 */
4265static int
4266request_cleanup(resource, islocked)
4267	int resource;
4268	int islocked;
4269{
4270	struct callout_handle handle;
4271	struct proc *p = CURPROC;
4272
4273	/*
4274	 * We never hold up the filesystem syncer process.
4275	 */
4276	if (p == filesys_syncer)
4277		return (0);
4278	/*
4279	 * If we are resource constrained on inode dependencies, try
4280	 * flushing some dirty inodes. Otherwise, we are constrained
4281	 * by file deletions, so try accelerating flushes of directories
4282	 * with removal dependencies. We would like to do the cleanup
4283	 * here, but we probably hold an inode locked at this point and
4284	 * that might deadlock against one that we try to clean. So,
4285	 * the best that we can do is request the syncer daemon to do
4286	 * the cleanup for us.
4287	 */
4288	switch (resource) {
4289
4290	case FLUSH_INODES:
4291		stat_ino_limit_push += 1;
4292		req_clear_inodedeps = 1;
4293		break;
4294
4295	case FLUSH_REMOVE:
4296		stat_blk_limit_push += 1;
4297		req_clear_remove = 1;
4298		break;
4299
4300	default:
4301		panic("request_cleanup: unknown type");
4302	}
4303	/*
4304	 * Hopefully the syncer daemon will catch up and awaken us.
4305	 * We wait at most tickdelay before proceeding in any case.
4306	 */
4307	if (islocked == 0)
4308		ACQUIRE_LOCK(&lk);
4309	if (proc_waiting == 0) {
4310		proc_waiting = 1;
4311		handle = timeout(pause_timer, NULL,
4312		    tickdelay > 2 ? tickdelay : 2);
4313	}
4314	FREE_LOCK_INTERLOCKED(&lk);
4315	(void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0);
4316	ACQUIRE_LOCK_INTERLOCKED(&lk);
4317	if (proc_waiting) {
4318		untimeout(pause_timer, NULL, handle);
4319		proc_waiting = 0;
4320	} else {
4321		switch (resource) {
4322
4323		case FLUSH_INODES:
4324			stat_ino_limit_hit += 1;
4325			break;
4326
4327		case FLUSH_REMOVE:
4328			stat_blk_limit_hit += 1;
4329			break;
4330		}
4331	}
4332	if (islocked == 0)
4333		FREE_LOCK(&lk);
4334	return (1);
4335}
4336
4337/*
4338 * Awaken processes pausing in request_cleanup and clear proc_waiting
4339 * to indicate that there is no longer a timer running.
4340 */
4341void
4342pause_timer(arg)
4343	void *arg;
4344{
4345
4346	proc_waiting = 0;
4347	wakeup(&proc_waiting);
4348}
4349
4350/*
4351 * Flush out a directory with at least one removal dependency in an effort to
4352 * reduce the number of dirrem, freefile, and freeblks dependency structures.
4353 */
4354static void
4355clear_remove(p)
4356	struct proc *p;
4357{
4358	struct pagedep_hashhead *pagedephd;
4359	struct pagedep *pagedep;
4360	static int next = 0;
4361	struct mount *mp;
4362	struct vnode *vp;
4363	int error, cnt;
4364	ino_t ino;
4365
4366	ACQUIRE_LOCK(&lk);
4367	for (cnt = 0; cnt < pagedep_hash; cnt++) {
4368		pagedephd = &pagedep_hashtbl[next++];
4369		if (next >= pagedep_hash)
4370			next = 0;
4371		for (pagedep = LIST_FIRST(pagedephd); pagedep;
4372		     pagedep = LIST_NEXT(pagedep, pd_hash)) {
4373			if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
4374				continue;
4375			mp = pagedep->pd_mnt;
4376			ino = pagedep->pd_ino;
4377			FREE_LOCK(&lk);
4378			if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
4379				softdep_error("clear_remove: vget", error);
4380				return;
4381			}
4382			if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
4383				softdep_error("clear_remove: fsync", error);
4384			drain_output(vp, 0);
4385			vput(vp);
4386			return;
4387		}
4388	}
4389	FREE_LOCK(&lk);
4390}
4391
4392/*
4393 * Clear out a block of dirty inodes in an effort to reduce
4394 * the number of inodedep dependency structures.
4395 */
4396static void
4397clear_inodedeps(p)
4398	struct proc *p;
4399{
4400	struct inodedep_hashhead *inodedephd;
4401	struct inodedep *inodedep;
4402	static int next = 0;
4403	struct mount *mp;
4404	struct vnode *vp;
4405	struct fs *fs;
4406	int error, cnt;
4407	ino_t firstino, lastino, ino;
4408
4409	ACQUIRE_LOCK(&lk);
4410	/*
4411	 * Pick a random inode dependency to be cleared.
4412	 * We will then gather up all the inodes in its block
4413	 * that have dependencies and flush them out.
4414	 */
4415	for (cnt = 0; cnt < inodedep_hash; cnt++) {
4416		inodedephd = &inodedep_hashtbl[next++];
4417		if (next >= inodedep_hash)
4418			next = 0;
4419		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
4420			break;
4421	}
4422	/*
4423	 * Ugly code to find mount point given pointer to superblock.
4424	 */
4425	fs = inodedep->id_fs;
4426	TAILQ_FOREACH(mp, &mountlist, mnt_list)
4427		if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
4428			break;
4429	/*
4430	 * Find the last inode in the block with dependencies.
4431	 */
4432	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
4433	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
4434		if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
4435			break;
4436	/*
4437	 * Asynchronously push all but the last inode with dependencies.
4438	 * Synchronously push the last inode with dependencies to ensure
4439	 * that the inode block gets written to free up the inodedeps.
4440	 */
4441	for (ino = firstino; ino <= lastino; ino++) {
4442		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4443			continue;
4444		FREE_LOCK(&lk);
4445		if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
4446			softdep_error("clear_inodedeps: vget", error);
4447			return;
4448		}
4449		if (ino == lastino) {
4450			if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p)))
4451				softdep_error("clear_inodedeps: fsync1", error);
4452		} else {
4453			if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
4454				softdep_error("clear_inodedeps: fsync2", error);
4455			drain_output(vp, 0);
4456		}
4457		vput(vp);
4458		ACQUIRE_LOCK(&lk);
4459	}
4460	FREE_LOCK(&lk);
4461}
4462
4463/*
4464 * Function to determine if the buffer has outstanding dependencies
4465 * that will cause a roll-back if the buffer is written. If wantcount
4466 * is set, return number of dependencies, otherwise just yes or no.
4467 */
4468static int
4469softdep_count_dependencies(bp, wantcount)
4470	struct buf *bp;
4471	int wantcount;
4472{
4473	struct worklist *wk;
4474	struct inodedep *inodedep;
4475	struct indirdep *indirdep;
4476	struct allocindir *aip;
4477	struct pagedep *pagedep;
4478	struct diradd *dap;
4479	int i, retval;
4480
4481	retval = 0;
4482	ACQUIRE_LOCK(&lk);
4483	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) {
4484		switch (wk->wk_type) {
4485
4486		case D_INODEDEP:
4487			inodedep = WK_INODEDEP(wk);
4488			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4489				/* bitmap allocation dependency */
4490				retval += 1;
4491				if (!wantcount)
4492					goto out;
4493			}
4494			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
4495				/* direct block pointer dependency */
4496				retval += 1;
4497				if (!wantcount)
4498					goto out;
4499			}
4500			continue;
4501
4502		case D_INDIRDEP:
4503			indirdep = WK_INDIRDEP(wk);
4504			for (aip = LIST_FIRST(&indirdep->ir_deplisthd);
4505			     aip; aip = LIST_NEXT(aip, ai_next)) {
4506				/* indirect block pointer dependency */
4507				retval += 1;
4508				if (!wantcount)
4509					goto out;
4510			}
4511			continue;
4512
4513		case D_PAGEDEP:
4514			pagedep = WK_PAGEDEP(wk);
4515			for (i = 0; i < DAHASHSZ; i++) {
4516				for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]);
4517				     dap; dap = LIST_NEXT(dap, da_pdlist)) {
4518					/* directory entry dependency */
4519					retval += 1;
4520					if (!wantcount)
4521						goto out;
4522				}
4523			}
4524			continue;
4525
4526		case D_BMSAFEMAP:
4527		case D_ALLOCDIRECT:
4528		case D_ALLOCINDIR:
4529		case D_MKDIR:
4530			/* never a dependency on these blocks */
4531			continue;
4532
4533		default:
4534			panic("softdep_check_for_rollback: Unexpected type %s",
4535			    TYPENAME(wk->wk_type));
4536			/* NOTREACHED */
4537		}
4538	}
4539out:
4540	FREE_LOCK(&lk);
4541	return retval;
4542}
4543
4544/*
4545 * Acquire exclusive access to a buffer.
4546 * Must be called with splbio blocked.
4547 * Return 1 if buffer was acquired.
4548 */
4549static int
4550getdirtybuf(bpp, waitfor)
4551	struct buf **bpp;
4552	int waitfor;
4553{
4554	struct buf *bp;
4555
4556	for (;;) {
4557		if ((bp = *bpp) == NULL)
4558			return (0);
4559		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
4560			if ((bp->b_xflags & BX_BKGRDINPROG) == 0)
4561				break;
4562			BUF_UNLOCK(bp);
4563			if (waitfor != MNT_WAIT)
4564				return (0);
4565			bp->b_xflags |= BX_BKGRDWAIT;
4566			FREE_LOCK_INTERLOCKED(&lk);
4567			tsleep(&bp->b_xflags, PRIBIO, "getbuf", 0);
4568			ACQUIRE_LOCK_INTERLOCKED(&lk);
4569			if (bp->b_xflags & BX_BKGRDINPROG)
4570				panic("getdirtybuf: still writing");
4571			continue;
4572		}
4573		if (waitfor != MNT_WAIT)
4574			return (0);
4575		FREE_LOCK_INTERLOCKED(&lk);
4576		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL) != ENOLCK)
4577			panic("getdirtybuf: inconsistent lock");
4578		ACQUIRE_LOCK_INTERLOCKED(&lk);
4579	}
4580	if ((bp->b_flags & B_DELWRI) == 0) {
4581		BUF_UNLOCK(bp);
4582		return (0);
4583	}
4584	bremfree(bp);
4585	return (1);
4586}
4587
4588/*
4589 * Wait for pending output on a vnode to complete.
4590 * Must be called with vnode locked.
4591 */
4592static void
4593drain_output(vp, islocked)
4594	struct vnode *vp;
4595	int islocked;
4596{
4597
4598	if (!islocked)
4599		ACQUIRE_LOCK(&lk);
4600	while (vp->v_numoutput) {
4601		vp->v_flag |= VBWAIT;
4602		FREE_LOCK_INTERLOCKED(&lk);
4603		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drainvp", 0);
4604		ACQUIRE_LOCK_INTERLOCKED(&lk);
4605	}
4606	if (!islocked)
4607		FREE_LOCK(&lk);
4608}
4609
4610/*
4611 * Called whenever a buffer that is being invalidated or reallocated
4612 * contains dependencies. This should only happen if an I/O error has
4613 * occurred. The routine is called with the buffer locked.
4614 */
4615static void
4616softdep_deallocate_dependencies(bp)
4617	struct buf *bp;
4618{
4619
4620	if ((bp->b_flags & B_ERROR) == 0)
4621		panic("softdep_deallocate_dependencies: dangling deps");
4622	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
4623	panic("softdep_deallocate_dependencies: unrecovered I/O error");
4624}
4625
4626/*
4627 * Function to handle asynchronous write errors in the filesystem.
4628 */
4629void
4630softdep_error(func, error)
4631	char *func;
4632	int error;
4633{
4634
4635	/* XXX should do something better! */
4636	printf("%s: got error %d while accessing filesystem\n", func, error);
4637}
4638