ffs_softdep.c revision 58934
1/*
2 * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved.
3 *
4 * The soft updates code is derived from the appendix of a University
5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
6 * "Soft Updates: A Solution to the Metadata Update Problem in File
7 * Systems", CSE-TR-254-95, August 1995).
8 *
9 * The following are the copyrights and redistribution conditions that
10 * apply to this copy of the soft update software. For a license
11 * to use, redistribute or sell the soft update software under
12 * conditions other than those described here, please contact the
13 * author at one of the following addresses:
14 *
15 *	Marshall Kirk McKusick		mckusick@mckusick.com
16 *	1614 Oxford Street		+1-510-843-9542
17 *	Berkeley, CA 94709-1608
18 *	USA
19 *
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
22 * are met:
23 *
24 * 1. Redistributions of source code must retain the above copyright
25 *    notice, this list of conditions and the following disclaimer.
26 * 2. Redistributions in binary form must reproduce the above copyright
27 *    notice, this list of conditions and the following disclaimer in the
28 *    documentation and/or other materials provided with the distribution.
29 * 3. None of the names of McKusick, Ganger, Patt, or the University of
30 *    Michigan may be used to endorse or promote products derived from
31 *    this software without specific prior written permission.
32 * 4. Redistributions in any form must be accompanied by information on
33 *    how to obtain complete source code for any accompanying software
34 *    that uses this software. This source code must either be included
35 *    in the distribution or be available for no more than the cost of
36 *    distribution plus a nominal fee, and must be freely redistributable
37 *    under reasonable conditions. For an executable file, complete
38 *    source code means the source code for all modules it contains.
39 *    It does not mean source code for modules or files that typically
40 *    accompany the operating system on which the executable file runs,
41 *    e.g., standard library modules or system header files.
42 *
43 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
44 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
45 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
46 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
47 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 *	from: @(#)ffs_softdep.c	9.56 (McKusick) 1/17/00
56 * $FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 58934 2000-04-02 15:24:56Z phk $
57 */
58
59/*
60 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
61 */
62#ifndef DIAGNOSTIC
63#define DIAGNOSTIC
64#endif
65#ifndef DEBUG
66#define DEBUG
67#endif
68
69#include <sys/param.h>
70#include <sys/kernel.h>
71#include <sys/systm.h>
72#include <sys/buf.h>
73#include <sys/malloc.h>
74#include <sys/mount.h>
75#include <sys/proc.h>
76#include <sys/syslog.h>
77#include <sys/vnode.h>
78#include <sys/conf.h>
79#include <ufs/ufs/dir.h>
80#include <ufs/ufs/quota.h>
81#include <ufs/ufs/inode.h>
82#include <ufs/ufs/ufsmount.h>
83#include <ufs/ffs/fs.h>
84#include <ufs/ffs/softdep.h>
85#include <ufs/ffs/ffs_extern.h>
86#include <ufs/ufs/ufs_extern.h>
87
88/*
89 * These definitions need to be adapted to the system to which
90 * this file is being ported.
91 */
92/*
93 * malloc types defined for the softdep system.
94 */
95MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
96MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
97MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
98MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
99MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
100MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
101MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
102MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
103MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
104MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
105MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
106MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
107MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
108
109#define	D_PAGEDEP	0
110#define	D_INODEDEP	1
111#define	D_NEWBLK	2
112#define	D_BMSAFEMAP	3
113#define	D_ALLOCDIRECT	4
114#define	D_INDIRDEP	5
115#define	D_ALLOCINDIR	6
116#define	D_FREEFRAG	7
117#define	D_FREEBLKS	8
118#define	D_FREEFILE	9
119#define	D_DIRADD	10
120#define	D_MKDIR		11
121#define	D_DIRREM	12
122#define D_LAST		D_DIRREM
123
124/*
125 * translate from workitem type to memory type
126 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
127 */
128static struct malloc_type *memtype[] = {
129	M_PAGEDEP,
130	M_INODEDEP,
131	M_NEWBLK,
132	M_BMSAFEMAP,
133	M_ALLOCDIRECT,
134	M_INDIRDEP,
135	M_ALLOCINDIR,
136	M_FREEFRAG,
137	M_FREEBLKS,
138	M_FREEFILE,
139	M_DIRADD,
140	M_MKDIR,
141	M_DIRREM
142};
143
144#define DtoM(type) (memtype[type])
145
146/*
147 * Names of malloc types.
148 */
149#define TYPENAME(type)  \
150	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
151#define CURPROC curproc
152/*
153 * End system adaptaion definitions.
154 */
155
156/*
157 * Internal function prototypes.
158 */
159static	void softdep_error __P((char *, int));
160static	void drain_output __P((struct vnode *, int));
161static	int getdirtybuf __P((struct buf **, int));
162static	void clear_remove __P((struct proc *));
163static	void clear_inodedeps __P((struct proc *));
164static	int flush_pagedep_deps __P((struct vnode *, struct mount *,
165	    struct diraddhd *));
166static	int flush_inodedep_deps __P((struct fs *, ino_t));
167static	int handle_written_filepage __P((struct pagedep *, struct buf *));
168static  void diradd_inode_written __P((struct diradd *, struct inodedep *));
169static	int handle_written_inodeblock __P((struct inodedep *, struct buf *));
170static	void handle_allocdirect_partdone __P((struct allocdirect *));
171static	void handle_allocindir_partdone __P((struct allocindir *));
172static	void initiate_write_filepage __P((struct pagedep *, struct buf *));
173static	void handle_written_mkdir __P((struct mkdir *, int));
174static	void initiate_write_inodeblock __P((struct inodedep *, struct buf *));
175static	void handle_workitem_freefile __P((struct freefile *));
176static	void handle_workitem_remove __P((struct dirrem *));
177static	struct dirrem *newdirrem __P((struct buf *, struct inode *,
178	    struct inode *, int, struct dirrem **));
179static	void free_diradd __P((struct diradd *));
180static	void free_allocindir __P((struct allocindir *, struct inodedep *));
181static	int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t,
182	    long *));
183static	void deallocate_dependencies __P((struct buf *, struct inodedep *));
184static	void free_allocdirect __P((struct allocdirectlst *,
185	    struct allocdirect *, int));
186static	int check_inode_unwritten __P((struct inodedep *));
187static	int free_inodedep __P((struct inodedep *));
188static	void handle_workitem_freeblocks __P((struct freeblks *));
189static	void merge_inode_lists __P((struct inodedep *));
190static	void setup_allocindir_phase2 __P((struct buf *, struct inode *,
191	    struct allocindir *));
192static	struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t,
193	    ufs_daddr_t));
194static	void handle_workitem_freefrag __P((struct freefrag *));
195static	struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long));
196static	void allocdirect_merge __P((struct allocdirectlst *,
197	    struct allocdirect *, struct allocdirect *));
198static	struct bmsafemap *bmsafemap_lookup __P((struct buf *));
199static	int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
200	    struct newblk **));
201static	int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
202static	int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
203	    struct pagedep **));
204static	void pause_timer __P((void *));
205static	int request_cleanup __P((int, int));
206static	void add_to_worklist __P((struct worklist *));
207
208/*
209 * Exported softdep operations.
210 */
211static	void softdep_disk_io_initiation __P((struct buf *));
212static	void softdep_disk_write_complete __P((struct buf *));
213static	void softdep_deallocate_dependencies __P((struct buf *));
214static	int softdep_fsync __P((struct vnode *));
215static	int softdep_process_worklist __P((struct mount *));
216static	void softdep_move_dependencies __P((struct buf *, struct buf *));
217static	int softdep_count_dependencies __P((struct buf *bp, int));
218
219struct bio_ops bioops = {
220	softdep_disk_io_initiation,		/* io_start */
221	softdep_disk_write_complete,		/* io_complete */
222	softdep_deallocate_dependencies,	/* io_deallocate */
223	softdep_fsync,				/* io_fsync */
224	softdep_process_worklist,		/* io_sync */
225	softdep_move_dependencies,		/* io_movedeps */
226	softdep_count_dependencies,		/* io_countdeps */
227};
228
229/*
230 * Locking primitives.
231 *
232 * For a uniprocessor, all we need to do is protect against disk
233 * interrupts. For a multiprocessor, this lock would have to be
234 * a mutex. A single mutex is used throughout this file, though
235 * finer grain locking could be used if contention warranted it.
236 *
237 * For a multiprocessor, the sleep call would accept a lock and
238 * release it after the sleep processing was complete. In a uniprocessor
239 * implementation there is no such interlock, so we simple mark
240 * the places where it needs to be done with the `interlocked' form
241 * of the lock calls. Since the uniprocessor sleep already interlocks
242 * the spl, there is nothing that really needs to be done.
243 */
244#ifndef /* NOT */ DEBUG
245static struct lockit {
246	int	lkt_spl;
247} lk = { 0 };
248#define ACQUIRE_LOCK(lk)		(lk)->lkt_spl = splbio()
249#define FREE_LOCK(lk)			splx((lk)->lkt_spl)
250#define ACQUIRE_LOCK_INTERLOCKED(lk)
251#define FREE_LOCK_INTERLOCKED(lk)
252
253#else /* DEBUG */
254static struct lockit {
255	int	lkt_spl;
256	pid_t	lkt_held;
257} lk = { 0, -1 };
258static int lockcnt;
259
260static	void acquire_lock __P((struct lockit *));
261static	void free_lock __P((struct lockit *));
262static	void acquire_lock_interlocked __P((struct lockit *));
263static	void free_lock_interlocked __P((struct lockit *));
264
265#define ACQUIRE_LOCK(lk)		acquire_lock(lk)
266#define FREE_LOCK(lk)			free_lock(lk)
267#define ACQUIRE_LOCK_INTERLOCKED(lk)	acquire_lock_interlocked(lk)
268#define FREE_LOCK_INTERLOCKED(lk)	free_lock_interlocked(lk)
269
270static void
271acquire_lock(lk)
272	struct lockit *lk;
273{
274
275	if (lk->lkt_held != -1) {
276		if (lk->lkt_held == CURPROC->p_pid)
277			panic("softdep_lock: locking against myself");
278		else
279			panic("softdep_lock: lock held by %d", lk->lkt_held);
280	}
281	lk->lkt_spl = splbio();
282	lk->lkt_held = CURPROC->p_pid;
283	lockcnt++;
284}
285
286static void
287free_lock(lk)
288	struct lockit *lk;
289{
290
291	if (lk->lkt_held == -1)
292		panic("softdep_unlock: lock not held");
293	lk->lkt_held = -1;
294	splx(lk->lkt_spl);
295}
296
297static void
298acquire_lock_interlocked(lk)
299	struct lockit *lk;
300{
301
302	if (lk->lkt_held != -1) {
303		if (lk->lkt_held == CURPROC->p_pid)
304			panic("softdep_lock_interlocked: locking against self");
305		else
306			panic("softdep_lock_interlocked: lock held by %d",
307			    lk->lkt_held);
308	}
309	lk->lkt_held = CURPROC->p_pid;
310	lockcnt++;
311}
312
313static void
314free_lock_interlocked(lk)
315	struct lockit *lk;
316{
317
318	if (lk->lkt_held == -1)
319		panic("softdep_unlock_interlocked: lock not held");
320	lk->lkt_held = -1;
321}
322#endif /* DEBUG */
323
324/*
325 * Place holder for real semaphores.
326 */
327struct sema {
328	int	value;
329	pid_t	holder;
330	char	*name;
331	int	prio;
332	int	timo;
333};
334static	void sema_init __P((struct sema *, char *, int, int));
335static	int sema_get __P((struct sema *, struct lockit *));
336static	void sema_release __P((struct sema *));
337
338static void
339sema_init(semap, name, prio, timo)
340	struct sema *semap;
341	char *name;
342	int prio, timo;
343{
344
345	semap->holder = -1;
346	semap->value = 0;
347	semap->name = name;
348	semap->prio = prio;
349	semap->timo = timo;
350}
351
352static int
353sema_get(semap, interlock)
354	struct sema *semap;
355	struct lockit *interlock;
356{
357
358	if (semap->value++ > 0) {
359		if (interlock != NULL)
360			FREE_LOCK_INTERLOCKED(interlock);
361		tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo);
362		if (interlock != NULL) {
363			ACQUIRE_LOCK_INTERLOCKED(interlock);
364			FREE_LOCK(interlock);
365		}
366		return (0);
367	}
368	semap->holder = CURPROC->p_pid;
369	if (interlock != NULL)
370		FREE_LOCK(interlock);
371	return (1);
372}
373
374static void
375sema_release(semap)
376	struct sema *semap;
377{
378
379	if (semap->value <= 0 || semap->holder != CURPROC->p_pid)
380		panic("sema_release: not held");
381	if (--semap->value > 0) {
382		semap->value = 0;
383		wakeup(semap);
384	}
385	semap->holder = -1;
386}
387
388/*
389 * Worklist queue management.
390 * These routines require that the lock be held.
391 */
392#ifndef /* NOT */ DEBUG
393#define WORKLIST_INSERT(head, item) do {	\
394	(item)->wk_state |= ONWORKLIST;		\
395	LIST_INSERT_HEAD(head, item, wk_list);	\
396} while (0)
397#define WORKLIST_REMOVE(item) do {		\
398	(item)->wk_state &= ~ONWORKLIST;	\
399	LIST_REMOVE(item, wk_list);		\
400} while (0)
401#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
402
403#else /* DEBUG */
404static	void worklist_insert __P((struct workhead *, struct worklist *));
405static	void worklist_remove __P((struct worklist *));
406static	void workitem_free __P((struct worklist *, int));
407
408#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
409#define WORKLIST_REMOVE(item) worklist_remove(item)
410#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
411
412static void
413worklist_insert(head, item)
414	struct workhead *head;
415	struct worklist *item;
416{
417
418	if (lk.lkt_held == -1)
419		panic("worklist_insert: lock not held");
420	if (item->wk_state & ONWORKLIST)
421		panic("worklist_insert: already on list");
422	item->wk_state |= ONWORKLIST;
423	LIST_INSERT_HEAD(head, item, wk_list);
424}
425
426static void
427worklist_remove(item)
428	struct worklist *item;
429{
430
431	if (lk.lkt_held == -1)
432		panic("worklist_remove: lock not held");
433	if ((item->wk_state & ONWORKLIST) == 0)
434		panic("worklist_remove: not on list");
435	item->wk_state &= ~ONWORKLIST;
436	LIST_REMOVE(item, wk_list);
437}
438
439static void
440workitem_free(item, type)
441	struct worklist *item;
442	int type;
443{
444
445	if (item->wk_state & ONWORKLIST)
446		panic("workitem_free: still on list");
447	if (item->wk_type != type)
448		panic("workitem_free: type mismatch");
449	FREE(item, DtoM(type));
450}
451#endif /* DEBUG */
452
453/*
454 * Workitem queue management
455 */
456static struct workhead softdep_workitem_pending;
457static int softdep_worklist_busy;
458static int max_softdeps;	/* maximum number of structs before slowdown */
459static int tickdelay = 2;	/* number of ticks to pause during slowdown */
460static int proc_waiting;	/* tracks whether we have a timeout posted */
461static struct proc *filesys_syncer; /* proc of filesystem syncer process */
462static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
463#define FLUSH_INODES	1
464static int req_clear_remove;	/* syncer process flush some freeblks */
465#define FLUSH_REMOVE	2
466/*
467 * runtime statistics
468 */
469static int stat_blk_limit_push;	/* number of times block limit neared */
470static int stat_ino_limit_push;	/* number of times inode limit neared */
471static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
472static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
473static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
474static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
475static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
476static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
477#ifdef DEBUG
478#include <vm/vm.h>
479#include <sys/sysctl.h>
480SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
481SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
482SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
483SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
484SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
485SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
486SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
487SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
488SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
489SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
490#endif /* DEBUG */
491
492/*
493 * Add an item to the end of the work queue.
494 * This routine requires that the lock be held.
495 * This is the only routine that adds items to the list.
496 * The following routine is the only one that removes items
497 * and does so in order from first to last.
498 */
499static void
500add_to_worklist(wk)
501	struct worklist *wk;
502{
503	static struct worklist *worklist_tail;
504
505	if (wk->wk_state & ONWORKLIST)
506		panic("add_to_worklist: already on list");
507	wk->wk_state |= ONWORKLIST;
508	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
509		LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
510	else
511		LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
512	worklist_tail = wk;
513}
514
515/*
516 * Process that runs once per second to handle items in the background queue.
517 *
518 * Note that we ensure that everything is done in the order in which they
519 * appear in the queue. The code below depends on this property to ensure
520 * that blocks of a file are freed before the inode itself is freed. This
521 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
522 * until all the old ones have been purged from the dependency lists.
523 */
524static int
525softdep_process_worklist(matchmnt)
526	struct mount *matchmnt;
527{
528	struct proc *p = CURPROC;
529	struct worklist *wk;
530	struct fs *matchfs;
531	int matchcnt, loopcount;
532
533	/*
534	 * Record the process identifier of our caller so that we can give
535	 * this process preferential treatment in request_cleanup below.
536	 */
537	filesys_syncer = p;
538	matchcnt = 0;
539	matchfs = NULL;
540	if (matchmnt != NULL)
541		matchfs = VFSTOUFS(matchmnt)->um_fs;
542	/*
543	 * There is no danger of having multiple processes run this
544	 * code. It is single threaded solely so that softdep_flushfiles
545	 * (below) can get an accurate count of the number of items
546	 * related to its mount point that are in the list.
547	 */
548	if (softdep_worklist_busy && matchmnt == NULL)
549		return (-1);
550	/*
551	 * If requested, try removing inode or removal dependencies.
552	 */
553	if (req_clear_inodedeps) {
554		clear_inodedeps(p);
555		req_clear_inodedeps = 0;
556		wakeup(&proc_waiting);
557	}
558	if (req_clear_remove) {
559		clear_remove(p);
560		req_clear_remove = 0;
561		wakeup(&proc_waiting);
562	}
563	ACQUIRE_LOCK(&lk);
564	loopcount = 1;
565	while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) {
566		WORKLIST_REMOVE(wk);
567		FREE_LOCK(&lk);
568		switch (wk->wk_type) {
569
570		case D_DIRREM:
571			/* removal of a directory entry */
572			if (WK_DIRREM(wk)->dm_mnt == matchmnt)
573				matchcnt += 1;
574			handle_workitem_remove(WK_DIRREM(wk));
575			break;
576
577		case D_FREEBLKS:
578			/* releasing blocks and/or fragments from a file */
579			if (WK_FREEBLKS(wk)->fb_fs == matchfs)
580				matchcnt += 1;
581			handle_workitem_freeblocks(WK_FREEBLKS(wk));
582			break;
583
584		case D_FREEFRAG:
585			/* releasing a fragment when replaced as a file grows */
586			if (WK_FREEFRAG(wk)->ff_fs == matchfs)
587				matchcnt += 1;
588			handle_workitem_freefrag(WK_FREEFRAG(wk));
589			break;
590
591		case D_FREEFILE:
592			/* releasing an inode when its link count drops to 0 */
593			if (WK_FREEFILE(wk)->fx_fs == matchfs)
594				matchcnt += 1;
595			handle_workitem_freefile(WK_FREEFILE(wk));
596			break;
597
598		default:
599			panic("%s_process_worklist: Unknown type %s",
600			    "softdep", TYPENAME(wk->wk_type));
601			/* NOTREACHED */
602		}
603		if (softdep_worklist_busy && matchmnt == NULL)
604			return (-1);
605		/*
606		 * If requested, try removing inode or removal dependencies.
607		 */
608		if (req_clear_inodedeps) {
609			clear_inodedeps(p);
610			req_clear_inodedeps = 0;
611			wakeup(&proc_waiting);
612		}
613		if (req_clear_remove) {
614			clear_remove(p);
615			req_clear_remove = 0;
616			wakeup(&proc_waiting);
617		}
618		/*
619		 * We do not generally want to stop for buffer space, but if
620		 * we are really being a buffer hog, we will stop and wait.
621		 */
622		if (loopcount++ % 128 == 0)
623			bwillwrite();
624		ACQUIRE_LOCK(&lk);
625	}
626	FREE_LOCK(&lk);
627	return (matchcnt);
628}
629
630/*
631 * Move dependencies from one buffer to another.
632 */
633static void
634softdep_move_dependencies(oldbp, newbp)
635	struct buf *oldbp;
636	struct buf *newbp;
637{
638	struct worklist *wk, *wktail;
639
640	if (LIST_FIRST(&newbp->b_dep) != NULL)
641		panic("softdep_move_dependencies: need merge code");
642	wktail = 0;
643	ACQUIRE_LOCK(&lk);
644	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
645		LIST_REMOVE(wk, wk_list);
646		if (wktail == 0)
647			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
648		else
649			LIST_INSERT_AFTER(wktail, wk, wk_list);
650		wktail = wk;
651	}
652	FREE_LOCK(&lk);
653}
654
655/*
656 * Purge the work list of all items associated with a particular mount point.
657 */
658int
659softdep_flushfiles(oldmnt, flags, p)
660	struct mount *oldmnt;
661	int flags;
662	struct proc *p;
663{
664	struct vnode *devvp;
665	int error, loopcnt;
666
667	/*
668	 * Await our turn to clear out the queue.
669	 */
670	while (softdep_worklist_busy)
671		tsleep(&lbolt, PRIBIO, "softflush", 0);
672	softdep_worklist_busy = 1;
673	if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) {
674		softdep_worklist_busy = 0;
675		return (error);
676	}
677	/*
678	 * Alternately flush the block device associated with the mount
679	 * point and process any dependencies that the flushing
680	 * creates. In theory, this loop can happen at most twice,
681	 * but we give it a few extra just to be sure.
682	 */
683	devvp = VFSTOUFS(oldmnt)->um_devvp;
684	for (loopcnt = 10; loopcnt > 0; ) {
685		if (softdep_process_worklist(oldmnt) == 0) {
686			loopcnt--;
687			/*
688			 * Do another flush in case any vnodes were brought in
689			 * as part of the cleanup operations.
690			 */
691			if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
692				break;
693			/*
694			 * If we still found nothing to do, we are really done.
695			 */
696			if (softdep_process_worklist(oldmnt) == 0)
697				break;
698		}
699		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
700		error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
701		VOP_UNLOCK(devvp, 0, p);
702		if (error)
703			break;
704	}
705	softdep_worklist_busy = 0;
706	/*
707	 * If we are unmounting then it is an error to fail. If we
708	 * are simply trying to downgrade to read-only, then filesystem
709	 * activity can keep us busy forever, so we just fail with EBUSY.
710	 */
711	if (loopcnt == 0) {
712		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
713			panic("softdep_flushfiles: looping");
714		error = EBUSY;
715	}
716	return (error);
717}
718
719/*
720 * Structure hashing.
721 *
722 * There are three types of structures that can be looked up:
723 *	1) pagedep structures identified by mount point, inode number,
724 *	   and logical block.
725 *	2) inodedep structures identified by mount point and inode number.
726 *	3) newblk structures identified by mount point and
727 *	   physical block number.
728 *
729 * The "pagedep" and "inodedep" dependency structures are hashed
730 * separately from the file blocks and inodes to which they correspond.
731 * This separation helps when the in-memory copy of an inode or
732 * file block must be replaced. It also obviates the need to access
733 * an inode or file page when simply updating (or de-allocating)
734 * dependency structures. Lookup of newblk structures is needed to
735 * find newly allocated blocks when trying to associate them with
736 * their allocdirect or allocindir structure.
737 *
738 * The lookup routines optionally create and hash a new instance when
739 * an existing entry is not found.
740 */
741#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
742
743/*
744 * Structures and routines associated with pagedep caching.
745 */
746LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
747u_long	pagedep_hash;		/* size of hash table - 1 */
748#define	PAGEDEP_HASH(mp, inum, lbn) \
749	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
750	    pagedep_hash])
751static struct sema pagedep_in_progress;
752
753/*
754 * Look up a pagedep. Return 1 if found, 0 if not found.
755 * If not found, allocate if DEPALLOC flag is passed.
756 * Found or allocated entry is returned in pagedeppp.
757 * This routine must be called with splbio interrupts blocked.
758 */
759static int
760pagedep_lookup(ip, lbn, flags, pagedeppp)
761	struct inode *ip;
762	ufs_lbn_t lbn;
763	int flags;
764	struct pagedep **pagedeppp;
765{
766	struct pagedep *pagedep;
767	struct pagedep_hashhead *pagedephd;
768	struct mount *mp;
769	int i;
770
771#ifdef DEBUG
772	if (lk.lkt_held == -1)
773		panic("pagedep_lookup: lock not held");
774#endif
775	mp = ITOV(ip)->v_mount;
776	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
777top:
778	for (pagedep = LIST_FIRST(pagedephd); pagedep;
779	     pagedep = LIST_NEXT(pagedep, pd_hash))
780		if (ip->i_number == pagedep->pd_ino &&
781		    lbn == pagedep->pd_lbn &&
782		    mp == pagedep->pd_mnt)
783			break;
784	if (pagedep) {
785		*pagedeppp = pagedep;
786		return (1);
787	}
788	if ((flags & DEPALLOC) == 0) {
789		*pagedeppp = NULL;
790		return (0);
791	}
792	if (sema_get(&pagedep_in_progress, &lk) == 0) {
793		ACQUIRE_LOCK(&lk);
794		goto top;
795	}
796	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
797		M_WAITOK);
798	bzero(pagedep, sizeof(struct pagedep));
799	pagedep->pd_list.wk_type = D_PAGEDEP;
800	pagedep->pd_mnt = mp;
801	pagedep->pd_ino = ip->i_number;
802	pagedep->pd_lbn = lbn;
803	LIST_INIT(&pagedep->pd_dirremhd);
804	LIST_INIT(&pagedep->pd_pendinghd);
805	for (i = 0; i < DAHASHSZ; i++)
806		LIST_INIT(&pagedep->pd_diraddhd[i]);
807	ACQUIRE_LOCK(&lk);
808	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
809	sema_release(&pagedep_in_progress);
810	*pagedeppp = pagedep;
811	return (0);
812}
813
814/*
815 * Structures and routines associated with inodedep caching.
816 */
817LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
818static u_long	inodedep_hash;	/* size of hash table - 1 */
819static long	num_inodedep;	/* number of inodedep allocated */
820#define	INODEDEP_HASH(fs, inum) \
821      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
822static struct sema inodedep_in_progress;
823
824/*
825 * Look up a inodedep. Return 1 if found, 0 if not found.
826 * If not found, allocate if DEPALLOC flag is passed.
827 * Found or allocated entry is returned in inodedeppp.
828 * This routine must be called with splbio interrupts blocked.
829 */
830static int
831inodedep_lookup(fs, inum, flags, inodedeppp)
832	struct fs *fs;
833	ino_t inum;
834	int flags;
835	struct inodedep **inodedeppp;
836{
837	struct inodedep *inodedep;
838	struct inodedep_hashhead *inodedephd;
839	int firsttry;
840
841#ifdef DEBUG
842	if (lk.lkt_held == -1)
843		panic("inodedep_lookup: lock not held");
844#endif
845	firsttry = 1;
846	inodedephd = INODEDEP_HASH(fs, inum);
847top:
848	for (inodedep = LIST_FIRST(inodedephd); inodedep;
849	     inodedep = LIST_NEXT(inodedep, id_hash))
850		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
851			break;
852	if (inodedep) {
853		*inodedeppp = inodedep;
854		return (1);
855	}
856	if ((flags & DEPALLOC) == 0) {
857		*inodedeppp = NULL;
858		return (0);
859	}
860	/*
861	 * If we are over our limit, try to improve the situation.
862	 */
863	if (num_inodedep > max_softdeps && firsttry && speedup_syncer() == 0 &&
864	    request_cleanup(FLUSH_INODES, 1)) {
865		firsttry = 0;
866		goto top;
867	}
868	if (sema_get(&inodedep_in_progress, &lk) == 0) {
869		ACQUIRE_LOCK(&lk);
870		goto top;
871	}
872	num_inodedep += 1;
873	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
874		M_INODEDEP, M_WAITOK);
875	inodedep->id_list.wk_type = D_INODEDEP;
876	inodedep->id_fs = fs;
877	inodedep->id_ino = inum;
878	inodedep->id_state = ALLCOMPLETE;
879	inodedep->id_nlinkdelta = 0;
880	inodedep->id_savedino = NULL;
881	inodedep->id_savedsize = -1;
882	inodedep->id_buf = NULL;
883	LIST_INIT(&inodedep->id_pendinghd);
884	LIST_INIT(&inodedep->id_inowait);
885	LIST_INIT(&inodedep->id_bufwait);
886	TAILQ_INIT(&inodedep->id_inoupdt);
887	TAILQ_INIT(&inodedep->id_newinoupdt);
888	ACQUIRE_LOCK(&lk);
889	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
890	sema_release(&inodedep_in_progress);
891	*inodedeppp = inodedep;
892	return (0);
893}
894
895/*
896 * Structures and routines associated with newblk caching.
897 */
898LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
899u_long	newblk_hash;		/* size of hash table - 1 */
900#define	NEWBLK_HASH(fs, inum) \
901	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
902static struct sema newblk_in_progress;
903
904/*
905 * Look up a newblk. Return 1 if found, 0 if not found.
906 * If not found, allocate if DEPALLOC flag is passed.
907 * Found or allocated entry is returned in newblkpp.
908 */
909static int
910newblk_lookup(fs, newblkno, flags, newblkpp)
911	struct fs *fs;
912	ufs_daddr_t newblkno;
913	int flags;
914	struct newblk **newblkpp;
915{
916	struct newblk *newblk;
917	struct newblk_hashhead *newblkhd;
918
919	newblkhd = NEWBLK_HASH(fs, newblkno);
920top:
921	for (newblk = LIST_FIRST(newblkhd); newblk;
922	     newblk = LIST_NEXT(newblk, nb_hash))
923		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
924			break;
925	if (newblk) {
926		*newblkpp = newblk;
927		return (1);
928	}
929	if ((flags & DEPALLOC) == 0) {
930		*newblkpp = NULL;
931		return (0);
932	}
933	if (sema_get(&newblk_in_progress, 0) == 0)
934		goto top;
935	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
936		M_NEWBLK, M_WAITOK);
937	newblk->nb_state = 0;
938	newblk->nb_fs = fs;
939	newblk->nb_newblkno = newblkno;
940	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
941	sema_release(&newblk_in_progress);
942	*newblkpp = newblk;
943	return (0);
944}
945
946/*
947 * Executed during filesystem system initialization before
948 * mounting any file systems.
949 */
950void
951softdep_initialize()
952{
953
954	LIST_INIT(&mkdirlisthd);
955	LIST_INIT(&softdep_workitem_pending);
956	max_softdeps = desiredvnodes * 8;
957	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
958	    &pagedep_hash);
959	sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
960	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
961	sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
962	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
963	sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
964}
965
966/*
967 * Called at mount time to notify the dependency code that a
968 * filesystem wishes to use it.
969 */
970int
971softdep_mount(devvp, mp, fs, cred)
972	struct vnode *devvp;
973	struct mount *mp;
974	struct fs *fs;
975	struct ucred *cred;
976{
977	struct csum cstotal;
978	struct cg *cgp;
979	struct buf *bp;
980	int error, cyl;
981
982	mp->mnt_flag &= ~MNT_ASYNC;
983	mp->mnt_flag |= MNT_SOFTDEP;
984	/*
985	 * When doing soft updates, the counters in the
986	 * superblock may have gotten out of sync, so we have
987	 * to scan the cylinder groups and recalculate them.
988	 */
989	if (fs->fs_clean != 0)
990		return (0);
991	bzero(&cstotal, sizeof cstotal);
992	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
993		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
994		    fs->fs_cgsize, cred, &bp)) != 0) {
995			brelse(bp);
996			return (error);
997		}
998		cgp = (struct cg *)bp->b_data;
999		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1000		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1001		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1002		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1003		fs->fs_cs(fs, cyl) = cgp->cg_cs;
1004		brelse(bp);
1005	}
1006#ifdef DEBUG
1007	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1008		printf("ffs_mountfs: superblock updated for soft updates\n");
1009#endif
1010	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1011	return (0);
1012}
1013
1014/*
1015 * Protecting the freemaps (or bitmaps).
1016 *
1017 * To eliminate the need to execute fsck before mounting a file system
1018 * after a power failure, one must (conservatively) guarantee that the
1019 * on-disk copy of the bitmaps never indicate that a live inode or block is
1020 * free.  So, when a block or inode is allocated, the bitmap should be
1021 * updated (on disk) before any new pointers.  When a block or inode is
1022 * freed, the bitmap should not be updated until all pointers have been
1023 * reset.  The latter dependency is handled by the delayed de-allocation
1024 * approach described below for block and inode de-allocation.  The former
1025 * dependency is handled by calling the following procedure when a block or
1026 * inode is allocated. When an inode is allocated an "inodedep" is created
1027 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1028 * Each "inodedep" is also inserted into the hash indexing structure so
1029 * that any additional link additions can be made dependent on the inode
1030 * allocation.
1031 *
1032 * The ufs file system maintains a number of free block counts (e.g., per
1033 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1034 * in addition to the bitmaps.  These counts are used to improve efficiency
1035 * during allocation and therefore must be consistent with the bitmaps.
1036 * There is no convenient way to guarantee post-crash consistency of these
1037 * counts with simple update ordering, for two main reasons: (1) The counts
1038 * and bitmaps for a single cylinder group block are not in the same disk
1039 * sector.  If a disk write is interrupted (e.g., by power failure), one may
1040 * be written and the other not.  (2) Some of the counts are located in the
1041 * superblock rather than the cylinder group block. So, we focus our soft
1042 * updates implementation on protecting the bitmaps. When mounting a
1043 * filesystem, we recompute the auxiliary counts from the bitmaps.
1044 */
1045
1046/*
1047 * Called just after updating the cylinder group block to allocate an inode.
1048 */
1049void
1050softdep_setup_inomapdep(bp, ip, newinum)
1051	struct buf *bp;		/* buffer for cylgroup block with inode map */
1052	struct inode *ip;	/* inode related to allocation */
1053	ino_t newinum;		/* new inode number being allocated */
1054{
1055	struct inodedep *inodedep;
1056	struct bmsafemap *bmsafemap;
1057
1058	/*
1059	 * Create a dependency for the newly allocated inode.
1060	 * Panic if it already exists as something is seriously wrong.
1061	 * Otherwise add it to the dependency list for the buffer holding
1062	 * the cylinder group map from which it was allocated.
1063	 */
1064	ACQUIRE_LOCK(&lk);
1065	if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0)
1066		panic("softdep_setup_inomapdep: found inode");
1067	inodedep->id_buf = bp;
1068	inodedep->id_state &= ~DEPCOMPLETE;
1069	bmsafemap = bmsafemap_lookup(bp);
1070	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1071	FREE_LOCK(&lk);
1072}
1073
1074/*
1075 * Called just after updating the cylinder group block to
1076 * allocate block or fragment.
1077 */
1078void
1079softdep_setup_blkmapdep(bp, fs, newblkno)
1080	struct buf *bp;		/* buffer for cylgroup block with block map */
1081	struct fs *fs;		/* filesystem doing allocation */
1082	ufs_daddr_t newblkno;	/* number of newly allocated block */
1083{
1084	struct newblk *newblk;
1085	struct bmsafemap *bmsafemap;
1086
1087	/*
1088	 * Create a dependency for the newly allocated block.
1089	 * Add it to the dependency list for the buffer holding
1090	 * the cylinder group map from which it was allocated.
1091	 */
1092	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1093		panic("softdep_setup_blkmapdep: found block");
1094	ACQUIRE_LOCK(&lk);
1095	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1096	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1097	FREE_LOCK(&lk);
1098}
1099
1100/*
1101 * Find the bmsafemap associated with a cylinder group buffer.
1102 * If none exists, create one. The buffer must be locked when
1103 * this routine is called and this routine must be called with
1104 * splbio interrupts blocked.
1105 */
1106static struct bmsafemap *
1107bmsafemap_lookup(bp)
1108	struct buf *bp;
1109{
1110	struct bmsafemap *bmsafemap;
1111	struct worklist *wk;
1112
1113#ifdef DEBUG
1114	if (lk.lkt_held == -1)
1115		panic("bmsafemap_lookup: lock not held");
1116#endif
1117	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list))
1118		if (wk->wk_type == D_BMSAFEMAP)
1119			return (WK_BMSAFEMAP(wk));
1120	FREE_LOCK(&lk);
1121	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1122		M_BMSAFEMAP, M_WAITOK);
1123	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1124	bmsafemap->sm_list.wk_state = 0;
1125	bmsafemap->sm_buf = bp;
1126	LIST_INIT(&bmsafemap->sm_allocdirecthd);
1127	LIST_INIT(&bmsafemap->sm_allocindirhd);
1128	LIST_INIT(&bmsafemap->sm_inodedephd);
1129	LIST_INIT(&bmsafemap->sm_newblkhd);
1130	ACQUIRE_LOCK(&lk);
1131	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1132	return (bmsafemap);
1133}
1134
1135/*
1136 * Direct block allocation dependencies.
1137 *
1138 * When a new block is allocated, the corresponding disk locations must be
1139 * initialized (with zeros or new data) before the on-disk inode points to
1140 * them.  Also, the freemap from which the block was allocated must be
1141 * updated (on disk) before the inode's pointer. These two dependencies are
1142 * independent of each other and are needed for all file blocks and indirect
1143 * blocks that are pointed to directly by the inode.  Just before the
1144 * "in-core" version of the inode is updated with a newly allocated block
1145 * number, a procedure (below) is called to setup allocation dependency
1146 * structures.  These structures are removed when the corresponding
1147 * dependencies are satisfied or when the block allocation becomes obsolete
1148 * (i.e., the file is deleted, the block is de-allocated, or the block is a
1149 * fragment that gets upgraded).  All of these cases are handled in
1150 * procedures described later.
1151 *
1152 * When a file extension causes a fragment to be upgraded, either to a larger
1153 * fragment or to a full block, the on-disk location may change (if the
1154 * previous fragment could not simply be extended). In this case, the old
1155 * fragment must be de-allocated, but not until after the inode's pointer has
1156 * been updated. In most cases, this is handled by later procedures, which
1157 * will construct a "freefrag" structure to be added to the workitem queue
1158 * when the inode update is complete (or obsolete).  The main exception to
1159 * this is when an allocation occurs while a pending allocation dependency
1160 * (for the same block pointer) remains.  This case is handled in the main
1161 * allocation dependency setup procedure by immediately freeing the
1162 * unreferenced fragments.
1163 */
1164void
1165softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1166	struct inode *ip;	/* inode to which block is being added */
1167	ufs_lbn_t lbn;		/* block pointer within inode */
1168	ufs_daddr_t newblkno;	/* disk block number being added */
1169	ufs_daddr_t oldblkno;	/* previous block number, 0 unless frag */
1170	long newsize;		/* size of new block */
1171	long oldsize;		/* size of new block */
1172	struct buf *bp;		/* bp for allocated block */
1173{
1174	struct allocdirect *adp, *oldadp;
1175	struct allocdirectlst *adphead;
1176	struct bmsafemap *bmsafemap;
1177	struct inodedep *inodedep;
1178	struct pagedep *pagedep;
1179	struct newblk *newblk;
1180
1181	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1182		M_ALLOCDIRECT, M_WAITOK);
1183	bzero(adp, sizeof(struct allocdirect));
1184	adp->ad_list.wk_type = D_ALLOCDIRECT;
1185	adp->ad_lbn = lbn;
1186	adp->ad_newblkno = newblkno;
1187	adp->ad_oldblkno = oldblkno;
1188	adp->ad_newsize = newsize;
1189	adp->ad_oldsize = oldsize;
1190	adp->ad_state = ATTACHED;
1191	if (newblkno == oldblkno)
1192		adp->ad_freefrag = NULL;
1193	else
1194		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1195
1196	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1197		panic("softdep_setup_allocdirect: lost block");
1198
1199	ACQUIRE_LOCK(&lk);
1200	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
1201	adp->ad_inodedep = inodedep;
1202
1203	if (newblk->nb_state == DEPCOMPLETE) {
1204		adp->ad_state |= DEPCOMPLETE;
1205		adp->ad_buf = NULL;
1206	} else {
1207		bmsafemap = newblk->nb_bmsafemap;
1208		adp->ad_buf = bmsafemap->sm_buf;
1209		LIST_REMOVE(newblk, nb_deps);
1210		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1211	}
1212	LIST_REMOVE(newblk, nb_hash);
1213	FREE(newblk, M_NEWBLK);
1214
1215	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1216	if (lbn >= NDADDR) {
1217		/* allocating an indirect block */
1218		if (oldblkno != 0)
1219			panic("softdep_setup_allocdirect: non-zero indir");
1220	} else {
1221		/*
1222		 * Allocating a direct block.
1223		 *
1224		 * If we are allocating a directory block, then we must
1225		 * allocate an associated pagedep to track additions and
1226		 * deletions.
1227		 */
1228		if ((ip->i_mode & IFMT) == IFDIR &&
1229		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1230			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1231	}
1232	/*
1233	 * The list of allocdirects must be kept in sorted and ascending
1234	 * order so that the rollback routines can quickly determine the
1235	 * first uncommitted block (the size of the file stored on disk
1236	 * ends at the end of the lowest committed fragment, or if there
1237	 * are no fragments, at the end of the highest committed block).
1238	 * Since files generally grow, the typical case is that the new
1239	 * block is to be added at the end of the list. We speed this
1240	 * special case by checking against the last allocdirect in the
1241	 * list before laboriously traversing the list looking for the
1242	 * insertion point.
1243	 */
1244	adphead = &inodedep->id_newinoupdt;
1245	oldadp = TAILQ_LAST(adphead, allocdirectlst);
1246	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1247		/* insert at end of list */
1248		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1249		if (oldadp != NULL && oldadp->ad_lbn == lbn)
1250			allocdirect_merge(adphead, adp, oldadp);
1251		FREE_LOCK(&lk);
1252		return;
1253	}
1254	for (oldadp = TAILQ_FIRST(adphead); oldadp;
1255	     oldadp = TAILQ_NEXT(oldadp, ad_next)) {
1256		if (oldadp->ad_lbn >= lbn)
1257			break;
1258	}
1259	if (oldadp == NULL)
1260		panic("softdep_setup_allocdirect: lost entry");
1261	/* insert in middle of list */
1262	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1263	if (oldadp->ad_lbn == lbn)
1264		allocdirect_merge(adphead, adp, oldadp);
1265	FREE_LOCK(&lk);
1266}
1267
1268/*
1269 * Replace an old allocdirect dependency with a newer one.
1270 * This routine must be called with splbio interrupts blocked.
1271 */
1272static void
1273allocdirect_merge(adphead, newadp, oldadp)
1274	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
1275	struct allocdirect *newadp;	/* allocdirect being added */
1276	struct allocdirect *oldadp;	/* existing allocdirect being checked */
1277{
1278	struct freefrag *freefrag;
1279
1280#ifdef DEBUG
1281	if (lk.lkt_held == -1)
1282		panic("allocdirect_merge: lock not held");
1283#endif
1284	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1285	    newadp->ad_oldsize != oldadp->ad_newsize ||
1286	    newadp->ad_lbn >= NDADDR)
1287		panic("allocdirect_check: old %d != new %d || lbn %ld >= %d",
1288		    newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
1289		    NDADDR);
1290	newadp->ad_oldblkno = oldadp->ad_oldblkno;
1291	newadp->ad_oldsize = oldadp->ad_oldsize;
1292	/*
1293	 * If the old dependency had a fragment to free or had never
1294	 * previously had a block allocated, then the new dependency
1295	 * can immediately post its freefrag and adopt the old freefrag.
1296	 * This action is done by swapping the freefrag dependencies.
1297	 * The new dependency gains the old one's freefrag, and the
1298	 * old one gets the new one and then immediately puts it on
1299	 * the worklist when it is freed by free_allocdirect. It is
1300	 * not possible to do this swap when the old dependency had a
1301	 * non-zero size but no previous fragment to free. This condition
1302	 * arises when the new block is an extension of the old block.
1303	 * Here, the first part of the fragment allocated to the new
1304	 * dependency is part of the block currently claimed on disk by
1305	 * the old dependency, so cannot legitimately be freed until the
1306	 * conditions for the new dependency are fulfilled.
1307	 */
1308	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1309		freefrag = newadp->ad_freefrag;
1310		newadp->ad_freefrag = oldadp->ad_freefrag;
1311		oldadp->ad_freefrag = freefrag;
1312	}
1313	free_allocdirect(adphead, oldadp, 0);
1314}
1315
1316/*
1317 * Allocate a new freefrag structure if needed.
1318 */
1319static struct freefrag *
1320newfreefrag(ip, blkno, size)
1321	struct inode *ip;
1322	ufs_daddr_t blkno;
1323	long size;
1324{
1325	struct freefrag *freefrag;
1326	struct fs *fs;
1327
1328	if (blkno == 0)
1329		return (NULL);
1330	fs = ip->i_fs;
1331	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1332		panic("newfreefrag: frag size");
1333	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1334		M_FREEFRAG, M_WAITOK);
1335	freefrag->ff_list.wk_type = D_FREEFRAG;
1336	freefrag->ff_state = ip->i_uid & ~ONWORKLIST;	/* XXX - used below */
1337	freefrag->ff_inum = ip->i_number;
1338	freefrag->ff_fs = fs;
1339	freefrag->ff_devvp = ip->i_devvp;
1340	freefrag->ff_blkno = blkno;
1341	freefrag->ff_fragsize = size;
1342	return (freefrag);
1343}
1344
1345/*
1346 * This workitem de-allocates fragments that were replaced during
1347 * file block allocation.
1348 */
1349static void
1350handle_workitem_freefrag(freefrag)
1351	struct freefrag *freefrag;
1352{
1353	struct inode tip;
1354
1355	tip.i_fs = freefrag->ff_fs;
1356	tip.i_devvp = freefrag->ff_devvp;
1357	tip.i_dev = freefrag->ff_devvp->v_rdev;
1358	tip.i_number = freefrag->ff_inum;
1359	tip.i_uid = freefrag->ff_state & ~ONWORKLIST;	/* XXX - set above */
1360	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1361	FREE(freefrag, M_FREEFRAG);
1362}
1363
1364/*
1365 * Indirect block allocation dependencies.
1366 *
1367 * The same dependencies that exist for a direct block also exist when
1368 * a new block is allocated and pointed to by an entry in a block of
1369 * indirect pointers. The undo/redo states described above are also
1370 * used here. Because an indirect block contains many pointers that
1371 * may have dependencies, a second copy of the entire in-memory indirect
1372 * block is kept. The buffer cache copy is always completely up-to-date.
1373 * The second copy, which is used only as a source for disk writes,
1374 * contains only the safe pointers (i.e., those that have no remaining
1375 * update dependencies). The second copy is freed when all pointers
1376 * are safe. The cache is not allowed to replace indirect blocks with
1377 * pending update dependencies. If a buffer containing an indirect
1378 * block with dependencies is written, these routines will mark it
1379 * dirty again. It can only be successfully written once all the
1380 * dependencies are removed. The ffs_fsync routine in conjunction with
1381 * softdep_sync_metadata work together to get all the dependencies
1382 * removed so that a file can be successfully written to disk. Three
1383 * procedures are used when setting up indirect block pointer
1384 * dependencies. The division is necessary because of the organization
1385 * of the "balloc" routine and because of the distinction between file
1386 * pages and file metadata blocks.
1387 */
1388
1389/*
1390 * Allocate a new allocindir structure.
1391 */
1392static struct allocindir *
1393newallocindir(ip, ptrno, newblkno, oldblkno)
1394	struct inode *ip;	/* inode for file being extended */
1395	int ptrno;		/* offset of pointer in indirect block */
1396	ufs_daddr_t newblkno;	/* disk block number being added */
1397	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1398{
1399	struct allocindir *aip;
1400
1401	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1402		M_ALLOCINDIR, M_WAITOK);
1403	bzero(aip, sizeof(struct allocindir));
1404	aip->ai_list.wk_type = D_ALLOCINDIR;
1405	aip->ai_state = ATTACHED;
1406	aip->ai_offset = ptrno;
1407	aip->ai_newblkno = newblkno;
1408	aip->ai_oldblkno = oldblkno;
1409	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1410	return (aip);
1411}
1412
1413/*
1414 * Called just before setting an indirect block pointer
1415 * to a newly allocated file page.
1416 */
1417void
1418softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1419	struct inode *ip;	/* inode for file being extended */
1420	ufs_lbn_t lbn;		/* allocated block number within file */
1421	struct buf *bp;		/* buffer with indirect blk referencing page */
1422	int ptrno;		/* offset of pointer in indirect block */
1423	ufs_daddr_t newblkno;	/* disk block number being added */
1424	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1425	struct buf *nbp;	/* buffer holding allocated page */
1426{
1427	struct allocindir *aip;
1428	struct pagedep *pagedep;
1429
1430	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1431	ACQUIRE_LOCK(&lk);
1432	/*
1433	 * If we are allocating a directory page, then we must
1434	 * allocate an associated pagedep to track additions and
1435	 * deletions.
1436	 */
1437	if ((ip->i_mode & IFMT) == IFDIR &&
1438	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1439		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1440	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1441	FREE_LOCK(&lk);
1442	setup_allocindir_phase2(bp, ip, aip);
1443}
1444
1445/*
1446 * Called just before setting an indirect block pointer to a
1447 * newly allocated indirect block.
1448 */
1449void
1450softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1451	struct buf *nbp;	/* newly allocated indirect block */
1452	struct inode *ip;	/* inode for file being extended */
1453	struct buf *bp;		/* indirect block referencing allocated block */
1454	int ptrno;		/* offset of pointer in indirect block */
1455	ufs_daddr_t newblkno;	/* disk block number being added */
1456{
1457	struct allocindir *aip;
1458
1459	aip = newallocindir(ip, ptrno, newblkno, 0);
1460	ACQUIRE_LOCK(&lk);
1461	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1462	FREE_LOCK(&lk);
1463	setup_allocindir_phase2(bp, ip, aip);
1464}
1465
1466/*
1467 * Called to finish the allocation of the "aip" allocated
1468 * by one of the two routines above.
1469 */
1470static void
1471setup_allocindir_phase2(bp, ip, aip)
1472	struct buf *bp;		/* in-memory copy of the indirect block */
1473	struct inode *ip;	/* inode for file being extended */
1474	struct allocindir *aip;	/* allocindir allocated by the above routines */
1475{
1476	struct worklist *wk;
1477	struct indirdep *indirdep, *newindirdep;
1478	struct bmsafemap *bmsafemap;
1479	struct allocindir *oldaip;
1480	struct freefrag *freefrag;
1481	struct newblk *newblk;
1482
1483	if (bp->b_lblkno >= 0)
1484		panic("setup_allocindir_phase2: not indir blk");
1485	for (indirdep = NULL, newindirdep = NULL; ; ) {
1486		ACQUIRE_LOCK(&lk);
1487		for (wk = LIST_FIRST(&bp->b_dep); wk;
1488		     wk = LIST_NEXT(wk, wk_list)) {
1489			if (wk->wk_type != D_INDIRDEP)
1490				continue;
1491			indirdep = WK_INDIRDEP(wk);
1492			break;
1493		}
1494		if (indirdep == NULL && newindirdep) {
1495			indirdep = newindirdep;
1496			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1497			newindirdep = NULL;
1498		}
1499		FREE_LOCK(&lk);
1500		if (indirdep) {
1501			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1502			    &newblk) == 0)
1503				panic("setup_allocindir: lost block");
1504			ACQUIRE_LOCK(&lk);
1505			if (newblk->nb_state == DEPCOMPLETE) {
1506				aip->ai_state |= DEPCOMPLETE;
1507				aip->ai_buf = NULL;
1508			} else {
1509				bmsafemap = newblk->nb_bmsafemap;
1510				aip->ai_buf = bmsafemap->sm_buf;
1511				LIST_REMOVE(newblk, nb_deps);
1512				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1513				    aip, ai_deps);
1514			}
1515			LIST_REMOVE(newblk, nb_hash);
1516			FREE(newblk, M_NEWBLK);
1517			aip->ai_indirdep = indirdep;
1518			/*
1519			 * Check to see if there is an existing dependency
1520			 * for this block. If there is, merge the old
1521			 * dependency into the new one.
1522			 */
1523			if (aip->ai_oldblkno == 0)
1524				oldaip = NULL;
1525			else
1526				for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd);
1527				    oldaip; oldaip = LIST_NEXT(oldaip, ai_next))
1528					if (oldaip->ai_offset == aip->ai_offset)
1529						break;
1530			if (oldaip != NULL) {
1531				if (oldaip->ai_newblkno != aip->ai_oldblkno)
1532					panic("setup_allocindir_phase2: blkno");
1533				aip->ai_oldblkno = oldaip->ai_oldblkno;
1534				freefrag = oldaip->ai_freefrag;
1535				oldaip->ai_freefrag = aip->ai_freefrag;
1536				aip->ai_freefrag = freefrag;
1537				free_allocindir(oldaip, NULL);
1538			}
1539			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1540			((ufs_daddr_t *)indirdep->ir_savebp->b_data)
1541			    [aip->ai_offset] = aip->ai_oldblkno;
1542			FREE_LOCK(&lk);
1543		}
1544		if (newindirdep) {
1545			if (indirdep->ir_savebp != NULL)
1546				brelse(newindirdep->ir_savebp);
1547			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
1548		}
1549		if (indirdep)
1550			break;
1551		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
1552			M_INDIRDEP, M_WAITOK);
1553		newindirdep->ir_list.wk_type = D_INDIRDEP;
1554		newindirdep->ir_state = ATTACHED;
1555		LIST_INIT(&newindirdep->ir_deplisthd);
1556		LIST_INIT(&newindirdep->ir_donehd);
1557		if (bp->b_blkno == bp->b_lblkno) {
1558			VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
1559				NULL, NULL);
1560		}
1561		newindirdep->ir_savebp =
1562		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1563		BUF_KERNPROC(newindirdep->ir_savebp);
1564		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1565	}
1566}
1567
1568/*
1569 * Block de-allocation dependencies.
1570 *
1571 * When blocks are de-allocated, the on-disk pointers must be nullified before
1572 * the blocks are made available for use by other files.  (The true
1573 * requirement is that old pointers must be nullified before new on-disk
1574 * pointers are set.  We chose this slightly more stringent requirement to
1575 * reduce complexity.) Our implementation handles this dependency by updating
1576 * the inode (or indirect block) appropriately but delaying the actual block
1577 * de-allocation (i.e., freemap and free space count manipulation) until
1578 * after the updated versions reach stable storage.  After the disk is
1579 * updated, the blocks can be safely de-allocated whenever it is convenient.
1580 * This implementation handles only the common case of reducing a file's
1581 * length to zero. Other cases are handled by the conventional synchronous
1582 * write approach.
1583 *
1584 * The ffs implementation with which we worked double-checks
1585 * the state of the block pointers and file size as it reduces
1586 * a file's length.  Some of this code is replicated here in our
1587 * soft updates implementation.  The freeblks->fb_chkcnt field is
1588 * used to transfer a part of this information to the procedure
1589 * that eventually de-allocates the blocks.
1590 *
1591 * This routine should be called from the routine that shortens
1592 * a file's length, before the inode's size or block pointers
1593 * are modified. It will save the block pointer information for
1594 * later release and zero the inode so that the calling routine
1595 * can release it.
1596 */
1597void
1598softdep_setup_freeblocks(ip, length)
1599	struct inode *ip;	/* The inode whose length is to be reduced */
1600	off_t length;		/* The new length for the file */
1601{
1602	struct freeblks *freeblks;
1603	struct inodedep *inodedep;
1604	struct allocdirect *adp;
1605	struct vnode *vp;
1606	struct buf *bp;
1607	struct fs *fs;
1608	int i, error;
1609
1610	fs = ip->i_fs;
1611	if (length != 0)
1612		panic("softde_setup_freeblocks: non-zero length");
1613	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
1614		M_FREEBLKS, M_WAITOK);
1615	bzero(freeblks, sizeof(struct freeblks));
1616	freeblks->fb_list.wk_type = D_FREEBLKS;
1617	freeblks->fb_uid = ip->i_uid;
1618	freeblks->fb_previousinum = ip->i_number;
1619	freeblks->fb_devvp = ip->i_devvp;
1620	freeblks->fb_fs = fs;
1621	freeblks->fb_oldsize = ip->i_size;
1622	freeblks->fb_newsize = length;
1623	freeblks->fb_chkcnt = ip->i_blocks;
1624	for (i = 0; i < NDADDR; i++) {
1625		freeblks->fb_dblks[i] = ip->i_db[i];
1626		ip->i_db[i] = 0;
1627	}
1628	for (i = 0; i < NIADDR; i++) {
1629		freeblks->fb_iblks[i] = ip->i_ib[i];
1630		ip->i_ib[i] = 0;
1631	}
1632	ip->i_blocks = 0;
1633	ip->i_size = 0;
1634	/*
1635	 * Push the zero'ed inode to to its disk buffer so that we are free
1636	 * to delete its dependencies below. Once the dependencies are gone
1637	 * the buffer can be safely released.
1638	 */
1639	if ((error = bread(ip->i_devvp,
1640	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1641	    (int)fs->fs_bsize, NOCRED, &bp)) != 0)
1642		softdep_error("softdep_setup_freeblocks", error);
1643	*((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
1644	    ip->i_din;
1645	/*
1646	 * Find and eliminate any inode dependencies.
1647	 */
1648	ACQUIRE_LOCK(&lk);
1649	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1650	if ((inodedep->id_state & IOSTARTED) != 0)
1651		panic("softdep_setup_freeblocks: inode busy");
1652	/*
1653	 * Because the file length has been truncated to zero, any
1654	 * pending block allocation dependency structures associated
1655	 * with this inode are obsolete and can simply be de-allocated.
1656	 * We must first merge the two dependency lists to get rid of
1657	 * any duplicate freefrag structures, then purge the merged list.
1658	 */
1659	merge_inode_lists(inodedep);
1660	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
1661		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
1662	FREE_LOCK(&lk);
1663	bdwrite(bp);
1664	/*
1665	 * We must wait for any I/O in progress to finish so that
1666	 * all potential buffers on the dirty list will be visible.
1667	 * Once they are all there, walk the list and get rid of
1668	 * any dependencies.
1669	 */
1670	vp = ITOV(ip);
1671	ACQUIRE_LOCK(&lk);
1672	drain_output(vp, 1);
1673	while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
1674		bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
1675		(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
1676		deallocate_dependencies(bp, inodedep);
1677		bp->b_flags |= B_INVAL | B_NOCACHE;
1678		FREE_LOCK(&lk);
1679		brelse(bp);
1680		ACQUIRE_LOCK(&lk);
1681	}
1682	/*
1683	 * Add the freeblks structure to the list of operations that
1684	 * must await the zero'ed inode being written to disk. If we
1685	 * still have a bitmap dependency, then the inode has never been
1686	 * written to disk, so we can process the freeblks immediately.
1687	 * If the inodedep does not exist, then the zero'ed inode has
1688	 * been written and we can also proceed.
1689	 */
1690	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0 ||
1691	    free_inodedep(inodedep) ||
1692	    (inodedep->id_state & DEPCOMPLETE) == 0) {
1693		FREE_LOCK(&lk);
1694		handle_workitem_freeblocks(freeblks);
1695	} else {
1696		WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1697		FREE_LOCK(&lk);
1698	}
1699}
1700
1701/*
1702 * Reclaim any dependency structures from a buffer that is about to
1703 * be reallocated to a new vnode. The buffer must be locked, thus,
1704 * no I/O completion operations can occur while we are manipulating
1705 * its associated dependencies. The mutex is held so that other I/O's
1706 * associated with related dependencies do not occur.
1707 */
1708static void
1709deallocate_dependencies(bp, inodedep)
1710	struct buf *bp;
1711	struct inodedep *inodedep;
1712{
1713	struct worklist *wk;
1714	struct indirdep *indirdep;
1715	struct allocindir *aip;
1716	struct pagedep *pagedep;
1717	struct dirrem *dirrem;
1718	struct diradd *dap;
1719	int i;
1720
1721	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
1722		switch (wk->wk_type) {
1723
1724		case D_INDIRDEP:
1725			indirdep = WK_INDIRDEP(wk);
1726			/*
1727			 * None of the indirect pointers will ever be visible,
1728			 * so they can simply be tossed. GOINGAWAY ensures
1729			 * that allocated pointers will be saved in the buffer
1730			 * cache until they are freed. Note that they will
1731			 * only be able to be found by their physical address
1732			 * since the inode mapping the logical address will
1733			 * be gone. The save buffer used for the safe copy
1734			 * was allocated in setup_allocindir_phase2 using
1735			 * the physical address so it could be used for this
1736			 * purpose. Hence we swap the safe copy with the real
1737			 * copy, allowing the safe copy to be freed and holding
1738			 * on to the real copy for later use in indir_trunc.
1739			 */
1740			if (indirdep->ir_state & GOINGAWAY)
1741				panic("deallocate_dependencies: already gone");
1742			indirdep->ir_state |= GOINGAWAY;
1743			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
1744				free_allocindir(aip, inodedep);
1745			if (bp->b_lblkno >= 0 ||
1746			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
1747				panic("deallocate_dependencies: not indir");
1748			bcopy(bp->b_data, indirdep->ir_savebp->b_data,
1749			    bp->b_bcount);
1750			WORKLIST_REMOVE(wk);
1751			WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
1752			continue;
1753
1754		case D_PAGEDEP:
1755			pagedep = WK_PAGEDEP(wk);
1756			/*
1757			 * None of the directory additions will ever be
1758			 * visible, so they can simply be tossed.
1759			 */
1760			for (i = 0; i < DAHASHSZ; i++)
1761				while ((dap =
1762				    LIST_FIRST(&pagedep->pd_diraddhd[i])))
1763					free_diradd(dap);
1764			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
1765				free_diradd(dap);
1766			/*
1767			 * Copy any directory remove dependencies to the list
1768			 * to be processed after the zero'ed inode is written.
1769			 * If the inode has already been written, then they
1770			 * can be dumped directly onto the work list.
1771			 */
1772			for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem;
1773			     dirrem = LIST_NEXT(dirrem, dm_next)) {
1774				LIST_REMOVE(dirrem, dm_next);
1775				dirrem->dm_dirinum = pagedep->pd_ino;
1776				if (inodedep == NULL ||
1777				    (inodedep->id_state & ALLCOMPLETE) ==
1778				     ALLCOMPLETE)
1779					add_to_worklist(&dirrem->dm_list);
1780				else
1781					WORKLIST_INSERT(&inodedep->id_bufwait,
1782					    &dirrem->dm_list);
1783			}
1784			WORKLIST_REMOVE(&pagedep->pd_list);
1785			LIST_REMOVE(pagedep, pd_hash);
1786			WORKITEM_FREE(pagedep, D_PAGEDEP);
1787			continue;
1788
1789		case D_ALLOCINDIR:
1790			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
1791			continue;
1792
1793		case D_ALLOCDIRECT:
1794		case D_INODEDEP:
1795			panic("deallocate_dependencies: Unexpected type %s",
1796			    TYPENAME(wk->wk_type));
1797			/* NOTREACHED */
1798
1799		default:
1800			panic("deallocate_dependencies: Unknown type %s",
1801			    TYPENAME(wk->wk_type));
1802			/* NOTREACHED */
1803		}
1804	}
1805}
1806
1807/*
1808 * Free an allocdirect. Generate a new freefrag work request if appropriate.
1809 * This routine must be called with splbio interrupts blocked.
1810 */
1811static void
1812free_allocdirect(adphead, adp, delay)
1813	struct allocdirectlst *adphead;
1814	struct allocdirect *adp;
1815	int delay;
1816{
1817
1818#ifdef DEBUG
1819	if (lk.lkt_held == -1)
1820		panic("free_allocdirect: lock not held");
1821#endif
1822	if ((adp->ad_state & DEPCOMPLETE) == 0)
1823		LIST_REMOVE(adp, ad_deps);
1824	TAILQ_REMOVE(adphead, adp, ad_next);
1825	if ((adp->ad_state & COMPLETE) == 0)
1826		WORKLIST_REMOVE(&adp->ad_list);
1827	if (adp->ad_freefrag != NULL) {
1828		if (delay)
1829			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
1830			    &adp->ad_freefrag->ff_list);
1831		else
1832			add_to_worklist(&adp->ad_freefrag->ff_list);
1833	}
1834	WORKITEM_FREE(adp, D_ALLOCDIRECT);
1835}
1836
1837/*
1838 * Prepare an inode to be freed. The actual free operation is not
1839 * done until the zero'ed inode has been written to disk.
1840 */
1841void
1842softdep_freefile(pvp, ino, mode)
1843		struct vnode *pvp;
1844		ino_t ino;
1845		int mode;
1846{
1847	struct inode *ip = VTOI(pvp);
1848	struct inodedep *inodedep;
1849	struct freefile *freefile;
1850
1851	/*
1852	 * This sets up the inode de-allocation dependency.
1853	 */
1854	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
1855		M_FREEFILE, M_WAITOK);
1856	freefile->fx_list.wk_type = D_FREEFILE;
1857	freefile->fx_list.wk_state = 0;
1858	freefile->fx_mode = mode;
1859	freefile->fx_oldinum = ino;
1860	freefile->fx_devvp = ip->i_devvp;
1861	freefile->fx_fs = ip->i_fs;
1862
1863	/*
1864	 * If the inodedep does not exist, then the zero'ed inode has
1865	 * been written to disk. If the allocated inode has never been
1866	 * written to disk, then the on-disk inode is zero'ed. In either
1867	 * case we can free the file immediately.
1868	 */
1869	ACQUIRE_LOCK(&lk);
1870	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
1871	    check_inode_unwritten(inodedep)) {
1872		FREE_LOCK(&lk);
1873		handle_workitem_freefile(freefile);
1874		return;
1875	}
1876	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
1877	FREE_LOCK(&lk);
1878}
1879
1880/*
1881 * Check to see if an inode has never been written to disk. If
1882 * so free the inodedep and return success, otherwise return failure.
1883 * This routine must be called with splbio interrupts blocked.
1884 *
1885 * If we still have a bitmap dependency, then the inode has never
1886 * been written to disk. Drop the dependency as it is no longer
1887 * necessary since the inode is being deallocated. We set the
1888 * ALLCOMPLETE flags since the bitmap now properly shows that the
1889 * inode is not allocated. Even if the inode is actively being
1890 * written, it has been rolled back to its zero'ed state, so we
1891 * are ensured that a zero inode is what is on the disk. For short
1892 * lived files, this change will usually result in removing all the
1893 * dependencies from the inode so that it can be freed immediately.
1894 */
1895static int
1896check_inode_unwritten(inodedep)
1897	struct inodedep *inodedep;
1898{
1899
1900	if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
1901	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
1902	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
1903	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
1904	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
1905	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
1906	    inodedep->id_nlinkdelta != 0)
1907		return (0);
1908	inodedep->id_state |= ALLCOMPLETE;
1909	LIST_REMOVE(inodedep, id_deps);
1910	inodedep->id_buf = NULL;
1911	WORKLIST_REMOVE(&inodedep->id_list);
1912	if (inodedep->id_savedino != NULL) {
1913		FREE(inodedep->id_savedino, M_INODEDEP);
1914		inodedep->id_savedino = NULL;
1915	}
1916	if (free_inodedep(inodedep) == 0)
1917		panic("check_inode_unwritten: busy inode");
1918	return (1);
1919}
1920
1921/*
1922 * Try to free an inodedep structure. Return 1 if it could be freed.
1923 */
1924static int
1925free_inodedep(inodedep)
1926	struct inodedep *inodedep;
1927{
1928
1929	if ((inodedep->id_state & ONWORKLIST) != 0 ||
1930	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
1931	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
1932	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
1933	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
1934	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
1935	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
1936	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL)
1937		return (0);
1938	LIST_REMOVE(inodedep, id_hash);
1939	WORKITEM_FREE(inodedep, D_INODEDEP);
1940	num_inodedep -= 1;
1941	return (1);
1942}
1943
1944/*
1945 * This workitem routine performs the block de-allocation.
1946 * The workitem is added to the pending list after the updated
1947 * inode block has been written to disk.  As mentioned above,
1948 * checks regarding the number of blocks de-allocated (compared
1949 * to the number of blocks allocated for the file) are also
1950 * performed in this function.
1951 */
1952static void
1953handle_workitem_freeblocks(freeblks)
1954	struct freeblks *freeblks;
1955{
1956	struct inode tip;
1957	ufs_daddr_t bn;
1958	struct fs *fs;
1959	int i, level, bsize;
1960	long nblocks, blocksreleased = 0;
1961	int error, allerror = 0;
1962	ufs_lbn_t baselbns[NIADDR], tmpval;
1963
1964	tip.i_number = freeblks->fb_previousinum;
1965	tip.i_devvp = freeblks->fb_devvp;
1966	tip.i_dev = freeblks->fb_devvp->v_rdev;
1967	tip.i_fs = freeblks->fb_fs;
1968	tip.i_size = freeblks->fb_oldsize;
1969	tip.i_uid = freeblks->fb_uid;
1970	fs = freeblks->fb_fs;
1971	tmpval = 1;
1972	baselbns[0] = NDADDR;
1973	for (i = 1; i < NIADDR; i++) {
1974		tmpval *= NINDIR(fs);
1975		baselbns[i] = baselbns[i - 1] + tmpval;
1976	}
1977	nblocks = btodb(fs->fs_bsize);
1978	blocksreleased = 0;
1979	/*
1980	 * Indirect blocks first.
1981	 */
1982	for (level = (NIADDR - 1); level >= 0; level--) {
1983		if ((bn = freeblks->fb_iblks[level]) == 0)
1984			continue;
1985		if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
1986		    baselbns[level], &blocksreleased)) == 0)
1987			allerror = error;
1988		ffs_blkfree(&tip, bn, fs->fs_bsize);
1989		blocksreleased += nblocks;
1990	}
1991	/*
1992	 * All direct blocks or frags.
1993	 */
1994	for (i = (NDADDR - 1); i >= 0; i--) {
1995		if ((bn = freeblks->fb_dblks[i]) == 0)
1996			continue;
1997		bsize = blksize(fs, &tip, i);
1998		ffs_blkfree(&tip, bn, bsize);
1999		blocksreleased += btodb(bsize);
2000	}
2001
2002#ifdef DIAGNOSTIC
2003	if (freeblks->fb_chkcnt != blocksreleased)
2004		panic("handle_workitem_freeblocks: block count");
2005	if (allerror)
2006		softdep_error("handle_workitem_freeblks", allerror);
2007#endif /* DIAGNOSTIC */
2008	WORKITEM_FREE(freeblks, D_FREEBLKS);
2009}
2010
2011/*
2012 * Release blocks associated with the inode ip and stored in the indirect
2013 * block dbn. If level is greater than SINGLE, the block is an indirect block
2014 * and recursive calls to indirtrunc must be used to cleanse other indirect
2015 * blocks.
2016 */
2017static int
2018indir_trunc(ip, dbn, level, lbn, countp)
2019	struct inode *ip;
2020	ufs_daddr_t dbn;
2021	int level;
2022	ufs_lbn_t lbn;
2023	long *countp;
2024{
2025	struct buf *bp;
2026	ufs_daddr_t *bap;
2027	ufs_daddr_t nb;
2028	struct fs *fs;
2029	struct worklist *wk;
2030	struct indirdep *indirdep;
2031	int i, lbnadd, nblocks;
2032	int error, allerror = 0;
2033
2034	fs = ip->i_fs;
2035	lbnadd = 1;
2036	for (i = level; i > 0; i--)
2037		lbnadd *= NINDIR(fs);
2038	/*
2039	 * Get buffer of block pointers to be freed. This routine is not
2040	 * called until the zero'ed inode has been written, so it is safe
2041	 * to free blocks as they are encountered. Because the inode has
2042	 * been zero'ed, calls to bmap on these blocks will fail. So, we
2043	 * have to use the on-disk address and the block device for the
2044	 * filesystem to look them up. If the file was deleted before its
2045	 * indirect blocks were all written to disk, the routine that set
2046	 * us up (deallocate_dependencies) will have arranged to leave
2047	 * a complete copy of the indirect block in memory for our use.
2048	 * Otherwise we have to read the blocks in from the disk.
2049	 */
2050	ACQUIRE_LOCK(&lk);
2051	if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
2052	    (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2053		if (wk->wk_type != D_INDIRDEP ||
2054		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2055		    (indirdep->ir_state & GOINGAWAY) == 0)
2056			panic("indir_trunc: lost indirdep");
2057		WORKLIST_REMOVE(wk);
2058		WORKITEM_FREE(indirdep, D_INDIRDEP);
2059		if (LIST_FIRST(&bp->b_dep) != NULL)
2060			panic("indir_trunc: dangling dep");
2061		FREE_LOCK(&lk);
2062	} else {
2063		FREE_LOCK(&lk);
2064		error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
2065		if (error)
2066			return (error);
2067	}
2068	/*
2069	 * Recursively free indirect blocks.
2070	 */
2071	bap = (ufs_daddr_t *)bp->b_data;
2072	nblocks = btodb(fs->fs_bsize);
2073	for (i = NINDIR(fs) - 1; i >= 0; i--) {
2074		if ((nb = bap[i]) == 0)
2075			continue;
2076		if (level != 0) {
2077			if ((error = indir_trunc(ip, fsbtodb(fs, nb),
2078			     level - 1, lbn + (i * lbnadd), countp)) != 0)
2079				allerror = error;
2080		}
2081		ffs_blkfree(ip, nb, fs->fs_bsize);
2082		*countp += nblocks;
2083	}
2084	bp->b_flags |= B_INVAL | B_NOCACHE;
2085	brelse(bp);
2086	return (allerror);
2087}
2088
2089/*
2090 * Free an allocindir.
2091 * This routine must be called with splbio interrupts blocked.
2092 */
2093static void
2094free_allocindir(aip, inodedep)
2095	struct allocindir *aip;
2096	struct inodedep *inodedep;
2097{
2098	struct freefrag *freefrag;
2099
2100#ifdef DEBUG
2101	if (lk.lkt_held == -1)
2102		panic("free_allocindir: lock not held");
2103#endif
2104	if ((aip->ai_state & DEPCOMPLETE) == 0)
2105		LIST_REMOVE(aip, ai_deps);
2106	if (aip->ai_state & ONWORKLIST)
2107		WORKLIST_REMOVE(&aip->ai_list);
2108	LIST_REMOVE(aip, ai_next);
2109	if ((freefrag = aip->ai_freefrag) != NULL) {
2110		if (inodedep == NULL)
2111			add_to_worklist(&freefrag->ff_list);
2112		else
2113			WORKLIST_INSERT(&inodedep->id_bufwait,
2114			    &freefrag->ff_list);
2115	}
2116	WORKITEM_FREE(aip, D_ALLOCINDIR);
2117}
2118
2119/*
2120 * Directory entry addition dependencies.
2121 *
2122 * When adding a new directory entry, the inode (with its incremented link
2123 * count) must be written to disk before the directory entry's pointer to it.
2124 * Also, if the inode is newly allocated, the corresponding freemap must be
2125 * updated (on disk) before the directory entry's pointer. These requirements
2126 * are met via undo/redo on the directory entry's pointer, which consists
2127 * simply of the inode number.
2128 *
2129 * As directory entries are added and deleted, the free space within a
2130 * directory block can become fragmented.  The ufs file system will compact
2131 * a fragmented directory block to make space for a new entry. When this
2132 * occurs, the offsets of previously added entries change. Any "diradd"
2133 * dependency structures corresponding to these entries must be updated with
2134 * the new offsets.
2135 */
2136
2137/*
2138 * This routine is called after the in-memory inode's link
2139 * count has been incremented, but before the directory entry's
2140 * pointer to the inode has been set.
2141 */
2142void
2143softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
2144	struct buf *bp;		/* buffer containing directory block */
2145	struct inode *dp;	/* inode for directory */
2146	off_t diroffset;	/* offset of new entry in directory */
2147	long newinum;		/* inode referenced by new directory entry */
2148	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
2149{
2150	int offset;		/* offset of new entry within directory block */
2151	ufs_lbn_t lbn;		/* block in directory containing new entry */
2152	struct fs *fs;
2153	struct diradd *dap;
2154	struct pagedep *pagedep;
2155	struct inodedep *inodedep;
2156	struct mkdir *mkdir1, *mkdir2;
2157
2158	/*
2159	 * Whiteouts have no dependencies.
2160	 */
2161	if (newinum == WINO) {
2162		if (newdirbp != NULL)
2163			bdwrite(newdirbp);
2164		return;
2165	}
2166
2167	fs = dp->i_fs;
2168	lbn = lblkno(fs, diroffset);
2169	offset = blkoff(fs, diroffset);
2170	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK);
2171	bzero(dap, sizeof(struct diradd));
2172	dap->da_list.wk_type = D_DIRADD;
2173	dap->da_offset = offset;
2174	dap->da_newinum = newinum;
2175	dap->da_state = ATTACHED;
2176	if (newdirbp == NULL) {
2177		dap->da_state |= DEPCOMPLETE;
2178		ACQUIRE_LOCK(&lk);
2179	} else {
2180		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2181		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2182		    M_WAITOK);
2183		mkdir1->md_list.wk_type = D_MKDIR;
2184		mkdir1->md_state = MKDIR_BODY;
2185		mkdir1->md_diradd = dap;
2186		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2187		    M_WAITOK);
2188		mkdir2->md_list.wk_type = D_MKDIR;
2189		mkdir2->md_state = MKDIR_PARENT;
2190		mkdir2->md_diradd = dap;
2191		/*
2192		 * Dependency on "." and ".." being written to disk.
2193		 */
2194		mkdir1->md_buf = newdirbp;
2195		ACQUIRE_LOCK(&lk);
2196		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2197		WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2198		FREE_LOCK(&lk);
2199		bdwrite(newdirbp);
2200		/*
2201		 * Dependency on link count increase for parent directory
2202		 */
2203		ACQUIRE_LOCK(&lk);
2204		if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
2205		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2206			dap->da_state &= ~MKDIR_PARENT;
2207			WORKITEM_FREE(mkdir2, D_MKDIR);
2208		} else {
2209			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2210			WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2211		}
2212	}
2213	/*
2214	 * Link into parent directory pagedep to await its being written.
2215	 */
2216	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2217		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2218	dap->da_pagedep = pagedep;
2219	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2220	    da_pdlist);
2221	/*
2222	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2223	 * is not yet written. If it is written, do the post-inode write
2224	 * processing to put it on the id_pendinghd list.
2225	 */
2226	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2227	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2228		diradd_inode_written(dap, inodedep);
2229	else
2230		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2231	FREE_LOCK(&lk);
2232}
2233
2234/*
2235 * This procedure is called to change the offset of a directory
2236 * entry when compacting a directory block which must be owned
2237 * exclusively by the caller. Note that the actual entry movement
2238 * must be done in this procedure to ensure that no I/O completions
2239 * occur while the move is in progress.
2240 */
2241void
2242softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2243	struct inode *dp;	/* inode for directory */
2244	caddr_t base;		/* address of dp->i_offset */
2245	caddr_t oldloc;		/* address of old directory location */
2246	caddr_t newloc;		/* address of new directory location */
2247	int entrysize;		/* size of directory entry */
2248{
2249	int offset, oldoffset, newoffset;
2250	struct pagedep *pagedep;
2251	struct diradd *dap;
2252	ufs_lbn_t lbn;
2253
2254	ACQUIRE_LOCK(&lk);
2255	lbn = lblkno(dp->i_fs, dp->i_offset);
2256	offset = blkoff(dp->i_fs, dp->i_offset);
2257	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2258		goto done;
2259	oldoffset = offset + (oldloc - base);
2260	newoffset = offset + (newloc - base);
2261	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]);
2262	     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2263		if (dap->da_offset != oldoffset)
2264			continue;
2265		dap->da_offset = newoffset;
2266		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2267			break;
2268		LIST_REMOVE(dap, da_pdlist);
2269		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2270		    dap, da_pdlist);
2271		break;
2272	}
2273	if (dap == NULL) {
2274		for (dap = LIST_FIRST(&pagedep->pd_pendinghd);
2275		     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2276			if (dap->da_offset == oldoffset) {
2277				dap->da_offset = newoffset;
2278				break;
2279			}
2280		}
2281	}
2282done:
2283	bcopy(oldloc, newloc, entrysize);
2284	FREE_LOCK(&lk);
2285}
2286
2287/*
2288 * Free a diradd dependency structure. This routine must be called
2289 * with splbio interrupts blocked.
2290 */
2291static void
2292free_diradd(dap)
2293	struct diradd *dap;
2294{
2295	struct dirrem *dirrem;
2296	struct pagedep *pagedep;
2297	struct inodedep *inodedep;
2298	struct mkdir *mkdir, *nextmd;
2299
2300#ifdef DEBUG
2301	if (lk.lkt_held == -1)
2302		panic("free_diradd: lock not held");
2303#endif
2304	WORKLIST_REMOVE(&dap->da_list);
2305	LIST_REMOVE(dap, da_pdlist);
2306	if ((dap->da_state & DIRCHG) == 0) {
2307		pagedep = dap->da_pagedep;
2308	} else {
2309		dirrem = dap->da_previous;
2310		pagedep = dirrem->dm_pagedep;
2311		dirrem->dm_dirinum = pagedep->pd_ino;
2312		add_to_worklist(&dirrem->dm_list);
2313	}
2314	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2315	    0, &inodedep) != 0)
2316		(void) free_inodedep(inodedep);
2317	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2318		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2319			nextmd = LIST_NEXT(mkdir, md_mkdirs);
2320			if (mkdir->md_diradd != dap)
2321				continue;
2322			dap->da_state &= ~mkdir->md_state;
2323			WORKLIST_REMOVE(&mkdir->md_list);
2324			LIST_REMOVE(mkdir, md_mkdirs);
2325			WORKITEM_FREE(mkdir, D_MKDIR);
2326		}
2327		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
2328			panic("free_diradd: unfound ref");
2329	}
2330	WORKITEM_FREE(dap, D_DIRADD);
2331}
2332
2333/*
2334 * Directory entry removal dependencies.
2335 *
2336 * When removing a directory entry, the entry's inode pointer must be
2337 * zero'ed on disk before the corresponding inode's link count is decremented
2338 * (possibly freeing the inode for re-use). This dependency is handled by
2339 * updating the directory entry but delaying the inode count reduction until
2340 * after the directory block has been written to disk. After this point, the
2341 * inode count can be decremented whenever it is convenient.
2342 */
2343
2344/*
2345 * This routine should be called immediately after removing
2346 * a directory entry.  The inode's link count should not be
2347 * decremented by the calling procedure -- the soft updates
2348 * code will do this task when it is safe.
2349 */
2350void
2351softdep_setup_remove(bp, dp, ip, isrmdir)
2352	struct buf *bp;		/* buffer containing directory block */
2353	struct inode *dp;	/* inode for the directory being modified */
2354	struct inode *ip;	/* inode for directory entry being removed */
2355	int isrmdir;		/* indicates if doing RMDIR */
2356{
2357	struct dirrem *dirrem, *prevdirrem;
2358
2359	/*
2360	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2361	 */
2362	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2363
2364	/*
2365	 * If the COMPLETE flag is clear, then there were no active
2366	 * entries and we want to roll back to a zeroed entry until
2367	 * the new inode is committed to disk. If the COMPLETE flag is
2368	 * set then we have deleted an entry that never made it to
2369	 * disk. If the entry we deleted resulted from a name change,
2370	 * then the old name still resides on disk. We cannot delete
2371	 * its inode (returned to us in prevdirrem) until the zeroed
2372	 * directory entry gets to disk. The new inode has never been
2373	 * referenced on the disk, so can be deleted immediately.
2374	 */
2375	if ((dirrem->dm_state & COMPLETE) == 0) {
2376		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2377		    dm_next);
2378		FREE_LOCK(&lk);
2379	} else {
2380		if (prevdirrem != NULL)
2381			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
2382			    prevdirrem, dm_next);
2383		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2384		FREE_LOCK(&lk);
2385		handle_workitem_remove(dirrem);
2386	}
2387}
2388
2389/*
2390 * Allocate a new dirrem if appropriate and return it along with
2391 * its associated pagedep. Called without a lock, returns with lock.
2392 */
2393static long num_dirrem;		/* number of dirrem allocated */
2394static struct dirrem *
2395newdirrem(bp, dp, ip, isrmdir, prevdirremp)
2396	struct buf *bp;		/* buffer containing directory block */
2397	struct inode *dp;	/* inode for the directory being modified */
2398	struct inode *ip;	/* inode for directory entry being removed */
2399	int isrmdir;		/* indicates if doing RMDIR */
2400	struct dirrem **prevdirremp; /* previously referenced inode, if any */
2401{
2402	int offset;
2403	ufs_lbn_t lbn;
2404	struct diradd *dap;
2405	struct dirrem *dirrem;
2406	struct pagedep *pagedep;
2407
2408	/*
2409	 * Whiteouts have no deletion dependencies.
2410	 */
2411	if (ip == NULL)
2412		panic("newdirrem: whiteout");
2413	/*
2414	 * If we are over our limit, try to improve the situation.
2415	 * Limiting the number of dirrem structures will also limit
2416	 * the number of freefile and freeblks structures.
2417	 */
2418	if (num_dirrem > max_softdeps / 2 && speedup_syncer() == 0)
2419		(void) request_cleanup(FLUSH_REMOVE, 0);
2420	num_dirrem += 1;
2421	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
2422		M_DIRREM, M_WAITOK);
2423	bzero(dirrem, sizeof(struct dirrem));
2424	dirrem->dm_list.wk_type = D_DIRREM;
2425	dirrem->dm_state = isrmdir ? RMDIR : 0;
2426	dirrem->dm_mnt = ITOV(ip)->v_mount;
2427	dirrem->dm_oldinum = ip->i_number;
2428	*prevdirremp = NULL;
2429
2430	ACQUIRE_LOCK(&lk);
2431	lbn = lblkno(dp->i_fs, dp->i_offset);
2432	offset = blkoff(dp->i_fs, dp->i_offset);
2433	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2434		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2435	dirrem->dm_pagedep = pagedep;
2436	/*
2437	 * Check for a diradd dependency for the same directory entry.
2438	 * If present, then both dependencies become obsolete and can
2439	 * be de-allocated. Check for an entry on both the pd_dirraddhd
2440	 * list and the pd_pendinghd list.
2441	 */
2442	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(offset)]);
2443	     dap; dap = LIST_NEXT(dap, da_pdlist))
2444		if (dap->da_offset == offset)
2445			break;
2446	if (dap == NULL) {
2447		for (dap = LIST_FIRST(&pagedep->pd_pendinghd);
2448		     dap; dap = LIST_NEXT(dap, da_pdlist))
2449			if (dap->da_offset == offset)
2450				break;
2451		if (dap == NULL)
2452			return (dirrem);
2453	}
2454	/*
2455	 * Must be ATTACHED at this point.
2456	 */
2457	if ((dap->da_state & ATTACHED) == 0)
2458		panic("newdirrem: not ATTACHED");
2459	if (dap->da_newinum != ip->i_number)
2460		panic("newdirrem: inum %d should be %d",
2461		    ip->i_number, dap->da_newinum);
2462	/*
2463	 * If we are deleting a changed name that never made it to disk,
2464	 * then return the dirrem describing the previous inode (which
2465	 * represents the inode currently referenced from this entry on disk).
2466	 */
2467	if ((dap->da_state & DIRCHG) != 0) {
2468		*prevdirremp = dap->da_previous;
2469		dap->da_state &= ~DIRCHG;
2470		dap->da_pagedep = pagedep;
2471	}
2472	/*
2473	 * We are deleting an entry that never made it to disk.
2474	 * Mark it COMPLETE so we can delete its inode immediately.
2475	 */
2476	dirrem->dm_state |= COMPLETE;
2477	free_diradd(dap);
2478	return (dirrem);
2479}
2480
2481/*
2482 * Directory entry change dependencies.
2483 *
2484 * Changing an existing directory entry requires that an add operation
2485 * be completed first followed by a deletion. The semantics for the addition
2486 * are identical to the description of adding a new entry above except
2487 * that the rollback is to the old inode number rather than zero. Once
2488 * the addition dependency is completed, the removal is done as described
2489 * in the removal routine above.
2490 */
2491
2492/*
2493 * This routine should be called immediately after changing
2494 * a directory entry.  The inode's link count should not be
2495 * decremented by the calling procedure -- the soft updates
2496 * code will perform this task when it is safe.
2497 */
2498void
2499softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
2500	struct buf *bp;		/* buffer containing directory block */
2501	struct inode *dp;	/* inode for the directory being modified */
2502	struct inode *ip;	/* inode for directory entry being removed */
2503	long newinum;		/* new inode number for changed entry */
2504	int isrmdir;		/* indicates if doing RMDIR */
2505{
2506	int offset;
2507	struct diradd *dap = NULL;
2508	struct dirrem *dirrem, *prevdirrem;
2509	struct pagedep *pagedep;
2510	struct inodedep *inodedep;
2511
2512	offset = blkoff(dp->i_fs, dp->i_offset);
2513
2514	/*
2515	 * Whiteouts do not need diradd dependencies.
2516	 */
2517	if (newinum != WINO) {
2518		MALLOC(dap, struct diradd *, sizeof(struct diradd),
2519		    M_DIRADD, M_WAITOK);
2520		bzero(dap, sizeof(struct diradd));
2521		dap->da_list.wk_type = D_DIRADD;
2522		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
2523		dap->da_offset = offset;
2524		dap->da_newinum = newinum;
2525	}
2526
2527	/*
2528	 * Allocate a new dirrem and ACQUIRE_LOCK.
2529	 */
2530	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2531	pagedep = dirrem->dm_pagedep;
2532	/*
2533	 * The possible values for isrmdir:
2534	 *	0 - non-directory file rename
2535	 *	1 - directory rename within same directory
2536	 *   inum - directory rename to new directory of given inode number
2537	 * When renaming to a new directory, we are both deleting and
2538	 * creating a new directory entry, so the link count on the new
2539	 * directory should not change. Thus we do not need the followup
2540	 * dirrem which is usually done in handle_workitem_remove. We set
2541	 * the DIRCHG flag to tell handle_workitem_remove to skip the
2542	 * followup dirrem.
2543	 */
2544	if (isrmdir > 1)
2545		dirrem->dm_state |= DIRCHG;
2546
2547	/*
2548	 * Whiteouts have no additional dependencies,
2549	 * so just put the dirrem on the correct list.
2550	 */
2551	if (newinum == WINO) {
2552		if ((dirrem->dm_state & COMPLETE) == 0) {
2553			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
2554			    dm_next);
2555		} else {
2556			dirrem->dm_dirinum = pagedep->pd_ino;
2557			add_to_worklist(&dirrem->dm_list);
2558		}
2559		FREE_LOCK(&lk);
2560		return;
2561	}
2562
2563	/*
2564	 * If the COMPLETE flag is clear, then there were no active
2565	 * entries and we want to roll back to the previous inode until
2566	 * the new inode is committed to disk. If the COMPLETE flag is
2567	 * set, then we have deleted an entry that never made it to disk.
2568	 * If the entry we deleted resulted from a name change, then the old
2569	 * inode reference still resides on disk. Any rollback that we do
2570	 * needs to be to that old inode (returned to us in prevdirrem). If
2571	 * the entry we deleted resulted from a create, then there is
2572	 * no entry on the disk, so we want to roll back to zero rather
2573	 * than the uncommitted inode. In either of the COMPLETE cases we
2574	 * want to immediately free the unwritten and unreferenced inode.
2575	 */
2576	if ((dirrem->dm_state & COMPLETE) == 0) {
2577		dap->da_previous = dirrem;
2578	} else {
2579		if (prevdirrem != NULL) {
2580			dap->da_previous = prevdirrem;
2581		} else {
2582			dap->da_state &= ~DIRCHG;
2583			dap->da_pagedep = pagedep;
2584		}
2585		dirrem->dm_dirinum = pagedep->pd_ino;
2586		add_to_worklist(&dirrem->dm_list);
2587	}
2588	/*
2589	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2590	 * is not yet written. If it is written, do the post-inode write
2591	 * processing to put it on the id_pendinghd list.
2592	 */
2593	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
2594	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2595		dap->da_state |= COMPLETE;
2596		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
2597		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
2598	} else {
2599		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
2600		    dap, da_pdlist);
2601		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2602	}
2603	FREE_LOCK(&lk);
2604}
2605
2606/*
2607 * Called whenever the link count on an inode is changed.
2608 * It creates an inode dependency so that the new reference(s)
2609 * to the inode cannot be committed to disk until the updated
2610 * inode has been written.
2611 */
2612void
2613softdep_change_linkcnt(ip)
2614	struct inode *ip;	/* the inode with the increased link count */
2615{
2616	struct inodedep *inodedep;
2617
2618	ACQUIRE_LOCK(&lk);
2619	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
2620	if (ip->i_nlink < ip->i_effnlink)
2621		panic("softdep_change_linkcnt: bad delta");
2622	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2623	FREE_LOCK(&lk);
2624}
2625
2626/*
2627 * This workitem decrements the inode's link count.
2628 * If the link count reaches zero, the file is removed.
2629 */
2630static void
2631handle_workitem_remove(dirrem)
2632	struct dirrem *dirrem;
2633{
2634	struct proc *p = CURPROC;	/* XXX */
2635	struct inodedep *inodedep;
2636	struct vnode *vp;
2637	struct inode *ip;
2638	ino_t oldinum;
2639	int error;
2640
2641	if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
2642		softdep_error("handle_workitem_remove: vget", error);
2643		return;
2644	}
2645	ip = VTOI(vp);
2646	ACQUIRE_LOCK(&lk);
2647	if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0)
2648		panic("handle_workitem_remove: lost inodedep");
2649	/*
2650	 * Normal file deletion.
2651	 */
2652	if ((dirrem->dm_state & RMDIR) == 0) {
2653		ip->i_nlink--;
2654		ip->i_flag |= IN_CHANGE;
2655		if (ip->i_nlink < ip->i_effnlink)
2656			panic("handle_workitem_remove: bad file delta");
2657		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2658		FREE_LOCK(&lk);
2659		vput(vp);
2660		num_dirrem -= 1;
2661		WORKITEM_FREE(dirrem, D_DIRREM);
2662		return;
2663	}
2664	/*
2665	 * Directory deletion. Decrement reference count for both the
2666	 * just deleted parent directory entry and the reference for ".".
2667	 * Next truncate the directory to length zero. When the
2668	 * truncation completes, arrange to have the reference count on
2669	 * the parent decremented to account for the loss of "..".
2670	 */
2671	ip->i_nlink -= 2;
2672	ip->i_flag |= IN_CHANGE;
2673	if (ip->i_nlink < ip->i_effnlink)
2674		panic("handle_workitem_remove: bad dir delta");
2675	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2676	FREE_LOCK(&lk);
2677	if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0)
2678		softdep_error("handle_workitem_remove: truncate", error);
2679	/*
2680	 * Rename a directory to a new parent. Since, we are both deleting
2681	 * and creating a new directory entry, the link count on the new
2682	 * directory should not change. Thus we skip the followup dirrem.
2683	 */
2684	if (dirrem->dm_state & DIRCHG) {
2685		vput(vp);
2686		num_dirrem -= 1;
2687		WORKITEM_FREE(dirrem, D_DIRREM);
2688		return;
2689	}
2690	/*
2691	 * If the inodedep does not exist, then the zero'ed inode has
2692	 * been written to disk. If the allocated inode has never been
2693	 * written to disk, then the on-disk inode is zero'ed. In either
2694	 * case we can remove the file immediately.
2695	 */
2696	ACQUIRE_LOCK(&lk);
2697	dirrem->dm_state = 0;
2698	oldinum = dirrem->dm_oldinum;
2699	dirrem->dm_oldinum = dirrem->dm_dirinum;
2700	if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
2701	    check_inode_unwritten(inodedep)) {
2702		FREE_LOCK(&lk);
2703		vput(vp);
2704		handle_workitem_remove(dirrem);
2705		return;
2706	}
2707	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
2708	FREE_LOCK(&lk);
2709	vput(vp);
2710}
2711
2712/*
2713 * Inode de-allocation dependencies.
2714 *
2715 * When an inode's link count is reduced to zero, it can be de-allocated. We
2716 * found it convenient to postpone de-allocation until after the inode is
2717 * written to disk with its new link count (zero).  At this point, all of the
2718 * on-disk inode's block pointers are nullified and, with careful dependency
2719 * list ordering, all dependencies related to the inode will be satisfied and
2720 * the corresponding dependency structures de-allocated.  So, if/when the
2721 * inode is reused, there will be no mixing of old dependencies with new
2722 * ones.  This artificial dependency is set up by the block de-allocation
2723 * procedure above (softdep_setup_freeblocks) and completed by the
2724 * following procedure.
2725 */
2726static void
2727handle_workitem_freefile(freefile)
2728	struct freefile *freefile;
2729{
2730	struct vnode vp;
2731	struct inode tip;
2732	struct inodedep *idp;
2733	int error;
2734
2735#ifdef DEBUG
2736	ACQUIRE_LOCK(&lk);
2737	if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp))
2738		panic("handle_workitem_freefile: inodedep survived");
2739	FREE_LOCK(&lk);
2740#endif
2741	tip.i_devvp = freefile->fx_devvp;
2742	tip.i_dev = freefile->fx_devvp->v_rdev;
2743	tip.i_fs = freefile->fx_fs;
2744	vp.v_data = &tip;
2745	if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
2746		softdep_error("handle_workitem_freefile", error);
2747	WORKITEM_FREE(freefile, D_FREEFILE);
2748}
2749
2750/*
2751 * Disk writes.
2752 *
2753 * The dependency structures constructed above are most actively used when file
2754 * system blocks are written to disk.  No constraints are placed on when a
2755 * block can be written, but unsatisfied update dependencies are made safe by
2756 * modifying (or replacing) the source memory for the duration of the disk
2757 * write.  When the disk write completes, the memory block is again brought
2758 * up-to-date.
2759 *
2760 * In-core inode structure reclamation.
2761 *
2762 * Because there are a finite number of "in-core" inode structures, they are
2763 * reused regularly.  By transferring all inode-related dependencies to the
2764 * in-memory inode block and indexing them separately (via "inodedep"s), we
2765 * can allow "in-core" inode structures to be reused at any time and avoid
2766 * any increase in contention.
2767 *
2768 * Called just before entering the device driver to initiate a new disk I/O.
2769 * The buffer must be locked, thus, no I/O completion operations can occur
2770 * while we are manipulating its associated dependencies.
2771 */
2772static void
2773softdep_disk_io_initiation(bp)
2774	struct buf *bp;		/* structure describing disk write to occur */
2775{
2776	struct worklist *wk, *nextwk;
2777	struct indirdep *indirdep;
2778
2779	/*
2780	 * We only care about write operations. There should never
2781	 * be dependencies for reads.
2782	 */
2783	if (bp->b_iocmd == BIO_READ)
2784		panic("softdep_disk_io_initiation: read");
2785	/*
2786	 * Do any necessary pre-I/O processing.
2787	 */
2788	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
2789		nextwk = LIST_NEXT(wk, wk_list);
2790		switch (wk->wk_type) {
2791
2792		case D_PAGEDEP:
2793			initiate_write_filepage(WK_PAGEDEP(wk), bp);
2794			continue;
2795
2796		case D_INODEDEP:
2797			initiate_write_inodeblock(WK_INODEDEP(wk), bp);
2798			continue;
2799
2800		case D_INDIRDEP:
2801			indirdep = WK_INDIRDEP(wk);
2802			if (indirdep->ir_state & GOINGAWAY)
2803				panic("disk_io_initiation: indirdep gone");
2804			/*
2805			 * If there are no remaining dependencies, this
2806			 * will be writing the real pointers, so the
2807			 * dependency can be freed.
2808			 */
2809			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
2810				indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
2811				brelse(indirdep->ir_savebp);
2812				/* inline expand WORKLIST_REMOVE(wk); */
2813				wk->wk_state &= ~ONWORKLIST;
2814				LIST_REMOVE(wk, wk_list);
2815				WORKITEM_FREE(indirdep, D_INDIRDEP);
2816				continue;
2817			}
2818			/*
2819			 * Replace up-to-date version with safe version.
2820			 */
2821			ACQUIRE_LOCK(&lk);
2822			indirdep->ir_state &= ~ATTACHED;
2823			indirdep->ir_state |= UNDONE;
2824			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
2825			    M_INDIRDEP, M_WAITOK);
2826			bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
2827			bcopy(indirdep->ir_savebp->b_data, bp->b_data,
2828			    bp->b_bcount);
2829			FREE_LOCK(&lk);
2830			continue;
2831
2832		case D_MKDIR:
2833		case D_BMSAFEMAP:
2834		case D_ALLOCDIRECT:
2835		case D_ALLOCINDIR:
2836			continue;
2837
2838		default:
2839			panic("handle_disk_io_initiation: Unexpected type %s",
2840			    TYPENAME(wk->wk_type));
2841			/* NOTREACHED */
2842		}
2843	}
2844}
2845
2846/*
2847 * Called from within the procedure above to deal with unsatisfied
2848 * allocation dependencies in a directory. The buffer must be locked,
2849 * thus, no I/O completion operations can occur while we are
2850 * manipulating its associated dependencies.
2851 */
2852static void
2853initiate_write_filepage(pagedep, bp)
2854	struct pagedep *pagedep;
2855	struct buf *bp;
2856{
2857	struct diradd *dap;
2858	struct direct *ep;
2859	int i;
2860
2861	if (pagedep->pd_state & IOSTARTED) {
2862		/*
2863		 * This can only happen if there is a driver that does not
2864		 * understand chaining. Here biodone will reissue the call
2865		 * to strategy for the incomplete buffers.
2866		 */
2867		printf("initiate_write_filepage: already started\n");
2868		return;
2869	}
2870	pagedep->pd_state |= IOSTARTED;
2871	ACQUIRE_LOCK(&lk);
2872	for (i = 0; i < DAHASHSZ; i++) {
2873		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
2874		     dap = LIST_NEXT(dap, da_pdlist)) {
2875			ep = (struct direct *)
2876			    ((char *)bp->b_data + dap->da_offset);
2877			if (ep->d_ino != dap->da_newinum)
2878				panic("%s: dir inum %d != new %d",
2879				    "initiate_write_filepage",
2880				    ep->d_ino, dap->da_newinum);
2881			if (dap->da_state & DIRCHG)
2882				ep->d_ino = dap->da_previous->dm_oldinum;
2883			else
2884				ep->d_ino = 0;
2885			dap->da_state &= ~ATTACHED;
2886			dap->da_state |= UNDONE;
2887		}
2888	}
2889	FREE_LOCK(&lk);
2890}
2891
2892/*
2893 * Called from within the procedure above to deal with unsatisfied
2894 * allocation dependencies in an inodeblock. The buffer must be
2895 * locked, thus, no I/O completion operations can occur while we
2896 * are manipulating its associated dependencies.
2897 */
2898static void
2899initiate_write_inodeblock(inodedep, bp)
2900	struct inodedep *inodedep;
2901	struct buf *bp;			/* The inode block */
2902{
2903	struct allocdirect *adp, *lastadp;
2904	struct dinode *dp;
2905	struct fs *fs;
2906	ufs_lbn_t prevlbn = 0;
2907	int i, deplist;
2908
2909	if (inodedep->id_state & IOSTARTED)
2910		panic("initiate_write_inodeblock: already started");
2911	inodedep->id_state |= IOSTARTED;
2912	fs = inodedep->id_fs;
2913	dp = (struct dinode *)bp->b_data +
2914	    ino_to_fsbo(fs, inodedep->id_ino);
2915	/*
2916	 * If the bitmap is not yet written, then the allocated
2917	 * inode cannot be written to disk.
2918	 */
2919	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
2920		if (inodedep->id_savedino != NULL)
2921			panic("initiate_write_inodeblock: already doing I/O");
2922		MALLOC(inodedep->id_savedino, struct dinode *,
2923		    sizeof(struct dinode), M_INODEDEP, M_WAITOK);
2924		*inodedep->id_savedino = *dp;
2925		bzero((caddr_t)dp, sizeof(struct dinode));
2926		return;
2927	}
2928	/*
2929	 * If no dependencies, then there is nothing to roll back.
2930	 */
2931	inodedep->id_savedsize = dp->di_size;
2932	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
2933		return;
2934	/*
2935	 * Set the dependencies to busy.
2936	 */
2937	ACQUIRE_LOCK(&lk);
2938	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2939	     adp = TAILQ_NEXT(adp, ad_next)) {
2940#ifdef DIAGNOSTIC
2941		if (deplist != 0 && prevlbn >= adp->ad_lbn)
2942			panic("softdep_write_inodeblock: lbn order");
2943		prevlbn = adp->ad_lbn;
2944		if (adp->ad_lbn < NDADDR &&
2945		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
2946			panic("%s: direct pointer #%ld mismatch %d != %d",
2947			    "softdep_write_inodeblock", adp->ad_lbn,
2948			    dp->di_db[adp->ad_lbn], adp->ad_newblkno);
2949		if (adp->ad_lbn >= NDADDR &&
2950		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
2951			panic("%s: indirect pointer #%ld mismatch %d != %d",
2952			    "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
2953			    dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
2954		deplist |= 1 << adp->ad_lbn;
2955		if ((adp->ad_state & ATTACHED) == 0)
2956			panic("softdep_write_inodeblock: Unknown state 0x%x",
2957			    adp->ad_state);
2958#endif /* DIAGNOSTIC */
2959		adp->ad_state &= ~ATTACHED;
2960		adp->ad_state |= UNDONE;
2961	}
2962	/*
2963	 * The on-disk inode cannot claim to be any larger than the last
2964	 * fragment that has been written. Otherwise, the on-disk inode
2965	 * might have fragments that were not the last block in the file
2966	 * which would corrupt the filesystem.
2967	 */
2968	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2969	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
2970		if (adp->ad_lbn >= NDADDR)
2971			break;
2972		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
2973		/* keep going until hitting a rollback to a frag */
2974		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
2975			continue;
2976		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
2977		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
2978#ifdef DIAGNOSTIC
2979			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
2980				panic("softdep_write_inodeblock: lost dep1");
2981#endif /* DIAGNOSTIC */
2982			dp->di_db[i] = 0;
2983		}
2984		for (i = 0; i < NIADDR; i++) {
2985#ifdef DIAGNOSTIC
2986			if (dp->di_ib[i] != 0 &&
2987			    (deplist & ((1 << NDADDR) << i)) == 0)
2988				panic("softdep_write_inodeblock: lost dep2");
2989#endif /* DIAGNOSTIC */
2990			dp->di_ib[i] = 0;
2991		}
2992		FREE_LOCK(&lk);
2993		return;
2994	}
2995	/*
2996	 * If we have zero'ed out the last allocated block of the file,
2997	 * roll back the size to the last currently allocated block.
2998	 * We know that this last allocated block is a full-sized as
2999	 * we already checked for fragments in the loop above.
3000	 */
3001	if (lastadp != NULL &&
3002	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3003		for (i = lastadp->ad_lbn; i >= 0; i--)
3004			if (dp->di_db[i] != 0)
3005				break;
3006		dp->di_size = (i + 1) * fs->fs_bsize;
3007	}
3008	/*
3009	 * The only dependencies are for indirect blocks.
3010	 *
3011	 * The file size for indirect block additions is not guaranteed.
3012	 * Such a guarantee would be non-trivial to achieve. The conventional
3013	 * synchronous write implementation also does not make this guarantee.
3014	 * Fsck should catch and fix discrepancies. Arguably, the file size
3015	 * can be over-estimated without destroying integrity when the file
3016	 * moves into the indirect blocks (i.e., is large). If we want to
3017	 * postpone fsck, we are stuck with this argument.
3018	 */
3019	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3020		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3021	FREE_LOCK(&lk);
3022}
3023
3024/*
3025 * This routine is called during the completion interrupt
3026 * service routine for a disk write (from the procedure called
3027 * by the device driver to inform the file system caches of
3028 * a request completion).  It should be called early in this
3029 * procedure, before the block is made available to other
3030 * processes or other routines are called.
3031 */
3032static void
3033softdep_disk_write_complete(bp)
3034	struct buf *bp;		/* describes the completed disk write */
3035{
3036	struct worklist *wk;
3037	struct workhead reattach;
3038	struct newblk *newblk;
3039	struct allocindir *aip;
3040	struct allocdirect *adp;
3041	struct indirdep *indirdep;
3042	struct inodedep *inodedep;
3043	struct bmsafemap *bmsafemap;
3044
3045#ifdef DEBUG
3046	if (lk.lkt_held != -1)
3047		panic("softdep_disk_write_complete: lock is held");
3048	lk.lkt_held = -2;
3049#endif
3050	LIST_INIT(&reattach);
3051	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
3052		WORKLIST_REMOVE(wk);
3053		switch (wk->wk_type) {
3054
3055		case D_PAGEDEP:
3056			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
3057				WORKLIST_INSERT(&reattach, wk);
3058			continue;
3059
3060		case D_INODEDEP:
3061			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
3062				WORKLIST_INSERT(&reattach, wk);
3063			continue;
3064
3065		case D_BMSAFEMAP:
3066			bmsafemap = WK_BMSAFEMAP(wk);
3067			while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
3068				newblk->nb_state |= DEPCOMPLETE;
3069				newblk->nb_bmsafemap = NULL;
3070				LIST_REMOVE(newblk, nb_deps);
3071			}
3072			while ((adp =
3073			   LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
3074				adp->ad_state |= DEPCOMPLETE;
3075				adp->ad_buf = NULL;
3076				LIST_REMOVE(adp, ad_deps);
3077				handle_allocdirect_partdone(adp);
3078			}
3079			while ((aip =
3080			    LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
3081				aip->ai_state |= DEPCOMPLETE;
3082				aip->ai_buf = NULL;
3083				LIST_REMOVE(aip, ai_deps);
3084				handle_allocindir_partdone(aip);
3085			}
3086			while ((inodedep =
3087			     LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
3088				inodedep->id_state |= DEPCOMPLETE;
3089				LIST_REMOVE(inodedep, id_deps);
3090				inodedep->id_buf = NULL;
3091			}
3092			WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
3093			continue;
3094
3095		case D_MKDIR:
3096			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
3097			continue;
3098
3099		case D_ALLOCDIRECT:
3100			adp = WK_ALLOCDIRECT(wk);
3101			adp->ad_state |= COMPLETE;
3102			handle_allocdirect_partdone(adp);
3103			continue;
3104
3105		case D_ALLOCINDIR:
3106			aip = WK_ALLOCINDIR(wk);
3107			aip->ai_state |= COMPLETE;
3108			handle_allocindir_partdone(aip);
3109			continue;
3110
3111		case D_INDIRDEP:
3112			indirdep = WK_INDIRDEP(wk);
3113			if (indirdep->ir_state & GOINGAWAY)
3114				panic("disk_write_complete: indirdep gone");
3115			bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
3116			FREE(indirdep->ir_saveddata, M_INDIRDEP);
3117			indirdep->ir_saveddata = 0;
3118			indirdep->ir_state &= ~UNDONE;
3119			indirdep->ir_state |= ATTACHED;
3120			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
3121				handle_allocindir_partdone(aip);
3122				if (aip == LIST_FIRST(&indirdep->ir_donehd))
3123					panic("disk_write_complete: not gone");
3124			}
3125			WORKLIST_INSERT(&reattach, wk);
3126			if ((bp->b_flags & B_DELWRI) == 0)
3127				stat_indir_blk_ptrs++;
3128			bdirty(bp);
3129			continue;
3130
3131		default:
3132			panic("handle_disk_write_complete: Unknown type %s",
3133			    TYPENAME(wk->wk_type));
3134			/* NOTREACHED */
3135		}
3136	}
3137	/*
3138	 * Reattach any requests that must be redone.
3139	 */
3140	while ((wk = LIST_FIRST(&reattach)) != NULL) {
3141		WORKLIST_REMOVE(wk);
3142		WORKLIST_INSERT(&bp->b_dep, wk);
3143	}
3144#ifdef DEBUG
3145	if (lk.lkt_held != -2)
3146		panic("softdep_disk_write_complete: lock lost");
3147	lk.lkt_held = -1;
3148#endif
3149}
3150
3151/*
3152 * Called from within softdep_disk_write_complete above. Note that
3153 * this routine is always called from interrupt level with further
3154 * splbio interrupts blocked.
3155 */
3156static void
3157handle_allocdirect_partdone(adp)
3158	struct allocdirect *adp;	/* the completed allocdirect */
3159{
3160	struct allocdirect *listadp;
3161	struct inodedep *inodedep;
3162	long bsize;
3163
3164	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3165		return;
3166	if (adp->ad_buf != NULL)
3167		panic("handle_allocdirect_partdone: dangling dep");
3168	/*
3169	 * The on-disk inode cannot claim to be any larger than the last
3170	 * fragment that has been written. Otherwise, the on-disk inode
3171	 * might have fragments that were not the last block in the file
3172	 * which would corrupt the filesystem. Thus, we cannot free any
3173	 * allocdirects after one whose ad_oldblkno claims a fragment as
3174	 * these blocks must be rolled back to zero before writing the inode.
3175	 * We check the currently active set of allocdirects in id_inoupdt.
3176	 */
3177	inodedep = adp->ad_inodedep;
3178	bsize = inodedep->id_fs->fs_bsize;
3179	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp;
3180	     listadp = TAILQ_NEXT(listadp, ad_next)) {
3181		/* found our block */
3182		if (listadp == adp)
3183			break;
3184		/* continue if ad_oldlbn is not a fragment */
3185		if (listadp->ad_oldsize == 0 ||
3186		    listadp->ad_oldsize == bsize)
3187			continue;
3188		/* hit a fragment */
3189		return;
3190	}
3191	/*
3192	 * If we have reached the end of the current list without
3193	 * finding the just finished dependency, then it must be
3194	 * on the future dependency list. Future dependencies cannot
3195	 * be freed until they are moved to the current list.
3196	 */
3197	if (listadp == NULL) {
3198#ifdef DEBUG
3199		for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp;
3200		     listadp = TAILQ_NEXT(listadp, ad_next))
3201			/* found our block */
3202			if (listadp == adp)
3203				break;
3204		if (listadp == NULL)
3205			panic("handle_allocdirect_partdone: lost dep");
3206#endif /* DEBUG */
3207		return;
3208	}
3209	/*
3210	 * If we have found the just finished dependency, then free
3211	 * it along with anything that follows it that is complete.
3212	 */
3213	for (; adp; adp = listadp) {
3214		listadp = TAILQ_NEXT(adp, ad_next);
3215		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3216			return;
3217		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
3218	}
3219}
3220
3221/*
3222 * Called from within softdep_disk_write_complete above. Note that
3223 * this routine is always called from interrupt level with further
3224 * splbio interrupts blocked.
3225 */
3226static void
3227handle_allocindir_partdone(aip)
3228	struct allocindir *aip;		/* the completed allocindir */
3229{
3230	struct indirdep *indirdep;
3231
3232	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
3233		return;
3234	if (aip->ai_buf != NULL)
3235		panic("handle_allocindir_partdone: dangling dependency");
3236	indirdep = aip->ai_indirdep;
3237	if (indirdep->ir_state & UNDONE) {
3238		LIST_REMOVE(aip, ai_next);
3239		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
3240		return;
3241	}
3242	((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3243	    aip->ai_newblkno;
3244	LIST_REMOVE(aip, ai_next);
3245	if (aip->ai_freefrag != NULL)
3246		add_to_worklist(&aip->ai_freefrag->ff_list);
3247	WORKITEM_FREE(aip, D_ALLOCINDIR);
3248}
3249
3250/*
3251 * Called from within softdep_disk_write_complete above to restore
3252 * in-memory inode block contents to their most up-to-date state. Note
3253 * that this routine is always called from interrupt level with further
3254 * splbio interrupts blocked.
3255 */
3256static int
3257handle_written_inodeblock(inodedep, bp)
3258	struct inodedep *inodedep;
3259	struct buf *bp;		/* buffer containing the inode block */
3260{
3261	struct worklist *wk, *filefree;
3262	struct allocdirect *adp, *nextadp;
3263	struct dinode *dp;
3264	int hadchanges;
3265
3266	if ((inodedep->id_state & IOSTARTED) == 0)
3267		panic("handle_written_inodeblock: not started");
3268	inodedep->id_state &= ~IOSTARTED;
3269	inodedep->id_state |= COMPLETE;
3270	dp = (struct dinode *)bp->b_data +
3271	    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
3272	/*
3273	 * If we had to rollback the inode allocation because of
3274	 * bitmaps being incomplete, then simply restore it.
3275	 * Keep the block dirty so that it will not be reclaimed until
3276	 * all associated dependencies have been cleared and the
3277	 * corresponding updates written to disk.
3278	 */
3279	if (inodedep->id_savedino != NULL) {
3280		*dp = *inodedep->id_savedino;
3281		FREE(inodedep->id_savedino, M_INODEDEP);
3282		inodedep->id_savedino = NULL;
3283		if ((bp->b_flags & B_DELWRI) == 0)
3284			stat_inode_bitmap++;
3285		bdirty(bp);
3286		return (1);
3287	}
3288	/*
3289	 * Roll forward anything that had to be rolled back before
3290	 * the inode could be updated.
3291	 */
3292	hadchanges = 0;
3293	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
3294		nextadp = TAILQ_NEXT(adp, ad_next);
3295		if (adp->ad_state & ATTACHED)
3296			panic("handle_written_inodeblock: new entry");
3297		if (adp->ad_lbn < NDADDR) {
3298			if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno)
3299				panic("%s: %s #%ld mismatch %d != %d",
3300				    "handle_written_inodeblock",
3301				    "direct pointer", adp->ad_lbn,
3302				    dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
3303			dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
3304		} else {
3305			if (dp->di_ib[adp->ad_lbn - NDADDR] != 0)
3306				panic("%s: %s #%ld allocated as %d",
3307				    "handle_written_inodeblock",
3308				    "indirect pointer", adp->ad_lbn - NDADDR,
3309				    dp->di_ib[adp->ad_lbn - NDADDR]);
3310			dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
3311		}
3312		adp->ad_state &= ~UNDONE;
3313		adp->ad_state |= ATTACHED;
3314		hadchanges = 1;
3315	}
3316	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
3317		stat_direct_blk_ptrs++;
3318	/*
3319	 * Reset the file size to its most up-to-date value.
3320	 */
3321	if (inodedep->id_savedsize == -1)
3322		panic("handle_written_inodeblock: bad size");
3323	if (dp->di_size != inodedep->id_savedsize) {
3324		dp->di_size = inodedep->id_savedsize;
3325		hadchanges = 1;
3326	}
3327	inodedep->id_savedsize = -1;
3328	/*
3329	 * If there were any rollbacks in the inode block, then it must be
3330	 * marked dirty so that its will eventually get written back in
3331	 * its correct form.
3332	 */
3333	if (hadchanges)
3334		bdirty(bp);
3335	/*
3336	 * Process any allocdirects that completed during the update.
3337	 */
3338	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
3339		handle_allocdirect_partdone(adp);
3340	/*
3341	 * Process deallocations that were held pending until the
3342	 * inode had been written to disk. Freeing of the inode
3343	 * is delayed until after all blocks have been freed to
3344	 * avoid creation of new <vfsid, inum, lbn> triples
3345	 * before the old ones have been deleted.
3346	 */
3347	filefree = NULL;
3348	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
3349		WORKLIST_REMOVE(wk);
3350		switch (wk->wk_type) {
3351
3352		case D_FREEFILE:
3353			/*
3354			 * We defer adding filefree to the worklist until
3355			 * all other additions have been made to ensure
3356			 * that it will be done after all the old blocks
3357			 * have been freed.
3358			 */
3359			if (filefree != NULL)
3360				panic("handle_written_inodeblock: filefree");
3361			filefree = wk;
3362			continue;
3363
3364		case D_MKDIR:
3365			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
3366			continue;
3367
3368		case D_DIRADD:
3369			diradd_inode_written(WK_DIRADD(wk), inodedep);
3370			continue;
3371
3372		case D_FREEBLKS:
3373		case D_FREEFRAG:
3374		case D_DIRREM:
3375			add_to_worklist(wk);
3376			continue;
3377
3378		default:
3379			panic("handle_written_inodeblock: Unknown type %s",
3380			    TYPENAME(wk->wk_type));
3381			/* NOTREACHED */
3382		}
3383	}
3384	if (filefree != NULL) {
3385		if (free_inodedep(inodedep) == 0)
3386			panic("handle_written_inodeblock: live inodedep");
3387		add_to_worklist(filefree);
3388		return (0);
3389	}
3390
3391	/*
3392	 * If no outstanding dependencies, free it.
3393	 */
3394	if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
3395		return (0);
3396	return (hadchanges);
3397}
3398
3399/*
3400 * Process a diradd entry after its dependent inode has been written.
3401 * This routine must be called with splbio interrupts blocked.
3402 */
3403static void
3404diradd_inode_written(dap, inodedep)
3405	struct diradd *dap;
3406	struct inodedep *inodedep;
3407{
3408	struct pagedep *pagedep;
3409
3410	dap->da_state |= COMPLETE;
3411	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3412		if (dap->da_state & DIRCHG)
3413			pagedep = dap->da_previous->dm_pagedep;
3414		else
3415			pagedep = dap->da_pagedep;
3416		LIST_REMOVE(dap, da_pdlist);
3417		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3418	}
3419	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3420}
3421
3422/*
3423 * Handle the completion of a mkdir dependency.
3424 */
3425static void
3426handle_written_mkdir(mkdir, type)
3427	struct mkdir *mkdir;
3428	int type;
3429{
3430	struct diradd *dap;
3431	struct pagedep *pagedep;
3432
3433	if (mkdir->md_state != type)
3434		panic("handle_written_mkdir: bad type");
3435	dap = mkdir->md_diradd;
3436	dap->da_state &= ~type;
3437	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
3438		dap->da_state |= DEPCOMPLETE;
3439	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3440		if (dap->da_state & DIRCHG)
3441			pagedep = dap->da_previous->dm_pagedep;
3442		else
3443			pagedep = dap->da_pagedep;
3444		LIST_REMOVE(dap, da_pdlist);
3445		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3446	}
3447	LIST_REMOVE(mkdir, md_mkdirs);
3448	WORKITEM_FREE(mkdir, D_MKDIR);
3449}
3450
3451/*
3452 * Called from within softdep_disk_write_complete above.
3453 * A write operation was just completed. Removed inodes can
3454 * now be freed and associated block pointers may be committed.
3455 * Note that this routine is always called from interrupt level
3456 * with further splbio interrupts blocked.
3457 */
3458static int
3459handle_written_filepage(pagedep, bp)
3460	struct pagedep *pagedep;
3461	struct buf *bp;		/* buffer containing the written page */
3462{
3463	struct dirrem *dirrem;
3464	struct diradd *dap, *nextdap;
3465	struct direct *ep;
3466	int i, chgs;
3467
3468	if ((pagedep->pd_state & IOSTARTED) == 0)
3469		panic("handle_written_filepage: not started");
3470	pagedep->pd_state &= ~IOSTARTED;
3471	/*
3472	 * Process any directory removals that have been committed.
3473	 */
3474	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
3475		LIST_REMOVE(dirrem, dm_next);
3476		dirrem->dm_dirinum = pagedep->pd_ino;
3477		add_to_worklist(&dirrem->dm_list);
3478	}
3479	/*
3480	 * Free any directory additions that have been committed.
3481	 */
3482	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
3483		free_diradd(dap);
3484	/*
3485	 * Uncommitted directory entries must be restored.
3486	 */
3487	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
3488		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
3489		     dap = nextdap) {
3490			nextdap = LIST_NEXT(dap, da_pdlist);
3491			if (dap->da_state & ATTACHED)
3492				panic("handle_written_filepage: attached");
3493			ep = (struct direct *)
3494			    ((char *)bp->b_data + dap->da_offset);
3495			ep->d_ino = dap->da_newinum;
3496			dap->da_state &= ~UNDONE;
3497			dap->da_state |= ATTACHED;
3498			chgs = 1;
3499			/*
3500			 * If the inode referenced by the directory has
3501			 * been written out, then the dependency can be
3502			 * moved to the pending list.
3503			 */
3504			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3505				LIST_REMOVE(dap, da_pdlist);
3506				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
3507				    da_pdlist);
3508			}
3509		}
3510	}
3511	/*
3512	 * If there were any rollbacks in the directory, then it must be
3513	 * marked dirty so that its will eventually get written back in
3514	 * its correct form.
3515	 */
3516	if (chgs) {
3517		if ((bp->b_flags & B_DELWRI) == 0)
3518			stat_dir_entry++;
3519		bdirty(bp);
3520	}
3521	/*
3522	 * If no dependencies remain, the pagedep will be freed.
3523	 * Otherwise it will remain to update the page before it
3524	 * is written back to disk.
3525	 */
3526	if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
3527		for (i = 0; i < DAHASHSZ; i++)
3528			if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
3529				break;
3530		if (i == DAHASHSZ) {
3531			LIST_REMOVE(pagedep, pd_hash);
3532			WORKITEM_FREE(pagedep, D_PAGEDEP);
3533			return (0);
3534		}
3535	}
3536	return (1);
3537}
3538
3539/*
3540 * Writing back in-core inode structures.
3541 *
3542 * The file system only accesses an inode's contents when it occupies an
3543 * "in-core" inode structure.  These "in-core" structures are separate from
3544 * the page frames used to cache inode blocks.  Only the latter are
3545 * transferred to/from the disk.  So, when the updated contents of the
3546 * "in-core" inode structure are copied to the corresponding in-memory inode
3547 * block, the dependencies are also transferred.  The following procedure is
3548 * called when copying a dirty "in-core" inode to a cached inode block.
3549 */
3550
3551/*
3552 * Called when an inode is loaded from disk. If the effective link count
3553 * differed from the actual link count when it was last flushed, then we
3554 * need to ensure that the correct effective link count is put back.
3555 */
3556void
3557softdep_load_inodeblock(ip)
3558	struct inode *ip;	/* the "in_core" copy of the inode */
3559{
3560	struct inodedep *inodedep;
3561
3562	/*
3563	 * Check for alternate nlink count.
3564	 */
3565	ip->i_effnlink = ip->i_nlink;
3566	ACQUIRE_LOCK(&lk);
3567	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3568		FREE_LOCK(&lk);
3569		return;
3570	}
3571	ip->i_effnlink -= inodedep->id_nlinkdelta;
3572	FREE_LOCK(&lk);
3573}
3574
3575/*
3576 * This routine is called just before the "in-core" inode
3577 * information is to be copied to the in-memory inode block.
3578 * Recall that an inode block contains several inodes. If
3579 * the force flag is set, then the dependencies will be
3580 * cleared so that the update can always be made. Note that
3581 * the buffer is locked when this routine is called, so we
3582 * will never be in the middle of writing the inode block
3583 * to disk.
3584 */
3585void
3586softdep_update_inodeblock(ip, bp, waitfor)
3587	struct inode *ip;	/* the "in_core" copy of the inode */
3588	struct buf *bp;		/* the buffer containing the inode block */
3589	int waitfor;		/* nonzero => update must be allowed */
3590{
3591	struct inodedep *inodedep;
3592	struct worklist *wk;
3593	int error, gotit;
3594
3595	/*
3596	 * If the effective link count is not equal to the actual link
3597	 * count, then we must track the difference in an inodedep while
3598	 * the inode is (potentially) tossed out of the cache. Otherwise,
3599	 * if there is no existing inodedep, then there are no dependencies
3600	 * to track.
3601	 */
3602	ACQUIRE_LOCK(&lk);
3603	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3604		if (ip->i_effnlink != ip->i_nlink)
3605			panic("softdep_update_inodeblock: bad link count");
3606		FREE_LOCK(&lk);
3607		return;
3608	}
3609	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
3610		panic("softdep_update_inodeblock: bad delta");
3611	/*
3612	 * Changes have been initiated. Anything depending on these
3613	 * changes cannot occur until this inode has been written.
3614	 */
3615	inodedep->id_state &= ~COMPLETE;
3616	if ((inodedep->id_state & ONWORKLIST) == 0)
3617		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
3618	/*
3619	 * Any new dependencies associated with the incore inode must
3620	 * now be moved to the list associated with the buffer holding
3621	 * the in-memory copy of the inode. Once merged process any
3622	 * allocdirects that are completed by the merger.
3623	 */
3624	merge_inode_lists(inodedep);
3625	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
3626		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
3627	/*
3628	 * Now that the inode has been pushed into the buffer, the
3629	 * operations dependent on the inode being written to disk
3630	 * can be moved to the id_bufwait so that they will be
3631	 * processed when the buffer I/O completes.
3632	 */
3633	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
3634		WORKLIST_REMOVE(wk);
3635		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
3636	}
3637	/*
3638	 * Newly allocated inodes cannot be written until the bitmap
3639	 * that allocates them have been written (indicated by
3640	 * DEPCOMPLETE being set in id_state). If we are doing a
3641	 * forced sync (e.g., an fsync on a file), we force the bitmap
3642	 * to be written so that the update can be done.
3643	 */
3644	if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
3645		FREE_LOCK(&lk);
3646		return;
3647	}
3648	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
3649	FREE_LOCK(&lk);
3650	if (gotit &&
3651	    (error = BUF_WRITE(inodedep->id_buf)) != 0)
3652		softdep_error("softdep_update_inodeblock: bwrite", error);
3653	if ((inodedep->id_state & DEPCOMPLETE) == 0)
3654		panic("softdep_update_inodeblock: update failed");
3655}
3656
3657/*
3658 * Merge the new inode dependency list (id_newinoupdt) into the old
3659 * inode dependency list (id_inoupdt). This routine must be called
3660 * with splbio interrupts blocked.
3661 */
3662static void
3663merge_inode_lists(inodedep)
3664	struct inodedep *inodedep;
3665{
3666	struct allocdirect *listadp, *newadp;
3667
3668	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3669	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
3670		if (listadp->ad_lbn < newadp->ad_lbn) {
3671			listadp = TAILQ_NEXT(listadp, ad_next);
3672			continue;
3673		}
3674		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3675		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
3676		if (listadp->ad_lbn == newadp->ad_lbn) {
3677			allocdirect_merge(&inodedep->id_inoupdt, newadp,
3678			    listadp);
3679			listadp = newadp;
3680		}
3681		newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3682	}
3683	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
3684		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3685		TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
3686	}
3687}
3688
3689/*
3690 * If we are doing an fsync, then we must ensure that any directory
3691 * entries for the inode have been written after the inode gets to disk.
3692 */
3693static int
3694softdep_fsync(vp)
3695	struct vnode *vp;	/* the "in_core" copy of the inode */
3696{
3697	struct inodedep *inodedep;
3698	struct pagedep *pagedep;
3699	struct worklist *wk;
3700	struct diradd *dap;
3701	struct mount *mnt;
3702	struct vnode *pvp;
3703	struct inode *ip;
3704	struct buf *bp;
3705	struct fs *fs;
3706	struct proc *p = CURPROC;		/* XXX */
3707	int error, flushparent;
3708	ino_t parentino;
3709	ufs_lbn_t lbn;
3710
3711	ip = VTOI(vp);
3712	fs = ip->i_fs;
3713	ACQUIRE_LOCK(&lk);
3714	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
3715		FREE_LOCK(&lk);
3716		return (0);
3717	}
3718	if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
3719	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
3720	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
3721	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
3722		panic("softdep_fsync: pending ops");
3723	for (error = 0, flushparent = 0; ; ) {
3724		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
3725			break;
3726		if (wk->wk_type != D_DIRADD)
3727			panic("softdep_fsync: Unexpected type %s",
3728			    TYPENAME(wk->wk_type));
3729		dap = WK_DIRADD(wk);
3730		/*
3731		 * Flush our parent if this directory entry
3732		 * has a MKDIR_PARENT dependency.
3733		 */
3734		if (dap->da_state & DIRCHG)
3735			pagedep = dap->da_previous->dm_pagedep;
3736		else
3737			pagedep = dap->da_pagedep;
3738		mnt = pagedep->pd_mnt;
3739		parentino = pagedep->pd_ino;
3740		lbn = pagedep->pd_lbn;
3741		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
3742			panic("softdep_fsync: dirty");
3743		flushparent = dap->da_state & MKDIR_PARENT;
3744		/*
3745		 * If we are being fsync'ed as part of vgone'ing this vnode,
3746		 * then we will not be able to release and recover the
3747		 * vnode below, so we just have to give up on writing its
3748		 * directory entry out. It will eventually be written, just
3749		 * not now, but then the user was not asking to have it
3750		 * written, so we are not breaking any promises.
3751		 */
3752		if (vp->v_flag & VXLOCK)
3753			break;
3754		/*
3755		 * We prevent deadlock by always fetching inodes from the
3756		 * root, moving down the directory tree. Thus, when fetching
3757		 * our parent directory, we must unlock ourselves before
3758		 * requesting the lock on our parent. See the comment in
3759		 * ufs_lookup for details on possible races.
3760		 */
3761		FREE_LOCK(&lk);
3762		VOP_UNLOCK(vp, 0, p);
3763		error = VFS_VGET(mnt, parentino, &pvp);
3764		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
3765		if (error != 0)
3766			return (error);
3767		if (flushparent) {
3768			if ((error = UFS_UPDATE(pvp, 1)) != 0) {
3769				vput(pvp);
3770				return (error);
3771			}
3772		}
3773		/*
3774		 * Flush directory page containing the inode's name.
3775		 */
3776		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
3777		    &bp);
3778		if (error == 0)
3779			error = BUF_WRITE(bp);
3780		vput(pvp);
3781		if (error != 0)
3782			return (error);
3783		ACQUIRE_LOCK(&lk);
3784		if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
3785			break;
3786	}
3787	FREE_LOCK(&lk);
3788	return (0);
3789}
3790
3791/*
3792 * Flush all the dirty bitmaps associated with the block device
3793 * before flushing the rest of the dirty blocks so as to reduce
3794 * the number of dependencies that will have to be rolled back.
3795 */
3796void
3797softdep_fsync_mountdev(vp)
3798	struct vnode *vp;
3799{
3800	struct buf *bp, *nbp;
3801	struct worklist *wk;
3802
3803	if (!vn_isdisk(vp, NULL))
3804		panic("softdep_fsync_mountdev: vnode not a disk");
3805	ACQUIRE_LOCK(&lk);
3806	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
3807		nbp = TAILQ_NEXT(bp, b_vnbufs);
3808		/*
3809		 * If it is already scheduled, skip to the next buffer.
3810		 */
3811		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
3812			continue;
3813		if ((bp->b_flags & B_DELWRI) == 0)
3814			panic("softdep_fsync_mountdev: not dirty");
3815		/*
3816		 * We are only interested in bitmaps with outstanding
3817		 * dependencies.
3818		 */
3819		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
3820		    wk->wk_type != D_BMSAFEMAP ||
3821		    (bp->b_xflags & BX_BKGRDINPROG)) {
3822			BUF_UNLOCK(bp);
3823			continue;
3824		}
3825		bremfree(bp);
3826		FREE_LOCK(&lk);
3827		(void) bawrite(bp);
3828		ACQUIRE_LOCK(&lk);
3829		/*
3830		 * Since we may have slept during the I/O, we need
3831		 * to start from a known point.
3832		 */
3833		nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
3834	}
3835	drain_output(vp, 1);
3836	FREE_LOCK(&lk);
3837}
3838
3839/*
3840 * This routine is called when we are trying to synchronously flush a
3841 * file. This routine must eliminate any filesystem metadata dependencies
3842 * so that the syncing routine can succeed by pushing the dirty blocks
3843 * associated with the file. If any I/O errors occur, they are returned.
3844 */
3845int
3846softdep_sync_metadata(ap)
3847	struct vop_fsync_args /* {
3848		struct vnode *a_vp;
3849		struct ucred *a_cred;
3850		int a_waitfor;
3851		struct proc *a_p;
3852	} */ *ap;
3853{
3854	struct vnode *vp = ap->a_vp;
3855	struct pagedep *pagedep;
3856	struct allocdirect *adp;
3857	struct allocindir *aip;
3858	struct buf *bp, *nbp;
3859	struct worklist *wk;
3860	int i, error, waitfor;
3861
3862	/*
3863	 * Check whether this vnode is involved in a filesystem
3864	 * that is doing soft dependency processing.
3865	 */
3866	if (!vn_isdisk(vp, NULL)) {
3867		if (!DOINGSOFTDEP(vp))
3868			return (0);
3869	} else
3870		if (vp->v_specmountpoint == NULL ||
3871		    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
3872			return (0);
3873	/*
3874	 * Ensure that any direct block dependencies have been cleared.
3875	 */
3876	ACQUIRE_LOCK(&lk);
3877	if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
3878		FREE_LOCK(&lk);
3879		return (error);
3880	}
3881	/*
3882	 * For most files, the only metadata dependencies are the
3883	 * cylinder group maps that allocate their inode or blocks.
3884	 * The block allocation dependencies can be found by traversing
3885	 * the dependency lists for any buffers that remain on their
3886	 * dirty buffer list. The inode allocation dependency will
3887	 * be resolved when the inode is updated with MNT_WAIT.
3888	 * This work is done in two passes. The first pass grabs most
3889	 * of the buffers and begins asynchronously writing them. The
3890	 * only way to wait for these asynchronous writes is to sleep
3891	 * on the filesystem vnode which may stay busy for a long time
3892	 * if the filesystem is active. So, instead, we make a second
3893	 * pass over the dependencies blocking on each write. In the
3894	 * usual case we will be blocking against a write that we
3895	 * initiated, so when it is done the dependency will have been
3896	 * resolved. Thus the second pass is expected to end quickly.
3897	 */
3898	waitfor = MNT_NOWAIT;
3899top:
3900	if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
3901		FREE_LOCK(&lk);
3902		return (0);
3903	}
3904	bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
3905loop:
3906	/*
3907	 * As we hold the buffer locked, none of its dependencies
3908	 * will disappear.
3909	 */
3910	for (wk = LIST_FIRST(&bp->b_dep); wk;
3911	     wk = LIST_NEXT(wk, wk_list)) {
3912		switch (wk->wk_type) {
3913
3914		case D_ALLOCDIRECT:
3915			adp = WK_ALLOCDIRECT(wk);
3916			if (adp->ad_state & DEPCOMPLETE)
3917				break;
3918			nbp = adp->ad_buf;
3919			if (getdirtybuf(&nbp, waitfor) == 0)
3920				break;
3921			FREE_LOCK(&lk);
3922			if (waitfor == MNT_NOWAIT) {
3923				bawrite(nbp);
3924			} else if ((error = BUF_WRITE(nbp)) != 0) {
3925				bawrite(bp);
3926				return (error);
3927			}
3928			ACQUIRE_LOCK(&lk);
3929			break;
3930
3931		case D_ALLOCINDIR:
3932			aip = WK_ALLOCINDIR(wk);
3933			if (aip->ai_state & DEPCOMPLETE)
3934				break;
3935			nbp = aip->ai_buf;
3936			if (getdirtybuf(&nbp, waitfor) == 0)
3937				break;
3938			FREE_LOCK(&lk);
3939			if (waitfor == MNT_NOWAIT) {
3940				bawrite(nbp);
3941			} else if ((error = BUF_WRITE(nbp)) != 0) {
3942				bawrite(bp);
3943				return (error);
3944			}
3945			ACQUIRE_LOCK(&lk);
3946			break;
3947
3948		case D_INDIRDEP:
3949		restart:
3950			for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd);
3951			     aip; aip = LIST_NEXT(aip, ai_next)) {
3952				if (aip->ai_state & DEPCOMPLETE)
3953					continue;
3954				nbp = aip->ai_buf;
3955				if (getdirtybuf(&nbp, MNT_WAIT) == 0)
3956					goto restart;
3957				FREE_LOCK(&lk);
3958				if ((error = BUF_WRITE(nbp)) != 0) {
3959					bawrite(bp);
3960					return (error);
3961				}
3962				ACQUIRE_LOCK(&lk);
3963				goto restart;
3964			}
3965			break;
3966
3967		case D_INODEDEP:
3968			if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
3969			    WK_INODEDEP(wk)->id_ino)) != 0) {
3970				FREE_LOCK(&lk);
3971				bawrite(bp);
3972				return (error);
3973			}
3974			break;
3975
3976		case D_PAGEDEP:
3977			/*
3978			 * We are trying to sync a directory that may
3979			 * have dependencies on both its own metadata
3980			 * and/or dependencies on the inodes of any
3981			 * recently allocated files. We walk its diradd
3982			 * lists pushing out the associated inode.
3983			 */
3984			pagedep = WK_PAGEDEP(wk);
3985			for (i = 0; i < DAHASHSZ; i++) {
3986				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
3987					continue;
3988				if ((error =
3989				    flush_pagedep_deps(vp, pagedep->pd_mnt,
3990						&pagedep->pd_diraddhd[i]))) {
3991					FREE_LOCK(&lk);
3992					bawrite(bp);
3993					return (error);
3994				}
3995			}
3996			break;
3997
3998		case D_MKDIR:
3999			/*
4000			 * This case should never happen if the vnode has
4001			 * been properly sync'ed. However, if this function
4002			 * is used at a place where the vnode has not yet
4003			 * been sync'ed, this dependency can show up. So,
4004			 * rather than panic, just flush it.
4005			 */
4006			nbp = WK_MKDIR(wk)->md_buf;
4007			if (getdirtybuf(&nbp, waitfor) == 0)
4008				break;
4009			FREE_LOCK(&lk);
4010			if (waitfor == MNT_NOWAIT) {
4011				bawrite(nbp);
4012			} else if ((error = BUF_WRITE(nbp)) != 0) {
4013				bawrite(bp);
4014				return (error);
4015			}
4016			ACQUIRE_LOCK(&lk);
4017			break;
4018
4019		case D_BMSAFEMAP:
4020			/*
4021			 * This case should never happen if the vnode has
4022			 * been properly sync'ed. However, if this function
4023			 * is used at a place where the vnode has not yet
4024			 * been sync'ed, this dependency can show up. So,
4025			 * rather than panic, just flush it.
4026			 */
4027			nbp = WK_BMSAFEMAP(wk)->sm_buf;
4028			if (getdirtybuf(&nbp, waitfor) == 0)
4029				break;
4030			FREE_LOCK(&lk);
4031			if (waitfor == MNT_NOWAIT) {
4032				bawrite(nbp);
4033			} else if ((error = BUF_WRITE(nbp)) != 0) {
4034				bawrite(bp);
4035				return (error);
4036			}
4037			ACQUIRE_LOCK(&lk);
4038			break;
4039
4040		default:
4041			panic("softdep_sync_metadata: Unknown type %s",
4042			    TYPENAME(wk->wk_type));
4043			/* NOTREACHED */
4044		}
4045	}
4046	(void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT);
4047	nbp = TAILQ_NEXT(bp, b_vnbufs);
4048	FREE_LOCK(&lk);
4049	bawrite(bp);
4050	ACQUIRE_LOCK(&lk);
4051	if (nbp != NULL) {
4052		bp = nbp;
4053		goto loop;
4054	}
4055	/*
4056	 * We must wait for any I/O in progress to finish so that
4057	 * all potential buffers on the dirty list will be visible.
4058	 * Once they are all there, proceed with the second pass
4059	 * which will wait for the I/O as per above.
4060	 */
4061	drain_output(vp, 1);
4062	/*
4063	 * The brief unlock is to allow any pent up dependency
4064	 * processing to be done.
4065	 */
4066	if (waitfor == MNT_NOWAIT) {
4067		waitfor = MNT_WAIT;
4068		FREE_LOCK(&lk);
4069		ACQUIRE_LOCK(&lk);
4070		goto top;
4071	}
4072
4073	/*
4074	 * If we have managed to get rid of all the dirty buffers,
4075	 * then we are done. For certain directories and block
4076	 * devices, we may need to do further work.
4077	 */
4078	if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
4079		FREE_LOCK(&lk);
4080		return (0);
4081	}
4082
4083	FREE_LOCK(&lk);
4084	/*
4085	 * If we are trying to sync a block device, some of its buffers may
4086	 * contain metadata that cannot be written until the contents of some
4087	 * partially written files have been written to disk. The only easy
4088	 * way to accomplish this is to sync the entire filesystem (luckily
4089	 * this happens rarely).
4090	 */
4091	if (vn_isdisk(vp, NULL) &&
4092	    vp->v_specmountpoint && !VOP_ISLOCKED(vp, NULL) &&
4093	    (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred,
4094	     ap->a_p)) != 0)
4095		return (error);
4096	return (0);
4097}
4098
4099/*
4100 * Flush the dependencies associated with an inodedep.
4101 * Called with splbio blocked.
4102 */
4103static int
4104flush_inodedep_deps(fs, ino)
4105	struct fs *fs;
4106	ino_t ino;
4107{
4108	struct inodedep *inodedep;
4109	struct allocdirect *adp;
4110	int error, waitfor;
4111	struct buf *bp;
4112
4113	/*
4114	 * This work is done in two passes. The first pass grabs most
4115	 * of the buffers and begins asynchronously writing them. The
4116	 * only way to wait for these asynchronous writes is to sleep
4117	 * on the filesystem vnode which may stay busy for a long time
4118	 * if the filesystem is active. So, instead, we make a second
4119	 * pass over the dependencies blocking on each write. In the
4120	 * usual case we will be blocking against a write that we
4121	 * initiated, so when it is done the dependency will have been
4122	 * resolved. Thus the second pass is expected to end quickly.
4123	 * We give a brief window at the top of the loop to allow
4124	 * any pending I/O to complete.
4125	 */
4126	for (waitfor = MNT_NOWAIT; ; ) {
4127		FREE_LOCK(&lk);
4128		ACQUIRE_LOCK(&lk);
4129		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4130			return (0);
4131		for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
4132		     adp = TAILQ_NEXT(adp, ad_next)) {
4133			if (adp->ad_state & DEPCOMPLETE)
4134				continue;
4135			bp = adp->ad_buf;
4136			if (getdirtybuf(&bp, waitfor) == 0) {
4137				if (waitfor == MNT_NOWAIT)
4138					continue;
4139				break;
4140			}
4141			FREE_LOCK(&lk);
4142			if (waitfor == MNT_NOWAIT) {
4143				bawrite(bp);
4144			} else if ((error = BUF_WRITE(bp)) != 0) {
4145				ACQUIRE_LOCK(&lk);
4146				return (error);
4147			}
4148			ACQUIRE_LOCK(&lk);
4149			break;
4150		}
4151		if (adp != NULL)
4152			continue;
4153		for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp;
4154		     adp = TAILQ_NEXT(adp, ad_next)) {
4155			if (adp->ad_state & DEPCOMPLETE)
4156				continue;
4157			bp = adp->ad_buf;
4158			if (getdirtybuf(&bp, waitfor) == 0) {
4159				if (waitfor == MNT_NOWAIT)
4160					continue;
4161				break;
4162			}
4163			FREE_LOCK(&lk);
4164			if (waitfor == MNT_NOWAIT) {
4165				bawrite(bp);
4166			} else if ((error = BUF_WRITE(bp)) != 0) {
4167				ACQUIRE_LOCK(&lk);
4168				return (error);
4169			}
4170			ACQUIRE_LOCK(&lk);
4171			break;
4172		}
4173		if (adp != NULL)
4174			continue;
4175		/*
4176		 * If pass2, we are done, otherwise do pass 2.
4177		 */
4178		if (waitfor == MNT_WAIT)
4179			break;
4180		waitfor = MNT_WAIT;
4181	}
4182	/*
4183	 * Try freeing inodedep in case all dependencies have been removed.
4184	 */
4185	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
4186		(void) free_inodedep(inodedep);
4187	return (0);
4188}
4189
4190/*
4191 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
4192 * Called with splbio blocked.
4193 */
4194static int
4195flush_pagedep_deps(pvp, mp, diraddhdp)
4196	struct vnode *pvp;
4197	struct mount *mp;
4198	struct diraddhd *diraddhdp;
4199{
4200	struct proc *p = CURPROC;	/* XXX */
4201	struct inodedep *inodedep;
4202	struct ufsmount *ump;
4203	struct diradd *dap;
4204	struct vnode *vp;
4205	int gotit, error = 0;
4206	struct buf *bp;
4207	ino_t inum;
4208
4209	ump = VFSTOUFS(mp);
4210	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
4211		/*
4212		 * Flush ourselves if this directory entry
4213		 * has a MKDIR_PARENT dependency.
4214		 */
4215		if (dap->da_state & MKDIR_PARENT) {
4216			FREE_LOCK(&lk);
4217			if ((error = UFS_UPDATE(pvp, 1)) != 0)
4218				break;
4219			ACQUIRE_LOCK(&lk);
4220			/*
4221			 * If that cleared dependencies, go on to next.
4222			 */
4223			if (dap != LIST_FIRST(diraddhdp))
4224				continue;
4225			if (dap->da_state & MKDIR_PARENT)
4226				panic("flush_pagedep_deps: MKDIR_PARENT");
4227		}
4228		/*
4229		 * A newly allocated directory must have its "." and
4230		 * ".." entries written out before its name can be
4231		 * committed in its parent. We do not want or need
4232		 * the full semantics of a synchronous VOP_FSYNC as
4233		 * that may end up here again, once for each directory
4234		 * level in the filesystem. Instead, we push the blocks
4235		 * and wait for them to clear. We have to fsync twice
4236		 * because the first call may choose to defer blocks
4237		 * that still have dependencies, but deferral will
4238		 * happen at most once.
4239		 */
4240		inum = dap->da_newinum;
4241		if (dap->da_state & MKDIR_BODY) {
4242			FREE_LOCK(&lk);
4243			if ((error = VFS_VGET(mp, inum, &vp)) != 0)
4244				break;
4245			if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) ||
4246			    (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) {
4247				vput(vp);
4248				break;
4249			}
4250			drain_output(vp, 0);
4251			vput(vp);
4252			ACQUIRE_LOCK(&lk);
4253			/*
4254			 * If that cleared dependencies, go on to next.
4255			 */
4256			if (dap != LIST_FIRST(diraddhdp))
4257				continue;
4258			if (dap->da_state & MKDIR_BODY)
4259				panic("flush_pagedep_deps: MKDIR_BODY");
4260		}
4261		/*
4262		 * Flush the inode on which the directory entry depends.
4263		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
4264		 * the only remaining dependency is that the updated inode
4265		 * count must get pushed to disk. The inode has already
4266		 * been pushed into its inode buffer (via VOP_UPDATE) at
4267		 * the time of the reference count change. So we need only
4268		 * locate that buffer, ensure that there will be no rollback
4269		 * caused by a bitmap dependency, then write the inode buffer.
4270		 */
4271		if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0)
4272			panic("flush_pagedep_deps: lost inode");
4273		/*
4274		 * If the inode still has bitmap dependencies,
4275		 * push them to disk.
4276		 */
4277		if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4278			gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
4279			FREE_LOCK(&lk);
4280			if (gotit &&
4281			    (error = BUF_WRITE(inodedep->id_buf)) != 0)
4282				break;
4283			ACQUIRE_LOCK(&lk);
4284			if (dap != LIST_FIRST(diraddhdp))
4285				continue;
4286		}
4287		/*
4288		 * If the inode is still sitting in a buffer waiting
4289		 * to be written, push it to disk.
4290		 */
4291		FREE_LOCK(&lk);
4292		if ((error = bread(ump->um_devvp,
4293		    fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
4294		    (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0)
4295			break;
4296		if ((error = BUF_WRITE(bp)) != 0)
4297			break;
4298		ACQUIRE_LOCK(&lk);
4299		/*
4300		 * If we have failed to get rid of all the dependencies
4301		 * then something is seriously wrong.
4302		 */
4303		if (dap == LIST_FIRST(diraddhdp))
4304			panic("flush_pagedep_deps: flush failed");
4305	}
4306	if (error)
4307		ACQUIRE_LOCK(&lk);
4308	return (error);
4309}
4310
4311/*
4312 * A large burst of file addition or deletion activity can drive the
4313 * memory load excessively high. Therefore we deliberately slow things
4314 * down and speed up the I/O processing if we find ourselves with too
4315 * many dependencies in progress.
4316 */
4317static int
4318request_cleanup(resource, islocked)
4319	int resource;
4320	int islocked;
4321{
4322	struct callout_handle handle;
4323	struct proc *p = CURPROC;
4324
4325	/*
4326	 * We never hold up the filesystem syncer process.
4327	 */
4328	if (p == filesys_syncer)
4329		return (0);
4330	/*
4331	 * If we are resource constrained on inode dependencies, try
4332	 * flushing some dirty inodes. Otherwise, we are constrained
4333	 * by file deletions, so try accelerating flushes of directories
4334	 * with removal dependencies. We would like to do the cleanup
4335	 * here, but we probably hold an inode locked at this point and
4336	 * that might deadlock against one that we try to clean. So,
4337	 * the best that we can do is request the syncer daemon to do
4338	 * the cleanup for us.
4339	 */
4340	switch (resource) {
4341
4342	case FLUSH_INODES:
4343		stat_ino_limit_push += 1;
4344		req_clear_inodedeps = 1;
4345		break;
4346
4347	case FLUSH_REMOVE:
4348		stat_blk_limit_push += 1;
4349		req_clear_remove = 1;
4350		break;
4351
4352	default:
4353		panic("request_cleanup: unknown type");
4354	}
4355	/*
4356	 * Hopefully the syncer daemon will catch up and awaken us.
4357	 * We wait at most tickdelay before proceeding in any case.
4358	 */
4359	if (islocked == 0)
4360		ACQUIRE_LOCK(&lk);
4361	if (proc_waiting == 0) {
4362		proc_waiting = 1;
4363		handle = timeout(pause_timer, NULL,
4364		    tickdelay > 2 ? tickdelay : 2);
4365	}
4366	FREE_LOCK_INTERLOCKED(&lk);
4367	(void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0);
4368	ACQUIRE_LOCK_INTERLOCKED(&lk);
4369	if (proc_waiting) {
4370		untimeout(pause_timer, NULL, handle);
4371		proc_waiting = 0;
4372	} else {
4373		switch (resource) {
4374
4375		case FLUSH_INODES:
4376			stat_ino_limit_hit += 1;
4377			break;
4378
4379		case FLUSH_REMOVE:
4380			stat_blk_limit_hit += 1;
4381			break;
4382		}
4383	}
4384	if (islocked == 0)
4385		FREE_LOCK(&lk);
4386	return (1);
4387}
4388
4389/*
4390 * Awaken processes pausing in request_cleanup and clear proc_waiting
4391 * to indicate that there is no longer a timer running.
4392 */
4393void
4394pause_timer(arg)
4395	void *arg;
4396{
4397
4398	proc_waiting = 0;
4399	wakeup(&proc_waiting);
4400}
4401
4402/*
4403 * Flush out a directory with at least one removal dependency in an effort to
4404 * reduce the number of dirrem, freefile, and freeblks dependency structures.
4405 */
4406static void
4407clear_remove(p)
4408	struct proc *p;
4409{
4410	struct pagedep_hashhead *pagedephd;
4411	struct pagedep *pagedep;
4412	static int next = 0;
4413	struct mount *mp;
4414	struct vnode *vp;
4415	int error, cnt;
4416	ino_t ino;
4417
4418	ACQUIRE_LOCK(&lk);
4419	for (cnt = 0; cnt < pagedep_hash; cnt++) {
4420		pagedephd = &pagedep_hashtbl[next++];
4421		if (next >= pagedep_hash)
4422			next = 0;
4423		for (pagedep = LIST_FIRST(pagedephd); pagedep;
4424		     pagedep = LIST_NEXT(pagedep, pd_hash)) {
4425			if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
4426				continue;
4427			mp = pagedep->pd_mnt;
4428			ino = pagedep->pd_ino;
4429			FREE_LOCK(&lk);
4430			if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
4431				softdep_error("clear_remove: vget", error);
4432				return;
4433			}
4434			if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
4435				softdep_error("clear_remove: fsync", error);
4436			drain_output(vp, 0);
4437			vput(vp);
4438			return;
4439		}
4440	}
4441	FREE_LOCK(&lk);
4442}
4443
4444/*
4445 * Clear out a block of dirty inodes in an effort to reduce
4446 * the number of inodedep dependency structures.
4447 */
4448static void
4449clear_inodedeps(p)
4450	struct proc *p;
4451{
4452	struct inodedep_hashhead *inodedephd;
4453	struct inodedep *inodedep;
4454	static int next = 0;
4455	struct mount *mp;
4456	struct vnode *vp;
4457	struct fs *fs;
4458	int error, cnt;
4459	ino_t firstino, lastino, ino;
4460
4461	ACQUIRE_LOCK(&lk);
4462	/*
4463	 * Pick a random inode dependency to be cleared.
4464	 * We will then gather up all the inodes in its block
4465	 * that have dependencies and flush them out.
4466	 */
4467	for (cnt = 0; cnt < inodedep_hash; cnt++) {
4468		inodedephd = &inodedep_hashtbl[next++];
4469		if (next >= inodedep_hash)
4470			next = 0;
4471		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
4472			break;
4473	}
4474	/*
4475	 * Ugly code to find mount point given pointer to superblock.
4476	 */
4477	fs = inodedep->id_fs;
4478	TAILQ_FOREACH(mp, &mountlist, mnt_list)
4479		if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
4480			break;
4481	/*
4482	 * Find the last inode in the block with dependencies.
4483	 */
4484	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
4485	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
4486		if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
4487			break;
4488	/*
4489	 * Asynchronously push all but the last inode with dependencies.
4490	 * Synchronously push the last inode with dependencies to ensure
4491	 * that the inode block gets written to free up the inodedeps.
4492	 */
4493	for (ino = firstino; ino <= lastino; ino++) {
4494		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4495			continue;
4496		FREE_LOCK(&lk);
4497		if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
4498			softdep_error("clear_inodedeps: vget", error);
4499			return;
4500		}
4501		if (ino == lastino) {
4502			if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p)))
4503				softdep_error("clear_inodedeps: fsync1", error);
4504		} else {
4505			if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
4506				softdep_error("clear_inodedeps: fsync2", error);
4507			drain_output(vp, 0);
4508		}
4509		vput(vp);
4510		ACQUIRE_LOCK(&lk);
4511	}
4512	FREE_LOCK(&lk);
4513}
4514
4515/*
4516 * Function to determine if the buffer has outstanding dependencies
4517 * that will cause a roll-back if the buffer is written. If wantcount
4518 * is set, return number of dependencies, otherwise just yes or no.
4519 */
4520static int
4521softdep_count_dependencies(bp, wantcount)
4522	struct buf *bp;
4523	int wantcount;
4524{
4525	struct worklist *wk;
4526	struct inodedep *inodedep;
4527	struct indirdep *indirdep;
4528	struct allocindir *aip;
4529	struct pagedep *pagedep;
4530	struct diradd *dap;
4531	int i, retval;
4532
4533	retval = 0;
4534	ACQUIRE_LOCK(&lk);
4535	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) {
4536		switch (wk->wk_type) {
4537
4538		case D_INODEDEP:
4539			inodedep = WK_INODEDEP(wk);
4540			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4541				/* bitmap allocation dependency */
4542				retval += 1;
4543				if (!wantcount)
4544					goto out;
4545			}
4546			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
4547				/* direct block pointer dependency */
4548				retval += 1;
4549				if (!wantcount)
4550					goto out;
4551			}
4552			continue;
4553
4554		case D_INDIRDEP:
4555			indirdep = WK_INDIRDEP(wk);
4556			for (aip = LIST_FIRST(&indirdep->ir_deplisthd);
4557			     aip; aip = LIST_NEXT(aip, ai_next)) {
4558				/* indirect block pointer dependency */
4559				retval += 1;
4560				if (!wantcount)
4561					goto out;
4562			}
4563			continue;
4564
4565		case D_PAGEDEP:
4566			pagedep = WK_PAGEDEP(wk);
4567			for (i = 0; i < DAHASHSZ; i++) {
4568				for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]);
4569				     dap; dap = LIST_NEXT(dap, da_pdlist)) {
4570					/* directory entry dependency */
4571					retval += 1;
4572					if (!wantcount)
4573						goto out;
4574				}
4575			}
4576			continue;
4577
4578		case D_BMSAFEMAP:
4579		case D_ALLOCDIRECT:
4580		case D_ALLOCINDIR:
4581		case D_MKDIR:
4582			/* never a dependency on these blocks */
4583			continue;
4584
4585		default:
4586			panic("softdep_check_for_rollback: Unexpected type %s",
4587			    TYPENAME(wk->wk_type));
4588			/* NOTREACHED */
4589		}
4590	}
4591out:
4592	FREE_LOCK(&lk);
4593	return retval;
4594}
4595
4596/*
4597 * Acquire exclusive access to a buffer.
4598 * Must be called with splbio blocked.
4599 * Return 1 if buffer was acquired.
4600 */
4601static int
4602getdirtybuf(bpp, waitfor)
4603	struct buf **bpp;
4604	int waitfor;
4605{
4606	struct buf *bp;
4607
4608	for (;;) {
4609		if ((bp = *bpp) == NULL)
4610			return (0);
4611		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
4612			if ((bp->b_xflags & BX_BKGRDINPROG) == 0)
4613				break;
4614			BUF_UNLOCK(bp);
4615			if (waitfor != MNT_WAIT)
4616				return (0);
4617			bp->b_xflags |= BX_BKGRDWAIT;
4618			FREE_LOCK_INTERLOCKED(&lk);
4619			tsleep(&bp->b_xflags, PRIBIO, "getbuf", 0);
4620			ACQUIRE_LOCK_INTERLOCKED(&lk);
4621			continue;
4622		}
4623		if (waitfor != MNT_WAIT)
4624			return (0);
4625		FREE_LOCK_INTERLOCKED(&lk);
4626		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL) != ENOLCK)
4627			panic("getdirtybuf: inconsistent lock");
4628		ACQUIRE_LOCK_INTERLOCKED(&lk);
4629	}
4630	if ((bp->b_flags & B_DELWRI) == 0) {
4631		BUF_UNLOCK(bp);
4632		return (0);
4633	}
4634	bremfree(bp);
4635	return (1);
4636}
4637
4638/*
4639 * Wait for pending output on a vnode to complete.
4640 * Must be called with vnode locked.
4641 */
4642static void
4643drain_output(vp, islocked)
4644	struct vnode *vp;
4645	int islocked;
4646{
4647
4648	if (!islocked)
4649		ACQUIRE_LOCK(&lk);
4650	while (vp->v_numoutput) {
4651		vp->v_flag |= VBWAIT;
4652		FREE_LOCK_INTERLOCKED(&lk);
4653		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drainvp", 0);
4654		ACQUIRE_LOCK_INTERLOCKED(&lk);
4655	}
4656	if (!islocked)
4657		FREE_LOCK(&lk);
4658}
4659
4660/*
4661 * Called whenever a buffer that is being invalidated or reallocated
4662 * contains dependencies. This should only happen if an I/O error has
4663 * occurred. The routine is called with the buffer locked.
4664 */
4665static void
4666softdep_deallocate_dependencies(bp)
4667	struct buf *bp;
4668{
4669
4670	if ((bp->b_ioflags & BIO_ERROR) == 0)
4671		panic("softdep_deallocate_dependencies: dangling deps");
4672	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
4673	panic("softdep_deallocate_dependencies: unrecovered I/O error");
4674}
4675
4676/*
4677 * Function to handle asynchronous write errors in the filesystem.
4678 */
4679void
4680softdep_error(func, error)
4681	char *func;
4682	int error;
4683{
4684
4685	/* XXX should do something better! */
4686	printf("%s: got error %d while accessing filesystem\n", func, error);
4687}
4688