ffs_softdep.c revision 56208
1/*
2 * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved.
3 *
4 * The soft updates code is derived from the appendix of a University
5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
6 * "Soft Updates: A Solution to the Metadata Update Problem in File
7 * Systems", CSE-TR-254-95, August 1995).
8 *
9 * The following are the copyrights and redistribution conditions that
10 * apply to this copy of the soft update software. For a license
11 * to use, redistribute or sell the soft update software under
12 * conditions other than those described here, please contact the
13 * author at one of the following addresses:
14 *
15 *	Marshall Kirk McKusick		mckusick@mckusick.com
16 *	1614 Oxford Street		+1-510-843-9542
17 *	Berkeley, CA 94709-1608
18 *	USA
19 *
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
22 * are met:
23 *
24 * 1. Redistributions of source code must retain the above copyright
25 *    notice, this list of conditions and the following disclaimer.
26 * 2. Redistributions in binary form must reproduce the above copyright
27 *    notice, this list of conditions and the following disclaimer in the
28 *    documentation and/or other materials provided with the distribution.
29 * 3. None of the names of McKusick, Ganger, Patt, or the University of
30 *    Michigan may be used to endorse or promote products derived from
31 *    this software without specific prior written permission.
32 * 4. Redistributions in any form must be accompanied by information on
33 *    how to obtain complete source code for any accompanying software
34 *    that uses this software. This source code must either be included
35 *    in the distribution or be available for no more than the cost of
36 *    distribution plus a nominal fee, and must be freely redistributable
37 *    under reasonable conditions. For an executable file, complete
38 *    source code means the source code for all modules it contains.
39 *    It does not mean source code for modules or files that typically
40 *    accompany the operating system on which the executable file runs,
41 *    e.g., standard library modules or system header files.
42 *
43 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
44 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
45 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
46 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
47 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 *	from: @(#)ffs_softdep.c	9.55 (McKusick) 1/17/00
56 * $FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 56208 2000-01-18 01:30:03Z mckusick $
57 */
58
59/*
60 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
61 */
62#ifndef DIAGNOSTIC
63#define DIAGNOSTIC
64#endif
65#ifndef DEBUG
66#define DEBUG
67#endif
68
69#include <sys/param.h>
70#include <sys/kernel.h>
71#include <sys/systm.h>
72#include <sys/buf.h>
73#include <sys/malloc.h>
74#include <sys/mount.h>
75#include <sys/proc.h>
76#include <sys/syslog.h>
77#include <sys/vnode.h>
78#include <sys/conf.h>
79#include <ufs/ufs/dir.h>
80#include <ufs/ufs/quota.h>
81#include <ufs/ufs/inode.h>
82#include <ufs/ufs/ufsmount.h>
83#include <ufs/ffs/fs.h>
84#include <ufs/ffs/softdep.h>
85#include <ufs/ffs/ffs_extern.h>
86#include <ufs/ufs/ufs_extern.h>
87
88/*
89 * These definitions need to be adapted to the system to which
90 * this file is being ported.
91 */
92/*
93 * malloc types defined for the softdep system.
94 */
95MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
96MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
97MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
98MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
99MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
100MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
101MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
102MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
103MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
104MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
105MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
106MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
107MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
108
109#define	D_PAGEDEP	0
110#define	D_INODEDEP	1
111#define	D_NEWBLK	2
112#define	D_BMSAFEMAP	3
113#define	D_ALLOCDIRECT	4
114#define	D_INDIRDEP	5
115#define	D_ALLOCINDIR	6
116#define	D_FREEFRAG	7
117#define	D_FREEBLKS	8
118#define	D_FREEFILE	9
119#define	D_DIRADD	10
120#define	D_MKDIR		11
121#define	D_DIRREM	12
122#define D_LAST		D_DIRREM
123
124/*
125 * translate from workitem type to memory type
126 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
127 */
128static struct malloc_type *memtype[] = {
129	M_PAGEDEP,
130	M_INODEDEP,
131	M_NEWBLK,
132	M_BMSAFEMAP,
133	M_ALLOCDIRECT,
134	M_INDIRDEP,
135	M_ALLOCINDIR,
136	M_FREEFRAG,
137	M_FREEBLKS,
138	M_FREEFILE,
139	M_DIRADD,
140	M_MKDIR,
141	M_DIRREM
142};
143
144#define DtoM(type) (memtype[type])
145
146/*
147 * Names of malloc types.
148 */
149#define TYPENAME(type)  \
150	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
151#define CURPROC curproc
152/*
153 * End system adaptaion definitions.
154 */
155
156/*
157 * Internal function prototypes.
158 */
159static	void softdep_error __P((char *, int));
160static	void drain_output __P((struct vnode *, int));
161static	int getdirtybuf __P((struct buf **, int));
162static	void clear_remove __P((struct proc *));
163static	void clear_inodedeps __P((struct proc *));
164static	int flush_pagedep_deps __P((struct vnode *, struct mount *,
165	    struct diraddhd *));
166static	int flush_inodedep_deps __P((struct fs *, ino_t));
167static	int handle_written_filepage __P((struct pagedep *, struct buf *));
168static  void diradd_inode_written __P((struct diradd *, struct inodedep *));
169static	int handle_written_inodeblock __P((struct inodedep *, struct buf *));
170static	void handle_allocdirect_partdone __P((struct allocdirect *));
171static	void handle_allocindir_partdone __P((struct allocindir *));
172static	void initiate_write_filepage __P((struct pagedep *, struct buf *));
173static	void handle_written_mkdir __P((struct mkdir *, int));
174static	void initiate_write_inodeblock __P((struct inodedep *, struct buf *));
175static	void handle_workitem_freefile __P((struct freefile *));
176static	void handle_workitem_remove __P((struct dirrem *));
177static	struct dirrem *newdirrem __P((struct buf *, struct inode *,
178	    struct inode *, int, struct dirrem **));
179static	void free_diradd __P((struct diradd *));
180static	void free_allocindir __P((struct allocindir *, struct inodedep *));
181static	int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t,
182	    long *));
183static	void deallocate_dependencies __P((struct buf *, struct inodedep *));
184static	void free_allocdirect __P((struct allocdirectlst *,
185	    struct allocdirect *, int));
186static	int free_inodedep __P((struct inodedep *));
187static	void handle_workitem_freeblocks __P((struct freeblks *));
188static	void merge_inode_lists __P((struct inodedep *));
189static	void setup_allocindir_phase2 __P((struct buf *, struct inode *,
190	    struct allocindir *));
191static	struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t,
192	    ufs_daddr_t));
193static	void handle_workitem_freefrag __P((struct freefrag *));
194static	struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long));
195static	void allocdirect_merge __P((struct allocdirectlst *,
196	    struct allocdirect *, struct allocdirect *));
197static	struct bmsafemap *bmsafemap_lookup __P((struct buf *));
198static	int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
199	    struct newblk **));
200static	int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
201static	int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
202	    struct pagedep **));
203static	void pause_timer __P((void *));
204static	int request_cleanup __P((int, int));
205static	void add_to_worklist __P((struct worklist *));
206
207/*
208 * Exported softdep operations.
209 */
210static	void softdep_disk_io_initiation __P((struct buf *));
211static	void softdep_disk_write_complete __P((struct buf *));
212static	void softdep_deallocate_dependencies __P((struct buf *));
213static	int softdep_fsync __P((struct vnode *));
214static	int softdep_process_worklist __P((struct mount *));
215static	void softdep_move_dependencies __P((struct buf *, struct buf *));
216static	int softdep_count_dependencies __P((struct buf *bp, int));
217
218struct bio_ops bioops = {
219	softdep_disk_io_initiation,		/* io_start */
220	softdep_disk_write_complete,		/* io_complete */
221	softdep_deallocate_dependencies,	/* io_deallocate */
222	softdep_fsync,				/* io_fsync */
223	softdep_process_worklist,		/* io_sync */
224	softdep_move_dependencies,		/* io_movedeps */
225	softdep_count_dependencies,		/* io_countdeps */
226};
227
228/*
229 * Locking primitives.
230 *
231 * For a uniprocessor, all we need to do is protect against disk
232 * interrupts. For a multiprocessor, this lock would have to be
233 * a mutex. A single mutex is used throughout this file, though
234 * finer grain locking could be used if contention warranted it.
235 *
236 * For a multiprocessor, the sleep call would accept a lock and
237 * release it after the sleep processing was complete. In a uniprocessor
238 * implementation there is no such interlock, so we simple mark
239 * the places where it needs to be done with the `interlocked' form
240 * of the lock calls. Since the uniprocessor sleep already interlocks
241 * the spl, there is nothing that really needs to be done.
242 */
243#ifndef /* NOT */ DEBUG
244static struct lockit {
245	int	lkt_spl;
246} lk = { 0 };
247#define ACQUIRE_LOCK(lk)		(lk)->lkt_spl = splbio()
248#define FREE_LOCK(lk)			splx((lk)->lkt_spl)
249#define ACQUIRE_LOCK_INTERLOCKED(lk)
250#define FREE_LOCK_INTERLOCKED(lk)
251
252#else /* DEBUG */
253static struct lockit {
254	int	lkt_spl;
255	pid_t	lkt_held;
256} lk = { 0, -1 };
257static int lockcnt;
258
259static	void acquire_lock __P((struct lockit *));
260static	void free_lock __P((struct lockit *));
261static	void acquire_lock_interlocked __P((struct lockit *));
262static	void free_lock_interlocked __P((struct lockit *));
263
264#define ACQUIRE_LOCK(lk)		acquire_lock(lk)
265#define FREE_LOCK(lk)			free_lock(lk)
266#define ACQUIRE_LOCK_INTERLOCKED(lk)	acquire_lock_interlocked(lk)
267#define FREE_LOCK_INTERLOCKED(lk)	free_lock_interlocked(lk)
268
269static void
270acquire_lock(lk)
271	struct lockit *lk;
272{
273
274	if (lk->lkt_held != -1) {
275		if (lk->lkt_held == CURPROC->p_pid)
276			panic("softdep_lock: locking against myself");
277		else
278			panic("softdep_lock: lock held by %d", lk->lkt_held);
279	}
280	lk->lkt_spl = splbio();
281	lk->lkt_held = CURPROC->p_pid;
282	lockcnt++;
283}
284
285static void
286free_lock(lk)
287	struct lockit *lk;
288{
289
290	if (lk->lkt_held == -1)
291		panic("softdep_unlock: lock not held");
292	lk->lkt_held = -1;
293	splx(lk->lkt_spl);
294}
295
296static void
297acquire_lock_interlocked(lk)
298	struct lockit *lk;
299{
300
301	if (lk->lkt_held != -1) {
302		if (lk->lkt_held == CURPROC->p_pid)
303			panic("softdep_lock_interlocked: locking against self");
304		else
305			panic("softdep_lock_interlocked: lock held by %d",
306			    lk->lkt_held);
307	}
308	lk->lkt_held = CURPROC->p_pid;
309	lockcnt++;
310}
311
312static void
313free_lock_interlocked(lk)
314	struct lockit *lk;
315{
316
317	if (lk->lkt_held == -1)
318		panic("softdep_unlock_interlocked: lock not held");
319	lk->lkt_held = -1;
320}
321#endif /* DEBUG */
322
323/*
324 * Place holder for real semaphores.
325 */
326struct sema {
327	int	value;
328	pid_t	holder;
329	char	*name;
330	int	prio;
331	int	timo;
332};
333static	void sema_init __P((struct sema *, char *, int, int));
334static	int sema_get __P((struct sema *, struct lockit *));
335static	void sema_release __P((struct sema *));
336
337static void
338sema_init(semap, name, prio, timo)
339	struct sema *semap;
340	char *name;
341	int prio, timo;
342{
343
344	semap->holder = -1;
345	semap->value = 0;
346	semap->name = name;
347	semap->prio = prio;
348	semap->timo = timo;
349}
350
351static int
352sema_get(semap, interlock)
353	struct sema *semap;
354	struct lockit *interlock;
355{
356
357	if (semap->value++ > 0) {
358		if (interlock != NULL)
359			FREE_LOCK_INTERLOCKED(interlock);
360		tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo);
361		if (interlock != NULL) {
362			ACQUIRE_LOCK_INTERLOCKED(interlock);
363			FREE_LOCK(interlock);
364		}
365		return (0);
366	}
367	semap->holder = CURPROC->p_pid;
368	if (interlock != NULL)
369		FREE_LOCK(interlock);
370	return (1);
371}
372
373static void
374sema_release(semap)
375	struct sema *semap;
376{
377
378	if (semap->value <= 0 || semap->holder != CURPROC->p_pid)
379		panic("sema_release: not held");
380	if (--semap->value > 0) {
381		semap->value = 0;
382		wakeup(semap);
383	}
384	semap->holder = -1;
385}
386
387/*
388 * Worklist queue management.
389 * These routines require that the lock be held.
390 */
391#ifndef /* NOT */ DEBUG
392#define WORKLIST_INSERT(head, item) do {	\
393	(item)->wk_state |= ONWORKLIST;		\
394	LIST_INSERT_HEAD(head, item, wk_list);	\
395} while (0)
396#define WORKLIST_REMOVE(item) do {		\
397	(item)->wk_state &= ~ONWORKLIST;	\
398	LIST_REMOVE(item, wk_list);		\
399} while (0)
400#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
401
402#else /* DEBUG */
403static	void worklist_insert __P((struct workhead *, struct worklist *));
404static	void worklist_remove __P((struct worklist *));
405static	void workitem_free __P((struct worklist *, int));
406
407#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
408#define WORKLIST_REMOVE(item) worklist_remove(item)
409#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
410
411static void
412worklist_insert(head, item)
413	struct workhead *head;
414	struct worklist *item;
415{
416
417	if (lk.lkt_held == -1)
418		panic("worklist_insert: lock not held");
419	if (item->wk_state & ONWORKLIST)
420		panic("worklist_insert: already on list");
421	item->wk_state |= ONWORKLIST;
422	LIST_INSERT_HEAD(head, item, wk_list);
423}
424
425static void
426worklist_remove(item)
427	struct worklist *item;
428{
429
430	if (lk.lkt_held == -1)
431		panic("worklist_remove: lock not held");
432	if ((item->wk_state & ONWORKLIST) == 0)
433		panic("worklist_remove: not on list");
434	item->wk_state &= ~ONWORKLIST;
435	LIST_REMOVE(item, wk_list);
436}
437
438static void
439workitem_free(item, type)
440	struct worklist *item;
441	int type;
442{
443
444	if (item->wk_state & ONWORKLIST)
445		panic("workitem_free: still on list");
446	if (item->wk_type != type)
447		panic("workitem_free: type mismatch");
448	FREE(item, DtoM(type));
449}
450#endif /* DEBUG */
451
452/*
453 * Workitem queue management
454 */
455static struct workhead softdep_workitem_pending;
456static int softdep_worklist_busy;
457static int max_softdeps;	/* maximum number of structs before slowdown */
458static int tickdelay = 2;	/* number of ticks to pause during slowdown */
459static int proc_waiting;	/* tracks whether we have a timeout posted */
460static struct proc *filesys_syncer; /* proc of filesystem syncer process */
461static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
462#define FLUSH_INODES	1
463static int req_clear_remove;	/* syncer process flush some freeblks */
464#define FLUSH_REMOVE	2
465/*
466 * runtime statistics
467 */
468static int stat_blk_limit_push;	/* number of times block limit neared */
469static int stat_ino_limit_push;	/* number of times inode limit neared */
470static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
471static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
472static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
473static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
474static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
475static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
476#ifdef DEBUG
477#include <vm/vm.h>
478#include <sys/sysctl.h>
479SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
480SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
481SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
482SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
483SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
484SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
485SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
486SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
487SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
488SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
489#endif /* DEBUG */
490
491/*
492 * Add an item to the end of the work queue.
493 * This routine requires that the lock be held.
494 * This is the only routine that adds items to the list.
495 * The following routine is the only one that removes items
496 * and does so in order from first to last.
497 */
498static void
499add_to_worklist(wk)
500	struct worklist *wk;
501{
502	static struct worklist *worklist_tail;
503
504	if (wk->wk_state & ONWORKLIST)
505		panic("add_to_worklist: already on list");
506	wk->wk_state |= ONWORKLIST;
507	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
508		LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
509	else
510		LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
511	worklist_tail = wk;
512}
513
514/*
515 * Process that runs once per second to handle items in the background queue.
516 *
517 * Note that we ensure that everything is done in the order in which they
518 * appear in the queue. The code below depends on this property to ensure
519 * that blocks of a file are freed before the inode itself is freed. This
520 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
521 * until all the old ones have been purged from the dependency lists.
522 */
523static int
524softdep_process_worklist(matchmnt)
525	struct mount *matchmnt;
526{
527	struct proc *p = CURPROC;
528	struct worklist *wk;
529	struct fs *matchfs;
530	int matchcnt, loopcount;
531
532	/*
533	 * Record the process identifier of our caller so that we can give
534	 * this process preferential treatment in request_cleanup below.
535	 */
536	filesys_syncer = p;
537	matchcnt = 0;
538	matchfs = NULL;
539	if (matchmnt != NULL)
540		matchfs = VFSTOUFS(matchmnt)->um_fs;
541	/*
542	 * There is no danger of having multiple processes run this
543	 * code. It is single threaded solely so that softdep_flushfiles
544	 * (below) can get an accurate count of the number of items
545	 * related to its mount point that are in the list.
546	 */
547	if (softdep_worklist_busy && matchmnt == NULL)
548		return (-1);
549	/*
550	 * If requested, try removing inode or removal dependencies.
551	 */
552	if (req_clear_inodedeps) {
553		clear_inodedeps(p);
554		req_clear_inodedeps = 0;
555		wakeup(&proc_waiting);
556	}
557	if (req_clear_remove) {
558		clear_remove(p);
559		req_clear_remove = 0;
560		wakeup(&proc_waiting);
561	}
562	ACQUIRE_LOCK(&lk);
563	loopcount = 1;
564	while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) {
565		WORKLIST_REMOVE(wk);
566		FREE_LOCK(&lk);
567		switch (wk->wk_type) {
568
569		case D_DIRREM:
570			/* removal of a directory entry */
571			if (WK_DIRREM(wk)->dm_mnt == matchmnt)
572				matchcnt += 1;
573			handle_workitem_remove(WK_DIRREM(wk));
574			break;
575
576		case D_FREEBLKS:
577			/* releasing blocks and/or fragments from a file */
578			if (WK_FREEBLKS(wk)->fb_fs == matchfs)
579				matchcnt += 1;
580			handle_workitem_freeblocks(WK_FREEBLKS(wk));
581			break;
582
583		case D_FREEFRAG:
584			/* releasing a fragment when replaced as a file grows */
585			if (WK_FREEFRAG(wk)->ff_fs == matchfs)
586				matchcnt += 1;
587			handle_workitem_freefrag(WK_FREEFRAG(wk));
588			break;
589
590		case D_FREEFILE:
591			/* releasing an inode when its link count drops to 0 */
592			if (WK_FREEFILE(wk)->fx_fs == matchfs)
593				matchcnt += 1;
594			handle_workitem_freefile(WK_FREEFILE(wk));
595			break;
596
597		default:
598			panic("%s_process_worklist: Unknown type %s",
599			    "softdep", TYPENAME(wk->wk_type));
600			/* NOTREACHED */
601		}
602		if (softdep_worklist_busy && matchmnt == NULL)
603			return (-1);
604		/*
605		 * If requested, try removing inode or removal dependencies.
606		 */
607		if (req_clear_inodedeps) {
608			clear_inodedeps(p);
609			req_clear_inodedeps = 0;
610			wakeup(&proc_waiting);
611		}
612		if (req_clear_remove) {
613			clear_remove(p);
614			req_clear_remove = 0;
615			wakeup(&proc_waiting);
616		}
617		/*
618		 * We do not generally want to stop for buffer space, but if
619		 * we are really being a buffer hog, we will stop and wait.
620		 */
621		if (loopcount++ % 128 == 0)
622			bwillwrite();
623		ACQUIRE_LOCK(&lk);
624	}
625	FREE_LOCK(&lk);
626	return (matchcnt);
627}
628
629/*
630 * Move dependencies from one buffer to another.
631 */
632static void
633softdep_move_dependencies(oldbp, newbp)
634	struct buf *oldbp;
635	struct buf *newbp;
636{
637	struct worklist *wk, *wktail;
638
639	if (LIST_FIRST(&newbp->b_dep) != NULL)
640		panic("softdep_move_dependencies: need merge code");
641	wktail = 0;
642	ACQUIRE_LOCK(&lk);
643	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
644		LIST_REMOVE(wk, wk_list);
645		if (wktail == 0)
646			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
647		else
648			LIST_INSERT_AFTER(wktail, wk, wk_list);
649		wktail = wk;
650	}
651	FREE_LOCK(&lk);
652}
653
654/*
655 * Purge the work list of all items associated with a particular mount point.
656 */
657int
658softdep_flushfiles(oldmnt, flags, p)
659	struct mount *oldmnt;
660	int flags;
661	struct proc *p;
662{
663	struct vnode *devvp;
664	int error, loopcnt;
665
666	/*
667	 * Await our turn to clear out the queue.
668	 */
669	while (softdep_worklist_busy)
670		tsleep(&lbolt, PRIBIO, "softflush", 0);
671	softdep_worklist_busy = 1;
672	if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) {
673		softdep_worklist_busy = 0;
674		return (error);
675	}
676	/*
677	 * Alternately flush the block device associated with the mount
678	 * point and process any dependencies that the flushing
679	 * creates. In theory, this loop can happen at most twice,
680	 * but we give it a few extra just to be sure.
681	 */
682	devvp = VFSTOUFS(oldmnt)->um_devvp;
683	for (loopcnt = 10; loopcnt > 0; ) {
684		if (softdep_process_worklist(oldmnt) == 0) {
685			loopcnt--;
686			/*
687			 * Do another flush in case any vnodes were brought in
688			 * as part of the cleanup operations.
689			 */
690			if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
691				break;
692			/*
693			 * If we still found nothing to do, we are really done.
694			 */
695			if (softdep_process_worklist(oldmnt) == 0)
696				break;
697		}
698		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
699		error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
700		VOP_UNLOCK(devvp, 0, p);
701		if (error)
702			break;
703	}
704	softdep_worklist_busy = 0;
705	/*
706	 * If we are unmounting then it is an error to fail. If we
707	 * are simply trying to downgrade to read-only, then filesystem
708	 * activity can keep us busy forever, so we just fail with EBUSY.
709	 */
710	if (loopcnt == 0) {
711		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
712			panic("softdep_flushfiles: looping");
713		error = EBUSY;
714	}
715	return (error);
716}
717
718/*
719 * Structure hashing.
720 *
721 * There are three types of structures that can be looked up:
722 *	1) pagedep structures identified by mount point, inode number,
723 *	   and logical block.
724 *	2) inodedep structures identified by mount point and inode number.
725 *	3) newblk structures identified by mount point and
726 *	   physical block number.
727 *
728 * The "pagedep" and "inodedep" dependency structures are hashed
729 * separately from the file blocks and inodes to which they correspond.
730 * This separation helps when the in-memory copy of an inode or
731 * file block must be replaced. It also obviates the need to access
732 * an inode or file page when simply updating (or de-allocating)
733 * dependency structures. Lookup of newblk structures is needed to
734 * find newly allocated blocks when trying to associate them with
735 * their allocdirect or allocindir structure.
736 *
737 * The lookup routines optionally create and hash a new instance when
738 * an existing entry is not found.
739 */
740#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
741
742/*
743 * Structures and routines associated with pagedep caching.
744 */
745LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
746u_long	pagedep_hash;		/* size of hash table - 1 */
747#define	PAGEDEP_HASH(mp, inum, lbn) \
748	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
749	    pagedep_hash])
750static struct sema pagedep_in_progress;
751
752/*
753 * Look up a pagedep. Return 1 if found, 0 if not found.
754 * If not found, allocate if DEPALLOC flag is passed.
755 * Found or allocated entry is returned in pagedeppp.
756 * This routine must be called with splbio interrupts blocked.
757 */
758static int
759pagedep_lookup(ip, lbn, flags, pagedeppp)
760	struct inode *ip;
761	ufs_lbn_t lbn;
762	int flags;
763	struct pagedep **pagedeppp;
764{
765	struct pagedep *pagedep;
766	struct pagedep_hashhead *pagedephd;
767	struct mount *mp;
768	int i;
769
770#ifdef DEBUG
771	if (lk.lkt_held == -1)
772		panic("pagedep_lookup: lock not held");
773#endif
774	mp = ITOV(ip)->v_mount;
775	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
776top:
777	for (pagedep = LIST_FIRST(pagedephd); pagedep;
778	     pagedep = LIST_NEXT(pagedep, pd_hash))
779		if (ip->i_number == pagedep->pd_ino &&
780		    lbn == pagedep->pd_lbn &&
781		    mp == pagedep->pd_mnt)
782			break;
783	if (pagedep) {
784		*pagedeppp = pagedep;
785		return (1);
786	}
787	if ((flags & DEPALLOC) == 0) {
788		*pagedeppp = NULL;
789		return (0);
790	}
791	if (sema_get(&pagedep_in_progress, &lk) == 0) {
792		ACQUIRE_LOCK(&lk);
793		goto top;
794	}
795	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
796		M_WAITOK);
797	bzero(pagedep, sizeof(struct pagedep));
798	pagedep->pd_list.wk_type = D_PAGEDEP;
799	pagedep->pd_mnt = mp;
800	pagedep->pd_ino = ip->i_number;
801	pagedep->pd_lbn = lbn;
802	LIST_INIT(&pagedep->pd_dirremhd);
803	LIST_INIT(&pagedep->pd_pendinghd);
804	for (i = 0; i < DAHASHSZ; i++)
805		LIST_INIT(&pagedep->pd_diraddhd[i]);
806	ACQUIRE_LOCK(&lk);
807	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
808	sema_release(&pagedep_in_progress);
809	*pagedeppp = pagedep;
810	return (0);
811}
812
813/*
814 * Structures and routines associated with inodedep caching.
815 */
816LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
817static u_long	inodedep_hash;	/* size of hash table - 1 */
818static long	num_inodedep;	/* number of inodedep allocated */
819#define	INODEDEP_HASH(fs, inum) \
820      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
821static struct sema inodedep_in_progress;
822
823/*
824 * Look up a inodedep. Return 1 if found, 0 if not found.
825 * If not found, allocate if DEPALLOC flag is passed.
826 * Found or allocated entry is returned in inodedeppp.
827 * This routine must be called with splbio interrupts blocked.
828 */
829static int
830inodedep_lookup(fs, inum, flags, inodedeppp)
831	struct fs *fs;
832	ino_t inum;
833	int flags;
834	struct inodedep **inodedeppp;
835{
836	struct inodedep *inodedep;
837	struct inodedep_hashhead *inodedephd;
838	int firsttry;
839
840#ifdef DEBUG
841	if (lk.lkt_held == -1)
842		panic("inodedep_lookup: lock not held");
843#endif
844	firsttry = 1;
845	inodedephd = INODEDEP_HASH(fs, inum);
846top:
847	for (inodedep = LIST_FIRST(inodedephd); inodedep;
848	     inodedep = LIST_NEXT(inodedep, id_hash))
849		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
850			break;
851	if (inodedep) {
852		*inodedeppp = inodedep;
853		return (1);
854	}
855	if ((flags & DEPALLOC) == 0) {
856		*inodedeppp = NULL;
857		return (0);
858	}
859	/*
860	 * If we are over our limit, try to improve the situation.
861	 */
862	if (num_inodedep > max_softdeps && firsttry && speedup_syncer() == 0 &&
863	    request_cleanup(FLUSH_INODES, 1)) {
864		firsttry = 0;
865		goto top;
866	}
867	if (sema_get(&inodedep_in_progress, &lk) == 0) {
868		ACQUIRE_LOCK(&lk);
869		goto top;
870	}
871	num_inodedep += 1;
872	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
873		M_INODEDEP, M_WAITOK);
874	inodedep->id_list.wk_type = D_INODEDEP;
875	inodedep->id_fs = fs;
876	inodedep->id_ino = inum;
877	inodedep->id_state = ALLCOMPLETE;
878	inodedep->id_nlinkdelta = 0;
879	inodedep->id_savedino = NULL;
880	inodedep->id_savedsize = -1;
881	inodedep->id_buf = NULL;
882	LIST_INIT(&inodedep->id_pendinghd);
883	LIST_INIT(&inodedep->id_inowait);
884	LIST_INIT(&inodedep->id_bufwait);
885	TAILQ_INIT(&inodedep->id_inoupdt);
886	TAILQ_INIT(&inodedep->id_newinoupdt);
887	ACQUIRE_LOCK(&lk);
888	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
889	sema_release(&inodedep_in_progress);
890	*inodedeppp = inodedep;
891	return (0);
892}
893
894/*
895 * Structures and routines associated with newblk caching.
896 */
897LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
898u_long	newblk_hash;		/* size of hash table - 1 */
899#define	NEWBLK_HASH(fs, inum) \
900	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
901static struct sema newblk_in_progress;
902
903/*
904 * Look up a newblk. Return 1 if found, 0 if not found.
905 * If not found, allocate if DEPALLOC flag is passed.
906 * Found or allocated entry is returned in newblkpp.
907 */
908static int
909newblk_lookup(fs, newblkno, flags, newblkpp)
910	struct fs *fs;
911	ufs_daddr_t newblkno;
912	int flags;
913	struct newblk **newblkpp;
914{
915	struct newblk *newblk;
916	struct newblk_hashhead *newblkhd;
917
918	newblkhd = NEWBLK_HASH(fs, newblkno);
919top:
920	for (newblk = LIST_FIRST(newblkhd); newblk;
921	     newblk = LIST_NEXT(newblk, nb_hash))
922		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
923			break;
924	if (newblk) {
925		*newblkpp = newblk;
926		return (1);
927	}
928	if ((flags & DEPALLOC) == 0) {
929		*newblkpp = NULL;
930		return (0);
931	}
932	if (sema_get(&newblk_in_progress, 0) == 0)
933		goto top;
934	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
935		M_NEWBLK, M_WAITOK);
936	newblk->nb_state = 0;
937	newblk->nb_fs = fs;
938	newblk->nb_newblkno = newblkno;
939	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
940	sema_release(&newblk_in_progress);
941	*newblkpp = newblk;
942	return (0);
943}
944
945/*
946 * Executed during filesystem system initialization before
947 * mounting any file systems.
948 */
949void
950softdep_initialize()
951{
952
953	LIST_INIT(&mkdirlisthd);
954	LIST_INIT(&softdep_workitem_pending);
955	max_softdeps = desiredvnodes * 8;
956	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
957	    &pagedep_hash);
958	sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
959	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
960	sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
961	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
962	sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
963}
964
965/*
966 * Called at mount time to notify the dependency code that a
967 * filesystem wishes to use it.
968 */
969int
970softdep_mount(devvp, mp, fs, cred)
971	struct vnode *devvp;
972	struct mount *mp;
973	struct fs *fs;
974	struct ucred *cred;
975{
976	struct csum cstotal;
977	struct cg *cgp;
978	struct buf *bp;
979	int error, cyl;
980
981	mp->mnt_flag &= ~MNT_ASYNC;
982	mp->mnt_flag |= MNT_SOFTDEP;
983	/*
984	 * When doing soft updates, the counters in the
985	 * superblock may have gotten out of sync, so we have
986	 * to scan the cylinder groups and recalculate them.
987	 */
988	if (fs->fs_clean != 0)
989		return (0);
990	bzero(&cstotal, sizeof cstotal);
991	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
992		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
993		    fs->fs_cgsize, cred, &bp)) != 0) {
994			brelse(bp);
995			return (error);
996		}
997		cgp = (struct cg *)bp->b_data;
998		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
999		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1000		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1001		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1002		fs->fs_cs(fs, cyl) = cgp->cg_cs;
1003		brelse(bp);
1004	}
1005#ifdef DEBUG
1006	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1007		printf("ffs_mountfs: superblock updated for soft updates\n");
1008#endif
1009	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1010	return (0);
1011}
1012
1013/*
1014 * Protecting the freemaps (or bitmaps).
1015 *
1016 * To eliminate the need to execute fsck before mounting a file system
1017 * after a power failure, one must (conservatively) guarantee that the
1018 * on-disk copy of the bitmaps never indicate that a live inode or block is
1019 * free.  So, when a block or inode is allocated, the bitmap should be
1020 * updated (on disk) before any new pointers.  When a block or inode is
1021 * freed, the bitmap should not be updated until all pointers have been
1022 * reset.  The latter dependency is handled by the delayed de-allocation
1023 * approach described below for block and inode de-allocation.  The former
1024 * dependency is handled by calling the following procedure when a block or
1025 * inode is allocated. When an inode is allocated an "inodedep" is created
1026 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1027 * Each "inodedep" is also inserted into the hash indexing structure so
1028 * that any additional link additions can be made dependent on the inode
1029 * allocation.
1030 *
1031 * The ufs file system maintains a number of free block counts (e.g., per
1032 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1033 * in addition to the bitmaps.  These counts are used to improve efficiency
1034 * during allocation and therefore must be consistent with the bitmaps.
1035 * There is no convenient way to guarantee post-crash consistency of these
1036 * counts with simple update ordering, for two main reasons: (1) The counts
1037 * and bitmaps for a single cylinder group block are not in the same disk
1038 * sector.  If a disk write is interrupted (e.g., by power failure), one may
1039 * be written and the other not.  (2) Some of the counts are located in the
1040 * superblock rather than the cylinder group block. So, we focus our soft
1041 * updates implementation on protecting the bitmaps. When mounting a
1042 * filesystem, we recompute the auxiliary counts from the bitmaps.
1043 */
1044
1045/*
1046 * Called just after updating the cylinder group block to allocate an inode.
1047 */
1048void
1049softdep_setup_inomapdep(bp, ip, newinum)
1050	struct buf *bp;		/* buffer for cylgroup block with inode map */
1051	struct inode *ip;	/* inode related to allocation */
1052	ino_t newinum;		/* new inode number being allocated */
1053{
1054	struct inodedep *inodedep;
1055	struct bmsafemap *bmsafemap;
1056
1057	/*
1058	 * Create a dependency for the newly allocated inode.
1059	 * Panic if it already exists as something is seriously wrong.
1060	 * Otherwise add it to the dependency list for the buffer holding
1061	 * the cylinder group map from which it was allocated.
1062	 */
1063	ACQUIRE_LOCK(&lk);
1064	if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0)
1065		panic("softdep_setup_inomapdep: found inode");
1066	inodedep->id_buf = bp;
1067	inodedep->id_state &= ~DEPCOMPLETE;
1068	bmsafemap = bmsafemap_lookup(bp);
1069	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1070	FREE_LOCK(&lk);
1071}
1072
1073/*
1074 * Called just after updating the cylinder group block to
1075 * allocate block or fragment.
1076 */
1077void
1078softdep_setup_blkmapdep(bp, fs, newblkno)
1079	struct buf *bp;		/* buffer for cylgroup block with block map */
1080	struct fs *fs;		/* filesystem doing allocation */
1081	ufs_daddr_t newblkno;	/* number of newly allocated block */
1082{
1083	struct newblk *newblk;
1084	struct bmsafemap *bmsafemap;
1085
1086	/*
1087	 * Create a dependency for the newly allocated block.
1088	 * Add it to the dependency list for the buffer holding
1089	 * the cylinder group map from which it was allocated.
1090	 */
1091	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1092		panic("softdep_setup_blkmapdep: found block");
1093	ACQUIRE_LOCK(&lk);
1094	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1095	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1096	FREE_LOCK(&lk);
1097}
1098
1099/*
1100 * Find the bmsafemap associated with a cylinder group buffer.
1101 * If none exists, create one. The buffer must be locked when
1102 * this routine is called and this routine must be called with
1103 * splbio interrupts blocked.
1104 */
1105static struct bmsafemap *
1106bmsafemap_lookup(bp)
1107	struct buf *bp;
1108{
1109	struct bmsafemap *bmsafemap;
1110	struct worklist *wk;
1111
1112#ifdef DEBUG
1113	if (lk.lkt_held == -1)
1114		panic("bmsafemap_lookup: lock not held");
1115#endif
1116	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list))
1117		if (wk->wk_type == D_BMSAFEMAP)
1118			return (WK_BMSAFEMAP(wk));
1119	FREE_LOCK(&lk);
1120	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1121		M_BMSAFEMAP, M_WAITOK);
1122	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1123	bmsafemap->sm_list.wk_state = 0;
1124	bmsafemap->sm_buf = bp;
1125	LIST_INIT(&bmsafemap->sm_allocdirecthd);
1126	LIST_INIT(&bmsafemap->sm_allocindirhd);
1127	LIST_INIT(&bmsafemap->sm_inodedephd);
1128	LIST_INIT(&bmsafemap->sm_newblkhd);
1129	ACQUIRE_LOCK(&lk);
1130	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1131	return (bmsafemap);
1132}
1133
1134/*
1135 * Direct block allocation dependencies.
1136 *
1137 * When a new block is allocated, the corresponding disk locations must be
1138 * initialized (with zeros or new data) before the on-disk inode points to
1139 * them.  Also, the freemap from which the block was allocated must be
1140 * updated (on disk) before the inode's pointer. These two dependencies are
1141 * independent of each other and are needed for all file blocks and indirect
1142 * blocks that are pointed to directly by the inode.  Just before the
1143 * "in-core" version of the inode is updated with a newly allocated block
1144 * number, a procedure (below) is called to setup allocation dependency
1145 * structures.  These structures are removed when the corresponding
1146 * dependencies are satisfied or when the block allocation becomes obsolete
1147 * (i.e., the file is deleted, the block is de-allocated, or the block is a
1148 * fragment that gets upgraded).  All of these cases are handled in
1149 * procedures described later.
1150 *
1151 * When a file extension causes a fragment to be upgraded, either to a larger
1152 * fragment or to a full block, the on-disk location may change (if the
1153 * previous fragment could not simply be extended). In this case, the old
1154 * fragment must be de-allocated, but not until after the inode's pointer has
1155 * been updated. In most cases, this is handled by later procedures, which
1156 * will construct a "freefrag" structure to be added to the workitem queue
1157 * when the inode update is complete (or obsolete).  The main exception to
1158 * this is when an allocation occurs while a pending allocation dependency
1159 * (for the same block pointer) remains.  This case is handled in the main
1160 * allocation dependency setup procedure by immediately freeing the
1161 * unreferenced fragments.
1162 */
1163void
1164softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1165	struct inode *ip;	/* inode to which block is being added */
1166	ufs_lbn_t lbn;		/* block pointer within inode */
1167	ufs_daddr_t newblkno;	/* disk block number being added */
1168	ufs_daddr_t oldblkno;	/* previous block number, 0 unless frag */
1169	long newsize;		/* size of new block */
1170	long oldsize;		/* size of new block */
1171	struct buf *bp;		/* bp for allocated block */
1172{
1173	struct allocdirect *adp, *oldadp;
1174	struct allocdirectlst *adphead;
1175	struct bmsafemap *bmsafemap;
1176	struct inodedep *inodedep;
1177	struct pagedep *pagedep;
1178	struct newblk *newblk;
1179
1180	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1181		M_ALLOCDIRECT, M_WAITOK);
1182	bzero(adp, sizeof(struct allocdirect));
1183	adp->ad_list.wk_type = D_ALLOCDIRECT;
1184	adp->ad_lbn = lbn;
1185	adp->ad_newblkno = newblkno;
1186	adp->ad_oldblkno = oldblkno;
1187	adp->ad_newsize = newsize;
1188	adp->ad_oldsize = oldsize;
1189	adp->ad_state = ATTACHED;
1190	if (newblkno == oldblkno)
1191		adp->ad_freefrag = NULL;
1192	else
1193		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1194
1195	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1196		panic("softdep_setup_allocdirect: lost block");
1197
1198	ACQUIRE_LOCK(&lk);
1199	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
1200	adp->ad_inodedep = inodedep;
1201
1202	if (newblk->nb_state == DEPCOMPLETE) {
1203		adp->ad_state |= DEPCOMPLETE;
1204		adp->ad_buf = NULL;
1205	} else {
1206		bmsafemap = newblk->nb_bmsafemap;
1207		adp->ad_buf = bmsafemap->sm_buf;
1208		LIST_REMOVE(newblk, nb_deps);
1209		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1210	}
1211	LIST_REMOVE(newblk, nb_hash);
1212	FREE(newblk, M_NEWBLK);
1213
1214	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1215	if (lbn >= NDADDR) {
1216		/* allocating an indirect block */
1217		if (oldblkno != 0)
1218			panic("softdep_setup_allocdirect: non-zero indir");
1219	} else {
1220		/*
1221		 * Allocating a direct block.
1222		 *
1223		 * If we are allocating a directory block, then we must
1224		 * allocate an associated pagedep to track additions and
1225		 * deletions.
1226		 */
1227		if ((ip->i_mode & IFMT) == IFDIR &&
1228		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1229			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1230	}
1231	/*
1232	 * The list of allocdirects must be kept in sorted and ascending
1233	 * order so that the rollback routines can quickly determine the
1234	 * first uncommitted block (the size of the file stored on disk
1235	 * ends at the end of the lowest committed fragment, or if there
1236	 * are no fragments, at the end of the highest committed block).
1237	 * Since files generally grow, the typical case is that the new
1238	 * block is to be added at the end of the list. We speed this
1239	 * special case by checking against the last allocdirect in the
1240	 * list before laboriously traversing the list looking for the
1241	 * insertion point.
1242	 */
1243	adphead = &inodedep->id_newinoupdt;
1244	oldadp = TAILQ_LAST(adphead, allocdirectlst);
1245	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1246		/* insert at end of list */
1247		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1248		if (oldadp != NULL && oldadp->ad_lbn == lbn)
1249			allocdirect_merge(adphead, adp, oldadp);
1250		FREE_LOCK(&lk);
1251		return;
1252	}
1253	for (oldadp = TAILQ_FIRST(adphead); oldadp;
1254	     oldadp = TAILQ_NEXT(oldadp, ad_next)) {
1255		if (oldadp->ad_lbn >= lbn)
1256			break;
1257	}
1258	if (oldadp == NULL)
1259		panic("softdep_setup_allocdirect: lost entry");
1260	/* insert in middle of list */
1261	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1262	if (oldadp->ad_lbn == lbn)
1263		allocdirect_merge(adphead, adp, oldadp);
1264	FREE_LOCK(&lk);
1265}
1266
1267/*
1268 * Replace an old allocdirect dependency with a newer one.
1269 * This routine must be called with splbio interrupts blocked.
1270 */
1271static void
1272allocdirect_merge(adphead, newadp, oldadp)
1273	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
1274	struct allocdirect *newadp;	/* allocdirect being added */
1275	struct allocdirect *oldadp;	/* existing allocdirect being checked */
1276{
1277	struct freefrag *freefrag;
1278
1279#ifdef DEBUG
1280	if (lk.lkt_held == -1)
1281		panic("allocdirect_merge: lock not held");
1282#endif
1283	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1284	    newadp->ad_oldsize != oldadp->ad_newsize ||
1285	    newadp->ad_lbn >= NDADDR)
1286		panic("allocdirect_check: old %d != new %d || lbn %ld >= %d",
1287		    newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
1288		    NDADDR);
1289	newadp->ad_oldblkno = oldadp->ad_oldblkno;
1290	newadp->ad_oldsize = oldadp->ad_oldsize;
1291	/*
1292	 * If the old dependency had a fragment to free or had never
1293	 * previously had a block allocated, then the new dependency
1294	 * can immediately post its freefrag and adopt the old freefrag.
1295	 * This action is done by swapping the freefrag dependencies.
1296	 * The new dependency gains the old one's freefrag, and the
1297	 * old one gets the new one and then immediately puts it on
1298	 * the worklist when it is freed by free_allocdirect. It is
1299	 * not possible to do this swap when the old dependency had a
1300	 * non-zero size but no previous fragment to free. This condition
1301	 * arises when the new block is an extension of the old block.
1302	 * Here, the first part of the fragment allocated to the new
1303	 * dependency is part of the block currently claimed on disk by
1304	 * the old dependency, so cannot legitimately be freed until the
1305	 * conditions for the new dependency are fulfilled.
1306	 */
1307	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1308		freefrag = newadp->ad_freefrag;
1309		newadp->ad_freefrag = oldadp->ad_freefrag;
1310		oldadp->ad_freefrag = freefrag;
1311	}
1312	free_allocdirect(adphead, oldadp, 0);
1313}
1314
1315/*
1316 * Allocate a new freefrag structure if needed.
1317 */
1318static struct freefrag *
1319newfreefrag(ip, blkno, size)
1320	struct inode *ip;
1321	ufs_daddr_t blkno;
1322	long size;
1323{
1324	struct freefrag *freefrag;
1325	struct fs *fs;
1326
1327	if (blkno == 0)
1328		return (NULL);
1329	fs = ip->i_fs;
1330	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1331		panic("newfreefrag: frag size");
1332	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1333		M_FREEFRAG, M_WAITOK);
1334	freefrag->ff_list.wk_type = D_FREEFRAG;
1335	freefrag->ff_state = ip->i_uid & ~ONWORKLIST;	/* XXX - used below */
1336	freefrag->ff_inum = ip->i_number;
1337	freefrag->ff_fs = fs;
1338	freefrag->ff_devvp = ip->i_devvp;
1339	freefrag->ff_blkno = blkno;
1340	freefrag->ff_fragsize = size;
1341	return (freefrag);
1342}
1343
1344/*
1345 * This workitem de-allocates fragments that were replaced during
1346 * file block allocation.
1347 */
1348static void
1349handle_workitem_freefrag(freefrag)
1350	struct freefrag *freefrag;
1351{
1352	struct inode tip;
1353
1354	tip.i_fs = freefrag->ff_fs;
1355	tip.i_devvp = freefrag->ff_devvp;
1356	tip.i_dev = freefrag->ff_devvp->v_rdev;
1357	tip.i_number = freefrag->ff_inum;
1358	tip.i_uid = freefrag->ff_state & ~ONWORKLIST;	/* XXX - set above */
1359	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1360	FREE(freefrag, M_FREEFRAG);
1361}
1362
1363/*
1364 * Indirect block allocation dependencies.
1365 *
1366 * The same dependencies that exist for a direct block also exist when
1367 * a new block is allocated and pointed to by an entry in a block of
1368 * indirect pointers. The undo/redo states described above are also
1369 * used here. Because an indirect block contains many pointers that
1370 * may have dependencies, a second copy of the entire in-memory indirect
1371 * block is kept. The buffer cache copy is always completely up-to-date.
1372 * The second copy, which is used only as a source for disk writes,
1373 * contains only the safe pointers (i.e., those that have no remaining
1374 * update dependencies). The second copy is freed when all pointers
1375 * are safe. The cache is not allowed to replace indirect blocks with
1376 * pending update dependencies. If a buffer containing an indirect
1377 * block with dependencies is written, these routines will mark it
1378 * dirty again. It can only be successfully written once all the
1379 * dependencies are removed. The ffs_fsync routine in conjunction with
1380 * softdep_sync_metadata work together to get all the dependencies
1381 * removed so that a file can be successfully written to disk. Three
1382 * procedures are used when setting up indirect block pointer
1383 * dependencies. The division is necessary because of the organization
1384 * of the "balloc" routine and because of the distinction between file
1385 * pages and file metadata blocks.
1386 */
1387
1388/*
1389 * Allocate a new allocindir structure.
1390 */
1391static struct allocindir *
1392newallocindir(ip, ptrno, newblkno, oldblkno)
1393	struct inode *ip;	/* inode for file being extended */
1394	int ptrno;		/* offset of pointer in indirect block */
1395	ufs_daddr_t newblkno;	/* disk block number being added */
1396	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1397{
1398	struct allocindir *aip;
1399
1400	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1401		M_ALLOCINDIR, M_WAITOK);
1402	bzero(aip, sizeof(struct allocindir));
1403	aip->ai_list.wk_type = D_ALLOCINDIR;
1404	aip->ai_state = ATTACHED;
1405	aip->ai_offset = ptrno;
1406	aip->ai_newblkno = newblkno;
1407	aip->ai_oldblkno = oldblkno;
1408	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1409	return (aip);
1410}
1411
1412/*
1413 * Called just before setting an indirect block pointer
1414 * to a newly allocated file page.
1415 */
1416void
1417softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1418	struct inode *ip;	/* inode for file being extended */
1419	ufs_lbn_t lbn;		/* allocated block number within file */
1420	struct buf *bp;		/* buffer with indirect blk referencing page */
1421	int ptrno;		/* offset of pointer in indirect block */
1422	ufs_daddr_t newblkno;	/* disk block number being added */
1423	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1424	struct buf *nbp;	/* buffer holding allocated page */
1425{
1426	struct allocindir *aip;
1427	struct pagedep *pagedep;
1428
1429	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1430	ACQUIRE_LOCK(&lk);
1431	/*
1432	 * If we are allocating a directory page, then we must
1433	 * allocate an associated pagedep to track additions and
1434	 * deletions.
1435	 */
1436	if ((ip->i_mode & IFMT) == IFDIR &&
1437	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1438		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1439	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1440	FREE_LOCK(&lk);
1441	setup_allocindir_phase2(bp, ip, aip);
1442}
1443
1444/*
1445 * Called just before setting an indirect block pointer to a
1446 * newly allocated indirect block.
1447 */
1448void
1449softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1450	struct buf *nbp;	/* newly allocated indirect block */
1451	struct inode *ip;	/* inode for file being extended */
1452	struct buf *bp;		/* indirect block referencing allocated block */
1453	int ptrno;		/* offset of pointer in indirect block */
1454	ufs_daddr_t newblkno;	/* disk block number being added */
1455{
1456	struct allocindir *aip;
1457
1458	aip = newallocindir(ip, ptrno, newblkno, 0);
1459	ACQUIRE_LOCK(&lk);
1460	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1461	FREE_LOCK(&lk);
1462	setup_allocindir_phase2(bp, ip, aip);
1463}
1464
1465/*
1466 * Called to finish the allocation of the "aip" allocated
1467 * by one of the two routines above.
1468 */
1469static void
1470setup_allocindir_phase2(bp, ip, aip)
1471	struct buf *bp;		/* in-memory copy of the indirect block */
1472	struct inode *ip;	/* inode for file being extended */
1473	struct allocindir *aip;	/* allocindir allocated by the above routines */
1474{
1475	struct worklist *wk;
1476	struct indirdep *indirdep, *newindirdep;
1477	struct bmsafemap *bmsafemap;
1478	struct allocindir *oldaip;
1479	struct freefrag *freefrag;
1480	struct newblk *newblk;
1481
1482	if (bp->b_lblkno >= 0)
1483		panic("setup_allocindir_phase2: not indir blk");
1484	for (indirdep = NULL, newindirdep = NULL; ; ) {
1485		ACQUIRE_LOCK(&lk);
1486		for (wk = LIST_FIRST(&bp->b_dep); wk;
1487		     wk = LIST_NEXT(wk, wk_list)) {
1488			if (wk->wk_type != D_INDIRDEP)
1489				continue;
1490			indirdep = WK_INDIRDEP(wk);
1491			break;
1492		}
1493		if (indirdep == NULL && newindirdep) {
1494			indirdep = newindirdep;
1495			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1496			newindirdep = NULL;
1497		}
1498		FREE_LOCK(&lk);
1499		if (indirdep) {
1500			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1501			    &newblk) == 0)
1502				panic("setup_allocindir: lost block");
1503			ACQUIRE_LOCK(&lk);
1504			if (newblk->nb_state == DEPCOMPLETE) {
1505				aip->ai_state |= DEPCOMPLETE;
1506				aip->ai_buf = NULL;
1507			} else {
1508				bmsafemap = newblk->nb_bmsafemap;
1509				aip->ai_buf = bmsafemap->sm_buf;
1510				LIST_REMOVE(newblk, nb_deps);
1511				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1512				    aip, ai_deps);
1513			}
1514			LIST_REMOVE(newblk, nb_hash);
1515			FREE(newblk, M_NEWBLK);
1516			aip->ai_indirdep = indirdep;
1517			/*
1518			 * Check to see if there is an existing dependency
1519			 * for this block. If there is, merge the old
1520			 * dependency into the new one.
1521			 */
1522			if (aip->ai_oldblkno == 0)
1523				oldaip = NULL;
1524			else
1525				for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd);
1526				    oldaip; oldaip = LIST_NEXT(oldaip, ai_next))
1527					if (oldaip->ai_offset == aip->ai_offset)
1528						break;
1529			if (oldaip != NULL) {
1530				if (oldaip->ai_newblkno != aip->ai_oldblkno)
1531					panic("setup_allocindir_phase2: blkno");
1532				aip->ai_oldblkno = oldaip->ai_oldblkno;
1533				freefrag = oldaip->ai_freefrag;
1534				oldaip->ai_freefrag = aip->ai_freefrag;
1535				aip->ai_freefrag = freefrag;
1536				free_allocindir(oldaip, NULL);
1537			}
1538			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1539			((ufs_daddr_t *)indirdep->ir_savebp->b_data)
1540			    [aip->ai_offset] = aip->ai_oldblkno;
1541			FREE_LOCK(&lk);
1542		}
1543		if (newindirdep) {
1544			if (indirdep->ir_savebp != NULL)
1545				brelse(newindirdep->ir_savebp);
1546			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
1547		}
1548		if (indirdep)
1549			break;
1550		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
1551			M_INDIRDEP, M_WAITOK);
1552		newindirdep->ir_list.wk_type = D_INDIRDEP;
1553		newindirdep->ir_state = ATTACHED;
1554		LIST_INIT(&newindirdep->ir_deplisthd);
1555		LIST_INIT(&newindirdep->ir_donehd);
1556		if (bp->b_blkno == bp->b_lblkno) {
1557			VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
1558				NULL, NULL);
1559		}
1560		newindirdep->ir_savebp =
1561		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1562		BUF_KERNPROC(newindirdep->ir_savebp);
1563		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1564	}
1565}
1566
1567/*
1568 * Block de-allocation dependencies.
1569 *
1570 * When blocks are de-allocated, the on-disk pointers must be nullified before
1571 * the blocks are made available for use by other files.  (The true
1572 * requirement is that old pointers must be nullified before new on-disk
1573 * pointers are set.  We chose this slightly more stringent requirement to
1574 * reduce complexity.) Our implementation handles this dependency by updating
1575 * the inode (or indirect block) appropriately but delaying the actual block
1576 * de-allocation (i.e., freemap and free space count manipulation) until
1577 * after the updated versions reach stable storage.  After the disk is
1578 * updated, the blocks can be safely de-allocated whenever it is convenient.
1579 * This implementation handles only the common case of reducing a file's
1580 * length to zero. Other cases are handled by the conventional synchronous
1581 * write approach.
1582 *
1583 * The ffs implementation with which we worked double-checks
1584 * the state of the block pointers and file size as it reduces
1585 * a file's length.  Some of this code is replicated here in our
1586 * soft updates implementation.  The freeblks->fb_chkcnt field is
1587 * used to transfer a part of this information to the procedure
1588 * that eventually de-allocates the blocks.
1589 *
1590 * This routine should be called from the routine that shortens
1591 * a file's length, before the inode's size or block pointers
1592 * are modified. It will save the block pointer information for
1593 * later release and zero the inode so that the calling routine
1594 * can release it.
1595 */
1596void
1597softdep_setup_freeblocks(ip, length)
1598	struct inode *ip;	/* The inode whose length is to be reduced */
1599	off_t length;		/* The new length for the file */
1600{
1601	struct freeblks *freeblks;
1602	struct inodedep *inodedep;
1603	struct allocdirect *adp;
1604	struct vnode *vp;
1605	struct buf *bp;
1606	struct fs *fs;
1607	int i, error;
1608
1609	fs = ip->i_fs;
1610	if (length != 0)
1611		panic("softde_setup_freeblocks: non-zero length");
1612	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
1613		M_FREEBLKS, M_WAITOK);
1614	bzero(freeblks, sizeof(struct freeblks));
1615	freeblks->fb_list.wk_type = D_FREEBLKS;
1616	freeblks->fb_uid = ip->i_uid;
1617	freeblks->fb_previousinum = ip->i_number;
1618	freeblks->fb_devvp = ip->i_devvp;
1619	freeblks->fb_fs = fs;
1620	freeblks->fb_oldsize = ip->i_size;
1621	freeblks->fb_newsize = length;
1622	freeblks->fb_chkcnt = ip->i_blocks;
1623	for (i = 0; i < NDADDR; i++) {
1624		freeblks->fb_dblks[i] = ip->i_db[i];
1625		ip->i_db[i] = 0;
1626	}
1627	for (i = 0; i < NIADDR; i++) {
1628		freeblks->fb_iblks[i] = ip->i_ib[i];
1629		ip->i_ib[i] = 0;
1630	}
1631	ip->i_blocks = 0;
1632	ip->i_size = 0;
1633	/*
1634	 * Push the zero'ed inode to to its disk buffer so that we are free
1635	 * to delete its dependencies below. Once the dependencies are gone
1636	 * the buffer can be safely released.
1637	 */
1638	if ((error = bread(ip->i_devvp,
1639	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1640	    (int)fs->fs_bsize, NOCRED, &bp)) != 0)
1641		softdep_error("softdep_setup_freeblocks", error);
1642	*((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
1643	    ip->i_din;
1644	/*
1645	 * Find and eliminate any inode dependencies.
1646	 */
1647	ACQUIRE_LOCK(&lk);
1648	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1649	if ((inodedep->id_state & IOSTARTED) != 0)
1650		panic("softdep_setup_freeblocks: inode busy");
1651	/*
1652	 * Because the file length has been truncated to zero, any
1653	 * pending block allocation dependency structures associated
1654	 * with this inode are obsolete and can simply be de-allocated.
1655	 * We must first merge the two dependency lists to get rid of
1656	 * any duplicate freefrag structures, then purge the merged list.
1657	 */
1658	merge_inode_lists(inodedep);
1659	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
1660		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
1661	FREE_LOCK(&lk);
1662	bdwrite(bp);
1663	/*
1664	 * We must wait for any I/O in progress to finish so that
1665	 * all potential buffers on the dirty list will be visible.
1666	 * Once they are all there, walk the list and get rid of
1667	 * any dependencies.
1668	 */
1669	vp = ITOV(ip);
1670	ACQUIRE_LOCK(&lk);
1671	drain_output(vp, 1);
1672	while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
1673		bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
1674		(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
1675		deallocate_dependencies(bp, inodedep);
1676		bp->b_flags |= B_INVAL | B_NOCACHE;
1677		FREE_LOCK(&lk);
1678		brelse(bp);
1679		ACQUIRE_LOCK(&lk);
1680	}
1681	/*
1682	 * Add the freeblks structure to the list of operations that
1683	 * must await the zero'ed inode being written to disk. If we
1684	 * still have a bitmap dependency, then the inode has never been
1685	 * written to disk, so we can process the freeblks immediately.
1686	 * If the inodedep does not exist, then the zero'ed inode has
1687	 * been written and we can also proceed.
1688	 */
1689	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0 ||
1690	    free_inodedep(inodedep) ||
1691	    (inodedep->id_state & DEPCOMPLETE) == 0) {
1692		FREE_LOCK(&lk);
1693		handle_workitem_freeblocks(freeblks);
1694	} else {
1695		WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1696		FREE_LOCK(&lk);
1697	}
1698}
1699
1700/*
1701 * Reclaim any dependency structures from a buffer that is about to
1702 * be reallocated to a new vnode. The buffer must be locked, thus,
1703 * no I/O completion operations can occur while we are manipulating
1704 * its associated dependencies. The mutex is held so that other I/O's
1705 * associated with related dependencies do not occur.
1706 */
1707static void
1708deallocate_dependencies(bp, inodedep)
1709	struct buf *bp;
1710	struct inodedep *inodedep;
1711{
1712	struct worklist *wk;
1713	struct indirdep *indirdep;
1714	struct allocindir *aip;
1715	struct pagedep *pagedep;
1716	struct dirrem *dirrem;
1717	struct diradd *dap;
1718	int i;
1719
1720	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
1721		switch (wk->wk_type) {
1722
1723		case D_INDIRDEP:
1724			indirdep = WK_INDIRDEP(wk);
1725			/*
1726			 * None of the indirect pointers will ever be visible,
1727			 * so they can simply be tossed. GOINGAWAY ensures
1728			 * that allocated pointers will be saved in the buffer
1729			 * cache until they are freed. Note that they will
1730			 * only be able to be found by their physical address
1731			 * since the inode mapping the logical address will
1732			 * be gone. The save buffer used for the safe copy
1733			 * was allocated in setup_allocindir_phase2 using
1734			 * the physical address so it could be used for this
1735			 * purpose. Hence we swap the safe copy with the real
1736			 * copy, allowing the safe copy to be freed and holding
1737			 * on to the real copy for later use in indir_trunc.
1738			 */
1739			if (indirdep->ir_state & GOINGAWAY)
1740				panic("deallocate_dependencies: already gone");
1741			indirdep->ir_state |= GOINGAWAY;
1742			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
1743				free_allocindir(aip, inodedep);
1744			if (bp->b_lblkno >= 0 ||
1745			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
1746				panic("deallocate_dependencies: not indir");
1747			bcopy(bp->b_data, indirdep->ir_savebp->b_data,
1748			    bp->b_bcount);
1749			WORKLIST_REMOVE(wk);
1750			WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
1751			continue;
1752
1753		case D_PAGEDEP:
1754			pagedep = WK_PAGEDEP(wk);
1755			/*
1756			 * None of the directory additions will ever be
1757			 * visible, so they can simply be tossed.
1758			 */
1759			for (i = 0; i < DAHASHSZ; i++)
1760				while ((dap =
1761				    LIST_FIRST(&pagedep->pd_diraddhd[i])))
1762					free_diradd(dap);
1763			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
1764				free_diradd(dap);
1765			/*
1766			 * Copy any directory remove dependencies to the list
1767			 * to be processed after the zero'ed inode is written.
1768			 * If the inode has already been written, then they
1769			 * can be dumped directly onto the work list.
1770			 */
1771			for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem;
1772			     dirrem = LIST_NEXT(dirrem, dm_next)) {
1773				LIST_REMOVE(dirrem, dm_next);
1774				dirrem->dm_dirinum = pagedep->pd_ino;
1775				if (inodedep == NULL ||
1776				    (inodedep->id_state & ALLCOMPLETE) ==
1777				     ALLCOMPLETE)
1778					add_to_worklist(&dirrem->dm_list);
1779				else
1780					WORKLIST_INSERT(&inodedep->id_bufwait,
1781					    &dirrem->dm_list);
1782			}
1783			WORKLIST_REMOVE(&pagedep->pd_list);
1784			LIST_REMOVE(pagedep, pd_hash);
1785			WORKITEM_FREE(pagedep, D_PAGEDEP);
1786			continue;
1787
1788		case D_ALLOCINDIR:
1789			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
1790			continue;
1791
1792		case D_ALLOCDIRECT:
1793		case D_INODEDEP:
1794			panic("deallocate_dependencies: Unexpected type %s",
1795			    TYPENAME(wk->wk_type));
1796			/* NOTREACHED */
1797
1798		default:
1799			panic("deallocate_dependencies: Unknown type %s",
1800			    TYPENAME(wk->wk_type));
1801			/* NOTREACHED */
1802		}
1803	}
1804}
1805
1806/*
1807 * Free an allocdirect. Generate a new freefrag work request if appropriate.
1808 * This routine must be called with splbio interrupts blocked.
1809 */
1810static void
1811free_allocdirect(adphead, adp, delay)
1812	struct allocdirectlst *adphead;
1813	struct allocdirect *adp;
1814	int delay;
1815{
1816
1817#ifdef DEBUG
1818	if (lk.lkt_held == -1)
1819		panic("free_allocdirect: lock not held");
1820#endif
1821	if ((adp->ad_state & DEPCOMPLETE) == 0)
1822		LIST_REMOVE(adp, ad_deps);
1823	TAILQ_REMOVE(adphead, adp, ad_next);
1824	if ((adp->ad_state & COMPLETE) == 0)
1825		WORKLIST_REMOVE(&adp->ad_list);
1826	if (adp->ad_freefrag != NULL) {
1827		if (delay)
1828			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
1829			    &adp->ad_freefrag->ff_list);
1830		else
1831			add_to_worklist(&adp->ad_freefrag->ff_list);
1832	}
1833	WORKITEM_FREE(adp, D_ALLOCDIRECT);
1834}
1835
1836/*
1837 * Prepare an inode to be freed. The actual free operation is not
1838 * done until the zero'ed inode has been written to disk.
1839 */
1840void
1841softdep_freefile(pvp, ino, mode)
1842		struct vnode *pvp;
1843		ino_t ino;
1844		int mode;
1845{
1846	struct inode *ip = VTOI(pvp);
1847	struct inodedep *inodedep;
1848	struct freefile *freefile;
1849
1850	/*
1851	 * This sets up the inode de-allocation dependency.
1852	 */
1853	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
1854		M_FREEFILE, M_WAITOK);
1855	freefile->fx_list.wk_type = D_FREEFILE;
1856	freefile->fx_list.wk_state = 0;
1857	freefile->fx_mode = mode;
1858	freefile->fx_oldinum = ino;
1859	freefile->fx_devvp = ip->i_devvp;
1860	freefile->fx_fs = ip->i_fs;
1861
1862	/*
1863	 * If the inodedep does not exist, then the zero'ed inode has
1864	 * been written to disk and we can free the file immediately.
1865	 */
1866	ACQUIRE_LOCK(&lk);
1867	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0) {
1868		FREE_LOCK(&lk);
1869		handle_workitem_freefile(freefile);
1870		return;
1871	}
1872
1873	/*
1874	 * If we still have a bitmap dependency, then the inode has never
1875	 * been written to disk. Drop the dependency as it is no longer
1876	 * necessary since the inode is being deallocated. We set the
1877	 * ALLCOMPLETE flags since the bitmap now properly shows that the
1878	 * inode is not allocated. Even if the inode is actively being
1879	 * written, it has been rolled back to its zero'ed state, so we
1880	 * are ensured that a zero inode is what is on the disk. For short
1881	 * lived files, this change will usually result in removing all the
1882	 * dependencies from the inode so that it can be freed immediately.
1883	 */
1884	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
1885		inodedep->id_state |= ALLCOMPLETE;
1886		LIST_REMOVE(inodedep, id_deps);
1887		inodedep->id_buf = NULL;
1888		WORKLIST_REMOVE(&inodedep->id_list);
1889	}
1890	if (free_inodedep(inodedep) == 0) {
1891		WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
1892		FREE_LOCK(&lk);
1893	} else {
1894		FREE_LOCK(&lk);
1895		handle_workitem_freefile(freefile);
1896	}
1897}
1898
1899/*
1900 * Try to free an inodedep structure. Return 1 if it could be freed.
1901 */
1902static int
1903free_inodedep(inodedep)
1904	struct inodedep *inodedep;
1905{
1906
1907	if ((inodedep->id_state & ONWORKLIST) != 0 ||
1908	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
1909	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
1910	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
1911	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
1912	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
1913	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
1914	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL)
1915		return (0);
1916	LIST_REMOVE(inodedep, id_hash);
1917	WORKITEM_FREE(inodedep, D_INODEDEP);
1918	num_inodedep -= 1;
1919	return (1);
1920}
1921
1922/*
1923 * This workitem routine performs the block de-allocation.
1924 * The workitem is added to the pending list after the updated
1925 * inode block has been written to disk.  As mentioned above,
1926 * checks regarding the number of blocks de-allocated (compared
1927 * to the number of blocks allocated for the file) are also
1928 * performed in this function.
1929 */
1930static void
1931handle_workitem_freeblocks(freeblks)
1932	struct freeblks *freeblks;
1933{
1934	struct inode tip;
1935	ufs_daddr_t bn;
1936	struct fs *fs;
1937	int i, level, bsize;
1938	long nblocks, blocksreleased = 0;
1939	int error, allerror = 0;
1940	ufs_lbn_t baselbns[NIADDR], tmpval;
1941
1942	tip.i_number = freeblks->fb_previousinum;
1943	tip.i_devvp = freeblks->fb_devvp;
1944	tip.i_dev = freeblks->fb_devvp->v_rdev;
1945	tip.i_fs = freeblks->fb_fs;
1946	tip.i_size = freeblks->fb_oldsize;
1947	tip.i_uid = freeblks->fb_uid;
1948	fs = freeblks->fb_fs;
1949	tmpval = 1;
1950	baselbns[0] = NDADDR;
1951	for (i = 1; i < NIADDR; i++) {
1952		tmpval *= NINDIR(fs);
1953		baselbns[i] = baselbns[i - 1] + tmpval;
1954	}
1955	nblocks = btodb(fs->fs_bsize);
1956	blocksreleased = 0;
1957	/*
1958	 * Indirect blocks first.
1959	 */
1960	for (level = (NIADDR - 1); level >= 0; level--) {
1961		if ((bn = freeblks->fb_iblks[level]) == 0)
1962			continue;
1963		if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
1964		    baselbns[level], &blocksreleased)) == 0)
1965			allerror = error;
1966		ffs_blkfree(&tip, bn, fs->fs_bsize);
1967		blocksreleased += nblocks;
1968	}
1969	/*
1970	 * All direct blocks or frags.
1971	 */
1972	for (i = (NDADDR - 1); i >= 0; i--) {
1973		if ((bn = freeblks->fb_dblks[i]) == 0)
1974			continue;
1975		bsize = blksize(fs, &tip, i);
1976		ffs_blkfree(&tip, bn, bsize);
1977		blocksreleased += btodb(bsize);
1978	}
1979
1980#ifdef DIAGNOSTIC
1981	if (freeblks->fb_chkcnt != blocksreleased)
1982		panic("handle_workitem_freeblocks: block count");
1983	if (allerror)
1984		softdep_error("handle_workitem_freeblks", allerror);
1985#endif /* DIAGNOSTIC */
1986	WORKITEM_FREE(freeblks, D_FREEBLKS);
1987}
1988
1989/*
1990 * Release blocks associated with the inode ip and stored in the indirect
1991 * block dbn. If level is greater than SINGLE, the block is an indirect block
1992 * and recursive calls to indirtrunc must be used to cleanse other indirect
1993 * blocks.
1994 */
1995static int
1996indir_trunc(ip, dbn, level, lbn, countp)
1997	struct inode *ip;
1998	ufs_daddr_t dbn;
1999	int level;
2000	ufs_lbn_t lbn;
2001	long *countp;
2002{
2003	struct buf *bp;
2004	ufs_daddr_t *bap;
2005	ufs_daddr_t nb;
2006	struct fs *fs;
2007	struct worklist *wk;
2008	struct indirdep *indirdep;
2009	int i, lbnadd, nblocks;
2010	int error, allerror = 0;
2011
2012	fs = ip->i_fs;
2013	lbnadd = 1;
2014	for (i = level; i > 0; i--)
2015		lbnadd *= NINDIR(fs);
2016	/*
2017	 * Get buffer of block pointers to be freed. This routine is not
2018	 * called until the zero'ed inode has been written, so it is safe
2019	 * to free blocks as they are encountered. Because the inode has
2020	 * been zero'ed, calls to bmap on these blocks will fail. So, we
2021	 * have to use the on-disk address and the block device for the
2022	 * filesystem to look them up. If the file was deleted before its
2023	 * indirect blocks were all written to disk, the routine that set
2024	 * us up (deallocate_dependencies) will have arranged to leave
2025	 * a complete copy of the indirect block in memory for our use.
2026	 * Otherwise we have to read the blocks in from the disk.
2027	 */
2028	ACQUIRE_LOCK(&lk);
2029	if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
2030	    (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2031		if (wk->wk_type != D_INDIRDEP ||
2032		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2033		    (indirdep->ir_state & GOINGAWAY) == 0)
2034			panic("indir_trunc: lost indirdep");
2035		WORKLIST_REMOVE(wk);
2036		WORKITEM_FREE(indirdep, D_INDIRDEP);
2037		if (LIST_FIRST(&bp->b_dep) != NULL)
2038			panic("indir_trunc: dangling dep");
2039		FREE_LOCK(&lk);
2040	} else {
2041		FREE_LOCK(&lk);
2042		error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
2043		if (error)
2044			return (error);
2045	}
2046	/*
2047	 * Recursively free indirect blocks.
2048	 */
2049	bap = (ufs_daddr_t *)bp->b_data;
2050	nblocks = btodb(fs->fs_bsize);
2051	for (i = NINDIR(fs) - 1; i >= 0; i--) {
2052		if ((nb = bap[i]) == 0)
2053			continue;
2054		if (level != 0) {
2055			if ((error = indir_trunc(ip, fsbtodb(fs, nb),
2056			     level - 1, lbn + (i * lbnadd), countp)) != 0)
2057				allerror = error;
2058		}
2059		ffs_blkfree(ip, nb, fs->fs_bsize);
2060		*countp += nblocks;
2061	}
2062	bp->b_flags |= B_INVAL | B_NOCACHE;
2063	brelse(bp);
2064	return (allerror);
2065}
2066
2067/*
2068 * Free an allocindir.
2069 * This routine must be called with splbio interrupts blocked.
2070 */
2071static void
2072free_allocindir(aip, inodedep)
2073	struct allocindir *aip;
2074	struct inodedep *inodedep;
2075{
2076	struct freefrag *freefrag;
2077
2078#ifdef DEBUG
2079	if (lk.lkt_held == -1)
2080		panic("free_allocindir: lock not held");
2081#endif
2082	if ((aip->ai_state & DEPCOMPLETE) == 0)
2083		LIST_REMOVE(aip, ai_deps);
2084	if (aip->ai_state & ONWORKLIST)
2085		WORKLIST_REMOVE(&aip->ai_list);
2086	LIST_REMOVE(aip, ai_next);
2087	if ((freefrag = aip->ai_freefrag) != NULL) {
2088		if (inodedep == NULL)
2089			add_to_worklist(&freefrag->ff_list);
2090		else
2091			WORKLIST_INSERT(&inodedep->id_bufwait,
2092			    &freefrag->ff_list);
2093	}
2094	WORKITEM_FREE(aip, D_ALLOCINDIR);
2095}
2096
2097/*
2098 * Directory entry addition dependencies.
2099 *
2100 * When adding a new directory entry, the inode (with its incremented link
2101 * count) must be written to disk before the directory entry's pointer to it.
2102 * Also, if the inode is newly allocated, the corresponding freemap must be
2103 * updated (on disk) before the directory entry's pointer. These requirements
2104 * are met via undo/redo on the directory entry's pointer, which consists
2105 * simply of the inode number.
2106 *
2107 * As directory entries are added and deleted, the free space within a
2108 * directory block can become fragmented.  The ufs file system will compact
2109 * a fragmented directory block to make space for a new entry. When this
2110 * occurs, the offsets of previously added entries change. Any "diradd"
2111 * dependency structures corresponding to these entries must be updated with
2112 * the new offsets.
2113 */
2114
2115/*
2116 * This routine is called after the in-memory inode's link
2117 * count has been incremented, but before the directory entry's
2118 * pointer to the inode has been set.
2119 */
2120void
2121softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
2122	struct buf *bp;		/* buffer containing directory block */
2123	struct inode *dp;	/* inode for directory */
2124	off_t diroffset;	/* offset of new entry in directory */
2125	long newinum;		/* inode referenced by new directory entry */
2126	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
2127{
2128	int offset;		/* offset of new entry within directory block */
2129	ufs_lbn_t lbn;		/* block in directory containing new entry */
2130	struct fs *fs;
2131	struct diradd *dap;
2132	struct pagedep *pagedep;
2133	struct inodedep *inodedep;
2134	struct mkdir *mkdir1, *mkdir2;
2135
2136	/*
2137	 * Whiteouts have no dependencies.
2138	 */
2139	if (newinum == WINO) {
2140		if (newdirbp != NULL)
2141			bdwrite(newdirbp);
2142		return;
2143	}
2144
2145	fs = dp->i_fs;
2146	lbn = lblkno(fs, diroffset);
2147	offset = blkoff(fs, diroffset);
2148	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK);
2149	bzero(dap, sizeof(struct diradd));
2150	dap->da_list.wk_type = D_DIRADD;
2151	dap->da_offset = offset;
2152	dap->da_newinum = newinum;
2153	dap->da_state = ATTACHED;
2154	if (newdirbp == NULL) {
2155		dap->da_state |= DEPCOMPLETE;
2156		ACQUIRE_LOCK(&lk);
2157	} else {
2158		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2159		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2160		    M_WAITOK);
2161		mkdir1->md_list.wk_type = D_MKDIR;
2162		mkdir1->md_state = MKDIR_BODY;
2163		mkdir1->md_diradd = dap;
2164		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2165		    M_WAITOK);
2166		mkdir2->md_list.wk_type = D_MKDIR;
2167		mkdir2->md_state = MKDIR_PARENT;
2168		mkdir2->md_diradd = dap;
2169		/*
2170		 * Dependency on "." and ".." being written to disk.
2171		 */
2172		mkdir1->md_buf = newdirbp;
2173		ACQUIRE_LOCK(&lk);
2174		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2175		WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2176		FREE_LOCK(&lk);
2177		bdwrite(newdirbp);
2178		/*
2179		 * Dependency on link count increase for parent directory
2180		 */
2181		ACQUIRE_LOCK(&lk);
2182		if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
2183		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2184			dap->da_state &= ~MKDIR_PARENT;
2185			WORKITEM_FREE(mkdir2, D_MKDIR);
2186		} else {
2187			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2188			WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2189		}
2190	}
2191	/*
2192	 * Link into parent directory pagedep to await its being written.
2193	 */
2194	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2195		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2196	dap->da_pagedep = pagedep;
2197	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2198	    da_pdlist);
2199	/*
2200	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2201	 * is not yet written. If it is written, do the post-inode write
2202	 * processing to put it on the id_pendinghd list.
2203	 */
2204	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2205	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2206		diradd_inode_written(dap, inodedep);
2207	else
2208		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2209	FREE_LOCK(&lk);
2210}
2211
2212/*
2213 * This procedure is called to change the offset of a directory
2214 * entry when compacting a directory block which must be owned
2215 * exclusively by the caller. Note that the actual entry movement
2216 * must be done in this procedure to ensure that no I/O completions
2217 * occur while the move is in progress.
2218 */
2219void
2220softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2221	struct inode *dp;	/* inode for directory */
2222	caddr_t base;		/* address of dp->i_offset */
2223	caddr_t oldloc;		/* address of old directory location */
2224	caddr_t newloc;		/* address of new directory location */
2225	int entrysize;		/* size of directory entry */
2226{
2227	int offset, oldoffset, newoffset;
2228	struct pagedep *pagedep;
2229	struct diradd *dap;
2230	ufs_lbn_t lbn;
2231
2232	ACQUIRE_LOCK(&lk);
2233	lbn = lblkno(dp->i_fs, dp->i_offset);
2234	offset = blkoff(dp->i_fs, dp->i_offset);
2235	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2236		goto done;
2237	oldoffset = offset + (oldloc - base);
2238	newoffset = offset + (newloc - base);
2239	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]);
2240	     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2241		if (dap->da_offset != oldoffset)
2242			continue;
2243		dap->da_offset = newoffset;
2244		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2245			break;
2246		LIST_REMOVE(dap, da_pdlist);
2247		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2248		    dap, da_pdlist);
2249		break;
2250	}
2251	if (dap == NULL) {
2252		for (dap = LIST_FIRST(&pagedep->pd_pendinghd);
2253		     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2254			if (dap->da_offset == oldoffset) {
2255				dap->da_offset = newoffset;
2256				break;
2257			}
2258		}
2259	}
2260done:
2261	bcopy(oldloc, newloc, entrysize);
2262	FREE_LOCK(&lk);
2263}
2264
2265/*
2266 * Free a diradd dependency structure. This routine must be called
2267 * with splbio interrupts blocked.
2268 */
2269static void
2270free_diradd(dap)
2271	struct diradd *dap;
2272{
2273	struct dirrem *dirrem;
2274	struct pagedep *pagedep;
2275	struct inodedep *inodedep;
2276	struct mkdir *mkdir, *nextmd;
2277
2278#ifdef DEBUG
2279	if (lk.lkt_held == -1)
2280		panic("free_diradd: lock not held");
2281#endif
2282	WORKLIST_REMOVE(&dap->da_list);
2283	LIST_REMOVE(dap, da_pdlist);
2284	if ((dap->da_state & DIRCHG) == 0) {
2285		pagedep = dap->da_pagedep;
2286	} else {
2287		dirrem = dap->da_previous;
2288		pagedep = dirrem->dm_pagedep;
2289		dirrem->dm_dirinum = pagedep->pd_ino;
2290		add_to_worklist(&dirrem->dm_list);
2291	}
2292	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2293	    0, &inodedep) != 0)
2294		(void) free_inodedep(inodedep);
2295	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2296		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2297			nextmd = LIST_NEXT(mkdir, md_mkdirs);
2298			if (mkdir->md_diradd != dap)
2299				continue;
2300			dap->da_state &= ~mkdir->md_state;
2301			WORKLIST_REMOVE(&mkdir->md_list);
2302			LIST_REMOVE(mkdir, md_mkdirs);
2303			WORKITEM_FREE(mkdir, D_MKDIR);
2304		}
2305		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
2306			panic("free_diradd: unfound ref");
2307	}
2308	WORKITEM_FREE(dap, D_DIRADD);
2309}
2310
2311/*
2312 * Directory entry removal dependencies.
2313 *
2314 * When removing a directory entry, the entry's inode pointer must be
2315 * zero'ed on disk before the corresponding inode's link count is decremented
2316 * (possibly freeing the inode for re-use). This dependency is handled by
2317 * updating the directory entry but delaying the inode count reduction until
2318 * after the directory block has been written to disk. After this point, the
2319 * inode count can be decremented whenever it is convenient.
2320 */
2321
2322/*
2323 * This routine should be called immediately after removing
2324 * a directory entry.  The inode's link count should not be
2325 * decremented by the calling procedure -- the soft updates
2326 * code will do this task when it is safe.
2327 */
2328void
2329softdep_setup_remove(bp, dp, ip, isrmdir)
2330	struct buf *bp;		/* buffer containing directory block */
2331	struct inode *dp;	/* inode for the directory being modified */
2332	struct inode *ip;	/* inode for directory entry being removed */
2333	int isrmdir;		/* indicates if doing RMDIR */
2334{
2335	struct dirrem *dirrem, *prevdirrem;
2336
2337	/*
2338	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2339	 */
2340	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2341
2342	/*
2343	 * If the COMPLETE flag is clear, then there were no active
2344	 * entries and we want to roll back to a zeroed entry until
2345	 * the new inode is committed to disk. If the COMPLETE flag is
2346	 * set then we have deleted an entry that never made it to
2347	 * disk. If the entry we deleted resulted from a name change,
2348	 * then the old name still resides on disk. We cannot delete
2349	 * its inode (returned to us in prevdirrem) until the zeroed
2350	 * directory entry gets to disk. The new inode has never been
2351	 * referenced on the disk, so can be deleted immediately.
2352	 */
2353	if ((dirrem->dm_state & COMPLETE) == 0) {
2354		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2355		    dm_next);
2356		FREE_LOCK(&lk);
2357	} else {
2358		if (prevdirrem != NULL)
2359			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
2360			    prevdirrem, dm_next);
2361		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2362		FREE_LOCK(&lk);
2363		handle_workitem_remove(dirrem);
2364	}
2365}
2366
2367/*
2368 * Allocate a new dirrem if appropriate and return it along with
2369 * its associated pagedep. Called without a lock, returns with lock.
2370 */
2371static long num_dirrem;		/* number of dirrem allocated */
2372static struct dirrem *
2373newdirrem(bp, dp, ip, isrmdir, prevdirremp)
2374	struct buf *bp;		/* buffer containing directory block */
2375	struct inode *dp;	/* inode for the directory being modified */
2376	struct inode *ip;	/* inode for directory entry being removed */
2377	int isrmdir;		/* indicates if doing RMDIR */
2378	struct dirrem **prevdirremp; /* previously referenced inode, if any */
2379{
2380	int offset;
2381	ufs_lbn_t lbn;
2382	struct diradd *dap;
2383	struct dirrem *dirrem;
2384	struct pagedep *pagedep;
2385
2386	/*
2387	 * Whiteouts have no deletion dependencies.
2388	 */
2389	if (ip == NULL)
2390		panic("newdirrem: whiteout");
2391	/*
2392	 * If we are over our limit, try to improve the situation.
2393	 * Limiting the number of dirrem structures will also limit
2394	 * the number of freefile and freeblks structures.
2395	 */
2396	if (num_dirrem > max_softdeps / 2 && speedup_syncer() == 0)
2397		(void) request_cleanup(FLUSH_REMOVE, 0);
2398	num_dirrem += 1;
2399	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
2400		M_DIRREM, M_WAITOK);
2401	bzero(dirrem, sizeof(struct dirrem));
2402	dirrem->dm_list.wk_type = D_DIRREM;
2403	dirrem->dm_state = isrmdir ? RMDIR : 0;
2404	dirrem->dm_mnt = ITOV(ip)->v_mount;
2405	dirrem->dm_oldinum = ip->i_number;
2406	*prevdirremp = NULL;
2407
2408	ACQUIRE_LOCK(&lk);
2409	lbn = lblkno(dp->i_fs, dp->i_offset);
2410	offset = blkoff(dp->i_fs, dp->i_offset);
2411	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2412		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2413	dirrem->dm_pagedep = pagedep;
2414	/*
2415	 * Check for a diradd dependency for the same directory entry.
2416	 * If present, then both dependencies become obsolete and can
2417	 * be de-allocated. Check for an entry on both the pd_dirraddhd
2418	 * list and the pd_pendinghd list.
2419	 */
2420	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(offset)]);
2421	     dap; dap = LIST_NEXT(dap, da_pdlist))
2422		if (dap->da_offset == offset)
2423			break;
2424	if (dap == NULL) {
2425		for (dap = LIST_FIRST(&pagedep->pd_pendinghd);
2426		     dap; dap = LIST_NEXT(dap, da_pdlist))
2427			if (dap->da_offset == offset)
2428				break;
2429		if (dap == NULL)
2430			return (dirrem);
2431	}
2432	/*
2433	 * Must be ATTACHED at this point.
2434	 */
2435	if ((dap->da_state & ATTACHED) == 0)
2436		panic("newdirrem: not ATTACHED");
2437	if (dap->da_newinum != ip->i_number)
2438		panic("newdirrem: inum %d should be %d",
2439		    ip->i_number, dap->da_newinum);
2440	/*
2441	 * If we are deleting a changed name that never made it to disk,
2442	 * then return the dirrem describing the previous inode (which
2443	 * represents the inode currently referenced from this entry on disk).
2444	 */
2445	if ((dap->da_state & DIRCHG) != 0) {
2446		*prevdirremp = dap->da_previous;
2447		dap->da_state &= ~DIRCHG;
2448		dap->da_pagedep = pagedep;
2449	}
2450	/*
2451	 * We are deleting an entry that never made it to disk.
2452	 * Mark it COMPLETE so we can delete its inode immediately.
2453	 */
2454	dirrem->dm_state |= COMPLETE;
2455	free_diradd(dap);
2456	return (dirrem);
2457}
2458
2459/*
2460 * Directory entry change dependencies.
2461 *
2462 * Changing an existing directory entry requires that an add operation
2463 * be completed first followed by a deletion. The semantics for the addition
2464 * are identical to the description of adding a new entry above except
2465 * that the rollback is to the old inode number rather than zero. Once
2466 * the addition dependency is completed, the removal is done as described
2467 * in the removal routine above.
2468 */
2469
2470/*
2471 * This routine should be called immediately after changing
2472 * a directory entry.  The inode's link count should not be
2473 * decremented by the calling procedure -- the soft updates
2474 * code will perform this task when it is safe.
2475 */
2476void
2477softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
2478	struct buf *bp;		/* buffer containing directory block */
2479	struct inode *dp;	/* inode for the directory being modified */
2480	struct inode *ip;	/* inode for directory entry being removed */
2481	long newinum;		/* new inode number for changed entry */
2482	int isrmdir;		/* indicates if doing RMDIR */
2483{
2484	int offset;
2485	struct diradd *dap = NULL;
2486	struct dirrem *dirrem, *prevdirrem;
2487	struct pagedep *pagedep;
2488	struct inodedep *inodedep;
2489
2490	offset = blkoff(dp->i_fs, dp->i_offset);
2491
2492	/*
2493	 * Whiteouts do not need diradd dependencies.
2494	 */
2495	if (newinum != WINO) {
2496		MALLOC(dap, struct diradd *, sizeof(struct diradd),
2497		    M_DIRADD, M_WAITOK);
2498		bzero(dap, sizeof(struct diradd));
2499		dap->da_list.wk_type = D_DIRADD;
2500		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
2501		dap->da_offset = offset;
2502		dap->da_newinum = newinum;
2503	}
2504
2505	/*
2506	 * Allocate a new dirrem and ACQUIRE_LOCK.
2507	 */
2508	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2509	pagedep = dirrem->dm_pagedep;
2510	/*
2511	 * The possible values for isrmdir:
2512	 *	0 - non-directory file rename
2513	 *	1 - directory rename within same directory
2514	 *   inum - directory rename to new directory of given inode number
2515	 * When renaming to a new directory, we are both deleting and
2516	 * creating a new directory entry, so the link count on the new
2517	 * directory should not change. Thus we do not need the followup
2518	 * dirrem which is usually done in handle_workitem_remove. We set
2519	 * the DIRCHG flag to tell handle_workitem_remove to skip the
2520	 * followup dirrem.
2521	 */
2522	if (isrmdir > 1)
2523		dirrem->dm_state |= DIRCHG;
2524
2525	/*
2526	 * Whiteouts have no additional dependencies,
2527	 * so just put the dirrem on the correct list.
2528	 */
2529	if (newinum == WINO) {
2530		if ((dirrem->dm_state & COMPLETE) == 0) {
2531			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
2532			    dm_next);
2533		} else {
2534			dirrem->dm_dirinum = pagedep->pd_ino;
2535			add_to_worklist(&dirrem->dm_list);
2536		}
2537		FREE_LOCK(&lk);
2538		return;
2539	}
2540
2541	/*
2542	 * If the COMPLETE flag is clear, then there were no active
2543	 * entries and we want to roll back to the previous inode until
2544	 * the new inode is committed to disk. If the COMPLETE flag is
2545	 * set, then we have deleted an entry that never made it to disk.
2546	 * If the entry we deleted resulted from a name change, then the old
2547	 * inode reference still resides on disk. Any rollback that we do
2548	 * needs to be to that old inode (returned to us in prevdirrem). If
2549	 * the entry we deleted resulted from a create, then there is
2550	 * no entry on the disk, so we want to roll back to zero rather
2551	 * than the uncommitted inode. In either of the COMPLETE cases we
2552	 * want to immediately free the unwritten and unreferenced inode.
2553	 */
2554	if ((dirrem->dm_state & COMPLETE) == 0) {
2555		dap->da_previous = dirrem;
2556	} else {
2557		if (prevdirrem != NULL) {
2558			dap->da_previous = prevdirrem;
2559		} else {
2560			dap->da_state &= ~DIRCHG;
2561			dap->da_pagedep = pagedep;
2562		}
2563		dirrem->dm_dirinum = pagedep->pd_ino;
2564		add_to_worklist(&dirrem->dm_list);
2565	}
2566	/*
2567	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2568	 * is not yet written. If it is written, do the post-inode write
2569	 * processing to put it on the id_pendinghd list.
2570	 */
2571	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
2572	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2573		dap->da_state |= COMPLETE;
2574		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
2575		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
2576	} else {
2577		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
2578		    dap, da_pdlist);
2579		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2580	}
2581	FREE_LOCK(&lk);
2582}
2583
2584/*
2585 * Called whenever the link count on an inode is changed.
2586 * It creates an inode dependency so that the new reference(s)
2587 * to the inode cannot be committed to disk until the updated
2588 * inode has been written.
2589 */
2590void
2591softdep_change_linkcnt(ip)
2592	struct inode *ip;	/* the inode with the increased link count */
2593{
2594	struct inodedep *inodedep;
2595
2596	ACQUIRE_LOCK(&lk);
2597	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
2598	if (ip->i_nlink < ip->i_effnlink)
2599		panic("softdep_change_linkcnt: bad delta");
2600	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2601	FREE_LOCK(&lk);
2602}
2603
2604/*
2605 * This workitem decrements the inode's link count.
2606 * If the link count reaches zero, the file is removed.
2607 */
2608static void
2609handle_workitem_remove(dirrem)
2610	struct dirrem *dirrem;
2611{
2612	struct proc *p = CURPROC;	/* XXX */
2613	struct inodedep *inodedep;
2614	struct vnode *vp;
2615	struct inode *ip;
2616	ino_t oldinum;
2617	int error;
2618
2619	if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
2620		softdep_error("handle_workitem_remove: vget", error);
2621		return;
2622	}
2623	ip = VTOI(vp);
2624	ACQUIRE_LOCK(&lk);
2625	if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0)
2626		panic("handle_workitem_remove: lost inodedep");
2627	/*
2628	 * Normal file deletion.
2629	 */
2630	if ((dirrem->dm_state & RMDIR) == 0) {
2631		ip->i_nlink--;
2632		ip->i_flag |= IN_CHANGE;
2633		if (ip->i_nlink < ip->i_effnlink)
2634			panic("handle_workitem_remove: bad file delta");
2635		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2636		FREE_LOCK(&lk);
2637		vput(vp);
2638		num_dirrem -= 1;
2639		WORKITEM_FREE(dirrem, D_DIRREM);
2640		return;
2641	}
2642	/*
2643	 * Directory deletion. Decrement reference count for both the
2644	 * just deleted parent directory entry and the reference for ".".
2645	 * Next truncate the directory to length zero. When the
2646	 * truncation completes, arrange to have the reference count on
2647	 * the parent decremented to account for the loss of "..".
2648	 */
2649	ip->i_nlink -= 2;
2650	ip->i_flag |= IN_CHANGE;
2651	if (ip->i_nlink < ip->i_effnlink)
2652		panic("handle_workitem_remove: bad dir delta");
2653	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2654	FREE_LOCK(&lk);
2655	if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0)
2656		softdep_error("handle_workitem_remove: truncate", error);
2657	/*
2658	 * Rename a directory to a new parent. Since, we are both deleting
2659	 * and creating a new directory entry, the link count on the new
2660	 * directory should not change. Thus we skip the followup dirrem.
2661	 */
2662	if (dirrem->dm_state & DIRCHG) {
2663		vput(vp);
2664		num_dirrem -= 1;
2665		WORKITEM_FREE(dirrem, D_DIRREM);
2666		return;
2667	}
2668	/*
2669	 * If there is no inode dependency then we can free immediately.
2670	 * If we still have a bitmap dependency, then the inode has never
2671	 * been written to disk. Drop the dependency as it is no longer
2672	 * necessary since the inode is being deallocated. We set the
2673	 * ALLCOMPLETE flags since the bitmap now properly shows that the
2674	 * inode is not allocated. Even if the inode is actively being
2675	 * written, it has been rolled back to its zero'ed state, so we
2676	 * are ensured that a zero inode is what is on the disk. For short
2677	 * lived files, this change will usually result in removing all the
2678	 * dependencies from the inode so that it can be freed immediately.
2679	 */
2680	ACQUIRE_LOCK(&lk);
2681	dirrem->dm_state = 0;
2682	oldinum = dirrem->dm_oldinum;
2683	dirrem->dm_oldinum = dirrem->dm_dirinum;
2684	if ((inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep)) == 0)
2685		goto out;
2686	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
2687		inodedep->id_state |= ALLCOMPLETE;
2688		LIST_REMOVE(inodedep, id_deps);
2689		inodedep->id_buf = NULL;
2690		WORKLIST_REMOVE(&inodedep->id_list);
2691	}
2692	if (free_inodedep(inodedep) == 0) {
2693		WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
2694		FREE_LOCK(&lk);
2695		vput(vp);
2696		return;
2697	}
2698out:
2699	FREE_LOCK(&lk);
2700	vput(vp);
2701	handle_workitem_remove(dirrem);
2702}
2703
2704/*
2705 * Inode de-allocation dependencies.
2706 *
2707 * When an inode's link count is reduced to zero, it can be de-allocated. We
2708 * found it convenient to postpone de-allocation until after the inode is
2709 * written to disk with its new link count (zero).  At this point, all of the
2710 * on-disk inode's block pointers are nullified and, with careful dependency
2711 * list ordering, all dependencies related to the inode will be satisfied and
2712 * the corresponding dependency structures de-allocated.  So, if/when the
2713 * inode is reused, there will be no mixing of old dependencies with new
2714 * ones.  This artificial dependency is set up by the block de-allocation
2715 * procedure above (softdep_setup_freeblocks) and completed by the
2716 * following procedure.
2717 */
2718static void
2719handle_workitem_freefile(freefile)
2720	struct freefile *freefile;
2721{
2722	struct vnode vp;
2723	struct inode tip;
2724	struct inodedep *idp;
2725	int error;
2726
2727#ifdef DEBUG
2728	ACQUIRE_LOCK(&lk);
2729	if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp))
2730		panic("handle_workitem_freefile: inodedep survived");
2731	FREE_LOCK(&lk);
2732#endif
2733	tip.i_devvp = freefile->fx_devvp;
2734	tip.i_dev = freefile->fx_devvp->v_rdev;
2735	tip.i_fs = freefile->fx_fs;
2736	vp.v_data = &tip;
2737	if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
2738		softdep_error("handle_workitem_freefile", error);
2739	WORKITEM_FREE(freefile, D_FREEFILE);
2740}
2741
2742/*
2743 * Disk writes.
2744 *
2745 * The dependency structures constructed above are most actively used when file
2746 * system blocks are written to disk.  No constraints are placed on when a
2747 * block can be written, but unsatisfied update dependencies are made safe by
2748 * modifying (or replacing) the source memory for the duration of the disk
2749 * write.  When the disk write completes, the memory block is again brought
2750 * up-to-date.
2751 *
2752 * In-core inode structure reclamation.
2753 *
2754 * Because there are a finite number of "in-core" inode structures, they are
2755 * reused regularly.  By transferring all inode-related dependencies to the
2756 * in-memory inode block and indexing them separately (via "inodedep"s), we
2757 * can allow "in-core" inode structures to be reused at any time and avoid
2758 * any increase in contention.
2759 *
2760 * Called just before entering the device driver to initiate a new disk I/O.
2761 * The buffer must be locked, thus, no I/O completion operations can occur
2762 * while we are manipulating its associated dependencies.
2763 */
2764static void
2765softdep_disk_io_initiation(bp)
2766	struct buf *bp;		/* structure describing disk write to occur */
2767{
2768	struct worklist *wk, *nextwk;
2769	struct indirdep *indirdep;
2770
2771	/*
2772	 * We only care about write operations. There should never
2773	 * be dependencies for reads.
2774	 */
2775	if (bp->b_flags & B_READ)
2776		panic("softdep_disk_io_initiation: read");
2777	/*
2778	 * Do any necessary pre-I/O processing.
2779	 */
2780	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
2781		nextwk = LIST_NEXT(wk, wk_list);
2782		switch (wk->wk_type) {
2783
2784		case D_PAGEDEP:
2785			initiate_write_filepage(WK_PAGEDEP(wk), bp);
2786			continue;
2787
2788		case D_INODEDEP:
2789			initiate_write_inodeblock(WK_INODEDEP(wk), bp);
2790			continue;
2791
2792		case D_INDIRDEP:
2793			indirdep = WK_INDIRDEP(wk);
2794			if (indirdep->ir_state & GOINGAWAY)
2795				panic("disk_io_initiation: indirdep gone");
2796			/*
2797			 * If there are no remaining dependencies, this
2798			 * will be writing the real pointers, so the
2799			 * dependency can be freed.
2800			 */
2801			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
2802				indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
2803				brelse(indirdep->ir_savebp);
2804				/* inline expand WORKLIST_REMOVE(wk); */
2805				wk->wk_state &= ~ONWORKLIST;
2806				LIST_REMOVE(wk, wk_list);
2807				WORKITEM_FREE(indirdep, D_INDIRDEP);
2808				continue;
2809			}
2810			/*
2811			 * Replace up-to-date version with safe version.
2812			 */
2813			ACQUIRE_LOCK(&lk);
2814			indirdep->ir_state &= ~ATTACHED;
2815			indirdep->ir_state |= UNDONE;
2816			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
2817			    M_INDIRDEP, M_WAITOK);
2818			bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
2819			bcopy(indirdep->ir_savebp->b_data, bp->b_data,
2820			    bp->b_bcount);
2821			FREE_LOCK(&lk);
2822			continue;
2823
2824		case D_MKDIR:
2825		case D_BMSAFEMAP:
2826		case D_ALLOCDIRECT:
2827		case D_ALLOCINDIR:
2828			continue;
2829
2830		default:
2831			panic("handle_disk_io_initiation: Unexpected type %s",
2832			    TYPENAME(wk->wk_type));
2833			/* NOTREACHED */
2834		}
2835	}
2836}
2837
2838/*
2839 * Called from within the procedure above to deal with unsatisfied
2840 * allocation dependencies in a directory. The buffer must be locked,
2841 * thus, no I/O completion operations can occur while we are
2842 * manipulating its associated dependencies.
2843 */
2844static void
2845initiate_write_filepage(pagedep, bp)
2846	struct pagedep *pagedep;
2847	struct buf *bp;
2848{
2849	struct diradd *dap;
2850	struct direct *ep;
2851	int i;
2852
2853	if (pagedep->pd_state & IOSTARTED) {
2854		/*
2855		 * This can only happen if there is a driver that does not
2856		 * understand chaining. Here biodone will reissue the call
2857		 * to strategy for the incomplete buffers.
2858		 */
2859		printf("initiate_write_filepage: already started\n");
2860		return;
2861	}
2862	pagedep->pd_state |= IOSTARTED;
2863	ACQUIRE_LOCK(&lk);
2864	for (i = 0; i < DAHASHSZ; i++) {
2865		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
2866		     dap = LIST_NEXT(dap, da_pdlist)) {
2867			ep = (struct direct *)
2868			    ((char *)bp->b_data + dap->da_offset);
2869			if (ep->d_ino != dap->da_newinum)
2870				panic("%s: dir inum %d != new %d",
2871				    "initiate_write_filepage",
2872				    ep->d_ino, dap->da_newinum);
2873			if (dap->da_state & DIRCHG)
2874				ep->d_ino = dap->da_previous->dm_oldinum;
2875			else
2876				ep->d_ino = 0;
2877			dap->da_state &= ~ATTACHED;
2878			dap->da_state |= UNDONE;
2879		}
2880	}
2881	FREE_LOCK(&lk);
2882}
2883
2884/*
2885 * Called from within the procedure above to deal with unsatisfied
2886 * allocation dependencies in an inodeblock. The buffer must be
2887 * locked, thus, no I/O completion operations can occur while we
2888 * are manipulating its associated dependencies.
2889 */
2890static void
2891initiate_write_inodeblock(inodedep, bp)
2892	struct inodedep *inodedep;
2893	struct buf *bp;			/* The inode block */
2894{
2895	struct allocdirect *adp, *lastadp;
2896	struct dinode *dp;
2897	struct fs *fs;
2898	ufs_lbn_t prevlbn = 0;
2899	int i, deplist;
2900
2901	if (inodedep->id_state & IOSTARTED)
2902		panic("initiate_write_inodeblock: already started");
2903	inodedep->id_state |= IOSTARTED;
2904	fs = inodedep->id_fs;
2905	dp = (struct dinode *)bp->b_data +
2906	    ino_to_fsbo(fs, inodedep->id_ino);
2907	/*
2908	 * If the bitmap is not yet written, then the allocated
2909	 * inode cannot be written to disk.
2910	 */
2911	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
2912		if (inodedep->id_savedino != NULL)
2913			panic("initiate_write_inodeblock: already doing I/O");
2914		MALLOC(inodedep->id_savedino, struct dinode *,
2915		    sizeof(struct dinode), M_INODEDEP, M_WAITOK);
2916		*inodedep->id_savedino = *dp;
2917		bzero((caddr_t)dp, sizeof(struct dinode));
2918		return;
2919	}
2920	/*
2921	 * If no dependencies, then there is nothing to roll back.
2922	 */
2923	inodedep->id_savedsize = dp->di_size;
2924	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
2925		return;
2926	/*
2927	 * Set the dependencies to busy.
2928	 */
2929	ACQUIRE_LOCK(&lk);
2930	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2931	     adp = TAILQ_NEXT(adp, ad_next)) {
2932#ifdef DIAGNOSTIC
2933		if (deplist != 0 && prevlbn >= adp->ad_lbn)
2934			panic("softdep_write_inodeblock: lbn order");
2935		prevlbn = adp->ad_lbn;
2936		if (adp->ad_lbn < NDADDR &&
2937		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
2938			panic("%s: direct pointer #%ld mismatch %d != %d",
2939			    "softdep_write_inodeblock", adp->ad_lbn,
2940			    dp->di_db[adp->ad_lbn], adp->ad_newblkno);
2941		if (adp->ad_lbn >= NDADDR &&
2942		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
2943			panic("%s: indirect pointer #%ld mismatch %d != %d",
2944			    "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
2945			    dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
2946		deplist |= 1 << adp->ad_lbn;
2947		if ((adp->ad_state & ATTACHED) == 0)
2948			panic("softdep_write_inodeblock: Unknown state 0x%x",
2949			    adp->ad_state);
2950#endif /* DIAGNOSTIC */
2951		adp->ad_state &= ~ATTACHED;
2952		adp->ad_state |= UNDONE;
2953	}
2954	/*
2955	 * The on-disk inode cannot claim to be any larger than the last
2956	 * fragment that has been written. Otherwise, the on-disk inode
2957	 * might have fragments that were not the last block in the file
2958	 * which would corrupt the filesystem.
2959	 */
2960	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2961	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
2962		if (adp->ad_lbn >= NDADDR)
2963			break;
2964		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
2965		/* keep going until hitting a rollback to a frag */
2966		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
2967			continue;
2968		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
2969		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
2970#ifdef DIAGNOSTIC
2971			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
2972				panic("softdep_write_inodeblock: lost dep1");
2973#endif /* DIAGNOSTIC */
2974			dp->di_db[i] = 0;
2975		}
2976		for (i = 0; i < NIADDR; i++) {
2977#ifdef DIAGNOSTIC
2978			if (dp->di_ib[i] != 0 &&
2979			    (deplist & ((1 << NDADDR) << i)) == 0)
2980				panic("softdep_write_inodeblock: lost dep2");
2981#endif /* DIAGNOSTIC */
2982			dp->di_ib[i] = 0;
2983		}
2984		FREE_LOCK(&lk);
2985		return;
2986	}
2987	/*
2988	 * If we have zero'ed out the last allocated block of the file,
2989	 * roll back the size to the last currently allocated block.
2990	 * We know that this last allocated block is a full-sized as
2991	 * we already checked for fragments in the loop above.
2992	 */
2993	if (lastadp != NULL &&
2994	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
2995		for (i = lastadp->ad_lbn; i >= 0; i--)
2996			if (dp->di_db[i] != 0)
2997				break;
2998		dp->di_size = (i + 1) * fs->fs_bsize;
2999	}
3000	/*
3001	 * The only dependencies are for indirect blocks.
3002	 *
3003	 * The file size for indirect block additions is not guaranteed.
3004	 * Such a guarantee would be non-trivial to achieve. The conventional
3005	 * synchronous write implementation also does not make this guarantee.
3006	 * Fsck should catch and fix discrepancies. Arguably, the file size
3007	 * can be over-estimated without destroying integrity when the file
3008	 * moves into the indirect blocks (i.e., is large). If we want to
3009	 * postpone fsck, we are stuck with this argument.
3010	 */
3011	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3012		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3013	FREE_LOCK(&lk);
3014}
3015
3016/*
3017 * This routine is called during the completion interrupt
3018 * service routine for a disk write (from the procedure called
3019 * by the device driver to inform the file system caches of
3020 * a request completion).  It should be called early in this
3021 * procedure, before the block is made available to other
3022 * processes or other routines are called.
3023 */
3024static void
3025softdep_disk_write_complete(bp)
3026	struct buf *bp;		/* describes the completed disk write */
3027{
3028	struct worklist *wk;
3029	struct workhead reattach;
3030	struct newblk *newblk;
3031	struct allocindir *aip;
3032	struct allocdirect *adp;
3033	struct indirdep *indirdep;
3034	struct inodedep *inodedep;
3035	struct bmsafemap *bmsafemap;
3036
3037#ifdef DEBUG
3038	if (lk.lkt_held != -1)
3039		panic("softdep_disk_write_complete: lock is held");
3040	lk.lkt_held = -2;
3041#endif
3042	LIST_INIT(&reattach);
3043	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
3044		WORKLIST_REMOVE(wk);
3045		switch (wk->wk_type) {
3046
3047		case D_PAGEDEP:
3048			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
3049				WORKLIST_INSERT(&reattach, wk);
3050			continue;
3051
3052		case D_INODEDEP:
3053			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
3054				WORKLIST_INSERT(&reattach, wk);
3055			continue;
3056
3057		case D_BMSAFEMAP:
3058			bmsafemap = WK_BMSAFEMAP(wk);
3059			while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
3060				newblk->nb_state |= DEPCOMPLETE;
3061				newblk->nb_bmsafemap = NULL;
3062				LIST_REMOVE(newblk, nb_deps);
3063			}
3064			while ((adp =
3065			   LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
3066				adp->ad_state |= DEPCOMPLETE;
3067				adp->ad_buf = NULL;
3068				LIST_REMOVE(adp, ad_deps);
3069				handle_allocdirect_partdone(adp);
3070			}
3071			while ((aip =
3072			    LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
3073				aip->ai_state |= DEPCOMPLETE;
3074				aip->ai_buf = NULL;
3075				LIST_REMOVE(aip, ai_deps);
3076				handle_allocindir_partdone(aip);
3077			}
3078			while ((inodedep =
3079			     LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
3080				inodedep->id_state |= DEPCOMPLETE;
3081				LIST_REMOVE(inodedep, id_deps);
3082				inodedep->id_buf = NULL;
3083			}
3084			WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
3085			continue;
3086
3087		case D_MKDIR:
3088			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
3089			continue;
3090
3091		case D_ALLOCDIRECT:
3092			adp = WK_ALLOCDIRECT(wk);
3093			adp->ad_state |= COMPLETE;
3094			handle_allocdirect_partdone(adp);
3095			continue;
3096
3097		case D_ALLOCINDIR:
3098			aip = WK_ALLOCINDIR(wk);
3099			aip->ai_state |= COMPLETE;
3100			handle_allocindir_partdone(aip);
3101			continue;
3102
3103		case D_INDIRDEP:
3104			indirdep = WK_INDIRDEP(wk);
3105			if (indirdep->ir_state & GOINGAWAY)
3106				panic("disk_write_complete: indirdep gone");
3107			bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
3108			FREE(indirdep->ir_saveddata, M_INDIRDEP);
3109			indirdep->ir_saveddata = 0;
3110			indirdep->ir_state &= ~UNDONE;
3111			indirdep->ir_state |= ATTACHED;
3112			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
3113				handle_allocindir_partdone(aip);
3114				if (aip == LIST_FIRST(&indirdep->ir_donehd))
3115					panic("disk_write_complete: not gone");
3116			}
3117			WORKLIST_INSERT(&reattach, wk);
3118			if ((bp->b_flags & B_DELWRI) == 0)
3119				stat_indir_blk_ptrs++;
3120			bdirty(bp);
3121			continue;
3122
3123		default:
3124			panic("handle_disk_write_complete: Unknown type %s",
3125			    TYPENAME(wk->wk_type));
3126			/* NOTREACHED */
3127		}
3128	}
3129	/*
3130	 * Reattach any requests that must be redone.
3131	 */
3132	while ((wk = LIST_FIRST(&reattach)) != NULL) {
3133		WORKLIST_REMOVE(wk);
3134		WORKLIST_INSERT(&bp->b_dep, wk);
3135	}
3136#ifdef DEBUG
3137	if (lk.lkt_held != -2)
3138		panic("softdep_disk_write_complete: lock lost");
3139	lk.lkt_held = -1;
3140#endif
3141}
3142
3143/*
3144 * Called from within softdep_disk_write_complete above. Note that
3145 * this routine is always called from interrupt level with further
3146 * splbio interrupts blocked.
3147 */
3148static void
3149handle_allocdirect_partdone(adp)
3150	struct allocdirect *adp;	/* the completed allocdirect */
3151{
3152	struct allocdirect *listadp;
3153	struct inodedep *inodedep;
3154	long bsize;
3155
3156	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3157		return;
3158	if (adp->ad_buf != NULL)
3159		panic("handle_allocdirect_partdone: dangling dep");
3160	/*
3161	 * The on-disk inode cannot claim to be any larger than the last
3162	 * fragment that has been written. Otherwise, the on-disk inode
3163	 * might have fragments that were not the last block in the file
3164	 * which would corrupt the filesystem. Thus, we cannot free any
3165	 * allocdirects after one whose ad_oldblkno claims a fragment as
3166	 * these blocks must be rolled back to zero before writing the inode.
3167	 * We check the currently active set of allocdirects in id_inoupdt.
3168	 */
3169	inodedep = adp->ad_inodedep;
3170	bsize = inodedep->id_fs->fs_bsize;
3171	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp;
3172	     listadp = TAILQ_NEXT(listadp, ad_next)) {
3173		/* found our block */
3174		if (listadp == adp)
3175			break;
3176		/* continue if ad_oldlbn is not a fragment */
3177		if (listadp->ad_oldsize == 0 ||
3178		    listadp->ad_oldsize == bsize)
3179			continue;
3180		/* hit a fragment */
3181		return;
3182	}
3183	/*
3184	 * If we have reached the end of the current list without
3185	 * finding the just finished dependency, then it must be
3186	 * on the future dependency list. Future dependencies cannot
3187	 * be freed until they are moved to the current list.
3188	 */
3189	if (listadp == NULL) {
3190#ifdef DEBUG
3191		for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp;
3192		     listadp = TAILQ_NEXT(listadp, ad_next))
3193			/* found our block */
3194			if (listadp == adp)
3195				break;
3196		if (listadp == NULL)
3197			panic("handle_allocdirect_partdone: lost dep");
3198#endif /* DEBUG */
3199		return;
3200	}
3201	/*
3202	 * If we have found the just finished dependency, then free
3203	 * it along with anything that follows it that is complete.
3204	 */
3205	for (; adp; adp = listadp) {
3206		listadp = TAILQ_NEXT(adp, ad_next);
3207		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3208			return;
3209		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
3210	}
3211}
3212
3213/*
3214 * Called from within softdep_disk_write_complete above. Note that
3215 * this routine is always called from interrupt level with further
3216 * splbio interrupts blocked.
3217 */
3218static void
3219handle_allocindir_partdone(aip)
3220	struct allocindir *aip;		/* the completed allocindir */
3221{
3222	struct indirdep *indirdep;
3223
3224	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
3225		return;
3226	if (aip->ai_buf != NULL)
3227		panic("handle_allocindir_partdone: dangling dependency");
3228	indirdep = aip->ai_indirdep;
3229	if (indirdep->ir_state & UNDONE) {
3230		LIST_REMOVE(aip, ai_next);
3231		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
3232		return;
3233	}
3234	((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3235	    aip->ai_newblkno;
3236	LIST_REMOVE(aip, ai_next);
3237	if (aip->ai_freefrag != NULL)
3238		add_to_worklist(&aip->ai_freefrag->ff_list);
3239	WORKITEM_FREE(aip, D_ALLOCINDIR);
3240}
3241
3242/*
3243 * Called from within softdep_disk_write_complete above to restore
3244 * in-memory inode block contents to their most up-to-date state. Note
3245 * that this routine is always called from interrupt level with further
3246 * splbio interrupts blocked.
3247 */
3248static int
3249handle_written_inodeblock(inodedep, bp)
3250	struct inodedep *inodedep;
3251	struct buf *bp;		/* buffer containing the inode block */
3252{
3253	struct worklist *wk, *filefree;
3254	struct allocdirect *adp, *nextadp;
3255	struct dinode *dp;
3256	int hadchanges;
3257
3258	if ((inodedep->id_state & IOSTARTED) == 0)
3259		panic("handle_written_inodeblock: not started");
3260	inodedep->id_state &= ~IOSTARTED;
3261	inodedep->id_state |= COMPLETE;
3262	dp = (struct dinode *)bp->b_data +
3263	    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
3264	/*
3265	 * If we had to rollback the inode allocation because of
3266	 * bitmaps being incomplete, then simply restore it.
3267	 * Keep the block dirty so that it will not be reclaimed until
3268	 * all associated dependencies have been cleared and the
3269	 * corresponding updates written to disk.
3270	 */
3271	if (inodedep->id_savedino != NULL) {
3272		*dp = *inodedep->id_savedino;
3273		FREE(inodedep->id_savedino, M_INODEDEP);
3274		inodedep->id_savedino = NULL;
3275		if ((bp->b_flags & B_DELWRI) == 0)
3276			stat_inode_bitmap++;
3277		bdirty(bp);
3278		return (1);
3279	}
3280	/*
3281	 * Roll forward anything that had to be rolled back before
3282	 * the inode could be updated.
3283	 */
3284	hadchanges = 0;
3285	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
3286		nextadp = TAILQ_NEXT(adp, ad_next);
3287		if (adp->ad_state & ATTACHED)
3288			panic("handle_written_inodeblock: new entry");
3289		if (adp->ad_lbn < NDADDR) {
3290			if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno)
3291				panic("%s: %s #%ld mismatch %d != %d",
3292				    "handle_written_inodeblock",
3293				    "direct pointer", adp->ad_lbn,
3294				    dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
3295			dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
3296		} else {
3297			if (dp->di_ib[adp->ad_lbn - NDADDR] != 0)
3298				panic("%s: %s #%ld allocated as %d",
3299				    "handle_written_inodeblock",
3300				    "indirect pointer", adp->ad_lbn - NDADDR,
3301				    dp->di_ib[adp->ad_lbn - NDADDR]);
3302			dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
3303		}
3304		adp->ad_state &= ~UNDONE;
3305		adp->ad_state |= ATTACHED;
3306		hadchanges = 1;
3307	}
3308	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
3309		stat_direct_blk_ptrs++;
3310	/*
3311	 * Reset the file size to its most up-to-date value.
3312	 */
3313	if (inodedep->id_savedsize == -1)
3314		panic("handle_written_inodeblock: bad size");
3315	if (dp->di_size != inodedep->id_savedsize) {
3316		dp->di_size = inodedep->id_savedsize;
3317		hadchanges = 1;
3318	}
3319	inodedep->id_savedsize = -1;
3320	/*
3321	 * If there were any rollbacks in the inode block, then it must be
3322	 * marked dirty so that its will eventually get written back in
3323	 * its correct form.
3324	 */
3325	if (hadchanges)
3326		bdirty(bp);
3327	/*
3328	 * Process any allocdirects that completed during the update.
3329	 */
3330	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
3331		handle_allocdirect_partdone(adp);
3332	/*
3333	 * Process deallocations that were held pending until the
3334	 * inode had been written to disk. Freeing of the inode
3335	 * is delayed until after all blocks have been freed to
3336	 * avoid creation of new <vfsid, inum, lbn> triples
3337	 * before the old ones have been deleted.
3338	 */
3339	filefree = NULL;
3340	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
3341		WORKLIST_REMOVE(wk);
3342		switch (wk->wk_type) {
3343
3344		case D_FREEFILE:
3345			/*
3346			 * We defer adding filefree to the worklist until
3347			 * all other additions have been made to ensure
3348			 * that it will be done after all the old blocks
3349			 * have been freed.
3350			 */
3351			if (filefree != NULL)
3352				panic("handle_written_inodeblock: filefree");
3353			filefree = wk;
3354			continue;
3355
3356		case D_MKDIR:
3357			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
3358			continue;
3359
3360		case D_DIRADD:
3361			diradd_inode_written(WK_DIRADD(wk), inodedep);
3362			continue;
3363
3364		case D_FREEBLKS:
3365		case D_FREEFRAG:
3366		case D_DIRREM:
3367			add_to_worklist(wk);
3368			continue;
3369
3370		default:
3371			panic("handle_written_inodeblock: Unknown type %s",
3372			    TYPENAME(wk->wk_type));
3373			/* NOTREACHED */
3374		}
3375	}
3376	if (filefree != NULL) {
3377		if (free_inodedep(inodedep) == 0)
3378			panic("handle_written_inodeblock: live inodedep");
3379		add_to_worklist(filefree);
3380		return (0);
3381	}
3382
3383	/*
3384	 * If no outstanding dependencies, free it.
3385	 */
3386	if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
3387		return (0);
3388	return (hadchanges);
3389}
3390
3391/*
3392 * Process a diradd entry after its dependent inode has been written.
3393 * This routine must be called with splbio interrupts blocked.
3394 */
3395static void
3396diradd_inode_written(dap, inodedep)
3397	struct diradd *dap;
3398	struct inodedep *inodedep;
3399{
3400	struct pagedep *pagedep;
3401
3402	dap->da_state |= COMPLETE;
3403	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3404		if (dap->da_state & DIRCHG)
3405			pagedep = dap->da_previous->dm_pagedep;
3406		else
3407			pagedep = dap->da_pagedep;
3408		LIST_REMOVE(dap, da_pdlist);
3409		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3410	}
3411	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3412}
3413
3414/*
3415 * Handle the completion of a mkdir dependency.
3416 */
3417static void
3418handle_written_mkdir(mkdir, type)
3419	struct mkdir *mkdir;
3420	int type;
3421{
3422	struct diradd *dap;
3423	struct pagedep *pagedep;
3424
3425	if (mkdir->md_state != type)
3426		panic("handle_written_mkdir: bad type");
3427	dap = mkdir->md_diradd;
3428	dap->da_state &= ~type;
3429	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
3430		dap->da_state |= DEPCOMPLETE;
3431	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3432		if (dap->da_state & DIRCHG)
3433			pagedep = dap->da_previous->dm_pagedep;
3434		else
3435			pagedep = dap->da_pagedep;
3436		LIST_REMOVE(dap, da_pdlist);
3437		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3438	}
3439	LIST_REMOVE(mkdir, md_mkdirs);
3440	WORKITEM_FREE(mkdir, D_MKDIR);
3441}
3442
3443/*
3444 * Called from within softdep_disk_write_complete above.
3445 * A write operation was just completed. Removed inodes can
3446 * now be freed and associated block pointers may be committed.
3447 * Note that this routine is always called from interrupt level
3448 * with further splbio interrupts blocked.
3449 */
3450static int
3451handle_written_filepage(pagedep, bp)
3452	struct pagedep *pagedep;
3453	struct buf *bp;		/* buffer containing the written page */
3454{
3455	struct dirrem *dirrem;
3456	struct diradd *dap, *nextdap;
3457	struct direct *ep;
3458	int i, chgs;
3459
3460	if ((pagedep->pd_state & IOSTARTED) == 0)
3461		panic("handle_written_filepage: not started");
3462	pagedep->pd_state &= ~IOSTARTED;
3463	/*
3464	 * Process any directory removals that have been committed.
3465	 */
3466	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
3467		LIST_REMOVE(dirrem, dm_next);
3468		dirrem->dm_dirinum = pagedep->pd_ino;
3469		add_to_worklist(&dirrem->dm_list);
3470	}
3471	/*
3472	 * Free any directory additions that have been committed.
3473	 */
3474	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
3475		free_diradd(dap);
3476	/*
3477	 * Uncommitted directory entries must be restored.
3478	 */
3479	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
3480		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
3481		     dap = nextdap) {
3482			nextdap = LIST_NEXT(dap, da_pdlist);
3483			if (dap->da_state & ATTACHED)
3484				panic("handle_written_filepage: attached");
3485			ep = (struct direct *)
3486			    ((char *)bp->b_data + dap->da_offset);
3487			ep->d_ino = dap->da_newinum;
3488			dap->da_state &= ~UNDONE;
3489			dap->da_state |= ATTACHED;
3490			chgs = 1;
3491			/*
3492			 * If the inode referenced by the directory has
3493			 * been written out, then the dependency can be
3494			 * moved to the pending list.
3495			 */
3496			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3497				LIST_REMOVE(dap, da_pdlist);
3498				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
3499				    da_pdlist);
3500			}
3501		}
3502	}
3503	/*
3504	 * If there were any rollbacks in the directory, then it must be
3505	 * marked dirty so that its will eventually get written back in
3506	 * its correct form.
3507	 */
3508	if (chgs) {
3509		if ((bp->b_flags & B_DELWRI) == 0)
3510			stat_dir_entry++;
3511		bdirty(bp);
3512	}
3513	/*
3514	 * If no dependencies remain, the pagedep will be freed.
3515	 * Otherwise it will remain to update the page before it
3516	 * is written back to disk.
3517	 */
3518	if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
3519		for (i = 0; i < DAHASHSZ; i++)
3520			if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
3521				break;
3522		if (i == DAHASHSZ) {
3523			LIST_REMOVE(pagedep, pd_hash);
3524			WORKITEM_FREE(pagedep, D_PAGEDEP);
3525			return (0);
3526		}
3527	}
3528	return (1);
3529}
3530
3531/*
3532 * Writing back in-core inode structures.
3533 *
3534 * The file system only accesses an inode's contents when it occupies an
3535 * "in-core" inode structure.  These "in-core" structures are separate from
3536 * the page frames used to cache inode blocks.  Only the latter are
3537 * transferred to/from the disk.  So, when the updated contents of the
3538 * "in-core" inode structure are copied to the corresponding in-memory inode
3539 * block, the dependencies are also transferred.  The following procedure is
3540 * called when copying a dirty "in-core" inode to a cached inode block.
3541 */
3542
3543/*
3544 * Called when an inode is loaded from disk. If the effective link count
3545 * differed from the actual link count when it was last flushed, then we
3546 * need to ensure that the correct effective link count is put back.
3547 */
3548void
3549softdep_load_inodeblock(ip)
3550	struct inode *ip;	/* the "in_core" copy of the inode */
3551{
3552	struct inodedep *inodedep;
3553
3554	/*
3555	 * Check for alternate nlink count.
3556	 */
3557	ip->i_effnlink = ip->i_nlink;
3558	ACQUIRE_LOCK(&lk);
3559	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3560		FREE_LOCK(&lk);
3561		return;
3562	}
3563	ip->i_effnlink -= inodedep->id_nlinkdelta;
3564	FREE_LOCK(&lk);
3565}
3566
3567/*
3568 * This routine is called just before the "in-core" inode
3569 * information is to be copied to the in-memory inode block.
3570 * Recall that an inode block contains several inodes. If
3571 * the force flag is set, then the dependencies will be
3572 * cleared so that the update can always be made. Note that
3573 * the buffer is locked when this routine is called, so we
3574 * will never be in the middle of writing the inode block
3575 * to disk.
3576 */
3577void
3578softdep_update_inodeblock(ip, bp, waitfor)
3579	struct inode *ip;	/* the "in_core" copy of the inode */
3580	struct buf *bp;		/* the buffer containing the inode block */
3581	int waitfor;		/* nonzero => update must be allowed */
3582{
3583	struct inodedep *inodedep;
3584	struct worklist *wk;
3585	int error, gotit;
3586
3587	/*
3588	 * If the effective link count is not equal to the actual link
3589	 * count, then we must track the difference in an inodedep while
3590	 * the inode is (potentially) tossed out of the cache. Otherwise,
3591	 * if there is no existing inodedep, then there are no dependencies
3592	 * to track.
3593	 */
3594	ACQUIRE_LOCK(&lk);
3595	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3596		if (ip->i_effnlink != ip->i_nlink)
3597			panic("softdep_update_inodeblock: bad link count");
3598		FREE_LOCK(&lk);
3599		return;
3600	}
3601	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
3602		panic("softdep_update_inodeblock: bad delta");
3603	/*
3604	 * Changes have been initiated. Anything depending on these
3605	 * changes cannot occur until this inode has been written.
3606	 */
3607	inodedep->id_state &= ~COMPLETE;
3608	if ((inodedep->id_state & ONWORKLIST) == 0)
3609		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
3610	/*
3611	 * Any new dependencies associated with the incore inode must
3612	 * now be moved to the list associated with the buffer holding
3613	 * the in-memory copy of the inode. Once merged process any
3614	 * allocdirects that are completed by the merger.
3615	 */
3616	merge_inode_lists(inodedep);
3617	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
3618		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
3619	/*
3620	 * Now that the inode has been pushed into the buffer, the
3621	 * operations dependent on the inode being written to disk
3622	 * can be moved to the id_bufwait so that they will be
3623	 * processed when the buffer I/O completes.
3624	 */
3625	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
3626		WORKLIST_REMOVE(wk);
3627		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
3628	}
3629	/*
3630	 * Newly allocated inodes cannot be written until the bitmap
3631	 * that allocates them have been written (indicated by
3632	 * DEPCOMPLETE being set in id_state). If we are doing a
3633	 * forced sync (e.g., an fsync on a file), we force the bitmap
3634	 * to be written so that the update can be done.
3635	 */
3636	if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
3637		FREE_LOCK(&lk);
3638		return;
3639	}
3640	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
3641	FREE_LOCK(&lk);
3642	if (gotit &&
3643	    (error = VOP_BWRITE(inodedep->id_buf->b_vp, inodedep->id_buf)) != 0)
3644		softdep_error("softdep_update_inodeblock: bwrite", error);
3645	if ((inodedep->id_state & DEPCOMPLETE) == 0)
3646		panic("softdep_update_inodeblock: update failed");
3647}
3648
3649/*
3650 * Merge the new inode dependency list (id_newinoupdt) into the old
3651 * inode dependency list (id_inoupdt). This routine must be called
3652 * with splbio interrupts blocked.
3653 */
3654static void
3655merge_inode_lists(inodedep)
3656	struct inodedep *inodedep;
3657{
3658	struct allocdirect *listadp, *newadp;
3659
3660	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3661	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
3662		if (listadp->ad_lbn < newadp->ad_lbn) {
3663			listadp = TAILQ_NEXT(listadp, ad_next);
3664			continue;
3665		}
3666		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3667		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
3668		if (listadp->ad_lbn == newadp->ad_lbn) {
3669			allocdirect_merge(&inodedep->id_inoupdt, newadp,
3670			    listadp);
3671			listadp = newadp;
3672		}
3673		newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3674	}
3675	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
3676		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3677		TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
3678	}
3679}
3680
3681/*
3682 * If we are doing an fsync, then we must ensure that any directory
3683 * entries for the inode have been written after the inode gets to disk.
3684 */
3685static int
3686softdep_fsync(vp)
3687	struct vnode *vp;	/* the "in_core" copy of the inode */
3688{
3689	struct inodedep *inodedep;
3690	struct pagedep *pagedep;
3691	struct worklist *wk;
3692	struct diradd *dap;
3693	struct mount *mnt;
3694	struct vnode *pvp;
3695	struct inode *ip;
3696	struct buf *bp;
3697	struct fs *fs;
3698	struct proc *p = CURPROC;		/* XXX */
3699	int error, flushparent;
3700	ino_t parentino;
3701	ufs_lbn_t lbn;
3702
3703	ip = VTOI(vp);
3704	fs = ip->i_fs;
3705	ACQUIRE_LOCK(&lk);
3706	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
3707		FREE_LOCK(&lk);
3708		return (0);
3709	}
3710	if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
3711	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
3712	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
3713	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
3714		panic("softdep_fsync: pending ops");
3715	for (error = 0, flushparent = 0; ; ) {
3716		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
3717			break;
3718		if (wk->wk_type != D_DIRADD)
3719			panic("softdep_fsync: Unexpected type %s",
3720			    TYPENAME(wk->wk_type));
3721		dap = WK_DIRADD(wk);
3722		/*
3723		 * Flush our parent if this directory entry
3724		 * has a MKDIR_PARENT dependency.
3725		 */
3726		if (dap->da_state & DIRCHG)
3727			pagedep = dap->da_previous->dm_pagedep;
3728		else
3729			pagedep = dap->da_pagedep;
3730		mnt = pagedep->pd_mnt;
3731		parentino = pagedep->pd_ino;
3732		lbn = pagedep->pd_lbn;
3733		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
3734			panic("softdep_fsync: dirty");
3735		flushparent = dap->da_state & MKDIR_PARENT;
3736		/*
3737		 * If we are being fsync'ed as part of vgone'ing this vnode,
3738		 * then we will not be able to release and recover the
3739		 * vnode below, so we just have to give up on writing its
3740		 * directory entry out. It will eventually be written, just
3741		 * not now, but then the user was not asking to have it
3742		 * written, so we are not breaking any promises.
3743		 */
3744		if (vp->v_flag & VXLOCK)
3745			break;
3746		/*
3747		 * We prevent deadlock by always fetching inodes from the
3748		 * root, moving down the directory tree. Thus, when fetching
3749		 * our parent directory, we must unlock ourselves before
3750		 * requesting the lock on our parent. See the comment in
3751		 * ufs_lookup for details on possible races.
3752		 */
3753		FREE_LOCK(&lk);
3754		VOP_UNLOCK(vp, 0, p);
3755		error = VFS_VGET(mnt, parentino, &pvp);
3756		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
3757		if (error != 0)
3758			return (error);
3759		if (flushparent) {
3760			if ((error = UFS_UPDATE(pvp, 1)) != 0) {
3761				vput(pvp);
3762				return (error);
3763			}
3764		}
3765		/*
3766		 * Flush directory page containing the inode's name.
3767		 */
3768		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
3769		    &bp);
3770		if (error == 0)
3771			error = VOP_BWRITE(bp->b_vp, bp);
3772		vput(pvp);
3773		if (error != 0)
3774			return (error);
3775		ACQUIRE_LOCK(&lk);
3776		if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
3777			break;
3778	}
3779	FREE_LOCK(&lk);
3780	return (0);
3781}
3782
3783/*
3784 * Flush all the dirty bitmaps associated with the block device
3785 * before flushing the rest of the dirty blocks so as to reduce
3786 * the number of dependencies that will have to be rolled back.
3787 */
3788void
3789softdep_fsync_mountdev(vp)
3790	struct vnode *vp;
3791{
3792	struct buf *bp, *nbp;
3793	struct worklist *wk;
3794
3795	if (!vn_isdisk(vp, NULL))
3796		panic("softdep_fsync_mountdev: vnode not a disk");
3797	ACQUIRE_LOCK(&lk);
3798	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
3799		nbp = TAILQ_NEXT(bp, b_vnbufs);
3800		/*
3801		 * If it is already scheduled, skip to the next buffer.
3802		 */
3803		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
3804			continue;
3805		if ((bp->b_flags & B_DELWRI) == 0)
3806			panic("softdep_fsync_mountdev: not dirty");
3807		/*
3808		 * We are only interested in bitmaps with outstanding
3809		 * dependencies.
3810		 */
3811		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
3812		    wk->wk_type != D_BMSAFEMAP) {
3813			BUF_UNLOCK(bp);
3814			continue;
3815		}
3816		bremfree(bp);
3817		FREE_LOCK(&lk);
3818		(void) bawrite(bp);
3819		ACQUIRE_LOCK(&lk);
3820		/*
3821		 * Since we may have slept during the I/O, we need
3822		 * to start from a known point.
3823		 */
3824		nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
3825	}
3826	drain_output(vp, 1);
3827	FREE_LOCK(&lk);
3828}
3829
3830/*
3831 * This routine is called when we are trying to synchronously flush a
3832 * file. This routine must eliminate any filesystem metadata dependencies
3833 * so that the syncing routine can succeed by pushing the dirty blocks
3834 * associated with the file. If any I/O errors occur, they are returned.
3835 */
3836int
3837softdep_sync_metadata(ap)
3838	struct vop_fsync_args /* {
3839		struct vnode *a_vp;
3840		struct ucred *a_cred;
3841		int a_waitfor;
3842		struct proc *a_p;
3843	} */ *ap;
3844{
3845	struct vnode *vp = ap->a_vp;
3846	struct pagedep *pagedep;
3847	struct allocdirect *adp;
3848	struct allocindir *aip;
3849	struct buf *bp, *nbp;
3850	struct worklist *wk;
3851	int i, error, waitfor;
3852
3853	/*
3854	 * Check whether this vnode is involved in a filesystem
3855	 * that is doing soft dependency processing.
3856	 */
3857	if (!vn_isdisk(vp, NULL)) {
3858		if (!DOINGSOFTDEP(vp))
3859			return (0);
3860	} else
3861		if (vp->v_specmountpoint == NULL ||
3862		    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
3863			return (0);
3864	/*
3865	 * Ensure that any direct block dependencies have been cleared.
3866	 */
3867	ACQUIRE_LOCK(&lk);
3868	if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
3869		FREE_LOCK(&lk);
3870		return (error);
3871	}
3872	/*
3873	 * For most files, the only metadata dependencies are the
3874	 * cylinder group maps that allocate their inode or blocks.
3875	 * The block allocation dependencies can be found by traversing
3876	 * the dependency lists for any buffers that remain on their
3877	 * dirty buffer list. The inode allocation dependency will
3878	 * be resolved when the inode is updated with MNT_WAIT.
3879	 * This work is done in two passes. The first pass grabs most
3880	 * of the buffers and begins asynchronously writing them. The
3881	 * only way to wait for these asynchronous writes is to sleep
3882	 * on the filesystem vnode which may stay busy for a long time
3883	 * if the filesystem is active. So, instead, we make a second
3884	 * pass over the dependencies blocking on each write. In the
3885	 * usual case we will be blocking against a write that we
3886	 * initiated, so when it is done the dependency will have been
3887	 * resolved. Thus the second pass is expected to end quickly.
3888	 */
3889	waitfor = MNT_NOWAIT;
3890top:
3891	if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
3892		FREE_LOCK(&lk);
3893		return (0);
3894	}
3895	bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
3896loop:
3897	/*
3898	 * As we hold the buffer locked, none of its dependencies
3899	 * will disappear.
3900	 */
3901	for (wk = LIST_FIRST(&bp->b_dep); wk;
3902	     wk = LIST_NEXT(wk, wk_list)) {
3903		switch (wk->wk_type) {
3904
3905		case D_ALLOCDIRECT:
3906			adp = WK_ALLOCDIRECT(wk);
3907			if (adp->ad_state & DEPCOMPLETE)
3908				break;
3909			nbp = adp->ad_buf;
3910			if (getdirtybuf(&nbp, waitfor) == 0)
3911				break;
3912			FREE_LOCK(&lk);
3913			if (waitfor == MNT_NOWAIT) {
3914				bawrite(nbp);
3915			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
3916				bawrite(bp);
3917				return (error);
3918			}
3919			ACQUIRE_LOCK(&lk);
3920			break;
3921
3922		case D_ALLOCINDIR:
3923			aip = WK_ALLOCINDIR(wk);
3924			if (aip->ai_state & DEPCOMPLETE)
3925				break;
3926			nbp = aip->ai_buf;
3927			if (getdirtybuf(&nbp, waitfor) == 0)
3928				break;
3929			FREE_LOCK(&lk);
3930			if (waitfor == MNT_NOWAIT) {
3931				bawrite(nbp);
3932			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
3933				bawrite(bp);
3934				return (error);
3935			}
3936			ACQUIRE_LOCK(&lk);
3937			break;
3938
3939		case D_INDIRDEP:
3940		restart:
3941			for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd);
3942			     aip; aip = LIST_NEXT(aip, ai_next)) {
3943				if (aip->ai_state & DEPCOMPLETE)
3944					continue;
3945				nbp = aip->ai_buf;
3946				if (getdirtybuf(&nbp, MNT_WAIT) == 0)
3947					goto restart;
3948				FREE_LOCK(&lk);
3949				if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
3950					bawrite(bp);
3951					return (error);
3952				}
3953				ACQUIRE_LOCK(&lk);
3954				goto restart;
3955			}
3956			break;
3957
3958		case D_INODEDEP:
3959			if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
3960			    WK_INODEDEP(wk)->id_ino)) != 0) {
3961				FREE_LOCK(&lk);
3962				bawrite(bp);
3963				return (error);
3964			}
3965			break;
3966
3967		case D_PAGEDEP:
3968			/*
3969			 * We are trying to sync a directory that may
3970			 * have dependencies on both its own metadata
3971			 * and/or dependencies on the inodes of any
3972			 * recently allocated files. We walk its diradd
3973			 * lists pushing out the associated inode.
3974			 */
3975			pagedep = WK_PAGEDEP(wk);
3976			for (i = 0; i < DAHASHSZ; i++) {
3977				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
3978					continue;
3979				if ((error =
3980				    flush_pagedep_deps(vp, pagedep->pd_mnt,
3981						&pagedep->pd_diraddhd[i]))) {
3982					FREE_LOCK(&lk);
3983					bawrite(bp);
3984					return (error);
3985				}
3986			}
3987			break;
3988
3989		case D_MKDIR:
3990			/*
3991			 * This case should never happen if the vnode has
3992			 * been properly sync'ed. However, if this function
3993			 * is used at a place where the vnode has not yet
3994			 * been sync'ed, this dependency can show up. So,
3995			 * rather than panic, just flush it.
3996			 */
3997			nbp = WK_MKDIR(wk)->md_buf;
3998			if (getdirtybuf(&nbp, waitfor) == 0)
3999				break;
4000			FREE_LOCK(&lk);
4001			if (waitfor == MNT_NOWAIT) {
4002				bawrite(nbp);
4003			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
4004				bawrite(bp);
4005				return (error);
4006			}
4007			ACQUIRE_LOCK(&lk);
4008			break;
4009
4010		case D_BMSAFEMAP:
4011			/*
4012			 * This case should never happen if the vnode has
4013			 * been properly sync'ed. However, if this function
4014			 * is used at a place where the vnode has not yet
4015			 * been sync'ed, this dependency can show up. So,
4016			 * rather than panic, just flush it.
4017			 */
4018			nbp = WK_BMSAFEMAP(wk)->sm_buf;
4019			if (getdirtybuf(&nbp, waitfor) == 0)
4020				break;
4021			FREE_LOCK(&lk);
4022			if (waitfor == MNT_NOWAIT) {
4023				bawrite(nbp);
4024			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
4025				bawrite(bp);
4026				return (error);
4027			}
4028			ACQUIRE_LOCK(&lk);
4029			break;
4030
4031		default:
4032			panic("softdep_sync_metadata: Unknown type %s",
4033			    TYPENAME(wk->wk_type));
4034			/* NOTREACHED */
4035		}
4036	}
4037	(void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT);
4038	nbp = TAILQ_NEXT(bp, b_vnbufs);
4039	FREE_LOCK(&lk);
4040	bawrite(bp);
4041	ACQUIRE_LOCK(&lk);
4042	if (nbp != NULL) {
4043		bp = nbp;
4044		goto loop;
4045	}
4046	/*
4047	 * We must wait for any I/O in progress to finish so that
4048	 * all potential buffers on the dirty list will be visible.
4049	 * Once they are all there, proceed with the second pass
4050	 * which will wait for the I/O as per above.
4051	 */
4052	drain_output(vp, 1);
4053	/*
4054	 * The brief unlock is to allow any pent up dependency
4055	 * processing to be done.
4056	 */
4057	if (waitfor == MNT_NOWAIT) {
4058		waitfor = MNT_WAIT;
4059		FREE_LOCK(&lk);
4060		ACQUIRE_LOCK(&lk);
4061		goto top;
4062	}
4063
4064	/*
4065	 * If we have managed to get rid of all the dirty buffers,
4066	 * then we are done. For certain directories and block
4067	 * devices, we may need to do further work.
4068	 */
4069	if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
4070		FREE_LOCK(&lk);
4071		return (0);
4072	}
4073
4074	FREE_LOCK(&lk);
4075	/*
4076	 * If we are trying to sync a block device, some of its buffers may
4077	 * contain metadata that cannot be written until the contents of some
4078	 * partially written files have been written to disk. The only easy
4079	 * way to accomplish this is to sync the entire filesystem (luckily
4080	 * this happens rarely).
4081	 */
4082	if (vn_isdisk(vp, NULL) &&
4083	    vp->v_specmountpoint && !VOP_ISLOCKED(vp, NULL) &&
4084	    (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred,
4085	     ap->a_p)) != 0)
4086		return (error);
4087	return (0);
4088}
4089
4090/*
4091 * Flush the dependencies associated with an inodedep.
4092 * Called with splbio blocked.
4093 */
4094static int
4095flush_inodedep_deps(fs, ino)
4096	struct fs *fs;
4097	ino_t ino;
4098{
4099	struct inodedep *inodedep;
4100	struct allocdirect *adp;
4101	int error, waitfor;
4102	struct buf *bp;
4103
4104	/*
4105	 * This work is done in two passes. The first pass grabs most
4106	 * of the buffers and begins asynchronously writing them. The
4107	 * only way to wait for these asynchronous writes is to sleep
4108	 * on the filesystem vnode which may stay busy for a long time
4109	 * if the filesystem is active. So, instead, we make a second
4110	 * pass over the dependencies blocking on each write. In the
4111	 * usual case we will be blocking against a write that we
4112	 * initiated, so when it is done the dependency will have been
4113	 * resolved. Thus the second pass is expected to end quickly.
4114	 * We give a brief window at the top of the loop to allow
4115	 * any pending I/O to complete.
4116	 */
4117	for (waitfor = MNT_NOWAIT; ; ) {
4118		FREE_LOCK(&lk);
4119		ACQUIRE_LOCK(&lk);
4120		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4121			return (0);
4122		for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
4123		     adp = TAILQ_NEXT(adp, ad_next)) {
4124			if (adp->ad_state & DEPCOMPLETE)
4125				continue;
4126			bp = adp->ad_buf;
4127			if (getdirtybuf(&bp, waitfor) == 0) {
4128				if (waitfor == MNT_NOWAIT)
4129					continue;
4130				break;
4131			}
4132			FREE_LOCK(&lk);
4133			if (waitfor == MNT_NOWAIT) {
4134				bawrite(bp);
4135			} else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) {
4136				ACQUIRE_LOCK(&lk);
4137				return (error);
4138			}
4139			ACQUIRE_LOCK(&lk);
4140			break;
4141		}
4142		if (adp != NULL)
4143			continue;
4144		for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp;
4145		     adp = TAILQ_NEXT(adp, ad_next)) {
4146			if (adp->ad_state & DEPCOMPLETE)
4147				continue;
4148			bp = adp->ad_buf;
4149			if (getdirtybuf(&bp, waitfor) == 0) {
4150				if (waitfor == MNT_NOWAIT)
4151					continue;
4152				break;
4153			}
4154			FREE_LOCK(&lk);
4155			if (waitfor == MNT_NOWAIT) {
4156				bawrite(bp);
4157			} else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) {
4158				ACQUIRE_LOCK(&lk);
4159				return (error);
4160			}
4161			ACQUIRE_LOCK(&lk);
4162			break;
4163		}
4164		if (adp != NULL)
4165			continue;
4166		/*
4167		 * If pass2, we are done, otherwise do pass 2.
4168		 */
4169		if (waitfor == MNT_WAIT)
4170			break;
4171		waitfor = MNT_WAIT;
4172	}
4173	/*
4174	 * Try freeing inodedep in case all dependencies have been removed.
4175	 */
4176	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
4177		(void) free_inodedep(inodedep);
4178	return (0);
4179}
4180
4181/*
4182 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
4183 * Called with splbio blocked.
4184 */
4185static int
4186flush_pagedep_deps(pvp, mp, diraddhdp)
4187	struct vnode *pvp;
4188	struct mount *mp;
4189	struct diraddhd *diraddhdp;
4190{
4191	struct proc *p = CURPROC;	/* XXX */
4192	struct inodedep *inodedep;
4193	struct ufsmount *ump;
4194	struct diradd *dap;
4195	struct vnode *vp;
4196	int gotit, error = 0;
4197	struct buf *bp;
4198	ino_t inum;
4199
4200	ump = VFSTOUFS(mp);
4201	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
4202		/*
4203		 * Flush ourselves if this directory entry
4204		 * has a MKDIR_PARENT dependency.
4205		 */
4206		if (dap->da_state & MKDIR_PARENT) {
4207			FREE_LOCK(&lk);
4208			if ((error = UFS_UPDATE(pvp, 1)) != 0)
4209				break;
4210			ACQUIRE_LOCK(&lk);
4211			/*
4212			 * If that cleared dependencies, go on to next.
4213			 */
4214			if (dap != LIST_FIRST(diraddhdp))
4215				continue;
4216			if (dap->da_state & MKDIR_PARENT)
4217				panic("flush_pagedep_deps: MKDIR_PARENT");
4218		}
4219		/*
4220		 * A newly allocated directory must have its "." and
4221		 * ".." entries written out before its name can be
4222		 * committed in its parent. We do not want or need
4223		 * the full semantics of a synchronous VOP_FSYNC as
4224		 * that may end up here again, once for each directory
4225		 * level in the filesystem. Instead, we push the blocks
4226		 * and wait for them to clear. We have to fsync twice
4227		 * because the first call may choose to defer blocks
4228		 * that still have dependencies, but deferral will
4229		 * happen at most once.
4230		 */
4231		inum = dap->da_newinum;
4232		if (dap->da_state & MKDIR_BODY) {
4233			FREE_LOCK(&lk);
4234			if ((error = VFS_VGET(mp, inum, &vp)) != 0)
4235				break;
4236			if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) ||
4237			    (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) {
4238				vput(vp);
4239				break;
4240			}
4241			drain_output(vp, 0);
4242			vput(vp);
4243			ACQUIRE_LOCK(&lk);
4244			/*
4245			 * If that cleared dependencies, go on to next.
4246			 */
4247			if (dap != LIST_FIRST(diraddhdp))
4248				continue;
4249			if (dap->da_state & MKDIR_BODY)
4250				panic("flush_pagedep_deps: MKDIR_BODY");
4251		}
4252		/*
4253		 * Flush the inode on which the directory entry depends.
4254		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
4255		 * the only remaining dependency is that the updated inode
4256		 * count must get pushed to disk. The inode has already
4257		 * been pushed into its inode buffer (via VOP_UPDATE) at
4258		 * the time of the reference count change. So we need only
4259		 * locate that buffer, ensure that there will be no rollback
4260		 * caused by a bitmap dependency, then write the inode buffer.
4261		 */
4262		if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0)
4263			panic("flush_pagedep_deps: lost inode");
4264		/*
4265		 * If the inode still has bitmap dependencies,
4266		 * push them to disk.
4267		 */
4268		if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4269			gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
4270			FREE_LOCK(&lk);
4271			if (gotit &&
4272			    (error = VOP_BWRITE(inodedep->id_buf->b_vp,
4273			     inodedep->id_buf)) != 0)
4274				break;
4275			ACQUIRE_LOCK(&lk);
4276			if (dap != LIST_FIRST(diraddhdp))
4277				continue;
4278		}
4279		/*
4280		 * If the inode is still sitting in a buffer waiting
4281		 * to be written, push it to disk.
4282		 */
4283		FREE_LOCK(&lk);
4284		if ((error = bread(ump->um_devvp,
4285		    fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
4286		    (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0)
4287			break;
4288		if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0)
4289			break;
4290		ACQUIRE_LOCK(&lk);
4291		/*
4292		 * If we have failed to get rid of all the dependencies
4293		 * then something is seriously wrong.
4294		 */
4295		if (dap == LIST_FIRST(diraddhdp))
4296			panic("flush_pagedep_deps: flush failed");
4297	}
4298	if (error)
4299		ACQUIRE_LOCK(&lk);
4300	return (error);
4301}
4302
4303/*
4304 * A large burst of file addition or deletion activity can drive the
4305 * memory load excessively high. Therefore we deliberately slow things
4306 * down and speed up the I/O processing if we find ourselves with too
4307 * many dependencies in progress.
4308 */
4309static int
4310request_cleanup(resource, islocked)
4311	int resource;
4312	int islocked;
4313{
4314	struct callout_handle handle;
4315	struct proc *p = CURPROC;
4316
4317	/*
4318	 * We never hold up the filesystem syncer process.
4319	 */
4320	if (p == filesys_syncer)
4321		return (0);
4322	/*
4323	 * If we are resource constrained on inode dependencies, try
4324	 * flushing some dirty inodes. Otherwise, we are constrained
4325	 * by file deletions, so try accelerating flushes of directories
4326	 * with removal dependencies. We would like to do the cleanup
4327	 * here, but we probably hold an inode locked at this point and
4328	 * that might deadlock against one that we try to clean. So,
4329	 * the best that we can do is request the syncer daemon to do
4330	 * the cleanup for us.
4331	 */
4332	switch (resource) {
4333
4334	case FLUSH_INODES:
4335		stat_ino_limit_push += 1;
4336		req_clear_inodedeps = 1;
4337		break;
4338
4339	case FLUSH_REMOVE:
4340		stat_blk_limit_push += 1;
4341		req_clear_remove = 1;
4342		break;
4343
4344	default:
4345		panic("request_cleanup: unknown type");
4346	}
4347	/*
4348	 * Hopefully the syncer daemon will catch up and awaken us.
4349	 * We wait at most tickdelay before proceeding in any case.
4350	 */
4351	if (islocked == 0)
4352		ACQUIRE_LOCK(&lk);
4353	if (proc_waiting == 0) {
4354		proc_waiting = 1;
4355		handle = timeout(pause_timer, NULL,
4356		    tickdelay > 2 ? tickdelay : 2);
4357	}
4358	FREE_LOCK_INTERLOCKED(&lk);
4359	(void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0);
4360	ACQUIRE_LOCK_INTERLOCKED(&lk);
4361	if (proc_waiting) {
4362		untimeout(pause_timer, NULL, handle);
4363		proc_waiting = 0;
4364	} else {
4365		switch (resource) {
4366
4367		case FLUSH_INODES:
4368			stat_ino_limit_hit += 1;
4369			break;
4370
4371		case FLUSH_REMOVE:
4372			stat_blk_limit_hit += 1;
4373			break;
4374		}
4375	}
4376	if (islocked == 0)
4377		FREE_LOCK(&lk);
4378	return (1);
4379}
4380
4381/*
4382 * Awaken processes pausing in request_cleanup and clear proc_waiting
4383 * to indicate that there is no longer a timer running.
4384 */
4385void
4386pause_timer(arg)
4387	void *arg;
4388{
4389
4390	proc_waiting = 0;
4391	wakeup(&proc_waiting);
4392}
4393
4394/*
4395 * Flush out a directory with at least one removal dependency in an effort to
4396 * reduce the number of dirrem, freefile, and freeblks dependency structures.
4397 */
4398static void
4399clear_remove(p)
4400	struct proc *p;
4401{
4402	struct pagedep_hashhead *pagedephd;
4403	struct pagedep *pagedep;
4404	static int next = 0;
4405	struct mount *mp;
4406	struct vnode *vp;
4407	int error, cnt;
4408	ino_t ino;
4409
4410	ACQUIRE_LOCK(&lk);
4411	for (cnt = 0; cnt < pagedep_hash; cnt++) {
4412		pagedephd = &pagedep_hashtbl[next++];
4413		if (next >= pagedep_hash)
4414			next = 0;
4415		for (pagedep = LIST_FIRST(pagedephd); pagedep;
4416		     pagedep = LIST_NEXT(pagedep, pd_hash)) {
4417			if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
4418				continue;
4419			mp = pagedep->pd_mnt;
4420			ino = pagedep->pd_ino;
4421			FREE_LOCK(&lk);
4422			if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
4423				softdep_error("clear_remove: vget", error);
4424				return;
4425			}
4426			if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
4427				softdep_error("clear_remove: fsync", error);
4428			drain_output(vp, 0);
4429			vput(vp);
4430			return;
4431		}
4432	}
4433	FREE_LOCK(&lk);
4434}
4435
4436/*
4437 * Clear out a block of dirty inodes in an effort to reduce
4438 * the number of inodedep dependency structures.
4439 */
4440static void
4441clear_inodedeps(p)
4442	struct proc *p;
4443{
4444	struct inodedep_hashhead *inodedephd;
4445	struct inodedep *inodedep;
4446	static int next = 0;
4447	struct mount *mp;
4448	struct vnode *vp;
4449	struct fs *fs;
4450	int error, cnt;
4451	ino_t firstino, lastino, ino;
4452
4453	ACQUIRE_LOCK(&lk);
4454	/*
4455	 * Pick a random inode dependency to be cleared.
4456	 * We will then gather up all the inodes in its block
4457	 * that have dependencies and flush them out.
4458	 */
4459	for (cnt = 0; cnt < inodedep_hash; cnt++) {
4460		inodedephd = &inodedep_hashtbl[next++];
4461		if (next >= inodedep_hash)
4462			next = 0;
4463		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
4464			break;
4465	}
4466	/*
4467	 * Ugly code to find mount point given pointer to superblock.
4468	 */
4469	fs = inodedep->id_fs;
4470	TAILQ_FOREACH(mp, &mountlist, mnt_list)
4471		if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
4472			break;
4473	/*
4474	 * Find the last inode in the block with dependencies.
4475	 */
4476	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
4477	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
4478		if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
4479			break;
4480	/*
4481	 * Asynchronously push all but the last inode with dependencies.
4482	 * Synchronously push the last inode with dependencies to ensure
4483	 * that the inode block gets written to free up the inodedeps.
4484	 */
4485	for (ino = firstino; ino <= lastino; ino++) {
4486		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4487			continue;
4488		FREE_LOCK(&lk);
4489		if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
4490			softdep_error("clear_inodedeps: vget", error);
4491			return;
4492		}
4493		if (ino == lastino) {
4494			if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p)))
4495				softdep_error("clear_inodedeps: fsync1", error);
4496		} else {
4497			if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
4498				softdep_error("clear_inodedeps: fsync2", error);
4499			drain_output(vp, 0);
4500		}
4501		vput(vp);
4502		ACQUIRE_LOCK(&lk);
4503	}
4504	FREE_LOCK(&lk);
4505}
4506
4507/*
4508 * Function to determine if the buffer has outstanding dependencies
4509 * that will cause a roll-back if the buffer is written. If wantcount
4510 * is set, return number of dependencies, otherwise just yes or no.
4511 */
4512static int
4513softdep_count_dependencies(bp, wantcount)
4514	struct buf *bp;
4515	int wantcount;
4516{
4517	struct worklist *wk;
4518	struct inodedep *inodedep;
4519	struct indirdep *indirdep;
4520	struct allocindir *aip;
4521	struct pagedep *pagedep;
4522	struct diradd *dap;
4523	int i, retval;
4524
4525	retval = 0;
4526	ACQUIRE_LOCK(&lk);
4527	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) {
4528		switch (wk->wk_type) {
4529
4530		case D_INODEDEP:
4531			inodedep = WK_INODEDEP(wk);
4532			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4533				/* bitmap allocation dependency */
4534				retval += 1;
4535				if (!wantcount)
4536					goto out;
4537			}
4538			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
4539				/* direct block pointer dependency */
4540				retval += 1;
4541				if (!wantcount)
4542					goto out;
4543			}
4544			continue;
4545
4546		case D_INDIRDEP:
4547			indirdep = WK_INDIRDEP(wk);
4548			for (aip = LIST_FIRST(&indirdep->ir_deplisthd);
4549			     aip; aip = LIST_NEXT(aip, ai_next)) {
4550				/* indirect block pointer dependency */
4551				retval += 1;
4552				if (!wantcount)
4553					goto out;
4554			}
4555			continue;
4556
4557		case D_PAGEDEP:
4558			pagedep = WK_PAGEDEP(wk);
4559			for (i = 0; i < DAHASHSZ; i++) {
4560				for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]);
4561				     dap; dap = LIST_NEXT(dap, da_pdlist)) {
4562					/* directory entry dependency */
4563					retval += 1;
4564					if (!wantcount)
4565						goto out;
4566				}
4567			}
4568			continue;
4569
4570		case D_BMSAFEMAP:
4571		case D_ALLOCDIRECT:
4572		case D_ALLOCINDIR:
4573		case D_MKDIR:
4574			/* never a dependency on these blocks */
4575			continue;
4576
4577		default:
4578			panic("softdep_check_for_rollback: Unexpected type %s",
4579			    TYPENAME(wk->wk_type));
4580			/* NOTREACHED */
4581		}
4582	}
4583out:
4584	FREE_LOCK(&lk);
4585	return retval;
4586}
4587
4588/*
4589 * Acquire exclusive access to a buffer.
4590 * Must be called with splbio blocked.
4591 * Return 1 if buffer was acquired.
4592 */
4593static int
4594getdirtybuf(bpp, waitfor)
4595	struct buf **bpp;
4596	int waitfor;
4597{
4598	struct buf *bp;
4599
4600	for (;;) {
4601		if ((bp = *bpp) == NULL)
4602			return (0);
4603		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
4604			if ((bp->b_xflags & BX_BKGRDINPROG) == 0)
4605				break;
4606			BUF_UNLOCK(bp);
4607			if (waitfor != MNT_WAIT)
4608				return (0);
4609			bp->b_xflags |= BX_BKGRDWAIT;
4610			FREE_LOCK_INTERLOCKED(&lk);
4611			tsleep(&bp->b_xflags, PRIBIO, "getbuf", 0);
4612			ACQUIRE_LOCK_INTERLOCKED(&lk);
4613			continue;
4614		}
4615		if (waitfor != MNT_WAIT)
4616			return (0);
4617		FREE_LOCK_INTERLOCKED(&lk);
4618		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL) != ENOLCK)
4619			panic("getdirtybuf: inconsistent lock");
4620		ACQUIRE_LOCK_INTERLOCKED(&lk);
4621	}
4622	if ((bp->b_flags & B_DELWRI) == 0) {
4623		BUF_UNLOCK(bp);
4624		return (0);
4625	}
4626	bremfree(bp);
4627	return (1);
4628}
4629
4630/*
4631 * Wait for pending output on a vnode to complete.
4632 * Must be called with vnode locked.
4633 */
4634static void
4635drain_output(vp, islocked)
4636	struct vnode *vp;
4637	int islocked;
4638{
4639
4640	if (!islocked)
4641		ACQUIRE_LOCK(&lk);
4642	while (vp->v_numoutput) {
4643		vp->v_flag |= VBWAIT;
4644		FREE_LOCK_INTERLOCKED(&lk);
4645		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drainvp", 0);
4646		ACQUIRE_LOCK_INTERLOCKED(&lk);
4647	}
4648	if (!islocked)
4649		FREE_LOCK(&lk);
4650}
4651
4652/*
4653 * Called whenever a buffer that is being invalidated or reallocated
4654 * contains dependencies. This should only happen if an I/O error has
4655 * occurred. The routine is called with the buffer locked.
4656 */
4657static void
4658softdep_deallocate_dependencies(bp)
4659	struct buf *bp;
4660{
4661
4662	if ((bp->b_flags & B_ERROR) == 0)
4663		panic("softdep_deallocate_dependencies: dangling deps");
4664	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
4665	panic("softdep_deallocate_dependencies: unrecovered I/O error");
4666}
4667
4668/*
4669 * Function to handle asynchronous write errors in the filesystem.
4670 */
4671void
4672softdep_error(func, error)
4673	char *func;
4674	int error;
4675{
4676
4677	/* XXX should do something better! */
4678	printf("%s: got error %d while accessing filesystem\n", func, error);
4679}
4680