ffs_softdep.c revision 98542
1/*
2 * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
3 *
4 * The soft updates code is derived from the appendix of a University
5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
6 * "Soft Updates: A Solution to the Metadata Update Problem in File
7 * Systems", CSE-TR-254-95, August 1995).
8 *
9 * Further information about soft updates can be obtained from:
10 *
11 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
12 *	1614 Oxford Street		mckusick@mckusick.com
13 *	Berkeley, CA 94709-1608		+1-510-843-9542
14 *	USA
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 *
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 *
26 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
27 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
28 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
29 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
30 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 98542 2002-06-21 06:18:05Z mckusick $");
43
44/*
45 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
46 */
47#ifndef DIAGNOSTIC
48#define DIAGNOSTIC
49#endif
50#ifndef DEBUG
51#define DEBUG
52#endif
53
54#include <sys/param.h>
55#include <sys/kernel.h>
56#include <sys/systm.h>
57#include <sys/stdint.h>
58#include <sys/bio.h>
59#include <sys/buf.h>
60#include <sys/malloc.h>
61#include <sys/mount.h>
62#include <sys/proc.h>
63#include <sys/stat.h>
64#include <sys/syslog.h>
65#include <sys/vnode.h>
66#include <sys/conf.h>
67#include <ufs/ufs/dir.h>
68#include <ufs/ufs/extattr.h>
69#include <ufs/ufs/quota.h>
70#include <ufs/ufs/inode.h>
71#include <ufs/ufs/ufsmount.h>
72#include <ufs/ffs/fs.h>
73#include <ufs/ffs/softdep.h>
74#include <ufs/ffs/ffs_extern.h>
75#include <ufs/ufs/ufs_extern.h>
76
77/*
78 * These definitions need to be adapted to the system to which
79 * this file is being ported.
80 */
81/*
82 * malloc types defined for the softdep system.
83 */
84static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
85static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
86static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
87static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
88static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
89static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
90static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
91static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
92static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
93static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
94static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
95static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
96static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
97static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
98
99#define M_SOFTDEP_FLAGS	(M_WAITOK | M_USE_RESERVE)
100
101#define	D_PAGEDEP	0
102#define	D_INODEDEP	1
103#define	D_NEWBLK	2
104#define	D_BMSAFEMAP	3
105#define	D_ALLOCDIRECT	4
106#define	D_INDIRDEP	5
107#define	D_ALLOCINDIR	6
108#define	D_FREEFRAG	7
109#define	D_FREEBLKS	8
110#define	D_FREEFILE	9
111#define	D_DIRADD	10
112#define	D_MKDIR		11
113#define	D_DIRREM	12
114#define	D_NEWDIRBLK	13
115#define	D_LAST		D_NEWDIRBLK
116
117/*
118 * translate from workitem type to memory type
119 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
120 */
121static struct malloc_type *memtype[] = {
122	M_PAGEDEP,
123	M_INODEDEP,
124	M_NEWBLK,
125	M_BMSAFEMAP,
126	M_ALLOCDIRECT,
127	M_INDIRDEP,
128	M_ALLOCINDIR,
129	M_FREEFRAG,
130	M_FREEBLKS,
131	M_FREEFILE,
132	M_DIRADD,
133	M_MKDIR,
134	M_DIRREM,
135	M_NEWDIRBLK
136};
137
138#define DtoM(type) (memtype[type])
139
140/*
141 * Names of malloc types.
142 */
143#define TYPENAME(type)  \
144	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
145/*
146 * End system adaptaion definitions.
147 */
148
149/*
150 * Internal function prototypes.
151 */
152static	void softdep_error(char *, int);
153static	void drain_output(struct vnode *, int);
154static	int getdirtybuf(struct buf **, int);
155static	void clear_remove(struct thread *);
156static	void clear_inodedeps(struct thread *);
157static	int flush_pagedep_deps(struct vnode *, struct mount *,
158	    struct diraddhd *);
159static	int flush_inodedep_deps(struct fs *, ino_t);
160static	int handle_written_filepage(struct pagedep *, struct buf *);
161static  void diradd_inode_written(struct diradd *, struct inodedep *);
162static	int handle_written_inodeblock(struct inodedep *, struct buf *);
163static	void handle_allocdirect_partdone(struct allocdirect *);
164static	void handle_allocindir_partdone(struct allocindir *);
165static	void initiate_write_filepage(struct pagedep *, struct buf *);
166static	void handle_written_mkdir(struct mkdir *, int);
167static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
168static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
169static	void handle_workitem_freefile(struct freefile *);
170static	void handle_workitem_remove(struct dirrem *, struct vnode *);
171static	struct dirrem *newdirrem(struct buf *, struct inode *,
172	    struct inode *, int, struct dirrem **);
173static	void free_diradd(struct diradd *);
174static	void free_allocindir(struct allocindir *, struct inodedep *);
175static	void free_newdirblk(struct newdirblk *);
176static	int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
177	    ufs2_daddr_t *);
178static	void deallocate_dependencies(struct buf *, struct inodedep *);
179static	void free_allocdirect(struct allocdirectlst *,
180	    struct allocdirect *, int);
181static	int check_inode_unwritten(struct inodedep *);
182static	int free_inodedep(struct inodedep *);
183static	void handle_workitem_freeblocks(struct freeblks *, int);
184static	void merge_inode_lists(struct inodedep *);
185static	void setup_allocindir_phase2(struct buf *, struct inode *,
186	    struct allocindir *);
187static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
188	    ufs2_daddr_t);
189static	void handle_workitem_freefrag(struct freefrag *);
190static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
191static	void allocdirect_merge(struct allocdirectlst *,
192	    struct allocdirect *, struct allocdirect *);
193static	struct bmsafemap *bmsafemap_lookup(struct buf *);
194static	int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
195static	int inodedep_lookup(struct fs *, ino_t, int, struct inodedep **);
196static	int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
197static	void pause_timer(void *);
198static	int request_cleanup(int, int);
199static	int process_worklist_item(struct mount *, int);
200static	void add_to_worklist(struct worklist *);
201
202/*
203 * Exported softdep operations.
204 */
205static	void softdep_disk_io_initiation(struct buf *);
206static	void softdep_disk_write_complete(struct buf *);
207static	void softdep_deallocate_dependencies(struct buf *);
208static	void softdep_move_dependencies(struct buf *, struct buf *);
209static	int softdep_count_dependencies(struct buf *bp, int);
210
211/*
212 * Locking primitives.
213 *
214 * For a uniprocessor, all we need to do is protect against disk
215 * interrupts. For a multiprocessor, this lock would have to be
216 * a mutex. A single mutex is used throughout this file, though
217 * finer grain locking could be used if contention warranted it.
218 *
219 * For a multiprocessor, the sleep call would accept a lock and
220 * release it after the sleep processing was complete. In a uniprocessor
221 * implementation there is no such interlock, so we simple mark
222 * the places where it needs to be done with the `interlocked' form
223 * of the lock calls. Since the uniprocessor sleep already interlocks
224 * the spl, there is nothing that really needs to be done.
225 */
226#ifndef /* NOT */ DEBUG
227static struct lockit {
228	int	lkt_spl;
229} lk = { 0 };
230#define ACQUIRE_LOCK(lk)		(lk)->lkt_spl = splbio()
231#define FREE_LOCK(lk)			splx((lk)->lkt_spl)
232
233#else /* DEBUG */
234#define NOHOLDER	((struct thread *)-1)
235#define SPECIAL_FLAG	((struct thread *)-2)
236static struct lockit {
237	int	lkt_spl;
238	struct	thread *lkt_held;
239} lk = { 0, NOHOLDER };
240static int lockcnt;
241
242static	void acquire_lock(struct lockit *);
243static	void free_lock(struct lockit *);
244void	softdep_panic(char *);
245
246#define ACQUIRE_LOCK(lk)		acquire_lock(lk)
247#define FREE_LOCK(lk)			free_lock(lk)
248
249static void
250acquire_lock(lk)
251	struct lockit *lk;
252{
253	struct thread *holder;
254
255	if (lk->lkt_held != NOHOLDER) {
256		holder = lk->lkt_held;
257		FREE_LOCK(lk);
258		if (holder == curthread)
259			panic("softdep_lock: locking against myself");
260		else
261			panic("softdep_lock: lock held by %p", holder);
262	}
263	lk->lkt_spl = splbio();
264	lk->lkt_held = curthread;
265	lockcnt++;
266}
267
268static void
269free_lock(lk)
270	struct lockit *lk;
271{
272
273	if (lk->lkt_held == NOHOLDER)
274		panic("softdep_unlock: lock not held");
275	lk->lkt_held = NOHOLDER;
276	splx(lk->lkt_spl);
277}
278
279/*
280 * Function to release soft updates lock and panic.
281 */
282void
283softdep_panic(msg)
284	char *msg;
285{
286
287	if (lk.lkt_held != NOHOLDER)
288		FREE_LOCK(&lk);
289	panic(msg);
290}
291#endif /* DEBUG */
292
293static	int interlocked_sleep(struct lockit *, int, void *, int,
294	    const char *, int);
295
296/*
297 * When going to sleep, we must save our SPL so that it does
298 * not get lost if some other process uses the lock while we
299 * are sleeping. We restore it after we have slept. This routine
300 * wraps the interlocking with functions that sleep. The list
301 * below enumerates the available set of operations.
302 */
303#define	UNKNOWN		0
304#define	SLEEP		1
305#define	LOCKBUF		2
306
307static int
308interlocked_sleep(lk, op, ident, flags, wmesg, timo)
309	struct lockit *lk;
310	int op;
311	void *ident;
312	int flags;
313	const char *wmesg;
314	int timo;
315{
316	struct thread *holder;
317	int s, retval;
318
319	s = lk->lkt_spl;
320#	ifdef DEBUG
321	if (lk->lkt_held == NOHOLDER)
322		panic("interlocked_sleep: lock not held");
323	lk->lkt_held = NOHOLDER;
324#	endif /* DEBUG */
325	switch (op) {
326	case SLEEP:
327		retval = tsleep(ident, flags, wmesg, timo);
328		break;
329	case LOCKBUF:
330		retval = BUF_LOCK((struct buf *)ident, flags);
331		break;
332	default:
333		panic("interlocked_sleep: unknown operation");
334	}
335#	ifdef DEBUG
336	if (lk->lkt_held != NOHOLDER) {
337		holder = lk->lkt_held;
338		FREE_LOCK(lk);
339		if (holder == curthread)
340			panic("interlocked_sleep: locking against self");
341		else
342			panic("interlocked_sleep: lock held by %p", holder);
343	}
344	lk->lkt_held = curthread;
345	lockcnt++;
346#	endif /* DEBUG */
347	lk->lkt_spl = s;
348	return (retval);
349}
350
351/*
352 * Place holder for real semaphores.
353 */
354struct sema {
355	int	value;
356	struct	thread *holder;
357	char	*name;
358	int	prio;
359	int	timo;
360};
361static	void sema_init(struct sema *, char *, int, int);
362static	int sema_get(struct sema *, struct lockit *);
363static	void sema_release(struct sema *);
364
365static void
366sema_init(semap, name, prio, timo)
367	struct sema *semap;
368	char *name;
369	int prio, timo;
370{
371
372	semap->holder = NOHOLDER;
373	semap->value = 0;
374	semap->name = name;
375	semap->prio = prio;
376	semap->timo = timo;
377}
378
379static int
380sema_get(semap, interlock)
381	struct sema *semap;
382	struct lockit *interlock;
383{
384
385	if (semap->value++ > 0) {
386		if (interlock != NULL) {
387			interlocked_sleep(interlock, SLEEP, (caddr_t)semap,
388			    semap->prio, semap->name, semap->timo);
389			FREE_LOCK(interlock);
390		} else {
391			tsleep((caddr_t)semap, semap->prio, semap->name,
392			    semap->timo);
393		}
394		return (0);
395	}
396	semap->holder = curthread;
397	if (interlock != NULL)
398		FREE_LOCK(interlock);
399	return (1);
400}
401
402static void
403sema_release(semap)
404	struct sema *semap;
405{
406
407	if (semap->value <= 0 || semap->holder != curthread) {
408		if (lk.lkt_held != NOHOLDER)
409			FREE_LOCK(&lk);
410		panic("sema_release: not held");
411	}
412	if (--semap->value > 0) {
413		semap->value = 0;
414		wakeup(semap);
415	}
416	semap->holder = NOHOLDER;
417}
418
419/*
420 * Worklist queue management.
421 * These routines require that the lock be held.
422 */
423#ifndef /* NOT */ DEBUG
424#define WORKLIST_INSERT(head, item) do {	\
425	(item)->wk_state |= ONWORKLIST;		\
426	LIST_INSERT_HEAD(head, item, wk_list);	\
427} while (0)
428#define WORKLIST_REMOVE(item) do {		\
429	(item)->wk_state &= ~ONWORKLIST;	\
430	LIST_REMOVE(item, wk_list);		\
431} while (0)
432#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
433
434#else /* DEBUG */
435static	void worklist_insert(struct workhead *, struct worklist *);
436static	void worklist_remove(struct worklist *);
437static	void workitem_free(struct worklist *, int);
438
439#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
440#define WORKLIST_REMOVE(item) worklist_remove(item)
441#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
442
443static void
444worklist_insert(head, item)
445	struct workhead *head;
446	struct worklist *item;
447{
448
449	if (lk.lkt_held == NOHOLDER)
450		panic("worklist_insert: lock not held");
451	if (item->wk_state & ONWORKLIST) {
452		FREE_LOCK(&lk);
453		panic("worklist_insert: already on list");
454	}
455	item->wk_state |= ONWORKLIST;
456	LIST_INSERT_HEAD(head, item, wk_list);
457}
458
459static void
460worklist_remove(item)
461	struct worklist *item;
462{
463
464	if (lk.lkt_held == NOHOLDER)
465		panic("worklist_remove: lock not held");
466	if ((item->wk_state & ONWORKLIST) == 0) {
467		FREE_LOCK(&lk);
468		panic("worklist_remove: not on list");
469	}
470	item->wk_state &= ~ONWORKLIST;
471	LIST_REMOVE(item, wk_list);
472}
473
474static void
475workitem_free(item, type)
476	struct worklist *item;
477	int type;
478{
479
480	if (item->wk_state & ONWORKLIST) {
481		if (lk.lkt_held != NOHOLDER)
482			FREE_LOCK(&lk);
483		panic("workitem_free: still on list");
484	}
485	if (item->wk_type != type) {
486		if (lk.lkt_held != NOHOLDER)
487			FREE_LOCK(&lk);
488		panic("workitem_free: type mismatch");
489	}
490	FREE(item, DtoM(type));
491}
492#endif /* DEBUG */
493
494/*
495 * Workitem queue management
496 */
497static struct workhead softdep_workitem_pending;
498static int num_on_worklist;	/* number of worklist items to be processed */
499static int softdep_worklist_busy; /* 1 => trying to do unmount */
500static int softdep_worklist_req; /* serialized waiters */
501static int max_softdeps;	/* maximum number of structs before slowdown */
502static int tickdelay = 2;	/* number of ticks to pause during slowdown */
503static int proc_waiting;	/* tracks whether we have a timeout posted */
504static int *stat_countp;	/* statistic to count in proc_waiting timeout */
505static struct callout_handle handle; /* handle on posted proc_waiting timeout */
506static struct thread *filesys_syncer; /* proc of filesystem syncer process */
507static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
508#define FLUSH_INODES		1
509static int req_clear_remove;	/* syncer process flush some freeblks */
510#define FLUSH_REMOVE		2
511#define FLUSH_REMOVE_WAIT	3
512/*
513 * runtime statistics
514 */
515static int stat_worklist_push;	/* number of worklist cleanups */
516static int stat_blk_limit_push;	/* number of times block limit neared */
517static int stat_ino_limit_push;	/* number of times inode limit neared */
518static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
519static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
520static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
521static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
522static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
523static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
524static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
525#ifdef DEBUG
526#include <vm/vm.h>
527#include <sys/sysctl.h>
528SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
529SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
530SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
531SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
532SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
533SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
534SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
535SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
536SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
537SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
538SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
539SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
540#endif /* DEBUG */
541
542/*
543 * Add an item to the end of the work queue.
544 * This routine requires that the lock be held.
545 * This is the only routine that adds items to the list.
546 * The following routine is the only one that removes items
547 * and does so in order from first to last.
548 */
549static void
550add_to_worklist(wk)
551	struct worklist *wk;
552{
553	static struct worklist *worklist_tail;
554
555	if (wk->wk_state & ONWORKLIST) {
556		if (lk.lkt_held != NOHOLDER)
557			FREE_LOCK(&lk);
558		panic("add_to_worklist: already on list");
559	}
560	wk->wk_state |= ONWORKLIST;
561	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
562		LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
563	else
564		LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
565	worklist_tail = wk;
566	num_on_worklist += 1;
567}
568
569/*
570 * Process that runs once per second to handle items in the background queue.
571 *
572 * Note that we ensure that everything is done in the order in which they
573 * appear in the queue. The code below depends on this property to ensure
574 * that blocks of a file are freed before the inode itself is freed. This
575 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
576 * until all the old ones have been purged from the dependency lists.
577 */
578int
579softdep_process_worklist(matchmnt)
580	struct mount *matchmnt;
581{
582	struct thread *td = curthread;
583	int cnt, matchcnt, loopcount;
584	long starttime;
585
586	/*
587	 * Record the process identifier of our caller so that we can give
588	 * this process preferential treatment in request_cleanup below.
589	 */
590	filesys_syncer = td;
591	matchcnt = 0;
592
593	/*
594	 * There is no danger of having multiple processes run this
595	 * code, but we have to single-thread it when softdep_flushfiles()
596	 * is in operation to get an accurate count of the number of items
597	 * related to its mount point that are in the list.
598	 */
599	if (matchmnt == NULL) {
600		if (softdep_worklist_busy < 0)
601			return(-1);
602		softdep_worklist_busy += 1;
603	}
604
605	/*
606	 * If requested, try removing inode or removal dependencies.
607	 */
608	if (req_clear_inodedeps) {
609		clear_inodedeps(td);
610		req_clear_inodedeps -= 1;
611		wakeup_one(&proc_waiting);
612	}
613	if (req_clear_remove) {
614		clear_remove(td);
615		req_clear_remove -= 1;
616		wakeup_one(&proc_waiting);
617	}
618	loopcount = 1;
619	starttime = time_second;
620	while (num_on_worklist > 0) {
621		if ((cnt = process_worklist_item(matchmnt, 0)) == -1)
622			break;
623		else
624			matchcnt += cnt;
625
626		/*
627		 * If a umount operation wants to run the worklist
628		 * accurately, abort.
629		 */
630		if (softdep_worklist_req && matchmnt == NULL) {
631			matchcnt = -1;
632			break;
633		}
634
635		/*
636		 * If requested, try removing inode or removal dependencies.
637		 */
638		if (req_clear_inodedeps) {
639			clear_inodedeps(td);
640			req_clear_inodedeps -= 1;
641			wakeup_one(&proc_waiting);
642		}
643		if (req_clear_remove) {
644			clear_remove(td);
645			req_clear_remove -= 1;
646			wakeup_one(&proc_waiting);
647		}
648		/*
649		 * We do not generally want to stop for buffer space, but if
650		 * we are really being a buffer hog, we will stop and wait.
651		 */
652		if (loopcount++ % 128 == 0)
653			bwillwrite();
654		/*
655		 * Never allow processing to run for more than one
656		 * second. Otherwise the other syncer tasks may get
657		 * excessively backlogged.
658		 */
659		if (starttime != time_second && matchmnt == NULL) {
660			matchcnt = -1;
661			break;
662		}
663	}
664	if (matchmnt == NULL) {
665		softdep_worklist_busy -= 1;
666		if (softdep_worklist_req && softdep_worklist_busy == 0)
667			wakeup(&softdep_worklist_req);
668	}
669	return (matchcnt);
670}
671
672/*
673 * Process one item on the worklist.
674 */
675static int
676process_worklist_item(matchmnt, flags)
677	struct mount *matchmnt;
678	int flags;
679{
680	struct worklist *wk;
681	struct mount *mp;
682	struct vnode *vp;
683	int matchcnt = 0;
684
685	ACQUIRE_LOCK(&lk);
686	/*
687	 * Normally we just process each item on the worklist in order.
688	 * However, if we are in a situation where we cannot lock any
689	 * inodes, we have to skip over any dirrem requests whose
690	 * vnodes are resident and locked.
691	 */
692	vp = NULL;
693	LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
694		if (wk->wk_state & INPROGRESS)
695			continue;
696		if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
697			break;
698		wk->wk_state |= INPROGRESS;
699		FREE_LOCK(&lk);
700		VFS_VGET(WK_DIRREM(wk)->dm_mnt, WK_DIRREM(wk)->dm_oldinum,
701		    LK_NOWAIT | LK_EXCLUSIVE, &vp);
702		ACQUIRE_LOCK(&lk);
703		wk->wk_state &= ~INPROGRESS;
704		if (vp != NULL)
705			break;
706	}
707	if (wk == 0) {
708		FREE_LOCK(&lk);
709		return (-1);
710	}
711	WORKLIST_REMOVE(wk);
712	num_on_worklist -= 1;
713	FREE_LOCK(&lk);
714	switch (wk->wk_type) {
715
716	case D_DIRREM:
717		/* removal of a directory entry */
718		mp = WK_DIRREM(wk)->dm_mnt;
719		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
720			panic("%s: dirrem on suspended filesystem",
721				"process_worklist_item");
722		if (mp == matchmnt)
723			matchcnt += 1;
724		handle_workitem_remove(WK_DIRREM(wk), vp);
725		break;
726
727	case D_FREEBLKS:
728		/* releasing blocks and/or fragments from a file */
729		mp = WK_FREEBLKS(wk)->fb_mnt;
730		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
731			panic("%s: freeblks on suspended filesystem",
732				"process_worklist_item");
733		if (mp == matchmnt)
734			matchcnt += 1;
735		handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
736		break;
737
738	case D_FREEFRAG:
739		/* releasing a fragment when replaced as a file grows */
740		mp = WK_FREEFRAG(wk)->ff_mnt;
741		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
742			panic("%s: freefrag on suspended filesystem",
743				"process_worklist_item");
744		if (mp == matchmnt)
745			matchcnt += 1;
746		handle_workitem_freefrag(WK_FREEFRAG(wk));
747		break;
748
749	case D_FREEFILE:
750		/* releasing an inode when its link count drops to 0 */
751		mp = WK_FREEFILE(wk)->fx_mnt;
752		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
753			panic("%s: freefile on suspended filesystem",
754				"process_worklist_item");
755		if (mp == matchmnt)
756			matchcnt += 1;
757		handle_workitem_freefile(WK_FREEFILE(wk));
758		break;
759
760	default:
761		panic("%s_process_worklist: Unknown type %s",
762		    "softdep", TYPENAME(wk->wk_type));
763		/* NOTREACHED */
764	}
765	return (matchcnt);
766}
767
768/*
769 * Move dependencies from one buffer to another.
770 */
771static void
772softdep_move_dependencies(oldbp, newbp)
773	struct buf *oldbp;
774	struct buf *newbp;
775{
776	struct worklist *wk, *wktail;
777
778	if (LIST_FIRST(&newbp->b_dep) != NULL)
779		panic("softdep_move_dependencies: need merge code");
780	wktail = 0;
781	ACQUIRE_LOCK(&lk);
782	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
783		LIST_REMOVE(wk, wk_list);
784		if (wktail == 0)
785			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
786		else
787			LIST_INSERT_AFTER(wktail, wk, wk_list);
788		wktail = wk;
789	}
790	FREE_LOCK(&lk);
791}
792
793/*
794 * Purge the work list of all items associated with a particular mount point.
795 */
796int
797softdep_flushworklist(oldmnt, countp, td)
798	struct mount *oldmnt;
799	int *countp;
800	struct thread *td;
801{
802	struct vnode *devvp;
803	int count, error = 0;
804
805	/*
806	 * Await our turn to clear out the queue, then serialize access.
807	 */
808	while (softdep_worklist_busy) {
809		softdep_worklist_req += 1;
810		tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0);
811		softdep_worklist_req -= 1;
812	}
813	softdep_worklist_busy = -1;
814	/*
815	 * Alternately flush the block device associated with the mount
816	 * point and process any dependencies that the flushing
817	 * creates. We continue until no more worklist dependencies
818	 * are found.
819	 */
820	*countp = 0;
821	devvp = VFSTOUFS(oldmnt)->um_devvp;
822	while ((count = softdep_process_worklist(oldmnt)) > 0) {
823		*countp += count;
824		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
825		error = VOP_FSYNC(devvp, td->td_ucred, MNT_WAIT, td);
826		VOP_UNLOCK(devvp, 0, td);
827		if (error)
828			break;
829	}
830	softdep_worklist_busy = 0;
831	if (softdep_worklist_req)
832		wakeup(&softdep_worklist_req);
833	return (error);
834}
835
836/*
837 * Flush all vnodes and worklist items associated with a specified mount point.
838 */
839int
840softdep_flushfiles(oldmnt, flags, td)
841	struct mount *oldmnt;
842	int flags;
843	struct thread *td;
844{
845	int error, count, loopcnt;
846
847	error = 0;
848
849	/*
850	 * Alternately flush the vnodes associated with the mount
851	 * point and process any dependencies that the flushing
852	 * creates. In theory, this loop can happen at most twice,
853	 * but we give it a few extra just to be sure.
854	 */
855	for (loopcnt = 10; loopcnt > 0; loopcnt--) {
856		/*
857		 * Do another flush in case any vnodes were brought in
858		 * as part of the cleanup operations.
859		 */
860		if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
861			break;
862		if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 ||
863		    count == 0)
864			break;
865	}
866	/*
867	 * If we are unmounting then it is an error to fail. If we
868	 * are simply trying to downgrade to read-only, then filesystem
869	 * activity can keep us busy forever, so we just fail with EBUSY.
870	 */
871	if (loopcnt == 0) {
872		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
873			panic("softdep_flushfiles: looping");
874		error = EBUSY;
875	}
876	return (error);
877}
878
879/*
880 * Structure hashing.
881 *
882 * There are three types of structures that can be looked up:
883 *	1) pagedep structures identified by mount point, inode number,
884 *	   and logical block.
885 *	2) inodedep structures identified by mount point and inode number.
886 *	3) newblk structures identified by mount point and
887 *	   physical block number.
888 *
889 * The "pagedep" and "inodedep" dependency structures are hashed
890 * separately from the file blocks and inodes to which they correspond.
891 * This separation helps when the in-memory copy of an inode or
892 * file block must be replaced. It also obviates the need to access
893 * an inode or file page when simply updating (or de-allocating)
894 * dependency structures. Lookup of newblk structures is needed to
895 * find newly allocated blocks when trying to associate them with
896 * their allocdirect or allocindir structure.
897 *
898 * The lookup routines optionally create and hash a new instance when
899 * an existing entry is not found.
900 */
901#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
902#define NODELAY		0x0002	/* cannot do background work */
903
904/*
905 * Structures and routines associated with pagedep caching.
906 */
907LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
908u_long	pagedep_hash;		/* size of hash table - 1 */
909#define	PAGEDEP_HASH(mp, inum, lbn) \
910	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
911	    pagedep_hash])
912static struct sema pagedep_in_progress;
913
914/*
915 * Look up a pagedep. Return 1 if found, 0 if not found or found
916 * when asked to allocate but not associated with any buffer.
917 * If not found, allocate if DEPALLOC flag is passed.
918 * Found or allocated entry is returned in pagedeppp.
919 * This routine must be called with splbio interrupts blocked.
920 */
921static int
922pagedep_lookup(ip, lbn, flags, pagedeppp)
923	struct inode *ip;
924	ufs_lbn_t lbn;
925	int flags;
926	struct pagedep **pagedeppp;
927{
928	struct pagedep *pagedep;
929	struct pagedep_hashhead *pagedephd;
930	struct mount *mp;
931	int i;
932
933#ifdef DEBUG
934	if (lk.lkt_held == NOHOLDER)
935		panic("pagedep_lookup: lock not held");
936#endif
937	mp = ITOV(ip)->v_mount;
938	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
939top:
940	LIST_FOREACH(pagedep, pagedephd, pd_hash)
941		if (ip->i_number == pagedep->pd_ino &&
942		    lbn == pagedep->pd_lbn &&
943		    mp == pagedep->pd_mnt)
944			break;
945	if (pagedep) {
946		*pagedeppp = pagedep;
947		if ((flags & DEPALLOC) != 0 &&
948		    (pagedep->pd_state & ONWORKLIST) == 0)
949			return (0);
950		return (1);
951	}
952	if ((flags & DEPALLOC) == 0) {
953		*pagedeppp = NULL;
954		return (0);
955	}
956	if (sema_get(&pagedep_in_progress, &lk) == 0) {
957		ACQUIRE_LOCK(&lk);
958		goto top;
959	}
960	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
961		M_SOFTDEP_FLAGS|M_ZERO);
962	pagedep->pd_list.wk_type = D_PAGEDEP;
963	pagedep->pd_mnt = mp;
964	pagedep->pd_ino = ip->i_number;
965	pagedep->pd_lbn = lbn;
966	LIST_INIT(&pagedep->pd_dirremhd);
967	LIST_INIT(&pagedep->pd_pendinghd);
968	for (i = 0; i < DAHASHSZ; i++)
969		LIST_INIT(&pagedep->pd_diraddhd[i]);
970	ACQUIRE_LOCK(&lk);
971	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
972	sema_release(&pagedep_in_progress);
973	*pagedeppp = pagedep;
974	return (0);
975}
976
977/*
978 * Structures and routines associated with inodedep caching.
979 */
980LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
981static u_long	inodedep_hash;	/* size of hash table - 1 */
982static long	num_inodedep;	/* number of inodedep allocated */
983#define	INODEDEP_HASH(fs, inum) \
984      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
985static struct sema inodedep_in_progress;
986
987/*
988 * Look up a inodedep. Return 1 if found, 0 if not found.
989 * If not found, allocate if DEPALLOC flag is passed.
990 * Found or allocated entry is returned in inodedeppp.
991 * This routine must be called with splbio interrupts blocked.
992 */
993static int
994inodedep_lookup(fs, inum, flags, inodedeppp)
995	struct fs *fs;
996	ino_t inum;
997	int flags;
998	struct inodedep **inodedeppp;
999{
1000	struct inodedep *inodedep;
1001	struct inodedep_hashhead *inodedephd;
1002	int firsttry;
1003
1004#ifdef DEBUG
1005	if (lk.lkt_held == NOHOLDER)
1006		panic("inodedep_lookup: lock not held");
1007#endif
1008	firsttry = 1;
1009	inodedephd = INODEDEP_HASH(fs, inum);
1010top:
1011	LIST_FOREACH(inodedep, inodedephd, id_hash)
1012		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1013			break;
1014	if (inodedep) {
1015		*inodedeppp = inodedep;
1016		return (1);
1017	}
1018	if ((flags & DEPALLOC) == 0) {
1019		*inodedeppp = NULL;
1020		return (0);
1021	}
1022	/*
1023	 * If we are over our limit, try to improve the situation.
1024	 */
1025	if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 &&
1026	    request_cleanup(FLUSH_INODES, 1)) {
1027		firsttry = 0;
1028		goto top;
1029	}
1030	if (sema_get(&inodedep_in_progress, &lk) == 0) {
1031		ACQUIRE_LOCK(&lk);
1032		goto top;
1033	}
1034	num_inodedep += 1;
1035	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
1036		M_INODEDEP, M_SOFTDEP_FLAGS);
1037	inodedep->id_list.wk_type = D_INODEDEP;
1038	inodedep->id_fs = fs;
1039	inodedep->id_ino = inum;
1040	inodedep->id_state = ALLCOMPLETE;
1041	inodedep->id_nlinkdelta = 0;
1042	inodedep->id_savedino1 = NULL;
1043	inodedep->id_savedsize = -1;
1044	inodedep->id_buf = NULL;
1045	LIST_INIT(&inodedep->id_pendinghd);
1046	LIST_INIT(&inodedep->id_inowait);
1047	LIST_INIT(&inodedep->id_bufwait);
1048	TAILQ_INIT(&inodedep->id_inoupdt);
1049	TAILQ_INIT(&inodedep->id_newinoupdt);
1050	ACQUIRE_LOCK(&lk);
1051	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1052	sema_release(&inodedep_in_progress);
1053	*inodedeppp = inodedep;
1054	return (0);
1055}
1056
1057/*
1058 * Structures and routines associated with newblk caching.
1059 */
1060LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1061u_long	newblk_hash;		/* size of hash table - 1 */
1062#define	NEWBLK_HASH(fs, inum) \
1063	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1064static struct sema newblk_in_progress;
1065
1066/*
1067 * Look up a newblk. Return 1 if found, 0 if not found.
1068 * If not found, allocate if DEPALLOC flag is passed.
1069 * Found or allocated entry is returned in newblkpp.
1070 */
1071static int
1072newblk_lookup(fs, newblkno, flags, newblkpp)
1073	struct fs *fs;
1074	ufs2_daddr_t newblkno;
1075	int flags;
1076	struct newblk **newblkpp;
1077{
1078	struct newblk *newblk;
1079	struct newblk_hashhead *newblkhd;
1080
1081	newblkhd = NEWBLK_HASH(fs, newblkno);
1082top:
1083	LIST_FOREACH(newblk, newblkhd, nb_hash)
1084		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
1085			break;
1086	if (newblk) {
1087		*newblkpp = newblk;
1088		return (1);
1089	}
1090	if ((flags & DEPALLOC) == 0) {
1091		*newblkpp = NULL;
1092		return (0);
1093	}
1094	if (sema_get(&newblk_in_progress, 0) == 0)
1095		goto top;
1096	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
1097		M_NEWBLK, M_SOFTDEP_FLAGS);
1098	newblk->nb_state = 0;
1099	newblk->nb_fs = fs;
1100	newblk->nb_newblkno = newblkno;
1101	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1102	sema_release(&newblk_in_progress);
1103	*newblkpp = newblk;
1104	return (0);
1105}
1106
1107/*
1108 * Executed during filesystem system initialization before
1109 * mounting any filesystems.
1110 */
1111void
1112softdep_initialize()
1113{
1114
1115	LIST_INIT(&mkdirlisthd);
1116	LIST_INIT(&softdep_workitem_pending);
1117	max_softdeps = desiredvnodes * 8;
1118	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
1119	    &pagedep_hash);
1120	sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
1121	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
1122	sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
1123	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
1124	sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
1125
1126	/* initialise bioops hack */
1127	bioops.io_start = softdep_disk_io_initiation;
1128	bioops.io_complete = softdep_disk_write_complete;
1129	bioops.io_deallocate = softdep_deallocate_dependencies;
1130	bioops.io_movedeps = softdep_move_dependencies;
1131	bioops.io_countdeps = softdep_count_dependencies;
1132}
1133
1134/*
1135 * Called at mount time to notify the dependency code that a
1136 * filesystem wishes to use it.
1137 */
1138int
1139softdep_mount(devvp, mp, fs, cred)
1140	struct vnode *devvp;
1141	struct mount *mp;
1142	struct fs *fs;
1143	struct ucred *cred;
1144{
1145	struct csum_total cstotal;
1146	struct cg *cgp;
1147	struct buf *bp;
1148	int error, cyl;
1149
1150	mp->mnt_flag &= ~MNT_ASYNC;
1151	mp->mnt_flag |= MNT_SOFTDEP;
1152	/*
1153	 * When doing soft updates, the counters in the
1154	 * superblock may have gotten out of sync, so we have
1155	 * to scan the cylinder groups and recalculate them.
1156	 */
1157	if (fs->fs_clean != 0)
1158		return (0);
1159	bzero(&cstotal, sizeof cstotal);
1160	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1161		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
1162		    fs->fs_cgsize, cred, &bp)) != 0) {
1163			brelse(bp);
1164			return (error);
1165		}
1166		cgp = (struct cg *)bp->b_data;
1167		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1168		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1169		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1170		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1171		fs->fs_cs(fs, cyl) = cgp->cg_cs;
1172		brelse(bp);
1173	}
1174#ifdef DEBUG
1175	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1176		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
1177#endif
1178	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1179	return (0);
1180}
1181
1182/*
1183 * Protecting the freemaps (or bitmaps).
1184 *
1185 * To eliminate the need to execute fsck before mounting a filesystem
1186 * after a power failure, one must (conservatively) guarantee that the
1187 * on-disk copy of the bitmaps never indicate that a live inode or block is
1188 * free.  So, when a block or inode is allocated, the bitmap should be
1189 * updated (on disk) before any new pointers.  When a block or inode is
1190 * freed, the bitmap should not be updated until all pointers have been
1191 * reset.  The latter dependency is handled by the delayed de-allocation
1192 * approach described below for block and inode de-allocation.  The former
1193 * dependency is handled by calling the following procedure when a block or
1194 * inode is allocated. When an inode is allocated an "inodedep" is created
1195 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1196 * Each "inodedep" is also inserted into the hash indexing structure so
1197 * that any additional link additions can be made dependent on the inode
1198 * allocation.
1199 *
1200 * The ufs filesystem maintains a number of free block counts (e.g., per
1201 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1202 * in addition to the bitmaps.  These counts are used to improve efficiency
1203 * during allocation and therefore must be consistent with the bitmaps.
1204 * There is no convenient way to guarantee post-crash consistency of these
1205 * counts with simple update ordering, for two main reasons: (1) The counts
1206 * and bitmaps for a single cylinder group block are not in the same disk
1207 * sector.  If a disk write is interrupted (e.g., by power failure), one may
1208 * be written and the other not.  (2) Some of the counts are located in the
1209 * superblock rather than the cylinder group block. So, we focus our soft
1210 * updates implementation on protecting the bitmaps. When mounting a
1211 * filesystem, we recompute the auxiliary counts from the bitmaps.
1212 */
1213
1214/*
1215 * Called just after updating the cylinder group block to allocate an inode.
1216 */
1217void
1218softdep_setup_inomapdep(bp, ip, newinum)
1219	struct buf *bp;		/* buffer for cylgroup block with inode map */
1220	struct inode *ip;	/* inode related to allocation */
1221	ino_t newinum;		/* new inode number being allocated */
1222{
1223	struct inodedep *inodedep;
1224	struct bmsafemap *bmsafemap;
1225
1226	/*
1227	 * Create a dependency for the newly allocated inode.
1228	 * Panic if it already exists as something is seriously wrong.
1229	 * Otherwise add it to the dependency list for the buffer holding
1230	 * the cylinder group map from which it was allocated.
1231	 */
1232	ACQUIRE_LOCK(&lk);
1233	if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC|NODELAY, &inodedep))) {
1234		FREE_LOCK(&lk);
1235		panic("softdep_setup_inomapdep: found inode");
1236	}
1237	inodedep->id_buf = bp;
1238	inodedep->id_state &= ~DEPCOMPLETE;
1239	bmsafemap = bmsafemap_lookup(bp);
1240	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1241	FREE_LOCK(&lk);
1242}
1243
1244/*
1245 * Called just after updating the cylinder group block to
1246 * allocate block or fragment.
1247 */
1248void
1249softdep_setup_blkmapdep(bp, fs, newblkno)
1250	struct buf *bp;		/* buffer for cylgroup block with block map */
1251	struct fs *fs;		/* filesystem doing allocation */
1252	ufs2_daddr_t newblkno;	/* number of newly allocated block */
1253{
1254	struct newblk *newblk;
1255	struct bmsafemap *bmsafemap;
1256
1257	/*
1258	 * Create a dependency for the newly allocated block.
1259	 * Add it to the dependency list for the buffer holding
1260	 * the cylinder group map from which it was allocated.
1261	 */
1262	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1263		panic("softdep_setup_blkmapdep: found block");
1264	ACQUIRE_LOCK(&lk);
1265	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1266	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1267	FREE_LOCK(&lk);
1268}
1269
1270/*
1271 * Find the bmsafemap associated with a cylinder group buffer.
1272 * If none exists, create one. The buffer must be locked when
1273 * this routine is called and this routine must be called with
1274 * splbio interrupts blocked.
1275 */
1276static struct bmsafemap *
1277bmsafemap_lookup(bp)
1278	struct buf *bp;
1279{
1280	struct bmsafemap *bmsafemap;
1281	struct worklist *wk;
1282
1283#ifdef DEBUG
1284	if (lk.lkt_held == NOHOLDER)
1285		panic("bmsafemap_lookup: lock not held");
1286#endif
1287	LIST_FOREACH(wk, &bp->b_dep, wk_list)
1288		if (wk->wk_type == D_BMSAFEMAP)
1289			return (WK_BMSAFEMAP(wk));
1290	FREE_LOCK(&lk);
1291	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1292		M_BMSAFEMAP, M_SOFTDEP_FLAGS);
1293	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1294	bmsafemap->sm_list.wk_state = 0;
1295	bmsafemap->sm_buf = bp;
1296	LIST_INIT(&bmsafemap->sm_allocdirecthd);
1297	LIST_INIT(&bmsafemap->sm_allocindirhd);
1298	LIST_INIT(&bmsafemap->sm_inodedephd);
1299	LIST_INIT(&bmsafemap->sm_newblkhd);
1300	ACQUIRE_LOCK(&lk);
1301	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1302	return (bmsafemap);
1303}
1304
1305/*
1306 * Direct block allocation dependencies.
1307 *
1308 * When a new block is allocated, the corresponding disk locations must be
1309 * initialized (with zeros or new data) before the on-disk inode points to
1310 * them.  Also, the freemap from which the block was allocated must be
1311 * updated (on disk) before the inode's pointer. These two dependencies are
1312 * independent of each other and are needed for all file blocks and indirect
1313 * blocks that are pointed to directly by the inode.  Just before the
1314 * "in-core" version of the inode is updated with a newly allocated block
1315 * number, a procedure (below) is called to setup allocation dependency
1316 * structures.  These structures are removed when the corresponding
1317 * dependencies are satisfied or when the block allocation becomes obsolete
1318 * (i.e., the file is deleted, the block is de-allocated, or the block is a
1319 * fragment that gets upgraded).  All of these cases are handled in
1320 * procedures described later.
1321 *
1322 * When a file extension causes a fragment to be upgraded, either to a larger
1323 * fragment or to a full block, the on-disk location may change (if the
1324 * previous fragment could not simply be extended). In this case, the old
1325 * fragment must be de-allocated, but not until after the inode's pointer has
1326 * been updated. In most cases, this is handled by later procedures, which
1327 * will construct a "freefrag" structure to be added to the workitem queue
1328 * when the inode update is complete (or obsolete).  The main exception to
1329 * this is when an allocation occurs while a pending allocation dependency
1330 * (for the same block pointer) remains.  This case is handled in the main
1331 * allocation dependency setup procedure by immediately freeing the
1332 * unreferenced fragments.
1333 */
1334void
1335softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1336	struct inode *ip;	/* inode to which block is being added */
1337	ufs_lbn_t lbn;		/* block pointer within inode */
1338	ufs2_daddr_t newblkno;	/* disk block number being added */
1339	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
1340	long newsize;		/* size of new block */
1341	long oldsize;		/* size of new block */
1342	struct buf *bp;		/* bp for allocated block */
1343{
1344	struct allocdirect *adp, *oldadp;
1345	struct allocdirectlst *adphead;
1346	struct bmsafemap *bmsafemap;
1347	struct inodedep *inodedep;
1348	struct pagedep *pagedep;
1349	struct newblk *newblk;
1350
1351	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1352		M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1353	adp->ad_list.wk_type = D_ALLOCDIRECT;
1354	adp->ad_lbn = lbn;
1355	adp->ad_newblkno = newblkno;
1356	adp->ad_oldblkno = oldblkno;
1357	adp->ad_newsize = newsize;
1358	adp->ad_oldsize = oldsize;
1359	adp->ad_state = ATTACHED;
1360	LIST_INIT(&adp->ad_newdirblk);
1361	if (newblkno == oldblkno)
1362		adp->ad_freefrag = NULL;
1363	else
1364		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1365
1366	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1367		panic("softdep_setup_allocdirect: lost block");
1368
1369	ACQUIRE_LOCK(&lk);
1370	inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1371	adp->ad_inodedep = inodedep;
1372
1373	if (newblk->nb_state == DEPCOMPLETE) {
1374		adp->ad_state |= DEPCOMPLETE;
1375		adp->ad_buf = NULL;
1376	} else {
1377		bmsafemap = newblk->nb_bmsafemap;
1378		adp->ad_buf = bmsafemap->sm_buf;
1379		LIST_REMOVE(newblk, nb_deps);
1380		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1381	}
1382	LIST_REMOVE(newblk, nb_hash);
1383	FREE(newblk, M_NEWBLK);
1384
1385	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1386	if (lbn >= NDADDR) {
1387		/* allocating an indirect block */
1388		if (oldblkno != 0) {
1389			FREE_LOCK(&lk);
1390			panic("softdep_setup_allocdirect: non-zero indir");
1391		}
1392	} else {
1393		/*
1394		 * Allocating a direct block.
1395		 *
1396		 * If we are allocating a directory block, then we must
1397		 * allocate an associated pagedep to track additions and
1398		 * deletions.
1399		 */
1400		if ((ip->i_mode & IFMT) == IFDIR &&
1401		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1402			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1403	}
1404	/*
1405	 * The list of allocdirects must be kept in sorted and ascending
1406	 * order so that the rollback routines can quickly determine the
1407	 * first uncommitted block (the size of the file stored on disk
1408	 * ends at the end of the lowest committed fragment, or if there
1409	 * are no fragments, at the end of the highest committed block).
1410	 * Since files generally grow, the typical case is that the new
1411	 * block is to be added at the end of the list. We speed this
1412	 * special case by checking against the last allocdirect in the
1413	 * list before laboriously traversing the list looking for the
1414	 * insertion point.
1415	 */
1416	adphead = &inodedep->id_newinoupdt;
1417	oldadp = TAILQ_LAST(adphead, allocdirectlst);
1418	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1419		/* insert at end of list */
1420		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1421		if (oldadp != NULL && oldadp->ad_lbn == lbn)
1422			allocdirect_merge(adphead, adp, oldadp);
1423		FREE_LOCK(&lk);
1424		return;
1425	}
1426	TAILQ_FOREACH(oldadp, adphead, ad_next) {
1427		if (oldadp->ad_lbn >= lbn)
1428			break;
1429	}
1430	if (oldadp == NULL) {
1431		FREE_LOCK(&lk);
1432		panic("softdep_setup_allocdirect: lost entry");
1433	}
1434	/* insert in middle of list */
1435	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1436	if (oldadp->ad_lbn == lbn)
1437		allocdirect_merge(adphead, adp, oldadp);
1438	FREE_LOCK(&lk);
1439}
1440
1441/*
1442 * Replace an old allocdirect dependency with a newer one.
1443 * This routine must be called with splbio interrupts blocked.
1444 */
1445static void
1446allocdirect_merge(adphead, newadp, oldadp)
1447	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
1448	struct allocdirect *newadp;	/* allocdirect being added */
1449	struct allocdirect *oldadp;	/* existing allocdirect being checked */
1450{
1451	struct worklist *wk;
1452	struct freefrag *freefrag;
1453	struct newdirblk *newdirblk;
1454
1455#ifdef DEBUG
1456	if (lk.lkt_held == NOHOLDER)
1457		panic("allocdirect_merge: lock not held");
1458#endif
1459	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1460	    newadp->ad_oldsize != oldadp->ad_newsize ||
1461	    newadp->ad_lbn >= NDADDR) {
1462		FREE_LOCK(&lk);
1463		panic("%s %lld != new %lld || old size %ld != new %ld",
1464		    "allocdirect_merge: old blkno",
1465		    (intmax_t)newadp->ad_oldblkno,
1466		    (intmax_t)oldadp->ad_newblkno,
1467		    newadp->ad_oldsize, oldadp->ad_newsize);
1468	}
1469	newadp->ad_oldblkno = oldadp->ad_oldblkno;
1470	newadp->ad_oldsize = oldadp->ad_oldsize;
1471	/*
1472	 * If the old dependency had a fragment to free or had never
1473	 * previously had a block allocated, then the new dependency
1474	 * can immediately post its freefrag and adopt the old freefrag.
1475	 * This action is done by swapping the freefrag dependencies.
1476	 * The new dependency gains the old one's freefrag, and the
1477	 * old one gets the new one and then immediately puts it on
1478	 * the worklist when it is freed by free_allocdirect. It is
1479	 * not possible to do this swap when the old dependency had a
1480	 * non-zero size but no previous fragment to free. This condition
1481	 * arises when the new block is an extension of the old block.
1482	 * Here, the first part of the fragment allocated to the new
1483	 * dependency is part of the block currently claimed on disk by
1484	 * the old dependency, so cannot legitimately be freed until the
1485	 * conditions for the new dependency are fulfilled.
1486	 */
1487	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1488		freefrag = newadp->ad_freefrag;
1489		newadp->ad_freefrag = oldadp->ad_freefrag;
1490		oldadp->ad_freefrag = freefrag;
1491	}
1492	/*
1493	 * If we are tracking a new directory-block allocation,
1494	 * move it from the old allocdirect to the new allocdirect.
1495	 */
1496	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
1497		newdirblk = WK_NEWDIRBLK(wk);
1498		WORKLIST_REMOVE(&newdirblk->db_list);
1499		if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL)
1500			panic("allocdirect_merge: extra newdirblk");
1501		WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
1502	}
1503	free_allocdirect(adphead, oldadp, 0);
1504}
1505
1506/*
1507 * Allocate a new freefrag structure if needed.
1508 */
1509static struct freefrag *
1510newfreefrag(ip, blkno, size)
1511	struct inode *ip;
1512	ufs2_daddr_t blkno;
1513	long size;
1514{
1515	struct freefrag *freefrag;
1516	struct fs *fs;
1517
1518	if (blkno == 0)
1519		return (NULL);
1520	fs = ip->i_fs;
1521	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1522		panic("newfreefrag: frag size");
1523	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1524		M_FREEFRAG, M_SOFTDEP_FLAGS);
1525	freefrag->ff_list.wk_type = D_FREEFRAG;
1526	freefrag->ff_state = 0;
1527	freefrag->ff_inum = ip->i_number;
1528	freefrag->ff_mnt = ITOV(ip)->v_mount;
1529	freefrag->ff_blkno = blkno;
1530	freefrag->ff_fragsize = size;
1531	return (freefrag);
1532}
1533
1534/*
1535 * This workitem de-allocates fragments that were replaced during
1536 * file block allocation.
1537 */
1538static void
1539handle_workitem_freefrag(freefrag)
1540	struct freefrag *freefrag;
1541{
1542	struct ufsmount *ump = VFSTOUFS(freefrag->ff_mnt);
1543
1544	ffs_blkfree(ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
1545	    freefrag->ff_fragsize, freefrag->ff_inum);
1546	FREE(freefrag, M_FREEFRAG);
1547}
1548
1549/*
1550 * Indirect block allocation dependencies.
1551 *
1552 * The same dependencies that exist for a direct block also exist when
1553 * a new block is allocated and pointed to by an entry in a block of
1554 * indirect pointers. The undo/redo states described above are also
1555 * used here. Because an indirect block contains many pointers that
1556 * may have dependencies, a second copy of the entire in-memory indirect
1557 * block is kept. The buffer cache copy is always completely up-to-date.
1558 * The second copy, which is used only as a source for disk writes,
1559 * contains only the safe pointers (i.e., those that have no remaining
1560 * update dependencies). The second copy is freed when all pointers
1561 * are safe. The cache is not allowed to replace indirect blocks with
1562 * pending update dependencies. If a buffer containing an indirect
1563 * block with dependencies is written, these routines will mark it
1564 * dirty again. It can only be successfully written once all the
1565 * dependencies are removed. The ffs_fsync routine in conjunction with
1566 * softdep_sync_metadata work together to get all the dependencies
1567 * removed so that a file can be successfully written to disk. Three
1568 * procedures are used when setting up indirect block pointer
1569 * dependencies. The division is necessary because of the organization
1570 * of the "balloc" routine and because of the distinction between file
1571 * pages and file metadata blocks.
1572 */
1573
1574/*
1575 * Allocate a new allocindir structure.
1576 */
1577static struct allocindir *
1578newallocindir(ip, ptrno, newblkno, oldblkno)
1579	struct inode *ip;	/* inode for file being extended */
1580	int ptrno;		/* offset of pointer in indirect block */
1581	ufs2_daddr_t newblkno;	/* disk block number being added */
1582	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
1583{
1584	struct allocindir *aip;
1585
1586	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1587		M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
1588	aip->ai_list.wk_type = D_ALLOCINDIR;
1589	aip->ai_state = ATTACHED;
1590	aip->ai_offset = ptrno;
1591	aip->ai_newblkno = newblkno;
1592	aip->ai_oldblkno = oldblkno;
1593	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1594	return (aip);
1595}
1596
1597/*
1598 * Called just before setting an indirect block pointer
1599 * to a newly allocated file page.
1600 */
1601void
1602softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1603	struct inode *ip;	/* inode for file being extended */
1604	ufs_lbn_t lbn;		/* allocated block number within file */
1605	struct buf *bp;		/* buffer with indirect blk referencing page */
1606	int ptrno;		/* offset of pointer in indirect block */
1607	ufs2_daddr_t newblkno;	/* disk block number being added */
1608	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
1609	struct buf *nbp;	/* buffer holding allocated page */
1610{
1611	struct allocindir *aip;
1612	struct pagedep *pagedep;
1613
1614	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1615	ACQUIRE_LOCK(&lk);
1616	/*
1617	 * If we are allocating a directory page, then we must
1618	 * allocate an associated pagedep to track additions and
1619	 * deletions.
1620	 */
1621	if ((ip->i_mode & IFMT) == IFDIR &&
1622	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1623		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1624	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1625	FREE_LOCK(&lk);
1626	setup_allocindir_phase2(bp, ip, aip);
1627}
1628
1629/*
1630 * Called just before setting an indirect block pointer to a
1631 * newly allocated indirect block.
1632 */
1633void
1634softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1635	struct buf *nbp;	/* newly allocated indirect block */
1636	struct inode *ip;	/* inode for file being extended */
1637	struct buf *bp;		/* indirect block referencing allocated block */
1638	int ptrno;		/* offset of pointer in indirect block */
1639	ufs2_daddr_t newblkno;	/* disk block number being added */
1640{
1641	struct allocindir *aip;
1642
1643	aip = newallocindir(ip, ptrno, newblkno, 0);
1644	ACQUIRE_LOCK(&lk);
1645	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1646	FREE_LOCK(&lk);
1647	setup_allocindir_phase2(bp, ip, aip);
1648}
1649
1650/*
1651 * Called to finish the allocation of the "aip" allocated
1652 * by one of the two routines above.
1653 */
1654static void
1655setup_allocindir_phase2(bp, ip, aip)
1656	struct buf *bp;		/* in-memory copy of the indirect block */
1657	struct inode *ip;	/* inode for file being extended */
1658	struct allocindir *aip;	/* allocindir allocated by the above routines */
1659{
1660	struct worklist *wk;
1661	struct indirdep *indirdep, *newindirdep;
1662	struct bmsafemap *bmsafemap;
1663	struct allocindir *oldaip;
1664	struct freefrag *freefrag;
1665	struct newblk *newblk;
1666	ufs2_daddr_t blkno;
1667
1668	if (bp->b_lblkno >= 0)
1669		panic("setup_allocindir_phase2: not indir blk");
1670	for (indirdep = NULL, newindirdep = NULL; ; ) {
1671		ACQUIRE_LOCK(&lk);
1672		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
1673			if (wk->wk_type != D_INDIRDEP)
1674				continue;
1675			indirdep = WK_INDIRDEP(wk);
1676			break;
1677		}
1678		if (indirdep == NULL && newindirdep) {
1679			indirdep = newindirdep;
1680			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1681			newindirdep = NULL;
1682		}
1683		FREE_LOCK(&lk);
1684		if (indirdep) {
1685			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1686			    &newblk) == 0)
1687				panic("setup_allocindir: lost block");
1688			ACQUIRE_LOCK(&lk);
1689			if (newblk->nb_state == DEPCOMPLETE) {
1690				aip->ai_state |= DEPCOMPLETE;
1691				aip->ai_buf = NULL;
1692			} else {
1693				bmsafemap = newblk->nb_bmsafemap;
1694				aip->ai_buf = bmsafemap->sm_buf;
1695				LIST_REMOVE(newblk, nb_deps);
1696				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1697				    aip, ai_deps);
1698			}
1699			LIST_REMOVE(newblk, nb_hash);
1700			FREE(newblk, M_NEWBLK);
1701			aip->ai_indirdep = indirdep;
1702			/*
1703			 * Check to see if there is an existing dependency
1704			 * for this block. If there is, merge the old
1705			 * dependency into the new one.
1706			 */
1707			if (aip->ai_oldblkno == 0)
1708				oldaip = NULL;
1709			else
1710
1711				LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
1712					if (oldaip->ai_offset == aip->ai_offset)
1713						break;
1714			freefrag = NULL;
1715			if (oldaip != NULL) {
1716				if (oldaip->ai_newblkno != aip->ai_oldblkno) {
1717					FREE_LOCK(&lk);
1718					panic("setup_allocindir_phase2: blkno");
1719				}
1720				aip->ai_oldblkno = oldaip->ai_oldblkno;
1721				freefrag = aip->ai_freefrag;
1722				aip->ai_freefrag = oldaip->ai_freefrag;
1723				oldaip->ai_freefrag = NULL;
1724				free_allocindir(oldaip, NULL);
1725			}
1726			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1727			if (ip->i_ump->um_fstype == UFS1)
1728				((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
1729				    [aip->ai_offset] = aip->ai_oldblkno;
1730			else
1731				((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
1732				    [aip->ai_offset] = aip->ai_oldblkno;
1733			FREE_LOCK(&lk);
1734			if (freefrag != NULL)
1735				handle_workitem_freefrag(freefrag);
1736		}
1737		if (newindirdep) {
1738			if (indirdep->ir_savebp != NULL)
1739				brelse(newindirdep->ir_savebp);
1740			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
1741		}
1742		if (indirdep)
1743			break;
1744		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
1745			M_INDIRDEP, M_SOFTDEP_FLAGS);
1746		newindirdep->ir_list.wk_type = D_INDIRDEP;
1747		newindirdep->ir_state = ATTACHED;
1748		if (ip->i_ump->um_fstype == UFS1)
1749			newindirdep->ir_state |= UFS1FMT;
1750		LIST_INIT(&newindirdep->ir_deplisthd);
1751		LIST_INIT(&newindirdep->ir_donehd);
1752		if (bp->b_blkno == bp->b_lblkno) {
1753			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, NULL, NULL);
1754			bp->b_blkno = blkno;
1755		}
1756		newindirdep->ir_savebp =
1757		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1758		BUF_KERNPROC(newindirdep->ir_savebp);
1759		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1760	}
1761}
1762
1763/*
1764 * Block de-allocation dependencies.
1765 *
1766 * When blocks are de-allocated, the on-disk pointers must be nullified before
1767 * the blocks are made available for use by other files.  (The true
1768 * requirement is that old pointers must be nullified before new on-disk
1769 * pointers are set.  We chose this slightly more stringent requirement to
1770 * reduce complexity.) Our implementation handles this dependency by updating
1771 * the inode (or indirect block) appropriately but delaying the actual block
1772 * de-allocation (i.e., freemap and free space count manipulation) until
1773 * after the updated versions reach stable storage.  After the disk is
1774 * updated, the blocks can be safely de-allocated whenever it is convenient.
1775 * This implementation handles only the common case of reducing a file's
1776 * length to zero. Other cases are handled by the conventional synchronous
1777 * write approach.
1778 *
1779 * The ffs implementation with which we worked double-checks
1780 * the state of the block pointers and file size as it reduces
1781 * a file's length.  Some of this code is replicated here in our
1782 * soft updates implementation.  The freeblks->fb_chkcnt field is
1783 * used to transfer a part of this information to the procedure
1784 * that eventually de-allocates the blocks.
1785 *
1786 * This routine should be called from the routine that shortens
1787 * a file's length, before the inode's size or block pointers
1788 * are modified. It will save the block pointer information for
1789 * later release and zero the inode so that the calling routine
1790 * can release it.
1791 */
1792void
1793softdep_setup_freeblocks(ip, length)
1794	struct inode *ip;	/* The inode whose length is to be reduced */
1795	off_t length;		/* The new length for the file */
1796{
1797	struct freeblks *freeblks;
1798	struct inodedep *inodedep;
1799	struct allocdirect *adp;
1800	struct vnode *vp;
1801	struct buf *bp;
1802	struct fs *fs;
1803	int i, delay, error;
1804
1805	fs = ip->i_fs;
1806	if (length != 0)
1807		panic("softdep_setup_freeblocks: non-zero length");
1808	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
1809		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
1810	freeblks->fb_list.wk_type = D_FREEBLKS;
1811	freeblks->fb_uid = ip->i_uid;
1812	freeblks->fb_previousinum = ip->i_number;
1813	freeblks->fb_devvp = ip->i_devvp;
1814	freeblks->fb_mnt = ITOV(ip)->v_mount;
1815	freeblks->fb_oldsize = ip->i_size;
1816	freeblks->fb_newsize = length;
1817	freeblks->fb_chkcnt = DIP(ip, i_blocks);
1818	for (i = 0; i < NDADDR; i++) {
1819		freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
1820		DIP(ip, i_db[i]) = 0;
1821	}
1822	for (i = 0; i < NIADDR; i++) {
1823		freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
1824		DIP(ip, i_ib[i]) = 0;
1825	}
1826	DIP(ip, i_blocks) = 0;
1827	ip->i_size = 0;
1828	DIP(ip, i_size) = 0;
1829	/*
1830	 * If the file was removed, then the space being freed was
1831	 * accounted for then (see softdep_filereleased()). If the
1832	 * file is merely being truncated, then we account for it now.
1833	 */
1834	if ((ip->i_flag & IN_SPACECOUNTED) == 0)
1835		fs->fs_pendingblocks += freeblks->fb_chkcnt;
1836	/*
1837	 * Push the zero'ed inode to to its disk buffer so that we are free
1838	 * to delete its dependencies below. Once the dependencies are gone
1839	 * the buffer can be safely released.
1840	 */
1841	if ((error = bread(ip->i_devvp,
1842	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1843	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
1844		brelse(bp);
1845		softdep_error("softdep_setup_freeblocks", error);
1846	}
1847	if (ip->i_ump->um_fstype == UFS1)
1848		*((struct ufs1_dinode *)bp->b_data +
1849		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
1850	else
1851		*((struct ufs2_dinode *)bp->b_data +
1852		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
1853	/*
1854	 * Find and eliminate any inode dependencies.
1855	 */
1856	ACQUIRE_LOCK(&lk);
1857	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1858	if ((inodedep->id_state & IOSTARTED) != 0) {
1859		FREE_LOCK(&lk);
1860		panic("softdep_setup_freeblocks: inode busy");
1861	}
1862	/*
1863	 * Add the freeblks structure to the list of operations that
1864	 * must await the zero'ed inode being written to disk. If we
1865	 * still have a bitmap dependency (delay == 0), then the inode
1866	 * has never been written to disk, so we can process the
1867	 * freeblks below once we have deleted the dependencies.
1868	 */
1869	delay = (inodedep->id_state & DEPCOMPLETE);
1870	if (delay)
1871		WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1872	/*
1873	 * Because the file length has been truncated to zero, any
1874	 * pending block allocation dependency structures associated
1875	 * with this inode are obsolete and can simply be de-allocated.
1876	 * We must first merge the two dependency lists to get rid of
1877	 * any duplicate freefrag structures, then purge the merged list.
1878	 * If we still have a bitmap dependency, then the inode has never
1879	 * been written to disk, so we can free any fragments without delay.
1880	 */
1881	merge_inode_lists(inodedep);
1882	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
1883		free_allocdirect(&inodedep->id_inoupdt, adp, delay);
1884	FREE_LOCK(&lk);
1885	bdwrite(bp);
1886	/*
1887	 * We must wait for any I/O in progress to finish so that
1888	 * all potential buffers on the dirty list will be visible.
1889	 * Once they are all there, walk the list and get rid of
1890	 * any dependencies.
1891	 */
1892	vp = ITOV(ip);
1893	ACQUIRE_LOCK(&lk);
1894	drain_output(vp, 1);
1895	while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
1896		bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
1897		(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
1898		deallocate_dependencies(bp, inodedep);
1899		bp->b_flags |= B_INVAL | B_NOCACHE;
1900		FREE_LOCK(&lk);
1901		brelse(bp);
1902		ACQUIRE_LOCK(&lk);
1903	}
1904	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
1905		(void) free_inodedep(inodedep);
1906	FREE_LOCK(&lk);
1907	/*
1908	 * If the inode has never been written to disk (delay == 0),
1909	 * then we can process the freeblks now that we have deleted
1910	 * the dependencies.
1911	 */
1912	if (!delay)
1913		handle_workitem_freeblocks(freeblks, 0);
1914}
1915
1916/*
1917 * Reclaim any dependency structures from a buffer that is about to
1918 * be reallocated to a new vnode. The buffer must be locked, thus,
1919 * no I/O completion operations can occur while we are manipulating
1920 * its associated dependencies. The mutex is held so that other I/O's
1921 * associated with related dependencies do not occur.
1922 */
1923static void
1924deallocate_dependencies(bp, inodedep)
1925	struct buf *bp;
1926	struct inodedep *inodedep;
1927{
1928	struct worklist *wk;
1929	struct indirdep *indirdep;
1930	struct allocindir *aip;
1931	struct pagedep *pagedep;
1932	struct dirrem *dirrem;
1933	struct diradd *dap;
1934	int i;
1935
1936	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
1937		switch (wk->wk_type) {
1938
1939		case D_INDIRDEP:
1940			indirdep = WK_INDIRDEP(wk);
1941			/*
1942			 * None of the indirect pointers will ever be visible,
1943			 * so they can simply be tossed. GOINGAWAY ensures
1944			 * that allocated pointers will be saved in the buffer
1945			 * cache until they are freed. Note that they will
1946			 * only be able to be found by their physical address
1947			 * since the inode mapping the logical address will
1948			 * be gone. The save buffer used for the safe copy
1949			 * was allocated in setup_allocindir_phase2 using
1950			 * the physical address so it could be used for this
1951			 * purpose. Hence we swap the safe copy with the real
1952			 * copy, allowing the safe copy to be freed and holding
1953			 * on to the real copy for later use in indir_trunc.
1954			 */
1955			if (indirdep->ir_state & GOINGAWAY) {
1956				FREE_LOCK(&lk);
1957				panic("deallocate_dependencies: already gone");
1958			}
1959			indirdep->ir_state |= GOINGAWAY;
1960			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
1961				free_allocindir(aip, inodedep);
1962			if (bp->b_lblkno >= 0 ||
1963			    bp->b_blkno != indirdep->ir_savebp->b_lblkno) {
1964				FREE_LOCK(&lk);
1965				panic("deallocate_dependencies: not indir");
1966			}
1967			bcopy(bp->b_data, indirdep->ir_savebp->b_data,
1968			    bp->b_bcount);
1969			WORKLIST_REMOVE(wk);
1970			WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
1971			continue;
1972
1973		case D_PAGEDEP:
1974			pagedep = WK_PAGEDEP(wk);
1975			/*
1976			 * None of the directory additions will ever be
1977			 * visible, so they can simply be tossed.
1978			 */
1979			for (i = 0; i < DAHASHSZ; i++)
1980				while ((dap =
1981				    LIST_FIRST(&pagedep->pd_diraddhd[i])))
1982					free_diradd(dap);
1983			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
1984				free_diradd(dap);
1985			/*
1986			 * Copy any directory remove dependencies to the list
1987			 * to be processed after the zero'ed inode is written.
1988			 * If the inode has already been written, then they
1989			 * can be dumped directly onto the work list.
1990			 */
1991			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
1992				LIST_REMOVE(dirrem, dm_next);
1993				dirrem->dm_dirinum = pagedep->pd_ino;
1994				if (inodedep == NULL ||
1995				    (inodedep->id_state & ALLCOMPLETE) ==
1996				     ALLCOMPLETE)
1997					add_to_worklist(&dirrem->dm_list);
1998				else
1999					WORKLIST_INSERT(&inodedep->id_bufwait,
2000					    &dirrem->dm_list);
2001			}
2002			if ((pagedep->pd_state & NEWBLOCK) != 0) {
2003				LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
2004					if (wk->wk_type == D_NEWDIRBLK &&
2005					    WK_NEWDIRBLK(wk)->db_pagedep ==
2006					      pagedep)
2007						break;
2008				if (wk != NULL) {
2009					WORKLIST_REMOVE(wk);
2010					free_newdirblk(WK_NEWDIRBLK(wk));
2011				} else {
2012					FREE_LOCK(&lk);
2013					panic("deallocate_dependencies: "
2014					      "lost pagedep");
2015				}
2016			}
2017			WORKLIST_REMOVE(&pagedep->pd_list);
2018			LIST_REMOVE(pagedep, pd_hash);
2019			WORKITEM_FREE(pagedep, D_PAGEDEP);
2020			continue;
2021
2022		case D_ALLOCINDIR:
2023			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
2024			continue;
2025
2026		case D_ALLOCDIRECT:
2027		case D_INODEDEP:
2028			FREE_LOCK(&lk);
2029			panic("deallocate_dependencies: Unexpected type %s",
2030			    TYPENAME(wk->wk_type));
2031			/* NOTREACHED */
2032
2033		default:
2034			FREE_LOCK(&lk);
2035			panic("deallocate_dependencies: Unknown type %s",
2036			    TYPENAME(wk->wk_type));
2037			/* NOTREACHED */
2038		}
2039	}
2040}
2041
2042/*
2043 * Free an allocdirect. Generate a new freefrag work request if appropriate.
2044 * This routine must be called with splbio interrupts blocked.
2045 */
2046static void
2047free_allocdirect(adphead, adp, delay)
2048	struct allocdirectlst *adphead;
2049	struct allocdirect *adp;
2050	int delay;
2051{
2052	struct newdirblk *newdirblk;
2053	struct worklist *wk;
2054
2055#ifdef DEBUG
2056	if (lk.lkt_held == NOHOLDER)
2057		panic("free_allocdirect: lock not held");
2058#endif
2059	if ((adp->ad_state & DEPCOMPLETE) == 0)
2060		LIST_REMOVE(adp, ad_deps);
2061	TAILQ_REMOVE(adphead, adp, ad_next);
2062	if ((adp->ad_state & COMPLETE) == 0)
2063		WORKLIST_REMOVE(&adp->ad_list);
2064	if (adp->ad_freefrag != NULL) {
2065		if (delay)
2066			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2067			    &adp->ad_freefrag->ff_list);
2068		else
2069			add_to_worklist(&adp->ad_freefrag->ff_list);
2070	}
2071	if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
2072		newdirblk = WK_NEWDIRBLK(wk);
2073		WORKLIST_REMOVE(&newdirblk->db_list);
2074		if (LIST_FIRST(&adp->ad_newdirblk) != NULL)
2075			panic("free_allocdirect: extra newdirblk");
2076		if (delay)
2077			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2078			    &newdirblk->db_list);
2079		else
2080			free_newdirblk(newdirblk);
2081	}
2082	WORKITEM_FREE(adp, D_ALLOCDIRECT);
2083}
2084
2085/*
2086 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
2087 * This routine must be called with splbio interrupts blocked.
2088 */
2089static void
2090free_newdirblk(newdirblk)
2091	struct newdirblk *newdirblk;
2092{
2093	struct pagedep *pagedep;
2094	struct diradd *dap;
2095	int i;
2096
2097#ifdef DEBUG
2098	if (lk.lkt_held == NOHOLDER)
2099		panic("free_newdirblk: lock not held");
2100#endif
2101	/*
2102	 * If the pagedep is still linked onto the directory buffer
2103	 * dependency chain, then some of the entries on the
2104	 * pd_pendinghd list may not be committed to disk yet. In
2105	 * this case, we will simply clear the NEWBLOCK flag and
2106	 * let the pd_pendinghd list be processed when the pagedep
2107	 * is next written. If the pagedep is no longer on the buffer
2108	 * dependency chain, then all the entries on the pd_pending
2109	 * list are committed to disk and we can free them here.
2110	 */
2111	pagedep = newdirblk->db_pagedep;
2112	pagedep->pd_state &= ~NEWBLOCK;
2113	if ((pagedep->pd_state & ONWORKLIST) == 0)
2114		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
2115			free_diradd(dap);
2116	/*
2117	 * If no dependencies remain, the pagedep will be freed.
2118	 */
2119	for (i = 0; i < DAHASHSZ; i++)
2120		if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
2121			break;
2122	if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
2123		LIST_REMOVE(pagedep, pd_hash);
2124		WORKITEM_FREE(pagedep, D_PAGEDEP);
2125	}
2126	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2127}
2128
2129/*
2130 * Prepare an inode to be freed. The actual free operation is not
2131 * done until the zero'ed inode has been written to disk.
2132 */
2133void
2134softdep_freefile(pvp, ino, mode)
2135	struct vnode *pvp;
2136	ino_t ino;
2137	int mode;
2138{
2139	struct inode *ip = VTOI(pvp);
2140	struct inodedep *inodedep;
2141	struct freefile *freefile;
2142
2143	/*
2144	 * This sets up the inode de-allocation dependency.
2145	 */
2146	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
2147		M_FREEFILE, M_SOFTDEP_FLAGS);
2148	freefile->fx_list.wk_type = D_FREEFILE;
2149	freefile->fx_list.wk_state = 0;
2150	freefile->fx_mode = mode;
2151	freefile->fx_oldinum = ino;
2152	freefile->fx_devvp = ip->i_devvp;
2153	freefile->fx_mnt = ITOV(ip)->v_mount;
2154	if ((ip->i_flag & IN_SPACECOUNTED) == 0)
2155		ip->i_fs->fs_pendinginodes += 1;
2156
2157	/*
2158	 * If the inodedep does not exist, then the zero'ed inode has
2159	 * been written to disk. If the allocated inode has never been
2160	 * written to disk, then the on-disk inode is zero'ed. In either
2161	 * case we can free the file immediately.
2162	 */
2163	ACQUIRE_LOCK(&lk);
2164	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
2165	    check_inode_unwritten(inodedep)) {
2166		FREE_LOCK(&lk);
2167		handle_workitem_freefile(freefile);
2168		return;
2169	}
2170	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
2171	FREE_LOCK(&lk);
2172}
2173
2174/*
2175 * Check to see if an inode has never been written to disk. If
2176 * so free the inodedep and return success, otherwise return failure.
2177 * This routine must be called with splbio interrupts blocked.
2178 *
2179 * If we still have a bitmap dependency, then the inode has never
2180 * been written to disk. Drop the dependency as it is no longer
2181 * necessary since the inode is being deallocated. We set the
2182 * ALLCOMPLETE flags since the bitmap now properly shows that the
2183 * inode is not allocated. Even if the inode is actively being
2184 * written, it has been rolled back to its zero'ed state, so we
2185 * are ensured that a zero inode is what is on the disk. For short
2186 * lived files, this change will usually result in removing all the
2187 * dependencies from the inode so that it can be freed immediately.
2188 */
2189static int
2190check_inode_unwritten(inodedep)
2191	struct inodedep *inodedep;
2192{
2193
2194	if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
2195	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2196	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2197	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
2198	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2199	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2200	    inodedep->id_nlinkdelta != 0)
2201		return (0);
2202	inodedep->id_state |= ALLCOMPLETE;
2203	LIST_REMOVE(inodedep, id_deps);
2204	inodedep->id_buf = NULL;
2205	if (inodedep->id_state & ONWORKLIST)
2206		WORKLIST_REMOVE(&inodedep->id_list);
2207	if (inodedep->id_savedino1 != NULL) {
2208		FREE(inodedep->id_savedino1, M_INODEDEP);
2209		inodedep->id_savedino1 = NULL;
2210	}
2211	if (free_inodedep(inodedep) == 0) {
2212		FREE_LOCK(&lk);
2213		panic("check_inode_unwritten: busy inode");
2214	}
2215	return (1);
2216}
2217
2218/*
2219 * Try to free an inodedep structure. Return 1 if it could be freed.
2220 */
2221static int
2222free_inodedep(inodedep)
2223	struct inodedep *inodedep;
2224{
2225
2226	if ((inodedep->id_state & ONWORKLIST) != 0 ||
2227	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
2228	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2229	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2230	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
2231	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2232	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2233	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
2234		return (0);
2235	LIST_REMOVE(inodedep, id_hash);
2236	WORKITEM_FREE(inodedep, D_INODEDEP);
2237	num_inodedep -= 1;
2238	return (1);
2239}
2240
2241/*
2242 * This workitem routine performs the block de-allocation.
2243 * The workitem is added to the pending list after the updated
2244 * inode block has been written to disk.  As mentioned above,
2245 * checks regarding the number of blocks de-allocated (compared
2246 * to the number of blocks allocated for the file) are also
2247 * performed in this function.
2248 */
2249static void
2250handle_workitem_freeblocks(freeblks, flags)
2251	struct freeblks *freeblks;
2252	int flags;
2253{
2254	struct inode *ip;
2255	struct vnode *vp;
2256	struct fs *fs;
2257	int i, nblocks, level, bsize;
2258	ufs2_daddr_t bn, blocksreleased = 0;
2259	int error, allerror = 0;
2260	ufs_lbn_t baselbns[NIADDR], tmpval;
2261
2262	fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
2263	tmpval = 1;
2264	baselbns[0] = NDADDR;
2265	for (i = 1; i < NIADDR; i++) {
2266		tmpval *= NINDIR(fs);
2267		baselbns[i] = baselbns[i - 1] + tmpval;
2268	}
2269	nblocks = btodb(fs->fs_bsize);
2270	blocksreleased = 0;
2271	/*
2272	 * Indirect blocks first.
2273	 */
2274	for (level = (NIADDR - 1); level >= 0; level--) {
2275		if ((bn = freeblks->fb_iblks[level]) == 0)
2276			continue;
2277		if ((error = indir_trunc(freeblks, fsbtodb(fs, bn), level,
2278		    baselbns[level], &blocksreleased)) == 0)
2279			allerror = error;
2280		ffs_blkfree(fs, freeblks->fb_devvp, bn, fs->fs_bsize,
2281		    freeblks->fb_previousinum);
2282		fs->fs_pendingblocks -= nblocks;
2283		blocksreleased += nblocks;
2284	}
2285	/*
2286	 * All direct blocks or frags.
2287	 */
2288	for (i = (NDADDR - 1); i >= 0; i--) {
2289		if ((bn = freeblks->fb_dblks[i]) == 0)
2290			continue;
2291		bsize = sblksize(fs, freeblks->fb_oldsize, i);
2292		ffs_blkfree(fs, freeblks->fb_devvp, bn, bsize,
2293		    freeblks->fb_previousinum);
2294		fs->fs_pendingblocks -= btodb(bsize);
2295		blocksreleased += btodb(bsize);
2296	}
2297	/*
2298	 * If we still have not finished background cleanup, then check
2299	 * to see if the block count needs to be adjusted.
2300	 */
2301	if (freeblks->fb_chkcnt != blocksreleased &&
2302	    (fs->fs_flags & FS_UNCLEAN) != 0 &&
2303	    VFS_VGET(freeblks->fb_mnt, freeblks->fb_previousinum,
2304	    (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) {
2305		ip = VTOI(vp);
2306		DIP(ip, i_blocks) += freeblks->fb_chkcnt - blocksreleased;
2307		ip->i_flag |= IN_CHANGE;
2308		vput(vp);
2309	}
2310
2311#ifdef DIAGNOSTIC
2312	if (freeblks->fb_chkcnt != blocksreleased &&
2313	    ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
2314		printf("handle_workitem_freeblocks: block count");
2315	if (allerror)
2316		softdep_error("handle_workitem_freeblks", allerror);
2317#endif /* DIAGNOSTIC */
2318
2319	WORKITEM_FREE(freeblks, D_FREEBLKS);
2320}
2321
2322/*
2323 * Release blocks associated with the inode ip and stored in the indirect
2324 * block dbn. If level is greater than SINGLE, the block is an indirect block
2325 * and recursive calls to indirtrunc must be used to cleanse other indirect
2326 * blocks.
2327 */
2328static int
2329indir_trunc(freeblks, dbn, level, lbn, countp)
2330	struct freeblks *freeblks;
2331	ufs2_daddr_t dbn;
2332	int level;
2333	ufs_lbn_t lbn;
2334	ufs2_daddr_t *countp;
2335{
2336	struct buf *bp;
2337	struct fs *fs;
2338	struct worklist *wk;
2339	struct indirdep *indirdep;
2340	ufs1_daddr_t *bap1 = 0;
2341	ufs2_daddr_t nb, *bap2 = 0;
2342	ufs_lbn_t lbnadd;
2343	int i, nblocks, ufs1fmt;
2344	int error, allerror = 0;
2345
2346	fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
2347	lbnadd = 1;
2348	for (i = level; i > 0; i--)
2349		lbnadd *= NINDIR(fs);
2350	/*
2351	 * Get buffer of block pointers to be freed. This routine is not
2352	 * called until the zero'ed inode has been written, so it is safe
2353	 * to free blocks as they are encountered. Because the inode has
2354	 * been zero'ed, calls to bmap on these blocks will fail. So, we
2355	 * have to use the on-disk address and the block device for the
2356	 * filesystem to look them up. If the file was deleted before its
2357	 * indirect blocks were all written to disk, the routine that set
2358	 * us up (deallocate_dependencies) will have arranged to leave
2359	 * a complete copy of the indirect block in memory for our use.
2360	 * Otherwise we have to read the blocks in from the disk.
2361	 */
2362	ACQUIRE_LOCK(&lk);
2363	if ((bp = incore(freeblks->fb_devvp, dbn)) != NULL &&
2364	    (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2365		if (wk->wk_type != D_INDIRDEP ||
2366		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2367		    (indirdep->ir_state & GOINGAWAY) == 0) {
2368			FREE_LOCK(&lk);
2369			panic("indir_trunc: lost indirdep");
2370		}
2371		WORKLIST_REMOVE(wk);
2372		WORKITEM_FREE(indirdep, D_INDIRDEP);
2373		if (LIST_FIRST(&bp->b_dep) != NULL) {
2374			FREE_LOCK(&lk);
2375			panic("indir_trunc: dangling dep");
2376		}
2377		FREE_LOCK(&lk);
2378	} else {
2379		FREE_LOCK(&lk);
2380		error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
2381		    NOCRED, &bp);
2382		if (error) {
2383			brelse(bp);
2384			return (error);
2385		}
2386	}
2387	/*
2388	 * Recursively free indirect blocks.
2389	 */
2390	if (VFSTOUFS(freeblks->fb_mnt)->um_fstype == UFS1) {
2391		ufs1fmt = 1;
2392		bap1 = (ufs1_daddr_t *)bp->b_data;
2393	} else {
2394		ufs1fmt = 0;
2395		bap2 = (ufs2_daddr_t *)bp->b_data;
2396	}
2397	nblocks = btodb(fs->fs_bsize);
2398	for (i = NINDIR(fs) - 1; i >= 0; i--) {
2399		if (ufs1fmt)
2400			nb = bap1[i];
2401		else
2402			nb = bap2[i];
2403		if (nb == 0)
2404			continue;
2405		if (level != 0) {
2406			if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
2407			     level - 1, lbn + (i * lbnadd), countp)) != 0)
2408				allerror = error;
2409		}
2410		ffs_blkfree(fs, freeblks->fb_devvp, nb, fs->fs_bsize,
2411		    freeblks->fb_previousinum);
2412		fs->fs_pendingblocks -= nblocks;
2413		*countp += nblocks;
2414	}
2415	bp->b_flags |= B_INVAL | B_NOCACHE;
2416	brelse(bp);
2417	return (allerror);
2418}
2419
2420/*
2421 * Free an allocindir.
2422 * This routine must be called with splbio interrupts blocked.
2423 */
2424static void
2425free_allocindir(aip, inodedep)
2426	struct allocindir *aip;
2427	struct inodedep *inodedep;
2428{
2429	struct freefrag *freefrag;
2430
2431#ifdef DEBUG
2432	if (lk.lkt_held == NOHOLDER)
2433		panic("free_allocindir: lock not held");
2434#endif
2435	if ((aip->ai_state & DEPCOMPLETE) == 0)
2436		LIST_REMOVE(aip, ai_deps);
2437	if (aip->ai_state & ONWORKLIST)
2438		WORKLIST_REMOVE(&aip->ai_list);
2439	LIST_REMOVE(aip, ai_next);
2440	if ((freefrag = aip->ai_freefrag) != NULL) {
2441		if (inodedep == NULL)
2442			add_to_worklist(&freefrag->ff_list);
2443		else
2444			WORKLIST_INSERT(&inodedep->id_bufwait,
2445			    &freefrag->ff_list);
2446	}
2447	WORKITEM_FREE(aip, D_ALLOCINDIR);
2448}
2449
2450/*
2451 * Directory entry addition dependencies.
2452 *
2453 * When adding a new directory entry, the inode (with its incremented link
2454 * count) must be written to disk before the directory entry's pointer to it.
2455 * Also, if the inode is newly allocated, the corresponding freemap must be
2456 * updated (on disk) before the directory entry's pointer. These requirements
2457 * are met via undo/redo on the directory entry's pointer, which consists
2458 * simply of the inode number.
2459 *
2460 * As directory entries are added and deleted, the free space within a
2461 * directory block can become fragmented.  The ufs filesystem will compact
2462 * a fragmented directory block to make space for a new entry. When this
2463 * occurs, the offsets of previously added entries change. Any "diradd"
2464 * dependency structures corresponding to these entries must be updated with
2465 * the new offsets.
2466 */
2467
2468/*
2469 * This routine is called after the in-memory inode's link
2470 * count has been incremented, but before the directory entry's
2471 * pointer to the inode has been set.
2472 */
2473int
2474softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
2475	struct buf *bp;		/* buffer containing directory block */
2476	struct inode *dp;	/* inode for directory */
2477	off_t diroffset;	/* offset of new entry in directory */
2478	ino_t newinum;		/* inode referenced by new directory entry */
2479	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
2480	int isnewblk;		/* entry is in a newly allocated block */
2481{
2482	int offset;		/* offset of new entry within directory block */
2483	ufs_lbn_t lbn;		/* block in directory containing new entry */
2484	struct fs *fs;
2485	struct diradd *dap;
2486	struct allocdirect *adp;
2487	struct pagedep *pagedep;
2488	struct inodedep *inodedep;
2489	struct newdirblk *newdirblk = 0;
2490	struct mkdir *mkdir1, *mkdir2;
2491
2492	/*
2493	 * Whiteouts have no dependencies.
2494	 */
2495	if (newinum == WINO) {
2496		if (newdirbp != NULL)
2497			bdwrite(newdirbp);
2498		return (0);
2499	}
2500
2501	fs = dp->i_fs;
2502	lbn = lblkno(fs, diroffset);
2503	offset = blkoff(fs, diroffset);
2504	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
2505		M_SOFTDEP_FLAGS|M_ZERO);
2506	dap->da_list.wk_type = D_DIRADD;
2507	dap->da_offset = offset;
2508	dap->da_newinum = newinum;
2509	dap->da_state = ATTACHED;
2510	if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
2511		MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk),
2512		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
2513		newdirblk->db_list.wk_type = D_NEWDIRBLK;
2514		newdirblk->db_state = 0;
2515	}
2516	if (newdirbp == NULL) {
2517		dap->da_state |= DEPCOMPLETE;
2518		ACQUIRE_LOCK(&lk);
2519	} else {
2520		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2521		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2522		    M_SOFTDEP_FLAGS);
2523		mkdir1->md_list.wk_type = D_MKDIR;
2524		mkdir1->md_state = MKDIR_BODY;
2525		mkdir1->md_diradd = dap;
2526		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2527		    M_SOFTDEP_FLAGS);
2528		mkdir2->md_list.wk_type = D_MKDIR;
2529		mkdir2->md_state = MKDIR_PARENT;
2530		mkdir2->md_diradd = dap;
2531		/*
2532		 * Dependency on "." and ".." being written to disk.
2533		 */
2534		mkdir1->md_buf = newdirbp;
2535		ACQUIRE_LOCK(&lk);
2536		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2537		WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2538		FREE_LOCK(&lk);
2539		bdwrite(newdirbp);
2540		/*
2541		 * Dependency on link count increase for parent directory
2542		 */
2543		ACQUIRE_LOCK(&lk);
2544		if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0
2545		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2546			dap->da_state &= ~MKDIR_PARENT;
2547			WORKITEM_FREE(mkdir2, D_MKDIR);
2548		} else {
2549			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2550			WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2551		}
2552	}
2553	/*
2554	 * Link into parent directory pagedep to await its being written.
2555	 */
2556	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2557		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2558	dap->da_pagedep = pagedep;
2559	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2560	    da_pdlist);
2561	/*
2562	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2563	 * is not yet written. If it is written, do the post-inode write
2564	 * processing to put it on the id_pendinghd list.
2565	 */
2566	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2567	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2568		diradd_inode_written(dap, inodedep);
2569	else
2570		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2571	if (isnewblk) {
2572		/*
2573		 * Directories growing into indirect blocks are rare
2574		 * enough and the frequency of new block allocation
2575		 * in those cases even more rare, that we choose not
2576		 * to bother tracking them. Rather we simply force the
2577		 * new directory entry to disk.
2578		 */
2579		if (lbn >= NDADDR) {
2580			FREE_LOCK(&lk);
2581			/*
2582			 * We only have a new allocation when at the
2583			 * beginning of a new block, not when we are
2584			 * expanding into an existing block.
2585			 */
2586			if (blkoff(fs, diroffset) == 0)
2587				return (1);
2588			return (0);
2589		}
2590		/*
2591		 * We only have a new allocation when at the beginning
2592		 * of a new fragment, not when we are expanding into an
2593		 * existing fragment. Also, there is nothing to do if we
2594		 * are already tracking this block.
2595		 */
2596		if (fragoff(fs, diroffset) != 0) {
2597			FREE_LOCK(&lk);
2598			return (0);
2599		}
2600		if ((pagedep->pd_state & NEWBLOCK) != 0) {
2601			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2602			FREE_LOCK(&lk);
2603			return (0);
2604		}
2605		/*
2606		 * Find our associated allocdirect and have it track us.
2607		 */
2608		if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0)
2609			panic("softdep_setup_directory_add: lost inodedep");
2610		adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
2611		if (adp == NULL || adp->ad_lbn != lbn) {
2612			FREE_LOCK(&lk);
2613			panic("softdep_setup_directory_add: lost entry");
2614		}
2615		pagedep->pd_state |= NEWBLOCK;
2616		newdirblk->db_pagedep = pagedep;
2617		WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
2618	}
2619	FREE_LOCK(&lk);
2620	return (0);
2621}
2622
2623/*
2624 * This procedure is called to change the offset of a directory
2625 * entry when compacting a directory block which must be owned
2626 * exclusively by the caller. Note that the actual entry movement
2627 * must be done in this procedure to ensure that no I/O completions
2628 * occur while the move is in progress.
2629 */
2630void
2631softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2632	struct inode *dp;	/* inode for directory */
2633	caddr_t base;		/* address of dp->i_offset */
2634	caddr_t oldloc;		/* address of old directory location */
2635	caddr_t newloc;		/* address of new directory location */
2636	int entrysize;		/* size of directory entry */
2637{
2638	int offset, oldoffset, newoffset;
2639	struct pagedep *pagedep;
2640	struct diradd *dap;
2641	ufs_lbn_t lbn;
2642
2643	ACQUIRE_LOCK(&lk);
2644	lbn = lblkno(dp->i_fs, dp->i_offset);
2645	offset = blkoff(dp->i_fs, dp->i_offset);
2646	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2647		goto done;
2648	oldoffset = offset + (oldloc - base);
2649	newoffset = offset + (newloc - base);
2650
2651	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
2652		if (dap->da_offset != oldoffset)
2653			continue;
2654		dap->da_offset = newoffset;
2655		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2656			break;
2657		LIST_REMOVE(dap, da_pdlist);
2658		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2659		    dap, da_pdlist);
2660		break;
2661	}
2662	if (dap == NULL) {
2663
2664		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
2665			if (dap->da_offset == oldoffset) {
2666				dap->da_offset = newoffset;
2667				break;
2668			}
2669		}
2670	}
2671done:
2672	bcopy(oldloc, newloc, entrysize);
2673	FREE_LOCK(&lk);
2674}
2675
2676/*
2677 * Free a diradd dependency structure. This routine must be called
2678 * with splbio interrupts blocked.
2679 */
2680static void
2681free_diradd(dap)
2682	struct diradd *dap;
2683{
2684	struct dirrem *dirrem;
2685	struct pagedep *pagedep;
2686	struct inodedep *inodedep;
2687	struct mkdir *mkdir, *nextmd;
2688
2689#ifdef DEBUG
2690	if (lk.lkt_held == NOHOLDER)
2691		panic("free_diradd: lock not held");
2692#endif
2693	WORKLIST_REMOVE(&dap->da_list);
2694	LIST_REMOVE(dap, da_pdlist);
2695	if ((dap->da_state & DIRCHG) == 0) {
2696		pagedep = dap->da_pagedep;
2697	} else {
2698		dirrem = dap->da_previous;
2699		pagedep = dirrem->dm_pagedep;
2700		dirrem->dm_dirinum = pagedep->pd_ino;
2701		add_to_worklist(&dirrem->dm_list);
2702	}
2703	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2704	    0, &inodedep) != 0)
2705		(void) free_inodedep(inodedep);
2706	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2707		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2708			nextmd = LIST_NEXT(mkdir, md_mkdirs);
2709			if (mkdir->md_diradd != dap)
2710				continue;
2711			dap->da_state &= ~mkdir->md_state;
2712			WORKLIST_REMOVE(&mkdir->md_list);
2713			LIST_REMOVE(mkdir, md_mkdirs);
2714			WORKITEM_FREE(mkdir, D_MKDIR);
2715		}
2716		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2717			FREE_LOCK(&lk);
2718			panic("free_diradd: unfound ref");
2719		}
2720	}
2721	WORKITEM_FREE(dap, D_DIRADD);
2722}
2723
2724/*
2725 * Directory entry removal dependencies.
2726 *
2727 * When removing a directory entry, the entry's inode pointer must be
2728 * zero'ed on disk before the corresponding inode's link count is decremented
2729 * (possibly freeing the inode for re-use). This dependency is handled by
2730 * updating the directory entry but delaying the inode count reduction until
2731 * after the directory block has been written to disk. After this point, the
2732 * inode count can be decremented whenever it is convenient.
2733 */
2734
2735/*
2736 * This routine should be called immediately after removing
2737 * a directory entry.  The inode's link count should not be
2738 * decremented by the calling procedure -- the soft updates
2739 * code will do this task when it is safe.
2740 */
2741void
2742softdep_setup_remove(bp, dp, ip, isrmdir)
2743	struct buf *bp;		/* buffer containing directory block */
2744	struct inode *dp;	/* inode for the directory being modified */
2745	struct inode *ip;	/* inode for directory entry being removed */
2746	int isrmdir;		/* indicates if doing RMDIR */
2747{
2748	struct dirrem *dirrem, *prevdirrem;
2749
2750	/*
2751	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2752	 */
2753	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2754
2755	/*
2756	 * If the COMPLETE flag is clear, then there were no active
2757	 * entries and we want to roll back to a zeroed entry until
2758	 * the new inode is committed to disk. If the COMPLETE flag is
2759	 * set then we have deleted an entry that never made it to
2760	 * disk. If the entry we deleted resulted from a name change,
2761	 * then the old name still resides on disk. We cannot delete
2762	 * its inode (returned to us in prevdirrem) until the zeroed
2763	 * directory entry gets to disk. The new inode has never been
2764	 * referenced on the disk, so can be deleted immediately.
2765	 */
2766	if ((dirrem->dm_state & COMPLETE) == 0) {
2767		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2768		    dm_next);
2769		FREE_LOCK(&lk);
2770	} else {
2771		if (prevdirrem != NULL)
2772			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
2773			    prevdirrem, dm_next);
2774		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2775		FREE_LOCK(&lk);
2776		handle_workitem_remove(dirrem, NULL);
2777	}
2778}
2779
2780/*
2781 * Allocate a new dirrem if appropriate and return it along with
2782 * its associated pagedep. Called without a lock, returns with lock.
2783 */
2784static long num_dirrem;		/* number of dirrem allocated */
2785static struct dirrem *
2786newdirrem(bp, dp, ip, isrmdir, prevdirremp)
2787	struct buf *bp;		/* buffer containing directory block */
2788	struct inode *dp;	/* inode for the directory being modified */
2789	struct inode *ip;	/* inode for directory entry being removed */
2790	int isrmdir;		/* indicates if doing RMDIR */
2791	struct dirrem **prevdirremp; /* previously referenced inode, if any */
2792{
2793	int offset;
2794	ufs_lbn_t lbn;
2795	struct diradd *dap;
2796	struct dirrem *dirrem;
2797	struct pagedep *pagedep;
2798
2799	/*
2800	 * Whiteouts have no deletion dependencies.
2801	 */
2802	if (ip == NULL)
2803		panic("newdirrem: whiteout");
2804	/*
2805	 * If we are over our limit, try to improve the situation.
2806	 * Limiting the number of dirrem structures will also limit
2807	 * the number of freefile and freeblks structures.
2808	 */
2809	if (num_dirrem > max_softdeps / 2)
2810		(void) request_cleanup(FLUSH_REMOVE, 0);
2811	num_dirrem += 1;
2812	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
2813		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
2814	dirrem->dm_list.wk_type = D_DIRREM;
2815	dirrem->dm_state = isrmdir ? RMDIR : 0;
2816	dirrem->dm_mnt = ITOV(ip)->v_mount;
2817	dirrem->dm_oldinum = ip->i_number;
2818	*prevdirremp = NULL;
2819
2820	ACQUIRE_LOCK(&lk);
2821	lbn = lblkno(dp->i_fs, dp->i_offset);
2822	offset = blkoff(dp->i_fs, dp->i_offset);
2823	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2824		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2825	dirrem->dm_pagedep = pagedep;
2826	/*
2827	 * Check for a diradd dependency for the same directory entry.
2828	 * If present, then both dependencies become obsolete and can
2829	 * be de-allocated. Check for an entry on both the pd_dirraddhd
2830	 * list and the pd_pendinghd list.
2831	 */
2832
2833	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
2834		if (dap->da_offset == offset)
2835			break;
2836	if (dap == NULL) {
2837
2838		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
2839			if (dap->da_offset == offset)
2840				break;
2841		if (dap == NULL)
2842			return (dirrem);
2843	}
2844	/*
2845	 * Must be ATTACHED at this point.
2846	 */
2847	if ((dap->da_state & ATTACHED) == 0) {
2848		FREE_LOCK(&lk);
2849		panic("newdirrem: not ATTACHED");
2850	}
2851	if (dap->da_newinum != ip->i_number) {
2852		FREE_LOCK(&lk);
2853		panic("newdirrem: inum %d should be %d",
2854		    ip->i_number, dap->da_newinum);
2855	}
2856	/*
2857	 * If we are deleting a changed name that never made it to disk,
2858	 * then return the dirrem describing the previous inode (which
2859	 * represents the inode currently referenced from this entry on disk).
2860	 */
2861	if ((dap->da_state & DIRCHG) != 0) {
2862		*prevdirremp = dap->da_previous;
2863		dap->da_state &= ~DIRCHG;
2864		dap->da_pagedep = pagedep;
2865	}
2866	/*
2867	 * We are deleting an entry that never made it to disk.
2868	 * Mark it COMPLETE so we can delete its inode immediately.
2869	 */
2870	dirrem->dm_state |= COMPLETE;
2871	free_diradd(dap);
2872	return (dirrem);
2873}
2874
2875/*
2876 * Directory entry change dependencies.
2877 *
2878 * Changing an existing directory entry requires that an add operation
2879 * be completed first followed by a deletion. The semantics for the addition
2880 * are identical to the description of adding a new entry above except
2881 * that the rollback is to the old inode number rather than zero. Once
2882 * the addition dependency is completed, the removal is done as described
2883 * in the removal routine above.
2884 */
2885
2886/*
2887 * This routine should be called immediately after changing
2888 * a directory entry.  The inode's link count should not be
2889 * decremented by the calling procedure -- the soft updates
2890 * code will perform this task when it is safe.
2891 */
2892void
2893softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
2894	struct buf *bp;		/* buffer containing directory block */
2895	struct inode *dp;	/* inode for the directory being modified */
2896	struct inode *ip;	/* inode for directory entry being removed */
2897	ino_t newinum;		/* new inode number for changed entry */
2898	int isrmdir;		/* indicates if doing RMDIR */
2899{
2900	int offset;
2901	struct diradd *dap = NULL;
2902	struct dirrem *dirrem, *prevdirrem;
2903	struct pagedep *pagedep;
2904	struct inodedep *inodedep;
2905
2906	offset = blkoff(dp->i_fs, dp->i_offset);
2907
2908	/*
2909	 * Whiteouts do not need diradd dependencies.
2910	 */
2911	if (newinum != WINO) {
2912		MALLOC(dap, struct diradd *, sizeof(struct diradd),
2913		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
2914		dap->da_list.wk_type = D_DIRADD;
2915		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
2916		dap->da_offset = offset;
2917		dap->da_newinum = newinum;
2918	}
2919
2920	/*
2921	 * Allocate a new dirrem and ACQUIRE_LOCK.
2922	 */
2923	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2924	pagedep = dirrem->dm_pagedep;
2925	/*
2926	 * The possible values for isrmdir:
2927	 *	0 - non-directory file rename
2928	 *	1 - directory rename within same directory
2929	 *   inum - directory rename to new directory of given inode number
2930	 * When renaming to a new directory, we are both deleting and
2931	 * creating a new directory entry, so the link count on the new
2932	 * directory should not change. Thus we do not need the followup
2933	 * dirrem which is usually done in handle_workitem_remove. We set
2934	 * the DIRCHG flag to tell handle_workitem_remove to skip the
2935	 * followup dirrem.
2936	 */
2937	if (isrmdir > 1)
2938		dirrem->dm_state |= DIRCHG;
2939
2940	/*
2941	 * Whiteouts have no additional dependencies,
2942	 * so just put the dirrem on the correct list.
2943	 */
2944	if (newinum == WINO) {
2945		if ((dirrem->dm_state & COMPLETE) == 0) {
2946			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
2947			    dm_next);
2948		} else {
2949			dirrem->dm_dirinum = pagedep->pd_ino;
2950			add_to_worklist(&dirrem->dm_list);
2951		}
2952		FREE_LOCK(&lk);
2953		return;
2954	}
2955
2956	/*
2957	 * If the COMPLETE flag is clear, then there were no active
2958	 * entries and we want to roll back to the previous inode until
2959	 * the new inode is committed to disk. If the COMPLETE flag is
2960	 * set, then we have deleted an entry that never made it to disk.
2961	 * If the entry we deleted resulted from a name change, then the old
2962	 * inode reference still resides on disk. Any rollback that we do
2963	 * needs to be to that old inode (returned to us in prevdirrem). If
2964	 * the entry we deleted resulted from a create, then there is
2965	 * no entry on the disk, so we want to roll back to zero rather
2966	 * than the uncommitted inode. In either of the COMPLETE cases we
2967	 * want to immediately free the unwritten and unreferenced inode.
2968	 */
2969	if ((dirrem->dm_state & COMPLETE) == 0) {
2970		dap->da_previous = dirrem;
2971	} else {
2972		if (prevdirrem != NULL) {
2973			dap->da_previous = prevdirrem;
2974		} else {
2975			dap->da_state &= ~DIRCHG;
2976			dap->da_pagedep = pagedep;
2977		}
2978		dirrem->dm_dirinum = pagedep->pd_ino;
2979		add_to_worklist(&dirrem->dm_list);
2980	}
2981	/*
2982	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2983	 * is not yet written. If it is written, do the post-inode write
2984	 * processing to put it on the id_pendinghd list.
2985	 */
2986	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
2987	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2988		dap->da_state |= COMPLETE;
2989		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
2990		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
2991	} else {
2992		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
2993		    dap, da_pdlist);
2994		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2995	}
2996	FREE_LOCK(&lk);
2997}
2998
2999/*
3000 * Called whenever the link count on an inode is changed.
3001 * It creates an inode dependency so that the new reference(s)
3002 * to the inode cannot be committed to disk until the updated
3003 * inode has been written.
3004 */
3005void
3006softdep_change_linkcnt(ip)
3007	struct inode *ip;	/* the inode with the increased link count */
3008{
3009	struct inodedep *inodedep;
3010
3011	ACQUIRE_LOCK(&lk);
3012	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
3013	if (ip->i_nlink < ip->i_effnlink) {
3014		FREE_LOCK(&lk);
3015		panic("softdep_change_linkcnt: bad delta");
3016	}
3017	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3018	FREE_LOCK(&lk);
3019}
3020
3021/*
3022 * Called when the effective link count and the reference count
3023 * on an inode drops to zero. At this point there are no names
3024 * referencing the file in the filesystem and no active file
3025 * references. The space associated with the file will be freed
3026 * as soon as the necessary soft dependencies are cleared.
3027 */
3028void
3029softdep_releasefile(ip)
3030	struct inode *ip;	/* inode with the zero effective link count */
3031{
3032	struct inodedep *inodedep;
3033
3034	if (ip->i_effnlink > 0)
3035		panic("softdep_filerelease: file still referenced");
3036	/*
3037	 * We may be called several times as the real reference count
3038	 * drops to zero. We only want to account for the space once.
3039	 */
3040	if (ip->i_flag & IN_SPACECOUNTED)
3041		return;
3042	/*
3043	 * We have to deactivate a snapshot otherwise copyonwrites may
3044	 * add blocks and the cleanup may remove blocks after we have
3045	 * tried to account for them.
3046	 */
3047	if ((ip->i_flags & SF_SNAPSHOT) != 0)
3048		ffs_snapremove(ITOV(ip));
3049	/*
3050	 * If we are tracking an nlinkdelta, we have to also remember
3051	 * whether we accounted for the freed space yet.
3052	 */
3053	ACQUIRE_LOCK(&lk);
3054	if ((inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep)))
3055		inodedep->id_state |= SPACECOUNTED;
3056	FREE_LOCK(&lk);
3057	ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks);
3058	ip->i_fs->fs_pendinginodes += 1;
3059	ip->i_flag |= IN_SPACECOUNTED;
3060}
3061
3062/*
3063 * This workitem decrements the inode's link count.
3064 * If the link count reaches zero, the file is removed.
3065 */
3066static void
3067handle_workitem_remove(dirrem, xp)
3068	struct dirrem *dirrem;
3069	struct vnode *xp;
3070{
3071	struct thread *td = curthread;
3072	struct inodedep *inodedep;
3073	struct vnode *vp;
3074	struct inode *ip;
3075	ino_t oldinum;
3076	int error;
3077
3078	if ((vp = xp) == NULL &&
3079	    (error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, LK_EXCLUSIVE,
3080	     &vp)) != 0) {
3081		softdep_error("handle_workitem_remove: vget", error);
3082		return;
3083	}
3084	ip = VTOI(vp);
3085	ACQUIRE_LOCK(&lk);
3086	if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){
3087		FREE_LOCK(&lk);
3088		panic("handle_workitem_remove: lost inodedep");
3089	}
3090	/*
3091	 * Normal file deletion.
3092	 */
3093	if ((dirrem->dm_state & RMDIR) == 0) {
3094		ip->i_nlink--;
3095		DIP(ip, i_nlink) = ip->i_nlink;
3096		ip->i_flag |= IN_CHANGE;
3097		if (ip->i_nlink < ip->i_effnlink) {
3098			FREE_LOCK(&lk);
3099			panic("handle_workitem_remove: bad file delta");
3100		}
3101		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3102		FREE_LOCK(&lk);
3103		vput(vp);
3104		num_dirrem -= 1;
3105		WORKITEM_FREE(dirrem, D_DIRREM);
3106		return;
3107	}
3108	/*
3109	 * Directory deletion. Decrement reference count for both the
3110	 * just deleted parent directory entry and the reference for ".".
3111	 * Next truncate the directory to length zero. When the
3112	 * truncation completes, arrange to have the reference count on
3113	 * the parent decremented to account for the loss of "..".
3114	 */
3115	ip->i_nlink -= 2;
3116	DIP(ip, i_nlink) = ip->i_nlink;
3117	ip->i_flag |= IN_CHANGE;
3118	if (ip->i_nlink < ip->i_effnlink) {
3119		FREE_LOCK(&lk);
3120		panic("handle_workitem_remove: bad dir delta");
3121	}
3122	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3123	FREE_LOCK(&lk);
3124	if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
3125		softdep_error("handle_workitem_remove: truncate", error);
3126	/*
3127	 * Rename a directory to a new parent. Since, we are both deleting
3128	 * and creating a new directory entry, the link count on the new
3129	 * directory should not change. Thus we skip the followup dirrem.
3130	 */
3131	if (dirrem->dm_state & DIRCHG) {
3132		vput(vp);
3133		num_dirrem -= 1;
3134		WORKITEM_FREE(dirrem, D_DIRREM);
3135		return;
3136	}
3137	/*
3138	 * If the inodedep does not exist, then the zero'ed inode has
3139	 * been written to disk. If the allocated inode has never been
3140	 * written to disk, then the on-disk inode is zero'ed. In either
3141	 * case we can remove the file immediately.
3142	 */
3143	ACQUIRE_LOCK(&lk);
3144	dirrem->dm_state = 0;
3145	oldinum = dirrem->dm_oldinum;
3146	dirrem->dm_oldinum = dirrem->dm_dirinum;
3147	if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
3148	    check_inode_unwritten(inodedep)) {
3149		FREE_LOCK(&lk);
3150		vput(vp);
3151		handle_workitem_remove(dirrem, NULL);
3152		return;
3153	}
3154	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
3155	FREE_LOCK(&lk);
3156	vput(vp);
3157}
3158
3159/*
3160 * Inode de-allocation dependencies.
3161 *
3162 * When an inode's link count is reduced to zero, it can be de-allocated. We
3163 * found it convenient to postpone de-allocation until after the inode is
3164 * written to disk with its new link count (zero).  At this point, all of the
3165 * on-disk inode's block pointers are nullified and, with careful dependency
3166 * list ordering, all dependencies related to the inode will be satisfied and
3167 * the corresponding dependency structures de-allocated.  So, if/when the
3168 * inode is reused, there will be no mixing of old dependencies with new
3169 * ones.  This artificial dependency is set up by the block de-allocation
3170 * procedure above (softdep_setup_freeblocks) and completed by the
3171 * following procedure.
3172 */
3173static void
3174handle_workitem_freefile(freefile)
3175	struct freefile *freefile;
3176{
3177	struct fs *fs;
3178	struct inodedep *idp;
3179	int error;
3180
3181	fs = VFSTOUFS(freefile->fx_mnt)->um_fs;
3182#ifdef DEBUG
3183	ACQUIRE_LOCK(&lk);
3184	error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp);
3185	FREE_LOCK(&lk);
3186	if (error)
3187		panic("handle_workitem_freefile: inodedep survived");
3188#endif
3189	fs->fs_pendinginodes -= 1;
3190	if ((error = ffs_freefile(fs, freefile->fx_devvp, freefile->fx_oldinum,
3191	     freefile->fx_mode)) != 0)
3192		softdep_error("handle_workitem_freefile", error);
3193	WORKITEM_FREE(freefile, D_FREEFILE);
3194}
3195
3196/*
3197 * Disk writes.
3198 *
3199 * The dependency structures constructed above are most actively used when file
3200 * system blocks are written to disk.  No constraints are placed on when a
3201 * block can be written, but unsatisfied update dependencies are made safe by
3202 * modifying (or replacing) the source memory for the duration of the disk
3203 * write.  When the disk write completes, the memory block is again brought
3204 * up-to-date.
3205 *
3206 * In-core inode structure reclamation.
3207 *
3208 * Because there are a finite number of "in-core" inode structures, they are
3209 * reused regularly.  By transferring all inode-related dependencies to the
3210 * in-memory inode block and indexing them separately (via "inodedep"s), we
3211 * can allow "in-core" inode structures to be reused at any time and avoid
3212 * any increase in contention.
3213 *
3214 * Called just before entering the device driver to initiate a new disk I/O.
3215 * The buffer must be locked, thus, no I/O completion operations can occur
3216 * while we are manipulating its associated dependencies.
3217 */
3218static void
3219softdep_disk_io_initiation(bp)
3220	struct buf *bp;		/* structure describing disk write to occur */
3221{
3222	struct worklist *wk, *nextwk;
3223	struct indirdep *indirdep;
3224	struct inodedep *inodedep;
3225
3226	/*
3227	 * We only care about write operations. There should never
3228	 * be dependencies for reads.
3229	 */
3230	if (bp->b_iocmd == BIO_READ)
3231		panic("softdep_disk_io_initiation: read");
3232	/*
3233	 * Do any necessary pre-I/O processing.
3234	 */
3235	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
3236		nextwk = LIST_NEXT(wk, wk_list);
3237		switch (wk->wk_type) {
3238
3239		case D_PAGEDEP:
3240			initiate_write_filepage(WK_PAGEDEP(wk), bp);
3241			continue;
3242
3243		case D_INODEDEP:
3244			inodedep = WK_INODEDEP(wk);
3245			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
3246				initiate_write_inodeblock_ufs1(inodedep, bp);
3247			else
3248				initiate_write_inodeblock_ufs2(inodedep, bp);
3249			continue;
3250
3251		case D_INDIRDEP:
3252			indirdep = WK_INDIRDEP(wk);
3253			if (indirdep->ir_state & GOINGAWAY)
3254				panic("disk_io_initiation: indirdep gone");
3255			/*
3256			 * If there are no remaining dependencies, this
3257			 * will be writing the real pointers, so the
3258			 * dependency can be freed.
3259			 */
3260			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
3261				indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
3262				brelse(indirdep->ir_savebp);
3263				/* inline expand WORKLIST_REMOVE(wk); */
3264				wk->wk_state &= ~ONWORKLIST;
3265				LIST_REMOVE(wk, wk_list);
3266				WORKITEM_FREE(indirdep, D_INDIRDEP);
3267				continue;
3268			}
3269			/*
3270			 * Replace up-to-date version with safe version.
3271			 */
3272			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
3273			    M_INDIRDEP, M_SOFTDEP_FLAGS);
3274			ACQUIRE_LOCK(&lk);
3275			indirdep->ir_state &= ~ATTACHED;
3276			indirdep->ir_state |= UNDONE;
3277			bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
3278			bcopy(indirdep->ir_savebp->b_data, bp->b_data,
3279			    bp->b_bcount);
3280			FREE_LOCK(&lk);
3281			continue;
3282
3283		case D_MKDIR:
3284		case D_BMSAFEMAP:
3285		case D_ALLOCDIRECT:
3286		case D_ALLOCINDIR:
3287			continue;
3288
3289		default:
3290			panic("handle_disk_io_initiation: Unexpected type %s",
3291			    TYPENAME(wk->wk_type));
3292			/* NOTREACHED */
3293		}
3294	}
3295}
3296
3297/*
3298 * Called from within the procedure above to deal with unsatisfied
3299 * allocation dependencies in a directory. The buffer must be locked,
3300 * thus, no I/O completion operations can occur while we are
3301 * manipulating its associated dependencies.
3302 */
3303static void
3304initiate_write_filepage(pagedep, bp)
3305	struct pagedep *pagedep;
3306	struct buf *bp;
3307{
3308	struct diradd *dap;
3309	struct direct *ep;
3310	int i;
3311
3312	if (pagedep->pd_state & IOSTARTED) {
3313		/*
3314		 * This can only happen if there is a driver that does not
3315		 * understand chaining. Here biodone will reissue the call
3316		 * to strategy for the incomplete buffers.
3317		 */
3318		printf("initiate_write_filepage: already started\n");
3319		return;
3320	}
3321	pagedep->pd_state |= IOSTARTED;
3322	ACQUIRE_LOCK(&lk);
3323	for (i = 0; i < DAHASHSZ; i++) {
3324		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
3325			ep = (struct direct *)
3326			    ((char *)bp->b_data + dap->da_offset);
3327			if (ep->d_ino != dap->da_newinum) {
3328				FREE_LOCK(&lk);
3329				panic("%s: dir inum %d != new %d",
3330				    "initiate_write_filepage",
3331				    ep->d_ino, dap->da_newinum);
3332			}
3333			if (dap->da_state & DIRCHG)
3334				ep->d_ino = dap->da_previous->dm_oldinum;
3335			else
3336				ep->d_ino = 0;
3337			dap->da_state &= ~ATTACHED;
3338			dap->da_state |= UNDONE;
3339		}
3340	}
3341	FREE_LOCK(&lk);
3342}
3343
3344/*
3345 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
3346 * Note that any bug fixes made to this routine must be done in the
3347 * version found below.
3348 *
3349 * Called from within the procedure above to deal with unsatisfied
3350 * allocation dependencies in an inodeblock. The buffer must be
3351 * locked, thus, no I/O completion operations can occur while we
3352 * are manipulating its associated dependencies.
3353 */
3354static void
3355initiate_write_inodeblock_ufs1(inodedep, bp)
3356	struct inodedep *inodedep;
3357	struct buf *bp;			/* The inode block */
3358{
3359	struct allocdirect *adp, *lastadp;
3360	struct ufs1_dinode *dp;
3361	struct fs *fs;
3362	ufs_lbn_t i, prevlbn = 0;
3363	int deplist;
3364
3365	if (inodedep->id_state & IOSTARTED)
3366		panic("initiate_write_inodeblock_ufs1: already started");
3367	inodedep->id_state |= IOSTARTED;
3368	fs = inodedep->id_fs;
3369	dp = (struct ufs1_dinode *)bp->b_data +
3370	    ino_to_fsbo(fs, inodedep->id_ino);
3371	/*
3372	 * If the bitmap is not yet written, then the allocated
3373	 * inode cannot be written to disk.
3374	 */
3375	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3376		if (inodedep->id_savedino1 != NULL)
3377			panic("initiate_write_inodeblock_ufs1: I/O underway");
3378		MALLOC(inodedep->id_savedino1, struct ufs1_dinode *,
3379		    sizeof(struct ufs1_dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
3380		*inodedep->id_savedino1 = *dp;
3381		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
3382		return;
3383	}
3384	/*
3385	 * If no dependencies, then there is nothing to roll back.
3386	 */
3387	inodedep->id_savedsize = dp->di_size;
3388	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3389		return;
3390	/*
3391	 * Set the dependencies to busy.
3392	 */
3393	ACQUIRE_LOCK(&lk);
3394	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3395	     adp = TAILQ_NEXT(adp, ad_next)) {
3396#ifdef DIAGNOSTIC
3397		if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3398			FREE_LOCK(&lk);
3399			panic("softdep_write_inodeblock: lbn order");
3400		}
3401		prevlbn = adp->ad_lbn;
3402		if (adp->ad_lbn < NDADDR &&
3403		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
3404			FREE_LOCK(&lk);
3405			panic("%s: direct pointer #%lld mismatch %d != %lld",
3406			    "softdep_write_inodeblock",
3407			    (intmax_t)adp->ad_lbn,
3408			    dp->di_db[adp->ad_lbn],
3409			    (intmax_t)adp->ad_newblkno);
3410		}
3411		if (adp->ad_lbn >= NDADDR &&
3412		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
3413			FREE_LOCK(&lk);
3414			panic("%s: indirect pointer #%lld mismatch %d != %lld",
3415			    "softdep_write_inodeblock",
3416			    (intmax_t)adp->ad_lbn - NDADDR,
3417			    dp->di_ib[adp->ad_lbn - NDADDR],
3418			    (intmax_t)adp->ad_newblkno);
3419		}
3420		deplist |= 1 << adp->ad_lbn;
3421		if ((adp->ad_state & ATTACHED) == 0) {
3422			FREE_LOCK(&lk);
3423			panic("softdep_write_inodeblock: Unknown state 0x%x",
3424			    adp->ad_state);
3425		}
3426#endif /* DIAGNOSTIC */
3427		adp->ad_state &= ~ATTACHED;
3428		adp->ad_state |= UNDONE;
3429	}
3430	/*
3431	 * The on-disk inode cannot claim to be any larger than the last
3432	 * fragment that has been written. Otherwise, the on-disk inode
3433	 * might have fragments that were not the last block in the file
3434	 * which would corrupt the filesystem.
3435	 */
3436	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3437	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3438		if (adp->ad_lbn >= NDADDR)
3439			break;
3440		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3441		/* keep going until hitting a rollback to a frag */
3442		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3443			continue;
3444		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3445		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3446#ifdef DIAGNOSTIC
3447			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
3448				FREE_LOCK(&lk);
3449				panic("softdep_write_inodeblock: lost dep1");
3450			}
3451#endif /* DIAGNOSTIC */
3452			dp->di_db[i] = 0;
3453		}
3454		for (i = 0; i < NIADDR; i++) {
3455#ifdef DIAGNOSTIC
3456			if (dp->di_ib[i] != 0 &&
3457			    (deplist & ((1 << NDADDR) << i)) == 0) {
3458				FREE_LOCK(&lk);
3459				panic("softdep_write_inodeblock: lost dep2");
3460			}
3461#endif /* DIAGNOSTIC */
3462			dp->di_ib[i] = 0;
3463		}
3464		FREE_LOCK(&lk);
3465		return;
3466	}
3467	/*
3468	 * If we have zero'ed out the last allocated block of the file,
3469	 * roll back the size to the last currently allocated block.
3470	 * We know that this last allocated block is a full-sized as
3471	 * we already checked for fragments in the loop above.
3472	 */
3473	if (lastadp != NULL &&
3474	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3475		for (i = lastadp->ad_lbn; i >= 0; i--)
3476			if (dp->di_db[i] != 0)
3477				break;
3478		dp->di_size = (i + 1) * fs->fs_bsize;
3479	}
3480	/*
3481	 * The only dependencies are for indirect blocks.
3482	 *
3483	 * The file size for indirect block additions is not guaranteed.
3484	 * Such a guarantee would be non-trivial to achieve. The conventional
3485	 * synchronous write implementation also does not make this guarantee.
3486	 * Fsck should catch and fix discrepancies. Arguably, the file size
3487	 * can be over-estimated without destroying integrity when the file
3488	 * moves into the indirect blocks (i.e., is large). If we want to
3489	 * postpone fsck, we are stuck with this argument.
3490	 */
3491	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3492		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3493	FREE_LOCK(&lk);
3494}
3495
3496/*
3497 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
3498 * Note that any bug fixes made to this routine must be done in the
3499 * version found above.
3500 *
3501 * Called from within the procedure above to deal with unsatisfied
3502 * allocation dependencies in an inodeblock. The buffer must be
3503 * locked, thus, no I/O completion operations can occur while we
3504 * are manipulating its associated dependencies.
3505 */
3506static void
3507initiate_write_inodeblock_ufs2(inodedep, bp)
3508	struct inodedep *inodedep;
3509	struct buf *bp;			/* The inode block */
3510{
3511	struct allocdirect *adp, *lastadp;
3512	struct ufs2_dinode *dp;
3513	struct fs *fs;
3514	ufs_lbn_t i, prevlbn = 0;
3515	int deplist;
3516
3517	if (inodedep->id_state & IOSTARTED)
3518		panic("initiate_write_inodeblock_ufs2: already started");
3519	inodedep->id_state |= IOSTARTED;
3520	fs = inodedep->id_fs;
3521	dp = (struct ufs2_dinode *)bp->b_data +
3522	    ino_to_fsbo(fs, inodedep->id_ino);
3523	/*
3524	 * If the bitmap is not yet written, then the allocated
3525	 * inode cannot be written to disk.
3526	 */
3527	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3528		if (inodedep->id_savedino2 != NULL)
3529			panic("initiate_write_inodeblock_ufs2: I/O underway");
3530		MALLOC(inodedep->id_savedino2, struct ufs2_dinode *,
3531		    sizeof(struct ufs2_dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
3532		*inodedep->id_savedino2 = *dp;
3533		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
3534		return;
3535	}
3536	/*
3537	 * If no dependencies, then there is nothing to roll back.
3538	 */
3539	inodedep->id_savedsize = dp->di_size;
3540	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3541		return;
3542	/*
3543	 * Set the dependencies to busy.
3544	 */
3545	ACQUIRE_LOCK(&lk);
3546	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3547	     adp = TAILQ_NEXT(adp, ad_next)) {
3548#ifdef DIAGNOSTIC
3549		if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3550			FREE_LOCK(&lk);
3551			panic("softdep_write_inodeblock: lbn order");
3552		}
3553		prevlbn = adp->ad_lbn;
3554		if (adp->ad_lbn < NDADDR &&
3555		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
3556			FREE_LOCK(&lk);
3557			panic("%s: direct pointer #%lld mismatch %lld != %lld",
3558			    "softdep_write_inodeblock",
3559			    (intmax_t)adp->ad_lbn,
3560			    (intmax_t)dp->di_db[adp->ad_lbn],
3561			    (intmax_t)adp->ad_newblkno);
3562		}
3563		if (adp->ad_lbn >= NDADDR &&
3564		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
3565			FREE_LOCK(&lk);
3566			panic("%s indirect pointer #%lld mismatch %lld != %lld",
3567			    "softdep_write_inodeblock:",
3568			    (intmax_t)adp->ad_lbn - NDADDR,
3569			    (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
3570			    (intmax_t)adp->ad_newblkno);
3571		}
3572		deplist |= 1 << adp->ad_lbn;
3573		if ((adp->ad_state & ATTACHED) == 0) {
3574			FREE_LOCK(&lk);
3575			panic("softdep_write_inodeblock: Unknown state 0x%x",
3576			    adp->ad_state);
3577		}
3578#endif /* DIAGNOSTIC */
3579		adp->ad_state &= ~ATTACHED;
3580		adp->ad_state |= UNDONE;
3581	}
3582	/*
3583	 * The on-disk inode cannot claim to be any larger than the last
3584	 * fragment that has been written. Otherwise, the on-disk inode
3585	 * might have fragments that were not the last block in the file
3586	 * which would corrupt the filesystem.
3587	 */
3588	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3589	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3590		if (adp->ad_lbn >= NDADDR)
3591			break;
3592		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3593		/* keep going until hitting a rollback to a frag */
3594		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3595			continue;
3596		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3597		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3598#ifdef DIAGNOSTIC
3599			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
3600				FREE_LOCK(&lk);
3601				panic("softdep_write_inodeblock: lost dep1");
3602			}
3603#endif /* DIAGNOSTIC */
3604			dp->di_db[i] = 0;
3605		}
3606		for (i = 0; i < NIADDR; i++) {
3607#ifdef DIAGNOSTIC
3608			if (dp->di_ib[i] != 0 &&
3609			    (deplist & ((1 << NDADDR) << i)) == 0) {
3610				FREE_LOCK(&lk);
3611				panic("softdep_write_inodeblock: lost dep2");
3612			}
3613#endif /* DIAGNOSTIC */
3614			dp->di_ib[i] = 0;
3615		}
3616		FREE_LOCK(&lk);
3617		return;
3618	}
3619	/*
3620	 * If we have zero'ed out the last allocated block of the file,
3621	 * roll back the size to the last currently allocated block.
3622	 * We know that this last allocated block is a full-sized as
3623	 * we already checked for fragments in the loop above.
3624	 */
3625	if (lastadp != NULL &&
3626	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3627		for (i = lastadp->ad_lbn; i >= 0; i--)
3628			if (dp->di_db[i] != 0)
3629				break;
3630		dp->di_size = (i + 1) * fs->fs_bsize;
3631	}
3632	/*
3633	 * The only dependencies are for indirect blocks.
3634	 *
3635	 * The file size for indirect block additions is not guaranteed.
3636	 * Such a guarantee would be non-trivial to achieve. The conventional
3637	 * synchronous write implementation also does not make this guarantee.
3638	 * Fsck should catch and fix discrepancies. Arguably, the file size
3639	 * can be over-estimated without destroying integrity when the file
3640	 * moves into the indirect blocks (i.e., is large). If we want to
3641	 * postpone fsck, we are stuck with this argument.
3642	 */
3643	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3644		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3645	FREE_LOCK(&lk);
3646}
3647
3648/*
3649 * This routine is called during the completion interrupt
3650 * service routine for a disk write (from the procedure called
3651 * by the device driver to inform the filesystem caches of
3652 * a request completion).  It should be called early in this
3653 * procedure, before the block is made available to other
3654 * processes or other routines are called.
3655 */
3656static void
3657softdep_disk_write_complete(bp)
3658	struct buf *bp;		/* describes the completed disk write */
3659{
3660	struct worklist *wk;
3661	struct workhead reattach;
3662	struct newblk *newblk;
3663	struct allocindir *aip;
3664	struct allocdirect *adp;
3665	struct indirdep *indirdep;
3666	struct inodedep *inodedep;
3667	struct bmsafemap *bmsafemap;
3668
3669#ifdef DEBUG
3670	if (lk.lkt_held != NOHOLDER)
3671		panic("softdep_disk_write_complete: lock is held");
3672	lk.lkt_held = SPECIAL_FLAG;
3673#endif
3674	LIST_INIT(&reattach);
3675	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
3676		WORKLIST_REMOVE(wk);
3677		switch (wk->wk_type) {
3678
3679		case D_PAGEDEP:
3680			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
3681				WORKLIST_INSERT(&reattach, wk);
3682			continue;
3683
3684		case D_INODEDEP:
3685			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
3686				WORKLIST_INSERT(&reattach, wk);
3687			continue;
3688
3689		case D_BMSAFEMAP:
3690			bmsafemap = WK_BMSAFEMAP(wk);
3691			while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
3692				newblk->nb_state |= DEPCOMPLETE;
3693				newblk->nb_bmsafemap = NULL;
3694				LIST_REMOVE(newblk, nb_deps);
3695			}
3696			while ((adp =
3697			   LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
3698				adp->ad_state |= DEPCOMPLETE;
3699				adp->ad_buf = NULL;
3700				LIST_REMOVE(adp, ad_deps);
3701				handle_allocdirect_partdone(adp);
3702			}
3703			while ((aip =
3704			    LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
3705				aip->ai_state |= DEPCOMPLETE;
3706				aip->ai_buf = NULL;
3707				LIST_REMOVE(aip, ai_deps);
3708				handle_allocindir_partdone(aip);
3709			}
3710			while ((inodedep =
3711			     LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
3712				inodedep->id_state |= DEPCOMPLETE;
3713				LIST_REMOVE(inodedep, id_deps);
3714				inodedep->id_buf = NULL;
3715			}
3716			WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
3717			continue;
3718
3719		case D_MKDIR:
3720			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
3721			continue;
3722
3723		case D_ALLOCDIRECT:
3724			adp = WK_ALLOCDIRECT(wk);
3725			adp->ad_state |= COMPLETE;
3726			handle_allocdirect_partdone(adp);
3727			continue;
3728
3729		case D_ALLOCINDIR:
3730			aip = WK_ALLOCINDIR(wk);
3731			aip->ai_state |= COMPLETE;
3732			handle_allocindir_partdone(aip);
3733			continue;
3734
3735		case D_INDIRDEP:
3736			indirdep = WK_INDIRDEP(wk);
3737			if (indirdep->ir_state & GOINGAWAY) {
3738				lk.lkt_held = NOHOLDER;
3739				panic("disk_write_complete: indirdep gone");
3740			}
3741			bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
3742			FREE(indirdep->ir_saveddata, M_INDIRDEP);
3743			indirdep->ir_saveddata = 0;
3744			indirdep->ir_state &= ~UNDONE;
3745			indirdep->ir_state |= ATTACHED;
3746			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
3747				handle_allocindir_partdone(aip);
3748				if (aip == LIST_FIRST(&indirdep->ir_donehd)) {
3749					lk.lkt_held = NOHOLDER;
3750					panic("disk_write_complete: not gone");
3751				}
3752			}
3753			WORKLIST_INSERT(&reattach, wk);
3754			if ((bp->b_flags & B_DELWRI) == 0)
3755				stat_indir_blk_ptrs++;
3756			bdirty(bp);
3757			continue;
3758
3759		default:
3760			lk.lkt_held = NOHOLDER;
3761			panic("handle_disk_write_complete: Unknown type %s",
3762			    TYPENAME(wk->wk_type));
3763			/* NOTREACHED */
3764		}
3765	}
3766	/*
3767	 * Reattach any requests that must be redone.
3768	 */
3769	while ((wk = LIST_FIRST(&reattach)) != NULL) {
3770		WORKLIST_REMOVE(wk);
3771		WORKLIST_INSERT(&bp->b_dep, wk);
3772	}
3773#ifdef DEBUG
3774	if (lk.lkt_held != SPECIAL_FLAG)
3775		panic("softdep_disk_write_complete: lock lost");
3776	lk.lkt_held = NOHOLDER;
3777#endif
3778}
3779
3780/*
3781 * Called from within softdep_disk_write_complete above. Note that
3782 * this routine is always called from interrupt level with further
3783 * splbio interrupts blocked.
3784 */
3785static void
3786handle_allocdirect_partdone(adp)
3787	struct allocdirect *adp;	/* the completed allocdirect */
3788{
3789	struct allocdirect *listadp;
3790	struct inodedep *inodedep;
3791	long bsize, delay;
3792
3793	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3794		return;
3795	if (adp->ad_buf != NULL) {
3796		lk.lkt_held = NOHOLDER;
3797		panic("handle_allocdirect_partdone: dangling dep");
3798	}
3799	/*
3800	 * The on-disk inode cannot claim to be any larger than the last
3801	 * fragment that has been written. Otherwise, the on-disk inode
3802	 * might have fragments that were not the last block in the file
3803	 * which would corrupt the filesystem. Thus, we cannot free any
3804	 * allocdirects after one whose ad_oldblkno claims a fragment as
3805	 * these blocks must be rolled back to zero before writing the inode.
3806	 * We check the currently active set of allocdirects in id_inoupdt.
3807	 */
3808	inodedep = adp->ad_inodedep;
3809	bsize = inodedep->id_fs->fs_bsize;
3810	TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
3811		/* found our block */
3812		if (listadp == adp)
3813			break;
3814		/* continue if ad_oldlbn is not a fragment */
3815		if (listadp->ad_oldsize == 0 ||
3816		    listadp->ad_oldsize == bsize)
3817			continue;
3818		/* hit a fragment */
3819		return;
3820	}
3821	/*
3822	 * If we have reached the end of the current list without
3823	 * finding the just finished dependency, then it must be
3824	 * on the future dependency list. Future dependencies cannot
3825	 * be freed until they are moved to the current list.
3826	 */
3827	if (listadp == NULL) {
3828#ifdef DEBUG
3829		TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
3830			/* found our block */
3831			if (listadp == adp)
3832				break;
3833		if (listadp == NULL) {
3834			lk.lkt_held = NOHOLDER;
3835			panic("handle_allocdirect_partdone: lost dep");
3836		}
3837#endif /* DEBUG */
3838		return;
3839	}
3840	/*
3841	 * If we have found the just finished dependency, then free
3842	 * it along with anything that follows it that is complete.
3843	 * If the inode still has a bitmap dependency, then it has
3844	 * never been written to disk, hence the on-disk inode cannot
3845	 * reference the old fragment so we can free it without delay.
3846	 */
3847	delay = (inodedep->id_state & DEPCOMPLETE);
3848	for (; adp; adp = listadp) {
3849		listadp = TAILQ_NEXT(adp, ad_next);
3850		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3851			return;
3852		free_allocdirect(&inodedep->id_inoupdt, adp, delay);
3853	}
3854}
3855
3856/*
3857 * Called from within softdep_disk_write_complete above. Note that
3858 * this routine is always called from interrupt level with further
3859 * splbio interrupts blocked.
3860 */
3861static void
3862handle_allocindir_partdone(aip)
3863	struct allocindir *aip;		/* the completed allocindir */
3864{
3865	struct indirdep *indirdep;
3866
3867	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
3868		return;
3869	if (aip->ai_buf != NULL) {
3870		lk.lkt_held = NOHOLDER;
3871		panic("handle_allocindir_partdone: dangling dependency");
3872	}
3873	indirdep = aip->ai_indirdep;
3874	if (indirdep->ir_state & UNDONE) {
3875		LIST_REMOVE(aip, ai_next);
3876		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
3877		return;
3878	}
3879	if (indirdep->ir_state & UFS1FMT)
3880		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3881		    aip->ai_newblkno;
3882	else
3883		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3884		    aip->ai_newblkno;
3885	LIST_REMOVE(aip, ai_next);
3886	if (aip->ai_freefrag != NULL)
3887		add_to_worklist(&aip->ai_freefrag->ff_list);
3888	WORKITEM_FREE(aip, D_ALLOCINDIR);
3889}
3890
3891/*
3892 * Called from within softdep_disk_write_complete above to restore
3893 * in-memory inode block contents to their most up-to-date state. Note
3894 * that this routine is always called from interrupt level with further
3895 * splbio interrupts blocked.
3896 */
3897static int
3898handle_written_inodeblock(inodedep, bp)
3899	struct inodedep *inodedep;
3900	struct buf *bp;		/* buffer containing the inode block */
3901{
3902	struct worklist *wk, *filefree;
3903	struct allocdirect *adp, *nextadp;
3904	struct ufs1_dinode *dp1 = NULL;
3905	struct ufs2_dinode *dp2 = NULL;
3906	int hadchanges, fstype;
3907
3908	if ((inodedep->id_state & IOSTARTED) == 0) {
3909		lk.lkt_held = NOHOLDER;
3910		panic("handle_written_inodeblock: not started");
3911	}
3912	inodedep->id_state &= ~IOSTARTED;
3913	inodedep->id_state |= COMPLETE;
3914	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
3915		fstype = UFS1;
3916		dp1 = (struct ufs1_dinode *)bp->b_data +
3917		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
3918	} else {
3919		fstype = UFS2;
3920		dp2 = (struct ufs2_dinode *)bp->b_data +
3921		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
3922	}
3923	/*
3924	 * If we had to rollback the inode allocation because of
3925	 * bitmaps being incomplete, then simply restore it.
3926	 * Keep the block dirty so that it will not be reclaimed until
3927	 * all associated dependencies have been cleared and the
3928	 * corresponding updates written to disk.
3929	 */
3930	if (inodedep->id_savedino1 != NULL) {
3931		if (fstype == UFS1)
3932			*dp1 = *inodedep->id_savedino1;
3933		else
3934			*dp2 = *inodedep->id_savedino2;
3935		FREE(inodedep->id_savedino1, M_INODEDEP);
3936		inodedep->id_savedino1 = NULL;
3937		if ((bp->b_flags & B_DELWRI) == 0)
3938			stat_inode_bitmap++;
3939		bdirty(bp);
3940		return (1);
3941	}
3942	/*
3943	 * Roll forward anything that had to be rolled back before
3944	 * the inode could be updated.
3945	 */
3946	hadchanges = 0;
3947	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
3948		nextadp = TAILQ_NEXT(adp, ad_next);
3949		if (adp->ad_state & ATTACHED) {
3950			lk.lkt_held = NOHOLDER;
3951			panic("handle_written_inodeblock: new entry");
3952		}
3953		if (fstype == UFS1) {
3954			if (adp->ad_lbn < NDADDR) {
3955				if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno) {
3956					lk.lkt_held = NOHOLDER;
3957					panic("%s %s #%lld mismatch %d != %lld",
3958					    "handle_written_inodeblock:",
3959					    "direct pointer",
3960					    (intmax_t)adp->ad_lbn,
3961					    dp1->di_db[adp->ad_lbn],
3962					    (intmax_t)adp->ad_oldblkno);
3963				}
3964				dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
3965			} else {
3966				if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0) {
3967					lk.lkt_held = NOHOLDER;
3968					panic("%s: %s #%lld allocated as %d",
3969					    "handle_written_inodeblock",
3970					    "indirect pointer",
3971					    (intmax_t)adp->ad_lbn - NDADDR,
3972					    dp1->di_ib[adp->ad_lbn - NDADDR]);
3973				}
3974				dp1->di_ib[adp->ad_lbn - NDADDR] =
3975				    adp->ad_newblkno;
3976			}
3977		} else {
3978			if (adp->ad_lbn < NDADDR) {
3979				if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno) {
3980					lk.lkt_held = NOHOLDER;
3981					panic("%s: %s #%lld %s %lld != %lld",
3982					    "handle_written_inodeblock",
3983					    "direct pointer",
3984					    (intmax_t)adp->ad_lbn, "mismatch",
3985					    (intmax_t)dp2->di_db[adp->ad_lbn],
3986					    (intmax_t)adp->ad_oldblkno);
3987				}
3988				dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
3989			} else {
3990				if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0) {
3991					lk.lkt_held = NOHOLDER;
3992					panic("%s: %s #%lld allocated as %lld",
3993					    "handle_written_inodeblock",
3994					    "indirect pointer",
3995					    (intmax_t)adp->ad_lbn - NDADDR,
3996					    (intmax_t)
3997					    dp2->di_ib[adp->ad_lbn - NDADDR]);
3998				}
3999				dp2->di_ib[adp->ad_lbn - NDADDR] =
4000				    adp->ad_newblkno;
4001			}
4002		}
4003		adp->ad_state &= ~UNDONE;
4004		adp->ad_state |= ATTACHED;
4005		hadchanges = 1;
4006	}
4007	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
4008		stat_direct_blk_ptrs++;
4009	/*
4010	 * Reset the file size to its most up-to-date value.
4011	 */
4012	if (inodedep->id_savedsize == -1) {
4013		lk.lkt_held = NOHOLDER;
4014		panic("handle_written_inodeblock: bad size");
4015	}
4016	if (fstype == UFS1) {
4017		if (dp1->di_size != inodedep->id_savedsize) {
4018			dp1->di_size = inodedep->id_savedsize;
4019			hadchanges = 1;
4020		}
4021	} else {
4022		if (dp2->di_size != inodedep->id_savedsize) {
4023			dp2->di_size = inodedep->id_savedsize;
4024			hadchanges = 1;
4025		}
4026	}
4027	inodedep->id_savedsize = -1;
4028	/*
4029	 * If there were any rollbacks in the inode block, then it must be
4030	 * marked dirty so that its will eventually get written back in
4031	 * its correct form.
4032	 */
4033	if (hadchanges)
4034		bdirty(bp);
4035	/*
4036	 * Process any allocdirects that completed during the update.
4037	 */
4038	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
4039		handle_allocdirect_partdone(adp);
4040	/*
4041	 * Process deallocations that were held pending until the
4042	 * inode had been written to disk. Freeing of the inode
4043	 * is delayed until after all blocks have been freed to
4044	 * avoid creation of new <vfsid, inum, lbn> triples
4045	 * before the old ones have been deleted.
4046	 */
4047	filefree = NULL;
4048	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
4049		WORKLIST_REMOVE(wk);
4050		switch (wk->wk_type) {
4051
4052		case D_FREEFILE:
4053			/*
4054			 * We defer adding filefree to the worklist until
4055			 * all other additions have been made to ensure
4056			 * that it will be done after all the old blocks
4057			 * have been freed.
4058			 */
4059			if (filefree != NULL) {
4060				lk.lkt_held = NOHOLDER;
4061				panic("handle_written_inodeblock: filefree");
4062			}
4063			filefree = wk;
4064			continue;
4065
4066		case D_MKDIR:
4067			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
4068			continue;
4069
4070		case D_DIRADD:
4071			diradd_inode_written(WK_DIRADD(wk), inodedep);
4072			continue;
4073
4074		case D_FREEBLKS:
4075		case D_FREEFRAG:
4076		case D_DIRREM:
4077			add_to_worklist(wk);
4078			continue;
4079
4080		case D_NEWDIRBLK:
4081			free_newdirblk(WK_NEWDIRBLK(wk));
4082			continue;
4083
4084		default:
4085			lk.lkt_held = NOHOLDER;
4086			panic("handle_written_inodeblock: Unknown type %s",
4087			    TYPENAME(wk->wk_type));
4088			/* NOTREACHED */
4089		}
4090	}
4091	if (filefree != NULL) {
4092		if (free_inodedep(inodedep) == 0) {
4093			lk.lkt_held = NOHOLDER;
4094			panic("handle_written_inodeblock: live inodedep");
4095		}
4096		add_to_worklist(filefree);
4097		return (0);
4098	}
4099
4100	/*
4101	 * If no outstanding dependencies, free it.
4102	 */
4103	if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
4104		return (0);
4105	return (hadchanges);
4106}
4107
4108/*
4109 * Process a diradd entry after its dependent inode has been written.
4110 * This routine must be called with splbio interrupts blocked.
4111 */
4112static void
4113diradd_inode_written(dap, inodedep)
4114	struct diradd *dap;
4115	struct inodedep *inodedep;
4116{
4117	struct pagedep *pagedep;
4118
4119	dap->da_state |= COMPLETE;
4120	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4121		if (dap->da_state & DIRCHG)
4122			pagedep = dap->da_previous->dm_pagedep;
4123		else
4124			pagedep = dap->da_pagedep;
4125		LIST_REMOVE(dap, da_pdlist);
4126		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4127	}
4128	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
4129}
4130
4131/*
4132 * Handle the completion of a mkdir dependency.
4133 */
4134static void
4135handle_written_mkdir(mkdir, type)
4136	struct mkdir *mkdir;
4137	int type;
4138{
4139	struct diradd *dap;
4140	struct pagedep *pagedep;
4141
4142	if (mkdir->md_state != type) {
4143		lk.lkt_held = NOHOLDER;
4144		panic("handle_written_mkdir: bad type");
4145	}
4146	dap = mkdir->md_diradd;
4147	dap->da_state &= ~type;
4148	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
4149		dap->da_state |= DEPCOMPLETE;
4150	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4151		if (dap->da_state & DIRCHG)
4152			pagedep = dap->da_previous->dm_pagedep;
4153		else
4154			pagedep = dap->da_pagedep;
4155		LIST_REMOVE(dap, da_pdlist);
4156		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4157	}
4158	LIST_REMOVE(mkdir, md_mkdirs);
4159	WORKITEM_FREE(mkdir, D_MKDIR);
4160}
4161
4162/*
4163 * Called from within softdep_disk_write_complete above.
4164 * A write operation was just completed. Removed inodes can
4165 * now be freed and associated block pointers may be committed.
4166 * Note that this routine is always called from interrupt level
4167 * with further splbio interrupts blocked.
4168 */
4169static int
4170handle_written_filepage(pagedep, bp)
4171	struct pagedep *pagedep;
4172	struct buf *bp;		/* buffer containing the written page */
4173{
4174	struct dirrem *dirrem;
4175	struct diradd *dap, *nextdap;
4176	struct direct *ep;
4177	int i, chgs;
4178
4179	if ((pagedep->pd_state & IOSTARTED) == 0) {
4180		lk.lkt_held = NOHOLDER;
4181		panic("handle_written_filepage: not started");
4182	}
4183	pagedep->pd_state &= ~IOSTARTED;
4184	/*
4185	 * Process any directory removals that have been committed.
4186	 */
4187	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
4188		LIST_REMOVE(dirrem, dm_next);
4189		dirrem->dm_dirinum = pagedep->pd_ino;
4190		add_to_worklist(&dirrem->dm_list);
4191	}
4192	/*
4193	 * Free any directory additions that have been committed.
4194	 * If it is a newly allocated block, we have to wait until
4195	 * the on-disk directory inode claims the new block.
4196	 */
4197	if ((pagedep->pd_state & NEWBLOCK) == 0)
4198		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
4199			free_diradd(dap);
4200	/*
4201	 * Uncommitted directory entries must be restored.
4202	 */
4203	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
4204		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
4205		     dap = nextdap) {
4206			nextdap = LIST_NEXT(dap, da_pdlist);
4207			if (dap->da_state & ATTACHED) {
4208				lk.lkt_held = NOHOLDER;
4209				panic("handle_written_filepage: attached");
4210			}
4211			ep = (struct direct *)
4212			    ((char *)bp->b_data + dap->da_offset);
4213			ep->d_ino = dap->da_newinum;
4214			dap->da_state &= ~UNDONE;
4215			dap->da_state |= ATTACHED;
4216			chgs = 1;
4217			/*
4218			 * If the inode referenced by the directory has
4219			 * been written out, then the dependency can be
4220			 * moved to the pending list.
4221			 */
4222			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4223				LIST_REMOVE(dap, da_pdlist);
4224				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
4225				    da_pdlist);
4226			}
4227		}
4228	}
4229	/*
4230	 * If there were any rollbacks in the directory, then it must be
4231	 * marked dirty so that its will eventually get written back in
4232	 * its correct form.
4233	 */
4234	if (chgs) {
4235		if ((bp->b_flags & B_DELWRI) == 0)
4236			stat_dir_entry++;
4237		bdirty(bp);
4238		return (1);
4239	}
4240	/*
4241	 * If we are not waiting for a new directory block to be
4242	 * claimed by its inode, then the pagedep will be freed.
4243	 * Otherwise it will remain to track any new entries on
4244	 * the page in case they are fsync'ed.
4245	 */
4246	if ((pagedep->pd_state & NEWBLOCK) == 0) {
4247		LIST_REMOVE(pagedep, pd_hash);
4248		WORKITEM_FREE(pagedep, D_PAGEDEP);
4249	}
4250	return (0);
4251}
4252
4253/*
4254 * Writing back in-core inode structures.
4255 *
4256 * The filesystem only accesses an inode's contents when it occupies an
4257 * "in-core" inode structure.  These "in-core" structures are separate from
4258 * the page frames used to cache inode blocks.  Only the latter are
4259 * transferred to/from the disk.  So, when the updated contents of the
4260 * "in-core" inode structure are copied to the corresponding in-memory inode
4261 * block, the dependencies are also transferred.  The following procedure is
4262 * called when copying a dirty "in-core" inode to a cached inode block.
4263 */
4264
4265/*
4266 * Called when an inode is loaded from disk. If the effective link count
4267 * differed from the actual link count when it was last flushed, then we
4268 * need to ensure that the correct effective link count is put back.
4269 */
4270void
4271softdep_load_inodeblock(ip)
4272	struct inode *ip;	/* the "in_core" copy of the inode */
4273{
4274	struct inodedep *inodedep;
4275
4276	/*
4277	 * Check for alternate nlink count.
4278	 */
4279	ip->i_effnlink = ip->i_nlink;
4280	ACQUIRE_LOCK(&lk);
4281	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
4282		FREE_LOCK(&lk);
4283		return;
4284	}
4285	ip->i_effnlink -= inodedep->id_nlinkdelta;
4286	if (inodedep->id_state & SPACECOUNTED)
4287		ip->i_flag |= IN_SPACECOUNTED;
4288	FREE_LOCK(&lk);
4289}
4290
4291/*
4292 * This routine is called just before the "in-core" inode
4293 * information is to be copied to the in-memory inode block.
4294 * Recall that an inode block contains several inodes. If
4295 * the force flag is set, then the dependencies will be
4296 * cleared so that the update can always be made. Note that
4297 * the buffer is locked when this routine is called, so we
4298 * will never be in the middle of writing the inode block
4299 * to disk.
4300 */
4301void
4302softdep_update_inodeblock(ip, bp, waitfor)
4303	struct inode *ip;	/* the "in_core" copy of the inode */
4304	struct buf *bp;		/* the buffer containing the inode block */
4305	int waitfor;		/* nonzero => update must be allowed */
4306{
4307	struct inodedep *inodedep;
4308	struct worklist *wk;
4309	int error, gotit;
4310
4311	/*
4312	 * If the effective link count is not equal to the actual link
4313	 * count, then we must track the difference in an inodedep while
4314	 * the inode is (potentially) tossed out of the cache. Otherwise,
4315	 * if there is no existing inodedep, then there are no dependencies
4316	 * to track.
4317	 */
4318	ACQUIRE_LOCK(&lk);
4319	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
4320		FREE_LOCK(&lk);
4321		if (ip->i_effnlink != ip->i_nlink)
4322			panic("softdep_update_inodeblock: bad link count");
4323		return;
4324	}
4325	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) {
4326		FREE_LOCK(&lk);
4327		panic("softdep_update_inodeblock: bad delta");
4328	}
4329	/*
4330	 * Changes have been initiated. Anything depending on these
4331	 * changes cannot occur until this inode has been written.
4332	 */
4333	inodedep->id_state &= ~COMPLETE;
4334	if ((inodedep->id_state & ONWORKLIST) == 0)
4335		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
4336	/*
4337	 * Any new dependencies associated with the incore inode must
4338	 * now be moved to the list associated with the buffer holding
4339	 * the in-memory copy of the inode. Once merged process any
4340	 * allocdirects that are completed by the merger.
4341	 */
4342	merge_inode_lists(inodedep);
4343	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
4344		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
4345	/*
4346	 * Now that the inode has been pushed into the buffer, the
4347	 * operations dependent on the inode being written to disk
4348	 * can be moved to the id_bufwait so that they will be
4349	 * processed when the buffer I/O completes.
4350	 */
4351	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
4352		WORKLIST_REMOVE(wk);
4353		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
4354	}
4355	/*
4356	 * Newly allocated inodes cannot be written until the bitmap
4357	 * that allocates them have been written (indicated by
4358	 * DEPCOMPLETE being set in id_state). If we are doing a
4359	 * forced sync (e.g., an fsync on a file), we force the bitmap
4360	 * to be written so that the update can be done.
4361	 */
4362	if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
4363		FREE_LOCK(&lk);
4364		return;
4365	}
4366	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
4367	FREE_LOCK(&lk);
4368	if (gotit &&
4369	    (error = BUF_WRITE(inodedep->id_buf)) != 0)
4370		softdep_error("softdep_update_inodeblock: bwrite", error);
4371	if ((inodedep->id_state & DEPCOMPLETE) == 0)
4372		panic("softdep_update_inodeblock: update failed");
4373}
4374
4375/*
4376 * Merge the new inode dependency list (id_newinoupdt) into the old
4377 * inode dependency list (id_inoupdt). This routine must be called
4378 * with splbio interrupts blocked.
4379 */
4380static void
4381merge_inode_lists(inodedep)
4382	struct inodedep *inodedep;
4383{
4384	struct allocdirect *listadp, *newadp;
4385
4386	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
4387	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
4388		if (listadp->ad_lbn < newadp->ad_lbn) {
4389			listadp = TAILQ_NEXT(listadp, ad_next);
4390			continue;
4391		}
4392		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
4393		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
4394		if (listadp->ad_lbn == newadp->ad_lbn) {
4395			allocdirect_merge(&inodedep->id_inoupdt, newadp,
4396			    listadp);
4397			listadp = newadp;
4398		}
4399		newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
4400	}
4401	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
4402		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
4403		TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
4404	}
4405}
4406
4407/*
4408 * If we are doing an fsync, then we must ensure that any directory
4409 * entries for the inode have been written after the inode gets to disk.
4410 */
4411int
4412softdep_fsync(vp)
4413	struct vnode *vp;	/* the "in_core" copy of the inode */
4414{
4415	struct inodedep *inodedep;
4416	struct pagedep *pagedep;
4417	struct worklist *wk;
4418	struct diradd *dap;
4419	struct mount *mnt;
4420	struct vnode *pvp;
4421	struct inode *ip;
4422	struct buf *bp;
4423	struct fs *fs;
4424	struct thread *td = curthread;
4425	int error, flushparent;
4426	ino_t parentino;
4427	ufs_lbn_t lbn;
4428
4429	ip = VTOI(vp);
4430	fs = ip->i_fs;
4431	ACQUIRE_LOCK(&lk);
4432	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
4433		FREE_LOCK(&lk);
4434		return (0);
4435	}
4436	if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
4437	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
4438	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
4439	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
4440		FREE_LOCK(&lk);
4441		panic("softdep_fsync: pending ops");
4442	}
4443	for (error = 0, flushparent = 0; ; ) {
4444		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
4445			break;
4446		if (wk->wk_type != D_DIRADD) {
4447			FREE_LOCK(&lk);
4448			panic("softdep_fsync: Unexpected type %s",
4449			    TYPENAME(wk->wk_type));
4450		}
4451		dap = WK_DIRADD(wk);
4452		/*
4453		 * Flush our parent if this directory entry has a MKDIR_PARENT
4454		 * dependency or is contained in a newly allocated block.
4455		 */
4456		if (dap->da_state & DIRCHG)
4457			pagedep = dap->da_previous->dm_pagedep;
4458		else
4459			pagedep = dap->da_pagedep;
4460		mnt = pagedep->pd_mnt;
4461		parentino = pagedep->pd_ino;
4462		lbn = pagedep->pd_lbn;
4463		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) {
4464			FREE_LOCK(&lk);
4465			panic("softdep_fsync: dirty");
4466		}
4467		if ((dap->da_state & MKDIR_PARENT) ||
4468		    (pagedep->pd_state & NEWBLOCK))
4469			flushparent = 1;
4470		else
4471			flushparent = 0;
4472		/*
4473		 * If we are being fsync'ed as part of vgone'ing this vnode,
4474		 * then we will not be able to release and recover the
4475		 * vnode below, so we just have to give up on writing its
4476		 * directory entry out. It will eventually be written, just
4477		 * not now, but then the user was not asking to have it
4478		 * written, so we are not breaking any promises.
4479		 */
4480		if (vp->v_flag & VXLOCK)
4481			break;
4482		/*
4483		 * We prevent deadlock by always fetching inodes from the
4484		 * root, moving down the directory tree. Thus, when fetching
4485		 * our parent directory, we first try to get the lock. If
4486		 * that fails, we must unlock ourselves before requesting
4487		 * the lock on our parent. See the comment in ufs_lookup
4488		 * for details on possible races.
4489		 */
4490		FREE_LOCK(&lk);
4491		if (VFS_VGET(mnt, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) {
4492			VOP_UNLOCK(vp, 0, td);
4493			error = VFS_VGET(mnt, parentino, LK_EXCLUSIVE, &pvp);
4494			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
4495			if (error != 0)
4496				return (error);
4497		}
4498		/*
4499		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
4500		 * that are contained in direct blocks will be resolved by
4501		 * doing a UFS_UPDATE. Pagedeps contained in indirect blocks
4502		 * may require a complete sync'ing of the directory. So, we
4503		 * try the cheap and fast UFS_UPDATE first, and if that fails,
4504		 * then we do the slower VOP_FSYNC of the directory.
4505		 */
4506		if (flushparent) {
4507			if ((error = UFS_UPDATE(pvp, 1)) != 0) {
4508				vput(pvp);
4509				return (error);
4510			}
4511			if ((pagedep->pd_state & NEWBLOCK) &&
4512			    (error = VOP_FSYNC(pvp, td->td_ucred, MNT_WAIT, td))) {
4513				vput(pvp);
4514				return (error);
4515			}
4516		}
4517		/*
4518		 * Flush directory page containing the inode's name.
4519		 */
4520		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
4521		    &bp);
4522		if (error == 0)
4523			error = BUF_WRITE(bp);
4524		else
4525			brelse(bp);
4526		vput(pvp);
4527		if (error != 0)
4528			return (error);
4529		ACQUIRE_LOCK(&lk);
4530		if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
4531			break;
4532	}
4533	FREE_LOCK(&lk);
4534	return (0);
4535}
4536
4537/*
4538 * Flush all the dirty bitmaps associated with the block device
4539 * before flushing the rest of the dirty blocks so as to reduce
4540 * the number of dependencies that will have to be rolled back.
4541 */
4542void
4543softdep_fsync_mountdev(vp)
4544	struct vnode *vp;
4545{
4546	struct buf *bp, *nbp;
4547	struct worklist *wk;
4548
4549	if (!vn_isdisk(vp, NULL))
4550		panic("softdep_fsync_mountdev: vnode not a disk");
4551	ACQUIRE_LOCK(&lk);
4552	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
4553		nbp = TAILQ_NEXT(bp, b_vnbufs);
4554		/*
4555		 * If it is already scheduled, skip to the next buffer.
4556		 */
4557		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
4558			continue;
4559		if ((bp->b_flags & B_DELWRI) == 0) {
4560			FREE_LOCK(&lk);
4561			panic("softdep_fsync_mountdev: not dirty");
4562		}
4563		/*
4564		 * We are only interested in bitmaps with outstanding
4565		 * dependencies.
4566		 */
4567		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
4568		    wk->wk_type != D_BMSAFEMAP ||
4569		    (bp->b_xflags & BX_BKGRDINPROG)) {
4570			BUF_UNLOCK(bp);
4571			continue;
4572		}
4573		bremfree(bp);
4574		FREE_LOCK(&lk);
4575		(void) bawrite(bp);
4576		ACQUIRE_LOCK(&lk);
4577		/*
4578		 * Since we may have slept during the I/O, we need
4579		 * to start from a known point.
4580		 */
4581		nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
4582	}
4583	drain_output(vp, 1);
4584	FREE_LOCK(&lk);
4585}
4586
4587/*
4588 * This routine is called when we are trying to synchronously flush a
4589 * file. This routine must eliminate any filesystem metadata dependencies
4590 * so that the syncing routine can succeed by pushing the dirty blocks
4591 * associated with the file. If any I/O errors occur, they are returned.
4592 */
4593int
4594softdep_sync_metadata(ap)
4595	struct vop_fsync_args /* {
4596		struct vnode *a_vp;
4597		struct ucred *a_cred;
4598		int a_waitfor;
4599		struct thread *a_td;
4600	} */ *ap;
4601{
4602	struct vnode *vp = ap->a_vp;
4603	struct pagedep *pagedep;
4604	struct allocdirect *adp;
4605	struct allocindir *aip;
4606	struct buf *bp, *nbp;
4607	struct worklist *wk;
4608	int i, error, waitfor;
4609
4610	/*
4611	 * Check whether this vnode is involved in a filesystem
4612	 * that is doing soft dependency processing.
4613	 */
4614	if (!vn_isdisk(vp, NULL)) {
4615		if (!DOINGSOFTDEP(vp))
4616			return (0);
4617	} else
4618		if (vp->v_rdev->si_mountpoint == NULL ||
4619		    (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0)
4620			return (0);
4621	/*
4622	 * Ensure that any direct block dependencies have been cleared.
4623	 */
4624	ACQUIRE_LOCK(&lk);
4625	if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
4626		FREE_LOCK(&lk);
4627		return (error);
4628	}
4629	/*
4630	 * For most files, the only metadata dependencies are the
4631	 * cylinder group maps that allocate their inode or blocks.
4632	 * The block allocation dependencies can be found by traversing
4633	 * the dependency lists for any buffers that remain on their
4634	 * dirty buffer list. The inode allocation dependency will
4635	 * be resolved when the inode is updated with MNT_WAIT.
4636	 * This work is done in two passes. The first pass grabs most
4637	 * of the buffers and begins asynchronously writing them. The
4638	 * only way to wait for these asynchronous writes is to sleep
4639	 * on the filesystem vnode which may stay busy for a long time
4640	 * if the filesystem is active. So, instead, we make a second
4641	 * pass over the dependencies blocking on each write. In the
4642	 * usual case we will be blocking against a write that we
4643	 * initiated, so when it is done the dependency will have been
4644	 * resolved. Thus the second pass is expected to end quickly.
4645	 */
4646	waitfor = MNT_NOWAIT;
4647top:
4648	/*
4649	 * We must wait for any I/O in progress to finish so that
4650	 * all potential buffers on the dirty list will be visible.
4651	 */
4652	drain_output(vp, 1);
4653	if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
4654		FREE_LOCK(&lk);
4655		return (0);
4656	}
4657	bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
4658	/* While syncing snapshots, we must allow recursive lookups */
4659	bp->b_lock.lk_flags |= LK_CANRECURSE;
4660loop:
4661	/*
4662	 * As we hold the buffer locked, none of its dependencies
4663	 * will disappear.
4664	 */
4665	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4666		switch (wk->wk_type) {
4667
4668		case D_ALLOCDIRECT:
4669			adp = WK_ALLOCDIRECT(wk);
4670			if (adp->ad_state & DEPCOMPLETE)
4671				continue;
4672			nbp = adp->ad_buf;
4673			if (getdirtybuf(&nbp, waitfor) == 0)
4674				continue;
4675			FREE_LOCK(&lk);
4676			if (waitfor == MNT_NOWAIT) {
4677				bawrite(nbp);
4678			} else if ((error = BUF_WRITE(nbp)) != 0) {
4679				break;
4680			}
4681			ACQUIRE_LOCK(&lk);
4682			continue;
4683
4684		case D_ALLOCINDIR:
4685			aip = WK_ALLOCINDIR(wk);
4686			if (aip->ai_state & DEPCOMPLETE)
4687				continue;
4688			nbp = aip->ai_buf;
4689			if (getdirtybuf(&nbp, waitfor) == 0)
4690				continue;
4691			FREE_LOCK(&lk);
4692			if (waitfor == MNT_NOWAIT) {
4693				bawrite(nbp);
4694			} else if ((error = BUF_WRITE(nbp)) != 0) {
4695				break;
4696			}
4697			ACQUIRE_LOCK(&lk);
4698			continue;
4699
4700		case D_INDIRDEP:
4701		restart:
4702
4703			LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
4704				if (aip->ai_state & DEPCOMPLETE)
4705					continue;
4706				nbp = aip->ai_buf;
4707				if (getdirtybuf(&nbp, MNT_WAIT) == 0)
4708					goto restart;
4709				FREE_LOCK(&lk);
4710				if ((error = BUF_WRITE(nbp)) != 0) {
4711					break;
4712				}
4713				ACQUIRE_LOCK(&lk);
4714				goto restart;
4715			}
4716			continue;
4717
4718		case D_INODEDEP:
4719			if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
4720			    WK_INODEDEP(wk)->id_ino)) != 0) {
4721				FREE_LOCK(&lk);
4722				break;
4723			}
4724			continue;
4725
4726		case D_PAGEDEP:
4727			/*
4728			 * We are trying to sync a directory that may
4729			 * have dependencies on both its own metadata
4730			 * and/or dependencies on the inodes of any
4731			 * recently allocated files. We walk its diradd
4732			 * lists pushing out the associated inode.
4733			 */
4734			pagedep = WK_PAGEDEP(wk);
4735			for (i = 0; i < DAHASHSZ; i++) {
4736				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
4737					continue;
4738				if ((error =
4739				    flush_pagedep_deps(vp, pagedep->pd_mnt,
4740						&pagedep->pd_diraddhd[i]))) {
4741					FREE_LOCK(&lk);
4742					break;
4743				}
4744			}
4745			continue;
4746
4747		case D_MKDIR:
4748			/*
4749			 * This case should never happen if the vnode has
4750			 * been properly sync'ed. However, if this function
4751			 * is used at a place where the vnode has not yet
4752			 * been sync'ed, this dependency can show up. So,
4753			 * rather than panic, just flush it.
4754			 */
4755			nbp = WK_MKDIR(wk)->md_buf;
4756			if (getdirtybuf(&nbp, waitfor) == 0)
4757				continue;
4758			FREE_LOCK(&lk);
4759			if (waitfor == MNT_NOWAIT) {
4760				bawrite(nbp);
4761			} else if ((error = BUF_WRITE(nbp)) != 0) {
4762				break;
4763			}
4764			ACQUIRE_LOCK(&lk);
4765			continue;
4766
4767		case D_BMSAFEMAP:
4768			/*
4769			 * This case should never happen if the vnode has
4770			 * been properly sync'ed. However, if this function
4771			 * is used at a place where the vnode has not yet
4772			 * been sync'ed, this dependency can show up. So,
4773			 * rather than panic, just flush it.
4774			 */
4775			nbp = WK_BMSAFEMAP(wk)->sm_buf;
4776			if (getdirtybuf(&nbp, waitfor) == 0)
4777				continue;
4778			FREE_LOCK(&lk);
4779			if (waitfor == MNT_NOWAIT) {
4780				bawrite(nbp);
4781			} else if ((error = BUF_WRITE(nbp)) != 0) {
4782				break;
4783			}
4784			ACQUIRE_LOCK(&lk);
4785			continue;
4786
4787		default:
4788			FREE_LOCK(&lk);
4789			panic("softdep_sync_metadata: Unknown type %s",
4790			    TYPENAME(wk->wk_type));
4791			/* NOTREACHED */
4792		}
4793		/* We reach here only in error and unlocked */
4794		if (error == 0)
4795			panic("softdep_sync_metadata: zero error");
4796		bp->b_lock.lk_flags &= ~LK_CANRECURSE;
4797		bawrite(bp);
4798		return (error);
4799	}
4800	(void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT);
4801	nbp = TAILQ_NEXT(bp, b_vnbufs);
4802	FREE_LOCK(&lk);
4803	bp->b_lock.lk_flags &= ~LK_CANRECURSE;
4804	bawrite(bp);
4805	ACQUIRE_LOCK(&lk);
4806	if (nbp != NULL) {
4807		bp = nbp;
4808		goto loop;
4809	}
4810	/*
4811	 * The brief unlock is to allow any pent up dependency
4812	 * processing to be done. Then proceed with the second pass.
4813	 */
4814	if (waitfor == MNT_NOWAIT) {
4815		waitfor = MNT_WAIT;
4816		FREE_LOCK(&lk);
4817		ACQUIRE_LOCK(&lk);
4818		goto top;
4819	}
4820
4821	/*
4822	 * If we have managed to get rid of all the dirty buffers,
4823	 * then we are done. For certain directories and block
4824	 * devices, we may need to do further work.
4825	 *
4826	 * We must wait for any I/O in progress to finish so that
4827	 * all potential buffers on the dirty list will be visible.
4828	 */
4829	drain_output(vp, 1);
4830	if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
4831		FREE_LOCK(&lk);
4832		return (0);
4833	}
4834
4835	FREE_LOCK(&lk);
4836	/*
4837	 * If we are trying to sync a block device, some of its buffers may
4838	 * contain metadata that cannot be written until the contents of some
4839	 * partially written files have been written to disk. The only easy
4840	 * way to accomplish this is to sync the entire filesystem (luckily
4841	 * this happens rarely).
4842	 */
4843	if (vn_isdisk(vp, NULL) &&
4844	    vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) &&
4845	    (error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT, ap->a_cred,
4846	     ap->a_td)) != 0)
4847		return (error);
4848	return (0);
4849}
4850
4851/*
4852 * Flush the dependencies associated with an inodedep.
4853 * Called with splbio blocked.
4854 */
4855static int
4856flush_inodedep_deps(fs, ino)
4857	struct fs *fs;
4858	ino_t ino;
4859{
4860	struct inodedep *inodedep;
4861	struct allocdirect *adp;
4862	int error, waitfor;
4863	struct buf *bp;
4864
4865	/*
4866	 * This work is done in two passes. The first pass grabs most
4867	 * of the buffers and begins asynchronously writing them. The
4868	 * only way to wait for these asynchronous writes is to sleep
4869	 * on the filesystem vnode which may stay busy for a long time
4870	 * if the filesystem is active. So, instead, we make a second
4871	 * pass over the dependencies blocking on each write. In the
4872	 * usual case we will be blocking against a write that we
4873	 * initiated, so when it is done the dependency will have been
4874	 * resolved. Thus the second pass is expected to end quickly.
4875	 * We give a brief window at the top of the loop to allow
4876	 * any pending I/O to complete.
4877	 */
4878	for (waitfor = MNT_NOWAIT; ; ) {
4879		FREE_LOCK(&lk);
4880		ACQUIRE_LOCK(&lk);
4881		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4882			return (0);
4883		TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
4884			if (adp->ad_state & DEPCOMPLETE)
4885				continue;
4886			bp = adp->ad_buf;
4887			if (getdirtybuf(&bp, waitfor) == 0) {
4888				if (waitfor == MNT_NOWAIT)
4889					continue;
4890				break;
4891			}
4892			FREE_LOCK(&lk);
4893			if (waitfor == MNT_NOWAIT) {
4894				bawrite(bp);
4895			} else if ((error = BUF_WRITE(bp)) != 0) {
4896				ACQUIRE_LOCK(&lk);
4897				return (error);
4898			}
4899			ACQUIRE_LOCK(&lk);
4900			break;
4901		}
4902		if (adp != NULL)
4903			continue;
4904		TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
4905			if (adp->ad_state & DEPCOMPLETE)
4906				continue;
4907			bp = adp->ad_buf;
4908			if (getdirtybuf(&bp, waitfor) == 0) {
4909				if (waitfor == MNT_NOWAIT)
4910					continue;
4911				break;
4912			}
4913			FREE_LOCK(&lk);
4914			if (waitfor == MNT_NOWAIT) {
4915				bawrite(bp);
4916			} else if ((error = BUF_WRITE(bp)) != 0) {
4917				ACQUIRE_LOCK(&lk);
4918				return (error);
4919			}
4920			ACQUIRE_LOCK(&lk);
4921			break;
4922		}
4923		if (adp != NULL)
4924			continue;
4925		/*
4926		 * If pass2, we are done, otherwise do pass 2.
4927		 */
4928		if (waitfor == MNT_WAIT)
4929			break;
4930		waitfor = MNT_WAIT;
4931	}
4932	/*
4933	 * Try freeing inodedep in case all dependencies have been removed.
4934	 */
4935	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
4936		(void) free_inodedep(inodedep);
4937	return (0);
4938}
4939
4940/*
4941 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
4942 * Called with splbio blocked.
4943 */
4944static int
4945flush_pagedep_deps(pvp, mp, diraddhdp)
4946	struct vnode *pvp;
4947	struct mount *mp;
4948	struct diraddhd *diraddhdp;
4949{
4950	struct thread *td = curthread;
4951	struct inodedep *inodedep;
4952	struct ufsmount *ump;
4953	struct diradd *dap;
4954	struct vnode *vp;
4955	int gotit, error = 0;
4956	struct buf *bp;
4957	ino_t inum;
4958
4959	ump = VFSTOUFS(mp);
4960	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
4961		/*
4962		 * Flush ourselves if this directory entry
4963		 * has a MKDIR_PARENT dependency.
4964		 */
4965		if (dap->da_state & MKDIR_PARENT) {
4966			FREE_LOCK(&lk);
4967			if ((error = UFS_UPDATE(pvp, 1)) != 0)
4968				break;
4969			ACQUIRE_LOCK(&lk);
4970			/*
4971			 * If that cleared dependencies, go on to next.
4972			 */
4973			if (dap != LIST_FIRST(diraddhdp))
4974				continue;
4975			if (dap->da_state & MKDIR_PARENT) {
4976				FREE_LOCK(&lk);
4977				panic("flush_pagedep_deps: MKDIR_PARENT");
4978			}
4979		}
4980		/*
4981		 * A newly allocated directory must have its "." and
4982		 * ".." entries written out before its name can be
4983		 * committed in its parent. We do not want or need
4984		 * the full semantics of a synchronous VOP_FSYNC as
4985		 * that may end up here again, once for each directory
4986		 * level in the filesystem. Instead, we push the blocks
4987		 * and wait for them to clear. We have to fsync twice
4988		 * because the first call may choose to defer blocks
4989		 * that still have dependencies, but deferral will
4990		 * happen at most once.
4991		 */
4992		inum = dap->da_newinum;
4993		if (dap->da_state & MKDIR_BODY) {
4994			FREE_LOCK(&lk);
4995			if ((error = VFS_VGET(mp, inum, LK_EXCLUSIVE, &vp)))
4996				break;
4997			if ((error=VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td)) ||
4998			    (error=VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td))) {
4999				vput(vp);
5000				break;
5001			}
5002			drain_output(vp, 0);
5003			vput(vp);
5004			ACQUIRE_LOCK(&lk);
5005			/*
5006			 * If that cleared dependencies, go on to next.
5007			 */
5008			if (dap != LIST_FIRST(diraddhdp))
5009				continue;
5010			if (dap->da_state & MKDIR_BODY) {
5011				FREE_LOCK(&lk);
5012				panic("flush_pagedep_deps: MKDIR_BODY");
5013			}
5014		}
5015		/*
5016		 * Flush the inode on which the directory entry depends.
5017		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
5018		 * the only remaining dependency is that the updated inode
5019		 * count must get pushed to disk. The inode has already
5020		 * been pushed into its inode buffer (via VOP_UPDATE) at
5021		 * the time of the reference count change. So we need only
5022		 * locate that buffer, ensure that there will be no rollback
5023		 * caused by a bitmap dependency, then write the inode buffer.
5024		 */
5025		if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
5026			FREE_LOCK(&lk);
5027			panic("flush_pagedep_deps: lost inode");
5028		}
5029		/*
5030		 * If the inode still has bitmap dependencies,
5031		 * push them to disk.
5032		 */
5033		if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5034			gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
5035			FREE_LOCK(&lk);
5036			if (gotit &&
5037			    (error = BUF_WRITE(inodedep->id_buf)) != 0)
5038				break;
5039			ACQUIRE_LOCK(&lk);
5040			if (dap != LIST_FIRST(diraddhdp))
5041				continue;
5042		}
5043		/*
5044		 * If the inode is still sitting in a buffer waiting
5045		 * to be written, push it to disk.
5046		 */
5047		FREE_LOCK(&lk);
5048		if ((error = bread(ump->um_devvp,
5049		    fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
5050		    (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
5051			brelse(bp);
5052			break;
5053		}
5054		if ((error = BUF_WRITE(bp)) != 0)
5055			break;
5056		ACQUIRE_LOCK(&lk);
5057		/*
5058		 * If we have failed to get rid of all the dependencies
5059		 * then something is seriously wrong.
5060		 */
5061		if (dap == LIST_FIRST(diraddhdp)) {
5062			FREE_LOCK(&lk);
5063			panic("flush_pagedep_deps: flush failed");
5064		}
5065	}
5066	if (error)
5067		ACQUIRE_LOCK(&lk);
5068	return (error);
5069}
5070
5071/*
5072 * A large burst of file addition or deletion activity can drive the
5073 * memory load excessively high. First attempt to slow things down
5074 * using the techniques below. If that fails, this routine requests
5075 * the offending operations to fall back to running synchronously
5076 * until the memory load returns to a reasonable level.
5077 */
5078int
5079softdep_slowdown(vp)
5080	struct vnode *vp;
5081{
5082	int max_softdeps_hard;
5083
5084	max_softdeps_hard = max_softdeps * 11 / 10;
5085	if (num_dirrem < max_softdeps_hard / 2 &&
5086	    num_inodedep < max_softdeps_hard)
5087		return (0);
5088	stat_sync_limit_hit += 1;
5089	return (1);
5090}
5091
5092/*
5093 * Called by the allocation routines when they are about to fail
5094 * in the hope that we can free up some disk space.
5095 *
5096 * First check to see if the work list has anything on it. If it has,
5097 * clean up entries until we successfully free some space. Because this
5098 * process holds inodes locked, we cannot handle any remove requests
5099 * that might block on a locked inode as that could lead to deadlock.
5100 * If the worklist yields no free space, encourage the syncer daemon
5101 * to help us. In no event will we try for longer than tickdelay seconds.
5102 */
5103int
5104softdep_request_cleanup(fs, vp)
5105	struct fs *fs;
5106	struct vnode *vp;
5107{
5108	long starttime;
5109	ufs2_daddr_t needed;
5110
5111	needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
5112	starttime = time_second + tickdelay;
5113	if (UFS_UPDATE(vp, 1) != 0)
5114		return (0);
5115	while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
5116		if (time_second > starttime)
5117			return (0);
5118		if (num_on_worklist > 0 &&
5119		    process_worklist_item(NULL, LK_NOWAIT) != -1) {
5120			stat_worklist_push += 1;
5121			continue;
5122		}
5123		request_cleanup(FLUSH_REMOVE_WAIT, 0);
5124	}
5125	return (1);
5126}
5127
5128/*
5129 * If memory utilization has gotten too high, deliberately slow things
5130 * down and speed up the I/O processing.
5131 */
5132static int
5133request_cleanup(resource, islocked)
5134	int resource;
5135	int islocked;
5136{
5137	struct thread *td = curthread;
5138
5139	/*
5140	 * We never hold up the filesystem syncer process.
5141	 */
5142	if (td == filesys_syncer)
5143		return (0);
5144	/*
5145	 * First check to see if the work list has gotten backlogged.
5146	 * If it has, co-opt this process to help clean up two entries.
5147	 * Because this process may hold inodes locked, we cannot
5148	 * handle any remove requests that might block on a locked
5149	 * inode as that could lead to deadlock.
5150	 */
5151	if (num_on_worklist > max_softdeps / 10) {
5152		if (islocked)
5153			FREE_LOCK(&lk);
5154		process_worklist_item(NULL, LK_NOWAIT);
5155		process_worklist_item(NULL, LK_NOWAIT);
5156		stat_worklist_push += 2;
5157		if (islocked)
5158			ACQUIRE_LOCK(&lk);
5159		return(1);
5160	}
5161	/*
5162	 * Next, we attempt to speed up the syncer process. If that
5163	 * is successful, then we allow the process to continue.
5164	 */
5165	if (speedup_syncer() && resource != FLUSH_REMOVE_WAIT)
5166		return(0);
5167	/*
5168	 * If we are resource constrained on inode dependencies, try
5169	 * flushing some dirty inodes. Otherwise, we are constrained
5170	 * by file deletions, so try accelerating flushes of directories
5171	 * with removal dependencies. We would like to do the cleanup
5172	 * here, but we probably hold an inode locked at this point and
5173	 * that might deadlock against one that we try to clean. So,
5174	 * the best that we can do is request the syncer daemon to do
5175	 * the cleanup for us.
5176	 */
5177	switch (resource) {
5178
5179	case FLUSH_INODES:
5180		stat_ino_limit_push += 1;
5181		req_clear_inodedeps += 1;
5182		stat_countp = &stat_ino_limit_hit;
5183		break;
5184
5185	case FLUSH_REMOVE:
5186	case FLUSH_REMOVE_WAIT:
5187		stat_blk_limit_push += 1;
5188		req_clear_remove += 1;
5189		stat_countp = &stat_blk_limit_hit;
5190		break;
5191
5192	default:
5193		if (islocked)
5194			FREE_LOCK(&lk);
5195		panic("request_cleanup: unknown type");
5196	}
5197	/*
5198	 * Hopefully the syncer daemon will catch up and awaken us.
5199	 * We wait at most tickdelay before proceeding in any case.
5200	 */
5201	if (islocked == 0)
5202		ACQUIRE_LOCK(&lk);
5203	proc_waiting += 1;
5204	if (handle.callout == NULL)
5205		handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
5206	interlocked_sleep(&lk, SLEEP, (caddr_t)&proc_waiting, PPAUSE,
5207	    "softupdate", 0);
5208	proc_waiting -= 1;
5209	if (islocked == 0)
5210		FREE_LOCK(&lk);
5211	return (1);
5212}
5213
5214/*
5215 * Awaken processes pausing in request_cleanup and clear proc_waiting
5216 * to indicate that there is no longer a timer running.
5217 */
5218void
5219pause_timer(arg)
5220	void *arg;
5221{
5222
5223	*stat_countp += 1;
5224	wakeup_one(&proc_waiting);
5225	if (proc_waiting > 0)
5226		handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
5227	else
5228		handle.callout = NULL;
5229}
5230
5231/*
5232 * Flush out a directory with at least one removal dependency in an effort to
5233 * reduce the number of dirrem, freefile, and freeblks dependency structures.
5234 */
5235static void
5236clear_remove(td)
5237	struct thread *td;
5238{
5239	struct pagedep_hashhead *pagedephd;
5240	struct pagedep *pagedep;
5241	static int next = 0;
5242	struct mount *mp;
5243	struct vnode *vp;
5244	int error, cnt;
5245	ino_t ino;
5246
5247	ACQUIRE_LOCK(&lk);
5248	for (cnt = 0; cnt < pagedep_hash; cnt++) {
5249		pagedephd = &pagedep_hashtbl[next++];
5250		if (next >= pagedep_hash)
5251			next = 0;
5252		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
5253			if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
5254				continue;
5255			mp = pagedep->pd_mnt;
5256			ino = pagedep->pd_ino;
5257			FREE_LOCK(&lk);
5258			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5259				continue;
5260			if ((error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &vp))) {
5261				softdep_error("clear_remove: vget", error);
5262				vn_finished_write(mp);
5263				return;
5264			}
5265			if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td)))
5266				softdep_error("clear_remove: fsync", error);
5267			drain_output(vp, 0);
5268			vput(vp);
5269			vn_finished_write(mp);
5270			return;
5271		}
5272	}
5273	FREE_LOCK(&lk);
5274}
5275
5276/*
5277 * Clear out a block of dirty inodes in an effort to reduce
5278 * the number of inodedep dependency structures.
5279 */
5280static void
5281clear_inodedeps(td)
5282	struct thread *td;
5283{
5284	struct inodedep_hashhead *inodedephd;
5285	struct inodedep *inodedep;
5286	static int next = 0;
5287	struct mount *mp;
5288	struct vnode *vp;
5289	struct fs *fs;
5290	int error, cnt;
5291	ino_t firstino, lastino, ino;
5292
5293	ACQUIRE_LOCK(&lk);
5294	/*
5295	 * Pick a random inode dependency to be cleared.
5296	 * We will then gather up all the inodes in its block
5297	 * that have dependencies and flush them out.
5298	 */
5299	for (cnt = 0; cnt < inodedep_hash; cnt++) {
5300		inodedephd = &inodedep_hashtbl[next++];
5301		if (next >= inodedep_hash)
5302			next = 0;
5303		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
5304			break;
5305	}
5306	if (inodedep == NULL)
5307		return;
5308	/*
5309	 * Ugly code to find mount point given pointer to superblock.
5310	 */
5311	fs = inodedep->id_fs;
5312	TAILQ_FOREACH(mp, &mountlist, mnt_list)
5313		if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
5314			break;
5315	/*
5316	 * Find the last inode in the block with dependencies.
5317	 */
5318	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
5319	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
5320		if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
5321			break;
5322	/*
5323	 * Asynchronously push all but the last inode with dependencies.
5324	 * Synchronously push the last inode with dependencies to ensure
5325	 * that the inode block gets written to free up the inodedeps.
5326	 */
5327	for (ino = firstino; ino <= lastino; ino++) {
5328		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
5329			continue;
5330		FREE_LOCK(&lk);
5331		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5332			continue;
5333		if ((error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &vp)) != 0) {
5334			softdep_error("clear_inodedeps: vget", error);
5335			vn_finished_write(mp);
5336			return;
5337		}
5338		if (ino == lastino) {
5339			if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_WAIT, td)))
5340				softdep_error("clear_inodedeps: fsync1", error);
5341		} else {
5342			if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td)))
5343				softdep_error("clear_inodedeps: fsync2", error);
5344			drain_output(vp, 0);
5345		}
5346		vput(vp);
5347		vn_finished_write(mp);
5348		ACQUIRE_LOCK(&lk);
5349	}
5350	FREE_LOCK(&lk);
5351}
5352
5353/*
5354 * Function to determine if the buffer has outstanding dependencies
5355 * that will cause a roll-back if the buffer is written. If wantcount
5356 * is set, return number of dependencies, otherwise just yes or no.
5357 */
5358static int
5359softdep_count_dependencies(bp, wantcount)
5360	struct buf *bp;
5361	int wantcount;
5362{
5363	struct worklist *wk;
5364	struct inodedep *inodedep;
5365	struct indirdep *indirdep;
5366	struct allocindir *aip;
5367	struct pagedep *pagedep;
5368	struct diradd *dap;
5369	int i, retval;
5370
5371	retval = 0;
5372	ACQUIRE_LOCK(&lk);
5373	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5374		switch (wk->wk_type) {
5375
5376		case D_INODEDEP:
5377			inodedep = WK_INODEDEP(wk);
5378			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5379				/* bitmap allocation dependency */
5380				retval += 1;
5381				if (!wantcount)
5382					goto out;
5383			}
5384			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
5385				/* direct block pointer dependency */
5386				retval += 1;
5387				if (!wantcount)
5388					goto out;
5389			}
5390			continue;
5391
5392		case D_INDIRDEP:
5393			indirdep = WK_INDIRDEP(wk);
5394
5395			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
5396				/* indirect block pointer dependency */
5397				retval += 1;
5398				if (!wantcount)
5399					goto out;
5400			}
5401			continue;
5402
5403		case D_PAGEDEP:
5404			pagedep = WK_PAGEDEP(wk);
5405			for (i = 0; i < DAHASHSZ; i++) {
5406
5407				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
5408					/* directory entry dependency */
5409					retval += 1;
5410					if (!wantcount)
5411						goto out;
5412				}
5413			}
5414			continue;
5415
5416		case D_BMSAFEMAP:
5417		case D_ALLOCDIRECT:
5418		case D_ALLOCINDIR:
5419		case D_MKDIR:
5420			/* never a dependency on these blocks */
5421			continue;
5422
5423		default:
5424			FREE_LOCK(&lk);
5425			panic("softdep_check_for_rollback: Unexpected type %s",
5426			    TYPENAME(wk->wk_type));
5427			/* NOTREACHED */
5428		}
5429	}
5430out:
5431	FREE_LOCK(&lk);
5432	return retval;
5433}
5434
5435/*
5436 * Acquire exclusive access to a buffer.
5437 * Must be called with splbio blocked.
5438 * Return 1 if buffer was acquired.
5439 */
5440static int
5441getdirtybuf(bpp, waitfor)
5442	struct buf **bpp;
5443	int waitfor;
5444{
5445	struct buf *bp;
5446	int error;
5447
5448	for (;;) {
5449		if ((bp = *bpp) == NULL)
5450			return (0);
5451		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
5452			if ((bp->b_xflags & BX_BKGRDINPROG) == 0)
5453				break;
5454			BUF_UNLOCK(bp);
5455			if (waitfor != MNT_WAIT)
5456				return (0);
5457			bp->b_xflags |= BX_BKGRDWAIT;
5458			interlocked_sleep(&lk, SLEEP, &bp->b_xflags, PRIBIO,
5459			    "getbuf", 0);
5460			continue;
5461		}
5462		if (waitfor != MNT_WAIT)
5463			return (0);
5464		error = interlocked_sleep(&lk, LOCKBUF, bp,
5465		    LK_EXCLUSIVE | LK_SLEEPFAIL, 0, 0);
5466		if (error != ENOLCK) {
5467			FREE_LOCK(&lk);
5468			panic("getdirtybuf: inconsistent lock");
5469		}
5470	}
5471	if ((bp->b_flags & B_DELWRI) == 0) {
5472		BUF_UNLOCK(bp);
5473		return (0);
5474	}
5475	bremfree(bp);
5476	return (1);
5477}
5478
5479/*
5480 * Wait for pending output on a vnode to complete.
5481 * Must be called with vnode locked.
5482 */
5483static void
5484drain_output(vp, islocked)
5485	struct vnode *vp;
5486	int islocked;
5487{
5488
5489	if (!islocked)
5490		ACQUIRE_LOCK(&lk);
5491	while (vp->v_numoutput) {
5492		vp->v_flag |= VBWAIT;
5493		interlocked_sleep(&lk, SLEEP, (caddr_t)&vp->v_numoutput,
5494		    PRIBIO + 1, "drainvp", 0);
5495	}
5496	if (!islocked)
5497		FREE_LOCK(&lk);
5498}
5499
5500/*
5501 * Called whenever a buffer that is being invalidated or reallocated
5502 * contains dependencies. This should only happen if an I/O error has
5503 * occurred. The routine is called with the buffer locked.
5504 */
5505static void
5506softdep_deallocate_dependencies(bp)
5507	struct buf *bp;
5508{
5509
5510	if ((bp->b_ioflags & BIO_ERROR) == 0)
5511		panic("softdep_deallocate_dependencies: dangling deps");
5512	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
5513	panic("softdep_deallocate_dependencies: unrecovered I/O error");
5514}
5515
5516/*
5517 * Function to handle asynchronous write errors in the filesystem.
5518 */
5519void
5520softdep_error(func, error)
5521	char *func;
5522	int error;
5523{
5524
5525	/* XXX should do something better! */
5526	printf("%s: got error %d while accessing filesystem\n", func, error);
5527}
5528