ffs_softdep.c revision 119601
1/*
2 * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
3 *
4 * The soft updates code is derived from the appendix of a University
5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
6 * "Soft Updates: A Solution to the Metadata Update Problem in File
7 * Systems", CSE-TR-254-95, August 1995).
8 *
9 * Further information about soft updates can be obtained from:
10 *
11 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
12 *	1614 Oxford Street		mckusick@mckusick.com
13 *	Berkeley, CA 94709-1608		+1-510-843-9542
14 *	USA
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 *
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 *
26 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
27 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
28 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
29 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
30 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 119601 2003-08-31 07:29:34Z jeff $");
43
44/*
45 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
46 */
47#ifndef DIAGNOSTIC
48#define DIAGNOSTIC
49#endif
50#ifndef DEBUG
51#define DEBUG
52#endif
53
54#include <sys/param.h>
55#include <sys/kernel.h>
56#include <sys/systm.h>
57#include <sys/bio.h>
58#include <sys/buf.h>
59#include <sys/malloc.h>
60#include <sys/mount.h>
61#include <sys/proc.h>
62#include <sys/stat.h>
63#include <sys/syslog.h>
64#include <sys/vnode.h>
65#include <sys/conf.h>
66#include <ufs/ufs/dir.h>
67#include <ufs/ufs/extattr.h>
68#include <ufs/ufs/quota.h>
69#include <ufs/ufs/inode.h>
70#include <ufs/ufs/ufsmount.h>
71#include <ufs/ffs/fs.h>
72#include <ufs/ffs/softdep.h>
73#include <ufs/ffs/ffs_extern.h>
74#include <ufs/ufs/ufs_extern.h>
75
76/*
77 * These definitions need to be adapted to the system to which
78 * this file is being ported.
79 */
80/*
81 * malloc types defined for the softdep system.
82 */
83static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
84static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
85static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
86static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
87static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
88static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
89static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
90static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
91static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
92static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
93static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
94static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
95static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
96static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
97
98#define M_SOFTDEP_FLAGS	(M_WAITOK | M_USE_RESERVE)
99
100#define	D_PAGEDEP	0
101#define	D_INODEDEP	1
102#define	D_NEWBLK	2
103#define	D_BMSAFEMAP	3
104#define	D_ALLOCDIRECT	4
105#define	D_INDIRDEP	5
106#define	D_ALLOCINDIR	6
107#define	D_FREEFRAG	7
108#define	D_FREEBLKS	8
109#define	D_FREEFILE	9
110#define	D_DIRADD	10
111#define	D_MKDIR		11
112#define	D_DIRREM	12
113#define	D_NEWDIRBLK	13
114#define	D_LAST		D_NEWDIRBLK
115
116/*
117 * translate from workitem type to memory type
118 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
119 */
120static struct malloc_type *memtype[] = {
121	M_PAGEDEP,
122	M_INODEDEP,
123	M_NEWBLK,
124	M_BMSAFEMAP,
125	M_ALLOCDIRECT,
126	M_INDIRDEP,
127	M_ALLOCINDIR,
128	M_FREEFRAG,
129	M_FREEBLKS,
130	M_FREEFILE,
131	M_DIRADD,
132	M_MKDIR,
133	M_DIRREM,
134	M_NEWDIRBLK
135};
136
137#define DtoM(type) (memtype[type])
138
139/*
140 * Names of malloc types.
141 */
142#define TYPENAME(type)  \
143	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
144/*
145 * End system adaptaion definitions.
146 */
147
148/*
149 * Internal function prototypes.
150 */
151static	void softdep_error(char *, int);
152static	void drain_output(struct vnode *, int);
153static	struct buf *getdirtybuf(struct buf **, struct mtx *, int);
154static	void clear_remove(struct thread *);
155static	void clear_inodedeps(struct thread *);
156static	int flush_pagedep_deps(struct vnode *, struct mount *,
157	    struct diraddhd *);
158static	int flush_inodedep_deps(struct fs *, ino_t);
159static	int flush_deplist(struct allocdirectlst *, int, int *);
160static	int handle_written_filepage(struct pagedep *, struct buf *);
161static  void diradd_inode_written(struct diradd *, struct inodedep *);
162static	int handle_written_inodeblock(struct inodedep *, struct buf *);
163static	void handle_allocdirect_partdone(struct allocdirect *);
164static	void handle_allocindir_partdone(struct allocindir *);
165static	void initiate_write_filepage(struct pagedep *, struct buf *);
166static	void handle_written_mkdir(struct mkdir *, int);
167static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
168static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
169static	void handle_workitem_freefile(struct freefile *);
170static	void handle_workitem_remove(struct dirrem *, struct vnode *);
171static	struct dirrem *newdirrem(struct buf *, struct inode *,
172	    struct inode *, int, struct dirrem **);
173static	void free_diradd(struct diradd *);
174static	void free_allocindir(struct allocindir *, struct inodedep *);
175static	void free_newdirblk(struct newdirblk *);
176static	int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
177	    ufs2_daddr_t *);
178static	void deallocate_dependencies(struct buf *, struct inodedep *);
179static	void free_allocdirect(struct allocdirectlst *,
180	    struct allocdirect *, int);
181static	int check_inode_unwritten(struct inodedep *);
182static	int free_inodedep(struct inodedep *);
183static	void handle_workitem_freeblocks(struct freeblks *, int);
184static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
185static	void setup_allocindir_phase2(struct buf *, struct inode *,
186	    struct allocindir *);
187static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
188	    ufs2_daddr_t);
189static	void handle_workitem_freefrag(struct freefrag *);
190static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
191static	void allocdirect_merge(struct allocdirectlst *,
192	    struct allocdirect *, struct allocdirect *);
193static	struct bmsafemap *bmsafemap_lookup(struct buf *);
194static	int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
195static	int inodedep_lookup(struct fs *, ino_t, int, struct inodedep **);
196static	int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
197static	void pause_timer(void *);
198static	int request_cleanup(int, int);
199static	int process_worklist_item(struct mount *, int);
200static	void add_to_worklist(struct worklist *);
201
202/*
203 * Exported softdep operations.
204 */
205static	void softdep_disk_io_initiation(struct buf *);
206static	void softdep_disk_write_complete(struct buf *);
207static	void softdep_deallocate_dependencies(struct buf *);
208static	void softdep_move_dependencies(struct buf *, struct buf *);
209static	int softdep_count_dependencies(struct buf *bp, int);
210
211/*
212 * Locking primitives.
213 *
214 * For a uniprocessor, all we need to do is protect against disk
215 * interrupts. For a multiprocessor, this lock would have to be
216 * a mutex. A single mutex is used throughout this file, though
217 * finer grain locking could be used if contention warranted it.
218 *
219 * For a multiprocessor, the sleep call would accept a lock and
220 * release it after the sleep processing was complete. In a uniprocessor
221 * implementation there is no such interlock, so we simple mark
222 * the places where it needs to be done with the `interlocked' form
223 * of the lock calls. Since the uniprocessor sleep already interlocks
224 * the spl, there is nothing that really needs to be done.
225 */
226#ifndef /* NOT */ DEBUG
227static struct lockit {
228	int	lkt_spl;
229} lk = { 0 };
230#define ACQUIRE_LOCK(lk)		(lk)->lkt_spl = splbio()
231#define FREE_LOCK(lk)			splx((lk)->lkt_spl)
232
233#else /* DEBUG */
234#define NOHOLDER	((struct thread *)-1)
235#define SPECIAL_FLAG	((struct thread *)-2)
236static struct lockit {
237	int	lkt_spl;
238	struct	thread *lkt_held;
239} lk = { 0, NOHOLDER };
240
241static	void acquire_lock(struct lockit *);
242static	void free_lock(struct lockit *);
243void	softdep_panic(char *);
244
245#define ACQUIRE_LOCK(lk)		acquire_lock(lk)
246#define FREE_LOCK(lk)			free_lock(lk)
247
248static void
249acquire_lock(lk)
250	struct lockit *lk;
251{
252	struct thread *holder;
253
254	if (lk->lkt_held != NOHOLDER) {
255		holder = lk->lkt_held;
256		FREE_LOCK(lk);
257		if (holder == curthread)
258			panic("softdep_lock: locking against myself");
259		else
260			panic("softdep_lock: lock held by %p", holder);
261	}
262	lk->lkt_spl = splbio();
263	lk->lkt_held = curthread;
264}
265
266static void
267free_lock(lk)
268	struct lockit *lk;
269{
270
271	if (lk->lkt_held == NOHOLDER)
272		panic("softdep_unlock: lock not held");
273	lk->lkt_held = NOHOLDER;
274	splx(lk->lkt_spl);
275}
276
277/*
278 * Function to release soft updates lock and panic.
279 */
280void
281softdep_panic(msg)
282	char *msg;
283{
284
285	if (lk.lkt_held != NOHOLDER)
286		FREE_LOCK(&lk);
287	panic(msg);
288}
289#endif /* DEBUG */
290
291static	int interlocked_sleep(struct lockit *, int, void *, struct mtx *, int,
292	    const char *, int);
293
294/*
295 * When going to sleep, we must save our SPL so that it does
296 * not get lost if some other process uses the lock while we
297 * are sleeping. We restore it after we have slept. This routine
298 * wraps the interlocking with functions that sleep. The list
299 * below enumerates the available set of operations.
300 */
301#define	UNKNOWN		0
302#define	SLEEP		1
303#define	LOCKBUF		2
304
305static int
306interlocked_sleep(lk, op, ident, mtx, flags, wmesg, timo)
307	struct lockit *lk;
308	int op;
309	void *ident;
310	struct mtx *mtx;
311	int flags;
312	const char *wmesg;
313	int timo;
314{
315	struct thread *holder;
316	int s, retval;
317
318	s = lk->lkt_spl;
319#	ifdef DEBUG
320	if (lk->lkt_held == NOHOLDER)
321		panic("interlocked_sleep: lock not held");
322	lk->lkt_held = NOHOLDER;
323#	endif /* DEBUG */
324	switch (op) {
325	case SLEEP:
326		retval = msleep(ident, mtx, flags, wmesg, timo);
327		break;
328	case LOCKBUF:
329		retval = BUF_LOCK((struct buf *)ident, flags, mtx);
330		break;
331	default:
332		panic("interlocked_sleep: unknown operation");
333	}
334#	ifdef DEBUG
335	if (lk->lkt_held != NOHOLDER) {
336		holder = lk->lkt_held;
337		FREE_LOCK(lk);
338		if (holder == curthread)
339			panic("interlocked_sleep: locking against self");
340		else
341			panic("interlocked_sleep: lock held by %p", holder);
342	}
343	lk->lkt_held = curthread;
344#	endif /* DEBUG */
345	lk->lkt_spl = s;
346	return (retval);
347}
348
349/*
350 * Place holder for real semaphores.
351 */
352struct sema {
353	int	value;
354	struct	thread *holder;
355	char	*name;
356	int	prio;
357	int	timo;
358};
359static	void sema_init(struct sema *, char *, int, int);
360static	int sema_get(struct sema *, struct lockit *);
361static	void sema_release(struct sema *);
362
363static void
364sema_init(semap, name, prio, timo)
365	struct sema *semap;
366	char *name;
367	int prio, timo;
368{
369
370	semap->holder = NOHOLDER;
371	semap->value = 0;
372	semap->name = name;
373	semap->prio = prio;
374	semap->timo = timo;
375}
376
377static int
378sema_get(semap, interlock)
379	struct sema *semap;
380	struct lockit *interlock;
381{
382
383	if (semap->value++ > 0) {
384		if (interlock != NULL) {
385			interlocked_sleep(interlock, SLEEP, (caddr_t)semap,
386			    NULL, semap->prio, semap->name,
387			    semap->timo);
388			FREE_LOCK(interlock);
389		} else {
390			tsleep(semap, semap->prio, semap->name,
391			    semap->timo);
392		}
393		return (0);
394	}
395	semap->holder = curthread;
396	if (interlock != NULL)
397		FREE_LOCK(interlock);
398	return (1);
399}
400
401static void
402sema_release(semap)
403	struct sema *semap;
404{
405
406	if (semap->value <= 0 || semap->holder != curthread) {
407		if (lk.lkt_held != NOHOLDER)
408			FREE_LOCK(&lk);
409		panic("sema_release: not held");
410	}
411	if (--semap->value > 0) {
412		semap->value = 0;
413		wakeup(semap);
414	}
415	semap->holder = NOHOLDER;
416}
417
418/*
419 * Worklist queue management.
420 * These routines require that the lock be held.
421 */
422#ifndef /* NOT */ DEBUG
423#define WORKLIST_INSERT(head, item) do {	\
424	(item)->wk_state |= ONWORKLIST;		\
425	LIST_INSERT_HEAD(head, item, wk_list);	\
426} while (0)
427#define WORKLIST_REMOVE(item) do {		\
428	(item)->wk_state &= ~ONWORKLIST;	\
429	LIST_REMOVE(item, wk_list);		\
430} while (0)
431#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
432
433#else /* DEBUG */
434static	void worklist_insert(struct workhead *, struct worklist *);
435static	void worklist_remove(struct worklist *);
436static	void workitem_free(struct worklist *, int);
437
438#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
439#define WORKLIST_REMOVE(item) worklist_remove(item)
440#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
441
442static void
443worklist_insert(head, item)
444	struct workhead *head;
445	struct worklist *item;
446{
447
448	if (lk.lkt_held == NOHOLDER)
449		panic("worklist_insert: lock not held");
450	if (item->wk_state & ONWORKLIST) {
451		FREE_LOCK(&lk);
452		panic("worklist_insert: already on list");
453	}
454	item->wk_state |= ONWORKLIST;
455	LIST_INSERT_HEAD(head, item, wk_list);
456}
457
458static void
459worklist_remove(item)
460	struct worklist *item;
461{
462
463	if (lk.lkt_held == NOHOLDER)
464		panic("worklist_remove: lock not held");
465	if ((item->wk_state & ONWORKLIST) == 0) {
466		FREE_LOCK(&lk);
467		panic("worklist_remove: not on list");
468	}
469	item->wk_state &= ~ONWORKLIST;
470	LIST_REMOVE(item, wk_list);
471}
472
473static void
474workitem_free(item, type)
475	struct worklist *item;
476	int type;
477{
478
479	if (item->wk_state & ONWORKLIST) {
480		if (lk.lkt_held != NOHOLDER)
481			FREE_LOCK(&lk);
482		panic("workitem_free: still on list");
483	}
484	if (item->wk_type != type) {
485		if (lk.lkt_held != NOHOLDER)
486			FREE_LOCK(&lk);
487		panic("workitem_free: type mismatch");
488	}
489	FREE(item, DtoM(type));
490}
491#endif /* DEBUG */
492
493/*
494 * Workitem queue management
495 */
496static struct workhead softdep_workitem_pending;
497static struct worklist *worklist_tail;
498static int num_on_worklist;	/* number of worklist items to be processed */
499static int softdep_worklist_busy; /* 1 => trying to do unmount */
500static int softdep_worklist_req; /* serialized waiters */
501static int max_softdeps;	/* maximum number of structs before slowdown */
502static int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
503static int tickdelay = 2;	/* number of ticks to pause during slowdown */
504static int proc_waiting;	/* tracks whether we have a timeout posted */
505static int *stat_countp;	/* statistic to count in proc_waiting timeout */
506static struct callout_handle handle; /* handle on posted proc_waiting timeout */
507static struct thread *filesys_syncer; /* proc of filesystem syncer process */
508static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
509#define FLUSH_INODES		1
510static int req_clear_remove;	/* syncer process flush some freeblks */
511#define FLUSH_REMOVE		2
512#define FLUSH_REMOVE_WAIT	3
513/*
514 * runtime statistics
515 */
516static int stat_worklist_push;	/* number of worklist cleanups */
517static int stat_blk_limit_push;	/* number of times block limit neared */
518static int stat_ino_limit_push;	/* number of times inode limit neared */
519static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
520static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
521static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
522static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
523static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
524static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
525static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
526#ifdef DEBUG
527#include <vm/vm.h>
528#include <sys/sysctl.h>
529SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
530SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
531SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
532SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
533SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
534SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
535SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
536SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
537SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
538SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
539SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
540SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
541SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
542#endif /* DEBUG */
543
544/*
545 * Add an item to the end of the work queue.
546 * This routine requires that the lock be held.
547 * This is the only routine that adds items to the list.
548 * The following routine is the only one that removes items
549 * and does so in order from first to last.
550 */
551static void
552add_to_worklist(wk)
553	struct worklist *wk;
554{
555
556	if (wk->wk_state & ONWORKLIST) {
557		if (lk.lkt_held != NOHOLDER)
558			FREE_LOCK(&lk);
559		panic("add_to_worklist: already on list");
560	}
561	wk->wk_state |= ONWORKLIST;
562	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
563		LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
564	else
565		LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
566	worklist_tail = wk;
567	num_on_worklist += 1;
568}
569
570/*
571 * Process that runs once per second to handle items in the background queue.
572 *
573 * Note that we ensure that everything is done in the order in which they
574 * appear in the queue. The code below depends on this property to ensure
575 * that blocks of a file are freed before the inode itself is freed. This
576 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
577 * until all the old ones have been purged from the dependency lists.
578 */
579int
580softdep_process_worklist(matchmnt)
581	struct mount *matchmnt;
582{
583	struct thread *td = curthread;
584	int cnt, matchcnt, loopcount;
585	long starttime;
586
587	/*
588	 * Record the process identifier of our caller so that we can give
589	 * this process preferential treatment in request_cleanup below.
590	 */
591	filesys_syncer = td;
592	matchcnt = 0;
593
594	/*
595	 * There is no danger of having multiple processes run this
596	 * code, but we have to single-thread it when softdep_flushfiles()
597	 * is in operation to get an accurate count of the number of items
598	 * related to its mount point that are in the list.
599	 */
600	if (matchmnt == NULL) {
601		if (softdep_worklist_busy < 0)
602			return(-1);
603		softdep_worklist_busy += 1;
604	}
605
606	/*
607	 * If requested, try removing inode or removal dependencies.
608	 */
609	if (req_clear_inodedeps) {
610		clear_inodedeps(td);
611		req_clear_inodedeps -= 1;
612		wakeup_one(&proc_waiting);
613	}
614	if (req_clear_remove) {
615		clear_remove(td);
616		req_clear_remove -= 1;
617		wakeup_one(&proc_waiting);
618	}
619	loopcount = 1;
620	starttime = time_second;
621	while (num_on_worklist > 0) {
622		if ((cnt = process_worklist_item(matchmnt, 0)) == -1)
623			break;
624		else
625			matchcnt += cnt;
626
627		/*
628		 * If a umount operation wants to run the worklist
629		 * accurately, abort.
630		 */
631		if (softdep_worklist_req && matchmnt == NULL) {
632			matchcnt = -1;
633			break;
634		}
635
636		/*
637		 * If requested, try removing inode or removal dependencies.
638		 */
639		if (req_clear_inodedeps) {
640			clear_inodedeps(td);
641			req_clear_inodedeps -= 1;
642			wakeup_one(&proc_waiting);
643		}
644		if (req_clear_remove) {
645			clear_remove(td);
646			req_clear_remove -= 1;
647			wakeup_one(&proc_waiting);
648		}
649		/*
650		 * We do not generally want to stop for buffer space, but if
651		 * we are really being a buffer hog, we will stop and wait.
652		 */
653		if (loopcount++ % 128 == 0)
654			bwillwrite();
655		/*
656		 * Never allow processing to run for more than one
657		 * second. Otherwise the other syncer tasks may get
658		 * excessively backlogged.
659		 */
660		if (starttime != time_second && matchmnt == NULL) {
661			matchcnt = -1;
662			break;
663		}
664	}
665	if (matchmnt == NULL) {
666		softdep_worklist_busy -= 1;
667		if (softdep_worklist_req && softdep_worklist_busy == 0)
668			wakeup(&softdep_worklist_req);
669	}
670	return (matchcnt);
671}
672
673/*
674 * Process one item on the worklist.
675 */
676static int
677process_worklist_item(matchmnt, flags)
678	struct mount *matchmnt;
679	int flags;
680{
681	struct worklist *wk, *wkend;
682	struct mount *mp;
683	struct vnode *vp;
684	int matchcnt = 0;
685
686	/*
687	 * If we are being called because of a process doing a
688	 * copy-on-write, then it is not safe to write as we may
689	 * recurse into the copy-on-write routine.
690	 */
691	if (curthread->td_proc->p_flag & P_COWINPROGRESS)
692		return (-1);
693	ACQUIRE_LOCK(&lk);
694	/*
695	 * Normally we just process each item on the worklist in order.
696	 * However, if we are in a situation where we cannot lock any
697	 * inodes, we have to skip over any dirrem requests whose
698	 * vnodes are resident and locked.
699	 */
700	vp = NULL;
701	LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
702		if (wk->wk_state & INPROGRESS)
703			continue;
704		if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
705			break;
706		wk->wk_state |= INPROGRESS;
707		FREE_LOCK(&lk);
708		VFS_VGET(WK_DIRREM(wk)->dm_mnt, WK_DIRREM(wk)->dm_oldinum,
709		    LK_NOWAIT | LK_EXCLUSIVE, &vp);
710		ACQUIRE_LOCK(&lk);
711		wk->wk_state &= ~INPROGRESS;
712		if (vp != NULL)
713			break;
714	}
715	if (wk == 0) {
716		FREE_LOCK(&lk);
717		return (-1);
718	}
719	/*
720	 * Remove the item to be processed. If we are removing the last
721	 * item on the list, we need to recalculate the tail pointer.
722	 * As this happens rarely and usually when the list is short,
723	 * we just run down the list to find it rather than tracking it
724	 * in the above loop.
725	 */
726	WORKLIST_REMOVE(wk);
727	if (wk == worklist_tail) {
728		LIST_FOREACH(wkend, &softdep_workitem_pending, wk_list)
729			if (LIST_NEXT(wkend, wk_list) == NULL)
730				break;
731		worklist_tail = wkend;
732	}
733	num_on_worklist -= 1;
734	FREE_LOCK(&lk);
735	switch (wk->wk_type) {
736
737	case D_DIRREM:
738		/* removal of a directory entry */
739		mp = WK_DIRREM(wk)->dm_mnt;
740		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
741			panic("%s: dirrem on suspended filesystem",
742				"process_worklist_item");
743		if (mp == matchmnt)
744			matchcnt += 1;
745		handle_workitem_remove(WK_DIRREM(wk), vp);
746		break;
747
748	case D_FREEBLKS:
749		/* releasing blocks and/or fragments from a file */
750		mp = WK_FREEBLKS(wk)->fb_mnt;
751		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
752			panic("%s: freeblks on suspended filesystem",
753				"process_worklist_item");
754		if (mp == matchmnt)
755			matchcnt += 1;
756		handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
757		break;
758
759	case D_FREEFRAG:
760		/* releasing a fragment when replaced as a file grows */
761		mp = WK_FREEFRAG(wk)->ff_mnt;
762		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
763			panic("%s: freefrag on suspended filesystem",
764				"process_worklist_item");
765		if (mp == matchmnt)
766			matchcnt += 1;
767		handle_workitem_freefrag(WK_FREEFRAG(wk));
768		break;
769
770	case D_FREEFILE:
771		/* releasing an inode when its link count drops to 0 */
772		mp = WK_FREEFILE(wk)->fx_mnt;
773		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
774			panic("%s: freefile on suspended filesystem",
775				"process_worklist_item");
776		if (mp == matchmnt)
777			matchcnt += 1;
778		handle_workitem_freefile(WK_FREEFILE(wk));
779		break;
780
781	default:
782		panic("%s_process_worklist: Unknown type %s",
783		    "softdep", TYPENAME(wk->wk_type));
784		/* NOTREACHED */
785	}
786	return (matchcnt);
787}
788
789/*
790 * Move dependencies from one buffer to another.
791 */
792static void
793softdep_move_dependencies(oldbp, newbp)
794	struct buf *oldbp;
795	struct buf *newbp;
796{
797	struct worklist *wk, *wktail;
798
799	if (LIST_FIRST(&newbp->b_dep) != NULL)
800		panic("softdep_move_dependencies: need merge code");
801	wktail = 0;
802	ACQUIRE_LOCK(&lk);
803	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
804		LIST_REMOVE(wk, wk_list);
805		if (wktail == 0)
806			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
807		else
808			LIST_INSERT_AFTER(wktail, wk, wk_list);
809		wktail = wk;
810	}
811	FREE_LOCK(&lk);
812}
813
814/*
815 * Purge the work list of all items associated with a particular mount point.
816 */
817int
818softdep_flushworklist(oldmnt, countp, td)
819	struct mount *oldmnt;
820	int *countp;
821	struct thread *td;
822{
823	struct vnode *devvp;
824	int count, error = 0;
825
826	/*
827	 * Await our turn to clear out the queue, then serialize access.
828	 */
829	while (softdep_worklist_busy) {
830		softdep_worklist_req += 1;
831		tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0);
832		softdep_worklist_req -= 1;
833	}
834	softdep_worklist_busy = -1;
835	/*
836	 * Alternately flush the block device associated with the mount
837	 * point and process any dependencies that the flushing
838	 * creates. We continue until no more worklist dependencies
839	 * are found.
840	 */
841	*countp = 0;
842	devvp = VFSTOUFS(oldmnt)->um_devvp;
843	while ((count = softdep_process_worklist(oldmnt)) > 0) {
844		*countp += count;
845		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
846		error = VOP_FSYNC(devvp, td->td_ucred, MNT_WAIT, td);
847		VOP_UNLOCK(devvp, 0, td);
848		if (error)
849			break;
850	}
851	softdep_worklist_busy = 0;
852	if (softdep_worklist_req)
853		wakeup(&softdep_worklist_req);
854	return (error);
855}
856
857/*
858 * Flush all vnodes and worklist items associated with a specified mount point.
859 */
860int
861softdep_flushfiles(oldmnt, flags, td)
862	struct mount *oldmnt;
863	int flags;
864	struct thread *td;
865{
866	int error, count, loopcnt;
867
868	error = 0;
869
870	/*
871	 * Alternately flush the vnodes associated with the mount
872	 * point and process any dependencies that the flushing
873	 * creates. In theory, this loop can happen at most twice,
874	 * but we give it a few extra just to be sure.
875	 */
876	for (loopcnt = 10; loopcnt > 0; loopcnt--) {
877		/*
878		 * Do another flush in case any vnodes were brought in
879		 * as part of the cleanup operations.
880		 */
881		if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
882			break;
883		if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 ||
884		    count == 0)
885			break;
886	}
887	/*
888	 * If we are unmounting then it is an error to fail. If we
889	 * are simply trying to downgrade to read-only, then filesystem
890	 * activity can keep us busy forever, so we just fail with EBUSY.
891	 */
892	if (loopcnt == 0) {
893		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
894			panic("softdep_flushfiles: looping");
895		error = EBUSY;
896	}
897	return (error);
898}
899
900/*
901 * Structure hashing.
902 *
903 * There are three types of structures that can be looked up:
904 *	1) pagedep structures identified by mount point, inode number,
905 *	   and logical block.
906 *	2) inodedep structures identified by mount point and inode number.
907 *	3) newblk structures identified by mount point and
908 *	   physical block number.
909 *
910 * The "pagedep" and "inodedep" dependency structures are hashed
911 * separately from the file blocks and inodes to which they correspond.
912 * This separation helps when the in-memory copy of an inode or
913 * file block must be replaced. It also obviates the need to access
914 * an inode or file page when simply updating (or de-allocating)
915 * dependency structures. Lookup of newblk structures is needed to
916 * find newly allocated blocks when trying to associate them with
917 * their allocdirect or allocindir structure.
918 *
919 * The lookup routines optionally create and hash a new instance when
920 * an existing entry is not found.
921 */
922#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
923#define NODELAY		0x0002	/* cannot do background work */
924
925/*
926 * Structures and routines associated with pagedep caching.
927 */
928LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
929u_long	pagedep_hash;		/* size of hash table - 1 */
930#define	PAGEDEP_HASH(mp, inum, lbn) \
931	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
932	    pagedep_hash])
933static struct sema pagedep_in_progress;
934
935/*
936 * Look up a pagedep. Return 1 if found, 0 if not found or found
937 * when asked to allocate but not associated with any buffer.
938 * If not found, allocate if DEPALLOC flag is passed.
939 * Found or allocated entry is returned in pagedeppp.
940 * This routine must be called with splbio interrupts blocked.
941 */
942static int
943pagedep_lookup(ip, lbn, flags, pagedeppp)
944	struct inode *ip;
945	ufs_lbn_t lbn;
946	int flags;
947	struct pagedep **pagedeppp;
948{
949	struct pagedep *pagedep;
950	struct pagedep_hashhead *pagedephd;
951	struct mount *mp;
952	int i;
953
954#ifdef DEBUG
955	if (lk.lkt_held == NOHOLDER)
956		panic("pagedep_lookup: lock not held");
957#endif
958	mp = ITOV(ip)->v_mount;
959	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
960top:
961	LIST_FOREACH(pagedep, pagedephd, pd_hash)
962		if (ip->i_number == pagedep->pd_ino &&
963		    lbn == pagedep->pd_lbn &&
964		    mp == pagedep->pd_mnt)
965			break;
966	if (pagedep) {
967		*pagedeppp = pagedep;
968		if ((flags & DEPALLOC) != 0 &&
969		    (pagedep->pd_state & ONWORKLIST) == 0)
970			return (0);
971		return (1);
972	}
973	if ((flags & DEPALLOC) == 0) {
974		*pagedeppp = NULL;
975		return (0);
976	}
977	if (sema_get(&pagedep_in_progress, &lk) == 0) {
978		ACQUIRE_LOCK(&lk);
979		goto top;
980	}
981	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
982		M_SOFTDEP_FLAGS|M_ZERO);
983	pagedep->pd_list.wk_type = D_PAGEDEP;
984	pagedep->pd_mnt = mp;
985	pagedep->pd_ino = ip->i_number;
986	pagedep->pd_lbn = lbn;
987	LIST_INIT(&pagedep->pd_dirremhd);
988	LIST_INIT(&pagedep->pd_pendinghd);
989	for (i = 0; i < DAHASHSZ; i++)
990		LIST_INIT(&pagedep->pd_diraddhd[i]);
991	ACQUIRE_LOCK(&lk);
992	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
993	sema_release(&pagedep_in_progress);
994	*pagedeppp = pagedep;
995	return (0);
996}
997
998/*
999 * Structures and routines associated with inodedep caching.
1000 */
1001LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
1002static u_long	inodedep_hash;	/* size of hash table - 1 */
1003static long	num_inodedep;	/* number of inodedep allocated */
1004#define	INODEDEP_HASH(fs, inum) \
1005      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
1006static struct sema inodedep_in_progress;
1007
1008/*
1009 * Look up an inodedep. Return 1 if found, 0 if not found.
1010 * If not found, allocate if DEPALLOC flag is passed.
1011 * Found or allocated entry is returned in inodedeppp.
1012 * This routine must be called with splbio interrupts blocked.
1013 */
1014static int
1015inodedep_lookup(fs, inum, flags, inodedeppp)
1016	struct fs *fs;
1017	ino_t inum;
1018	int flags;
1019	struct inodedep **inodedeppp;
1020{
1021	struct inodedep *inodedep;
1022	struct inodedep_hashhead *inodedephd;
1023	int firsttry;
1024
1025#ifdef DEBUG
1026	if (lk.lkt_held == NOHOLDER)
1027		panic("inodedep_lookup: lock not held");
1028#endif
1029	firsttry = 1;
1030	inodedephd = INODEDEP_HASH(fs, inum);
1031top:
1032	LIST_FOREACH(inodedep, inodedephd, id_hash)
1033		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1034			break;
1035	if (inodedep) {
1036		*inodedeppp = inodedep;
1037		return (1);
1038	}
1039	if ((flags & DEPALLOC) == 0) {
1040		*inodedeppp = NULL;
1041		return (0);
1042	}
1043	/*
1044	 * If we are over our limit, try to improve the situation.
1045	 */
1046	if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 &&
1047	    request_cleanup(FLUSH_INODES, 1)) {
1048		firsttry = 0;
1049		goto top;
1050	}
1051	if (sema_get(&inodedep_in_progress, &lk) == 0) {
1052		ACQUIRE_LOCK(&lk);
1053		goto top;
1054	}
1055	num_inodedep += 1;
1056	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
1057		M_INODEDEP, M_SOFTDEP_FLAGS);
1058	inodedep->id_list.wk_type = D_INODEDEP;
1059	inodedep->id_fs = fs;
1060	inodedep->id_ino = inum;
1061	inodedep->id_state = ALLCOMPLETE;
1062	inodedep->id_nlinkdelta = 0;
1063	inodedep->id_savedino1 = NULL;
1064	inodedep->id_savedsize = -1;
1065	inodedep->id_savedextsize = -1;
1066	inodedep->id_buf = NULL;
1067	LIST_INIT(&inodedep->id_pendinghd);
1068	LIST_INIT(&inodedep->id_inowait);
1069	LIST_INIT(&inodedep->id_bufwait);
1070	TAILQ_INIT(&inodedep->id_inoupdt);
1071	TAILQ_INIT(&inodedep->id_newinoupdt);
1072	TAILQ_INIT(&inodedep->id_extupdt);
1073	TAILQ_INIT(&inodedep->id_newextupdt);
1074	ACQUIRE_LOCK(&lk);
1075	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1076	sema_release(&inodedep_in_progress);
1077	*inodedeppp = inodedep;
1078	return (0);
1079}
1080
1081/*
1082 * Structures and routines associated with newblk caching.
1083 */
1084LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1085u_long	newblk_hash;		/* size of hash table - 1 */
1086#define	NEWBLK_HASH(fs, inum) \
1087	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1088static struct sema newblk_in_progress;
1089
1090/*
1091 * Look up a newblk. Return 1 if found, 0 if not found.
1092 * If not found, allocate if DEPALLOC flag is passed.
1093 * Found or allocated entry is returned in newblkpp.
1094 */
1095static int
1096newblk_lookup(fs, newblkno, flags, newblkpp)
1097	struct fs *fs;
1098	ufs2_daddr_t newblkno;
1099	int flags;
1100	struct newblk **newblkpp;
1101{
1102	struct newblk *newblk;
1103	struct newblk_hashhead *newblkhd;
1104
1105	newblkhd = NEWBLK_HASH(fs, newblkno);
1106top:
1107	LIST_FOREACH(newblk, newblkhd, nb_hash)
1108		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
1109			break;
1110	if (newblk) {
1111		*newblkpp = newblk;
1112		return (1);
1113	}
1114	if ((flags & DEPALLOC) == 0) {
1115		*newblkpp = NULL;
1116		return (0);
1117	}
1118	if (sema_get(&newblk_in_progress, 0) == 0)
1119		goto top;
1120	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
1121		M_NEWBLK, M_SOFTDEP_FLAGS);
1122	newblk->nb_state = 0;
1123	newblk->nb_fs = fs;
1124	newblk->nb_newblkno = newblkno;
1125	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1126	sema_release(&newblk_in_progress);
1127	*newblkpp = newblk;
1128	return (0);
1129}
1130
1131/*
1132 * Executed during filesystem system initialization before
1133 * mounting any filesystems.
1134 */
1135void
1136softdep_initialize()
1137{
1138
1139	LIST_INIT(&mkdirlisthd);
1140	LIST_INIT(&softdep_workitem_pending);
1141	max_softdeps = desiredvnodes * 4;
1142	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
1143	    &pagedep_hash);
1144	sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
1145	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
1146	sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
1147	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
1148	sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
1149
1150	/* hooks through which the main kernel code calls us */
1151	softdep_process_worklist_hook = softdep_process_worklist;
1152	softdep_fsync_hook = softdep_fsync;
1153
1154	/* initialise bioops hack */
1155	bioops.io_start = softdep_disk_io_initiation;
1156	bioops.io_complete = softdep_disk_write_complete;
1157	bioops.io_deallocate = softdep_deallocate_dependencies;
1158	bioops.io_movedeps = softdep_move_dependencies;
1159	bioops.io_countdeps = softdep_count_dependencies;
1160}
1161
1162/*
1163 * Executed after all filesystems have been unmounted during
1164 * filesystem module unload.
1165 */
1166void
1167softdep_uninitialize()
1168{
1169
1170	softdep_process_worklist_hook = NULL;
1171	softdep_fsync_hook = NULL;
1172	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
1173	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
1174	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
1175}
1176
1177/*
1178 * Called at mount time to notify the dependency code that a
1179 * filesystem wishes to use it.
1180 */
1181int
1182softdep_mount(devvp, mp, fs, cred)
1183	struct vnode *devvp;
1184	struct mount *mp;
1185	struct fs *fs;
1186	struct ucred *cred;
1187{
1188	struct csum_total cstotal;
1189	struct cg *cgp;
1190	struct buf *bp;
1191	int error, cyl;
1192
1193	mp->mnt_flag &= ~MNT_ASYNC;
1194	mp->mnt_flag |= MNT_SOFTDEP;
1195	/*
1196	 * When doing soft updates, the counters in the
1197	 * superblock may have gotten out of sync, so we have
1198	 * to scan the cylinder groups and recalculate them.
1199	 */
1200	if (fs->fs_clean != 0)
1201		return (0);
1202	bzero(&cstotal, sizeof cstotal);
1203	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1204		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
1205		    fs->fs_cgsize, cred, &bp)) != 0) {
1206			brelse(bp);
1207			return (error);
1208		}
1209		cgp = (struct cg *)bp->b_data;
1210		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1211		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1212		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1213		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1214		fs->fs_cs(fs, cyl) = cgp->cg_cs;
1215		brelse(bp);
1216	}
1217#ifdef DEBUG
1218	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1219		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
1220#endif
1221	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1222	return (0);
1223}
1224
1225/*
1226 * Protecting the freemaps (or bitmaps).
1227 *
1228 * To eliminate the need to execute fsck before mounting a filesystem
1229 * after a power failure, one must (conservatively) guarantee that the
1230 * on-disk copy of the bitmaps never indicate that a live inode or block is
1231 * free.  So, when a block or inode is allocated, the bitmap should be
1232 * updated (on disk) before any new pointers.  When a block or inode is
1233 * freed, the bitmap should not be updated until all pointers have been
1234 * reset.  The latter dependency is handled by the delayed de-allocation
1235 * approach described below for block and inode de-allocation.  The former
1236 * dependency is handled by calling the following procedure when a block or
1237 * inode is allocated. When an inode is allocated an "inodedep" is created
1238 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1239 * Each "inodedep" is also inserted into the hash indexing structure so
1240 * that any additional link additions can be made dependent on the inode
1241 * allocation.
1242 *
1243 * The ufs filesystem maintains a number of free block counts (e.g., per
1244 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1245 * in addition to the bitmaps.  These counts are used to improve efficiency
1246 * during allocation and therefore must be consistent with the bitmaps.
1247 * There is no convenient way to guarantee post-crash consistency of these
1248 * counts with simple update ordering, for two main reasons: (1) The counts
1249 * and bitmaps for a single cylinder group block are not in the same disk
1250 * sector.  If a disk write is interrupted (e.g., by power failure), one may
1251 * be written and the other not.  (2) Some of the counts are located in the
1252 * superblock rather than the cylinder group block. So, we focus our soft
1253 * updates implementation on protecting the bitmaps. When mounting a
1254 * filesystem, we recompute the auxiliary counts from the bitmaps.
1255 */
1256
1257/*
1258 * Called just after updating the cylinder group block to allocate an inode.
1259 */
1260void
1261softdep_setup_inomapdep(bp, ip, newinum)
1262	struct buf *bp;		/* buffer for cylgroup block with inode map */
1263	struct inode *ip;	/* inode related to allocation */
1264	ino_t newinum;		/* new inode number being allocated */
1265{
1266	struct inodedep *inodedep;
1267	struct bmsafemap *bmsafemap;
1268
1269	/*
1270	 * Create a dependency for the newly allocated inode.
1271	 * Panic if it already exists as something is seriously wrong.
1272	 * Otherwise add it to the dependency list for the buffer holding
1273	 * the cylinder group map from which it was allocated.
1274	 */
1275	ACQUIRE_LOCK(&lk);
1276	if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC|NODELAY, &inodedep))) {
1277		FREE_LOCK(&lk);
1278		panic("softdep_setup_inomapdep: found inode");
1279	}
1280	inodedep->id_buf = bp;
1281	inodedep->id_state &= ~DEPCOMPLETE;
1282	bmsafemap = bmsafemap_lookup(bp);
1283	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1284	FREE_LOCK(&lk);
1285}
1286
1287/*
1288 * Called just after updating the cylinder group block to
1289 * allocate block or fragment.
1290 */
1291void
1292softdep_setup_blkmapdep(bp, fs, newblkno)
1293	struct buf *bp;		/* buffer for cylgroup block with block map */
1294	struct fs *fs;		/* filesystem doing allocation */
1295	ufs2_daddr_t newblkno;	/* number of newly allocated block */
1296{
1297	struct newblk *newblk;
1298	struct bmsafemap *bmsafemap;
1299
1300	/*
1301	 * Create a dependency for the newly allocated block.
1302	 * Add it to the dependency list for the buffer holding
1303	 * the cylinder group map from which it was allocated.
1304	 */
1305	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1306		panic("softdep_setup_blkmapdep: found block");
1307	ACQUIRE_LOCK(&lk);
1308	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1309	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1310	FREE_LOCK(&lk);
1311}
1312
1313/*
1314 * Find the bmsafemap associated with a cylinder group buffer.
1315 * If none exists, create one. The buffer must be locked when
1316 * this routine is called and this routine must be called with
1317 * splbio interrupts blocked.
1318 */
1319static struct bmsafemap *
1320bmsafemap_lookup(bp)
1321	struct buf *bp;
1322{
1323	struct bmsafemap *bmsafemap;
1324	struct worklist *wk;
1325
1326#ifdef DEBUG
1327	if (lk.lkt_held == NOHOLDER)
1328		panic("bmsafemap_lookup: lock not held");
1329#endif
1330	LIST_FOREACH(wk, &bp->b_dep, wk_list)
1331		if (wk->wk_type == D_BMSAFEMAP)
1332			return (WK_BMSAFEMAP(wk));
1333	FREE_LOCK(&lk);
1334	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1335		M_BMSAFEMAP, M_SOFTDEP_FLAGS);
1336	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1337	bmsafemap->sm_list.wk_state = 0;
1338	bmsafemap->sm_buf = bp;
1339	LIST_INIT(&bmsafemap->sm_allocdirecthd);
1340	LIST_INIT(&bmsafemap->sm_allocindirhd);
1341	LIST_INIT(&bmsafemap->sm_inodedephd);
1342	LIST_INIT(&bmsafemap->sm_newblkhd);
1343	ACQUIRE_LOCK(&lk);
1344	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1345	return (bmsafemap);
1346}
1347
1348/*
1349 * Direct block allocation dependencies.
1350 *
1351 * When a new block is allocated, the corresponding disk locations must be
1352 * initialized (with zeros or new data) before the on-disk inode points to
1353 * them.  Also, the freemap from which the block was allocated must be
1354 * updated (on disk) before the inode's pointer. These two dependencies are
1355 * independent of each other and are needed for all file blocks and indirect
1356 * blocks that are pointed to directly by the inode.  Just before the
1357 * "in-core" version of the inode is updated with a newly allocated block
1358 * number, a procedure (below) is called to setup allocation dependency
1359 * structures.  These structures are removed when the corresponding
1360 * dependencies are satisfied or when the block allocation becomes obsolete
1361 * (i.e., the file is deleted, the block is de-allocated, or the block is a
1362 * fragment that gets upgraded).  All of these cases are handled in
1363 * procedures described later.
1364 *
1365 * When a file extension causes a fragment to be upgraded, either to a larger
1366 * fragment or to a full block, the on-disk location may change (if the
1367 * previous fragment could not simply be extended). In this case, the old
1368 * fragment must be de-allocated, but not until after the inode's pointer has
1369 * been updated. In most cases, this is handled by later procedures, which
1370 * will construct a "freefrag" structure to be added to the workitem queue
1371 * when the inode update is complete (or obsolete).  The main exception to
1372 * this is when an allocation occurs while a pending allocation dependency
1373 * (for the same block pointer) remains.  This case is handled in the main
1374 * allocation dependency setup procedure by immediately freeing the
1375 * unreferenced fragments.
1376 */
1377void
1378softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1379	struct inode *ip;	/* inode to which block is being added */
1380	ufs_lbn_t lbn;		/* block pointer within inode */
1381	ufs2_daddr_t newblkno;	/* disk block number being added */
1382	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
1383	long newsize;		/* size of new block */
1384	long oldsize;		/* size of new block */
1385	struct buf *bp;		/* bp for allocated block */
1386{
1387	struct allocdirect *adp, *oldadp;
1388	struct allocdirectlst *adphead;
1389	struct bmsafemap *bmsafemap;
1390	struct inodedep *inodedep;
1391	struct pagedep *pagedep;
1392	struct newblk *newblk;
1393
1394	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1395		M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1396	adp->ad_list.wk_type = D_ALLOCDIRECT;
1397	adp->ad_lbn = lbn;
1398	adp->ad_newblkno = newblkno;
1399	adp->ad_oldblkno = oldblkno;
1400	adp->ad_newsize = newsize;
1401	adp->ad_oldsize = oldsize;
1402	adp->ad_state = ATTACHED;
1403	LIST_INIT(&adp->ad_newdirblk);
1404	if (newblkno == oldblkno)
1405		adp->ad_freefrag = NULL;
1406	else
1407		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1408
1409	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1410		panic("softdep_setup_allocdirect: lost block");
1411
1412	ACQUIRE_LOCK(&lk);
1413	inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1414	adp->ad_inodedep = inodedep;
1415
1416	if (newblk->nb_state == DEPCOMPLETE) {
1417		adp->ad_state |= DEPCOMPLETE;
1418		adp->ad_buf = NULL;
1419	} else {
1420		bmsafemap = newblk->nb_bmsafemap;
1421		adp->ad_buf = bmsafemap->sm_buf;
1422		LIST_REMOVE(newblk, nb_deps);
1423		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1424	}
1425	LIST_REMOVE(newblk, nb_hash);
1426	FREE(newblk, M_NEWBLK);
1427
1428	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1429	if (lbn >= NDADDR) {
1430		/* allocating an indirect block */
1431		if (oldblkno != 0) {
1432			FREE_LOCK(&lk);
1433			panic("softdep_setup_allocdirect: non-zero indir");
1434		}
1435	} else {
1436		/*
1437		 * Allocating a direct block.
1438		 *
1439		 * If we are allocating a directory block, then we must
1440		 * allocate an associated pagedep to track additions and
1441		 * deletions.
1442		 */
1443		if ((ip->i_mode & IFMT) == IFDIR &&
1444		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1445			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1446	}
1447	/*
1448	 * The list of allocdirects must be kept in sorted and ascending
1449	 * order so that the rollback routines can quickly determine the
1450	 * first uncommitted block (the size of the file stored on disk
1451	 * ends at the end of the lowest committed fragment, or if there
1452	 * are no fragments, at the end of the highest committed block).
1453	 * Since files generally grow, the typical case is that the new
1454	 * block is to be added at the end of the list. We speed this
1455	 * special case by checking against the last allocdirect in the
1456	 * list before laboriously traversing the list looking for the
1457	 * insertion point.
1458	 */
1459	adphead = &inodedep->id_newinoupdt;
1460	oldadp = TAILQ_LAST(adphead, allocdirectlst);
1461	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1462		/* insert at end of list */
1463		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1464		if (oldadp != NULL && oldadp->ad_lbn == lbn)
1465			allocdirect_merge(adphead, adp, oldadp);
1466		FREE_LOCK(&lk);
1467		return;
1468	}
1469	TAILQ_FOREACH(oldadp, adphead, ad_next) {
1470		if (oldadp->ad_lbn >= lbn)
1471			break;
1472	}
1473	if (oldadp == NULL) {
1474		FREE_LOCK(&lk);
1475		panic("softdep_setup_allocdirect: lost entry");
1476	}
1477	/* insert in middle of list */
1478	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1479	if (oldadp->ad_lbn == lbn)
1480		allocdirect_merge(adphead, adp, oldadp);
1481	FREE_LOCK(&lk);
1482}
1483
1484/*
1485 * Replace an old allocdirect dependency with a newer one.
1486 * This routine must be called with splbio interrupts blocked.
1487 */
1488static void
1489allocdirect_merge(adphead, newadp, oldadp)
1490	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
1491	struct allocdirect *newadp;	/* allocdirect being added */
1492	struct allocdirect *oldadp;	/* existing allocdirect being checked */
1493{
1494	struct worklist *wk;
1495	struct freefrag *freefrag;
1496	struct newdirblk *newdirblk;
1497
1498#ifdef DEBUG
1499	if (lk.lkt_held == NOHOLDER)
1500		panic("allocdirect_merge: lock not held");
1501#endif
1502	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1503	    newadp->ad_oldsize != oldadp->ad_newsize ||
1504	    newadp->ad_lbn >= NDADDR) {
1505		FREE_LOCK(&lk);
1506		panic("%s %jd != new %jd || old size %ld != new %ld",
1507		    "allocdirect_merge: old blkno",
1508		    (intmax_t)newadp->ad_oldblkno,
1509		    (intmax_t)oldadp->ad_newblkno,
1510		    newadp->ad_oldsize, oldadp->ad_newsize);
1511	}
1512	newadp->ad_oldblkno = oldadp->ad_oldblkno;
1513	newadp->ad_oldsize = oldadp->ad_oldsize;
1514	/*
1515	 * If the old dependency had a fragment to free or had never
1516	 * previously had a block allocated, then the new dependency
1517	 * can immediately post its freefrag and adopt the old freefrag.
1518	 * This action is done by swapping the freefrag dependencies.
1519	 * The new dependency gains the old one's freefrag, and the
1520	 * old one gets the new one and then immediately puts it on
1521	 * the worklist when it is freed by free_allocdirect. It is
1522	 * not possible to do this swap when the old dependency had a
1523	 * non-zero size but no previous fragment to free. This condition
1524	 * arises when the new block is an extension of the old block.
1525	 * Here, the first part of the fragment allocated to the new
1526	 * dependency is part of the block currently claimed on disk by
1527	 * the old dependency, so cannot legitimately be freed until the
1528	 * conditions for the new dependency are fulfilled.
1529	 */
1530	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1531		freefrag = newadp->ad_freefrag;
1532		newadp->ad_freefrag = oldadp->ad_freefrag;
1533		oldadp->ad_freefrag = freefrag;
1534	}
1535	/*
1536	 * If we are tracking a new directory-block allocation,
1537	 * move it from the old allocdirect to the new allocdirect.
1538	 */
1539	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
1540		newdirblk = WK_NEWDIRBLK(wk);
1541		WORKLIST_REMOVE(&newdirblk->db_list);
1542		if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL)
1543			panic("allocdirect_merge: extra newdirblk");
1544		WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
1545	}
1546	free_allocdirect(adphead, oldadp, 0);
1547}
1548
1549/*
1550 * Allocate a new freefrag structure if needed.
1551 */
1552static struct freefrag *
1553newfreefrag(ip, blkno, size)
1554	struct inode *ip;
1555	ufs2_daddr_t blkno;
1556	long size;
1557{
1558	struct freefrag *freefrag;
1559	struct fs *fs;
1560
1561	if (blkno == 0)
1562		return (NULL);
1563	fs = ip->i_fs;
1564	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1565		panic("newfreefrag: frag size");
1566	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1567		M_FREEFRAG, M_SOFTDEP_FLAGS);
1568	freefrag->ff_list.wk_type = D_FREEFRAG;
1569	freefrag->ff_state = 0;
1570	freefrag->ff_inum = ip->i_number;
1571	freefrag->ff_mnt = ITOV(ip)->v_mount;
1572	freefrag->ff_blkno = blkno;
1573	freefrag->ff_fragsize = size;
1574	return (freefrag);
1575}
1576
1577/*
1578 * This workitem de-allocates fragments that were replaced during
1579 * file block allocation.
1580 */
1581static void
1582handle_workitem_freefrag(freefrag)
1583	struct freefrag *freefrag;
1584{
1585	struct ufsmount *ump = VFSTOUFS(freefrag->ff_mnt);
1586
1587	ffs_blkfree(ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
1588	    freefrag->ff_fragsize, freefrag->ff_inum);
1589	FREE(freefrag, M_FREEFRAG);
1590}
1591
1592/*
1593 * Set up a dependency structure for an external attributes data block.
1594 * This routine follows much of the structure of softdep_setup_allocdirect.
1595 * See the description of softdep_setup_allocdirect above for details.
1596 */
1597void
1598softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1599	struct inode *ip;
1600	ufs_lbn_t lbn;
1601	ufs2_daddr_t newblkno;
1602	ufs2_daddr_t oldblkno;
1603	long newsize;
1604	long oldsize;
1605	struct buf *bp;
1606{
1607	struct allocdirect *adp, *oldadp;
1608	struct allocdirectlst *adphead;
1609	struct bmsafemap *bmsafemap;
1610	struct inodedep *inodedep;
1611	struct newblk *newblk;
1612
1613	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1614		M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1615	adp->ad_list.wk_type = D_ALLOCDIRECT;
1616	adp->ad_lbn = lbn;
1617	adp->ad_newblkno = newblkno;
1618	adp->ad_oldblkno = oldblkno;
1619	adp->ad_newsize = newsize;
1620	adp->ad_oldsize = oldsize;
1621	adp->ad_state = ATTACHED | EXTDATA;
1622	LIST_INIT(&adp->ad_newdirblk);
1623	if (newblkno == oldblkno)
1624		adp->ad_freefrag = NULL;
1625	else
1626		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1627
1628	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1629		panic("softdep_setup_allocext: lost block");
1630
1631	ACQUIRE_LOCK(&lk);
1632	inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1633	adp->ad_inodedep = inodedep;
1634
1635	if (newblk->nb_state == DEPCOMPLETE) {
1636		adp->ad_state |= DEPCOMPLETE;
1637		adp->ad_buf = NULL;
1638	} else {
1639		bmsafemap = newblk->nb_bmsafemap;
1640		adp->ad_buf = bmsafemap->sm_buf;
1641		LIST_REMOVE(newblk, nb_deps);
1642		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1643	}
1644	LIST_REMOVE(newblk, nb_hash);
1645	FREE(newblk, M_NEWBLK);
1646
1647	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1648	if (lbn >= NXADDR) {
1649		FREE_LOCK(&lk);
1650		panic("softdep_setup_allocext: lbn %lld > NXADDR",
1651		    (long long)lbn);
1652	}
1653	/*
1654	 * The list of allocdirects must be kept in sorted and ascending
1655	 * order so that the rollback routines can quickly determine the
1656	 * first uncommitted block (the size of the file stored on disk
1657	 * ends at the end of the lowest committed fragment, or if there
1658	 * are no fragments, at the end of the highest committed block).
1659	 * Since files generally grow, the typical case is that the new
1660	 * block is to be added at the end of the list. We speed this
1661	 * special case by checking against the last allocdirect in the
1662	 * list before laboriously traversing the list looking for the
1663	 * insertion point.
1664	 */
1665	adphead = &inodedep->id_newextupdt;
1666	oldadp = TAILQ_LAST(adphead, allocdirectlst);
1667	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1668		/* insert at end of list */
1669		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1670		if (oldadp != NULL && oldadp->ad_lbn == lbn)
1671			allocdirect_merge(adphead, adp, oldadp);
1672		FREE_LOCK(&lk);
1673		return;
1674	}
1675	TAILQ_FOREACH(oldadp, adphead, ad_next) {
1676		if (oldadp->ad_lbn >= lbn)
1677			break;
1678	}
1679	if (oldadp == NULL) {
1680		FREE_LOCK(&lk);
1681		panic("softdep_setup_allocext: lost entry");
1682	}
1683	/* insert in middle of list */
1684	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1685	if (oldadp->ad_lbn == lbn)
1686		allocdirect_merge(adphead, adp, oldadp);
1687	FREE_LOCK(&lk);
1688}
1689
1690/*
1691 * Indirect block allocation dependencies.
1692 *
1693 * The same dependencies that exist for a direct block also exist when
1694 * a new block is allocated and pointed to by an entry in a block of
1695 * indirect pointers. The undo/redo states described above are also
1696 * used here. Because an indirect block contains many pointers that
1697 * may have dependencies, a second copy of the entire in-memory indirect
1698 * block is kept. The buffer cache copy is always completely up-to-date.
1699 * The second copy, which is used only as a source for disk writes,
1700 * contains only the safe pointers (i.e., those that have no remaining
1701 * update dependencies). The second copy is freed when all pointers
1702 * are safe. The cache is not allowed to replace indirect blocks with
1703 * pending update dependencies. If a buffer containing an indirect
1704 * block with dependencies is written, these routines will mark it
1705 * dirty again. It can only be successfully written once all the
1706 * dependencies are removed. The ffs_fsync routine in conjunction with
1707 * softdep_sync_metadata work together to get all the dependencies
1708 * removed so that a file can be successfully written to disk. Three
1709 * procedures are used when setting up indirect block pointer
1710 * dependencies. The division is necessary because of the organization
1711 * of the "balloc" routine and because of the distinction between file
1712 * pages and file metadata blocks.
1713 */
1714
1715/*
1716 * Allocate a new allocindir structure.
1717 */
1718static struct allocindir *
1719newallocindir(ip, ptrno, newblkno, oldblkno)
1720	struct inode *ip;	/* inode for file being extended */
1721	int ptrno;		/* offset of pointer in indirect block */
1722	ufs2_daddr_t newblkno;	/* disk block number being added */
1723	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
1724{
1725	struct allocindir *aip;
1726
1727	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1728		M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
1729	aip->ai_list.wk_type = D_ALLOCINDIR;
1730	aip->ai_state = ATTACHED;
1731	aip->ai_offset = ptrno;
1732	aip->ai_newblkno = newblkno;
1733	aip->ai_oldblkno = oldblkno;
1734	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1735	return (aip);
1736}
1737
1738/*
1739 * Called just before setting an indirect block pointer
1740 * to a newly allocated file page.
1741 */
1742void
1743softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1744	struct inode *ip;	/* inode for file being extended */
1745	ufs_lbn_t lbn;		/* allocated block number within file */
1746	struct buf *bp;		/* buffer with indirect blk referencing page */
1747	int ptrno;		/* offset of pointer in indirect block */
1748	ufs2_daddr_t newblkno;	/* disk block number being added */
1749	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
1750	struct buf *nbp;	/* buffer holding allocated page */
1751{
1752	struct allocindir *aip;
1753	struct pagedep *pagedep;
1754
1755	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1756	ACQUIRE_LOCK(&lk);
1757	/*
1758	 * If we are allocating a directory page, then we must
1759	 * allocate an associated pagedep to track additions and
1760	 * deletions.
1761	 */
1762	if ((ip->i_mode & IFMT) == IFDIR &&
1763	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1764		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1765	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1766	FREE_LOCK(&lk);
1767	setup_allocindir_phase2(bp, ip, aip);
1768}
1769
1770/*
1771 * Called just before setting an indirect block pointer to a
1772 * newly allocated indirect block.
1773 */
1774void
1775softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1776	struct buf *nbp;	/* newly allocated indirect block */
1777	struct inode *ip;	/* inode for file being extended */
1778	struct buf *bp;		/* indirect block referencing allocated block */
1779	int ptrno;		/* offset of pointer in indirect block */
1780	ufs2_daddr_t newblkno;	/* disk block number being added */
1781{
1782	struct allocindir *aip;
1783
1784	aip = newallocindir(ip, ptrno, newblkno, 0);
1785	ACQUIRE_LOCK(&lk);
1786	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1787	FREE_LOCK(&lk);
1788	setup_allocindir_phase2(bp, ip, aip);
1789}
1790
1791/*
1792 * Called to finish the allocation of the "aip" allocated
1793 * by one of the two routines above.
1794 */
1795static void
1796setup_allocindir_phase2(bp, ip, aip)
1797	struct buf *bp;		/* in-memory copy of the indirect block */
1798	struct inode *ip;	/* inode for file being extended */
1799	struct allocindir *aip;	/* allocindir allocated by the above routines */
1800{
1801	struct worklist *wk;
1802	struct indirdep *indirdep, *newindirdep;
1803	struct bmsafemap *bmsafemap;
1804	struct allocindir *oldaip;
1805	struct freefrag *freefrag;
1806	struct newblk *newblk;
1807	ufs2_daddr_t blkno;
1808
1809	if (bp->b_lblkno >= 0)
1810		panic("setup_allocindir_phase2: not indir blk");
1811	for (indirdep = NULL, newindirdep = NULL; ; ) {
1812		ACQUIRE_LOCK(&lk);
1813		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
1814			if (wk->wk_type != D_INDIRDEP)
1815				continue;
1816			indirdep = WK_INDIRDEP(wk);
1817			break;
1818		}
1819		if (indirdep == NULL && newindirdep) {
1820			indirdep = newindirdep;
1821			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1822			newindirdep = NULL;
1823		}
1824		FREE_LOCK(&lk);
1825		if (indirdep) {
1826			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1827			    &newblk) == 0)
1828				panic("setup_allocindir: lost block");
1829			ACQUIRE_LOCK(&lk);
1830			if (newblk->nb_state == DEPCOMPLETE) {
1831				aip->ai_state |= DEPCOMPLETE;
1832				aip->ai_buf = NULL;
1833			} else {
1834				bmsafemap = newblk->nb_bmsafemap;
1835				aip->ai_buf = bmsafemap->sm_buf;
1836				LIST_REMOVE(newblk, nb_deps);
1837				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1838				    aip, ai_deps);
1839			}
1840			LIST_REMOVE(newblk, nb_hash);
1841			FREE(newblk, M_NEWBLK);
1842			aip->ai_indirdep = indirdep;
1843			/*
1844			 * Check to see if there is an existing dependency
1845			 * for this block. If there is, merge the old
1846			 * dependency into the new one.
1847			 */
1848			if (aip->ai_oldblkno == 0)
1849				oldaip = NULL;
1850			else
1851
1852				LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
1853					if (oldaip->ai_offset == aip->ai_offset)
1854						break;
1855			freefrag = NULL;
1856			if (oldaip != NULL) {
1857				if (oldaip->ai_newblkno != aip->ai_oldblkno) {
1858					FREE_LOCK(&lk);
1859					panic("setup_allocindir_phase2: blkno");
1860				}
1861				aip->ai_oldblkno = oldaip->ai_oldblkno;
1862				freefrag = aip->ai_freefrag;
1863				aip->ai_freefrag = oldaip->ai_freefrag;
1864				oldaip->ai_freefrag = NULL;
1865				free_allocindir(oldaip, NULL);
1866			}
1867			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1868			if (ip->i_ump->um_fstype == UFS1)
1869				((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
1870				    [aip->ai_offset] = aip->ai_oldblkno;
1871			else
1872				((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
1873				    [aip->ai_offset] = aip->ai_oldblkno;
1874			FREE_LOCK(&lk);
1875			if (freefrag != NULL)
1876				handle_workitem_freefrag(freefrag);
1877		}
1878		if (newindirdep) {
1879			brelse(newindirdep->ir_savebp);
1880			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
1881		}
1882		if (indirdep)
1883			break;
1884		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
1885			M_INDIRDEP, M_SOFTDEP_FLAGS);
1886		newindirdep->ir_list.wk_type = D_INDIRDEP;
1887		newindirdep->ir_state = ATTACHED;
1888		if (ip->i_ump->um_fstype == UFS1)
1889			newindirdep->ir_state |= UFS1FMT;
1890		LIST_INIT(&newindirdep->ir_deplisthd);
1891		LIST_INIT(&newindirdep->ir_donehd);
1892		if (bp->b_blkno == bp->b_lblkno) {
1893			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
1894			    NULL, NULL);
1895			bp->b_blkno = blkno;
1896		}
1897		newindirdep->ir_savebp =
1898		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
1899		BUF_KERNPROC(newindirdep->ir_savebp);
1900		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1901	}
1902}
1903
1904/*
1905 * Block de-allocation dependencies.
1906 *
1907 * When blocks are de-allocated, the on-disk pointers must be nullified before
1908 * the blocks are made available for use by other files.  (The true
1909 * requirement is that old pointers must be nullified before new on-disk
1910 * pointers are set.  We chose this slightly more stringent requirement to
1911 * reduce complexity.) Our implementation handles this dependency by updating
1912 * the inode (or indirect block) appropriately but delaying the actual block
1913 * de-allocation (i.e., freemap and free space count manipulation) until
1914 * after the updated versions reach stable storage.  After the disk is
1915 * updated, the blocks can be safely de-allocated whenever it is convenient.
1916 * This implementation handles only the common case of reducing a file's
1917 * length to zero. Other cases are handled by the conventional synchronous
1918 * write approach.
1919 *
1920 * The ffs implementation with which we worked double-checks
1921 * the state of the block pointers and file size as it reduces
1922 * a file's length.  Some of this code is replicated here in our
1923 * soft updates implementation.  The freeblks->fb_chkcnt field is
1924 * used to transfer a part of this information to the procedure
1925 * that eventually de-allocates the blocks.
1926 *
1927 * This routine should be called from the routine that shortens
1928 * a file's length, before the inode's size or block pointers
1929 * are modified. It will save the block pointer information for
1930 * later release and zero the inode so that the calling routine
1931 * can release it.
1932 */
1933void
1934softdep_setup_freeblocks(ip, length, flags)
1935	struct inode *ip;	/* The inode whose length is to be reduced */
1936	off_t length;		/* The new length for the file */
1937	int flags;		/* IO_EXT and/or IO_NORMAL */
1938{
1939	struct freeblks *freeblks;
1940	struct inodedep *inodedep;
1941	struct allocdirect *adp;
1942	struct vnode *vp;
1943	struct buf *bp;
1944	struct fs *fs;
1945	ufs2_daddr_t extblocks, datablocks;
1946	int i, delay, error;
1947
1948	fs = ip->i_fs;
1949	if (length != 0)
1950		panic("softdep_setup_freeblocks: non-zero length");
1951	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
1952		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
1953	freeblks->fb_list.wk_type = D_FREEBLKS;
1954	freeblks->fb_uid = ip->i_uid;
1955	freeblks->fb_previousinum = ip->i_number;
1956	freeblks->fb_devvp = ip->i_devvp;
1957	freeblks->fb_mnt = ITOV(ip)->v_mount;
1958	extblocks = 0;
1959	if (fs->fs_magic == FS_UFS2_MAGIC)
1960		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
1961	datablocks = DIP(ip, i_blocks) - extblocks;
1962	if ((flags & IO_NORMAL) == 0) {
1963		freeblks->fb_oldsize = 0;
1964		freeblks->fb_chkcnt = 0;
1965	} else {
1966		freeblks->fb_oldsize = ip->i_size;
1967		ip->i_size = 0;
1968		DIP(ip, i_size) = 0;
1969		freeblks->fb_chkcnt = datablocks;
1970		for (i = 0; i < NDADDR; i++) {
1971			freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
1972			DIP(ip, i_db[i]) = 0;
1973		}
1974		for (i = 0; i < NIADDR; i++) {
1975			freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
1976			DIP(ip, i_ib[i]) = 0;
1977		}
1978		/*
1979		 * If the file was removed, then the space being freed was
1980		 * accounted for then (see softdep_filereleased()). If the
1981		 * file is merely being truncated, then we account for it now.
1982		 */
1983		if ((ip->i_flag & IN_SPACECOUNTED) == 0)
1984			fs->fs_pendingblocks += datablocks;
1985	}
1986	if ((flags & IO_EXT) == 0) {
1987		freeblks->fb_oldextsize = 0;
1988	} else {
1989		freeblks->fb_oldextsize = ip->i_din2->di_extsize;
1990		ip->i_din2->di_extsize = 0;
1991		freeblks->fb_chkcnt += extblocks;
1992		for (i = 0; i < NXADDR; i++) {
1993			freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
1994			ip->i_din2->di_extb[i] = 0;
1995		}
1996	}
1997	DIP(ip, i_blocks) -= freeblks->fb_chkcnt;
1998	/*
1999	 * Push the zero'ed inode to to its disk buffer so that we are free
2000	 * to delete its dependencies below. Once the dependencies are gone
2001	 * the buffer can be safely released.
2002	 */
2003	if ((error = bread(ip->i_devvp,
2004	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
2005	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
2006		brelse(bp);
2007		softdep_error("softdep_setup_freeblocks", error);
2008	}
2009	if (ip->i_ump->um_fstype == UFS1)
2010		*((struct ufs1_dinode *)bp->b_data +
2011		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
2012	else
2013		*((struct ufs2_dinode *)bp->b_data +
2014		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
2015	/*
2016	 * Find and eliminate any inode dependencies.
2017	 */
2018	ACQUIRE_LOCK(&lk);
2019	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
2020	if ((inodedep->id_state & IOSTARTED) != 0) {
2021		FREE_LOCK(&lk);
2022		panic("softdep_setup_freeblocks: inode busy");
2023	}
2024	/*
2025	 * Add the freeblks structure to the list of operations that
2026	 * must await the zero'ed inode being written to disk. If we
2027	 * still have a bitmap dependency (delay == 0), then the inode
2028	 * has never been written to disk, so we can process the
2029	 * freeblks below once we have deleted the dependencies.
2030	 */
2031	delay = (inodedep->id_state & DEPCOMPLETE);
2032	if (delay)
2033		WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
2034	/*
2035	 * Because the file length has been truncated to zero, any
2036	 * pending block allocation dependency structures associated
2037	 * with this inode are obsolete and can simply be de-allocated.
2038	 * We must first merge the two dependency lists to get rid of
2039	 * any duplicate freefrag structures, then purge the merged list.
2040	 * If we still have a bitmap dependency, then the inode has never
2041	 * been written to disk, so we can free any fragments without delay.
2042	 */
2043	if (flags & IO_NORMAL) {
2044		merge_inode_lists(&inodedep->id_newinoupdt,
2045		    &inodedep->id_inoupdt);
2046		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
2047			free_allocdirect(&inodedep->id_inoupdt, adp, delay);
2048	}
2049	if (flags & IO_EXT) {
2050		merge_inode_lists(&inodedep->id_newextupdt,
2051		    &inodedep->id_extupdt);
2052		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
2053			free_allocdirect(&inodedep->id_extupdt, adp, delay);
2054	}
2055	FREE_LOCK(&lk);
2056	bdwrite(bp);
2057	/*
2058	 * We must wait for any I/O in progress to finish so that
2059	 * all potential buffers on the dirty list will be visible.
2060	 * Once they are all there, walk the list and get rid of
2061	 * any dependencies.
2062	 */
2063	vp = ITOV(ip);
2064	ACQUIRE_LOCK(&lk);
2065	VI_LOCK(vp);
2066	drain_output(vp, 1);
2067restart:
2068	TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
2069		if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
2070		    ((flags & IO_NORMAL) == 0 &&
2071		      (bp->b_xflags & BX_ALTDATA) == 0))
2072			continue;
2073		if ((bp = getdirtybuf(&bp, VI_MTX(vp), MNT_WAIT)) == NULL)
2074			goto restart;
2075		(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
2076		deallocate_dependencies(bp, inodedep);
2077		bp->b_flags |= B_INVAL | B_NOCACHE;
2078		FREE_LOCK(&lk);
2079		brelse(bp);
2080		ACQUIRE_LOCK(&lk);
2081		VI_LOCK(vp);
2082		goto restart;
2083	}
2084	VI_UNLOCK(vp);
2085	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
2086		(void) free_inodedep(inodedep);
2087	FREE_LOCK(&lk);
2088	/*
2089	 * If the inode has never been written to disk (delay == 0),
2090	 * then we can process the freeblks now that we have deleted
2091	 * the dependencies.
2092	 */
2093	if (!delay)
2094		handle_workitem_freeblocks(freeblks, 0);
2095}
2096
2097/*
2098 * Reclaim any dependency structures from a buffer that is about to
2099 * be reallocated to a new vnode. The buffer must be locked, thus,
2100 * no I/O completion operations can occur while we are manipulating
2101 * its associated dependencies. The mutex is held so that other I/O's
2102 * associated with related dependencies do not occur.
2103 */
2104static void
2105deallocate_dependencies(bp, inodedep)
2106	struct buf *bp;
2107	struct inodedep *inodedep;
2108{
2109	struct worklist *wk;
2110	struct indirdep *indirdep;
2111	struct allocindir *aip;
2112	struct pagedep *pagedep;
2113	struct dirrem *dirrem;
2114	struct diradd *dap;
2115	int i;
2116
2117	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2118		switch (wk->wk_type) {
2119
2120		case D_INDIRDEP:
2121			indirdep = WK_INDIRDEP(wk);
2122			/*
2123			 * None of the indirect pointers will ever be visible,
2124			 * so they can simply be tossed. GOINGAWAY ensures
2125			 * that allocated pointers will be saved in the buffer
2126			 * cache until they are freed. Note that they will
2127			 * only be able to be found by their physical address
2128			 * since the inode mapping the logical address will
2129			 * be gone. The save buffer used for the safe copy
2130			 * was allocated in setup_allocindir_phase2 using
2131			 * the physical address so it could be used for this
2132			 * purpose. Hence we swap the safe copy with the real
2133			 * copy, allowing the safe copy to be freed and holding
2134			 * on to the real copy for later use in indir_trunc.
2135			 */
2136			if (indirdep->ir_state & GOINGAWAY) {
2137				FREE_LOCK(&lk);
2138				panic("deallocate_dependencies: already gone");
2139			}
2140			indirdep->ir_state |= GOINGAWAY;
2141			VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
2142			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
2143				free_allocindir(aip, inodedep);
2144			if (bp->b_lblkno >= 0 ||
2145			    bp->b_blkno != indirdep->ir_savebp->b_lblkno) {
2146				FREE_LOCK(&lk);
2147				panic("deallocate_dependencies: not indir");
2148			}
2149			bcopy(bp->b_data, indirdep->ir_savebp->b_data,
2150			    bp->b_bcount);
2151			WORKLIST_REMOVE(wk);
2152			WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
2153			continue;
2154
2155		case D_PAGEDEP:
2156			pagedep = WK_PAGEDEP(wk);
2157			/*
2158			 * None of the directory additions will ever be
2159			 * visible, so they can simply be tossed.
2160			 */
2161			for (i = 0; i < DAHASHSZ; i++)
2162				while ((dap =
2163				    LIST_FIRST(&pagedep->pd_diraddhd[i])))
2164					free_diradd(dap);
2165			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
2166				free_diradd(dap);
2167			/*
2168			 * Copy any directory remove dependencies to the list
2169			 * to be processed after the zero'ed inode is written.
2170			 * If the inode has already been written, then they
2171			 * can be dumped directly onto the work list.
2172			 */
2173			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
2174				LIST_REMOVE(dirrem, dm_next);
2175				dirrem->dm_dirinum = pagedep->pd_ino;
2176				if (inodedep == NULL ||
2177				    (inodedep->id_state & ALLCOMPLETE) ==
2178				     ALLCOMPLETE)
2179					add_to_worklist(&dirrem->dm_list);
2180				else
2181					WORKLIST_INSERT(&inodedep->id_bufwait,
2182					    &dirrem->dm_list);
2183			}
2184			if ((pagedep->pd_state & NEWBLOCK) != 0) {
2185				LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
2186					if (wk->wk_type == D_NEWDIRBLK &&
2187					    WK_NEWDIRBLK(wk)->db_pagedep ==
2188					      pagedep)
2189						break;
2190				if (wk != NULL) {
2191					WORKLIST_REMOVE(wk);
2192					free_newdirblk(WK_NEWDIRBLK(wk));
2193				} else {
2194					FREE_LOCK(&lk);
2195					panic("deallocate_dependencies: "
2196					      "lost pagedep");
2197				}
2198			}
2199			WORKLIST_REMOVE(&pagedep->pd_list);
2200			LIST_REMOVE(pagedep, pd_hash);
2201			WORKITEM_FREE(pagedep, D_PAGEDEP);
2202			continue;
2203
2204		case D_ALLOCINDIR:
2205			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
2206			continue;
2207
2208		case D_ALLOCDIRECT:
2209		case D_INODEDEP:
2210			FREE_LOCK(&lk);
2211			panic("deallocate_dependencies: Unexpected type %s",
2212			    TYPENAME(wk->wk_type));
2213			/* NOTREACHED */
2214
2215		default:
2216			FREE_LOCK(&lk);
2217			panic("deallocate_dependencies: Unknown type %s",
2218			    TYPENAME(wk->wk_type));
2219			/* NOTREACHED */
2220		}
2221	}
2222}
2223
2224/*
2225 * Free an allocdirect. Generate a new freefrag work request if appropriate.
2226 * This routine must be called with splbio interrupts blocked.
2227 */
2228static void
2229free_allocdirect(adphead, adp, delay)
2230	struct allocdirectlst *adphead;
2231	struct allocdirect *adp;
2232	int delay;
2233{
2234	struct newdirblk *newdirblk;
2235	struct worklist *wk;
2236
2237#ifdef DEBUG
2238	if (lk.lkt_held == NOHOLDER)
2239		panic("free_allocdirect: lock not held");
2240#endif
2241	if ((adp->ad_state & DEPCOMPLETE) == 0)
2242		LIST_REMOVE(adp, ad_deps);
2243	TAILQ_REMOVE(adphead, adp, ad_next);
2244	if ((adp->ad_state & COMPLETE) == 0)
2245		WORKLIST_REMOVE(&adp->ad_list);
2246	if (adp->ad_freefrag != NULL) {
2247		if (delay)
2248			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2249			    &adp->ad_freefrag->ff_list);
2250		else
2251			add_to_worklist(&adp->ad_freefrag->ff_list);
2252	}
2253	if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
2254		newdirblk = WK_NEWDIRBLK(wk);
2255		WORKLIST_REMOVE(&newdirblk->db_list);
2256		if (LIST_FIRST(&adp->ad_newdirblk) != NULL)
2257			panic("free_allocdirect: extra newdirblk");
2258		if (delay)
2259			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2260			    &newdirblk->db_list);
2261		else
2262			free_newdirblk(newdirblk);
2263	}
2264	WORKITEM_FREE(adp, D_ALLOCDIRECT);
2265}
2266
2267/*
2268 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
2269 * This routine must be called with splbio interrupts blocked.
2270 */
2271static void
2272free_newdirblk(newdirblk)
2273	struct newdirblk *newdirblk;
2274{
2275	struct pagedep *pagedep;
2276	struct diradd *dap;
2277	int i;
2278
2279#ifdef DEBUG
2280	if (lk.lkt_held == NOHOLDER)
2281		panic("free_newdirblk: lock not held");
2282#endif
2283	/*
2284	 * If the pagedep is still linked onto the directory buffer
2285	 * dependency chain, then some of the entries on the
2286	 * pd_pendinghd list may not be committed to disk yet. In
2287	 * this case, we will simply clear the NEWBLOCK flag and
2288	 * let the pd_pendinghd list be processed when the pagedep
2289	 * is next written. If the pagedep is no longer on the buffer
2290	 * dependency chain, then all the entries on the pd_pending
2291	 * list are committed to disk and we can free them here.
2292	 */
2293	pagedep = newdirblk->db_pagedep;
2294	pagedep->pd_state &= ~NEWBLOCK;
2295	if ((pagedep->pd_state & ONWORKLIST) == 0)
2296		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
2297			free_diradd(dap);
2298	/*
2299	 * If no dependencies remain, the pagedep will be freed.
2300	 */
2301	for (i = 0; i < DAHASHSZ; i++)
2302		if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
2303			break;
2304	if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
2305		LIST_REMOVE(pagedep, pd_hash);
2306		WORKITEM_FREE(pagedep, D_PAGEDEP);
2307	}
2308	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2309}
2310
2311/*
2312 * Prepare an inode to be freed. The actual free operation is not
2313 * done until the zero'ed inode has been written to disk.
2314 */
2315void
2316softdep_freefile(pvp, ino, mode)
2317	struct vnode *pvp;
2318	ino_t ino;
2319	int mode;
2320{
2321	struct inode *ip = VTOI(pvp);
2322	struct inodedep *inodedep;
2323	struct freefile *freefile;
2324
2325	/*
2326	 * This sets up the inode de-allocation dependency.
2327	 */
2328	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
2329		M_FREEFILE, M_SOFTDEP_FLAGS);
2330	freefile->fx_list.wk_type = D_FREEFILE;
2331	freefile->fx_list.wk_state = 0;
2332	freefile->fx_mode = mode;
2333	freefile->fx_oldinum = ino;
2334	freefile->fx_devvp = ip->i_devvp;
2335	freefile->fx_mnt = ITOV(ip)->v_mount;
2336	if ((ip->i_flag & IN_SPACECOUNTED) == 0)
2337		ip->i_fs->fs_pendinginodes += 1;
2338
2339	/*
2340	 * If the inodedep does not exist, then the zero'ed inode has
2341	 * been written to disk. If the allocated inode has never been
2342	 * written to disk, then the on-disk inode is zero'ed. In either
2343	 * case we can free the file immediately.
2344	 */
2345	ACQUIRE_LOCK(&lk);
2346	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
2347	    check_inode_unwritten(inodedep)) {
2348		FREE_LOCK(&lk);
2349		handle_workitem_freefile(freefile);
2350		return;
2351	}
2352	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
2353	FREE_LOCK(&lk);
2354}
2355
2356/*
2357 * Check to see if an inode has never been written to disk. If
2358 * so free the inodedep and return success, otherwise return failure.
2359 * This routine must be called with splbio interrupts blocked.
2360 *
2361 * If we still have a bitmap dependency, then the inode has never
2362 * been written to disk. Drop the dependency as it is no longer
2363 * necessary since the inode is being deallocated. We set the
2364 * ALLCOMPLETE flags since the bitmap now properly shows that the
2365 * inode is not allocated. Even if the inode is actively being
2366 * written, it has been rolled back to its zero'ed state, so we
2367 * are ensured that a zero inode is what is on the disk. For short
2368 * lived files, this change will usually result in removing all the
2369 * dependencies from the inode so that it can be freed immediately.
2370 */
2371static int
2372check_inode_unwritten(inodedep)
2373	struct inodedep *inodedep;
2374{
2375
2376	if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
2377	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2378	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2379	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
2380	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2381	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2382	    TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
2383	    TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
2384	    inodedep->id_nlinkdelta != 0)
2385		return (0);
2386	inodedep->id_state |= ALLCOMPLETE;
2387	LIST_REMOVE(inodedep, id_deps);
2388	inodedep->id_buf = NULL;
2389	if (inodedep->id_state & ONWORKLIST)
2390		WORKLIST_REMOVE(&inodedep->id_list);
2391	if (inodedep->id_savedino1 != NULL) {
2392		FREE(inodedep->id_savedino1, M_INODEDEP);
2393		inodedep->id_savedino1 = NULL;
2394	}
2395	if (free_inodedep(inodedep) == 0) {
2396		FREE_LOCK(&lk);
2397		panic("check_inode_unwritten: busy inode");
2398	}
2399	return (1);
2400}
2401
2402/*
2403 * Try to free an inodedep structure. Return 1 if it could be freed.
2404 */
2405static int
2406free_inodedep(inodedep)
2407	struct inodedep *inodedep;
2408{
2409
2410	if ((inodedep->id_state & ONWORKLIST) != 0 ||
2411	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
2412	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2413	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2414	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
2415	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2416	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2417	    TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
2418	    TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
2419	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
2420		return (0);
2421	LIST_REMOVE(inodedep, id_hash);
2422	WORKITEM_FREE(inodedep, D_INODEDEP);
2423	num_inodedep -= 1;
2424	return (1);
2425}
2426
2427/*
2428 * This workitem routine performs the block de-allocation.
2429 * The workitem is added to the pending list after the updated
2430 * inode block has been written to disk.  As mentioned above,
2431 * checks regarding the number of blocks de-allocated (compared
2432 * to the number of blocks allocated for the file) are also
2433 * performed in this function.
2434 */
2435static void
2436handle_workitem_freeblocks(freeblks, flags)
2437	struct freeblks *freeblks;
2438	int flags;
2439{
2440	struct inode *ip;
2441	struct vnode *vp;
2442	struct fs *fs;
2443	int i, nblocks, level, bsize;
2444	ufs2_daddr_t bn, blocksreleased = 0;
2445	int error, allerror = 0;
2446	ufs_lbn_t baselbns[NIADDR], tmpval;
2447
2448	fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
2449	tmpval = 1;
2450	baselbns[0] = NDADDR;
2451	for (i = 1; i < NIADDR; i++) {
2452		tmpval *= NINDIR(fs);
2453		baselbns[i] = baselbns[i - 1] + tmpval;
2454	}
2455	nblocks = btodb(fs->fs_bsize);
2456	blocksreleased = 0;
2457	/*
2458	 * Release all extended attribute blocks or frags.
2459	 */
2460	if (freeblks->fb_oldextsize > 0) {
2461		for (i = (NXADDR - 1); i >= 0; i--) {
2462			if ((bn = freeblks->fb_eblks[i]) == 0)
2463				continue;
2464			bsize = sblksize(fs, freeblks->fb_oldextsize, i);
2465			ffs_blkfree(fs, freeblks->fb_devvp, bn, bsize,
2466			    freeblks->fb_previousinum);
2467			blocksreleased += btodb(bsize);
2468		}
2469	}
2470	/*
2471	 * Release all data blocks or frags.
2472	 */
2473	if (freeblks->fb_oldsize > 0) {
2474		/*
2475		 * Indirect blocks first.
2476		 */
2477		for (level = (NIADDR - 1); level >= 0; level--) {
2478			if ((bn = freeblks->fb_iblks[level]) == 0)
2479				continue;
2480			if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
2481			    level, baselbns[level], &blocksreleased)) == 0)
2482				allerror = error;
2483			ffs_blkfree(fs, freeblks->fb_devvp, bn, fs->fs_bsize,
2484			    freeblks->fb_previousinum);
2485			fs->fs_pendingblocks -= nblocks;
2486			blocksreleased += nblocks;
2487		}
2488		/*
2489		 * All direct blocks or frags.
2490		 */
2491		for (i = (NDADDR - 1); i >= 0; i--) {
2492			if ((bn = freeblks->fb_dblks[i]) == 0)
2493				continue;
2494			bsize = sblksize(fs, freeblks->fb_oldsize, i);
2495			ffs_blkfree(fs, freeblks->fb_devvp, bn, bsize,
2496			    freeblks->fb_previousinum);
2497			fs->fs_pendingblocks -= btodb(bsize);
2498			blocksreleased += btodb(bsize);
2499		}
2500	}
2501	/*
2502	 * If we still have not finished background cleanup, then check
2503	 * to see if the block count needs to be adjusted.
2504	 */
2505	if (freeblks->fb_chkcnt != blocksreleased &&
2506	    (fs->fs_flags & FS_UNCLEAN) != 0 &&
2507	    VFS_VGET(freeblks->fb_mnt, freeblks->fb_previousinum,
2508	    (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) {
2509		ip = VTOI(vp);
2510		DIP(ip, i_blocks) += freeblks->fb_chkcnt - blocksreleased;
2511		ip->i_flag |= IN_CHANGE;
2512		vput(vp);
2513	}
2514
2515#ifdef DIAGNOSTIC
2516	if (freeblks->fb_chkcnt != blocksreleased &&
2517	    ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
2518		printf("handle_workitem_freeblocks: block count\n");
2519	if (allerror)
2520		softdep_error("handle_workitem_freeblks", allerror);
2521#endif /* DIAGNOSTIC */
2522
2523	WORKITEM_FREE(freeblks, D_FREEBLKS);
2524}
2525
2526/*
2527 * Release blocks associated with the inode ip and stored in the indirect
2528 * block dbn. If level is greater than SINGLE, the block is an indirect block
2529 * and recursive calls to indirtrunc must be used to cleanse other indirect
2530 * blocks.
2531 */
2532static int
2533indir_trunc(freeblks, dbn, level, lbn, countp)
2534	struct freeblks *freeblks;
2535	ufs2_daddr_t dbn;
2536	int level;
2537	ufs_lbn_t lbn;
2538	ufs2_daddr_t *countp;
2539{
2540	struct buf *bp;
2541	struct fs *fs;
2542	struct worklist *wk;
2543	struct indirdep *indirdep;
2544	ufs1_daddr_t *bap1 = 0;
2545	ufs2_daddr_t nb, *bap2 = 0;
2546	ufs_lbn_t lbnadd;
2547	int i, nblocks, ufs1fmt;
2548	int error, allerror = 0;
2549
2550	fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
2551	lbnadd = 1;
2552	for (i = level; i > 0; i--)
2553		lbnadd *= NINDIR(fs);
2554	/*
2555	 * Get buffer of block pointers to be freed. This routine is not
2556	 * called until the zero'ed inode has been written, so it is safe
2557	 * to free blocks as they are encountered. Because the inode has
2558	 * been zero'ed, calls to bmap on these blocks will fail. So, we
2559	 * have to use the on-disk address and the block device for the
2560	 * filesystem to look them up. If the file was deleted before its
2561	 * indirect blocks were all written to disk, the routine that set
2562	 * us up (deallocate_dependencies) will have arranged to leave
2563	 * a complete copy of the indirect block in memory for our use.
2564	 * Otherwise we have to read the blocks in from the disk.
2565	 */
2566	ACQUIRE_LOCK(&lk);
2567	/* XXX Buf not locked! */
2568	if ((bp = incore(freeblks->fb_devvp, dbn)) != NULL &&
2569	    (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2570		if (wk->wk_type != D_INDIRDEP ||
2571		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2572		    (indirdep->ir_state & GOINGAWAY) == 0) {
2573			FREE_LOCK(&lk);
2574			panic("indir_trunc: lost indirdep");
2575		}
2576		WORKLIST_REMOVE(wk);
2577		WORKITEM_FREE(indirdep, D_INDIRDEP);
2578		if (LIST_FIRST(&bp->b_dep) != NULL) {
2579			FREE_LOCK(&lk);
2580			panic("indir_trunc: dangling dep");
2581		}
2582		VFSTOUFS(freeblks->fb_mnt)->um_numindirdeps -= 1;
2583		FREE_LOCK(&lk);
2584	} else {
2585		FREE_LOCK(&lk);
2586		error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
2587		    NOCRED, &bp);
2588		if (error) {
2589			brelse(bp);
2590			return (error);
2591		}
2592	}
2593	/*
2594	 * Recursively free indirect blocks.
2595	 */
2596	if (VFSTOUFS(freeblks->fb_mnt)->um_fstype == UFS1) {
2597		ufs1fmt = 1;
2598		bap1 = (ufs1_daddr_t *)bp->b_data;
2599	} else {
2600		ufs1fmt = 0;
2601		bap2 = (ufs2_daddr_t *)bp->b_data;
2602	}
2603	nblocks = btodb(fs->fs_bsize);
2604	for (i = NINDIR(fs) - 1; i >= 0; i--) {
2605		if (ufs1fmt)
2606			nb = bap1[i];
2607		else
2608			nb = bap2[i];
2609		if (nb == 0)
2610			continue;
2611		if (level != 0) {
2612			if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
2613			     level - 1, lbn + (i * lbnadd), countp)) != 0)
2614				allerror = error;
2615		}
2616		ffs_blkfree(fs, freeblks->fb_devvp, nb, fs->fs_bsize,
2617		    freeblks->fb_previousinum);
2618		fs->fs_pendingblocks -= nblocks;
2619		*countp += nblocks;
2620	}
2621	bp->b_flags |= B_INVAL | B_NOCACHE;
2622	brelse(bp);
2623	return (allerror);
2624}
2625
2626/*
2627 * Free an allocindir.
2628 * This routine must be called with splbio interrupts blocked.
2629 */
2630static void
2631free_allocindir(aip, inodedep)
2632	struct allocindir *aip;
2633	struct inodedep *inodedep;
2634{
2635	struct freefrag *freefrag;
2636
2637#ifdef DEBUG
2638	if (lk.lkt_held == NOHOLDER)
2639		panic("free_allocindir: lock not held");
2640#endif
2641	if ((aip->ai_state & DEPCOMPLETE) == 0)
2642		LIST_REMOVE(aip, ai_deps);
2643	if (aip->ai_state & ONWORKLIST)
2644		WORKLIST_REMOVE(&aip->ai_list);
2645	LIST_REMOVE(aip, ai_next);
2646	if ((freefrag = aip->ai_freefrag) != NULL) {
2647		if (inodedep == NULL)
2648			add_to_worklist(&freefrag->ff_list);
2649		else
2650			WORKLIST_INSERT(&inodedep->id_bufwait,
2651			    &freefrag->ff_list);
2652	}
2653	WORKITEM_FREE(aip, D_ALLOCINDIR);
2654}
2655
2656/*
2657 * Directory entry addition dependencies.
2658 *
2659 * When adding a new directory entry, the inode (with its incremented link
2660 * count) must be written to disk before the directory entry's pointer to it.
2661 * Also, if the inode is newly allocated, the corresponding freemap must be
2662 * updated (on disk) before the directory entry's pointer. These requirements
2663 * are met via undo/redo on the directory entry's pointer, which consists
2664 * simply of the inode number.
2665 *
2666 * As directory entries are added and deleted, the free space within a
2667 * directory block can become fragmented.  The ufs filesystem will compact
2668 * a fragmented directory block to make space for a new entry. When this
2669 * occurs, the offsets of previously added entries change. Any "diradd"
2670 * dependency structures corresponding to these entries must be updated with
2671 * the new offsets.
2672 */
2673
2674/*
2675 * This routine is called after the in-memory inode's link
2676 * count has been incremented, but before the directory entry's
2677 * pointer to the inode has been set.
2678 */
2679int
2680softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
2681	struct buf *bp;		/* buffer containing directory block */
2682	struct inode *dp;	/* inode for directory */
2683	off_t diroffset;	/* offset of new entry in directory */
2684	ino_t newinum;		/* inode referenced by new directory entry */
2685	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
2686	int isnewblk;		/* entry is in a newly allocated block */
2687{
2688	int offset;		/* offset of new entry within directory block */
2689	ufs_lbn_t lbn;		/* block in directory containing new entry */
2690	struct fs *fs;
2691	struct diradd *dap;
2692	struct allocdirect *adp;
2693	struct pagedep *pagedep;
2694	struct inodedep *inodedep;
2695	struct newdirblk *newdirblk = 0;
2696	struct mkdir *mkdir1, *mkdir2;
2697
2698	/*
2699	 * Whiteouts have no dependencies.
2700	 */
2701	if (newinum == WINO) {
2702		if (newdirbp != NULL)
2703			bdwrite(newdirbp);
2704		return (0);
2705	}
2706
2707	fs = dp->i_fs;
2708	lbn = lblkno(fs, diroffset);
2709	offset = blkoff(fs, diroffset);
2710	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
2711		M_SOFTDEP_FLAGS|M_ZERO);
2712	dap->da_list.wk_type = D_DIRADD;
2713	dap->da_offset = offset;
2714	dap->da_newinum = newinum;
2715	dap->da_state = ATTACHED;
2716	if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
2717		MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk),
2718		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
2719		newdirblk->db_list.wk_type = D_NEWDIRBLK;
2720		newdirblk->db_state = 0;
2721	}
2722	if (newdirbp == NULL) {
2723		dap->da_state |= DEPCOMPLETE;
2724		ACQUIRE_LOCK(&lk);
2725	} else {
2726		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2727		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2728		    M_SOFTDEP_FLAGS);
2729		mkdir1->md_list.wk_type = D_MKDIR;
2730		mkdir1->md_state = MKDIR_BODY;
2731		mkdir1->md_diradd = dap;
2732		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2733		    M_SOFTDEP_FLAGS);
2734		mkdir2->md_list.wk_type = D_MKDIR;
2735		mkdir2->md_state = MKDIR_PARENT;
2736		mkdir2->md_diradd = dap;
2737		/*
2738		 * Dependency on "." and ".." being written to disk.
2739		 */
2740		mkdir1->md_buf = newdirbp;
2741		ACQUIRE_LOCK(&lk);
2742		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2743		WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2744		FREE_LOCK(&lk);
2745		bdwrite(newdirbp);
2746		/*
2747		 * Dependency on link count increase for parent directory
2748		 */
2749		ACQUIRE_LOCK(&lk);
2750		if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0
2751		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2752			dap->da_state &= ~MKDIR_PARENT;
2753			WORKITEM_FREE(mkdir2, D_MKDIR);
2754		} else {
2755			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2756			WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2757		}
2758	}
2759	/*
2760	 * Link into parent directory pagedep to await its being written.
2761	 */
2762	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2763		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2764	dap->da_pagedep = pagedep;
2765	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2766	    da_pdlist);
2767	/*
2768	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2769	 * is not yet written. If it is written, do the post-inode write
2770	 * processing to put it on the id_pendinghd list.
2771	 */
2772	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2773	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2774		diradd_inode_written(dap, inodedep);
2775	else
2776		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2777	if (isnewblk) {
2778		/*
2779		 * Directories growing into indirect blocks are rare
2780		 * enough and the frequency of new block allocation
2781		 * in those cases even more rare, that we choose not
2782		 * to bother tracking them. Rather we simply force the
2783		 * new directory entry to disk.
2784		 */
2785		if (lbn >= NDADDR) {
2786			FREE_LOCK(&lk);
2787			/*
2788			 * We only have a new allocation when at the
2789			 * beginning of a new block, not when we are
2790			 * expanding into an existing block.
2791			 */
2792			if (blkoff(fs, diroffset) == 0)
2793				return (1);
2794			return (0);
2795		}
2796		/*
2797		 * We only have a new allocation when at the beginning
2798		 * of a new fragment, not when we are expanding into an
2799		 * existing fragment. Also, there is nothing to do if we
2800		 * are already tracking this block.
2801		 */
2802		if (fragoff(fs, diroffset) != 0) {
2803			FREE_LOCK(&lk);
2804			return (0);
2805		}
2806		if ((pagedep->pd_state & NEWBLOCK) != 0) {
2807			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2808			FREE_LOCK(&lk);
2809			return (0);
2810		}
2811		/*
2812		 * Find our associated allocdirect and have it track us.
2813		 */
2814		if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0)
2815			panic("softdep_setup_directory_add: lost inodedep");
2816		adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
2817		if (adp == NULL || adp->ad_lbn != lbn) {
2818			FREE_LOCK(&lk);
2819			panic("softdep_setup_directory_add: lost entry");
2820		}
2821		pagedep->pd_state |= NEWBLOCK;
2822		newdirblk->db_pagedep = pagedep;
2823		WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
2824	}
2825	FREE_LOCK(&lk);
2826	return (0);
2827}
2828
2829/*
2830 * This procedure is called to change the offset of a directory
2831 * entry when compacting a directory block which must be owned
2832 * exclusively by the caller. Note that the actual entry movement
2833 * must be done in this procedure to ensure that no I/O completions
2834 * occur while the move is in progress.
2835 */
2836void
2837softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2838	struct inode *dp;	/* inode for directory */
2839	caddr_t base;		/* address of dp->i_offset */
2840	caddr_t oldloc;		/* address of old directory location */
2841	caddr_t newloc;		/* address of new directory location */
2842	int entrysize;		/* size of directory entry */
2843{
2844	int offset, oldoffset, newoffset;
2845	struct pagedep *pagedep;
2846	struct diradd *dap;
2847	ufs_lbn_t lbn;
2848
2849	ACQUIRE_LOCK(&lk);
2850	lbn = lblkno(dp->i_fs, dp->i_offset);
2851	offset = blkoff(dp->i_fs, dp->i_offset);
2852	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2853		goto done;
2854	oldoffset = offset + (oldloc - base);
2855	newoffset = offset + (newloc - base);
2856
2857	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
2858		if (dap->da_offset != oldoffset)
2859			continue;
2860		dap->da_offset = newoffset;
2861		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2862			break;
2863		LIST_REMOVE(dap, da_pdlist);
2864		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2865		    dap, da_pdlist);
2866		break;
2867	}
2868	if (dap == NULL) {
2869
2870		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
2871			if (dap->da_offset == oldoffset) {
2872				dap->da_offset = newoffset;
2873				break;
2874			}
2875		}
2876	}
2877done:
2878	bcopy(oldloc, newloc, entrysize);
2879	FREE_LOCK(&lk);
2880}
2881
2882/*
2883 * Free a diradd dependency structure. This routine must be called
2884 * with splbio interrupts blocked.
2885 */
2886static void
2887free_diradd(dap)
2888	struct diradd *dap;
2889{
2890	struct dirrem *dirrem;
2891	struct pagedep *pagedep;
2892	struct inodedep *inodedep;
2893	struct mkdir *mkdir, *nextmd;
2894
2895#ifdef DEBUG
2896	if (lk.lkt_held == NOHOLDER)
2897		panic("free_diradd: lock not held");
2898#endif
2899	WORKLIST_REMOVE(&dap->da_list);
2900	LIST_REMOVE(dap, da_pdlist);
2901	if ((dap->da_state & DIRCHG) == 0) {
2902		pagedep = dap->da_pagedep;
2903	} else {
2904		dirrem = dap->da_previous;
2905		pagedep = dirrem->dm_pagedep;
2906		dirrem->dm_dirinum = pagedep->pd_ino;
2907		add_to_worklist(&dirrem->dm_list);
2908	}
2909	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2910	    0, &inodedep) != 0)
2911		(void) free_inodedep(inodedep);
2912	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2913		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2914			nextmd = LIST_NEXT(mkdir, md_mkdirs);
2915			if (mkdir->md_diradd != dap)
2916				continue;
2917			dap->da_state &= ~mkdir->md_state;
2918			WORKLIST_REMOVE(&mkdir->md_list);
2919			LIST_REMOVE(mkdir, md_mkdirs);
2920			WORKITEM_FREE(mkdir, D_MKDIR);
2921		}
2922		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2923			FREE_LOCK(&lk);
2924			panic("free_diradd: unfound ref");
2925		}
2926	}
2927	WORKITEM_FREE(dap, D_DIRADD);
2928}
2929
2930/*
2931 * Directory entry removal dependencies.
2932 *
2933 * When removing a directory entry, the entry's inode pointer must be
2934 * zero'ed on disk before the corresponding inode's link count is decremented
2935 * (possibly freeing the inode for re-use). This dependency is handled by
2936 * updating the directory entry but delaying the inode count reduction until
2937 * after the directory block has been written to disk. After this point, the
2938 * inode count can be decremented whenever it is convenient.
2939 */
2940
2941/*
2942 * This routine should be called immediately after removing
2943 * a directory entry.  The inode's link count should not be
2944 * decremented by the calling procedure -- the soft updates
2945 * code will do this task when it is safe.
2946 */
2947void
2948softdep_setup_remove(bp, dp, ip, isrmdir)
2949	struct buf *bp;		/* buffer containing directory block */
2950	struct inode *dp;	/* inode for the directory being modified */
2951	struct inode *ip;	/* inode for directory entry being removed */
2952	int isrmdir;		/* indicates if doing RMDIR */
2953{
2954	struct dirrem *dirrem, *prevdirrem;
2955
2956	/*
2957	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2958	 */
2959	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2960
2961	/*
2962	 * If the COMPLETE flag is clear, then there were no active
2963	 * entries and we want to roll back to a zeroed entry until
2964	 * the new inode is committed to disk. If the COMPLETE flag is
2965	 * set then we have deleted an entry that never made it to
2966	 * disk. If the entry we deleted resulted from a name change,
2967	 * then the old name still resides on disk. We cannot delete
2968	 * its inode (returned to us in prevdirrem) until the zeroed
2969	 * directory entry gets to disk. The new inode has never been
2970	 * referenced on the disk, so can be deleted immediately.
2971	 */
2972	if ((dirrem->dm_state & COMPLETE) == 0) {
2973		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2974		    dm_next);
2975		FREE_LOCK(&lk);
2976	} else {
2977		if (prevdirrem != NULL)
2978			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
2979			    prevdirrem, dm_next);
2980		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2981		FREE_LOCK(&lk);
2982		handle_workitem_remove(dirrem, NULL);
2983	}
2984}
2985
2986/*
2987 * Allocate a new dirrem if appropriate and return it along with
2988 * its associated pagedep. Called without a lock, returns with lock.
2989 */
2990static long num_dirrem;		/* number of dirrem allocated */
2991static struct dirrem *
2992newdirrem(bp, dp, ip, isrmdir, prevdirremp)
2993	struct buf *bp;		/* buffer containing directory block */
2994	struct inode *dp;	/* inode for the directory being modified */
2995	struct inode *ip;	/* inode for directory entry being removed */
2996	int isrmdir;		/* indicates if doing RMDIR */
2997	struct dirrem **prevdirremp; /* previously referenced inode, if any */
2998{
2999	int offset;
3000	ufs_lbn_t lbn;
3001	struct diradd *dap;
3002	struct dirrem *dirrem;
3003	struct pagedep *pagedep;
3004
3005	/*
3006	 * Whiteouts have no deletion dependencies.
3007	 */
3008	if (ip == NULL)
3009		panic("newdirrem: whiteout");
3010	/*
3011	 * If we are over our limit, try to improve the situation.
3012	 * Limiting the number of dirrem structures will also limit
3013	 * the number of freefile and freeblks structures.
3014	 */
3015	if (num_dirrem > max_softdeps / 2)
3016		(void) request_cleanup(FLUSH_REMOVE, 0);
3017	num_dirrem += 1;
3018	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
3019		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
3020	dirrem->dm_list.wk_type = D_DIRREM;
3021	dirrem->dm_state = isrmdir ? RMDIR : 0;
3022	dirrem->dm_mnt = ITOV(ip)->v_mount;
3023	dirrem->dm_oldinum = ip->i_number;
3024	*prevdirremp = NULL;
3025
3026	ACQUIRE_LOCK(&lk);
3027	lbn = lblkno(dp->i_fs, dp->i_offset);
3028	offset = blkoff(dp->i_fs, dp->i_offset);
3029	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
3030		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
3031	dirrem->dm_pagedep = pagedep;
3032	/*
3033	 * Check for a diradd dependency for the same directory entry.
3034	 * If present, then both dependencies become obsolete and can
3035	 * be de-allocated. Check for an entry on both the pd_dirraddhd
3036	 * list and the pd_pendinghd list.
3037	 */
3038
3039	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
3040		if (dap->da_offset == offset)
3041			break;
3042	if (dap == NULL) {
3043
3044		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
3045			if (dap->da_offset == offset)
3046				break;
3047		if (dap == NULL)
3048			return (dirrem);
3049	}
3050	/*
3051	 * Must be ATTACHED at this point.
3052	 */
3053	if ((dap->da_state & ATTACHED) == 0) {
3054		FREE_LOCK(&lk);
3055		panic("newdirrem: not ATTACHED");
3056	}
3057	if (dap->da_newinum != ip->i_number) {
3058		FREE_LOCK(&lk);
3059		panic("newdirrem: inum %d should be %d",
3060		    ip->i_number, dap->da_newinum);
3061	}
3062	/*
3063	 * If we are deleting a changed name that never made it to disk,
3064	 * then return the dirrem describing the previous inode (which
3065	 * represents the inode currently referenced from this entry on disk).
3066	 */
3067	if ((dap->da_state & DIRCHG) != 0) {
3068		*prevdirremp = dap->da_previous;
3069		dap->da_state &= ~DIRCHG;
3070		dap->da_pagedep = pagedep;
3071	}
3072	/*
3073	 * We are deleting an entry that never made it to disk.
3074	 * Mark it COMPLETE so we can delete its inode immediately.
3075	 */
3076	dirrem->dm_state |= COMPLETE;
3077	free_diradd(dap);
3078	return (dirrem);
3079}
3080
3081/*
3082 * Directory entry change dependencies.
3083 *
3084 * Changing an existing directory entry requires that an add operation
3085 * be completed first followed by a deletion. The semantics for the addition
3086 * are identical to the description of adding a new entry above except
3087 * that the rollback is to the old inode number rather than zero. Once
3088 * the addition dependency is completed, the removal is done as described
3089 * in the removal routine above.
3090 */
3091
3092/*
3093 * This routine should be called immediately after changing
3094 * a directory entry.  The inode's link count should not be
3095 * decremented by the calling procedure -- the soft updates
3096 * code will perform this task when it is safe.
3097 */
3098void
3099softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
3100	struct buf *bp;		/* buffer containing directory block */
3101	struct inode *dp;	/* inode for the directory being modified */
3102	struct inode *ip;	/* inode for directory entry being removed */
3103	ino_t newinum;		/* new inode number for changed entry */
3104	int isrmdir;		/* indicates if doing RMDIR */
3105{
3106	int offset;
3107	struct diradd *dap = NULL;
3108	struct dirrem *dirrem, *prevdirrem;
3109	struct pagedep *pagedep;
3110	struct inodedep *inodedep;
3111
3112	offset = blkoff(dp->i_fs, dp->i_offset);
3113
3114	/*
3115	 * Whiteouts do not need diradd dependencies.
3116	 */
3117	if (newinum != WINO) {
3118		MALLOC(dap, struct diradd *, sizeof(struct diradd),
3119		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
3120		dap->da_list.wk_type = D_DIRADD;
3121		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
3122		dap->da_offset = offset;
3123		dap->da_newinum = newinum;
3124	}
3125
3126	/*
3127	 * Allocate a new dirrem and ACQUIRE_LOCK.
3128	 */
3129	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
3130	pagedep = dirrem->dm_pagedep;
3131	/*
3132	 * The possible values for isrmdir:
3133	 *	0 - non-directory file rename
3134	 *	1 - directory rename within same directory
3135	 *   inum - directory rename to new directory of given inode number
3136	 * When renaming to a new directory, we are both deleting and
3137	 * creating a new directory entry, so the link count on the new
3138	 * directory should not change. Thus we do not need the followup
3139	 * dirrem which is usually done in handle_workitem_remove. We set
3140	 * the DIRCHG flag to tell handle_workitem_remove to skip the
3141	 * followup dirrem.
3142	 */
3143	if (isrmdir > 1)
3144		dirrem->dm_state |= DIRCHG;
3145
3146	/*
3147	 * Whiteouts have no additional dependencies,
3148	 * so just put the dirrem on the correct list.
3149	 */
3150	if (newinum == WINO) {
3151		if ((dirrem->dm_state & COMPLETE) == 0) {
3152			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
3153			    dm_next);
3154		} else {
3155			dirrem->dm_dirinum = pagedep->pd_ino;
3156			add_to_worklist(&dirrem->dm_list);
3157		}
3158		FREE_LOCK(&lk);
3159		return;
3160	}
3161
3162	/*
3163	 * If the COMPLETE flag is clear, then there were no active
3164	 * entries and we want to roll back to the previous inode until
3165	 * the new inode is committed to disk. If the COMPLETE flag is
3166	 * set, then we have deleted an entry that never made it to disk.
3167	 * If the entry we deleted resulted from a name change, then the old
3168	 * inode reference still resides on disk. Any rollback that we do
3169	 * needs to be to that old inode (returned to us in prevdirrem). If
3170	 * the entry we deleted resulted from a create, then there is
3171	 * no entry on the disk, so we want to roll back to zero rather
3172	 * than the uncommitted inode. In either of the COMPLETE cases we
3173	 * want to immediately free the unwritten and unreferenced inode.
3174	 */
3175	if ((dirrem->dm_state & COMPLETE) == 0) {
3176		dap->da_previous = dirrem;
3177	} else {
3178		if (prevdirrem != NULL) {
3179			dap->da_previous = prevdirrem;
3180		} else {
3181			dap->da_state &= ~DIRCHG;
3182			dap->da_pagedep = pagedep;
3183		}
3184		dirrem->dm_dirinum = pagedep->pd_ino;
3185		add_to_worklist(&dirrem->dm_list);
3186	}
3187	/*
3188	 * Link into its inodedep. Put it on the id_bufwait list if the inode
3189	 * is not yet written. If it is written, do the post-inode write
3190	 * processing to put it on the id_pendinghd list.
3191	 */
3192	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
3193	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
3194		dap->da_state |= COMPLETE;
3195		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3196		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3197	} else {
3198		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
3199		    dap, da_pdlist);
3200		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
3201	}
3202	FREE_LOCK(&lk);
3203}
3204
3205/*
3206 * Called whenever the link count on an inode is changed.
3207 * It creates an inode dependency so that the new reference(s)
3208 * to the inode cannot be committed to disk until the updated
3209 * inode has been written.
3210 */
3211void
3212softdep_change_linkcnt(ip)
3213	struct inode *ip;	/* the inode with the increased link count */
3214{
3215	struct inodedep *inodedep;
3216
3217	ACQUIRE_LOCK(&lk);
3218	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
3219	if (ip->i_nlink < ip->i_effnlink) {
3220		FREE_LOCK(&lk);
3221		panic("softdep_change_linkcnt: bad delta");
3222	}
3223	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3224	FREE_LOCK(&lk);
3225}
3226
3227/*
3228 * Called when the effective link count and the reference count
3229 * on an inode drops to zero. At this point there are no names
3230 * referencing the file in the filesystem and no active file
3231 * references. The space associated with the file will be freed
3232 * as soon as the necessary soft dependencies are cleared.
3233 */
3234void
3235softdep_releasefile(ip)
3236	struct inode *ip;	/* inode with the zero effective link count */
3237{
3238	struct inodedep *inodedep;
3239	struct fs *fs;
3240	int extblocks;
3241
3242	if (ip->i_effnlink > 0)
3243		panic("softdep_filerelease: file still referenced");
3244	/*
3245	 * We may be called several times as the real reference count
3246	 * drops to zero. We only want to account for the space once.
3247	 */
3248	if (ip->i_flag & IN_SPACECOUNTED)
3249		return;
3250	/*
3251	 * We have to deactivate a snapshot otherwise copyonwrites may
3252	 * add blocks and the cleanup may remove blocks after we have
3253	 * tried to account for them.
3254	 */
3255	if ((ip->i_flags & SF_SNAPSHOT) != 0)
3256		ffs_snapremove(ITOV(ip));
3257	/*
3258	 * If we are tracking an nlinkdelta, we have to also remember
3259	 * whether we accounted for the freed space yet.
3260	 */
3261	ACQUIRE_LOCK(&lk);
3262	if ((inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep)))
3263		inodedep->id_state |= SPACECOUNTED;
3264	FREE_LOCK(&lk);
3265	fs = ip->i_fs;
3266	extblocks = 0;
3267	if (fs->fs_magic == FS_UFS2_MAGIC)
3268		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
3269	ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks;
3270	ip->i_fs->fs_pendinginodes += 1;
3271	ip->i_flag |= IN_SPACECOUNTED;
3272}
3273
3274/*
3275 * This workitem decrements the inode's link count.
3276 * If the link count reaches zero, the file is removed.
3277 */
3278static void
3279handle_workitem_remove(dirrem, xp)
3280	struct dirrem *dirrem;
3281	struct vnode *xp;
3282{
3283	struct thread *td = curthread;
3284	struct inodedep *inodedep;
3285	struct vnode *vp;
3286	struct inode *ip;
3287	ino_t oldinum;
3288	int error;
3289
3290	if ((vp = xp) == NULL &&
3291	    (error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, LK_EXCLUSIVE,
3292	     &vp)) != 0) {
3293		softdep_error("handle_workitem_remove: vget", error);
3294		return;
3295	}
3296	ip = VTOI(vp);
3297	ACQUIRE_LOCK(&lk);
3298	if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){
3299		FREE_LOCK(&lk);
3300		panic("handle_workitem_remove: lost inodedep");
3301	}
3302	/*
3303	 * Normal file deletion.
3304	 */
3305	if ((dirrem->dm_state & RMDIR) == 0) {
3306		ip->i_nlink--;
3307		DIP(ip, i_nlink) = ip->i_nlink;
3308		ip->i_flag |= IN_CHANGE;
3309		if (ip->i_nlink < ip->i_effnlink) {
3310			FREE_LOCK(&lk);
3311			panic("handle_workitem_remove: bad file delta");
3312		}
3313		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3314		FREE_LOCK(&lk);
3315		vput(vp);
3316		num_dirrem -= 1;
3317		WORKITEM_FREE(dirrem, D_DIRREM);
3318		return;
3319	}
3320	/*
3321	 * Directory deletion. Decrement reference count for both the
3322	 * just deleted parent directory entry and the reference for ".".
3323	 * Next truncate the directory to length zero. When the
3324	 * truncation completes, arrange to have the reference count on
3325	 * the parent decremented to account for the loss of "..".
3326	 */
3327	ip->i_nlink -= 2;
3328	DIP(ip, i_nlink) = ip->i_nlink;
3329	ip->i_flag |= IN_CHANGE;
3330	if (ip->i_nlink < ip->i_effnlink) {
3331		FREE_LOCK(&lk);
3332		panic("handle_workitem_remove: bad dir delta");
3333	}
3334	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3335	FREE_LOCK(&lk);
3336	if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
3337		softdep_error("handle_workitem_remove: truncate", error);
3338	/*
3339	 * Rename a directory to a new parent. Since, we are both deleting
3340	 * and creating a new directory entry, the link count on the new
3341	 * directory should not change. Thus we skip the followup dirrem.
3342	 */
3343	if (dirrem->dm_state & DIRCHG) {
3344		vput(vp);
3345		num_dirrem -= 1;
3346		WORKITEM_FREE(dirrem, D_DIRREM);
3347		return;
3348	}
3349	/*
3350	 * If the inodedep does not exist, then the zero'ed inode has
3351	 * been written to disk. If the allocated inode has never been
3352	 * written to disk, then the on-disk inode is zero'ed. In either
3353	 * case we can remove the file immediately.
3354	 */
3355	ACQUIRE_LOCK(&lk);
3356	dirrem->dm_state = 0;
3357	oldinum = dirrem->dm_oldinum;
3358	dirrem->dm_oldinum = dirrem->dm_dirinum;
3359	if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
3360	    check_inode_unwritten(inodedep)) {
3361		FREE_LOCK(&lk);
3362		vput(vp);
3363		handle_workitem_remove(dirrem, NULL);
3364		return;
3365	}
3366	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
3367	FREE_LOCK(&lk);
3368	vput(vp);
3369}
3370
3371/*
3372 * Inode de-allocation dependencies.
3373 *
3374 * When an inode's link count is reduced to zero, it can be de-allocated. We
3375 * found it convenient to postpone de-allocation until after the inode is
3376 * written to disk with its new link count (zero).  At this point, all of the
3377 * on-disk inode's block pointers are nullified and, with careful dependency
3378 * list ordering, all dependencies related to the inode will be satisfied and
3379 * the corresponding dependency structures de-allocated.  So, if/when the
3380 * inode is reused, there will be no mixing of old dependencies with new
3381 * ones.  This artificial dependency is set up by the block de-allocation
3382 * procedure above (softdep_setup_freeblocks) and completed by the
3383 * following procedure.
3384 */
3385static void
3386handle_workitem_freefile(freefile)
3387	struct freefile *freefile;
3388{
3389	struct fs *fs;
3390	struct inodedep *idp;
3391	int error;
3392
3393	fs = VFSTOUFS(freefile->fx_mnt)->um_fs;
3394#ifdef DEBUG
3395	ACQUIRE_LOCK(&lk);
3396	error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp);
3397	FREE_LOCK(&lk);
3398	if (error)
3399		panic("handle_workitem_freefile: inodedep survived");
3400#endif
3401	fs->fs_pendinginodes -= 1;
3402	if ((error = ffs_freefile(fs, freefile->fx_devvp, freefile->fx_oldinum,
3403	     freefile->fx_mode)) != 0)
3404		softdep_error("handle_workitem_freefile", error);
3405	WORKITEM_FREE(freefile, D_FREEFILE);
3406}
3407
3408/*
3409 * Disk writes.
3410 *
3411 * The dependency structures constructed above are most actively used when file
3412 * system blocks are written to disk.  No constraints are placed on when a
3413 * block can be written, but unsatisfied update dependencies are made safe by
3414 * modifying (or replacing) the source memory for the duration of the disk
3415 * write.  When the disk write completes, the memory block is again brought
3416 * up-to-date.
3417 *
3418 * In-core inode structure reclamation.
3419 *
3420 * Because there are a finite number of "in-core" inode structures, they are
3421 * reused regularly.  By transferring all inode-related dependencies to the
3422 * in-memory inode block and indexing them separately (via "inodedep"s), we
3423 * can allow "in-core" inode structures to be reused at any time and avoid
3424 * any increase in contention.
3425 *
3426 * Called just before entering the device driver to initiate a new disk I/O.
3427 * The buffer must be locked, thus, no I/O completion operations can occur
3428 * while we are manipulating its associated dependencies.
3429 */
3430static void
3431softdep_disk_io_initiation(bp)
3432	struct buf *bp;		/* structure describing disk write to occur */
3433{
3434	struct worklist *wk, *nextwk;
3435	struct indirdep *indirdep;
3436	struct inodedep *inodedep;
3437
3438	/*
3439	 * We only care about write operations. There should never
3440	 * be dependencies for reads.
3441	 */
3442	if (bp->b_iocmd == BIO_READ)
3443		panic("softdep_disk_io_initiation: read");
3444	/*
3445	 * Do any necessary pre-I/O processing.
3446	 */
3447	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
3448		nextwk = LIST_NEXT(wk, wk_list);
3449		switch (wk->wk_type) {
3450
3451		case D_PAGEDEP:
3452			initiate_write_filepage(WK_PAGEDEP(wk), bp);
3453			continue;
3454
3455		case D_INODEDEP:
3456			inodedep = WK_INODEDEP(wk);
3457			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
3458				initiate_write_inodeblock_ufs1(inodedep, bp);
3459			else
3460				initiate_write_inodeblock_ufs2(inodedep, bp);
3461			continue;
3462
3463		case D_INDIRDEP:
3464			indirdep = WK_INDIRDEP(wk);
3465			if (indirdep->ir_state & GOINGAWAY)
3466				panic("disk_io_initiation: indirdep gone");
3467			/*
3468			 * If there are no remaining dependencies, this
3469			 * will be writing the real pointers, so the
3470			 * dependency can be freed.
3471			 */
3472			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
3473				indirdep->ir_savebp->b_flags |=
3474				    B_INVAL | B_NOCACHE;
3475				brelse(indirdep->ir_savebp);
3476				/* inline expand WORKLIST_REMOVE(wk); */
3477				wk->wk_state &= ~ONWORKLIST;
3478				LIST_REMOVE(wk, wk_list);
3479				WORKITEM_FREE(indirdep, D_INDIRDEP);
3480				continue;
3481			}
3482			/*
3483			 * Replace up-to-date version with safe version.
3484			 */
3485			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
3486			    M_INDIRDEP, M_SOFTDEP_FLAGS);
3487			ACQUIRE_LOCK(&lk);
3488			indirdep->ir_state &= ~ATTACHED;
3489			indirdep->ir_state |= UNDONE;
3490			bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
3491			bcopy(indirdep->ir_savebp->b_data, bp->b_data,
3492			    bp->b_bcount);
3493			FREE_LOCK(&lk);
3494			continue;
3495
3496		case D_MKDIR:
3497		case D_BMSAFEMAP:
3498		case D_ALLOCDIRECT:
3499		case D_ALLOCINDIR:
3500			continue;
3501
3502		default:
3503			panic("handle_disk_io_initiation: Unexpected type %s",
3504			    TYPENAME(wk->wk_type));
3505			/* NOTREACHED */
3506		}
3507	}
3508}
3509
3510/*
3511 * Called from within the procedure above to deal with unsatisfied
3512 * allocation dependencies in a directory. The buffer must be locked,
3513 * thus, no I/O completion operations can occur while we are
3514 * manipulating its associated dependencies.
3515 */
3516static void
3517initiate_write_filepage(pagedep, bp)
3518	struct pagedep *pagedep;
3519	struct buf *bp;
3520{
3521	struct diradd *dap;
3522	struct direct *ep;
3523	int i;
3524
3525	if (pagedep->pd_state & IOSTARTED) {
3526		/*
3527		 * This can only happen if there is a driver that does not
3528		 * understand chaining. Here biodone will reissue the call
3529		 * to strategy for the incomplete buffers.
3530		 */
3531		printf("initiate_write_filepage: already started\n");
3532		return;
3533	}
3534	pagedep->pd_state |= IOSTARTED;
3535	ACQUIRE_LOCK(&lk);
3536	for (i = 0; i < DAHASHSZ; i++) {
3537		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
3538			ep = (struct direct *)
3539			    ((char *)bp->b_data + dap->da_offset);
3540			if (ep->d_ino != dap->da_newinum) {
3541				FREE_LOCK(&lk);
3542				panic("%s: dir inum %d != new %d",
3543				    "initiate_write_filepage",
3544				    ep->d_ino, dap->da_newinum);
3545			}
3546			if (dap->da_state & DIRCHG)
3547				ep->d_ino = dap->da_previous->dm_oldinum;
3548			else
3549				ep->d_ino = 0;
3550			dap->da_state &= ~ATTACHED;
3551			dap->da_state |= UNDONE;
3552		}
3553	}
3554	FREE_LOCK(&lk);
3555}
3556
3557/*
3558 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
3559 * Note that any bug fixes made to this routine must be done in the
3560 * version found below.
3561 *
3562 * Called from within the procedure above to deal with unsatisfied
3563 * allocation dependencies in an inodeblock. The buffer must be
3564 * locked, thus, no I/O completion operations can occur while we
3565 * are manipulating its associated dependencies.
3566 */
3567static void
3568initiate_write_inodeblock_ufs1(inodedep, bp)
3569	struct inodedep *inodedep;
3570	struct buf *bp;			/* The inode block */
3571{
3572	struct allocdirect *adp, *lastadp;
3573	struct ufs1_dinode *dp;
3574	struct fs *fs;
3575	ufs_lbn_t i, prevlbn = 0;
3576	int deplist;
3577
3578	if (inodedep->id_state & IOSTARTED)
3579		panic("initiate_write_inodeblock_ufs1: already started");
3580	inodedep->id_state |= IOSTARTED;
3581	fs = inodedep->id_fs;
3582	dp = (struct ufs1_dinode *)bp->b_data +
3583	    ino_to_fsbo(fs, inodedep->id_ino);
3584	/*
3585	 * If the bitmap is not yet written, then the allocated
3586	 * inode cannot be written to disk.
3587	 */
3588	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3589		if (inodedep->id_savedino1 != NULL)
3590			panic("initiate_write_inodeblock_ufs1: I/O underway");
3591		MALLOC(inodedep->id_savedino1, struct ufs1_dinode *,
3592		    sizeof(struct ufs1_dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
3593		*inodedep->id_savedino1 = *dp;
3594		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
3595		return;
3596	}
3597	/*
3598	 * If no dependencies, then there is nothing to roll back.
3599	 */
3600	inodedep->id_savedsize = dp->di_size;
3601	inodedep->id_savedextsize = 0;
3602	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3603		return;
3604	/*
3605	 * Set the dependencies to busy.
3606	 */
3607	ACQUIRE_LOCK(&lk);
3608	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3609	     adp = TAILQ_NEXT(adp, ad_next)) {
3610#ifdef DIAGNOSTIC
3611		if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3612			FREE_LOCK(&lk);
3613			panic("softdep_write_inodeblock: lbn order");
3614		}
3615		prevlbn = adp->ad_lbn;
3616		if (adp->ad_lbn < NDADDR &&
3617		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
3618			FREE_LOCK(&lk);
3619			panic("%s: direct pointer #%jd mismatch %d != %jd",
3620			    "softdep_write_inodeblock",
3621			    (intmax_t)adp->ad_lbn,
3622			    dp->di_db[adp->ad_lbn],
3623			    (intmax_t)adp->ad_newblkno);
3624		}
3625		if (adp->ad_lbn >= NDADDR &&
3626		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
3627			FREE_LOCK(&lk);
3628			panic("%s: indirect pointer #%jd mismatch %d != %jd",
3629			    "softdep_write_inodeblock",
3630			    (intmax_t)adp->ad_lbn - NDADDR,
3631			    dp->di_ib[adp->ad_lbn - NDADDR],
3632			    (intmax_t)adp->ad_newblkno);
3633		}
3634		deplist |= 1 << adp->ad_lbn;
3635		if ((adp->ad_state & ATTACHED) == 0) {
3636			FREE_LOCK(&lk);
3637			panic("softdep_write_inodeblock: Unknown state 0x%x",
3638			    adp->ad_state);
3639		}
3640#endif /* DIAGNOSTIC */
3641		adp->ad_state &= ~ATTACHED;
3642		adp->ad_state |= UNDONE;
3643	}
3644	/*
3645	 * The on-disk inode cannot claim to be any larger than the last
3646	 * fragment that has been written. Otherwise, the on-disk inode
3647	 * might have fragments that were not the last block in the file
3648	 * which would corrupt the filesystem.
3649	 */
3650	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3651	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3652		if (adp->ad_lbn >= NDADDR)
3653			break;
3654		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3655		/* keep going until hitting a rollback to a frag */
3656		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3657			continue;
3658		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3659		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3660#ifdef DIAGNOSTIC
3661			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
3662				FREE_LOCK(&lk);
3663				panic("softdep_write_inodeblock: lost dep1");
3664			}
3665#endif /* DIAGNOSTIC */
3666			dp->di_db[i] = 0;
3667		}
3668		for (i = 0; i < NIADDR; i++) {
3669#ifdef DIAGNOSTIC
3670			if (dp->di_ib[i] != 0 &&
3671			    (deplist & ((1 << NDADDR) << i)) == 0) {
3672				FREE_LOCK(&lk);
3673				panic("softdep_write_inodeblock: lost dep2");
3674			}
3675#endif /* DIAGNOSTIC */
3676			dp->di_ib[i] = 0;
3677		}
3678		FREE_LOCK(&lk);
3679		return;
3680	}
3681	/*
3682	 * If we have zero'ed out the last allocated block of the file,
3683	 * roll back the size to the last currently allocated block.
3684	 * We know that this last allocated block is a full-sized as
3685	 * we already checked for fragments in the loop above.
3686	 */
3687	if (lastadp != NULL &&
3688	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3689		for (i = lastadp->ad_lbn; i >= 0; i--)
3690			if (dp->di_db[i] != 0)
3691				break;
3692		dp->di_size = (i + 1) * fs->fs_bsize;
3693	}
3694	/*
3695	 * The only dependencies are for indirect blocks.
3696	 *
3697	 * The file size for indirect block additions is not guaranteed.
3698	 * Such a guarantee would be non-trivial to achieve. The conventional
3699	 * synchronous write implementation also does not make this guarantee.
3700	 * Fsck should catch and fix discrepancies. Arguably, the file size
3701	 * can be over-estimated without destroying integrity when the file
3702	 * moves into the indirect blocks (i.e., is large). If we want to
3703	 * postpone fsck, we are stuck with this argument.
3704	 */
3705	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3706		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3707	FREE_LOCK(&lk);
3708}
3709
3710/*
3711 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
3712 * Note that any bug fixes made to this routine must be done in the
3713 * version found above.
3714 *
3715 * Called from within the procedure above to deal with unsatisfied
3716 * allocation dependencies in an inodeblock. The buffer must be
3717 * locked, thus, no I/O completion operations can occur while we
3718 * are manipulating its associated dependencies.
3719 */
3720static void
3721initiate_write_inodeblock_ufs2(inodedep, bp)
3722	struct inodedep *inodedep;
3723	struct buf *bp;			/* The inode block */
3724{
3725	struct allocdirect *adp, *lastadp;
3726	struct ufs2_dinode *dp;
3727	struct fs *fs;
3728	ufs_lbn_t i, prevlbn = 0;
3729	int deplist;
3730
3731	if (inodedep->id_state & IOSTARTED)
3732		panic("initiate_write_inodeblock_ufs2: already started");
3733	inodedep->id_state |= IOSTARTED;
3734	fs = inodedep->id_fs;
3735	dp = (struct ufs2_dinode *)bp->b_data +
3736	    ino_to_fsbo(fs, inodedep->id_ino);
3737	/*
3738	 * If the bitmap is not yet written, then the allocated
3739	 * inode cannot be written to disk.
3740	 */
3741	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3742		if (inodedep->id_savedino2 != NULL)
3743			panic("initiate_write_inodeblock_ufs2: I/O underway");
3744		MALLOC(inodedep->id_savedino2, struct ufs2_dinode *,
3745		    sizeof(struct ufs2_dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
3746		*inodedep->id_savedino2 = *dp;
3747		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
3748		return;
3749	}
3750	/*
3751	 * If no dependencies, then there is nothing to roll back.
3752	 */
3753	inodedep->id_savedsize = dp->di_size;
3754	inodedep->id_savedextsize = dp->di_extsize;
3755	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL &&
3756	    TAILQ_FIRST(&inodedep->id_extupdt) == NULL)
3757		return;
3758	/*
3759	 * Set the ext data dependencies to busy.
3760	 */
3761	ACQUIRE_LOCK(&lk);
3762	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
3763	     adp = TAILQ_NEXT(adp, ad_next)) {
3764#ifdef DIAGNOSTIC
3765		if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3766			FREE_LOCK(&lk);
3767			panic("softdep_write_inodeblock: lbn order");
3768		}
3769		prevlbn = adp->ad_lbn;
3770		if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno) {
3771			FREE_LOCK(&lk);
3772			panic("%s: direct pointer #%jd mismatch %jd != %jd",
3773			    "softdep_write_inodeblock",
3774			    (intmax_t)adp->ad_lbn,
3775			    (intmax_t)dp->di_extb[adp->ad_lbn],
3776			    (intmax_t)adp->ad_newblkno);
3777		}
3778		deplist |= 1 << adp->ad_lbn;
3779		if ((adp->ad_state & ATTACHED) == 0) {
3780			FREE_LOCK(&lk);
3781			panic("softdep_write_inodeblock: Unknown state 0x%x",
3782			    adp->ad_state);
3783		}
3784#endif /* DIAGNOSTIC */
3785		adp->ad_state &= ~ATTACHED;
3786		adp->ad_state |= UNDONE;
3787	}
3788	/*
3789	 * The on-disk inode cannot claim to be any larger than the last
3790	 * fragment that has been written. Otherwise, the on-disk inode
3791	 * might have fragments that were not the last block in the ext
3792	 * data which would corrupt the filesystem.
3793	 */
3794	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
3795	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3796		dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
3797		/* keep going until hitting a rollback to a frag */
3798		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3799			continue;
3800		dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3801		for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
3802#ifdef DIAGNOSTIC
3803			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) {
3804				FREE_LOCK(&lk);
3805				panic("softdep_write_inodeblock: lost dep1");
3806			}
3807#endif /* DIAGNOSTIC */
3808			dp->di_extb[i] = 0;
3809		}
3810		lastadp = NULL;
3811		break;
3812	}
3813	/*
3814	 * If we have zero'ed out the last allocated block of the ext
3815	 * data, roll back the size to the last currently allocated block.
3816	 * We know that this last allocated block is a full-sized as
3817	 * we already checked for fragments in the loop above.
3818	 */
3819	if (lastadp != NULL &&
3820	    dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3821		for (i = lastadp->ad_lbn; i >= 0; i--)
3822			if (dp->di_extb[i] != 0)
3823				break;
3824		dp->di_extsize = (i + 1) * fs->fs_bsize;
3825	}
3826	/*
3827	 * Set the file data dependencies to busy.
3828	 */
3829	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3830	     adp = TAILQ_NEXT(adp, ad_next)) {
3831#ifdef DIAGNOSTIC
3832		if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3833			FREE_LOCK(&lk);
3834			panic("softdep_write_inodeblock: lbn order");
3835		}
3836		prevlbn = adp->ad_lbn;
3837		if (adp->ad_lbn < NDADDR &&
3838		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
3839			FREE_LOCK(&lk);
3840			panic("%s: direct pointer #%jd mismatch %jd != %jd",
3841			    "softdep_write_inodeblock",
3842			    (intmax_t)adp->ad_lbn,
3843			    (intmax_t)dp->di_db[adp->ad_lbn],
3844			    (intmax_t)adp->ad_newblkno);
3845		}
3846		if (adp->ad_lbn >= NDADDR &&
3847		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
3848			FREE_LOCK(&lk);
3849			panic("%s indirect pointer #%jd mismatch %jd != %jd",
3850			    "softdep_write_inodeblock:",
3851			    (intmax_t)adp->ad_lbn - NDADDR,
3852			    (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
3853			    (intmax_t)adp->ad_newblkno);
3854		}
3855		deplist |= 1 << adp->ad_lbn;
3856		if ((adp->ad_state & ATTACHED) == 0) {
3857			FREE_LOCK(&lk);
3858			panic("softdep_write_inodeblock: Unknown state 0x%x",
3859			    adp->ad_state);
3860		}
3861#endif /* DIAGNOSTIC */
3862		adp->ad_state &= ~ATTACHED;
3863		adp->ad_state |= UNDONE;
3864	}
3865	/*
3866	 * The on-disk inode cannot claim to be any larger than the last
3867	 * fragment that has been written. Otherwise, the on-disk inode
3868	 * might have fragments that were not the last block in the file
3869	 * which would corrupt the filesystem.
3870	 */
3871	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3872	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3873		if (adp->ad_lbn >= NDADDR)
3874			break;
3875		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3876		/* keep going until hitting a rollback to a frag */
3877		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3878			continue;
3879		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3880		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3881#ifdef DIAGNOSTIC
3882			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
3883				FREE_LOCK(&lk);
3884				panic("softdep_write_inodeblock: lost dep2");
3885			}
3886#endif /* DIAGNOSTIC */
3887			dp->di_db[i] = 0;
3888		}
3889		for (i = 0; i < NIADDR; i++) {
3890#ifdef DIAGNOSTIC
3891			if (dp->di_ib[i] != 0 &&
3892			    (deplist & ((1 << NDADDR) << i)) == 0) {
3893				FREE_LOCK(&lk);
3894				panic("softdep_write_inodeblock: lost dep3");
3895			}
3896#endif /* DIAGNOSTIC */
3897			dp->di_ib[i] = 0;
3898		}
3899		FREE_LOCK(&lk);
3900		return;
3901	}
3902	/*
3903	 * If we have zero'ed out the last allocated block of the file,
3904	 * roll back the size to the last currently allocated block.
3905	 * We know that this last allocated block is a full-sized as
3906	 * we already checked for fragments in the loop above.
3907	 */
3908	if (lastadp != NULL &&
3909	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3910		for (i = lastadp->ad_lbn; i >= 0; i--)
3911			if (dp->di_db[i] != 0)
3912				break;
3913		dp->di_size = (i + 1) * fs->fs_bsize;
3914	}
3915	/*
3916	 * The only dependencies are for indirect blocks.
3917	 *
3918	 * The file size for indirect block additions is not guaranteed.
3919	 * Such a guarantee would be non-trivial to achieve. The conventional
3920	 * synchronous write implementation also does not make this guarantee.
3921	 * Fsck should catch and fix discrepancies. Arguably, the file size
3922	 * can be over-estimated without destroying integrity when the file
3923	 * moves into the indirect blocks (i.e., is large). If we want to
3924	 * postpone fsck, we are stuck with this argument.
3925	 */
3926	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3927		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3928	FREE_LOCK(&lk);
3929}
3930
3931/*
3932 * This routine is called during the completion interrupt
3933 * service routine for a disk write (from the procedure called
3934 * by the device driver to inform the filesystem caches of
3935 * a request completion).  It should be called early in this
3936 * procedure, before the block is made available to other
3937 * processes or other routines are called.
3938 */
3939static void
3940softdep_disk_write_complete(bp)
3941	struct buf *bp;		/* describes the completed disk write */
3942{
3943	struct worklist *wk;
3944	struct workhead reattach;
3945	struct newblk *newblk;
3946	struct allocindir *aip;
3947	struct allocdirect *adp;
3948	struct indirdep *indirdep;
3949	struct inodedep *inodedep;
3950	struct bmsafemap *bmsafemap;
3951
3952	/*
3953	 * If an error occurred while doing the write, then the data
3954	 * has not hit the disk and the dependencies cannot be unrolled.
3955	 */
3956	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
3957		return;
3958#ifdef DEBUG
3959	if (lk.lkt_held != NOHOLDER)
3960		panic("softdep_disk_write_complete: lock is held");
3961	lk.lkt_held = SPECIAL_FLAG;
3962#endif
3963	LIST_INIT(&reattach);
3964	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
3965		WORKLIST_REMOVE(wk);
3966		switch (wk->wk_type) {
3967
3968		case D_PAGEDEP:
3969			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
3970				WORKLIST_INSERT(&reattach, wk);
3971			continue;
3972
3973		case D_INODEDEP:
3974			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
3975				WORKLIST_INSERT(&reattach, wk);
3976			continue;
3977
3978		case D_BMSAFEMAP:
3979			bmsafemap = WK_BMSAFEMAP(wk);
3980			while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
3981				newblk->nb_state |= DEPCOMPLETE;
3982				newblk->nb_bmsafemap = NULL;
3983				LIST_REMOVE(newblk, nb_deps);
3984			}
3985			while ((adp =
3986			   LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
3987				adp->ad_state |= DEPCOMPLETE;
3988				adp->ad_buf = NULL;
3989				LIST_REMOVE(adp, ad_deps);
3990				handle_allocdirect_partdone(adp);
3991			}
3992			while ((aip =
3993			    LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
3994				aip->ai_state |= DEPCOMPLETE;
3995				aip->ai_buf = NULL;
3996				LIST_REMOVE(aip, ai_deps);
3997				handle_allocindir_partdone(aip);
3998			}
3999			while ((inodedep =
4000			     LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
4001				inodedep->id_state |= DEPCOMPLETE;
4002				LIST_REMOVE(inodedep, id_deps);
4003				inodedep->id_buf = NULL;
4004			}
4005			WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
4006			continue;
4007
4008		case D_MKDIR:
4009			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
4010			continue;
4011
4012		case D_ALLOCDIRECT:
4013			adp = WK_ALLOCDIRECT(wk);
4014			adp->ad_state |= COMPLETE;
4015			handle_allocdirect_partdone(adp);
4016			continue;
4017
4018		case D_ALLOCINDIR:
4019			aip = WK_ALLOCINDIR(wk);
4020			aip->ai_state |= COMPLETE;
4021			handle_allocindir_partdone(aip);
4022			continue;
4023
4024		case D_INDIRDEP:
4025			indirdep = WK_INDIRDEP(wk);
4026			if (indirdep->ir_state & GOINGAWAY) {
4027				lk.lkt_held = NOHOLDER;
4028				panic("disk_write_complete: indirdep gone");
4029			}
4030			bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
4031			FREE(indirdep->ir_saveddata, M_INDIRDEP);
4032			indirdep->ir_saveddata = 0;
4033			indirdep->ir_state &= ~UNDONE;
4034			indirdep->ir_state |= ATTACHED;
4035			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
4036				handle_allocindir_partdone(aip);
4037				if (aip == LIST_FIRST(&indirdep->ir_donehd)) {
4038					lk.lkt_held = NOHOLDER;
4039					panic("disk_write_complete: not gone");
4040				}
4041			}
4042			WORKLIST_INSERT(&reattach, wk);
4043			if ((bp->b_flags & B_DELWRI) == 0)
4044				stat_indir_blk_ptrs++;
4045			bdirty(bp);
4046			continue;
4047
4048		default:
4049			lk.lkt_held = NOHOLDER;
4050			panic("handle_disk_write_complete: Unknown type %s",
4051			    TYPENAME(wk->wk_type));
4052			/* NOTREACHED */
4053		}
4054	}
4055	/*
4056	 * Reattach any requests that must be redone.
4057	 */
4058	while ((wk = LIST_FIRST(&reattach)) != NULL) {
4059		WORKLIST_REMOVE(wk);
4060		WORKLIST_INSERT(&bp->b_dep, wk);
4061	}
4062#ifdef DEBUG
4063	if (lk.lkt_held != SPECIAL_FLAG)
4064		panic("softdep_disk_write_complete: lock lost");
4065	lk.lkt_held = NOHOLDER;
4066#endif
4067}
4068
4069/*
4070 * Called from within softdep_disk_write_complete above. Note that
4071 * this routine is always called from interrupt level with further
4072 * splbio interrupts blocked.
4073 */
4074static void
4075handle_allocdirect_partdone(adp)
4076	struct allocdirect *adp;	/* the completed allocdirect */
4077{
4078	struct allocdirectlst *listhead;
4079	struct allocdirect *listadp;
4080	struct inodedep *inodedep;
4081	long bsize, delay;
4082
4083	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
4084		return;
4085	if (adp->ad_buf != NULL) {
4086		lk.lkt_held = NOHOLDER;
4087		panic("handle_allocdirect_partdone: dangling dep");
4088	}
4089	/*
4090	 * The on-disk inode cannot claim to be any larger than the last
4091	 * fragment that has been written. Otherwise, the on-disk inode
4092	 * might have fragments that were not the last block in the file
4093	 * which would corrupt the filesystem. Thus, we cannot free any
4094	 * allocdirects after one whose ad_oldblkno claims a fragment as
4095	 * these blocks must be rolled back to zero before writing the inode.
4096	 * We check the currently active set of allocdirects in id_inoupdt
4097	 * or id_extupdt as appropriate.
4098	 */
4099	inodedep = adp->ad_inodedep;
4100	bsize = inodedep->id_fs->fs_bsize;
4101	if (adp->ad_state & EXTDATA)
4102		listhead = &inodedep->id_extupdt;
4103	else
4104		listhead = &inodedep->id_inoupdt;
4105	TAILQ_FOREACH(listadp, listhead, ad_next) {
4106		/* found our block */
4107		if (listadp == adp)
4108			break;
4109		/* continue if ad_oldlbn is not a fragment */
4110		if (listadp->ad_oldsize == 0 ||
4111		    listadp->ad_oldsize == bsize)
4112			continue;
4113		/* hit a fragment */
4114		return;
4115	}
4116	/*
4117	 * If we have reached the end of the current list without
4118	 * finding the just finished dependency, then it must be
4119	 * on the future dependency list. Future dependencies cannot
4120	 * be freed until they are moved to the current list.
4121	 */
4122	if (listadp == NULL) {
4123#ifdef DEBUG
4124		if (adp->ad_state & EXTDATA)
4125			listhead = &inodedep->id_newextupdt;
4126		else
4127			listhead = &inodedep->id_newinoupdt;
4128		TAILQ_FOREACH(listadp, listhead, ad_next)
4129			/* found our block */
4130			if (listadp == adp)
4131				break;
4132		if (listadp == NULL) {
4133			lk.lkt_held = NOHOLDER;
4134			panic("handle_allocdirect_partdone: lost dep");
4135		}
4136#endif /* DEBUG */
4137		return;
4138	}
4139	/*
4140	 * If we have found the just finished dependency, then free
4141	 * it along with anything that follows it that is complete.
4142	 * If the inode still has a bitmap dependency, then it has
4143	 * never been written to disk, hence the on-disk inode cannot
4144	 * reference the old fragment so we can free it without delay.
4145	 */
4146	delay = (inodedep->id_state & DEPCOMPLETE);
4147	for (; adp; adp = listadp) {
4148		listadp = TAILQ_NEXT(adp, ad_next);
4149		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
4150			return;
4151		free_allocdirect(listhead, adp, delay);
4152	}
4153}
4154
4155/*
4156 * Called from within softdep_disk_write_complete above. Note that
4157 * this routine is always called from interrupt level with further
4158 * splbio interrupts blocked.
4159 */
4160static void
4161handle_allocindir_partdone(aip)
4162	struct allocindir *aip;		/* the completed allocindir */
4163{
4164	struct indirdep *indirdep;
4165
4166	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
4167		return;
4168	if (aip->ai_buf != NULL) {
4169		lk.lkt_held = NOHOLDER;
4170		panic("handle_allocindir_partdone: dangling dependency");
4171	}
4172	indirdep = aip->ai_indirdep;
4173	if (indirdep->ir_state & UNDONE) {
4174		LIST_REMOVE(aip, ai_next);
4175		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
4176		return;
4177	}
4178	if (indirdep->ir_state & UFS1FMT)
4179		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4180		    aip->ai_newblkno;
4181	else
4182		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4183		    aip->ai_newblkno;
4184	LIST_REMOVE(aip, ai_next);
4185	if (aip->ai_freefrag != NULL)
4186		add_to_worklist(&aip->ai_freefrag->ff_list);
4187	WORKITEM_FREE(aip, D_ALLOCINDIR);
4188}
4189
4190/*
4191 * Called from within softdep_disk_write_complete above to restore
4192 * in-memory inode block contents to their most up-to-date state. Note
4193 * that this routine is always called from interrupt level with further
4194 * splbio interrupts blocked.
4195 */
4196static int
4197handle_written_inodeblock(inodedep, bp)
4198	struct inodedep *inodedep;
4199	struct buf *bp;		/* buffer containing the inode block */
4200{
4201	struct worklist *wk, *filefree;
4202	struct allocdirect *adp, *nextadp;
4203	struct ufs1_dinode *dp1 = NULL;
4204	struct ufs2_dinode *dp2 = NULL;
4205	int hadchanges, fstype;
4206
4207	if ((inodedep->id_state & IOSTARTED) == 0) {
4208		lk.lkt_held = NOHOLDER;
4209		panic("handle_written_inodeblock: not started");
4210	}
4211	inodedep->id_state &= ~IOSTARTED;
4212	inodedep->id_state |= COMPLETE;
4213	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
4214		fstype = UFS1;
4215		dp1 = (struct ufs1_dinode *)bp->b_data +
4216		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4217	} else {
4218		fstype = UFS2;
4219		dp2 = (struct ufs2_dinode *)bp->b_data +
4220		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4221	}
4222	/*
4223	 * If we had to rollback the inode allocation because of
4224	 * bitmaps being incomplete, then simply restore it.
4225	 * Keep the block dirty so that it will not be reclaimed until
4226	 * all associated dependencies have been cleared and the
4227	 * corresponding updates written to disk.
4228	 */
4229	if (inodedep->id_savedino1 != NULL) {
4230		if (fstype == UFS1)
4231			*dp1 = *inodedep->id_savedino1;
4232		else
4233			*dp2 = *inodedep->id_savedino2;
4234		FREE(inodedep->id_savedino1, M_INODEDEP);
4235		inodedep->id_savedino1 = NULL;
4236		if ((bp->b_flags & B_DELWRI) == 0)
4237			stat_inode_bitmap++;
4238		bdirty(bp);
4239		return (1);
4240	}
4241	/*
4242	 * Roll forward anything that had to be rolled back before
4243	 * the inode could be updated.
4244	 */
4245	hadchanges = 0;
4246	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
4247		nextadp = TAILQ_NEXT(adp, ad_next);
4248		if (adp->ad_state & ATTACHED) {
4249			lk.lkt_held = NOHOLDER;
4250			panic("handle_written_inodeblock: new entry");
4251		}
4252		if (fstype == UFS1) {
4253			if (adp->ad_lbn < NDADDR) {
4254				if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno) {
4255					lk.lkt_held = NOHOLDER;
4256					panic("%s %s #%jd mismatch %d != %jd",
4257					    "handle_written_inodeblock:",
4258					    "direct pointer",
4259					    (intmax_t)adp->ad_lbn,
4260					    dp1->di_db[adp->ad_lbn],
4261					    (intmax_t)adp->ad_oldblkno);
4262				}
4263				dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
4264			} else {
4265				if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0) {
4266					lk.lkt_held = NOHOLDER;
4267					panic("%s: %s #%jd allocated as %d",
4268					    "handle_written_inodeblock",
4269					    "indirect pointer",
4270					    (intmax_t)adp->ad_lbn - NDADDR,
4271					    dp1->di_ib[adp->ad_lbn - NDADDR]);
4272				}
4273				dp1->di_ib[adp->ad_lbn - NDADDR] =
4274				    adp->ad_newblkno;
4275			}
4276		} else {
4277			if (adp->ad_lbn < NDADDR) {
4278				if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno) {
4279					lk.lkt_held = NOHOLDER;
4280					panic("%s: %s #%jd %s %jd != %jd",
4281					    "handle_written_inodeblock",
4282					    "direct pointer",
4283					    (intmax_t)adp->ad_lbn, "mismatch",
4284					    (intmax_t)dp2->di_db[adp->ad_lbn],
4285					    (intmax_t)adp->ad_oldblkno);
4286				}
4287				dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
4288			} else {
4289				if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0) {
4290					lk.lkt_held = NOHOLDER;
4291					panic("%s: %s #%jd allocated as %jd",
4292					    "handle_written_inodeblock",
4293					    "indirect pointer",
4294					    (intmax_t)adp->ad_lbn - NDADDR,
4295					    (intmax_t)
4296					    dp2->di_ib[adp->ad_lbn - NDADDR]);
4297				}
4298				dp2->di_ib[adp->ad_lbn - NDADDR] =
4299				    adp->ad_newblkno;
4300			}
4301		}
4302		adp->ad_state &= ~UNDONE;
4303		adp->ad_state |= ATTACHED;
4304		hadchanges = 1;
4305	}
4306	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
4307		nextadp = TAILQ_NEXT(adp, ad_next);
4308		if (adp->ad_state & ATTACHED) {
4309			lk.lkt_held = NOHOLDER;
4310			panic("handle_written_inodeblock: new entry");
4311		}
4312		if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno) {
4313			lk.lkt_held = NOHOLDER;
4314			panic("%s: direct pointers #%jd %s %jd != %jd",
4315			    "handle_written_inodeblock",
4316			    (intmax_t)adp->ad_lbn, "mismatch",
4317			    (intmax_t)dp2->di_extb[adp->ad_lbn],
4318			    (intmax_t)adp->ad_oldblkno);
4319		}
4320		dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
4321		adp->ad_state &= ~UNDONE;
4322		adp->ad_state |= ATTACHED;
4323		hadchanges = 1;
4324	}
4325	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
4326		stat_direct_blk_ptrs++;
4327	/*
4328	 * Reset the file size to its most up-to-date value.
4329	 */
4330	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) {
4331		lk.lkt_held = NOHOLDER;
4332		panic("handle_written_inodeblock: bad size");
4333	}
4334	if (fstype == UFS1) {
4335		if (dp1->di_size != inodedep->id_savedsize) {
4336			dp1->di_size = inodedep->id_savedsize;
4337			hadchanges = 1;
4338		}
4339	} else {
4340		if (dp2->di_size != inodedep->id_savedsize) {
4341			dp2->di_size = inodedep->id_savedsize;
4342			hadchanges = 1;
4343		}
4344		if (dp2->di_extsize != inodedep->id_savedextsize) {
4345			dp2->di_extsize = inodedep->id_savedextsize;
4346			hadchanges = 1;
4347		}
4348	}
4349	inodedep->id_savedsize = -1;
4350	inodedep->id_savedextsize = -1;
4351	/*
4352	 * If there were any rollbacks in the inode block, then it must be
4353	 * marked dirty so that its will eventually get written back in
4354	 * its correct form.
4355	 */
4356	if (hadchanges)
4357		bdirty(bp);
4358	/*
4359	 * Process any allocdirects that completed during the update.
4360	 */
4361	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
4362		handle_allocdirect_partdone(adp);
4363	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
4364		handle_allocdirect_partdone(adp);
4365	/*
4366	 * Process deallocations that were held pending until the
4367	 * inode had been written to disk. Freeing of the inode
4368	 * is delayed until after all blocks have been freed to
4369	 * avoid creation of new <vfsid, inum, lbn> triples
4370	 * before the old ones have been deleted.
4371	 */
4372	filefree = NULL;
4373	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
4374		WORKLIST_REMOVE(wk);
4375		switch (wk->wk_type) {
4376
4377		case D_FREEFILE:
4378			/*
4379			 * We defer adding filefree to the worklist until
4380			 * all other additions have been made to ensure
4381			 * that it will be done after all the old blocks
4382			 * have been freed.
4383			 */
4384			if (filefree != NULL) {
4385				lk.lkt_held = NOHOLDER;
4386				panic("handle_written_inodeblock: filefree");
4387			}
4388			filefree = wk;
4389			continue;
4390
4391		case D_MKDIR:
4392			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
4393			continue;
4394
4395		case D_DIRADD:
4396			diradd_inode_written(WK_DIRADD(wk), inodedep);
4397			continue;
4398
4399		case D_FREEBLKS:
4400		case D_FREEFRAG:
4401		case D_DIRREM:
4402			add_to_worklist(wk);
4403			continue;
4404
4405		case D_NEWDIRBLK:
4406			free_newdirblk(WK_NEWDIRBLK(wk));
4407			continue;
4408
4409		default:
4410			lk.lkt_held = NOHOLDER;
4411			panic("handle_written_inodeblock: Unknown type %s",
4412			    TYPENAME(wk->wk_type));
4413			/* NOTREACHED */
4414		}
4415	}
4416	if (filefree != NULL) {
4417		if (free_inodedep(inodedep) == 0) {
4418			lk.lkt_held = NOHOLDER;
4419			panic("handle_written_inodeblock: live inodedep");
4420		}
4421		add_to_worklist(filefree);
4422		return (0);
4423	}
4424
4425	/*
4426	 * If no outstanding dependencies, free it.
4427	 */
4428	if (free_inodedep(inodedep) ||
4429	    (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
4430	     TAILQ_FIRST(&inodedep->id_extupdt) == 0))
4431		return (0);
4432	return (hadchanges);
4433}
4434
4435/*
4436 * Process a diradd entry after its dependent inode has been written.
4437 * This routine must be called with splbio interrupts blocked.
4438 */
4439static void
4440diradd_inode_written(dap, inodedep)
4441	struct diradd *dap;
4442	struct inodedep *inodedep;
4443{
4444	struct pagedep *pagedep;
4445
4446	dap->da_state |= COMPLETE;
4447	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4448		if (dap->da_state & DIRCHG)
4449			pagedep = dap->da_previous->dm_pagedep;
4450		else
4451			pagedep = dap->da_pagedep;
4452		LIST_REMOVE(dap, da_pdlist);
4453		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4454	}
4455	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
4456}
4457
4458/*
4459 * Handle the completion of a mkdir dependency.
4460 */
4461static void
4462handle_written_mkdir(mkdir, type)
4463	struct mkdir *mkdir;
4464	int type;
4465{
4466	struct diradd *dap;
4467	struct pagedep *pagedep;
4468
4469	if (mkdir->md_state != type) {
4470		lk.lkt_held = NOHOLDER;
4471		panic("handle_written_mkdir: bad type");
4472	}
4473	dap = mkdir->md_diradd;
4474	dap->da_state &= ~type;
4475	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
4476		dap->da_state |= DEPCOMPLETE;
4477	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4478		if (dap->da_state & DIRCHG)
4479			pagedep = dap->da_previous->dm_pagedep;
4480		else
4481			pagedep = dap->da_pagedep;
4482		LIST_REMOVE(dap, da_pdlist);
4483		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4484	}
4485	LIST_REMOVE(mkdir, md_mkdirs);
4486	WORKITEM_FREE(mkdir, D_MKDIR);
4487}
4488
4489/*
4490 * Called from within softdep_disk_write_complete above.
4491 * A write operation was just completed. Removed inodes can
4492 * now be freed and associated block pointers may be committed.
4493 * Note that this routine is always called from interrupt level
4494 * with further splbio interrupts blocked.
4495 */
4496static int
4497handle_written_filepage(pagedep, bp)
4498	struct pagedep *pagedep;
4499	struct buf *bp;		/* buffer containing the written page */
4500{
4501	struct dirrem *dirrem;
4502	struct diradd *dap, *nextdap;
4503	struct direct *ep;
4504	int i, chgs;
4505
4506	if ((pagedep->pd_state & IOSTARTED) == 0) {
4507		lk.lkt_held = NOHOLDER;
4508		panic("handle_written_filepage: not started");
4509	}
4510	pagedep->pd_state &= ~IOSTARTED;
4511	/*
4512	 * Process any directory removals that have been committed.
4513	 */
4514	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
4515		LIST_REMOVE(dirrem, dm_next);
4516		dirrem->dm_dirinum = pagedep->pd_ino;
4517		add_to_worklist(&dirrem->dm_list);
4518	}
4519	/*
4520	 * Free any directory additions that have been committed.
4521	 * If it is a newly allocated block, we have to wait until
4522	 * the on-disk directory inode claims the new block.
4523	 */
4524	if ((pagedep->pd_state & NEWBLOCK) == 0)
4525		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
4526			free_diradd(dap);
4527	/*
4528	 * Uncommitted directory entries must be restored.
4529	 */
4530	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
4531		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
4532		     dap = nextdap) {
4533			nextdap = LIST_NEXT(dap, da_pdlist);
4534			if (dap->da_state & ATTACHED) {
4535				lk.lkt_held = NOHOLDER;
4536				panic("handle_written_filepage: attached");
4537			}
4538			ep = (struct direct *)
4539			    ((char *)bp->b_data + dap->da_offset);
4540			ep->d_ino = dap->da_newinum;
4541			dap->da_state &= ~UNDONE;
4542			dap->da_state |= ATTACHED;
4543			chgs = 1;
4544			/*
4545			 * If the inode referenced by the directory has
4546			 * been written out, then the dependency can be
4547			 * moved to the pending list.
4548			 */
4549			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4550				LIST_REMOVE(dap, da_pdlist);
4551				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
4552				    da_pdlist);
4553			}
4554		}
4555	}
4556	/*
4557	 * If there were any rollbacks in the directory, then it must be
4558	 * marked dirty so that its will eventually get written back in
4559	 * its correct form.
4560	 */
4561	if (chgs) {
4562		if ((bp->b_flags & B_DELWRI) == 0)
4563			stat_dir_entry++;
4564		bdirty(bp);
4565		return (1);
4566	}
4567	/*
4568	 * If we are not waiting for a new directory block to be
4569	 * claimed by its inode, then the pagedep will be freed.
4570	 * Otherwise it will remain to track any new entries on
4571	 * the page in case they are fsync'ed.
4572	 */
4573	if ((pagedep->pd_state & NEWBLOCK) == 0) {
4574		LIST_REMOVE(pagedep, pd_hash);
4575		WORKITEM_FREE(pagedep, D_PAGEDEP);
4576	}
4577	return (0);
4578}
4579
4580/*
4581 * Writing back in-core inode structures.
4582 *
4583 * The filesystem only accesses an inode's contents when it occupies an
4584 * "in-core" inode structure.  These "in-core" structures are separate from
4585 * the page frames used to cache inode blocks.  Only the latter are
4586 * transferred to/from the disk.  So, when the updated contents of the
4587 * "in-core" inode structure are copied to the corresponding in-memory inode
4588 * block, the dependencies are also transferred.  The following procedure is
4589 * called when copying a dirty "in-core" inode to a cached inode block.
4590 */
4591
4592/*
4593 * Called when an inode is loaded from disk. If the effective link count
4594 * differed from the actual link count when it was last flushed, then we
4595 * need to ensure that the correct effective link count is put back.
4596 */
4597void
4598softdep_load_inodeblock(ip)
4599	struct inode *ip;	/* the "in_core" copy of the inode */
4600{
4601	struct inodedep *inodedep;
4602
4603	/*
4604	 * Check for alternate nlink count.
4605	 */
4606	ip->i_effnlink = ip->i_nlink;
4607	ACQUIRE_LOCK(&lk);
4608	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
4609		FREE_LOCK(&lk);
4610		return;
4611	}
4612	ip->i_effnlink -= inodedep->id_nlinkdelta;
4613	if (inodedep->id_state & SPACECOUNTED)
4614		ip->i_flag |= IN_SPACECOUNTED;
4615	FREE_LOCK(&lk);
4616}
4617
4618/*
4619 * This routine is called just before the "in-core" inode
4620 * information is to be copied to the in-memory inode block.
4621 * Recall that an inode block contains several inodes. If
4622 * the force flag is set, then the dependencies will be
4623 * cleared so that the update can always be made. Note that
4624 * the buffer is locked when this routine is called, so we
4625 * will never be in the middle of writing the inode block
4626 * to disk.
4627 */
4628void
4629softdep_update_inodeblock(ip, bp, waitfor)
4630	struct inode *ip;	/* the "in_core" copy of the inode */
4631	struct buf *bp;		/* the buffer containing the inode block */
4632	int waitfor;		/* nonzero => update must be allowed */
4633{
4634	struct inodedep *inodedep;
4635	struct worklist *wk;
4636	struct buf *ibp;
4637	int error;
4638
4639	/*
4640	 * If the effective link count is not equal to the actual link
4641	 * count, then we must track the difference in an inodedep while
4642	 * the inode is (potentially) tossed out of the cache. Otherwise,
4643	 * if there is no existing inodedep, then there are no dependencies
4644	 * to track.
4645	 */
4646	ACQUIRE_LOCK(&lk);
4647	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
4648		FREE_LOCK(&lk);
4649		if (ip->i_effnlink != ip->i_nlink)
4650			panic("softdep_update_inodeblock: bad link count");
4651		return;
4652	}
4653	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) {
4654		FREE_LOCK(&lk);
4655		panic("softdep_update_inodeblock: bad delta");
4656	}
4657	/*
4658	 * Changes have been initiated. Anything depending on these
4659	 * changes cannot occur until this inode has been written.
4660	 */
4661	inodedep->id_state &= ~COMPLETE;
4662	if ((inodedep->id_state & ONWORKLIST) == 0)
4663		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
4664	/*
4665	 * Any new dependencies associated with the incore inode must
4666	 * now be moved to the list associated with the buffer holding
4667	 * the in-memory copy of the inode. Once merged process any
4668	 * allocdirects that are completed by the merger.
4669	 */
4670	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
4671	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
4672		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
4673	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
4674	if (TAILQ_FIRST(&inodedep->id_extupdt) != NULL)
4675		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
4676	/*
4677	 * Now that the inode has been pushed into the buffer, the
4678	 * operations dependent on the inode being written to disk
4679	 * can be moved to the id_bufwait so that they will be
4680	 * processed when the buffer I/O completes.
4681	 */
4682	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
4683		WORKLIST_REMOVE(wk);
4684		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
4685	}
4686	/*
4687	 * Newly allocated inodes cannot be written until the bitmap
4688	 * that allocates them have been written (indicated by
4689	 * DEPCOMPLETE being set in id_state). If we are doing a
4690	 * forced sync (e.g., an fsync on a file), we force the bitmap
4691	 * to be written so that the update can be done.
4692	 */
4693	if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
4694		FREE_LOCK(&lk);
4695		return;
4696	}
4697	ibp = getdirtybuf(&inodedep->id_buf, NULL, MNT_WAIT);
4698	FREE_LOCK(&lk);
4699	if (ibp && (error = BUF_WRITE(ibp)) != 0)
4700		softdep_error("softdep_update_inodeblock: bwrite", error);
4701	if ((inodedep->id_state & DEPCOMPLETE) == 0)
4702		panic("softdep_update_inodeblock: update failed");
4703}
4704
4705/*
4706 * Merge the a new inode dependency list (such as id_newinoupdt) into an
4707 * old inode dependency list (such as id_inoupdt). This routine must be
4708 * called with splbio interrupts blocked.
4709 */
4710static void
4711merge_inode_lists(newlisthead, oldlisthead)
4712	struct allocdirectlst *newlisthead;
4713	struct allocdirectlst *oldlisthead;
4714{
4715	struct allocdirect *listadp, *newadp;
4716
4717	newadp = TAILQ_FIRST(newlisthead);
4718	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
4719		if (listadp->ad_lbn < newadp->ad_lbn) {
4720			listadp = TAILQ_NEXT(listadp, ad_next);
4721			continue;
4722		}
4723		TAILQ_REMOVE(newlisthead, newadp, ad_next);
4724		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
4725		if (listadp->ad_lbn == newadp->ad_lbn) {
4726			allocdirect_merge(oldlisthead, newadp,
4727			    listadp);
4728			listadp = newadp;
4729		}
4730		newadp = TAILQ_FIRST(newlisthead);
4731	}
4732	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
4733		TAILQ_REMOVE(newlisthead, newadp, ad_next);
4734		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
4735	}
4736}
4737
4738/*
4739 * If we are doing an fsync, then we must ensure that any directory
4740 * entries for the inode have been written after the inode gets to disk.
4741 */
4742int
4743softdep_fsync(vp)
4744	struct vnode *vp;	/* the "in_core" copy of the inode */
4745{
4746	struct inodedep *inodedep;
4747	struct pagedep *pagedep;
4748	struct worklist *wk;
4749	struct diradd *dap;
4750	struct mount *mnt;
4751	struct vnode *pvp;
4752	struct inode *ip;
4753	struct buf *bp;
4754	struct fs *fs;
4755	struct thread *td = curthread;
4756	int error, flushparent;
4757	ino_t parentino;
4758	ufs_lbn_t lbn;
4759
4760	ip = VTOI(vp);
4761	fs = ip->i_fs;
4762	ACQUIRE_LOCK(&lk);
4763	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
4764		FREE_LOCK(&lk);
4765		return (0);
4766	}
4767	if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
4768	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
4769	    TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
4770	    TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
4771	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
4772	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
4773		FREE_LOCK(&lk);
4774		panic("softdep_fsync: pending ops");
4775	}
4776	for (error = 0, flushparent = 0; ; ) {
4777		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
4778			break;
4779		if (wk->wk_type != D_DIRADD) {
4780			FREE_LOCK(&lk);
4781			panic("softdep_fsync: Unexpected type %s",
4782			    TYPENAME(wk->wk_type));
4783		}
4784		dap = WK_DIRADD(wk);
4785		/*
4786		 * Flush our parent if this directory entry has a MKDIR_PARENT
4787		 * dependency or is contained in a newly allocated block.
4788		 */
4789		if (dap->da_state & DIRCHG)
4790			pagedep = dap->da_previous->dm_pagedep;
4791		else
4792			pagedep = dap->da_pagedep;
4793		mnt = pagedep->pd_mnt;
4794		parentino = pagedep->pd_ino;
4795		lbn = pagedep->pd_lbn;
4796		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) {
4797			FREE_LOCK(&lk);
4798			panic("softdep_fsync: dirty");
4799		}
4800		if ((dap->da_state & MKDIR_PARENT) ||
4801		    (pagedep->pd_state & NEWBLOCK))
4802			flushparent = 1;
4803		else
4804			flushparent = 0;
4805		/*
4806		 * If we are being fsync'ed as part of vgone'ing this vnode,
4807		 * then we will not be able to release and recover the
4808		 * vnode below, so we just have to give up on writing its
4809		 * directory entry out. It will eventually be written, just
4810		 * not now, but then the user was not asking to have it
4811		 * written, so we are not breaking any promises.
4812		 */
4813		mp_fixme("This operation is not atomic wrt the rest of the code");
4814		VI_LOCK(vp);
4815		if (vp->v_iflag & VI_XLOCK) {
4816			VI_UNLOCK(vp);
4817			break;
4818		} else
4819			VI_UNLOCK(vp);
4820		/*
4821		 * We prevent deadlock by always fetching inodes from the
4822		 * root, moving down the directory tree. Thus, when fetching
4823		 * our parent directory, we first try to get the lock. If
4824		 * that fails, we must unlock ourselves before requesting
4825		 * the lock on our parent. See the comment in ufs_lookup
4826		 * for details on possible races.
4827		 */
4828		FREE_LOCK(&lk);
4829		if (VFS_VGET(mnt, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) {
4830			VOP_UNLOCK(vp, 0, td);
4831			error = VFS_VGET(mnt, parentino, LK_EXCLUSIVE, &pvp);
4832			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
4833			if (error != 0)
4834				return (error);
4835		}
4836		/*
4837		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
4838		 * that are contained in direct blocks will be resolved by
4839		 * doing a UFS_UPDATE. Pagedeps contained in indirect blocks
4840		 * may require a complete sync'ing of the directory. So, we
4841		 * try the cheap and fast UFS_UPDATE first, and if that fails,
4842		 * then we do the slower VOP_FSYNC of the directory.
4843		 */
4844		if (flushparent) {
4845			if ((error = UFS_UPDATE(pvp, 1)) != 0) {
4846				vput(pvp);
4847				return (error);
4848			}
4849			if ((pagedep->pd_state & NEWBLOCK) &&
4850			    (error = VOP_FSYNC(pvp, td->td_ucred, MNT_WAIT, td))) {
4851				vput(pvp);
4852				return (error);
4853			}
4854		}
4855		/*
4856		 * Flush directory page containing the inode's name.
4857		 */
4858		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
4859		    &bp);
4860		if (error == 0)
4861			error = BUF_WRITE(bp);
4862		else
4863			brelse(bp);
4864		vput(pvp);
4865		if (error != 0)
4866			return (error);
4867		ACQUIRE_LOCK(&lk);
4868		if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
4869			break;
4870	}
4871	FREE_LOCK(&lk);
4872	return (0);
4873}
4874
4875/*
4876 * Flush all the dirty bitmaps associated with the block device
4877 * before flushing the rest of the dirty blocks so as to reduce
4878 * the number of dependencies that will have to be rolled back.
4879 */
4880void
4881softdep_fsync_mountdev(vp)
4882	struct vnode *vp;
4883{
4884	struct buf *bp, *nbp;
4885	struct worklist *wk;
4886
4887	if (!vn_isdisk(vp, NULL))
4888		panic("softdep_fsync_mountdev: vnode not a disk");
4889	ACQUIRE_LOCK(&lk);
4890	VI_LOCK(vp);
4891	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
4892		nbp = TAILQ_NEXT(bp, b_vnbufs);
4893		/*
4894		 * If it is already scheduled, skip to the next buffer.
4895		 */
4896		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
4897			continue;
4898
4899		if ((bp->b_flags & B_DELWRI) == 0) {
4900			FREE_LOCK(&lk);
4901			panic("softdep_fsync_mountdev: not dirty");
4902		}
4903		/*
4904		 * We are only interested in bitmaps with outstanding
4905		 * dependencies.
4906		 */
4907		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
4908		    wk->wk_type != D_BMSAFEMAP ||
4909		    (bp->b_vflags & BV_BKGRDINPROG)) {
4910			BUF_UNLOCK(bp);
4911			continue;
4912		}
4913		VI_UNLOCK(vp);
4914		bremfree(bp);
4915		FREE_LOCK(&lk);
4916		(void) bawrite(bp);
4917		ACQUIRE_LOCK(&lk);
4918		/*
4919		 * Since we may have slept during the I/O, we need
4920		 * to start from a known point.
4921		 */
4922		VI_LOCK(vp);
4923		nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
4924	}
4925	drain_output(vp, 1);
4926	VI_UNLOCK(vp);
4927	FREE_LOCK(&lk);
4928}
4929
4930/*
4931 * This routine is called when we are trying to synchronously flush a
4932 * file. This routine must eliminate any filesystem metadata dependencies
4933 * so that the syncing routine can succeed by pushing the dirty blocks
4934 * associated with the file. If any I/O errors occur, they are returned.
4935 */
4936int
4937softdep_sync_metadata(ap)
4938	struct vop_fsync_args /* {
4939		struct vnode *a_vp;
4940		struct ucred *a_cred;
4941		int a_waitfor;
4942		struct thread *a_td;
4943	} */ *ap;
4944{
4945	struct vnode *vp = ap->a_vp;
4946	struct pagedep *pagedep;
4947	struct allocdirect *adp;
4948	struct allocindir *aip;
4949	struct buf *bp, *nbp;
4950	struct worklist *wk;
4951	int i, error, waitfor;
4952
4953	/*
4954	 * Check whether this vnode is involved in a filesystem
4955	 * that is doing soft dependency processing.
4956	 */
4957	if (!vn_isdisk(vp, NULL)) {
4958		if (!DOINGSOFTDEP(vp))
4959			return (0);
4960	} else
4961		if (vp->v_rdev->si_mountpoint == NULL ||
4962		    (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0)
4963			return (0);
4964	/*
4965	 * Ensure that any direct block dependencies have been cleared.
4966	 */
4967	ACQUIRE_LOCK(&lk);
4968	if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
4969		FREE_LOCK(&lk);
4970		return (error);
4971	}
4972	/*
4973	 * For most files, the only metadata dependencies are the
4974	 * cylinder group maps that allocate their inode or blocks.
4975	 * The block allocation dependencies can be found by traversing
4976	 * the dependency lists for any buffers that remain on their
4977	 * dirty buffer list. The inode allocation dependency will
4978	 * be resolved when the inode is updated with MNT_WAIT.
4979	 * This work is done in two passes. The first pass grabs most
4980	 * of the buffers and begins asynchronously writing them. The
4981	 * only way to wait for these asynchronous writes is to sleep
4982	 * on the filesystem vnode which may stay busy for a long time
4983	 * if the filesystem is active. So, instead, we make a second
4984	 * pass over the dependencies blocking on each write. In the
4985	 * usual case we will be blocking against a write that we
4986	 * initiated, so when it is done the dependency will have been
4987	 * resolved. Thus the second pass is expected to end quickly.
4988	 */
4989	waitfor = MNT_NOWAIT;
4990top:
4991	/*
4992	 * We must wait for any I/O in progress to finish so that
4993	 * all potential buffers on the dirty list will be visible.
4994	 */
4995	VI_LOCK(vp);
4996	drain_output(vp, 1);
4997	bp = getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd),
4998	    VI_MTX(vp), MNT_WAIT);
4999	if (bp == NULL) {
5000		VI_UNLOCK(vp);
5001		FREE_LOCK(&lk);
5002		return (0);
5003	}
5004	/* While syncing snapshots, we must allow recursive lookups */
5005	bp->b_lock.lk_flags |= LK_CANRECURSE;
5006loop:
5007	/*
5008	 * As we hold the buffer locked, none of its dependencies
5009	 * will disappear.
5010	 */
5011	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5012		switch (wk->wk_type) {
5013
5014		case D_ALLOCDIRECT:
5015			adp = WK_ALLOCDIRECT(wk);
5016			if (adp->ad_state & DEPCOMPLETE)
5017				continue;
5018			nbp = getdirtybuf(&adp->ad_buf, NULL, waitfor);
5019			if (nbp == NULL)
5020				continue;
5021			FREE_LOCK(&lk);
5022			if (waitfor == MNT_NOWAIT) {
5023				bawrite(nbp);
5024			} else if ((error = BUF_WRITE(nbp)) != 0) {
5025				break;
5026			}
5027			ACQUIRE_LOCK(&lk);
5028			continue;
5029
5030		case D_ALLOCINDIR:
5031			aip = WK_ALLOCINDIR(wk);
5032			if (aip->ai_state & DEPCOMPLETE)
5033				continue;
5034			nbp = getdirtybuf(&aip->ai_buf, NULL, waitfor);
5035			if (nbp == NULL)
5036				continue;
5037			FREE_LOCK(&lk);
5038			if (waitfor == MNT_NOWAIT) {
5039				bawrite(nbp);
5040			} else if ((error = BUF_WRITE(nbp)) != 0) {
5041				break;
5042			}
5043			ACQUIRE_LOCK(&lk);
5044			continue;
5045
5046		case D_INDIRDEP:
5047		restart:
5048
5049			LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
5050				if (aip->ai_state & DEPCOMPLETE)
5051					continue;
5052				nbp = getdirtybuf(&aip->ai_buf, NULL, MNT_WAIT);
5053				if (nbp == NULL)
5054					goto restart;
5055				FREE_LOCK(&lk);
5056				if ((error = BUF_WRITE(nbp)) != 0) {
5057					break;
5058				}
5059				ACQUIRE_LOCK(&lk);
5060				goto restart;
5061			}
5062			continue;
5063
5064		case D_INODEDEP:
5065			if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
5066			    WK_INODEDEP(wk)->id_ino)) != 0) {
5067				FREE_LOCK(&lk);
5068				break;
5069			}
5070			continue;
5071
5072		case D_PAGEDEP:
5073			/*
5074			 * We are trying to sync a directory that may
5075			 * have dependencies on both its own metadata
5076			 * and/or dependencies on the inodes of any
5077			 * recently allocated files. We walk its diradd
5078			 * lists pushing out the associated inode.
5079			 */
5080			pagedep = WK_PAGEDEP(wk);
5081			for (i = 0; i < DAHASHSZ; i++) {
5082				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
5083					continue;
5084				if ((error =
5085				    flush_pagedep_deps(vp, pagedep->pd_mnt,
5086						&pagedep->pd_diraddhd[i]))) {
5087					FREE_LOCK(&lk);
5088					break;
5089				}
5090			}
5091			continue;
5092
5093		case D_MKDIR:
5094			/*
5095			 * This case should never happen if the vnode has
5096			 * been properly sync'ed. However, if this function
5097			 * is used at a place where the vnode has not yet
5098			 * been sync'ed, this dependency can show up. So,
5099			 * rather than panic, just flush it.
5100			 */
5101			nbp = getdirtybuf(&WK_MKDIR(wk)->md_buf, NULL, waitfor);
5102			if (nbp == NULL)
5103				continue;
5104			FREE_LOCK(&lk);
5105			if (waitfor == MNT_NOWAIT) {
5106				bawrite(nbp);
5107			} else if ((error = BUF_WRITE(nbp)) != 0) {
5108				break;
5109			}
5110			ACQUIRE_LOCK(&lk);
5111			continue;
5112
5113		case D_BMSAFEMAP:
5114			/*
5115			 * This case should never happen if the vnode has
5116			 * been properly sync'ed. However, if this function
5117			 * is used at a place where the vnode has not yet
5118			 * been sync'ed, this dependency can show up. So,
5119			 * rather than panic, just flush it.
5120			 */
5121			nbp = getdirtybuf(&WK_BMSAFEMAP(wk)->sm_buf,
5122			    NULL, waitfor);
5123			if (nbp == NULL)
5124				continue;
5125			FREE_LOCK(&lk);
5126			if (waitfor == MNT_NOWAIT) {
5127				bawrite(nbp);
5128			} else if ((error = BUF_WRITE(nbp)) != 0) {
5129				break;
5130			}
5131			ACQUIRE_LOCK(&lk);
5132			continue;
5133
5134		default:
5135			FREE_LOCK(&lk);
5136			panic("softdep_sync_metadata: Unknown type %s",
5137			    TYPENAME(wk->wk_type));
5138			/* NOTREACHED */
5139		}
5140		/* We reach here only in error and unlocked */
5141		if (error == 0)
5142			panic("softdep_sync_metadata: zero error");
5143		bp->b_lock.lk_flags &= ~LK_CANRECURSE;
5144		bawrite(bp);
5145		return (error);
5146	}
5147	VI_LOCK(vp);
5148	nbp = getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), VI_MTX(vp), MNT_WAIT);
5149	if (nbp == NULL)
5150		VI_UNLOCK(vp);
5151	FREE_LOCK(&lk);
5152	bp->b_lock.lk_flags &= ~LK_CANRECURSE;
5153	bawrite(bp);
5154	ACQUIRE_LOCK(&lk);
5155	if (nbp != NULL) {
5156		bp = nbp;
5157		goto loop;
5158	}
5159	/*
5160	 * The brief unlock is to allow any pent up dependency
5161	 * processing to be done. Then proceed with the second pass.
5162	 */
5163	if (waitfor == MNT_NOWAIT) {
5164		waitfor = MNT_WAIT;
5165		FREE_LOCK(&lk);
5166		ACQUIRE_LOCK(&lk);
5167		goto top;
5168	}
5169
5170	/*
5171	 * If we have managed to get rid of all the dirty buffers,
5172	 * then we are done. For certain directories and block
5173	 * devices, we may need to do further work.
5174	 *
5175	 * We must wait for any I/O in progress to finish so that
5176	 * all potential buffers on the dirty list will be visible.
5177	 */
5178	VI_LOCK(vp);
5179	drain_output(vp, 1);
5180	if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
5181		VI_UNLOCK(vp);
5182		FREE_LOCK(&lk);
5183		return (0);
5184	}
5185	VI_UNLOCK(vp);
5186
5187	FREE_LOCK(&lk);
5188	/*
5189	 * If we are trying to sync a block device, some of its buffers may
5190	 * contain metadata that cannot be written until the contents of some
5191	 * partially written files have been written to disk. The only easy
5192	 * way to accomplish this is to sync the entire filesystem (luckily
5193	 * this happens rarely).
5194	 */
5195	if (vn_isdisk(vp, NULL) &&
5196	    vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) &&
5197	    (error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT, ap->a_cred,
5198	     ap->a_td)) != 0)
5199		return (error);
5200	return (0);
5201}
5202
5203/*
5204 * Flush the dependencies associated with an inodedep.
5205 * Called with splbio blocked.
5206 */
5207static int
5208flush_inodedep_deps(fs, ino)
5209	struct fs *fs;
5210	ino_t ino;
5211{
5212	struct inodedep *inodedep;
5213	int error, waitfor;
5214
5215	/*
5216	 * This work is done in two passes. The first pass grabs most
5217	 * of the buffers and begins asynchronously writing them. The
5218	 * only way to wait for these asynchronous writes is to sleep
5219	 * on the filesystem vnode which may stay busy for a long time
5220	 * if the filesystem is active. So, instead, we make a second
5221	 * pass over the dependencies blocking on each write. In the
5222	 * usual case we will be blocking against a write that we
5223	 * initiated, so when it is done the dependency will have been
5224	 * resolved. Thus the second pass is expected to end quickly.
5225	 * We give a brief window at the top of the loop to allow
5226	 * any pending I/O to complete.
5227	 */
5228	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
5229		if (error)
5230			return (error);
5231		FREE_LOCK(&lk);
5232		ACQUIRE_LOCK(&lk);
5233		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
5234			return (0);
5235		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
5236		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
5237		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
5238		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
5239			continue;
5240		/*
5241		 * If pass2, we are done, otherwise do pass 2.
5242		 */
5243		if (waitfor == MNT_WAIT)
5244			break;
5245		waitfor = MNT_WAIT;
5246	}
5247	/*
5248	 * Try freeing inodedep in case all dependencies have been removed.
5249	 */
5250	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
5251		(void) free_inodedep(inodedep);
5252	return (0);
5253}
5254
5255/*
5256 * Flush an inode dependency list.
5257 * Called with splbio blocked.
5258 */
5259static int
5260flush_deplist(listhead, waitfor, errorp)
5261	struct allocdirectlst *listhead;
5262	int waitfor;
5263	int *errorp;
5264{
5265	struct allocdirect *adp;
5266	struct buf *bp;
5267
5268	TAILQ_FOREACH(adp, listhead, ad_next) {
5269		if (adp->ad_state & DEPCOMPLETE)
5270			continue;
5271		bp = getdirtybuf(&adp->ad_buf, NULL, waitfor);
5272		if (bp == NULL) {
5273			if (waitfor == MNT_NOWAIT)
5274				continue;
5275			return (1);
5276		}
5277		FREE_LOCK(&lk);
5278		if (waitfor == MNT_NOWAIT) {
5279			bawrite(bp);
5280		} else if ((*errorp = BUF_WRITE(bp)) != 0) {
5281			ACQUIRE_LOCK(&lk);
5282			return (1);
5283		}
5284		ACQUIRE_LOCK(&lk);
5285		return (1);
5286	}
5287	return (0);
5288}
5289
5290/*
5291 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
5292 * Called with splbio blocked.
5293 */
5294static int
5295flush_pagedep_deps(pvp, mp, diraddhdp)
5296	struct vnode *pvp;
5297	struct mount *mp;
5298	struct diraddhd *diraddhdp;
5299{
5300	struct thread *td = curthread;
5301	struct inodedep *inodedep;
5302	struct ufsmount *ump;
5303	struct diradd *dap;
5304	struct vnode *vp;
5305	int error = 0;
5306	struct buf *bp;
5307	ino_t inum;
5308
5309	ump = VFSTOUFS(mp);
5310	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
5311		/*
5312		 * Flush ourselves if this directory entry
5313		 * has a MKDIR_PARENT dependency.
5314		 */
5315		if (dap->da_state & MKDIR_PARENT) {
5316			FREE_LOCK(&lk);
5317			if ((error = UFS_UPDATE(pvp, 1)) != 0)
5318				break;
5319			ACQUIRE_LOCK(&lk);
5320			/*
5321			 * If that cleared dependencies, go on to next.
5322			 */
5323			if (dap != LIST_FIRST(diraddhdp))
5324				continue;
5325			if (dap->da_state & MKDIR_PARENT) {
5326				FREE_LOCK(&lk);
5327				panic("flush_pagedep_deps: MKDIR_PARENT");
5328			}
5329		}
5330		/*
5331		 * A newly allocated directory must have its "." and
5332		 * ".." entries written out before its name can be
5333		 * committed in its parent. We do not want or need
5334		 * the full semantics of a synchronous VOP_FSYNC as
5335		 * that may end up here again, once for each directory
5336		 * level in the filesystem. Instead, we push the blocks
5337		 * and wait for them to clear. We have to fsync twice
5338		 * because the first call may choose to defer blocks
5339		 * that still have dependencies, but deferral will
5340		 * happen at most once.
5341		 */
5342		inum = dap->da_newinum;
5343		if (dap->da_state & MKDIR_BODY) {
5344			FREE_LOCK(&lk);
5345			if ((error = VFS_VGET(mp, inum, LK_EXCLUSIVE, &vp)))
5346				break;
5347			if ((error=VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td)) ||
5348			    (error=VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td))) {
5349				vput(vp);
5350				break;
5351			}
5352			VI_LOCK(vp);
5353			drain_output(vp, 0);
5354			VI_UNLOCK(vp);
5355			vput(vp);
5356			ACQUIRE_LOCK(&lk);
5357			/*
5358			 * If that cleared dependencies, go on to next.
5359			 */
5360			if (dap != LIST_FIRST(diraddhdp))
5361				continue;
5362			if (dap->da_state & MKDIR_BODY) {
5363				FREE_LOCK(&lk);
5364				panic("flush_pagedep_deps: MKDIR_BODY");
5365			}
5366		}
5367		/*
5368		 * Flush the inode on which the directory entry depends.
5369		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
5370		 * the only remaining dependency is that the updated inode
5371		 * count must get pushed to disk. The inode has already
5372		 * been pushed into its inode buffer (via VOP_UPDATE) at
5373		 * the time of the reference count change. So we need only
5374		 * locate that buffer, ensure that there will be no rollback
5375		 * caused by a bitmap dependency, then write the inode buffer.
5376		 */
5377		if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
5378			FREE_LOCK(&lk);
5379			panic("flush_pagedep_deps: lost inode");
5380		}
5381		/*
5382		 * If the inode still has bitmap dependencies,
5383		 * push them to disk.
5384		 */
5385		if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5386			bp = getdirtybuf(&inodedep->id_buf, NULL, MNT_WAIT);
5387			FREE_LOCK(&lk);
5388			if (bp && (error = BUF_WRITE(bp)) != 0)
5389				break;
5390			ACQUIRE_LOCK(&lk);
5391			if (dap != LIST_FIRST(diraddhdp))
5392				continue;
5393		}
5394		/*
5395		 * If the inode is still sitting in a buffer waiting
5396		 * to be written, push it to disk.
5397		 */
5398		FREE_LOCK(&lk);
5399		if ((error = bread(ump->um_devvp,
5400		    fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
5401		    (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
5402			brelse(bp);
5403			break;
5404		}
5405		if ((error = BUF_WRITE(bp)) != 0)
5406			break;
5407		ACQUIRE_LOCK(&lk);
5408		/*
5409		 * If we have failed to get rid of all the dependencies
5410		 * then something is seriously wrong.
5411		 */
5412		if (dap == LIST_FIRST(diraddhdp)) {
5413			FREE_LOCK(&lk);
5414			panic("flush_pagedep_deps: flush failed");
5415		}
5416	}
5417	if (error)
5418		ACQUIRE_LOCK(&lk);
5419	return (error);
5420}
5421
5422/*
5423 * A large burst of file addition or deletion activity can drive the
5424 * memory load excessively high. First attempt to slow things down
5425 * using the techniques below. If that fails, this routine requests
5426 * the offending operations to fall back to running synchronously
5427 * until the memory load returns to a reasonable level.
5428 */
5429int
5430softdep_slowdown(vp)
5431	struct vnode *vp;
5432{
5433	int max_softdeps_hard;
5434
5435	max_softdeps_hard = max_softdeps * 11 / 10;
5436	if (num_dirrem < max_softdeps_hard / 2 &&
5437	    num_inodedep < max_softdeps_hard &&
5438	    VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps)
5439  		return (0);
5440	if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps)
5441		speedup_syncer();
5442	stat_sync_limit_hit += 1;
5443	return (1);
5444}
5445
5446/*
5447 * Called by the allocation routines when they are about to fail
5448 * in the hope that we can free up some disk space.
5449 *
5450 * First check to see if the work list has anything on it. If it has,
5451 * clean up entries until we successfully free some space. Because this
5452 * process holds inodes locked, we cannot handle any remove requests
5453 * that might block on a locked inode as that could lead to deadlock.
5454 * If the worklist yields no free space, encourage the syncer daemon
5455 * to help us. In no event will we try for longer than tickdelay seconds.
5456 */
5457int
5458softdep_request_cleanup(fs, vp)
5459	struct fs *fs;
5460	struct vnode *vp;
5461{
5462	long starttime;
5463	ufs2_daddr_t needed;
5464
5465	needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
5466	starttime = time_second + tickdelay;
5467	/*
5468	 * If we are being called because of a process doing a
5469	 * copy-on-write, then it is not safe to update the vnode
5470	 * as we may recurse into the copy-on-write routine.
5471	 */
5472	if ((curthread->td_proc->p_flag & P_COWINPROGRESS) == 0 &&
5473	    UFS_UPDATE(vp, 1) != 0)
5474		return (0);
5475	while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
5476		if (time_second > starttime)
5477			return (0);
5478		if (num_on_worklist > 0 &&
5479		    process_worklist_item(NULL, LK_NOWAIT) != -1) {
5480			stat_worklist_push += 1;
5481			continue;
5482		}
5483		request_cleanup(FLUSH_REMOVE_WAIT, 0);
5484	}
5485	return (1);
5486}
5487
5488/*
5489 * If memory utilization has gotten too high, deliberately slow things
5490 * down and speed up the I/O processing.
5491 */
5492static int
5493request_cleanup(resource, islocked)
5494	int resource;
5495	int islocked;
5496{
5497	struct thread *td = curthread;
5498
5499	/*
5500	 * We never hold up the filesystem syncer process.
5501	 */
5502	if (td == filesys_syncer)
5503		return (0);
5504	/*
5505	 * First check to see if the work list has gotten backlogged.
5506	 * If it has, co-opt this process to help clean up two entries.
5507	 * Because this process may hold inodes locked, we cannot
5508	 * handle any remove requests that might block on a locked
5509	 * inode as that could lead to deadlock.
5510	 */
5511	if (num_on_worklist > max_softdeps / 10) {
5512		if (islocked)
5513			FREE_LOCK(&lk);
5514		process_worklist_item(NULL, LK_NOWAIT);
5515		process_worklist_item(NULL, LK_NOWAIT);
5516		stat_worklist_push += 2;
5517		if (islocked)
5518			ACQUIRE_LOCK(&lk);
5519		return(1);
5520	}
5521	/*
5522	 * Next, we attempt to speed up the syncer process. If that
5523	 * is successful, then we allow the process to continue.
5524	 */
5525	if (speedup_syncer() && resource != FLUSH_REMOVE_WAIT)
5526		return(0);
5527	/*
5528	 * If we are resource constrained on inode dependencies, try
5529	 * flushing some dirty inodes. Otherwise, we are constrained
5530	 * by file deletions, so try accelerating flushes of directories
5531	 * with removal dependencies. We would like to do the cleanup
5532	 * here, but we probably hold an inode locked at this point and
5533	 * that might deadlock against one that we try to clean. So,
5534	 * the best that we can do is request the syncer daemon to do
5535	 * the cleanup for us.
5536	 */
5537	switch (resource) {
5538
5539	case FLUSH_INODES:
5540		stat_ino_limit_push += 1;
5541		req_clear_inodedeps += 1;
5542		stat_countp = &stat_ino_limit_hit;
5543		break;
5544
5545	case FLUSH_REMOVE:
5546	case FLUSH_REMOVE_WAIT:
5547		stat_blk_limit_push += 1;
5548		req_clear_remove += 1;
5549		stat_countp = &stat_blk_limit_hit;
5550		break;
5551
5552	default:
5553		if (islocked)
5554			FREE_LOCK(&lk);
5555		panic("request_cleanup: unknown type");
5556	}
5557	/*
5558	 * Hopefully the syncer daemon will catch up and awaken us.
5559	 * We wait at most tickdelay before proceeding in any case.
5560	 */
5561	if (islocked == 0)
5562		ACQUIRE_LOCK(&lk);
5563	proc_waiting += 1;
5564	if (handle.callout == NULL)
5565		handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
5566	interlocked_sleep(&lk, SLEEP, (caddr_t)&proc_waiting, NULL, PPAUSE,
5567	    "softupdate", 0);
5568	proc_waiting -= 1;
5569	if (islocked == 0)
5570		FREE_LOCK(&lk);
5571	return (1);
5572}
5573
5574/*
5575 * Awaken processes pausing in request_cleanup and clear proc_waiting
5576 * to indicate that there is no longer a timer running.
5577 */
5578static void
5579pause_timer(arg)
5580	void *arg;
5581{
5582
5583	*stat_countp += 1;
5584	wakeup_one(&proc_waiting);
5585	if (proc_waiting > 0)
5586		handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
5587	else
5588		handle.callout = NULL;
5589}
5590
5591/*
5592 * Flush out a directory with at least one removal dependency in an effort to
5593 * reduce the number of dirrem, freefile, and freeblks dependency structures.
5594 */
5595static void
5596clear_remove(td)
5597	struct thread *td;
5598{
5599	struct pagedep_hashhead *pagedephd;
5600	struct pagedep *pagedep;
5601	static int next = 0;
5602	struct mount *mp;
5603	struct vnode *vp;
5604	int error, cnt;
5605	ino_t ino;
5606
5607	ACQUIRE_LOCK(&lk);
5608	for (cnt = 0; cnt < pagedep_hash; cnt++) {
5609		pagedephd = &pagedep_hashtbl[next++];
5610		if (next >= pagedep_hash)
5611			next = 0;
5612		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
5613			if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
5614				continue;
5615			mp = pagedep->pd_mnt;
5616			ino = pagedep->pd_ino;
5617			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5618				continue;
5619			FREE_LOCK(&lk);
5620			if ((error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &vp))) {
5621				softdep_error("clear_remove: vget", error);
5622				vn_finished_write(mp);
5623				return;
5624			}
5625			if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td)))
5626				softdep_error("clear_remove: fsync", error);
5627			VI_LOCK(vp);
5628			drain_output(vp, 0);
5629			VI_UNLOCK(vp);
5630			vput(vp);
5631			vn_finished_write(mp);
5632			return;
5633		}
5634	}
5635	FREE_LOCK(&lk);
5636}
5637
5638/*
5639 * Clear out a block of dirty inodes in an effort to reduce
5640 * the number of inodedep dependency structures.
5641 */
5642static void
5643clear_inodedeps(td)
5644	struct thread *td;
5645{
5646	struct inodedep_hashhead *inodedephd;
5647	struct inodedep *inodedep;
5648	static int next = 0;
5649	struct mount *mp;
5650	struct vnode *vp;
5651	struct fs *fs;
5652	int error, cnt;
5653	ino_t firstino, lastino, ino;
5654
5655	ACQUIRE_LOCK(&lk);
5656	/*
5657	 * Pick a random inode dependency to be cleared.
5658	 * We will then gather up all the inodes in its block
5659	 * that have dependencies and flush them out.
5660	 */
5661	for (cnt = 0; cnt < inodedep_hash; cnt++) {
5662		inodedephd = &inodedep_hashtbl[next++];
5663		if (next >= inodedep_hash)
5664			next = 0;
5665		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
5666			break;
5667	}
5668	if (inodedep == NULL)
5669		return;
5670	/*
5671	 * Ugly code to find mount point given pointer to superblock.
5672	 */
5673	fs = inodedep->id_fs;
5674	TAILQ_FOREACH(mp, &mountlist, mnt_list)
5675		if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
5676			break;
5677	/*
5678	 * Find the last inode in the block with dependencies.
5679	 */
5680	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
5681	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
5682		if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
5683			break;
5684	/*
5685	 * Asynchronously push all but the last inode with dependencies.
5686	 * Synchronously push the last inode with dependencies to ensure
5687	 * that the inode block gets written to free up the inodedeps.
5688	 */
5689	for (ino = firstino; ino <= lastino; ino++) {
5690		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
5691			continue;
5692		FREE_LOCK(&lk);
5693		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5694			continue;
5695		if ((error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &vp)) != 0) {
5696			softdep_error("clear_inodedeps: vget", error);
5697			vn_finished_write(mp);
5698			return;
5699		}
5700		if (ino == lastino) {
5701			if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_WAIT, td)))
5702				softdep_error("clear_inodedeps: fsync1", error);
5703		} else {
5704			if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td)))
5705				softdep_error("clear_inodedeps: fsync2", error);
5706			VI_LOCK(vp);
5707			drain_output(vp, 0);
5708			VI_UNLOCK(vp);
5709		}
5710		vput(vp);
5711		vn_finished_write(mp);
5712		ACQUIRE_LOCK(&lk);
5713	}
5714	FREE_LOCK(&lk);
5715}
5716
5717/*
5718 * Function to determine if the buffer has outstanding dependencies
5719 * that will cause a roll-back if the buffer is written. If wantcount
5720 * is set, return number of dependencies, otherwise just yes or no.
5721 */
5722static int
5723softdep_count_dependencies(bp, wantcount)
5724	struct buf *bp;
5725	int wantcount;
5726{
5727	struct worklist *wk;
5728	struct inodedep *inodedep;
5729	struct indirdep *indirdep;
5730	struct allocindir *aip;
5731	struct pagedep *pagedep;
5732	struct diradd *dap;
5733	int i, retval;
5734
5735	retval = 0;
5736	ACQUIRE_LOCK(&lk);
5737	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5738		switch (wk->wk_type) {
5739
5740		case D_INODEDEP:
5741			inodedep = WK_INODEDEP(wk);
5742			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5743				/* bitmap allocation dependency */
5744				retval += 1;
5745				if (!wantcount)
5746					goto out;
5747			}
5748			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
5749				/* direct block pointer dependency */
5750				retval += 1;
5751				if (!wantcount)
5752					goto out;
5753			}
5754			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
5755				/* direct block pointer dependency */
5756				retval += 1;
5757				if (!wantcount)
5758					goto out;
5759			}
5760			continue;
5761
5762		case D_INDIRDEP:
5763			indirdep = WK_INDIRDEP(wk);
5764
5765			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
5766				/* indirect block pointer dependency */
5767				retval += 1;
5768				if (!wantcount)
5769					goto out;
5770			}
5771			continue;
5772
5773		case D_PAGEDEP:
5774			pagedep = WK_PAGEDEP(wk);
5775			for (i = 0; i < DAHASHSZ; i++) {
5776
5777				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
5778					/* directory entry dependency */
5779					retval += 1;
5780					if (!wantcount)
5781						goto out;
5782				}
5783			}
5784			continue;
5785
5786		case D_BMSAFEMAP:
5787		case D_ALLOCDIRECT:
5788		case D_ALLOCINDIR:
5789		case D_MKDIR:
5790			/* never a dependency on these blocks */
5791			continue;
5792
5793		default:
5794			FREE_LOCK(&lk);
5795			panic("softdep_check_for_rollback: Unexpected type %s",
5796			    TYPENAME(wk->wk_type));
5797			/* NOTREACHED */
5798		}
5799	}
5800out:
5801	FREE_LOCK(&lk);
5802	return retval;
5803}
5804
5805/*
5806 * Acquire exclusive access to a buffer.
5807 * Must be called with splbio blocked.
5808 * Return acquired buffer or NULL on failure.  mtx, if provided, will be
5809 * released on success but held on failure.
5810 */
5811static struct buf *
5812getdirtybuf(bpp, mtx, waitfor)
5813	struct buf **bpp;
5814	struct mtx *mtx;
5815	int waitfor;
5816{
5817	struct buf *bp;
5818	int error;
5819
5820	/*
5821	 * XXX This code and the code that calls it need to be reviewed to
5822	 * verify its use of the vnode interlock.
5823	 */
5824
5825	for (;;) {
5826		if ((bp = *bpp) == NULL)
5827			return (0);
5828		if (bp->b_vp == NULL)
5829			backtrace();
5830		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
5831			if ((bp->b_vflags & BV_BKGRDINPROG) == 0)
5832				break;
5833			BUF_UNLOCK(bp);
5834			if (waitfor != MNT_WAIT)
5835				return (NULL);
5836			/*
5837			 * The mtx argument must be bp->b_vp's mutex in
5838			 * this case.
5839			 */
5840			ASSERT_VI_LOCKED(bp->b_vp, "getdirtybuf");
5841			bp->b_vflags |= BV_BKGRDWAIT;
5842			interlocked_sleep(&lk, SLEEP, &bp->b_xflags, mtx,
5843			    PRIBIO, "getbuf", 0);
5844			continue;
5845		}
5846		if (waitfor != MNT_WAIT)
5847			return (NULL);
5848		if (mtx) {
5849			error = interlocked_sleep(&lk, LOCKBUF, bp, mtx,
5850			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 0, 0);
5851			mtx_lock(mtx);
5852		} else
5853			error = interlocked_sleep(&lk, LOCKBUF, bp, NULL,
5854			    LK_EXCLUSIVE | LK_SLEEPFAIL, 0, 0);
5855		if (error != ENOLCK) {
5856			FREE_LOCK(&lk);
5857			panic("getdirtybuf: inconsistent lock");
5858		}
5859	}
5860	if ((bp->b_flags & B_DELWRI) == 0) {
5861		BUF_UNLOCK(bp);
5862		return (NULL);
5863	}
5864	if (mtx)
5865		mtx_unlock(mtx);
5866	bremfree(bp);
5867	return (bp);
5868}
5869
5870/*
5871 * Wait for pending output on a vnode to complete.
5872 * Must be called with vnode lock and interlock locked.
5873 */
5874static void
5875drain_output(vp, islocked)
5876	struct vnode *vp;
5877	int islocked;
5878{
5879	ASSERT_VOP_LOCKED(vp, "drain_output");
5880	ASSERT_VI_LOCKED(vp, "drain_output");
5881
5882	if (!islocked)
5883		ACQUIRE_LOCK(&lk);
5884	while (vp->v_numoutput) {
5885		vp->v_iflag |= VI_BWAIT;
5886		interlocked_sleep(&lk, SLEEP, (caddr_t)&vp->v_numoutput,
5887		    VI_MTX(vp), PRIBIO + 1, "drainvp", 0);
5888	}
5889	if (!islocked)
5890		FREE_LOCK(&lk);
5891}
5892
5893/*
5894 * Called whenever a buffer that is being invalidated or reallocated
5895 * contains dependencies. This should only happen if an I/O error has
5896 * occurred. The routine is called with the buffer locked.
5897 */
5898static void
5899softdep_deallocate_dependencies(bp)
5900	struct buf *bp;
5901{
5902
5903	if ((bp->b_ioflags & BIO_ERROR) == 0)
5904		panic("softdep_deallocate_dependencies: dangling deps");
5905	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
5906	panic("softdep_deallocate_dependencies: unrecovered I/O error");
5907}
5908
5909/*
5910 * Function to handle asynchronous write errors in the filesystem.
5911 */
5912static void
5913softdep_error(func, error)
5914	char *func;
5915	int error;
5916{
5917
5918	/* XXX should do something better! */
5919	printf("%s: got error %d while accessing filesystem\n", func, error);
5920}
5921