ffs_softdep.c revision 56149
1107120Sjulian/*
2107120Sjulian * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved.
3139823Simp *
4139823Simp * The soft updates code is derived from the appendix of a University
5139823Simp * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
6107120Sjulian * "Soft Updates: A Solution to the Metadata Update Problem in File
7107120Sjulian * Systems", CSE-TR-254-95, August 1995).
8107120Sjulian *
9107120Sjulian * The following are the copyrights and redistribution conditions that
10107120Sjulian * apply to this copy of the soft update software. For a license
11107120Sjulian * to use, redistribute or sell the soft update software under
12107120Sjulian * conditions other than those described here, please contact the
13107120Sjulian * author at one of the following addresses:
14107120Sjulian *
15107120Sjulian *	Marshall Kirk McKusick		mckusick@mckusick.com
16107120Sjulian *	1614 Oxford Street		+1-510-843-9542
17107120Sjulian *	Berkeley, CA 94709-1608
18107120Sjulian *	USA
19107120Sjulian *
20107120Sjulian * Redistribution and use in source and binary forms, with or without
21107120Sjulian * modification, are permitted provided that the following conditions
22107120Sjulian * are met:
23107120Sjulian *
24107120Sjulian * 1. Redistributions of source code must retain the above copyright
25107120Sjulian *    notice, this list of conditions and the following disclaimer.
26107120Sjulian * 2. Redistributions in binary form must reproduce the above copyright
27107120Sjulian *    notice, this list of conditions and the following disclaimer in the
28107120Sjulian *    documentation and/or other materials provided with the distribution.
29107120Sjulian * 3. None of the names of McKusick, Ganger, Patt, or the University of
30114878Sjulian *    Michigan may be used to endorse or promote products derived from
31107120Sjulian *    this software without specific prior written permission.
32107120Sjulian * 4. Redistributions in any form must be accompanied by information on
33107120Sjulian *    how to obtain complete source code for any accompanying software
34107120Sjulian *    that uses this software. This source code must either be included
35107120Sjulian *    in the distribution or be available for no more than the cost of
36107120Sjulian *    distribution plus a nominal fee, and must be freely redistributable
37107120Sjulian *    under reasonable conditions. For an executable file, complete
38107120Sjulian *    source code means the source code for all modules it contains.
39107120Sjulian *    It does not mean source code for modules or files that typically
40107120Sjulian *    accompany the operating system on which the executable file runs,
41107120Sjulian *    e.g., standard library modules or system header files.
42107120Sjulian *
43107120Sjulian * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
44107120Sjulian * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
45107120Sjulian * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
46107120Sjulian * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
47107120Sjulian * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48107120Sjulian * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49107120Sjulian * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50107120Sjulian * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51107120Sjulian * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52107120Sjulian * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53107120Sjulian * SUCH DAMAGE.
54107120Sjulian *
55107120Sjulian *	from: @(#)ffs_softdep.c	9.53 (McKusick) 1/16/00
56107120Sjulian * $FreeBSD: head/sys/ufs/ffs/ffs_softdep.c 56149 2000-01-17 06:28:18Z mckusick $
57107120Sjulian */
58107120Sjulian
59107120Sjulian/*
60107120Sjulian * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
61107120Sjulian */
62107120Sjulian#ifndef DIAGNOSTIC
63107120Sjulian#define DIAGNOSTIC
64129740Simp#endif
65107120Sjulian#ifndef DEBUG
66107120Sjulian#define DEBUG
67107120Sjulian#endif
68107120Sjulian
69128688Semax#include <sys/param.h>
70128688Semax#include <sys/kernel.h>
71128688Semax#include <sys/systm.h>
72128688Semax#include <sys/buf.h>
73107120Sjulian#include <sys/malloc.h>
74107120Sjulian#include <sys/mount.h>
75107120Sjulian#include <sys/proc.h>
76107120Sjulian#include <sys/syslog.h>
77107120Sjulian#include <sys/vnode.h>
78107120Sjulian#include <sys/conf.h>
79107120Sjulian#include <ufs/ufs/dir.h>
80107120Sjulian#include <ufs/ufs/quota.h>
81107120Sjulian#include <ufs/ufs/inode.h>
82107120Sjulian#include <ufs/ufs/ufsmount.h>
83107120Sjulian#include <ufs/ffs/fs.h>
84107120Sjulian#include <ufs/ffs/softdep.h>
85107120Sjulian#include <ufs/ffs/ffs_extern.h>
86107120Sjulian#include <ufs/ufs/ufs_extern.h>
87107120Sjulian
88107120Sjulian/*
89107120Sjulian * These definitions need to be adapted to the system to which
90107120Sjulian * this file is being ported.
91107120Sjulian */
92107120Sjulian/*
93107120Sjulian * malloc types defined for the softdep system.
94107120Sjulian */
95107120SjulianMALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
96107120SjulianMALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
97107120SjulianMALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
98107120SjulianMALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
99160114SemaxMALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
100160114SemaxMALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
101107120SjulianMALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
102107120SjulianMALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
103107120SjulianMALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
104107120SjulianMALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
105160114SemaxMALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
106160114SemaxMALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
107107120SjulianMALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
108107120Sjulian
109107120Sjulian#define	D_PAGEDEP	0
110107120Sjulian#define	D_INODEDEP	1
111160114Semax#define	D_NEWBLK	2
112160114Semax#define	D_BMSAFEMAP	3
113107120Sjulian#define	D_ALLOCDIRECT	4
114107120Sjulian#define	D_INDIRDEP	5
115107120Sjulian#define	D_ALLOCINDIR	6
116107120Sjulian#define	D_FREEFRAG	7
117160114Semax#define	D_FREEBLKS	8
118107120Sjulian#define	D_FREEFILE	9
119107120Sjulian#define	D_DIRADD	10
120107120Sjulian#define	D_MKDIR		11
121107120Sjulian#define	D_DIRREM	12
122160114Semax#define D_LAST		D_DIRREM
123107120Sjulian
124107120Sjulian/*
125107120Sjulian * translate from workitem type to memory type
126107120Sjulian * MUST match the defines above, such that memtype[D_XXX] == M_XXX
127107120Sjulian */
128107120Sjulianstatic struct malloc_type *memtype[] = {
129107120Sjulian	M_PAGEDEP,
130107120Sjulian	M_INODEDEP,
131107120Sjulian	M_NEWBLK,
132107120Sjulian	M_BMSAFEMAP,
133107120Sjulian	M_ALLOCDIRECT,
134107120Sjulian	M_INDIRDEP,
135107120Sjulian	M_ALLOCINDIR,
136107120Sjulian	M_FREEFRAG,
137107120Sjulian	M_FREEBLKS,
138107120Sjulian	M_FREEFILE,
139107120Sjulian	M_DIRADD,
140107120Sjulian	M_MKDIR,
141107120Sjulian	M_DIRREM
142107120Sjulian};
143107120Sjulian
144107120Sjulian#define DtoM(type) (memtype[type])
145107120Sjulian
146107120Sjulian/*
147107120Sjulian * Names of malloc types.
148107120Sjulian */
149107120Sjulian#define TYPENAME(type)  \
150107120Sjulian	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
151107120Sjulian#define CURPROC curproc
152107120Sjulian/*
153107120Sjulian * End system adaptaion definitions.
154107120Sjulian */
155107120Sjulian
156107120Sjulian/*
157107120Sjulian * Internal function prototypes.
158107120Sjulian */
159107120Sjulianstatic	void softdep_error __P((char *, int));
160107120Sjulianstatic	void drain_output __P((struct vnode *, int));
161107120Sjulianstatic	int getdirtybuf __P((struct buf **, int));
162107120Sjulianstatic	void clear_remove __P((struct proc *));
163107120Sjulianstatic	void clear_inodedeps __P((struct proc *));
164107120Sjulianstatic	int flush_pagedep_deps __P((struct vnode *, struct mount *,
165107120Sjulian	    struct diraddhd *));
166107120Sjulianstatic	int flush_inodedep_deps __P((struct fs *, ino_t));
167107120Sjulianstatic	int handle_written_filepage __P((struct pagedep *, struct buf *));
168107120Sjulianstatic  void diradd_inode_written __P((struct diradd *, struct inodedep *));
169107120Sjulianstatic	int handle_written_inodeblock __P((struct inodedep *, struct buf *));
170107120Sjulianstatic	void handle_allocdirect_partdone __P((struct allocdirect *));
171107120Sjulianstatic	void handle_allocindir_partdone __P((struct allocindir *));
172107120Sjulianstatic	void initiate_write_filepage __P((struct pagedep *, struct buf *));
173107120Sjulianstatic	void handle_written_mkdir __P((struct mkdir *, int));
174107120Sjulianstatic	void initiate_write_inodeblock __P((struct inodedep *, struct buf *));
175107120Sjulianstatic	void handle_workitem_freefile __P((struct freefile *));
176107120Sjulianstatic	void handle_workitem_remove __P((struct dirrem *));
177107120Sjulianstatic	struct dirrem *newdirrem __P((struct buf *, struct inode *,
178107120Sjulian	    struct inode *, int, struct dirrem **));
179107120Sjulianstatic	void free_diradd __P((struct diradd *));
180107120Sjulianstatic	void free_allocindir __P((struct allocindir *, struct inodedep *));
181107120Sjulianstatic	int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t,
182107120Sjulian	    long *));
183107120Sjulianstatic	void deallocate_dependencies __P((struct buf *, struct inodedep *));
184107120Sjulianstatic	void free_allocdirect __P((struct allocdirectlst *,
185107120Sjulian	    struct allocdirect *, int));
186107120Sjulianstatic	int free_inodedep __P((struct inodedep *));
187107120Sjulianstatic	void handle_workitem_freeblocks __P((struct freeblks *));
188107120Sjulianstatic	void merge_inode_lists __P((struct inodedep *));
189107120Sjulianstatic	void setup_allocindir_phase2 __P((struct buf *, struct inode *,
190107120Sjulian	    struct allocindir *));
191107120Sjulianstatic	struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t,
192107120Sjulian	    ufs_daddr_t));
193107120Sjulianstatic	void handle_workitem_freefrag __P((struct freefrag *));
194107120Sjulianstatic	struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long));
195107120Sjulianstatic	void allocdirect_merge __P((struct allocdirectlst *,
196107120Sjulian	    struct allocdirect *, struct allocdirect *));
197107120Sjulianstatic	struct bmsafemap *bmsafemap_lookup __P((struct buf *));
198107120Sjulianstatic	int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
199107120Sjulian	    struct newblk **));
200107120Sjulianstatic	int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
201107120Sjulianstatic	int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
202107120Sjulian	    struct pagedep **));
203107120Sjulianstatic	void pause_timer __P((void *));
204107120Sjulianstatic	int request_cleanup __P((int, int));
205107120Sjulianstatic	void add_to_worklist __P((struct worklist *));
206107120Sjulian
207107120Sjulian/*
208107120Sjulian * Exported softdep operations.
209107120Sjulian */
210107120Sjulianstatic	void softdep_disk_io_initiation __P((struct buf *));
211107120Sjulianstatic	void softdep_disk_write_complete __P((struct buf *));
212107120Sjulianstatic	void softdep_deallocate_dependencies __P((struct buf *));
213107120Sjulianstatic	int softdep_fsync __P((struct vnode *));
214107120Sjulianstatic	int softdep_process_worklist __P((struct mount *));
215107120Sjulianstatic	void softdep_move_dependencies __P((struct buf *, struct buf *));
216107120Sjulianstatic	int softdep_count_dependencies __P((struct buf *bp, int));
217107120Sjulian
218107120Sjulianstruct bio_ops bioops = {
219107120Sjulian	softdep_disk_io_initiation,		/* io_start */
220107120Sjulian	softdep_disk_write_complete,		/* io_complete */
221107120Sjulian	softdep_deallocate_dependencies,	/* io_deallocate */
222107120Sjulian	softdep_fsync,				/* io_fsync */
223107120Sjulian	softdep_process_worklist,		/* io_sync */
224107120Sjulian	softdep_move_dependencies,		/* io_movedeps */
225107120Sjulian	softdep_count_dependencies,		/* io_countdeps */
226107120Sjulian};
227107120Sjulian
228107120Sjulian/*
229107120Sjulian * Locking primitives.
230107120Sjulian *
231129835Sjulian * For a uniprocessor, all we need to do is protect against disk
232129835Sjulian * interrupts. For a multiprocessor, this lock would have to be
233129835Sjulian * a mutex. A single mutex is used throughout this file, though
234129835Sjulian * finer grain locking could be used if contention warranted it.
235129835Sjulian *
236129835Sjulian * For a multiprocessor, the sleep call would accept a lock and
237129835Sjulian * release it after the sleep processing was complete. In a uniprocessor
238129835Sjulian * implementation there is no such interlock, so we simple mark
239129835Sjulian * the places where it needs to be done with the `interlocked' form
240129835Sjulian * of the lock calls. Since the uniprocessor sleep already interlocks
241107120Sjulian * the spl, there is nothing that really needs to be done.
242107120Sjulian */
243107120Sjulian#ifndef /* NOT */ DEBUG
244107120Sjulianstatic struct lockit {
245107120Sjulian	int	lkt_spl;
246107120Sjulian} lk = { 0 };
247107120Sjulian#define ACQUIRE_LOCK(lk)		(lk)->lkt_spl = splbio()
248107120Sjulian#define FREE_LOCK(lk)			splx((lk)->lkt_spl)
249107120Sjulian#define ACQUIRE_LOCK_INTERLOCKED(lk)
250107120Sjulian#define FREE_LOCK_INTERLOCKED(lk)
251107120Sjulian
252107120Sjulian#else /* DEBUG */
253107120Sjulianstatic struct lockit {
254107120Sjulian	int	lkt_spl;
255107120Sjulian	pid_t	lkt_held;
256107120Sjulian} lk = { 0, -1 };
257107120Sjulianstatic int lockcnt;
258107120Sjulian
259107120Sjulianstatic	void acquire_lock __P((struct lockit *));
260107120Sjulianstatic	void free_lock __P((struct lockit *));
261107120Sjulianstatic	void acquire_lock_interlocked __P((struct lockit *));
262107120Sjulianstatic	void free_lock_interlocked __P((struct lockit *));
263107120Sjulian
264107120Sjulian#define ACQUIRE_LOCK(lk)		acquire_lock(lk)
265107120Sjulian#define FREE_LOCK(lk)			free_lock(lk)
266107120Sjulian#define ACQUIRE_LOCK_INTERLOCKED(lk)	acquire_lock_interlocked(lk)
267107120Sjulian#define FREE_LOCK_INTERLOCKED(lk)	free_lock_interlocked(lk)
268107120Sjulian
269107120Sjulianstatic void
270107120Sjulianacquire_lock(lk)
271107120Sjulian	struct lockit *lk;
272107120Sjulian{
273107120Sjulian
274107120Sjulian	if (lk->lkt_held != -1) {
275107120Sjulian		if (lk->lkt_held == CURPROC->p_pid)
276107120Sjulian			panic("softdep_lock: locking against myself");
277107120Sjulian		else
278107120Sjulian			panic("softdep_lock: lock held by %d", lk->lkt_held);
279107120Sjulian	}
280107120Sjulian	lk->lkt_spl = splbio();
281107120Sjulian	lk->lkt_held = CURPROC->p_pid;
282107120Sjulian	lockcnt++;
283107120Sjulian}
284107120Sjulian
285107120Sjulianstatic void
286107120Sjulianfree_lock(lk)
287107120Sjulian	struct lockit *lk;
288107120Sjulian{
289107120Sjulian
290107120Sjulian	if (lk->lkt_held == -1)
291107120Sjulian		panic("softdep_unlock: lock not held");
292107120Sjulian	lk->lkt_held = -1;
293107120Sjulian	splx(lk->lkt_spl);
294107120Sjulian}
295107120Sjulian
296107120Sjulianstatic void
297107120Sjulianacquire_lock_interlocked(lk)
298107120Sjulian	struct lockit *lk;
299107120Sjulian{
300107120Sjulian
301107120Sjulian	if (lk->lkt_held != -1) {
302107120Sjulian		if (lk->lkt_held == CURPROC->p_pid)
303107120Sjulian			panic("softdep_lock_interlocked: locking against self");
304107120Sjulian		else
305107120Sjulian			panic("softdep_lock_interlocked: lock held by %d",
306107120Sjulian			    lk->lkt_held);
307107120Sjulian	}
308107120Sjulian	lk->lkt_held = CURPROC->p_pid;
309107120Sjulian	lockcnt++;
310107120Sjulian}
311107120Sjulian
312107120Sjulianstatic void
313107120Sjulianfree_lock_interlocked(lk)
314107120Sjulian	struct lockit *lk;
315107120Sjulian{
316107120Sjulian
317107120Sjulian	if (lk->lkt_held == -1)
318107120Sjulian		panic("softdep_unlock_interlocked: lock not held");
319107120Sjulian	lk->lkt_held = -1;
320107120Sjulian}
321107120Sjulian#endif /* DEBUG */
322107120Sjulian
323107120Sjulian/*
324107120Sjulian * Place holder for real semaphores.
325107120Sjulian */
326107120Sjulianstruct sema {
327107120Sjulian	int	value;
328107120Sjulian	pid_t	holder;
329107120Sjulian	char	*name;
330107120Sjulian	int	prio;
331107120Sjulian	int	timo;
332107120Sjulian};
333107120Sjulianstatic	void sema_init __P((struct sema *, char *, int, int));
334107120Sjulianstatic	int sema_get __P((struct sema *, struct lockit *));
335107120Sjulianstatic	void sema_release __P((struct sema *));
336107120Sjulian
337107120Sjulianstatic void
338107120Sjuliansema_init(semap, name, prio, timo)
339107120Sjulian	struct sema *semap;
340107120Sjulian	char *name;
341107120Sjulian	int prio, timo;
342107120Sjulian{
343107120Sjulian
344107120Sjulian	semap->holder = -1;
345107120Sjulian	semap->value = 0;
346107120Sjulian	semap->name = name;
347107120Sjulian	semap->prio = prio;
348107120Sjulian	semap->timo = timo;
349107120Sjulian}
350107120Sjulian
351107120Sjulianstatic int
352107120Sjuliansema_get(semap, interlock)
353107120Sjulian	struct sema *semap;
354107120Sjulian	struct lockit *interlock;
355107120Sjulian{
356107120Sjulian
357107120Sjulian	if (semap->value++ > 0) {
358107120Sjulian		if (interlock != NULL)
359107120Sjulian			FREE_LOCK_INTERLOCKED(interlock);
360107120Sjulian		tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo);
361107120Sjulian		if (interlock != NULL) {
362107120Sjulian			ACQUIRE_LOCK_INTERLOCKED(interlock);
363107120Sjulian			FREE_LOCK(interlock);
364107120Sjulian		}
365107120Sjulian		return (0);
366107120Sjulian	}
367107120Sjulian	semap->holder = CURPROC->p_pid;
368107120Sjulian	if (interlock != NULL)
369107120Sjulian		FREE_LOCK(interlock);
370107120Sjulian	return (1);
371107120Sjulian}
372107120Sjulian
373107120Sjulianstatic void
374107120Sjuliansema_release(semap)
375107120Sjulian	struct sema *semap;
376107120Sjulian{
377107120Sjulian
378107120Sjulian	if (semap->value <= 0 || semap->holder != CURPROC->p_pid)
379107120Sjulian		panic("sema_release: not held");
380107120Sjulian	if (--semap->value > 0) {
381107120Sjulian		semap->value = 0;
382107120Sjulian		wakeup(semap);
383107120Sjulian	}
384107120Sjulian	semap->holder = -1;
385107120Sjulian}
386107120Sjulian
387107120Sjulian/*
388107120Sjulian * Worklist queue management.
389107120Sjulian * These routines require that the lock be held.
390107120Sjulian */
391107120Sjulian#ifndef /* NOT */ DEBUG
392107120Sjulian#define WORKLIST_INSERT(head, item) do {	\
393107120Sjulian	(item)->wk_state |= ONWORKLIST;		\
394107120Sjulian	LIST_INSERT_HEAD(head, item, wk_list);	\
395107120Sjulian} while (0)
396107120Sjulian#define WORKLIST_REMOVE(item) do {		\
397107120Sjulian	(item)->wk_state &= ~ONWORKLIST;	\
398107120Sjulian	LIST_REMOVE(item, wk_list);		\
399107120Sjulian} while (0)
400107120Sjulian#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
401107120Sjulian
402107120Sjulian#else /* DEBUG */
403107120Sjulianstatic	void worklist_insert __P((struct workhead *, struct worklist *));
404107120Sjulianstatic	void worklist_remove __P((struct worklist *));
405107120Sjulianstatic	void workitem_free __P((struct worklist *, int));
406107120Sjulian
407107120Sjulian#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
408107120Sjulian#define WORKLIST_REMOVE(item) worklist_remove(item)
409107120Sjulian#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
410107120Sjulian
411107120Sjulianstatic void
412107120Sjulianworklist_insert(head, item)
413107120Sjulian	struct workhead *head;
414107120Sjulian	struct worklist *item;
415107120Sjulian{
416107120Sjulian
417107120Sjulian	if (lk.lkt_held == -1)
418107120Sjulian		panic("worklist_insert: lock not held");
419107120Sjulian	if (item->wk_state & ONWORKLIST)
420107120Sjulian		panic("worklist_insert: already on list");
421107120Sjulian	item->wk_state |= ONWORKLIST;
422107120Sjulian	LIST_INSERT_HEAD(head, item, wk_list);
423107120Sjulian}
424107120Sjulian
425107120Sjulianstatic void
426107120Sjulianworklist_remove(item)
427107120Sjulian	struct worklist *item;
428107120Sjulian{
429107120Sjulian
430107120Sjulian	if (lk.lkt_held == -1)
431107120Sjulian		panic("worklist_remove: lock not held");
432107120Sjulian	if ((item->wk_state & ONWORKLIST) == 0)
433107120Sjulian		panic("worklist_remove: not on list");
434107120Sjulian	item->wk_state &= ~ONWORKLIST;
435107120Sjulian	LIST_REMOVE(item, wk_list);
436107120Sjulian}
437107120Sjulian
438107120Sjulianstatic void
439107120Sjulianworkitem_free(item, type)
440107120Sjulian	struct worklist *item;
441107120Sjulian	int type;
442107120Sjulian{
443107120Sjulian
444107120Sjulian	if (item->wk_state & ONWORKLIST)
445107120Sjulian		panic("workitem_free: still on list");
446107120Sjulian	if (item->wk_type != type)
447107120Sjulian		panic("workitem_free: type mismatch");
448107120Sjulian	FREE(item, DtoM(type));
449107120Sjulian}
450107120Sjulian#endif /* DEBUG */
451107120Sjulian
452107120Sjulian/*
453107120Sjulian * Workitem queue management
454107120Sjulian */
455107120Sjulianstatic struct workhead softdep_workitem_pending;
456107120Sjulianstatic int softdep_worklist_busy;
457107120Sjulianstatic int max_softdeps;	/* maximum number of structs before slowdown */
458107120Sjulianstatic int tickdelay = 2;	/* number of ticks to pause during slowdown */
459107120Sjulianstatic int proc_waiting;	/* tracks whether we have a timeout posted */
460107120Sjulianstatic struct proc *filesys_syncer; /* proc of filesystem syncer process */
461107120Sjulianstatic int req_clear_inodedeps;	/* syncer process flush some inodedeps */
462107120Sjulian#define FLUSH_INODES	1
463107120Sjulianstatic int req_clear_remove;	/* syncer process flush some freeblks */
464107120Sjulian#define FLUSH_REMOVE	2
465107120Sjulian/*
466107120Sjulian * runtime statistics
467107120Sjulian */
468107120Sjulianstatic int stat_blk_limit_push;	/* number of times block limit neared */
469107120Sjulianstatic int stat_ino_limit_push;	/* number of times inode limit neared */
470107120Sjulianstatic int stat_blk_limit_hit;	/* number of times block slowdown imposed */
471107120Sjulianstatic int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
472107120Sjulianstatic int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
473107120Sjulianstatic int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
474107120Sjulianstatic int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
475107120Sjulianstatic int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
476107120Sjulian#ifdef DEBUG
477107120Sjulian#include <vm/vm.h>
478107120Sjulian#include <sys/sysctl.h>
479107120SjulianSYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
480107120SjulianSYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
481107120SjulianSYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
482107120SjulianSYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
483107120SjulianSYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
484107120SjulianSYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
485107120SjulianSYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
486107120SjulianSYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
487107120SjulianSYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
488107120SjulianSYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
489107120Sjulian#endif /* DEBUG */
490107120Sjulian
491107120Sjulian/*
492107120Sjulian * Add an item to the end of the work queue.
493107120Sjulian * This routine requires that the lock be held.
494107120Sjulian * This is the only routine that adds items to the list.
495107120Sjulian * The following routine is the only one that removes items
496107120Sjulian * and does so in order from first to last.
497107120Sjulian */
498107120Sjulianstatic void
499107120Sjulianadd_to_worklist(wk)
500107120Sjulian	struct worklist *wk;
501107120Sjulian{
502107120Sjulian	static struct worklist *worklist_tail;
503107120Sjulian
504107120Sjulian	if (wk->wk_state & ONWORKLIST)
505107120Sjulian		panic("add_to_worklist: already on list");
506107120Sjulian	wk->wk_state |= ONWORKLIST;
507107120Sjulian	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
508107120Sjulian		LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
509107120Sjulian	else
510107120Sjulian		LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
511107120Sjulian	worklist_tail = wk;
512107120Sjulian}
513107120Sjulian
514107120Sjulian/*
515107120Sjulian * Process that runs once per second to handle items in the background queue.
516107120Sjulian *
517107120Sjulian * Note that we ensure that everything is done in the order in which they
518107120Sjulian * appear in the queue. The code below depends on this property to ensure
519107120Sjulian * that blocks of a file are freed before the inode itself is freed. This
520107120Sjulian * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
521107120Sjulian * until all the old ones have been purged from the dependency lists.
522107120Sjulian */
523107120Sjulianstatic int
524107120Sjuliansoftdep_process_worklist(matchmnt)
525107120Sjulian	struct mount *matchmnt;
526107120Sjulian{
527107120Sjulian	struct proc *p = CURPROC;
528107120Sjulian	struct worklist *wk;
529107120Sjulian	struct fs *matchfs;
530107120Sjulian	int matchcnt, loopcount;
531107120Sjulian
532107120Sjulian	/*
533107120Sjulian	 * Record the process identifier of our caller so that we can give
534107120Sjulian	 * this process preferential treatment in request_cleanup below.
535107120Sjulian	 */
536107120Sjulian	filesys_syncer = p;
537107120Sjulian	matchcnt = 0;
538107120Sjulian	matchfs = NULL;
539107120Sjulian	if (matchmnt != NULL)
540107120Sjulian		matchfs = VFSTOUFS(matchmnt)->um_fs;
541107120Sjulian	/*
542107120Sjulian	 * There is no danger of having multiple processes run this
543107120Sjulian	 * code. It is single threaded solely so that softdep_flushfiles
544107120Sjulian	 * (below) can get an accurate count of the number of items
545107120Sjulian	 * related to its mount point that are in the list.
546107120Sjulian	 */
547107120Sjulian	if (softdep_worklist_busy && matchmnt == NULL)
548107120Sjulian		return (-1);
549107120Sjulian	/*
550107120Sjulian	 * If requested, try removing inode or removal dependencies.
551107120Sjulian	 */
552107120Sjulian	if (req_clear_inodedeps) {
553107120Sjulian		clear_inodedeps(p);
554107120Sjulian		req_clear_inodedeps = 0;
555107120Sjulian		wakeup(&proc_waiting);
556107120Sjulian	}
557107120Sjulian	if (req_clear_remove) {
558107120Sjulian		clear_remove(p);
559107120Sjulian		req_clear_remove = 0;
560107120Sjulian		wakeup(&proc_waiting);
561107120Sjulian	}
562107120Sjulian	ACQUIRE_LOCK(&lk);
563107120Sjulian	loopcount = 1;
564107120Sjulian	while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) {
565107120Sjulian		WORKLIST_REMOVE(wk);
566107120Sjulian		FREE_LOCK(&lk);
567107120Sjulian		switch (wk->wk_type) {
568107120Sjulian
569107120Sjulian		case D_DIRREM:
570107120Sjulian			/* removal of a directory entry */
571107120Sjulian			if (WK_DIRREM(wk)->dm_mnt == matchmnt)
572107120Sjulian				matchcnt += 1;
573107120Sjulian			handle_workitem_remove(WK_DIRREM(wk));
574107120Sjulian			break;
575107120Sjulian
576107120Sjulian		case D_FREEBLKS:
577107120Sjulian			/* releasing blocks and/or fragments from a file */
578107120Sjulian			if (WK_FREEBLKS(wk)->fb_fs == matchfs)
579107120Sjulian				matchcnt += 1;
580107120Sjulian			handle_workitem_freeblocks(WK_FREEBLKS(wk));
581107120Sjulian			break;
582107120Sjulian
583107120Sjulian		case D_FREEFRAG:
584150456Simp			/* releasing a fragment when replaced as a file grows */
585107120Sjulian			if (WK_FREEFRAG(wk)->ff_fs == matchfs)
586107120Sjulian				matchcnt += 1;
587292079Simp			handle_workitem_freefrag(WK_FREEFRAG(wk));
588292079Simp			break;
589292079Simp
590292079Simp		case D_FREEFILE:
591292079Simp			/* releasing an inode when its link count drops to 0 */
592107120Sjulian			if (WK_FREEFILE(wk)->fx_fs == matchfs)
593150456Simp				matchcnt += 1;
594107120Sjulian			handle_workitem_freefile(WK_FREEFILE(wk));
595107120Sjulian			break;
596107120Sjulian
597107120Sjulian		default:
598107120Sjulian			panic("%s_process_worklist: Unknown type %s",
599107120Sjulian			    "softdep", TYPENAME(wk->wk_type));
600137896Semax			/* NOTREACHED */
601107120Sjulian		}
602107120Sjulian		if (softdep_worklist_busy && matchmnt == NULL)
603107120Sjulian			return (-1);
604107120Sjulian		/*
605150482Semax		 * If requested, try removing inode or removal dependencies.
606107120Sjulian		 */
607107120Sjulian		if (req_clear_inodedeps) {
608150456Simp			clear_inodedeps(p);
609107120Sjulian			req_clear_inodedeps = 0;
610107120Sjulian			wakeup(&proc_waiting);
611107120Sjulian		}
612107120Sjulian		if (req_clear_remove) {
613107120Sjulian			clear_remove(p);
614151726Semax			req_clear_remove = 0;
615107120Sjulian			wakeup(&proc_waiting);
616107120Sjulian		}
617107120Sjulian		/*
618296137Sjhibbits		 * We do not generally want to stop for buffer space, but if
619296137Sjhibbits		 * we are really being a buffer hog, we will stop and wait.
620107120Sjulian		 */
621107120Sjulian		if (loopcount++ % 128 == 0)
622107120Sjulian			bwillwrite();
623107120Sjulian		ACQUIRE_LOCK(&lk);
624160114Semax	}
625160114Semax	FREE_LOCK(&lk);
626107120Sjulian	return (matchcnt);
627107120Sjulian}
628107120Sjulian
629127135Snjl/*
630127135Snjl * Move dependencies from one buffer to another.
631107120Sjulian */
632107120Sjulianstatic void
633107120Sjuliansoftdep_move_dependencies(oldbp, newbp)
634107120Sjulian	struct buf *oldbp;
635107120Sjulian	struct buf *newbp;
636107120Sjulian{
637166901Spiso	struct worklist *wk, *wktail;
638107120Sjulian
639107120Sjulian	if (LIST_FIRST(&newbp->b_dep) != NULL)
640107120Sjulian		panic("softdep_move_dependencies: need merge code");
641107120Sjulian	wktail = 0;
642107120Sjulian	ACQUIRE_LOCK(&lk);
643107120Sjulian	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
644107120Sjulian		LIST_REMOVE(wk, wk_list);
645151689Sru		if (wktail == 0)
646107120Sjulian			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
647107120Sjulian		else
648107120Sjulian			LIST_INSERT_AFTER(wktail, wk, wk_list);
649107120Sjulian		wktail = wk;
650107120Sjulian	}
651107120Sjulian	FREE_LOCK(&lk);
652107120Sjulian}
653107120Sjulian
654107120Sjulian/*
655107120Sjulian * Purge the work list of all items associated with a particular mount point.
656107120Sjulian */
657107120Sjulianint
658107120Sjuliansoftdep_flushfiles(oldmnt, flags, p)
659107120Sjulian	struct mount *oldmnt;
660107120Sjulian	int flags;
661107120Sjulian	struct proc *p;
662107120Sjulian{
663107120Sjulian	struct vnode *devvp;
664107120Sjulian	int error, loopcnt;
665107120Sjulian
666107120Sjulian	/*
667107120Sjulian	 * Await our turn to clear out the queue.
668107120Sjulian	 */
669107120Sjulian	while (softdep_worklist_busy)
670107120Sjulian		tsleep(&lbolt, PRIBIO, "softflush", 0);
671107120Sjulian	softdep_worklist_busy = 1;
672107120Sjulian	if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) {
673107120Sjulian		softdep_worklist_busy = 0;
674107120Sjulian		return (error);
675107120Sjulian	}
676107120Sjulian	/*
677107120Sjulian	 * Alternately flush the block device associated with the mount
678107120Sjulian	 * point and process any dependencies that the flushing
679107120Sjulian	 * creates. In theory, this loop can happen at most twice,
680107120Sjulian	 * but we give it a few extra just to be sure.
681151700Sjhb	 */
682107120Sjulian	devvp = VFSTOUFS(oldmnt)->um_devvp;
683107120Sjulian	for (loopcnt = 10; loopcnt > 0; loopcnt--) {
684107120Sjulian		if (softdep_process_worklist(oldmnt) == 0) {
685107120Sjulian			/*
686107120Sjulian			 * Do another flush in case any vnodes were brought in
687107120Sjulian			 * as part of the cleanup operations.
688107120Sjulian			 */
689107120Sjulian			if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
690107120Sjulian				break;
691107120Sjulian			/*
692107120Sjulian			 * If we still found nothing to do, we are really done.
693107120Sjulian			 */
694107120Sjulian			if (softdep_process_worklist(oldmnt) == 0)
695107120Sjulian				break;
696107120Sjulian		}
697107120Sjulian		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
698107120Sjulian		error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
699107120Sjulian		VOP_UNLOCK(devvp, 0, p);
700107120Sjulian		if (error)
701107120Sjulian			break;
702107120Sjulian	}
703107120Sjulian	softdep_worklist_busy = 0;
704107120Sjulian	/*
705107120Sjulian	 * If we are unmounting then it is an error to fail. If we
706107120Sjulian	 * are simply trying to downgrade to read-only, then filesystem
707107120Sjulian	 * activity can keep us busy forever, so we just fail with EBUSY.
708150456Simp	 */
709107120Sjulian	if (loopcnt == 0) {
710107120Sjulian		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
711107120Sjulian			panic("softdep_flushfiles: looping");
712107120Sjulian		error = EBUSY;
713107120Sjulian	}
714107120Sjulian	return (error);
715107120Sjulian}
716107120Sjulian
717107120Sjulian/*
718107120Sjulian * Structure hashing.
719151700Sjhb *
720107120Sjulian * There are three types of structures that can be looked up:
721107120Sjulian *	1) pagedep structures identified by mount point, inode number,
722107120Sjulian *	   and logical block.
723107120Sjulian *	2) inodedep structures identified by mount point and inode number.
724107120Sjulian *	3) newblk structures identified by mount point and
725107120Sjulian *	   physical block number.
726107120Sjulian *
727107120Sjulian * The "pagedep" and "inodedep" dependency structures are hashed
728107120Sjulian * separately from the file blocks and inodes to which they correspond.
729107120Sjulian * This separation helps when the in-memory copy of an inode or
730107120Sjulian * file block must be replaced. It also obviates the need to access
731107120Sjulian * an inode or file page when simply updating (or de-allocating)
732107120Sjulian * dependency structures. Lookup of newblk structures is needed to
733107120Sjulian * find newly allocated blocks when trying to associate them with
734107120Sjulian * their allocdirect or allocindir structure.
735107120Sjulian *
736107120Sjulian * The lookup routines optionally create and hash a new instance when
737107120Sjulian * an existing entry is not found.
738107120Sjulian */
739107120Sjulian#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
740107120Sjulian
741107120Sjulian/*
742107120Sjulian * Structures and routines associated with pagedep caching.
743107120Sjulian */
744107120SjulianLIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
745107120Sjulianu_long	pagedep_hash;		/* size of hash table - 1 */
746107120Sjulian#define	PAGEDEP_HASH(mp, inum, lbn) \
747107120Sjulian	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
748107120Sjulian	    pagedep_hash])
749107120Sjulianstatic struct sema pagedep_in_progress;
750107120Sjulian
751107120Sjulian/*
752107120Sjulian * Look up a pagedep. Return 1 if found, 0 if not found.
753107120Sjulian * If not found, allocate if DEPALLOC flag is passed.
754107120Sjulian * Found or allocated entry is returned in pagedeppp.
755107120Sjulian * This routine must be called with splbio interrupts blocked.
756107120Sjulian */
757107120Sjulianstatic int
758107120Sjulianpagedep_lookup(ip, lbn, flags, pagedeppp)
759107120Sjulian	struct inode *ip;
760107120Sjulian	ufs_lbn_t lbn;
761107120Sjulian	int flags;
762107120Sjulian	struct pagedep **pagedeppp;
763107120Sjulian{
764107120Sjulian	struct pagedep *pagedep;
765107120Sjulian	struct pagedep_hashhead *pagedephd;
766107120Sjulian	struct mount *mp;
767107120Sjulian	int i;
768107120Sjulian
769107120Sjulian#ifdef DEBUG
770107120Sjulian	if (lk.lkt_held == -1)
771107120Sjulian		panic("pagedep_lookup: lock not held");
772107120Sjulian#endif
773107120Sjulian	mp = ITOV(ip)->v_mount;
774107120Sjulian	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
775107120Sjuliantop:
776107120Sjulian	for (pagedep = LIST_FIRST(pagedephd); pagedep;
777107120Sjulian	     pagedep = LIST_NEXT(pagedep, pd_hash))
778107120Sjulian		if (ip->i_number == pagedep->pd_ino &&
779107120Sjulian		    lbn == pagedep->pd_lbn &&
780107120Sjulian		    mp == pagedep->pd_mnt)
781107120Sjulian			break;
782107120Sjulian	if (pagedep) {
783107120Sjulian		*pagedeppp = pagedep;
784107120Sjulian		return (1);
785107120Sjulian	}
786107120Sjulian	if ((flags & DEPALLOC) == 0) {
787107120Sjulian		*pagedeppp = NULL;
788107120Sjulian		return (0);
789107120Sjulian	}
790107120Sjulian	if (sema_get(&pagedep_in_progress, &lk) == 0) {
791107120Sjulian		ACQUIRE_LOCK(&lk);
792107120Sjulian		goto top;
793107120Sjulian	}
794107120Sjulian	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
795107120Sjulian		M_WAITOK);
796107120Sjulian	bzero(pagedep, sizeof(struct pagedep));
797107120Sjulian	pagedep->pd_list.wk_type = D_PAGEDEP;
798107120Sjulian	pagedep->pd_mnt = mp;
799107120Sjulian	pagedep->pd_ino = ip->i_number;
800107120Sjulian	pagedep->pd_lbn = lbn;
801107120Sjulian	LIST_INIT(&pagedep->pd_dirremhd);
802107120Sjulian	LIST_INIT(&pagedep->pd_pendinghd);
803107120Sjulian	for (i = 0; i < DAHASHSZ; i++)
804107120Sjulian		LIST_INIT(&pagedep->pd_diraddhd[i]);
805107120Sjulian	ACQUIRE_LOCK(&lk);
806107120Sjulian	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
807107120Sjulian	sema_release(&pagedep_in_progress);
808107120Sjulian	*pagedeppp = pagedep;
809243882Sglebius	return (0);
810107120Sjulian}
811107120Sjulian
812107120Sjulian/*
813107120Sjulian * Structures and routines associated with inodedep caching.
814107120Sjulian */
815107120SjulianLIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
816107120Sjulianstatic u_long	inodedep_hash;	/* size of hash table - 1 */
817276750Srwatsonstatic long	num_inodedep;	/* number of inodedep allocated */
818114878Sjulian#define	INODEDEP_HASH(fs, inum) \
819114878Sjulian      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
820114878Sjulianstatic struct sema inodedep_in_progress;
821114878Sjulian
822114878Sjulian/*
823114878Sjulian * Look up a inodedep. Return 1 if found, 0 if not found.
824114878Sjulian * If not found, allocate if DEPALLOC flag is passed.
825114878Sjulian * Found or allocated entry is returned in inodedeppp.
826107120Sjulian * This routine must be called with splbio interrupts blocked.
827107120Sjulian */
828107120Sjulianstatic int
829107120Sjulianinodedep_lookup(fs, inum, flags, inodedeppp)
830107120Sjulian	struct fs *fs;
831114878Sjulian	ino_t inum;
832114878Sjulian	int flags;
833114878Sjulian	struct inodedep **inodedeppp;
834107120Sjulian{
835107120Sjulian	struct inodedep *inodedep;
836107120Sjulian	struct inodedep_hashhead *inodedephd;
837107120Sjulian	int firsttry;
838107120Sjulian
839107120Sjulian#ifdef DEBUG
840107120Sjulian	if (lk.lkt_held == -1)
841114878Sjulian		panic("inodedep_lookup: lock not held");
842114878Sjulian#endif
843114878Sjulian	firsttry = 1;
844107120Sjulian	inodedephd = INODEDEP_HASH(fs, inum);
845107120Sjuliantop:
846107120Sjulian	for (inodedep = LIST_FIRST(inodedephd); inodedep;
847107120Sjulian	     inodedep = LIST_NEXT(inodedep, id_hash))
848107120Sjulian		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
849107120Sjulian			break;
850107120Sjulian	if (inodedep) {
851107120Sjulian		*inodedeppp = inodedep;
852107120Sjulian		return (1);
853107120Sjulian	}
854107120Sjulian	if ((flags & DEPALLOC) == 0) {
855107120Sjulian		*inodedeppp = NULL;
856107120Sjulian		return (0);
857107120Sjulian	}
858107120Sjulian	/*
859107120Sjulian	 * If we are over our limit, try to improve the situation.
860107120Sjulian	 */
861107120Sjulian	if (num_inodedep > max_softdeps && firsttry && speedup_syncer() == 0 &&
862107120Sjulian	    request_cleanup(FLUSH_INODES, 1)) {
863107120Sjulian		firsttry = 0;
864107120Sjulian		goto top;
865107120Sjulian	}
866107120Sjulian	if (sema_get(&inodedep_in_progress, &lk) == 0) {
867107120Sjulian		ACQUIRE_LOCK(&lk);
868107120Sjulian		goto top;
869107120Sjulian	}
870107120Sjulian	num_inodedep += 1;
871107120Sjulian	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
872107120Sjulian		M_INODEDEP, M_WAITOK);
873107120Sjulian	inodedep->id_list.wk_type = D_INODEDEP;
874107120Sjulian	inodedep->id_fs = fs;
875107120Sjulian	inodedep->id_ino = inum;
876107120Sjulian	inodedep->id_state = ALLCOMPLETE;
877107120Sjulian	inodedep->id_nlinkdelta = 0;
878107120Sjulian	inodedep->id_savedino = NULL;
879107120Sjulian	inodedep->id_savedsize = -1;
880107120Sjulian	inodedep->id_buf = NULL;
881107120Sjulian	LIST_INIT(&inodedep->id_pendinghd);
882107120Sjulian	LIST_INIT(&inodedep->id_inowait);
883107120Sjulian	LIST_INIT(&inodedep->id_bufwait);
884107120Sjulian	TAILQ_INIT(&inodedep->id_inoupdt);
885107120Sjulian	TAILQ_INIT(&inodedep->id_newinoupdt);
886107120Sjulian	ACQUIRE_LOCK(&lk);
887107120Sjulian	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
888107120Sjulian	sema_release(&inodedep_in_progress);
889107120Sjulian	*inodedeppp = inodedep;
890107120Sjulian	return (0);
891107120Sjulian}
892107120Sjulian
893107120Sjulian/*
894107120Sjulian * Structures and routines associated with newblk caching.
895107120Sjulian */
896107120SjulianLIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
897107120Sjulianu_long	newblk_hash;		/* size of hash table - 1 */
898107120Sjulian#define	NEWBLK_HASH(fs, inum) \
899107120Sjulian	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
900107120Sjulianstatic struct sema newblk_in_progress;
901107120Sjulian
902107120Sjulian/*
903107120Sjulian * Look up a newblk. Return 1 if found, 0 if not found.
904107120Sjulian * If not found, allocate if DEPALLOC flag is passed.
905107120Sjulian * Found or allocated entry is returned in newblkpp.
906107120Sjulian */
907107120Sjulianstatic int
908107120Sjuliannewblk_lookup(fs, newblkno, flags, newblkpp)
909107120Sjulian	struct fs *fs;
910107120Sjulian	ufs_daddr_t newblkno;
911107120Sjulian	int flags;
912107120Sjulian	struct newblk **newblkpp;
913107120Sjulian{
914107120Sjulian	struct newblk *newblk;
915107120Sjulian	struct newblk_hashhead *newblkhd;
916107120Sjulian
917107120Sjulian	newblkhd = NEWBLK_HASH(fs, newblkno);
918107120Sjuliantop:
919107120Sjulian	for (newblk = LIST_FIRST(newblkhd); newblk;
920107120Sjulian	     newblk = LIST_NEXT(newblk, nb_hash))
921107120Sjulian		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
922107120Sjulian			break;
923107120Sjulian	if (newblk) {
924107120Sjulian		*newblkpp = newblk;
925107120Sjulian		return (1);
926107120Sjulian	}
927107120Sjulian	if ((flags & DEPALLOC) == 0) {
928107120Sjulian		*newblkpp = NULL;
929107120Sjulian		return (0);
930107120Sjulian	}
931107120Sjulian	if (sema_get(&newblk_in_progress, 0) == 0)
932107120Sjulian		goto top;
933107120Sjulian	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
934107120Sjulian		M_NEWBLK, M_WAITOK);
935107120Sjulian	newblk->nb_state = 0;
936107120Sjulian	newblk->nb_fs = fs;
937107120Sjulian	newblk->nb_newblkno = newblkno;
938107120Sjulian	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
939107120Sjulian	sema_release(&newblk_in_progress);
940107120Sjulian	*newblkpp = newblk;
941107120Sjulian	return (0);
942107120Sjulian}
943107120Sjulian
944107120Sjulian/*
945107120Sjulian * Executed during filesystem system initialization before
946107120Sjulian * mounting any file systems.
947107120Sjulian */
948107120Sjulianvoid
949107120Sjuliansoftdep_initialize()
950107120Sjulian{
951107120Sjulian
952107120Sjulian	LIST_INIT(&mkdirlisthd);
953107120Sjulian	LIST_INIT(&softdep_workitem_pending);
954107120Sjulian	max_softdeps = desiredvnodes * 8;
955107120Sjulian	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
956107120Sjulian	    &pagedep_hash);
957107120Sjulian	sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
958107120Sjulian	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
959107120Sjulian	sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
960107120Sjulian	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
961107120Sjulian	sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
962107120Sjulian}
963107120Sjulian
964107120Sjulian/*
965107120Sjulian * Called at mount time to notify the dependency code that a
966107120Sjulian * filesystem wishes to use it.
967107120Sjulian */
968107120Sjulianint
969107120Sjuliansoftdep_mount(devvp, mp, fs, cred)
970107120Sjulian	struct vnode *devvp;
971107120Sjulian	struct mount *mp;
972107120Sjulian	struct fs *fs;
973107120Sjulian	struct ucred *cred;
974107120Sjulian{
975107120Sjulian	struct csum cstotal;
976107120Sjulian	struct cg *cgp;
977107120Sjulian	struct buf *bp;
978107120Sjulian	int error, cyl;
979107120Sjulian
980107120Sjulian	mp->mnt_flag &= ~MNT_ASYNC;
981107120Sjulian	mp->mnt_flag |= MNT_SOFTDEP;
982107120Sjulian	/*
983107120Sjulian	 * When doing soft updates, the counters in the
984107120Sjulian	 * superblock may have gotten out of sync, so we have
985107120Sjulian	 * to scan the cylinder groups and recalculate them.
986107120Sjulian	 */
987107120Sjulian	if (fs->fs_clean != 0)
988107120Sjulian		return (0);
989107120Sjulian	bzero(&cstotal, sizeof cstotal);
990107120Sjulian	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
991107120Sjulian		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
992107120Sjulian		    fs->fs_cgsize, cred, &bp)) != 0) {
993107120Sjulian			brelse(bp);
994107120Sjulian			return (error);
995107120Sjulian		}
996107120Sjulian		cgp = (struct cg *)bp->b_data;
997107120Sjulian		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
998107120Sjulian		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
999107120Sjulian		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1000107120Sjulian		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1001107120Sjulian		fs->fs_cs(fs, cyl) = cgp->cg_cs;
1002107120Sjulian		brelse(bp);
1003107120Sjulian	}
1004107120Sjulian#ifdef DEBUG
1005107120Sjulian	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1006107120Sjulian		printf("ffs_mountfs: superblock updated for soft updates\n");
1007107120Sjulian#endif
1008107120Sjulian	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1009107120Sjulian	return (0);
1010107120Sjulian}
1011107120Sjulian
1012107120Sjulian/*
1013107120Sjulian * Protecting the freemaps (or bitmaps).
1014107120Sjulian *
1015107120Sjulian * To eliminate the need to execute fsck before mounting a file system
1016107120Sjulian * after a power failure, one must (conservatively) guarantee that the
1017107120Sjulian * on-disk copy of the bitmaps never indicate that a live inode or block is
1018107120Sjulian * free.  So, when a block or inode is allocated, the bitmap should be
1019107120Sjulian * updated (on disk) before any new pointers.  When a block or inode is
1020107120Sjulian * freed, the bitmap should not be updated until all pointers have been
1021107120Sjulian * reset.  The latter dependency is handled by the delayed de-allocation
1022107120Sjulian * approach described below for block and inode de-allocation.  The former
1023107120Sjulian * dependency is handled by calling the following procedure when a block or
1024107120Sjulian * inode is allocated. When an inode is allocated an "inodedep" is created
1025107120Sjulian * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1026107120Sjulian * Each "inodedep" is also inserted into the hash indexing structure so
1027107120Sjulian * that any additional link additions can be made dependent on the inode
1028107120Sjulian * allocation.
1029107120Sjulian *
1030107120Sjulian * The ufs file system maintains a number of free block counts (e.g., per
1031107120Sjulian * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1032107120Sjulian * in addition to the bitmaps.  These counts are used to improve efficiency
1033107120Sjulian * during allocation and therefore must be consistent with the bitmaps.
1034107120Sjulian * There is no convenient way to guarantee post-crash consistency of these
1035107120Sjulian * counts with simple update ordering, for two main reasons: (1) The counts
1036107120Sjulian * and bitmaps for a single cylinder group block are not in the same disk
1037107120Sjulian * sector.  If a disk write is interrupted (e.g., by power failure), one may
1038107120Sjulian * be written and the other not.  (2) Some of the counts are located in the
1039107120Sjulian * superblock rather than the cylinder group block. So, we focus our soft
1040107120Sjulian * updates implementation on protecting the bitmaps. When mounting a
1041107120Sjulian * filesystem, we recompute the auxiliary counts from the bitmaps.
1042107120Sjulian */
1043107120Sjulian
1044107120Sjulian/*
1045107120Sjulian * Called just after updating the cylinder group block to allocate an inode.
1046107120Sjulian */
1047107120Sjulianvoid
1048107120Sjuliansoftdep_setup_inomapdep(bp, ip, newinum)
1049107120Sjulian	struct buf *bp;		/* buffer for cylgroup block with inode map */
1050107120Sjulian	struct inode *ip;	/* inode related to allocation */
1051107120Sjulian	ino_t newinum;		/* new inode number being allocated */
1052107120Sjulian{
1053107120Sjulian	struct inodedep *inodedep;
1054107120Sjulian	struct bmsafemap *bmsafemap;
1055107120Sjulian
1056107120Sjulian	/*
1057107120Sjulian	 * Create a dependency for the newly allocated inode.
1058107120Sjulian	 * Panic if it already exists as something is seriously wrong.
1059107120Sjulian	 * Otherwise add it to the dependency list for the buffer holding
1060107120Sjulian	 * the cylinder group map from which it was allocated.
1061107120Sjulian	 */
1062107120Sjulian	ACQUIRE_LOCK(&lk);
1063107120Sjulian	if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0)
1064107120Sjulian		panic("softdep_setup_inomapdep: found inode");
1065107120Sjulian	inodedep->id_buf = bp;
1066107120Sjulian	inodedep->id_state &= ~DEPCOMPLETE;
1067107120Sjulian	bmsafemap = bmsafemap_lookup(bp);
1068107120Sjulian	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1069107120Sjulian	FREE_LOCK(&lk);
1070107120Sjulian}
1071107120Sjulian
1072107120Sjulian/*
1073107120Sjulian * Called just after updating the cylinder group block to
1074107120Sjulian * allocate block or fragment.
1075144724Semax */
1076107120Sjulianvoid
1077107120Sjuliansoftdep_setup_blkmapdep(bp, fs, newblkno)
1078107120Sjulian	struct buf *bp;		/* buffer for cylgroup block with block map */
1079107120Sjulian	struct fs *fs;		/* filesystem doing allocation */
1080107120Sjulian	ufs_daddr_t newblkno;	/* number of newly allocated block */
1081107120Sjulian{
1082107120Sjulian	struct newblk *newblk;
1083107120Sjulian	struct bmsafemap *bmsafemap;
1084107120Sjulian
1085107120Sjulian	/*
1086107120Sjulian	 * Create a dependency for the newly allocated block.
1087107120Sjulian	 * Add it to the dependency list for the buffer holding
1088107120Sjulian	 * the cylinder group map from which it was allocated.
1089107120Sjulian	 */
1090107120Sjulian	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1091107120Sjulian		panic("softdep_setup_blkmapdep: found block");
1092107120Sjulian	ACQUIRE_LOCK(&lk);
1093107120Sjulian	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1094107120Sjulian	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1095107120Sjulian	FREE_LOCK(&lk);
1096107120Sjulian}
1097107120Sjulian
1098107120Sjulian/*
1099107120Sjulian * Find the bmsafemap associated with a cylinder group buffer.
1100107120Sjulian * If none exists, create one. The buffer must be locked when
1101107120Sjulian * this routine is called and this routine must be called with
1102107120Sjulian * splbio interrupts blocked.
1103107120Sjulian */
1104107120Sjulianstatic struct bmsafemap *
1105107120Sjulianbmsafemap_lookup(bp)
1106107120Sjulian	struct buf *bp;
1107107120Sjulian{
1108107120Sjulian	struct bmsafemap *bmsafemap;
1109107120Sjulian	struct worklist *wk;
1110107120Sjulian
1111107120Sjulian#ifdef DEBUG
1112107120Sjulian	if (lk.lkt_held == -1)
1113107120Sjulian		panic("bmsafemap_lookup: lock not held");
1114107120Sjulian#endif
1115107120Sjulian	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list))
1116107120Sjulian		if (wk->wk_type == D_BMSAFEMAP)
1117107120Sjulian			return (WK_BMSAFEMAP(wk));
1118107120Sjulian	FREE_LOCK(&lk);
1119107120Sjulian	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1120107120Sjulian		M_BMSAFEMAP, M_WAITOK);
1121107120Sjulian	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1122107120Sjulian	bmsafemap->sm_list.wk_state = 0;
1123107120Sjulian	bmsafemap->sm_buf = bp;
1124107120Sjulian	LIST_INIT(&bmsafemap->sm_allocdirecthd);
1125107120Sjulian	LIST_INIT(&bmsafemap->sm_allocindirhd);
1126107120Sjulian	LIST_INIT(&bmsafemap->sm_inodedephd);
1127107120Sjulian	LIST_INIT(&bmsafemap->sm_newblkhd);
1128107120Sjulian	ACQUIRE_LOCK(&lk);
1129107120Sjulian	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1130107120Sjulian	return (bmsafemap);
1131107120Sjulian}
1132107120Sjulian
1133107120Sjulian/*
1134107120Sjulian * Direct block allocation dependencies.
1135107120Sjulian *
1136107120Sjulian * When a new block is allocated, the corresponding disk locations must be
1137107120Sjulian * initialized (with zeros or new data) before the on-disk inode points to
1138107120Sjulian * them.  Also, the freemap from which the block was allocated must be
1139107120Sjulian * updated (on disk) before the inode's pointer. These two dependencies are
1140107120Sjulian * independent of each other and are needed for all file blocks and indirect
1141107120Sjulian * blocks that are pointed to directly by the inode.  Just before the
1142107120Sjulian * "in-core" version of the inode is updated with a newly allocated block
1143107120Sjulian * number, a procedure (below) is called to setup allocation dependency
1144107120Sjulian * structures.  These structures are removed when the corresponding
1145107120Sjulian * dependencies are satisfied or when the block allocation becomes obsolete
1146107120Sjulian * (i.e., the file is deleted, the block is de-allocated, or the block is a
1147107120Sjulian * fragment that gets upgraded).  All of these cases are handled in
1148107120Sjulian * procedures described later.
1149107120Sjulian *
1150107120Sjulian * When a file extension causes a fragment to be upgraded, either to a larger
1151107120Sjulian * fragment or to a full block, the on-disk location may change (if the
1152107120Sjulian * previous fragment could not simply be extended). In this case, the old
1153107120Sjulian * fragment must be de-allocated, but not until after the inode's pointer has
1154107120Sjulian * been updated. In most cases, this is handled by later procedures, which
1155107120Sjulian * will construct a "freefrag" structure to be added to the workitem queue
1156107120Sjulian * when the inode update is complete (or obsolete).  The main exception to
1157107120Sjulian * this is when an allocation occurs while a pending allocation dependency
1158107120Sjulian * (for the same block pointer) remains.  This case is handled in the main
1159107120Sjulian * allocation dependency setup procedure by immediately freeing the
1160107120Sjulian * unreferenced fragments.
1161107120Sjulian */
1162107120Sjulianvoid
1163107120Sjuliansoftdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1164107120Sjulian	struct inode *ip;	/* inode to which block is being added */
1165107120Sjulian	ufs_lbn_t lbn;		/* block pointer within inode */
1166107120Sjulian	ufs_daddr_t newblkno;	/* disk block number being added */
1167107120Sjulian	ufs_daddr_t oldblkno;	/* previous block number, 0 unless frag */
1168107120Sjulian	long newsize;		/* size of new block */
1169107120Sjulian	long oldsize;		/* size of new block */
1170107120Sjulian	struct buf *bp;		/* bp for allocated block */
1171107120Sjulian{
1172150457Simp	struct allocdirect *adp, *oldadp;
1173107120Sjulian	struct allocdirectlst *adphead;
1174107120Sjulian	struct bmsafemap *bmsafemap;
1175107120Sjulian	struct inodedep *inodedep;
1176107120Sjulian	struct pagedep *pagedep;
1177150457Simp	struct newblk *newblk;
1178150457Simp
1179107120Sjulian	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1180107120Sjulian		M_ALLOCDIRECT, M_WAITOK);
1181107120Sjulian	bzero(adp, sizeof(struct allocdirect));
1182107120Sjulian	adp->ad_list.wk_type = D_ALLOCDIRECT;
1183107120Sjulian	adp->ad_lbn = lbn;
1184107120Sjulian	adp->ad_newblkno = newblkno;
1185107120Sjulian	adp->ad_oldblkno = oldblkno;
1186107120Sjulian	adp->ad_newsize = newsize;
1187151726Semax	adp->ad_oldsize = oldsize;
1188107120Sjulian	adp->ad_state = ATTACHED;
1189107120Sjulian	if (newblkno == oldblkno)
1190107120Sjulian		adp->ad_freefrag = NULL;
1191107120Sjulian	else
1192114878Sjulian		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1193114878Sjulian
1194114878Sjulian	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1195114878Sjulian		panic("softdep_setup_allocdirect: lost block");
1196114878Sjulian
1197114878Sjulian	ACQUIRE_LOCK(&lk);
1198114878Sjulian	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
1199114878Sjulian	adp->ad_inodedep = inodedep;
1200114878Sjulian
1201114878Sjulian	if (newblk->nb_state == DEPCOMPLETE) {
1202114878Sjulian		adp->ad_state |= DEPCOMPLETE;
1203114878Sjulian		adp->ad_buf = NULL;
1204114878Sjulian	} else {
1205114878Sjulian		bmsafemap = newblk->nb_bmsafemap;
1206114878Sjulian		adp->ad_buf = bmsafemap->sm_buf;
1207114878Sjulian		LIST_REMOVE(newblk, nb_deps);
1208114878Sjulian		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1209107120Sjulian	}
1210114878Sjulian	LIST_REMOVE(newblk, nb_hash);
1211114878Sjulian	FREE(newblk, M_NEWBLK);
1212114878Sjulian
1213114878Sjulian	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1214114878Sjulian	if (lbn >= NDADDR) {
1215114878Sjulian		/* allocating an indirect block */
1216114878Sjulian		if (oldblkno != 0)
1217114878Sjulian			panic("softdep_setup_allocdirect: non-zero indir");
1218114878Sjulian	} else {
1219114878Sjulian		/*
1220114878Sjulian		 * Allocating a direct block.
1221114878Sjulian		 *
1222114878Sjulian		 * If we are allocating a directory block, then we must
1223114878Sjulian		 * allocate an associated pagedep to track additions and
1224114878Sjulian		 * deletions.
1225292079Simp		 */
1226		if ((ip->i_mode & IFMT) == IFDIR &&
1227		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1228			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1229	}
1230	/*
1231	 * The list of allocdirects must be kept in sorted and ascending
1232	 * order so that the rollback routines can quickly determine the
1233	 * first uncommitted block (the size of the file stored on disk
1234	 * ends at the end of the lowest committed fragment, or if there
1235	 * are no fragments, at the end of the highest committed block).
1236	 * Since files generally grow, the typical case is that the new
1237	 * block is to be added at the end of the list. We speed this
1238	 * special case by checking against the last allocdirect in the
1239	 * list before laboriously traversing the list looking for the
1240	 * insertion point.
1241	 */
1242	adphead = &inodedep->id_newinoupdt;
1243	oldadp = TAILQ_LAST(adphead, allocdirectlst);
1244	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1245		/* insert at end of list */
1246		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1247		if (oldadp != NULL && oldadp->ad_lbn == lbn)
1248			allocdirect_merge(adphead, adp, oldadp);
1249		FREE_LOCK(&lk);
1250		return;
1251	}
1252	for (oldadp = TAILQ_FIRST(adphead); oldadp;
1253	     oldadp = TAILQ_NEXT(oldadp, ad_next)) {
1254		if (oldadp->ad_lbn >= lbn)
1255			break;
1256	}
1257	if (oldadp == NULL)
1258		panic("softdep_setup_allocdirect: lost entry");
1259	/* insert in middle of list */
1260	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1261	if (oldadp->ad_lbn == lbn)
1262		allocdirect_merge(adphead, adp, oldadp);
1263	FREE_LOCK(&lk);
1264}
1265
1266/*
1267 * Replace an old allocdirect dependency with a newer one.
1268 * This routine must be called with splbio interrupts blocked.
1269 */
1270static void
1271allocdirect_merge(adphead, newadp, oldadp)
1272	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
1273	struct allocdirect *newadp;	/* allocdirect being added */
1274	struct allocdirect *oldadp;	/* existing allocdirect being checked */
1275{
1276	struct freefrag *freefrag;
1277
1278#ifdef DEBUG
1279	if (lk.lkt_held == -1)
1280		panic("allocdirect_merge: lock not held");
1281#endif
1282	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1283	    newadp->ad_oldsize != oldadp->ad_newsize ||
1284	    newadp->ad_lbn >= NDADDR)
1285		panic("allocdirect_check: old %d != new %d || lbn %ld >= %d",
1286		    newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
1287		    NDADDR);
1288	newadp->ad_oldblkno = oldadp->ad_oldblkno;
1289	newadp->ad_oldsize = oldadp->ad_oldsize;
1290	/*
1291	 * If the old dependency had a fragment to free or had never
1292	 * previously had a block allocated, then the new dependency
1293	 * can immediately post its freefrag and adopt the old freefrag.
1294	 * This action is done by swapping the freefrag dependencies.
1295	 * The new dependency gains the old one's freefrag, and the
1296	 * old one gets the new one and then immediately puts it on
1297	 * the worklist when it is freed by free_allocdirect. It is
1298	 * not possible to do this swap when the old dependency had a
1299	 * non-zero size but no previous fragment to free. This condition
1300	 * arises when the new block is an extension of the old block.
1301	 * Here, the first part of the fragment allocated to the new
1302	 * dependency is part of the block currently claimed on disk by
1303	 * the old dependency, so cannot legitimately be freed until the
1304	 * conditions for the new dependency are fulfilled.
1305	 */
1306	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1307		freefrag = newadp->ad_freefrag;
1308		newadp->ad_freefrag = oldadp->ad_freefrag;
1309		oldadp->ad_freefrag = freefrag;
1310	}
1311	free_allocdirect(adphead, oldadp, 0);
1312}
1313
1314/*
1315 * Allocate a new freefrag structure if needed.
1316 */
1317static struct freefrag *
1318newfreefrag(ip, blkno, size)
1319	struct inode *ip;
1320	ufs_daddr_t blkno;
1321	long size;
1322{
1323	struct freefrag *freefrag;
1324	struct fs *fs;
1325
1326	if (blkno == 0)
1327		return (NULL);
1328	fs = ip->i_fs;
1329	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1330		panic("newfreefrag: frag size");
1331	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1332		M_FREEFRAG, M_WAITOK);
1333	freefrag->ff_list.wk_type = D_FREEFRAG;
1334	freefrag->ff_state = ip->i_uid & ~ONWORKLIST;	/* XXX - used below */
1335	freefrag->ff_inum = ip->i_number;
1336	freefrag->ff_fs = fs;
1337	freefrag->ff_devvp = ip->i_devvp;
1338	freefrag->ff_blkno = blkno;
1339	freefrag->ff_fragsize = size;
1340	return (freefrag);
1341}
1342
1343/*
1344 * This workitem de-allocates fragments that were replaced during
1345 * file block allocation.
1346 */
1347static void
1348handle_workitem_freefrag(freefrag)
1349	struct freefrag *freefrag;
1350{
1351	struct inode tip;
1352
1353	tip.i_fs = freefrag->ff_fs;
1354	tip.i_devvp = freefrag->ff_devvp;
1355	tip.i_dev = freefrag->ff_devvp->v_rdev;
1356	tip.i_number = freefrag->ff_inum;
1357	tip.i_uid = freefrag->ff_state & ~ONWORKLIST;	/* XXX - set above */
1358	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1359	FREE(freefrag, M_FREEFRAG);
1360}
1361
1362/*
1363 * Indirect block allocation dependencies.
1364 *
1365 * The same dependencies that exist for a direct block also exist when
1366 * a new block is allocated and pointed to by an entry in a block of
1367 * indirect pointers. The undo/redo states described above are also
1368 * used here. Because an indirect block contains many pointers that
1369 * may have dependencies, a second copy of the entire in-memory indirect
1370 * block is kept. The buffer cache copy is always completely up-to-date.
1371 * The second copy, which is used only as a source for disk writes,
1372 * contains only the safe pointers (i.e., those that have no remaining
1373 * update dependencies). The second copy is freed when all pointers
1374 * are safe. The cache is not allowed to replace indirect blocks with
1375 * pending update dependencies. If a buffer containing an indirect
1376 * block with dependencies is written, these routines will mark it
1377 * dirty again. It can only be successfully written once all the
1378 * dependencies are removed. The ffs_fsync routine in conjunction with
1379 * softdep_sync_metadata work together to get all the dependencies
1380 * removed so that a file can be successfully written to disk. Three
1381 * procedures are used when setting up indirect block pointer
1382 * dependencies. The division is necessary because of the organization
1383 * of the "balloc" routine and because of the distinction between file
1384 * pages and file metadata blocks.
1385 */
1386
1387/*
1388 * Allocate a new allocindir structure.
1389 */
1390static struct allocindir *
1391newallocindir(ip, ptrno, newblkno, oldblkno)
1392	struct inode *ip;	/* inode for file being extended */
1393	int ptrno;		/* offset of pointer in indirect block */
1394	ufs_daddr_t newblkno;	/* disk block number being added */
1395	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1396{
1397	struct allocindir *aip;
1398
1399	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1400		M_ALLOCINDIR, M_WAITOK);
1401	bzero(aip, sizeof(struct allocindir));
1402	aip->ai_list.wk_type = D_ALLOCINDIR;
1403	aip->ai_state = ATTACHED;
1404	aip->ai_offset = ptrno;
1405	aip->ai_newblkno = newblkno;
1406	aip->ai_oldblkno = oldblkno;
1407	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1408	return (aip);
1409}
1410
1411/*
1412 * Called just before setting an indirect block pointer
1413 * to a newly allocated file page.
1414 */
1415void
1416softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1417	struct inode *ip;	/* inode for file being extended */
1418	ufs_lbn_t lbn;		/* allocated block number within file */
1419	struct buf *bp;		/* buffer with indirect blk referencing page */
1420	int ptrno;		/* offset of pointer in indirect block */
1421	ufs_daddr_t newblkno;	/* disk block number being added */
1422	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1423	struct buf *nbp;	/* buffer holding allocated page */
1424{
1425	struct allocindir *aip;
1426	struct pagedep *pagedep;
1427
1428	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1429	ACQUIRE_LOCK(&lk);
1430	/*
1431	 * If we are allocating a directory page, then we must
1432	 * allocate an associated pagedep to track additions and
1433	 * deletions.
1434	 */
1435	if ((ip->i_mode & IFMT) == IFDIR &&
1436	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1437		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1438	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1439	FREE_LOCK(&lk);
1440	setup_allocindir_phase2(bp, ip, aip);
1441}
1442
1443/*
1444 * Called just before setting an indirect block pointer to a
1445 * newly allocated indirect block.
1446 */
1447void
1448softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1449	struct buf *nbp;	/* newly allocated indirect block */
1450	struct inode *ip;	/* inode for file being extended */
1451	struct buf *bp;		/* indirect block referencing allocated block */
1452	int ptrno;		/* offset of pointer in indirect block */
1453	ufs_daddr_t newblkno;	/* disk block number being added */
1454{
1455	struct allocindir *aip;
1456
1457	aip = newallocindir(ip, ptrno, newblkno, 0);
1458	ACQUIRE_LOCK(&lk);
1459	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1460	FREE_LOCK(&lk);
1461	setup_allocindir_phase2(bp, ip, aip);
1462}
1463
1464/*
1465 * Called to finish the allocation of the "aip" allocated
1466 * by one of the two routines above.
1467 */
1468static void
1469setup_allocindir_phase2(bp, ip, aip)
1470	struct buf *bp;		/* in-memory copy of the indirect block */
1471	struct inode *ip;	/* inode for file being extended */
1472	struct allocindir *aip;	/* allocindir allocated by the above routines */
1473{
1474	struct worklist *wk;
1475	struct indirdep *indirdep, *newindirdep;
1476	struct bmsafemap *bmsafemap;
1477	struct allocindir *oldaip;
1478	struct freefrag *freefrag;
1479	struct newblk *newblk;
1480
1481	if (bp->b_lblkno >= 0)
1482		panic("setup_allocindir_phase2: not indir blk");
1483	for (indirdep = NULL, newindirdep = NULL; ; ) {
1484		ACQUIRE_LOCK(&lk);
1485		for (wk = LIST_FIRST(&bp->b_dep); wk;
1486		     wk = LIST_NEXT(wk, wk_list)) {
1487			if (wk->wk_type != D_INDIRDEP)
1488				continue;
1489			indirdep = WK_INDIRDEP(wk);
1490			break;
1491		}
1492		if (indirdep == NULL && newindirdep) {
1493			indirdep = newindirdep;
1494			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1495			newindirdep = NULL;
1496		}
1497		FREE_LOCK(&lk);
1498		if (indirdep) {
1499			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1500			    &newblk) == 0)
1501				panic("setup_allocindir: lost block");
1502			ACQUIRE_LOCK(&lk);
1503			if (newblk->nb_state == DEPCOMPLETE) {
1504				aip->ai_state |= DEPCOMPLETE;
1505				aip->ai_buf = NULL;
1506			} else {
1507				bmsafemap = newblk->nb_bmsafemap;
1508				aip->ai_buf = bmsafemap->sm_buf;
1509				LIST_REMOVE(newblk, nb_deps);
1510				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1511				    aip, ai_deps);
1512			}
1513			LIST_REMOVE(newblk, nb_hash);
1514			FREE(newblk, M_NEWBLK);
1515			aip->ai_indirdep = indirdep;
1516			/*
1517			 * Check to see if there is an existing dependency
1518			 * for this block. If there is, merge the old
1519			 * dependency into the new one.
1520			 */
1521			if (aip->ai_oldblkno == 0)
1522				oldaip = NULL;
1523			else
1524				for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd);
1525				    oldaip; oldaip = LIST_NEXT(oldaip, ai_next))
1526					if (oldaip->ai_offset == aip->ai_offset)
1527						break;
1528			if (oldaip != NULL) {
1529				if (oldaip->ai_newblkno != aip->ai_oldblkno)
1530					panic("setup_allocindir_phase2: blkno");
1531				aip->ai_oldblkno = oldaip->ai_oldblkno;
1532				freefrag = oldaip->ai_freefrag;
1533				oldaip->ai_freefrag = aip->ai_freefrag;
1534				aip->ai_freefrag = freefrag;
1535				free_allocindir(oldaip, NULL);
1536			}
1537			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1538			((ufs_daddr_t *)indirdep->ir_savebp->b_data)
1539			    [aip->ai_offset] = aip->ai_oldblkno;
1540			FREE_LOCK(&lk);
1541		}
1542		if (newindirdep) {
1543			if (indirdep->ir_savebp != NULL)
1544				brelse(newindirdep->ir_savebp);
1545			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
1546		}
1547		if (indirdep)
1548			break;
1549		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
1550			M_INDIRDEP, M_WAITOK);
1551		newindirdep->ir_list.wk_type = D_INDIRDEP;
1552		newindirdep->ir_state = ATTACHED;
1553		LIST_INIT(&newindirdep->ir_deplisthd);
1554		LIST_INIT(&newindirdep->ir_donehd);
1555		if (bp->b_blkno == bp->b_lblkno) {
1556			VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
1557				NULL, NULL);
1558		}
1559		newindirdep->ir_savebp =
1560		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1561		BUF_KERNPROC(newindirdep->ir_savebp);
1562		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1563	}
1564}
1565
1566/*
1567 * Block de-allocation dependencies.
1568 *
1569 * When blocks are de-allocated, the on-disk pointers must be nullified before
1570 * the blocks are made available for use by other files.  (The true
1571 * requirement is that old pointers must be nullified before new on-disk
1572 * pointers are set.  We chose this slightly more stringent requirement to
1573 * reduce complexity.) Our implementation handles this dependency by updating
1574 * the inode (or indirect block) appropriately but delaying the actual block
1575 * de-allocation (i.e., freemap and free space count manipulation) until
1576 * after the updated versions reach stable storage.  After the disk is
1577 * updated, the blocks can be safely de-allocated whenever it is convenient.
1578 * This implementation handles only the common case of reducing a file's
1579 * length to zero. Other cases are handled by the conventional synchronous
1580 * write approach.
1581 *
1582 * The ffs implementation with which we worked double-checks
1583 * the state of the block pointers and file size as it reduces
1584 * a file's length.  Some of this code is replicated here in our
1585 * soft updates implementation.  The freeblks->fb_chkcnt field is
1586 * used to transfer a part of this information to the procedure
1587 * that eventually de-allocates the blocks.
1588 *
1589 * This routine should be called from the routine that shortens
1590 * a file's length, before the inode's size or block pointers
1591 * are modified. It will save the block pointer information for
1592 * later release and zero the inode so that the calling routine
1593 * can release it.
1594 */
1595void
1596softdep_setup_freeblocks(ip, length)
1597	struct inode *ip;	/* The inode whose length is to be reduced */
1598	off_t length;		/* The new length for the file */
1599{
1600	struct freeblks *freeblks;
1601	struct inodedep *inodedep;
1602	struct allocdirect *adp;
1603	struct vnode *vp;
1604	struct buf *bp;
1605	struct fs *fs;
1606	int i, error;
1607
1608	fs = ip->i_fs;
1609	if (length != 0)
1610		panic("softde_setup_freeblocks: non-zero length");
1611	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
1612		M_FREEBLKS, M_WAITOK);
1613	bzero(freeblks, sizeof(struct freeblks));
1614	freeblks->fb_list.wk_type = D_FREEBLKS;
1615	freeblks->fb_uid = ip->i_uid;
1616	freeblks->fb_previousinum = ip->i_number;
1617	freeblks->fb_devvp = ip->i_devvp;
1618	freeblks->fb_fs = fs;
1619	freeblks->fb_oldsize = ip->i_size;
1620	freeblks->fb_newsize = length;
1621	freeblks->fb_chkcnt = ip->i_blocks;
1622	for (i = 0; i < NDADDR; i++) {
1623		freeblks->fb_dblks[i] = ip->i_db[i];
1624		ip->i_db[i] = 0;
1625	}
1626	for (i = 0; i < NIADDR; i++) {
1627		freeblks->fb_iblks[i] = ip->i_ib[i];
1628		ip->i_ib[i] = 0;
1629	}
1630	ip->i_blocks = 0;
1631	ip->i_size = 0;
1632	/*
1633	 * Push the zero'ed inode to to its disk buffer so that we are free
1634	 * to delete its dependencies below. Once the dependencies are gone
1635	 * the buffer can be safely released.
1636	 */
1637	if ((error = bread(ip->i_devvp,
1638	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1639	    (int)fs->fs_bsize, NOCRED, &bp)) != 0)
1640		softdep_error("softdep_setup_freeblocks", error);
1641	*((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
1642	    ip->i_din;
1643	/*
1644	 * Find and eliminate any inode dependencies.
1645	 */
1646	ACQUIRE_LOCK(&lk);
1647	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1648	if ((inodedep->id_state & IOSTARTED) != 0)
1649		panic("softdep_setup_freeblocks: inode busy");
1650	/*
1651	 * Because the file length has been truncated to zero, any
1652	 * pending block allocation dependency structures associated
1653	 * with this inode are obsolete and can simply be de-allocated.
1654	 * We must first merge the two dependency lists to get rid of
1655	 * any duplicate freefrag structures, then purge the merged list.
1656	 */
1657	merge_inode_lists(inodedep);
1658	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
1659		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
1660	FREE_LOCK(&lk);
1661	bdwrite(bp);
1662	/*
1663	 * We must wait for any I/O in progress to finish so that
1664	 * all potential buffers on the dirty list will be visible.
1665	 * Once they are all there, walk the list and get rid of
1666	 * any dependencies.
1667	 */
1668	vp = ITOV(ip);
1669	ACQUIRE_LOCK(&lk);
1670	drain_output(vp, 1);
1671	while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
1672		bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
1673		(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
1674		deallocate_dependencies(bp, inodedep);
1675		bp->b_flags |= B_INVAL | B_NOCACHE;
1676		FREE_LOCK(&lk);
1677		brelse(bp);
1678		ACQUIRE_LOCK(&lk);
1679	}
1680	/*
1681	 * Add the freeblks structure to the list of operations that
1682	 * must await the zero'ed inode being written to disk. If we
1683	 * still have a bitmap dependency, then the inode has never been
1684	 * written to disk, so we can process the freeblks immediately.
1685	 * If the inodedep does not exist, then the zero'ed inode has
1686	 * been written and we can also proceed.
1687	 */
1688	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0 ||
1689	    free_inodedep(inodedep) ||
1690	    (inodedep->id_state & DEPCOMPLETE) == 0) {
1691		FREE_LOCK(&lk);
1692		handle_workitem_freeblocks(freeblks);
1693	} else {
1694		WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1695		FREE_LOCK(&lk);
1696	}
1697}
1698
1699/*
1700 * Reclaim any dependency structures from a buffer that is about to
1701 * be reallocated to a new vnode. The buffer must be locked, thus,
1702 * no I/O completion operations can occur while we are manipulating
1703 * its associated dependencies. The mutex is held so that other I/O's
1704 * associated with related dependencies do not occur.
1705 */
1706static void
1707deallocate_dependencies(bp, inodedep)
1708	struct buf *bp;
1709	struct inodedep *inodedep;
1710{
1711	struct worklist *wk;
1712	struct indirdep *indirdep;
1713	struct allocindir *aip;
1714	struct pagedep *pagedep;
1715	struct dirrem *dirrem;
1716	struct diradd *dap;
1717	int i;
1718
1719	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
1720		switch (wk->wk_type) {
1721
1722		case D_INDIRDEP:
1723			indirdep = WK_INDIRDEP(wk);
1724			/*
1725			 * None of the indirect pointers will ever be visible,
1726			 * so they can simply be tossed. GOINGAWAY ensures
1727			 * that allocated pointers will be saved in the buffer
1728			 * cache until they are freed. Note that they will
1729			 * only be able to be found by their physical address
1730			 * since the inode mapping the logical address will
1731			 * be gone. The save buffer used for the safe copy
1732			 * was allocated in setup_allocindir_phase2 using
1733			 * the physical address so it could be used for this
1734			 * purpose. Hence we swap the safe copy with the real
1735			 * copy, allowing the safe copy to be freed and holding
1736			 * on to the real copy for later use in indir_trunc.
1737			 */
1738			if (indirdep->ir_state & GOINGAWAY)
1739				panic("deallocate_dependencies: already gone");
1740			indirdep->ir_state |= GOINGAWAY;
1741			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
1742				free_allocindir(aip, inodedep);
1743			if (bp->b_lblkno >= 0 ||
1744			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
1745				panic("deallocate_dependencies: not indir");
1746			bcopy(bp->b_data, indirdep->ir_savebp->b_data,
1747			    bp->b_bcount);
1748			WORKLIST_REMOVE(wk);
1749			WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
1750			continue;
1751
1752		case D_PAGEDEP:
1753			pagedep = WK_PAGEDEP(wk);
1754			/*
1755			 * None of the directory additions will ever be
1756			 * visible, so they can simply be tossed.
1757			 */
1758			for (i = 0; i < DAHASHSZ; i++)
1759				while ((dap =
1760				    LIST_FIRST(&pagedep->pd_diraddhd[i])))
1761					free_diradd(dap);
1762			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
1763				free_diradd(dap);
1764			/*
1765			 * Copy any directory remove dependencies to the list
1766			 * to be processed after the zero'ed inode is written.
1767			 * If the inode has already been written, then they
1768			 * can be dumped directly onto the work list.
1769			 */
1770			for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem;
1771			     dirrem = LIST_NEXT(dirrem, dm_next)) {
1772				LIST_REMOVE(dirrem, dm_next);
1773				dirrem->dm_dirinum = pagedep->pd_ino;
1774				if (inodedep == NULL)
1775					add_to_worklist(&dirrem->dm_list);
1776				else
1777					WORKLIST_INSERT(&inodedep->id_bufwait,
1778					    &dirrem->dm_list);
1779			}
1780			WORKLIST_REMOVE(&pagedep->pd_list);
1781			LIST_REMOVE(pagedep, pd_hash);
1782			WORKITEM_FREE(pagedep, D_PAGEDEP);
1783			continue;
1784
1785		case D_ALLOCINDIR:
1786			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
1787			continue;
1788
1789		case D_ALLOCDIRECT:
1790		case D_INODEDEP:
1791			panic("deallocate_dependencies: Unexpected type %s",
1792			    TYPENAME(wk->wk_type));
1793			/* NOTREACHED */
1794
1795		default:
1796			panic("deallocate_dependencies: Unknown type %s",
1797			    TYPENAME(wk->wk_type));
1798			/* NOTREACHED */
1799		}
1800	}
1801}
1802
1803/*
1804 * Free an allocdirect. Generate a new freefrag work request if appropriate.
1805 * This routine must be called with splbio interrupts blocked.
1806 */
1807static void
1808free_allocdirect(adphead, adp, delay)
1809	struct allocdirectlst *adphead;
1810	struct allocdirect *adp;
1811	int delay;
1812{
1813
1814#ifdef DEBUG
1815	if (lk.lkt_held == -1)
1816		panic("free_allocdirect: lock not held");
1817#endif
1818	if ((adp->ad_state & DEPCOMPLETE) == 0)
1819		LIST_REMOVE(adp, ad_deps);
1820	TAILQ_REMOVE(adphead, adp, ad_next);
1821	if ((adp->ad_state & COMPLETE) == 0)
1822		WORKLIST_REMOVE(&adp->ad_list);
1823	if (adp->ad_freefrag != NULL) {
1824		if (delay)
1825			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
1826			    &adp->ad_freefrag->ff_list);
1827		else
1828			add_to_worklist(&adp->ad_freefrag->ff_list);
1829	}
1830	WORKITEM_FREE(adp, D_ALLOCDIRECT);
1831}
1832
1833/*
1834 * Prepare an inode to be freed. The actual free operation is not
1835 * done until the zero'ed inode has been written to disk.
1836 */
1837void
1838softdep_freefile(pvp, ino, mode)
1839		struct vnode *pvp;
1840		ino_t ino;
1841		int mode;
1842{
1843	struct inode *ip = VTOI(pvp);
1844	struct inodedep *inodedep;
1845	struct freefile *freefile;
1846
1847	/*
1848	 * This sets up the inode de-allocation dependency.
1849	 */
1850	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
1851		M_FREEFILE, M_WAITOK);
1852	freefile->fx_list.wk_type = D_FREEFILE;
1853	freefile->fx_list.wk_state = 0;
1854	freefile->fx_mode = mode;
1855	freefile->fx_oldinum = ino;
1856	freefile->fx_devvp = ip->i_devvp;
1857	freefile->fx_fs = ip->i_fs;
1858
1859	/*
1860	 * If the inodedep does not exist, then the zero'ed inode has
1861	 * been written to disk and we can free the file immediately.
1862	 */
1863	ACQUIRE_LOCK(&lk);
1864	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0) {
1865		FREE_LOCK(&lk);
1866		handle_workitem_freefile(freefile);
1867		return;
1868	}
1869
1870	/*
1871	 * If we still have a bitmap dependency, then the inode has never
1872	 * been written to disk. Drop the dependency as it is no longer
1873	 * necessary since the inode is being deallocated. We set the
1874	 * ALLCOMPLETE flags since the bitmap now properly shows that the
1875	 * inode is not allocated. Even if the inode is actively being
1876	 * written, it has been rolled back to its zero'ed state, so we
1877	 * are ensured that a zero inode is what is on the disk. For short
1878	 * lived files, this change will usually result in removing all the
1879	 * dependencies from the inode so that it can be freed immediately.
1880	 */
1881	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
1882		inodedep->id_state |= ALLCOMPLETE;
1883		LIST_REMOVE(inodedep, id_deps);
1884		inodedep->id_buf = NULL;
1885		WORKLIST_REMOVE(&inodedep->id_list);
1886	}
1887	if (free_inodedep(inodedep) == 0) {
1888		WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
1889		FREE_LOCK(&lk);
1890	} else {
1891		FREE_LOCK(&lk);
1892		handle_workitem_freefile(freefile);
1893	}
1894}
1895
1896/*
1897 * Try to free an inodedep structure. Return 1 if it could be freed.
1898 */
1899static int
1900free_inodedep(inodedep)
1901	struct inodedep *inodedep;
1902{
1903
1904	if ((inodedep->id_state & ONWORKLIST) != 0 ||
1905	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
1906	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
1907	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
1908	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
1909	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
1910	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
1911	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL)
1912		return (0);
1913	LIST_REMOVE(inodedep, id_hash);
1914	WORKITEM_FREE(inodedep, D_INODEDEP);
1915	num_inodedep -= 1;
1916	return (1);
1917}
1918
1919/*
1920 * This workitem routine performs the block de-allocation.
1921 * The workitem is added to the pending list after the updated
1922 * inode block has been written to disk.  As mentioned above,
1923 * checks regarding the number of blocks de-allocated (compared
1924 * to the number of blocks allocated for the file) are also
1925 * performed in this function.
1926 */
1927static void
1928handle_workitem_freeblocks(freeblks)
1929	struct freeblks *freeblks;
1930{
1931	struct inode tip;
1932	ufs_daddr_t bn;
1933	struct fs *fs;
1934	int i, level, bsize;
1935	long nblocks, blocksreleased = 0;
1936	int error, allerror = 0;
1937	ufs_lbn_t baselbns[NIADDR], tmpval;
1938
1939	tip.i_number = freeblks->fb_previousinum;
1940	tip.i_devvp = freeblks->fb_devvp;
1941	tip.i_dev = freeblks->fb_devvp->v_rdev;
1942	tip.i_fs = freeblks->fb_fs;
1943	tip.i_size = freeblks->fb_oldsize;
1944	tip.i_uid = freeblks->fb_uid;
1945	fs = freeblks->fb_fs;
1946	tmpval = 1;
1947	baselbns[0] = NDADDR;
1948	for (i = 1; i < NIADDR; i++) {
1949		tmpval *= NINDIR(fs);
1950		baselbns[i] = baselbns[i - 1] + tmpval;
1951	}
1952	nblocks = btodb(fs->fs_bsize);
1953	blocksreleased = 0;
1954	/*
1955	 * Indirect blocks first.
1956	 */
1957	for (level = (NIADDR - 1); level >= 0; level--) {
1958		if ((bn = freeblks->fb_iblks[level]) == 0)
1959			continue;
1960		if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
1961		    baselbns[level], &blocksreleased)) == 0)
1962			allerror = error;
1963		ffs_blkfree(&tip, bn, fs->fs_bsize);
1964		blocksreleased += nblocks;
1965	}
1966	/*
1967	 * All direct blocks or frags.
1968	 */
1969	for (i = (NDADDR - 1); i >= 0; i--) {
1970		if ((bn = freeblks->fb_dblks[i]) == 0)
1971			continue;
1972		bsize = blksize(fs, &tip, i);
1973		ffs_blkfree(&tip, bn, bsize);
1974		blocksreleased += btodb(bsize);
1975	}
1976
1977#ifdef DIAGNOSTIC
1978	if (freeblks->fb_chkcnt != blocksreleased)
1979		panic("handle_workitem_freeblocks: block count");
1980	if (allerror)
1981		softdep_error("handle_workitem_freeblks", allerror);
1982#endif /* DIAGNOSTIC */
1983	WORKITEM_FREE(freeblks, D_FREEBLKS);
1984}
1985
1986/*
1987 * Release blocks associated with the inode ip and stored in the indirect
1988 * block dbn. If level is greater than SINGLE, the block is an indirect block
1989 * and recursive calls to indirtrunc must be used to cleanse other indirect
1990 * blocks.
1991 */
1992static int
1993indir_trunc(ip, dbn, level, lbn, countp)
1994	struct inode *ip;
1995	ufs_daddr_t dbn;
1996	int level;
1997	ufs_lbn_t lbn;
1998	long *countp;
1999{
2000	struct buf *bp;
2001	ufs_daddr_t *bap;
2002	ufs_daddr_t nb;
2003	struct fs *fs;
2004	struct worklist *wk;
2005	struct indirdep *indirdep;
2006	int i, lbnadd, nblocks;
2007	int error, allerror = 0;
2008
2009	fs = ip->i_fs;
2010	lbnadd = 1;
2011	for (i = level; i > 0; i--)
2012		lbnadd *= NINDIR(fs);
2013	/*
2014	 * Get buffer of block pointers to be freed. This routine is not
2015	 * called until the zero'ed inode has been written, so it is safe
2016	 * to free blocks as they are encountered. Because the inode has
2017	 * been zero'ed, calls to bmap on these blocks will fail. So, we
2018	 * have to use the on-disk address and the block device for the
2019	 * filesystem to look them up. If the file was deleted before its
2020	 * indirect blocks were all written to disk, the routine that set
2021	 * us up (deallocate_dependencies) will have arranged to leave
2022	 * a complete copy of the indirect block in memory for our use.
2023	 * Otherwise we have to read the blocks in from the disk.
2024	 */
2025	ACQUIRE_LOCK(&lk);
2026	if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
2027	    (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2028		if (wk->wk_type != D_INDIRDEP ||
2029		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2030		    (indirdep->ir_state & GOINGAWAY) == 0)
2031			panic("indir_trunc: lost indirdep");
2032		WORKLIST_REMOVE(wk);
2033		WORKITEM_FREE(indirdep, D_INDIRDEP);
2034		if (LIST_FIRST(&bp->b_dep) != NULL)
2035			panic("indir_trunc: dangling dep");
2036		FREE_LOCK(&lk);
2037	} else {
2038		FREE_LOCK(&lk);
2039		error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
2040		if (error)
2041			return (error);
2042	}
2043	/*
2044	 * Recursively free indirect blocks.
2045	 */
2046	bap = (ufs_daddr_t *)bp->b_data;
2047	nblocks = btodb(fs->fs_bsize);
2048	for (i = NINDIR(fs) - 1; i >= 0; i--) {
2049		if ((nb = bap[i]) == 0)
2050			continue;
2051		if (level != 0) {
2052			if ((error = indir_trunc(ip, fsbtodb(fs, nb),
2053			     level - 1, lbn + (i * lbnadd), countp)) != 0)
2054				allerror = error;
2055		}
2056		ffs_blkfree(ip, nb, fs->fs_bsize);
2057		*countp += nblocks;
2058	}
2059	bp->b_flags |= B_INVAL | B_NOCACHE;
2060	brelse(bp);
2061	return (allerror);
2062}
2063
2064/*
2065 * Free an allocindir.
2066 * This routine must be called with splbio interrupts blocked.
2067 */
2068static void
2069free_allocindir(aip, inodedep)
2070	struct allocindir *aip;
2071	struct inodedep *inodedep;
2072{
2073	struct freefrag *freefrag;
2074
2075#ifdef DEBUG
2076	if (lk.lkt_held == -1)
2077		panic("free_allocindir: lock not held");
2078#endif
2079	if ((aip->ai_state & DEPCOMPLETE) == 0)
2080		LIST_REMOVE(aip, ai_deps);
2081	if (aip->ai_state & ONWORKLIST)
2082		WORKLIST_REMOVE(&aip->ai_list);
2083	LIST_REMOVE(aip, ai_next);
2084	if ((freefrag = aip->ai_freefrag) != NULL) {
2085		if (inodedep == NULL)
2086			add_to_worklist(&freefrag->ff_list);
2087		else
2088			WORKLIST_INSERT(&inodedep->id_bufwait,
2089			    &freefrag->ff_list);
2090	}
2091	WORKITEM_FREE(aip, D_ALLOCINDIR);
2092}
2093
2094/*
2095 * Directory entry addition dependencies.
2096 *
2097 * When adding a new directory entry, the inode (with its incremented link
2098 * count) must be written to disk before the directory entry's pointer to it.
2099 * Also, if the inode is newly allocated, the corresponding freemap must be
2100 * updated (on disk) before the directory entry's pointer. These requirements
2101 * are met via undo/redo on the directory entry's pointer, which consists
2102 * simply of the inode number.
2103 *
2104 * As directory entries are added and deleted, the free space within a
2105 * directory block can become fragmented.  The ufs file system will compact
2106 * a fragmented directory block to make space for a new entry. When this
2107 * occurs, the offsets of previously added entries change. Any "diradd"
2108 * dependency structures corresponding to these entries must be updated with
2109 * the new offsets.
2110 */
2111
2112/*
2113 * This routine is called after the in-memory inode's link
2114 * count has been incremented, but before the directory entry's
2115 * pointer to the inode has been set.
2116 */
2117void
2118softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
2119	struct buf *bp;		/* buffer containing directory block */
2120	struct inode *dp;	/* inode for directory */
2121	off_t diroffset;	/* offset of new entry in directory */
2122	long newinum;		/* inode referenced by new directory entry */
2123	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
2124{
2125	int offset;		/* offset of new entry within directory block */
2126	ufs_lbn_t lbn;		/* block in directory containing new entry */
2127	struct fs *fs;
2128	struct diradd *dap;
2129	struct pagedep *pagedep;
2130	struct inodedep *inodedep;
2131	struct mkdir *mkdir1, *mkdir2;
2132
2133	/*
2134	 * Whiteouts have no dependencies.
2135	 */
2136	if (newinum == WINO) {
2137		if (newdirbp != NULL)
2138			bdwrite(newdirbp);
2139		return;
2140	}
2141
2142	fs = dp->i_fs;
2143	lbn = lblkno(fs, diroffset);
2144	offset = blkoff(fs, diroffset);
2145	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK);
2146	bzero(dap, sizeof(struct diradd));
2147	dap->da_list.wk_type = D_DIRADD;
2148	dap->da_offset = offset;
2149	dap->da_newinum = newinum;
2150	dap->da_state = ATTACHED;
2151	if (newdirbp == NULL) {
2152		dap->da_state |= DEPCOMPLETE;
2153		ACQUIRE_LOCK(&lk);
2154	} else {
2155		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2156		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2157		    M_WAITOK);
2158		mkdir1->md_list.wk_type = D_MKDIR;
2159		mkdir1->md_state = MKDIR_BODY;
2160		mkdir1->md_diradd = dap;
2161		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
2162		    M_WAITOK);
2163		mkdir2->md_list.wk_type = D_MKDIR;
2164		mkdir2->md_state = MKDIR_PARENT;
2165		mkdir2->md_diradd = dap;
2166		/*
2167		 * Dependency on "." and ".." being written to disk.
2168		 */
2169		mkdir1->md_buf = newdirbp;
2170		ACQUIRE_LOCK(&lk);
2171		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2172		WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2173		FREE_LOCK(&lk);
2174		bdwrite(newdirbp);
2175		/*
2176		 * Dependency on link count increase for parent directory
2177		 */
2178		ACQUIRE_LOCK(&lk);
2179		if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
2180		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2181			dap->da_state &= ~MKDIR_PARENT;
2182			WORKITEM_FREE(mkdir2, D_MKDIR);
2183		} else {
2184			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2185			WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2186		}
2187	}
2188	/*
2189	 * Link into parent directory pagedep to await its being written.
2190	 */
2191	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2192		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2193	dap->da_pagedep = pagedep;
2194	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2195	    da_pdlist);
2196	/*
2197	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2198	 * is not yet written. If it is written, do the post-inode write
2199	 * processing to put it on the id_pendinghd list.
2200	 */
2201	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2202	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2203		diradd_inode_written(dap, inodedep);
2204	else
2205		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2206	FREE_LOCK(&lk);
2207}
2208
2209/*
2210 * This procedure is called to change the offset of a directory
2211 * entry when compacting a directory block which must be owned
2212 * exclusively by the caller. Note that the actual entry movement
2213 * must be done in this procedure to ensure that no I/O completions
2214 * occur while the move is in progress.
2215 */
2216void
2217softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2218	struct inode *dp;	/* inode for directory */
2219	caddr_t base;		/* address of dp->i_offset */
2220	caddr_t oldloc;		/* address of old directory location */
2221	caddr_t newloc;		/* address of new directory location */
2222	int entrysize;		/* size of directory entry */
2223{
2224	int offset, oldoffset, newoffset;
2225	struct pagedep *pagedep;
2226	struct diradd *dap;
2227	ufs_lbn_t lbn;
2228
2229	ACQUIRE_LOCK(&lk);
2230	lbn = lblkno(dp->i_fs, dp->i_offset);
2231	offset = blkoff(dp->i_fs, dp->i_offset);
2232	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2233		goto done;
2234	oldoffset = offset + (oldloc - base);
2235	newoffset = offset + (newloc - base);
2236	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]);
2237	     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2238		if (dap->da_offset != oldoffset)
2239			continue;
2240		dap->da_offset = newoffset;
2241		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2242			break;
2243		LIST_REMOVE(dap, da_pdlist);
2244		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2245		    dap, da_pdlist);
2246		break;
2247	}
2248	if (dap == NULL) {
2249		for (dap = LIST_FIRST(&pagedep->pd_pendinghd);
2250		     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2251			if (dap->da_offset == oldoffset) {
2252				dap->da_offset = newoffset;
2253				break;
2254			}
2255		}
2256	}
2257done:
2258	bcopy(oldloc, newloc, entrysize);
2259	FREE_LOCK(&lk);
2260}
2261
2262/*
2263 * Free a diradd dependency structure. This routine must be called
2264 * with splbio interrupts blocked.
2265 */
2266static void
2267free_diradd(dap)
2268	struct diradd *dap;
2269{
2270	struct dirrem *dirrem;
2271	struct pagedep *pagedep;
2272	struct inodedep *inodedep;
2273	struct mkdir *mkdir, *nextmd;
2274
2275#ifdef DEBUG
2276	if (lk.lkt_held == -1)
2277		panic("free_diradd: lock not held");
2278#endif
2279	WORKLIST_REMOVE(&dap->da_list);
2280	LIST_REMOVE(dap, da_pdlist);
2281	if ((dap->da_state & DIRCHG) == 0) {
2282		pagedep = dap->da_pagedep;
2283	} else {
2284		dirrem = dap->da_previous;
2285		pagedep = dirrem->dm_pagedep;
2286		dirrem->dm_dirinum = pagedep->pd_ino;
2287		add_to_worklist(&dirrem->dm_list);
2288	}
2289	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2290	    0, &inodedep) != 0)
2291		(void) free_inodedep(inodedep);
2292	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2293		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2294			nextmd = LIST_NEXT(mkdir, md_mkdirs);
2295			if (mkdir->md_diradd != dap)
2296				continue;
2297			dap->da_state &= ~mkdir->md_state;
2298			WORKLIST_REMOVE(&mkdir->md_list);
2299			LIST_REMOVE(mkdir, md_mkdirs);
2300			WORKITEM_FREE(mkdir, D_MKDIR);
2301		}
2302		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
2303			panic("free_diradd: unfound ref");
2304	}
2305	WORKITEM_FREE(dap, D_DIRADD);
2306}
2307
2308/*
2309 * Directory entry removal dependencies.
2310 *
2311 * When removing a directory entry, the entry's inode pointer must be
2312 * zero'ed on disk before the corresponding inode's link count is decremented
2313 * (possibly freeing the inode for re-use). This dependency is handled by
2314 * updating the directory entry but delaying the inode count reduction until
2315 * after the directory block has been written to disk. After this point, the
2316 * inode count can be decremented whenever it is convenient.
2317 */
2318
2319/*
2320 * This routine should be called immediately after removing
2321 * a directory entry.  The inode's link count should not be
2322 * decremented by the calling procedure -- the soft updates
2323 * code will do this task when it is safe.
2324 */
2325void
2326softdep_setup_remove(bp, dp, ip, isrmdir)
2327	struct buf *bp;		/* buffer containing directory block */
2328	struct inode *dp;	/* inode for the directory being modified */
2329	struct inode *ip;	/* inode for directory entry being removed */
2330	int isrmdir;		/* indicates if doing RMDIR */
2331{
2332	struct dirrem *dirrem, *prevdirrem;
2333
2334	/*
2335	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2336	 */
2337	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2338
2339	/*
2340	 * If the COMPLETE flag is clear, then there were no active
2341	 * entries and we want to roll back to a zeroed entry until
2342	 * the new inode is committed to disk. If the COMPLETE flag is
2343	 * set then we have deleted an entry that never made it to
2344	 * disk. If the entry we deleted resulted from a name change,
2345	 * then the old name still resides on disk. We cannot delete
2346	 * its inode (returned to us in prevdirrem) until the zeroed
2347	 * directory entry gets to disk. The new inode has never been
2348	 * referenced on the disk, so can be deleted immediately.
2349	 */
2350	if ((dirrem->dm_state & COMPLETE) == 0) {
2351		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2352		    dm_next);
2353		FREE_LOCK(&lk);
2354	} else {
2355		if (prevdirrem != NULL)
2356			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
2357			    prevdirrem, dm_next);
2358		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2359		FREE_LOCK(&lk);
2360		handle_workitem_remove(dirrem);
2361	}
2362}
2363
2364/*
2365 * Allocate a new dirrem if appropriate and return it along with
2366 * its associated pagedep. Called without a lock, returns with lock.
2367 */
2368static long num_dirrem;		/* number of dirrem allocated */
2369static struct dirrem *
2370newdirrem(bp, dp, ip, isrmdir, prevdirremp)
2371	struct buf *bp;		/* buffer containing directory block */
2372	struct inode *dp;	/* inode for the directory being modified */
2373	struct inode *ip;	/* inode for directory entry being removed */
2374	int isrmdir;		/* indicates if doing RMDIR */
2375	struct dirrem **prevdirremp; /* previously referenced inode, if any */
2376{
2377	int offset;
2378	ufs_lbn_t lbn;
2379	struct diradd *dap;
2380	struct dirrem *dirrem;
2381	struct pagedep *pagedep;
2382
2383	/*
2384	 * Whiteouts have no deletion dependencies.
2385	 */
2386	if (ip == NULL)
2387		panic("newdirrem: whiteout");
2388	/*
2389	 * If we are over our limit, try to improve the situation.
2390	 * Limiting the number of dirrem structures will also limit
2391	 * the number of freefile and freeblks structures.
2392	 */
2393	if (num_dirrem > max_softdeps / 2 && speedup_syncer() == 0)
2394		(void) request_cleanup(FLUSH_REMOVE, 0);
2395	num_dirrem += 1;
2396	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
2397		M_DIRREM, M_WAITOK);
2398	bzero(dirrem, sizeof(struct dirrem));
2399	dirrem->dm_list.wk_type = D_DIRREM;
2400	dirrem->dm_state = isrmdir ? RMDIR : 0;
2401	dirrem->dm_mnt = ITOV(ip)->v_mount;
2402	dirrem->dm_oldinum = ip->i_number;
2403	*prevdirremp = NULL;
2404
2405	ACQUIRE_LOCK(&lk);
2406	lbn = lblkno(dp->i_fs, dp->i_offset);
2407	offset = blkoff(dp->i_fs, dp->i_offset);
2408	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2409		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2410	dirrem->dm_pagedep = pagedep;
2411	/*
2412	 * Check for a diradd dependency for the same directory entry.
2413	 * If present, then both dependencies become obsolete and can
2414	 * be de-allocated. Check for an entry on both the pd_dirraddhd
2415	 * list and the pd_pendinghd list.
2416	 */
2417	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(offset)]);
2418	     dap; dap = LIST_NEXT(dap, da_pdlist))
2419		if (dap->da_offset == offset)
2420			break;
2421	if (dap == NULL) {
2422		for (dap = LIST_FIRST(&pagedep->pd_pendinghd);
2423		     dap; dap = LIST_NEXT(dap, da_pdlist))
2424			if (dap->da_offset == offset)
2425				break;
2426		if (dap == NULL)
2427			return (dirrem);
2428	}
2429	/*
2430	 * Must be ATTACHED at this point.
2431	 */
2432	if ((dap->da_state & ATTACHED) == 0)
2433		panic("newdirrem: not ATTACHED");
2434	if (dap->da_newinum != ip->i_number)
2435		panic("newdirrem: inum %d should be %d",
2436		    ip->i_number, dap->da_newinum);
2437	/*
2438	 * If we are deleting a changed name that never made it to disk,
2439	 * then return the dirrem describing the previous inode (which
2440	 * represents the inode currently referenced from this entry on disk).
2441	 */
2442	if ((dap->da_state & DIRCHG) != 0) {
2443		*prevdirremp = dap->da_previous;
2444		dap->da_state &= ~DIRCHG;
2445		dap->da_pagedep = pagedep;
2446	}
2447	/*
2448	 * We are deleting an entry that never made it to disk.
2449	 * Mark it COMPLETE so we can delete its inode immediately.
2450	 */
2451	dirrem->dm_state |= COMPLETE;
2452	free_diradd(dap);
2453	return (dirrem);
2454}
2455
2456/*
2457 * Directory entry change dependencies.
2458 *
2459 * Changing an existing directory entry requires that an add operation
2460 * be completed first followed by a deletion. The semantics for the addition
2461 * are identical to the description of adding a new entry above except
2462 * that the rollback is to the old inode number rather than zero. Once
2463 * the addition dependency is completed, the removal is done as described
2464 * in the removal routine above.
2465 */
2466
2467/*
2468 * This routine should be called immediately after changing
2469 * a directory entry.  The inode's link count should not be
2470 * decremented by the calling procedure -- the soft updates
2471 * code will perform this task when it is safe.
2472 */
2473void
2474softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
2475	struct buf *bp;		/* buffer containing directory block */
2476	struct inode *dp;	/* inode for the directory being modified */
2477	struct inode *ip;	/* inode for directory entry being removed */
2478	long newinum;		/* new inode number for changed entry */
2479	int isrmdir;		/* indicates if doing RMDIR */
2480{
2481	int offset;
2482	struct diradd *dap = NULL;
2483	struct dirrem *dirrem, *prevdirrem;
2484	struct pagedep *pagedep;
2485	struct inodedep *inodedep;
2486
2487	offset = blkoff(dp->i_fs, dp->i_offset);
2488
2489	/*
2490	 * Whiteouts do not need diradd dependencies.
2491	 */
2492	if (newinum != WINO) {
2493		MALLOC(dap, struct diradd *, sizeof(struct diradd),
2494		    M_DIRADD, M_WAITOK);
2495		bzero(dap, sizeof(struct diradd));
2496		dap->da_list.wk_type = D_DIRADD;
2497		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
2498		dap->da_offset = offset;
2499		dap->da_newinum = newinum;
2500	}
2501
2502	/*
2503	 * Allocate a new dirrem and ACQUIRE_LOCK.
2504	 */
2505	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2506	pagedep = dirrem->dm_pagedep;
2507	/*
2508	 * The possible values for isrmdir:
2509	 *	0 - non-directory file rename
2510	 *	1 - directory rename within same directory
2511	 *   inum - directory rename to new directory of given inode number
2512	 * When renaming to a new directory, we are both deleting and
2513	 * creating a new directory entry, so the link count on the new
2514	 * directory should not change. Thus we do not need the followup
2515	 * dirrem which is usually done in handle_workitem_remove. We set
2516	 * the DIRCHG flag to tell handle_workitem_remove to skip the
2517	 * followup dirrem.
2518	 */
2519	if (isrmdir > 1)
2520		dirrem->dm_state |= DIRCHG;
2521
2522	/*
2523	 * Whiteouts have no additional dependencies,
2524	 * so just put the dirrem on the correct list.
2525	 */
2526	if (newinum == WINO) {
2527		if ((dirrem->dm_state & COMPLETE) == 0) {
2528			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
2529			    dm_next);
2530		} else {
2531			dirrem->dm_dirinum = pagedep->pd_ino;
2532			add_to_worklist(&dirrem->dm_list);
2533		}
2534		FREE_LOCK(&lk);
2535		return;
2536	}
2537
2538	/*
2539	 * If the COMPLETE flag is clear, then there were no active
2540	 * entries and we want to roll back to the previous inode until
2541	 * the new inode is committed to disk. If the COMPLETE flag is
2542	 * set, then we have deleted an entry that never made it to disk.
2543	 * If the entry we deleted resulted from a name change, then the old
2544	 * inode reference still resides on disk. Any rollback that we do
2545	 * needs to be to that old inode (returned to us in prevdirrem). If
2546	 * the entry we deleted resulted from a create, then there is
2547	 * no entry on the disk, so we want to roll back to zero rather
2548	 * than the uncommitted inode. In either of the COMPLETE cases we
2549	 * want to immediately free the unwritten and unreferenced inode.
2550	 */
2551	if ((dirrem->dm_state & COMPLETE) == 0) {
2552		dap->da_previous = dirrem;
2553	} else {
2554		if (prevdirrem != NULL) {
2555			dap->da_previous = prevdirrem;
2556		} else {
2557			dap->da_state &= ~DIRCHG;
2558			dap->da_pagedep = pagedep;
2559		}
2560		dirrem->dm_dirinum = pagedep->pd_ino;
2561		add_to_worklist(&dirrem->dm_list);
2562	}
2563	/*
2564	 * Link into its inodedep. Put it on the id_bufwait list if the inode
2565	 * is not yet written. If it is written, do the post-inode write
2566	 * processing to put it on the id_pendinghd list.
2567	 */
2568	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
2569	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2570		dap->da_state |= COMPLETE;
2571		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
2572		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
2573	} else {
2574		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
2575		    dap, da_pdlist);
2576		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2577	}
2578	FREE_LOCK(&lk);
2579}
2580
2581/*
2582 * Called whenever the link count on an inode is changed.
2583 * It creates an inode dependency so that the new reference(s)
2584 * to the inode cannot be committed to disk until the updated
2585 * inode has been written.
2586 */
2587void
2588softdep_change_linkcnt(ip)
2589	struct inode *ip;	/* the inode with the increased link count */
2590{
2591	struct inodedep *inodedep;
2592
2593	ACQUIRE_LOCK(&lk);
2594	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
2595	if (ip->i_nlink < ip->i_effnlink)
2596		panic("softdep_change_linkcnt: bad delta");
2597	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2598	FREE_LOCK(&lk);
2599}
2600
2601/*
2602 * This workitem decrements the inode's link count.
2603 * If the link count reaches zero, the file is removed.
2604 */
2605static void
2606handle_workitem_remove(dirrem)
2607	struct dirrem *dirrem;
2608{
2609	struct proc *p = CURPROC;	/* XXX */
2610	struct inodedep *inodedep;
2611	struct vnode *vp;
2612	struct inode *ip;
2613	ino_t oldinum;
2614	int error;
2615
2616	if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
2617		softdep_error("handle_workitem_remove: vget", error);
2618		return;
2619	}
2620	ip = VTOI(vp);
2621	ACQUIRE_LOCK(&lk);
2622	if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0)
2623		panic("handle_workitem_remove: lost inodedep");
2624	/*
2625	 * Normal file deletion.
2626	 */
2627	if ((dirrem->dm_state & RMDIR) == 0) {
2628		ip->i_nlink--;
2629		ip->i_flag |= IN_CHANGE;
2630		if (ip->i_nlink < ip->i_effnlink)
2631			panic("handle_workitem_remove: bad file delta");
2632		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2633		FREE_LOCK(&lk);
2634		vput(vp);
2635		num_dirrem -= 1;
2636		WORKITEM_FREE(dirrem, D_DIRREM);
2637		return;
2638	}
2639	/*
2640	 * Directory deletion. Decrement reference count for both the
2641	 * just deleted parent directory entry and the reference for ".".
2642	 * Next truncate the directory to length zero. When the
2643	 * truncation completes, arrange to have the reference count on
2644	 * the parent decremented to account for the loss of "..".
2645	 */
2646	ip->i_nlink -= 2;
2647	ip->i_flag |= IN_CHANGE;
2648	if (ip->i_nlink < ip->i_effnlink)
2649		panic("handle_workitem_remove: bad dir delta");
2650	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
2651	FREE_LOCK(&lk);
2652	if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0)
2653		softdep_error("handle_workitem_remove: truncate", error);
2654	/*
2655	 * Rename a directory to a new parent. Since, we are both deleting
2656	 * and creating a new directory entry, the link count on the new
2657	 * directory should not change. Thus we skip the followup dirrem.
2658	 */
2659	if (dirrem->dm_state & DIRCHG) {
2660		vput(vp);
2661		num_dirrem -= 1;
2662		WORKITEM_FREE(dirrem, D_DIRREM);
2663		return;
2664	}
2665	/*
2666	 * If there is no inode dependency then we can free immediately.
2667	 * If we still have a bitmap dependency, then the inode has never
2668	 * been written to disk. Drop the dependency as it is no longer
2669	 * necessary since the inode is being deallocated. We set the
2670	 * ALLCOMPLETE flags since the bitmap now properly shows that the
2671	 * inode is not allocated. Even if the inode is actively being
2672	 * written, it has been rolled back to its zero'ed state, so we
2673	 * are ensured that a zero inode is what is on the disk. For short
2674	 * lived files, this change will usually result in removing all the
2675	 * dependencies from the inode so that it can be freed immediately.
2676	 */
2677	ACQUIRE_LOCK(&lk);
2678	dirrem->dm_state = 0;
2679	oldinum = dirrem->dm_oldinum;
2680	dirrem->dm_oldinum = dirrem->dm_dirinum;
2681	if ((inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep)) == 0)
2682		goto out;
2683	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
2684		inodedep->id_state |= ALLCOMPLETE;
2685		LIST_REMOVE(inodedep, id_deps);
2686		inodedep->id_buf = NULL;
2687		WORKLIST_REMOVE(&inodedep->id_list);
2688	}
2689	if (free_inodedep(inodedep) == 0) {
2690		WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
2691		FREE_LOCK(&lk);
2692		vput(vp);
2693		return;
2694	}
2695out:
2696	FREE_LOCK(&lk);
2697	vput(vp);
2698	handle_workitem_remove(dirrem);
2699}
2700
2701/*
2702 * Inode de-allocation dependencies.
2703 *
2704 * When an inode's link count is reduced to zero, it can be de-allocated. We
2705 * found it convenient to postpone de-allocation until after the inode is
2706 * written to disk with its new link count (zero).  At this point, all of the
2707 * on-disk inode's block pointers are nullified and, with careful dependency
2708 * list ordering, all dependencies related to the inode will be satisfied and
2709 * the corresponding dependency structures de-allocated.  So, if/when the
2710 * inode is reused, there will be no mixing of old dependencies with new
2711 * ones.  This artificial dependency is set up by the block de-allocation
2712 * procedure above (softdep_setup_freeblocks) and completed by the
2713 * following procedure.
2714 */
2715static void
2716handle_workitem_freefile(freefile)
2717	struct freefile *freefile;
2718{
2719	struct vnode vp;
2720	struct inode tip;
2721	struct inodedep *idp;
2722	int error;
2723
2724#ifdef DEBUG
2725	ACQUIRE_LOCK(&lk);
2726	if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp))
2727		panic("handle_workitem_freefile: inodedep survived");
2728	FREE_LOCK(&lk);
2729#endif
2730	tip.i_devvp = freefile->fx_devvp;
2731	tip.i_dev = freefile->fx_devvp->v_rdev;
2732	tip.i_fs = freefile->fx_fs;
2733	vp.v_data = &tip;
2734	if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
2735		softdep_error("handle_workitem_freefile", error);
2736	WORKITEM_FREE(freefile, D_FREEFILE);
2737}
2738
2739/*
2740 * Disk writes.
2741 *
2742 * The dependency structures constructed above are most actively used when file
2743 * system blocks are written to disk.  No constraints are placed on when a
2744 * block can be written, but unsatisfied update dependencies are made safe by
2745 * modifying (or replacing) the source memory for the duration of the disk
2746 * write.  When the disk write completes, the memory block is again brought
2747 * up-to-date.
2748 *
2749 * In-core inode structure reclamation.
2750 *
2751 * Because there are a finite number of "in-core" inode structures, they are
2752 * reused regularly.  By transferring all inode-related dependencies to the
2753 * in-memory inode block and indexing them separately (via "inodedep"s), we
2754 * can allow "in-core" inode structures to be reused at any time and avoid
2755 * any increase in contention.
2756 *
2757 * Called just before entering the device driver to initiate a new disk I/O.
2758 * The buffer must be locked, thus, no I/O completion operations can occur
2759 * while we are manipulating its associated dependencies.
2760 */
2761static void
2762softdep_disk_io_initiation(bp)
2763	struct buf *bp;		/* structure describing disk write to occur */
2764{
2765	struct worklist *wk, *nextwk;
2766	struct indirdep *indirdep;
2767
2768	/*
2769	 * We only care about write operations. There should never
2770	 * be dependencies for reads.
2771	 */
2772	if (bp->b_flags & B_READ)
2773		panic("softdep_disk_io_initiation: read");
2774	/*
2775	 * Do any necessary pre-I/O processing.
2776	 */
2777	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
2778		nextwk = LIST_NEXT(wk, wk_list);
2779		switch (wk->wk_type) {
2780
2781		case D_PAGEDEP:
2782			initiate_write_filepage(WK_PAGEDEP(wk), bp);
2783			continue;
2784
2785		case D_INODEDEP:
2786			initiate_write_inodeblock(WK_INODEDEP(wk), bp);
2787			continue;
2788
2789		case D_INDIRDEP:
2790			indirdep = WK_INDIRDEP(wk);
2791			if (indirdep->ir_state & GOINGAWAY)
2792				panic("disk_io_initiation: indirdep gone");
2793			/*
2794			 * If there are no remaining dependencies, this
2795			 * will be writing the real pointers, so the
2796			 * dependency can be freed.
2797			 */
2798			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
2799				indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
2800				brelse(indirdep->ir_savebp);
2801				/* inline expand WORKLIST_REMOVE(wk); */
2802				wk->wk_state &= ~ONWORKLIST;
2803				LIST_REMOVE(wk, wk_list);
2804				WORKITEM_FREE(indirdep, D_INDIRDEP);
2805				continue;
2806			}
2807			/*
2808			 * Replace up-to-date version with safe version.
2809			 */
2810			ACQUIRE_LOCK(&lk);
2811			indirdep->ir_state &= ~ATTACHED;
2812			indirdep->ir_state |= UNDONE;
2813			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
2814			    M_INDIRDEP, M_WAITOK);
2815			bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
2816			bcopy(indirdep->ir_savebp->b_data, bp->b_data,
2817			    bp->b_bcount);
2818			FREE_LOCK(&lk);
2819			continue;
2820
2821		case D_MKDIR:
2822		case D_BMSAFEMAP:
2823		case D_ALLOCDIRECT:
2824		case D_ALLOCINDIR:
2825			continue;
2826
2827		default:
2828			panic("handle_disk_io_initiation: Unexpected type %s",
2829			    TYPENAME(wk->wk_type));
2830			/* NOTREACHED */
2831		}
2832	}
2833}
2834
2835/*
2836 * Called from within the procedure above to deal with unsatisfied
2837 * allocation dependencies in a directory. The buffer must be locked,
2838 * thus, no I/O completion operations can occur while we are
2839 * manipulating its associated dependencies.
2840 */
2841static void
2842initiate_write_filepage(pagedep, bp)
2843	struct pagedep *pagedep;
2844	struct buf *bp;
2845{
2846	struct diradd *dap;
2847	struct direct *ep;
2848	int i;
2849
2850	if (pagedep->pd_state & IOSTARTED) {
2851		/*
2852		 * This can only happen if there is a driver that does not
2853		 * understand chaining. Here biodone will reissue the call
2854		 * to strategy for the incomplete buffers.
2855		 */
2856		printf("initiate_write_filepage: already started\n");
2857		return;
2858	}
2859	pagedep->pd_state |= IOSTARTED;
2860	ACQUIRE_LOCK(&lk);
2861	for (i = 0; i < DAHASHSZ; i++) {
2862		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
2863		     dap = LIST_NEXT(dap, da_pdlist)) {
2864			ep = (struct direct *)
2865			    ((char *)bp->b_data + dap->da_offset);
2866			if (ep->d_ino != dap->da_newinum)
2867				panic("%s: dir inum %d != new %d",
2868				    "initiate_write_filepage",
2869				    ep->d_ino, dap->da_newinum);
2870			if (dap->da_state & DIRCHG)
2871				ep->d_ino = dap->da_previous->dm_oldinum;
2872			else
2873				ep->d_ino = 0;
2874			dap->da_state &= ~ATTACHED;
2875			dap->da_state |= UNDONE;
2876		}
2877	}
2878	FREE_LOCK(&lk);
2879}
2880
2881/*
2882 * Called from within the procedure above to deal with unsatisfied
2883 * allocation dependencies in an inodeblock. The buffer must be
2884 * locked, thus, no I/O completion operations can occur while we
2885 * are manipulating its associated dependencies.
2886 */
2887static void
2888initiate_write_inodeblock(inodedep, bp)
2889	struct inodedep *inodedep;
2890	struct buf *bp;			/* The inode block */
2891{
2892	struct allocdirect *adp, *lastadp;
2893	struct dinode *dp;
2894	struct fs *fs;
2895	ufs_lbn_t prevlbn = 0;
2896	int i, deplist;
2897
2898	if (inodedep->id_state & IOSTARTED)
2899		panic("initiate_write_inodeblock: already started");
2900	inodedep->id_state |= IOSTARTED;
2901	fs = inodedep->id_fs;
2902	dp = (struct dinode *)bp->b_data +
2903	    ino_to_fsbo(fs, inodedep->id_ino);
2904	/*
2905	 * If the bitmap is not yet written, then the allocated
2906	 * inode cannot be written to disk.
2907	 */
2908	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
2909		if (inodedep->id_savedino != NULL)
2910			panic("initiate_write_inodeblock: already doing I/O");
2911		MALLOC(inodedep->id_savedino, struct dinode *,
2912		    sizeof(struct dinode), M_INODEDEP, M_WAITOK);
2913		*inodedep->id_savedino = *dp;
2914		bzero((caddr_t)dp, sizeof(struct dinode));
2915		return;
2916	}
2917	/*
2918	 * If no dependencies, then there is nothing to roll back.
2919	 */
2920	inodedep->id_savedsize = dp->di_size;
2921	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
2922		return;
2923	/*
2924	 * Set the dependencies to busy.
2925	 */
2926	ACQUIRE_LOCK(&lk);
2927	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2928	     adp = TAILQ_NEXT(adp, ad_next)) {
2929#ifdef DIAGNOSTIC
2930		if (deplist != 0 && prevlbn >= adp->ad_lbn)
2931			panic("softdep_write_inodeblock: lbn order");
2932		prevlbn = adp->ad_lbn;
2933		if (adp->ad_lbn < NDADDR &&
2934		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
2935			panic("%s: direct pointer #%ld mismatch %d != %d",
2936			    "softdep_write_inodeblock", adp->ad_lbn,
2937			    dp->di_db[adp->ad_lbn], adp->ad_newblkno);
2938		if (adp->ad_lbn >= NDADDR &&
2939		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
2940			panic("%s: indirect pointer #%ld mismatch %d != %d",
2941			    "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
2942			    dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
2943		deplist |= 1 << adp->ad_lbn;
2944		if ((adp->ad_state & ATTACHED) == 0)
2945			panic("softdep_write_inodeblock: Unknown state 0x%x",
2946			    adp->ad_state);
2947#endif /* DIAGNOSTIC */
2948		adp->ad_state &= ~ATTACHED;
2949		adp->ad_state |= UNDONE;
2950	}
2951	/*
2952	 * The on-disk inode cannot claim to be any larger than the last
2953	 * fragment that has been written. Otherwise, the on-disk inode
2954	 * might have fragments that were not the last block in the file
2955	 * which would corrupt the filesystem.
2956	 */
2957	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2958	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
2959		if (adp->ad_lbn >= NDADDR)
2960			break;
2961		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
2962		/* keep going until hitting a rollback to a frag */
2963		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
2964			continue;
2965		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
2966		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
2967#ifdef DIAGNOSTIC
2968			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
2969				panic("softdep_write_inodeblock: lost dep1");
2970#endif /* DIAGNOSTIC */
2971			dp->di_db[i] = 0;
2972		}
2973		for (i = 0; i < NIADDR; i++) {
2974#ifdef DIAGNOSTIC
2975			if (dp->di_ib[i] != 0 &&
2976			    (deplist & ((1 << NDADDR) << i)) == 0)
2977				panic("softdep_write_inodeblock: lost dep2");
2978#endif /* DIAGNOSTIC */
2979			dp->di_ib[i] = 0;
2980		}
2981		FREE_LOCK(&lk);
2982		return;
2983	}
2984	/*
2985	 * If we have zero'ed out the last allocated block of the file,
2986	 * roll back the size to the last currently allocated block.
2987	 * We know that this last allocated block is a full-sized as
2988	 * we already checked for fragments in the loop above.
2989	 */
2990	if (lastadp != NULL &&
2991	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
2992		for (i = lastadp->ad_lbn; i >= 0; i--)
2993			if (dp->di_db[i] != 0)
2994				break;
2995		dp->di_size = (i + 1) * fs->fs_bsize;
2996	}
2997	/*
2998	 * The only dependencies are for indirect blocks.
2999	 *
3000	 * The file size for indirect block additions is not guaranteed.
3001	 * Such a guarantee would be non-trivial to achieve. The conventional
3002	 * synchronous write implementation also does not make this guarantee.
3003	 * Fsck should catch and fix discrepancies. Arguably, the file size
3004	 * can be over-estimated without destroying integrity when the file
3005	 * moves into the indirect blocks (i.e., is large). If we want to
3006	 * postpone fsck, we are stuck with this argument.
3007	 */
3008	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3009		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3010	FREE_LOCK(&lk);
3011}
3012
3013/*
3014 * This routine is called during the completion interrupt
3015 * service routine for a disk write (from the procedure called
3016 * by the device driver to inform the file system caches of
3017 * a request completion).  It should be called early in this
3018 * procedure, before the block is made available to other
3019 * processes or other routines are called.
3020 */
3021static void
3022softdep_disk_write_complete(bp)
3023	struct buf *bp;		/* describes the completed disk write */
3024{
3025	struct worklist *wk;
3026	struct workhead reattach;
3027	struct newblk *newblk;
3028	struct allocindir *aip;
3029	struct allocdirect *adp;
3030	struct indirdep *indirdep;
3031	struct inodedep *inodedep;
3032	struct bmsafemap *bmsafemap;
3033
3034#ifdef DEBUG
3035	if (lk.lkt_held != -1)
3036		panic("softdep_disk_write_complete: lock is held");
3037	lk.lkt_held = -2;
3038#endif
3039	LIST_INIT(&reattach);
3040	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
3041		WORKLIST_REMOVE(wk);
3042		switch (wk->wk_type) {
3043
3044		case D_PAGEDEP:
3045			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
3046				WORKLIST_INSERT(&reattach, wk);
3047			continue;
3048
3049		case D_INODEDEP:
3050			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
3051				WORKLIST_INSERT(&reattach, wk);
3052			continue;
3053
3054		case D_BMSAFEMAP:
3055			bmsafemap = WK_BMSAFEMAP(wk);
3056			while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
3057				newblk->nb_state |= DEPCOMPLETE;
3058				newblk->nb_bmsafemap = NULL;
3059				LIST_REMOVE(newblk, nb_deps);
3060			}
3061			while ((adp =
3062			   LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
3063				adp->ad_state |= DEPCOMPLETE;
3064				adp->ad_buf = NULL;
3065				LIST_REMOVE(adp, ad_deps);
3066				handle_allocdirect_partdone(adp);
3067			}
3068			while ((aip =
3069			    LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
3070				aip->ai_state |= DEPCOMPLETE;
3071				aip->ai_buf = NULL;
3072				LIST_REMOVE(aip, ai_deps);
3073				handle_allocindir_partdone(aip);
3074			}
3075			while ((inodedep =
3076			     LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
3077				inodedep->id_state |= DEPCOMPLETE;
3078				LIST_REMOVE(inodedep, id_deps);
3079				inodedep->id_buf = NULL;
3080			}
3081			WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
3082			continue;
3083
3084		case D_MKDIR:
3085			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
3086			continue;
3087
3088		case D_ALLOCDIRECT:
3089			adp = WK_ALLOCDIRECT(wk);
3090			adp->ad_state |= COMPLETE;
3091			handle_allocdirect_partdone(adp);
3092			continue;
3093
3094		case D_ALLOCINDIR:
3095			aip = WK_ALLOCINDIR(wk);
3096			aip->ai_state |= COMPLETE;
3097			handle_allocindir_partdone(aip);
3098			continue;
3099
3100		case D_INDIRDEP:
3101			indirdep = WK_INDIRDEP(wk);
3102			if (indirdep->ir_state & GOINGAWAY)
3103				panic("disk_write_complete: indirdep gone");
3104			bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
3105			FREE(indirdep->ir_saveddata, M_INDIRDEP);
3106			indirdep->ir_saveddata = 0;
3107			indirdep->ir_state &= ~UNDONE;
3108			indirdep->ir_state |= ATTACHED;
3109			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
3110				handle_allocindir_partdone(aip);
3111				if (aip == LIST_FIRST(&indirdep->ir_donehd))
3112					panic("disk_write_complete: not gone");
3113			}
3114			WORKLIST_INSERT(&reattach, wk);
3115			if ((bp->b_flags & B_DELWRI) == 0)
3116				stat_indir_blk_ptrs++;
3117			bdirty(bp);
3118			continue;
3119
3120		default:
3121			panic("handle_disk_write_complete: Unknown type %s",
3122			    TYPENAME(wk->wk_type));
3123			/* NOTREACHED */
3124		}
3125	}
3126	/*
3127	 * Reattach any requests that must be redone.
3128	 */
3129	while ((wk = LIST_FIRST(&reattach)) != NULL) {
3130		WORKLIST_REMOVE(wk);
3131		WORKLIST_INSERT(&bp->b_dep, wk);
3132	}
3133#ifdef DEBUG
3134	if (lk.lkt_held != -2)
3135		panic("softdep_disk_write_complete: lock lost");
3136	lk.lkt_held = -1;
3137#endif
3138}
3139
3140/*
3141 * Called from within softdep_disk_write_complete above. Note that
3142 * this routine is always called from interrupt level with further
3143 * splbio interrupts blocked.
3144 */
3145static void
3146handle_allocdirect_partdone(adp)
3147	struct allocdirect *adp;	/* the completed allocdirect */
3148{
3149	struct allocdirect *listadp;
3150	struct inodedep *inodedep;
3151	long bsize;
3152
3153	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3154		return;
3155	if (adp->ad_buf != NULL)
3156		panic("handle_allocdirect_partdone: dangling dep");
3157	/*
3158	 * The on-disk inode cannot claim to be any larger than the last
3159	 * fragment that has been written. Otherwise, the on-disk inode
3160	 * might have fragments that were not the last block in the file
3161	 * which would corrupt the filesystem. Thus, we cannot free any
3162	 * allocdirects after one whose ad_oldblkno claims a fragment as
3163	 * these blocks must be rolled back to zero before writing the inode.
3164	 * We check the currently active set of allocdirects in id_inoupdt.
3165	 */
3166	inodedep = adp->ad_inodedep;
3167	bsize = inodedep->id_fs->fs_bsize;
3168	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp;
3169	     listadp = TAILQ_NEXT(listadp, ad_next)) {
3170		/* found our block */
3171		if (listadp == adp)
3172			break;
3173		/* continue if ad_oldlbn is not a fragment */
3174		if (listadp->ad_oldsize == 0 ||
3175		    listadp->ad_oldsize == bsize)
3176			continue;
3177		/* hit a fragment */
3178		return;
3179	}
3180	/*
3181	 * If we have reached the end of the current list without
3182	 * finding the just finished dependency, then it must be
3183	 * on the future dependency list. Future dependencies cannot
3184	 * be freed until they are moved to the current list.
3185	 */
3186	if (listadp == NULL) {
3187#ifdef DEBUG
3188		for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp;
3189		     listadp = TAILQ_NEXT(listadp, ad_next))
3190			/* found our block */
3191			if (listadp == adp)
3192				break;
3193		if (listadp == NULL)
3194			panic("handle_allocdirect_partdone: lost dep");
3195#endif /* DEBUG */
3196		return;
3197	}
3198	/*
3199	 * If we have found the just finished dependency, then free
3200	 * it along with anything that follows it that is complete.
3201	 */
3202	for (; adp; adp = listadp) {
3203		listadp = TAILQ_NEXT(adp, ad_next);
3204		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3205			return;
3206		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
3207	}
3208}
3209
3210/*
3211 * Called from within softdep_disk_write_complete above. Note that
3212 * this routine is always called from interrupt level with further
3213 * splbio interrupts blocked.
3214 */
3215static void
3216handle_allocindir_partdone(aip)
3217	struct allocindir *aip;		/* the completed allocindir */
3218{
3219	struct indirdep *indirdep;
3220
3221	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
3222		return;
3223	if (aip->ai_buf != NULL)
3224		panic("handle_allocindir_partdone: dangling dependency");
3225	indirdep = aip->ai_indirdep;
3226	if (indirdep->ir_state & UNDONE) {
3227		LIST_REMOVE(aip, ai_next);
3228		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
3229		return;
3230	}
3231	((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
3232	    aip->ai_newblkno;
3233	LIST_REMOVE(aip, ai_next);
3234	if (aip->ai_freefrag != NULL)
3235		add_to_worklist(&aip->ai_freefrag->ff_list);
3236	WORKITEM_FREE(aip, D_ALLOCINDIR);
3237}
3238
3239/*
3240 * Called from within softdep_disk_write_complete above to restore
3241 * in-memory inode block contents to their most up-to-date state. Note
3242 * that this routine is always called from interrupt level with further
3243 * splbio interrupts blocked.
3244 */
3245static int
3246handle_written_inodeblock(inodedep, bp)
3247	struct inodedep *inodedep;
3248	struct buf *bp;		/* buffer containing the inode block */
3249{
3250	struct worklist *wk, *filefree;
3251	struct allocdirect *adp, *nextadp;
3252	struct dinode *dp;
3253	int hadchanges;
3254
3255	if ((inodedep->id_state & IOSTARTED) == 0)
3256		panic("handle_written_inodeblock: not started");
3257	inodedep->id_state &= ~IOSTARTED;
3258	inodedep->id_state |= COMPLETE;
3259	dp = (struct dinode *)bp->b_data +
3260	    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
3261	/*
3262	 * If we had to rollback the inode allocation because of
3263	 * bitmaps being incomplete, then simply restore it.
3264	 * Keep the block dirty so that it will not be reclaimed until
3265	 * all associated dependencies have been cleared and the
3266	 * corresponding updates written to disk.
3267	 */
3268	if (inodedep->id_savedino != NULL) {
3269		*dp = *inodedep->id_savedino;
3270		FREE(inodedep->id_savedino, M_INODEDEP);
3271		inodedep->id_savedino = NULL;
3272		if ((bp->b_flags & B_DELWRI) == 0)
3273			stat_inode_bitmap++;
3274		bdirty(bp);
3275		return (1);
3276	}
3277	/*
3278	 * Roll forward anything that had to be rolled back before
3279	 * the inode could be updated.
3280	 */
3281	hadchanges = 0;
3282	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
3283		nextadp = TAILQ_NEXT(adp, ad_next);
3284		if (adp->ad_state & ATTACHED)
3285			panic("handle_written_inodeblock: new entry");
3286		if (adp->ad_lbn < NDADDR) {
3287			if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno)
3288				panic("%s: %s #%ld mismatch %d != %d",
3289				    "handle_written_inodeblock",
3290				    "direct pointer", adp->ad_lbn,
3291				    dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
3292			dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
3293		} else {
3294			if (dp->di_ib[adp->ad_lbn - NDADDR] != 0)
3295				panic("%s: %s #%ld allocated as %d",
3296				    "handle_written_inodeblock",
3297				    "indirect pointer", adp->ad_lbn - NDADDR,
3298				    dp->di_ib[adp->ad_lbn - NDADDR]);
3299			dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
3300		}
3301		adp->ad_state &= ~UNDONE;
3302		adp->ad_state |= ATTACHED;
3303		hadchanges = 1;
3304	}
3305	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
3306		stat_direct_blk_ptrs++;
3307	/*
3308	 * Reset the file size to its most up-to-date value.
3309	 */
3310	if (inodedep->id_savedsize == -1)
3311		panic("handle_written_inodeblock: bad size");
3312	if (dp->di_size != inodedep->id_savedsize) {
3313		dp->di_size = inodedep->id_savedsize;
3314		hadchanges = 1;
3315	}
3316	inodedep->id_savedsize = -1;
3317	/*
3318	 * If there were any rollbacks in the inode block, then it must be
3319	 * marked dirty so that its will eventually get written back in
3320	 * its correct form.
3321	 */
3322	if (hadchanges)
3323		bdirty(bp);
3324	/*
3325	 * Process any allocdirects that completed during the update.
3326	 */
3327	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
3328		handle_allocdirect_partdone(adp);
3329	/*
3330	 * Process deallocations that were held pending until the
3331	 * inode had been written to disk. Freeing of the inode
3332	 * is delayed until after all blocks have been freed to
3333	 * avoid creation of new <vfsid, inum, lbn> triples
3334	 * before the old ones have been deleted.
3335	 */
3336	filefree = NULL;
3337	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
3338		WORKLIST_REMOVE(wk);
3339		switch (wk->wk_type) {
3340
3341		case D_FREEFILE:
3342			/*
3343			 * We defer adding filefree to the worklist until
3344			 * all other additions have been made to ensure
3345			 * that it will be done after all the old blocks
3346			 * have been freed.
3347			 */
3348			if (filefree != NULL)
3349				panic("handle_written_inodeblock: filefree");
3350			filefree = wk;
3351			continue;
3352
3353		case D_MKDIR:
3354			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
3355			continue;
3356
3357		case D_DIRADD:
3358			diradd_inode_written(WK_DIRADD(wk), inodedep);
3359			continue;
3360
3361		case D_FREEBLKS:
3362		case D_FREEFRAG:
3363		case D_DIRREM:
3364			add_to_worklist(wk);
3365			continue;
3366
3367		default:
3368			panic("handle_written_inodeblock: Unknown type %s",
3369			    TYPENAME(wk->wk_type));
3370			/* NOTREACHED */
3371		}
3372	}
3373	if (filefree != NULL) {
3374		if (free_inodedep(inodedep) == 0)
3375			panic("handle_written_inodeblock: live inodedep");
3376		add_to_worklist(filefree);
3377		return (0);
3378	}
3379
3380	/*
3381	 * If no outstanding dependencies, free it.
3382	 */
3383	if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
3384		return (0);
3385	return (hadchanges);
3386}
3387
3388/*
3389 * Process a diradd entry after its dependent inode has been written.
3390 * This routine must be called with splbio interrupts blocked.
3391 */
3392static void
3393diradd_inode_written(dap, inodedep)
3394	struct diradd *dap;
3395	struct inodedep *inodedep;
3396{
3397	struct pagedep *pagedep;
3398
3399	dap->da_state |= COMPLETE;
3400	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3401		if (dap->da_state & DIRCHG)
3402			pagedep = dap->da_previous->dm_pagedep;
3403		else
3404			pagedep = dap->da_pagedep;
3405		LIST_REMOVE(dap, da_pdlist);
3406		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3407	}
3408	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3409}
3410
3411/*
3412 * Handle the completion of a mkdir dependency.
3413 */
3414static void
3415handle_written_mkdir(mkdir, type)
3416	struct mkdir *mkdir;
3417	int type;
3418{
3419	struct diradd *dap;
3420	struct pagedep *pagedep;
3421
3422	if (mkdir->md_state != type)
3423		panic("handle_written_mkdir: bad type");
3424	dap = mkdir->md_diradd;
3425	dap->da_state &= ~type;
3426	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
3427		dap->da_state |= DEPCOMPLETE;
3428	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3429		if (dap->da_state & DIRCHG)
3430			pagedep = dap->da_previous->dm_pagedep;
3431		else
3432			pagedep = dap->da_pagedep;
3433		LIST_REMOVE(dap, da_pdlist);
3434		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3435	}
3436	LIST_REMOVE(mkdir, md_mkdirs);
3437	WORKITEM_FREE(mkdir, D_MKDIR);
3438}
3439
3440/*
3441 * Called from within softdep_disk_write_complete above.
3442 * A write operation was just completed. Removed inodes can
3443 * now be freed and associated block pointers may be committed.
3444 * Note that this routine is always called from interrupt level
3445 * with further splbio interrupts blocked.
3446 */
3447static int
3448handle_written_filepage(pagedep, bp)
3449	struct pagedep *pagedep;
3450	struct buf *bp;		/* buffer containing the written page */
3451{
3452	struct dirrem *dirrem;
3453	struct diradd *dap, *nextdap;
3454	struct direct *ep;
3455	int i, chgs;
3456
3457	if ((pagedep->pd_state & IOSTARTED) == 0)
3458		panic("handle_written_filepage: not started");
3459	pagedep->pd_state &= ~IOSTARTED;
3460	/*
3461	 * Process any directory removals that have been committed.
3462	 */
3463	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
3464		LIST_REMOVE(dirrem, dm_next);
3465		dirrem->dm_dirinum = pagedep->pd_ino;
3466		add_to_worklist(&dirrem->dm_list);
3467	}
3468	/*
3469	 * Free any directory additions that have been committed.
3470	 */
3471	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
3472		free_diradd(dap);
3473	/*
3474	 * Uncommitted directory entries must be restored.
3475	 */
3476	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
3477		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
3478		     dap = nextdap) {
3479			nextdap = LIST_NEXT(dap, da_pdlist);
3480			if (dap->da_state & ATTACHED)
3481				panic("handle_written_filepage: attached");
3482			ep = (struct direct *)
3483			    ((char *)bp->b_data + dap->da_offset);
3484			ep->d_ino = dap->da_newinum;
3485			dap->da_state &= ~UNDONE;
3486			dap->da_state |= ATTACHED;
3487			chgs = 1;
3488			/*
3489			 * If the inode referenced by the directory has
3490			 * been written out, then the dependency can be
3491			 * moved to the pending list.
3492			 */
3493			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3494				LIST_REMOVE(dap, da_pdlist);
3495				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
3496				    da_pdlist);
3497			}
3498		}
3499	}
3500	/*
3501	 * If there were any rollbacks in the directory, then it must be
3502	 * marked dirty so that its will eventually get written back in
3503	 * its correct form.
3504	 */
3505	if (chgs) {
3506		if ((bp->b_flags & B_DELWRI) == 0)
3507			stat_dir_entry++;
3508		bdirty(bp);
3509	}
3510	/*
3511	 * If no dependencies remain, the pagedep will be freed.
3512	 * Otherwise it will remain to update the page before it
3513	 * is written back to disk.
3514	 */
3515	if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
3516		for (i = 0; i < DAHASHSZ; i++)
3517			if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
3518				break;
3519		if (i == DAHASHSZ) {
3520			LIST_REMOVE(pagedep, pd_hash);
3521			WORKITEM_FREE(pagedep, D_PAGEDEP);
3522			return (0);
3523		}
3524	}
3525	return (1);
3526}
3527
3528/*
3529 * Writing back in-core inode structures.
3530 *
3531 * The file system only accesses an inode's contents when it occupies an
3532 * "in-core" inode structure.  These "in-core" structures are separate from
3533 * the page frames used to cache inode blocks.  Only the latter are
3534 * transferred to/from the disk.  So, when the updated contents of the
3535 * "in-core" inode structure are copied to the corresponding in-memory inode
3536 * block, the dependencies are also transferred.  The following procedure is
3537 * called when copying a dirty "in-core" inode to a cached inode block.
3538 */
3539
3540/*
3541 * Called when an inode is loaded from disk. If the effective link count
3542 * differed from the actual link count when it was last flushed, then we
3543 * need to ensure that the correct effective link count is put back.
3544 */
3545void
3546softdep_load_inodeblock(ip)
3547	struct inode *ip;	/* the "in_core" copy of the inode */
3548{
3549	struct inodedep *inodedep;
3550
3551	/*
3552	 * Check for alternate nlink count.
3553	 */
3554	ip->i_effnlink = ip->i_nlink;
3555	ACQUIRE_LOCK(&lk);
3556	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3557		FREE_LOCK(&lk);
3558		return;
3559	}
3560	ip->i_effnlink -= inodedep->id_nlinkdelta;
3561	FREE_LOCK(&lk);
3562}
3563
3564/*
3565 * This routine is called just before the "in-core" inode
3566 * information is to be copied to the in-memory inode block.
3567 * Recall that an inode block contains several inodes. If
3568 * the force flag is set, then the dependencies will be
3569 * cleared so that the update can always be made. Note that
3570 * the buffer is locked when this routine is called, so we
3571 * will never be in the middle of writing the inode block
3572 * to disk.
3573 */
3574void
3575softdep_update_inodeblock(ip, bp, waitfor)
3576	struct inode *ip;	/* the "in_core" copy of the inode */
3577	struct buf *bp;		/* the buffer containing the inode block */
3578	int waitfor;		/* nonzero => update must be allowed */
3579{
3580	struct inodedep *inodedep;
3581	struct worklist *wk;
3582	int error, gotit;
3583
3584	/*
3585	 * If the effective link count is not equal to the actual link
3586	 * count, then we must track the difference in an inodedep while
3587	 * the inode is (potentially) tossed out of the cache. Otherwise,
3588	 * if there is no existing inodedep, then there are no dependencies
3589	 * to track.
3590	 */
3591	ACQUIRE_LOCK(&lk);
3592	if (ip->i_effnlink != ip->i_nlink) {
3593		(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC,
3594		    &inodedep);
3595	} else if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3596		FREE_LOCK(&lk);
3597		return;
3598	}
3599	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
3600		panic("softdep_update_inodeblock: bad delta");
3601	/*
3602	 * Changes have been initiated. Anything depending on these
3603	 * changes cannot occur until this inode has been written.
3604	 */
3605	inodedep->id_state &= ~COMPLETE;
3606	if ((inodedep->id_state & ONWORKLIST) == 0)
3607		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
3608	/*
3609	 * Any new dependencies associated with the incore inode must
3610	 * now be moved to the list associated with the buffer holding
3611	 * the in-memory copy of the inode. Once merged process any
3612	 * allocdirects that are completed by the merger.
3613	 */
3614	merge_inode_lists(inodedep);
3615	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
3616		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
3617	/*
3618	 * Now that the inode has been pushed into the buffer, the
3619	 * operations dependent on the inode being written to disk
3620	 * can be moved to the id_bufwait so that they will be
3621	 * processed when the buffer I/O completes.
3622	 */
3623	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
3624		WORKLIST_REMOVE(wk);
3625		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
3626	}
3627	/*
3628	 * Newly allocated inodes cannot be written until the bitmap
3629	 * that allocates them have been written (indicated by
3630	 * DEPCOMPLETE being set in id_state). If we are doing a
3631	 * forced sync (e.g., an fsync on a file), we force the bitmap
3632	 * to be written so that the update can be done.
3633	 */
3634	if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
3635		FREE_LOCK(&lk);
3636		return;
3637	}
3638	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
3639	FREE_LOCK(&lk);
3640	if (gotit &&
3641	    (error = VOP_BWRITE(inodedep->id_buf->b_vp, inodedep->id_buf)) != 0)
3642		softdep_error("softdep_update_inodeblock: bwrite", error);
3643	if ((inodedep->id_state & DEPCOMPLETE) == 0)
3644		panic("softdep_update_inodeblock: update failed");
3645}
3646
3647/*
3648 * Merge the new inode dependency list (id_newinoupdt) into the old
3649 * inode dependency list (id_inoupdt). This routine must be called
3650 * with splbio interrupts blocked.
3651 */
3652static void
3653merge_inode_lists(inodedep)
3654	struct inodedep *inodedep;
3655{
3656	struct allocdirect *listadp, *newadp;
3657
3658	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3659	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
3660		if (listadp->ad_lbn < newadp->ad_lbn) {
3661			listadp = TAILQ_NEXT(listadp, ad_next);
3662			continue;
3663		}
3664		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3665		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
3666		if (listadp->ad_lbn == newadp->ad_lbn) {
3667			allocdirect_merge(&inodedep->id_inoupdt, newadp,
3668			    listadp);
3669			listadp = newadp;
3670		}
3671		newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3672	}
3673	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
3674		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3675		TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
3676	}
3677}
3678
3679/*
3680 * If we are doing an fsync, then we must ensure that any directory
3681 * entries for the inode have been written after the inode gets to disk.
3682 */
3683static int
3684softdep_fsync(vp)
3685	struct vnode *vp;	/* the "in_core" copy of the inode */
3686{
3687	struct inodedep *inodedep;
3688	struct pagedep *pagedep;
3689	struct worklist *wk;
3690	struct diradd *dap;
3691	struct mount *mnt;
3692	struct vnode *pvp;
3693	struct inode *ip;
3694	struct buf *bp;
3695	struct fs *fs;
3696	struct proc *p = CURPROC;		/* XXX */
3697	int error, flushparent;
3698	ino_t parentino;
3699	ufs_lbn_t lbn;
3700
3701	ip = VTOI(vp);
3702	fs = ip->i_fs;
3703	ACQUIRE_LOCK(&lk);
3704	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
3705		FREE_LOCK(&lk);
3706		return (0);
3707	}
3708	if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
3709	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
3710	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
3711	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
3712		panic("softdep_fsync: pending ops");
3713	for (error = 0, flushparent = 0; ; ) {
3714		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
3715			break;
3716		if (wk->wk_type != D_DIRADD)
3717			panic("softdep_fsync: Unexpected type %s",
3718			    TYPENAME(wk->wk_type));
3719		dap = WK_DIRADD(wk);
3720		/*
3721		 * Flush our parent if this directory entry
3722		 * has a MKDIR_PARENT dependency.
3723		 */
3724		if (dap->da_state & DIRCHG)
3725			pagedep = dap->da_previous->dm_pagedep;
3726		else
3727			pagedep = dap->da_pagedep;
3728		mnt = pagedep->pd_mnt;
3729		parentino = pagedep->pd_ino;
3730		lbn = pagedep->pd_lbn;
3731		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
3732			panic("softdep_fsync: dirty");
3733		flushparent = dap->da_state & MKDIR_PARENT;
3734		/*
3735		 * If we are being fsync'ed as part of vgone'ing this vnode,
3736		 * then we will not be able to release and recover the
3737		 * vnode below, so we just have to give up on writing its
3738		 * directory entry out. It will eventually be written, just
3739		 * not now, but then the user was not asking to have it
3740		 * written, so we are not breaking any promises.
3741		 */
3742		if (vp->v_flag & VXLOCK)
3743			break;
3744		/*
3745		 * We prevent deadlock by always fetching inodes from the
3746		 * root, moving down the directory tree. Thus, when fetching
3747		 * our parent directory, we must unlock ourselves before
3748		 * requesting the lock on our parent. See the comment in
3749		 * ufs_lookup for details on possible races.
3750		 */
3751		FREE_LOCK(&lk);
3752		VOP_UNLOCK(vp, 0, p);
3753		error = VFS_VGET(mnt, parentino, &pvp);
3754		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
3755		if (error != 0)
3756			return (error);
3757		if (flushparent) {
3758			if ((error = UFS_UPDATE(pvp, 1)) != 0) {
3759				vput(pvp);
3760				return (error);
3761			}
3762		}
3763		/*
3764		 * Flush directory page containing the inode's name.
3765		 */
3766		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
3767		    &bp);
3768		if (error == 0)
3769			error = VOP_BWRITE(bp->b_vp, bp);
3770		vput(pvp);
3771		if (error != 0)
3772			return (error);
3773		ACQUIRE_LOCK(&lk);
3774		if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
3775			break;
3776	}
3777	FREE_LOCK(&lk);
3778	return (0);
3779}
3780
3781/*
3782 * Flush all the dirty bitmaps associated with the block device
3783 * before flushing the rest of the dirty blocks so as to reduce
3784 * the number of dependencies that will have to be rolled back.
3785 */
3786void
3787softdep_fsync_mountdev(vp)
3788	struct vnode *vp;
3789{
3790	struct buf *bp, *nbp;
3791	struct worklist *wk;
3792
3793	if (!vn_isdisk(vp, NULL))
3794		panic("softdep_fsync_mountdev: vnode not a disk");
3795	ACQUIRE_LOCK(&lk);
3796	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
3797		nbp = TAILQ_NEXT(bp, b_vnbufs);
3798		/*
3799		 * If it is already scheduled, skip to the next buffer.
3800		 */
3801		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
3802			continue;
3803		if ((bp->b_flags & B_DELWRI) == 0)
3804			panic("softdep_fsync_mountdev: not dirty");
3805		/*
3806		 * We are only interested in bitmaps with outstanding
3807		 * dependencies.
3808		 */
3809		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
3810		    wk->wk_type != D_BMSAFEMAP) {
3811			BUF_UNLOCK(bp);
3812			continue;
3813		}
3814		bremfree(bp);
3815		FREE_LOCK(&lk);
3816		(void) bawrite(bp);
3817		ACQUIRE_LOCK(&lk);
3818		/*
3819		 * Since we may have slept during the I/O, we need
3820		 * to start from a known point.
3821		 */
3822		nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
3823	}
3824	drain_output(vp, 1);
3825	FREE_LOCK(&lk);
3826}
3827
3828/*
3829 * This routine is called when we are trying to synchronously flush a
3830 * file. This routine must eliminate any filesystem metadata dependencies
3831 * so that the syncing routine can succeed by pushing the dirty blocks
3832 * associated with the file. If any I/O errors occur, they are returned.
3833 */
3834int
3835softdep_sync_metadata(ap)
3836	struct vop_fsync_args /* {
3837		struct vnode *a_vp;
3838		struct ucred *a_cred;
3839		int a_waitfor;
3840		struct proc *a_p;
3841	} */ *ap;
3842{
3843	struct vnode *vp = ap->a_vp;
3844	struct pagedep *pagedep;
3845	struct allocdirect *adp;
3846	struct allocindir *aip;
3847	struct buf *bp, *nbp;
3848	struct worklist *wk;
3849	int i, error, waitfor;
3850
3851	/*
3852	 * Check whether this vnode is involved in a filesystem
3853	 * that is doing soft dependency processing.
3854	 */
3855	if (!vn_isdisk(vp, NULL)) {
3856		if (!DOINGSOFTDEP(vp))
3857			return (0);
3858	} else
3859		if (vp->v_specmountpoint == NULL ||
3860		    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
3861			return (0);
3862	/*
3863	 * Ensure that any direct block dependencies have been cleared.
3864	 */
3865	ACQUIRE_LOCK(&lk);
3866	if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
3867		FREE_LOCK(&lk);
3868		return (error);
3869	}
3870	/*
3871	 * For most files, the only metadata dependencies are the
3872	 * cylinder group maps that allocate their inode or blocks.
3873	 * The block allocation dependencies can be found by traversing
3874	 * the dependency lists for any buffers that remain on their
3875	 * dirty buffer list. The inode allocation dependency will
3876	 * be resolved when the inode is updated with MNT_WAIT.
3877	 * This work is done in two passes. The first pass grabs most
3878	 * of the buffers and begins asynchronously writing them. The
3879	 * only way to wait for these asynchronous writes is to sleep
3880	 * on the filesystem vnode which may stay busy for a long time
3881	 * if the filesystem is active. So, instead, we make a second
3882	 * pass over the dependencies blocking on each write. In the
3883	 * usual case we will be blocking against a write that we
3884	 * initiated, so when it is done the dependency will have been
3885	 * resolved. Thus the second pass is expected to end quickly.
3886	 */
3887	waitfor = MNT_NOWAIT;
3888top:
3889	if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
3890		FREE_LOCK(&lk);
3891		return (0);
3892	}
3893	bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
3894loop:
3895	/*
3896	 * As we hold the buffer locked, none of its dependencies
3897	 * will disappear.
3898	 */
3899	for (wk = LIST_FIRST(&bp->b_dep); wk;
3900	     wk = LIST_NEXT(wk, wk_list)) {
3901		switch (wk->wk_type) {
3902
3903		case D_ALLOCDIRECT:
3904			adp = WK_ALLOCDIRECT(wk);
3905			if (adp->ad_state & DEPCOMPLETE)
3906				break;
3907			nbp = adp->ad_buf;
3908			if (getdirtybuf(&nbp, waitfor) == 0)
3909				break;
3910			FREE_LOCK(&lk);
3911			if (waitfor == MNT_NOWAIT) {
3912				bawrite(nbp);
3913			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
3914				bawrite(bp);
3915				return (error);
3916			}
3917			ACQUIRE_LOCK(&lk);
3918			break;
3919
3920		case D_ALLOCINDIR:
3921			aip = WK_ALLOCINDIR(wk);
3922			if (aip->ai_state & DEPCOMPLETE)
3923				break;
3924			nbp = aip->ai_buf;
3925			if (getdirtybuf(&nbp, waitfor) == 0)
3926				break;
3927			FREE_LOCK(&lk);
3928			if (waitfor == MNT_NOWAIT) {
3929				bawrite(nbp);
3930			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
3931				bawrite(bp);
3932				return (error);
3933			}
3934			ACQUIRE_LOCK(&lk);
3935			break;
3936
3937		case D_INDIRDEP:
3938		restart:
3939			for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd);
3940			     aip; aip = LIST_NEXT(aip, ai_next)) {
3941				if (aip->ai_state & DEPCOMPLETE)
3942					continue;
3943				nbp = aip->ai_buf;
3944				if (getdirtybuf(&nbp, MNT_WAIT) == 0)
3945					goto restart;
3946				FREE_LOCK(&lk);
3947				if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
3948					bawrite(bp);
3949					return (error);
3950				}
3951				ACQUIRE_LOCK(&lk);
3952				goto restart;
3953			}
3954			break;
3955
3956		case D_INODEDEP:
3957			if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
3958			    WK_INODEDEP(wk)->id_ino)) != 0) {
3959				FREE_LOCK(&lk);
3960				bawrite(bp);
3961				return (error);
3962			}
3963			break;
3964
3965		case D_PAGEDEP:
3966			/*
3967			 * We are trying to sync a directory that may
3968			 * have dependencies on both its own metadata
3969			 * and/or dependencies on the inodes of any
3970			 * recently allocated files. We walk its diradd
3971			 * lists pushing out the associated inode.
3972			 */
3973			pagedep = WK_PAGEDEP(wk);
3974			for (i = 0; i < DAHASHSZ; i++) {
3975				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
3976					continue;
3977				if ((error =
3978				    flush_pagedep_deps(vp, pagedep->pd_mnt,
3979						&pagedep->pd_diraddhd[i]))) {
3980					FREE_LOCK(&lk);
3981					bawrite(bp);
3982					return (error);
3983				}
3984			}
3985			break;
3986
3987		case D_MKDIR:
3988			/*
3989			 * This case should never happen if the vnode has
3990			 * been properly sync'ed. However, if this function
3991			 * is used at a place where the vnode has not yet
3992			 * been sync'ed, this dependency can show up. So,
3993			 * rather than panic, just flush it.
3994			 */
3995			nbp = WK_MKDIR(wk)->md_buf;
3996			if (getdirtybuf(&nbp, waitfor) == 0)
3997				break;
3998			FREE_LOCK(&lk);
3999			if (waitfor == MNT_NOWAIT) {
4000				bawrite(nbp);
4001			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
4002				bawrite(bp);
4003				return (error);
4004			}
4005			ACQUIRE_LOCK(&lk);
4006			break;
4007
4008		case D_BMSAFEMAP:
4009			/*
4010			 * This case should never happen if the vnode has
4011			 * been properly sync'ed. However, if this function
4012			 * is used at a place where the vnode has not yet
4013			 * been sync'ed, this dependency can show up. So,
4014			 * rather than panic, just flush it.
4015			 */
4016			nbp = WK_BMSAFEMAP(wk)->sm_buf;
4017			if (getdirtybuf(&nbp, waitfor) == 0)
4018				break;
4019			FREE_LOCK(&lk);
4020			if (waitfor == MNT_NOWAIT) {
4021				bawrite(nbp);
4022			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
4023				bawrite(bp);
4024				return (error);
4025			}
4026			ACQUIRE_LOCK(&lk);
4027			break;
4028
4029		default:
4030			panic("softdep_sync_metadata: Unknown type %s",
4031			    TYPENAME(wk->wk_type));
4032			/* NOTREACHED */
4033		}
4034	}
4035	(void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT);
4036	nbp = TAILQ_NEXT(bp, b_vnbufs);
4037	FREE_LOCK(&lk);
4038	bawrite(bp);
4039	ACQUIRE_LOCK(&lk);
4040	if (nbp != NULL) {
4041		bp = nbp;
4042		goto loop;
4043	}
4044	/*
4045	 * We must wait for any I/O in progress to finish so that
4046	 * all potential buffers on the dirty list will be visible.
4047	 * Once they are all there, proceed with the second pass
4048	 * which will wait for the I/O as per above.
4049	 */
4050	drain_output(vp, 1);
4051	/*
4052	 * The brief unlock is to allow any pent up dependency
4053	 * processing to be done.
4054	 */
4055	if (waitfor == MNT_NOWAIT) {
4056		waitfor = MNT_WAIT;
4057		FREE_LOCK(&lk);
4058		ACQUIRE_LOCK(&lk);
4059		goto top;
4060	}
4061
4062	/*
4063	 * If we have managed to get rid of all the dirty buffers,
4064	 * then we are done. For certain directories and block
4065	 * devices, we may need to do further work.
4066	 */
4067	if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
4068		FREE_LOCK(&lk);
4069		return (0);
4070	}
4071
4072	FREE_LOCK(&lk);
4073	/*
4074	 * If we are trying to sync a block device, some of its buffers may
4075	 * contain metadata that cannot be written until the contents of some
4076	 * partially written files have been written to disk. The only easy
4077	 * way to accomplish this is to sync the entire filesystem (luckily
4078	 * this happens rarely).
4079	 */
4080	if (vn_isdisk(vp, NULL) &&
4081	    vp->v_specmountpoint && !VOP_ISLOCKED(vp, NULL) &&
4082	    (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred,
4083	     ap->a_p)) != 0)
4084		return (error);
4085	return (0);
4086}
4087
4088/*
4089 * Flush the dependencies associated with an inodedep.
4090 * Called with splbio blocked.
4091 */
4092static int
4093flush_inodedep_deps(fs, ino)
4094	struct fs *fs;
4095	ino_t ino;
4096{
4097	struct inodedep *inodedep;
4098	struct allocdirect *adp;
4099	int error, waitfor;
4100	struct buf *bp;
4101
4102	/*
4103	 * This work is done in two passes. The first pass grabs most
4104	 * of the buffers and begins asynchronously writing them. The
4105	 * only way to wait for these asynchronous writes is to sleep
4106	 * on the filesystem vnode which may stay busy for a long time
4107	 * if the filesystem is active. So, instead, we make a second
4108	 * pass over the dependencies blocking on each write. In the
4109	 * usual case we will be blocking against a write that we
4110	 * initiated, so when it is done the dependency will have been
4111	 * resolved. Thus the second pass is expected to end quickly.
4112	 * We give a brief window at the top of the loop to allow
4113	 * any pending I/O to complete.
4114	 */
4115	for (waitfor = MNT_NOWAIT; ; ) {
4116		FREE_LOCK(&lk);
4117		ACQUIRE_LOCK(&lk);
4118		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4119			return (0);
4120		for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
4121		     adp = TAILQ_NEXT(adp, ad_next)) {
4122			if (adp->ad_state & DEPCOMPLETE)
4123				continue;
4124			bp = adp->ad_buf;
4125			if (getdirtybuf(&bp, waitfor) == 0) {
4126				if (waitfor == MNT_NOWAIT)
4127					continue;
4128				break;
4129			}
4130			FREE_LOCK(&lk);
4131			if (waitfor == MNT_NOWAIT) {
4132				bawrite(bp);
4133			} else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) {
4134				ACQUIRE_LOCK(&lk);
4135				return (error);
4136			}
4137			ACQUIRE_LOCK(&lk);
4138			break;
4139		}
4140		if (adp != NULL)
4141			continue;
4142		for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp;
4143		     adp = TAILQ_NEXT(adp, ad_next)) {
4144			if (adp->ad_state & DEPCOMPLETE)
4145				continue;
4146			bp = adp->ad_buf;
4147			if (getdirtybuf(&bp, waitfor) == 0) {
4148				if (waitfor == MNT_NOWAIT)
4149					continue;
4150				break;
4151			}
4152			FREE_LOCK(&lk);
4153			if (waitfor == MNT_NOWAIT) {
4154				bawrite(bp);
4155			} else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) {
4156				ACQUIRE_LOCK(&lk);
4157				return (error);
4158			}
4159			ACQUIRE_LOCK(&lk);
4160			break;
4161		}
4162		if (adp != NULL)
4163			continue;
4164		/*
4165		 * If pass2, we are done, otherwise do pass 2.
4166		 */
4167		if (waitfor == MNT_WAIT)
4168			break;
4169		waitfor = MNT_WAIT;
4170	}
4171	/*
4172	 * Try freeing inodedep in case all dependencies have been removed.
4173	 */
4174	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
4175		(void) free_inodedep(inodedep);
4176	return (0);
4177}
4178
4179/*
4180 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
4181 * Called with splbio blocked.
4182 */
4183static int
4184flush_pagedep_deps(pvp, mp, diraddhdp)
4185	struct vnode *pvp;
4186	struct mount *mp;
4187	struct diraddhd *diraddhdp;
4188{
4189	struct proc *p = CURPROC;	/* XXX */
4190	struct inodedep *inodedep;
4191	struct ufsmount *ump;
4192	struct diradd *dap;
4193	struct vnode *vp;
4194	int gotit, error = 0;
4195	struct buf *bp;
4196	ino_t inum;
4197
4198	ump = VFSTOUFS(mp);
4199	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
4200		/*
4201		 * Flush ourselves if this directory entry
4202		 * has a MKDIR_PARENT dependency.
4203		 */
4204		if (dap->da_state & MKDIR_PARENT) {
4205			FREE_LOCK(&lk);
4206			if ((error = UFS_UPDATE(pvp, 1)) != 0)
4207				break;
4208			ACQUIRE_LOCK(&lk);
4209			/*
4210			 * If that cleared dependencies, go on to next.
4211			 */
4212			if (dap != LIST_FIRST(diraddhdp))
4213				continue;
4214			if (dap->da_state & MKDIR_PARENT)
4215				panic("flush_pagedep_deps: MKDIR");
4216		}
4217		/*
4218		 * Flush the file on which the directory entry depends.
4219		 * If the inode has already been pushed out of the cache,
4220		 * then all the block dependencies will have been flushed
4221		 * leaving only inode dependencies (e.g., bitmaps). Thus,
4222		 * we do a ufs_ihashget to check for the vnode in the cache.
4223		 * If it is there, we do a full flush. If it is no longer
4224		 * there we need only dispose of any remaining bitmap
4225		 * dependencies and write the inode to disk.
4226		 */
4227		inum = dap->da_newinum;
4228		FREE_LOCK(&lk);
4229		if ((vp = ufs_ihashget(ump->um_dev, inum)) == NULL) {
4230			ACQUIRE_LOCK(&lk);
4231			if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0
4232			    && dap == LIST_FIRST(diraddhdp))
4233				panic("flush_pagedep_deps: flush 1 failed");
4234			/*
4235			 * If the inode still has bitmap dependencies,
4236			 * push them to disk.
4237			 */
4238			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4239				gotit = getdirtybuf(&inodedep->id_buf,MNT_WAIT);
4240				FREE_LOCK(&lk);
4241				if (gotit &&
4242				    (error = VOP_BWRITE(inodedep->id_buf->b_vp,
4243				     inodedep->id_buf)) != 0)
4244					break;
4245				ACQUIRE_LOCK(&lk);
4246			}
4247			if (dap != LIST_FIRST(diraddhdp))
4248				continue;
4249			/*
4250			 * If the inode is still sitting in a buffer waiting
4251			 * to be written, push it to disk.
4252			 */
4253			FREE_LOCK(&lk);
4254			if ((error = bread(ump->um_devvp,
4255			    fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
4256			    (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0)
4257				break;
4258			if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0)
4259				break;
4260			ACQUIRE_LOCK(&lk);
4261			if (dap == LIST_FIRST(diraddhdp))
4262				panic("flush_pagedep_deps: flush 2 failed");
4263			continue;
4264		}
4265		if (vp->v_type == VDIR) {
4266			/*
4267			 * A newly allocated directory must have its "." and
4268			 * ".." entries written out before its name can be
4269			 * committed in its parent. We do not want or need
4270			 * the full semantics of a synchronous VOP_FSYNC as
4271			 * that may end up here again, once for each directory
4272			 * level in the filesystem. Instead, we push the blocks
4273			 * and wait for them to clear. We have to fsync twice
4274			 * because the first call may choose to defer blocks
4275			 * that still have dependencies, but deferral will
4276			 * happen at most once.
4277			 */
4278			if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) ||
4279			    (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) {
4280				vput(vp);
4281				break;
4282			}
4283			drain_output(vp, 0);
4284		}
4285		error = UFS_UPDATE(vp, 1);
4286		vput(vp);
4287		if (error)
4288			break;
4289		/*
4290		 * If we have failed to get rid of all the dependencies
4291		 * then something is seriously wrong.
4292		 */
4293		if (dap == LIST_FIRST(diraddhdp))
4294			panic("flush_pagedep_deps: flush 3 failed");
4295		ACQUIRE_LOCK(&lk);
4296	}
4297	if (error)
4298		ACQUIRE_LOCK(&lk);
4299	return (error);
4300}
4301
4302/*
4303 * A large burst of file addition or deletion activity can drive the
4304 * memory load excessively high. Therefore we deliberately slow things
4305 * down and speed up the I/O processing if we find ourselves with too
4306 * many dependencies in progress.
4307 */
4308static int
4309request_cleanup(resource, islocked)
4310	int resource;
4311	int islocked;
4312{
4313	struct callout_handle handle;
4314	struct proc *p = CURPROC;
4315
4316	/*
4317	 * We never hold up the filesystem syncer process.
4318	 */
4319	if (p == filesys_syncer)
4320		return (0);
4321	/*
4322	 * If we are resource constrained on inode dependencies, try
4323	 * flushing some dirty inodes. Otherwise, we are constrained
4324	 * by file deletions, so try accelerating flushes of directories
4325	 * with removal dependencies. We would like to do the cleanup
4326	 * here, but we probably hold an inode locked at this point and
4327	 * that might deadlock against one that we try to clean. So,
4328	 * the best that we can do is request the syncer daemon to do
4329	 * the cleanup for us.
4330	 */
4331	switch (resource) {
4332
4333	case FLUSH_INODES:
4334		stat_ino_limit_push += 1;
4335		req_clear_inodedeps = 1;
4336		break;
4337
4338	case FLUSH_REMOVE:
4339		stat_blk_limit_push += 1;
4340		req_clear_remove = 1;
4341		break;
4342
4343	default:
4344		panic("request_cleanup: unknown type");
4345	}
4346	/*
4347	 * Hopefully the syncer daemon will catch up and awaken us.
4348	 * We wait at most tickdelay before proceeding in any case.
4349	 */
4350	if (islocked == 0)
4351		ACQUIRE_LOCK(&lk);
4352	if (proc_waiting == 0) {
4353		proc_waiting = 1;
4354		handle = timeout(pause_timer, NULL,
4355		    tickdelay > 2 ? tickdelay : 2);
4356	}
4357	FREE_LOCK_INTERLOCKED(&lk);
4358	(void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0);
4359	ACQUIRE_LOCK_INTERLOCKED(&lk);
4360	if (proc_waiting) {
4361		untimeout(pause_timer, NULL, handle);
4362		proc_waiting = 0;
4363	} else {
4364		switch (resource) {
4365
4366		case FLUSH_INODES:
4367			stat_ino_limit_hit += 1;
4368			break;
4369
4370		case FLUSH_REMOVE:
4371			stat_blk_limit_hit += 1;
4372			break;
4373		}
4374	}
4375	if (islocked == 0)
4376		FREE_LOCK(&lk);
4377	return (1);
4378}
4379
4380/*
4381 * Awaken processes pausing in request_cleanup and clear proc_waiting
4382 * to indicate that there is no longer a timer running.
4383 */
4384void
4385pause_timer(arg)
4386	void *arg;
4387{
4388
4389	proc_waiting = 0;
4390	wakeup(&proc_waiting);
4391}
4392
4393/*
4394 * Flush out a directory with at least one removal dependency in an effort to
4395 * reduce the number of dirrem, freefile, and freeblks dependency structures.
4396 */
4397static void
4398clear_remove(p)
4399	struct proc *p;
4400{
4401	struct pagedep_hashhead *pagedephd;
4402	struct pagedep *pagedep;
4403	static int next = 0;
4404	struct mount *mp;
4405	struct vnode *vp;
4406	int error, cnt;
4407	ino_t ino;
4408
4409	ACQUIRE_LOCK(&lk);
4410	for (cnt = 0; cnt < pagedep_hash; cnt++) {
4411		pagedephd = &pagedep_hashtbl[next++];
4412		if (next >= pagedep_hash)
4413			next = 0;
4414		for (pagedep = LIST_FIRST(pagedephd); pagedep;
4415		     pagedep = LIST_NEXT(pagedep, pd_hash)) {
4416			if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
4417				continue;
4418			mp = pagedep->pd_mnt;
4419			ino = pagedep->pd_ino;
4420			FREE_LOCK(&lk);
4421			if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
4422				softdep_error("clear_remove: vget", error);
4423				return;
4424			}
4425			if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
4426				softdep_error("clear_remove: fsync", error);
4427			drain_output(vp, 0);
4428			vput(vp);
4429			return;
4430		}
4431	}
4432	FREE_LOCK(&lk);
4433}
4434
4435/*
4436 * Clear out a block of dirty inodes in an effort to reduce
4437 * the number of inodedep dependency structures.
4438 */
4439static void
4440clear_inodedeps(p)
4441	struct proc *p;
4442{
4443	struct inodedep_hashhead *inodedephd;
4444	struct inodedep *inodedep;
4445	static int next = 0;
4446	struct mount *mp;
4447	struct vnode *vp;
4448	struct fs *fs;
4449	int error, cnt;
4450	ino_t firstino, lastino, ino;
4451
4452	ACQUIRE_LOCK(&lk);
4453	/*
4454	 * Pick a random inode dependency to be cleared.
4455	 * We will then gather up all the inodes in its block
4456	 * that have dependencies and flush them out.
4457	 */
4458	for (cnt = 0; cnt < inodedep_hash; cnt++) {
4459		inodedephd = &inodedep_hashtbl[next++];
4460		if (next >= inodedep_hash)
4461			next = 0;
4462		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
4463			break;
4464	}
4465	/*
4466	 * Ugly code to find mount point given pointer to superblock.
4467	 */
4468	fs = inodedep->id_fs;
4469	TAILQ_FOREACH(mp, &mountlist, mnt_list)
4470		if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
4471			break;
4472	/*
4473	 * Find the last inode in the block with dependencies.
4474	 */
4475	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
4476	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
4477		if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
4478			break;
4479	/*
4480	 * Asynchronously push all but the last inode with dependencies.
4481	 * Synchronously push the last inode with dependencies to ensure
4482	 * that the inode block gets written to free up the inodedeps.
4483	 */
4484	for (ino = firstino; ino <= lastino; ino++) {
4485		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
4486			continue;
4487		FREE_LOCK(&lk);
4488		if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
4489			softdep_error("clear_inodedeps: vget", error);
4490			return;
4491		}
4492		if (ino == lastino) {
4493			if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p)))
4494				softdep_error("clear_inodedeps: fsync1", error);
4495		} else {
4496			if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
4497				softdep_error("clear_inodedeps: fsync2", error);
4498			drain_output(vp, 0);
4499		}
4500		vput(vp);
4501		ACQUIRE_LOCK(&lk);
4502	}
4503	FREE_LOCK(&lk);
4504}
4505
4506/*
4507 * Function to determine if the buffer has outstanding dependencies
4508 * that will cause a roll-back if the buffer is written. If wantcount
4509 * is set, return number of dependencies, otherwise just yes or no.
4510 */
4511static int
4512softdep_count_dependencies(bp, wantcount)
4513	struct buf *bp;
4514	int wantcount;
4515{
4516	struct worklist *wk;
4517	struct inodedep *inodedep;
4518	struct indirdep *indirdep;
4519	struct allocindir *aip;
4520	struct pagedep *pagedep;
4521	struct diradd *dap;
4522	int i, retval;
4523
4524	retval = 0;
4525	ACQUIRE_LOCK(&lk);
4526	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) {
4527		switch (wk->wk_type) {
4528
4529		case D_INODEDEP:
4530			inodedep = WK_INODEDEP(wk);
4531			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4532				/* bitmap allocation dependency */
4533				retval += 1;
4534				if (!wantcount)
4535					goto out;
4536			}
4537			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
4538				/* direct block pointer dependency */
4539				retval += 1;
4540				if (!wantcount)
4541					goto out;
4542			}
4543			continue;
4544
4545		case D_INDIRDEP:
4546			indirdep = WK_INDIRDEP(wk);
4547			for (aip = LIST_FIRST(&indirdep->ir_deplisthd);
4548			     aip; aip = LIST_NEXT(aip, ai_next)) {
4549				/* indirect block pointer dependency */
4550				retval += 1;
4551				if (!wantcount)
4552					goto out;
4553			}
4554			continue;
4555
4556		case D_PAGEDEP:
4557			pagedep = WK_PAGEDEP(wk);
4558			for (i = 0; i < DAHASHSZ; i++) {
4559				for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]);
4560				     dap; dap = LIST_NEXT(dap, da_pdlist)) {
4561					/* directory entry dependency */
4562					retval += 1;
4563					if (!wantcount)
4564						goto out;
4565				}
4566			}
4567			continue;
4568
4569		case D_BMSAFEMAP:
4570		case D_ALLOCDIRECT:
4571		case D_ALLOCINDIR:
4572		case D_MKDIR:
4573			/* never a dependency on these blocks */
4574			continue;
4575
4576		default:
4577			panic("softdep_check_for_rollback: Unexpected type %s",
4578			    TYPENAME(wk->wk_type));
4579			/* NOTREACHED */
4580		}
4581	}
4582out:
4583	FREE_LOCK(&lk);
4584	return retval;
4585}
4586
4587/*
4588 * Acquire exclusive access to a buffer.
4589 * Must be called with splbio blocked.
4590 * Return 1 if buffer was acquired.
4591 */
4592static int
4593getdirtybuf(bpp, waitfor)
4594	struct buf **bpp;
4595	int waitfor;
4596{
4597	struct buf *bp;
4598
4599	for (;;) {
4600		if ((bp = *bpp) == NULL)
4601			return (0);
4602		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
4603			if ((bp->b_xflags & BX_BKGRDINPROG) == 0)
4604				break;
4605			BUF_UNLOCK(bp);
4606			if (waitfor != MNT_WAIT)
4607				return (0);
4608			bp->b_xflags |= BX_BKGRDWAIT;
4609			FREE_LOCK_INTERLOCKED(&lk);
4610			tsleep(&bp->b_xflags, PRIBIO, "getbuf", 0);
4611			ACQUIRE_LOCK_INTERLOCKED(&lk);
4612			continue;
4613		}
4614		if (waitfor != MNT_WAIT)
4615			return (0);
4616		FREE_LOCK_INTERLOCKED(&lk);
4617		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL) != ENOLCK)
4618			panic("getdirtybuf: inconsistent lock");
4619		ACQUIRE_LOCK_INTERLOCKED(&lk);
4620	}
4621	if ((bp->b_flags & B_DELWRI) == 0) {
4622		BUF_UNLOCK(bp);
4623		return (0);
4624	}
4625	bremfree(bp);
4626	return (1);
4627}
4628
4629/*
4630 * Wait for pending output on a vnode to complete.
4631 * Must be called with vnode locked.
4632 */
4633static void
4634drain_output(vp, islocked)
4635	struct vnode *vp;
4636	int islocked;
4637{
4638
4639	if (!islocked)
4640		ACQUIRE_LOCK(&lk);
4641	while (vp->v_numoutput) {
4642		vp->v_flag |= VBWAIT;
4643		FREE_LOCK_INTERLOCKED(&lk);
4644		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drainvp", 0);
4645		ACQUIRE_LOCK_INTERLOCKED(&lk);
4646	}
4647	if (!islocked)
4648		FREE_LOCK(&lk);
4649}
4650
4651/*
4652 * Called whenever a buffer that is being invalidated or reallocated
4653 * contains dependencies. This should only happen if an I/O error has
4654 * occurred. The routine is called with the buffer locked.
4655 */
4656static void
4657softdep_deallocate_dependencies(bp)
4658	struct buf *bp;
4659{
4660
4661	if ((bp->b_flags & B_ERROR) == 0)
4662		panic("softdep_deallocate_dependencies: dangling deps");
4663	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
4664	panic("softdep_deallocate_dependencies: unrecovered I/O error");
4665}
4666
4667/*
4668 * Function to handle asynchronous write errors in the filesystem.
4669 */
4670void
4671softdep_error(func, error)
4672	char *func;
4673	int error;
4674{
4675
4676	/* XXX should do something better! */
4677	printf("%s: got error %d while accessing filesystem\n", func, error);
4678}
4679