ffs_softdep.c revision 36207
1264790Sbapt/*
2264790Sbapt * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved.
3272955Srodrigc *
4264790Sbapt * The soft updates code is derived from the appendix of a University
5264790Sbapt * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
6264790Sbapt * "Soft Updates: A Solution to the Metadata Update Problem in File
7264790Sbapt * Systems", CSE-TR-254-95, August 1995).
8264790Sbapt *
9264790Sbapt * The following are the copyrights and redistribution conditions that
10264790Sbapt * apply to this copy of the soft update software. For a license
11264790Sbapt * to use, redistribute or sell the soft update software under
12264790Sbapt * conditions other than those described here, please contact the
13264790Sbapt * author at one of the following addresses:
14264790Sbapt *
15264790Sbapt *	Marshall Kirk McKusick		mckusick@mckusick.com
16 *	1614 Oxford Street		+1-510-843-9542
17 *	Berkeley, CA 94709-1608
18 *	USA
19 *
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
22 * are met:
23 *
24 * 1. Redistributions of source code must retain the above copyright
25 *    notice, this list of conditions and the following disclaimer.
26 * 2. Redistributions in binary form must reproduce the above copyright
27 *    notice, this list of conditions and the following disclaimer in the
28 *    documentation and/or other materials provided with the distribution.
29 * 3. None of the names of McKusick, Ganger, Patt, or the University of
30 *    Michigan may be used to endorse or promote products derived from
31 *    this software without specific prior written permission.
32 * 4. Redistributions in any form must be accompanied by information on
33 *    how to obtain complete source code for any accompanying software
34 *    that uses this software. This source code must either be included
35 *    in the distribution or be available for no more than the cost of
36 *    distribution plus a nominal fee, and must be freely redistributable
37 *    under reasonable conditions. For an executable file, complete
38 *    source code means the source code for all modules it contains.
39 *    It does not mean source code for modules or files that typically
40 *    accompany the operating system on which the executable file runs,
41 *    e.g., standard library modules or system header files.
42 *
43 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
44 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
45 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
46 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
47 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 *
56 *	from: @(#)ffs_softdep.c	9.14 (McKusick) 1/15/98
57 */
58
59/*
60 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
61 */
62#ifndef DIAGNOSTIC
63#define DIAGNOSTIC
64#endif
65#ifndef DEBUG
66#define DEBUG
67#endif
68
69#include <sys/param.h>
70#include <sys/buf.h>
71#include <sys/kernel.h>
72#include <sys/malloc.h>
73#include <sys/mount.h>
74#include <sys/proc.h>
75#include <sys/syslog.h>
76#include <sys/systm.h>
77#include <sys/vnode.h>
78#include <machine/pcpu.h>
79#include <miscfs/specfs/specdev.h>
80#include <ufs/ufs/dir.h>
81#include <ufs/ufs/quota.h>
82#include <ufs/ufs/inode.h>
83#include <ufs/ufs/ufsmount.h>
84#include <ufs/ffs/fs.h>
85#include <ufs/ffs/softdep.h>
86#include <ufs/ffs/ffs_extern.h>
87#include <ufs/ufs/ufs_extern.h>
88
89/*
90 * Internal function prototypes.
91 */
92static	void softdep_error __P((char *, int));
93static	int getdirtybuf __P((struct buf **, int));
94static	int flush_pagedep_deps __P((struct vnode *, struct mount *,
95	    struct diraddhd *));
96static	int flush_inodedep_deps __P((struct fs *, ino_t));
97static	int handle_written_filepage __P((struct pagedep *, struct buf *));
98static	int handle_written_inodeblock __P((struct inodedep *, struct buf *));
99static	void handle_allocdirect_partdone __P((struct allocdirect *));
100static	void handle_allocindir_partdone __P((struct allocindir *));
101static	void initiate_write_filepage __P((struct pagedep *, struct buf *));
102static	void handle_written_mkdir __P((struct mkdir *, int));
103static	void initiate_write_inodeblock __P((struct inodedep *, struct buf *));
104static	void handle_workitem_freefile __P((struct freefile *));
105static	void handle_workitem_remove __P((struct dirrem *));
106static	struct dirrem *newdirrem __P((struct buf *, struct inode *,
107	    struct inode *, int));
108static	void free_diradd __P((struct diradd *));
109static	void free_allocindir __P((struct allocindir *, struct inodedep *));
110static	int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t,
111	    long *));
112static	void deallocate_dependencies __P((struct buf *, struct inodedep *));
113static	void free_allocdirect __P((struct allocdirectlst *,
114	    struct allocdirect *, int));
115static	int free_inodedep __P((struct inodedep *));
116static	void handle_workitem_freeblocks __P((struct freeblks *));
117static	void merge_inode_lists __P((struct inodedep *));
118static	void setup_allocindir_phase2 __P((struct buf *, struct inode *,
119	    struct allocindir *));
120static	struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t,
121	    ufs_daddr_t));
122static	void handle_workitem_freefrag __P((struct freefrag *));
123static	struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long));
124static	void allocdirect_merge __P((struct allocdirectlst *,
125	    struct allocdirect *, struct allocdirect *));
126static	struct bmsafemap *bmsafemap_lookup __P((struct buf *));
127static	int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
128	    struct newblk **));
129static	int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
130static	int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
131	    struct pagedep **));
132static	void add_to_worklist __P((struct worklist *));
133
134/*
135 * Exported softdep operations.
136 */
137struct bio_ops bioops = {
138	softdep_disk_io_initiation,		/* io_start */
139	softdep_disk_write_complete,		/* io_complete */
140	softdep_deallocate_dependencies,	/* io_deallocate */
141	softdep_process_worklist,		/* io_sync */
142};
143
144/*
145 * Names of malloc types.
146 */
147extern char *memname[];
148#define TYPENAME(type) ((unsigned)(type) < M_LAST ? memname[type] : "???")
149
150/*
151 * Locking primitives.
152 *
153 * For a uniprocessor, all we need to do is protect against disk
154 * interrupts. For a multiprocessor, this lock would have to be
155 * a mutex. A single mutex is used throughout this file, though
156 * finer grain locking could be used if contention warranted it.
157 *
158 * For a multiprocessor, the sleep call would accept a lock and
159 * release it after the sleep processing was complete. In a uniprocessor
160 * implementation there is no such interlock, so we simple mark
161 * the places where it needs to be done with the `interlocked' form
162 * of the lock calls. Since the uniprocessor sleep already interlocks
163 * the spl, there is nothing that really needs to be done.
164 */
165#ifndef /* NOT */ DEBUG
166static struct lockit {
167	int	lkt_spl;
168} lk = { 0 };
169#define ACQUIRE_LOCK(lk)		(lk)->lkt_spl = splbio()
170#define FREE_LOCK(lk)			splx((lk)->lkt_spl)
171#define ACQUIRE_LOCK_INTERLOCKED(lk)
172#define FREE_LOCK_INTERLOCKED(lk)
173
174#else /* DEBUG */
175static struct lockit {
176	int	lkt_spl;
177	pid_t	lkt_held;
178} lk = { 0, -1 };
179static int lockcnt;
180
181static	void acquire_lock __P((struct lockit *));
182static	void free_lock __P((struct lockit *));
183static	void acquire_lock_interlocked __P((struct lockit *));
184static	void free_lock_interlocked __P((struct lockit *));
185
186#define ACQUIRE_LOCK(lk)		acquire_lock(lk)
187#define FREE_LOCK(lk)			free_lock(lk)
188#define ACQUIRE_LOCK_INTERLOCKED(lk)	acquire_lock_interlocked(lk)
189#define FREE_LOCK_INTERLOCKED(lk)	free_lock_interlocked(lk)
190
191static void
192acquire_lock(lk)
193	struct lockit *lk;
194{
195
196	if (lk->lkt_held != -1)
197		if (lk->lkt_held == curproc->p_pid)
198			panic("softdep_lock: locking against myself");
199		else
200			panic("softdep_lock: lock held by %d", lk->lkt_held);
201	lk->lkt_spl = splbio();
202	lk->lkt_held = curproc->p_pid;
203	lockcnt++;
204}
205
206static void
207free_lock(lk)
208	struct lockit *lk;
209{
210
211	if (lk->lkt_held == -1)
212		panic("softdep_unlock: lock not held");
213	lk->lkt_held = -1;
214	splx(lk->lkt_spl);
215}
216
217static void
218acquire_lock_interlocked(lk)
219	struct lockit *lk;
220{
221
222	if (lk->lkt_held != -1)
223		if (lk->lkt_held == curproc->p_pid)
224			panic("softdep_lock_interlocked: locking against self");
225		else
226			panic("softdep_lock_interlocked: lock held by %d",
227			    lk->lkt_held);
228	lk->lkt_held = curproc->p_pid;
229	lockcnt++;
230}
231
232static void
233free_lock_interlocked(lk)
234	struct lockit *lk;
235{
236
237	if (lk->lkt_held == -1)
238		panic("softdep_unlock_interlocked: lock not held");
239	lk->lkt_held = -1;
240}
241#endif /* DEBUG */
242
243/*
244 * Place holder for real semaphores.
245 */
246struct sema {
247	int	value;
248	pid_t	holder;
249	char	*name;
250	int	prio;
251	int	timo;
252};
253static	void sema_init __P((struct sema *, char *, int, int));
254static	int sema_get __P((struct sema *, struct lockit *));
255static	void sema_release __P((struct sema *));
256
257static void
258sema_init(semap, name, prio, timo)
259	struct sema *semap;
260	char *name;
261	int prio, timo;
262{
263
264	semap->holder = -1;
265	semap->value = 0;
266	semap->name = name;
267	semap->prio = prio;
268	semap->timo = timo;
269}
270
271static int
272sema_get(semap, interlock)
273	struct sema *semap;
274	struct lockit *interlock;
275{
276
277	if (semap->value++ > 0) {
278		if (interlock != NULL)
279			FREE_LOCK_INTERLOCKED(interlock);
280		tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo);
281		if (interlock != NULL) {
282			ACQUIRE_LOCK_INTERLOCKED(interlock);
283			FREE_LOCK(interlock);
284		}
285		return (0);
286	}
287	semap->holder = curproc->p_pid;
288	if (interlock != NULL)
289		FREE_LOCK(interlock);
290	return (1);
291}
292
293static void
294sema_release(semap)
295	struct sema *semap;
296{
297
298	if (semap->value <= 0 || semap->holder != curproc->p_pid)
299		panic("sema_release: not held");
300	if (--semap->value > 0) {
301		semap->value = 0;
302		wakeup(semap);
303	}
304	semap->holder = -1;
305}
306
307/*
308 * Worklist queue management.
309 * These routines require that the lock be held.
310 */
311#ifndef /* NOT */ DEBUG
312#define WORKLIST_INSERT(head, item) do {	\
313	(item)->wk_state |= ONWORKLIST;		\
314	LIST_INSERT_HEAD(head, item, wk_list);	\
315} while (0)
316#define WORKLIST_REMOVE(item) do {		\
317	(item)->wk_state &= ~ONWORKLIST;	\
318	LIST_REMOVE(item, wk_list);		\
319} while (0)
320#define WORKITEM_FREE(item, type) FREE(item, type)
321
322#else /* DEBUG */
323static	void worklist_insert __P((struct workhead *, struct worklist *));
324static	void worklist_remove __P((struct worklist *));
325static	void workitem_free __P((struct worklist *, int));
326
327#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
328#define WORKLIST_REMOVE(item) worklist_remove(item)
329#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
330
331static void
332worklist_insert(head, item)
333	struct workhead *head;
334	struct worklist *item;
335{
336
337	if (lk.lkt_held == -1)
338		panic("worklist_insert: lock not held");
339	if (item->wk_state & ONWORKLIST)
340		panic("worklist_insert: already on list");
341	item->wk_state |= ONWORKLIST;
342	LIST_INSERT_HEAD(head, item, wk_list);
343}
344
345static void
346worklist_remove(item)
347	struct worklist *item;
348{
349
350	if (lk.lkt_held == -1)
351		panic("worklist_remove: lock not held");
352	if ((item->wk_state & ONWORKLIST) == 0)
353		panic("worklist_remove: not on list");
354	item->wk_state &= ~ONWORKLIST;
355	LIST_REMOVE(item, wk_list);
356}
357
358static void
359workitem_free(item, type)
360	struct worklist *item;
361	int type;
362{
363
364	if (item->wk_state & ONWORKLIST)
365		panic("workitem_free: still on list");
366	if (item->wk_type != type)
367		panic("workitem_free: type mismatch");
368	FREE(item, type);
369}
370#endif /* DEBUG */
371
372/*
373 * Workitem queue management
374 */
375static struct workhead softdep_workitem_pending;
376static int softdep_worklist_busy;
377
378/*
379 * Add an item to the end of the work queue.
380 * This routine requires that the lock be held.
381 * This is the only routine that adds items to the list.
382 * The following routine is the only one that removes items
383 * and does so in order from first to last.
384 */
385static void
386add_to_worklist(wk)
387	struct worklist *wk;
388{
389	static struct worklist *worklist_tail;
390
391	if (wk->wk_state & ONWORKLIST)
392		panic("add_to_worklist: already on list");
393	wk->wk_state |= ONWORKLIST;
394	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
395		LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
396	else
397		LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
398	worklist_tail = wk;
399}
400
401/*
402 * Process that runs once per second to handle items in the background queue.
403 *
404 * Note that we ensure that everything is done in the order in which they
405 * appear in the queue. The code below depends on this property to ensure
406 * that blocks of a file are freed before the inode itself is freed. This
407 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
408 * until all the old ones have been purged from the dependency lists.
409 */
410int
411softdep_process_worklist(matchmnt)
412	struct mount *matchmnt;
413{
414	struct worklist *wk;
415	struct fs *matchfs;
416	int matchcnt;
417
418	matchcnt = 0;
419	matchfs = NULL;
420	if (matchmnt != NULL)
421		matchfs = VFSTOUFS(matchmnt)->um_fs;
422	/*
423	 * There is no danger of having multiple processes run this
424	 * code. It is single threaded solely so that softdep_flushfiles
425	 * (below) can get an accurate count of the number of items
426	 * related to its mount point that are in the list.
427	 */
428	if (softdep_worklist_busy && matchmnt == NULL)
429		return (-1);
430	ACQUIRE_LOCK(&lk);
431	while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) {
432		WORKLIST_REMOVE(wk);
433		FREE_LOCK(&lk);
434		switch (wk->wk_type) {
435
436		case M_DIRREM:
437			/* removal of a directory entry */
438			if (WK_DIRREM(wk)->dm_mnt == matchmnt)
439				matchcnt += 1;
440			handle_workitem_remove(WK_DIRREM(wk));
441			break;
442
443		case M_FREEBLKS:
444			/* releasing blocks and/or fragments from a file */
445			if (WK_FREEBLKS(wk)->fb_fs == matchfs)
446				matchcnt += 1;
447			handle_workitem_freeblocks(WK_FREEBLKS(wk));
448			break;
449
450		case M_FREEFRAG:
451			/* releasing a fragment when replaced as a file grows */
452			if (WK_FREEFRAG(wk)->ff_fs == matchfs)
453				matchcnt += 1;
454			handle_workitem_freefrag(WK_FREEFRAG(wk));
455			break;
456
457		case M_FREEFILE:
458			/* releasing an inode when its link count drops to 0 */
459			if (WK_FREEFILE(wk)->fx_fs == matchfs)
460				matchcnt += 1;
461			handle_workitem_freefile(WK_FREEFILE(wk));
462			break;
463
464		default:
465			panic("%s_process_worklist: Unknown type %s",
466			    "softdep", TYPENAME(wk->wk_type));
467			/* NOTREACHED */
468		}
469		if (softdep_worklist_busy && matchmnt == NULL)
470			return (-1);
471		ACQUIRE_LOCK(&lk);
472	}
473	FREE_LOCK(&lk);
474	return (matchcnt);
475}
476
477/*
478 * Purge the work list of all items associated with a particular mount point.
479 */
480int
481softdep_flushfiles(oldmnt, flags, p)
482	struct mount *oldmnt;
483	int flags;
484	struct proc *p;
485{
486	struct vnode *devvp;
487	int error, loopcnt;
488
489	/*
490	 * Await our turn to clear out the queue.
491	 */
492	while (softdep_worklist_busy)
493		sleep(&lbolt, PRIBIO);
494	softdep_worklist_busy = 1;
495	if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) {
496		softdep_worklist_busy = 0;
497		return (error);
498	}
499	/*
500	 * Alternately flush the block device associated with the mount
501	 * point and process any dependencies that the flushing
502	 * creates. In theory, this loop can happen at most twice,
503	 * but we give it a few extra just to be sure.
504	 */
505	devvp = VFSTOUFS(oldmnt)->um_devvp;
506	for (loopcnt = 10; loopcnt > 0; loopcnt--) {
507		if (softdep_process_worklist(oldmnt) == 0) {
508			/*
509			 * Do another flush in case any vnodes were brought in
510			 * as part of the cleanup operations.
511			 */
512			if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
513				break;
514			/*
515			 * If we still found nothing to do, we are really done.
516			 */
517			if (softdep_process_worklist(oldmnt) == 0)
518				break;
519		}
520		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
521		error = VOP_FSYNC(devvp, p->p_cred, MNT_WAIT, p);
522		VOP_UNLOCK(devvp, 0, p);
523		if (error)
524			break;
525	}
526	softdep_worklist_busy = 0;
527	/*
528	 * If we are unmounting then it is an error to fail. If we
529	 * are simply trying to downgrade to read-only, then filesystem
530	 * activity can keep us busy forever, so we just fail with EBUSY.
531	 */
532	if (loopcnt == 0) {
533		if (oldmnt->mnt_flag & MNT_UNMOUNT)
534			panic("softdep_flushfiles: looping");
535		error = EBUSY;
536	}
537	return (error);
538}
539
540/*
541 * Structure hashing.
542 *
543 * There are three types of structures that can be looked up:
544 *	1) pagedep structures identified by mount point, inode number,
545 *	   and logical block.
546 *	2) inodedep structures identified by mount point and inode number.
547 *	3) newblk structures identified by mount point and
548 *	   physical block number.
549 *
550 * The "pagedep" and "inodedep" dependency structures are hashed
551 * separately from the file blocks and inodes to which they correspond.
552 * This separation helps when the in-memory copy of an inode or
553 * file block must be replaced. It also obviates the need to access
554 * an inode or file page when simply updating (or de-allocating)
555 * dependency structures. Lookup of newblk structures is needed to
556 * find newly allocated blocks when trying to associate them with
557 * their allocdirect or allocindir structure.
558 *
559 * The lookup routines optionally create and hash a new instance when
560 * an existing entry is not found.
561 */
562#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
563
564/*
565 * Structures and routines associated with pagedep caching.
566 */
567LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
568u_long	pagedep_hash;		/* size of hash table - 1 */
569#define	PAGEDEP_HASH(mp, inum, lbn) \
570	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
571	    pagedep_hash])
572static struct sema pagedep_in_progress;
573
574/*
575 * Look up a pagedep. Return 1 if found, 0 if not found.
576 * If not found, allocate if DEPALLOC flag is passed.
577 * Found or allocated entry is returned in pagedeppp.
578 * This routine must be called with splbio interrupts blocked.
579 */
580static int
581pagedep_lookup(ip, lbn, flags, pagedeppp)
582	struct inode *ip;
583	ufs_lbn_t lbn;
584	int flags;
585	struct pagedep **pagedeppp;
586{
587	struct pagedep *pagedep;
588	struct pagedep_hashhead *pagedephd;
589	struct mount *mp;
590	int i;
591
592#ifdef DEBUG
593	if (lk.lkt_held == -1)
594		panic("pagedep_lookup: lock not held");
595#endif
596	mp = ITOV(ip)->v_mount;
597	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
598top:
599	for (pagedep = LIST_FIRST(pagedephd); pagedep;
600	     pagedep = LIST_NEXT(pagedep, pd_hash))
601		if (ip->i_number == pagedep->pd_ino &&
602		    lbn == pagedep->pd_lbn &&
603		    mp == pagedep->pd_mnt)
604			break;
605	if (pagedep) {
606		*pagedeppp = pagedep;
607		return (1);
608	}
609	if ((flags & DEPALLOC) == 0) {
610		*pagedeppp = NULL;
611		return (0);
612	}
613	if (sema_get(&pagedep_in_progress, &lk) == 0) {
614		ACQUIRE_LOCK(&lk);
615		goto top;
616	}
617	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
618		M_WAITOK);
619	bzero(pagedep, sizeof(struct pagedep));
620	pagedep->pd_list.wk_type = M_PAGEDEP;
621	pagedep->pd_mnt = mp;
622	pagedep->pd_ino = ip->i_number;
623	pagedep->pd_lbn = lbn;
624	LIST_INIT(&pagedep->pd_dirremhd);
625	LIST_INIT(&pagedep->pd_pendinghd);
626	for (i = 0; i < DAHASHSZ; i++)
627		LIST_INIT(&pagedep->pd_diraddhd[i]);
628	ACQUIRE_LOCK(&lk);
629	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
630	sema_release(&pagedep_in_progress);
631	*pagedeppp = pagedep;
632	return (0);
633}
634
635/*
636 * Structures and routines associated with inodedep caching.
637 */
638LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
639u_long	inodedep_hash;		/* size of hash table - 1 */
640#define	INODEDEP_HASH(fs, inum) \
641      (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
642static struct sema inodedep_in_progress;
643
644/*
645 * Look up a inodedep. Return 1 if found, 0 if not found.
646 * If not found, allocate if DEPALLOC flag is passed.
647 * Found or allocated entry is returned in inodedeppp.
648 * This routine must be called with splbio interrupts blocked.
649 */
650static int
651inodedep_lookup(fs, inum, flags, inodedeppp)
652	struct fs *fs;
653	ino_t inum;
654	int flags;
655	struct inodedep **inodedeppp;
656{
657	struct inodedep *inodedep;
658	struct inodedep_hashhead *inodedephd;
659
660#ifdef DEBUG
661	if (lk.lkt_held == -1)
662		panic("inodedep_lookup: lock not held");
663#endif
664	inodedephd = INODEDEP_HASH(fs, inum);
665top:
666	for (inodedep = LIST_FIRST(inodedephd); inodedep;
667	     inodedep = LIST_NEXT(inodedep, id_hash))
668		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
669			break;
670	if (inodedep) {
671		*inodedeppp = inodedep;
672		return (1);
673	}
674	if ((flags & DEPALLOC) == 0) {
675		*inodedeppp = NULL;
676		return (0);
677	}
678	if (sema_get(&inodedep_in_progress, &lk) == 0) {
679		ACQUIRE_LOCK(&lk);
680		goto top;
681	}
682	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
683		M_INODEDEP, M_WAITOK);
684	inodedep->id_list.wk_type = M_INODEDEP;
685	inodedep->id_fs = fs;
686	inodedep->id_ino = inum;
687	inodedep->id_state = ALLCOMPLETE;
688	inodedep->id_nlinkdelta = 0;
689	inodedep->id_savedino = NULL;
690	inodedep->id_savedsize = -1;
691	inodedep->id_buf = NULL;
692	LIST_INIT(&inodedep->id_pendinghd);
693	LIST_INIT(&inodedep->id_inowait);
694	TAILQ_INIT(&inodedep->id_inoupdt);
695	TAILQ_INIT(&inodedep->id_newinoupdt);
696	ACQUIRE_LOCK(&lk);
697	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
698	sema_release(&inodedep_in_progress);
699	*inodedeppp = inodedep;
700	return (0);
701}
702
703/*
704 * Structures and routines associated with newblk caching.
705 */
706LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
707u_long	newblk_hash;		/* size of hash table - 1 */
708#define	NEWBLK_HASH(fs, inum) \
709	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
710static struct sema newblk_in_progress;
711
712/*
713 * Look up a newblk. Return 1 if found, 0 if not found.
714 * If not found, allocate if DEPALLOC flag is passed.
715 * Found or allocated entry is returned in newblkpp.
716 */
717static int
718newblk_lookup(fs, newblkno, flags, newblkpp)
719	struct fs *fs;
720	ufs_daddr_t newblkno;
721	int flags;
722	struct newblk **newblkpp;
723{
724	struct newblk *newblk;
725	struct newblk_hashhead *newblkhd;
726
727	newblkhd = NEWBLK_HASH(fs, newblkno);
728top:
729	for (newblk = LIST_FIRST(newblkhd); newblk;
730	     newblk = LIST_NEXT(newblk, nb_hash))
731		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
732			break;
733	if (newblk) {
734		*newblkpp = newblk;
735		return (1);
736	}
737	if ((flags & DEPALLOC) == 0) {
738		*newblkpp = NULL;
739		return (0);
740	}
741	if (sema_get(&newblk_in_progress, 0) == 0)
742		goto top;
743	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
744		M_NEWBLK, M_WAITOK);
745	newblk->nb_state = 0;
746	newblk->nb_fs = fs;
747	newblk->nb_newblkno = newblkno;
748	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
749	sema_release(&newblk_in_progress);
750	*newblkpp = newblk;
751	return (0);
752}
753
754/*
755 * Executed during filesystem system initialization before
756 * mounting any file systems.
757 */
758void
759softdep_initialize()
760{
761
762	LIST_INIT(&mkdirlisthd);
763	LIST_INIT(&softdep_workitem_pending);
764	pagedep_hashtbl = hashinit(desiredvnodes / 10, M_PAGEDEP,
765	    &pagedep_hash);
766	sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
767	inodedep_hashtbl = hashinit(desiredvnodes / 2, M_INODEDEP,
768	    &inodedep_hash);
769	sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
770	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
771	sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
772}
773
774/*
775 * Called at mount time to notify the dependency code that a
776 * filesystem wishes to use it.
777 */
778int
779softdep_mount(devvp, mp, fs, cred)
780	struct vnode *devvp;
781	struct mount *mp;
782	struct fs *fs;
783	struct ucred *cred;
784{
785	struct csum cstotal;
786	struct cg *cgp;
787	struct buf *bp;
788	int error, cyl;
789
790	mp->mnt_flag |= MNT_SOFTDEP;
791	/*
792	 * When doing soft updates, the counters in the
793	 * superblock may have gotten out of sync, so we have
794	 * to scan the cylinder groups and recalculate them.
795	 */
796	if (fs->fs_clean != 0)
797		return (0);
798	bzero(&cstotal, sizeof cstotal);
799	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
800		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
801		    fs->fs_cgsize, cred, &bp)) != 0) {
802			brelse(bp);
803			return (error);
804		}
805		cgp = (struct cg *)bp->b_data;
806		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
807		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
808		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
809		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
810		fs->fs_cs(fs, cyl) = cgp->cg_cs;
811		brelse(bp);
812	}
813#ifdef DEBUG
814	if (!bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
815		printf("ffs_mountfs: superblock updated\n");
816#endif
817	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
818	return (0);
819}
820
821/*
822 * Protecting the freemaps (or bitmaps).
823 *
824 * To eliminate the need to execute fsck before mounting a file system
825 * after a power failure, one must (conservatively) guarantee that the
826 * on-disk copy of the bitmaps never indicate that a live inode or block is
827 * free.  So, when a block or inode is allocated, the bitmap should be
828 * updated (on disk) before any new pointers.  When a block or inode is
829 * freed, the bitmap should not be updated until all pointers have been
830 * reset.  The latter dependency is handled by the delayed de-allocation
831 * approach described below for block and inode de-allocation.  The former
832 * dependency is handled by calling the following procedure when a block or
833 * inode is allocated. When an inode is allocated an "inodedep" is created
834 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
835 * Each "inodedep" is also inserted into the hash indexing structure so
836 * that any additional link additions can be made dependent on the inode
837 * allocation.
838 *
839 * The ufs file system maintains a number of free block counts (e.g., per
840 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
841 * in addition to the bitmaps.  These counts are used to improve efficiency
842 * during allocation and therefore must be consistent with the bitmaps.
843 * There is no convenient way to guarantee post-crash consistency of these
844 * counts with simple update ordering, for two main reasons: (1) The counts
845 * and bitmaps for a single cylinder group block are not in the same disk
846 * sector.  If a disk write is interrupted (e.g., by power failure), one may
847 * be written and the other not.  (2) Some of the counts are located in the
848 * superblock rather than the cylinder group block. So, we focus our soft
849 * updates implementation on protecting the bitmaps. When mounting a
850 * filesystem, we recompute the auxiliary counts from the bitmaps.
851 */
852
853/*
854 * Called just after updating the cylinder group block to allocate an inode.
855 */
856void
857softdep_setup_inomapdep(bp, ip, newinum)
858	struct buf *bp;		/* buffer for cylgroup block with inode map */
859	struct inode *ip;	/* inode related to allocation */
860	ino_t newinum;		/* new inode number being allocated */
861{
862	struct inodedep *inodedep;
863	struct bmsafemap *bmsafemap;
864
865	/*
866	 * Create a dependency for the newly allocated inode.
867	 * Panic if it already exists as something is seriously wrong.
868	 * Otherwise add it to the dependency list for the buffer holding
869	 * the cylinder group map from which it was allocated.
870	 */
871	ACQUIRE_LOCK(&lk);
872	if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0)
873		panic("softdep_setup_inomapdep: found inode");
874	inodedep->id_buf = bp;
875	inodedep->id_state &= ~DEPCOMPLETE;
876	bmsafemap = bmsafemap_lookup(bp);
877	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
878	FREE_LOCK(&lk);
879}
880
881/*
882 * Called just after updating the cylinder group block to
883 * allocate block or fragment.
884 */
885void
886softdep_setup_blkmapdep(bp, fs, newblkno)
887	struct buf *bp;		/* buffer for cylgroup block with block map */
888	struct fs *fs;		/* filesystem doing allocation */
889	ufs_daddr_t newblkno;	/* number of newly allocated block */
890{
891	struct newblk *newblk;
892	struct bmsafemap *bmsafemap;
893
894	/*
895	 * Create a dependency for the newly allocated block.
896	 * Add it to the dependency list for the buffer holding
897	 * the cylinder group map from which it was allocated.
898	 */
899	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
900		panic("softdep_setup_blkmapdep: found block");
901	ACQUIRE_LOCK(&lk);
902	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
903	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
904	FREE_LOCK(&lk);
905}
906
907/*
908 * Find the bmsafemap associated with a cylinder group buffer.
909 * If none exists, create one. The buffer must be locked when
910 * this routine is called and this routine must be called with
911 * splbio interrupts blocked.
912 */
913static struct bmsafemap *
914bmsafemap_lookup(bp)
915	struct buf *bp;
916{
917	struct bmsafemap *bmsafemap;
918	struct worklist *wk;
919
920#ifdef DEBUG
921	if (lk.lkt_held == -1)
922		panic("bmsafemap_lookup: lock not held");
923#endif
924	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list))
925		if (wk->wk_type == M_BMSAFEMAP)
926			return (WK_BMSAFEMAP(wk));
927	FREE_LOCK(&lk);
928	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
929		M_BMSAFEMAP, M_WAITOK);
930	bmsafemap->sm_list.wk_type = M_BMSAFEMAP;
931	bmsafemap->sm_list.wk_state = 0;
932	bmsafemap->sm_buf = bp;
933	LIST_INIT(&bmsafemap->sm_allocdirecthd);
934	LIST_INIT(&bmsafemap->sm_allocindirhd);
935	LIST_INIT(&bmsafemap->sm_inodedephd);
936	LIST_INIT(&bmsafemap->sm_newblkhd);
937	ACQUIRE_LOCK(&lk);
938	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
939	return (bmsafemap);
940}
941
942/*
943 * Direct block allocation dependencies.
944 *
945 * When a new block is allocated, the corresponding disk locations must be
946 * initialized (with zeros or new data) before the on-disk inode points to
947 * them.  Also, the freemap from which the block was allocated must be
948 * updated (on disk) before the inode's pointer. These two dependencies are
949 * independent of each other and are needed for all file blocks and indirect
950 * blocks that are pointed to directly by the inode.  Just before the
951 * "in-core" version of the inode is updated with a newly allocated block
952 * number, a procedure (below) is called to setup allocation dependency
953 * structures.  These structures are removed when the corresponding
954 * dependencies are satisfied or when the block allocation becomes obsolete
955 * (i.e., the file is deleted, the block is de-allocated, or the block is a
956 * fragment that gets upgraded).  All of these cases are handled in
957 * procedures described later.
958 *
959 * When a file extension causes a fragment to be upgraded, either to a larger
960 * fragment or to a full block, the on-disk location may change (if the
961 * previous fragment could not simply be extended). In this case, the old
962 * fragment must be de-allocated, but not until after the inode's pointer has
963 * been updated. In most cases, this is handled by later procedures, which
964 * will construct a "freefrag" structure to be added to the workitem queue
965 * when the inode update is complete (or obsolete).  The main exception to
966 * this is when an allocation occurs while a pending allocation dependency
967 * (for the same block pointer) remains.  This case is handled in the main
968 * allocation dependency setup procedure by immediately freeing the
969 * unreferenced fragments.
970 */
971void
972softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
973	struct inode *ip;	/* inode to which block is being added */
974	ufs_lbn_t lbn;		/* block pointer within inode */
975	ufs_daddr_t newblkno;	/* disk block number being added */
976	ufs_daddr_t oldblkno;	/* previous block number, 0 unless frag */
977	long newsize;		/* size of new block */
978	long oldsize;		/* size of new block */
979	struct buf *bp;		/* bp for allocated block */
980{
981	struct allocdirect *adp, *oldadp;
982	struct allocdirectlst *adphead;
983	struct bmsafemap *bmsafemap;
984	struct inodedep *inodedep;
985	struct pagedep *pagedep;
986	struct newblk *newblk;
987
988	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
989		M_ALLOCDIRECT, M_WAITOK);
990	bzero(adp, sizeof(struct allocdirect));
991	adp->ad_list.wk_type = M_ALLOCDIRECT;
992	adp->ad_lbn = lbn;
993	adp->ad_newblkno = newblkno;
994	adp->ad_oldblkno = oldblkno;
995	adp->ad_newsize = newsize;
996	adp->ad_oldsize = oldsize;
997	adp->ad_state = ATTACHED;
998	if (newblkno == oldblkno)
999		adp->ad_freefrag = NULL;
1000	else
1001		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1002
1003	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1004		panic("softdep_setup_allocdirect: lost block");
1005
1006	ACQUIRE_LOCK(&lk);
1007	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
1008	adp->ad_inodedep = inodedep;
1009
1010	if (newblk->nb_state == DEPCOMPLETE) {
1011		adp->ad_state |= DEPCOMPLETE;
1012		adp->ad_buf = NULL;
1013	} else {
1014		bmsafemap = newblk->nb_bmsafemap;
1015		adp->ad_buf = bmsafemap->sm_buf;
1016		LIST_REMOVE(newblk, nb_deps);
1017		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1018	}
1019	LIST_REMOVE(newblk, nb_hash);
1020	FREE(newblk, M_NEWBLK);
1021
1022	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1023	if (lbn >= NDADDR) {
1024		/* allocating an indirect block */
1025		if (oldblkno != 0)
1026			panic("softdep_setup_allocdirect: non-zero indir");
1027	} else {
1028		/*
1029		 * Allocating a direct block.
1030		 *
1031		 * If we are allocating a directory block, then we must
1032		 * allocate an associated pagedep to track additions and
1033		 * deletions.
1034		 */
1035		if ((ip->i_mode & IFMT) == IFDIR &&
1036		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1037			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1038	}
1039	/*
1040	 * The list of allocdirects must be kept in sorted and ascending
1041	 * order so that the rollback routines can quickly determine the
1042	 * first uncommitted block (the size of the file stored on disk
1043	 * ends at the end of the lowest committed fragment, or if there
1044	 * are no fragments, at the end of the highest committed block).
1045	 * Since files generally grow, the typical case is that the new
1046	 * block is to be added at the end of the list. We speed this
1047	 * special case by checking against the last allocdirect in the
1048	 * list before laboriously traversing the list looking for the
1049	 * insertion point.
1050	 */
1051	adphead = &inodedep->id_newinoupdt;
1052	oldadp = TAILQ_LAST(adphead, allocdirectlst);
1053	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1054		/* insert at end of list */
1055		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1056		if (oldadp != NULL && oldadp->ad_lbn == lbn)
1057			allocdirect_merge(adphead, adp, oldadp);
1058		FREE_LOCK(&lk);
1059		return;
1060	}
1061	for (oldadp = TAILQ_FIRST(adphead); oldadp;
1062	     oldadp = TAILQ_NEXT(oldadp, ad_next)) {
1063		if (oldadp->ad_lbn >= lbn)
1064			break;
1065	}
1066	if (oldadp == NULL)
1067		panic("softdep_setup_allocdirect: lost entry");
1068	/* insert in middle of list */
1069	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1070	if (oldadp->ad_lbn == lbn)
1071		allocdirect_merge(adphead, adp, oldadp);
1072	FREE_LOCK(&lk);
1073}
1074
1075/*
1076 * Replace an old allocdirect dependency with a newer one.
1077 * This routine must be called with splbio interrupts blocked.
1078 */
1079static void
1080allocdirect_merge(adphead, newadp, oldadp)
1081	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
1082	struct allocdirect *newadp;	/* allocdirect being added */
1083	struct allocdirect *oldadp;	/* existing allocdirect being checked */
1084{
1085	struct freefrag *freefrag;
1086
1087#ifdef DEBUG
1088	if (lk.lkt_held == -1)
1089		panic("allocdirect_merge: lock not held");
1090#endif
1091	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1092	    newadp->ad_oldsize != oldadp->ad_newsize ||
1093	    newadp->ad_lbn >= NDADDR)
1094		panic("allocdirect_check: old %d != new %d || lbn %d >= %d",
1095		    newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
1096		    NDADDR);
1097	newadp->ad_oldblkno = oldadp->ad_oldblkno;
1098	newadp->ad_oldsize = oldadp->ad_oldsize;
1099	/*
1100	 * If the old dependency had a fragment to free or had never
1101	 * previously had a block allocated, then the new dependency
1102	 * can immediately post its freefrag and adopt the old freefrag.
1103	 * This action is done by swapping the freefrag dependencies.
1104	 * The new dependency gains the old one's freefrag, and the
1105	 * old one gets the new one and then immediately puts it on
1106	 * the worklist when it is freed by free_allocdirect. It is
1107	 * not possible to do this swap when the old dependency had a
1108	 * non-zero size but no previous fragment to free. This condition
1109	 * arises when the new block is an extension of the old block.
1110	 * Here, the first part of the fragment allocated to the new
1111	 * dependency is part of the block currently claimed on disk by
1112	 * the old dependency, so cannot legitimately be freed until the
1113	 * conditions for the new dependency are fulfilled.
1114	 */
1115	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1116		freefrag = newadp->ad_freefrag;
1117		newadp->ad_freefrag = oldadp->ad_freefrag;
1118		oldadp->ad_freefrag = freefrag;
1119	}
1120	free_allocdirect(adphead, oldadp, 0);
1121}
1122
1123/*
1124 * Allocate a new freefrag structure if needed.
1125 */
1126static struct freefrag *
1127newfreefrag(ip, blkno, size)
1128	struct inode *ip;
1129	ufs_daddr_t blkno;
1130	long size;
1131{
1132	struct freefrag *freefrag;
1133	struct fs *fs;
1134
1135	if (blkno == 0)
1136		return (NULL);
1137	fs = ip->i_fs;
1138	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1139		panic("newfreefrag: frag size");
1140	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1141		M_FREEFRAG, M_WAITOK);
1142	freefrag->ff_list.wk_type = M_FREEFRAG;
1143	freefrag->ff_state = ip->i_uid & ~ONWORKLIST;	/* XXX - used below */
1144	freefrag->ff_inum = ip->i_number;
1145	freefrag->ff_fs = fs;
1146	freefrag->ff_devvp = ip->i_devvp;
1147	freefrag->ff_blkno = blkno;
1148	freefrag->ff_fragsize = size;
1149	return (freefrag);
1150}
1151
1152/*
1153 * This workitem de-allocates fragments that were replaced during
1154 * file block allocation.
1155 */
1156static void
1157handle_workitem_freefrag(freefrag)
1158	struct freefrag *freefrag;
1159{
1160	struct inode tip;
1161
1162	tip.i_fs = freefrag->ff_fs;
1163	tip.i_devvp = freefrag->ff_devvp;
1164	tip.i_dev = freefrag->ff_devvp->v_rdev;
1165	tip.i_number = freefrag->ff_inum;
1166	tip.i_uid = freefrag->ff_state & ~ONWORKLIST;	/* XXX - set above */
1167	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1168	FREE(freefrag, M_FREEFRAG);
1169}
1170
1171/*
1172 * Indirect block allocation dependencies.
1173 *
1174 * The same dependencies that exist for a direct block also exist when
1175 * a new block is allocated and pointed to by an entry in a block of
1176 * indirect pointers. The undo/redo states described above are also
1177 * used here. Because an indirect block contains many pointers that
1178 * may have dependencies, a second copy of the entire in-memory indirect
1179 * block is kept. The buffer cache copy is always completely up-to-date.
1180 * The second copy, which is used only as a source for disk writes,
1181 * contains only the safe pointers (i.e., those that have no remaining
1182 * update dependencies). The second copy is freed when all pointers
1183 * are safe. The cache is not allowed to replace indirect blocks with
1184 * pending update dependencies. If a buffer containing an indirect
1185 * block with dependencies is written, these routines will mark it
1186 * dirty again. It can only be successfully written once all the
1187 * dependencies are removed. The ffs_fsync routine in conjunction with
1188 * softdep_sync_metadata work together to get all the dependencies
1189 * removed so that a file can be successfully written to disk. Three
1190 * procedures are used when setting up indirect block pointer
1191 * dependencies. The division is necessary because of the organization
1192 * of the "balloc" routine and because of the distinction between file
1193 * pages and file metadata blocks.
1194 */
1195
1196/*
1197 * Allocate a new allocindir structure.
1198 */
1199static struct allocindir *
1200newallocindir(ip, ptrno, newblkno, oldblkno)
1201	struct inode *ip;	/* inode for file being extended */
1202	int ptrno;		/* offset of pointer in indirect block */
1203	ufs_daddr_t newblkno;	/* disk block number being added */
1204	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1205{
1206	struct allocindir *aip;
1207
1208	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1209		M_ALLOCINDIR, M_WAITOK);
1210	bzero(aip, sizeof(struct allocindir));
1211	aip->ai_list.wk_type = M_ALLOCINDIR;
1212	aip->ai_state = ATTACHED;
1213	aip->ai_offset = ptrno;
1214	aip->ai_newblkno = newblkno;
1215	aip->ai_oldblkno = oldblkno;
1216	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1217	return (aip);
1218}
1219
1220/*
1221 * Called just before setting an indirect block pointer
1222 * to a newly allocated file page.
1223 */
1224void
1225softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1226	struct inode *ip;	/* inode for file being extended */
1227	ufs_lbn_t lbn;		/* allocated block number within file */
1228	struct buf *bp;		/* buffer with indirect blk referencing page */
1229	int ptrno;		/* offset of pointer in indirect block */
1230	ufs_daddr_t newblkno;	/* disk block number being added */
1231	ufs_daddr_t oldblkno;	/* previous block number, 0 if none */
1232	struct buf *nbp;	/* buffer holding allocated page */
1233{
1234	struct allocindir *aip;
1235	struct pagedep *pagedep;
1236
1237	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1238	ACQUIRE_LOCK(&lk);
1239	/*
1240	 * If we are allocating a directory page, then we must
1241	 * allocate an associated pagedep to track additions and
1242	 * deletions.
1243	 */
1244	if ((ip->i_mode & IFMT) == IFDIR &&
1245	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1246		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1247	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1248	FREE_LOCK(&lk);
1249	setup_allocindir_phase2(bp, ip, aip);
1250}
1251
1252/*
1253 * Called just before setting an indirect block pointer to a
1254 * newly allocated indirect block.
1255 */
1256void
1257softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1258	struct buf *nbp;	/* newly allocated indirect block */
1259	struct inode *ip;	/* inode for file being extended */
1260	struct buf *bp;		/* indirect block referencing allocated block */
1261	int ptrno;		/* offset of pointer in indirect block */
1262	ufs_daddr_t newblkno;	/* disk block number being added */
1263{
1264	struct allocindir *aip;
1265
1266	aip = newallocindir(ip, ptrno, newblkno, 0);
1267	ACQUIRE_LOCK(&lk);
1268	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1269	FREE_LOCK(&lk);
1270	setup_allocindir_phase2(bp, ip, aip);
1271}
1272
1273/*
1274 * Called to finish the allocation of the "aip" allocated
1275 * by one of the two routines above.
1276 */
1277static void
1278setup_allocindir_phase2(bp, ip, aip)
1279	struct buf *bp;		/* in-memory copy of the indirect block */
1280	struct inode *ip;	/* inode for file being extended */
1281	struct allocindir *aip;	/* allocindir allocated by the above routines */
1282{
1283	struct worklist *wk;
1284	struct indirdep *indirdep, *newindirdep;
1285	struct bmsafemap *bmsafemap;
1286	struct allocindir *oldaip;
1287	struct freefrag *freefrag;
1288	struct newblk *newblk;
1289
1290	if (bp->b_lblkno >= 0)
1291		panic("setup_allocindir_phase2: not indir blk");
1292	for (indirdep = NULL, newindirdep = NULL; ; ) {
1293		ACQUIRE_LOCK(&lk);
1294		for (wk = LIST_FIRST(&bp->b_dep); wk;
1295		     wk = LIST_NEXT(wk, wk_list)) {
1296			if (wk->wk_type != M_INDIRDEP)
1297				continue;
1298			indirdep = WK_INDIRDEP(wk);
1299			break;
1300		}
1301		if (indirdep == NULL && newindirdep) {
1302			indirdep = newindirdep;
1303			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1304			newindirdep = NULL;
1305		}
1306		FREE_LOCK(&lk);
1307		if (indirdep) {
1308			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1309			    &newblk) == 0)
1310				panic("setup_allocindir: lost block");
1311			ACQUIRE_LOCK(&lk);
1312			if (newblk->nb_state == DEPCOMPLETE) {
1313				aip->ai_state |= DEPCOMPLETE;
1314				aip->ai_buf = NULL;
1315			} else {
1316				bmsafemap = newblk->nb_bmsafemap;
1317				aip->ai_buf = bmsafemap->sm_buf;
1318				LIST_REMOVE(newblk, nb_deps);
1319				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1320				    aip, ai_deps);
1321			}
1322			LIST_REMOVE(newblk, nb_hash);
1323			FREE(newblk, M_NEWBLK);
1324			aip->ai_indirdep = indirdep;
1325			/*
1326			 * Check to see if there is an existing dependency
1327			 * for this block. If there is, merge the old
1328			 * dependency into the new one.
1329			 */
1330			if (aip->ai_oldblkno == 0)
1331				oldaip = NULL;
1332			else
1333				for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd);
1334				    oldaip; oldaip = LIST_NEXT(oldaip, ai_next))
1335					if (oldaip->ai_offset == aip->ai_offset)
1336						break;
1337			if (oldaip != NULL) {
1338				if (oldaip->ai_newblkno != aip->ai_oldblkno)
1339					panic("setup_allocindir_phase2: blkno");
1340				aip->ai_oldblkno = oldaip->ai_oldblkno;
1341				freefrag = oldaip->ai_freefrag;
1342				oldaip->ai_freefrag = aip->ai_freefrag;
1343				aip->ai_freefrag = freefrag;
1344				free_allocindir(oldaip, NULL);
1345			}
1346			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1347			((ufs_daddr_t *)indirdep->ir_savebp->b_data)
1348			    [aip->ai_offset] = aip->ai_oldblkno;
1349			FREE_LOCK(&lk);
1350		}
1351		if (newindirdep) {
1352			if (indirdep->ir_savebp != NULL)
1353				brelse(newindirdep->ir_savebp);
1354			WORKITEM_FREE((caddr_t)newindirdep, M_INDIRDEP);
1355		}
1356		if (indirdep)
1357			break;
1358		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
1359			M_INDIRDEP, M_WAITOK);
1360		newindirdep->ir_list.wk_type = M_INDIRDEP;
1361		newindirdep->ir_state = ATTACHED;
1362		LIST_INIT(&newindirdep->ir_deplisthd);
1363		LIST_INIT(&newindirdep->ir_donehd);
1364		newindirdep->ir_saveddata = (ufs_daddr_t *)bp->b_data;
1365		newindirdep->ir_savebp =
1366		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1367		bcopy((caddr_t)newindirdep->ir_saveddata,
1368		    newindirdep->ir_savebp->b_data, bp->b_bcount);
1369	}
1370}
1371
1372/*
1373 * Block de-allocation dependencies.
1374 *
1375 * When blocks are de-allocated, the on-disk pointers must be nullified before
1376 * the blocks are made available for use by other files.  (The true
1377 * requirement is that old pointers must be nullified before new on-disk
1378 * pointers are set.  We chose this slightly more stringent requirement to
1379 * reduce complexity.) Our implementation handles this dependency by updating
1380 * the inode (or indirect block) appropriately but delaying the actual block
1381 * de-allocation (i.e., freemap and free space count manipulation) until
1382 * after the updated versions reach stable storage.  After the disk is
1383 * updated, the blocks can be safely de-allocated whenever it is convenient.
1384 * This implementation handles only the common case of reducing a file's
1385 * length to zero. Other cases are handled by the conventional synchronous
1386 * write approach.
1387 *
1388 * The ffs implementation with which we worked double-checks
1389 * the state of the block pointers and file size as it reduces
1390 * a file's length.  Some of this code is replicated here in our
1391 * soft updates implementation.  The freeblks->fb_chkcnt field is
1392 * used to transfer a part of this information to the procedure
1393 * that eventually de-allocates the blocks.
1394 *
1395 * This routine should be called from the routine that shortens
1396 * a file's length, before the inode's size or block pointers
1397 * are modified. It will save the block pointer information for
1398 * later release and zero the inode so that the calling routine
1399 * can release it.
1400 */
1401void
1402softdep_setup_freeblocks(ip, length)
1403	struct inode *ip;	/* The inode whose length is to be reduced */
1404	off_t length;		/* The new length for the file */
1405{
1406	struct freeblks *freeblks;
1407	struct inodedep *inodedep;
1408	struct allocdirect *adp;
1409	struct vnode *vp;
1410	struct buf *bp;
1411	struct fs *fs;
1412	int i, error;
1413
1414	fs = ip->i_fs;
1415	if (length != 0)
1416		panic("softde_setup_freeblocks: non-zero length");
1417	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
1418		M_FREEBLKS, M_WAITOK);
1419	bzero(freeblks, sizeof(struct freeblks));
1420	freeblks->fb_list.wk_type = M_FREEBLKS;
1421	freeblks->fb_uid = ip->i_uid;
1422	freeblks->fb_previousinum = ip->i_number;
1423	freeblks->fb_devvp = ip->i_devvp;
1424	freeblks->fb_fs = fs;
1425	freeblks->fb_oldsize = ip->i_size;
1426	freeblks->fb_newsize = length;
1427	freeblks->fb_chkcnt = ip->i_blocks;
1428	for (i = 0; i < NDADDR; i++) {
1429		freeblks->fb_dblks[i] = ip->i_db[i];
1430		ip->i_db[i] = 0;
1431	}
1432	for (i = 0; i < NIADDR; i++) {
1433		freeblks->fb_iblks[i] = ip->i_ib[i];
1434		ip->i_ib[i] = 0;
1435	}
1436	ip->i_blocks = 0;
1437	ip->i_size = 0;
1438	/*
1439	 * Push the zero'ed inode to to its disk buffer so that we are free
1440	 * to delete its dependencies below. Once the dependencies are gone
1441	 * the buffer can be safely released.
1442	 */
1443	if ((error = bread(ip->i_devvp,
1444	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1445	    (int)fs->fs_bsize, NOCRED, &bp)) != 0)
1446		softdep_error("softdep_setup_freeblocks", error);
1447	*((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) =
1448	    ip->i_din;
1449	/*
1450	 * Find and eliminate any inode dependencies.
1451	 */
1452	ACQUIRE_LOCK(&lk);
1453	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1454	if ((inodedep->id_state & IOSTARTED) != 0)
1455		panic("softdep_setup_freeblocks: inode busy");
1456	/*
1457	 * Add the freeblks structure to the list of operations that
1458	 * must await the zero'ed inode being written to disk.
1459	 */
1460	WORKLIST_INSERT(&inodedep->id_inowait, &freeblks->fb_list);
1461	/*
1462	 * Because the file length has been truncated to zero, any
1463	 * pending block allocation dependency structures associated
1464	 * with this inode are obsolete and can simply be de-allocated.
1465	 * We must first merge the two dependency lists to get rid of
1466	 * any duplicate freefrag structures, then purge the merged list.
1467	 */
1468	merge_inode_lists(inodedep);
1469	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
1470		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
1471	bdwrite(bp);
1472	/*
1473	 * We must wait for any I/O in progress to finish so that
1474	 * all potential buffers on the dirty list will be visible.
1475	 * Once they are all there, walk the list and get rid of
1476	 * any dependencies.
1477	 */
1478	vp = ITOV(ip);
1479	while (vp->v_numoutput) {
1480		vp->v_flag |= VBWAIT;
1481		FREE_LOCK_INTERLOCKED(&lk);
1482		sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1);
1483		ACQUIRE_LOCK_INTERLOCKED(&lk);
1484	}
1485	while (getdirtybuf(&LIST_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
1486		bp = LIST_FIRST(&vp->v_dirtyblkhd);
1487		(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
1488		deallocate_dependencies(bp, inodedep);
1489		bp->b_flags |= B_INVAL;
1490		brelse(bp);
1491	}
1492	/*
1493	 * Try freeing the inodedep in case that was the last dependency.
1494	 */
1495	if ((inodedep_lookup(fs, ip->i_number, 0, &inodedep)) != 0)
1496		(void) free_inodedep(inodedep);
1497	FREE_LOCK(&lk);
1498}
1499
1500/*
1501 * Reclaim any dependency structures from a buffer that is about to
1502 * be reallocated to a new vnode. The buffer must be locked, thus,
1503 * no I/O completion operations can occur while we are manipulating
1504 * its associated dependencies. The mutex is held so that other I/O's
1505 * associated with related dependencies do not occur.
1506 */
1507static void
1508deallocate_dependencies(bp, inodedep)
1509	struct buf *bp;
1510	struct inodedep *inodedep;
1511{
1512	struct worklist *wk;
1513	struct indirdep *indirdep;
1514	struct allocindir *aip;
1515	struct pagedep *pagedep;
1516	struct dirrem *dirrem;
1517	struct diradd *dap;
1518	long tmpsize;
1519	caddr_t tmp;
1520	int i;
1521
1522	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
1523		switch (wk->wk_type) {
1524
1525		case M_INDIRDEP:
1526			indirdep = WK_INDIRDEP(wk);
1527			/*
1528			 * None of the indirect pointers will ever be visible,
1529			 * so they can simply be tossed. GOINGAWAY ensures
1530			 * that allocated pointers will be saved in the buffer
1531			 * cache until they are freed. Note that they will
1532			 * only be able to be found by their physical address
1533			 * since the inode mapping the logical address will
1534			 * be gone. The save buffer used for the safe copy
1535			 * was allocated in setup_allocindir_phase2 using
1536			 * the physical address so it could be used for this
1537			 * purpose. Hence we swap the safe copy with the real
1538			 * copy, allowing the safe copy to be freed and holding
1539			 * on to the real copy for later use in indir_trunc.
1540			 */
1541			if (indirdep->ir_state & GOINGAWAY)
1542				panic("deallocate_dependencies: already gone");
1543			indirdep->ir_state |= GOINGAWAY;
1544			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
1545				free_allocindir(aip, inodedep);
1546			if (bp->b_lblkno >= 0 ||
1547			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
1548				panic("deallocate_dependencies: not indir");
1549			tmp = indirdep->ir_savebp->b_data;
1550			indirdep->ir_savebp->b_data = bp->b_data;
1551			bp->b_data = tmp;
1552			tmpsize = indirdep->ir_savebp->b_bufsize;
1553			indirdep->ir_savebp->b_bufsize = bp->b_bufsize;
1554			bp->b_bufsize = tmpsize;
1555			WORKLIST_REMOVE(wk);
1556			WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
1557			continue;
1558
1559		case M_PAGEDEP:
1560			pagedep = WK_PAGEDEP(wk);
1561			/*
1562			 * None of the directory additions will ever be
1563			 * visible, so they can simply be tossed.
1564			 */
1565			for (i = 0; i < DAHASHSZ; i++)
1566				while (dap=LIST_FIRST(&pagedep->pd_diraddhd[i]))
1567					free_diradd(dap);
1568			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
1569				free_diradd(dap);
1570			/*
1571			 * Copy any directory remove dependencies to the list
1572			 * to be processed after the zero'ed inode is written.
1573			 * If the inode has already been written, then they
1574			 * can be dumped directly onto the work list.
1575			 */
1576			for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem;
1577			     dirrem = LIST_NEXT(dirrem, dm_next)) {
1578				LIST_REMOVE(dirrem, dm_next);
1579				dirrem->dm_dirinum = pagedep->pd_ino;
1580				if (inodedep == NULL)
1581					add_to_worklist(&dirrem->dm_list);
1582				else
1583					WORKLIST_INSERT(&inodedep->id_inowait,
1584					    &dirrem->dm_list);
1585			}
1586			WORKLIST_REMOVE(&pagedep->pd_list);
1587			LIST_REMOVE(pagedep, pd_hash);
1588			WORKITEM_FREE(pagedep, M_PAGEDEP);
1589			continue;
1590
1591		case M_ALLOCINDIR:
1592			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
1593			continue;
1594
1595		case M_ALLOCDIRECT:
1596		case M_INODEDEP:
1597			panic("deallocate_dependencies: Unexpected type %s",
1598			    TYPENAME(wk->wk_type));
1599			/* NOTREACHED */
1600
1601		default:
1602			panic("deallocate_dependencies: Unknown type %s",
1603			    TYPENAME(wk->wk_type));
1604			/* NOTREACHED */
1605		}
1606	}
1607}
1608
1609/*
1610 * Free an allocdirect. Generate a new freefrag work request if appropriate.
1611 * This routine must be called with splbio interrupts blocked.
1612 */
1613static void
1614free_allocdirect(adphead, adp, delay)
1615	struct allocdirectlst *adphead;
1616	struct allocdirect *adp;
1617	int delay;
1618{
1619
1620#ifdef DEBUG
1621	if (lk.lkt_held == -1)
1622		panic("free_allocdirect: lock not held");
1623#endif
1624	if ((adp->ad_state & DEPCOMPLETE) == 0)
1625		LIST_REMOVE(adp, ad_deps);
1626	TAILQ_REMOVE(adphead, adp, ad_next);
1627	if ((adp->ad_state & COMPLETE) == 0)
1628		WORKLIST_REMOVE(&adp->ad_list);
1629	if (adp->ad_freefrag != NULL) {
1630		if (delay)
1631			WORKLIST_INSERT(&adp->ad_inodedep->id_inowait,
1632			    &adp->ad_freefrag->ff_list);
1633		else
1634			add_to_worklist(&adp->ad_freefrag->ff_list);
1635	}
1636	WORKITEM_FREE(adp, M_ALLOCDIRECT);
1637}
1638
1639/*
1640 * Prepare an inode to be freed. The actual free operation is not
1641 * done until the zero'ed inode has been written to disk.
1642 */
1643void
1644softdep_freefile(ap)
1645	struct vop_vfree_args /* {
1646		struct vnode *a_pvp;
1647		ino_t a_ino;
1648		int a_mode;
1649	} */ *ap;
1650{
1651	struct inode *ip = VTOI(ap->a_pvp);
1652	struct inodedep *inodedep;
1653	struct freefile *freefile;
1654
1655	/*
1656	 * This sets up the inode de-allocation dependency.
1657	 */
1658	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
1659		M_FREEFILE, M_WAITOK);
1660	freefile->fx_list.wk_type = M_FREEFILE;
1661	freefile->fx_list.wk_state = 0;
1662	freefile->fx_mode = ap->a_mode;
1663	freefile->fx_oldinum = ap->a_ino;
1664	freefile->fx_devvp = ip->i_devvp;
1665	freefile->fx_fs = ip->i_fs;
1666
1667	/*
1668	 * If the inodedep does not exist, then the zero'ed inode has
1669	 * been written to disk and we can free the file immediately.
1670	 */
1671	ACQUIRE_LOCK(&lk);
1672	if (inodedep_lookup(ip->i_fs, ap->a_ino, 0, &inodedep) == 0) {
1673		add_to_worklist(&freefile->fx_list);
1674		FREE_LOCK(&lk);
1675		return;
1676	}
1677
1678	/*
1679	 * If we still have a bitmap dependency, then the inode has never
1680	 * been written to disk. Drop the dependency as it is no longer
1681	 * necessary since the inode is being deallocated. We could process
1682	 * the freefile immediately, but then we would have to clear the
1683	 * id_inowait dependencies here and it is easier just to let the
1684	 * zero'ed inode be written and let them be cleaned up in the
1685	 * normal followup actions that follow the inode write.
1686	 */
1687	 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
1688		inodedep->id_state |= DEPCOMPLETE;
1689		LIST_REMOVE(inodedep, id_deps);
1690		inodedep->id_buf = NULL;
1691	}
1692	/*
1693	 * If the inodedep has no dependencies associated with it,
1694	 * then we must free it here and free the file immediately.
1695	 * This case arises when an early allocation fails (for
1696	 * example, the user is over their file quota).
1697	 */
1698	if (free_inodedep(inodedep) == 0)
1699		WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
1700	else
1701		add_to_worklist(&freefile->fx_list);
1702	FREE_LOCK(&lk);
1703}
1704
1705/*
1706 * Try to free an inodedep structure. Return 1 if it could be freed.
1707 */
1708static int
1709free_inodedep(inodedep)
1710	struct inodedep *inodedep;
1711{
1712
1713	if ((inodedep->id_state & ONWORKLIST) != 0 ||
1714	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
1715	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
1716	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
1717	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
1718	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
1719	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL)
1720		return (0);
1721	LIST_REMOVE(inodedep, id_hash);
1722	WORKITEM_FREE(inodedep, M_INODEDEP);
1723	return (1);
1724}
1725
1726/*
1727 * This workitem routine performs the block de-allocation.
1728 * The workitem is added to the pending list after the updated
1729 * inode block has been written to disk.  As mentioned above,
1730 * checks regarding the number of blocks de-allocated (compared
1731 * to the number of blocks allocated for the file) are also
1732 * performed in this function.
1733 */
1734static void
1735handle_workitem_freeblocks(freeblks)
1736	struct freeblks *freeblks;
1737{
1738	struct inode tip;
1739	ufs_daddr_t bn;
1740	struct fs *fs;
1741	int i, level, bsize;
1742	long nblocks, blocksreleased = 0;
1743	int error, allerror = 0;
1744	ufs_lbn_t baselbns[NIADDR], tmpval;
1745
1746	tip.i_number = freeblks->fb_previousinum;
1747	tip.i_devvp = freeblks->fb_devvp;
1748	tip.i_dev = freeblks->fb_devvp->v_rdev;
1749	tip.i_fs = freeblks->fb_fs;
1750	tip.i_size = freeblks->fb_oldsize;
1751	tip.i_uid = freeblks->fb_uid;
1752	fs = freeblks->fb_fs;
1753	tmpval = 1;
1754	baselbns[0] = NDADDR;
1755	for (i = 1; i < NIADDR; i++) {
1756		tmpval *= NINDIR(fs);
1757		baselbns[i] = baselbns[i - 1] + tmpval;
1758	}
1759	nblocks = btodb(fs->fs_bsize);
1760	blocksreleased = 0;
1761	/*
1762	 * Indirect blocks first.
1763	 */
1764	for (level = (NIADDR - 1); level >= 0; level--) {
1765		if ((bn = freeblks->fb_iblks[level]) == 0)
1766			continue;
1767		if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
1768		    baselbns[level], &blocksreleased)) == 0)
1769			allerror = error;
1770		ffs_blkfree(&tip, bn, fs->fs_bsize);
1771		blocksreleased += nblocks;
1772	}
1773	/*
1774	 * All direct blocks or frags.
1775	 */
1776	for (i = (NDADDR - 1); i >= 0; i--) {
1777		if ((bn = freeblks->fb_dblks[i]) == 0)
1778			continue;
1779		bsize = blksize(fs, &tip, i);
1780		ffs_blkfree(&tip, bn, bsize);
1781		blocksreleased += btodb(bsize);
1782	}
1783
1784#ifdef DIAGNOSTIC
1785	if (freeblks->fb_chkcnt != blocksreleased)
1786		panic("handle_workitem_freeblocks: block count");
1787	if (allerror)
1788		softdep_error("handle_workitem_freeblks", allerror);
1789#endif /* DIAGNOSTIC */
1790	WORKITEM_FREE(freeblks, M_FREEBLKS);
1791}
1792
1793/*
1794 * Release blocks associated with the inode ip and stored in the indirect
1795 * block dbn. If level is greater than SINGLE, the block is an indirect block
1796 * and recursive calls to indirtrunc must be used to cleanse other indirect
1797 * blocks.
1798 */
1799static int
1800indir_trunc(ip, dbn, level, lbn, countp)
1801	struct inode *ip;
1802	ufs_daddr_t dbn;
1803	int level;
1804	ufs_lbn_t lbn;
1805	long *countp;
1806{
1807	struct buf *bp;
1808	ufs_daddr_t *bap;
1809	ufs_daddr_t nb;
1810	struct fs *fs;
1811	struct worklist *wk;
1812	struct indirdep *indirdep;
1813	int i, lbnadd, nblocks;
1814	int error, allerror = 0;
1815
1816	fs = ip->i_fs;
1817	lbnadd = 1;
1818	for (i = level; i > 0; i--)
1819		lbnadd *= NINDIR(fs);
1820	/*
1821	 * Get buffer of block pointers to be freed. This routine is not
1822	 * called until the zero'ed inode has been written, so it is safe
1823	 * to free blocks as they are encountered. Because the inode has
1824	 * been zero'ed, calls to bmap on these blocks will fail. So, we
1825	 * have to use the on-disk address and the block device for the
1826	 * filesystem to look them up. If the file was deleted before its
1827	 * indirect blocks were all written to disk, the routine that set
1828	 * us up (deallocate_dependencies) will have arranged to leave
1829	 * a complete copy of the indirect block in memory for our use.
1830	 * Otherwise we have to read the blocks in from the disk.
1831	 */
1832	ACQUIRE_LOCK(&lk);
1833	if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
1834	    (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
1835		if (wk->wk_type != M_INDIRDEP ||
1836		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
1837		    (indirdep->ir_state & GOINGAWAY) == 0)
1838			panic("indir_trunc: lost indirdep");
1839		WORKLIST_REMOVE(wk);
1840		WORKITEM_FREE(indirdep, M_INDIRDEP);
1841		if (LIST_FIRST(&bp->b_dep) != NULL)
1842			panic("indir_trunc: dangling dep");
1843		FREE_LOCK(&lk);
1844	} else {
1845		FREE_LOCK(&lk);
1846		error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
1847		if (error)
1848			return (error);
1849	}
1850	/*
1851	 * Recursively free indirect blocks.
1852	 */
1853	bap = (ufs_daddr_t *)bp->b_data;
1854	nblocks = btodb(fs->fs_bsize);
1855	for (i = NINDIR(fs) - 1; i >= 0; i--) {
1856		if ((nb = bap[i]) == 0)
1857			continue;
1858		if (level != 0) {
1859			if ((error = indir_trunc(ip, fsbtodb(fs, nb),
1860			     level - 1, lbn + (i * lbnadd), countp)) != 0)
1861				allerror = error;
1862		}
1863		ffs_blkfree(ip, nb, fs->fs_bsize);
1864		*countp += nblocks;
1865	}
1866	bp->b_flags |= B_INVAL;
1867	brelse(bp);
1868	return (allerror);
1869}
1870
1871/*
1872 * Free an allocindir.
1873 * This routine must be called with splbio interrupts blocked.
1874 */
1875static void
1876free_allocindir(aip, inodedep)
1877	struct allocindir *aip;
1878	struct inodedep *inodedep;
1879{
1880	struct freefrag *freefrag;
1881
1882#ifdef DEBUG
1883	if (lk.lkt_held == -1)
1884		panic("free_allocindir: lock not held");
1885#endif
1886	if ((aip->ai_state & DEPCOMPLETE) == 0)
1887		LIST_REMOVE(aip, ai_deps);
1888	if (aip->ai_state & ONWORKLIST)
1889		WORKLIST_REMOVE(&aip->ai_list);
1890	LIST_REMOVE(aip, ai_next);
1891	if ((freefrag = aip->ai_freefrag) != NULL) {
1892		if (inodedep == NULL)
1893			add_to_worklist(&freefrag->ff_list);
1894		else
1895			WORKLIST_INSERT(&inodedep->id_inowait,
1896			    &freefrag->ff_list);
1897	}
1898	WORKITEM_FREE(aip, M_ALLOCINDIR);
1899}
1900
1901/*
1902 * Directory entry addition dependencies.
1903 *
1904 * When adding a new directory entry, the inode (with its incremented link
1905 * count) must be written to disk before the directory entry's pointer to it.
1906 * Also, if the inode is newly allocated, the corresponding freemap must be
1907 * updated (on disk) before the directory entry's pointer. These requirements
1908 * are met via undo/redo on the directory entry's pointer, which consists
1909 * simply of the inode number.
1910 *
1911 * As directory entries are added and deleted, the free space within a
1912 * directory block can become fragmented.  The ufs file system will compact
1913 * a fragmented directory block to make space for a new entry. When this
1914 * occurs, the offsets of previously added entries change. Any "diradd"
1915 * dependency structures corresponding to these entries must be updated with
1916 * the new offsets.
1917 */
1918
1919/*
1920 * This routine is called after the in-memory inode's link
1921 * count has been incremented, but before the directory entry's
1922 * pointer to the inode has been set.
1923 */
1924void
1925softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
1926	struct buf *bp;		/* buffer containing directory block */
1927	struct inode *dp;	/* inode for directory */
1928	off_t diroffset;	/* offset of new entry in directory */
1929	long newinum;		/* inode referenced by new directory entry */
1930	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
1931{
1932	int offset;		/* offset of new entry within directory block */
1933	ufs_lbn_t lbn;		/* block in directory containing new entry */
1934	struct fs *fs;
1935	struct diradd *dap;
1936	struct pagedep *pagedep;
1937	struct inodedep *inodedep;
1938	struct mkdir *mkdir1, *mkdir2;
1939
1940	/*
1941	 * Whiteouts have no dependencies.
1942	 */
1943	if (newinum == WINO) {
1944		if (newdirbp != NULL)
1945			bdwrite(newdirbp);
1946		return;
1947	}
1948
1949	fs = dp->i_fs;
1950	lbn = lblkno(fs, diroffset);
1951	offset = blkoff(fs, diroffset);
1952	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK);
1953	bzero(dap, sizeof(struct diradd));
1954	dap->da_list.wk_type = M_DIRADD;
1955	dap->da_offset = offset;
1956	dap->da_newinum = newinum;
1957	dap->da_state = ATTACHED;
1958	if (newdirbp == NULL) {
1959		dap->da_state |= DEPCOMPLETE;
1960	} else {
1961		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
1962		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
1963		    M_WAITOK);
1964		mkdir1->md_list.wk_type = M_MKDIR;
1965		mkdir1->md_state = MKDIR_BODY;
1966		mkdir1->md_diradd = dap;
1967		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
1968		    M_WAITOK);
1969		mkdir2->md_list.wk_type = M_MKDIR;
1970		mkdir2->md_state = MKDIR_PARENT;
1971		mkdir2->md_diradd = dap;
1972
1973	}
1974
1975	ACQUIRE_LOCK(&lk);
1976	/*
1977	 * If this directory entry references a new directory, create
1978	 * its two additional dependencies: its "." and ".." being written
1979	 * to disk and the link count increase for its parent directory.
1980	 */
1981	if (newdirbp != NULL) {
1982		/*
1983		 * Dependency on "." and ".." being written to disk
1984		 */
1985		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
1986		WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
1987		bdwrite(newdirbp);
1988		/*
1989		 * Dependency on link count increase for parent directory
1990		 */
1991		if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0
1992		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
1993			dap->da_state &= ~MKDIR_PARENT;
1994			WORKITEM_FREE(mkdir2, M_MKDIR);
1995		} else {
1996			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
1997			WORKLIST_INSERT(&inodedep->id_inowait,&mkdir2->md_list);
1998		}
1999	}
2000	/*
2001	 * Link into parent directory pagedep and new inode inodedep
2002	 * structures to await its being written.
2003	 */
2004	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2005		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2006	dap->da_pagedep = pagedep;
2007	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2008	    da_pdlist);
2009	if (inodedep_lookup(fs, newinum, DEPALLOC, &inodedep) == 1 &&
2010	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2011		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
2012	else
2013		WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list);
2014	FREE_LOCK(&lk);
2015}
2016
2017/*
2018 * This procedure is called to change the offset of a directory
2019 * entry when compacting a directory block which must be owned
2020 * exclusively by the caller. Note that the actual entry movement
2021 * must be done in this procedure to ensure that no I/O completions
2022 * occur while the move is in progress.
2023 */
2024void
2025softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2026	struct inode *dp;	/* inode for directory */
2027	caddr_t base;		/* address of dp->i_offset */
2028	caddr_t oldloc;		/* address of old directory location */
2029	caddr_t newloc;		/* address of new directory location */
2030	int entrysize;		/* size of directory entry */
2031{
2032	int offset, oldoffset, newoffset;
2033	struct pagedep *pagedep;
2034	struct diradd *dap;
2035	ufs_lbn_t lbn;
2036
2037	ACQUIRE_LOCK(&lk);
2038	lbn = lblkno(dp->i_fs, dp->i_offset);
2039	offset = blkoff(dp->i_fs, dp->i_offset);
2040	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2041		goto done;
2042	oldoffset = offset + (oldloc - base);
2043	newoffset = offset + (newloc - base);
2044	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]);
2045	     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2046		if (dap->da_offset != oldoffset)
2047			continue;
2048		dap->da_offset = newoffset;
2049		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2050			break;
2051		LIST_REMOVE(dap, da_pdlist);
2052		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2053		    dap, da_pdlist);
2054		break;
2055	}
2056done:
2057	bcopy(oldloc, newloc, entrysize);
2058	FREE_LOCK(&lk);
2059}
2060
2061/*
2062 * Free a diradd dependency structure. This routine must be called
2063 * with splbio interrupts blocked.
2064 */
2065static void
2066free_diradd(dap)
2067	struct diradd *dap;
2068{
2069	struct dirrem *dirrem;
2070	struct pagedep *pagedep;
2071	struct inodedep *inodedep;
2072	struct mkdir *mkdir, *nextmd;
2073
2074#ifdef DEBUG
2075	if (lk.lkt_held == -1)
2076		panic("free_diradd: lock not held");
2077#endif
2078	WORKLIST_REMOVE(&dap->da_list);
2079	LIST_REMOVE(dap, da_pdlist);
2080	if ((dap->da_state & DIRCHG) == 0) {
2081		pagedep = dap->da_pagedep;
2082	} else {
2083		dirrem = dap->da_previous;
2084		pagedep = dirrem->dm_pagedep;
2085		add_to_worklist(&dirrem->dm_list);
2086	}
2087	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2088	    0, &inodedep) != 0)
2089		(void) free_inodedep(inodedep);
2090	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2091		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2092			nextmd = LIST_NEXT(mkdir, md_mkdirs);
2093			if (mkdir->md_diradd != dap)
2094				continue;
2095			dap->da_state &= ~mkdir->md_state;
2096			WORKLIST_REMOVE(&mkdir->md_list);
2097			LIST_REMOVE(mkdir, md_mkdirs);
2098			WORKITEM_FREE(mkdir, M_MKDIR);
2099		}
2100		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
2101			panic("free_diradd: unfound ref");
2102	}
2103	WORKITEM_FREE(dap, M_DIRADD);
2104}
2105
2106/*
2107 * Directory entry removal dependencies.
2108 *
2109 * When removing a directory entry, the entry's inode pointer must be
2110 * zero'ed on disk before the corresponding inode's link count is decremented
2111 * (possibly freeing the inode for re-use). This dependency is handled by
2112 * updating the directory entry but delaying the inode count reduction until
2113 * after the directory block has been written to disk. After this point, the
2114 * inode count can be decremented whenever it is convenient.
2115 */
2116
2117/*
2118 * This routine should be called immediately after removing
2119 * a directory entry.  The inode's link count should not be
2120 * decremented by the calling procedure -- the soft updates
2121 * code will do this task when it is safe.
2122 */
2123void
2124softdep_setup_remove(bp, dp, ip, isrmdir)
2125	struct buf *bp;		/* buffer containing directory block */
2126	struct inode *dp;	/* inode for the directory being modified */
2127	struct inode *ip;	/* inode for directory entry being removed */
2128	int isrmdir;		/* indicates if doing RMDIR */
2129{
2130	struct dirrem *dirrem;
2131
2132	/*
2133	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2134	 */
2135	dirrem = newdirrem(bp, dp, ip, isrmdir);
2136	if ((dirrem->dm_state & COMPLETE) == 0) {
2137		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2138		    dm_next);
2139	} else {
2140		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2141		add_to_worklist(&dirrem->dm_list);
2142	}
2143	FREE_LOCK(&lk);
2144}
2145
2146/*
2147 * Allocate a new dirrem if appropriate and return it along with
2148 * its associated pagedep. Called without a lock, returns with lock.
2149 */
2150static struct dirrem *
2151newdirrem(bp, dp, ip, isrmdir)
2152	struct buf *bp;		/* buffer containing directory block */
2153	struct inode *dp;	/* inode for the directory being modified */
2154	struct inode *ip;	/* inode for directory entry being removed */
2155	int isrmdir;		/* indicates if doing RMDIR */
2156{
2157	int offset;
2158	ufs_lbn_t lbn;
2159	struct diradd *dap;
2160	struct dirrem *dirrem;
2161	struct pagedep *pagedep;
2162
2163	/*
2164	 * Whiteouts have no deletion dependencies.
2165	 */
2166	if (ip == NULL)
2167		panic("newdirrem: whiteout");
2168	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
2169		M_DIRREM, M_WAITOK);
2170	bzero(dirrem, sizeof(struct dirrem));
2171	dirrem->dm_list.wk_type = M_DIRREM;
2172	dirrem->dm_state = isrmdir ? RMDIR : 0;
2173	dirrem->dm_mnt = ITOV(ip)->v_mount;
2174	dirrem->dm_oldinum = ip->i_number;
2175
2176	ACQUIRE_LOCK(&lk);
2177	lbn = lblkno(dp->i_fs, dp->i_offset);
2178	offset = blkoff(dp->i_fs, dp->i_offset);
2179	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2180		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2181	dirrem->dm_pagedep = pagedep;
2182	for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(offset)]);
2183	     dap; dap = LIST_NEXT(dap, da_pdlist)) {
2184		/*
2185		 * Check for a diradd dependency for the same directory entry.
2186		 * If present, then both dependencies become obsolete and can
2187		 * be de-allocated.
2188		 */
2189		if (dap->da_offset != offset)
2190			continue;
2191		/*
2192		 * Must be ATTACHED at this point, so just delete it.
2193		 */
2194		if ((dap->da_state & ATTACHED) == 0)
2195			panic("newdirrem: not ATTACHED");
2196		if (dap->da_newinum != ip->i_number)
2197			panic("newdirrem: inum %d should be %d",
2198			    ip->i_number, dap->da_newinum);
2199		free_diradd(dap);
2200		dirrem->dm_state |= COMPLETE;
2201		break;
2202	}
2203	return (dirrem);
2204}
2205
2206/*
2207 * Directory entry change dependencies.
2208 *
2209 * Changing an existing directory entry requires that an add operation
2210 * be completed first followed by a deletion. The semantics for the addition
2211 * are identical to the description of adding a new entry above except
2212 * that the rollback is to the old inode number rather than zero. Once
2213 * the addition dependency is completed, the removal is done as described
2214 * in the removal routine above.
2215 */
2216
2217/*
2218 * This routine should be called immediately after changing
2219 * a directory entry.  The inode's link count should not be
2220 * decremented by the calling procedure -- the soft updates
2221 * code will perform this task when it is safe.
2222 */
2223void
2224softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
2225	struct buf *bp;		/* buffer containing directory block */
2226	struct inode *dp;	/* inode for the directory being modified */
2227	struct inode *ip;	/* inode for directory entry being removed */
2228	long newinum;		/* new inode number for changed entry */
2229	int isrmdir;		/* indicates if doing RMDIR */
2230{
2231	int offset;
2232	struct diradd *dap;
2233	struct dirrem *dirrem;
2234	struct inodedep *inodedep;
2235
2236	offset = blkoff(dp->i_fs, dp->i_offset);
2237
2238	/*
2239	 * Whiteouts have no addition dependencies.
2240	 */
2241	if (newinum == WINO) {
2242		dap = NULL;
2243	} else {
2244		MALLOC(dap, struct diradd *, sizeof(struct diradd),
2245		    M_DIRADD, M_WAITOK);
2246		bzero(dap, sizeof(struct diradd));
2247		dap->da_list.wk_type = M_DIRADD;
2248		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
2249		dap->da_offset = offset;
2250		dap->da_newinum = newinum;
2251	}
2252
2253	/*
2254	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2255	 */
2256	dirrem = newdirrem(bp, dp, ip, isrmdir);
2257
2258	/*
2259	 * If the inode has already been written, then no addition
2260	 * dependency needs to be created.
2261	 */
2262	if (inodedep_lookup(dp->i_fs, newinum, 0, &inodedep) == 0 ||
2263	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2264		WORKITEM_FREE(dap, M_DIRADD);
2265		dap = NULL;
2266	}
2267
2268	if (dap) {
2269		dap->da_previous = dirrem;
2270		LIST_INSERT_HEAD(
2271		    &dirrem->dm_pagedep->pd_diraddhd[DIRADDHASH(offset)],
2272		    dap, da_pdlist);
2273		WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list);
2274	} else if ((dirrem->dm_state & COMPLETE) == 0) {
2275		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2276		    dm_next);
2277	} else {
2278		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2279		add_to_worklist(&dirrem->dm_list);
2280	}
2281	FREE_LOCK(&lk);
2282}
2283
2284/*
2285 * Called whenever the link count on an inode is increased.
2286 * It creates an inode dependency so that the new reference(s)
2287 * to the inode cannot be committed to disk until the updated
2288 * inode has been written.
2289 */
2290void
2291softdep_increase_linkcnt(ip)
2292	struct inode *ip;	/* the inode with the increased link count */
2293{
2294	struct inodedep *inodedep;
2295
2296	ACQUIRE_LOCK(&lk);
2297	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
2298	FREE_LOCK(&lk);
2299}
2300
2301/*
2302 * This workitem decrements the inode's link count.
2303 * If the link count reaches zero, the file is removed.
2304 */
2305static void
2306handle_workitem_remove(dirrem)
2307	struct dirrem *dirrem;
2308{
2309	struct proc *p = curproc;	/* XXX */
2310	struct inodedep *inodedep;
2311	struct vnode *vp;
2312	struct inode *ip;
2313	int error;
2314
2315	if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
2316		softdep_error("handle_workitem_remove: vget", error);
2317		return;
2318	}
2319	ip = VTOI(vp);
2320	/*
2321	 * Normal file deletion.
2322	 */
2323	if ((dirrem->dm_state & RMDIR) == 0) {
2324		ip->i_nlink--;
2325		if (ip->i_nlink < ip->i_effnlink) {
2326#ifdef DIAGNOSTIC
2327			vprint("handle_workitem_remove: bad file delta", vp);
2328#endif
2329			ip->i_effnlink = ip->i_nlink;
2330		}
2331		ip->i_flag |= IN_CHANGE;
2332		vput(vp);
2333		WORKITEM_FREE(dirrem, M_DIRREM);
2334		return;
2335	}
2336	/*
2337	 * Directory deletion. Decrement reference count for both the
2338	 * just deleted parent directory entry and the reference for ".".
2339	 * Next truncate the directory to length zero. When the
2340	 * truncation completes, arrange to have the reference count on
2341	 * the parent decremented to account for the loss of "..".
2342	 */
2343	ip->i_nlink -= 2;
2344	if (ip->i_nlink < ip->i_effnlink)
2345		panic("handle_workitem_remove: bad dir delta");
2346	ip->i_flag |= IN_CHANGE;
2347	if ((error = VOP_TRUNCATE(vp, (off_t)0, 0, p->p_cred, p)) != 0)
2348		softdep_error("handle_workitem_remove: truncate", error);
2349	ACQUIRE_LOCK(&lk);
2350	(void) inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, DEPALLOC,
2351	    &inodedep);
2352	dirrem->dm_state = 0;
2353	dirrem->dm_oldinum = dirrem->dm_dirinum;
2354	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
2355	FREE_LOCK(&lk);
2356	vput(vp);
2357}
2358
2359/*
2360 * Inode de-allocation dependencies.
2361 *
2362 * When an inode's link count is reduced to zero, it can be de-allocated. We
2363 * found it convenient to postpone de-allocation until after the inode is
2364 * written to disk with its new link count (zero).  At this point, all of the
2365 * on-disk inode's block pointers are nullified and, with careful dependency
2366 * list ordering, all dependencies related to the inode will be satisfied and
2367 * the corresponding dependency structures de-allocated.  So, if/when the
2368 * inode is reused, there will be no mixing of old dependencies with new
2369 * ones.  This artificial dependency is set up by the block de-allocation
2370 * procedure above (softdep_setup_freeblocks) and completed by the
2371 * following procedure.
2372 */
2373static void
2374handle_workitem_freefile(freefile)
2375	struct freefile *freefile;
2376{
2377	struct vnode vp;
2378	struct inode tip;
2379	struct inodedep *idp;
2380	struct vop_vfree_args args;
2381	int error;
2382
2383#ifdef DEBUG
2384	ACQUIRE_LOCK(&lk);
2385	if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp))
2386		panic("handle_workitem_freefile: inodedep survived");
2387	FREE_LOCK(&lk);
2388#endif
2389	tip.i_devvp = freefile->fx_devvp;
2390	tip.i_dev = freefile->fx_devvp->v_rdev;
2391	tip.i_fs = freefile->fx_fs;
2392	vp.v_data = &tip;
2393	args.a_pvp = &vp;
2394	args.a_ino = freefile->fx_oldinum;
2395	args.a_mode = freefile->fx_mode;
2396	if ((error = ffs_freefile(&args)) != 0)
2397		softdep_error("handle_workitem_freefile", error);
2398	WORKITEM_FREE(freefile, M_FREEFILE);
2399}
2400
2401/*
2402 * Disk writes.
2403 *
2404 * The dependency structures constructed above are most actively used when file
2405 * system blocks are written to disk.  No constraints are placed on when a
2406 * block can be written, but unsatisfied update dependencies are made safe by
2407 * modifying (or replacing) the source memory for the duration of the disk
2408 * write.  When the disk write completes, the memory block is again brought
2409 * up-to-date.
2410 *
2411 * In-core inode structure reclamation.
2412 *
2413 * Because there are a finite number of "in-core" inode structures, they are
2414 * reused regularly.  By transferring all inode-related dependencies to the
2415 * in-memory inode block and indexing them separately (via "inodedep"s), we
2416 * can allow "in-core" inode structures to be reused at any time and avoid
2417 * any increase in contention.
2418 *
2419 * Called just before entering the device driver to initiate a new disk I/O.
2420 * The buffer must be locked, thus, no I/O completion operations can occur
2421 * while we are manipulating its associated dependencies.
2422 */
2423void
2424softdep_disk_io_initiation(bp)
2425	struct buf *bp;		/* structure describing disk write to occur */
2426{
2427	struct worklist *wk, *nextwk;
2428	struct indirdep *indirdep;
2429
2430	/*
2431	 * We only care about write operations. There should never
2432	 * be dependencies for reads.
2433	 */
2434	if (bp->b_flags & B_READ)
2435		panic("softdep_disk_io_initiation: read");
2436	/*
2437	 * Do any necessary pre-I/O processing.
2438	 */
2439	for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
2440		nextwk = LIST_NEXT(wk, wk_list);
2441		switch (wk->wk_type) {
2442
2443		case M_PAGEDEP:
2444			initiate_write_filepage(WK_PAGEDEP(wk), bp);
2445			continue;
2446
2447		case M_INODEDEP:
2448			initiate_write_inodeblock(WK_INODEDEP(wk), bp);
2449			continue;
2450
2451		case M_INDIRDEP:
2452			indirdep = WK_INDIRDEP(wk);
2453			if (indirdep->ir_state & GOINGAWAY)
2454				panic("disk_io_initiation: indirdep gone");
2455			/*
2456			 * If there are no remaining dependencies, this
2457			 * will be writing the real pointers, so the
2458			 * dependency can be freed.
2459			 */
2460			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
2461				brelse(indirdep->ir_savebp);
2462				/* inline expand WORKLIST_REMOVE(wk); */
2463				wk->wk_state &= ~ONWORKLIST;
2464				LIST_REMOVE(wk, wk_list);
2465				WORKITEM_FREE(indirdep, M_INDIRDEP);
2466				continue;
2467			}
2468			/*
2469			 * Replace up-to-date version with safe version.
2470			 */
2471			ACQUIRE_LOCK(&lk);
2472			indirdep->ir_state &= ~ATTACHED;
2473			indirdep->ir_state |= UNDONE;
2474			bp->b_data = indirdep->ir_savebp->b_data;
2475			FREE_LOCK(&lk);
2476			continue;
2477
2478		case M_MKDIR:
2479		case M_BMSAFEMAP:
2480		case M_ALLOCDIRECT:
2481		case M_ALLOCINDIR:
2482			continue;
2483
2484		default:
2485			panic("handle_disk_io_initiation: Unexpected type %s",
2486			    TYPENAME(wk->wk_type));
2487			/* NOTREACHED */
2488		}
2489	}
2490}
2491
2492/*
2493 * Called from within the procedure above to deal with unsatisfied
2494 * allocation dependencies in a directory. The buffer must be locked,
2495 * thus, no I/O completion operations can occur while we are
2496 * manipulating its associated dependencies.
2497 */
2498static void
2499initiate_write_filepage(pagedep, bp)
2500	struct pagedep *pagedep;
2501	struct buf *bp;
2502{
2503	struct diradd *dap;
2504	struct direct *ep;
2505	int i;
2506
2507	if (pagedep->pd_state & IOSTARTED) {
2508		/*
2509		 * This can only happen if there is a driver that does not
2510		 * understand chaining. Here biodone will reissue the call
2511		 * to strategy for the incomplete buffers.
2512		 */
2513		printf("initiate_write_filepage: already started\n");
2514		return;
2515	}
2516	pagedep->pd_state |= IOSTARTED;
2517	ACQUIRE_LOCK(&lk);
2518	for (i = 0; i < DAHASHSZ; i++) {
2519		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
2520		     dap = LIST_NEXT(dap, da_pdlist)) {
2521			ep = (struct direct *)
2522			    ((char *)bp->b_data + dap->da_offset);
2523			if (ep->d_ino != dap->da_newinum)
2524				panic("%s: dir inum %d != new %d",
2525				    "initiate_write_filepage",
2526				    ep->d_ino, dap->da_newinum);
2527			if (dap->da_state & DIRCHG)
2528				ep->d_ino = dap->da_previous->dm_oldinum;
2529			else
2530				ep->d_ino = 0;
2531			dap->da_state &= ~ATTACHED;
2532			dap->da_state |= UNDONE;
2533		}
2534	}
2535	FREE_LOCK(&lk);
2536}
2537
2538/*
2539 * Called from within the procedure above to deal with unsatisfied
2540 * allocation dependencies in an inodeblock. The buffer must be
2541 * locked, thus, no I/O completion operations can occur while we
2542 * are manipulating its associated dependencies.
2543 */
2544static void
2545initiate_write_inodeblock(inodedep, bp)
2546	struct inodedep *inodedep;
2547	struct buf *bp;			/* The inode block */
2548{
2549	struct allocdirect *adp, *lastadp;
2550	struct dinode *dp;
2551	struct fs *fs;
2552	ufs_lbn_t prevlbn;
2553	int i, deplist;
2554
2555	if (inodedep->id_state & IOSTARTED)
2556		panic("initiate_write_inodeblock: already started");
2557	inodedep->id_state |= IOSTARTED;
2558	fs = inodedep->id_fs;
2559	dp = (struct dinode *)bp->b_data +
2560	    ino_to_fsbo(fs, inodedep->id_ino);
2561	/*
2562	 * If the bitmap is not yet written, then the allocated
2563	 * inode cannot be written to disk.
2564	 */
2565	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
2566		if (inodedep->id_savedino != NULL)
2567			panic("initiate_write_inodeblock: already doing I/O");
2568		MALLOC(inodedep->id_savedino, struct dinode *,
2569		    sizeof(struct dinode), M_INODEDEP, M_WAITOK);
2570		*inodedep->id_savedino = *dp;
2571		bzero((caddr_t)dp, sizeof(struct dinode));
2572		return;
2573	}
2574	/*
2575	 * If no dependencies, then there is nothing to roll back.
2576	 */
2577	inodedep->id_savedsize = dp->di_size;
2578	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
2579		return;
2580	/*
2581	 * Set the dependencies to busy.
2582	 */
2583	ACQUIRE_LOCK(&lk);
2584	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2585	     adp = TAILQ_NEXT(adp, ad_next)) {
2586#ifdef DIAGNOSTIC
2587		if (deplist != 0 && prevlbn >= adp->ad_lbn)
2588			panic("softdep_write_inodeblock: lbn order");
2589		prevlbn = adp->ad_lbn;
2590		if (adp->ad_lbn < NDADDR &&
2591		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
2592			panic("%s: direct pointer #%d mismatch %d != %d",
2593			    "softdep_write_inodeblock", adp->ad_lbn,
2594			    dp->di_db[adp->ad_lbn], adp->ad_newblkno);
2595		if (adp->ad_lbn >= NDADDR &&
2596		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
2597			panic("%s: indirect pointer #%d mismatch %d != %d",
2598			    "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
2599			    dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno);
2600		deplist |= 1 << adp->ad_lbn;
2601		if ((adp->ad_state & ATTACHED) == 0)
2602			panic("softdep_write_inodeblock: Unknown state 0x%x",
2603			    adp->ad_state);
2604#endif /* DIAGNOSTIC */
2605		adp->ad_state &= ~ATTACHED;
2606		adp->ad_state |= UNDONE;
2607	}
2608	/*
2609	 * The on-disk inode cannot claim to be any larger than the last
2610	 * fragment that has been written. Otherwise, the on-disk inode
2611	 * might have fragments that were not the last block in the file
2612	 * which would corrupt the filesystem.
2613	 */
2614	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
2615	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
2616		if (adp->ad_lbn >= NDADDR)
2617			break;
2618		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
2619		/* keep going until hitting a rollback to a frag */
2620		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
2621			continue;
2622		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
2623		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
2624#ifdef DIAGNOSTIC
2625			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
2626				panic("softdep_write_inodeblock: lost dep1");
2627#endif /* DIAGNOSTIC */
2628			dp->di_db[i] = 0;
2629		}
2630		for (i = 0; i < NIADDR; i++) {
2631#ifdef DIAGNOSTIC
2632			if (dp->di_ib[i] != 0 &&
2633			    (deplist & ((1 << NDADDR) << i)) == 0)
2634				panic("softdep_write_inodeblock: lost dep2");
2635#endif /* DIAGNOSTIC */
2636			dp->di_ib[i] = 0;
2637		}
2638		FREE_LOCK(&lk);
2639		return;
2640	}
2641	/*
2642	 * If we have zero'ed out the last allocated block of the file,
2643	 * roll back the size to the last currently allocated block.
2644	 * We know that this last allocated block is a full-sized as
2645	 * we already checked for fragments in the loop above.
2646	 */
2647	if (lastadp != NULL &&
2648	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
2649		for (i = lastadp->ad_lbn; i >= 0; i--)
2650			if (dp->di_db[i] != 0)
2651				break;
2652		dp->di_size = (i + 1) * fs->fs_bsize;
2653	}
2654	/*
2655	 * The only dependencies are for indirect blocks.
2656	 *
2657	 * The file size for indirect block additions is not guaranteed.
2658	 * Such a guarantee would be non-trivial to achieve. The conventional
2659	 * synchronous write implementation also does not make this guarantee.
2660	 * Fsck should catch and fix discrepancies. Arguably, the file size
2661	 * can be over-estimated without destroying integrity when the file
2662	 * moves into the indirect blocks (i.e., is large). If we want to
2663	 * postpone fsck, we are stuck with this argument.
2664	 */
2665	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
2666		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
2667	FREE_LOCK(&lk);
2668}
2669
2670/*
2671 * This routine is called during the completion interrupt
2672 * service routine for a disk write (from the procedure called
2673 * by the device driver to inform the file system caches of
2674 * a request completion).  It should be called early in this
2675 * procedure, before the block is made available to other
2676 * processes or other routines are called.
2677 */
2678void
2679softdep_disk_write_complete(bp)
2680	struct buf *bp;		/* describes the completed disk write */
2681{
2682	struct worklist *wk;
2683	struct workhead reattach;
2684	struct newblk *newblk;
2685	struct allocindir *aip;
2686	struct allocdirect *adp;
2687	struct indirdep *indirdep;
2688	struct inodedep *inodedep;
2689	struct bmsafemap *bmsafemap;
2690
2691#ifdef DEBUG
2692	if (lk.lkt_held != -1)
2693		panic("softdep_disk_write_complete: lock is held");
2694	lk.lkt_held = -2;
2695#endif
2696	LIST_INIT(&reattach);
2697	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2698		WORKLIST_REMOVE(wk);
2699		switch (wk->wk_type) {
2700
2701		case M_PAGEDEP:
2702			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
2703				WORKLIST_INSERT(&reattach, wk);
2704			continue;
2705
2706		case M_INODEDEP:
2707			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
2708				WORKLIST_INSERT(&reattach, wk);
2709			continue;
2710
2711		case M_BMSAFEMAP:
2712			bmsafemap = WK_BMSAFEMAP(wk);
2713			while (newblk = LIST_FIRST(&bmsafemap->sm_newblkhd)) {
2714				newblk->nb_state |= DEPCOMPLETE;
2715				newblk->nb_bmsafemap = NULL;
2716				LIST_REMOVE(newblk, nb_deps);
2717			}
2718			while (adp = LIST_FIRST(&bmsafemap->sm_allocdirecthd)) {
2719				adp->ad_state |= DEPCOMPLETE;
2720				adp->ad_buf = NULL;
2721				LIST_REMOVE(adp, ad_deps);
2722				handle_allocdirect_partdone(adp);
2723			}
2724			while (aip = LIST_FIRST(&bmsafemap->sm_allocindirhd)) {
2725				aip->ai_state |= DEPCOMPLETE;
2726				aip->ai_buf = NULL;
2727				LIST_REMOVE(aip, ai_deps);
2728				handle_allocindir_partdone(aip);
2729			}
2730			while ((inodedep =
2731			       LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
2732				inodedep->id_state |= DEPCOMPLETE;
2733				LIST_REMOVE(inodedep, id_deps);
2734				inodedep->id_buf = NULL;
2735			}
2736			WORKITEM_FREE(bmsafemap, M_BMSAFEMAP);
2737			continue;
2738
2739		case M_MKDIR:
2740			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
2741			continue;
2742
2743		case M_ALLOCDIRECT:
2744			adp = WK_ALLOCDIRECT(wk);
2745			adp->ad_state |= COMPLETE;
2746			handle_allocdirect_partdone(adp);
2747			continue;
2748
2749		case M_ALLOCINDIR:
2750			aip = WK_ALLOCINDIR(wk);
2751			aip->ai_state |= COMPLETE;
2752			handle_allocindir_partdone(aip);
2753			continue;
2754
2755		case M_INDIRDEP:
2756			indirdep = WK_INDIRDEP(wk);
2757			if (indirdep->ir_state & GOINGAWAY)
2758				panic("disk_write_complete: indirdep gone");
2759			bp->b_data = (caddr_t)indirdep->ir_saveddata;
2760			indirdep->ir_state &= ~UNDONE;
2761			indirdep->ir_state |= ATTACHED;
2762			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
2763				LIST_REMOVE(aip, ai_next);
2764				handle_allocindir_partdone(aip);
2765			}
2766			WORKLIST_INSERT(&reattach, wk);
2767			bdirty(bp);
2768			continue;
2769
2770		default:
2771			panic("handle_disk_write_complete: Unknown type %s",
2772			    TYPENAME(wk->wk_type));
2773			/* NOTREACHED */
2774		}
2775	}
2776	/*
2777	 * Reattach any requests that must be redone.
2778	 */
2779	while ((wk = LIST_FIRST(&reattach)) != NULL) {
2780		WORKLIST_REMOVE(wk);
2781		WORKLIST_INSERT(&bp->b_dep, wk);
2782	}
2783#ifdef DEBUG
2784	if (lk.lkt_held != -2)
2785		panic("softdep_disk_write_complete: lock lost");
2786	lk.lkt_held = -1;
2787#endif
2788}
2789
2790/*
2791 * Called from within softdep_disk_write_complete above. Note that
2792 * this routine is always called from interrupt level with further
2793 * splbio interrupts blocked.
2794 */
2795static void
2796handle_allocdirect_partdone(adp)
2797	struct allocdirect *adp;	/* the completed allocdirect */
2798{
2799	struct allocdirect *listadp;
2800	struct inodedep *inodedep;
2801	long bsize;
2802
2803	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
2804		return;
2805	if (adp->ad_buf != NULL)
2806		panic("handle_allocdirect_partdone: dangling dep");
2807	/*
2808	 * The on-disk inode cannot claim to be any larger than the last
2809	 * fragment that has been written. Otherwise, the on-disk inode
2810	 * might have fragments that were not the last block in the file
2811	 * which would corrupt the filesystem. Thus, we cannot free any
2812	 * allocdirects after one whose ad_oldblkno claims a fragment as
2813	 * these blocks must be rolled back to zero before writing the inode.
2814	 * We check the currently active set of allocdirects in id_inoupdt.
2815	 */
2816	inodedep = adp->ad_inodedep;
2817	bsize = inodedep->id_fs->fs_bsize;
2818	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp;
2819	     listadp = TAILQ_NEXT(listadp, ad_next)) {
2820		/* found our block */
2821		if (listadp == adp)
2822			break;
2823		/* continue if ad_oldlbn is not a fragment */
2824		if (listadp->ad_oldsize == 0 ||
2825		    listadp->ad_oldsize == bsize)
2826			continue;
2827		/* hit a fragment */
2828		return;
2829	}
2830	/*
2831	 * If we have reached the end of the current list without
2832	 * finding the just finished dependency, then it must be
2833	 * on the future dependency list. Future dependencies cannot
2834	 * be freed until they are moved to the current list.
2835	 */
2836	if (listadp == NULL) {
2837#ifdef DEBUG
2838		for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp;
2839		     listadp = TAILQ_NEXT(listadp, ad_next))
2840			/* found our block */
2841			if (listadp == adp)
2842				break;
2843		if (listadp == NULL)
2844			panic("handle_allocdirect_partdone: lost dep");
2845#endif /* DEBUG */
2846		return;
2847	}
2848	/*
2849	 * If we have found the just finished dependency, then free
2850	 * it along with anything that follows it that is complete.
2851	 */
2852	for (; adp; adp = listadp) {
2853		listadp = TAILQ_NEXT(adp, ad_next);
2854		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
2855			return;
2856		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
2857	}
2858}
2859
2860/*
2861 * Called from within softdep_disk_write_complete above. Note that
2862 * this routine is always called from interrupt level with further
2863 * splbio interrupts blocked.
2864 */
2865static void
2866handle_allocindir_partdone(aip)
2867	struct allocindir *aip;		/* the completed allocindir */
2868{
2869	struct indirdep *indirdep;
2870
2871	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
2872		return;
2873	if (aip->ai_buf != NULL)
2874		panic("handle_allocindir_partdone: dangling dependency");
2875	indirdep = aip->ai_indirdep;
2876	if (indirdep->ir_state & UNDONE) {
2877		LIST_REMOVE(aip, ai_next);
2878		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
2879		return;
2880	}
2881	((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
2882	    aip->ai_newblkno;
2883	LIST_REMOVE(aip, ai_next);
2884	if (aip->ai_freefrag != NULL)
2885		add_to_worklist(&aip->ai_freefrag->ff_list);
2886	WORKITEM_FREE(aip, M_ALLOCINDIR);
2887}
2888
2889/*
2890 * Called from within softdep_disk_write_complete above to restore
2891 * in-memory inode block contents to their most up-to-date state. Note
2892 * that this routine is always called from interrupt level with further
2893 * splbio interrupts blocked.
2894 */
2895static int
2896handle_written_inodeblock(inodedep, bp)
2897	struct inodedep *inodedep;
2898	struct buf *bp;		/* buffer containing the inode block */
2899{
2900	struct pagedep *pagedep;
2901	struct worklist *wk, *filefree;
2902	struct allocdirect *adp, *nextadp;
2903	struct dinode *dp;
2904	struct diradd *dap;
2905	int hadchanges;
2906
2907	if ((inodedep->id_state & IOSTARTED) == 0)
2908		panic("handle_written_inodeblock: not started");
2909	inodedep->id_state &= ~IOSTARTED;
2910	inodedep->id_state |= COMPLETE;
2911	dp = (struct dinode *)bp->b_data +
2912	    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
2913	/*
2914	 * If we had to rollback the inode allocation because of
2915	 * bitmaps being incomplete, then simply restore it.
2916	 * Keep the block dirty so that it will not be reclaimed until
2917	 * all associated dependencies have been cleared and the
2918	 * corresponding updates written to disk.
2919	 */
2920	if (inodedep->id_savedino != NULL) {
2921		*dp = *inodedep->id_savedino;
2922		FREE(inodedep->id_savedino, M_INODEDEP);
2923		inodedep->id_savedino = NULL;
2924		bdirty(bp);
2925		return (1);
2926	}
2927	/*
2928	 * Roll forward anything that had to be rolled back before
2929	 * the inode could be updated.
2930	 */
2931	hadchanges = 0;
2932	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
2933		nextadp = TAILQ_NEXT(adp, ad_next);
2934		if (adp->ad_state & ATTACHED)
2935			panic("handle_written_inodeblock: new entry");
2936		if (adp->ad_lbn < NDADDR) {
2937			if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno)
2938				panic("%s: %s #%d mismatch %d != %d",
2939				    "handle_written_inodeblock",
2940				    "direct pointer", adp->ad_lbn,
2941				    dp->di_db[adp->ad_lbn], adp->ad_oldblkno);
2942			dp->di_db[adp->ad_lbn] = adp->ad_newblkno;
2943		} else {
2944			if (dp->di_ib[adp->ad_lbn - NDADDR] != 0)
2945				panic("%s: %s #%d allocated as %d",
2946				    "handle_written_inodeblock",
2947				    "indirect pointer", adp->ad_lbn - NDADDR,
2948				    dp->di_ib[adp->ad_lbn - NDADDR]);
2949			dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno;
2950		}
2951		adp->ad_state &= ~UNDONE;
2952		adp->ad_state |= ATTACHED;
2953		hadchanges = 1;
2954	}
2955	/*
2956	 * Reset the file size to its most up-to-date value.
2957	 */
2958	if (inodedep->id_savedsize == -1)
2959		panic("handle_written_inodeblock: bad size");
2960	if (dp->di_size != inodedep->id_savedsize) {
2961		dp->di_size = inodedep->id_savedsize;
2962		hadchanges = 1;
2963	}
2964	inodedep->id_savedsize = -1;
2965	/*
2966	 * If there were any rollbacks in the inode block, then it must be
2967	 * marked dirty so that its will eventually get written back in
2968	 * its correct form.
2969	 */
2970	if (hadchanges)
2971		bdirty(bp);
2972	/*
2973	 * Process any allocdirects that completed during the update.
2974	 */
2975	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
2976		handle_allocdirect_partdone(adp);
2977	/*
2978	 * Process deallocations that were held pending until the
2979	 * inode had been written to disk. Freeing of the inode
2980	 * is delayed until after all blocks have been freed to
2981	 * avoid creation of new <vfsid, inum, lbn> triples
2982	 * before the old ones have been deleted.
2983	 */
2984	filefree = NULL;
2985	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
2986		WORKLIST_REMOVE(wk);
2987		switch (wk->wk_type) {
2988
2989		case M_FREEFILE:
2990			/*
2991			 * We defer adding filefree to the worklist until
2992			 * all other additions have been made to ensure
2993			 * that it will be done after all the old blocks
2994			 * have been freed.
2995			 */
2996			if (filefree != NULL)
2997				panic("handle_written_inodeblock: filefree");
2998			filefree = wk;
2999			continue;
3000
3001		case M_MKDIR:
3002			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
3003			continue;
3004
3005		case M_DIRADD:
3006			dap = WK_DIRADD(wk);
3007			dap->da_state |= COMPLETE;
3008			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3009				if (dap->da_state & DIRCHG)
3010					pagedep = dap->da_previous->dm_pagedep;
3011				else
3012					pagedep = dap->da_pagedep;
3013				LIST_REMOVE(dap, da_pdlist);
3014				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
3015				    da_pdlist);
3016			}
3017			WORKLIST_INSERT(&inodedep->id_pendinghd, wk);
3018			continue;
3019
3020		case M_FREEBLKS:
3021		case M_FREEFRAG:
3022		case M_DIRREM:
3023			add_to_worklist(wk);
3024			continue;
3025
3026		default:
3027			panic("handle_written_inodeblock: Unknown type %s",
3028			    TYPENAME(wk->wk_type));
3029			/* NOTREACHED */
3030		}
3031	}
3032	if (filefree != NULL)
3033		add_to_worklist(filefree);
3034
3035	/*
3036	 * If no outstanding dependencies, free it.
3037	 */
3038	if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
3039		return (0);
3040	return (hadchanges);
3041}
3042
3043/*
3044 * Handle the completion of a mkdir dependency.
3045 */
3046static void
3047handle_written_mkdir(mkdir, type)
3048	struct mkdir *mkdir;
3049	int type;
3050{
3051	struct diradd *dap;
3052	struct pagedep *pagedep;
3053
3054	if (mkdir->md_state != type)
3055		panic("handle_written_mkdir: bad type");
3056	dap = mkdir->md_diradd;
3057	dap->da_state &= ~type;
3058	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
3059		dap->da_state |= DEPCOMPLETE;
3060	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3061		if (dap->da_state & DIRCHG)
3062			pagedep = dap->da_previous->dm_pagedep;
3063		else
3064			pagedep = dap->da_pagedep;
3065		LIST_REMOVE(dap, da_pdlist);
3066		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3067	}
3068	LIST_REMOVE(mkdir, md_mkdirs);
3069	WORKITEM_FREE(mkdir, M_MKDIR);
3070}
3071
3072/*
3073 * Called from within softdep_disk_write_complete above.
3074 * A write operation was just completed. Removed inodes can
3075 * now be freed and associated block pointers may be committed.
3076 * Note that this routine is always called from interrupt level
3077 * with further splbio interrupts blocked.
3078 */
3079static int
3080handle_written_filepage(pagedep, bp)
3081	struct pagedep *pagedep;
3082	struct buf *bp;		/* buffer containing the written page */
3083{
3084	struct dirrem *dirrem;
3085	struct diradd *dap, *nextdap;
3086	struct direct *ep;
3087	int i, chgs;
3088
3089	if ((pagedep->pd_state & IOSTARTED) == 0)
3090		panic("handle_written_filepage: not started");
3091	pagedep->pd_state &= ~IOSTARTED;
3092	/*
3093	 * Process any directory removals that have been committed.
3094	 */
3095	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
3096		LIST_REMOVE(dirrem, dm_next);
3097		dirrem->dm_dirinum = pagedep->pd_ino;
3098		add_to_worklist(&dirrem->dm_list);
3099	}
3100	/*
3101	 * Free any directory additions that have been committed.
3102	 */
3103	while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
3104		free_diradd(dap);
3105	/*
3106	 * Uncommitted directory entries must be restored.
3107	 */
3108	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
3109		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
3110		     dap = nextdap) {
3111			nextdap = LIST_NEXT(dap, da_pdlist);
3112			if (dap->da_state & ATTACHED)
3113				panic("handle_written_filepage: attached");
3114			ep = (struct direct *)
3115			    ((char *)bp->b_data + dap->da_offset);
3116			ep->d_ino = dap->da_newinum;
3117			dap->da_state &= ~UNDONE;
3118			dap->da_state |= ATTACHED;
3119			chgs = 1;
3120			/*
3121			 * If the inode referenced by the directory has
3122			 * been written out, then the dependency can be
3123			 * moved to the pending list.
3124			 */
3125			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
3126				LIST_REMOVE(dap, da_pdlist);
3127				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
3128				    da_pdlist);
3129			}
3130		}
3131	}
3132	/*
3133	 * If there were any rollbacks in the directory, then it must be
3134	 * marked dirty so that its will eventually get written back in
3135	 * its correct form.
3136	 */
3137	if (chgs)
3138		bdirty(bp);
3139	/*
3140	 * If no dependencies remain, the pagedep will be freed.
3141	 * Otherwise it will remain to update the page before it
3142	 * is written back to disk.
3143	 */
3144	if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) {
3145		for (i = 0; i < DAHASHSZ; i++)
3146			if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
3147				break;
3148		if (i == DAHASHSZ) {
3149			LIST_REMOVE(pagedep, pd_hash);
3150			WORKITEM_FREE(pagedep, M_PAGEDEP);
3151			return (0);
3152		}
3153	}
3154	return (1);
3155}
3156
3157/*
3158 * Writing back in-core inode structures.
3159 *
3160 * The file system only accesses an inode's contents when it occupies an
3161 * "in-core" inode structure.  These "in-core" structures are separate from
3162 * the page frames used to cache inode blocks.  Only the latter are
3163 * transferred to/from the disk.  So, when the updated contents of the
3164 * "in-core" inode structure are copied to the corresponding in-memory inode
3165 * block, the dependencies are also transferred.  The following procedure is
3166 * called when copying a dirty "in-core" inode to a cached inode block.
3167 */
3168
3169/*
3170 * Called when an inode is loaded from disk. If the effective link count
3171 * differed from the actual link count when it was last flushed, then we
3172 * need to ensure that the correct effective link count is put back.
3173 */
3174void
3175softdep_load_inodeblock(ip)
3176	struct inode *ip;	/* the "in_core" copy of the inode */
3177{
3178	struct inodedep *inodedep;
3179	int error, gotit;
3180
3181	/*
3182	 * Check for alternate nlink count.
3183	 */
3184	ip->i_effnlink = ip->i_nlink;
3185	ACQUIRE_LOCK(&lk);
3186	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3187		FREE_LOCK(&lk);
3188		return;
3189	}
3190	if (inodedep->id_nlinkdelta != 0) {
3191		ip->i_effnlink -= inodedep->id_nlinkdelta;
3192		inodedep->id_nlinkdelta = 0;
3193		(void) free_inodedep(inodedep);
3194	}
3195	FREE_LOCK(&lk);
3196}
3197
3198/*
3199 * This routine is called just before the "in-core" inode
3200 * information is to be copied to the in-memory inode block.
3201 * Recall that an inode block contains several inodes. If
3202 * the force flag is set, then the dependencies will be
3203 * cleared so that the update can always be made. Note that
3204 * the buffer is locked when this routine is called, so we
3205 * will never be in the middle of writing the inode block
3206 * to disk.
3207 */
3208void
3209softdep_update_inodeblock(ip, bp, waitfor)
3210	struct inode *ip;	/* the "in_core" copy of the inode */
3211	struct buf *bp;		/* the buffer containing the inode block */
3212	int waitfor;		/* 1 => update must be allowed */
3213{
3214	struct inodedep *inodedep;
3215	int error, gotit;
3216
3217	/*
3218	 * If the effective link count is not equal to the actual link
3219	 * count, then we must track the difference in an inodedep while
3220	 * the inode is (potentially) tossed out of the cache. Otherwise,
3221	 * if there is no existing inodedep, then there are no dependencies
3222	 * to track.
3223	 */
3224	ACQUIRE_LOCK(&lk);
3225	if (ip->i_effnlink != ip->i_nlink) {
3226		(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC,
3227		    &inodedep);
3228	} else if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
3229		FREE_LOCK(&lk);
3230		return;
3231	}
3232	if (ip->i_nlink < ip->i_effnlink)
3233		panic("softdep_update_inodeblock: bad delta");
3234	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3235	/*
3236	 * If the last remaining use for the inodedep was to track the
3237	 * link count, and there is no difference between the effective
3238	 * and actual link count, then we can free the inodedep.
3239	 */
3240	if (free_inodedep(inodedep)) {
3241		FREE_LOCK(&lk);
3242		return;
3243	}
3244	/*
3245	 * Changes have been initiated. Anything depending on these
3246	 * changes cannot occur until this inode has been written.
3247	 */
3248	inodedep->id_state &= ~COMPLETE;
3249	if ((inodedep->id_state & ONWORKLIST) == 0)
3250		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
3251	/*
3252	 * Any new dependencies associated with the incore inode must
3253	 * now be moved to the list associated with the buffer holding
3254	 * the in-memory copy of the inode. Once merged process any
3255	 * allocdirects that are completed by the merger.
3256	 */
3257	merge_inode_lists(inodedep);
3258	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
3259		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
3260	/*
3261	 * Newly allocated inodes cannot be written until the bitmap
3262	 * that allocates them have been written (indicated by
3263	 * DEPCOMPLETE being set in id_state). If we are doing a
3264	 * forced sync (e.g., an fsync on a file), we force the bitmap
3265	 * to be written so that the update can be done.
3266	 */
3267	if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
3268		FREE_LOCK(&lk);
3269		return;
3270	}
3271	gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
3272	FREE_LOCK(&lk);
3273	if (gotit && (error = VOP_BWRITE(inodedep->id_buf)) != 0)
3274		softdep_error("softdep_update_inodeblock: bwrite", error);
3275	if ((inodedep->id_state & DEPCOMPLETE) == 0)
3276		panic("softdep_update_inodeblock: update failed");
3277}
3278
3279/*
3280 * Merge the new inode dependency list (id_newinoupdt) into the old
3281 * inode dependency list (id_inoupdt). This routine must be called
3282 * with splbio interrupts blocked.
3283 */
3284static void
3285merge_inode_lists(inodedep)
3286	struct inodedep *inodedep;
3287{
3288	struct allocdirect *listadp, *newadp;
3289
3290	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3291	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
3292		if (listadp->ad_lbn < newadp->ad_lbn) {
3293			listadp = TAILQ_NEXT(listadp, ad_next);
3294			continue;
3295		}
3296		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3297		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
3298		if (listadp->ad_lbn == newadp->ad_lbn) {
3299			allocdirect_merge(&inodedep->id_inoupdt, newadp,
3300			    listadp);
3301			listadp = newadp;
3302		}
3303		newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
3304	}
3305	while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
3306		TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
3307		TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
3308	}
3309}
3310
3311/*
3312 * If we are doing an fsync, then we must ensure that any directory
3313 * entries for the inode have been written after the inode gets to disk.
3314 */
3315int
3316softdep_fsync(vp)
3317	struct vnode *vp;	/* the "in_core" copy of the inode */
3318{
3319	struct diradd *dap, *olddap;
3320	struct inodedep *inodedep;
3321	struct pagedep *pagedep;
3322	struct worklist *wk;
3323	struct mount *mnt;
3324	struct vnode *pvp;
3325	struct inode *ip;
3326	struct buf *bp;
3327	struct fs *fs;
3328	struct proc *p = curproc;		/* XXX */
3329	int error, ret, flushparent;
3330	struct timeval tv;
3331	ino_t parentino;
3332	ufs_lbn_t lbn;
3333
3334	ip = VTOI(vp);
3335	fs = ip->i_fs;
3336	for (error = 0, flushparent = 0, olddap = NULL; ; ) {
3337		ACQUIRE_LOCK(&lk);
3338		if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
3339			break;
3340		if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
3341		    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
3342		    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
3343			panic("softdep_fsync: pending ops");
3344		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
3345			break;
3346		if (wk->wk_type != M_DIRADD)
3347			panic("softdep_fsync: Unexpected type %s",
3348			    TYPENAME(wk->wk_type));
3349		dap = WK_DIRADD(wk);
3350		/*
3351		 * If we have failed to get rid of all the dependencies
3352		 * then something is seriously wrong.
3353		 */
3354		if (dap == olddap)
3355			panic("softdep_fsync: flush failed");
3356		olddap = dap;
3357		/*
3358		 * Flush our parent if this directory entry
3359		 * has a MKDIR_PARENT dependency.
3360		 */
3361		if (dap->da_state & DIRCHG)
3362			pagedep = dap->da_previous->dm_pagedep;
3363		else
3364			pagedep = dap->da_pagedep;
3365		mnt = pagedep->pd_mnt;
3366		parentino = pagedep->pd_ino;
3367		lbn = pagedep->pd_lbn;
3368		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
3369			panic("softdep_fsync: dirty");
3370		flushparent = dap->da_state & MKDIR_PARENT;
3371		/*
3372		 * If we are being fsync'ed as part of vgone'ing this vnode,
3373		 * then we will not be able to release and recover the
3374		 * vnode below, so we just have to give up on writing its
3375		 * directory entry out. It will eventually be written, just
3376		 * not now, but then the user was not asking to have it
3377		 * written, so we are not breaking any promises.
3378		 */
3379		if (vp->v_flag & VXLOCK)
3380			break;
3381		/*
3382		 * We prevent deadlock by always fetching inodes from the
3383		 * root, moving down the directory tree. Thus, when fetching
3384		 * our parent directory, we must unlock ourselves before
3385		 * requesting the lock on our parent. See the comment in
3386		 * ufs_lookup for details on possible races.
3387		 */
3388		FREE_LOCK(&lk);
3389		VOP_UNLOCK(vp, 0, p);
3390		if ((error = VFS_VGET(mnt, parentino, &pvp)) != 0) {
3391			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
3392			return (error);
3393		}
3394		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
3395		if (flushparent) {
3396			tv = time;
3397			if (error = VOP_UPDATE(pvp, &tv, &tv, MNT_WAIT)) {
3398				vput(pvp);
3399				return (error);
3400			}
3401		}
3402		/*
3403		 * Flush directory page containing the inode's name.
3404		 */
3405		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
3406		    &bp);
3407		vput(pvp);
3408		ret = VOP_BWRITE(bp);
3409		if (error != 0)
3410			return (error);
3411		if (ret != 0)
3412			return (ret);
3413	}
3414	FREE_LOCK(&lk);
3415	return (0);
3416}
3417
3418/*
3419 * This routine is called when we are trying to synchronously flush a
3420 * file. This routine must eliminate any filesystem metadata dependencies
3421 * so that the syncing routine can succeed by pushing the dirty blocks
3422 * associated with the file. If any I/O errors occur, they are returned.
3423 */
3424int
3425softdep_sync_metadata(ap)
3426	struct vop_fsync_args /* {
3427		struct vnode *a_vp;
3428		struct ucred *a_cred;
3429		int a_waitfor;
3430		struct proc *a_p;
3431	} */ *ap;
3432{
3433	struct vnode *vp = ap->a_vp;
3434	struct pagedep *pagedep;
3435	struct allocdirect *adp;
3436	struct allocindir *aip;
3437	struct buf *bp, *nbp;
3438	struct worklist *wk;
3439	int i, error, waitfor;
3440
3441	/*
3442	 * Check whether this vnode is involved in a filesystem
3443	 * that is doing soft dependency processing.
3444	 */
3445	if (vp->v_type != VBLK) {
3446		if (!DOINGSOFTDEP(vp))
3447			return (0);
3448	} else
3449		if (vp->v_specmountpoint == NULL ||
3450		    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
3451			return (0);
3452	/*
3453	 * Ensure that any direct block dependencies have been cleared.
3454	 */
3455	ACQUIRE_LOCK(&lk);
3456	if (error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number)) {
3457		FREE_LOCK(&lk);
3458		return (error);
3459	}
3460	/*
3461	 * For most files, the only metadata dependencies are the
3462	 * cylinder group maps that allocate their inode or blocks.
3463	 * The block allocation dependencies can be found by traversing
3464	 * the dependency lists for any buffers that remain on their
3465	 * dirty buffer list. The inode allocation dependency will
3466	 * be resolved when the inode is updated with MNT_WAIT.
3467	 * This work is done in two passes. The first pass grabs most
3468	 * of the buffers and begins asynchronously writing them. The
3469	 * only way to wait for these asynchronous writes is to sleep
3470	 * on the filesystem vnode which may stay busy for a long time
3471	 * if the filesystem is active. So, instead, we make a second
3472	 * pass over the dependencies blocking on each write. In the
3473	 * usual case we will be blocking against a write that we
3474	 * initiated, so when it is done the dependency will have been
3475	 * resolved. Thus the second pass is expected to end quickly.
3476	 */
3477	waitfor = MNT_NOWAIT;
3478top:
3479	if (getdirtybuf(&LIST_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
3480		FREE_LOCK(&lk);
3481		return (0);
3482	}
3483	bp = LIST_FIRST(&vp->v_dirtyblkhd);
3484loop:
3485	/*
3486	 * As we hold the buffer locked, none of its dependencies
3487	 * will disappear.
3488	 */
3489	for (wk = LIST_FIRST(&bp->b_dep); wk;
3490	     wk = LIST_NEXT(wk, wk_list)) {
3491		switch (wk->wk_type) {
3492
3493		case M_ALLOCDIRECT:
3494			adp = WK_ALLOCDIRECT(wk);
3495			if (adp->ad_state & DEPCOMPLETE)
3496				break;
3497			nbp = adp->ad_buf;
3498			if (getdirtybuf(&nbp, waitfor) == 0)
3499				break;
3500			FREE_LOCK(&lk);
3501			if (waitfor == MNT_NOWAIT) {
3502				bawrite(nbp);
3503			} else if ((error = VOP_BWRITE(nbp)) != 0) {
3504				bawrite(bp);
3505				return (error);
3506			}
3507			ACQUIRE_LOCK(&lk);
3508			break;
3509
3510		case M_ALLOCINDIR:
3511			aip = WK_ALLOCINDIR(wk);
3512			if (aip->ai_state & DEPCOMPLETE)
3513				break;
3514			nbp = aip->ai_buf;
3515			if (getdirtybuf(&nbp, waitfor) == 0)
3516				break;
3517			FREE_LOCK(&lk);
3518			if (waitfor == MNT_NOWAIT) {
3519				bawrite(nbp);
3520			} else if ((error = VOP_BWRITE(nbp)) != 0) {
3521				bawrite(bp);
3522				return (error);
3523			}
3524			ACQUIRE_LOCK(&lk);
3525			break;
3526
3527		case M_INDIRDEP:
3528		restart:
3529			for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd);
3530			     aip; aip = LIST_NEXT(aip, ai_next)) {
3531				if (aip->ai_state & DEPCOMPLETE)
3532					continue;
3533				nbp = aip->ai_buf;
3534				if (getdirtybuf(&nbp, MNT_WAIT) == 0)
3535					goto restart;
3536				FREE_LOCK(&lk);
3537				if ((error = VOP_BWRITE(nbp)) != 0) {
3538					bawrite(bp);
3539					return (error);
3540				}
3541				ACQUIRE_LOCK(&lk);
3542				goto restart;
3543			}
3544			break;
3545
3546		case M_INODEDEP:
3547			if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
3548			    WK_INODEDEP(wk)->id_ino)) != 0) {
3549				FREE_LOCK(&lk);
3550				bawrite(bp);
3551				return (error);
3552			}
3553			break;
3554
3555		case M_PAGEDEP:
3556			/*
3557			 * We are trying to sync a directory that may
3558			 * have dependencies on both its own metadata
3559			 * and/or dependencies on the inodes of any
3560			 * recently allocated files. We walk its diradd
3561			 * lists pushing out the associated inode.
3562			 */
3563			pagedep = WK_PAGEDEP(wk);
3564			for (i = 0; i < DAHASHSZ; i++) {
3565				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
3566					continue;
3567				if (error = flush_pagedep_deps(vp,
3568				   pagedep->pd_mnt, &pagedep->pd_diraddhd[i])) {
3569					FREE_LOCK(&lk);
3570					bawrite(bp);
3571					return (error);
3572				}
3573			}
3574			break;
3575
3576		default:
3577			panic("softdep_sync_metadata: Unknown type %s",
3578			    TYPENAME(wk->wk_type));
3579			/* NOTREACHED */
3580		}
3581	}
3582	(void) getdirtybuf(&LIST_NEXT(bp, b_vnbufs), MNT_WAIT);
3583	nbp = LIST_NEXT(bp, b_vnbufs);
3584	FREE_LOCK(&lk);
3585	bawrite(bp);
3586	ACQUIRE_LOCK(&lk);
3587	if (nbp != NULL) {
3588		bp = nbp;
3589		goto loop;
3590	}
3591	/*
3592	 * We must wait for any I/O in progress to finish so that
3593	 * all potential buffers on the dirty list will be visible.
3594	 * Once they are all there, proceed with the second pass
3595	 * which will wait for the I/O as per above.
3596	 */
3597	while (vp->v_numoutput) {
3598		vp->v_flag |= VBWAIT;
3599		FREE_LOCK_INTERLOCKED(&lk);
3600		sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1);
3601		ACQUIRE_LOCK_INTERLOCKED(&lk);
3602	}
3603	/*
3604	 * The brief unlock is to allow any pent up dependency
3605	 * processing to be done.
3606	 */
3607	if (waitfor == MNT_NOWAIT) {
3608		waitfor = MNT_WAIT;
3609		FREE_LOCK(&lk);
3610		ACQUIRE_LOCK(&lk);
3611		goto top;
3612	}
3613
3614	/*
3615	 * If we have managed to get rid of all the dirty buffers,
3616	 * then we are done. For certain directories and block
3617	 * devices, we may need to do further work.
3618	 */
3619	if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
3620		FREE_LOCK(&lk);
3621		return (0);
3622	}
3623
3624	FREE_LOCK(&lk);
3625	/*
3626	 * If we are trying to sync a block device, some of its buffers may
3627	 * contain metadata that cannot be written until the contents of some
3628	 * partially written files have been written to disk. The only easy
3629	 * way to accomplish this is to sync the entire filesystem (luckily
3630	 * this happens rarely).
3631	 */
3632	if (vp->v_type == VBLK && vp->v_specmountpoint && !VOP_ISLOCKED(vp) &&
3633	    (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred,
3634	     ap->a_p)) != 0)
3635		return (error);
3636	return (0);
3637}
3638
3639/*
3640 * Flush the dependencies associated with an inodedep.
3641 * Called with splbio blocked.
3642 */
3643static int
3644flush_inodedep_deps(fs, ino)
3645	struct fs *fs;
3646	ino_t ino;
3647{
3648	struct inodedep *inodedep;
3649	struct allocdirect *adp;
3650	int error, waitfor;
3651	struct buf *bp;
3652
3653	/*
3654	 * This work is done in two passes. The first pass grabs most
3655	 * of the buffers and begins asynchronously writing them. The
3656	 * only way to wait for these asynchronous writes is to sleep
3657	 * on the filesystem vnode which may stay busy for a long time
3658	 * if the filesystem is active. So, instead, we make a second
3659	 * pass over the dependencies blocking on each write. In the
3660	 * usual case we will be blocking against a write that we
3661	 * initiated, so when it is done the dependency will have been
3662	 * resolved. Thus the second pass is expected to end quickly.
3663	 * We give a brief window at the top of the loop to allow
3664	 * any pending I/O to complete.
3665	 */
3666	for (waitfor = MNT_NOWAIT; ; ) {
3667		FREE_LOCK(&lk);
3668		ACQUIRE_LOCK(&lk);
3669		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
3670			return (0);
3671		for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3672		     adp = TAILQ_NEXT(adp, ad_next)) {
3673			if (adp->ad_state & DEPCOMPLETE)
3674				continue;
3675			bp = adp->ad_buf;
3676			if (getdirtybuf(&bp, waitfor) == 0)
3677				break;
3678			FREE_LOCK(&lk);
3679			if (waitfor == MNT_NOWAIT) {
3680				bawrite(bp);
3681			} else if ((error = VOP_BWRITE(bp)) != 0) {
3682				ACQUIRE_LOCK(&lk);
3683				return (error);
3684			}
3685			ACQUIRE_LOCK(&lk);
3686			break;
3687		}
3688		if (adp != NULL)
3689			continue;
3690		for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp;
3691		     adp = TAILQ_NEXT(adp, ad_next)) {
3692			if (adp->ad_state & DEPCOMPLETE)
3693				continue;
3694			bp = adp->ad_buf;
3695			if (getdirtybuf(&bp, waitfor) == 0)
3696				break;
3697			FREE_LOCK(&lk);
3698			if (waitfor == MNT_NOWAIT) {
3699				bawrite(bp);
3700			} else if ((error = VOP_BWRITE(bp)) != 0) {
3701				ACQUIRE_LOCK(&lk);
3702				return (error);
3703			}
3704			ACQUIRE_LOCK(&lk);
3705			break;
3706		}
3707		if (adp != NULL)
3708			continue;
3709		/*
3710		 * If pass2, we are done, otherwise do pass 2.
3711		 */
3712		if (waitfor == MNT_WAIT)
3713			break;
3714		waitfor = MNT_WAIT;
3715	}
3716	/*
3717	 * Try freeing inodedep in case all dependencies have been removed.
3718	 */
3719	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
3720		(void) free_inodedep(inodedep);
3721	return (0);
3722}
3723
3724/*
3725 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
3726 * Called with splbio blocked.
3727 */
3728static int
3729flush_pagedep_deps(pvp, mp, diraddhdp)
3730	struct vnode *pvp;
3731	struct mount *mp;
3732	struct diraddhd *diraddhdp;
3733{
3734	struct proc *p = curproc;	/* XXX */
3735	struct inodedep *inodedep;
3736	struct ufsmount *ump;
3737	struct diradd *dap;
3738	struct timeval tv;
3739	struct vnode *vp;
3740	int gotit, error;
3741	struct buf *bp;
3742	ino_t inum;
3743
3744	ump = VFSTOUFS(mp);
3745	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
3746		/*
3747		 * Flush ourselves if this directory entry
3748		 * has a MKDIR_PARENT dependency.
3749		 */
3750		if (dap->da_state & MKDIR_PARENT) {
3751			tv = time;
3752			FREE_LOCK(&lk);
3753			if (error = VOP_UPDATE(pvp, &tv, &tv, MNT_WAIT))
3754				break;
3755			ACQUIRE_LOCK(&lk);
3756			/*
3757			 * If that cleared dependencies, go on to next.
3758			 */
3759			if (dap != LIST_FIRST(diraddhdp))
3760				continue;
3761			if (dap->da_state & MKDIR_PARENT)
3762				panic("flush_pagedep_deps: MKDIR");
3763		}
3764		/*
3765		 * Flush the file on which the directory entry depends.
3766		 * If the inode has already been pushed out of the cache,
3767		 * then all the block dependencies will have been flushed
3768		 * leaving only inode dependencies (e.g., bitmaps). Thus,
3769		 * we do a ufs_ihashget to check for the vnode in the cache.
3770		 * If it is there, we do a full flush. If it is no longer
3771		 * there we need only dispose of any remaining bitmap
3772		 * dependencies and write the inode to disk.
3773		 */
3774		inum = dap->da_newinum;
3775		FREE_LOCK(&lk);
3776		if ((vp = ufs_ihashget(ump->um_dev, inum)) == NULL) {
3777			ACQUIRE_LOCK(&lk);
3778			if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0
3779			    && dap == LIST_FIRST(diraddhdp))
3780				panic("flush_pagedep_deps: flush 1 failed");
3781			/*
3782			 * If the inode still has bitmap dependencies,
3783			 * push them to disk.
3784			 */
3785			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3786				gotit = getdirtybuf(&inodedep->id_buf,MNT_WAIT);
3787				FREE_LOCK(&lk);
3788				if (gotit &&
3789				    (error = VOP_BWRITE(inodedep->id_buf)) != 0)
3790					break;
3791				ACQUIRE_LOCK(&lk);
3792			}
3793			if (dap != LIST_FIRST(diraddhdp))
3794				continue;
3795			/*
3796			 * If the inode is still sitting in a buffer waiting
3797			 * to be written, push it to disk.
3798			 */
3799			FREE_LOCK(&lk);
3800			if ((error = bread(ump->um_devvp,
3801			    fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
3802			    (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0)
3803				break;
3804			if ((error = VOP_BWRITE(bp)) != 0)
3805				break;
3806			ACQUIRE_LOCK(&lk);
3807			if (dap == LIST_FIRST(diraddhdp))
3808				panic("flush_pagedep_deps: flush 2 failed");
3809			continue;
3810		}
3811		if (vp->v_type == VDIR) {
3812			/*
3813			 * A newly allocated directory must have its "." and
3814			 * ".." entries written out before its name can be
3815			 * committed in its parent. We do not want or need
3816			 * the full semantics of a synchronous VOP_FSYNC as
3817			 * that may end up here again, once for each directory
3818			 * level in the filesystem. Instead, we push the blocks
3819			 * and wait for them to clear.
3820			 */
3821			if (error = VOP_FSYNC(vp, p->p_cred, MNT_NOWAIT, p)) {
3822				vput(vp);
3823				break;
3824			}
3825			ACQUIRE_LOCK(&lk);
3826			while (vp->v_numoutput) {
3827				vp->v_flag |= VBWAIT;
3828				FREE_LOCK_INTERLOCKED(&lk);
3829				sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1);
3830				ACQUIRE_LOCK_INTERLOCKED(&lk);
3831			}
3832			FREE_LOCK(&lk);
3833		}
3834		tv = time;
3835		error = VOP_UPDATE(vp, &tv, &tv, MNT_WAIT);
3836		vput(vp);
3837		if (error)
3838			break;
3839		/*
3840		 * If we have failed to get rid of all the dependencies
3841		 * then something is seriously wrong.
3842		 */
3843		if (dap == LIST_FIRST(diraddhdp))
3844			panic("flush_pagedep_deps: flush 3 failed");
3845		ACQUIRE_LOCK(&lk);
3846	}
3847	if (error)
3848		ACQUIRE_LOCK(&lk);
3849	return (error);
3850}
3851
3852/*
3853 * Acquire exclusive access to a buffer.
3854 * Must be called with splbio blocked.
3855 * Return 1 if buffer was acquired.
3856 */
3857static int
3858getdirtybuf(bpp, waitfor)
3859	struct buf **bpp;
3860	int waitfor;
3861{
3862	struct buf *bp;
3863
3864	for (;;) {
3865		if ((bp = *bpp) == NULL)
3866			return (0);
3867		if ((bp->b_flags & B_BUSY) == 0)
3868			break;
3869		if (waitfor != MNT_WAIT)
3870			return (0);
3871		bp->b_flags |= B_WANTED;
3872		FREE_LOCK_INTERLOCKED(&lk);
3873		sleep((caddr_t)bp, PRIBIO + 1);
3874		ACQUIRE_LOCK_INTERLOCKED(&lk);
3875	}
3876	if ((bp->b_flags & B_DELWRI) == 0)
3877		return (0);
3878	bremfree(bp);
3879	bp->b_flags |= B_BUSY;
3880	return (1);
3881}
3882
3883/*
3884 * Called whenever a buffer that is being invalidated or reallocated
3885 * contains dependencies. This should only happen if an I/O error has
3886 * occurred. The routine is called with the buffer locked.
3887 */
3888void
3889softdep_deallocate_dependencies(bp)
3890	struct buf *bp;
3891{
3892	struct worklist *wk;
3893
3894	if ((bp->b_flags & B_ERROR) == 0)
3895		panic("softdep_deallocate_dependencies: dangling deps");
3896	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
3897	ACQUIRE_LOCK(&lk);
3898	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
3899		WORKLIST_REMOVE(wk);
3900		FREE_LOCK(&lk);
3901		switch (wk->wk_type) {
3902		/*
3903		 * XXX - should really clean up, but for now we will
3904		 * just leak memory and not worry about it. Also should
3905		 * mark the filesystem permanently dirty so that it will
3906		 * force fsck to be run (though this would best be done
3907		 * in the mainline code).
3908		 */
3909		case M_PAGEDEP:
3910		case M_INODEDEP:
3911		case M_BMSAFEMAP:
3912		case M_ALLOCDIRECT:
3913		case M_INDIRDEP:
3914		case M_ALLOCINDIR:
3915		case M_MKDIR:
3916#ifdef DEBUG
3917			printf("Lost type %s\n", TYPENAME(wk->wk_type));
3918#endif
3919			break;
3920		default:
3921			panic("%s: Unexpected type %s",
3922			    "softdep_deallocate_dependencies",
3923			    TYPENAME(wk->wk_type));
3924			/* NOTREACHED */
3925		}
3926		ACQUIRE_LOCK(&lk);
3927	}
3928	FREE_LOCK(&lk);
3929}
3930
3931/*
3932 * Function to handle asynchronous write errors in the filesystem.
3933 */
3934void
3935softdep_error(func, error)
3936	char *func;
3937	int error;
3938{
3939
3940	/* XXX should do something better! */
3941	log(LOG_ERR, "%s: got error %d while accessing filesystem\n",
3942	    func, error);
3943}
3944